1 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
2 ; RUN: opt -S -aarch64-sve-intrinsic-opts -mtriple=aarch64-linux-gnu -mattr=+sve < %s | FileCheck %s
4 declare <vscale x 16 x i1> @llvm.aarch64.sve.ptrue.nxv16i1(i32 immarg)
5 declare <vscale x 2 x i1> @llvm.aarch64.sve.ptrue.nxv2i1(i32 immarg)
6 declare <vscale x 4 x i1> @llvm.aarch64.sve.ptrue.nxv4i1(i32 immarg)
7 declare <vscale x 8 x i1> @llvm.aarch64.sve.ptrue.nxv8i1(i32 immarg)
9 declare <vscale x 16 x i32> @llvm.aarch64.sve.ld1.nxv16i32(<vscale x 16 x i1>, ptr)
10 declare <vscale x 2 x i32> @llvm.aarch64.sve.ld1.nxv2i32(<vscale x 2 x i1>, ptr)
11 declare <vscale x 4 x i32> @llvm.aarch64.sve.ld1.nxv4i32(<vscale x 4 x i1>, ptr)
12 declare <vscale x 8 x i16> @llvm.aarch64.sve.ld1.nxv8i16(<vscale x 8 x i1>, ptr)
13 declare <vscale x 8 x i32> @llvm.aarch64.sve.ld1.nxv8i32(<vscale x 8 x i1>, ptr)
15 declare <vscale x 16 x i1> @llvm.aarch64.sve.convert.to.svbool.nxv4i1(<vscale x 4 x i1>)
16 declare <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv4i1(<vscale x 16 x i1>)
18 ; Two calls to the SVE ptrue intrinsic. %1 is redundant, and can be expressed as an SVE reinterpret of %3 via
19 ; convert.{to,from}.svbool.
20 define <vscale x 8 x i32> @coalesce_test_basic(ptr %addr) {
21 ; CHECK-LABEL: @coalesce_test_basic(
22 ; CHECK-NEXT: [[TMP1:%.*]] = call <vscale x 8 x i1> @llvm.aarch64.sve.ptrue.nxv8i1(i32 31)
23 ; CHECK-NEXT: [[TMP2:%.*]] = call <vscale x 16 x i1> @llvm.aarch64.sve.convert.to.svbool.nxv8i1(<vscale x 8 x i1> [[TMP1]])
24 ; CHECK-NEXT: [[TMP3:%.*]] = call <vscale x 4 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv4i1(<vscale x 16 x i1> [[TMP2]])
25 ; CHECK-NEXT: [[TMP4:%.*]] = call <vscale x 4 x i32> @llvm.aarch64.sve.ld1.nxv4i32(<vscale x 4 x i1> [[TMP3]], ptr [[ADDR:%.*]])
26 ; CHECK-NEXT: [[TMP5:%.*]] = call <vscale x 8 x i32> @llvm.aarch64.sve.ld1.nxv8i32(<vscale x 8 x i1> [[TMP1]], ptr [[ADDR]])
27 ; CHECK-NEXT: ret <vscale x 8 x i32> [[TMP5]]
29 %1 = call <vscale x 4 x i1> @llvm.aarch64.sve.ptrue.nxv4i1(i32 31)
30 %2 = call <vscale x 4 x i32> @llvm.aarch64.sve.ld1.nxv4i32(<vscale x 4 x i1> %1, ptr %addr)
31 %3 = call <vscale x 8 x i1> @llvm.aarch64.sve.ptrue.nxv8i1(i32 31)
32 %4 = call <vscale x 8 x i32> @llvm.aarch64.sve.ld1.nxv8i32(<vscale x 8 x i1> %3, ptr %addr)
33 ret <vscale x 8 x i32> %4
36 ; Two calls to the SVE ptrue intrinsic with the SV_POW2 pattern. This should reduce to the same output as
37 ; coalesce_test_basic.
38 define <vscale x 8 x i32> @coalesce_test_pow2(ptr %addr) {
39 ; CHECK-LABEL: @coalesce_test_pow2(
40 ; CHECK-NEXT: [[TMP1:%.*]] = call <vscale x 8 x i1> @llvm.aarch64.sve.ptrue.nxv8i1(i32 0)
41 ; CHECK-NEXT: [[TMP2:%.*]] = call <vscale x 16 x i1> @llvm.aarch64.sve.convert.to.svbool.nxv8i1(<vscale x 8 x i1> [[TMP1]])
42 ; CHECK-NEXT: [[TMP3:%.*]] = call <vscale x 4 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv4i1(<vscale x 16 x i1> [[TMP2]])
43 ; CHECK-NEXT: [[TMP4:%.*]] = call <vscale x 4 x i32> @llvm.aarch64.sve.ld1.nxv4i32(<vscale x 4 x i1> [[TMP3]], ptr [[ADDR:%.*]])
44 ; CHECK-NEXT: [[TMP5:%.*]] = call <vscale x 8 x i32> @llvm.aarch64.sve.ld1.nxv8i32(<vscale x 8 x i1> [[TMP1]], ptr [[ADDR]])
45 ; CHECK-NEXT: ret <vscale x 8 x i32> [[TMP5]]
47 %1 = call <vscale x 4 x i1> @llvm.aarch64.sve.ptrue.nxv4i1(i32 0)
48 %2 = call <vscale x 4 x i32> @llvm.aarch64.sve.ld1.nxv4i32(<vscale x 4 x i1> %1, ptr %addr)
49 %3 = call <vscale x 8 x i1> @llvm.aarch64.sve.ptrue.nxv8i1(i32 0)
50 %4 = call <vscale x 8 x i32> @llvm.aarch64.sve.ld1.nxv8i32(<vscale x 8 x i1> %3, ptr %addr)
51 ret <vscale x 8 x i32> %4
54 ; Four calls to the SVE ptrue intrinsic; two with the SV_ALL patterns, and two with the SV_POW2 pattern. The
55 ; two SV_ALL ptrue intrinsics should be coalesced, and the two SV_POW2 intrinsics should be colaesced.
56 define <vscale x 8 x i32> @coalesce_test_all_and_pow2(ptr %addr) {
57 ; CHECK-LABEL: @coalesce_test_all_and_pow2(
58 ; CHECK-NEXT: [[TMP1:%.*]] = call <vscale x 8 x i1> @llvm.aarch64.sve.ptrue.nxv8i1(i32 0)
59 ; CHECK-NEXT: [[TMP2:%.*]] = call <vscale x 16 x i1> @llvm.aarch64.sve.convert.to.svbool.nxv8i1(<vscale x 8 x i1> [[TMP1]])
60 ; CHECK-NEXT: [[TMP3:%.*]] = call <vscale x 4 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv4i1(<vscale x 16 x i1> [[TMP2]])
61 ; CHECK-NEXT: [[TMP4:%.*]] = call <vscale x 8 x i1> @llvm.aarch64.sve.ptrue.nxv8i1(i32 31)
62 ; CHECK-NEXT: [[TMP5:%.*]] = call <vscale x 16 x i1> @llvm.aarch64.sve.convert.to.svbool.nxv8i1(<vscale x 8 x i1> [[TMP4]])
63 ; CHECK-NEXT: [[TMP6:%.*]] = call <vscale x 4 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv4i1(<vscale x 16 x i1> [[TMP5]])
64 ; CHECK-NEXT: [[TMP7:%.*]] = call <vscale x 4 x i32> @llvm.aarch64.sve.ld1.nxv4i32(<vscale x 4 x i1> [[TMP3]], ptr [[ADDR:%.*]])
65 ; CHECK-NEXT: [[TMP8:%.*]] = call <vscale x 8 x i32> @llvm.aarch64.sve.ld1.nxv8i32(<vscale x 8 x i1> [[TMP1]], ptr [[ADDR]])
66 ; CHECK-NEXT: [[TMP9:%.*]] = call <vscale x 4 x i32> @llvm.aarch64.sve.ld1.nxv4i32(<vscale x 4 x i1> [[TMP6]], ptr [[ADDR]])
67 ; CHECK-NEXT: [[TMP10:%.*]] = call <vscale x 8 x i32> @llvm.aarch64.sve.ld1.nxv8i32(<vscale x 8 x i1> [[TMP4]], ptr [[ADDR]])
68 ; CHECK-NEXT: ret <vscale x 8 x i32> [[TMP10]]
70 %1 = call <vscale x 4 x i1> @llvm.aarch64.sve.ptrue.nxv4i1(i32 0)
71 %2 = call <vscale x 8 x i1> @llvm.aarch64.sve.ptrue.nxv8i1(i32 0)
72 %3 = call <vscale x 4 x i1> @llvm.aarch64.sve.ptrue.nxv4i1(i32 31)
73 %4 = call <vscale x 8 x i1> @llvm.aarch64.sve.ptrue.nxv8i1(i32 31)
75 %5 = call <vscale x 4 x i32> @llvm.aarch64.sve.ld1.nxv4i32(<vscale x 4 x i1> %1, ptr %addr)
76 %6 = call <vscale x 8 x i32> @llvm.aarch64.sve.ld1.nxv8i32(<vscale x 8 x i1> %2, ptr %addr)
77 %7 = call <vscale x 4 x i32> @llvm.aarch64.sve.ld1.nxv4i32(<vscale x 4 x i1> %3, ptr %addr)
78 %8 = call <vscale x 8 x i32> @llvm.aarch64.sve.ld1.nxv8i32(<vscale x 8 x i1> %4, ptr %addr)
79 ret <vscale x 8 x i32> %8
83 ; Two calls to the SVE ptrue intrinsic: one with the SV_ALL pattern, another with the SV_POW2 pattern. The
84 ; patterns are incompatible, so they should not be coalesced.
85 define <vscale x 8 x i32> @coalesce_test_pattern_mismatch2(ptr %addr) {
86 ; CHECK-LABEL: @coalesce_test_pattern_mismatch2(
87 ; CHECK-NEXT: [[TMP1:%.*]] = call <vscale x 4 x i1> @llvm.aarch64.sve.ptrue.nxv4i1(i32 0)
88 ; CHECK-NEXT: [[TMP2:%.*]] = call <vscale x 4 x i32> @llvm.aarch64.sve.ld1.nxv4i32(<vscale x 4 x i1> [[TMP1]], ptr [[ADDR:%.*]])
89 ; CHECK-NEXT: [[TMP3:%.*]] = call <vscale x 8 x i1> @llvm.aarch64.sve.ptrue.nxv8i1(i32 31)
90 ; CHECK-NEXT: [[TMP4:%.*]] = call <vscale x 8 x i32> @llvm.aarch64.sve.ld1.nxv8i32(<vscale x 8 x i1> [[TMP3]], ptr [[ADDR]])
91 ; CHECK-NEXT: ret <vscale x 8 x i32> [[TMP4]]
93 %1 = call <vscale x 4 x i1> @llvm.aarch64.sve.ptrue.nxv4i1(i32 0)
94 %2 = call <vscale x 4 x i32> @llvm.aarch64.sve.ld1.nxv4i32(<vscale x 4 x i1> %1, ptr %addr)
95 %3 = call <vscale x 8 x i1> @llvm.aarch64.sve.ptrue.nxv8i1(i32 31)
96 %4 = call <vscale x 8 x i32> @llvm.aarch64.sve.ld1.nxv8i32(<vscale x 8 x i1> %3, ptr %addr)
97 ret <vscale x 8 x i32> %4
100 ; Two calls to the SVE ptrue intrinsic with the SV_VL1 pattern. This pattern is not currently recognised, so
101 ; nothing should be done here.
102 define <vscale x 8 x i32> @coalesce_test_bad_pattern(ptr %addr) {
103 ; CHECK-LABEL: @coalesce_test_bad_pattern(
104 ; CHECK-NEXT: [[TMP1:%.*]] = call <vscale x 4 x i1> @llvm.aarch64.sve.ptrue.nxv4i1(i32 1)
105 ; CHECK-NEXT: [[TMP2:%.*]] = call <vscale x 4 x i32> @llvm.aarch64.sve.ld1.nxv4i32(<vscale x 4 x i1> [[TMP1]], ptr [[ADDR:%.*]])
106 ; CHECK-NEXT: [[TMP3:%.*]] = call <vscale x 8 x i1> @llvm.aarch64.sve.ptrue.nxv8i1(i32 1)
107 ; CHECK-NEXT: [[TMP4:%.*]] = call <vscale x 8 x i32> @llvm.aarch64.sve.ld1.nxv8i32(<vscale x 8 x i1> [[TMP3]], ptr [[ADDR]])
108 ; CHECK-NEXT: ret <vscale x 8 x i32> [[TMP4]]
110 %1 = call <vscale x 4 x i1> @llvm.aarch64.sve.ptrue.nxv4i1(i32 1)
111 %2 = call <vscale x 4 x i32> @llvm.aarch64.sve.ld1.nxv4i32(<vscale x 4 x i1> %1, ptr %addr)
112 %3 = call <vscale x 8 x i1> @llvm.aarch64.sve.ptrue.nxv8i1(i32 1)
113 %4 = call <vscale x 8 x i32> @llvm.aarch64.sve.ld1.nxv8i32(<vscale x 8 x i1> %3, ptr %addr)
114 ret <vscale x 8 x i32> %4
117 ; Four calls to the SVE ptrue intrinsic. %7 is the most encompassing, and the others can be expressed as an
118 ; SVE reinterprets of %7 via convert.{to,from}.svbool.
119 define <vscale x 16 x i32> @coalesce_test_multiple(ptr %addr) {
120 ; CHECK-LABEL: @coalesce_test_multiple(
121 ; CHECK-NEXT: [[TMP1:%.*]] = tail call <vscale x 16 x i1> @llvm.aarch64.sve.ptrue.nxv16i1(i32 31)
122 ; CHECK-NEXT: [[TMP2:%.*]] = call <vscale x 16 x i1> @llvm.aarch64.sve.convert.to.svbool.nxv16i1(<vscale x 16 x i1> [[TMP1]])
123 ; CHECK-NEXT: [[TMP3:%.*]] = call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[TMP2]])
124 ; CHECK-NEXT: [[TMP4:%.*]] = call <vscale x 4 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv4i1(<vscale x 16 x i1> [[TMP2]])
125 ; CHECK-NEXT: [[TMP5:%.*]] = call <vscale x 2 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv2i1(<vscale x 16 x i1> [[TMP2]])
126 ; CHECK-NEXT: [[TMP6:%.*]] = call <vscale x 2 x i32> @llvm.aarch64.sve.ld1.nxv2i32(<vscale x 2 x i1> [[TMP5]], ptr [[ADDR:%.*]])
127 ; CHECK-NEXT: [[TMP7:%.*]] = call <vscale x 4 x i32> @llvm.aarch64.sve.ld1.nxv4i32(<vscale x 4 x i1> [[TMP4]], ptr [[ADDR]])
128 ; CHECK-NEXT: [[TMP8:%.*]] = call <vscale x 8 x i32> @llvm.aarch64.sve.ld1.nxv8i32(<vscale x 8 x i1> [[TMP3]], ptr [[ADDR]])
129 ; CHECK-NEXT: [[TMP9:%.*]] = call <vscale x 16 x i32> @llvm.aarch64.sve.ld1.nxv16i32(<vscale x 16 x i1> [[TMP1]], ptr [[ADDR]])
130 ; CHECK-NEXT: ret <vscale x 16 x i32> [[TMP9]]
132 %1 = tail call <vscale x 2 x i1> @llvm.aarch64.sve.ptrue.nxv2i1(i32 31)
133 %2 = call <vscale x 2 x i32> @llvm.aarch64.sve.ld1.nxv2i32(<vscale x 2 x i1> %1, ptr %addr)
134 %3 = tail call <vscale x 4 x i1> @llvm.aarch64.sve.ptrue.nxv4i1(i32 31)
135 %4 = call <vscale x 4 x i32> @llvm.aarch64.sve.ld1.nxv4i32(<vscale x 4 x i1> %3, ptr %addr)
136 %5 = tail call <vscale x 8 x i1> @llvm.aarch64.sve.ptrue.nxv8i1(i32 31)
137 %6 = call <vscale x 8 x i32> @llvm.aarch64.sve.ld1.nxv8i32(<vscale x 8 x i1> %5, ptr %addr)
138 %7 = tail call <vscale x 16 x i1> @llvm.aarch64.sve.ptrue.nxv16i1(i32 31)
139 %8 = call <vscale x 16 x i32> @llvm.aarch64.sve.ld1.nxv16i32(<vscale x 16 x i1> %7, ptr %addr)
140 ret <vscale x 16 x i32> %8
143 ; Two calls to the SVE ptrue intrinsic which are both of the same size. In this case, one should be identified
144 ; as redundant and rewritten and an SVE reinterpret of the other via the convert.{to,from}.svbool intrinsics.
145 ; This introduces a redundant conversion which will then be eliminated.
146 define <vscale x 4 x i32> @coalesce_test_same_size(ptr %addr) {
147 ; CHECK-LABEL: @coalesce_test_same_size(
148 ; CHECK-NEXT: [[TMP1:%.*]] = tail call <vscale x 4 x i1> @llvm.aarch64.sve.ptrue.nxv4i1(i32 31)
149 ; CHECK-NEXT: [[TMP2:%.*]] = call <vscale x 4 x i32> @llvm.aarch64.sve.ld1.nxv4i32(<vscale x 4 x i1> [[TMP1]], ptr [[ADDR:%.*]])
150 ; CHECK-NEXT: [[TMP3:%.*]] = call <vscale x 4 x i32> @llvm.aarch64.sve.ld1.nxv4i32(<vscale x 4 x i1> [[TMP1]], ptr [[ADDR]])
151 ; CHECK-NEXT: ret <vscale x 4 x i32> [[TMP3]]
153 %1 = tail call <vscale x 4 x i1> @llvm.aarch64.sve.ptrue.nxv4i1(i32 31)
154 %2 = call <vscale x 4 x i32> @llvm.aarch64.sve.ld1.nxv4i32(<vscale x 4 x i1> %1, ptr %addr)
155 %3 = tail call <vscale x 4 x i1> @llvm.aarch64.sve.ptrue.nxv4i1(i32 31)
156 %4 = call <vscale x 4 x i32> @llvm.aarch64.sve.ld1.nxv4i32(<vscale x 4 x i1> %3, ptr %addr)
157 ret <vscale x 4 x i32> %4
160 ; Two calls to the SVE ptrue intrinsic, but neither can be eliminated; %1 is promoted to become %3, which
161 ; means eliminating this call to the SVE ptrue intrinsic would involve creating a longer, irreducible chain of
162 ; conversions. Better codegen is achieved by just leaving the ptrue as-is.
163 define <vscale x 8 x i16> @coalesce_test_promoted_ptrue(ptr %addr1, ptr %addr2) {
164 ; CHECK-LABEL: @coalesce_test_promoted_ptrue(
165 ; CHECK-NEXT: [[TMP1:%.*]] = call <vscale x 8 x i1> @llvm.aarch64.sve.ptrue.nxv8i1(i32 31)
166 ; CHECK-NEXT: [[TMP2:%.*]] = call <vscale x 4 x i1> @llvm.aarch64.sve.ptrue.nxv4i1(i32 31)
167 ; CHECK-NEXT: [[TMP3:%.*]] = call <vscale x 16 x i1> @llvm.aarch64.sve.convert.to.svbool.nxv4i1(<vscale x 4 x i1> [[TMP2]])
168 ; CHECK-NEXT: [[TMP4:%.*]] = call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[TMP3]])
169 ; CHECK-NEXT: [[TMP5:%.*]] = call <vscale x 4 x i32> @llvm.aarch64.sve.ld1.nxv4i32(<vscale x 4 x i1> [[TMP2]], ptr [[ADDR1:%.*]])
170 ; CHECK-NEXT: [[TMP6:%.*]] = call <vscale x 8 x i16> @llvm.aarch64.sve.ld1.nxv8i16(<vscale x 8 x i1> [[TMP4]], ptr [[ADDR2:%.*]])
171 ; CHECK-NEXT: [[TMP7:%.*]] = call <vscale x 8 x i16> @llvm.aarch64.sve.ld1.nxv8i16(<vscale x 8 x i1> [[TMP1]], ptr [[ADDR2]])
172 ; CHECK-NEXT: ret <vscale x 8 x i16> [[TMP7]]
174 %1 = call <vscale x 4 x i1> @llvm.aarch64.sve.ptrue.nxv4i1(i32 31)
175 %2 = call <vscale x 16 x i1> @llvm.aarch64.sve.convert.to.svbool.nxv4i1(<vscale x 4 x i1> %1)
176 %3 = call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv4i1(<vscale x 16 x i1> %2)
178 %4 = call <vscale x 4 x i32> @llvm.aarch64.sve.ld1.nxv4i32(<vscale x 4 x i1> %1, ptr %addr1)
179 %5 = call <vscale x 8 x i16> @llvm.aarch64.sve.ld1.nxv8i16(<vscale x 8 x i1> %3, ptr %addr2)
181 %6 = call <vscale x 8 x i1> @llvm.aarch64.sve.ptrue.nxv8i1(i32 31)
182 %7 = call <vscale x 8 x i16> @llvm.aarch64.sve.ld1.nxv8i16(<vscale x 8 x i1> %6, ptr %addr2)
183 ret <vscale x 8 x i16> %7