1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefix=SSE2
3 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefix=AVX
4 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=AVX
5 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f | FileCheck %s --check-prefix=AVX
6 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw | FileCheck %s --check-prefix=AVX
8 define i32 @sad8_32bit_icmp_sge(ptr nocapture readonly %cur, ptr nocapture readonly %ref, i32 %stride) local_unnamed_addr #0 {
9 ; SSE2-LABEL: sad8_32bit_icmp_sge:
10 ; SSE2: # %bb.0: # %entry
11 ; SSE2-NEXT: movq {{.*#+}} xmm0 = mem[0],zero
12 ; SSE2-NEXT: movq {{.*#+}} xmm1 = mem[0],zero
13 ; SSE2-NEXT: psadbw %xmm0, %xmm1
14 ; SSE2-NEXT: movd %xmm1, %eax
17 ; AVX-LABEL: sad8_32bit_icmp_sge:
18 ; AVX: # %bb.0: # %entry
19 ; AVX-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero
20 ; AVX-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero
21 ; AVX-NEXT: vpsadbw %xmm1, %xmm0, %xmm0
22 ; AVX-NEXT: vmovd %xmm0, %eax
26 %idx.ext = zext i32 %stride to i64
29 for.body: ; preds = %entry
30 %0 = load <8 x i8>, ptr %cur, align 1
31 %1 = zext <8 x i8> %0 to <8 x i32>
32 %2 = load <8 x i8>, ptr %ref, align 1
33 %3 = zext <8 x i8> %2 to <8 x i32>
34 %4 = sub nsw <8 x i32> %1, %3
35 %5 = icmp sgt <8 x i32> %4, <i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1>
36 %6 = sub nsw <8 x i32> zeroinitializer, %4
37 %7 = select <8 x i1> %5, <8 x i32> %4, <8 x i32> %6
38 %rdx.shuf = shufflevector <8 x i32> %7, <8 x i32> undef, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef>
39 %bin.rdx = add <8 x i32> %7, %rdx.shuf
40 %rdx.shuf229 = shufflevector <8 x i32> %bin.rdx, <8 x i32> undef, <8 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
41 %bin.rdx230 = add <8 x i32> %bin.rdx, %rdx.shuf229
42 %rdx.shuf231 = shufflevector <8 x i32> %bin.rdx230, <8 x i32> undef, <8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
43 %bin.rdx232 = add <8 x i32> %bin.rdx230, %rdx.shuf231
44 %8 = extractelement <8 x i32> %bin.rdx232, i32 0
48 define i32 @sad8_32bit_icmp_sgt(ptr nocapture readonly %cur, ptr nocapture readonly %ref, i32 %stride) local_unnamed_addr #1 {
49 ; SSE2-LABEL: sad8_32bit_icmp_sgt:
50 ; SSE2: # %bb.0: # %entry
51 ; SSE2-NEXT: movq {{.*#+}} xmm0 = mem[0],zero
52 ; SSE2-NEXT: movq {{.*#+}} xmm1 = mem[0],zero
53 ; SSE2-NEXT: psadbw %xmm0, %xmm1
54 ; SSE2-NEXT: movd %xmm1, %eax
57 ; AVX-LABEL: sad8_32bit_icmp_sgt:
58 ; AVX: # %bb.0: # %entry
59 ; AVX-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero
60 ; AVX-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero
61 ; AVX-NEXT: vpsadbw %xmm1, %xmm0, %xmm0
62 ; AVX-NEXT: vmovd %xmm0, %eax
65 %idx.ext = zext i32 %stride to i64
68 for.body: ; preds = %entry
69 %0 = load <8 x i8>, ptr %cur, align 1
70 %1 = zext <8 x i8> %0 to <8 x i32>
71 %2 = load <8 x i8>, ptr %ref, align 1
72 %3 = zext <8 x i8> %2 to <8 x i32>
73 %4 = sub nsw <8 x i32> %1, %3
74 %5 = icmp sgt <8 x i32> %4, zeroinitializer
75 %6 = sub nsw <8 x i32> zeroinitializer, %4
76 %7 = select <8 x i1> %5, <8 x i32> %4, <8 x i32> %6
77 %rdx.shuf = shufflevector <8 x i32> %7, <8 x i32> undef, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef>
78 %bin.rdx = add <8 x i32> %7, %rdx.shuf
79 %rdx.shuf229 = shufflevector <8 x i32> %bin.rdx, <8 x i32> undef, <8 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
80 %bin.rdx230 = add <8 x i32> %bin.rdx, %rdx.shuf229
81 %rdx.shuf231 = shufflevector <8 x i32> %bin.rdx230, <8 x i32> undef, <8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
82 %bin.rdx232 = add <8 x i32> %bin.rdx230, %rdx.shuf231
83 %8 = extractelement <8 x i32> %bin.rdx232, i32 0
87 define i32 @sad8_32bit_icmp_sle(ptr nocapture readonly %cur, ptr nocapture readonly %ref, i32 %stride) local_unnamed_addr #2 {
88 ; SSE2-LABEL: sad8_32bit_icmp_sle:
89 ; SSE2: # %bb.0: # %entry
90 ; SSE2-NEXT: movq {{.*#+}} xmm0 = mem[0],zero
91 ; SSE2-NEXT: movq {{.*#+}} xmm1 = mem[0],zero
92 ; SSE2-NEXT: psadbw %xmm0, %xmm1
93 ; SSE2-NEXT: movd %xmm1, %eax
96 ; AVX-LABEL: sad8_32bit_icmp_sle:
97 ; AVX: # %bb.0: # %entry
98 ; AVX-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero
99 ; AVX-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero
100 ; AVX-NEXT: vpsadbw %xmm1, %xmm0, %xmm0
101 ; AVX-NEXT: vmovd %xmm0, %eax
104 %idx.ext = zext i32 %stride to i64
107 for.body: ; preds = %entry
108 %0 = load <8 x i8>, ptr %cur, align 1
109 %1 = zext <8 x i8> %0 to <8 x i32>
110 %2 = load <8 x i8>, ptr %ref, align 1
111 %3 = zext <8 x i8> %2 to <8 x i32>
112 %4 = sub nsw <8 x i32> %1, %3
113 %5 = icmp slt <8 x i32> %4, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
114 %6 = sub nsw <8 x i32> zeroinitializer, %4
115 %7 = select <8 x i1> %5, <8 x i32> %6, <8 x i32> %4
116 %rdx.shuf = shufflevector <8 x i32> %7, <8 x i32> undef, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef>
117 %bin.rdx = add <8 x i32> %7, %rdx.shuf
118 %rdx.shuf229 = shufflevector <8 x i32> %bin.rdx, <8 x i32> undef, <8 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
119 %bin.rdx230 = add <8 x i32> %bin.rdx, %rdx.shuf229
120 %rdx.shuf231 = shufflevector <8 x i32> %bin.rdx230, <8 x i32> undef, <8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
121 %bin.rdx232 = add <8 x i32> %bin.rdx230, %rdx.shuf231
122 %8 = extractelement <8 x i32> %bin.rdx232, i32 0
126 define i32 @sad8_32bit_icmp_slt(ptr nocapture readonly %cur, ptr nocapture readonly %ref, i32 %stride) local_unnamed_addr #3 {
127 ; SSE2-LABEL: sad8_32bit_icmp_slt:
128 ; SSE2: # %bb.0: # %entry
129 ; SSE2-NEXT: movq {{.*#+}} xmm0 = mem[0],zero
130 ; SSE2-NEXT: movq {{.*#+}} xmm1 = mem[0],zero
131 ; SSE2-NEXT: psadbw %xmm0, %xmm1
132 ; SSE2-NEXT: movd %xmm1, %eax
135 ; AVX-LABEL: sad8_32bit_icmp_slt:
136 ; AVX: # %bb.0: # %entry
137 ; AVX-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero
138 ; AVX-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero
139 ; AVX-NEXT: vpsadbw %xmm1, %xmm0, %xmm0
140 ; AVX-NEXT: vmovd %xmm0, %eax
143 %idx.ext = zext i32 %stride to i64
146 for.body: ; preds = %entry
147 %0 = load <8 x i8>, ptr %cur, align 1
148 %1 = zext <8 x i8> %0 to <8 x i32>
149 %2 = load <8 x i8>, ptr %ref, align 1
150 %3 = zext <8 x i8> %2 to <8 x i32>
151 %4 = sub nsw <8 x i32> %1, %3
152 %5 = icmp slt <8 x i32> %4, zeroinitializer
153 %6 = sub nsw <8 x i32> zeroinitializer, %4
154 %7 = select <8 x i1> %5, <8 x i32> %6, <8 x i32> %4
155 %rdx.shuf = shufflevector <8 x i32> %7, <8 x i32> undef, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef>
156 %bin.rdx = add <8 x i32> %7, %rdx.shuf
157 %rdx.shuf229 = shufflevector <8 x i32> %bin.rdx, <8 x i32> undef, <8 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
158 %bin.rdx230 = add <8 x i32> %bin.rdx, %rdx.shuf229
159 %rdx.shuf231 = shufflevector <8 x i32> %bin.rdx230, <8 x i32> undef, <8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
160 %bin.rdx232 = add <8 x i32> %bin.rdx230, %rdx.shuf231
161 %8 = extractelement <8 x i32> %bin.rdx232, i32 0
165 define i64 @sad8_64bit_icmp_sext_slt(ptr nocapture readonly %cur, ptr nocapture readonly %ref, i64 %stride) local_unnamed_addr #4 {
166 ; SSE2-LABEL: sad8_64bit_icmp_sext_slt:
167 ; SSE2: # %bb.0: # %entry
168 ; SSE2-NEXT: movq {{.*#+}} xmm0 = mem[0],zero
169 ; SSE2-NEXT: movq {{.*#+}} xmm1 = mem[0],zero
170 ; SSE2-NEXT: psadbw %xmm0, %xmm1
171 ; SSE2-NEXT: movq %xmm1, %rax
174 ; AVX-LABEL: sad8_64bit_icmp_sext_slt:
175 ; AVX: # %bb.0: # %entry
176 ; AVX-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero
177 ; AVX-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero
178 ; AVX-NEXT: vpsadbw %xmm1, %xmm0, %xmm0
179 ; AVX-NEXT: vmovq %xmm0, %rax
184 for.body: ; preds = %entry
185 %0 = load <8 x i8>, ptr %cur, align 1
186 %1 = zext <8 x i8> %0 to <8 x i32>
187 %2 = load <8 x i8>, ptr %ref, align 1
188 %3 = zext <8 x i8> %2 to <8 x i32>
189 %4 = sub nsw <8 x i32> %1, %3
190 %5 = icmp slt <8 x i32> %4, zeroinitializer
191 %6 = sub nsw <8 x i32> zeroinitializer, %4
192 %7 = select <8 x i1> %5, <8 x i32> %6, <8 x i32> %4
193 %8 = sext <8 x i32> %7 to <8 x i64>
194 %rdx.shuf = shufflevector <8 x i64> %8, <8 x i64> undef, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef>
195 %bin.rdx = add <8 x i64> %rdx.shuf, %8
196 %rdx.shuf236 = shufflevector <8 x i64> %bin.rdx, <8 x i64> undef, <8 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
197 %bin.rdx237 = add <8 x i64> %bin.rdx, %rdx.shuf236
198 %rdx.shuf238 = shufflevector <8 x i64> %bin.rdx237, <8 x i64> undef, <8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
199 %bin.rdx239 = add <8 x i64> %bin.rdx237, %rdx.shuf238
200 %9 = extractelement <8 x i64> %bin.rdx239, i32 0
204 define i64 @sad8_64bit_icmp_zext_slt(ptr nocapture readonly %cur, ptr nocapture readonly %ref, i64 %stride) local_unnamed_addr #4 {
205 ; SSE2-LABEL: sad8_64bit_icmp_zext_slt:
206 ; SSE2: # %bb.0: # %entry
207 ; SSE2-NEXT: movq {{.*#+}} xmm0 = mem[0],zero
208 ; SSE2-NEXT: movq {{.*#+}} xmm1 = mem[0],zero
209 ; SSE2-NEXT: psadbw %xmm0, %xmm1
210 ; SSE2-NEXT: movq %xmm1, %rax
213 ; AVX-LABEL: sad8_64bit_icmp_zext_slt:
214 ; AVX: # %bb.0: # %entry
215 ; AVX-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero
216 ; AVX-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero
217 ; AVX-NEXT: vpsadbw %xmm1, %xmm0, %xmm0
218 ; AVX-NEXT: vmovq %xmm0, %rax
223 for.body: ; preds = %entry
224 %0 = load <8 x i8>, ptr %cur, align 1
225 %1 = zext <8 x i8> %0 to <8 x i32>
226 %2 = load <8 x i8>, ptr %ref, align 1
227 %3 = zext <8 x i8> %2 to <8 x i32>
228 %4 = sub nsw <8 x i32> %1, %3
229 %5 = icmp slt <8 x i32> %4, zeroinitializer
230 %6 = sub nsw <8 x i32> zeroinitializer, %4
231 %7 = select <8 x i1> %5, <8 x i32> %6, <8 x i32> %4
232 %8 = zext <8 x i32> %7 to <8 x i64>
233 %rdx.shuf = shufflevector <8 x i64> %8, <8 x i64> undef, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef>
234 %bin.rdx = add <8 x i64> %rdx.shuf, %8
235 %rdx.shuf236 = shufflevector <8 x i64> %bin.rdx, <8 x i64> undef, <8 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
236 %bin.rdx237 = add <8 x i64> %bin.rdx, %rdx.shuf236
237 %rdx.shuf238 = shufflevector <8 x i64> %bin.rdx237, <8 x i64> undef, <8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
238 %bin.rdx239 = add <8 x i64> %bin.rdx237, %rdx.shuf238
239 %9 = extractelement <8 x i64> %bin.rdx239, i32 0
243 define i64 @sad8_early_64bit_icmp_zext_slt(ptr nocapture readonly %cur, ptr nocapture readonly %ref, i64 %stride) local_unnamed_addr #4 {
244 ; SSE2-LABEL: sad8_early_64bit_icmp_zext_slt:
245 ; SSE2: # %bb.0: # %entry
246 ; SSE2-NEXT: movq {{.*#+}} xmm0 = mem[0],zero
247 ; SSE2-NEXT: movq {{.*#+}} xmm1 = mem[0],zero
248 ; SSE2-NEXT: psadbw %xmm0, %xmm1
249 ; SSE2-NEXT: movq %xmm1, %rax
252 ; AVX-LABEL: sad8_early_64bit_icmp_zext_slt:
253 ; AVX: # %bb.0: # %entry
254 ; AVX-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero
255 ; AVX-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero
256 ; AVX-NEXT: vpsadbw %xmm1, %xmm0, %xmm0
257 ; AVX-NEXT: vmovq %xmm0, %rax
262 for.body: ; preds = %entry
263 %0 = load <8 x i8>, ptr %cur, align 1
264 %1 = zext <8 x i8> %0 to <8 x i64>
265 %2 = load <8 x i8>, ptr %ref, align 1
266 %3 = zext <8 x i8> %2 to <8 x i64>
267 %4 = sub nsw <8 x i64> %1, %3
268 %5 = icmp slt <8 x i64> %4, zeroinitializer
269 %6 = sub nsw <8 x i64> zeroinitializer, %4
270 %7 = select <8 x i1> %5, <8 x i64> %6, <8 x i64> %4
271 %rdx.shuf = shufflevector <8 x i64> %7, <8 x i64> undef, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef>
272 %bin.rdx = add <8 x i64> %rdx.shuf, %7
273 %rdx.shuf236 = shufflevector <8 x i64> %bin.rdx, <8 x i64> undef, <8 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
274 %bin.rdx237 = add <8 x i64> %bin.rdx, %rdx.shuf236
275 %rdx.shuf238 = shufflevector <8 x i64> %bin.rdx237, <8 x i64> undef, <8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
276 %bin.rdx239 = add <8 x i64> %bin.rdx237, %rdx.shuf238
277 %8 = extractelement <8 x i64> %bin.rdx239, i32 0