1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefixes=SSE2
3 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefixes=AVX,AVX1
4 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=AVX,AVX2
5 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f | FileCheck %s --check-prefixes=AVX,AVX512,AVX512F
6 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw | FileCheck %s --check-prefixes=AVX,AVX512,AVX512BW
8 @a = dso_local global [1024 x i8] zeroinitializer, align 16
9 @b = dso_local global [1024 x i8] zeroinitializer, align 16
11 define dso_local i32 @sad_16i8() nounwind {
12 ; SSE2-LABEL: sad_16i8:
13 ; SSE2: # %bb.0: # %entry
14 ; SSE2-NEXT: pxor %xmm0, %xmm0
15 ; SSE2-NEXT: movq $-1024, %rax # imm = 0xFC00
16 ; SSE2-NEXT: pxor %xmm1, %xmm1
17 ; SSE2-NEXT: .p2align 4, 0x90
18 ; SSE2-NEXT: .LBB0_1: # %vector.body
19 ; SSE2-NEXT: # =>This Inner Loop Header: Depth=1
20 ; SSE2-NEXT: movdqu a+1024(%rax), %xmm2
21 ; SSE2-NEXT: movdqu b+1024(%rax), %xmm3
22 ; SSE2-NEXT: psadbw %xmm2, %xmm3
23 ; SSE2-NEXT: paddd %xmm3, %xmm1
24 ; SSE2-NEXT: addq $16, %rax
25 ; SSE2-NEXT: jne .LBB0_1
26 ; SSE2-NEXT: # %bb.2: # %middle.block
27 ; SSE2-NEXT: paddd %xmm0, %xmm1
28 ; SSE2-NEXT: paddd %xmm0, %xmm0
29 ; SSE2-NEXT: paddd %xmm1, %xmm0
30 ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
31 ; SSE2-NEXT: paddd %xmm0, %xmm1
32 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,1,1]
33 ; SSE2-NEXT: paddd %xmm1, %xmm0
34 ; SSE2-NEXT: movd %xmm0, %eax
37 ; AVX1-LABEL: sad_16i8:
38 ; AVX1: # %bb.0: # %entry
39 ; AVX1-NEXT: vpxor %xmm0, %xmm0, %xmm0
40 ; AVX1-NEXT: movq $-1024, %rax # imm = 0xFC00
41 ; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1
42 ; AVX1-NEXT: .p2align 4, 0x90
43 ; AVX1-NEXT: .LBB0_1: # %vector.body
44 ; AVX1-NEXT: # =>This Inner Loop Header: Depth=1
45 ; AVX1-NEXT: vmovdqu a+1024(%rax), %xmm2
46 ; AVX1-NEXT: vpsadbw b+1024(%rax), %xmm2, %xmm2
47 ; AVX1-NEXT: vpaddd %xmm1, %xmm2, %xmm2
48 ; AVX1-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
49 ; AVX1-NEXT: addq $16, %rax
50 ; AVX1-NEXT: jne .LBB0_1
51 ; AVX1-NEXT: # %bb.2: # %middle.block
52 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
53 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
54 ; AVX1-NEXT: vpaddd %xmm3, %xmm2, %xmm2
55 ; AVX1-NEXT: vpaddd %xmm2, %xmm0, %xmm0
56 ; AVX1-NEXT: vpaddd %xmm0, %xmm1, %xmm0
57 ; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
58 ; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm0
59 ; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
60 ; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm0
61 ; AVX1-NEXT: vmovd %xmm0, %eax
62 ; AVX1-NEXT: vzeroupper
65 ; AVX2-LABEL: sad_16i8:
66 ; AVX2: # %bb.0: # %entry
67 ; AVX2-NEXT: vpxor %xmm0, %xmm0, %xmm0
68 ; AVX2-NEXT: movq $-1024, %rax # imm = 0xFC00
69 ; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1
70 ; AVX2-NEXT: .p2align 4, 0x90
71 ; AVX2-NEXT: .LBB0_1: # %vector.body
72 ; AVX2-NEXT: # =>This Inner Loop Header: Depth=1
73 ; AVX2-NEXT: vmovdqu a+1024(%rax), %xmm2
74 ; AVX2-NEXT: vpsadbw b+1024(%rax), %xmm2, %xmm2
75 ; AVX2-NEXT: vpaddd %ymm1, %ymm2, %ymm1
76 ; AVX2-NEXT: addq $16, %rax
77 ; AVX2-NEXT: jne .LBB0_1
78 ; AVX2-NEXT: # %bb.2: # %middle.block
79 ; AVX2-NEXT: vpaddd %ymm0, %ymm1, %ymm0
80 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
81 ; AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm0
82 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
83 ; AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm0
84 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
85 ; AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm0
86 ; AVX2-NEXT: vmovd %xmm0, %eax
87 ; AVX2-NEXT: vzeroupper
90 ; AVX512-LABEL: sad_16i8:
91 ; AVX512: # %bb.0: # %entry
92 ; AVX512-NEXT: vpxor %xmm0, %xmm0, %xmm0
93 ; AVX512-NEXT: movq $-1024, %rax # imm = 0xFC00
94 ; AVX512-NEXT: .p2align 4, 0x90
95 ; AVX512-NEXT: .LBB0_1: # %vector.body
96 ; AVX512-NEXT: # =>This Inner Loop Header: Depth=1
97 ; AVX512-NEXT: vmovdqu a+1024(%rax), %xmm1
98 ; AVX512-NEXT: vpsadbw b+1024(%rax), %xmm1, %xmm1
99 ; AVX512-NEXT: vpaddd %zmm0, %zmm1, %zmm0
100 ; AVX512-NEXT: addq $16, %rax
101 ; AVX512-NEXT: jne .LBB0_1
102 ; AVX512-NEXT: # %bb.2: # %middle.block
103 ; AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm1
104 ; AVX512-NEXT: vpaddd %zmm1, %zmm0, %zmm0
105 ; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1
106 ; AVX512-NEXT: vpaddd %xmm1, %xmm0, %xmm0
107 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
108 ; AVX512-NEXT: vpaddd %xmm1, %xmm0, %xmm0
109 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
110 ; AVX512-NEXT: vpaddd %xmm1, %xmm0, %xmm0
111 ; AVX512-NEXT: vmovd %xmm0, %eax
112 ; AVX512-NEXT: vzeroupper
115 br label %vector.body
118 %index = phi i64 [ 0, %entry ], [ %index.next, %vector.body ]
119 %vec.phi = phi <16 x i32> [ zeroinitializer, %entry ], [ %10, %vector.body ]
120 %0 = getelementptr inbounds [1024 x i8], [1024 x i8]* @a, i64 0, i64 %index
121 %1 = bitcast i8* %0 to <16 x i8>*
122 %wide.load = load <16 x i8>, <16 x i8>* %1, align 4
123 %2 = zext <16 x i8> %wide.load to <16 x i32>
124 %3 = getelementptr inbounds [1024 x i8], [1024 x i8]* @b, i64 0, i64 %index
125 %4 = bitcast i8* %3 to <16 x i8>*
126 %wide.load1 = load <16 x i8>, <16 x i8>* %4, align 4
127 %5 = zext <16 x i8> %wide.load1 to <16 x i32>
128 %6 = sub nsw <16 x i32> %2, %5
129 %7 = icmp sgt <16 x i32> %6, <i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1>
130 %8 = sub nsw <16 x i32> zeroinitializer, %6
131 %9 = select <16 x i1> %7, <16 x i32> %6, <16 x i32> %8
132 %10 = add nsw <16 x i32> %9, %vec.phi
133 %index.next = add i64 %index, 16
134 %11 = icmp eq i64 %index.next, 1024
135 br i1 %11, label %middle.block, label %vector.body
138 %rdx.shuf = shufflevector <16 x i32> %10, <16 x i32> undef, <16 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
139 %bin.rdx = add <16 x i32> %10, %rdx.shuf
140 %rdx.shuf2 = shufflevector <16 x i32> %bin.rdx, <16 x i32> undef, <16 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
141 %bin.rdx2 = add <16 x i32> %bin.rdx, %rdx.shuf2
142 %rdx.shuf3 = shufflevector <16 x i32> %bin.rdx2, <16 x i32> undef, <16 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
143 %bin.rdx3 = add <16 x i32> %bin.rdx2, %rdx.shuf3
144 %rdx.shuf4 = shufflevector <16 x i32> %bin.rdx3, <16 x i32> undef, <16 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
145 %bin.rdx4 = add <16 x i32> %bin.rdx3, %rdx.shuf4
146 %12 = extractelement <16 x i32> %bin.rdx4, i32 0
150 define dso_local i32 @sad_32i8() nounwind {
151 ; SSE2-LABEL: sad_32i8:
152 ; SSE2: # %bb.0: # %entry
153 ; SSE2-NEXT: pxor %xmm0, %xmm0
154 ; SSE2-NEXT: movq $-1024, %rax # imm = 0xFC00
155 ; SSE2-NEXT: pxor %xmm2, %xmm2
156 ; SSE2-NEXT: pxor %xmm1, %xmm1
157 ; SSE2-NEXT: .p2align 4, 0x90
158 ; SSE2-NEXT: .LBB1_1: # %vector.body
159 ; SSE2-NEXT: # =>This Inner Loop Header: Depth=1
160 ; SSE2-NEXT: movdqa a+1024(%rax), %xmm3
161 ; SSE2-NEXT: psadbw b+1024(%rax), %xmm3
162 ; SSE2-NEXT: paddd %xmm3, %xmm2
163 ; SSE2-NEXT: movdqa a+1040(%rax), %xmm3
164 ; SSE2-NEXT: psadbw b+1040(%rax), %xmm3
165 ; SSE2-NEXT: paddd %xmm3, %xmm1
166 ; SSE2-NEXT: addq $32, %rax
167 ; SSE2-NEXT: jne .LBB1_1
168 ; SSE2-NEXT: # %bb.2: # %middle.block
169 ; SSE2-NEXT: paddd %xmm0, %xmm1
170 ; SSE2-NEXT: paddd %xmm0, %xmm2
171 ; SSE2-NEXT: paddd %xmm0, %xmm0
172 ; SSE2-NEXT: paddd %xmm0, %xmm1
173 ; SSE2-NEXT: paddd %xmm0, %xmm1
174 ; SSE2-NEXT: paddd %xmm2, %xmm1
175 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3]
176 ; SSE2-NEXT: paddd %xmm1, %xmm0
177 ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
178 ; SSE2-NEXT: paddd %xmm0, %xmm1
179 ; SSE2-NEXT: movd %xmm1, %eax
182 ; AVX1-LABEL: sad_32i8:
183 ; AVX1: # %bb.0: # %entry
184 ; AVX1-NEXT: vpxor %xmm0, %xmm0, %xmm0
185 ; AVX1-NEXT: movq $-1024, %rax # imm = 0xFC00
186 ; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1
187 ; AVX1-NEXT: .p2align 4, 0x90
188 ; AVX1-NEXT: .LBB1_1: # %vector.body
189 ; AVX1-NEXT: # =>This Inner Loop Header: Depth=1
190 ; AVX1-NEXT: vmovdqa a+1024(%rax), %xmm2
191 ; AVX1-NEXT: vpsadbw b+1024(%rax), %xmm2, %xmm2
192 ; AVX1-NEXT: vmovdqa a+1040(%rax), %xmm3
193 ; AVX1-NEXT: vpsadbw b+1040(%rax), %xmm3, %xmm3
194 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm4
195 ; AVX1-NEXT: vpaddd %xmm4, %xmm3, %xmm3
196 ; AVX1-NEXT: vpaddd %xmm1, %xmm2, %xmm1
197 ; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm1
198 ; AVX1-NEXT: addq $32, %rax
199 ; AVX1-NEXT: jne .LBB1_1
200 ; AVX1-NEXT: # %bb.2: # %middle.block
201 ; AVX1-NEXT: vpaddd %xmm0, %xmm0, %xmm2
202 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3
203 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm4
204 ; AVX1-NEXT: vpaddd %xmm4, %xmm4, %xmm5
205 ; AVX1-NEXT: vpaddd %xmm5, %xmm4, %xmm4
206 ; AVX1-NEXT: vpaddd %xmm4, %xmm3, %xmm3
207 ; AVX1-NEXT: vpaddd %xmm2, %xmm0, %xmm0
208 ; AVX1-NEXT: vpaddd %xmm3, %xmm0, %xmm0
209 ; AVX1-NEXT: vpaddd %xmm0, %xmm1, %xmm0
210 ; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
211 ; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm0
212 ; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
213 ; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm0
214 ; AVX1-NEXT: vmovd %xmm0, %eax
215 ; AVX1-NEXT: vzeroupper
218 ; AVX2-LABEL: sad_32i8:
219 ; AVX2: # %bb.0: # %entry
220 ; AVX2-NEXT: vpxor %xmm0, %xmm0, %xmm0
221 ; AVX2-NEXT: movq $-1024, %rax # imm = 0xFC00
222 ; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1
223 ; AVX2-NEXT: .p2align 4, 0x90
224 ; AVX2-NEXT: .LBB1_1: # %vector.body
225 ; AVX2-NEXT: # =>This Inner Loop Header: Depth=1
226 ; AVX2-NEXT: vmovdqa a+1024(%rax), %ymm2
227 ; AVX2-NEXT: vpsadbw b+1024(%rax), %ymm2, %ymm2
228 ; AVX2-NEXT: vpaddd %ymm1, %ymm2, %ymm1
229 ; AVX2-NEXT: addq $32, %rax
230 ; AVX2-NEXT: jne .LBB1_1
231 ; AVX2-NEXT: # %bb.2: # %middle.block
232 ; AVX2-NEXT: vpaddd %ymm0, %ymm1, %ymm1
233 ; AVX2-NEXT: vpaddd %ymm0, %ymm0, %ymm0
234 ; AVX2-NEXT: vpaddd %ymm0, %ymm1, %ymm0
235 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
236 ; AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm0
237 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
238 ; AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm0
239 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
240 ; AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm0
241 ; AVX2-NEXT: vmovd %xmm0, %eax
242 ; AVX2-NEXT: vzeroupper
245 ; AVX512-LABEL: sad_32i8:
246 ; AVX512: # %bb.0: # %entry
247 ; AVX512-NEXT: vpxor %xmm0, %xmm0, %xmm0
248 ; AVX512-NEXT: movq $-1024, %rax # imm = 0xFC00
249 ; AVX512-NEXT: vpxor %xmm1, %xmm1, %xmm1
250 ; AVX512-NEXT: .p2align 4, 0x90
251 ; AVX512-NEXT: .LBB1_1: # %vector.body
252 ; AVX512-NEXT: # =>This Inner Loop Header: Depth=1
253 ; AVX512-NEXT: vmovdqa a+1024(%rax), %ymm2
254 ; AVX512-NEXT: vpsadbw b+1024(%rax), %ymm2, %ymm2
255 ; AVX512-NEXT: vpaddd %zmm1, %zmm2, %zmm1
256 ; AVX512-NEXT: addq $32, %rax
257 ; AVX512-NEXT: jne .LBB1_1
258 ; AVX512-NEXT: # %bb.2: # %middle.block
259 ; AVX512-NEXT: vpaddd %zmm0, %zmm1, %zmm0
260 ; AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm1
261 ; AVX512-NEXT: vpaddd %zmm1, %zmm0, %zmm0
262 ; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1
263 ; AVX512-NEXT: vpaddd %xmm1, %xmm0, %xmm0
264 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
265 ; AVX512-NEXT: vpaddd %xmm1, %xmm0, %xmm0
266 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
267 ; AVX512-NEXT: vpaddd %xmm1, %xmm0, %xmm0
268 ; AVX512-NEXT: vmovd %xmm0, %eax
269 ; AVX512-NEXT: vzeroupper
272 br label %vector.body
275 %index = phi i64 [ 0, %entry ], [ %index.next, %vector.body ]
276 %vec.phi = phi <32 x i32> [ zeroinitializer, %entry ], [ %10, %vector.body ]
277 %0 = getelementptr inbounds [1024 x i8], [1024 x i8]* @a, i64 0, i64 %index
278 %1 = bitcast i8* %0 to <32 x i8>*
279 %wide.load = load <32 x i8>, <32 x i8>* %1, align 32
280 %2 = zext <32 x i8> %wide.load to <32 x i32>
281 %3 = getelementptr inbounds [1024 x i8], [1024 x i8]* @b, i64 0, i64 %index
282 %4 = bitcast i8* %3 to <32 x i8>*
283 %wide.load1 = load <32 x i8>, <32 x i8>* %4, align 32
284 %5 = zext <32 x i8> %wide.load1 to <32 x i32>
285 %6 = sub nsw <32 x i32> %2, %5
286 %7 = icmp sgt <32 x i32> %6, <i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1>
287 %8 = sub nsw <32 x i32> zeroinitializer, %6
288 %9 = select <32 x i1> %7, <32 x i32> %6, <32 x i32> %8
289 %10 = add nsw <32 x i32> %9, %vec.phi
290 %index.next = add i64 %index, 32
291 %11 = icmp eq i64 %index.next, 1024
292 br i1 %11, label %middle.block, label %vector.body
295 %rdx.shuf = shufflevector <32 x i32> %10, <32 x i32> undef, <32 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
296 %bin.rdx = add <32 x i32> %10, %rdx.shuf
297 %rdx.shuf2 = shufflevector <32 x i32> %bin.rdx, <32 x i32> undef, <32 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
298 %bin.rdx2 = add <32 x i32> %bin.rdx, %rdx.shuf2
299 %rdx.shuf3 = shufflevector <32 x i32> %bin.rdx2, <32 x i32> undef, <32 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
300 %bin.rdx3 = add <32 x i32> %bin.rdx2, %rdx.shuf3
301 %rdx.shuf4 = shufflevector <32 x i32> %bin.rdx3, <32 x i32> undef, <32 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
302 %bin.rdx4 = add <32 x i32> %bin.rdx3, %rdx.shuf4
303 %rdx.shuf5 = shufflevector <32 x i32> %bin.rdx4, <32 x i32> undef, <32 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
304 %bin.rdx5 = add <32 x i32> %bin.rdx4, %rdx.shuf5
305 %12 = extractelement <32 x i32> %bin.rdx5, i32 0
309 define dso_local i32 @sad_avx64i8() nounwind {
310 ; SSE2-LABEL: sad_avx64i8:
311 ; SSE2: # %bb.0: # %entry
312 ; SSE2-NEXT: pxor %xmm4, %xmm4
313 ; SSE2-NEXT: movq $-1024, %rax # imm = 0xFC00
314 ; SSE2-NEXT: pxor %xmm0, %xmm0
315 ; SSE2-NEXT: pxor %xmm3, %xmm3
316 ; SSE2-NEXT: pxor %xmm2, %xmm2
317 ; SSE2-NEXT: pxor %xmm1, %xmm1
318 ; SSE2-NEXT: .p2align 4, 0x90
319 ; SSE2-NEXT: .LBB2_1: # %vector.body
320 ; SSE2-NEXT: # =>This Inner Loop Header: Depth=1
321 ; SSE2-NEXT: movdqa a+1024(%rax), %xmm5
322 ; SSE2-NEXT: psadbw b+1024(%rax), %xmm5
323 ; SSE2-NEXT: paddd %xmm5, %xmm0
324 ; SSE2-NEXT: movdqa a+1040(%rax), %xmm5
325 ; SSE2-NEXT: psadbw b+1040(%rax), %xmm5
326 ; SSE2-NEXT: paddd %xmm5, %xmm3
327 ; SSE2-NEXT: movdqa a+1056(%rax), %xmm5
328 ; SSE2-NEXT: psadbw b+1056(%rax), %xmm5
329 ; SSE2-NEXT: paddd %xmm5, %xmm2
330 ; SSE2-NEXT: movdqa a+1072(%rax), %xmm5
331 ; SSE2-NEXT: psadbw b+1072(%rax), %xmm5
332 ; SSE2-NEXT: paddd %xmm5, %xmm1
333 ; SSE2-NEXT: addq $64, %rax
334 ; SSE2-NEXT: jne .LBB2_1
335 ; SSE2-NEXT: # %bb.2: # %middle.block
336 ; SSE2-NEXT: paddd %xmm4, %xmm2
337 ; SSE2-NEXT: pxor %xmm5, %xmm5
338 ; SSE2-NEXT: paddd %xmm5, %xmm5
339 ; SSE2-NEXT: paddd %xmm4, %xmm0
340 ; SSE2-NEXT: paddd %xmm4, %xmm1
341 ; SSE2-NEXT: paddd %xmm4, %xmm3
342 ; SSE2-NEXT: paddd %xmm5, %xmm1
343 ; SSE2-NEXT: paddd %xmm5, %xmm2
344 ; SSE2-NEXT: paddd %xmm5, %xmm2
345 ; SSE2-NEXT: paddd %xmm5, %xmm1
346 ; SSE2-NEXT: paddd %xmm3, %xmm1
347 ; SSE2-NEXT: paddd %xmm2, %xmm1
348 ; SSE2-NEXT: paddd %xmm0, %xmm1
349 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3]
350 ; SSE2-NEXT: paddd %xmm1, %xmm0
351 ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
352 ; SSE2-NEXT: paddd %xmm0, %xmm1
353 ; SSE2-NEXT: movd %xmm1, %eax
356 ; AVX1-LABEL: sad_avx64i8:
357 ; AVX1: # %bb.0: # %entry
358 ; AVX1-NEXT: vpxor %xmm8, %xmm8, %xmm8
359 ; AVX1-NEXT: movq $-1024, %rax # imm = 0xFC00
360 ; AVX1-NEXT: vpxor %xmm0, %xmm0, %xmm0
361 ; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2
362 ; AVX1-NEXT: .p2align 4, 0x90
363 ; AVX1-NEXT: .LBB2_1: # %vector.body
364 ; AVX1-NEXT: # =>This Inner Loop Header: Depth=1
365 ; AVX1-NEXT: vmovdqa a+1024(%rax), %xmm3
366 ; AVX1-NEXT: vpsadbw b+1024(%rax), %xmm3, %xmm3
367 ; AVX1-NEXT: vmovdqa a+1040(%rax), %xmm4
368 ; AVX1-NEXT: vpsadbw b+1040(%rax), %xmm4, %xmm4
369 ; AVX1-NEXT: vmovdqa a+1056(%rax), %xmm5
370 ; AVX1-NEXT: vpsadbw b+1056(%rax), %xmm5, %xmm5
371 ; AVX1-NEXT: vmovdqa a+1072(%rax), %xmm6
372 ; AVX1-NEXT: vpsadbw b+1072(%rax), %xmm6, %xmm6
373 ; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm7
374 ; AVX1-NEXT: vpaddd %xmm7, %xmm6, %xmm6
375 ; AVX1-NEXT: vpaddd %xmm2, %xmm5, %xmm2
376 ; AVX1-NEXT: vinsertf128 $1, %xmm6, %ymm2, %ymm2
377 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm5
378 ; AVX1-NEXT: vpaddd %xmm5, %xmm4, %xmm4
379 ; AVX1-NEXT: vpaddd %xmm0, %xmm3, %xmm0
380 ; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm0
381 ; AVX1-NEXT: addq $64, %rax
382 ; AVX1-NEXT: jne .LBB2_1
383 ; AVX1-NEXT: # %bb.2: # %middle.block
384 ; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm3
385 ; AVX1-NEXT: vextractf128 $1, %ymm8, %xmm4
386 ; AVX1-NEXT: vpaddd %xmm4, %xmm4, %xmm5
387 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm6
388 ; AVX1-NEXT: vpaddd %xmm8, %xmm8, %xmm7
389 ; AVX1-NEXT: vpaddd %xmm8, %xmm8, %xmm1
390 ; AVX1-NEXT: vpaddd %xmm1, %xmm8, %xmm1
391 ; AVX1-NEXT: vpaddd %xmm7, %xmm8, %xmm7
392 ; AVX1-NEXT: vpaddd %xmm7, %xmm2, %xmm2
393 ; AVX1-NEXT: vpaddd %xmm2, %xmm1, %xmm1
394 ; AVX1-NEXT: vpaddd %xmm5, %xmm4, %xmm2
395 ; AVX1-NEXT: vpaddd %xmm2, %xmm3, %xmm3
396 ; AVX1-NEXT: vpaddd %xmm3, %xmm2, %xmm2
397 ; AVX1-NEXT: vpaddd %xmm2, %xmm6, %xmm2
398 ; AVX1-NEXT: vpaddd %xmm2, %xmm1, %xmm1
399 ; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm0
400 ; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
401 ; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm0
402 ; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
403 ; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm0
404 ; AVX1-NEXT: vmovd %xmm0, %eax
405 ; AVX1-NEXT: vzeroupper
408 ; AVX2-LABEL: sad_avx64i8:
409 ; AVX2: # %bb.0: # %entry
410 ; AVX2-NEXT: vpxor %xmm0, %xmm0, %xmm0
411 ; AVX2-NEXT: movq $-1024, %rax # imm = 0xFC00
412 ; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1
413 ; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2
414 ; AVX2-NEXT: .p2align 4, 0x90
415 ; AVX2-NEXT: .LBB2_1: # %vector.body
416 ; AVX2-NEXT: # =>This Inner Loop Header: Depth=1
417 ; AVX2-NEXT: vmovdqa a+1024(%rax), %ymm3
418 ; AVX2-NEXT: vpsadbw b+1024(%rax), %ymm3, %ymm3
419 ; AVX2-NEXT: vpaddd %ymm1, %ymm3, %ymm1
420 ; AVX2-NEXT: vmovdqa a+1056(%rax), %ymm3
421 ; AVX2-NEXT: vpsadbw b+1056(%rax), %ymm3, %ymm3
422 ; AVX2-NEXT: vpaddd %ymm2, %ymm3, %ymm2
423 ; AVX2-NEXT: addq $64, %rax
424 ; AVX2-NEXT: jne .LBB2_1
425 ; AVX2-NEXT: # %bb.2: # %middle.block
426 ; AVX2-NEXT: vpaddd %ymm0, %ymm2, %ymm2
427 ; AVX2-NEXT: vpaddd %ymm0, %ymm0, %ymm3
428 ; AVX2-NEXT: vpaddd %ymm0, %ymm1, %ymm0
429 ; AVX2-NEXT: vpaddd %ymm3, %ymm2, %ymm1
430 ; AVX2-NEXT: vpaddd %ymm1, %ymm3, %ymm1
431 ; AVX2-NEXT: vpaddd %ymm1, %ymm0, %ymm0
432 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
433 ; AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm0
434 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
435 ; AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm0
436 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
437 ; AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm0
438 ; AVX2-NEXT: vmovd %xmm0, %eax
439 ; AVX2-NEXT: vzeroupper
442 ; AVX512F-LABEL: sad_avx64i8:
443 ; AVX512F: # %bb.0: # %entry
444 ; AVX512F-NEXT: vpxor %xmm0, %xmm0, %xmm0
445 ; AVX512F-NEXT: movq $-1024, %rax # imm = 0xFC00
446 ; AVX512F-NEXT: vpxor %xmm1, %xmm1, %xmm1
447 ; AVX512F-NEXT: .p2align 4, 0x90
448 ; AVX512F-NEXT: .LBB2_1: # %vector.body
449 ; AVX512F-NEXT: # =>This Inner Loop Header: Depth=1
450 ; AVX512F-NEXT: vmovdqa a+1024(%rax), %ymm2
451 ; AVX512F-NEXT: vpsadbw b+1024(%rax), %ymm2, %ymm2
452 ; AVX512F-NEXT: vmovdqa a+1056(%rax), %ymm3
453 ; AVX512F-NEXT: vpsadbw b+1056(%rax), %ymm3, %ymm3
454 ; AVX512F-NEXT: vinserti64x4 $1, %ymm3, %zmm2, %zmm2
455 ; AVX512F-NEXT: vpaddd %zmm1, %zmm2, %zmm1
456 ; AVX512F-NEXT: addq $64, %rax
457 ; AVX512F-NEXT: jne .LBB2_1
458 ; AVX512F-NEXT: # %bb.2: # %middle.block
459 ; AVX512F-NEXT: vpaddd %zmm0, %zmm1, %zmm1
460 ; AVX512F-NEXT: vpaddd %zmm0, %zmm0, %zmm0
461 ; AVX512F-NEXT: vpaddd %zmm0, %zmm1, %zmm0
462 ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1
463 ; AVX512F-NEXT: vpaddd %zmm1, %zmm0, %zmm0
464 ; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm1
465 ; AVX512F-NEXT: vpaddd %xmm1, %xmm0, %xmm0
466 ; AVX512F-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
467 ; AVX512F-NEXT: vpaddd %xmm1, %xmm0, %xmm0
468 ; AVX512F-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
469 ; AVX512F-NEXT: vpaddd %xmm1, %xmm0, %xmm0
470 ; AVX512F-NEXT: vmovd %xmm0, %eax
471 ; AVX512F-NEXT: vzeroupper
474 ; AVX512BW-LABEL: sad_avx64i8:
475 ; AVX512BW: # %bb.0: # %entry
476 ; AVX512BW-NEXT: vpxor %xmm0, %xmm0, %xmm0
477 ; AVX512BW-NEXT: movq $-1024, %rax # imm = 0xFC00
478 ; AVX512BW-NEXT: vpxor %xmm1, %xmm1, %xmm1
479 ; AVX512BW-NEXT: .p2align 4, 0x90
480 ; AVX512BW-NEXT: .LBB2_1: # %vector.body
481 ; AVX512BW-NEXT: # =>This Inner Loop Header: Depth=1
482 ; AVX512BW-NEXT: vmovdqa64 a+1024(%rax), %zmm2
483 ; AVX512BW-NEXT: vpsadbw b+1024(%rax), %zmm2, %zmm2
484 ; AVX512BW-NEXT: vpaddd %zmm1, %zmm2, %zmm1
485 ; AVX512BW-NEXT: addq $64, %rax
486 ; AVX512BW-NEXT: jne .LBB2_1
487 ; AVX512BW-NEXT: # %bb.2: # %middle.block
488 ; AVX512BW-NEXT: vpaddd %zmm0, %zmm1, %zmm1
489 ; AVX512BW-NEXT: vpaddd %zmm0, %zmm0, %zmm0
490 ; AVX512BW-NEXT: vpaddd %zmm0, %zmm1, %zmm0
491 ; AVX512BW-NEXT: vextracti64x4 $1, %zmm0, %ymm1
492 ; AVX512BW-NEXT: vpaddd %zmm1, %zmm0, %zmm0
493 ; AVX512BW-NEXT: vextracti128 $1, %ymm0, %xmm1
494 ; AVX512BW-NEXT: vpaddd %xmm1, %xmm0, %xmm0
495 ; AVX512BW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
496 ; AVX512BW-NEXT: vpaddd %xmm1, %xmm0, %xmm0
497 ; AVX512BW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
498 ; AVX512BW-NEXT: vpaddd %xmm1, %xmm0, %xmm0
499 ; AVX512BW-NEXT: vmovd %xmm0, %eax
500 ; AVX512BW-NEXT: vzeroupper
501 ; AVX512BW-NEXT: retq
503 br label %vector.body
506 %index = phi i64 [ 0, %entry ], [ %index.next, %vector.body ]
507 %vec.phi = phi <64 x i32> [ zeroinitializer, %entry ], [ %10, %vector.body ]
508 %0 = getelementptr inbounds [1024 x i8], [1024 x i8]* @a, i64 0, i64 %index
509 %1 = bitcast i8* %0 to <64 x i8>*
510 %wide.load = load <64 x i8>, <64 x i8>* %1, align 64
511 %2 = zext <64 x i8> %wide.load to <64 x i32>
512 %3 = getelementptr inbounds [1024 x i8], [1024 x i8]* @b, i64 0, i64 %index
513 %4 = bitcast i8* %3 to <64 x i8>*
514 %wide.load1 = load <64 x i8>, <64 x i8>* %4, align 64
515 %5 = zext <64 x i8> %wide.load1 to <64 x i32>
516 %6 = sub nsw <64 x i32> %2, %5
517 %7 = icmp sgt <64 x i32> %6, <i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1>
518 %8 = sub nsw <64 x i32> zeroinitializer, %6
519 %9 = select <64 x i1> %7, <64 x i32> %6, <64 x i32> %8
520 %10 = add nsw <64 x i32> %9, %vec.phi
521 %index.next = add i64 %index, 64
522 %11 = icmp eq i64 %index.next, 1024
523 br i1 %11, label %middle.block, label %vector.body
526 %rdx.shuf = shufflevector <64 x i32> %10, <64 x i32> undef, <64 x i32> <i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
527 %bin.rdx = add <64 x i32> %10, %rdx.shuf
528 %rdx.shuf2 = shufflevector <64 x i32> %bin.rdx, <64 x i32> undef, <64 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
529 %bin.rdx2 = add <64 x i32> %bin.rdx, %rdx.shuf2
530 %rdx.shuf3 = shufflevector <64 x i32> %bin.rdx2, <64 x i32> undef, <64 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
531 %bin.rdx3 = add <64 x i32> %bin.rdx2, %rdx.shuf3
532 %rdx.shuf4 = shufflevector <64 x i32> %bin.rdx3, <64 x i32> undef, <64 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
533 %bin.rdx4 = add <64 x i32> %bin.rdx3, %rdx.shuf4
534 %rdx.shuf5 = shufflevector <64 x i32> %bin.rdx4, <64 x i32> undef, <64 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
535 %bin.rdx5 = add <64 x i32> %bin.rdx4, %rdx.shuf5
536 %rdx.shuf6 = shufflevector <64 x i32> %bin.rdx5, <64 x i32> undef, <64 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
537 %bin.rdx6 = add <64 x i32> %bin.rdx5, %rdx.shuf6
538 %12 = extractelement <64 x i32> %bin.rdx6, i32 0
542 define dso_local i32 @sad_2i8() nounwind {
543 ; SSE2-LABEL: sad_2i8:
544 ; SSE2: # %bb.0: # %entry
545 ; SSE2-NEXT: pxor %xmm0, %xmm0
546 ; SSE2-NEXT: movq $-1024, %rax # imm = 0xFC00
547 ; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [65535,0,0,0]
548 ; SSE2-NEXT: .p2align 4, 0x90
549 ; SSE2-NEXT: .LBB3_1: # %vector.body
550 ; SSE2-NEXT: # =>This Inner Loop Header: Depth=1
551 ; SSE2-NEXT: movd {{.*#+}} xmm2 = mem[0],zero,zero,zero
552 ; SSE2-NEXT: movd {{.*#+}} xmm3 = mem[0],zero,zero,zero
553 ; SSE2-NEXT: pand %xmm1, %xmm2
554 ; SSE2-NEXT: pand %xmm1, %xmm3
555 ; SSE2-NEXT: psadbw %xmm2, %xmm3
556 ; SSE2-NEXT: paddd %xmm3, %xmm0
557 ; SSE2-NEXT: addq $2, %rax
558 ; SSE2-NEXT: jne .LBB3_1
559 ; SSE2-NEXT: # %bb.2: # %middle.block
560 ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
561 ; SSE2-NEXT: paddd %xmm0, %xmm1
562 ; SSE2-NEXT: movd %xmm1, %eax
565 ; AVX-LABEL: sad_2i8:
566 ; AVX: # %bb.0: # %entry
567 ; AVX-NEXT: vpxor %xmm0, %xmm0, %xmm0
568 ; AVX-NEXT: movq $-1024, %rax # imm = 0xFC00
569 ; AVX-NEXT: vpxor %xmm1, %xmm1, %xmm1
570 ; AVX-NEXT: .p2align 4, 0x90
571 ; AVX-NEXT: .LBB3_1: # %vector.body
572 ; AVX-NEXT: # =>This Inner Loop Header: Depth=1
573 ; AVX-NEXT: vmovd {{.*#+}} xmm2 = mem[0],zero,zero,zero
574 ; AVX-NEXT: vmovd {{.*#+}} xmm3 = mem[0],zero,zero,zero
575 ; AVX-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm0[1,2,3,4,5,6,7]
576 ; AVX-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0],xmm0[1,2,3,4,5,6,7]
577 ; AVX-NEXT: vpsadbw %xmm3, %xmm2, %xmm2
578 ; AVX-NEXT: vpaddd %xmm1, %xmm2, %xmm1
579 ; AVX-NEXT: addq $2, %rax
580 ; AVX-NEXT: jne .LBB3_1
581 ; AVX-NEXT: # %bb.2: # %middle.block
582 ; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm1[1,1,1,1]
583 ; AVX-NEXT: vpaddd %xmm0, %xmm1, %xmm0
584 ; AVX-NEXT: vmovd %xmm0, %eax
587 br label %vector.body
590 %index = phi i64 [ 0, %entry ], [ %index.next, %vector.body ]
591 %vec.phi = phi <2 x i32> [ zeroinitializer, %entry ], [ %10, %vector.body ]
592 %0 = getelementptr inbounds [1024 x i8], [1024 x i8]* @a, i64 0, i64 %index
593 %1 = bitcast i8* %0 to <2 x i8>*
594 %wide.load = load <2 x i8>, <2 x i8>* %1, align 4
595 %2 = zext <2 x i8> %wide.load to <2 x i32>
596 %3 = getelementptr inbounds [1024 x i8], [1024 x i8]* @b, i64 0, i64 %index
597 %4 = bitcast i8* %3 to <2 x i8>*
598 %wide.load1 = load <2 x i8>, <2 x i8>* %4, align 4
599 %5 = zext <2 x i8> %wide.load1 to <2 x i32>
600 %6 = sub nsw <2 x i32> %2, %5
601 %7 = icmp sgt <2 x i32> %6, <i32 -1, i32 -1>
602 %8 = sub nsw <2 x i32> zeroinitializer, %6
603 %9 = select <2 x i1> %7, <2 x i32> %6, <2 x i32> %8
604 %10 = add nsw <2 x i32> %9, %vec.phi
605 %index.next = add i64 %index, 2
606 %11 = icmp eq i64 %index.next, 1024
607 br i1 %11, label %middle.block, label %vector.body
610 %rdx.shuf = shufflevector <2 x i32> %10, <2 x i32> undef, <2 x i32> <i32 1, i32 undef>
611 %bin.rdx = add <2 x i32> %10, %rdx.shuf
612 %12 = extractelement <2 x i32> %bin.rdx, i32 0
616 define dso_local i32 @sad_4i8() nounwind {
617 ; SSE2-LABEL: sad_4i8:
618 ; SSE2: # %bb.0: # %entry
619 ; SSE2-NEXT: pxor %xmm0, %xmm0
620 ; SSE2-NEXT: movq $-1024, %rax # imm = 0xFC00
621 ; SSE2-NEXT: .p2align 4, 0x90
622 ; SSE2-NEXT: .LBB4_1: # %vector.body
623 ; SSE2-NEXT: # =>This Inner Loop Header: Depth=1
624 ; SSE2-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
625 ; SSE2-NEXT: movd {{.*#+}} xmm2 = mem[0],zero,zero,zero
626 ; SSE2-NEXT: psadbw %xmm1, %xmm2
627 ; SSE2-NEXT: paddd %xmm2, %xmm0
628 ; SSE2-NEXT: addq $4, %rax
629 ; SSE2-NEXT: jne .LBB4_1
630 ; SSE2-NEXT: # %bb.2: # %middle.block
631 ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
632 ; SSE2-NEXT: paddd %xmm0, %xmm1
633 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,1,1]
634 ; SSE2-NEXT: paddd %xmm1, %xmm0
635 ; SSE2-NEXT: movd %xmm0, %eax
638 ; AVX-LABEL: sad_4i8:
639 ; AVX: # %bb.0: # %entry
640 ; AVX-NEXT: vpxor %xmm0, %xmm0, %xmm0
641 ; AVX-NEXT: movq $-1024, %rax # imm = 0xFC00
642 ; AVX-NEXT: .p2align 4, 0x90
643 ; AVX-NEXT: .LBB4_1: # %vector.body
644 ; AVX-NEXT: # =>This Inner Loop Header: Depth=1
645 ; AVX-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero
646 ; AVX-NEXT: vmovd {{.*#+}} xmm2 = mem[0],zero,zero,zero
647 ; AVX-NEXT: vpsadbw %xmm2, %xmm1, %xmm1
648 ; AVX-NEXT: vpaddd %xmm0, %xmm1, %xmm0
649 ; AVX-NEXT: addq $4, %rax
650 ; AVX-NEXT: jne .LBB4_1
651 ; AVX-NEXT: # %bb.2: # %middle.block
652 ; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
653 ; AVX-NEXT: vpaddd %xmm1, %xmm0, %xmm0
654 ; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
655 ; AVX-NEXT: vpaddd %xmm1, %xmm0, %xmm0
656 ; AVX-NEXT: vmovd %xmm0, %eax
659 br label %vector.body
662 %index = phi i64 [ 0, %entry ], [ %index.next, %vector.body ]
663 %vec.phi = phi <4 x i32> [ zeroinitializer, %entry ], [ %10, %vector.body ]
664 %0 = getelementptr inbounds [1024 x i8], [1024 x i8]* @a, i64 0, i64 %index
665 %1 = bitcast i8* %0 to <4 x i8>*
666 %wide.load = load <4 x i8>, <4 x i8>* %1, align 4
667 %2 = zext <4 x i8> %wide.load to <4 x i32>
668 %3 = getelementptr inbounds [1024 x i8], [1024 x i8]* @b, i64 0, i64 %index
669 %4 = bitcast i8* %3 to <4 x i8>*
670 %wide.load1 = load <4 x i8>, <4 x i8>* %4, align 4
671 %5 = zext <4 x i8> %wide.load1 to <4 x i32>
672 %6 = sub nsw <4 x i32> %2, %5
673 %7 = icmp sgt <4 x i32> %6, <i32 -1, i32 -1, i32 -1, i32 -1>
674 %8 = sub nsw <4 x i32> zeroinitializer, %6
675 %9 = select <4 x i1> %7, <4 x i32> %6, <4 x i32> %8
676 %10 = add nsw <4 x i32> %9, %vec.phi
677 %index.next = add i64 %index, 4
678 %11 = icmp eq i64 %index.next, 1024
679 br i1 %11, label %middle.block, label %vector.body
682 %h2 = shufflevector <4 x i32> %10, <4 x i32> undef, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef>
683 %sum2 = add <4 x i32> %10, %h2
684 %h3 = shufflevector <4 x i32> %sum2, <4 x i32> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
685 %sum3 = add <4 x i32> %sum2, %h3
686 %sum = extractelement <4 x i32> %sum3, i32 0
691 define dso_local i32 @sad_nonloop_4i8(<4 x i8>* nocapture readonly %p, i64, <4 x i8>* nocapture readonly %q) local_unnamed_addr #0 {
692 ; SSE2-LABEL: sad_nonloop_4i8:
694 ; SSE2-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
695 ; SSE2-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
696 ; SSE2-NEXT: psadbw %xmm0, %xmm1
697 ; SSE2-NEXT: movd %xmm1, %eax
700 ; AVX-LABEL: sad_nonloop_4i8:
702 ; AVX-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
703 ; AVX-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero
704 ; AVX-NEXT: vpsadbw %xmm0, %xmm1, %xmm0
705 ; AVX-NEXT: vmovd %xmm0, %eax
707 %v1 = load <4 x i8>, <4 x i8>* %p, align 1
708 %z1 = zext <4 x i8> %v1 to <4 x i32>
709 %v2 = load <4 x i8>, <4 x i8>* %q, align 1
710 %z2 = zext <4 x i8> %v2 to <4 x i32>
711 %sub = sub nsw <4 x i32> %z1, %z2
712 %isneg = icmp sgt <4 x i32> %sub, <i32 -1, i32 -1, i32 -1, i32 -1>
713 %neg = sub nsw <4 x i32> zeroinitializer, %sub
714 %abs = select <4 x i1> %isneg, <4 x i32> %sub, <4 x i32> %neg
715 %h2 = shufflevector <4 x i32> %abs, <4 x i32> undef, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef>
716 %sum2 = add <4 x i32> %abs, %h2
717 %h3 = shufflevector <4 x i32> %sum2, <4 x i32> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
718 %sum3 = add <4 x i32> %sum2, %h3
719 %sum = extractelement <4 x i32> %sum3, i32 0
723 define dso_local i32 @sad_nonloop_8i8(<8 x i8>* nocapture readonly %p, i64, <8 x i8>* nocapture readonly %q) local_unnamed_addr #0 {
724 ; SSE2-LABEL: sad_nonloop_8i8:
726 ; SSE2-NEXT: movq {{.*#+}} xmm0 = mem[0],zero
727 ; SSE2-NEXT: movq {{.*#+}} xmm1 = mem[0],zero
728 ; SSE2-NEXT: psadbw %xmm0, %xmm1
729 ; SSE2-NEXT: movd %xmm1, %eax
732 ; AVX-LABEL: sad_nonloop_8i8:
734 ; AVX-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero
735 ; AVX-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero
736 ; AVX-NEXT: vpsadbw %xmm0, %xmm1, %xmm0
737 ; AVX-NEXT: vmovd %xmm0, %eax
739 %v1 = load <8 x i8>, <8 x i8>* %p, align 1
740 %z1 = zext <8 x i8> %v1 to <8 x i32>
741 %v2 = load <8 x i8>, <8 x i8>* %q, align 1
742 %z2 = zext <8 x i8> %v2 to <8 x i32>
743 %sub = sub nsw <8 x i32> %z1, %z2
744 %isneg = icmp sgt <8 x i32> %sub, <i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1>
745 %neg = sub nsw <8 x i32> zeroinitializer, %sub
746 %abs = select <8 x i1> %isneg, <8 x i32> %sub, <8 x i32> %neg
747 %h1 = shufflevector <8 x i32> %abs, <8 x i32> undef, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef>
748 %sum1 = add <8 x i32> %abs, %h1
749 %h2 = shufflevector <8 x i32> %sum1, <8 x i32> undef, <8 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
750 %sum2 = add <8 x i32> %sum1, %h2
751 %h3 = shufflevector <8 x i32> %sum2, <8 x i32> undef, <8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
752 %sum3 = add <8 x i32> %sum2, %h3
753 %sum = extractelement <8 x i32> %sum3, i32 0
757 define dso_local i32 @sad_nonloop_16i8(<16 x i8>* nocapture readonly %p, i64, <16 x i8>* nocapture readonly %q) local_unnamed_addr #0 {
758 ; SSE2-LABEL: sad_nonloop_16i8:
760 ; SSE2-NEXT: movdqu (%rdi), %xmm0
761 ; SSE2-NEXT: movdqu (%rdx), %xmm1
762 ; SSE2-NEXT: psadbw %xmm0, %xmm1
763 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3]
764 ; SSE2-NEXT: paddq %xmm1, %xmm0
765 ; SSE2-NEXT: movd %xmm0, %eax
768 ; AVX-LABEL: sad_nonloop_16i8:
770 ; AVX-NEXT: vmovdqu (%rdi), %xmm0
771 ; AVX-NEXT: vpsadbw (%rdx), %xmm0, %xmm0
772 ; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
773 ; AVX-NEXT: vpaddq %xmm1, %xmm0, %xmm0
774 ; AVX-NEXT: vmovd %xmm0, %eax
776 %v1 = load <16 x i8>, <16 x i8>* %p, align 1
777 %z1 = zext <16 x i8> %v1 to <16 x i32>
778 %v2 = load <16 x i8>, <16 x i8>* %q, align 1
779 %z2 = zext <16 x i8> %v2 to <16 x i32>
780 %sub = sub nsw <16 x i32> %z1, %z2
781 %isneg = icmp sgt <16 x i32> %sub, <i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1>
782 %neg = sub nsw <16 x i32> zeroinitializer, %sub
783 %abs = select <16 x i1> %isneg, <16 x i32> %sub, <16 x i32> %neg
784 %h0 = shufflevector <16 x i32> %abs, <16 x i32> undef, <16 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
785 %sum0 = add <16 x i32> %abs, %h0
786 %h1 = shufflevector <16 x i32> %sum0, <16 x i32> undef, <16 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
787 %sum1 = add <16 x i32> %sum0, %h1
788 %h2 = shufflevector <16 x i32> %sum1, <16 x i32> undef, <16 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
789 %sum2 = add <16 x i32> %sum1, %h2
790 %h3 = shufflevector <16 x i32> %sum2, <16 x i32> undef, <16 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
791 %sum3 = add <16 x i32> %sum2, %h3
792 %sum = extractelement <16 x i32> %sum3, i32 0
796 define dso_local i32 @sad_nonloop_32i8(<32 x i8>* nocapture readonly %p, i64, <32 x i8>* nocapture readonly %q) local_unnamed_addr #0 {
797 ; SSE2-LABEL: sad_nonloop_32i8:
799 ; SSE2-NEXT: movdqu (%rdx), %xmm0
800 ; SSE2-NEXT: movdqu 16(%rdx), %xmm1
801 ; SSE2-NEXT: movdqu (%rdi), %xmm2
802 ; SSE2-NEXT: psadbw %xmm0, %xmm2
803 ; SSE2-NEXT: movdqu 16(%rdi), %xmm0
804 ; SSE2-NEXT: psadbw %xmm1, %xmm0
805 ; SSE2-NEXT: paddq %xmm2, %xmm0
806 ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
807 ; SSE2-NEXT: paddq %xmm0, %xmm1
808 ; SSE2-NEXT: movd %xmm1, %eax
811 ; AVX1-LABEL: sad_nonloop_32i8:
813 ; AVX1-NEXT: vmovdqu (%rdi), %xmm0
814 ; AVX1-NEXT: vmovdqu 16(%rdi), %xmm1
815 ; AVX1-NEXT: vpsadbw 16(%rdx), %xmm1, %xmm1
816 ; AVX1-NEXT: vpsadbw (%rdx), %xmm0, %xmm0
817 ; AVX1-NEXT: vpaddq %xmm1, %xmm0, %xmm0
818 ; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
819 ; AVX1-NEXT: vpaddq %xmm1, %xmm0, %xmm0
820 ; AVX1-NEXT: vmovd %xmm0, %eax
823 ; AVX2-LABEL: sad_nonloop_32i8:
825 ; AVX2-NEXT: vmovdqu (%rdi), %ymm0
826 ; AVX2-NEXT: vpsadbw (%rdx), %ymm0, %ymm0
827 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
828 ; AVX2-NEXT: vpaddq %xmm1, %xmm0, %xmm0
829 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
830 ; AVX2-NEXT: vpaddq %xmm1, %xmm0, %xmm0
831 ; AVX2-NEXT: vmovd %xmm0, %eax
832 ; AVX2-NEXT: vzeroupper
835 ; AVX512-LABEL: sad_nonloop_32i8:
837 ; AVX512-NEXT: vmovdqu (%rdi), %ymm0
838 ; AVX512-NEXT: vpsadbw (%rdx), %ymm0, %ymm0
839 ; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1
840 ; AVX512-NEXT: vpaddq %xmm1, %xmm0, %xmm0
841 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
842 ; AVX512-NEXT: vpaddq %xmm1, %xmm0, %xmm0
843 ; AVX512-NEXT: vmovd %xmm0, %eax
844 ; AVX512-NEXT: vzeroupper
846 %v1 = load <32 x i8>, <32 x i8>* %p, align 1
847 %z1 = zext <32 x i8> %v1 to <32 x i32>
848 %v2 = load <32 x i8>, <32 x i8>* %q, align 1
849 %z2 = zext <32 x i8> %v2 to <32 x i32>
850 %sub = sub nsw <32 x i32> %z1, %z2
851 %isneg = icmp sgt <32 x i32> %sub, <i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1>
852 %neg = sub nsw <32 x i32> zeroinitializer, %sub
853 %abs = select <32 x i1> %isneg, <32 x i32> %sub, <32 x i32> %neg
854 %h32 = shufflevector <32 x i32> %abs, <32 x i32> undef, <32 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
855 %sum32 = add <32 x i32> %abs, %h32
856 %h0 = shufflevector <32 x i32> %sum32, <32 x i32> undef, <32 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
857 %sum0 = add <32 x i32> %sum32, %h0
858 %h1 = shufflevector <32 x i32> %sum0, <32 x i32> undef, <32 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
859 %sum1 = add <32 x i32> %sum0, %h1
860 %h2 = shufflevector <32 x i32> %sum1, <32 x i32> undef, <32 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
861 %sum2 = add <32 x i32> %sum1, %h2
862 %h3 = shufflevector <32 x i32> %sum2, <32 x i32> undef, <32 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
863 %sum3 = add <32 x i32> %sum2, %h3
864 %sum = extractelement <32 x i32> %sum3, i32 0
868 define dso_local i32 @sad_nonloop_64i8(<64 x i8>* nocapture readonly %p, i64, <64 x i8>* nocapture readonly %q) local_unnamed_addr #0 {
869 ; SSE2-LABEL: sad_nonloop_64i8:
871 ; SSE2-NEXT: movdqu (%rdx), %xmm0
872 ; SSE2-NEXT: movdqu 16(%rdx), %xmm1
873 ; SSE2-NEXT: movdqu 32(%rdx), %xmm2
874 ; SSE2-NEXT: movdqu 48(%rdx), %xmm3
875 ; SSE2-NEXT: movdqu (%rdi), %xmm4
876 ; SSE2-NEXT: psadbw %xmm0, %xmm4
877 ; SSE2-NEXT: movdqu 16(%rdi), %xmm0
878 ; SSE2-NEXT: psadbw %xmm1, %xmm0
879 ; SSE2-NEXT: movdqu 32(%rdi), %xmm1
880 ; SSE2-NEXT: psadbw %xmm2, %xmm1
881 ; SSE2-NEXT: movdqu 48(%rdi), %xmm2
882 ; SSE2-NEXT: psadbw %xmm3, %xmm2
883 ; SSE2-NEXT: paddq %xmm0, %xmm2
884 ; SSE2-NEXT: paddq %xmm1, %xmm2
885 ; SSE2-NEXT: paddq %xmm4, %xmm2
886 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,2,3]
887 ; SSE2-NEXT: paddq %xmm2, %xmm0
888 ; SSE2-NEXT: movd %xmm0, %eax
891 ; AVX1-LABEL: sad_nonloop_64i8:
893 ; AVX1-NEXT: vmovdqu (%rdi), %xmm0
894 ; AVX1-NEXT: vmovdqu 16(%rdi), %xmm1
895 ; AVX1-NEXT: vmovdqu 32(%rdi), %xmm2
896 ; AVX1-NEXT: vmovdqu 48(%rdi), %xmm3
897 ; AVX1-NEXT: vpsadbw 48(%rdx), %xmm3, %xmm3
898 ; AVX1-NEXT: vpsadbw 16(%rdx), %xmm1, %xmm1
899 ; AVX1-NEXT: vpaddq %xmm3, %xmm1, %xmm1
900 ; AVX1-NEXT: vpsadbw 32(%rdx), %xmm2, %xmm2
901 ; AVX1-NEXT: vpaddq %xmm1, %xmm2, %xmm1
902 ; AVX1-NEXT: vpsadbw (%rdx), %xmm0, %xmm0
903 ; AVX1-NEXT: vpaddq %xmm1, %xmm0, %xmm0
904 ; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
905 ; AVX1-NEXT: vpaddq %xmm1, %xmm0, %xmm0
906 ; AVX1-NEXT: vmovd %xmm0, %eax
909 ; AVX2-LABEL: sad_nonloop_64i8:
911 ; AVX2-NEXT: vmovdqu (%rdi), %ymm0
912 ; AVX2-NEXT: vmovdqu 32(%rdi), %ymm1
913 ; AVX2-NEXT: vpsadbw 32(%rdx), %ymm1, %ymm1
914 ; AVX2-NEXT: vpsadbw (%rdx), %ymm0, %ymm0
915 ; AVX2-NEXT: vpaddq %ymm1, %ymm0, %ymm0
916 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
917 ; AVX2-NEXT: vpaddq %xmm1, %xmm0, %xmm0
918 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
919 ; AVX2-NEXT: vpaddq %xmm1, %xmm0, %xmm0
920 ; AVX2-NEXT: vmovd %xmm0, %eax
921 ; AVX2-NEXT: vzeroupper
924 ; AVX512F-LABEL: sad_nonloop_64i8:
926 ; AVX512F-NEXT: vmovdqu (%rdi), %ymm0
927 ; AVX512F-NEXT: vmovdqu 32(%rdi), %ymm1
928 ; AVX512F-NEXT: vpsadbw 32(%rdx), %ymm1, %ymm1
929 ; AVX512F-NEXT: vpsadbw (%rdx), %ymm0, %ymm0
930 ; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
931 ; AVX512F-NEXT: vpaddq %zmm1, %zmm0, %zmm0
932 ; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm1
933 ; AVX512F-NEXT: vpaddq %xmm1, %xmm0, %xmm0
934 ; AVX512F-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
935 ; AVX512F-NEXT: vpaddq %xmm1, %xmm0, %xmm0
936 ; AVX512F-NEXT: vmovd %xmm0, %eax
937 ; AVX512F-NEXT: vzeroupper
940 ; AVX512BW-LABEL: sad_nonloop_64i8:
942 ; AVX512BW-NEXT: vmovdqu64 (%rdi), %zmm0
943 ; AVX512BW-NEXT: vpsadbw (%rdx), %zmm0, %zmm0
944 ; AVX512BW-NEXT: vextracti64x4 $1, %zmm0, %ymm1
945 ; AVX512BW-NEXT: vpaddq %zmm1, %zmm0, %zmm0
946 ; AVX512BW-NEXT: vextracti128 $1, %ymm0, %xmm1
947 ; AVX512BW-NEXT: vpaddq %xmm1, %xmm0, %xmm0
948 ; AVX512BW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
949 ; AVX512BW-NEXT: vpaddq %xmm1, %xmm0, %xmm0
950 ; AVX512BW-NEXT: vmovd %xmm0, %eax
951 ; AVX512BW-NEXT: vzeroupper
952 ; AVX512BW-NEXT: retq
953 %v1 = load <64 x i8>, <64 x i8>* %p, align 1
954 %z1 = zext <64 x i8> %v1 to <64 x i32>
955 %v2 = load <64 x i8>, <64 x i8>* %q, align 1
956 %z2 = zext <64 x i8> %v2 to <64 x i32>
957 %sub = sub nsw <64 x i32> %z1, %z2
958 %isneg = icmp sgt <64 x i32> %sub, <i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1>
959 %neg = sub nsw <64 x i32> zeroinitializer, %sub
960 %abs = select <64 x i1> %isneg, <64 x i32> %sub, <64 x i32> %neg
961 %h64 = shufflevector <64 x i32> %abs, <64 x i32> undef, <64 x i32> <i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
962 %sum64 = add <64 x i32> %abs, %h64
963 %h32 = shufflevector <64 x i32> %sum64, <64 x i32> undef, <64 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
964 %sum32 = add <64 x i32> %sum64, %h32
965 %h0 = shufflevector <64 x i32> %sum32, <64 x i32> undef, <64 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
966 %sum0 = add <64 x i32> %sum32, %h0
967 %h1 = shufflevector <64 x i32> %sum0, <64 x i32> undef, <64 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
968 %sum1 = add <64 x i32> %sum0, %h1
969 %h2 = shufflevector <64 x i32> %sum1, <64 x i32> undef, <64 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
970 %sum2 = add <64 x i32> %sum1, %h2
971 %h3 = shufflevector <64 x i32> %sum2, <64 x i32> undef, <64 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
972 %sum3 = add <64 x i32> %sum2, %h3
973 %sum = extractelement <64 x i32> %sum3, i32 0
977 ; This contains an unrolled sad loop with a non-zero initial value.
978 ; DAGCombiner reassociation previously rewrote the adds to move the constant vector further down the tree. This resulted in the vector-reduction flag being lost.
979 define dso_local i32 @sad_unroll_nonzero_initial(<16 x i8>* %arg, <16 x i8>* %arg1, <16 x i8>* %arg2, <16 x i8>* %arg3) {
980 ; SSE2-LABEL: sad_unroll_nonzero_initial:
981 ; SSE2: # %bb.0: # %bb
982 ; SSE2-NEXT: movdqu (%rdi), %xmm0
983 ; SSE2-NEXT: movdqu (%rsi), %xmm1
984 ; SSE2-NEXT: psadbw %xmm0, %xmm1
985 ; SSE2-NEXT: movdqu (%rdx), %xmm0
986 ; SSE2-NEXT: movdqu (%rcx), %xmm2
987 ; SSE2-NEXT: psadbw %xmm0, %xmm2
988 ; SSE2-NEXT: paddd %xmm1, %xmm2
989 ; SSE2-NEXT: paddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2
990 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,2,3]
991 ; SSE2-NEXT: paddd %xmm2, %xmm0
992 ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
993 ; SSE2-NEXT: paddd %xmm0, %xmm1
994 ; SSE2-NEXT: movd %xmm1, %eax
997 ; AVX-LABEL: sad_unroll_nonzero_initial:
998 ; AVX: # %bb.0: # %bb
999 ; AVX-NEXT: vmovdqu (%rdi), %xmm0
1000 ; AVX-NEXT: vpsadbw (%rsi), %xmm0, %xmm0
1001 ; AVX-NEXT: vmovdqu (%rdx), %xmm1
1002 ; AVX-NEXT: vpsadbw (%rcx), %xmm1, %xmm1
1003 ; AVX-NEXT: vpaddd %xmm1, %xmm0, %xmm0
1004 ; AVX-NEXT: vpaddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
1005 ; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
1006 ; AVX-NEXT: vpaddd %xmm1, %xmm0, %xmm0
1007 ; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
1008 ; AVX-NEXT: vpaddd %xmm1, %xmm0, %xmm0
1009 ; AVX-NEXT: vmovd %xmm0, %eax
1012 %tmp = load <16 x i8>, <16 x i8>* %arg, align 1
1013 %tmp4 = load <16 x i8>, <16 x i8>* %arg1, align 1
1014 %tmp5 = zext <16 x i8> %tmp to <16 x i32>
1015 %tmp6 = zext <16 x i8> %tmp4 to <16 x i32>
1016 %tmp7 = sub nsw <16 x i32> %tmp5, %tmp6
1017 %tmp8 = icmp slt <16 x i32> %tmp7, zeroinitializer
1018 %tmp9 = sub nsw <16 x i32> zeroinitializer, %tmp7
1019 %tmp10 = select <16 x i1> %tmp8, <16 x i32> %tmp9, <16 x i32> %tmp7
1020 %tmp11 = add nuw nsw <16 x i32> %tmp10, <i32 1, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
1021 %tmp12 = load <16 x i8>, <16 x i8>* %arg2, align 1
1022 %tmp13 = load <16 x i8>, <16 x i8>* %arg3, align 1
1023 %tmp14 = zext <16 x i8> %tmp12 to <16 x i32>
1024 %tmp15 = zext <16 x i8> %tmp13 to <16 x i32>
1025 %tmp16 = sub nsw <16 x i32> %tmp14, %tmp15
1026 %tmp17 = icmp slt <16 x i32> %tmp16, zeroinitializer
1027 %tmp18 = sub nsw <16 x i32> zeroinitializer, %tmp16
1028 %tmp19 = select <16 x i1> %tmp17, <16 x i32> %tmp18, <16 x i32> %tmp16
1029 %tmp20 = add nuw nsw <16 x i32> %tmp19, %tmp11
1030 %tmp21 = shufflevector <16 x i32> %tmp20, <16 x i32> undef, <16 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
1031 %tmp22 = add <16 x i32> %tmp20, %tmp21
1032 %tmp23 = shufflevector <16 x i32> %tmp22, <16 x i32> undef, <16 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
1033 %tmp24 = add <16 x i32> %tmp22, %tmp23
1034 %tmp25 = shufflevector <16 x i32> %tmp24, <16 x i32> undef, <16 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
1035 %tmp26 = add <16 x i32> %tmp24, %tmp25
1036 %tmp27 = shufflevector <16 x i32> %tmp26, <16 x i32> undef, <16 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
1037 %tmp28 = add <16 x i32> %tmp26, %tmp27
1038 %tmp29 = extractelement <16 x i32> %tmp28, i64 0
1042 ; This test contains two absolute difference patterns joined by an add. The result of that add is then reduced to a single element.
1043 ; SelectionDAGBuilder should tag the joining add as a vector reduction. We neeed to recognize that both sides can use psadbw.
1044 define dso_local i32 @sad_double_reduction(<16 x i8>* %arg, <16 x i8>* %arg1, <16 x i8>* %arg2, <16 x i8>* %arg3) {
1045 ; SSE2-LABEL: sad_double_reduction:
1046 ; SSE2: # %bb.0: # %bb
1047 ; SSE2-NEXT: movdqu (%rdi), %xmm0
1048 ; SSE2-NEXT: movdqu (%rsi), %xmm1
1049 ; SSE2-NEXT: psadbw %xmm0, %xmm1
1050 ; SSE2-NEXT: movdqu (%rdx), %xmm0
1051 ; SSE2-NEXT: movdqu (%rcx), %xmm2
1052 ; SSE2-NEXT: psadbw %xmm0, %xmm2
1053 ; SSE2-NEXT: paddd %xmm1, %xmm2
1054 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,2,3]
1055 ; SSE2-NEXT: paddd %xmm2, %xmm0
1056 ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
1057 ; SSE2-NEXT: por %xmm0, %xmm1
1058 ; SSE2-NEXT: movd %xmm1, %eax
1061 ; AVX1-LABEL: sad_double_reduction:
1062 ; AVX1: # %bb.0: # %bb
1063 ; AVX1-NEXT: vmovdqu (%rdi), %xmm0
1064 ; AVX1-NEXT: vpsadbw (%rsi), %xmm0, %xmm0
1065 ; AVX1-NEXT: vmovdqu (%rdx), %xmm1
1066 ; AVX1-NEXT: vpsadbw (%rcx), %xmm1, %xmm1
1067 ; AVX1-NEXT: vpaddd %xmm0, %xmm1, %xmm0
1068 ; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
1069 ; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm0
1070 ; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
1071 ; AVX1-NEXT: vpor %xmm1, %xmm0, %xmm0
1072 ; AVX1-NEXT: vmovd %xmm0, %eax
1075 ; AVX2-LABEL: sad_double_reduction:
1076 ; AVX2: # %bb.0: # %bb
1077 ; AVX2-NEXT: vmovdqu (%rdi), %xmm0
1078 ; AVX2-NEXT: vpsadbw (%rsi), %xmm0, %xmm0
1079 ; AVX2-NEXT: vmovdqu (%rdx), %xmm1
1080 ; AVX2-NEXT: vpsadbw (%rcx), %xmm1, %xmm1
1081 ; AVX2-NEXT: vpaddd %xmm0, %xmm1, %xmm0
1082 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
1083 ; AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm0
1084 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
1085 ; AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0
1086 ; AVX2-NEXT: vmovd %xmm0, %eax
1089 ; AVX512-LABEL: sad_double_reduction:
1090 ; AVX512: # %bb.0: # %bb
1091 ; AVX512-NEXT: vmovdqu (%rdi), %xmm0
1092 ; AVX512-NEXT: vpsadbw (%rsi), %xmm0, %xmm0
1093 ; AVX512-NEXT: vmovdqu (%rdx), %xmm1
1094 ; AVX512-NEXT: vpsadbw (%rcx), %xmm1, %xmm1
1095 ; AVX512-NEXT: vpaddd %xmm0, %xmm1, %xmm0
1096 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
1097 ; AVX512-NEXT: vpaddd %xmm1, %xmm0, %xmm0
1098 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
1099 ; AVX512-NEXT: vpaddd %xmm1, %xmm0, %xmm0
1100 ; AVX512-NEXT: vmovd %xmm0, %eax
1103 %tmp = load <16 x i8>, <16 x i8>* %arg, align 1
1104 %tmp4 = load <16 x i8>, <16 x i8>* %arg1, align 1
1105 %tmp5 = zext <16 x i8> %tmp to <16 x i32>
1106 %tmp6 = zext <16 x i8> %tmp4 to <16 x i32>
1107 %tmp7 = sub nsw <16 x i32> %tmp5, %tmp6
1108 %tmp8 = icmp slt <16 x i32> %tmp7, zeroinitializer
1109 %tmp9 = sub nsw <16 x i32> zeroinitializer, %tmp7
1110 %tmp10 = select <16 x i1> %tmp8, <16 x i32> %tmp9, <16 x i32> %tmp7
1111 %tmp11 = load <16 x i8>, <16 x i8>* %arg2, align 1
1112 %tmp12 = load <16 x i8>, <16 x i8>* %arg3, align 1
1113 %tmp13 = zext <16 x i8> %tmp11 to <16 x i32>
1114 %tmp14 = zext <16 x i8> %tmp12 to <16 x i32>
1115 %tmp15 = sub nsw <16 x i32> %tmp13, %tmp14
1116 %tmp16 = icmp slt <16 x i32> %tmp15, zeroinitializer
1117 %tmp17 = sub nsw <16 x i32> zeroinitializer, %tmp15
1118 %tmp18 = select <16 x i1> %tmp16, <16 x i32> %tmp17, <16 x i32> %tmp15
1119 %tmp19 = add nuw nsw <16 x i32> %tmp18, %tmp10
1120 %tmp20 = shufflevector <16 x i32> %tmp19, <16 x i32> undef, <16 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
1121 %tmp21 = add <16 x i32> %tmp19, %tmp20
1122 %tmp22 = shufflevector <16 x i32> %tmp21, <16 x i32> undef, <16 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
1123 %tmp23 = add <16 x i32> %tmp21, %tmp22
1124 %tmp24 = shufflevector <16 x i32> %tmp23, <16 x i32> undef, <16 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
1125 %tmp25 = add <16 x i32> %tmp23, %tmp24
1126 %tmp26 = shufflevector <16 x i32> %tmp25, <16 x i32> undef, <16 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
1127 %tmp27 = add <16 x i32> %tmp25, %tmp26
1128 %tmp28 = extractelement <16 x i32> %tmp27, i64 0