1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+sse4.2 | FileCheck %s --check-prefix=X86
3 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4.2 | FileCheck %s --check-prefix=X64
5 declare i32 @llvm.x86.sse42.pcmpestric128(<16 x i8> %lhs, i32, <16 x i8>, i32, i8)
6 declare i32 @llvm.x86.sse42.pcmpestri128(<16 x i8> %lhs, i32, <16 x i8>, i32, i8)
7 declare <16 x i8> @llvm.x86.sse42.pcmpestrm128(<16 x i8> %lhs, i32, <16 x i8>, i32, i8)
8 declare i32 @llvm.x86.sse42.pcmpistric128(<16 x i8> %lhs, <16 x i8>, i8)
9 declare i32 @llvm.x86.sse42.pcmpistri128(<16 x i8> %lhs, <16 x i8>, i8)
10 declare <16 x i8> @llvm.x86.sse42.pcmpistrm128(<16 x i8> %lhs, <16 x i8>, i8)
12 define i1 @pcmpestri_reg_eq_i8(<16 x i8> %lhs, i32 %lhs_len, <16 x i8> %rhs, i32 %rhs_len) nounwind {
13 ; X86-LABEL: pcmpestri_reg_eq_i8:
14 ; X86: # %bb.0: # %entry
15 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
16 ; X86-NEXT: movl {{[0-9]+}}(%esp), %edx
17 ; X86-NEXT: pcmpestri $24, %xmm1, %xmm0
21 ; X64-LABEL: pcmpestri_reg_eq_i8:
22 ; X64: # %bb.0: # %entry
23 ; X64-NEXT: movl %esi, %edx
24 ; X64-NEXT: movl %edi, %eax
25 ; X64-NEXT: pcmpestri $24, %xmm1, %xmm0
29 %c = call i32 @llvm.x86.sse42.pcmpestric128(<16 x i8> %lhs, i32 %lhs_len, <16 x i8> %rhs, i32 %rhs_len, i8 24)
30 %result = icmp eq i32 %c, 0
34 define i32 @pcmpestri_reg_idx_i8(<16 x i8> %lhs, i32 %lhs_len, <16 x i8> %rhs, i32 %rhs_len) nounwind {
35 ; X86-LABEL: pcmpestri_reg_idx_i8:
36 ; X86: # %bb.0: # %entry
37 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
38 ; X86-NEXT: movl {{[0-9]+}}(%esp), %edx
39 ; X86-NEXT: pcmpestri $24, %xmm1, %xmm0
40 ; X86-NEXT: movl %ecx, %eax
43 ; X64-LABEL: pcmpestri_reg_idx_i8:
44 ; X64: # %bb.0: # %entry
45 ; X64-NEXT: movl %esi, %edx
46 ; X64-NEXT: movl %edi, %eax
47 ; X64-NEXT: pcmpestri $24, %xmm1, %xmm0
48 ; X64-NEXT: movl %ecx, %eax
51 %idx = call i32 @llvm.x86.sse42.pcmpestri128(<16 x i8> %lhs, i32 %lhs_len, <16 x i8> %rhs, i32 %rhs_len, i8 24)
55 define i32 @pcmpestri_reg_diff_i8(<16 x i8> %lhs, i32 %lhs_len, <16 x i8> %rhs, i32 %rhs_len) nounwind {
56 ; X86-LABEL: pcmpestri_reg_diff_i8:
57 ; X86: # %bb.0: # %entry
58 ; X86-NEXT: pushl %ebp
59 ; X86-NEXT: movl %esp, %ebp
60 ; X86-NEXT: andl $-16, %esp
61 ; X86-NEXT: subl $48, %esp
62 ; X86-NEXT: movl 8(%ebp), %eax
63 ; X86-NEXT: movl 12(%ebp), %edx
64 ; X86-NEXT: pcmpestri $24, %xmm1, %xmm0
65 ; X86-NEXT: cmpl $16, %ecx
66 ; X86-NEXT: jne .LBB2_2
68 ; X86-NEXT: xorl %eax, %eax
69 ; X86-NEXT: jmp .LBB2_3
70 ; X86-NEXT: .LBB2_2: # %compare
71 ; X86-NEXT: movdqa %xmm0, (%esp)
72 ; X86-NEXT: andl $15, %ecx
73 ; X86-NEXT: movzbl (%esp,%ecx), %eax
74 ; X86-NEXT: movdqa %xmm1, {{[0-9]+}}(%esp)
75 ; X86-NEXT: subb 16(%esp,%ecx), %al
76 ; X86-NEXT: .LBB2_3: # %exit
77 ; X86-NEXT: movzbl %al, %eax
78 ; X86-NEXT: movl %ebp, %esp
82 ; X64-LABEL: pcmpestri_reg_diff_i8:
83 ; X64: # %bb.0: # %entry
84 ; X64-NEXT: movl %esi, %edx
85 ; X64-NEXT: movl %edi, %eax
86 ; X64-NEXT: pcmpestri $24, %xmm1, %xmm0
87 ; X64-NEXT: # kill: def $ecx killed $ecx def $rcx
88 ; X64-NEXT: cmpl $16, %ecx
89 ; X64-NEXT: jne .LBB2_2
91 ; X64-NEXT: xorl %eax, %eax
92 ; X64-NEXT: movzbl %al, %eax
94 ; X64-NEXT: .LBB2_2: # %compare
95 ; X64-NEXT: movdqa %xmm0, -{{[0-9]+}}(%rsp)
96 ; X64-NEXT: andl $15, %ecx
97 ; X64-NEXT: movzbl -24(%rsp,%rcx), %eax
98 ; X64-NEXT: movdqa %xmm1, -{{[0-9]+}}(%rsp)
99 ; X64-NEXT: subb -40(%rsp,%rcx), %al
100 ; X64-NEXT: movzbl %al, %eax
103 %idx = call i32 @llvm.x86.sse42.pcmpestri128(<16 x i8> %lhs, i32 %lhs_len, <16 x i8> %rhs, i32 %rhs_len, i8 24)
104 %eq = icmp eq i32 %idx, 16
105 br i1 %eq, label %exit, label %compare
108 %lhs_c = extractelement <16 x i8> %lhs, i32 %idx
109 %rhs_c = extractelement <16 x i8> %rhs, i32 %idx
110 %sub = sub i8 %lhs_c, %rhs_c
114 %result = phi i8 [ 0, %entry ], [ %sub, %compare ]
115 %result_ext = zext i8 %result to i32
119 define i1 @pcmpestri_mem_eq_i8(ptr %lhs_ptr, i32 %lhs_len, ptr %rhs_ptr, i32 %rhs_len) nounwind {
120 ; X86-LABEL: pcmpestri_mem_eq_i8:
121 ; X86: # %bb.0: # %entry
122 ; X86-NEXT: pushl %esi
123 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
124 ; X86-NEXT: movl {{[0-9]+}}(%esp), %edx
125 ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
126 ; X86-NEXT: movl {{[0-9]+}}(%esp), %esi
127 ; X86-NEXT: movdqu (%esi), %xmm0
128 ; X86-NEXT: pcmpestri $24, (%ecx), %xmm0
129 ; X86-NEXT: setae %al
130 ; X86-NEXT: popl %esi
133 ; X64-LABEL: pcmpestri_mem_eq_i8:
134 ; X64: # %bb.0: # %entry
135 ; X64-NEXT: movq %rdx, %r8
136 ; X64-NEXT: movl %esi, %eax
137 ; X64-NEXT: movdqu (%rdi), %xmm0
138 ; X64-NEXT: movl %ecx, %edx
139 ; X64-NEXT: pcmpestri $24, (%r8), %xmm0
140 ; X64-NEXT: setae %al
143 %lhs = load <16 x i8>, ptr %lhs_ptr, align 1
144 %rhs = load <16 x i8>, ptr %rhs_ptr, align 1
145 %c = call i32 @llvm.x86.sse42.pcmpestric128(<16 x i8> %lhs, i32 %lhs_len, <16 x i8> %rhs, i32 %rhs_len, i8 24)
146 %result = icmp eq i32 %c, 0
150 define i32 @pcmpestri_mem_idx_i8(ptr %lhs_ptr, i32 %lhs_len, ptr %rhs_ptr, i32 %rhs_len) nounwind {
151 ; X86-LABEL: pcmpestri_mem_idx_i8:
152 ; X86: # %bb.0: # %entry
153 ; X86-NEXT: pushl %esi
154 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
155 ; X86-NEXT: movl {{[0-9]+}}(%esp), %edx
156 ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
157 ; X86-NEXT: movl {{[0-9]+}}(%esp), %esi
158 ; X86-NEXT: movdqu (%esi), %xmm0
159 ; X86-NEXT: pcmpestri $24, (%ecx), %xmm0
160 ; X86-NEXT: movl %ecx, %eax
161 ; X86-NEXT: popl %esi
164 ; X64-LABEL: pcmpestri_mem_idx_i8:
165 ; X64: # %bb.0: # %entry
166 ; X64-NEXT: movq %rdx, %r8
167 ; X64-NEXT: movl %esi, %eax
168 ; X64-NEXT: movdqu (%rdi), %xmm0
169 ; X64-NEXT: movl %ecx, %edx
170 ; X64-NEXT: pcmpestri $24, (%r8), %xmm0
171 ; X64-NEXT: movl %ecx, %eax
174 %lhs = load <16 x i8>, ptr %lhs_ptr, align 1
175 %rhs = load <16 x i8>, ptr %rhs_ptr, align 1
176 %idx = call i32 @llvm.x86.sse42.pcmpestri128(<16 x i8> %lhs, i32 %lhs_len, <16 x i8> %rhs, i32 %rhs_len, i8 24)
180 define i32 @pcmpestri_mem_diff_i8(ptr %lhs_ptr, i32 %lhs_len, ptr %rhs_ptr, i32 %rhs_len) nounwind {
181 ; X86-LABEL: pcmpestri_mem_diff_i8:
182 ; X86: # %bb.0: # %entry
183 ; X86-NEXT: pushl %ebp
184 ; X86-NEXT: movl %esp, %ebp
185 ; X86-NEXT: pushl %esi
186 ; X86-NEXT: andl $-16, %esp
187 ; X86-NEXT: subl $48, %esp
188 ; X86-NEXT: movl 12(%ebp), %eax
189 ; X86-NEXT: movl 20(%ebp), %edx
190 ; X86-NEXT: movl 16(%ebp), %ecx
191 ; X86-NEXT: movl 8(%ebp), %esi
192 ; X86-NEXT: movdqu (%esi), %xmm1
193 ; X86-NEXT: movdqu (%ecx), %xmm0
194 ; X86-NEXT: pcmpestri $24, %xmm0, %xmm1
195 ; X86-NEXT: cmpl $16, %ecx
196 ; X86-NEXT: jne .LBB5_2
198 ; X86-NEXT: xorl %eax, %eax
199 ; X86-NEXT: jmp .LBB5_3
200 ; X86-NEXT: .LBB5_2: # %compare
201 ; X86-NEXT: movdqa %xmm1, (%esp)
202 ; X86-NEXT: andl $15, %ecx
203 ; X86-NEXT: movzbl (%esp,%ecx), %eax
204 ; X86-NEXT: movdqa %xmm0, {{[0-9]+}}(%esp)
205 ; X86-NEXT: subb 16(%esp,%ecx), %al
206 ; X86-NEXT: .LBB5_3: # %exit
207 ; X86-NEXT: movzbl %al, %eax
208 ; X86-NEXT: leal -4(%ebp), %esp
209 ; X86-NEXT: popl %esi
210 ; X86-NEXT: popl %ebp
213 ; X64-LABEL: pcmpestri_mem_diff_i8:
214 ; X64: # %bb.0: # %entry
215 ; X64-NEXT: movl %esi, %eax
216 ; X64-NEXT: movdqu (%rdi), %xmm1
217 ; X64-NEXT: movdqu (%rdx), %xmm0
218 ; X64-NEXT: movl %ecx, %edx
219 ; X64-NEXT: pcmpestri $24, %xmm0, %xmm1
220 ; X64-NEXT: # kill: def $ecx killed $ecx def $rcx
221 ; X64-NEXT: cmpl $16, %ecx
222 ; X64-NEXT: jne .LBB5_2
224 ; X64-NEXT: xorl %eax, %eax
225 ; X64-NEXT: movzbl %al, %eax
227 ; X64-NEXT: .LBB5_2: # %compare
228 ; X64-NEXT: movdqa %xmm1, -{{[0-9]+}}(%rsp)
229 ; X64-NEXT: andl $15, %ecx
230 ; X64-NEXT: movzbl -24(%rsp,%rcx), %eax
231 ; X64-NEXT: movdqa %xmm0, -{{[0-9]+}}(%rsp)
232 ; X64-NEXT: subb -40(%rsp,%rcx), %al
233 ; X64-NEXT: movzbl %al, %eax
236 %lhs = load <16 x i8>, ptr %lhs_ptr, align 1
237 %rhs = load <16 x i8>, ptr %rhs_ptr, align 1
238 %idx = call i32 @llvm.x86.sse42.pcmpestri128(<16 x i8> %lhs, i32 %lhs_len, <16 x i8> %rhs, i32 %rhs_len, i8 24)
239 %eq = icmp eq i32 %idx, 16
240 br i1 %eq, label %exit, label %compare
243 %lhs_c = extractelement <16 x i8> %lhs, i32 %idx
244 %rhs_c = extractelement <16 x i8> %rhs, i32 %idx
245 %sub = sub i8 %lhs_c, %rhs_c
249 %result = phi i8 [ 0, %entry ], [ %sub, %compare ]
250 %result_ext = zext i8 %result to i32
254 define i1 @pcmpestri_reg_eq_i16(<8 x i16> %lhs, i32 %lhs_len, <8 x i16> %rhs, i32 %rhs_len) nounwind {
255 ; X86-LABEL: pcmpestri_reg_eq_i16:
256 ; X86: # %bb.0: # %entry
257 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
258 ; X86-NEXT: movl {{[0-9]+}}(%esp), %edx
259 ; X86-NEXT: pcmpestri $24, %xmm1, %xmm0
260 ; X86-NEXT: setae %al
263 ; X64-LABEL: pcmpestri_reg_eq_i16:
264 ; X64: # %bb.0: # %entry
265 ; X64-NEXT: movl %esi, %edx
266 ; X64-NEXT: movl %edi, %eax
267 ; X64-NEXT: pcmpestri $24, %xmm1, %xmm0
268 ; X64-NEXT: setae %al
271 %lhs_cast = bitcast <8 x i16> %lhs to <16 x i8>
272 %rhs_cast = bitcast <8 x i16> %rhs to <16 x i8>
273 %c = call i32 @llvm.x86.sse42.pcmpestric128(<16 x i8> %lhs_cast, i32 %lhs_len, <16 x i8> %rhs_cast, i32 %rhs_len, i8 24)
274 %result = icmp eq i32 %c, 0
278 define i32 @pcmpestri_reg_idx_i16(<8 x i16> %lhs, i32 %lhs_len, <8 x i16> %rhs, i32 %rhs_len) nounwind {
279 ; X86-LABEL: pcmpestri_reg_idx_i16:
280 ; X86: # %bb.0: # %entry
281 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
282 ; X86-NEXT: movl {{[0-9]+}}(%esp), %edx
283 ; X86-NEXT: pcmpestri $24, %xmm1, %xmm0
284 ; X86-NEXT: movl %ecx, %eax
287 ; X64-LABEL: pcmpestri_reg_idx_i16:
288 ; X64: # %bb.0: # %entry
289 ; X64-NEXT: movl %esi, %edx
290 ; X64-NEXT: movl %edi, %eax
291 ; X64-NEXT: pcmpestri $24, %xmm1, %xmm0
292 ; X64-NEXT: movl %ecx, %eax
295 %lhs_cast = bitcast <8 x i16> %lhs to <16 x i8>
296 %rhs_cast = bitcast <8 x i16> %rhs to <16 x i8>
297 %idx = call i32 @llvm.x86.sse42.pcmpestri128(<16 x i8> %lhs_cast, i32 %lhs_len, <16 x i8> %rhs_cast, i32 %rhs_len, i8 24)
301 define i32 @pcmpestri_reg_diff_i16(<8 x i16> %lhs, i32 %lhs_len, <8 x i16> %rhs, i32 %rhs_len) nounwind {
302 ; X86-LABEL: pcmpestri_reg_diff_i16:
303 ; X86: # %bb.0: # %entry
304 ; X86-NEXT: pushl %ebp
305 ; X86-NEXT: movl %esp, %ebp
306 ; X86-NEXT: andl $-16, %esp
307 ; X86-NEXT: subl $48, %esp
308 ; X86-NEXT: movl 8(%ebp), %eax
309 ; X86-NEXT: movl 12(%ebp), %edx
310 ; X86-NEXT: pcmpestri $24, %xmm1, %xmm0
311 ; X86-NEXT: cmpl $16, %ecx
312 ; X86-NEXT: jne .LBB8_2
314 ; X86-NEXT: xorl %eax, %eax
315 ; X86-NEXT: jmp .LBB8_3
316 ; X86-NEXT: .LBB8_2: # %compare
317 ; X86-NEXT: movdqa %xmm0, (%esp)
318 ; X86-NEXT: addl %ecx, %ecx
319 ; X86-NEXT: andl $14, %ecx
320 ; X86-NEXT: movzwl (%esp,%ecx), %eax
321 ; X86-NEXT: movdqa %xmm1, {{[0-9]+}}(%esp)
322 ; X86-NEXT: subw 16(%esp,%ecx), %ax
323 ; X86-NEXT: .LBB8_3: # %exit
324 ; X86-NEXT: movzwl %ax, %eax
325 ; X86-NEXT: movl %ebp, %esp
326 ; X86-NEXT: popl %ebp
329 ; X64-LABEL: pcmpestri_reg_diff_i16:
330 ; X64: # %bb.0: # %entry
331 ; X64-NEXT: movl %esi, %edx
332 ; X64-NEXT: movl %edi, %eax
333 ; X64-NEXT: pcmpestri $24, %xmm1, %xmm0
334 ; X64-NEXT: # kill: def $ecx killed $ecx def $rcx
335 ; X64-NEXT: cmpl $16, %ecx
336 ; X64-NEXT: jne .LBB8_2
338 ; X64-NEXT: xorl %eax, %eax
339 ; X64-NEXT: movzwl %ax, %eax
341 ; X64-NEXT: .LBB8_2: # %compare
342 ; X64-NEXT: movdqa %xmm0, -{{[0-9]+}}(%rsp)
343 ; X64-NEXT: andl $7, %ecx
344 ; X64-NEXT: movzwl -24(%rsp,%rcx,2), %eax
345 ; X64-NEXT: movdqa %xmm1, -{{[0-9]+}}(%rsp)
346 ; X64-NEXT: subw -40(%rsp,%rcx,2), %ax
347 ; X64-NEXT: movzwl %ax, %eax
350 %lhs_cast = bitcast <8 x i16> %lhs to <16 x i8>
351 %rhs_cast = bitcast <8 x i16> %rhs to <16 x i8>
352 %idx = call i32 @llvm.x86.sse42.pcmpestri128(<16 x i8> %lhs_cast, i32 %lhs_len, <16 x i8> %rhs_cast, i32 %rhs_len, i8 24)
353 %eq = icmp eq i32 %idx, 16
354 br i1 %eq, label %exit, label %compare
357 %lhs_c = extractelement <8 x i16> %lhs, i32 %idx
358 %rhs_c = extractelement <8 x i16> %rhs, i32 %idx
359 %sub = sub i16 %lhs_c, %rhs_c
363 %result = phi i16 [ 0, %entry ], [ %sub, %compare ]
364 %result_ext = zext i16 %result to i32
368 define i1 @pcmpestri_mem_eq_i16(ptr %lhs_ptr, i32 %lhs_len, ptr %rhs_ptr, i32 %rhs_len) nounwind {
369 ; X86-LABEL: pcmpestri_mem_eq_i16:
370 ; X86: # %bb.0: # %entry
371 ; X86-NEXT: pushl %esi
372 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
373 ; X86-NEXT: movl {{[0-9]+}}(%esp), %edx
374 ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
375 ; X86-NEXT: movl {{[0-9]+}}(%esp), %esi
376 ; X86-NEXT: movdqu (%esi), %xmm0
377 ; X86-NEXT: pcmpestri $25, (%ecx), %xmm0
378 ; X86-NEXT: setae %al
379 ; X86-NEXT: popl %esi
382 ; X64-LABEL: pcmpestri_mem_eq_i16:
383 ; X64: # %bb.0: # %entry
384 ; X64-NEXT: movq %rdx, %r8
385 ; X64-NEXT: movl %esi, %eax
386 ; X64-NEXT: movdqu (%rdi), %xmm0
387 ; X64-NEXT: movl %ecx, %edx
388 ; X64-NEXT: pcmpestri $25, (%r8), %xmm0
389 ; X64-NEXT: setae %al
392 %lhs = load <8 x i16>, ptr %lhs_ptr, align 1
393 %rhs = load <8 x i16>, ptr %rhs_ptr, align 1
394 %lhs_cast = bitcast <8 x i16> %lhs to <16 x i8>
395 %rhs_cast = bitcast <8 x i16> %rhs to <16 x i8>
396 %c = call i32 @llvm.x86.sse42.pcmpestric128(<16 x i8> %lhs_cast, i32 %lhs_len, <16 x i8> %rhs_cast, i32 %rhs_len, i8 25)
397 %result = icmp eq i32 %c, 0
401 define i32 @pcmpestri_mem_idx_i16(ptr %lhs_ptr, i32 %lhs_len, ptr %rhs_ptr, i32 %rhs_len) nounwind {
402 ; X86-LABEL: pcmpestri_mem_idx_i16:
403 ; X86: # %bb.0: # %entry
404 ; X86-NEXT: pushl %esi
405 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
406 ; X86-NEXT: movl {{[0-9]+}}(%esp), %edx
407 ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
408 ; X86-NEXT: movl {{[0-9]+}}(%esp), %esi
409 ; X86-NEXT: movdqu (%esi), %xmm0
410 ; X86-NEXT: pcmpestri $25, (%ecx), %xmm0
411 ; X86-NEXT: movl %ecx, %eax
412 ; X86-NEXT: popl %esi
415 ; X64-LABEL: pcmpestri_mem_idx_i16:
416 ; X64: # %bb.0: # %entry
417 ; X64-NEXT: movq %rdx, %r8
418 ; X64-NEXT: movl %esi, %eax
419 ; X64-NEXT: movdqu (%rdi), %xmm0
420 ; X64-NEXT: movl %ecx, %edx
421 ; X64-NEXT: pcmpestri $25, (%r8), %xmm0
422 ; X64-NEXT: movl %ecx, %eax
425 %lhs = load <8 x i16>, ptr %lhs_ptr, align 1
426 %rhs = load <8 x i16>, ptr %rhs_ptr, align 1
427 %lhs_cast = bitcast <8 x i16> %lhs to <16 x i8>
428 %rhs_cast = bitcast <8 x i16> %rhs to <16 x i8>
429 %idx = call i32 @llvm.x86.sse42.pcmpestri128(<16 x i8> %lhs_cast, i32 %lhs_len, <16 x i8> %rhs_cast, i32 %rhs_len, i8 25)
433 define i32 @pcmpestri_mem_diff_i16(ptr %lhs_ptr, i32 %lhs_len, ptr %rhs_ptr, i32 %rhs_len) nounwind {
434 ; X86-LABEL: pcmpestri_mem_diff_i16:
435 ; X86: # %bb.0: # %entry
436 ; X86-NEXT: pushl %ebp
437 ; X86-NEXT: movl %esp, %ebp
438 ; X86-NEXT: pushl %esi
439 ; X86-NEXT: andl $-16, %esp
440 ; X86-NEXT: subl $48, %esp
441 ; X86-NEXT: movl 12(%ebp), %eax
442 ; X86-NEXT: movl 20(%ebp), %edx
443 ; X86-NEXT: movl 16(%ebp), %ecx
444 ; X86-NEXT: movl 8(%ebp), %esi
445 ; X86-NEXT: movdqu (%esi), %xmm1
446 ; X86-NEXT: movdqu (%ecx), %xmm0
447 ; X86-NEXT: pcmpestri $25, %xmm0, %xmm1
448 ; X86-NEXT: cmpl $8, %ecx
449 ; X86-NEXT: jne .LBB11_2
451 ; X86-NEXT: xorl %eax, %eax
452 ; X86-NEXT: jmp .LBB11_3
453 ; X86-NEXT: .LBB11_2: # %compare
454 ; X86-NEXT: movdqa %xmm1, (%esp)
455 ; X86-NEXT: addl %ecx, %ecx
456 ; X86-NEXT: andl $14, %ecx
457 ; X86-NEXT: movzwl (%esp,%ecx), %eax
458 ; X86-NEXT: movdqa %xmm0, {{[0-9]+}}(%esp)
459 ; X86-NEXT: subw 16(%esp,%ecx), %ax
460 ; X86-NEXT: .LBB11_3: # %exit
461 ; X86-NEXT: movzwl %ax, %eax
462 ; X86-NEXT: leal -4(%ebp), %esp
463 ; X86-NEXT: popl %esi
464 ; X86-NEXT: popl %ebp
467 ; X64-LABEL: pcmpestri_mem_diff_i16:
468 ; X64: # %bb.0: # %entry
469 ; X64-NEXT: movl %esi, %eax
470 ; X64-NEXT: movdqu (%rdi), %xmm1
471 ; X64-NEXT: movdqu (%rdx), %xmm0
472 ; X64-NEXT: movl %ecx, %edx
473 ; X64-NEXT: pcmpestri $25, %xmm0, %xmm1
474 ; X64-NEXT: # kill: def $ecx killed $ecx def $rcx
475 ; X64-NEXT: cmpl $8, %ecx
476 ; X64-NEXT: jne .LBB11_2
478 ; X64-NEXT: xorl %eax, %eax
479 ; X64-NEXT: movzwl %ax, %eax
481 ; X64-NEXT: .LBB11_2: # %compare
482 ; X64-NEXT: movdqa %xmm1, -{{[0-9]+}}(%rsp)
483 ; X64-NEXT: andl $7, %ecx
484 ; X64-NEXT: movzwl -24(%rsp,%rcx,2), %eax
485 ; X64-NEXT: movdqa %xmm0, -{{[0-9]+}}(%rsp)
486 ; X64-NEXT: subw -40(%rsp,%rcx,2), %ax
487 ; X64-NEXT: movzwl %ax, %eax
490 %lhs = load <8 x i16>, ptr %lhs_ptr, align 1
491 %rhs = load <8 x i16>, ptr %rhs_ptr, align 1
492 %lhs_cast = bitcast <8 x i16> %lhs to <16 x i8>
493 %rhs_cast = bitcast <8 x i16> %rhs to <16 x i8>
494 %idx = call i32 @llvm.x86.sse42.pcmpestri128(<16 x i8> %lhs_cast, i32 %lhs_len, <16 x i8> %rhs_cast, i32 %rhs_len, i8 25)
495 %eq = icmp eq i32 %idx, 8
496 br i1 %eq, label %exit, label %compare
499 %lhs_c = extractelement <8 x i16> %lhs, i32 %idx
500 %rhs_c = extractelement <8 x i16> %rhs, i32 %idx
501 %sub = sub i16 %lhs_c, %rhs_c
505 %result = phi i16 [ 0, %entry ], [ %sub, %compare ]
506 %result_ext = zext i16 %result to i32
510 define i1 @pcmpistri_reg_eq_i8(<16 x i8> %lhs, <16 x i8> %rhs) nounwind {
511 ; X86-LABEL: pcmpistri_reg_eq_i8:
512 ; X86: # %bb.0: # %entry
513 ; X86-NEXT: pcmpistri $24, %xmm1, %xmm0
514 ; X86-NEXT: setae %al
517 ; X64-LABEL: pcmpistri_reg_eq_i8:
518 ; X64: # %bb.0: # %entry
519 ; X64-NEXT: pcmpistri $24, %xmm1, %xmm0
520 ; X64-NEXT: setae %al
523 %c = call i32 @llvm.x86.sse42.pcmpistric128(<16 x i8> %lhs, <16 x i8> %rhs, i8 24)
524 %result = icmp eq i32 %c, 0
528 define i32 @pcmpistri_reg_idx_i8(<16 x i8> %lhs, <16 x i8> %rhs) nounwind {
529 ; X86-LABEL: pcmpistri_reg_idx_i8:
530 ; X86: # %bb.0: # %entry
531 ; X86-NEXT: pcmpistri $24, %xmm1, %xmm0
532 ; X86-NEXT: movl %ecx, %eax
535 ; X64-LABEL: pcmpistri_reg_idx_i8:
536 ; X64: # %bb.0: # %entry
537 ; X64-NEXT: pcmpistri $24, %xmm1, %xmm0
538 ; X64-NEXT: movl %ecx, %eax
541 %idx = call i32 @llvm.x86.sse42.pcmpistri128(<16 x i8> %lhs, <16 x i8> %rhs, i8 24)
545 define i32 @pcmpistri_reg_diff_i8(<16 x i8> %lhs, <16 x i8> %rhs) nounwind {
546 ; X86-LABEL: pcmpistri_reg_diff_i8:
547 ; X86: # %bb.0: # %entry
548 ; X86-NEXT: pcmpistri $24, %xmm1, %xmm0
549 ; X86-NEXT: cmpl $16, %ecx
550 ; X86-NEXT: jne .LBB14_2
552 ; X86-NEXT: xorl %eax, %eax
553 ; X86-NEXT: movzbl %al, %eax
555 ; X86-NEXT: .LBB14_2: # %compare
556 ; X86-NEXT: pushl %ebp
557 ; X86-NEXT: movl %esp, %ebp
558 ; X86-NEXT: andl $-16, %esp
559 ; X86-NEXT: subl $48, %esp
560 ; X86-NEXT: movdqa %xmm0, (%esp)
561 ; X86-NEXT: andl $15, %ecx
562 ; X86-NEXT: movzbl (%esp,%ecx), %eax
563 ; X86-NEXT: movdqa %xmm1, {{[0-9]+}}(%esp)
564 ; X86-NEXT: subb 16(%esp,%ecx), %al
565 ; X86-NEXT: movl %ebp, %esp
566 ; X86-NEXT: popl %ebp
567 ; X86-NEXT: movzbl %al, %eax
570 ; X64-LABEL: pcmpistri_reg_diff_i8:
571 ; X64: # %bb.0: # %entry
572 ; X64-NEXT: pcmpistri $24, %xmm1, %xmm0
573 ; X64-NEXT: # kill: def $ecx killed $ecx def $rcx
574 ; X64-NEXT: cmpl $16, %ecx
575 ; X64-NEXT: jne .LBB14_2
577 ; X64-NEXT: xorl %eax, %eax
578 ; X64-NEXT: movzbl %al, %eax
580 ; X64-NEXT: .LBB14_2: # %compare
581 ; X64-NEXT: movdqa %xmm0, -{{[0-9]+}}(%rsp)
582 ; X64-NEXT: andl $15, %ecx
583 ; X64-NEXT: movzbl -24(%rsp,%rcx), %eax
584 ; X64-NEXT: movdqa %xmm1, -{{[0-9]+}}(%rsp)
585 ; X64-NEXT: subb -40(%rsp,%rcx), %al
586 ; X64-NEXT: movzbl %al, %eax
589 %idx = call i32 @llvm.x86.sse42.pcmpistri128(<16 x i8> %lhs, <16 x i8> %rhs, i8 24)
590 %eq = icmp eq i32 %idx, 16
591 br i1 %eq, label %exit, label %compare
594 %lhs_c = extractelement <16 x i8> %lhs, i32 %idx
595 %rhs_c = extractelement <16 x i8> %rhs, i32 %idx
596 %sub = sub i8 %lhs_c, %rhs_c
600 %result = phi i8 [ 0, %entry ], [ %sub, %compare ]
601 %result_ext = zext i8 %result to i32
605 define i1 @pcmpistri_mem_eq_i8(ptr %lhs_ptr, ptr %rhs_ptr) nounwind {
606 ; X86-LABEL: pcmpistri_mem_eq_i8:
607 ; X86: # %bb.0: # %entry
608 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
609 ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
610 ; X86-NEXT: movdqu (%ecx), %xmm0
611 ; X86-NEXT: pcmpistri $24, (%eax), %xmm0
612 ; X86-NEXT: setae %al
615 ; X64-LABEL: pcmpistri_mem_eq_i8:
616 ; X64: # %bb.0: # %entry
617 ; X64-NEXT: movdqu (%rdi), %xmm0
618 ; X64-NEXT: pcmpistri $24, (%rsi), %xmm0
619 ; X64-NEXT: setae %al
622 %lhs = load <16 x i8>, ptr %lhs_ptr, align 1
623 %rhs = load <16 x i8>, ptr %rhs_ptr, align 1
624 %c = call i32 @llvm.x86.sse42.pcmpistric128(<16 x i8> %lhs, <16 x i8> %rhs, i8 24)
625 %result = icmp eq i32 %c, 0
629 define i32 @pcmpistri_mem_idx_i8(ptr %lhs_ptr, ptr %rhs_ptr) nounwind {
630 ; X86-LABEL: pcmpistri_mem_idx_i8:
631 ; X86: # %bb.0: # %entry
632 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
633 ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
634 ; X86-NEXT: movdqu (%ecx), %xmm0
635 ; X86-NEXT: pcmpistri $24, (%eax), %xmm0
636 ; X86-NEXT: movl %ecx, %eax
639 ; X64-LABEL: pcmpistri_mem_idx_i8:
640 ; X64: # %bb.0: # %entry
641 ; X64-NEXT: movdqu (%rdi), %xmm0
642 ; X64-NEXT: pcmpistri $24, (%rsi), %xmm0
643 ; X64-NEXT: movl %ecx, %eax
646 %lhs = load <16 x i8>, ptr %lhs_ptr, align 1
647 %rhs = load <16 x i8>, ptr %rhs_ptr, align 1
648 %idx = call i32 @llvm.x86.sse42.pcmpistri128(<16 x i8> %lhs, <16 x i8> %rhs, i8 24)
652 define i32 @pcmpistri_mem_diff_i8(ptr %lhs_ptr, ptr %rhs_ptr) nounwind {
653 ; X86-LABEL: pcmpistri_mem_diff_i8:
654 ; X86: # %bb.0: # %entry
655 ; X86-NEXT: pushl %ebp
656 ; X86-NEXT: movl %esp, %ebp
657 ; X86-NEXT: andl $-16, %esp
658 ; X86-NEXT: subl $48, %esp
659 ; X86-NEXT: movl 12(%ebp), %eax
660 ; X86-NEXT: movl 8(%ebp), %ecx
661 ; X86-NEXT: movdqu (%ecx), %xmm1
662 ; X86-NEXT: movdqu (%eax), %xmm0
663 ; X86-NEXT: pcmpistri $24, %xmm0, %xmm1
664 ; X86-NEXT: cmpl $16, %ecx
665 ; X86-NEXT: jne .LBB17_2
667 ; X86-NEXT: xorl %eax, %eax
668 ; X86-NEXT: jmp .LBB17_3
669 ; X86-NEXT: .LBB17_2: # %compare
670 ; X86-NEXT: movdqa %xmm1, (%esp)
671 ; X86-NEXT: andl $15, %ecx
672 ; X86-NEXT: movzbl (%esp,%ecx), %eax
673 ; X86-NEXT: movdqa %xmm0, {{[0-9]+}}(%esp)
674 ; X86-NEXT: subb 16(%esp,%ecx), %al
675 ; X86-NEXT: .LBB17_3: # %exit
676 ; X86-NEXT: movzbl %al, %eax
677 ; X86-NEXT: movl %ebp, %esp
678 ; X86-NEXT: popl %ebp
681 ; X64-LABEL: pcmpistri_mem_diff_i8:
682 ; X64: # %bb.0: # %entry
683 ; X64-NEXT: movdqu (%rdi), %xmm1
684 ; X64-NEXT: movdqu (%rsi), %xmm0
685 ; X64-NEXT: pcmpistri $24, %xmm0, %xmm1
686 ; X64-NEXT: # kill: def $ecx killed $ecx def $rcx
687 ; X64-NEXT: cmpl $16, %ecx
688 ; X64-NEXT: jne .LBB17_2
690 ; X64-NEXT: xorl %eax, %eax
691 ; X64-NEXT: movzbl %al, %eax
693 ; X64-NEXT: .LBB17_2: # %compare
694 ; X64-NEXT: movdqa %xmm1, -{{[0-9]+}}(%rsp)
695 ; X64-NEXT: andl $15, %ecx
696 ; X64-NEXT: movzbl -24(%rsp,%rcx), %eax
697 ; X64-NEXT: movdqa %xmm0, -{{[0-9]+}}(%rsp)
698 ; X64-NEXT: subb -40(%rsp,%rcx), %al
699 ; X64-NEXT: movzbl %al, %eax
702 %lhs = load <16 x i8>, ptr %lhs_ptr, align 1
703 %rhs = load <16 x i8>, ptr %rhs_ptr, align 1
704 %idx = call i32 @llvm.x86.sse42.pcmpistri128(<16 x i8> %lhs, <16 x i8> %rhs, i8 24)
705 %eq = icmp eq i32 %idx, 16
706 br i1 %eq, label %exit, label %compare
709 %lhs_c = extractelement <16 x i8> %lhs, i32 %idx
710 %rhs_c = extractelement <16 x i8> %rhs, i32 %idx
711 %sub = sub i8 %lhs_c, %rhs_c
715 %result = phi i8 [ 0, %entry ], [ %sub, %compare ]
716 %result_ext = zext i8 %result to i32
720 define i1 @pcmpistri_reg_eq_i16(<8 x i16> %lhs, <8 x i16> %rhs) nounwind {
721 ; X86-LABEL: pcmpistri_reg_eq_i16:
722 ; X86: # %bb.0: # %entry
723 ; X86-NEXT: pcmpistri $24, %xmm1, %xmm0
724 ; X86-NEXT: setae %al
727 ; X64-LABEL: pcmpistri_reg_eq_i16:
728 ; X64: # %bb.0: # %entry
729 ; X64-NEXT: pcmpistri $24, %xmm1, %xmm0
730 ; X64-NEXT: setae %al
733 %lhs_cast = bitcast <8 x i16> %lhs to <16 x i8>
734 %rhs_cast = bitcast <8 x i16> %rhs to <16 x i8>
735 %c = call i32 @llvm.x86.sse42.pcmpistric128(<16 x i8> %lhs_cast, <16 x i8> %rhs_cast, i8 24)
736 %result = icmp eq i32 %c, 0
740 define i32 @pcmpistri_reg_idx_i16(<8 x i16> %lhs, <8 x i16> %rhs) nounwind {
741 ; X86-LABEL: pcmpistri_reg_idx_i16:
742 ; X86: # %bb.0: # %entry
743 ; X86-NEXT: pcmpistri $24, %xmm1, %xmm0
744 ; X86-NEXT: movl %ecx, %eax
747 ; X64-LABEL: pcmpistri_reg_idx_i16:
748 ; X64: # %bb.0: # %entry
749 ; X64-NEXT: pcmpistri $24, %xmm1, %xmm0
750 ; X64-NEXT: movl %ecx, %eax
753 %lhs_cast = bitcast <8 x i16> %lhs to <16 x i8>
754 %rhs_cast = bitcast <8 x i16> %rhs to <16 x i8>
755 %idx = call i32 @llvm.x86.sse42.pcmpistri128(<16 x i8> %lhs_cast, <16 x i8> %rhs_cast, i8 24)
759 define i32 @pcmpistri_reg_diff_i16(<8 x i16> %lhs, <8 x i16> %rhs) nounwind {
760 ; X86-LABEL: pcmpistri_reg_diff_i16:
761 ; X86: # %bb.0: # %entry
762 ; X86-NEXT: pcmpistri $24, %xmm1, %xmm0
763 ; X86-NEXT: cmpl $16, %ecx
764 ; X86-NEXT: jne .LBB20_2
766 ; X86-NEXT: xorl %eax, %eax
767 ; X86-NEXT: movzwl %ax, %eax
769 ; X86-NEXT: .LBB20_2: # %compare
770 ; X86-NEXT: pushl %ebp
771 ; X86-NEXT: movl %esp, %ebp
772 ; X86-NEXT: andl $-16, %esp
773 ; X86-NEXT: subl $48, %esp
774 ; X86-NEXT: movdqa %xmm0, (%esp)
775 ; X86-NEXT: addl %ecx, %ecx
776 ; X86-NEXT: andl $14, %ecx
777 ; X86-NEXT: movzwl (%esp,%ecx), %eax
778 ; X86-NEXT: movdqa %xmm1, {{[0-9]+}}(%esp)
779 ; X86-NEXT: subw 16(%esp,%ecx), %ax
780 ; X86-NEXT: movl %ebp, %esp
781 ; X86-NEXT: popl %ebp
782 ; X86-NEXT: movzwl %ax, %eax
785 ; X64-LABEL: pcmpistri_reg_diff_i16:
786 ; X64: # %bb.0: # %entry
787 ; X64-NEXT: pcmpistri $24, %xmm1, %xmm0
788 ; X64-NEXT: # kill: def $ecx killed $ecx def $rcx
789 ; X64-NEXT: cmpl $16, %ecx
790 ; X64-NEXT: jne .LBB20_2
792 ; X64-NEXT: xorl %eax, %eax
793 ; X64-NEXT: movzwl %ax, %eax
795 ; X64-NEXT: .LBB20_2: # %compare
796 ; X64-NEXT: movdqa %xmm0, -{{[0-9]+}}(%rsp)
797 ; X64-NEXT: andl $7, %ecx
798 ; X64-NEXT: movzwl -24(%rsp,%rcx,2), %eax
799 ; X64-NEXT: movdqa %xmm1, -{{[0-9]+}}(%rsp)
800 ; X64-NEXT: subw -40(%rsp,%rcx,2), %ax
801 ; X64-NEXT: movzwl %ax, %eax
804 %lhs_cast = bitcast <8 x i16> %lhs to <16 x i8>
805 %rhs_cast = bitcast <8 x i16> %rhs to <16 x i8>
806 %idx = call i32 @llvm.x86.sse42.pcmpistri128(<16 x i8> %lhs_cast, <16 x i8> %rhs_cast, i8 24)
807 %eq = icmp eq i32 %idx, 16
808 br i1 %eq, label %exit, label %compare
811 %lhs_c = extractelement <8 x i16> %lhs, i32 %idx
812 %rhs_c = extractelement <8 x i16> %rhs, i32 %idx
813 %sub = sub i16 %lhs_c, %rhs_c
817 %result = phi i16 [ 0, %entry ], [ %sub, %compare ]
818 %result_ext = zext i16 %result to i32
822 define i1 @pcmpistri_mem_eq_i16(ptr %lhs_ptr, ptr %rhs_ptr) nounwind {
823 ; X86-LABEL: pcmpistri_mem_eq_i16:
824 ; X86: # %bb.0: # %entry
825 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
826 ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
827 ; X86-NEXT: movdqu (%ecx), %xmm0
828 ; X86-NEXT: pcmpistri $25, (%eax), %xmm0
829 ; X86-NEXT: setae %al
832 ; X64-LABEL: pcmpistri_mem_eq_i16:
833 ; X64: # %bb.0: # %entry
834 ; X64-NEXT: movdqu (%rdi), %xmm0
835 ; X64-NEXT: pcmpistri $25, (%rsi), %xmm0
836 ; X64-NEXT: setae %al
839 %lhs = load <8 x i16>, ptr %lhs_ptr, align 1
840 %rhs = load <8 x i16>, ptr %rhs_ptr, align 1
841 %lhs_cast = bitcast <8 x i16> %lhs to <16 x i8>
842 %rhs_cast = bitcast <8 x i16> %rhs to <16 x i8>
843 %c = call i32 @llvm.x86.sse42.pcmpistric128(<16 x i8> %lhs_cast, <16 x i8> %rhs_cast, i8 25)
844 %result = icmp eq i32 %c, 0
848 define i32 @pcmpistri_mem_idx_i16(ptr %lhs_ptr, ptr %rhs_ptr) nounwind {
849 ; X86-LABEL: pcmpistri_mem_idx_i16:
850 ; X86: # %bb.0: # %entry
851 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
852 ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
853 ; X86-NEXT: movdqu (%ecx), %xmm0
854 ; X86-NEXT: pcmpistri $25, (%eax), %xmm0
855 ; X86-NEXT: movl %ecx, %eax
858 ; X64-LABEL: pcmpistri_mem_idx_i16:
859 ; X64: # %bb.0: # %entry
860 ; X64-NEXT: movdqu (%rdi), %xmm0
861 ; X64-NEXT: pcmpistri $25, (%rsi), %xmm0
862 ; X64-NEXT: movl %ecx, %eax
865 %lhs = load <8 x i16>, ptr %lhs_ptr, align 1
866 %rhs = load <8 x i16>, ptr %rhs_ptr, align 1
867 %lhs_cast = bitcast <8 x i16> %lhs to <16 x i8>
868 %rhs_cast = bitcast <8 x i16> %rhs to <16 x i8>
869 %idx = call i32 @llvm.x86.sse42.pcmpistri128(<16 x i8> %lhs_cast, <16 x i8> %rhs_cast, i8 25)
873 define i32 @pcmpistri_mem_diff_i16(ptr %lhs_ptr, ptr %rhs_ptr) nounwind {
874 ; X86-LABEL: pcmpistri_mem_diff_i16:
875 ; X86: # %bb.0: # %entry
876 ; X86-NEXT: pushl %ebp
877 ; X86-NEXT: movl %esp, %ebp
878 ; X86-NEXT: andl $-16, %esp
879 ; X86-NEXT: subl $48, %esp
880 ; X86-NEXT: movl 12(%ebp), %eax
881 ; X86-NEXT: movl 8(%ebp), %ecx
882 ; X86-NEXT: movdqu (%ecx), %xmm1
883 ; X86-NEXT: movdqu (%eax), %xmm0
884 ; X86-NEXT: pcmpistri $25, %xmm0, %xmm1
885 ; X86-NEXT: cmpl $8, %ecx
886 ; X86-NEXT: jne .LBB23_2
888 ; X86-NEXT: xorl %eax, %eax
889 ; X86-NEXT: jmp .LBB23_3
890 ; X86-NEXT: .LBB23_2: # %compare
891 ; X86-NEXT: movdqa %xmm1, (%esp)
892 ; X86-NEXT: addl %ecx, %ecx
893 ; X86-NEXT: andl $14, %ecx
894 ; X86-NEXT: movzwl (%esp,%ecx), %eax
895 ; X86-NEXT: movdqa %xmm0, {{[0-9]+}}(%esp)
896 ; X86-NEXT: subw 16(%esp,%ecx), %ax
897 ; X86-NEXT: .LBB23_3: # %exit
898 ; X86-NEXT: movzwl %ax, %eax
899 ; X86-NEXT: movl %ebp, %esp
900 ; X86-NEXT: popl %ebp
903 ; X64-LABEL: pcmpistri_mem_diff_i16:
904 ; X64: # %bb.0: # %entry
905 ; X64-NEXT: movdqu (%rdi), %xmm1
906 ; X64-NEXT: movdqu (%rsi), %xmm0
907 ; X64-NEXT: pcmpistri $25, %xmm0, %xmm1
908 ; X64-NEXT: # kill: def $ecx killed $ecx def $rcx
909 ; X64-NEXT: cmpl $8, %ecx
910 ; X64-NEXT: jne .LBB23_2
912 ; X64-NEXT: xorl %eax, %eax
913 ; X64-NEXT: movzwl %ax, %eax
915 ; X64-NEXT: .LBB23_2: # %compare
916 ; X64-NEXT: movdqa %xmm1, -{{[0-9]+}}(%rsp)
917 ; X64-NEXT: andl $7, %ecx
918 ; X64-NEXT: movzwl -24(%rsp,%rcx,2), %eax
919 ; X64-NEXT: movdqa %xmm0, -{{[0-9]+}}(%rsp)
920 ; X64-NEXT: subw -40(%rsp,%rcx,2), %ax
921 ; X64-NEXT: movzwl %ax, %eax
924 %lhs = load <8 x i16>, ptr %lhs_ptr, align 1
925 %rhs = load <8 x i16>, ptr %rhs_ptr, align 1
926 %lhs_cast = bitcast <8 x i16> %lhs to <16 x i8>
927 %rhs_cast = bitcast <8 x i16> %rhs to <16 x i8>
928 %idx = call i32 @llvm.x86.sse42.pcmpistri128(<16 x i8> %lhs_cast, <16 x i8> %rhs_cast, i8 25)
929 %eq = icmp eq i32 %idx, 8
930 br i1 %eq, label %exit, label %compare
933 %lhs_c = extractelement <8 x i16> %lhs, i32 %idx
934 %rhs_c = extractelement <8 x i16> %rhs, i32 %idx
935 %sub = sub i16 %lhs_c, %rhs_c
939 %result = phi i16 [ 0, %entry ], [ %sub, %compare ]
940 %result_ext = zext i16 %result to i32
944 define void @pcmpestr_index_flag(<16 x i8> %lhs, i32 %lhs_len, <16 x i8> %rhs, i32 %rhs_len, ptr %iptr, ptr %fptr) nounwind {
945 ; X86-LABEL: pcmpestr_index_flag:
946 ; X86: # %bb.0: # %entry
947 ; X86-NEXT: pushl %ebx
948 ; X86-NEXT: pushl %edi
949 ; X86-NEXT: pushl %esi
950 ; X86-NEXT: movl {{[0-9]+}}(%esp), %esi
951 ; X86-NEXT: movl {{[0-9]+}}(%esp), %edi
952 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
953 ; X86-NEXT: movl {{[0-9]+}}(%esp), %edx
954 ; X86-NEXT: xorl %ebx, %ebx
955 ; X86-NEXT: pcmpestri $24, %xmm1, %xmm0
957 ; X86-NEXT: movl %ecx, (%edi)
958 ; X86-NEXT: movl %ebx, (%esi)
959 ; X86-NEXT: popl %esi
960 ; X86-NEXT: popl %edi
961 ; X86-NEXT: popl %ebx
964 ; X64-LABEL: pcmpestr_index_flag:
965 ; X64: # %bb.0: # %entry
966 ; X64-NEXT: movq %rcx, %r8
967 ; X64-NEXT: movq %rdx, %r9
968 ; X64-NEXT: movl %esi, %edx
969 ; X64-NEXT: movl %edi, %eax
970 ; X64-NEXT: xorl %esi, %esi
971 ; X64-NEXT: pcmpestri $24, %xmm1, %xmm0
972 ; X64-NEXT: setb %sil
973 ; X64-NEXT: movl %ecx, (%r9)
974 ; X64-NEXT: movl %esi, (%r8)
977 %flag = call i32 @llvm.x86.sse42.pcmpestric128(<16 x i8> %lhs, i32 %lhs_len, <16 x i8> %rhs, i32 %rhs_len, i8 24)
978 %index = call i32 @llvm.x86.sse42.pcmpestri128(<16 x i8> %lhs, i32 %lhs_len, <16 x i8> %rhs, i32 %rhs_len, i8 24)
979 store i32 %index, ptr %iptr
980 store i32 %flag, ptr %fptr
984 define void @pcmpestr_mask_flag(<16 x i8> %lhs, i32 %lhs_len, <16 x i8> %rhs, i32 %rhs_len, ptr %mptr, ptr %fptr) nounwind {
985 ; X86-LABEL: pcmpestr_mask_flag:
986 ; X86: # %bb.0: # %entry
987 ; X86-NEXT: pushl %ebx
988 ; X86-NEXT: pushl %esi
989 ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
990 ; X86-NEXT: movl {{[0-9]+}}(%esp), %esi
991 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
992 ; X86-NEXT: movl {{[0-9]+}}(%esp), %edx
993 ; X86-NEXT: xorl %ebx, %ebx
994 ; X86-NEXT: pcmpestrm $24, %xmm1, %xmm0
996 ; X86-NEXT: movdqa %xmm0, (%esi)
997 ; X86-NEXT: movl %ebx, (%ecx)
998 ; X86-NEXT: popl %esi
999 ; X86-NEXT: popl %ebx
1002 ; X64-LABEL: pcmpestr_mask_flag:
1003 ; X64: # %bb.0: # %entry
1004 ; X64-NEXT: movq %rdx, %r8
1005 ; X64-NEXT: movl %esi, %edx
1006 ; X64-NEXT: movl %edi, %eax
1007 ; X64-NEXT: xorl %esi, %esi
1008 ; X64-NEXT: pcmpestrm $24, %xmm1, %xmm0
1009 ; X64-NEXT: setb %sil
1010 ; X64-NEXT: movdqa %xmm0, (%r8)
1011 ; X64-NEXT: movl %esi, (%rcx)
1014 %flag = call i32 @llvm.x86.sse42.pcmpestric128(<16 x i8> %lhs, i32 %lhs_len, <16 x i8> %rhs, i32 %rhs_len, i8 24)
1015 %mask = call <16 x i8> @llvm.x86.sse42.pcmpestrm128(<16 x i8> %lhs, i32 %lhs_len, <16 x i8> %rhs, i32 %rhs_len, i8 24)
1016 store <16 x i8> %mask, ptr %mptr
1017 store i32 %flag, ptr %fptr
1021 define void @pcmpestr_mask_index(<16 x i8> %lhs, i32 %lhs_len, <16 x i8> %rhs, i32 %rhs_len, ptr %mptr, ptr %iptr) nounwind {
1022 ; X86-LABEL: pcmpestr_mask_index:
1023 ; X86: # %bb.0: # %entry
1024 ; X86-NEXT: pushl %edi
1025 ; X86-NEXT: pushl %esi
1026 ; X86-NEXT: movdqa %xmm0, %xmm2
1027 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
1028 ; X86-NEXT: movl {{[0-9]+}}(%esp), %edx
1029 ; X86-NEXT: pcmpestrm $24, %xmm1, %xmm0
1030 ; X86-NEXT: movl {{[0-9]+}}(%esp), %esi
1031 ; X86-NEXT: movl {{[0-9]+}}(%esp), %edi
1032 ; X86-NEXT: pcmpestri $24, %xmm1, %xmm2
1033 ; X86-NEXT: movdqa %xmm0, (%edi)
1034 ; X86-NEXT: movl %ecx, (%esi)
1035 ; X86-NEXT: popl %esi
1036 ; X86-NEXT: popl %edi
1039 ; X64-LABEL: pcmpestr_mask_index:
1040 ; X64: # %bb.0: # %entry
1041 ; X64-NEXT: movq %rcx, %r8
1042 ; X64-NEXT: movq %rdx, %r9
1043 ; X64-NEXT: movl %esi, %edx
1044 ; X64-NEXT: movl %edi, %eax
1045 ; X64-NEXT: movdqa %xmm0, %xmm2
1046 ; X64-NEXT: pcmpestrm $24, %xmm1, %xmm0
1047 ; X64-NEXT: pcmpestri $24, %xmm1, %xmm2
1048 ; X64-NEXT: movdqa %xmm0, (%r9)
1049 ; X64-NEXT: movl %ecx, (%r8)
1052 %index = call i32 @llvm.x86.sse42.pcmpestri128(<16 x i8> %lhs, i32 %lhs_len, <16 x i8> %rhs, i32 %rhs_len, i8 24)
1053 %mask = call <16 x i8> @llvm.x86.sse42.pcmpestrm128(<16 x i8> %lhs, i32 %lhs_len, <16 x i8> %rhs, i32 %rhs_len, i8 24)
1054 store <16 x i8> %mask, ptr %mptr
1055 store i32 %index, ptr %iptr
1059 define void @pcmpestr_mask_index_flag(<16 x i8> %lhs, i32 %lhs_len, <16 x i8> %rhs, i32 %rhs_len, ptr %mptr, ptr %iptr, ptr %fptr) nounwind {
1060 ; X86-LABEL: pcmpestr_mask_index_flag:
1061 ; X86: # %bb.0: # %entry
1062 ; X86-NEXT: pushl %ebp
1063 ; X86-NEXT: pushl %ebx
1064 ; X86-NEXT: pushl %edi
1065 ; X86-NEXT: pushl %esi
1066 ; X86-NEXT: movdqa %xmm0, %xmm2
1067 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
1068 ; X86-NEXT: movl {{[0-9]+}}(%esp), %edx
1069 ; X86-NEXT: pcmpestrm $24, %xmm1, %xmm0
1070 ; X86-NEXT: movl {{[0-9]+}}(%esp), %esi
1071 ; X86-NEXT: movl {{[0-9]+}}(%esp), %edi
1072 ; X86-NEXT: movl {{[0-9]+}}(%esp), %ebp
1073 ; X86-NEXT: xorl %ebx, %ebx
1074 ; X86-NEXT: pcmpestri $24, %xmm1, %xmm2
1075 ; X86-NEXT: setb %bl
1076 ; X86-NEXT: movdqa %xmm0, (%ebp)
1077 ; X86-NEXT: movl %ecx, (%edi)
1078 ; X86-NEXT: movl %ebx, (%esi)
1079 ; X86-NEXT: popl %esi
1080 ; X86-NEXT: popl %edi
1081 ; X86-NEXT: popl %ebx
1082 ; X86-NEXT: popl %ebp
1085 ; X64-LABEL: pcmpestr_mask_index_flag:
1086 ; X64: # %bb.0: # %entry
1087 ; X64-NEXT: movq %rcx, %r9
1088 ; X64-NEXT: movq %rdx, %r10
1089 ; X64-NEXT: movl %esi, %edx
1090 ; X64-NEXT: movl %edi, %eax
1091 ; X64-NEXT: movdqa %xmm0, %xmm2
1092 ; X64-NEXT: pcmpestrm $24, %xmm1, %xmm0
1093 ; X64-NEXT: xorl %esi, %esi
1094 ; X64-NEXT: pcmpestri $24, %xmm1, %xmm2
1095 ; X64-NEXT: setb %sil
1096 ; X64-NEXT: movdqa %xmm0, (%r10)
1097 ; X64-NEXT: movl %ecx, (%r9)
1098 ; X64-NEXT: movl %esi, (%r8)
1101 %index = call i32 @llvm.x86.sse42.pcmpestri128(<16 x i8> %lhs, i32 %lhs_len, <16 x i8> %rhs, i32 %rhs_len, i8 24)
1102 %mask = call <16 x i8> @llvm.x86.sse42.pcmpestrm128(<16 x i8> %lhs, i32 %lhs_len, <16 x i8> %rhs, i32 %rhs_len, i8 24)
1103 %flag = call i32 @llvm.x86.sse42.pcmpestric128(<16 x i8> %lhs, i32 %lhs_len, <16 x i8> %rhs, i32 %rhs_len, i8 24)
1104 store <16 x i8> %mask, ptr %mptr
1105 store i32 %index, ptr %iptr
1106 store i32 %flag, ptr %fptr
1110 define void @pcmpistr_index_flag(<16 x i8> %lhs, <16 x i8> %rhs, ptr %iptr, ptr %fptr) nounwind {
1111 ; X86-LABEL: pcmpistr_index_flag:
1112 ; X86: # %bb.0: # %entry
1113 ; X86-NEXT: pushl %esi
1114 ; X86-NEXT: movl {{[0-9]+}}(%esp), %edx
1115 ; X86-NEXT: movl {{[0-9]+}}(%esp), %esi
1116 ; X86-NEXT: xorl %eax, %eax
1117 ; X86-NEXT: pcmpistri $24, %xmm1, %xmm0
1118 ; X86-NEXT: setb %al
1119 ; X86-NEXT: movl %ecx, (%esi)
1120 ; X86-NEXT: movl %eax, (%edx)
1121 ; X86-NEXT: popl %esi
1124 ; X64-LABEL: pcmpistr_index_flag:
1125 ; X64: # %bb.0: # %entry
1126 ; X64-NEXT: xorl %eax, %eax
1127 ; X64-NEXT: pcmpistri $24, %xmm1, %xmm0
1128 ; X64-NEXT: setb %al
1129 ; X64-NEXT: movl %ecx, (%rdi)
1130 ; X64-NEXT: movl %eax, (%rsi)
1133 %flag = call i32 @llvm.x86.sse42.pcmpistric128(<16 x i8> %lhs, <16 x i8> %rhs, i8 24)
1134 %index = call i32 @llvm.x86.sse42.pcmpistri128(<16 x i8> %lhs, <16 x i8> %rhs, i8 24)
1135 store i32 %index, ptr %iptr
1136 store i32 %flag, ptr %fptr
1140 define void @pcmpistr_mask_flag(<16 x i8> %lhs, <16 x i8> %rhs, ptr %mptr, ptr %fptr) nounwind {
1141 ; X86-LABEL: pcmpistr_mask_flag:
1142 ; X86: # %bb.0: # %entry
1143 ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
1144 ; X86-NEXT: movl {{[0-9]+}}(%esp), %edx
1145 ; X86-NEXT: xorl %eax, %eax
1146 ; X86-NEXT: pcmpistrm $24, %xmm1, %xmm0
1147 ; X86-NEXT: setb %al
1148 ; X86-NEXT: movdqa %xmm0, (%edx)
1149 ; X86-NEXT: movl %eax, (%ecx)
1152 ; X64-LABEL: pcmpistr_mask_flag:
1153 ; X64: # %bb.0: # %entry
1154 ; X64-NEXT: xorl %eax, %eax
1155 ; X64-NEXT: pcmpistrm $24, %xmm1, %xmm0
1156 ; X64-NEXT: setb %al
1157 ; X64-NEXT: movdqa %xmm0, (%rdi)
1158 ; X64-NEXT: movl %eax, (%rsi)
1161 %flag = call i32 @llvm.x86.sse42.pcmpistric128(<16 x i8> %lhs, <16 x i8> %rhs, i8 24)
1162 %mask = call <16 x i8> @llvm.x86.sse42.pcmpistrm128(<16 x i8> %lhs, <16 x i8> %rhs, i8 24)
1163 store <16 x i8> %mask, ptr %mptr
1164 store i32 %flag, ptr %fptr
1168 define void @pcmpistr_mask_index(<16 x i8> %lhs, <16 x i8> %rhs, ptr %mptr, ptr %iptr) nounwind {
1169 ; X86-LABEL: pcmpistr_mask_index:
1170 ; X86: # %bb.0: # %entry
1171 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
1172 ; X86-NEXT: movl {{[0-9]+}}(%esp), %edx
1173 ; X86-NEXT: pcmpistri $24, %xmm1, %xmm0
1174 ; X86-NEXT: pcmpistrm $24, %xmm1, %xmm0
1175 ; X86-NEXT: movdqa %xmm0, (%edx)
1176 ; X86-NEXT: movl %ecx, (%eax)
1179 ; X64-LABEL: pcmpistr_mask_index:
1180 ; X64: # %bb.0: # %entry
1181 ; X64-NEXT: pcmpistri $24, %xmm1, %xmm0
1182 ; X64-NEXT: pcmpistrm $24, %xmm1, %xmm0
1183 ; X64-NEXT: movdqa %xmm0, (%rdi)
1184 ; X64-NEXT: movl %ecx, (%rsi)
1187 %index = call i32 @llvm.x86.sse42.pcmpistri128(<16 x i8> %lhs, <16 x i8> %rhs, i8 24)
1188 %mask = call <16 x i8> @llvm.x86.sse42.pcmpistrm128(<16 x i8> %lhs, <16 x i8> %rhs, i8 24)
1189 store <16 x i8> %mask, ptr %mptr
1190 store i32 %index, ptr %iptr
1194 define void @pcmpistr_mask_index_flag(<16 x i8> %lhs, <16 x i8> %rhs, ptr %mptr, ptr %iptr, ptr %fptr) nounwind {
1195 ; X86-LABEL: pcmpistr_mask_index_flag:
1196 ; X86: # %bb.0: # %entry
1197 ; X86-NEXT: pushl %ebx
1198 ; X86-NEXT: pushl %esi
1199 ; X86-NEXT: movdqa %xmm0, %xmm2
1200 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
1201 ; X86-NEXT: movl {{[0-9]+}}(%esp), %edx
1202 ; X86-NEXT: movl {{[0-9]+}}(%esp), %esi
1203 ; X86-NEXT: pcmpistrm $24, %xmm1, %xmm0
1204 ; X86-NEXT: xorl %ebx, %ebx
1205 ; X86-NEXT: pcmpistri $24, %xmm1, %xmm2
1206 ; X86-NEXT: setb %bl
1207 ; X86-NEXT: movdqa %xmm0, (%esi)
1208 ; X86-NEXT: movl %ecx, (%edx)
1209 ; X86-NEXT: movl %ebx, (%eax)
1210 ; X86-NEXT: popl %esi
1211 ; X86-NEXT: popl %ebx
1214 ; X64-LABEL: pcmpistr_mask_index_flag:
1215 ; X64: # %bb.0: # %entry
1216 ; X64-NEXT: movdqa %xmm0, %xmm2
1217 ; X64-NEXT: pcmpistrm $24, %xmm1, %xmm0
1218 ; X64-NEXT: xorl %eax, %eax
1219 ; X64-NEXT: pcmpistri $24, %xmm1, %xmm2
1220 ; X64-NEXT: setb %al
1221 ; X64-NEXT: movdqa %xmm0, (%rdi)
1222 ; X64-NEXT: movl %ecx, (%rsi)
1223 ; X64-NEXT: movl %eax, (%rdx)
1226 %index = call i32 @llvm.x86.sse42.pcmpistri128(<16 x i8> %lhs, <16 x i8> %rhs, i8 24)
1227 %mask = call <16 x i8> @llvm.x86.sse42.pcmpistrm128(<16 x i8> %lhs, <16 x i8> %rhs, i8 24)
1228 %flag = call i32 @llvm.x86.sse42.pcmpistric128(<16 x i8> %lhs, <16 x i8> %rhs, i8 24)
1229 store <16 x i8> %mask, ptr %mptr
1230 store i32 %index, ptr %iptr
1231 store i32 %flag, ptr %fptr
1235 ; Make sure we don't fold loads when we need to emit pcmpistrm and pcmpistri.
1236 define void @pcmpistr_mask_index_flag_load(<16 x i8> %lhs, ptr %rhsptr, ptr %mptr, ptr %iptr, ptr %fptr) nounwind {
1237 ; X86-LABEL: pcmpistr_mask_index_flag_load:
1238 ; X86: # %bb.0: # %entry
1239 ; X86-NEXT: pushl %ebx
1240 ; X86-NEXT: pushl %esi
1241 ; X86-NEXT: movdqa %xmm0, %xmm1
1242 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
1243 ; X86-NEXT: movl {{[0-9]+}}(%esp), %edx
1244 ; X86-NEXT: movl {{[0-9]+}}(%esp), %esi
1245 ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
1246 ; X86-NEXT: movdqu (%ecx), %xmm2
1247 ; X86-NEXT: pcmpistrm $24, %xmm2, %xmm0
1248 ; X86-NEXT: xorl %ebx, %ebx
1249 ; X86-NEXT: pcmpistri $24, %xmm2, %xmm1
1250 ; X86-NEXT: setb %bl
1251 ; X86-NEXT: movdqa %xmm0, (%esi)
1252 ; X86-NEXT: movl %ecx, (%edx)
1253 ; X86-NEXT: movl %ebx, (%eax)
1254 ; X86-NEXT: popl %esi
1255 ; X86-NEXT: popl %ebx
1258 ; X64-LABEL: pcmpistr_mask_index_flag_load:
1259 ; X64: # %bb.0: # %entry
1260 ; X64-NEXT: movq %rcx, %rax
1261 ; X64-NEXT: movdqa %xmm0, %xmm1
1262 ; X64-NEXT: movdqu (%rdi), %xmm2
1263 ; X64-NEXT: pcmpistrm $24, %xmm2, %xmm0
1264 ; X64-NEXT: xorl %edi, %edi
1265 ; X64-NEXT: pcmpistri $24, %xmm2, %xmm1
1266 ; X64-NEXT: setb %dil
1267 ; X64-NEXT: movdqa %xmm0, (%rsi)
1268 ; X64-NEXT: movl %ecx, (%rdx)
1269 ; X64-NEXT: movl %edi, (%rax)
1272 %rhs = load <16 x i8>, ptr %rhsptr, align 1
1273 %index = call i32 @llvm.x86.sse42.pcmpistri128(<16 x i8> %lhs, <16 x i8> %rhs, i8 24)
1274 %mask = call <16 x i8> @llvm.x86.sse42.pcmpistrm128(<16 x i8> %lhs, <16 x i8> %rhs, i8 24)
1275 %flag = call i32 @llvm.x86.sse42.pcmpistric128(<16 x i8> %lhs, <16 x i8> %rhs, i8 24)
1276 store <16 x i8> %mask, ptr %mptr
1277 store i32 %index, ptr %iptr
1278 store i32 %flag, ptr %fptr
1282 ; Make sure we don't fold nontemporal loads.
1283 define i32 @pcmpestri_nontemporal(<16 x i8> %lhs, i32 %lhs_len, ptr %rhsptr, i32 %rhs_len) nounwind {
1284 ; X86-LABEL: pcmpestri_nontemporal:
1285 ; X86: # %bb.0: # %entry
1286 ; X86-NEXT: pushl %ebx
1287 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
1288 ; X86-NEXT: movl {{[0-9]+}}(%esp), %edx
1289 ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
1290 ; X86-NEXT: movntdqa (%ecx), %xmm1
1291 ; X86-NEXT: xorl %ebx, %ebx
1292 ; X86-NEXT: pcmpestri $24, %xmm1, %xmm0
1293 ; X86-NEXT: setb %bl
1294 ; X86-NEXT: movl %ebx, %eax
1295 ; X86-NEXT: popl %ebx
1298 ; X64-LABEL: pcmpestri_nontemporal:
1299 ; X64: # %bb.0: # %entry
1300 ; X64-NEXT: movl %edi, %eax
1301 ; X64-NEXT: movntdqa (%rsi), %xmm1
1302 ; X64-NEXT: xorl %esi, %esi
1303 ; X64-NEXT: pcmpestri $24, %xmm1, %xmm0
1304 ; X64-NEXT: setb %sil
1305 ; X64-NEXT: movl %esi, %eax
1308 %rhs = load <16 x i8>, ptr %rhsptr, align 16, !nontemporal !0
1309 %flag = call i32 @llvm.x86.sse42.pcmpestric128(<16 x i8> %lhs, i32 %lhs_len, <16 x i8> %rhs, i32 %rhs_len, i8 24)