1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+sse4.2 | FileCheck %s --check-prefix=X86
3 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4.2 | FileCheck %s --check-prefix=X64
5 declare i32 @llvm.x86.sse42.pcmpestric128(<16 x i8> %lhs, i32, <16 x i8>, i32, i8)
6 declare i32 @llvm.x86.sse42.pcmpestri128(<16 x i8> %lhs, i32, <16 x i8>, i32, i8)
7 declare <16 x i8> @llvm.x86.sse42.pcmpestrm128(<16 x i8> %lhs, i32, <16 x i8>, i32, i8)
8 declare i32 @llvm.x86.sse42.pcmpistric128(<16 x i8> %lhs, <16 x i8>, i8)
9 declare i32 @llvm.x86.sse42.pcmpistri128(<16 x i8> %lhs, <16 x i8>, i8)
10 declare <16 x i8> @llvm.x86.sse42.pcmpistrm128(<16 x i8> %lhs, <16 x i8>, i8)
12 define i1 @pcmpestri_reg_eq_i8(<16 x i8> %lhs, i32 %lhs_len, <16 x i8> %rhs, i32 %rhs_len) nounwind {
13 ; X86-LABEL: pcmpestri_reg_eq_i8:
14 ; X86: # %bb.0: # %entry
15 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
16 ; X86-NEXT: movl {{[0-9]+}}(%esp), %edx
17 ; X86-NEXT: pcmpestri $24, %xmm1, %xmm0
21 ; X64-LABEL: pcmpestri_reg_eq_i8:
22 ; X64: # %bb.0: # %entry
23 ; X64-NEXT: movl %esi, %edx
24 ; X64-NEXT: movl %edi, %eax
25 ; X64-NEXT: pcmpestri $24, %xmm1, %xmm0
29 %c = call i32 @llvm.x86.sse42.pcmpestric128(<16 x i8> %lhs, i32 %lhs_len, <16 x i8> %rhs, i32 %rhs_len, i8 24)
30 %result = icmp eq i32 %c, 0
34 define i32 @pcmpestri_reg_idx_i8(<16 x i8> %lhs, i32 %lhs_len, <16 x i8> %rhs, i32 %rhs_len) nounwind {
35 ; X86-LABEL: pcmpestri_reg_idx_i8:
36 ; X86: # %bb.0: # %entry
37 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
38 ; X86-NEXT: movl {{[0-9]+}}(%esp), %edx
39 ; X86-NEXT: pcmpestri $24, %xmm1, %xmm0
40 ; X86-NEXT: movl %ecx, %eax
43 ; X64-LABEL: pcmpestri_reg_idx_i8:
44 ; X64: # %bb.0: # %entry
45 ; X64-NEXT: movl %esi, %edx
46 ; X64-NEXT: movl %edi, %eax
47 ; X64-NEXT: pcmpestri $24, %xmm1, %xmm0
48 ; X64-NEXT: movl %ecx, %eax
51 %idx = call i32 @llvm.x86.sse42.pcmpestri128(<16 x i8> %lhs, i32 %lhs_len, <16 x i8> %rhs, i32 %rhs_len, i8 24)
55 define i32 @pcmpestri_reg_diff_i8(<16 x i8> %lhs, i32 %lhs_len, <16 x i8> %rhs, i32 %rhs_len) nounwind {
56 ; X86-LABEL: pcmpestri_reg_diff_i8:
57 ; X86: # %bb.0: # %entry
58 ; X86-NEXT: pushl %ebp
59 ; X86-NEXT: movl %esp, %ebp
60 ; X86-NEXT: andl $-16, %esp
61 ; X86-NEXT: subl $48, %esp
62 ; X86-NEXT: movl 8(%ebp), %eax
63 ; X86-NEXT: movl 12(%ebp), %edx
64 ; X86-NEXT: pcmpestri $24, %xmm1, %xmm0
65 ; X86-NEXT: cmpl $16, %ecx
66 ; X86-NEXT: jne .LBB2_2
68 ; X86-NEXT: xorl %eax, %eax
69 ; X86-NEXT: jmp .LBB2_3
70 ; X86-NEXT: .LBB2_2: # %compare
71 ; X86-NEXT: movdqa %xmm0, (%esp)
72 ; X86-NEXT: andl $15, %ecx
73 ; X86-NEXT: movb (%esp,%ecx), %al
74 ; X86-NEXT: movdqa %xmm1, {{[0-9]+}}(%esp)
75 ; X86-NEXT: subb 16(%esp,%ecx), %al
76 ; X86-NEXT: .LBB2_3: # %exit
77 ; X86-NEXT: movzbl %al, %eax
78 ; X86-NEXT: movl %ebp, %esp
82 ; X64-LABEL: pcmpestri_reg_diff_i8:
83 ; X64: # %bb.0: # %entry
84 ; X64-NEXT: movl %esi, %edx
85 ; X64-NEXT: movl %edi, %eax
86 ; X64-NEXT: pcmpestri $24, %xmm1, %xmm0
87 ; X64-NEXT: # kill: def $ecx killed $ecx def $rcx
88 ; X64-NEXT: cmpl $16, %ecx
89 ; X64-NEXT: jne .LBB2_2
91 ; X64-NEXT: xorl %eax, %eax
92 ; X64-NEXT: movzbl %al, %eax
94 ; X64-NEXT: .LBB2_2: # %compare
95 ; X64-NEXT: movdqa %xmm0, -{{[0-9]+}}(%rsp)
96 ; X64-NEXT: andl $15, %ecx
97 ; X64-NEXT: movb -24(%rsp,%rcx), %al
98 ; X64-NEXT: movdqa %xmm1, -{{[0-9]+}}(%rsp)
99 ; X64-NEXT: subb -40(%rsp,%rcx), %al
100 ; X64-NEXT: movzbl %al, %eax
103 %idx = call i32 @llvm.x86.sse42.pcmpestri128(<16 x i8> %lhs, i32 %lhs_len, <16 x i8> %rhs, i32 %rhs_len, i8 24)
104 %eq = icmp eq i32 %idx, 16
105 br i1 %eq, label %exit, label %compare
108 %lhs_c = extractelement <16 x i8> %lhs, i32 %idx
109 %rhs_c = extractelement <16 x i8> %rhs, i32 %idx
110 %sub = sub i8 %lhs_c, %rhs_c
114 %result = phi i8 [ 0, %entry ], [ %sub, %compare ]
115 %result_ext = zext i8 %result to i32
119 define i1 @pcmpestri_mem_eq_i8(i8* %lhs_ptr, i32 %lhs_len, i8* %rhs_ptr, i32 %rhs_len) nounwind {
120 ; X86-LABEL: pcmpestri_mem_eq_i8:
121 ; X86: # %bb.0: # %entry
122 ; X86-NEXT: pushl %esi
123 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
124 ; X86-NEXT: movl {{[0-9]+}}(%esp), %edx
125 ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
126 ; X86-NEXT: movl {{[0-9]+}}(%esp), %esi
127 ; X86-NEXT: movdqu (%esi), %xmm0
128 ; X86-NEXT: pcmpestri $24, (%ecx), %xmm0
129 ; X86-NEXT: setae %al
130 ; X86-NEXT: popl %esi
133 ; X64-LABEL: pcmpestri_mem_eq_i8:
134 ; X64: # %bb.0: # %entry
135 ; X64-NEXT: movq %rdx, %r8
136 ; X64-NEXT: movl %esi, %eax
137 ; X64-NEXT: movdqu (%rdi), %xmm0
138 ; X64-NEXT: movl %ecx, %edx
139 ; X64-NEXT: pcmpestri $24, (%r8), %xmm0
140 ; X64-NEXT: setae %al
143 %lhs_vptr = bitcast i8* %lhs_ptr to <16 x i8>*
144 %lhs = load <16 x i8>, <16 x i8>* %lhs_vptr, align 1
145 %rhs_vptr = bitcast i8* %rhs_ptr to <16 x i8>*
146 %rhs = load <16 x i8>, <16 x i8>* %rhs_vptr, align 1
147 %c = call i32 @llvm.x86.sse42.pcmpestric128(<16 x i8> %lhs, i32 %lhs_len, <16 x i8> %rhs, i32 %rhs_len, i8 24)
148 %result = icmp eq i32 %c, 0
152 define i32 @pcmpestri_mem_idx_i8(i8* %lhs_ptr, i32 %lhs_len, i8* %rhs_ptr, i32 %rhs_len) nounwind {
153 ; X86-LABEL: pcmpestri_mem_idx_i8:
154 ; X86: # %bb.0: # %entry
155 ; X86-NEXT: pushl %esi
156 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
157 ; X86-NEXT: movl {{[0-9]+}}(%esp), %edx
158 ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
159 ; X86-NEXT: movl {{[0-9]+}}(%esp), %esi
160 ; X86-NEXT: movdqu (%esi), %xmm0
161 ; X86-NEXT: pcmpestri $24, (%ecx), %xmm0
162 ; X86-NEXT: movl %ecx, %eax
163 ; X86-NEXT: popl %esi
166 ; X64-LABEL: pcmpestri_mem_idx_i8:
167 ; X64: # %bb.0: # %entry
168 ; X64-NEXT: movq %rdx, %r8
169 ; X64-NEXT: movl %esi, %eax
170 ; X64-NEXT: movdqu (%rdi), %xmm0
171 ; X64-NEXT: movl %ecx, %edx
172 ; X64-NEXT: pcmpestri $24, (%r8), %xmm0
173 ; X64-NEXT: movl %ecx, %eax
176 %lhs_vptr = bitcast i8* %lhs_ptr to <16 x i8>*
177 %lhs = load <16 x i8>, <16 x i8>* %lhs_vptr, align 1
178 %rhs_vptr = bitcast i8* %rhs_ptr to <16 x i8>*
179 %rhs = load <16 x i8>, <16 x i8>* %rhs_vptr, align 1
180 %idx = call i32 @llvm.x86.sse42.pcmpestri128(<16 x i8> %lhs, i32 %lhs_len, <16 x i8> %rhs, i32 %rhs_len, i8 24)
184 define i32 @pcmpestri_mem_diff_i8(i8* %lhs_ptr, i32 %lhs_len, i8* %rhs_ptr, i32 %rhs_len) nounwind {
185 ; X86-LABEL: pcmpestri_mem_diff_i8:
186 ; X86: # %bb.0: # %entry
187 ; X86-NEXT: pushl %ebp
188 ; X86-NEXT: movl %esp, %ebp
189 ; X86-NEXT: pushl %esi
190 ; X86-NEXT: andl $-16, %esp
191 ; X86-NEXT: subl $48, %esp
192 ; X86-NEXT: movl 12(%ebp), %eax
193 ; X86-NEXT: movl 20(%ebp), %edx
194 ; X86-NEXT: movl 16(%ebp), %ecx
195 ; X86-NEXT: movl 8(%ebp), %esi
196 ; X86-NEXT: movdqu (%esi), %xmm1
197 ; X86-NEXT: movdqu (%ecx), %xmm0
198 ; X86-NEXT: pcmpestri $24, %xmm0, %xmm1
199 ; X86-NEXT: cmpl $16, %ecx
200 ; X86-NEXT: jne .LBB5_2
202 ; X86-NEXT: xorl %eax, %eax
203 ; X86-NEXT: jmp .LBB5_3
204 ; X86-NEXT: .LBB5_2: # %compare
205 ; X86-NEXT: movdqa %xmm1, (%esp)
206 ; X86-NEXT: andl $15, %ecx
207 ; X86-NEXT: movb (%esp,%ecx), %al
208 ; X86-NEXT: movdqa %xmm0, {{[0-9]+}}(%esp)
209 ; X86-NEXT: subb 16(%esp,%ecx), %al
210 ; X86-NEXT: .LBB5_3: # %exit
211 ; X86-NEXT: movzbl %al, %eax
212 ; X86-NEXT: leal -4(%ebp), %esp
213 ; X86-NEXT: popl %esi
214 ; X86-NEXT: popl %ebp
217 ; X64-LABEL: pcmpestri_mem_diff_i8:
218 ; X64: # %bb.0: # %entry
219 ; X64-NEXT: movl %esi, %eax
220 ; X64-NEXT: movdqu (%rdi), %xmm1
221 ; X64-NEXT: movdqu (%rdx), %xmm0
222 ; X64-NEXT: movl %ecx, %edx
223 ; X64-NEXT: pcmpestri $24, %xmm0, %xmm1
224 ; X64-NEXT: # kill: def $ecx killed $ecx def $rcx
225 ; X64-NEXT: cmpl $16, %ecx
226 ; X64-NEXT: jne .LBB5_2
228 ; X64-NEXT: xorl %eax, %eax
229 ; X64-NEXT: movzbl %al, %eax
231 ; X64-NEXT: .LBB5_2: # %compare
232 ; X64-NEXT: movdqa %xmm1, -{{[0-9]+}}(%rsp)
233 ; X64-NEXT: andl $15, %ecx
234 ; X64-NEXT: movb -24(%rsp,%rcx), %al
235 ; X64-NEXT: movdqa %xmm0, -{{[0-9]+}}(%rsp)
236 ; X64-NEXT: subb -40(%rsp,%rcx), %al
237 ; X64-NEXT: movzbl %al, %eax
240 %lhs_vptr = bitcast i8* %lhs_ptr to <16 x i8>*
241 %lhs = load <16 x i8>, <16 x i8>* %lhs_vptr, align 1
242 %rhs_vptr = bitcast i8* %rhs_ptr to <16 x i8>*
243 %rhs = load <16 x i8>, <16 x i8>* %rhs_vptr, align 1
244 %idx = call i32 @llvm.x86.sse42.pcmpestri128(<16 x i8> %lhs, i32 %lhs_len, <16 x i8> %rhs, i32 %rhs_len, i8 24)
245 %eq = icmp eq i32 %idx, 16
246 br i1 %eq, label %exit, label %compare
249 %lhs_c = extractelement <16 x i8> %lhs, i32 %idx
250 %rhs_c = extractelement <16 x i8> %rhs, i32 %idx
251 %sub = sub i8 %lhs_c, %rhs_c
255 %result = phi i8 [ 0, %entry ], [ %sub, %compare ]
256 %result_ext = zext i8 %result to i32
260 define i1 @pcmpestri_reg_eq_i16(<8 x i16> %lhs, i32 %lhs_len, <8 x i16> %rhs, i32 %rhs_len) nounwind {
261 ; X86-LABEL: pcmpestri_reg_eq_i16:
262 ; X86: # %bb.0: # %entry
263 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
264 ; X86-NEXT: movl {{[0-9]+}}(%esp), %edx
265 ; X86-NEXT: pcmpestri $24, %xmm1, %xmm0
266 ; X86-NEXT: setae %al
269 ; X64-LABEL: pcmpestri_reg_eq_i16:
270 ; X64: # %bb.0: # %entry
271 ; X64-NEXT: movl %esi, %edx
272 ; X64-NEXT: movl %edi, %eax
273 ; X64-NEXT: pcmpestri $24, %xmm1, %xmm0
274 ; X64-NEXT: setae %al
277 %lhs_cast = bitcast <8 x i16> %lhs to <16 x i8>
278 %rhs_cast = bitcast <8 x i16> %rhs to <16 x i8>
279 %c = call i32 @llvm.x86.sse42.pcmpestric128(<16 x i8> %lhs_cast, i32 %lhs_len, <16 x i8> %rhs_cast, i32 %rhs_len, i8 24)
280 %result = icmp eq i32 %c, 0
284 define i32 @pcmpestri_reg_idx_i16(<8 x i16> %lhs, i32 %lhs_len, <8 x i16> %rhs, i32 %rhs_len) nounwind {
285 ; X86-LABEL: pcmpestri_reg_idx_i16:
286 ; X86: # %bb.0: # %entry
287 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
288 ; X86-NEXT: movl {{[0-9]+}}(%esp), %edx
289 ; X86-NEXT: pcmpestri $24, %xmm1, %xmm0
290 ; X86-NEXT: movl %ecx, %eax
293 ; X64-LABEL: pcmpestri_reg_idx_i16:
294 ; X64: # %bb.0: # %entry
295 ; X64-NEXT: movl %esi, %edx
296 ; X64-NEXT: movl %edi, %eax
297 ; X64-NEXT: pcmpestri $24, %xmm1, %xmm0
298 ; X64-NEXT: movl %ecx, %eax
301 %lhs_cast = bitcast <8 x i16> %lhs to <16 x i8>
302 %rhs_cast = bitcast <8 x i16> %rhs to <16 x i8>
303 %idx = call i32 @llvm.x86.sse42.pcmpestri128(<16 x i8> %lhs_cast, i32 %lhs_len, <16 x i8> %rhs_cast, i32 %rhs_len, i8 24)
307 define i32 @pcmpestri_reg_diff_i16(<8 x i16> %lhs, i32 %lhs_len, <8 x i16> %rhs, i32 %rhs_len) nounwind {
308 ; X86-LABEL: pcmpestri_reg_diff_i16:
309 ; X86: # %bb.0: # %entry
310 ; X86-NEXT: pushl %ebp
311 ; X86-NEXT: movl %esp, %ebp
312 ; X86-NEXT: andl $-16, %esp
313 ; X86-NEXT: subl $48, %esp
314 ; X86-NEXT: movl 8(%ebp), %eax
315 ; X86-NEXT: movl 12(%ebp), %edx
316 ; X86-NEXT: pcmpestri $24, %xmm1, %xmm0
317 ; X86-NEXT: cmpl $16, %ecx
318 ; X86-NEXT: jne .LBB8_2
320 ; X86-NEXT: xorl %eax, %eax
321 ; X86-NEXT: jmp .LBB8_3
322 ; X86-NEXT: .LBB8_2: # %compare
323 ; X86-NEXT: movdqa %xmm0, (%esp)
324 ; X86-NEXT: addl %ecx, %ecx
325 ; X86-NEXT: andl $14, %ecx
326 ; X86-NEXT: movzwl (%esp,%ecx), %eax
327 ; X86-NEXT: movdqa %xmm1, {{[0-9]+}}(%esp)
328 ; X86-NEXT: subw 16(%esp,%ecx), %ax
329 ; X86-NEXT: .LBB8_3: # %exit
330 ; X86-NEXT: movzwl %ax, %eax
331 ; X86-NEXT: movl %ebp, %esp
332 ; X86-NEXT: popl %ebp
335 ; X64-LABEL: pcmpestri_reg_diff_i16:
336 ; X64: # %bb.0: # %entry
337 ; X64-NEXT: movl %esi, %edx
338 ; X64-NEXT: movl %edi, %eax
339 ; X64-NEXT: pcmpestri $24, %xmm1, %xmm0
340 ; X64-NEXT: # kill: def $ecx killed $ecx def $rcx
341 ; X64-NEXT: cmpl $16, %ecx
342 ; X64-NEXT: jne .LBB8_2
344 ; X64-NEXT: xorl %eax, %eax
345 ; X64-NEXT: movzwl %ax, %eax
347 ; X64-NEXT: .LBB8_2: # %compare
348 ; X64-NEXT: movdqa %xmm0, -{{[0-9]+}}(%rsp)
349 ; X64-NEXT: andl $7, %ecx
350 ; X64-NEXT: movzwl -24(%rsp,%rcx,2), %eax
351 ; X64-NEXT: movdqa %xmm1, -{{[0-9]+}}(%rsp)
352 ; X64-NEXT: subw -40(%rsp,%rcx,2), %ax
353 ; X64-NEXT: movzwl %ax, %eax
356 %lhs_cast = bitcast <8 x i16> %lhs to <16 x i8>
357 %rhs_cast = bitcast <8 x i16> %rhs to <16 x i8>
358 %idx = call i32 @llvm.x86.sse42.pcmpestri128(<16 x i8> %lhs_cast, i32 %lhs_len, <16 x i8> %rhs_cast, i32 %rhs_len, i8 24)
359 %eq = icmp eq i32 %idx, 16
360 br i1 %eq, label %exit, label %compare
363 %lhs_c = extractelement <8 x i16> %lhs, i32 %idx
364 %rhs_c = extractelement <8 x i16> %rhs, i32 %idx
365 %sub = sub i16 %lhs_c, %rhs_c
369 %result = phi i16 [ 0, %entry ], [ %sub, %compare ]
370 %result_ext = zext i16 %result to i32
374 define i1 @pcmpestri_mem_eq_i16(i16* %lhs_ptr, i32 %lhs_len, i16* %rhs_ptr, i32 %rhs_len) nounwind {
375 ; X86-LABEL: pcmpestri_mem_eq_i16:
376 ; X86: # %bb.0: # %entry
377 ; X86-NEXT: pushl %esi
378 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
379 ; X86-NEXT: movl {{[0-9]+}}(%esp), %edx
380 ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
381 ; X86-NEXT: movl {{[0-9]+}}(%esp), %esi
382 ; X86-NEXT: movdqu (%esi), %xmm0
383 ; X86-NEXT: pcmpestri $25, (%ecx), %xmm0
384 ; X86-NEXT: setae %al
385 ; X86-NEXT: popl %esi
388 ; X64-LABEL: pcmpestri_mem_eq_i16:
389 ; X64: # %bb.0: # %entry
390 ; X64-NEXT: movq %rdx, %r8
391 ; X64-NEXT: movl %esi, %eax
392 ; X64-NEXT: movdqu (%rdi), %xmm0
393 ; X64-NEXT: movl %ecx, %edx
394 ; X64-NEXT: pcmpestri $25, (%r8), %xmm0
395 ; X64-NEXT: setae %al
398 %lhs_vptr = bitcast i16* %lhs_ptr to <8 x i16>*
399 %lhs = load <8 x i16>, <8 x i16>* %lhs_vptr, align 1
400 %rhs_vptr = bitcast i16* %rhs_ptr to <8 x i16>*
401 %rhs = load <8 x i16>, <8 x i16>* %rhs_vptr, align 1
402 %lhs_cast = bitcast <8 x i16> %lhs to <16 x i8>
403 %rhs_cast = bitcast <8 x i16> %rhs to <16 x i8>
404 %c = call i32 @llvm.x86.sse42.pcmpestric128(<16 x i8> %lhs_cast, i32 %lhs_len, <16 x i8> %rhs_cast, i32 %rhs_len, i8 25)
405 %result = icmp eq i32 %c, 0
409 define i32 @pcmpestri_mem_idx_i16(i16* %lhs_ptr, i32 %lhs_len, i16* %rhs_ptr, i32 %rhs_len) nounwind {
410 ; X86-LABEL: pcmpestri_mem_idx_i16:
411 ; X86: # %bb.0: # %entry
412 ; X86-NEXT: pushl %esi
413 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
414 ; X86-NEXT: movl {{[0-9]+}}(%esp), %edx
415 ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
416 ; X86-NEXT: movl {{[0-9]+}}(%esp), %esi
417 ; X86-NEXT: movdqu (%esi), %xmm0
418 ; X86-NEXT: pcmpestri $25, (%ecx), %xmm0
419 ; X86-NEXT: movl %ecx, %eax
420 ; X86-NEXT: popl %esi
423 ; X64-LABEL: pcmpestri_mem_idx_i16:
424 ; X64: # %bb.0: # %entry
425 ; X64-NEXT: movq %rdx, %r8
426 ; X64-NEXT: movl %esi, %eax
427 ; X64-NEXT: movdqu (%rdi), %xmm0
428 ; X64-NEXT: movl %ecx, %edx
429 ; X64-NEXT: pcmpestri $25, (%r8), %xmm0
430 ; X64-NEXT: movl %ecx, %eax
433 %lhs_vptr = bitcast i16* %lhs_ptr to <8 x i16>*
434 %lhs = load <8 x i16>, <8 x i16>* %lhs_vptr, align 1
435 %rhs_vptr = bitcast i16* %rhs_ptr to <8 x i16>*
436 %rhs = load <8 x i16>, <8 x i16>* %rhs_vptr, align 1
437 %lhs_cast = bitcast <8 x i16> %lhs to <16 x i8>
438 %rhs_cast = bitcast <8 x i16> %rhs to <16 x i8>
439 %idx = call i32 @llvm.x86.sse42.pcmpestri128(<16 x i8> %lhs_cast, i32 %lhs_len, <16 x i8> %rhs_cast, i32 %rhs_len, i8 25)
443 define i32 @pcmpestri_mem_diff_i16(i16* %lhs_ptr, i32 %lhs_len, i16* %rhs_ptr, i32 %rhs_len) nounwind {
444 ; X86-LABEL: pcmpestri_mem_diff_i16:
445 ; X86: # %bb.0: # %entry
446 ; X86-NEXT: pushl %ebp
447 ; X86-NEXT: movl %esp, %ebp
448 ; X86-NEXT: pushl %esi
449 ; X86-NEXT: andl $-16, %esp
450 ; X86-NEXT: subl $48, %esp
451 ; X86-NEXT: movl 12(%ebp), %eax
452 ; X86-NEXT: movl 20(%ebp), %edx
453 ; X86-NEXT: movl 16(%ebp), %ecx
454 ; X86-NEXT: movl 8(%ebp), %esi
455 ; X86-NEXT: movdqu (%esi), %xmm1
456 ; X86-NEXT: movdqu (%ecx), %xmm0
457 ; X86-NEXT: pcmpestri $25, %xmm0, %xmm1
458 ; X86-NEXT: cmpl $8, %ecx
459 ; X86-NEXT: jne .LBB11_2
461 ; X86-NEXT: xorl %eax, %eax
462 ; X86-NEXT: jmp .LBB11_3
463 ; X86-NEXT: .LBB11_2: # %compare
464 ; X86-NEXT: movdqa %xmm1, (%esp)
465 ; X86-NEXT: addl %ecx, %ecx
466 ; X86-NEXT: andl $14, %ecx
467 ; X86-NEXT: movzwl (%esp,%ecx), %eax
468 ; X86-NEXT: movdqa %xmm0, {{[0-9]+}}(%esp)
469 ; X86-NEXT: subw 16(%esp,%ecx), %ax
470 ; X86-NEXT: .LBB11_3: # %exit
471 ; X86-NEXT: movzwl %ax, %eax
472 ; X86-NEXT: leal -4(%ebp), %esp
473 ; X86-NEXT: popl %esi
474 ; X86-NEXT: popl %ebp
477 ; X64-LABEL: pcmpestri_mem_diff_i16:
478 ; X64: # %bb.0: # %entry
479 ; X64-NEXT: movl %esi, %eax
480 ; X64-NEXT: movdqu (%rdi), %xmm1
481 ; X64-NEXT: movdqu (%rdx), %xmm0
482 ; X64-NEXT: movl %ecx, %edx
483 ; X64-NEXT: pcmpestri $25, %xmm0, %xmm1
484 ; X64-NEXT: # kill: def $ecx killed $ecx def $rcx
485 ; X64-NEXT: cmpl $8, %ecx
486 ; X64-NEXT: jne .LBB11_2
488 ; X64-NEXT: xorl %eax, %eax
489 ; X64-NEXT: movzwl %ax, %eax
491 ; X64-NEXT: .LBB11_2: # %compare
492 ; X64-NEXT: movdqa %xmm1, -{{[0-9]+}}(%rsp)
493 ; X64-NEXT: andl $7, %ecx
494 ; X64-NEXT: movzwl -24(%rsp,%rcx,2), %eax
495 ; X64-NEXT: movdqa %xmm0, -{{[0-9]+}}(%rsp)
496 ; X64-NEXT: subw -40(%rsp,%rcx,2), %ax
497 ; X64-NEXT: movzwl %ax, %eax
500 %lhs_vptr = bitcast i16* %lhs_ptr to <8 x i16>*
501 %lhs = load <8 x i16>, <8 x i16>* %lhs_vptr, align 1
502 %rhs_vptr = bitcast i16* %rhs_ptr to <8 x i16>*
503 %rhs = load <8 x i16>, <8 x i16>* %rhs_vptr, align 1
504 %lhs_cast = bitcast <8 x i16> %lhs to <16 x i8>
505 %rhs_cast = bitcast <8 x i16> %rhs to <16 x i8>
506 %idx = call i32 @llvm.x86.sse42.pcmpestri128(<16 x i8> %lhs_cast, i32 %lhs_len, <16 x i8> %rhs_cast, i32 %rhs_len, i8 25)
507 %eq = icmp eq i32 %idx, 8
508 br i1 %eq, label %exit, label %compare
511 %lhs_c = extractelement <8 x i16> %lhs, i32 %idx
512 %rhs_c = extractelement <8 x i16> %rhs, i32 %idx
513 %sub = sub i16 %lhs_c, %rhs_c
517 %result = phi i16 [ 0, %entry ], [ %sub, %compare ]
518 %result_ext = zext i16 %result to i32
522 define i1 @pcmpistri_reg_eq_i8(<16 x i8> %lhs, <16 x i8> %rhs) nounwind {
523 ; X86-LABEL: pcmpistri_reg_eq_i8:
524 ; X86: # %bb.0: # %entry
525 ; X86-NEXT: pcmpistri $24, %xmm1, %xmm0
526 ; X86-NEXT: setae %al
529 ; X64-LABEL: pcmpistri_reg_eq_i8:
530 ; X64: # %bb.0: # %entry
531 ; X64-NEXT: pcmpistri $24, %xmm1, %xmm0
532 ; X64-NEXT: setae %al
535 %c = call i32 @llvm.x86.sse42.pcmpistric128(<16 x i8> %lhs, <16 x i8> %rhs, i8 24)
536 %result = icmp eq i32 %c, 0
540 define i32 @pcmpistri_reg_idx_i8(<16 x i8> %lhs, <16 x i8> %rhs) nounwind {
541 ; X86-LABEL: pcmpistri_reg_idx_i8:
542 ; X86: # %bb.0: # %entry
543 ; X86-NEXT: pcmpistri $24, %xmm1, %xmm0
544 ; X86-NEXT: movl %ecx, %eax
547 ; X64-LABEL: pcmpistri_reg_idx_i8:
548 ; X64: # %bb.0: # %entry
549 ; X64-NEXT: pcmpistri $24, %xmm1, %xmm0
550 ; X64-NEXT: movl %ecx, %eax
553 %idx = call i32 @llvm.x86.sse42.pcmpistri128(<16 x i8> %lhs, <16 x i8> %rhs, i8 24)
557 define i32 @pcmpistri_reg_diff_i8(<16 x i8> %lhs, <16 x i8> %rhs) nounwind {
558 ; X86-LABEL: pcmpistri_reg_diff_i8:
559 ; X86: # %bb.0: # %entry
560 ; X86-NEXT: pcmpistri $24, %xmm1, %xmm0
561 ; X86-NEXT: cmpl $16, %ecx
562 ; X86-NEXT: jne .LBB14_2
564 ; X86-NEXT: xorl %eax, %eax
565 ; X86-NEXT: movzbl %al, %eax
567 ; X86-NEXT: .LBB14_2: # %compare
568 ; X86-NEXT: pushl %ebp
569 ; X86-NEXT: movl %esp, %ebp
570 ; X86-NEXT: andl $-16, %esp
571 ; X86-NEXT: subl $48, %esp
572 ; X86-NEXT: movdqa %xmm0, (%esp)
573 ; X86-NEXT: andl $15, %ecx
574 ; X86-NEXT: movb (%esp,%ecx), %al
575 ; X86-NEXT: movdqa %xmm1, {{[0-9]+}}(%esp)
576 ; X86-NEXT: subb 16(%esp,%ecx), %al
577 ; X86-NEXT: movl %ebp, %esp
578 ; X86-NEXT: popl %ebp
579 ; X86-NEXT: movzbl %al, %eax
582 ; X64-LABEL: pcmpistri_reg_diff_i8:
583 ; X64: # %bb.0: # %entry
584 ; X64-NEXT: pcmpistri $24, %xmm1, %xmm0
585 ; X64-NEXT: # kill: def $ecx killed $ecx def $rcx
586 ; X64-NEXT: cmpl $16, %ecx
587 ; X64-NEXT: jne .LBB14_2
589 ; X64-NEXT: xorl %eax, %eax
590 ; X64-NEXT: movzbl %al, %eax
592 ; X64-NEXT: .LBB14_2: # %compare
593 ; X64-NEXT: movdqa %xmm0, -{{[0-9]+}}(%rsp)
594 ; X64-NEXT: andl $15, %ecx
595 ; X64-NEXT: movb -24(%rsp,%rcx), %al
596 ; X64-NEXT: movdqa %xmm1, -{{[0-9]+}}(%rsp)
597 ; X64-NEXT: subb -40(%rsp,%rcx), %al
598 ; X64-NEXT: movzbl %al, %eax
601 %idx = call i32 @llvm.x86.sse42.pcmpistri128(<16 x i8> %lhs, <16 x i8> %rhs, i8 24)
602 %eq = icmp eq i32 %idx, 16
603 br i1 %eq, label %exit, label %compare
606 %lhs_c = extractelement <16 x i8> %lhs, i32 %idx
607 %rhs_c = extractelement <16 x i8> %rhs, i32 %idx
608 %sub = sub i8 %lhs_c, %rhs_c
612 %result = phi i8 [ 0, %entry ], [ %sub, %compare ]
613 %result_ext = zext i8 %result to i32
617 define i1 @pcmpistri_mem_eq_i8(i8* %lhs_ptr, i8* %rhs_ptr) nounwind {
618 ; X86-LABEL: pcmpistri_mem_eq_i8:
619 ; X86: # %bb.0: # %entry
620 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
621 ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
622 ; X86-NEXT: movdqu (%ecx), %xmm0
623 ; X86-NEXT: pcmpistri $24, (%eax), %xmm0
624 ; X86-NEXT: setae %al
627 ; X64-LABEL: pcmpistri_mem_eq_i8:
628 ; X64: # %bb.0: # %entry
629 ; X64-NEXT: movdqu (%rdi), %xmm0
630 ; X64-NEXT: pcmpistri $24, (%rsi), %xmm0
631 ; X64-NEXT: setae %al
634 %lhs_vptr = bitcast i8* %lhs_ptr to <16 x i8>*
635 %lhs = load <16 x i8>, <16 x i8>* %lhs_vptr, align 1
636 %rhs_vptr = bitcast i8* %rhs_ptr to <16 x i8>*
637 %rhs = load <16 x i8>, <16 x i8>* %rhs_vptr, align 1
638 %c = call i32 @llvm.x86.sse42.pcmpistric128(<16 x i8> %lhs, <16 x i8> %rhs, i8 24)
639 %result = icmp eq i32 %c, 0
643 define i32 @pcmpistri_mem_idx_i8(i8* %lhs_ptr, i8* %rhs_ptr) nounwind {
644 ; X86-LABEL: pcmpistri_mem_idx_i8:
645 ; X86: # %bb.0: # %entry
646 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
647 ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
648 ; X86-NEXT: movdqu (%ecx), %xmm0
649 ; X86-NEXT: pcmpistri $24, (%eax), %xmm0
650 ; X86-NEXT: movl %ecx, %eax
653 ; X64-LABEL: pcmpistri_mem_idx_i8:
654 ; X64: # %bb.0: # %entry
655 ; X64-NEXT: movdqu (%rdi), %xmm0
656 ; X64-NEXT: pcmpistri $24, (%rsi), %xmm0
657 ; X64-NEXT: movl %ecx, %eax
660 %lhs_vptr = bitcast i8* %lhs_ptr to <16 x i8>*
661 %lhs = load <16 x i8>, <16 x i8>* %lhs_vptr, align 1
662 %rhs_vptr = bitcast i8* %rhs_ptr to <16 x i8>*
663 %rhs = load <16 x i8>, <16 x i8>* %rhs_vptr, align 1
664 %idx = call i32 @llvm.x86.sse42.pcmpistri128(<16 x i8> %lhs, <16 x i8> %rhs, i8 24)
668 define i32 @pcmpistri_mem_diff_i8(i8* %lhs_ptr, i8* %rhs_ptr) nounwind {
669 ; X86-LABEL: pcmpistri_mem_diff_i8:
670 ; X86: # %bb.0: # %entry
671 ; X86-NEXT: pushl %ebp
672 ; X86-NEXT: movl %esp, %ebp
673 ; X86-NEXT: andl $-16, %esp
674 ; X86-NEXT: subl $48, %esp
675 ; X86-NEXT: movl 12(%ebp), %eax
676 ; X86-NEXT: movl 8(%ebp), %ecx
677 ; X86-NEXT: movdqu (%ecx), %xmm1
678 ; X86-NEXT: movdqu (%eax), %xmm0
679 ; X86-NEXT: pcmpistri $24, %xmm0, %xmm1
680 ; X86-NEXT: cmpl $16, %ecx
681 ; X86-NEXT: jne .LBB17_2
683 ; X86-NEXT: xorl %eax, %eax
684 ; X86-NEXT: jmp .LBB17_3
685 ; X86-NEXT: .LBB17_2: # %compare
686 ; X86-NEXT: movdqa %xmm1, (%esp)
687 ; X86-NEXT: andl $15, %ecx
688 ; X86-NEXT: movb (%esp,%ecx), %al
689 ; X86-NEXT: movdqa %xmm0, {{[0-9]+}}(%esp)
690 ; X86-NEXT: subb 16(%esp,%ecx), %al
691 ; X86-NEXT: .LBB17_3: # %exit
692 ; X86-NEXT: movzbl %al, %eax
693 ; X86-NEXT: movl %ebp, %esp
694 ; X86-NEXT: popl %ebp
697 ; X64-LABEL: pcmpistri_mem_diff_i8:
698 ; X64: # %bb.0: # %entry
699 ; X64-NEXT: movdqu (%rdi), %xmm1
700 ; X64-NEXT: movdqu (%rsi), %xmm0
701 ; X64-NEXT: pcmpistri $24, %xmm0, %xmm1
702 ; X64-NEXT: # kill: def $ecx killed $ecx def $rcx
703 ; X64-NEXT: cmpl $16, %ecx
704 ; X64-NEXT: jne .LBB17_2
706 ; X64-NEXT: xorl %eax, %eax
707 ; X64-NEXT: movzbl %al, %eax
709 ; X64-NEXT: .LBB17_2: # %compare
710 ; X64-NEXT: movdqa %xmm1, -{{[0-9]+}}(%rsp)
711 ; X64-NEXT: andl $15, %ecx
712 ; X64-NEXT: movb -24(%rsp,%rcx), %al
713 ; X64-NEXT: movdqa %xmm0, -{{[0-9]+}}(%rsp)
714 ; X64-NEXT: subb -40(%rsp,%rcx), %al
715 ; X64-NEXT: movzbl %al, %eax
718 %lhs_vptr = bitcast i8* %lhs_ptr to <16 x i8>*
719 %lhs = load <16 x i8>, <16 x i8>* %lhs_vptr, align 1
720 %rhs_vptr = bitcast i8* %rhs_ptr to <16 x i8>*
721 %rhs = load <16 x i8>, <16 x i8>* %rhs_vptr, align 1
722 %idx = call i32 @llvm.x86.sse42.pcmpistri128(<16 x i8> %lhs, <16 x i8> %rhs, i8 24)
723 %eq = icmp eq i32 %idx, 16
724 br i1 %eq, label %exit, label %compare
727 %lhs_c = extractelement <16 x i8> %lhs, i32 %idx
728 %rhs_c = extractelement <16 x i8> %rhs, i32 %idx
729 %sub = sub i8 %lhs_c, %rhs_c
733 %result = phi i8 [ 0, %entry ], [ %sub, %compare ]
734 %result_ext = zext i8 %result to i32
738 define i1 @pcmpistri_reg_eq_i16(<8 x i16> %lhs, <8 x i16> %rhs) nounwind {
739 ; X86-LABEL: pcmpistri_reg_eq_i16:
740 ; X86: # %bb.0: # %entry
741 ; X86-NEXT: pcmpistri $24, %xmm1, %xmm0
742 ; X86-NEXT: setae %al
745 ; X64-LABEL: pcmpistri_reg_eq_i16:
746 ; X64: # %bb.0: # %entry
747 ; X64-NEXT: pcmpistri $24, %xmm1, %xmm0
748 ; X64-NEXT: setae %al
751 %lhs_cast = bitcast <8 x i16> %lhs to <16 x i8>
752 %rhs_cast = bitcast <8 x i16> %rhs to <16 x i8>
753 %c = call i32 @llvm.x86.sse42.pcmpistric128(<16 x i8> %lhs_cast, <16 x i8> %rhs_cast, i8 24)
754 %result = icmp eq i32 %c, 0
758 define i32 @pcmpistri_reg_idx_i16(<8 x i16> %lhs, <8 x i16> %rhs) nounwind {
759 ; X86-LABEL: pcmpistri_reg_idx_i16:
760 ; X86: # %bb.0: # %entry
761 ; X86-NEXT: pcmpistri $24, %xmm1, %xmm0
762 ; X86-NEXT: movl %ecx, %eax
765 ; X64-LABEL: pcmpistri_reg_idx_i16:
766 ; X64: # %bb.0: # %entry
767 ; X64-NEXT: pcmpistri $24, %xmm1, %xmm0
768 ; X64-NEXT: movl %ecx, %eax
771 %lhs_cast = bitcast <8 x i16> %lhs to <16 x i8>
772 %rhs_cast = bitcast <8 x i16> %rhs to <16 x i8>
773 %idx = call i32 @llvm.x86.sse42.pcmpistri128(<16 x i8> %lhs_cast, <16 x i8> %rhs_cast, i8 24)
777 define i32 @pcmpistri_reg_diff_i16(<8 x i16> %lhs, <8 x i16> %rhs) nounwind {
778 ; X86-LABEL: pcmpistri_reg_diff_i16:
779 ; X86: # %bb.0: # %entry
780 ; X86-NEXT: pcmpistri $24, %xmm1, %xmm0
781 ; X86-NEXT: cmpl $16, %ecx
782 ; X86-NEXT: jne .LBB20_2
784 ; X86-NEXT: xorl %eax, %eax
785 ; X86-NEXT: movzwl %ax, %eax
787 ; X86-NEXT: .LBB20_2: # %compare
788 ; X86-NEXT: pushl %ebp
789 ; X86-NEXT: movl %esp, %ebp
790 ; X86-NEXT: andl $-16, %esp
791 ; X86-NEXT: subl $48, %esp
792 ; X86-NEXT: movdqa %xmm0, (%esp)
793 ; X86-NEXT: addl %ecx, %ecx
794 ; X86-NEXT: andl $14, %ecx
795 ; X86-NEXT: movzwl (%esp,%ecx), %eax
796 ; X86-NEXT: movdqa %xmm1, {{[0-9]+}}(%esp)
797 ; X86-NEXT: subw 16(%esp,%ecx), %ax
798 ; X86-NEXT: movl %ebp, %esp
799 ; X86-NEXT: popl %ebp
800 ; X86-NEXT: movzwl %ax, %eax
803 ; X64-LABEL: pcmpistri_reg_diff_i16:
804 ; X64: # %bb.0: # %entry
805 ; X64-NEXT: pcmpistri $24, %xmm1, %xmm0
806 ; X64-NEXT: # kill: def $ecx killed $ecx def $rcx
807 ; X64-NEXT: cmpl $16, %ecx
808 ; X64-NEXT: jne .LBB20_2
810 ; X64-NEXT: xorl %eax, %eax
811 ; X64-NEXT: movzwl %ax, %eax
813 ; X64-NEXT: .LBB20_2: # %compare
814 ; X64-NEXT: movdqa %xmm0, -{{[0-9]+}}(%rsp)
815 ; X64-NEXT: andl $7, %ecx
816 ; X64-NEXT: movzwl -24(%rsp,%rcx,2), %eax
817 ; X64-NEXT: movdqa %xmm1, -{{[0-9]+}}(%rsp)
818 ; X64-NEXT: subw -40(%rsp,%rcx,2), %ax
819 ; X64-NEXT: movzwl %ax, %eax
822 %lhs_cast = bitcast <8 x i16> %lhs to <16 x i8>
823 %rhs_cast = bitcast <8 x i16> %rhs to <16 x i8>
824 %idx = call i32 @llvm.x86.sse42.pcmpistri128(<16 x i8> %lhs_cast, <16 x i8> %rhs_cast, i8 24)
825 %eq = icmp eq i32 %idx, 16
826 br i1 %eq, label %exit, label %compare
829 %lhs_c = extractelement <8 x i16> %lhs, i32 %idx
830 %rhs_c = extractelement <8 x i16> %rhs, i32 %idx
831 %sub = sub i16 %lhs_c, %rhs_c
835 %result = phi i16 [ 0, %entry ], [ %sub, %compare ]
836 %result_ext = zext i16 %result to i32
840 define i1 @pcmpistri_mem_eq_i16(i16* %lhs_ptr, i16* %rhs_ptr) nounwind {
841 ; X86-LABEL: pcmpistri_mem_eq_i16:
842 ; X86: # %bb.0: # %entry
843 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
844 ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
845 ; X86-NEXT: movdqu (%ecx), %xmm0
846 ; X86-NEXT: pcmpistri $25, (%eax), %xmm0
847 ; X86-NEXT: setae %al
850 ; X64-LABEL: pcmpistri_mem_eq_i16:
851 ; X64: # %bb.0: # %entry
852 ; X64-NEXT: movdqu (%rdi), %xmm0
853 ; X64-NEXT: pcmpistri $25, (%rsi), %xmm0
854 ; X64-NEXT: setae %al
857 %lhs_vptr = bitcast i16* %lhs_ptr to <8 x i16>*
858 %lhs = load <8 x i16>, <8 x i16>* %lhs_vptr, align 1
859 %rhs_vptr = bitcast i16* %rhs_ptr to <8 x i16>*
860 %rhs = load <8 x i16>, <8 x i16>* %rhs_vptr, align 1
861 %lhs_cast = bitcast <8 x i16> %lhs to <16 x i8>
862 %rhs_cast = bitcast <8 x i16> %rhs to <16 x i8>
863 %c = call i32 @llvm.x86.sse42.pcmpistric128(<16 x i8> %lhs_cast, <16 x i8> %rhs_cast, i8 25)
864 %result = icmp eq i32 %c, 0
868 define i32 @pcmpistri_mem_idx_i16(i16* %lhs_ptr, i16* %rhs_ptr) nounwind {
869 ; X86-LABEL: pcmpistri_mem_idx_i16:
870 ; X86: # %bb.0: # %entry
871 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
872 ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
873 ; X86-NEXT: movdqu (%ecx), %xmm0
874 ; X86-NEXT: pcmpistri $25, (%eax), %xmm0
875 ; X86-NEXT: movl %ecx, %eax
878 ; X64-LABEL: pcmpistri_mem_idx_i16:
879 ; X64: # %bb.0: # %entry
880 ; X64-NEXT: movdqu (%rdi), %xmm0
881 ; X64-NEXT: pcmpistri $25, (%rsi), %xmm0
882 ; X64-NEXT: movl %ecx, %eax
885 %lhs_vptr = bitcast i16* %lhs_ptr to <8 x i16>*
886 %lhs = load <8 x i16>, <8 x i16>* %lhs_vptr, align 1
887 %rhs_vptr = bitcast i16* %rhs_ptr to <8 x i16>*
888 %rhs = load <8 x i16>, <8 x i16>* %rhs_vptr, align 1
889 %lhs_cast = bitcast <8 x i16> %lhs to <16 x i8>
890 %rhs_cast = bitcast <8 x i16> %rhs to <16 x i8>
891 %idx = call i32 @llvm.x86.sse42.pcmpistri128(<16 x i8> %lhs_cast, <16 x i8> %rhs_cast, i8 25)
895 define i32 @pcmpistri_mem_diff_i16(i16* %lhs_ptr, i16* %rhs_ptr) nounwind {
896 ; X86-LABEL: pcmpistri_mem_diff_i16:
897 ; X86: # %bb.0: # %entry
898 ; X86-NEXT: pushl %ebp
899 ; X86-NEXT: movl %esp, %ebp
900 ; X86-NEXT: andl $-16, %esp
901 ; X86-NEXT: subl $48, %esp
902 ; X86-NEXT: movl 12(%ebp), %eax
903 ; X86-NEXT: movl 8(%ebp), %ecx
904 ; X86-NEXT: movdqu (%ecx), %xmm1
905 ; X86-NEXT: movdqu (%eax), %xmm0
906 ; X86-NEXT: pcmpistri $25, %xmm0, %xmm1
907 ; X86-NEXT: cmpl $8, %ecx
908 ; X86-NEXT: jne .LBB23_2
910 ; X86-NEXT: xorl %eax, %eax
911 ; X86-NEXT: jmp .LBB23_3
912 ; X86-NEXT: .LBB23_2: # %compare
913 ; X86-NEXT: movdqa %xmm1, (%esp)
914 ; X86-NEXT: addl %ecx, %ecx
915 ; X86-NEXT: andl $14, %ecx
916 ; X86-NEXT: movzwl (%esp,%ecx), %eax
917 ; X86-NEXT: movdqa %xmm0, {{[0-9]+}}(%esp)
918 ; X86-NEXT: subw 16(%esp,%ecx), %ax
919 ; X86-NEXT: .LBB23_3: # %exit
920 ; X86-NEXT: movzwl %ax, %eax
921 ; X86-NEXT: movl %ebp, %esp
922 ; X86-NEXT: popl %ebp
925 ; X64-LABEL: pcmpistri_mem_diff_i16:
926 ; X64: # %bb.0: # %entry
927 ; X64-NEXT: movdqu (%rdi), %xmm1
928 ; X64-NEXT: movdqu (%rsi), %xmm0
929 ; X64-NEXT: pcmpistri $25, %xmm0, %xmm1
930 ; X64-NEXT: # kill: def $ecx killed $ecx def $rcx
931 ; X64-NEXT: cmpl $8, %ecx
932 ; X64-NEXT: jne .LBB23_2
934 ; X64-NEXT: xorl %eax, %eax
935 ; X64-NEXT: movzwl %ax, %eax
937 ; X64-NEXT: .LBB23_2: # %compare
938 ; X64-NEXT: movdqa %xmm1, -{{[0-9]+}}(%rsp)
939 ; X64-NEXT: andl $7, %ecx
940 ; X64-NEXT: movzwl -24(%rsp,%rcx,2), %eax
941 ; X64-NEXT: movdqa %xmm0, -{{[0-9]+}}(%rsp)
942 ; X64-NEXT: subw -40(%rsp,%rcx,2), %ax
943 ; X64-NEXT: movzwl %ax, %eax
946 %lhs_vptr = bitcast i16* %lhs_ptr to <8 x i16>*
947 %lhs = load <8 x i16>, <8 x i16>* %lhs_vptr, align 1
948 %rhs_vptr = bitcast i16* %rhs_ptr to <8 x i16>*
949 %rhs = load <8 x i16>, <8 x i16>* %rhs_vptr, align 1
950 %lhs_cast = bitcast <8 x i16> %lhs to <16 x i8>
951 %rhs_cast = bitcast <8 x i16> %rhs to <16 x i8>
952 %idx = call i32 @llvm.x86.sse42.pcmpistri128(<16 x i8> %lhs_cast, <16 x i8> %rhs_cast, i8 25)
953 %eq = icmp eq i32 %idx, 8
954 br i1 %eq, label %exit, label %compare
957 %lhs_c = extractelement <8 x i16> %lhs, i32 %idx
958 %rhs_c = extractelement <8 x i16> %rhs, i32 %idx
959 %sub = sub i16 %lhs_c, %rhs_c
963 %result = phi i16 [ 0, %entry ], [ %sub, %compare ]
964 %result_ext = zext i16 %result to i32
968 define void @pcmpestr_index_flag(<16 x i8> %lhs, i32 %lhs_len, <16 x i8> %rhs, i32 %rhs_len, i32* %iptr, i32* %fptr) nounwind {
969 ; X86-LABEL: pcmpestr_index_flag:
970 ; X86: # %bb.0: # %entry
971 ; X86-NEXT: pushl %ebx
972 ; X86-NEXT: pushl %edi
973 ; X86-NEXT: pushl %esi
974 ; X86-NEXT: movl {{[0-9]+}}(%esp), %esi
975 ; X86-NEXT: movl {{[0-9]+}}(%esp), %edi
976 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
977 ; X86-NEXT: movl {{[0-9]+}}(%esp), %edx
978 ; X86-NEXT: xorl %ebx, %ebx
979 ; X86-NEXT: pcmpestri $24, %xmm1, %xmm0
981 ; X86-NEXT: movl %ecx, (%edi)
982 ; X86-NEXT: movl %ebx, (%esi)
983 ; X86-NEXT: popl %esi
984 ; X86-NEXT: popl %edi
985 ; X86-NEXT: popl %ebx
988 ; X64-LABEL: pcmpestr_index_flag:
989 ; X64: # %bb.0: # %entry
990 ; X64-NEXT: movq %rcx, %r8
991 ; X64-NEXT: movq %rdx, %r9
992 ; X64-NEXT: movl %esi, %edx
993 ; X64-NEXT: movl %edi, %eax
994 ; X64-NEXT: xorl %esi, %esi
995 ; X64-NEXT: pcmpestri $24, %xmm1, %xmm0
996 ; X64-NEXT: setb %sil
997 ; X64-NEXT: movl %ecx, (%r9)
998 ; X64-NEXT: movl %esi, (%r8)
1001 %flag = call i32 @llvm.x86.sse42.pcmpestric128(<16 x i8> %lhs, i32 %lhs_len, <16 x i8> %rhs, i32 %rhs_len, i8 24)
1002 %index = call i32 @llvm.x86.sse42.pcmpestri128(<16 x i8> %lhs, i32 %lhs_len, <16 x i8> %rhs, i32 %rhs_len, i8 24)
1003 store i32 %index, i32* %iptr
1004 store i32 %flag, i32* %fptr
1008 define void @pcmpestr_mask_flag(<16 x i8> %lhs, i32 %lhs_len, <16 x i8> %rhs, i32 %rhs_len, <16 x i8>* %mptr, i32* %fptr) nounwind {
1009 ; X86-LABEL: pcmpestr_mask_flag:
1010 ; X86: # %bb.0: # %entry
1011 ; X86-NEXT: pushl %ebx
1012 ; X86-NEXT: pushl %esi
1013 ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
1014 ; X86-NEXT: movl {{[0-9]+}}(%esp), %esi
1015 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
1016 ; X86-NEXT: movl {{[0-9]+}}(%esp), %edx
1017 ; X86-NEXT: xorl %ebx, %ebx
1018 ; X86-NEXT: pcmpestrm $24, %xmm1, %xmm0
1019 ; X86-NEXT: setb %bl
1020 ; X86-NEXT: movdqa %xmm0, (%esi)
1021 ; X86-NEXT: movl %ebx, (%ecx)
1022 ; X86-NEXT: popl %esi
1023 ; X86-NEXT: popl %ebx
1026 ; X64-LABEL: pcmpestr_mask_flag:
1027 ; X64: # %bb.0: # %entry
1028 ; X64-NEXT: movq %rdx, %r8
1029 ; X64-NEXT: movl %esi, %edx
1030 ; X64-NEXT: movl %edi, %eax
1031 ; X64-NEXT: xorl %esi, %esi
1032 ; X64-NEXT: pcmpestrm $24, %xmm1, %xmm0
1033 ; X64-NEXT: setb %sil
1034 ; X64-NEXT: movdqa %xmm0, (%r8)
1035 ; X64-NEXT: movl %esi, (%rcx)
1038 %flag = call i32 @llvm.x86.sse42.pcmpestric128(<16 x i8> %lhs, i32 %lhs_len, <16 x i8> %rhs, i32 %rhs_len, i8 24)
1039 %mask = call <16 x i8> @llvm.x86.sse42.pcmpestrm128(<16 x i8> %lhs, i32 %lhs_len, <16 x i8> %rhs, i32 %rhs_len, i8 24)
1040 store <16 x i8> %mask, <16 x i8>* %mptr
1041 store i32 %flag, i32* %fptr
1045 define void @pcmpestr_mask_index(<16 x i8> %lhs, i32 %lhs_len, <16 x i8> %rhs, i32 %rhs_len, <16 x i8>* %mptr, i32* %iptr) nounwind {
1046 ; X86-LABEL: pcmpestr_mask_index:
1047 ; X86: # %bb.0: # %entry
1048 ; X86-NEXT: pushl %edi
1049 ; X86-NEXT: pushl %esi
1050 ; X86-NEXT: movdqa %xmm0, %xmm2
1051 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
1052 ; X86-NEXT: movl {{[0-9]+}}(%esp), %edx
1053 ; X86-NEXT: pcmpestrm $24, %xmm1, %xmm0
1054 ; X86-NEXT: movl {{[0-9]+}}(%esp), %esi
1055 ; X86-NEXT: movl {{[0-9]+}}(%esp), %edi
1056 ; X86-NEXT: pcmpestri $24, %xmm1, %xmm2
1057 ; X86-NEXT: movdqa %xmm0, (%edi)
1058 ; X86-NEXT: movl %ecx, (%esi)
1059 ; X86-NEXT: popl %esi
1060 ; X86-NEXT: popl %edi
1063 ; X64-LABEL: pcmpestr_mask_index:
1064 ; X64: # %bb.0: # %entry
1065 ; X64-NEXT: movq %rcx, %r8
1066 ; X64-NEXT: movq %rdx, %r9
1067 ; X64-NEXT: movl %esi, %edx
1068 ; X64-NEXT: movl %edi, %eax
1069 ; X64-NEXT: movdqa %xmm0, %xmm2
1070 ; X64-NEXT: pcmpestrm $24, %xmm1, %xmm0
1071 ; X64-NEXT: pcmpestri $24, %xmm1, %xmm2
1072 ; X64-NEXT: movdqa %xmm0, (%r9)
1073 ; X64-NEXT: movl %ecx, (%r8)
1076 %index = call i32 @llvm.x86.sse42.pcmpestri128(<16 x i8> %lhs, i32 %lhs_len, <16 x i8> %rhs, i32 %rhs_len, i8 24)
1077 %mask = call <16 x i8> @llvm.x86.sse42.pcmpestrm128(<16 x i8> %lhs, i32 %lhs_len, <16 x i8> %rhs, i32 %rhs_len, i8 24)
1078 store <16 x i8> %mask, <16 x i8>* %mptr
1079 store i32 %index, i32* %iptr
1083 define void @pcmpestr_mask_index_flag(<16 x i8> %lhs, i32 %lhs_len, <16 x i8> %rhs, i32 %rhs_len, <16 x i8>* %mptr, i32* %iptr, i32* %fptr) nounwind {
1084 ; X86-LABEL: pcmpestr_mask_index_flag:
1085 ; X86: # %bb.0: # %entry
1086 ; X86-NEXT: pushl %ebp
1087 ; X86-NEXT: pushl %ebx
1088 ; X86-NEXT: pushl %edi
1089 ; X86-NEXT: pushl %esi
1090 ; X86-NEXT: movdqa %xmm0, %xmm2
1091 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
1092 ; X86-NEXT: movl {{[0-9]+}}(%esp), %edx
1093 ; X86-NEXT: pcmpestrm $24, %xmm1, %xmm0
1094 ; X86-NEXT: movl {{[0-9]+}}(%esp), %esi
1095 ; X86-NEXT: movl {{[0-9]+}}(%esp), %edi
1096 ; X86-NEXT: movl {{[0-9]+}}(%esp), %ebp
1097 ; X86-NEXT: xorl %ebx, %ebx
1098 ; X86-NEXT: pcmpestri $24, %xmm1, %xmm2
1099 ; X86-NEXT: setb %bl
1100 ; X86-NEXT: movdqa %xmm0, (%ebp)
1101 ; X86-NEXT: movl %ecx, (%edi)
1102 ; X86-NEXT: movl %ebx, (%esi)
1103 ; X86-NEXT: popl %esi
1104 ; X86-NEXT: popl %edi
1105 ; X86-NEXT: popl %ebx
1106 ; X86-NEXT: popl %ebp
1109 ; X64-LABEL: pcmpestr_mask_index_flag:
1110 ; X64: # %bb.0: # %entry
1111 ; X64-NEXT: movq %rcx, %r9
1112 ; X64-NEXT: movq %rdx, %r10
1113 ; X64-NEXT: movl %esi, %edx
1114 ; X64-NEXT: movl %edi, %eax
1115 ; X64-NEXT: movdqa %xmm0, %xmm2
1116 ; X64-NEXT: pcmpestrm $24, %xmm1, %xmm0
1117 ; X64-NEXT: xorl %esi, %esi
1118 ; X64-NEXT: pcmpestri $24, %xmm1, %xmm2
1119 ; X64-NEXT: setb %sil
1120 ; X64-NEXT: movdqa %xmm0, (%r10)
1121 ; X64-NEXT: movl %ecx, (%r9)
1122 ; X64-NEXT: movl %esi, (%r8)
1125 %index = call i32 @llvm.x86.sse42.pcmpestri128(<16 x i8> %lhs, i32 %lhs_len, <16 x i8> %rhs, i32 %rhs_len, i8 24)
1126 %mask = call <16 x i8> @llvm.x86.sse42.pcmpestrm128(<16 x i8> %lhs, i32 %lhs_len, <16 x i8> %rhs, i32 %rhs_len, i8 24)
1127 %flag = call i32 @llvm.x86.sse42.pcmpestric128(<16 x i8> %lhs, i32 %lhs_len, <16 x i8> %rhs, i32 %rhs_len, i8 24)
1128 store <16 x i8> %mask, <16 x i8>* %mptr
1129 store i32 %index, i32* %iptr
1130 store i32 %flag, i32* %fptr
1134 define void @pcmpistr_index_flag(<16 x i8> %lhs, <16 x i8> %rhs, i32* %iptr, i32* %fptr) nounwind {
1135 ; X86-LABEL: pcmpistr_index_flag:
1136 ; X86: # %bb.0: # %entry
1137 ; X86-NEXT: pushl %ebx
1138 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
1139 ; X86-NEXT: movl {{[0-9]+}}(%esp), %edx
1140 ; X86-NEXT: xorl %ebx, %ebx
1141 ; X86-NEXT: pcmpistri $24, %xmm1, %xmm0
1142 ; X86-NEXT: setb %bl
1143 ; X86-NEXT: movl %ecx, (%edx)
1144 ; X86-NEXT: movl %ebx, (%eax)
1145 ; X86-NEXT: popl %ebx
1148 ; X64-LABEL: pcmpistr_index_flag:
1149 ; X64: # %bb.0: # %entry
1150 ; X64-NEXT: xorl %eax, %eax
1151 ; X64-NEXT: pcmpistri $24, %xmm1, %xmm0
1152 ; X64-NEXT: setb %al
1153 ; X64-NEXT: movl %ecx, (%rdi)
1154 ; X64-NEXT: movl %eax, (%rsi)
1157 %flag = call i32 @llvm.x86.sse42.pcmpistric128(<16 x i8> %lhs, <16 x i8> %rhs, i8 24)
1158 %index = call i32 @llvm.x86.sse42.pcmpistri128(<16 x i8> %lhs, <16 x i8> %rhs, i8 24)
1159 store i32 %index, i32* %iptr
1160 store i32 %flag, i32* %fptr
1164 define void @pcmpistr_mask_flag(<16 x i8> %lhs, <16 x i8> %rhs, <16 x i8>* %mptr, i32* %fptr) nounwind {
1165 ; X86-LABEL: pcmpistr_mask_flag:
1166 ; X86: # %bb.0: # %entry
1167 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
1168 ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
1169 ; X86-NEXT: xorl %edx, %edx
1170 ; X86-NEXT: pcmpistrm $24, %xmm1, %xmm0
1171 ; X86-NEXT: setb %dl
1172 ; X86-NEXT: movdqa %xmm0, (%ecx)
1173 ; X86-NEXT: movl %edx, (%eax)
1176 ; X64-LABEL: pcmpistr_mask_flag:
1177 ; X64: # %bb.0: # %entry
1178 ; X64-NEXT: xorl %eax, %eax
1179 ; X64-NEXT: pcmpistrm $24, %xmm1, %xmm0
1180 ; X64-NEXT: setb %al
1181 ; X64-NEXT: movdqa %xmm0, (%rdi)
1182 ; X64-NEXT: movl %eax, (%rsi)
1185 %flag = call i32 @llvm.x86.sse42.pcmpistric128(<16 x i8> %lhs, <16 x i8> %rhs, i8 24)
1186 %mask = call <16 x i8> @llvm.x86.sse42.pcmpistrm128(<16 x i8> %lhs, <16 x i8> %rhs, i8 24)
1187 store <16 x i8> %mask, <16 x i8>* %mptr
1188 store i32 %flag, i32* %fptr
1192 define void @pcmpistr_mask_index(<16 x i8> %lhs, <16 x i8> %rhs, <16 x i8>* %mptr, i32* %iptr) nounwind {
1193 ; X86-LABEL: pcmpistr_mask_index:
1194 ; X86: # %bb.0: # %entry
1195 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
1196 ; X86-NEXT: movl {{[0-9]+}}(%esp), %edx
1197 ; X86-NEXT: pcmpistri $24, %xmm1, %xmm0
1198 ; X86-NEXT: pcmpistrm $24, %xmm1, %xmm0
1199 ; X86-NEXT: movdqa %xmm0, (%edx)
1200 ; X86-NEXT: movl %ecx, (%eax)
1203 ; X64-LABEL: pcmpistr_mask_index:
1204 ; X64: # %bb.0: # %entry
1205 ; X64-NEXT: pcmpistri $24, %xmm1, %xmm0
1206 ; X64-NEXT: pcmpistrm $24, %xmm1, %xmm0
1207 ; X64-NEXT: movdqa %xmm0, (%rdi)
1208 ; X64-NEXT: movl %ecx, (%rsi)
1211 %index = call i32 @llvm.x86.sse42.pcmpistri128(<16 x i8> %lhs, <16 x i8> %rhs, i8 24)
1212 %mask = call <16 x i8> @llvm.x86.sse42.pcmpistrm128(<16 x i8> %lhs, <16 x i8> %rhs, i8 24)
1213 store <16 x i8> %mask, <16 x i8>* %mptr
1214 store i32 %index, i32* %iptr
1218 define void @pcmpistr_mask_index_flag(<16 x i8> %lhs, <16 x i8> %rhs, <16 x i8>* %mptr, i32* %iptr, i32* %fptr) nounwind {
1219 ; X86-LABEL: pcmpistr_mask_index_flag:
1220 ; X86: # %bb.0: # %entry
1221 ; X86-NEXT: pushl %ebx
1222 ; X86-NEXT: pushl %esi
1223 ; X86-NEXT: movdqa %xmm0, %xmm2
1224 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
1225 ; X86-NEXT: movl {{[0-9]+}}(%esp), %edx
1226 ; X86-NEXT: movl {{[0-9]+}}(%esp), %esi
1227 ; X86-NEXT: pcmpistrm $24, %xmm1, %xmm0
1228 ; X86-NEXT: xorl %ebx, %ebx
1229 ; X86-NEXT: pcmpistri $24, %xmm1, %xmm2
1230 ; X86-NEXT: setb %bl
1231 ; X86-NEXT: movdqa %xmm0, (%esi)
1232 ; X86-NEXT: movl %ecx, (%edx)
1233 ; X86-NEXT: movl %ebx, (%eax)
1234 ; X86-NEXT: popl %esi
1235 ; X86-NEXT: popl %ebx
1238 ; X64-LABEL: pcmpistr_mask_index_flag:
1239 ; X64: # %bb.0: # %entry
1240 ; X64-NEXT: movdqa %xmm0, %xmm2
1241 ; X64-NEXT: pcmpistrm $24, %xmm1, %xmm0
1242 ; X64-NEXT: xorl %eax, %eax
1243 ; X64-NEXT: pcmpistri $24, %xmm1, %xmm2
1244 ; X64-NEXT: setb %al
1245 ; X64-NEXT: movdqa %xmm0, (%rdi)
1246 ; X64-NEXT: movl %ecx, (%rsi)
1247 ; X64-NEXT: movl %eax, (%rdx)
1250 %index = call i32 @llvm.x86.sse42.pcmpistri128(<16 x i8> %lhs, <16 x i8> %rhs, i8 24)
1251 %mask = call <16 x i8> @llvm.x86.sse42.pcmpistrm128(<16 x i8> %lhs, <16 x i8> %rhs, i8 24)
1252 %flag = call i32 @llvm.x86.sse42.pcmpistric128(<16 x i8> %lhs, <16 x i8> %rhs, i8 24)
1253 store <16 x i8> %mask, <16 x i8>* %mptr
1254 store i32 %index, i32* %iptr
1255 store i32 %flag, i32* %fptr
1259 ; Make sure we don't fold loads when we need to emit pcmpistrm and pcmpistri.
1260 define void @pcmpistr_mask_index_flag_load(<16 x i8> %lhs, <16 x i8>* %rhsptr, <16 x i8>* %mptr, i32* %iptr, i32* %fptr) nounwind {
1261 ; X86-LABEL: pcmpistr_mask_index_flag_load:
1262 ; X86: # %bb.0: # %entry
1263 ; X86-NEXT: pushl %ebx
1264 ; X86-NEXT: pushl %esi
1265 ; X86-NEXT: movdqa %xmm0, %xmm1
1266 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
1267 ; X86-NEXT: movl {{[0-9]+}}(%esp), %edx
1268 ; X86-NEXT: movl {{[0-9]+}}(%esp), %esi
1269 ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
1270 ; X86-NEXT: movdqu (%ecx), %xmm2
1271 ; X86-NEXT: pcmpistrm $24, %xmm2, %xmm0
1272 ; X86-NEXT: xorl %ebx, %ebx
1273 ; X86-NEXT: pcmpistri $24, %xmm2, %xmm1
1274 ; X86-NEXT: setb %bl
1275 ; X86-NEXT: movdqa %xmm0, (%esi)
1276 ; X86-NEXT: movl %ecx, (%edx)
1277 ; X86-NEXT: movl %ebx, (%eax)
1278 ; X86-NEXT: popl %esi
1279 ; X86-NEXT: popl %ebx
1282 ; X64-LABEL: pcmpistr_mask_index_flag_load:
1283 ; X64: # %bb.0: # %entry
1284 ; X64-NEXT: movq %rcx, %rax
1285 ; X64-NEXT: movdqa %xmm0, %xmm1
1286 ; X64-NEXT: movdqu (%rdi), %xmm2
1287 ; X64-NEXT: pcmpistrm $24, %xmm2, %xmm0
1288 ; X64-NEXT: xorl %edi, %edi
1289 ; X64-NEXT: pcmpistri $24, %xmm2, %xmm1
1290 ; X64-NEXT: setb %dil
1291 ; X64-NEXT: movdqa %xmm0, (%rsi)
1292 ; X64-NEXT: movl %ecx, (%rdx)
1293 ; X64-NEXT: movl %edi, (%rax)
1296 %rhs = load <16 x i8>, <16 x i8>* %rhsptr, align 1
1297 %index = call i32 @llvm.x86.sse42.pcmpistri128(<16 x i8> %lhs, <16 x i8> %rhs, i8 24)
1298 %mask = call <16 x i8> @llvm.x86.sse42.pcmpistrm128(<16 x i8> %lhs, <16 x i8> %rhs, i8 24)
1299 %flag = call i32 @llvm.x86.sse42.pcmpistric128(<16 x i8> %lhs, <16 x i8> %rhs, i8 24)
1300 store <16 x i8> %mask, <16 x i8>* %mptr
1301 store i32 %index, i32* %iptr
1302 store i32 %flag, i32* %fptr
1306 ; Make sure we don't fold nontemporal loads.
1307 define i32 @pcmpestri_nontemporal(<16 x i8> %lhs, i32 %lhs_len, <16 x i8>* %rhsptr, i32 %rhs_len) nounwind {
1308 ; X86-LABEL: pcmpestri_nontemporal:
1309 ; X86: # %bb.0: # %entry
1310 ; X86-NEXT: pushl %ebx
1311 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
1312 ; X86-NEXT: movl {{[0-9]+}}(%esp), %edx
1313 ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
1314 ; X86-NEXT: movntdqa (%ecx), %xmm1
1315 ; X86-NEXT: xorl %ebx, %ebx
1316 ; X86-NEXT: pcmpestri $24, %xmm1, %xmm0
1317 ; X86-NEXT: setb %bl
1318 ; X86-NEXT: movl %ebx, %eax
1319 ; X86-NEXT: popl %ebx
1322 ; X64-LABEL: pcmpestri_nontemporal:
1323 ; X64: # %bb.0: # %entry
1324 ; X64-NEXT: movl %edi, %eax
1325 ; X64-NEXT: movntdqa (%rsi), %xmm1
1326 ; X64-NEXT: xorl %esi, %esi
1327 ; X64-NEXT: pcmpestri $24, %xmm1, %xmm0
1328 ; X64-NEXT: setb %sil
1329 ; X64-NEXT: movl %esi, %eax
1332 %rhs = load <16 x i8>, <16 x i8>* %rhsptr, align 16, !nontemporal !0
1333 %flag = call i32 @llvm.x86.sse42.pcmpestric128(<16 x i8> %lhs, i32 %lhs_len, <16 x i8> %rhs, i32 %rhs_len, i8 24)