1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc -mtriple=x86_64-unknown-unknown < %s -mattr=sse | FileCheck %s --check-prefix=SSE
3 ; RUN: llc -mtriple=x86_64-unknown-unknown < %s -mattr=sse2 | FileCheck %s --check-prefix=SSE
4 ; RUN: llc -mtriple=x86_64-unknown-unknown < %s -mattr=sse2,-slow-unaligned-mem-16 | FileCheck %s --check-prefix=SSE2FAST
5 ; RUN: llc -mtriple=x86_64-unknown-unknown < %s -mattr=avx | FileCheck %s --check-prefix=AVX --check-prefix=AVX1
6 ; RUN: llc -mtriple=x86_64-unknown-unknown < %s -mattr=avx2 | FileCheck %s --check-prefix=AVX --check-prefix=AVX2
8 ; https://llvm.org/bugs/show_bug.cgi?id=27100
10 define void @memset_16_nonzero_bytes(i8* %x) {
11 ; SSE-LABEL: memset_16_nonzero_bytes:
13 ; SSE-NEXT: movabsq $3038287259199220266, %rax # imm = 0x2A2A2A2A2A2A2A2A
14 ; SSE-NEXT: movq %rax, 8(%rdi)
15 ; SSE-NEXT: movq %rax, (%rdi)
18 ; SSE2FAST-LABEL: memset_16_nonzero_bytes:
20 ; SSE2FAST-NEXT: movaps {{.*#+}} xmm0 = [42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42]
21 ; SSE2FAST-NEXT: movups %xmm0, (%rdi)
24 ; AVX-LABEL: memset_16_nonzero_bytes:
26 ; AVX-NEXT: vmovaps {{.*#+}} xmm0 = [42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42]
27 ; AVX-NEXT: vmovups %xmm0, (%rdi)
29 %call = tail call i8* @__memset_chk(i8* %x, i32 42, i64 16, i64 -1)
33 define void @memset_32_nonzero_bytes(i8* %x) {
34 ; SSE-LABEL: memset_32_nonzero_bytes:
36 ; SSE-NEXT: movabsq $3038287259199220266, %rax # imm = 0x2A2A2A2A2A2A2A2A
37 ; SSE-NEXT: movq %rax, 24(%rdi)
38 ; SSE-NEXT: movq %rax, 16(%rdi)
39 ; SSE-NEXT: movq %rax, 8(%rdi)
40 ; SSE-NEXT: movq %rax, (%rdi)
43 ; SSE2FAST-LABEL: memset_32_nonzero_bytes:
45 ; SSE2FAST-NEXT: movaps {{.*#+}} xmm0 = [42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42]
46 ; SSE2FAST-NEXT: movups %xmm0, 16(%rdi)
47 ; SSE2FAST-NEXT: movups %xmm0, (%rdi)
50 ; AVX-LABEL: memset_32_nonzero_bytes:
52 ; AVX-NEXT: vmovaps {{.*#+}} ymm0 = [42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42]
53 ; AVX-NEXT: vmovups %ymm0, (%rdi)
54 ; AVX-NEXT: vzeroupper
56 %call = tail call i8* @__memset_chk(i8* %x, i32 42, i64 32, i64 -1)
60 define void @memset_64_nonzero_bytes(i8* %x) {
61 ; SSE-LABEL: memset_64_nonzero_bytes:
63 ; SSE-NEXT: movabsq $3038287259199220266, %rax # imm = 0x2A2A2A2A2A2A2A2A
64 ; SSE-NEXT: movq %rax, 56(%rdi)
65 ; SSE-NEXT: movq %rax, 48(%rdi)
66 ; SSE-NEXT: movq %rax, 40(%rdi)
67 ; SSE-NEXT: movq %rax, 32(%rdi)
68 ; SSE-NEXT: movq %rax, 24(%rdi)
69 ; SSE-NEXT: movq %rax, 16(%rdi)
70 ; SSE-NEXT: movq %rax, 8(%rdi)
71 ; SSE-NEXT: movq %rax, (%rdi)
74 ; SSE2FAST-LABEL: memset_64_nonzero_bytes:
76 ; SSE2FAST-NEXT: movaps {{.*#+}} xmm0 = [42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42]
77 ; SSE2FAST-NEXT: movups %xmm0, 48(%rdi)
78 ; SSE2FAST-NEXT: movups %xmm0, 32(%rdi)
79 ; SSE2FAST-NEXT: movups %xmm0, 16(%rdi)
80 ; SSE2FAST-NEXT: movups %xmm0, (%rdi)
83 ; AVX-LABEL: memset_64_nonzero_bytes:
85 ; AVX-NEXT: vmovaps {{.*#+}} ymm0 = [42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42]
86 ; AVX-NEXT: vmovups %ymm0, 32(%rdi)
87 ; AVX-NEXT: vmovups %ymm0, (%rdi)
88 ; AVX-NEXT: vzeroupper
90 %call = tail call i8* @__memset_chk(i8* %x, i32 42, i64 64, i64 -1)
94 define void @memset_128_nonzero_bytes(i8* %x) {
95 ; SSE-LABEL: memset_128_nonzero_bytes:
97 ; SSE-NEXT: movabsq $3038287259199220266, %rax # imm = 0x2A2A2A2A2A2A2A2A
98 ; SSE-NEXT: movq %rax, 120(%rdi)
99 ; SSE-NEXT: movq %rax, 112(%rdi)
100 ; SSE-NEXT: movq %rax, 104(%rdi)
101 ; SSE-NEXT: movq %rax, 96(%rdi)
102 ; SSE-NEXT: movq %rax, 88(%rdi)
103 ; SSE-NEXT: movq %rax, 80(%rdi)
104 ; SSE-NEXT: movq %rax, 72(%rdi)
105 ; SSE-NEXT: movq %rax, 64(%rdi)
106 ; SSE-NEXT: movq %rax, 56(%rdi)
107 ; SSE-NEXT: movq %rax, 48(%rdi)
108 ; SSE-NEXT: movq %rax, 40(%rdi)
109 ; SSE-NEXT: movq %rax, 32(%rdi)
110 ; SSE-NEXT: movq %rax, 24(%rdi)
111 ; SSE-NEXT: movq %rax, 16(%rdi)
112 ; SSE-NEXT: movq %rax, 8(%rdi)
113 ; SSE-NEXT: movq %rax, (%rdi)
116 ; SSE2FAST-LABEL: memset_128_nonzero_bytes:
118 ; SSE2FAST-NEXT: movaps {{.*#+}} xmm0 = [42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42]
119 ; SSE2FAST-NEXT: movups %xmm0, 112(%rdi)
120 ; SSE2FAST-NEXT: movups %xmm0, 96(%rdi)
121 ; SSE2FAST-NEXT: movups %xmm0, 80(%rdi)
122 ; SSE2FAST-NEXT: movups %xmm0, 64(%rdi)
123 ; SSE2FAST-NEXT: movups %xmm0, 48(%rdi)
124 ; SSE2FAST-NEXT: movups %xmm0, 32(%rdi)
125 ; SSE2FAST-NEXT: movups %xmm0, 16(%rdi)
126 ; SSE2FAST-NEXT: movups %xmm0, (%rdi)
127 ; SSE2FAST-NEXT: retq
129 ; AVX-LABEL: memset_128_nonzero_bytes:
131 ; AVX-NEXT: vmovaps {{.*#+}} ymm0 = [42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42]
132 ; AVX-NEXT: vmovups %ymm0, 96(%rdi)
133 ; AVX-NEXT: vmovups %ymm0, 64(%rdi)
134 ; AVX-NEXT: vmovups %ymm0, 32(%rdi)
135 ; AVX-NEXT: vmovups %ymm0, (%rdi)
136 ; AVX-NEXT: vzeroupper
138 %call = tail call i8* @__memset_chk(i8* %x, i32 42, i64 128, i64 -1)
142 define void @memset_256_nonzero_bytes(i8* %x) {
143 ; SSE-LABEL: memset_256_nonzero_bytes:
145 ; SSE-NEXT: pushq %rax
146 ; SSE-NEXT: .cfi_def_cfa_offset 16
147 ; SSE-NEXT: movl $42, %esi
148 ; SSE-NEXT: movl $256, %edx # imm = 0x100
149 ; SSE-NEXT: callq memset
150 ; SSE-NEXT: popq %rax
153 ; SSE2FAST-LABEL: memset_256_nonzero_bytes:
155 ; SSE2FAST-NEXT: movaps {{.*#+}} xmm0 = [42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42]
156 ; SSE2FAST-NEXT: movups %xmm0, 240(%rdi)
157 ; SSE2FAST-NEXT: movups %xmm0, 224(%rdi)
158 ; SSE2FAST-NEXT: movups %xmm0, 208(%rdi)
159 ; SSE2FAST-NEXT: movups %xmm0, 192(%rdi)
160 ; SSE2FAST-NEXT: movups %xmm0, 176(%rdi)
161 ; SSE2FAST-NEXT: movups %xmm0, 160(%rdi)
162 ; SSE2FAST-NEXT: movups %xmm0, 144(%rdi)
163 ; SSE2FAST-NEXT: movups %xmm0, 128(%rdi)
164 ; SSE2FAST-NEXT: movups %xmm0, 112(%rdi)
165 ; SSE2FAST-NEXT: movups %xmm0, 96(%rdi)
166 ; SSE2FAST-NEXT: movups %xmm0, 80(%rdi)
167 ; SSE2FAST-NEXT: movups %xmm0, 64(%rdi)
168 ; SSE2FAST-NEXT: movups %xmm0, 48(%rdi)
169 ; SSE2FAST-NEXT: movups %xmm0, 32(%rdi)
170 ; SSE2FAST-NEXT: movups %xmm0, 16(%rdi)
171 ; SSE2FAST-NEXT: movups %xmm0, (%rdi)
172 ; SSE2FAST-NEXT: retq
174 ; AVX-LABEL: memset_256_nonzero_bytes:
176 ; AVX-NEXT: vmovaps {{.*#+}} ymm0 = [42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42]
177 ; AVX-NEXT: vmovups %ymm0, 224(%rdi)
178 ; AVX-NEXT: vmovups %ymm0, 192(%rdi)
179 ; AVX-NEXT: vmovups %ymm0, 160(%rdi)
180 ; AVX-NEXT: vmovups %ymm0, 128(%rdi)
181 ; AVX-NEXT: vmovups %ymm0, 96(%rdi)
182 ; AVX-NEXT: vmovups %ymm0, 64(%rdi)
183 ; AVX-NEXT: vmovups %ymm0, 32(%rdi)
184 ; AVX-NEXT: vmovups %ymm0, (%rdi)
185 ; AVX-NEXT: vzeroupper
187 %call = tail call i8* @__memset_chk(i8* %x, i32 42, i64 256, i64 -1)
191 declare i8* @__memset_chk(i8*, i32, i64, i64)
193 ; Repeat with a non-constant value for the stores.
195 define void @memset_16_nonconst_bytes(i8* %x, i8 %c) {
196 ; SSE-LABEL: memset_16_nonconst_bytes:
198 ; SSE-NEXT: movzbl %sil, %eax
199 ; SSE-NEXT: movabsq $72340172838076673, %rcx # imm = 0x101010101010101
200 ; SSE-NEXT: imulq %rax, %rcx
201 ; SSE-NEXT: movq %rcx, 8(%rdi)
202 ; SSE-NEXT: movq %rcx, (%rdi)
205 ; SSE2FAST-LABEL: memset_16_nonconst_bytes:
207 ; SSE2FAST-NEXT: movd %esi, %xmm0
208 ; SSE2FAST-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
209 ; SSE2FAST-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
210 ; SSE2FAST-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,1,1]
211 ; SSE2FAST-NEXT: movdqu %xmm0, (%rdi)
212 ; SSE2FAST-NEXT: retq
214 ; AVX1-LABEL: memset_16_nonconst_bytes:
216 ; AVX1-NEXT: vmovd %esi, %xmm0
217 ; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1
218 ; AVX1-NEXT: vpshufb %xmm1, %xmm0, %xmm0
219 ; AVX1-NEXT: vmovdqu %xmm0, (%rdi)
222 ; AVX2-LABEL: memset_16_nonconst_bytes:
224 ; AVX2-NEXT: vmovd %esi, %xmm0
225 ; AVX2-NEXT: vpbroadcastb %xmm0, %xmm0
226 ; AVX2-NEXT: vmovdqu %xmm0, (%rdi)
228 tail call void @llvm.memset.p0i8.i64(i8* %x, i8 %c, i64 16, i32 1, i1 false)
232 define void @memset_32_nonconst_bytes(i8* %x, i8 %c) {
233 ; SSE-LABEL: memset_32_nonconst_bytes:
235 ; SSE-NEXT: movzbl %sil, %eax
236 ; SSE-NEXT: movabsq $72340172838076673, %rcx # imm = 0x101010101010101
237 ; SSE-NEXT: imulq %rax, %rcx
238 ; SSE-NEXT: movq %rcx, 24(%rdi)
239 ; SSE-NEXT: movq %rcx, 16(%rdi)
240 ; SSE-NEXT: movq %rcx, 8(%rdi)
241 ; SSE-NEXT: movq %rcx, (%rdi)
244 ; SSE2FAST-LABEL: memset_32_nonconst_bytes:
246 ; SSE2FAST-NEXT: movd %esi, %xmm0
247 ; SSE2FAST-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
248 ; SSE2FAST-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
249 ; SSE2FAST-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,1,1]
250 ; SSE2FAST-NEXT: movdqu %xmm0, 16(%rdi)
251 ; SSE2FAST-NEXT: movdqu %xmm0, (%rdi)
252 ; SSE2FAST-NEXT: retq
254 ; AVX1-LABEL: memset_32_nonconst_bytes:
256 ; AVX1-NEXT: vmovd %esi, %xmm0
257 ; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1
258 ; AVX1-NEXT: vpshufb %xmm1, %xmm0, %xmm0
259 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
260 ; AVX1-NEXT: vmovups %ymm0, (%rdi)
261 ; AVX1-NEXT: vzeroupper
264 ; AVX2-LABEL: memset_32_nonconst_bytes:
266 ; AVX2-NEXT: vmovd %esi, %xmm0
267 ; AVX2-NEXT: vpbroadcastb %xmm0, %ymm0
268 ; AVX2-NEXT: vmovdqu %ymm0, (%rdi)
269 ; AVX2-NEXT: vzeroupper
271 tail call void @llvm.memset.p0i8.i64(i8* %x, i8 %c, i64 32, i32 1, i1 false)
275 define void @memset_64_nonconst_bytes(i8* %x, i8 %c) {
276 ; SSE-LABEL: memset_64_nonconst_bytes:
278 ; SSE-NEXT: movzbl %sil, %eax
279 ; SSE-NEXT: movabsq $72340172838076673, %rcx # imm = 0x101010101010101
280 ; SSE-NEXT: imulq %rax, %rcx
281 ; SSE-NEXT: movq %rcx, 56(%rdi)
282 ; SSE-NEXT: movq %rcx, 48(%rdi)
283 ; SSE-NEXT: movq %rcx, 40(%rdi)
284 ; SSE-NEXT: movq %rcx, 32(%rdi)
285 ; SSE-NEXT: movq %rcx, 24(%rdi)
286 ; SSE-NEXT: movq %rcx, 16(%rdi)
287 ; SSE-NEXT: movq %rcx, 8(%rdi)
288 ; SSE-NEXT: movq %rcx, (%rdi)
291 ; SSE2FAST-LABEL: memset_64_nonconst_bytes:
293 ; SSE2FAST-NEXT: movd %esi, %xmm0
294 ; SSE2FAST-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
295 ; SSE2FAST-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
296 ; SSE2FAST-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,1,1]
297 ; SSE2FAST-NEXT: movdqu %xmm0, 48(%rdi)
298 ; SSE2FAST-NEXT: movdqu %xmm0, 32(%rdi)
299 ; SSE2FAST-NEXT: movdqu %xmm0, 16(%rdi)
300 ; SSE2FAST-NEXT: movdqu %xmm0, (%rdi)
301 ; SSE2FAST-NEXT: retq
303 ; AVX1-LABEL: memset_64_nonconst_bytes:
305 ; AVX1-NEXT: vmovd %esi, %xmm0
306 ; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1
307 ; AVX1-NEXT: vpshufb %xmm1, %xmm0, %xmm0
308 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
309 ; AVX1-NEXT: vmovups %ymm0, 32(%rdi)
310 ; AVX1-NEXT: vmovups %ymm0, (%rdi)
311 ; AVX1-NEXT: vzeroupper
314 ; AVX2-LABEL: memset_64_nonconst_bytes:
316 ; AVX2-NEXT: vmovd %esi, %xmm0
317 ; AVX2-NEXT: vpbroadcastb %xmm0, %ymm0
318 ; AVX2-NEXT: vmovdqu %ymm0, 32(%rdi)
319 ; AVX2-NEXT: vmovdqu %ymm0, (%rdi)
320 ; AVX2-NEXT: vzeroupper
322 tail call void @llvm.memset.p0i8.i64(i8* %x, i8 %c, i64 64, i32 1, i1 false)
326 define void @memset_128_nonconst_bytes(i8* %x, i8 %c) {
327 ; SSE-LABEL: memset_128_nonconst_bytes:
329 ; SSE-NEXT: movzbl %sil, %eax
330 ; SSE-NEXT: movabsq $72340172838076673, %rcx # imm = 0x101010101010101
331 ; SSE-NEXT: imulq %rax, %rcx
332 ; SSE-NEXT: movq %rcx, 120(%rdi)
333 ; SSE-NEXT: movq %rcx, 112(%rdi)
334 ; SSE-NEXT: movq %rcx, 104(%rdi)
335 ; SSE-NEXT: movq %rcx, 96(%rdi)
336 ; SSE-NEXT: movq %rcx, 88(%rdi)
337 ; SSE-NEXT: movq %rcx, 80(%rdi)
338 ; SSE-NEXT: movq %rcx, 72(%rdi)
339 ; SSE-NEXT: movq %rcx, 64(%rdi)
340 ; SSE-NEXT: movq %rcx, 56(%rdi)
341 ; SSE-NEXT: movq %rcx, 48(%rdi)
342 ; SSE-NEXT: movq %rcx, 40(%rdi)
343 ; SSE-NEXT: movq %rcx, 32(%rdi)
344 ; SSE-NEXT: movq %rcx, 24(%rdi)
345 ; SSE-NEXT: movq %rcx, 16(%rdi)
346 ; SSE-NEXT: movq %rcx, 8(%rdi)
347 ; SSE-NEXT: movq %rcx, (%rdi)
350 ; SSE2FAST-LABEL: memset_128_nonconst_bytes:
352 ; SSE2FAST-NEXT: movd %esi, %xmm0
353 ; SSE2FAST-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
354 ; SSE2FAST-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
355 ; SSE2FAST-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,1,1]
356 ; SSE2FAST-NEXT: movdqu %xmm0, 112(%rdi)
357 ; SSE2FAST-NEXT: movdqu %xmm0, 96(%rdi)
358 ; SSE2FAST-NEXT: movdqu %xmm0, 80(%rdi)
359 ; SSE2FAST-NEXT: movdqu %xmm0, 64(%rdi)
360 ; SSE2FAST-NEXT: movdqu %xmm0, 48(%rdi)
361 ; SSE2FAST-NEXT: movdqu %xmm0, 32(%rdi)
362 ; SSE2FAST-NEXT: movdqu %xmm0, 16(%rdi)
363 ; SSE2FAST-NEXT: movdqu %xmm0, (%rdi)
364 ; SSE2FAST-NEXT: retq
366 ; AVX1-LABEL: memset_128_nonconst_bytes:
368 ; AVX1-NEXT: vmovd %esi, %xmm0
369 ; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1
370 ; AVX1-NEXT: vpshufb %xmm1, %xmm0, %xmm0
371 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
372 ; AVX1-NEXT: vmovups %ymm0, 96(%rdi)
373 ; AVX1-NEXT: vmovups %ymm0, 64(%rdi)
374 ; AVX1-NEXT: vmovups %ymm0, 32(%rdi)
375 ; AVX1-NEXT: vmovups %ymm0, (%rdi)
376 ; AVX1-NEXT: vzeroupper
379 ; AVX2-LABEL: memset_128_nonconst_bytes:
381 ; AVX2-NEXT: vmovd %esi, %xmm0
382 ; AVX2-NEXT: vpbroadcastb %xmm0, %ymm0
383 ; AVX2-NEXT: vmovdqu %ymm0, 96(%rdi)
384 ; AVX2-NEXT: vmovdqu %ymm0, 64(%rdi)
385 ; AVX2-NEXT: vmovdqu %ymm0, 32(%rdi)
386 ; AVX2-NEXT: vmovdqu %ymm0, (%rdi)
387 ; AVX2-NEXT: vzeroupper
389 tail call void @llvm.memset.p0i8.i64(i8* %x, i8 %c, i64 128, i32 1, i1 false)
393 define void @memset_256_nonconst_bytes(i8* %x, i8 %c) {
394 ; SSE-LABEL: memset_256_nonconst_bytes:
396 ; SSE-NEXT: movl $256, %edx # imm = 0x100
397 ; SSE-NEXT: jmp memset # TAILCALL
399 ; SSE2FAST-LABEL: memset_256_nonconst_bytes:
401 ; SSE2FAST-NEXT: movd %esi, %xmm0
402 ; SSE2FAST-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
403 ; SSE2FAST-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
404 ; SSE2FAST-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,1,1]
405 ; SSE2FAST-NEXT: movdqu %xmm0, 240(%rdi)
406 ; SSE2FAST-NEXT: movdqu %xmm0, 224(%rdi)
407 ; SSE2FAST-NEXT: movdqu %xmm0, 208(%rdi)
408 ; SSE2FAST-NEXT: movdqu %xmm0, 192(%rdi)
409 ; SSE2FAST-NEXT: movdqu %xmm0, 176(%rdi)
410 ; SSE2FAST-NEXT: movdqu %xmm0, 160(%rdi)
411 ; SSE2FAST-NEXT: movdqu %xmm0, 144(%rdi)
412 ; SSE2FAST-NEXT: movdqu %xmm0, 128(%rdi)
413 ; SSE2FAST-NEXT: movdqu %xmm0, 112(%rdi)
414 ; SSE2FAST-NEXT: movdqu %xmm0, 96(%rdi)
415 ; SSE2FAST-NEXT: movdqu %xmm0, 80(%rdi)
416 ; SSE2FAST-NEXT: movdqu %xmm0, 64(%rdi)
417 ; SSE2FAST-NEXT: movdqu %xmm0, 48(%rdi)
418 ; SSE2FAST-NEXT: movdqu %xmm0, 32(%rdi)
419 ; SSE2FAST-NEXT: movdqu %xmm0, 16(%rdi)
420 ; SSE2FAST-NEXT: movdqu %xmm0, (%rdi)
421 ; SSE2FAST-NEXT: retq
423 ; AVX1-LABEL: memset_256_nonconst_bytes:
425 ; AVX1-NEXT: vmovd %esi, %xmm0
426 ; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1
427 ; AVX1-NEXT: vpshufb %xmm1, %xmm0, %xmm0
428 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
429 ; AVX1-NEXT: vmovups %ymm0, 224(%rdi)
430 ; AVX1-NEXT: vmovups %ymm0, 192(%rdi)
431 ; AVX1-NEXT: vmovups %ymm0, 160(%rdi)
432 ; AVX1-NEXT: vmovups %ymm0, 128(%rdi)
433 ; AVX1-NEXT: vmovups %ymm0, 96(%rdi)
434 ; AVX1-NEXT: vmovups %ymm0, 64(%rdi)
435 ; AVX1-NEXT: vmovups %ymm0, 32(%rdi)
436 ; AVX1-NEXT: vmovups %ymm0, (%rdi)
437 ; AVX1-NEXT: vzeroupper
440 ; AVX2-LABEL: memset_256_nonconst_bytes:
442 ; AVX2-NEXT: vmovd %esi, %xmm0
443 ; AVX2-NEXT: vpbroadcastb %xmm0, %ymm0
444 ; AVX2-NEXT: vmovdqu %ymm0, 224(%rdi)
445 ; AVX2-NEXT: vmovdqu %ymm0, 192(%rdi)
446 ; AVX2-NEXT: vmovdqu %ymm0, 160(%rdi)
447 ; AVX2-NEXT: vmovdqu %ymm0, 128(%rdi)
448 ; AVX2-NEXT: vmovdqu %ymm0, 96(%rdi)
449 ; AVX2-NEXT: vmovdqu %ymm0, 64(%rdi)
450 ; AVX2-NEXT: vmovdqu %ymm0, 32(%rdi)
451 ; AVX2-NEXT: vmovdqu %ymm0, (%rdi)
452 ; AVX2-NEXT: vzeroupper
454 tail call void @llvm.memset.p0i8.i64(i8* %x, i8 %c, i64 256, i32 1, i1 false)
458 declare void @llvm.memset.p0i8.i64(i8* nocapture, i8, i64, i32, i1) #1