1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc -mtriple=x86_64-unknown-unknown < %s -mattr=sse | FileCheck %s --check-prefix=SSE
3 ; RUN: llc -mtriple=x86_64-unknown-unknown < %s -mattr=sse2 | FileCheck %s --check-prefix=SSE
4 ; RUN: llc -mtriple=x86_64-unknown-unknown < %s -mattr=sse2,-slow-unaligned-mem-16 | FileCheck %s --check-prefix=SSE2FAST
5 ; RUN: llc -mtriple=x86_64-unknown-unknown < %s -mattr=avx | FileCheck %s --check-prefix=AVX --check-prefix=AVX1
6 ; RUN: llc -mtriple=x86_64-unknown-unknown < %s -mattr=avx2 | FileCheck %s --check-prefix=AVX --check-prefix=AVX2
7 ; RUN: llc -mtriple=x86_64-unknown-unknown < %s -mattr=avx512f -mattr=+prefer-256-bit | FileCheck %s --check-prefix=AVX --check-prefix=AVX512 --check-prefix=AVX512-ymm
8 ; RUN: llc -mtriple=x86_64-unknown-unknown < %s -mattr=avx512bw -mattr=+prefer-256-bit | FileCheck %s --check-prefix=AVX --check-prefix=AVX512 --check-prefix=AVX512-ymm
9 ; RUN: llc -mtriple=x86_64-unknown-unknown < %s -mattr=avx512dq -mattr=+prefer-256-bit | FileCheck %s --check-prefix=AVX --check-prefix=AVX512 --check-prefix=AVX512-ymm
10 ; RUN: llc -mtriple=x86_64-unknown-unknown < %s -mattr=avx512f -mattr=-prefer-256-bit | FileCheck %s --check-prefix=AVX --check-prefix=AVX512 --check-prefix=AVX512F
11 ; RUN: llc -mtriple=x86_64-unknown-unknown < %s -mattr=avx512bw -mattr=-prefer-256-bit | FileCheck %s --check-prefix=AVX --check-prefix=AVX512 --check-prefix=AVX512BW
13 ; https://llvm.org/bugs/show_bug.cgi?id=27100
15 define void @memset_16_nonzero_bytes(i8* %x) {
16 ; SSE-LABEL: memset_16_nonzero_bytes:
18 ; SSE-NEXT: movabsq $3038287259199220266, %rax # imm = 0x2A2A2A2A2A2A2A2A
19 ; SSE-NEXT: movq %rax, 8(%rdi)
20 ; SSE-NEXT: movq %rax, (%rdi)
23 ; SSE2FAST-LABEL: memset_16_nonzero_bytes:
25 ; SSE2FAST-NEXT: movaps {{.*#+}} xmm0 = [42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42]
26 ; SSE2FAST-NEXT: movups %xmm0, (%rdi)
29 ; AVX-LABEL: memset_16_nonzero_bytes:
31 ; AVX-NEXT: vmovaps {{.*#+}} xmm0 = [42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42]
32 ; AVX-NEXT: vmovups %xmm0, (%rdi)
34 %call = tail call i8* @__memset_chk(i8* %x, i32 42, i64 16, i64 -1)
38 define void @memset_32_nonzero_bytes(i8* %x) {
39 ; SSE-LABEL: memset_32_nonzero_bytes:
41 ; SSE-NEXT: movabsq $3038287259199220266, %rax # imm = 0x2A2A2A2A2A2A2A2A
42 ; SSE-NEXT: movq %rax, 24(%rdi)
43 ; SSE-NEXT: movq %rax, 16(%rdi)
44 ; SSE-NEXT: movq %rax, 8(%rdi)
45 ; SSE-NEXT: movq %rax, (%rdi)
48 ; SSE2FAST-LABEL: memset_32_nonzero_bytes:
50 ; SSE2FAST-NEXT: movaps {{.*#+}} xmm0 = [42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42]
51 ; SSE2FAST-NEXT: movups %xmm0, 16(%rdi)
52 ; SSE2FAST-NEXT: movups %xmm0, (%rdi)
55 ; AVX-LABEL: memset_32_nonzero_bytes:
57 ; AVX-NEXT: vmovaps {{.*#+}} ymm0 = [42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42]
58 ; AVX-NEXT: vmovups %ymm0, (%rdi)
59 ; AVX-NEXT: vzeroupper
61 %call = tail call i8* @__memset_chk(i8* %x, i32 42, i64 32, i64 -1)
65 define void @memset_64_nonzero_bytes(i8* %x) {
66 ; SSE-LABEL: memset_64_nonzero_bytes:
68 ; SSE-NEXT: movabsq $3038287259199220266, %rax # imm = 0x2A2A2A2A2A2A2A2A
69 ; SSE-NEXT: movq %rax, 56(%rdi)
70 ; SSE-NEXT: movq %rax, 48(%rdi)
71 ; SSE-NEXT: movq %rax, 40(%rdi)
72 ; SSE-NEXT: movq %rax, 32(%rdi)
73 ; SSE-NEXT: movq %rax, 24(%rdi)
74 ; SSE-NEXT: movq %rax, 16(%rdi)
75 ; SSE-NEXT: movq %rax, 8(%rdi)
76 ; SSE-NEXT: movq %rax, (%rdi)
79 ; SSE2FAST-LABEL: memset_64_nonzero_bytes:
81 ; SSE2FAST-NEXT: movaps {{.*#+}} xmm0 = [42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42]
82 ; SSE2FAST-NEXT: movups %xmm0, 48(%rdi)
83 ; SSE2FAST-NEXT: movups %xmm0, 32(%rdi)
84 ; SSE2FAST-NEXT: movups %xmm0, 16(%rdi)
85 ; SSE2FAST-NEXT: movups %xmm0, (%rdi)
88 ; AVX1-LABEL: memset_64_nonzero_bytes:
90 ; AVX1-NEXT: vmovaps {{.*#+}} ymm0 = [42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42]
91 ; AVX1-NEXT: vmovups %ymm0, 32(%rdi)
92 ; AVX1-NEXT: vmovups %ymm0, (%rdi)
93 ; AVX1-NEXT: vzeroupper
96 ; AVX2-LABEL: memset_64_nonzero_bytes:
98 ; AVX2-NEXT: vmovaps {{.*#+}} ymm0 = [42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42]
99 ; AVX2-NEXT: vmovups %ymm0, 32(%rdi)
100 ; AVX2-NEXT: vmovups %ymm0, (%rdi)
101 ; AVX2-NEXT: vzeroupper
104 ; AVX512-ymm-LABEL: memset_64_nonzero_bytes:
105 ; AVX512-ymm: # %bb.0:
106 ; AVX512-ymm-NEXT: vmovaps {{.*#+}} ymm0 = [42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42]
107 ; AVX512-ymm-NEXT: vmovups %ymm0, 32(%rdi)
108 ; AVX512-ymm-NEXT: vmovups %ymm0, (%rdi)
109 ; AVX512-ymm-NEXT: vzeroupper
110 ; AVX512-ymm-NEXT: retq
112 ; AVX512F-LABEL: memset_64_nonzero_bytes:
114 ; AVX512F-NEXT: vbroadcastss {{.*#+}} zmm0 = [707406378,707406378,707406378,707406378,707406378,707406378,707406378,707406378,707406378,707406378,707406378,707406378,707406378,707406378,707406378,707406378]
115 ; AVX512F-NEXT: vmovups %zmm0, (%rdi)
116 ; AVX512F-NEXT: vzeroupper
119 ; AVX512BW-LABEL: memset_64_nonzero_bytes:
121 ; AVX512BW-NEXT: vmovaps {{.*#+}} zmm0 = [42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42]
122 ; AVX512BW-NEXT: vmovups %zmm0, (%rdi)
123 ; AVX512BW-NEXT: vzeroupper
124 ; AVX512BW-NEXT: retq
125 ; AVX512NW-NEXT: retq
126 %call = tail call i8* @__memset_chk(i8* %x, i32 42, i64 64, i64 -1)
130 define void @memset_128_nonzero_bytes(i8* %x) {
131 ; SSE-LABEL: memset_128_nonzero_bytes:
133 ; SSE-NEXT: movabsq $3038287259199220266, %rax # imm = 0x2A2A2A2A2A2A2A2A
134 ; SSE-NEXT: movq %rax, 120(%rdi)
135 ; SSE-NEXT: movq %rax, 112(%rdi)
136 ; SSE-NEXT: movq %rax, 104(%rdi)
137 ; SSE-NEXT: movq %rax, 96(%rdi)
138 ; SSE-NEXT: movq %rax, 88(%rdi)
139 ; SSE-NEXT: movq %rax, 80(%rdi)
140 ; SSE-NEXT: movq %rax, 72(%rdi)
141 ; SSE-NEXT: movq %rax, 64(%rdi)
142 ; SSE-NEXT: movq %rax, 56(%rdi)
143 ; SSE-NEXT: movq %rax, 48(%rdi)
144 ; SSE-NEXT: movq %rax, 40(%rdi)
145 ; SSE-NEXT: movq %rax, 32(%rdi)
146 ; SSE-NEXT: movq %rax, 24(%rdi)
147 ; SSE-NEXT: movq %rax, 16(%rdi)
148 ; SSE-NEXT: movq %rax, 8(%rdi)
149 ; SSE-NEXT: movq %rax, (%rdi)
152 ; SSE2FAST-LABEL: memset_128_nonzero_bytes:
154 ; SSE2FAST-NEXT: movaps {{.*#+}} xmm0 = [42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42]
155 ; SSE2FAST-NEXT: movups %xmm0, 112(%rdi)
156 ; SSE2FAST-NEXT: movups %xmm0, 96(%rdi)
157 ; SSE2FAST-NEXT: movups %xmm0, 80(%rdi)
158 ; SSE2FAST-NEXT: movups %xmm0, 64(%rdi)
159 ; SSE2FAST-NEXT: movups %xmm0, 48(%rdi)
160 ; SSE2FAST-NEXT: movups %xmm0, 32(%rdi)
161 ; SSE2FAST-NEXT: movups %xmm0, 16(%rdi)
162 ; SSE2FAST-NEXT: movups %xmm0, (%rdi)
163 ; SSE2FAST-NEXT: retq
165 ; AVX1-LABEL: memset_128_nonzero_bytes:
167 ; AVX1-NEXT: vmovaps {{.*#+}} ymm0 = [42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42]
168 ; AVX1-NEXT: vmovups %ymm0, 96(%rdi)
169 ; AVX1-NEXT: vmovups %ymm0, 64(%rdi)
170 ; AVX1-NEXT: vmovups %ymm0, 32(%rdi)
171 ; AVX1-NEXT: vmovups %ymm0, (%rdi)
172 ; AVX1-NEXT: vzeroupper
175 ; AVX2-LABEL: memset_128_nonzero_bytes:
177 ; AVX2-NEXT: vmovaps {{.*#+}} ymm0 = [42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42]
178 ; AVX2-NEXT: vmovups %ymm0, 96(%rdi)
179 ; AVX2-NEXT: vmovups %ymm0, 64(%rdi)
180 ; AVX2-NEXT: vmovups %ymm0, 32(%rdi)
181 ; AVX2-NEXT: vmovups %ymm0, (%rdi)
182 ; AVX2-NEXT: vzeroupper
185 ; AVX512-ymm-LABEL: memset_128_nonzero_bytes:
186 ; AVX512-ymm: # %bb.0:
187 ; AVX512-ymm-NEXT: vmovaps {{.*#+}} ymm0 = [42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42]
188 ; AVX512-ymm-NEXT: vmovups %ymm0, 96(%rdi)
189 ; AVX512-ymm-NEXT: vmovups %ymm0, 64(%rdi)
190 ; AVX512-ymm-NEXT: vmovups %ymm0, 32(%rdi)
191 ; AVX512-ymm-NEXT: vmovups %ymm0, (%rdi)
192 ; AVX512-ymm-NEXT: vzeroupper
193 ; AVX512-ymm-NEXT: retq
195 ; AVX512F-LABEL: memset_128_nonzero_bytes:
197 ; AVX512F-NEXT: vbroadcastss {{.*#+}} zmm0 = [707406378,707406378,707406378,707406378,707406378,707406378,707406378,707406378,707406378,707406378,707406378,707406378,707406378,707406378,707406378,707406378]
198 ; AVX512F-NEXT: vmovups %zmm0, 64(%rdi)
199 ; AVX512F-NEXT: vmovups %zmm0, (%rdi)
200 ; AVX512F-NEXT: vzeroupper
203 ; AVX512BW-LABEL: memset_128_nonzero_bytes:
205 ; AVX512BW-NEXT: vmovaps {{.*#+}} zmm0 = [42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42]
206 ; AVX512BW-NEXT: vmovups %zmm0, 64(%rdi)
207 ; AVX512BW-NEXT: vmovups %zmm0, (%rdi)
208 ; AVX512BW-NEXT: vzeroupper
209 ; AVX512BW-NEXT: retq
210 %call = tail call i8* @__memset_chk(i8* %x, i32 42, i64 128, i64 -1)
214 define void @memset_256_nonzero_bytes(i8* %x) {
215 ; SSE-LABEL: memset_256_nonzero_bytes:
217 ; SSE-NEXT: pushq %rax
218 ; SSE-NEXT: .cfi_def_cfa_offset 16
219 ; SSE-NEXT: movl $256, %edx # imm = 0x100
220 ; SSE-NEXT: movl $42, %esi
221 ; SSE-NEXT: callq memset
222 ; SSE-NEXT: popq %rax
223 ; SSE-NEXT: .cfi_def_cfa_offset 8
226 ; SSE2FAST-LABEL: memset_256_nonzero_bytes:
228 ; SSE2FAST-NEXT: movaps {{.*#+}} xmm0 = [42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42]
229 ; SSE2FAST-NEXT: movups %xmm0, 240(%rdi)
230 ; SSE2FAST-NEXT: movups %xmm0, 224(%rdi)
231 ; SSE2FAST-NEXT: movups %xmm0, 208(%rdi)
232 ; SSE2FAST-NEXT: movups %xmm0, 192(%rdi)
233 ; SSE2FAST-NEXT: movups %xmm0, 176(%rdi)
234 ; SSE2FAST-NEXT: movups %xmm0, 160(%rdi)
235 ; SSE2FAST-NEXT: movups %xmm0, 144(%rdi)
236 ; SSE2FAST-NEXT: movups %xmm0, 128(%rdi)
237 ; SSE2FAST-NEXT: movups %xmm0, 112(%rdi)
238 ; SSE2FAST-NEXT: movups %xmm0, 96(%rdi)
239 ; SSE2FAST-NEXT: movups %xmm0, 80(%rdi)
240 ; SSE2FAST-NEXT: movups %xmm0, 64(%rdi)
241 ; SSE2FAST-NEXT: movups %xmm0, 48(%rdi)
242 ; SSE2FAST-NEXT: movups %xmm0, 32(%rdi)
243 ; SSE2FAST-NEXT: movups %xmm0, 16(%rdi)
244 ; SSE2FAST-NEXT: movups %xmm0, (%rdi)
245 ; SSE2FAST-NEXT: retq
247 ; AVX1-LABEL: memset_256_nonzero_bytes:
249 ; AVX1-NEXT: vmovaps {{.*#+}} ymm0 = [42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42]
250 ; AVX1-NEXT: vmovups %ymm0, 224(%rdi)
251 ; AVX1-NEXT: vmovups %ymm0, 192(%rdi)
252 ; AVX1-NEXT: vmovups %ymm0, 160(%rdi)
253 ; AVX1-NEXT: vmovups %ymm0, 128(%rdi)
254 ; AVX1-NEXT: vmovups %ymm0, 96(%rdi)
255 ; AVX1-NEXT: vmovups %ymm0, 64(%rdi)
256 ; AVX1-NEXT: vmovups %ymm0, 32(%rdi)
257 ; AVX1-NEXT: vmovups %ymm0, (%rdi)
258 ; AVX1-NEXT: vzeroupper
261 ; AVX2-LABEL: memset_256_nonzero_bytes:
263 ; AVX2-NEXT: vmovaps {{.*#+}} ymm0 = [42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42]
264 ; AVX2-NEXT: vmovups %ymm0, 224(%rdi)
265 ; AVX2-NEXT: vmovups %ymm0, 192(%rdi)
266 ; AVX2-NEXT: vmovups %ymm0, 160(%rdi)
267 ; AVX2-NEXT: vmovups %ymm0, 128(%rdi)
268 ; AVX2-NEXT: vmovups %ymm0, 96(%rdi)
269 ; AVX2-NEXT: vmovups %ymm0, 64(%rdi)
270 ; AVX2-NEXT: vmovups %ymm0, 32(%rdi)
271 ; AVX2-NEXT: vmovups %ymm0, (%rdi)
272 ; AVX2-NEXT: vzeroupper
275 ; AVX512-ymm-LABEL: memset_256_nonzero_bytes:
276 ; AVX512-ymm: # %bb.0:
277 ; AVX512-ymm-NEXT: vmovaps {{.*#+}} ymm0 = [42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42]
278 ; AVX512-ymm-NEXT: vmovups %ymm0, 224(%rdi)
279 ; AVX512-ymm-NEXT: vmovups %ymm0, 192(%rdi)
280 ; AVX512-ymm-NEXT: vmovups %ymm0, 160(%rdi)
281 ; AVX512-ymm-NEXT: vmovups %ymm0, 128(%rdi)
282 ; AVX512-ymm-NEXT: vmovups %ymm0, 96(%rdi)
283 ; AVX512-ymm-NEXT: vmovups %ymm0, 64(%rdi)
284 ; AVX512-ymm-NEXT: vmovups %ymm0, 32(%rdi)
285 ; AVX512-ymm-NEXT: vmovups %ymm0, (%rdi)
286 ; AVX512-ymm-NEXT: vzeroupper
287 ; AVX512-ymm-NEXT: retq
289 ; AVX512F-LABEL: memset_256_nonzero_bytes:
291 ; AVX512F-NEXT: vbroadcastss {{.*#+}} zmm0 = [707406378,707406378,707406378,707406378,707406378,707406378,707406378,707406378,707406378,707406378,707406378,707406378,707406378,707406378,707406378,707406378]
292 ; AVX512F-NEXT: vmovups %zmm0, 192(%rdi)
293 ; AVX512F-NEXT: vmovups %zmm0, 128(%rdi)
294 ; AVX512F-NEXT: vmovups %zmm0, 64(%rdi)
295 ; AVX512F-NEXT: vmovups %zmm0, (%rdi)
296 ; AVX512F-NEXT: vzeroupper
299 ; AVX512BW-LABEL: memset_256_nonzero_bytes:
301 ; AVX512BW-NEXT: vmovaps {{.*#+}} zmm0 = [42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42]
302 ; AVX512BW-NEXT: vmovups %zmm0, 192(%rdi)
303 ; AVX512BW-NEXT: vmovups %zmm0, 128(%rdi)
304 ; AVX512BW-NEXT: vmovups %zmm0, 64(%rdi)
305 ; AVX512BW-NEXT: vmovups %zmm0, (%rdi)
306 ; AVX512BW-NEXT: vzeroupper
307 ; AVX512BW-NEXT: retq
308 %call = tail call i8* @__memset_chk(i8* %x, i32 42, i64 256, i64 -1)
312 declare i8* @__memset_chk(i8*, i32, i64, i64)
314 ; Repeat with a non-constant value for the stores.
316 define void @memset_16_nonconst_bytes(i8* %x, i8 %c) {
317 ; SSE-LABEL: memset_16_nonconst_bytes:
319 ; SSE-NEXT: # kill: def $esi killed $esi def $rsi
320 ; SSE-NEXT: movzbl %sil, %eax
321 ; SSE-NEXT: movabsq $72340172838076673, %rcx # imm = 0x101010101010101
322 ; SSE-NEXT: imulq %rax, %rcx
323 ; SSE-NEXT: movq %rcx, 8(%rdi)
324 ; SSE-NEXT: movq %rcx, (%rdi)
327 ; SSE2FAST-LABEL: memset_16_nonconst_bytes:
329 ; SSE2FAST-NEXT: movd %esi, %xmm0
330 ; SSE2FAST-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
331 ; SSE2FAST-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,2,3,4,5,6,7]
332 ; SSE2FAST-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
333 ; SSE2FAST-NEXT: movdqu %xmm0, (%rdi)
334 ; SSE2FAST-NEXT: retq
336 ; AVX1-LABEL: memset_16_nonconst_bytes:
338 ; AVX1-NEXT: vmovd %esi, %xmm0
339 ; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1
340 ; AVX1-NEXT: vpshufb %xmm1, %xmm0, %xmm0
341 ; AVX1-NEXT: vmovdqu %xmm0, (%rdi)
344 ; AVX2-LABEL: memset_16_nonconst_bytes:
346 ; AVX2-NEXT: vmovd %esi, %xmm0
347 ; AVX2-NEXT: vpbroadcastb %xmm0, %xmm0
348 ; AVX2-NEXT: vmovdqu %xmm0, (%rdi)
351 ; AVX512-LABEL: memset_16_nonconst_bytes:
353 ; AVX512-NEXT: vmovd %esi, %xmm0
354 ; AVX512-NEXT: vpbroadcastb %xmm0, %xmm0
355 ; AVX512-NEXT: vmovdqu %xmm0, (%rdi)
357 tail call void @llvm.memset.p0i8.i64(i8* %x, i8 %c, i64 16, i1 false)
361 define void @memset_32_nonconst_bytes(i8* %x, i8 %c) {
362 ; SSE-LABEL: memset_32_nonconst_bytes:
364 ; SSE-NEXT: # kill: def $esi killed $esi def $rsi
365 ; SSE-NEXT: movzbl %sil, %eax
366 ; SSE-NEXT: movabsq $72340172838076673, %rcx # imm = 0x101010101010101
367 ; SSE-NEXT: imulq %rax, %rcx
368 ; SSE-NEXT: movq %rcx, 24(%rdi)
369 ; SSE-NEXT: movq %rcx, 16(%rdi)
370 ; SSE-NEXT: movq %rcx, 8(%rdi)
371 ; SSE-NEXT: movq %rcx, (%rdi)
374 ; SSE2FAST-LABEL: memset_32_nonconst_bytes:
376 ; SSE2FAST-NEXT: movd %esi, %xmm0
377 ; SSE2FAST-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
378 ; SSE2FAST-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,2,3,4,5,6,7]
379 ; SSE2FAST-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
380 ; SSE2FAST-NEXT: movdqu %xmm0, 16(%rdi)
381 ; SSE2FAST-NEXT: movdqu %xmm0, (%rdi)
382 ; SSE2FAST-NEXT: retq
384 ; AVX1-LABEL: memset_32_nonconst_bytes:
386 ; AVX1-NEXT: vmovd %esi, %xmm0
387 ; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1
388 ; AVX1-NEXT: vpshufb %xmm1, %xmm0, %xmm0
389 ; AVX1-NEXT: vmovdqu %xmm0, 16(%rdi)
390 ; AVX1-NEXT: vmovdqu %xmm0, (%rdi)
393 ; AVX2-LABEL: memset_32_nonconst_bytes:
395 ; AVX2-NEXT: vmovd %esi, %xmm0
396 ; AVX2-NEXT: vpbroadcastb %xmm0, %ymm0
397 ; AVX2-NEXT: vmovdqu %ymm0, (%rdi)
398 ; AVX2-NEXT: vzeroupper
401 ; AVX512-LABEL: memset_32_nonconst_bytes:
403 ; AVX512-NEXT: vmovd %esi, %xmm0
404 ; AVX512-NEXT: vpbroadcastb %xmm0, %ymm0
405 ; AVX512-NEXT: vmovdqu %ymm0, (%rdi)
406 ; AVX512-NEXT: vzeroupper
408 tail call void @llvm.memset.p0i8.i64(i8* %x, i8 %c, i64 32, i1 false)
412 define void @memset_64_nonconst_bytes(i8* %x, i8 %c) {
413 ; SSE-LABEL: memset_64_nonconst_bytes:
415 ; SSE-NEXT: # kill: def $esi killed $esi def $rsi
416 ; SSE-NEXT: movzbl %sil, %eax
417 ; SSE-NEXT: movabsq $72340172838076673, %rcx # imm = 0x101010101010101
418 ; SSE-NEXT: imulq %rax, %rcx
419 ; SSE-NEXT: movq %rcx, 56(%rdi)
420 ; SSE-NEXT: movq %rcx, 48(%rdi)
421 ; SSE-NEXT: movq %rcx, 40(%rdi)
422 ; SSE-NEXT: movq %rcx, 32(%rdi)
423 ; SSE-NEXT: movq %rcx, 24(%rdi)
424 ; SSE-NEXT: movq %rcx, 16(%rdi)
425 ; SSE-NEXT: movq %rcx, 8(%rdi)
426 ; SSE-NEXT: movq %rcx, (%rdi)
429 ; SSE2FAST-LABEL: memset_64_nonconst_bytes:
431 ; SSE2FAST-NEXT: movd %esi, %xmm0
432 ; SSE2FAST-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
433 ; SSE2FAST-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,2,3,4,5,6,7]
434 ; SSE2FAST-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
435 ; SSE2FAST-NEXT: movdqu %xmm0, 48(%rdi)
436 ; SSE2FAST-NEXT: movdqu %xmm0, 32(%rdi)
437 ; SSE2FAST-NEXT: movdqu %xmm0, 16(%rdi)
438 ; SSE2FAST-NEXT: movdqu %xmm0, (%rdi)
439 ; SSE2FAST-NEXT: retq
441 ; AVX1-LABEL: memset_64_nonconst_bytes:
443 ; AVX1-NEXT: vmovd %esi, %xmm0
444 ; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1
445 ; AVX1-NEXT: vpshufb %xmm1, %xmm0, %xmm0
446 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
447 ; AVX1-NEXT: vmovups %ymm0, 32(%rdi)
448 ; AVX1-NEXT: vmovups %ymm0, (%rdi)
449 ; AVX1-NEXT: vzeroupper
452 ; AVX2-LABEL: memset_64_nonconst_bytes:
454 ; AVX2-NEXT: vmovd %esi, %xmm0
455 ; AVX2-NEXT: vpbroadcastb %xmm0, %ymm0
456 ; AVX2-NEXT: vmovdqu %ymm0, 32(%rdi)
457 ; AVX2-NEXT: vmovdqu %ymm0, (%rdi)
458 ; AVX2-NEXT: vzeroupper
461 ; AVX512-ymm-LABEL: memset_64_nonconst_bytes:
462 ; AVX512-ymm: # %bb.0:
463 ; AVX512-ymm-NEXT: vmovd %esi, %xmm0
464 ; AVX512-ymm-NEXT: vpbroadcastb %xmm0, %ymm0
465 ; AVX512-ymm-NEXT: vmovdqu %ymm0, 32(%rdi)
466 ; AVX512-ymm-NEXT: vmovdqu %ymm0, (%rdi)
467 ; AVX512-ymm-NEXT: vzeroupper
468 ; AVX512-ymm-NEXT: retq
470 ; AVX512F-LABEL: memset_64_nonconst_bytes:
472 ; AVX512F-NEXT: movzbl %sil, %eax
473 ; AVX512F-NEXT: imull $16843009, %eax, %eax # imm = 0x1010101
474 ; AVX512F-NEXT: vpbroadcastd %eax, %zmm0
475 ; AVX512F-NEXT: vmovdqu64 %zmm0, (%rdi)
476 ; AVX512F-NEXT: vzeroupper
479 ; AVX512BW-LABEL: memset_64_nonconst_bytes:
481 ; AVX512BW-NEXT: vpbroadcastb %esi, %zmm0
482 ; AVX512BW-NEXT: vmovdqu64 %zmm0, (%rdi)
483 ; AVX512BW-NEXT: vzeroupper
484 ; AVX512BW-NEXT: retq
485 tail call void @llvm.memset.p0i8.i64(i8* %x, i8 %c, i64 64, i1 false)
489 define void @memset_128_nonconst_bytes(i8* %x, i8 %c) {
490 ; SSE-LABEL: memset_128_nonconst_bytes:
492 ; SSE-NEXT: # kill: def $esi killed $esi def $rsi
493 ; SSE-NEXT: movzbl %sil, %eax
494 ; SSE-NEXT: movabsq $72340172838076673, %rcx # imm = 0x101010101010101
495 ; SSE-NEXT: imulq %rax, %rcx
496 ; SSE-NEXT: movq %rcx, 120(%rdi)
497 ; SSE-NEXT: movq %rcx, 112(%rdi)
498 ; SSE-NEXT: movq %rcx, 104(%rdi)
499 ; SSE-NEXT: movq %rcx, 96(%rdi)
500 ; SSE-NEXT: movq %rcx, 88(%rdi)
501 ; SSE-NEXT: movq %rcx, 80(%rdi)
502 ; SSE-NEXT: movq %rcx, 72(%rdi)
503 ; SSE-NEXT: movq %rcx, 64(%rdi)
504 ; SSE-NEXT: movq %rcx, 56(%rdi)
505 ; SSE-NEXT: movq %rcx, 48(%rdi)
506 ; SSE-NEXT: movq %rcx, 40(%rdi)
507 ; SSE-NEXT: movq %rcx, 32(%rdi)
508 ; SSE-NEXT: movq %rcx, 24(%rdi)
509 ; SSE-NEXT: movq %rcx, 16(%rdi)
510 ; SSE-NEXT: movq %rcx, 8(%rdi)
511 ; SSE-NEXT: movq %rcx, (%rdi)
514 ; SSE2FAST-LABEL: memset_128_nonconst_bytes:
516 ; SSE2FAST-NEXT: movd %esi, %xmm0
517 ; SSE2FAST-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
518 ; SSE2FAST-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,2,3,4,5,6,7]
519 ; SSE2FAST-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
520 ; SSE2FAST-NEXT: movdqu %xmm0, 112(%rdi)
521 ; SSE2FAST-NEXT: movdqu %xmm0, 96(%rdi)
522 ; SSE2FAST-NEXT: movdqu %xmm0, 80(%rdi)
523 ; SSE2FAST-NEXT: movdqu %xmm0, 64(%rdi)
524 ; SSE2FAST-NEXT: movdqu %xmm0, 48(%rdi)
525 ; SSE2FAST-NEXT: movdqu %xmm0, 32(%rdi)
526 ; SSE2FAST-NEXT: movdqu %xmm0, 16(%rdi)
527 ; SSE2FAST-NEXT: movdqu %xmm0, (%rdi)
528 ; SSE2FAST-NEXT: retq
530 ; AVX1-LABEL: memset_128_nonconst_bytes:
532 ; AVX1-NEXT: vmovd %esi, %xmm0
533 ; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1
534 ; AVX1-NEXT: vpshufb %xmm1, %xmm0, %xmm0
535 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
536 ; AVX1-NEXT: vmovups %ymm0, 96(%rdi)
537 ; AVX1-NEXT: vmovups %ymm0, 64(%rdi)
538 ; AVX1-NEXT: vmovups %ymm0, 32(%rdi)
539 ; AVX1-NEXT: vmovups %ymm0, (%rdi)
540 ; AVX1-NEXT: vzeroupper
543 ; AVX2-LABEL: memset_128_nonconst_bytes:
545 ; AVX2-NEXT: vmovd %esi, %xmm0
546 ; AVX2-NEXT: vpbroadcastb %xmm0, %ymm0
547 ; AVX2-NEXT: vmovdqu %ymm0, 96(%rdi)
548 ; AVX2-NEXT: vmovdqu %ymm0, 64(%rdi)
549 ; AVX2-NEXT: vmovdqu %ymm0, 32(%rdi)
550 ; AVX2-NEXT: vmovdqu %ymm0, (%rdi)
551 ; AVX2-NEXT: vzeroupper
554 ; AVX512-ymm-LABEL: memset_128_nonconst_bytes:
555 ; AVX512-ymm: # %bb.0:
556 ; AVX512-ymm-NEXT: vmovd %esi, %xmm0
557 ; AVX512-ymm-NEXT: vpbroadcastb %xmm0, %ymm0
558 ; AVX512-ymm-NEXT: vmovdqu %ymm0, 96(%rdi)
559 ; AVX512-ymm-NEXT: vmovdqu %ymm0, 64(%rdi)
560 ; AVX512-ymm-NEXT: vmovdqu %ymm0, 32(%rdi)
561 ; AVX512-ymm-NEXT: vmovdqu %ymm0, (%rdi)
562 ; AVX512-ymm-NEXT: vzeroupper
563 ; AVX512-ymm-NEXT: retq
565 ; AVX512F-LABEL: memset_128_nonconst_bytes:
567 ; AVX512F-NEXT: movzbl %sil, %eax
568 ; AVX512F-NEXT: imull $16843009, %eax, %eax # imm = 0x1010101
569 ; AVX512F-NEXT: vpbroadcastd %eax, %zmm0
570 ; AVX512F-NEXT: vmovdqu64 %zmm0, 64(%rdi)
571 ; AVX512F-NEXT: vmovdqu64 %zmm0, (%rdi)
572 ; AVX512F-NEXT: vzeroupper
575 ; AVX512BW-LABEL: memset_128_nonconst_bytes:
577 ; AVX512BW-NEXT: vpbroadcastb %esi, %zmm0
578 ; AVX512BW-NEXT: vmovdqu64 %zmm0, 64(%rdi)
579 ; AVX512BW-NEXT: vmovdqu64 %zmm0, (%rdi)
580 ; AVX512BW-NEXT: vzeroupper
581 ; AVX512BW-NEXT: retq
582 tail call void @llvm.memset.p0i8.i64(i8* %x, i8 %c, i64 128, i1 false)
586 define void @memset_256_nonconst_bytes(i8* %x, i8 %c) {
587 ; SSE-LABEL: memset_256_nonconst_bytes:
589 ; SSE-NEXT: movl $256, %edx # imm = 0x100
590 ; SSE-NEXT: jmp memset # TAILCALL
592 ; SSE2FAST-LABEL: memset_256_nonconst_bytes:
594 ; SSE2FAST-NEXT: movd %esi, %xmm0
595 ; SSE2FAST-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
596 ; SSE2FAST-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,2,3,4,5,6,7]
597 ; SSE2FAST-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
598 ; SSE2FAST-NEXT: movdqu %xmm0, 240(%rdi)
599 ; SSE2FAST-NEXT: movdqu %xmm0, 224(%rdi)
600 ; SSE2FAST-NEXT: movdqu %xmm0, 208(%rdi)
601 ; SSE2FAST-NEXT: movdqu %xmm0, 192(%rdi)
602 ; SSE2FAST-NEXT: movdqu %xmm0, 176(%rdi)
603 ; SSE2FAST-NEXT: movdqu %xmm0, 160(%rdi)
604 ; SSE2FAST-NEXT: movdqu %xmm0, 144(%rdi)
605 ; SSE2FAST-NEXT: movdqu %xmm0, 128(%rdi)
606 ; SSE2FAST-NEXT: movdqu %xmm0, 112(%rdi)
607 ; SSE2FAST-NEXT: movdqu %xmm0, 96(%rdi)
608 ; SSE2FAST-NEXT: movdqu %xmm0, 80(%rdi)
609 ; SSE2FAST-NEXT: movdqu %xmm0, 64(%rdi)
610 ; SSE2FAST-NEXT: movdqu %xmm0, 48(%rdi)
611 ; SSE2FAST-NEXT: movdqu %xmm0, 32(%rdi)
612 ; SSE2FAST-NEXT: movdqu %xmm0, 16(%rdi)
613 ; SSE2FAST-NEXT: movdqu %xmm0, (%rdi)
614 ; SSE2FAST-NEXT: retq
616 ; AVX1-LABEL: memset_256_nonconst_bytes:
618 ; AVX1-NEXT: vmovd %esi, %xmm0
619 ; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1
620 ; AVX1-NEXT: vpshufb %xmm1, %xmm0, %xmm0
621 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
622 ; AVX1-NEXT: vmovups %ymm0, 224(%rdi)
623 ; AVX1-NEXT: vmovups %ymm0, 192(%rdi)
624 ; AVX1-NEXT: vmovups %ymm0, 160(%rdi)
625 ; AVX1-NEXT: vmovups %ymm0, 128(%rdi)
626 ; AVX1-NEXT: vmovups %ymm0, 96(%rdi)
627 ; AVX1-NEXT: vmovups %ymm0, 64(%rdi)
628 ; AVX1-NEXT: vmovups %ymm0, 32(%rdi)
629 ; AVX1-NEXT: vmovups %ymm0, (%rdi)
630 ; AVX1-NEXT: vzeroupper
633 ; AVX2-LABEL: memset_256_nonconst_bytes:
635 ; AVX2-NEXT: vmovd %esi, %xmm0
636 ; AVX2-NEXT: vpbroadcastb %xmm0, %ymm0
637 ; AVX2-NEXT: vmovdqu %ymm0, 224(%rdi)
638 ; AVX2-NEXT: vmovdqu %ymm0, 192(%rdi)
639 ; AVX2-NEXT: vmovdqu %ymm0, 160(%rdi)
640 ; AVX2-NEXT: vmovdqu %ymm0, 128(%rdi)
641 ; AVX2-NEXT: vmovdqu %ymm0, 96(%rdi)
642 ; AVX2-NEXT: vmovdqu %ymm0, 64(%rdi)
643 ; AVX2-NEXT: vmovdqu %ymm0, 32(%rdi)
644 ; AVX2-NEXT: vmovdqu %ymm0, (%rdi)
645 ; AVX2-NEXT: vzeroupper
648 ; AVX512-ymm-LABEL: memset_256_nonconst_bytes:
649 ; AVX512-ymm: # %bb.0:
650 ; AVX512-ymm-NEXT: vmovd %esi, %xmm0
651 ; AVX512-ymm-NEXT: vpbroadcastb %xmm0, %ymm0
652 ; AVX512-ymm-NEXT: vmovdqu %ymm0, 224(%rdi)
653 ; AVX512-ymm-NEXT: vmovdqu %ymm0, 192(%rdi)
654 ; AVX512-ymm-NEXT: vmovdqu %ymm0, 160(%rdi)
655 ; AVX512-ymm-NEXT: vmovdqu %ymm0, 128(%rdi)
656 ; AVX512-ymm-NEXT: vmovdqu %ymm0, 96(%rdi)
657 ; AVX512-ymm-NEXT: vmovdqu %ymm0, 64(%rdi)
658 ; AVX512-ymm-NEXT: vmovdqu %ymm0, 32(%rdi)
659 ; AVX512-ymm-NEXT: vmovdqu %ymm0, (%rdi)
660 ; AVX512-ymm-NEXT: vzeroupper
661 ; AVX512-ymm-NEXT: retq
663 ; AVX512F-LABEL: memset_256_nonconst_bytes:
665 ; AVX512F-NEXT: movzbl %sil, %eax
666 ; AVX512F-NEXT: imull $16843009, %eax, %eax # imm = 0x1010101
667 ; AVX512F-NEXT: vpbroadcastd %eax, %zmm0
668 ; AVX512F-NEXT: vmovdqu64 %zmm0, 192(%rdi)
669 ; AVX512F-NEXT: vmovdqu64 %zmm0, 128(%rdi)
670 ; AVX512F-NEXT: vmovdqu64 %zmm0, 64(%rdi)
671 ; AVX512F-NEXT: vmovdqu64 %zmm0, (%rdi)
672 ; AVX512F-NEXT: vzeroupper
675 ; AVX512BW-LABEL: memset_256_nonconst_bytes:
677 ; AVX512BW-NEXT: vpbroadcastb %esi, %zmm0
678 ; AVX512BW-NEXT: vmovdqu64 %zmm0, 192(%rdi)
679 ; AVX512BW-NEXT: vmovdqu64 %zmm0, 128(%rdi)
680 ; AVX512BW-NEXT: vmovdqu64 %zmm0, 64(%rdi)
681 ; AVX512BW-NEXT: vmovdqu64 %zmm0, (%rdi)
682 ; AVX512BW-NEXT: vzeroupper
683 ; AVX512BW-NEXT: retq
684 tail call void @llvm.memset.p0i8.i64(i8* %x, i8 %c, i64 256, i1 false)
688 declare void @llvm.memset.p0i8.i64(i8* nocapture, i8, i64, i1) #1