1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown | FileCheck %s --check-prefixes=SSE
3 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefixes=SSE
4 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefixes=AVX,AVX1OR2
5 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=AVX,AVX1OR2
6 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f | FileCheck %s --check-prefixes=AVX,AVX512F
7 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vl | FileCheck %s --check-prefixes=AVX,AVX512VL
9 define void @widen_fdiv_v2f32_v4f32(ptr %a0, ptr %b0, ptr %c0) {
10 ; SSE-LABEL: widen_fdiv_v2f32_v4f32:
12 ; SSE-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
13 ; SSE-NEXT: movsd {{.*#+}} xmm1 = mem[0],zero
14 ; SSE-NEXT: movsd {{.*#+}} xmm2 = mem[0],zero
15 ; SSE-NEXT: divps %xmm2, %xmm0
16 ; SSE-NEXT: movsd {{.*#+}} xmm2 = mem[0],zero
17 ; SSE-NEXT: divps %xmm2, %xmm1
18 ; SSE-NEXT: movlps %xmm0, (%rdx)
19 ; SSE-NEXT: movlps %xmm1, 8(%rdx)
22 ; AVX-LABEL: widen_fdiv_v2f32_v4f32:
24 ; AVX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
25 ; AVX-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero
26 ; AVX-NEXT: vmovsd {{.*#+}} xmm2 = mem[0],zero
27 ; AVX-NEXT: vdivps %xmm2, %xmm0, %xmm0
28 ; AVX-NEXT: vmovsd {{.*#+}} xmm2 = mem[0],zero
29 ; AVX-NEXT: vdivps %xmm2, %xmm1, %xmm1
30 ; AVX-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
31 ; AVX-NEXT: vmovups %xmm0, (%rdx)
33 %a2 = getelementptr inbounds i8, ptr %a0, i64 8
34 %b2 = getelementptr inbounds i8, ptr %b0, i64 8
35 %c2 = getelementptr inbounds i8, ptr %c0, i64 8
36 %va0 = load <2 x float>, ptr %a0, align 4
37 %vb0 = load <2 x float>, ptr %b0, align 4
38 %va2 = load <2 x float>, ptr %a2, align 4
39 %vb2 = load <2 x float>, ptr %b2, align 4
40 %vc0 = fdiv <2 x float> %va0, %vb0
41 %vc2 = fdiv <2 x float> %va2, %vb2
42 store <2 x float> %vc0, ptr %c0, align 4
43 store <2 x float> %vc2, ptr %c2, align 4
47 define void @widen_fdiv_v2f32_v8f32(ptr %a0, ptr %b0, ptr %c0) {
48 ; SSE-LABEL: widen_fdiv_v2f32_v8f32:
50 ; SSE-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
51 ; SSE-NEXT: movsd {{.*#+}} xmm1 = mem[0],zero
52 ; SSE-NEXT: movsd {{.*#+}} xmm2 = mem[0],zero
53 ; SSE-NEXT: movsd {{.*#+}} xmm3 = mem[0],zero
54 ; SSE-NEXT: movsd {{.*#+}} xmm4 = mem[0],zero
55 ; SSE-NEXT: divps %xmm4, %xmm0
56 ; SSE-NEXT: movsd {{.*#+}} xmm4 = mem[0],zero
57 ; SSE-NEXT: divps %xmm4, %xmm1
58 ; SSE-NEXT: movsd {{.*#+}} xmm4 = mem[0],zero
59 ; SSE-NEXT: divps %xmm4, %xmm2
60 ; SSE-NEXT: movsd {{.*#+}} xmm4 = mem[0],zero
61 ; SSE-NEXT: divps %xmm4, %xmm3
62 ; SSE-NEXT: movlps %xmm0, (%rdx)
63 ; SSE-NEXT: movlps %xmm1, 8(%rdx)
64 ; SSE-NEXT: movlps %xmm2, 16(%rdx)
65 ; SSE-NEXT: movlps %xmm3, 24(%rdx)
68 ; AVX1OR2-LABEL: widen_fdiv_v2f32_v8f32:
70 ; AVX1OR2-NEXT: vmovups (%rdi), %ymm0
71 ; AVX1OR2-NEXT: vdivps (%rsi), %ymm0, %ymm0
72 ; AVX1OR2-NEXT: vmovups %ymm0, (%rdx)
73 ; AVX1OR2-NEXT: vzeroupper
76 ; AVX512F-LABEL: widen_fdiv_v2f32_v8f32:
78 ; AVX512F-NEXT: vmovups (%rdi), %ymm0
79 ; AVX512F-NEXT: vdivps (%rsi), %ymm0, %ymm0
80 ; AVX512F-NEXT: vmovups %ymm0, (%rdx)
81 ; AVX512F-NEXT: vzeroupper
84 ; AVX512VL-LABEL: widen_fdiv_v2f32_v8f32:
86 ; AVX512VL-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
87 ; AVX512VL-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero
88 ; AVX512VL-NEXT: vmovsd {{.*#+}} xmm2 = mem[0],zero
89 ; AVX512VL-NEXT: vmovsd {{.*#+}} xmm3 = mem[0],zero
90 ; AVX512VL-NEXT: vmovsd {{.*#+}} xmm4 = mem[0],zero
91 ; AVX512VL-NEXT: vmovsd {{.*#+}} xmm5 = mem[0],zero
92 ; AVX512VL-NEXT: vdivps %xmm5, %xmm1, %xmm1
93 ; AVX512VL-NEXT: vmovsd {{.*#+}} xmm5 = mem[0],zero
94 ; AVX512VL-NEXT: vmovsd {{.*#+}} xmm6 = mem[0],zero
95 ; AVX512VL-NEXT: vdivps %xmm6, %xmm3, %xmm3
96 ; AVX512VL-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3
97 ; AVX512VL-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1
98 ; AVX512VL-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm1[2,3],ymm3[2,3]
99 ; AVX512VL-NEXT: vinsertf128 $1, %xmm5, %ymm4, %ymm3
100 ; AVX512VL-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
101 ; AVX512VL-NEXT: vdivps %ymm3, %ymm0, %ymm0
102 ; AVX512VL-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2]
103 ; AVX512VL-NEXT: vmovups %ymm0, (%rdx)
104 ; AVX512VL-NEXT: vzeroupper
105 ; AVX512VL-NEXT: retq
106 %a2 = getelementptr inbounds i8, ptr %a0, i64 8
107 %b2 = getelementptr inbounds i8, ptr %b0, i64 8
108 %c2 = getelementptr inbounds i8, ptr %c0, i64 8
109 %a4 = getelementptr inbounds i8, ptr %a0, i64 16
110 %b4 = getelementptr inbounds i8, ptr %b0, i64 16
111 %c4 = getelementptr inbounds i8, ptr %c0, i64 16
112 %a6 = getelementptr inbounds i8, ptr %a0, i64 24
113 %b6 = getelementptr inbounds i8, ptr %b0, i64 24
114 %c6 = getelementptr inbounds i8, ptr %c0, i64 24
115 %va0 = load <2 x float>, ptr %a0, align 4
116 %vb0 = load <2 x float>, ptr %b0, align 4
117 %va2 = load <2 x float>, ptr %a2, align 4
118 %vb2 = load <2 x float>, ptr %b2, align 4
119 %va4 = load <2 x float>, ptr %a4, align 4
120 %vb4 = load <2 x float>, ptr %b4, align 4
121 %va6 = load <2 x float>, ptr %a6, align 4
122 %vb6 = load <2 x float>, ptr %b6, align 4
123 %vc0 = fdiv <2 x float> %va0, %vb0
124 %vc2 = fdiv <2 x float> %va2, %vb2
125 %vc4 = fdiv <2 x float> %va4, %vb4
126 %vc6 = fdiv <2 x float> %va6, %vb6
127 store <2 x float> %vc0, ptr %c0, align 4
128 store <2 x float> %vc2, ptr %c2, align 4
129 store <2 x float> %vc4, ptr %c4, align 4
130 store <2 x float> %vc6, ptr %c6, align 4
134 define void @widen_fdiv_v2f32_v16f32(ptr %a0, ptr %b0, ptr %c0) {
135 ; SSE-LABEL: widen_fdiv_v2f32_v16f32:
137 ; SSE-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
138 ; SSE-NEXT: movsd {{.*#+}} xmm1 = mem[0],zero
139 ; SSE-NEXT: movsd {{.*#+}} xmm2 = mem[0],zero
140 ; SSE-NEXT: movsd {{.*#+}} xmm3 = mem[0],zero
141 ; SSE-NEXT: movsd {{.*#+}} xmm4 = mem[0],zero
142 ; SSE-NEXT: divps %xmm4, %xmm0
143 ; SSE-NEXT: movsd {{.*#+}} xmm4 = mem[0],zero
144 ; SSE-NEXT: divps %xmm4, %xmm1
145 ; SSE-NEXT: movsd {{.*#+}} xmm4 = mem[0],zero
146 ; SSE-NEXT: divps %xmm4, %xmm2
147 ; SSE-NEXT: movsd {{.*#+}} xmm4 = mem[0],zero
148 ; SSE-NEXT: divps %xmm4, %xmm3
149 ; SSE-NEXT: movsd {{.*#+}} xmm4 = mem[0],zero
150 ; SSE-NEXT: movsd {{.*#+}} xmm5 = mem[0],zero
151 ; SSE-NEXT: divps %xmm5, %xmm4
152 ; SSE-NEXT: movsd {{.*#+}} xmm5 = mem[0],zero
153 ; SSE-NEXT: movsd {{.*#+}} xmm6 = mem[0],zero
154 ; SSE-NEXT: divps %xmm6, %xmm5
155 ; SSE-NEXT: movsd {{.*#+}} xmm6 = mem[0],zero
156 ; SSE-NEXT: movsd {{.*#+}} xmm7 = mem[0],zero
157 ; SSE-NEXT: divps %xmm7, %xmm6
158 ; SSE-NEXT: movsd {{.*#+}} xmm7 = mem[0],zero
159 ; SSE-NEXT: movsd {{.*#+}} xmm8 = mem[0],zero
160 ; SSE-NEXT: divps %xmm8, %xmm7
161 ; SSE-NEXT: movlps %xmm0, (%rdx)
162 ; SSE-NEXT: movlps %xmm1, 8(%rdx)
163 ; SSE-NEXT: movlps %xmm2, 16(%rdx)
164 ; SSE-NEXT: movlps %xmm3, 24(%rdx)
165 ; SSE-NEXT: movlps %xmm4, 32(%rdx)
166 ; SSE-NEXT: movlps %xmm5, 40(%rdx)
167 ; SSE-NEXT: movlps %xmm6, 48(%rdx)
168 ; SSE-NEXT: movlps %xmm7, 56(%rdx)
171 ; AVX1OR2-LABEL: widen_fdiv_v2f32_v16f32:
173 ; AVX1OR2-NEXT: vmovups (%rdi), %ymm0
174 ; AVX1OR2-NEXT: vmovups 32(%rdi), %ymm1
175 ; AVX1OR2-NEXT: vdivps (%rsi), %ymm0, %ymm0
176 ; AVX1OR2-NEXT: vdivps 32(%rsi), %ymm1, %ymm1
177 ; AVX1OR2-NEXT: vmovups %ymm0, (%rdx)
178 ; AVX1OR2-NEXT: vmovups %ymm1, 32(%rdx)
179 ; AVX1OR2-NEXT: vzeroupper
182 ; AVX512F-LABEL: widen_fdiv_v2f32_v16f32:
184 ; AVX512F-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
185 ; AVX512F-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero
186 ; AVX512F-NEXT: vmovsd {{.*#+}} xmm2 = mem[0],zero
187 ; AVX512F-NEXT: vmovsd {{.*#+}} xmm3 = mem[0],zero
188 ; AVX512F-NEXT: vmovsd {{.*#+}} xmm4 = mem[0],zero
189 ; AVX512F-NEXT: vdivps %xmm4, %xmm0, %xmm0
190 ; AVX512F-NEXT: vmovsd {{.*#+}} xmm4 = mem[0],zero
191 ; AVX512F-NEXT: vdivps %xmm4, %xmm1, %xmm1
192 ; AVX512F-NEXT: vmovsd {{.*#+}} xmm4 = mem[0],zero
193 ; AVX512F-NEXT: vdivps %xmm4, %xmm2, %xmm2
194 ; AVX512F-NEXT: vmovsd {{.*#+}} xmm4 = mem[0],zero
195 ; AVX512F-NEXT: vdivps %xmm4, %xmm3, %xmm3
196 ; AVX512F-NEXT: vmovsd {{.*#+}} xmm4 = mem[0],zero
197 ; AVX512F-NEXT: vmovsd {{.*#+}} xmm5 = mem[0],zero
198 ; AVX512F-NEXT: vdivps %xmm5, %xmm4, %xmm4
199 ; AVX512F-NEXT: vmovsd {{.*#+}} xmm5 = mem[0],zero
200 ; AVX512F-NEXT: vmovsd {{.*#+}} xmm6 = mem[0],zero
201 ; AVX512F-NEXT: vdivps %xmm6, %xmm5, %xmm5
202 ; AVX512F-NEXT: vmovsd {{.*#+}} xmm6 = mem[0],zero
203 ; AVX512F-NEXT: vmovsd {{.*#+}} xmm7 = mem[0],zero
204 ; AVX512F-NEXT: vdivps %xmm7, %xmm6, %xmm6
205 ; AVX512F-NEXT: vmovsd {{.*#+}} xmm7 = mem[0],zero
206 ; AVX512F-NEXT: vmovsd {{.*#+}} xmm8 = mem[0],zero
207 ; AVX512F-NEXT: vdivps %xmm8, %xmm7, %xmm7
208 ; AVX512F-NEXT: vinsertf32x4 $1, %xmm7, %zmm6, %zmm6
209 ; AVX512F-NEXT: vinsertf32x4 $1, %xmm5, %zmm4, %zmm4
210 ; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm5 = [0,2,8,10,0,2,8,10]
211 ; AVX512F-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3]
212 ; AVX512F-NEXT: vpermt2pd %zmm6, %zmm5, %zmm4
213 ; AVX512F-NEXT: vinsertf32x4 $1, %xmm3, %zmm2, %zmm2
214 ; AVX512F-NEXT: vinsertf32x4 $1, %xmm1, %zmm0, %zmm0
215 ; AVX512F-NEXT: vpermt2pd %zmm2, %zmm5, %zmm0
216 ; AVX512F-NEXT: vinsertf64x4 $0, %ymm0, %zmm4, %zmm0
217 ; AVX512F-NEXT: vmovupd %zmm0, (%rdx)
218 ; AVX512F-NEXT: vzeroupper
221 ; AVX512VL-LABEL: widen_fdiv_v2f32_v16f32:
223 ; AVX512VL-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
224 ; AVX512VL-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero
225 ; AVX512VL-NEXT: vmovsd {{.*#+}} xmm2 = mem[0],zero
226 ; AVX512VL-NEXT: vmovsd {{.*#+}} xmm3 = mem[0],zero
227 ; AVX512VL-NEXT: vmovsd {{.*#+}} xmm4 = mem[0],zero
228 ; AVX512VL-NEXT: vdivps %xmm4, %xmm0, %xmm0
229 ; AVX512VL-NEXT: vmovsd {{.*#+}} xmm4 = mem[0],zero
230 ; AVX512VL-NEXT: vdivps %xmm4, %xmm1, %xmm1
231 ; AVX512VL-NEXT: vmovsd {{.*#+}} xmm4 = mem[0],zero
232 ; AVX512VL-NEXT: vdivps %xmm4, %xmm2, %xmm2
233 ; AVX512VL-NEXT: vmovsd {{.*#+}} xmm4 = mem[0],zero
234 ; AVX512VL-NEXT: vdivps %xmm4, %xmm3, %xmm3
235 ; AVX512VL-NEXT: vmovsd {{.*#+}} xmm4 = mem[0],zero
236 ; AVX512VL-NEXT: vmovsd {{.*#+}} xmm5 = mem[0],zero
237 ; AVX512VL-NEXT: vdivps %xmm5, %xmm4, %xmm4
238 ; AVX512VL-NEXT: vmovsd {{.*#+}} xmm5 = mem[0],zero
239 ; AVX512VL-NEXT: vmovsd {{.*#+}} xmm6 = mem[0],zero
240 ; AVX512VL-NEXT: vdivps %xmm6, %xmm5, %xmm5
241 ; AVX512VL-NEXT: vmovsd {{.*#+}} xmm6 = mem[0],zero
242 ; AVX512VL-NEXT: vmovsd {{.*#+}} xmm7 = mem[0],zero
243 ; AVX512VL-NEXT: vdivps %xmm7, %xmm6, %xmm6
244 ; AVX512VL-NEXT: vmovsd {{.*#+}} xmm7 = mem[0],zero
245 ; AVX512VL-NEXT: vmovsd {{.*#+}} xmm8 = mem[0],zero
246 ; AVX512VL-NEXT: vdivps %xmm8, %xmm7, %xmm7
247 ; AVX512VL-NEXT: vinsertf32x4 $1, %xmm7, %zmm6, %zmm6
248 ; AVX512VL-NEXT: vinsertf32x4 $1, %xmm5, %zmm4, %zmm4
249 ; AVX512VL-NEXT: vbroadcasti64x4 {{.*#+}} zmm5 = [0,2,8,10,0,2,8,10]
250 ; AVX512VL-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3]
251 ; AVX512VL-NEXT: vpermi2pd %zmm6, %zmm4, %zmm5
252 ; AVX512VL-NEXT: vinsertf32x4 $1, %xmm3, %zmm2, %zmm2
253 ; AVX512VL-NEXT: vinsertf32x4 $1, %xmm1, %zmm0, %zmm0
254 ; AVX512VL-NEXT: vmovapd {{.*#+}} ymm1 = [0,2,4,6]
255 ; AVX512VL-NEXT: vpermi2pd %ymm2, %ymm0, %ymm1
256 ; AVX512VL-NEXT: vinsertf64x4 $0, %ymm1, %zmm5, %zmm0
257 ; AVX512VL-NEXT: vmovupd %zmm0, (%rdx)
258 ; AVX512VL-NEXT: vzeroupper
259 ; AVX512VL-NEXT: retq
260 %a2 = getelementptr inbounds i8, ptr %a0, i64 8
261 %b2 = getelementptr inbounds i8, ptr %b0, i64 8
262 %c2 = getelementptr inbounds i8, ptr %c0, i64 8
263 %a4 = getelementptr inbounds i8, ptr %a0, i64 16
264 %b4 = getelementptr inbounds i8, ptr %b0, i64 16
265 %c4 = getelementptr inbounds i8, ptr %c0, i64 16
266 %a6 = getelementptr inbounds i8, ptr %a0, i64 24
267 %b6 = getelementptr inbounds i8, ptr %b0, i64 24
268 %c6 = getelementptr inbounds i8, ptr %c0, i64 24
269 %a8 = getelementptr inbounds i8, ptr %a0, i64 32
270 %b8 = getelementptr inbounds i8, ptr %b0, i64 32
271 %c8 = getelementptr inbounds i8, ptr %c0, i64 32
272 %a10 = getelementptr inbounds i8, ptr %a0, i64 40
273 %b10 = getelementptr inbounds i8, ptr %b0, i64 40
274 %c10 = getelementptr inbounds i8, ptr %c0, i64 40
275 %a12 = getelementptr inbounds i8, ptr %a0, i64 48
276 %b12 = getelementptr inbounds i8, ptr %b0, i64 48
277 %c12 = getelementptr inbounds i8, ptr %c0, i64 48
278 %a14 = getelementptr inbounds i8, ptr %a0, i64 56
279 %b14 = getelementptr inbounds i8, ptr %b0, i64 56
280 %c14 = getelementptr inbounds i8, ptr %c0, i64 56
281 %va0 = load <2 x float>, ptr %a0, align 4
282 %vb0 = load <2 x float>, ptr %b0, align 4
283 %va2 = load <2 x float>, ptr %a2, align 4
284 %vb2 = load <2 x float>, ptr %b2, align 4
285 %va4 = load <2 x float>, ptr %a4, align 4
286 %vb4 = load <2 x float>, ptr %b4, align 4
287 %va6 = load <2 x float>, ptr %a6, align 4
288 %vb6 = load <2 x float>, ptr %b6, align 4
289 %va8 = load <2 x float>, ptr %a8, align 4
290 %vb8 = load <2 x float>, ptr %b8, align 4
291 %va10 = load <2 x float>, ptr %a10, align 4
292 %vb10 = load <2 x float>, ptr %b10, align 4
293 %va12 = load <2 x float>, ptr %a12, align 4
294 %vb12 = load <2 x float>, ptr %b12, align 4
295 %va14 = load <2 x float>, ptr %a14, align 4
296 %vb14 = load <2 x float>, ptr %b14, align 4
297 %vc0 = fdiv <2 x float> %va0, %vb0
298 %vc2 = fdiv <2 x float> %va2, %vb2
299 %vc4 = fdiv <2 x float> %va4, %vb4
300 %vc6 = fdiv <2 x float> %va6, %vb6
301 %vc8 = fdiv <2 x float> %va8, %vb8
302 %vc10 = fdiv <2 x float> %va10, %vb10
303 %vc12 = fdiv <2 x float> %va12, %vb12
304 %vc14 = fdiv <2 x float> %va14, %vb14
305 store <2 x float> %vc0, ptr %c0, align 4
306 store <2 x float> %vc2, ptr %c2, align 4
307 store <2 x float> %vc4, ptr %c4, align 4
308 store <2 x float> %vc6, ptr %c6, align 4
309 store <2 x float> %vc8, ptr %c8, align 4
310 store <2 x float> %vc10, ptr %c10, align 4
311 store <2 x float> %vc12, ptr %c12, align 4
312 store <2 x float> %vc14, ptr %c14, align 4