1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefix=X86-SSE
3 ; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+avx | FileCheck %s --check-prefixes=X86-AVX,X86-AVX1
4 ; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=X86-AVX,X86-AVX2
5 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefix=X64-SSE
6 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefixes=X64-AVX,X64-AVX1
7 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=X64-AVX,X64-AVX2
9 @c = external dso_local global ptr, align 8
11 ; %val1 = load <2 x i8>
12 ; %op1 = zext<2 x i32> %val1
13 ; %val2 = load <2 x i8>
14 ; %op2 = zext<2 x i32> %val2
15 ; %rst = mul <2 x i32> %op1, %op2
17 define void @mul_2xi8(ptr nocapture readonly %a, ptr nocapture readonly %b, i64 %index) nounwind {
18 ; X86-SSE-LABEL: mul_2xi8:
19 ; X86-SSE: # %bb.0: # %entry
20 ; X86-SSE-NEXT: pushl %esi
21 ; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %ecx
22 ; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %eax
23 ; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %edx
24 ; X86-SSE-NEXT: movl c, %esi
25 ; X86-SSE-NEXT: movzwl (%edx,%eax), %edx
26 ; X86-SSE-NEXT: movd %edx, %xmm0
27 ; X86-SSE-NEXT: movzwl (%ecx,%eax), %ecx
28 ; X86-SSE-NEXT: movd %ecx, %xmm1
29 ; X86-SSE-NEXT: pxor %xmm2, %xmm2
30 ; X86-SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7]
31 ; X86-SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
32 ; X86-SSE-NEXT: pmullw %xmm0, %xmm1
33 ; X86-SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
34 ; X86-SSE-NEXT: movq %xmm1, (%esi,%eax,4)
35 ; X86-SSE-NEXT: popl %esi
38 ; X86-AVX-LABEL: mul_2xi8:
39 ; X86-AVX: # %bb.0: # %entry
40 ; X86-AVX-NEXT: pushl %esi
41 ; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax
42 ; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %ecx
43 ; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %edx
44 ; X86-AVX-NEXT: movl c, %esi
45 ; X86-AVX-NEXT: movzwl (%edx,%ecx), %edx
46 ; X86-AVX-NEXT: vmovd %edx, %xmm0
47 ; X86-AVX-NEXT: vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
48 ; X86-AVX-NEXT: movzwl (%eax,%ecx), %eax
49 ; X86-AVX-NEXT: vmovd %eax, %xmm1
50 ; X86-AVX-NEXT: vpmovzxbd {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero
51 ; X86-AVX-NEXT: vpmaddwd %xmm0, %xmm1, %xmm0
52 ; X86-AVX-NEXT: vmovq %xmm0, (%esi,%ecx,4)
53 ; X86-AVX-NEXT: popl %esi
56 ; X64-SSE-LABEL: mul_2xi8:
57 ; X64-SSE: # %bb.0: # %entry
58 ; X64-SSE-NEXT: movq c(%rip), %rax
59 ; X64-SSE-NEXT: movzwl (%rdi,%rdx), %ecx
60 ; X64-SSE-NEXT: movd %ecx, %xmm0
61 ; X64-SSE-NEXT: movzwl (%rsi,%rdx), %ecx
62 ; X64-SSE-NEXT: movd %ecx, %xmm1
63 ; X64-SSE-NEXT: pxor %xmm2, %xmm2
64 ; X64-SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7]
65 ; X64-SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
66 ; X64-SSE-NEXT: pmullw %xmm0, %xmm1
67 ; X64-SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
68 ; X64-SSE-NEXT: movq %xmm1, (%rax,%rdx,4)
71 ; X64-AVX-LABEL: mul_2xi8:
72 ; X64-AVX: # %bb.0: # %entry
73 ; X64-AVX-NEXT: movq c(%rip), %rax
74 ; X64-AVX-NEXT: movzwl (%rdi,%rdx), %ecx
75 ; X64-AVX-NEXT: vmovd %ecx, %xmm0
76 ; X64-AVX-NEXT: vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
77 ; X64-AVX-NEXT: movzwl (%rsi,%rdx), %ecx
78 ; X64-AVX-NEXT: vmovd %ecx, %xmm1
79 ; X64-AVX-NEXT: vpmovzxbd {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero
80 ; X64-AVX-NEXT: vpmaddwd %xmm0, %xmm1, %xmm0
81 ; X64-AVX-NEXT: vmovq %xmm0, (%rax,%rdx,4)
84 %pre = load ptr, ptr @c
85 %tmp6 = getelementptr inbounds i8, ptr %a, i64 %index
86 %wide.load = load <2 x i8>, ptr %tmp6, align 1
87 %tmp8 = zext <2 x i8> %wide.load to <2 x i32>
88 %tmp10 = getelementptr inbounds i8, ptr %b, i64 %index
89 %wide.load17 = load <2 x i8>, ptr %tmp10, align 1
90 %tmp12 = zext <2 x i8> %wide.load17 to <2 x i32>
91 %tmp13 = mul nuw nsw <2 x i32> %tmp12, %tmp8
92 %tmp14 = getelementptr inbounds i32, ptr %pre, i64 %index
93 store <2 x i32> %tmp13, ptr %tmp14, align 4
97 ; %val1 = load <4 x i8>
98 ; %op1 = zext<4 x i32> %val1
99 ; %val2 = load <4 x i8>
100 ; %op2 = zext<4 x i32> %val2
101 ; %rst = mul <4 x i32> %op1, %op2
103 define void @mul_4xi8(ptr nocapture readonly %a, ptr nocapture readonly %b, i64 %index) nounwind {
104 ; X86-SSE-LABEL: mul_4xi8:
105 ; X86-SSE: # %bb.0: # %entry
106 ; X86-SSE-NEXT: pushl %esi
107 ; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %eax
108 ; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %ecx
109 ; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %edx
110 ; X86-SSE-NEXT: movl c, %esi
111 ; X86-SSE-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
112 ; X86-SSE-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
113 ; X86-SSE-NEXT: pxor %xmm2, %xmm2
114 ; X86-SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7]
115 ; X86-SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
116 ; X86-SSE-NEXT: pmullw %xmm0, %xmm1
117 ; X86-SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
118 ; X86-SSE-NEXT: movdqu %xmm1, (%esi,%ecx,4)
119 ; X86-SSE-NEXT: popl %esi
122 ; X86-AVX-LABEL: mul_4xi8:
123 ; X86-AVX: # %bb.0: # %entry
124 ; X86-AVX-NEXT: pushl %esi
125 ; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax
126 ; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %ecx
127 ; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %edx
128 ; X86-AVX-NEXT: movl c, %esi
129 ; X86-AVX-NEXT: vpmovzxbd {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
130 ; X86-AVX-NEXT: vpmovzxbd {{.*#+}} xmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
131 ; X86-AVX-NEXT: vpmaddwd %xmm0, %xmm1, %xmm0
132 ; X86-AVX-NEXT: vmovdqu %xmm0, (%esi,%ecx,4)
133 ; X86-AVX-NEXT: popl %esi
136 ; X64-SSE-LABEL: mul_4xi8:
137 ; X64-SSE: # %bb.0: # %entry
138 ; X64-SSE-NEXT: movq c(%rip), %rax
139 ; X64-SSE-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
140 ; X64-SSE-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
141 ; X64-SSE-NEXT: pxor %xmm2, %xmm2
142 ; X64-SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7]
143 ; X64-SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
144 ; X64-SSE-NEXT: pmullw %xmm0, %xmm1
145 ; X64-SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
146 ; X64-SSE-NEXT: movdqu %xmm1, (%rax,%rdx,4)
149 ; X64-AVX-LABEL: mul_4xi8:
150 ; X64-AVX: # %bb.0: # %entry
151 ; X64-AVX-NEXT: movq c(%rip), %rax
152 ; X64-AVX-NEXT: vpmovzxbd {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
153 ; X64-AVX-NEXT: vpmovzxbd {{.*#+}} xmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
154 ; X64-AVX-NEXT: vpmaddwd %xmm0, %xmm1, %xmm0
155 ; X64-AVX-NEXT: vmovdqu %xmm0, (%rax,%rdx,4)
158 %pre = load ptr, ptr @c
159 %tmp6 = getelementptr inbounds i8, ptr %a, i64 %index
160 %wide.load = load <4 x i8>, ptr %tmp6, align 1
161 %tmp8 = zext <4 x i8> %wide.load to <4 x i32>
162 %tmp10 = getelementptr inbounds i8, ptr %b, i64 %index
163 %wide.load17 = load <4 x i8>, ptr %tmp10, align 1
164 %tmp12 = zext <4 x i8> %wide.load17 to <4 x i32>
165 %tmp13 = mul nuw nsw <4 x i32> %tmp12, %tmp8
166 %tmp14 = getelementptr inbounds i32, ptr %pre, i64 %index
167 store <4 x i32> %tmp13, ptr %tmp14, align 4
171 ; %val1 = load <8 x i8>
172 ; %op1 = zext<8 x i32> %val1
173 ; %val2 = load <8 x i8>
174 ; %op2 = zext<8 x i32> %val2
175 ; %rst = mul <8 x i32> %op1, %op2
177 define void @mul_8xi8(ptr nocapture readonly %a, ptr nocapture readonly %b, i64 %index) nounwind {
178 ; X86-SSE-LABEL: mul_8xi8:
179 ; X86-SSE: # %bb.0: # %entry
180 ; X86-SSE-NEXT: pushl %esi
181 ; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %edx
182 ; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %eax
183 ; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %esi
184 ; X86-SSE-NEXT: movl c, %ecx
185 ; X86-SSE-NEXT: movq {{.*#+}} xmm0 = mem[0],zero
186 ; X86-SSE-NEXT: movq {{.*#+}} xmm1 = mem[0],zero
187 ; X86-SSE-NEXT: pxor %xmm2, %xmm2
188 ; X86-SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7]
189 ; X86-SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
190 ; X86-SSE-NEXT: pmullw %xmm0, %xmm1
191 ; X86-SSE-NEXT: movdqa %xmm1, %xmm0
192 ; X86-SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
193 ; X86-SSE-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
194 ; X86-SSE-NEXT: movdqu %xmm1, 16(%ecx,%eax,4)
195 ; X86-SSE-NEXT: movdqu %xmm0, (%ecx,%eax,4)
196 ; X86-SSE-NEXT: popl %esi
199 ; X86-AVX1-LABEL: mul_8xi8:
200 ; X86-AVX1: # %bb.0: # %entry
201 ; X86-AVX1-NEXT: pushl %esi
202 ; X86-AVX1-NEXT: movl {{[0-9]+}}(%esp), %eax
203 ; X86-AVX1-NEXT: movl {{[0-9]+}}(%esp), %ecx
204 ; X86-AVX1-NEXT: movl {{[0-9]+}}(%esp), %edx
205 ; X86-AVX1-NEXT: movl c, %esi
206 ; X86-AVX1-NEXT: vpmovzxbd {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
207 ; X86-AVX1-NEXT: vpmovzxbd {{.*#+}} xmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
208 ; X86-AVX1-NEXT: vpmovzxbd {{.*#+}} xmm2 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
209 ; X86-AVX1-NEXT: vpmaddwd %xmm0, %xmm2, %xmm0
210 ; X86-AVX1-NEXT: vpmovzxbd {{.*#+}} xmm2 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
211 ; X86-AVX1-NEXT: vpmaddwd %xmm1, %xmm2, %xmm1
212 ; X86-AVX1-NEXT: vmovdqu %xmm0, 16(%esi,%ecx,4)
213 ; X86-AVX1-NEXT: vmovdqu %xmm1, (%esi,%ecx,4)
214 ; X86-AVX1-NEXT: popl %esi
215 ; X86-AVX1-NEXT: retl
217 ; X86-AVX2-LABEL: mul_8xi8:
218 ; X86-AVX2: # %bb.0: # %entry
219 ; X86-AVX2-NEXT: pushl %esi
220 ; X86-AVX2-NEXT: movl {{[0-9]+}}(%esp), %eax
221 ; X86-AVX2-NEXT: movl {{[0-9]+}}(%esp), %ecx
222 ; X86-AVX2-NEXT: movl {{[0-9]+}}(%esp), %edx
223 ; X86-AVX2-NEXT: movl c, %esi
224 ; X86-AVX2-NEXT: vpmovzxbd {{.*#+}} ymm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero
225 ; X86-AVX2-NEXT: vpmovzxbd {{.*#+}} ymm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero
226 ; X86-AVX2-NEXT: vpmaddwd %ymm0, %ymm1, %ymm0
227 ; X86-AVX2-NEXT: vmovdqu %ymm0, (%esi,%ecx,4)
228 ; X86-AVX2-NEXT: popl %esi
229 ; X86-AVX2-NEXT: vzeroupper
230 ; X86-AVX2-NEXT: retl
232 ; X64-SSE-LABEL: mul_8xi8:
233 ; X64-SSE: # %bb.0: # %entry
234 ; X64-SSE-NEXT: movq c(%rip), %rax
235 ; X64-SSE-NEXT: movq {{.*#+}} xmm0 = mem[0],zero
236 ; X64-SSE-NEXT: movq {{.*#+}} xmm1 = mem[0],zero
237 ; X64-SSE-NEXT: pxor %xmm2, %xmm2
238 ; X64-SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7]
239 ; X64-SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
240 ; X64-SSE-NEXT: pmullw %xmm0, %xmm1
241 ; X64-SSE-NEXT: movdqa %xmm1, %xmm0
242 ; X64-SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
243 ; X64-SSE-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
244 ; X64-SSE-NEXT: movdqu %xmm1, 16(%rax,%rdx,4)
245 ; X64-SSE-NEXT: movdqu %xmm0, (%rax,%rdx,4)
248 ; X64-AVX1-LABEL: mul_8xi8:
249 ; X64-AVX1: # %bb.0: # %entry
250 ; X64-AVX1-NEXT: movq c(%rip), %rax
251 ; X64-AVX1-NEXT: vpmovzxbd {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
252 ; X64-AVX1-NEXT: vpmovzxbd {{.*#+}} xmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
253 ; X64-AVX1-NEXT: vpmovzxbd {{.*#+}} xmm2 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
254 ; X64-AVX1-NEXT: vpmaddwd %xmm0, %xmm2, %xmm0
255 ; X64-AVX1-NEXT: vpmovzxbd {{.*#+}} xmm2 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
256 ; X64-AVX1-NEXT: vpmaddwd %xmm1, %xmm2, %xmm1
257 ; X64-AVX1-NEXT: vmovdqu %xmm0, 16(%rax,%rdx,4)
258 ; X64-AVX1-NEXT: vmovdqu %xmm1, (%rax,%rdx,4)
259 ; X64-AVX1-NEXT: retq
261 ; X64-AVX2-LABEL: mul_8xi8:
262 ; X64-AVX2: # %bb.0: # %entry
263 ; X64-AVX2-NEXT: movq c(%rip), %rax
264 ; X64-AVX2-NEXT: vpmovzxbd {{.*#+}} ymm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero
265 ; X64-AVX2-NEXT: vpmovzxbd {{.*#+}} ymm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero
266 ; X64-AVX2-NEXT: vpmaddwd %ymm0, %ymm1, %ymm0
267 ; X64-AVX2-NEXT: vmovdqu %ymm0, (%rax,%rdx,4)
268 ; X64-AVX2-NEXT: vzeroupper
269 ; X64-AVX2-NEXT: retq
271 %pre = load ptr, ptr @c
272 %tmp6 = getelementptr inbounds i8, ptr %a, i64 %index
273 %wide.load = load <8 x i8>, ptr %tmp6, align 1
274 %tmp8 = zext <8 x i8> %wide.load to <8 x i32>
275 %tmp10 = getelementptr inbounds i8, ptr %b, i64 %index
276 %wide.load17 = load <8 x i8>, ptr %tmp10, align 1
277 %tmp12 = zext <8 x i8> %wide.load17 to <8 x i32>
278 %tmp13 = mul nuw nsw <8 x i32> %tmp12, %tmp8
279 %tmp14 = getelementptr inbounds i32, ptr %pre, i64 %index
280 store <8 x i32> %tmp13, ptr %tmp14, align 4
284 ; %val1 = load <16 x i8>
285 ; %op1 = zext<16 x i32> %val1
286 ; %val2 = load <16 x i8>
287 ; %op2 = zext<16 x i32> %val2
288 ; %rst = mul <16 x i32> %op1, %op2
290 define void @mul_16xi8(ptr nocapture readonly %a, ptr nocapture readonly %b, i64 %index) nounwind {
291 ; X86-SSE-LABEL: mul_16xi8:
292 ; X86-SSE: # %bb.0: # %entry
293 ; X86-SSE-NEXT: pushl %esi
294 ; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %edx
295 ; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %eax
296 ; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %esi
297 ; X86-SSE-NEXT: movl c, %ecx
298 ; X86-SSE-NEXT: movdqu (%esi,%eax), %xmm3
299 ; X86-SSE-NEXT: movdqu (%edx,%eax), %xmm0
300 ; X86-SSE-NEXT: pxor %xmm1, %xmm1
301 ; X86-SSE-NEXT: movdqa %xmm3, %xmm4
302 ; X86-SSE-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1],xmm4[2],xmm1[2],xmm4[3],xmm1[3],xmm4[4],xmm1[4],xmm4[5],xmm1[5],xmm4[6],xmm1[6],xmm4[7],xmm1[7]
303 ; X86-SSE-NEXT: movdqa %xmm0, %xmm2
304 ; X86-SSE-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3],xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7]
305 ; X86-SSE-NEXT: pmullw %xmm4, %xmm2
306 ; X86-SSE-NEXT: movdqa %xmm2, %xmm4
307 ; X86-SSE-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1],xmm4[2],xmm1[2],xmm4[3],xmm1[3]
308 ; X86-SSE-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7]
309 ; X86-SSE-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm1[8],xmm3[9],xmm1[9],xmm3[10],xmm1[10],xmm3[11],xmm1[11],xmm3[12],xmm1[12],xmm3[13],xmm1[13],xmm3[14],xmm1[14],xmm3[15],xmm1[15]
310 ; X86-SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15]
311 ; X86-SSE-NEXT: pmullw %xmm3, %xmm0
312 ; X86-SSE-NEXT: movdqa %xmm0, %xmm3
313 ; X86-SSE-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3]
314 ; X86-SSE-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
315 ; X86-SSE-NEXT: movdqu %xmm0, 48(%ecx,%eax,4)
316 ; X86-SSE-NEXT: movdqu %xmm3, 32(%ecx,%eax,4)
317 ; X86-SSE-NEXT: movdqu %xmm2, 16(%ecx,%eax,4)
318 ; X86-SSE-NEXT: movdqu %xmm4, (%ecx,%eax,4)
319 ; X86-SSE-NEXT: popl %esi
322 ; X86-AVX1-LABEL: mul_16xi8:
323 ; X86-AVX1: # %bb.0: # %entry
324 ; X86-AVX1-NEXT: pushl %esi
325 ; X86-AVX1-NEXT: movl {{[0-9]+}}(%esp), %edx
326 ; X86-AVX1-NEXT: movl {{[0-9]+}}(%esp), %eax
327 ; X86-AVX1-NEXT: movl {{[0-9]+}}(%esp), %esi
328 ; X86-AVX1-NEXT: movl c, %ecx
329 ; X86-AVX1-NEXT: vpmovzxbd {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
330 ; X86-AVX1-NEXT: vpmovzxbd {{.*#+}} xmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
331 ; X86-AVX1-NEXT: vpmovzxbd {{.*#+}} xmm2 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
332 ; X86-AVX1-NEXT: vpmovzxbd {{.*#+}} xmm3 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
333 ; X86-AVX1-NEXT: vpmovzxbd {{.*#+}} xmm4 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
334 ; X86-AVX1-NEXT: vpmaddwd %xmm0, %xmm4, %xmm0
335 ; X86-AVX1-NEXT: vpmovzxbd {{.*#+}} xmm4 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
336 ; X86-AVX1-NEXT: vpmaddwd %xmm1, %xmm4, %xmm1
337 ; X86-AVX1-NEXT: vpmovzxbd {{.*#+}} xmm4 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
338 ; X86-AVX1-NEXT: vpmaddwd %xmm2, %xmm4, %xmm2
339 ; X86-AVX1-NEXT: vpmovzxbd {{.*#+}} xmm4 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
340 ; X86-AVX1-NEXT: vpmaddwd %xmm3, %xmm4, %xmm3
341 ; X86-AVX1-NEXT: vmovdqu %xmm0, 48(%ecx,%eax,4)
342 ; X86-AVX1-NEXT: vmovdqu %xmm1, 32(%ecx,%eax,4)
343 ; X86-AVX1-NEXT: vmovdqu %xmm2, 16(%ecx,%eax,4)
344 ; X86-AVX1-NEXT: vmovdqu %xmm3, (%ecx,%eax,4)
345 ; X86-AVX1-NEXT: popl %esi
346 ; X86-AVX1-NEXT: retl
348 ; X86-AVX2-LABEL: mul_16xi8:
349 ; X86-AVX2: # %bb.0: # %entry
350 ; X86-AVX2-NEXT: pushl %esi
351 ; X86-AVX2-NEXT: movl {{[0-9]+}}(%esp), %eax
352 ; X86-AVX2-NEXT: movl {{[0-9]+}}(%esp), %ecx
353 ; X86-AVX2-NEXT: movl {{[0-9]+}}(%esp), %edx
354 ; X86-AVX2-NEXT: movl c, %esi
355 ; X86-AVX2-NEXT: vpmovzxbd {{.*#+}} ymm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero
356 ; X86-AVX2-NEXT: vpmovzxbd {{.*#+}} ymm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero
357 ; X86-AVX2-NEXT: vpmovzxbd {{.*#+}} ymm2 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero
358 ; X86-AVX2-NEXT: vpmaddwd %ymm0, %ymm2, %ymm0
359 ; X86-AVX2-NEXT: vpmovzxbd {{.*#+}} ymm2 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero
360 ; X86-AVX2-NEXT: vpmaddwd %ymm1, %ymm2, %ymm1
361 ; X86-AVX2-NEXT: vmovdqu %ymm0, 32(%esi,%ecx,4)
362 ; X86-AVX2-NEXT: vmovdqu %ymm1, (%esi,%ecx,4)
363 ; X86-AVX2-NEXT: popl %esi
364 ; X86-AVX2-NEXT: vzeroupper
365 ; X86-AVX2-NEXT: retl
367 ; X64-SSE-LABEL: mul_16xi8:
368 ; X64-SSE: # %bb.0: # %entry
369 ; X64-SSE-NEXT: movq c(%rip), %rax
370 ; X64-SSE-NEXT: movdqu (%rdi,%rdx), %xmm0
371 ; X64-SSE-NEXT: movdqu (%rsi,%rdx), %xmm1
372 ; X64-SSE-NEXT: pxor %xmm2, %xmm2
373 ; X64-SSE-NEXT: movdqa %xmm0, %xmm3
374 ; X64-SSE-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3],xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7]
375 ; X64-SSE-NEXT: movdqa %xmm1, %xmm4
376 ; X64-SSE-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1],xmm4[2],xmm2[2],xmm4[3],xmm2[3],xmm4[4],xmm2[4],xmm4[5],xmm2[5],xmm4[6],xmm2[6],xmm4[7],xmm2[7]
377 ; X64-SSE-NEXT: pmullw %xmm3, %xmm4
378 ; X64-SSE-NEXT: movdqa %xmm4, %xmm3
379 ; X64-SSE-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3]
380 ; X64-SSE-NEXT: punpckhwd {{.*#+}} xmm4 = xmm4[4],xmm2[4],xmm4[5],xmm2[5],xmm4[6],xmm2[6],xmm4[7],xmm2[7]
381 ; X64-SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm2[8],xmm0[9],xmm2[9],xmm0[10],xmm2[10],xmm0[11],xmm2[11],xmm0[12],xmm2[12],xmm0[13],xmm2[13],xmm0[14],xmm2[14],xmm0[15],xmm2[15]
382 ; X64-SSE-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm2[8],xmm1[9],xmm2[9],xmm1[10],xmm2[10],xmm1[11],xmm2[11],xmm1[12],xmm2[12],xmm1[13],xmm2[13],xmm1[14],xmm2[14],xmm1[15],xmm2[15]
383 ; X64-SSE-NEXT: pmullw %xmm0, %xmm1
384 ; X64-SSE-NEXT: movdqa %xmm1, %xmm0
385 ; X64-SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
386 ; X64-SSE-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
387 ; X64-SSE-NEXT: movdqu %xmm1, 48(%rax,%rdx,4)
388 ; X64-SSE-NEXT: movdqu %xmm0, 32(%rax,%rdx,4)
389 ; X64-SSE-NEXT: movdqu %xmm4, 16(%rax,%rdx,4)
390 ; X64-SSE-NEXT: movdqu %xmm3, (%rax,%rdx,4)
393 ; X64-AVX1-LABEL: mul_16xi8:
394 ; X64-AVX1: # %bb.0: # %entry
395 ; X64-AVX1-NEXT: movq c(%rip), %rax
396 ; X64-AVX1-NEXT: vpmovzxbd {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
397 ; X64-AVX1-NEXT: vpmovzxbd {{.*#+}} xmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
398 ; X64-AVX1-NEXT: vpmovzxbd {{.*#+}} xmm2 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
399 ; X64-AVX1-NEXT: vpmovzxbd {{.*#+}} xmm3 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
400 ; X64-AVX1-NEXT: vpmovzxbd {{.*#+}} xmm4 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
401 ; X64-AVX1-NEXT: vpmaddwd %xmm0, %xmm4, %xmm0
402 ; X64-AVX1-NEXT: vpmovzxbd {{.*#+}} xmm4 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
403 ; X64-AVX1-NEXT: vpmaddwd %xmm1, %xmm4, %xmm1
404 ; X64-AVX1-NEXT: vpmovzxbd {{.*#+}} xmm4 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
405 ; X64-AVX1-NEXT: vpmaddwd %xmm2, %xmm4, %xmm2
406 ; X64-AVX1-NEXT: vpmovzxbd {{.*#+}} xmm4 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
407 ; X64-AVX1-NEXT: vpmaddwd %xmm3, %xmm4, %xmm3
408 ; X64-AVX1-NEXT: vmovdqu %xmm0, 48(%rax,%rdx,4)
409 ; X64-AVX1-NEXT: vmovdqu %xmm1, 32(%rax,%rdx,4)
410 ; X64-AVX1-NEXT: vmovdqu %xmm2, 16(%rax,%rdx,4)
411 ; X64-AVX1-NEXT: vmovdqu %xmm3, (%rax,%rdx,4)
412 ; X64-AVX1-NEXT: retq
414 ; X64-AVX2-LABEL: mul_16xi8:
415 ; X64-AVX2: # %bb.0: # %entry
416 ; X64-AVX2-NEXT: movq c(%rip), %rax
417 ; X64-AVX2-NEXT: vpmovzxbd {{.*#+}} ymm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero
418 ; X64-AVX2-NEXT: vpmovzxbd {{.*#+}} ymm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero
419 ; X64-AVX2-NEXT: vpmovzxbd {{.*#+}} ymm2 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero
420 ; X64-AVX2-NEXT: vpmaddwd %ymm0, %ymm2, %ymm0
421 ; X64-AVX2-NEXT: vpmovzxbd {{.*#+}} ymm2 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero
422 ; X64-AVX2-NEXT: vpmaddwd %ymm1, %ymm2, %ymm1
423 ; X64-AVX2-NEXT: vmovdqu %ymm0, 32(%rax,%rdx,4)
424 ; X64-AVX2-NEXT: vmovdqu %ymm1, (%rax,%rdx,4)
425 ; X64-AVX2-NEXT: vzeroupper
426 ; X64-AVX2-NEXT: retq
428 %pre = load ptr, ptr @c
429 %tmp6 = getelementptr inbounds i8, ptr %a, i64 %index
430 %wide.load = load <16 x i8>, ptr %tmp6, align 1
431 %tmp8 = zext <16 x i8> %wide.load to <16 x i32>
432 %tmp10 = getelementptr inbounds i8, ptr %b, i64 %index
433 %wide.load17 = load <16 x i8>, ptr %tmp10, align 1
434 %tmp12 = zext <16 x i8> %wide.load17 to <16 x i32>
435 %tmp13 = mul nuw nsw <16 x i32> %tmp12, %tmp8
436 %tmp14 = getelementptr inbounds i32, ptr %pre, i64 %index
437 store <16 x i32> %tmp13, ptr %tmp14, align 4
441 ; %val1 = load <2 x i16>
442 ; %op1 = zext<2 x i32> %val1
443 ; %val2 = load <2 x i16>
444 ; %op2 = zext<2 x i32> %val2
445 ; %rst = mul <2 x i32> %op1, %op2
447 define void @mul_2xi16(ptr nocapture readonly %a, ptr nocapture readonly %b, i64 %index) nounwind {
448 ; X86-SSE-LABEL: mul_2xi16:
449 ; X86-SSE: # %bb.0: # %entry
450 ; X86-SSE-NEXT: pushl %esi
451 ; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %eax
452 ; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %ecx
453 ; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %edx
454 ; X86-SSE-NEXT: movl c, %esi
455 ; X86-SSE-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
456 ; X86-SSE-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
457 ; X86-SSE-NEXT: movdqa %xmm1, %xmm2
458 ; X86-SSE-NEXT: pmulhuw %xmm0, %xmm2
459 ; X86-SSE-NEXT: pmullw %xmm0, %xmm1
460 ; X86-SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
461 ; X86-SSE-NEXT: movq %xmm1, (%esi,%ecx,4)
462 ; X86-SSE-NEXT: popl %esi
465 ; X86-AVX-LABEL: mul_2xi16:
466 ; X86-AVX: # %bb.0: # %entry
467 ; X86-AVX-NEXT: pushl %esi
468 ; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax
469 ; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %ecx
470 ; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %edx
471 ; X86-AVX-NEXT: movl c, %esi
472 ; X86-AVX-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
473 ; X86-AVX-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
474 ; X86-AVX-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero
475 ; X86-AVX-NEXT: vpmovzxwd {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero
476 ; X86-AVX-NEXT: vpmulld %xmm0, %xmm1, %xmm0
477 ; X86-AVX-NEXT: vmovq %xmm0, (%esi,%ecx,4)
478 ; X86-AVX-NEXT: popl %esi
481 ; X64-SSE-LABEL: mul_2xi16:
482 ; X64-SSE: # %bb.0: # %entry
483 ; X64-SSE-NEXT: movq c(%rip), %rax
484 ; X64-SSE-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
485 ; X64-SSE-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
486 ; X64-SSE-NEXT: movdqa %xmm1, %xmm2
487 ; X64-SSE-NEXT: pmulhuw %xmm0, %xmm2
488 ; X64-SSE-NEXT: pmullw %xmm0, %xmm1
489 ; X64-SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
490 ; X64-SSE-NEXT: movq %xmm1, (%rax,%rdx,4)
493 ; X64-AVX-LABEL: mul_2xi16:
494 ; X64-AVX: # %bb.0: # %entry
495 ; X64-AVX-NEXT: movq c(%rip), %rax
496 ; X64-AVX-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
497 ; X64-AVX-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
498 ; X64-AVX-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero
499 ; X64-AVX-NEXT: vpmovzxwd {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero
500 ; X64-AVX-NEXT: vpmulld %xmm0, %xmm1, %xmm0
501 ; X64-AVX-NEXT: vmovq %xmm0, (%rax,%rdx,4)
504 %pre = load ptr, ptr @c
505 %tmp6 = getelementptr inbounds i8, ptr %a, i64 %index
506 %wide.load = load <2 x i16>, ptr %tmp6, align 1
507 %tmp8 = zext <2 x i16> %wide.load to <2 x i32>
508 %tmp10 = getelementptr inbounds i8, ptr %b, i64 %index
509 %wide.load17 = load <2 x i16>, ptr %tmp10, align 1
510 %tmp12 = zext <2 x i16> %wide.load17 to <2 x i32>
511 %tmp13 = mul nuw nsw <2 x i32> %tmp12, %tmp8
512 %tmp14 = getelementptr inbounds i32, ptr %pre, i64 %index
513 store <2 x i32> %tmp13, ptr %tmp14, align 4
517 ; %val1 = load <4 x i16>
518 ; %op1 = zext<4 x i32> %val1
519 ; %val2 = load <4 x i16>
520 ; %op2 = zext<4 x i32> %val2
521 ; %rst = mul <4 x i32> %op1, %op2
523 define void @mul_4xi16(ptr nocapture readonly %a, ptr nocapture readonly %b, i64 %index) nounwind {
524 ; X86-SSE-LABEL: mul_4xi16:
525 ; X86-SSE: # %bb.0: # %entry
526 ; X86-SSE-NEXT: pushl %esi
527 ; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %eax
528 ; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %ecx
529 ; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %edx
530 ; X86-SSE-NEXT: movl c, %esi
531 ; X86-SSE-NEXT: movq {{.*#+}} xmm0 = mem[0],zero
532 ; X86-SSE-NEXT: movq {{.*#+}} xmm1 = mem[0],zero
533 ; X86-SSE-NEXT: movdqa %xmm1, %xmm2
534 ; X86-SSE-NEXT: pmulhuw %xmm0, %xmm2
535 ; X86-SSE-NEXT: pmullw %xmm0, %xmm1
536 ; X86-SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
537 ; X86-SSE-NEXT: movdqu %xmm1, (%esi,%ecx,4)
538 ; X86-SSE-NEXT: popl %esi
541 ; X86-AVX-LABEL: mul_4xi16:
542 ; X86-AVX: # %bb.0: # %entry
543 ; X86-AVX-NEXT: pushl %esi
544 ; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax
545 ; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %ecx
546 ; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %edx
547 ; X86-AVX-NEXT: movl c, %esi
548 ; X86-AVX-NEXT: vpmovzxwd {{.*#+}} xmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
549 ; X86-AVX-NEXT: vpmovzxwd {{.*#+}} xmm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
550 ; X86-AVX-NEXT: vpmulld %xmm0, %xmm1, %xmm0
551 ; X86-AVX-NEXT: vmovdqu %xmm0, (%esi,%ecx,4)
552 ; X86-AVX-NEXT: popl %esi
555 ; X64-SSE-LABEL: mul_4xi16:
556 ; X64-SSE: # %bb.0: # %entry
557 ; X64-SSE-NEXT: movq c(%rip), %rax
558 ; X64-SSE-NEXT: movq {{.*#+}} xmm0 = mem[0],zero
559 ; X64-SSE-NEXT: movq {{.*#+}} xmm1 = mem[0],zero
560 ; X64-SSE-NEXT: movdqa %xmm1, %xmm2
561 ; X64-SSE-NEXT: pmulhuw %xmm0, %xmm2
562 ; X64-SSE-NEXT: pmullw %xmm0, %xmm1
563 ; X64-SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
564 ; X64-SSE-NEXT: movdqu %xmm1, (%rax,%rdx,4)
567 ; X64-AVX-LABEL: mul_4xi16:
568 ; X64-AVX: # %bb.0: # %entry
569 ; X64-AVX-NEXT: movq c(%rip), %rax
570 ; X64-AVX-NEXT: vpmovzxwd {{.*#+}} xmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
571 ; X64-AVX-NEXT: vpmovzxwd {{.*#+}} xmm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
572 ; X64-AVX-NEXT: vpmulld %xmm0, %xmm1, %xmm0
573 ; X64-AVX-NEXT: vmovdqu %xmm0, (%rax,%rdx,4)
576 %pre = load ptr, ptr @c
577 %tmp6 = getelementptr inbounds i8, ptr %a, i64 %index
578 %wide.load = load <4 x i16>, ptr %tmp6, align 1
579 %tmp8 = zext <4 x i16> %wide.load to <4 x i32>
580 %tmp10 = getelementptr inbounds i8, ptr %b, i64 %index
581 %wide.load17 = load <4 x i16>, ptr %tmp10, align 1
582 %tmp12 = zext <4 x i16> %wide.load17 to <4 x i32>
583 %tmp13 = mul nuw nsw <4 x i32> %tmp12, %tmp8
584 %tmp14 = getelementptr inbounds i32, ptr %pre, i64 %index
585 store <4 x i32> %tmp13, ptr %tmp14, align 4
589 ; %val1 = load <8 x i16>
590 ; %op1 = zext<8 x i32> %val1
591 ; %val2 = load <8 x i16>
592 ; %op2 = zext<8 x i32> %val2
593 ; %rst = mul <8 x i32> %op1, %op2
595 define void @mul_8xi16(ptr nocapture readonly %a, ptr nocapture readonly %b, i64 %index) nounwind {
596 ; X86-SSE-LABEL: mul_8xi16:
597 ; X86-SSE: # %bb.0: # %entry
598 ; X86-SSE-NEXT: pushl %esi
599 ; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %eax
600 ; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %ecx
601 ; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %edx
602 ; X86-SSE-NEXT: movl c, %esi
603 ; X86-SSE-NEXT: movdqu (%edx,%ecx), %xmm0
604 ; X86-SSE-NEXT: movdqu (%eax,%ecx), %xmm1
605 ; X86-SSE-NEXT: movdqa %xmm1, %xmm2
606 ; X86-SSE-NEXT: pmulhuw %xmm0, %xmm2
607 ; X86-SSE-NEXT: pmullw %xmm0, %xmm1
608 ; X86-SSE-NEXT: movdqa %xmm1, %xmm0
609 ; X86-SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
610 ; X86-SSE-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
611 ; X86-SSE-NEXT: movdqu %xmm1, 16(%esi,%ecx,4)
612 ; X86-SSE-NEXT: movdqu %xmm0, (%esi,%ecx,4)
613 ; X86-SSE-NEXT: popl %esi
616 ; X86-AVX1-LABEL: mul_8xi16:
617 ; X86-AVX1: # %bb.0: # %entry
618 ; X86-AVX1-NEXT: pushl %esi
619 ; X86-AVX1-NEXT: movl {{[0-9]+}}(%esp), %eax
620 ; X86-AVX1-NEXT: movl {{[0-9]+}}(%esp), %ecx
621 ; X86-AVX1-NEXT: movl {{[0-9]+}}(%esp), %edx
622 ; X86-AVX1-NEXT: movl c, %esi
623 ; X86-AVX1-NEXT: vpmovzxwd {{.*#+}} xmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
624 ; X86-AVX1-NEXT: vpmovzxwd {{.*#+}} xmm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
625 ; X86-AVX1-NEXT: vpmovzxwd {{.*#+}} xmm2 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
626 ; X86-AVX1-NEXT: vpmulld %xmm0, %xmm2, %xmm0
627 ; X86-AVX1-NEXT: vpmovzxwd {{.*#+}} xmm2 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
628 ; X86-AVX1-NEXT: vpmulld %xmm1, %xmm2, %xmm1
629 ; X86-AVX1-NEXT: vmovdqu %xmm0, 16(%esi,%ecx,4)
630 ; X86-AVX1-NEXT: vmovdqu %xmm1, (%esi,%ecx,4)
631 ; X86-AVX1-NEXT: popl %esi
632 ; X86-AVX1-NEXT: retl
634 ; X86-AVX2-LABEL: mul_8xi16:
635 ; X86-AVX2: # %bb.0: # %entry
636 ; X86-AVX2-NEXT: pushl %esi
637 ; X86-AVX2-NEXT: movl {{[0-9]+}}(%esp), %eax
638 ; X86-AVX2-NEXT: movl {{[0-9]+}}(%esp), %ecx
639 ; X86-AVX2-NEXT: movl {{[0-9]+}}(%esp), %edx
640 ; X86-AVX2-NEXT: movl c, %esi
641 ; X86-AVX2-NEXT: vpmovzxwd {{.*#+}} ymm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero
642 ; X86-AVX2-NEXT: vpmovzxwd {{.*#+}} ymm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero
643 ; X86-AVX2-NEXT: vpmulld %ymm0, %ymm1, %ymm0
644 ; X86-AVX2-NEXT: vmovdqu %ymm0, (%esi,%ecx,4)
645 ; X86-AVX2-NEXT: popl %esi
646 ; X86-AVX2-NEXT: vzeroupper
647 ; X86-AVX2-NEXT: retl
649 ; X64-SSE-LABEL: mul_8xi16:
650 ; X64-SSE: # %bb.0: # %entry
651 ; X64-SSE-NEXT: movq c(%rip), %rax
652 ; X64-SSE-NEXT: movdqu (%rdi,%rdx), %xmm0
653 ; X64-SSE-NEXT: movdqu (%rsi,%rdx), %xmm1
654 ; X64-SSE-NEXT: movdqa %xmm1, %xmm2
655 ; X64-SSE-NEXT: pmulhuw %xmm0, %xmm2
656 ; X64-SSE-NEXT: pmullw %xmm0, %xmm1
657 ; X64-SSE-NEXT: movdqa %xmm1, %xmm0
658 ; X64-SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
659 ; X64-SSE-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
660 ; X64-SSE-NEXT: movdqu %xmm1, 16(%rax,%rdx,4)
661 ; X64-SSE-NEXT: movdqu %xmm0, (%rax,%rdx,4)
664 ; X64-AVX1-LABEL: mul_8xi16:
665 ; X64-AVX1: # %bb.0: # %entry
666 ; X64-AVX1-NEXT: movq c(%rip), %rax
667 ; X64-AVX1-NEXT: vpmovzxwd {{.*#+}} xmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
668 ; X64-AVX1-NEXT: vpmovzxwd {{.*#+}} xmm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
669 ; X64-AVX1-NEXT: vpmovzxwd {{.*#+}} xmm2 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
670 ; X64-AVX1-NEXT: vpmulld %xmm0, %xmm2, %xmm0
671 ; X64-AVX1-NEXT: vpmovzxwd {{.*#+}} xmm2 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
672 ; X64-AVX1-NEXT: vpmulld %xmm1, %xmm2, %xmm1
673 ; X64-AVX1-NEXT: vmovdqu %xmm0, 16(%rax,%rdx,4)
674 ; X64-AVX1-NEXT: vmovdqu %xmm1, (%rax,%rdx,4)
675 ; X64-AVX1-NEXT: retq
677 ; X64-AVX2-LABEL: mul_8xi16:
678 ; X64-AVX2: # %bb.0: # %entry
679 ; X64-AVX2-NEXT: movq c(%rip), %rax
680 ; X64-AVX2-NEXT: vpmovzxwd {{.*#+}} ymm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero
681 ; X64-AVX2-NEXT: vpmovzxwd {{.*#+}} ymm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero
682 ; X64-AVX2-NEXT: vpmulld %ymm0, %ymm1, %ymm0
683 ; X64-AVX2-NEXT: vmovdqu %ymm0, (%rax,%rdx,4)
684 ; X64-AVX2-NEXT: vzeroupper
685 ; X64-AVX2-NEXT: retq
687 %pre = load ptr, ptr @c
688 %tmp6 = getelementptr inbounds i8, ptr %a, i64 %index
689 %wide.load = load <8 x i16>, ptr %tmp6, align 1
690 %tmp8 = zext <8 x i16> %wide.load to <8 x i32>
691 %tmp10 = getelementptr inbounds i8, ptr %b, i64 %index
692 %wide.load17 = load <8 x i16>, ptr %tmp10, align 1
693 %tmp12 = zext <8 x i16> %wide.load17 to <8 x i32>
694 %tmp13 = mul nuw nsw <8 x i32> %tmp12, %tmp8
695 %tmp14 = getelementptr inbounds i32, ptr %pre, i64 %index
696 store <8 x i32> %tmp13, ptr %tmp14, align 4
700 ; %val1 = load <16 x i16>
701 ; %op1 = zext<16 x i32> %val1
702 ; %val2 = load <16 x i16>
703 ; %op2 = zext<16 x i32> %val2
704 ; %rst = mul <16 x i32> %op1, %op2
706 define void @mul_16xi16(ptr nocapture readonly %a, ptr nocapture readonly %b, i64 %index) nounwind {
707 ; X86-SSE-LABEL: mul_16xi16:
708 ; X86-SSE: # %bb.0: # %entry
709 ; X86-SSE-NEXT: pushl %esi
710 ; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %edx
711 ; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %eax
712 ; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %esi
713 ; X86-SSE-NEXT: movl c, %ecx
714 ; X86-SSE-NEXT: movdqu (%esi,%eax), %xmm2
715 ; X86-SSE-NEXT: movdqu 16(%esi,%eax), %xmm3
716 ; X86-SSE-NEXT: movdqu (%edx,%eax), %xmm0
717 ; X86-SSE-NEXT: movdqu 16(%edx,%eax), %xmm1
718 ; X86-SSE-NEXT: movdqa %xmm0, %xmm4
719 ; X86-SSE-NEXT: pmulhuw %xmm2, %xmm4
720 ; X86-SSE-NEXT: pmullw %xmm2, %xmm0
721 ; X86-SSE-NEXT: movdqa %xmm0, %xmm2
722 ; X86-SSE-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm4[4],xmm2[5],xmm4[5],xmm2[6],xmm4[6],xmm2[7],xmm4[7]
723 ; X86-SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3]
724 ; X86-SSE-NEXT: movdqa %xmm1, %xmm4
725 ; X86-SSE-NEXT: pmulhuw %xmm3, %xmm4
726 ; X86-SSE-NEXT: pmullw %xmm3, %xmm1
727 ; X86-SSE-NEXT: movdqa %xmm1, %xmm3
728 ; X86-SSE-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7]
729 ; X86-SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3]
730 ; X86-SSE-NEXT: movdqu %xmm1, 32(%ecx,%eax,4)
731 ; X86-SSE-NEXT: movdqu %xmm3, 48(%ecx,%eax,4)
732 ; X86-SSE-NEXT: movdqu %xmm0, (%ecx,%eax,4)
733 ; X86-SSE-NEXT: movdqu %xmm2, 16(%ecx,%eax,4)
734 ; X86-SSE-NEXT: popl %esi
737 ; X86-AVX1-LABEL: mul_16xi16:
738 ; X86-AVX1: # %bb.0: # %entry
739 ; X86-AVX1-NEXT: pushl %esi
740 ; X86-AVX1-NEXT: movl {{[0-9]+}}(%esp), %edx
741 ; X86-AVX1-NEXT: movl {{[0-9]+}}(%esp), %eax
742 ; X86-AVX1-NEXT: movl {{[0-9]+}}(%esp), %esi
743 ; X86-AVX1-NEXT: movl c, %ecx
744 ; X86-AVX1-NEXT: vpmovzxwd {{.*#+}} xmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
745 ; X86-AVX1-NEXT: vpmovzxwd {{.*#+}} xmm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
746 ; X86-AVX1-NEXT: vpmovzxwd {{.*#+}} xmm2 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
747 ; X86-AVX1-NEXT: vpmovzxwd {{.*#+}} xmm3 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
748 ; X86-AVX1-NEXT: vpmovzxwd {{.*#+}} xmm4 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
749 ; X86-AVX1-NEXT: vpmulld %xmm0, %xmm4, %xmm0
750 ; X86-AVX1-NEXT: vpmovzxwd {{.*#+}} xmm4 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
751 ; X86-AVX1-NEXT: vpmulld %xmm1, %xmm4, %xmm1
752 ; X86-AVX1-NEXT: vpmovzxwd {{.*#+}} xmm4 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
753 ; X86-AVX1-NEXT: vpmulld %xmm2, %xmm4, %xmm2
754 ; X86-AVX1-NEXT: vpmovzxwd {{.*#+}} xmm4 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
755 ; X86-AVX1-NEXT: vpmulld %xmm3, %xmm4, %xmm3
756 ; X86-AVX1-NEXT: vmovdqu %xmm0, 48(%ecx,%eax,4)
757 ; X86-AVX1-NEXT: vmovdqu %xmm1, 32(%ecx,%eax,4)
758 ; X86-AVX1-NEXT: vmovdqu %xmm2, 16(%ecx,%eax,4)
759 ; X86-AVX1-NEXT: vmovdqu %xmm3, (%ecx,%eax,4)
760 ; X86-AVX1-NEXT: popl %esi
761 ; X86-AVX1-NEXT: retl
763 ; X86-AVX2-LABEL: mul_16xi16:
764 ; X86-AVX2: # %bb.0: # %entry
765 ; X86-AVX2-NEXT: pushl %esi
766 ; X86-AVX2-NEXT: movl {{[0-9]+}}(%esp), %eax
767 ; X86-AVX2-NEXT: movl {{[0-9]+}}(%esp), %ecx
768 ; X86-AVX2-NEXT: movl {{[0-9]+}}(%esp), %edx
769 ; X86-AVX2-NEXT: movl c, %esi
770 ; X86-AVX2-NEXT: vpmovzxwd {{.*#+}} ymm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero
771 ; X86-AVX2-NEXT: vpmovzxwd {{.*#+}} ymm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero
772 ; X86-AVX2-NEXT: vpmovzxwd {{.*#+}} ymm2 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero
773 ; X86-AVX2-NEXT: vpmulld %ymm0, %ymm2, %ymm0
774 ; X86-AVX2-NEXT: vpmovzxwd {{.*#+}} ymm2 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero
775 ; X86-AVX2-NEXT: vpmulld %ymm1, %ymm2, %ymm1
776 ; X86-AVX2-NEXT: vmovdqu %ymm0, 32(%esi,%ecx,4)
777 ; X86-AVX2-NEXT: vmovdqu %ymm1, (%esi,%ecx,4)
778 ; X86-AVX2-NEXT: popl %esi
779 ; X86-AVX2-NEXT: vzeroupper
780 ; X86-AVX2-NEXT: retl
782 ; X64-SSE-LABEL: mul_16xi16:
783 ; X64-SSE: # %bb.0: # %entry
784 ; X64-SSE-NEXT: movq c(%rip), %rax
785 ; X64-SSE-NEXT: movdqu (%rdi,%rdx), %xmm0
786 ; X64-SSE-NEXT: movdqu 16(%rdi,%rdx), %xmm1
787 ; X64-SSE-NEXT: movdqu (%rsi,%rdx), %xmm2
788 ; X64-SSE-NEXT: movdqu 16(%rsi,%rdx), %xmm3
789 ; X64-SSE-NEXT: movdqa %xmm2, %xmm4
790 ; X64-SSE-NEXT: pmulhuw %xmm0, %xmm4
791 ; X64-SSE-NEXT: pmullw %xmm0, %xmm2
792 ; X64-SSE-NEXT: movdqa %xmm2, %xmm0
793 ; X64-SSE-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm4[4],xmm0[5],xmm4[5],xmm0[6],xmm4[6],xmm0[7],xmm4[7]
794 ; X64-SSE-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3]
795 ; X64-SSE-NEXT: movdqa %xmm3, %xmm4
796 ; X64-SSE-NEXT: pmulhuw %xmm1, %xmm4
797 ; X64-SSE-NEXT: pmullw %xmm1, %xmm3
798 ; X64-SSE-NEXT: movdqa %xmm3, %xmm1
799 ; X64-SSE-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm4[4],xmm1[5],xmm4[5],xmm1[6],xmm4[6],xmm1[7],xmm4[7]
800 ; X64-SSE-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3]
801 ; X64-SSE-NEXT: movdqu %xmm3, 32(%rax,%rdx,4)
802 ; X64-SSE-NEXT: movdqu %xmm1, 48(%rax,%rdx,4)
803 ; X64-SSE-NEXT: movdqu %xmm2, (%rax,%rdx,4)
804 ; X64-SSE-NEXT: movdqu %xmm0, 16(%rax,%rdx,4)
807 ; X64-AVX1-LABEL: mul_16xi16:
808 ; X64-AVX1: # %bb.0: # %entry
809 ; X64-AVX1-NEXT: movq c(%rip), %rax
810 ; X64-AVX1-NEXT: vpmovzxwd {{.*#+}} xmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
811 ; X64-AVX1-NEXT: vpmovzxwd {{.*#+}} xmm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
812 ; X64-AVX1-NEXT: vpmovzxwd {{.*#+}} xmm2 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
813 ; X64-AVX1-NEXT: vpmovzxwd {{.*#+}} xmm3 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
814 ; X64-AVX1-NEXT: vpmovzxwd {{.*#+}} xmm4 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
815 ; X64-AVX1-NEXT: vpmulld %xmm0, %xmm4, %xmm0
816 ; X64-AVX1-NEXT: vpmovzxwd {{.*#+}} xmm4 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
817 ; X64-AVX1-NEXT: vpmulld %xmm1, %xmm4, %xmm1
818 ; X64-AVX1-NEXT: vpmovzxwd {{.*#+}} xmm4 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
819 ; X64-AVX1-NEXT: vpmulld %xmm2, %xmm4, %xmm2
820 ; X64-AVX1-NEXT: vpmovzxwd {{.*#+}} xmm4 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
821 ; X64-AVX1-NEXT: vpmulld %xmm3, %xmm4, %xmm3
822 ; X64-AVX1-NEXT: vmovdqu %xmm0, 48(%rax,%rdx,4)
823 ; X64-AVX1-NEXT: vmovdqu %xmm1, 32(%rax,%rdx,4)
824 ; X64-AVX1-NEXT: vmovdqu %xmm2, 16(%rax,%rdx,4)
825 ; X64-AVX1-NEXT: vmovdqu %xmm3, (%rax,%rdx,4)
826 ; X64-AVX1-NEXT: retq
828 ; X64-AVX2-LABEL: mul_16xi16:
829 ; X64-AVX2: # %bb.0: # %entry
830 ; X64-AVX2-NEXT: movq c(%rip), %rax
831 ; X64-AVX2-NEXT: vpmovzxwd {{.*#+}} ymm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero
832 ; X64-AVX2-NEXT: vpmovzxwd {{.*#+}} ymm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero
833 ; X64-AVX2-NEXT: vpmovzxwd {{.*#+}} ymm2 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero
834 ; X64-AVX2-NEXT: vpmulld %ymm0, %ymm2, %ymm0
835 ; X64-AVX2-NEXT: vpmovzxwd {{.*#+}} ymm2 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero
836 ; X64-AVX2-NEXT: vpmulld %ymm1, %ymm2, %ymm1
837 ; X64-AVX2-NEXT: vmovdqu %ymm0, 32(%rax,%rdx,4)
838 ; X64-AVX2-NEXT: vmovdqu %ymm1, (%rax,%rdx,4)
839 ; X64-AVX2-NEXT: vzeroupper
840 ; X64-AVX2-NEXT: retq
842 %pre = load ptr, ptr @c
843 %tmp6 = getelementptr inbounds i8, ptr %a, i64 %index
844 %wide.load = load <16 x i16>, ptr %tmp6, align 1
845 %tmp8 = zext <16 x i16> %wide.load to <16 x i32>
846 %tmp10 = getelementptr inbounds i8, ptr %b, i64 %index
847 %wide.load17 = load <16 x i16>, ptr %tmp10, align 1
848 %tmp12 = zext <16 x i16> %wide.load17 to <16 x i32>
849 %tmp13 = mul nuw nsw <16 x i32> %tmp12, %tmp8
850 %tmp14 = getelementptr inbounds i32, ptr %pre, i64 %index
851 store <16 x i32> %tmp13, ptr %tmp14, align 4
855 ; %val1 = load <2 x i8>
856 ; %op1 = sext<2 x i32> %val1
857 ; %val2 = load <2 x i8>
858 ; %op2 = sext<2 x i32> %val2
859 ; %rst = mul <2 x i32> %op1, %op2
861 define void @mul_2xi8_sext(ptr nocapture readonly %a, ptr nocapture readonly %b, i64 %index) nounwind {
862 ; X86-SSE-LABEL: mul_2xi8_sext:
863 ; X86-SSE: # %bb.0: # %entry
864 ; X86-SSE-NEXT: pushl %esi
865 ; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %edx
866 ; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %eax
867 ; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %esi
868 ; X86-SSE-NEXT: movl c, %ecx
869 ; X86-SSE-NEXT: movzwl (%esi,%eax), %esi
870 ; X86-SSE-NEXT: movd %esi, %xmm0
871 ; X86-SSE-NEXT: movzwl (%edx,%eax), %edx
872 ; X86-SSE-NEXT: movd %edx, %xmm1
873 ; X86-SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
874 ; X86-SSE-NEXT: psraw $8, %xmm0
875 ; X86-SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
876 ; X86-SSE-NEXT: psraw $8, %xmm1
877 ; X86-SSE-NEXT: pmullw %xmm0, %xmm1
878 ; X86-SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm1[0,0,2,1,4,5,6,7]
879 ; X86-SSE-NEXT: psrad $16, %xmm0
880 ; X86-SSE-NEXT: movq %xmm0, (%ecx,%eax,4)
881 ; X86-SSE-NEXT: popl %esi
884 ; X86-AVX-LABEL: mul_2xi8_sext:
885 ; X86-AVX: # %bb.0: # %entry
886 ; X86-AVX-NEXT: pushl %esi
887 ; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax
888 ; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %ecx
889 ; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %edx
890 ; X86-AVX-NEXT: movl c, %esi
891 ; X86-AVX-NEXT: movzwl (%edx,%ecx), %edx
892 ; X86-AVX-NEXT: vmovd %edx, %xmm0
893 ; X86-AVX-NEXT: vpmovsxbd %xmm0, %xmm0
894 ; X86-AVX-NEXT: movzwl (%eax,%ecx), %eax
895 ; X86-AVX-NEXT: vmovd %eax, %xmm1
896 ; X86-AVX-NEXT: vpmovsxbd %xmm1, %xmm1
897 ; X86-AVX-NEXT: vpmulld %xmm0, %xmm1, %xmm0
898 ; X86-AVX-NEXT: vmovq %xmm0, (%esi,%ecx,4)
899 ; X86-AVX-NEXT: popl %esi
902 ; X64-SSE-LABEL: mul_2xi8_sext:
903 ; X64-SSE: # %bb.0: # %entry
904 ; X64-SSE-NEXT: movq c(%rip), %rax
905 ; X64-SSE-NEXT: movzwl (%rdi,%rdx), %ecx
906 ; X64-SSE-NEXT: movd %ecx, %xmm0
907 ; X64-SSE-NEXT: movzwl (%rsi,%rdx), %ecx
908 ; X64-SSE-NEXT: movd %ecx, %xmm1
909 ; X64-SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
910 ; X64-SSE-NEXT: psraw $8, %xmm0
911 ; X64-SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
912 ; X64-SSE-NEXT: psraw $8, %xmm1
913 ; X64-SSE-NEXT: pmullw %xmm0, %xmm1
914 ; X64-SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm1[0,0,2,1,4,5,6,7]
915 ; X64-SSE-NEXT: psrad $16, %xmm0
916 ; X64-SSE-NEXT: movq %xmm0, (%rax,%rdx,4)
919 ; X64-AVX-LABEL: mul_2xi8_sext:
920 ; X64-AVX: # %bb.0: # %entry
921 ; X64-AVX-NEXT: movq c(%rip), %rax
922 ; X64-AVX-NEXT: movzwl (%rdi,%rdx), %ecx
923 ; X64-AVX-NEXT: vmovd %ecx, %xmm0
924 ; X64-AVX-NEXT: vpmovsxbd %xmm0, %xmm0
925 ; X64-AVX-NEXT: movzwl (%rsi,%rdx), %ecx
926 ; X64-AVX-NEXT: vmovd %ecx, %xmm1
927 ; X64-AVX-NEXT: vpmovsxbd %xmm1, %xmm1
928 ; X64-AVX-NEXT: vpmulld %xmm0, %xmm1, %xmm0
929 ; X64-AVX-NEXT: vmovq %xmm0, (%rax,%rdx,4)
932 %pre = load ptr, ptr @c
933 %tmp6 = getelementptr inbounds i8, ptr %a, i64 %index
934 %wide.load = load <2 x i8>, ptr %tmp6, align 1
935 %tmp8 = sext <2 x i8> %wide.load to <2 x i32>
936 %tmp10 = getelementptr inbounds i8, ptr %b, i64 %index
937 %wide.load17 = load <2 x i8>, ptr %tmp10, align 1
938 %tmp12 = sext <2 x i8> %wide.load17 to <2 x i32>
939 %tmp13 = mul nuw nsw <2 x i32> %tmp12, %tmp8
940 %tmp14 = getelementptr inbounds i32, ptr %pre, i64 %index
941 store <2 x i32> %tmp13, ptr %tmp14, align 4
945 ; %val1 = load <2 x i8>
946 ; %op1 = sext<2 x i32> %val1
947 ; %val2 = load <2 x i8>
948 ; %op2 = zext<2 x i32> %val2
949 ; %rst = mul <2 x i32> %op1, %op2
951 define void @mul_2xi8_sext_zext(ptr nocapture readonly %a, ptr nocapture readonly %b, i64 %index) nounwind {
952 ; X86-SSE-LABEL: mul_2xi8_sext_zext:
953 ; X86-SSE: # %bb.0: # %entry
954 ; X86-SSE-NEXT: pushl %esi
955 ; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %edx
956 ; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %eax
957 ; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %esi
958 ; X86-SSE-NEXT: movl c, %ecx
959 ; X86-SSE-NEXT: movzwl (%esi,%eax), %esi
960 ; X86-SSE-NEXT: movd %esi, %xmm0
961 ; X86-SSE-NEXT: movzwl (%edx,%eax), %edx
962 ; X86-SSE-NEXT: movd %edx, %xmm1
963 ; X86-SSE-NEXT: pxor %xmm2, %xmm2
964 ; X86-SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
965 ; X86-SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,1,1,3,4,5,6,7]
966 ; X86-SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
967 ; X86-SSE-NEXT: psraw $8, %xmm0
968 ; X86-SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
969 ; X86-SSE-NEXT: pmaddwd %xmm1, %xmm0
970 ; X86-SSE-NEXT: movq %xmm0, (%ecx,%eax,4)
971 ; X86-SSE-NEXT: popl %esi
974 ; X86-AVX-LABEL: mul_2xi8_sext_zext:
975 ; X86-AVX: # %bb.0: # %entry
976 ; X86-AVX-NEXT: pushl %esi
977 ; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax
978 ; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %ecx
979 ; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %edx
980 ; X86-AVX-NEXT: movl c, %esi
981 ; X86-AVX-NEXT: movzwl (%edx,%ecx), %edx
982 ; X86-AVX-NEXT: vmovd %edx, %xmm0
983 ; X86-AVX-NEXT: vpmovsxbd %xmm0, %xmm0
984 ; X86-AVX-NEXT: movzwl (%eax,%ecx), %eax
985 ; X86-AVX-NEXT: vmovd %eax, %xmm1
986 ; X86-AVX-NEXT: vpmovzxbd {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero
987 ; X86-AVX-NEXT: vpmaddwd %xmm0, %xmm1, %xmm0
988 ; X86-AVX-NEXT: vmovq %xmm0, (%esi,%ecx,4)
989 ; X86-AVX-NEXT: popl %esi
992 ; X64-SSE-LABEL: mul_2xi8_sext_zext:
993 ; X64-SSE: # %bb.0: # %entry
994 ; X64-SSE-NEXT: movq c(%rip), %rax
995 ; X64-SSE-NEXT: movzwl (%rdi,%rdx), %ecx
996 ; X64-SSE-NEXT: movd %ecx, %xmm0
997 ; X64-SSE-NEXT: movzwl (%rsi,%rdx), %ecx
998 ; X64-SSE-NEXT: movd %ecx, %xmm1
999 ; X64-SSE-NEXT: pxor %xmm2, %xmm2
1000 ; X64-SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
1001 ; X64-SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,1,1,3,4,5,6,7]
1002 ; X64-SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
1003 ; X64-SSE-NEXT: psraw $8, %xmm0
1004 ; X64-SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
1005 ; X64-SSE-NEXT: pmaddwd %xmm1, %xmm0
1006 ; X64-SSE-NEXT: movq %xmm0, (%rax,%rdx,4)
1007 ; X64-SSE-NEXT: retq
1009 ; X64-AVX-LABEL: mul_2xi8_sext_zext:
1010 ; X64-AVX: # %bb.0: # %entry
1011 ; X64-AVX-NEXT: movq c(%rip), %rax
1012 ; X64-AVX-NEXT: movzwl (%rdi,%rdx), %ecx
1013 ; X64-AVX-NEXT: vmovd %ecx, %xmm0
1014 ; X64-AVX-NEXT: vpmovsxbd %xmm0, %xmm0
1015 ; X64-AVX-NEXT: movzwl (%rsi,%rdx), %ecx
1016 ; X64-AVX-NEXT: vmovd %ecx, %xmm1
1017 ; X64-AVX-NEXT: vpmovzxbd {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero
1018 ; X64-AVX-NEXT: vpmaddwd %xmm0, %xmm1, %xmm0
1019 ; X64-AVX-NEXT: vmovq %xmm0, (%rax,%rdx,4)
1020 ; X64-AVX-NEXT: retq
1022 %pre = load ptr, ptr @c
1023 %tmp6 = getelementptr inbounds i8, ptr %a, i64 %index
1024 %wide.load = load <2 x i8>, ptr %tmp6, align 1
1025 %tmp8 = sext <2 x i8> %wide.load to <2 x i32>
1026 %tmp10 = getelementptr inbounds i8, ptr %b, i64 %index
1027 %wide.load17 = load <2 x i8>, ptr %tmp10, align 1
1028 %tmp12 = zext <2 x i8> %wide.load17 to <2 x i32>
1029 %tmp13 = mul nuw nsw <2 x i32> %tmp12, %tmp8
1030 %tmp14 = getelementptr inbounds i32, ptr %pre, i64 %index
1031 store <2 x i32> %tmp13, ptr %tmp14, align 4
1035 ; %val1 = load <2 x i16>
1036 ; %op1 = sext<2 x i32> %val1
1037 ; %val2 = load <2 x i16>
1038 ; %op2 = sext<2 x i32> %val2
1039 ; %rst = mul <2 x i32> %op1, %op2
1041 define void @mul_2xi16_sext(ptr nocapture readonly %a, ptr nocapture readonly %b, i64 %index) nounwind {
1042 ; X86-SSE-LABEL: mul_2xi16_sext:
1043 ; X86-SSE: # %bb.0: # %entry
1044 ; X86-SSE-NEXT: pushl %esi
1045 ; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %eax
1046 ; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %ecx
1047 ; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %edx
1048 ; X86-SSE-NEXT: movl c, %esi
1049 ; X86-SSE-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
1050 ; X86-SSE-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
1051 ; X86-SSE-NEXT: pxor %xmm2, %xmm2
1052 ; X86-SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
1053 ; X86-SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,1,1,3,4,5,6,7]
1054 ; X86-SSE-NEXT: pmaddwd %xmm0, %xmm1
1055 ; X86-SSE-NEXT: movq %xmm1, (%esi,%ecx,4)
1056 ; X86-SSE-NEXT: popl %esi
1057 ; X86-SSE-NEXT: retl
1059 ; X86-AVX-LABEL: mul_2xi16_sext:
1060 ; X86-AVX: # %bb.0: # %entry
1061 ; X86-AVX-NEXT: pushl %esi
1062 ; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax
1063 ; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %ecx
1064 ; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %edx
1065 ; X86-AVX-NEXT: movl c, %esi
1066 ; X86-AVX-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
1067 ; X86-AVX-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero
1068 ; X86-AVX-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
1069 ; X86-AVX-NEXT: vpmovzxwd {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero
1070 ; X86-AVX-NEXT: vpmaddwd %xmm0, %xmm1, %xmm0
1071 ; X86-AVX-NEXT: vmovq %xmm0, (%esi,%ecx,4)
1072 ; X86-AVX-NEXT: popl %esi
1073 ; X86-AVX-NEXT: retl
1075 ; X64-SSE-LABEL: mul_2xi16_sext:
1076 ; X64-SSE: # %bb.0: # %entry
1077 ; X64-SSE-NEXT: movq c(%rip), %rax
1078 ; X64-SSE-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
1079 ; X64-SSE-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
1080 ; X64-SSE-NEXT: pxor %xmm2, %xmm2
1081 ; X64-SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
1082 ; X64-SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,1,1,3,4,5,6,7]
1083 ; X64-SSE-NEXT: pmaddwd %xmm0, %xmm1
1084 ; X64-SSE-NEXT: movq %xmm1, (%rax,%rdx,4)
1085 ; X64-SSE-NEXT: retq
1087 ; X64-AVX-LABEL: mul_2xi16_sext:
1088 ; X64-AVX: # %bb.0: # %entry
1089 ; X64-AVX-NEXT: movq c(%rip), %rax
1090 ; X64-AVX-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
1091 ; X64-AVX-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero
1092 ; X64-AVX-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
1093 ; X64-AVX-NEXT: vpmovzxwd {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero
1094 ; X64-AVX-NEXT: vpmaddwd %xmm0, %xmm1, %xmm0
1095 ; X64-AVX-NEXT: vmovq %xmm0, (%rax,%rdx,4)
1096 ; X64-AVX-NEXT: retq
1098 %pre = load ptr, ptr @c
1099 %tmp6 = getelementptr inbounds i8, ptr %a, i64 %index
1100 %wide.load = load <2 x i16>, ptr %tmp6, align 1
1101 %tmp8 = sext <2 x i16> %wide.load to <2 x i32>
1102 %tmp10 = getelementptr inbounds i8, ptr %b, i64 %index
1103 %wide.load17 = load <2 x i16>, ptr %tmp10, align 1
1104 %tmp12 = sext <2 x i16> %wide.load17 to <2 x i32>
1105 %tmp13 = mul nuw nsw <2 x i32> %tmp12, %tmp8
1106 %tmp14 = getelementptr inbounds i32, ptr %pre, i64 %index
1107 store <2 x i32> %tmp13, ptr %tmp14, align 4
1111 ; %val1 = load <2 x i16>
1112 ; %op1 = sext<2 x i32> %val1
1113 ; %val2 = load <2 x i16>
1114 ; %op2 = zext<2 x i32> %val2
1115 ; %rst = mul <2 x i32> %op1, %op2
1117 define void @mul_2xi16_sext_zext(ptr nocapture readonly %a, ptr nocapture readonly %b, i64 %index) nounwind {
1118 ; X86-SSE-LABEL: mul_2xi16_sext_zext:
1119 ; X86-SSE: # %bb.0: # %entry
1120 ; X86-SSE-NEXT: pushl %esi
1121 ; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %edx
1122 ; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %eax
1123 ; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %esi
1124 ; X86-SSE-NEXT: movl c, %ecx
1125 ; X86-SSE-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
1126 ; X86-SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,2,1,4,5,6,7]
1127 ; X86-SSE-NEXT: psrad $16, %xmm0
1128 ; X86-SSE-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
1129 ; X86-SSE-NEXT: pxor %xmm2, %xmm2
1130 ; X86-SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
1131 ; X86-SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
1132 ; X86-SSE-NEXT: pmuludq %xmm0, %xmm1
1133 ; X86-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
1134 ; X86-SSE-NEXT: pmuludq %xmm2, %xmm0
1135 ; X86-SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
1136 ; X86-SSE-NEXT: movq %xmm1, (%ecx,%eax,4)
1137 ; X86-SSE-NEXT: popl %esi
1138 ; X86-SSE-NEXT: retl
1140 ; X86-AVX-LABEL: mul_2xi16_sext_zext:
1141 ; X86-AVX: # %bb.0: # %entry
1142 ; X86-AVX-NEXT: pushl %esi
1143 ; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax
1144 ; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %ecx
1145 ; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %edx
1146 ; X86-AVX-NEXT: movl c, %esi
1147 ; X86-AVX-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
1148 ; X86-AVX-NEXT: vpmovsxwd %xmm0, %xmm0
1149 ; X86-AVX-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero
1150 ; X86-AVX-NEXT: vpmovzxwd {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero
1151 ; X86-AVX-NEXT: vpmulld %xmm0, %xmm1, %xmm0
1152 ; X86-AVX-NEXT: vmovq %xmm0, (%esi,%ecx,4)
1153 ; X86-AVX-NEXT: popl %esi
1154 ; X86-AVX-NEXT: retl
1156 ; X64-SSE-LABEL: mul_2xi16_sext_zext:
1157 ; X64-SSE: # %bb.0: # %entry
1158 ; X64-SSE-NEXT: movq c(%rip), %rax
1159 ; X64-SSE-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
1160 ; X64-SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,2,1,4,5,6,7]
1161 ; X64-SSE-NEXT: psrad $16, %xmm0
1162 ; X64-SSE-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
1163 ; X64-SSE-NEXT: pxor %xmm2, %xmm2
1164 ; X64-SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
1165 ; X64-SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
1166 ; X64-SSE-NEXT: pmuludq %xmm0, %xmm1
1167 ; X64-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
1168 ; X64-SSE-NEXT: pmuludq %xmm2, %xmm0
1169 ; X64-SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
1170 ; X64-SSE-NEXT: movq %xmm1, (%rax,%rdx,4)
1171 ; X64-SSE-NEXT: retq
1173 ; X64-AVX-LABEL: mul_2xi16_sext_zext:
1174 ; X64-AVX: # %bb.0: # %entry
1175 ; X64-AVX-NEXT: movq c(%rip), %rax
1176 ; X64-AVX-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
1177 ; X64-AVX-NEXT: vpmovsxwd %xmm0, %xmm0
1178 ; X64-AVX-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero
1179 ; X64-AVX-NEXT: vpmovzxwd {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero
1180 ; X64-AVX-NEXT: vpmulld %xmm0, %xmm1, %xmm0
1181 ; X64-AVX-NEXT: vmovq %xmm0, (%rax,%rdx,4)
1182 ; X64-AVX-NEXT: retq
1184 %pre = load ptr, ptr @c
1185 %tmp6 = getelementptr inbounds i8, ptr %a, i64 %index
1186 %wide.load = load <2 x i16>, ptr %tmp6, align 1
1187 %tmp8 = sext <2 x i16> %wide.load to <2 x i32>
1188 %tmp10 = getelementptr inbounds i8, ptr %b, i64 %index
1189 %wide.load17 = load <2 x i16>, ptr %tmp10, align 1
1190 %tmp12 = zext <2 x i16> %wide.load17 to <2 x i32>
1191 %tmp13 = mul nuw nsw <2 x i32> %tmp12, %tmp8
1192 %tmp14 = getelementptr inbounds i32, ptr %pre, i64 %index
1193 store <2 x i32> %tmp13, ptr %tmp14, align 4
1197 ; %val1 = load <16 x i16>
1198 ; %op1 = sext<16 x i32> %val1
1199 ; %val2 = load <16 x i16>
1200 ; %op2 = sext<16 x i32> %val2
1201 ; %rst = mul <16 x i32> %op1, %op2
1203 define void @mul_16xi16_sext(ptr nocapture readonly %a, ptr nocapture readonly %b, i64 %index) nounwind {
1204 ; X86-SSE-LABEL: mul_16xi16_sext:
1205 ; X86-SSE: # %bb.0: # %entry
1206 ; X86-SSE-NEXT: pushl %esi
1207 ; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %edx
1208 ; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %eax
1209 ; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %esi
1210 ; X86-SSE-NEXT: movl c, %ecx
1211 ; X86-SSE-NEXT: movdqu (%esi,%eax), %xmm2
1212 ; X86-SSE-NEXT: movdqu 16(%esi,%eax), %xmm3
1213 ; X86-SSE-NEXT: movdqu (%edx,%eax), %xmm0
1214 ; X86-SSE-NEXT: movdqu 16(%edx,%eax), %xmm1
1215 ; X86-SSE-NEXT: movdqa %xmm0, %xmm4
1216 ; X86-SSE-NEXT: pmulhw %xmm2, %xmm4
1217 ; X86-SSE-NEXT: pmullw %xmm2, %xmm0
1218 ; X86-SSE-NEXT: movdqa %xmm0, %xmm2
1219 ; X86-SSE-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm4[4],xmm2[5],xmm4[5],xmm2[6],xmm4[6],xmm2[7],xmm4[7]
1220 ; X86-SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3]
1221 ; X86-SSE-NEXT: movdqa %xmm1, %xmm4
1222 ; X86-SSE-NEXT: pmulhw %xmm3, %xmm4
1223 ; X86-SSE-NEXT: pmullw %xmm3, %xmm1
1224 ; X86-SSE-NEXT: movdqa %xmm1, %xmm3
1225 ; X86-SSE-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7]
1226 ; X86-SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3]
1227 ; X86-SSE-NEXT: movdqu %xmm1, 32(%ecx,%eax,4)
1228 ; X86-SSE-NEXT: movdqu %xmm3, 48(%ecx,%eax,4)
1229 ; X86-SSE-NEXT: movdqu %xmm0, (%ecx,%eax,4)
1230 ; X86-SSE-NEXT: movdqu %xmm2, 16(%ecx,%eax,4)
1231 ; X86-SSE-NEXT: popl %esi
1232 ; X86-SSE-NEXT: retl
1234 ; X86-AVX1-LABEL: mul_16xi16_sext:
1235 ; X86-AVX1: # %bb.0: # %entry
1236 ; X86-AVX1-NEXT: pushl %esi
1237 ; X86-AVX1-NEXT: movl {{[0-9]+}}(%esp), %edx
1238 ; X86-AVX1-NEXT: movl {{[0-9]+}}(%esp), %eax
1239 ; X86-AVX1-NEXT: movl {{[0-9]+}}(%esp), %esi
1240 ; X86-AVX1-NEXT: movl c, %ecx
1241 ; X86-AVX1-NEXT: vpmovsxwd 24(%esi,%eax), %xmm0
1242 ; X86-AVX1-NEXT: vpmovsxwd 16(%esi,%eax), %xmm1
1243 ; X86-AVX1-NEXT: vpmovsxwd 8(%esi,%eax), %xmm2
1244 ; X86-AVX1-NEXT: vpmovsxwd (%esi,%eax), %xmm3
1245 ; X86-AVX1-NEXT: vpmovsxwd 24(%edx,%eax), %xmm4
1246 ; X86-AVX1-NEXT: vpmulld %xmm0, %xmm4, %xmm0
1247 ; X86-AVX1-NEXT: vpmovsxwd 16(%edx,%eax), %xmm4
1248 ; X86-AVX1-NEXT: vpmulld %xmm1, %xmm4, %xmm1
1249 ; X86-AVX1-NEXT: vpmovsxwd 8(%edx,%eax), %xmm4
1250 ; X86-AVX1-NEXT: vpmulld %xmm2, %xmm4, %xmm2
1251 ; X86-AVX1-NEXT: vpmovsxwd (%edx,%eax), %xmm4
1252 ; X86-AVX1-NEXT: vpmulld %xmm3, %xmm4, %xmm3
1253 ; X86-AVX1-NEXT: vmovdqu %xmm0, 48(%ecx,%eax,4)
1254 ; X86-AVX1-NEXT: vmovdqu %xmm1, 32(%ecx,%eax,4)
1255 ; X86-AVX1-NEXT: vmovdqu %xmm2, 16(%ecx,%eax,4)
1256 ; X86-AVX1-NEXT: vmovdqu %xmm3, (%ecx,%eax,4)
1257 ; X86-AVX1-NEXT: popl %esi
1258 ; X86-AVX1-NEXT: retl
1260 ; X86-AVX2-LABEL: mul_16xi16_sext:
1261 ; X86-AVX2: # %bb.0: # %entry
1262 ; X86-AVX2-NEXT: pushl %esi
1263 ; X86-AVX2-NEXT: movl {{[0-9]+}}(%esp), %eax
1264 ; X86-AVX2-NEXT: movl {{[0-9]+}}(%esp), %ecx
1265 ; X86-AVX2-NEXT: movl {{[0-9]+}}(%esp), %edx
1266 ; X86-AVX2-NEXT: movl c, %esi
1267 ; X86-AVX2-NEXT: vpmovsxwd 16(%edx,%ecx), %ymm0
1268 ; X86-AVX2-NEXT: vpmovsxwd (%edx,%ecx), %ymm1
1269 ; X86-AVX2-NEXT: vpmovsxwd 16(%eax,%ecx), %ymm2
1270 ; X86-AVX2-NEXT: vpmulld %ymm0, %ymm2, %ymm0
1271 ; X86-AVX2-NEXT: vpmovsxwd (%eax,%ecx), %ymm2
1272 ; X86-AVX2-NEXT: vpmulld %ymm1, %ymm2, %ymm1
1273 ; X86-AVX2-NEXT: vmovdqu %ymm0, 32(%esi,%ecx,4)
1274 ; X86-AVX2-NEXT: vmovdqu %ymm1, (%esi,%ecx,4)
1275 ; X86-AVX2-NEXT: popl %esi
1276 ; X86-AVX2-NEXT: vzeroupper
1277 ; X86-AVX2-NEXT: retl
1279 ; X64-SSE-LABEL: mul_16xi16_sext:
1280 ; X64-SSE: # %bb.0: # %entry
1281 ; X64-SSE-NEXT: movq c(%rip), %rax
1282 ; X64-SSE-NEXT: movdqu (%rdi,%rdx), %xmm0
1283 ; X64-SSE-NEXT: movdqu 16(%rdi,%rdx), %xmm1
1284 ; X64-SSE-NEXT: movdqu (%rsi,%rdx), %xmm2
1285 ; X64-SSE-NEXT: movdqu 16(%rsi,%rdx), %xmm3
1286 ; X64-SSE-NEXT: movdqa %xmm2, %xmm4
1287 ; X64-SSE-NEXT: pmulhw %xmm0, %xmm4
1288 ; X64-SSE-NEXT: pmullw %xmm0, %xmm2
1289 ; X64-SSE-NEXT: movdqa %xmm2, %xmm0
1290 ; X64-SSE-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm4[4],xmm0[5],xmm4[5],xmm0[6],xmm4[6],xmm0[7],xmm4[7]
1291 ; X64-SSE-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3]
1292 ; X64-SSE-NEXT: movdqa %xmm3, %xmm4
1293 ; X64-SSE-NEXT: pmulhw %xmm1, %xmm4
1294 ; X64-SSE-NEXT: pmullw %xmm1, %xmm3
1295 ; X64-SSE-NEXT: movdqa %xmm3, %xmm1
1296 ; X64-SSE-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm4[4],xmm1[5],xmm4[5],xmm1[6],xmm4[6],xmm1[7],xmm4[7]
1297 ; X64-SSE-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3]
1298 ; X64-SSE-NEXT: movdqu %xmm3, 32(%rax,%rdx,4)
1299 ; X64-SSE-NEXT: movdqu %xmm1, 48(%rax,%rdx,4)
1300 ; X64-SSE-NEXT: movdqu %xmm2, (%rax,%rdx,4)
1301 ; X64-SSE-NEXT: movdqu %xmm0, 16(%rax,%rdx,4)
1302 ; X64-SSE-NEXT: retq
1304 ; X64-AVX1-LABEL: mul_16xi16_sext:
1305 ; X64-AVX1: # %bb.0: # %entry
1306 ; X64-AVX1-NEXT: movq c(%rip), %rax
1307 ; X64-AVX1-NEXT: vpmovsxwd 24(%rdi,%rdx), %xmm0
1308 ; X64-AVX1-NEXT: vpmovsxwd 16(%rdi,%rdx), %xmm1
1309 ; X64-AVX1-NEXT: vpmovsxwd 8(%rdi,%rdx), %xmm2
1310 ; X64-AVX1-NEXT: vpmovsxwd (%rdi,%rdx), %xmm3
1311 ; X64-AVX1-NEXT: vpmovsxwd 24(%rsi,%rdx), %xmm4
1312 ; X64-AVX1-NEXT: vpmulld %xmm0, %xmm4, %xmm0
1313 ; X64-AVX1-NEXT: vpmovsxwd 16(%rsi,%rdx), %xmm4
1314 ; X64-AVX1-NEXT: vpmulld %xmm1, %xmm4, %xmm1
1315 ; X64-AVX1-NEXT: vpmovsxwd 8(%rsi,%rdx), %xmm4
1316 ; X64-AVX1-NEXT: vpmulld %xmm2, %xmm4, %xmm2
1317 ; X64-AVX1-NEXT: vpmovsxwd (%rsi,%rdx), %xmm4
1318 ; X64-AVX1-NEXT: vpmulld %xmm3, %xmm4, %xmm3
1319 ; X64-AVX1-NEXT: vmovdqu %xmm0, 48(%rax,%rdx,4)
1320 ; X64-AVX1-NEXT: vmovdqu %xmm1, 32(%rax,%rdx,4)
1321 ; X64-AVX1-NEXT: vmovdqu %xmm2, 16(%rax,%rdx,4)
1322 ; X64-AVX1-NEXT: vmovdqu %xmm3, (%rax,%rdx,4)
1323 ; X64-AVX1-NEXT: retq
1325 ; X64-AVX2-LABEL: mul_16xi16_sext:
1326 ; X64-AVX2: # %bb.0: # %entry
1327 ; X64-AVX2-NEXT: movq c(%rip), %rax
1328 ; X64-AVX2-NEXT: vpmovsxwd 16(%rdi,%rdx), %ymm0
1329 ; X64-AVX2-NEXT: vpmovsxwd (%rdi,%rdx), %ymm1
1330 ; X64-AVX2-NEXT: vpmovsxwd 16(%rsi,%rdx), %ymm2
1331 ; X64-AVX2-NEXT: vpmulld %ymm0, %ymm2, %ymm0
1332 ; X64-AVX2-NEXT: vpmovsxwd (%rsi,%rdx), %ymm2
1333 ; X64-AVX2-NEXT: vpmulld %ymm1, %ymm2, %ymm1
1334 ; X64-AVX2-NEXT: vmovdqu %ymm0, 32(%rax,%rdx,4)
1335 ; X64-AVX2-NEXT: vmovdqu %ymm1, (%rax,%rdx,4)
1336 ; X64-AVX2-NEXT: vzeroupper
1337 ; X64-AVX2-NEXT: retq
1339 %pre = load ptr, ptr @c
1340 %tmp6 = getelementptr inbounds i8, ptr %a, i64 %index
1341 %wide.load = load <16 x i16>, ptr %tmp6, align 1
1342 %tmp8 = sext <16 x i16> %wide.load to <16 x i32>
1343 %tmp10 = getelementptr inbounds i8, ptr %b, i64 %index
1344 %wide.load17 = load <16 x i16>, ptr %tmp10, align 1
1345 %tmp12 = sext <16 x i16> %wide.load17 to <16 x i32>
1346 %tmp13 = mul nuw nsw <16 x i32> %tmp12, %tmp8
1347 %tmp14 = getelementptr inbounds i32, ptr %pre, i64 %index
1348 store <16 x i32> %tmp13, ptr %tmp14, align 4
1352 ; %val = load <2 x i8>
1353 ; %op1 = zext<2 x i32> %val
1354 ; %op2 = const <2 x i32> {c1, c2} // c1 and c2 are within (0 ~ 255)
1355 ; %rst = mul <2 x i32> %op1, %op2
1357 define void @mul_2xi8_varconst1(ptr nocapture readonly %a, i64 %index) {
1358 ; X86-SSE-LABEL: mul_2xi8_varconst1:
1359 ; X86-SSE: # %bb.0: # %entry
1360 ; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %eax
1361 ; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %ecx
1362 ; X86-SSE-NEXT: movl c, %edx
1363 ; X86-SSE-NEXT: movzwl (%ecx,%eax), %ecx
1364 ; X86-SSE-NEXT: movd %ecx, %xmm0
1365 ; X86-SSE-NEXT: pxor %xmm1, %xmm1
1366 ; X86-SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
1367 ; X86-SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
1368 ; X86-SSE-NEXT: pmaddwd {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
1369 ; X86-SSE-NEXT: movq %xmm0, (%edx,%eax,4)
1370 ; X86-SSE-NEXT: retl
1372 ; X86-AVX-LABEL: mul_2xi8_varconst1:
1373 ; X86-AVX: # %bb.0: # %entry
1374 ; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax
1375 ; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %ecx
1376 ; X86-AVX-NEXT: movl c, %edx
1377 ; X86-AVX-NEXT: movzwl (%ecx,%eax), %ecx
1378 ; X86-AVX-NEXT: vmovd %ecx, %xmm0
1379 ; X86-AVX-NEXT: vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
1380 ; X86-AVX-NEXT: vpmaddwd {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0
1381 ; X86-AVX-NEXT: vmovq %xmm0, (%edx,%eax,4)
1382 ; X86-AVX-NEXT: retl
1384 ; X64-SSE-LABEL: mul_2xi8_varconst1:
1385 ; X64-SSE: # %bb.0: # %entry
1386 ; X64-SSE-NEXT: movq c(%rip), %rax
1387 ; X64-SSE-NEXT: movzwl (%rdi,%rsi), %ecx
1388 ; X64-SSE-NEXT: movd %ecx, %xmm0
1389 ; X64-SSE-NEXT: pxor %xmm1, %xmm1
1390 ; X64-SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
1391 ; X64-SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
1392 ; X64-SSE-NEXT: pmaddwd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
1393 ; X64-SSE-NEXT: movq %xmm0, (%rax,%rsi,4)
1394 ; X64-SSE-NEXT: retq
1396 ; X64-AVX-LABEL: mul_2xi8_varconst1:
1397 ; X64-AVX: # %bb.0: # %entry
1398 ; X64-AVX-NEXT: movq c(%rip), %rax
1399 ; X64-AVX-NEXT: movzwl (%rdi,%rsi), %ecx
1400 ; X64-AVX-NEXT: vmovd %ecx, %xmm0
1401 ; X64-AVX-NEXT: vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
1402 ; X64-AVX-NEXT: vpmaddwd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
1403 ; X64-AVX-NEXT: vmovq %xmm0, (%rax,%rsi,4)
1404 ; X64-AVX-NEXT: retq
1406 %pre = load ptr, ptr @c
1407 %tmp6 = getelementptr inbounds i8, ptr %a, i64 %index
1408 %wide.load = load <2 x i8>, ptr %tmp6, align 1
1409 %tmp8 = zext <2 x i8> %wide.load to <2 x i32>
1410 %tmp13 = mul nuw nsw <2 x i32> %tmp8, <i32 0, i32 255>
1411 %tmp14 = getelementptr inbounds i32, ptr %pre, i64 %index
1412 store <2 x i32> %tmp13, ptr %tmp14, align 4
1416 ; %val = load <2 x i8>
1417 ; %op1 = sext<2 x i32> %val
1418 ; %op2 = const <2 x i32> {c1, c2} // c1 and c2 are within (-128 ~ 127)
1419 ; %rst = mul <2 x i32> %op1, %op2
1421 define void @mul_2xi8_varconst2(ptr nocapture readonly %a, i64 %index) {
1422 ; X86-SSE-LABEL: mul_2xi8_varconst2:
1423 ; X86-SSE: # %bb.0: # %entry
1424 ; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %eax
1425 ; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %ecx
1426 ; X86-SSE-NEXT: movl c, %edx
1427 ; X86-SSE-NEXT: movzwl (%ecx,%eax), %ecx
1428 ; X86-SSE-NEXT: movd %ecx, %xmm0
1429 ; X86-SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
1430 ; X86-SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
1431 ; X86-SSE-NEXT: psrad $24, %xmm0
1432 ; X86-SSE-NEXT: pmaddwd {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
1433 ; X86-SSE-NEXT: movq %xmm0, (%edx,%eax,4)
1434 ; X86-SSE-NEXT: retl
1436 ; X86-AVX-LABEL: mul_2xi8_varconst2:
1437 ; X86-AVX: # %bb.0: # %entry
1438 ; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax
1439 ; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %ecx
1440 ; X86-AVX-NEXT: movl c, %edx
1441 ; X86-AVX-NEXT: movzwl (%ecx,%eax), %ecx
1442 ; X86-AVX-NEXT: vmovd %ecx, %xmm0
1443 ; X86-AVX-NEXT: vpmovsxbd %xmm0, %xmm0
1444 ; X86-AVX-NEXT: vpmaddwd {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0
1445 ; X86-AVX-NEXT: vmovq %xmm0, (%edx,%eax,4)
1446 ; X86-AVX-NEXT: retl
1448 ; X64-SSE-LABEL: mul_2xi8_varconst2:
1449 ; X64-SSE: # %bb.0: # %entry
1450 ; X64-SSE-NEXT: movq c(%rip), %rax
1451 ; X64-SSE-NEXT: movzwl (%rdi,%rsi), %ecx
1452 ; X64-SSE-NEXT: movd %ecx, %xmm0
1453 ; X64-SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
1454 ; X64-SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
1455 ; X64-SSE-NEXT: psrad $24, %xmm0
1456 ; X64-SSE-NEXT: pmaddwd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
1457 ; X64-SSE-NEXT: movq %xmm0, (%rax,%rsi,4)
1458 ; X64-SSE-NEXT: retq
1460 ; X64-AVX-LABEL: mul_2xi8_varconst2:
1461 ; X64-AVX: # %bb.0: # %entry
1462 ; X64-AVX-NEXT: movq c(%rip), %rax
1463 ; X64-AVX-NEXT: movzwl (%rdi,%rsi), %ecx
1464 ; X64-AVX-NEXT: vmovd %ecx, %xmm0
1465 ; X64-AVX-NEXT: vpmovsxbd %xmm0, %xmm0
1466 ; X64-AVX-NEXT: vpmaddwd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
1467 ; X64-AVX-NEXT: vmovq %xmm0, (%rax,%rsi,4)
1468 ; X64-AVX-NEXT: retq
1470 %pre = load ptr, ptr @c
1471 %tmp6 = getelementptr inbounds i8, ptr %a, i64 %index
1472 %wide.load = load <2 x i8>, ptr %tmp6, align 1
1473 %tmp8 = sext <2 x i8> %wide.load to <2 x i32>
1474 %tmp13 = mul nuw nsw <2 x i32> %tmp8, <i32 -128, i32 127>
1475 %tmp14 = getelementptr inbounds i32, ptr %pre, i64 %index
1476 store <2 x i32> %tmp13, ptr %tmp14, align 4
1480 ; %val = load <2 x i8>
1481 ; %op1 = zext<2 x i32> %val
1482 ; %op2 = const <2 x i32> {c1, c2} // c1 and c2 are within (0 ~ 256)
1483 ; %rst = mul <2 x i32> %op1, %op2
1485 define void @mul_2xi8_varconst3(ptr nocapture readonly %a, i64 %index) {
1486 ; X86-SSE-LABEL: mul_2xi8_varconst3:
1487 ; X86-SSE: # %bb.0: # %entry
1488 ; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %eax
1489 ; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %ecx
1490 ; X86-SSE-NEXT: movl c, %edx
1491 ; X86-SSE-NEXT: movzwl (%ecx,%eax), %ecx
1492 ; X86-SSE-NEXT: movd %ecx, %xmm0
1493 ; X86-SSE-NEXT: pxor %xmm1, %xmm1
1494 ; X86-SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
1495 ; X86-SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
1496 ; X86-SSE-NEXT: pmaddwd {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
1497 ; X86-SSE-NEXT: movq %xmm0, (%edx,%eax,4)
1498 ; X86-SSE-NEXT: retl
1500 ; X86-AVX-LABEL: mul_2xi8_varconst3:
1501 ; X86-AVX: # %bb.0: # %entry
1502 ; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax
1503 ; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %ecx
1504 ; X86-AVX-NEXT: movl c, %edx
1505 ; X86-AVX-NEXT: movzwl (%ecx,%eax), %ecx
1506 ; X86-AVX-NEXT: vmovd %ecx, %xmm0
1507 ; X86-AVX-NEXT: vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
1508 ; X86-AVX-NEXT: vpmaddwd {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0
1509 ; X86-AVX-NEXT: vmovq %xmm0, (%edx,%eax,4)
1510 ; X86-AVX-NEXT: retl
1512 ; X64-SSE-LABEL: mul_2xi8_varconst3:
1513 ; X64-SSE: # %bb.0: # %entry
1514 ; X64-SSE-NEXT: movq c(%rip), %rax
1515 ; X64-SSE-NEXT: movzwl (%rdi,%rsi), %ecx
1516 ; X64-SSE-NEXT: movd %ecx, %xmm0
1517 ; X64-SSE-NEXT: pxor %xmm1, %xmm1
1518 ; X64-SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
1519 ; X64-SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
1520 ; X64-SSE-NEXT: pmaddwd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
1521 ; X64-SSE-NEXT: movq %xmm0, (%rax,%rsi,4)
1522 ; X64-SSE-NEXT: retq
1524 ; X64-AVX-LABEL: mul_2xi8_varconst3:
1525 ; X64-AVX: # %bb.0: # %entry
1526 ; X64-AVX-NEXT: movq c(%rip), %rax
1527 ; X64-AVX-NEXT: movzwl (%rdi,%rsi), %ecx
1528 ; X64-AVX-NEXT: vmovd %ecx, %xmm0
1529 ; X64-AVX-NEXT: vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
1530 ; X64-AVX-NEXT: vpmaddwd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
1531 ; X64-AVX-NEXT: vmovq %xmm0, (%rax,%rsi,4)
1532 ; X64-AVX-NEXT: retq
1534 %pre = load ptr, ptr @c
1535 %tmp6 = getelementptr inbounds i8, ptr %a, i64 %index
1536 %wide.load = load <2 x i8>, ptr %tmp6, align 1
1537 %tmp8 = zext <2 x i8> %wide.load to <2 x i32>
1538 %tmp13 = mul nuw nsw <2 x i32> %tmp8, <i32 0, i32 256>
1539 %tmp14 = getelementptr inbounds i32, ptr %pre, i64 %index
1540 store <2 x i32> %tmp13, ptr %tmp14, align 4
1544 ; %val = load <2 x i8>
1545 ; %op1 = zext<2 x i32> %val
1546 ; %op2 = const <2 x i32> {c1, c2} // c1 and c2 are within (-1 ~ 255)
1547 ; %rst = mul <2 x i32> %op1, %op2
1549 define void @mul_2xi8_varconst4(ptr nocapture readonly %a, i64 %index) {
1550 ; X86-SSE-LABEL: mul_2xi8_varconst4:
1551 ; X86-SSE: # %bb.0: # %entry
1552 ; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %eax
1553 ; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %ecx
1554 ; X86-SSE-NEXT: movl c, %edx
1555 ; X86-SSE-NEXT: movzwl (%ecx,%eax), %ecx
1556 ; X86-SSE-NEXT: movd %ecx, %xmm0
1557 ; X86-SSE-NEXT: pxor %xmm1, %xmm1
1558 ; X86-SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
1559 ; X86-SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
1560 ; X86-SSE-NEXT: pmaddwd {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
1561 ; X86-SSE-NEXT: movq %xmm0, (%edx,%eax,4)
1562 ; X86-SSE-NEXT: retl
1564 ; X86-AVX-LABEL: mul_2xi8_varconst4:
1565 ; X86-AVX: # %bb.0: # %entry
1566 ; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax
1567 ; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %ecx
1568 ; X86-AVX-NEXT: movl c, %edx
1569 ; X86-AVX-NEXT: movzwl (%ecx,%eax), %ecx
1570 ; X86-AVX-NEXT: vmovd %ecx, %xmm0
1571 ; X86-AVX-NEXT: vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
1572 ; X86-AVX-NEXT: vpmaddwd {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0
1573 ; X86-AVX-NEXT: vmovq %xmm0, (%edx,%eax,4)
1574 ; X86-AVX-NEXT: retl
1576 ; X64-SSE-LABEL: mul_2xi8_varconst4:
1577 ; X64-SSE: # %bb.0: # %entry
1578 ; X64-SSE-NEXT: movq c(%rip), %rax
1579 ; X64-SSE-NEXT: movzwl (%rdi,%rsi), %ecx
1580 ; X64-SSE-NEXT: movd %ecx, %xmm0
1581 ; X64-SSE-NEXT: pxor %xmm1, %xmm1
1582 ; X64-SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
1583 ; X64-SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
1584 ; X64-SSE-NEXT: pmaddwd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
1585 ; X64-SSE-NEXT: movq %xmm0, (%rax,%rsi,4)
1586 ; X64-SSE-NEXT: retq
1588 ; X64-AVX-LABEL: mul_2xi8_varconst4:
1589 ; X64-AVX: # %bb.0: # %entry
1590 ; X64-AVX-NEXT: movq c(%rip), %rax
1591 ; X64-AVX-NEXT: movzwl (%rdi,%rsi), %ecx
1592 ; X64-AVX-NEXT: vmovd %ecx, %xmm0
1593 ; X64-AVX-NEXT: vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
1594 ; X64-AVX-NEXT: vpmaddwd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
1595 ; X64-AVX-NEXT: vmovq %xmm0, (%rax,%rsi,4)
1596 ; X64-AVX-NEXT: retq
1598 %pre = load ptr, ptr @c
1599 %tmp6 = getelementptr inbounds i8, ptr %a, i64 %index
1600 %wide.load = load <2 x i8>, ptr %tmp6, align 1
1601 %tmp8 = zext <2 x i8> %wide.load to <2 x i32>
1602 %tmp13 = mul nuw nsw <2 x i32> %tmp8, <i32 -1, i32 255>
1603 %tmp14 = getelementptr inbounds i32, ptr %pre, i64 %index
1604 store <2 x i32> %tmp13, ptr %tmp14, align 4
1608 ; %val = load <2 x i8>
1609 ; %op1 = sext<2 x i32> %val
1610 ; %op2 = const <2 x i32> {c1, c2} // c1 and c2 are within (-129 ~ 127)
1611 ; %rst = mul <2 x i32> %op1, %op2
1613 define void @mul_2xi8_varconst5(ptr nocapture readonly %a, i64 %index) {
1614 ; X86-SSE-LABEL: mul_2xi8_varconst5:
1615 ; X86-SSE: # %bb.0: # %entry
1616 ; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %eax
1617 ; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %ecx
1618 ; X86-SSE-NEXT: movl c, %edx
1619 ; X86-SSE-NEXT: movzwl (%ecx,%eax), %ecx
1620 ; X86-SSE-NEXT: movd %ecx, %xmm0
1621 ; X86-SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
1622 ; X86-SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
1623 ; X86-SSE-NEXT: psrad $24, %xmm0
1624 ; X86-SSE-NEXT: pmaddwd {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
1625 ; X86-SSE-NEXT: movq %xmm0, (%edx,%eax,4)
1626 ; X86-SSE-NEXT: retl
1628 ; X86-AVX-LABEL: mul_2xi8_varconst5:
1629 ; X86-AVX: # %bb.0: # %entry
1630 ; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax
1631 ; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %ecx
1632 ; X86-AVX-NEXT: movl c, %edx
1633 ; X86-AVX-NEXT: movzwl (%ecx,%eax), %ecx
1634 ; X86-AVX-NEXT: vmovd %ecx, %xmm0
1635 ; X86-AVX-NEXT: vpmovsxbd %xmm0, %xmm0
1636 ; X86-AVX-NEXT: vpmaddwd {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0
1637 ; X86-AVX-NEXT: vmovq %xmm0, (%edx,%eax,4)
1638 ; X86-AVX-NEXT: retl
1640 ; X64-SSE-LABEL: mul_2xi8_varconst5:
1641 ; X64-SSE: # %bb.0: # %entry
1642 ; X64-SSE-NEXT: movq c(%rip), %rax
1643 ; X64-SSE-NEXT: movzwl (%rdi,%rsi), %ecx
1644 ; X64-SSE-NEXT: movd %ecx, %xmm0
1645 ; X64-SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
1646 ; X64-SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
1647 ; X64-SSE-NEXT: psrad $24, %xmm0
1648 ; X64-SSE-NEXT: pmaddwd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
1649 ; X64-SSE-NEXT: movq %xmm0, (%rax,%rsi,4)
1650 ; X64-SSE-NEXT: retq
1652 ; X64-AVX-LABEL: mul_2xi8_varconst5:
1653 ; X64-AVX: # %bb.0: # %entry
1654 ; X64-AVX-NEXT: movq c(%rip), %rax
1655 ; X64-AVX-NEXT: movzwl (%rdi,%rsi), %ecx
1656 ; X64-AVX-NEXT: vmovd %ecx, %xmm0
1657 ; X64-AVX-NEXT: vpmovsxbd %xmm0, %xmm0
1658 ; X64-AVX-NEXT: vpmaddwd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
1659 ; X64-AVX-NEXT: vmovq %xmm0, (%rax,%rsi,4)
1660 ; X64-AVX-NEXT: retq
1662 %pre = load ptr, ptr @c
1663 %tmp6 = getelementptr inbounds i8, ptr %a, i64 %index
1664 %wide.load = load <2 x i8>, ptr %tmp6, align 1
1665 %tmp8 = sext <2 x i8> %wide.load to <2 x i32>
1666 %tmp13 = mul nuw nsw <2 x i32> %tmp8, <i32 -129, i32 127>
1667 %tmp14 = getelementptr inbounds i32, ptr %pre, i64 %index
1668 store <2 x i32> %tmp13, ptr %tmp14, align 4
1672 ; %val = load <2 x i8>
1673 ; %op1 = sext<2 x i32> %val
1674 ; %op2 = const <2 x i32> {c1, c2} // c1 and c2 are within (-128 ~ 128)
1675 ; %rst = mul <2 x i32> %op1, %op2
1677 define void @mul_2xi8_varconst6(ptr nocapture readonly %a, i64 %index) {
1678 ; X86-SSE-LABEL: mul_2xi8_varconst6:
1679 ; X86-SSE: # %bb.0: # %entry
1680 ; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %eax
1681 ; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %ecx
1682 ; X86-SSE-NEXT: movl c, %edx
1683 ; X86-SSE-NEXT: movzwl (%ecx,%eax), %ecx
1684 ; X86-SSE-NEXT: movd %ecx, %xmm0
1685 ; X86-SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
1686 ; X86-SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
1687 ; X86-SSE-NEXT: psrad $24, %xmm0
1688 ; X86-SSE-NEXT: pmaddwd {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
1689 ; X86-SSE-NEXT: movq %xmm0, (%edx,%eax,4)
1690 ; X86-SSE-NEXT: retl
1692 ; X86-AVX-LABEL: mul_2xi8_varconst6:
1693 ; X86-AVX: # %bb.0: # %entry
1694 ; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax
1695 ; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %ecx
1696 ; X86-AVX-NEXT: movl c, %edx
1697 ; X86-AVX-NEXT: movzwl (%ecx,%eax), %ecx
1698 ; X86-AVX-NEXT: vmovd %ecx, %xmm0
1699 ; X86-AVX-NEXT: vpmovsxbd %xmm0, %xmm0
1700 ; X86-AVX-NEXT: vpmaddwd {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0
1701 ; X86-AVX-NEXT: vmovq %xmm0, (%edx,%eax,4)
1702 ; X86-AVX-NEXT: retl
1704 ; X64-SSE-LABEL: mul_2xi8_varconst6:
1705 ; X64-SSE: # %bb.0: # %entry
1706 ; X64-SSE-NEXT: movq c(%rip), %rax
1707 ; X64-SSE-NEXT: movzwl (%rdi,%rsi), %ecx
1708 ; X64-SSE-NEXT: movd %ecx, %xmm0
1709 ; X64-SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
1710 ; X64-SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
1711 ; X64-SSE-NEXT: psrad $24, %xmm0
1712 ; X64-SSE-NEXT: pmaddwd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
1713 ; X64-SSE-NEXT: movq %xmm0, (%rax,%rsi,4)
1714 ; X64-SSE-NEXT: retq
1716 ; X64-AVX-LABEL: mul_2xi8_varconst6:
1717 ; X64-AVX: # %bb.0: # %entry
1718 ; X64-AVX-NEXT: movq c(%rip), %rax
1719 ; X64-AVX-NEXT: movzwl (%rdi,%rsi), %ecx
1720 ; X64-AVX-NEXT: vmovd %ecx, %xmm0
1721 ; X64-AVX-NEXT: vpmovsxbd %xmm0, %xmm0
1722 ; X64-AVX-NEXT: vpmaddwd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
1723 ; X64-AVX-NEXT: vmovq %xmm0, (%rax,%rsi,4)
1724 ; X64-AVX-NEXT: retq
1726 %pre = load ptr, ptr @c
1727 %tmp6 = getelementptr inbounds i8, ptr %a, i64 %index
1728 %wide.load = load <2 x i8>, ptr %tmp6, align 1
1729 %tmp8 = sext <2 x i8> %wide.load to <2 x i32>
1730 %tmp13 = mul nuw nsw <2 x i32> %tmp8, <i32 -128, i32 128>
1731 %tmp14 = getelementptr inbounds i32, ptr %pre, i64 %index
1732 store <2 x i32> %tmp13, ptr %tmp14, align 4
1736 ; %val = load <2 x i16>
1737 ; %op1 = zext<2 x i32> %val
1738 ; %op2 = const <2 x i32> {c1, c2} // c1 and c2 are within (0 ~ 65535)
1739 ; %rst = mul <2 x i32> %op1, %op2
1741 define void @mul_2xi16_varconst1(ptr nocapture readonly %a, i64 %index) {
1742 ; X86-SSE-LABEL: mul_2xi16_varconst1:
1743 ; X86-SSE: # %bb.0: # %entry
1744 ; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %eax
1745 ; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %ecx
1746 ; X86-SSE-NEXT: movl c, %edx
1747 ; X86-SSE-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
1748 ; X86-SSE-NEXT: movdqa {{.*#+}} xmm1 = <0,65535,u,u,u,u,u,u>
1749 ; X86-SSE-NEXT: movdqa %xmm0, %xmm2
1750 ; X86-SSE-NEXT: pmulhuw %xmm1, %xmm2
1751 ; X86-SSE-NEXT: pmullw %xmm1, %xmm0
1752 ; X86-SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
1753 ; X86-SSE-NEXT: movq %xmm0, (%edx,%eax,4)
1754 ; X86-SSE-NEXT: retl
1756 ; X86-AVX-LABEL: mul_2xi16_varconst1:
1757 ; X86-AVX: # %bb.0: # %entry
1758 ; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax
1759 ; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %ecx
1760 ; X86-AVX-NEXT: movl c, %edx
1761 ; X86-AVX-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
1762 ; X86-AVX-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
1763 ; X86-AVX-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0
1764 ; X86-AVX-NEXT: vmovq %xmm0, (%edx,%eax,4)
1765 ; X86-AVX-NEXT: retl
1767 ; X64-SSE-LABEL: mul_2xi16_varconst1:
1768 ; X64-SSE: # %bb.0: # %entry
1769 ; X64-SSE-NEXT: movq c(%rip), %rax
1770 ; X64-SSE-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
1771 ; X64-SSE-NEXT: movdqa {{.*#+}} xmm1 = <0,65535,u,u,u,u,u,u>
1772 ; X64-SSE-NEXT: movdqa %xmm0, %xmm2
1773 ; X64-SSE-NEXT: pmulhuw %xmm1, %xmm2
1774 ; X64-SSE-NEXT: pmullw %xmm1, %xmm0
1775 ; X64-SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
1776 ; X64-SSE-NEXT: movq %xmm0, (%rax,%rsi,4)
1777 ; X64-SSE-NEXT: retq
1779 ; X64-AVX-LABEL: mul_2xi16_varconst1:
1780 ; X64-AVX: # %bb.0: # %entry
1781 ; X64-AVX-NEXT: movq c(%rip), %rax
1782 ; X64-AVX-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
1783 ; X64-AVX-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
1784 ; X64-AVX-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
1785 ; X64-AVX-NEXT: vmovq %xmm0, (%rax,%rsi,4)
1786 ; X64-AVX-NEXT: retq
1788 %pre = load ptr, ptr @c
1789 %tmp6 = getelementptr inbounds i8, ptr %a, i64 %index
1790 %wide.load = load <2 x i16>, ptr %tmp6, align 1
1791 %tmp8 = zext <2 x i16> %wide.load to <2 x i32>
1792 %tmp13 = mul nuw nsw <2 x i32> %tmp8, <i32 0, i32 65535>
1793 %tmp14 = getelementptr inbounds i32, ptr %pre, i64 %index
1794 store <2 x i32> %tmp13, ptr %tmp14, align 4
1798 ; %val = load <2 x i16>
1799 ; %op1 = sext<2 x i32> %val
1800 ; %op2 = const <2 x i32> {c1, c2} // c1 and c2 are within (-32768 ~ 32767)
1801 ; %rst = mul <2 x i32> %op1, %op2
1803 define void @mul_2xi16_varconst2(ptr nocapture readonly %a, i64 %index) {
1804 ; X86-SSE-LABEL: mul_2xi16_varconst2:
1805 ; X86-SSE: # %bb.0: # %entry
1806 ; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %eax
1807 ; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %ecx
1808 ; X86-SSE-NEXT: movl c, %edx
1809 ; X86-SSE-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
1810 ; X86-SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,1,3,4,5,6,7]
1811 ; X86-SSE-NEXT: pmaddwd {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
1812 ; X86-SSE-NEXT: movq %xmm0, (%edx,%eax,4)
1813 ; X86-SSE-NEXT: retl
1815 ; X86-AVX-LABEL: mul_2xi16_varconst2:
1816 ; X86-AVX: # %bb.0: # %entry
1817 ; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax
1818 ; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %ecx
1819 ; X86-AVX-NEXT: movl c, %edx
1820 ; X86-AVX-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
1821 ; X86-AVX-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
1822 ; X86-AVX-NEXT: vpmaddwd {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0
1823 ; X86-AVX-NEXT: vmovq %xmm0, (%edx,%eax,4)
1824 ; X86-AVX-NEXT: retl
1826 ; X64-SSE-LABEL: mul_2xi16_varconst2:
1827 ; X64-SSE: # %bb.0: # %entry
1828 ; X64-SSE-NEXT: movq c(%rip), %rax
1829 ; X64-SSE-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
1830 ; X64-SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,1,3,4,5,6,7]
1831 ; X64-SSE-NEXT: pmaddwd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
1832 ; X64-SSE-NEXT: movq %xmm0, (%rax,%rsi,4)
1833 ; X64-SSE-NEXT: retq
1835 ; X64-AVX-LABEL: mul_2xi16_varconst2:
1836 ; X64-AVX: # %bb.0: # %entry
1837 ; X64-AVX-NEXT: movq c(%rip), %rax
1838 ; X64-AVX-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
1839 ; X64-AVX-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
1840 ; X64-AVX-NEXT: vpmaddwd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
1841 ; X64-AVX-NEXT: vmovq %xmm0, (%rax,%rsi,4)
1842 ; X64-AVX-NEXT: retq
1844 %pre = load ptr, ptr @c
1845 %tmp6 = getelementptr inbounds i8, ptr %a, i64 %index
1846 %wide.load = load <2 x i16>, ptr %tmp6, align 1
1847 %tmp8 = sext <2 x i16> %wide.load to <2 x i32>
1848 %tmp13 = mul nuw nsw <2 x i32> %tmp8, <i32 -32768, i32 32767>
1849 %tmp14 = getelementptr inbounds i32, ptr %pre, i64 %index
1850 store <2 x i32> %tmp13, ptr %tmp14, align 4
1854 ; %val = load <2 x i16>
1855 ; %op1 = zext<2 x i32> %val
1856 ; %op2 = const <2 x i32> {c1, c2} // c1 and c2 are within (0 ~ 65536)
1857 ; %rst = mul <2 x i32> %op1, %op2
1859 define void @mul_2xi16_varconst3(ptr nocapture readonly %a, i64 %index) {
1860 ; X86-SSE-LABEL: mul_2xi16_varconst3:
1861 ; X86-SSE: # %bb.0: # %entry
1862 ; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %eax
1863 ; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %ecx
1864 ; X86-SSE-NEXT: movl c, %edx
1865 ; X86-SSE-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
1866 ; X86-SSE-NEXT: psrld $16, %xmm0
1867 ; X86-SSE-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
1868 ; X86-SSE-NEXT: psllq $32, %xmm0
1869 ; X86-SSE-NEXT: movq %xmm0, (%edx,%eax,4)
1870 ; X86-SSE-NEXT: retl
1872 ; X86-AVX-LABEL: mul_2xi16_varconst3:
1873 ; X86-AVX: # %bb.0: # %entry
1874 ; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax
1875 ; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %ecx
1876 ; X86-AVX-NEXT: movl c, %edx
1877 ; X86-AVX-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
1878 ; X86-AVX-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
1879 ; X86-AVX-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0
1880 ; X86-AVX-NEXT: vmovq %xmm0, (%edx,%eax,4)
1881 ; X86-AVX-NEXT: retl
1883 ; X64-SSE-LABEL: mul_2xi16_varconst3:
1884 ; X64-SSE: # %bb.0: # %entry
1885 ; X64-SSE-NEXT: movq c(%rip), %rax
1886 ; X64-SSE-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
1887 ; X64-SSE-NEXT: psrld $16, %xmm0
1888 ; X64-SSE-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
1889 ; X64-SSE-NEXT: psllq $32, %xmm0
1890 ; X64-SSE-NEXT: movq %xmm0, (%rax,%rsi,4)
1891 ; X64-SSE-NEXT: retq
1893 ; X64-AVX-LABEL: mul_2xi16_varconst3:
1894 ; X64-AVX: # %bb.0: # %entry
1895 ; X64-AVX-NEXT: movq c(%rip), %rax
1896 ; X64-AVX-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
1897 ; X64-AVX-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
1898 ; X64-AVX-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
1899 ; X64-AVX-NEXT: vmovq %xmm0, (%rax,%rsi,4)
1900 ; X64-AVX-NEXT: retq
1902 %pre = load ptr, ptr @c
1903 %tmp6 = getelementptr inbounds i8, ptr %a, i64 %index
1904 %wide.load = load <2 x i16>, ptr %tmp6, align 1
1905 %tmp8 = zext <2 x i16> %wide.load to <2 x i32>
1906 %tmp13 = mul nuw nsw <2 x i32> %tmp8, <i32 0, i32 65536>
1907 %tmp14 = getelementptr inbounds i32, ptr %pre, i64 %index
1908 store <2 x i32> %tmp13, ptr %tmp14, align 4
1912 ; %val = load <2 x i16>
1913 ; %op1 = sext<2 x i32> %val
1914 ; %op2 = const <2 x i32> {c1, c2} // c1 and c2 are within (0 ~ 32768)
1915 ; %rst = mul <2 x i32> %op1, %op2
1917 define void @mul_2xi16_varconst4(ptr nocapture readonly %a, i64 %index) {
1918 ; X86-SSE-LABEL: mul_2xi16_varconst4:
1919 ; X86-SSE: # %bb.0: # %entry
1920 ; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %eax
1921 ; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %ecx
1922 ; X86-SSE-NEXT: movl c, %edx
1923 ; X86-SSE-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
1924 ; X86-SSE-NEXT: psrad $16, %xmm0
1925 ; X86-SSE-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
1926 ; X86-SSE-NEXT: psllq $32, %xmm0
1927 ; X86-SSE-NEXT: movq %xmm0, (%edx,%eax,4)
1928 ; X86-SSE-NEXT: retl
1930 ; X86-AVX-LABEL: mul_2xi16_varconst4:
1931 ; X86-AVX: # %bb.0: # %entry
1932 ; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax
1933 ; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %ecx
1934 ; X86-AVX-NEXT: movl c, %edx
1935 ; X86-AVX-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
1936 ; X86-AVX-NEXT: vpmovsxwd %xmm0, %xmm0
1937 ; X86-AVX-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0
1938 ; X86-AVX-NEXT: vmovq %xmm0, (%edx,%eax,4)
1939 ; X86-AVX-NEXT: retl
1941 ; X64-SSE-LABEL: mul_2xi16_varconst4:
1942 ; X64-SSE: # %bb.0: # %entry
1943 ; X64-SSE-NEXT: movq c(%rip), %rax
1944 ; X64-SSE-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
1945 ; X64-SSE-NEXT: psrad $16, %xmm0
1946 ; X64-SSE-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
1947 ; X64-SSE-NEXT: psllq $32, %xmm0
1948 ; X64-SSE-NEXT: movq %xmm0, (%rax,%rsi,4)
1949 ; X64-SSE-NEXT: retq
1951 ; X64-AVX-LABEL: mul_2xi16_varconst4:
1952 ; X64-AVX: # %bb.0: # %entry
1953 ; X64-AVX-NEXT: movq c(%rip), %rax
1954 ; X64-AVX-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
1955 ; X64-AVX-NEXT: vpmovsxwd %xmm0, %xmm0
1956 ; X64-AVX-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
1957 ; X64-AVX-NEXT: vmovq %xmm0, (%rax,%rsi,4)
1958 ; X64-AVX-NEXT: retq
1960 %pre = load ptr, ptr @c
1961 %tmp6 = getelementptr inbounds i8, ptr %a, i64 %index
1962 %wide.load = load <2 x i16>, ptr %tmp6, align 1
1963 %tmp8 = sext <2 x i16> %wide.load to <2 x i32>
1964 %tmp13 = mul nuw nsw <2 x i32> %tmp8, <i32 0, i32 32768>
1965 %tmp14 = getelementptr inbounds i32, ptr %pre, i64 %index
1966 store <2 x i32> %tmp13, ptr %tmp14, align 4
1974 define void @PR34947(ptr %p0, ptr %p1) nounwind {
1975 ; X86-SSE-LABEL: PR34947:
1977 ; X86-SSE-NEXT: pushl %ebp
1978 ; X86-SSE-NEXT: pushl %ebx
1979 ; X86-SSE-NEXT: pushl %edi
1980 ; X86-SSE-NEXT: pushl %esi
1981 ; X86-SSE-NEXT: pushl %eax
1982 ; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %ecx
1983 ; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %eax
1984 ; X86-SSE-NEXT: movzwl 16(%eax), %edx
1985 ; X86-SSE-NEXT: movl %edx, (%esp) # 4-byte Spill
1986 ; X86-SSE-NEXT: movdqa (%eax), %xmm3
1987 ; X86-SSE-NEXT: movdqa (%ecx), %xmm0
1988 ; X86-SSE-NEXT: movdqa 16(%ecx), %xmm1
1989 ; X86-SSE-NEXT: pxor %xmm5, %xmm5
1990 ; X86-SSE-NEXT: movdqa %xmm3, %xmm2
1991 ; X86-SSE-NEXT: pextrw $7, %xmm3, %eax
1992 ; X86-SSE-NEXT: pextrw $4, %xmm3, %edi
1993 ; X86-SSE-NEXT: pextrw $0, %xmm3, %ebp
1994 ; X86-SSE-NEXT: pextrw $1, %xmm3, %esi
1995 ; X86-SSE-NEXT: pextrw $3, %xmm3, %ebx
1996 ; X86-SSE-NEXT: movdqa %xmm3, %xmm4
1997 ; X86-SSE-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1],xmm4[2],xmm5[2],xmm4[3],xmm5[3]
1998 ; X86-SSE-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm5[4],xmm2[5],xmm5[5],xmm2[6],xmm5[6],xmm2[7],xmm5[7]
1999 ; X86-SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm1[3,3,3,3]
2000 ; X86-SSE-NEXT: movd %xmm3, %ecx
2001 ; X86-SSE-NEXT: xorl %edx, %edx
2002 ; X86-SSE-NEXT: divl %ecx
2003 ; X86-SSE-NEXT: movd %edx, %xmm3
2004 ; X86-SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm2[2,3,2,3]
2005 ; X86-SSE-NEXT: movd %xmm5, %eax
2006 ; X86-SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm1[2,3,2,3]
2007 ; X86-SSE-NEXT: movd %xmm5, %ecx
2008 ; X86-SSE-NEXT: xorl %edx, %edx
2009 ; X86-SSE-NEXT: divl %ecx
2010 ; X86-SSE-NEXT: movd %edx, %xmm5
2011 ; X86-SSE-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm3[0],xmm5[1],xmm3[1]
2012 ; X86-SSE-NEXT: movl %edi, %eax
2013 ; X86-SSE-NEXT: xorl %edx, %edx
2014 ; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %edi
2015 ; X86-SSE-NEXT: divl 16(%edi)
2016 ; X86-SSE-NEXT: movd %edx, %xmm3
2017 ; X86-SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,1,1]
2018 ; X86-SSE-NEXT: movd %xmm2, %eax
2019 ; X86-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,1,1]
2020 ; X86-SSE-NEXT: movd %xmm1, %ecx
2021 ; X86-SSE-NEXT: xorl %edx, %edx
2022 ; X86-SSE-NEXT: divl %ecx
2023 ; X86-SSE-NEXT: movd %edx, %xmm1
2024 ; X86-SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1]
2025 ; X86-SSE-NEXT: punpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm5[0]
2026 ; X86-SSE-NEXT: movl %ebp, %eax
2027 ; X86-SSE-NEXT: xorl %edx, %edx
2028 ; X86-SSE-NEXT: divl (%edi)
2029 ; X86-SSE-NEXT: movd %edx, %xmm1
2030 ; X86-SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,1,1]
2031 ; X86-SSE-NEXT: movd %xmm2, %ecx
2032 ; X86-SSE-NEXT: movl %esi, %eax
2033 ; X86-SSE-NEXT: xorl %edx, %edx
2034 ; X86-SSE-NEXT: divl %ecx
2035 ; X86-SSE-NEXT: movd %edx, %xmm2
2036 ; X86-SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
2037 ; X86-SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm0[3,3,3,3]
2038 ; X86-SSE-NEXT: movd %xmm2, %ecx
2039 ; X86-SSE-NEXT: movl %ebx, %eax
2040 ; X86-SSE-NEXT: xorl %edx, %edx
2041 ; X86-SSE-NEXT: divl %ecx
2042 ; X86-SSE-NEXT: movd %edx, %xmm2
2043 ; X86-SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[2,3,2,3]
2044 ; X86-SSE-NEXT: movd %xmm4, %eax
2045 ; X86-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
2046 ; X86-SSE-NEXT: movd %xmm0, %ecx
2047 ; X86-SSE-NEXT: xorl %edx, %edx
2048 ; X86-SSE-NEXT: divl %ecx
2049 ; X86-SSE-NEXT: movd %edx, %xmm0
2050 ; X86-SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
2051 ; X86-SSE-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0]
2052 ; X86-SSE-NEXT: movl (%esp), %eax # 4-byte Reload
2053 ; X86-SSE-NEXT: xorl %edx, %edx
2054 ; X86-SSE-NEXT: divl 32(%edi)
2055 ; X86-SSE-NEXT: movdqa {{.*#+}} xmm2 = [8199,8199,8199,8199]
2056 ; X86-SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm1[1,1,3,3]
2057 ; X86-SSE-NEXT: pmuludq %xmm2, %xmm1
2058 ; X86-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,2,2,3]
2059 ; X86-SSE-NEXT: pmuludq %xmm2, %xmm4
2060 ; X86-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm4[0,2,2,3]
2061 ; X86-SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
2062 ; X86-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm3[1,1,3,3]
2063 ; X86-SSE-NEXT: pmuludq %xmm2, %xmm3
2064 ; X86-SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3]
2065 ; X86-SSE-NEXT: pmuludq %xmm2, %xmm1
2066 ; X86-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
2067 ; X86-SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1]
2068 ; X86-SSE-NEXT: imull $8199, %edx, %eax # imm = 0x2007
2069 ; X86-SSE-NEXT: movl %eax, (%eax)
2070 ; X86-SSE-NEXT: movdqa %xmm3, (%eax)
2071 ; X86-SSE-NEXT: movdqa %xmm0, (%eax)
2072 ; X86-SSE-NEXT: addl $4, %esp
2073 ; X86-SSE-NEXT: popl %esi
2074 ; X86-SSE-NEXT: popl %edi
2075 ; X86-SSE-NEXT: popl %ebx
2076 ; X86-SSE-NEXT: popl %ebp
2077 ; X86-SSE-NEXT: retl
2079 ; X86-AVX1-LABEL: PR34947:
2080 ; X86-AVX1: # %bb.0:
2081 ; X86-AVX1-NEXT: pushl %ebp
2082 ; X86-AVX1-NEXT: pushl %ebx
2083 ; X86-AVX1-NEXT: pushl %edi
2084 ; X86-AVX1-NEXT: pushl %esi
2085 ; X86-AVX1-NEXT: subl $16, %esp
2086 ; X86-AVX1-NEXT: movl {{[0-9]+}}(%esp), %ecx
2087 ; X86-AVX1-NEXT: movl {{[0-9]+}}(%esp), %eax
2088 ; X86-AVX1-NEXT: vpmovzxwd {{.*#+}} xmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
2089 ; X86-AVX1-NEXT: vpmovzxwd {{.*#+}} xmm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
2090 ; X86-AVX1-NEXT: vpmovzxwd {{.*#+}} xmm2 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
2091 ; X86-AVX1-NEXT: vmovd %xmm2, %eax
2092 ; X86-AVX1-NEXT: xorl %edx, %edx
2093 ; X86-AVX1-NEXT: divl 32(%ecx)
2094 ; X86-AVX1-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
2095 ; X86-AVX1-NEXT: vpextrd $3, %xmm1, %eax
2096 ; X86-AVX1-NEXT: xorl %edx, %edx
2097 ; X86-AVX1-NEXT: divl 28(%ecx)
2098 ; X86-AVX1-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
2099 ; X86-AVX1-NEXT: vpextrd $2, %xmm1, %eax
2100 ; X86-AVX1-NEXT: xorl %edx, %edx
2101 ; X86-AVX1-NEXT: divl 24(%ecx)
2102 ; X86-AVX1-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
2103 ; X86-AVX1-NEXT: vpextrd $1, %xmm1, %eax
2104 ; X86-AVX1-NEXT: xorl %edx, %edx
2105 ; X86-AVX1-NEXT: divl 20(%ecx)
2106 ; X86-AVX1-NEXT: movl %edx, (%esp) # 4-byte Spill
2107 ; X86-AVX1-NEXT: vmovd %xmm1, %eax
2108 ; X86-AVX1-NEXT: xorl %edx, %edx
2109 ; X86-AVX1-NEXT: divl 16(%ecx)
2110 ; X86-AVX1-NEXT: movl %edx, %ebp
2111 ; X86-AVX1-NEXT: vpextrd $3, %xmm0, %eax
2112 ; X86-AVX1-NEXT: xorl %edx, %edx
2113 ; X86-AVX1-NEXT: divl 12(%ecx)
2114 ; X86-AVX1-NEXT: movl %edx, %ebx
2115 ; X86-AVX1-NEXT: vpextrd $2, %xmm0, %eax
2116 ; X86-AVX1-NEXT: xorl %edx, %edx
2117 ; X86-AVX1-NEXT: divl 8(%ecx)
2118 ; X86-AVX1-NEXT: movl %edx, %esi
2119 ; X86-AVX1-NEXT: vpextrd $1, %xmm0, %eax
2120 ; X86-AVX1-NEXT: xorl %edx, %edx
2121 ; X86-AVX1-NEXT: divl 4(%ecx)
2122 ; X86-AVX1-NEXT: movl %edx, %edi
2123 ; X86-AVX1-NEXT: vmovd %xmm0, %eax
2124 ; X86-AVX1-NEXT: xorl %edx, %edx
2125 ; X86-AVX1-NEXT: divl (%ecx)
2126 ; X86-AVX1-NEXT: vmovd %edx, %xmm0
2127 ; X86-AVX1-NEXT: vpinsrd $1, %edi, %xmm0, %xmm0
2128 ; X86-AVX1-NEXT: vpinsrd $2, %esi, %xmm0, %xmm0
2129 ; X86-AVX1-NEXT: vpinsrd $3, %ebx, %xmm0, %xmm0
2130 ; X86-AVX1-NEXT: vmovd %ebp, %xmm1
2131 ; X86-AVX1-NEXT: vpinsrd $1, (%esp), %xmm1, %xmm1 # 4-byte Folded Reload
2132 ; X86-AVX1-NEXT: vpinsrd $2, {{[-0-9]+}}(%e{{[sb]}}p), %xmm1, %xmm1 # 4-byte Folded Reload
2133 ; X86-AVX1-NEXT: vpinsrd $3, {{[-0-9]+}}(%e{{[sb]}}p), %xmm1, %xmm1 # 4-byte Folded Reload
2134 ; X86-AVX1-NEXT: imull $8199, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
2135 ; X86-AVX1-NEXT: # imm = 0x2007
2136 ; X86-AVX1-NEXT: movl %eax, (%eax)
2137 ; X86-AVX1-NEXT: vbroadcastss {{.*#+}} xmm2 = [8199,8199,8199,8199]
2138 ; X86-AVX1-NEXT: vpmulld %xmm2, %xmm0, %xmm0
2139 ; X86-AVX1-NEXT: vpmulld %xmm2, %xmm1, %xmm1
2140 ; X86-AVX1-NEXT: vmovdqa %xmm1, (%eax)
2141 ; X86-AVX1-NEXT: vmovdqa %xmm0, (%eax)
2142 ; X86-AVX1-NEXT: addl $16, %esp
2143 ; X86-AVX1-NEXT: popl %esi
2144 ; X86-AVX1-NEXT: popl %edi
2145 ; X86-AVX1-NEXT: popl %ebx
2146 ; X86-AVX1-NEXT: popl %ebp
2147 ; X86-AVX1-NEXT: retl
2149 ; X86-AVX2-LABEL: PR34947:
2150 ; X86-AVX2: # %bb.0:
2151 ; X86-AVX2-NEXT: pushl %esi
2152 ; X86-AVX2-NEXT: movl {{[0-9]+}}(%esp), %esi
2153 ; X86-AVX2-NEXT: movl {{[0-9]+}}(%esp), %eax
2154 ; X86-AVX2-NEXT: vpmovzxwd {{.*#+}} ymm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero
2155 ; X86-AVX2-NEXT: vpmovzxwd {{.*#+}} ymm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero
2156 ; X86-AVX2-NEXT: vextracti128 $1, %ymm1, %xmm2
2157 ; X86-AVX2-NEXT: vpextrd $1, %xmm2, %eax
2158 ; X86-AVX2-NEXT: xorl %edx, %edx
2159 ; X86-AVX2-NEXT: divl 20(%esi)
2160 ; X86-AVX2-NEXT: movl %edx, %ecx
2161 ; X86-AVX2-NEXT: vmovd %xmm2, %eax
2162 ; X86-AVX2-NEXT: xorl %edx, %edx
2163 ; X86-AVX2-NEXT: divl 16(%esi)
2164 ; X86-AVX2-NEXT: vmovd %edx, %xmm3
2165 ; X86-AVX2-NEXT: vpinsrd $1, %ecx, %xmm3, %xmm3
2166 ; X86-AVX2-NEXT: vpextrd $2, %xmm2, %eax
2167 ; X86-AVX2-NEXT: xorl %edx, %edx
2168 ; X86-AVX2-NEXT: divl 24(%esi)
2169 ; X86-AVX2-NEXT: vpinsrd $2, %edx, %xmm3, %xmm3
2170 ; X86-AVX2-NEXT: vpextrd $3, %xmm2, %eax
2171 ; X86-AVX2-NEXT: xorl %edx, %edx
2172 ; X86-AVX2-NEXT: divl 28(%esi)
2173 ; X86-AVX2-NEXT: vpinsrd $3, %edx, %xmm3, %xmm2
2174 ; X86-AVX2-NEXT: vpextrd $1, %xmm1, %eax
2175 ; X86-AVX2-NEXT: xorl %edx, %edx
2176 ; X86-AVX2-NEXT: divl 4(%esi)
2177 ; X86-AVX2-NEXT: movl %edx, %ecx
2178 ; X86-AVX2-NEXT: vmovd %xmm1, %eax
2179 ; X86-AVX2-NEXT: xorl %edx, %edx
2180 ; X86-AVX2-NEXT: divl (%esi)
2181 ; X86-AVX2-NEXT: vmovd %edx, %xmm3
2182 ; X86-AVX2-NEXT: vpinsrd $1, %ecx, %xmm3, %xmm3
2183 ; X86-AVX2-NEXT: vpextrd $2, %xmm1, %eax
2184 ; X86-AVX2-NEXT: xorl %edx, %edx
2185 ; X86-AVX2-NEXT: divl 8(%esi)
2186 ; X86-AVX2-NEXT: vpinsrd $2, %edx, %xmm3, %xmm3
2187 ; X86-AVX2-NEXT: vpextrd $3, %xmm1, %eax
2188 ; X86-AVX2-NEXT: xorl %edx, %edx
2189 ; X86-AVX2-NEXT: divl 12(%esi)
2190 ; X86-AVX2-NEXT: vpinsrd $3, %edx, %xmm3, %xmm1
2191 ; X86-AVX2-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1
2192 ; X86-AVX2-NEXT: vmovd %xmm0, %eax
2193 ; X86-AVX2-NEXT: xorl %edx, %edx
2194 ; X86-AVX2-NEXT: divl 32(%esi)
2195 ; X86-AVX2-NEXT: vpbroadcastd {{.*#+}} ymm0 = [8199,8199,8199,8199,8199,8199,8199,8199]
2196 ; X86-AVX2-NEXT: vpmulld %ymm0, %ymm1, %ymm0
2197 ; X86-AVX2-NEXT: imull $8199, %edx, %eax # imm = 0x2007
2198 ; X86-AVX2-NEXT: movl %eax, (%eax)
2199 ; X86-AVX2-NEXT: vmovdqa %ymm0, (%eax)
2200 ; X86-AVX2-NEXT: popl %esi
2201 ; X86-AVX2-NEXT: vzeroupper
2202 ; X86-AVX2-NEXT: retl
2204 ; X64-SSE-LABEL: PR34947:
2206 ; X64-SSE-NEXT: movzwl 16(%rdi), %ecx
2207 ; X64-SSE-NEXT: movdqa (%rdi), %xmm3
2208 ; X64-SSE-NEXT: movdqa (%rsi), %xmm0
2209 ; X64-SSE-NEXT: movdqa 16(%rsi), %xmm1
2210 ; X64-SSE-NEXT: pxor %xmm5, %xmm5
2211 ; X64-SSE-NEXT: movdqa %xmm3, %xmm2
2212 ; X64-SSE-NEXT: pextrw $7, %xmm3, %eax
2213 ; X64-SSE-NEXT: pextrw $4, %xmm3, %r8d
2214 ; X64-SSE-NEXT: pextrw $0, %xmm3, %r10d
2215 ; X64-SSE-NEXT: pextrw $1, %xmm3, %edi
2216 ; X64-SSE-NEXT: pextrw $3, %xmm3, %r9d
2217 ; X64-SSE-NEXT: movdqa %xmm3, %xmm4
2218 ; X64-SSE-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1],xmm4[2],xmm5[2],xmm4[3],xmm5[3]
2219 ; X64-SSE-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm5[4],xmm2[5],xmm5[5],xmm2[6],xmm5[6],xmm2[7],xmm5[7]
2220 ; X64-SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm1[3,3,3,3]
2221 ; X64-SSE-NEXT: movd %xmm3, %r11d
2222 ; X64-SSE-NEXT: xorl %edx, %edx
2223 ; X64-SSE-NEXT: divl %r11d
2224 ; X64-SSE-NEXT: movd %edx, %xmm3
2225 ; X64-SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm2[2,3,2,3]
2226 ; X64-SSE-NEXT: movd %xmm5, %eax
2227 ; X64-SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm1[2,3,2,3]
2228 ; X64-SSE-NEXT: movd %xmm5, %r11d
2229 ; X64-SSE-NEXT: xorl %edx, %edx
2230 ; X64-SSE-NEXT: divl %r11d
2231 ; X64-SSE-NEXT: movd %edx, %xmm5
2232 ; X64-SSE-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm3[0],xmm5[1],xmm3[1]
2233 ; X64-SSE-NEXT: movl %r8d, %eax
2234 ; X64-SSE-NEXT: xorl %edx, %edx
2235 ; X64-SSE-NEXT: divl 16(%rsi)
2236 ; X64-SSE-NEXT: movd %edx, %xmm3
2237 ; X64-SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,1,1]
2238 ; X64-SSE-NEXT: movd %xmm2, %eax
2239 ; X64-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,1,1]
2240 ; X64-SSE-NEXT: movd %xmm1, %r8d
2241 ; X64-SSE-NEXT: xorl %edx, %edx
2242 ; X64-SSE-NEXT: divl %r8d
2243 ; X64-SSE-NEXT: movd %edx, %xmm1
2244 ; X64-SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1]
2245 ; X64-SSE-NEXT: punpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm5[0]
2246 ; X64-SSE-NEXT: movl %r10d, %eax
2247 ; X64-SSE-NEXT: xorl %edx, %edx
2248 ; X64-SSE-NEXT: divl (%rsi)
2249 ; X64-SSE-NEXT: movd %edx, %xmm1
2250 ; X64-SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,1,1]
2251 ; X64-SSE-NEXT: movd %xmm2, %r8d
2252 ; X64-SSE-NEXT: movl %edi, %eax
2253 ; X64-SSE-NEXT: xorl %edx, %edx
2254 ; X64-SSE-NEXT: divl %r8d
2255 ; X64-SSE-NEXT: movd %edx, %xmm2
2256 ; X64-SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
2257 ; X64-SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm0[3,3,3,3]
2258 ; X64-SSE-NEXT: movd %xmm2, %edi
2259 ; X64-SSE-NEXT: movl %r9d, %eax
2260 ; X64-SSE-NEXT: xorl %edx, %edx
2261 ; X64-SSE-NEXT: divl %edi
2262 ; X64-SSE-NEXT: movd %edx, %xmm2
2263 ; X64-SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[2,3,2,3]
2264 ; X64-SSE-NEXT: movd %xmm4, %eax
2265 ; X64-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
2266 ; X64-SSE-NEXT: movd %xmm0, %edi
2267 ; X64-SSE-NEXT: xorl %edx, %edx
2268 ; X64-SSE-NEXT: divl %edi
2269 ; X64-SSE-NEXT: movd %edx, %xmm0
2270 ; X64-SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
2271 ; X64-SSE-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0]
2272 ; X64-SSE-NEXT: movl %ecx, %eax
2273 ; X64-SSE-NEXT: xorl %edx, %edx
2274 ; X64-SSE-NEXT: divl 32(%rsi)
2275 ; X64-SSE-NEXT: movdqa {{.*#+}} xmm0 = [8199,8199,8199,8199]
2276 ; X64-SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
2277 ; X64-SSE-NEXT: pmuludq %xmm0, %xmm1
2278 ; X64-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
2279 ; X64-SSE-NEXT: pmuludq %xmm0, %xmm2
2280 ; X64-SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
2281 ; X64-SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
2282 ; X64-SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm3[1,1,3,3]
2283 ; X64-SSE-NEXT: pmuludq %xmm0, %xmm3
2284 ; X64-SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3]
2285 ; X64-SSE-NEXT: pmuludq %xmm0, %xmm2
2286 ; X64-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,2,2,3]
2287 ; X64-SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1]
2288 ; X64-SSE-NEXT: imull $8199, %edx, %eax # imm = 0x2007
2289 ; X64-SSE-NEXT: movl %eax, (%rax)
2290 ; X64-SSE-NEXT: movdqa %xmm3, (%rax)
2291 ; X64-SSE-NEXT: movdqa %xmm1, (%rax)
2292 ; X64-SSE-NEXT: retq
2294 ; X64-AVX1-LABEL: PR34947:
2295 ; X64-AVX1: # %bb.0:
2296 ; X64-AVX1-NEXT: pushq %rbp
2297 ; X64-AVX1-NEXT: pushq %rbx
2298 ; X64-AVX1-NEXT: vpmovzxwd {{.*#+}} xmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
2299 ; X64-AVX1-NEXT: vpmovzxwd {{.*#+}} xmm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
2300 ; X64-AVX1-NEXT: vpmovzxwd {{.*#+}} xmm2 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
2301 ; X64-AVX1-NEXT: vmovd %xmm2, %eax
2302 ; X64-AVX1-NEXT: xorl %edx, %edx
2303 ; X64-AVX1-NEXT: divl 32(%rsi)
2304 ; X64-AVX1-NEXT: movl %edx, %ecx
2305 ; X64-AVX1-NEXT: vpextrd $3, %xmm1, %eax
2306 ; X64-AVX1-NEXT: xorl %edx, %edx
2307 ; X64-AVX1-NEXT: divl 28(%rsi)
2308 ; X64-AVX1-NEXT: movl %edx, %edi
2309 ; X64-AVX1-NEXT: vpextrd $2, %xmm1, %eax
2310 ; X64-AVX1-NEXT: xorl %edx, %edx
2311 ; X64-AVX1-NEXT: divl 24(%rsi)
2312 ; X64-AVX1-NEXT: movl %edx, %r8d
2313 ; X64-AVX1-NEXT: vpextrd $1, %xmm1, %eax
2314 ; X64-AVX1-NEXT: xorl %edx, %edx
2315 ; X64-AVX1-NEXT: divl 20(%rsi)
2316 ; X64-AVX1-NEXT: movl %edx, %r9d
2317 ; X64-AVX1-NEXT: vmovd %xmm1, %eax
2318 ; X64-AVX1-NEXT: xorl %edx, %edx
2319 ; X64-AVX1-NEXT: divl 16(%rsi)
2320 ; X64-AVX1-NEXT: movl %edx, %r10d
2321 ; X64-AVX1-NEXT: vpextrd $3, %xmm0, %eax
2322 ; X64-AVX1-NEXT: xorl %edx, %edx
2323 ; X64-AVX1-NEXT: divl 12(%rsi)
2324 ; X64-AVX1-NEXT: movl %edx, %r11d
2325 ; X64-AVX1-NEXT: vpextrd $2, %xmm0, %eax
2326 ; X64-AVX1-NEXT: xorl %edx, %edx
2327 ; X64-AVX1-NEXT: divl 8(%rsi)
2328 ; X64-AVX1-NEXT: movl %edx, %ebx
2329 ; X64-AVX1-NEXT: vpextrd $1, %xmm0, %eax
2330 ; X64-AVX1-NEXT: xorl %edx, %edx
2331 ; X64-AVX1-NEXT: divl 4(%rsi)
2332 ; X64-AVX1-NEXT: movl %edx, %ebp
2333 ; X64-AVX1-NEXT: vmovd %xmm0, %eax
2334 ; X64-AVX1-NEXT: xorl %edx, %edx
2335 ; X64-AVX1-NEXT: divl (%rsi)
2336 ; X64-AVX1-NEXT: vmovd %edx, %xmm0
2337 ; X64-AVX1-NEXT: vpinsrd $1, %ebp, %xmm0, %xmm0
2338 ; X64-AVX1-NEXT: vpinsrd $2, %ebx, %xmm0, %xmm0
2339 ; X64-AVX1-NEXT: vpinsrd $3, %r11d, %xmm0, %xmm0
2340 ; X64-AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [8199,8199,8199,8199]
2341 ; X64-AVX1-NEXT: vpmulld %xmm1, %xmm0, %xmm0
2342 ; X64-AVX1-NEXT: vmovd %r10d, %xmm2
2343 ; X64-AVX1-NEXT: vpinsrd $1, %r9d, %xmm2, %xmm2
2344 ; X64-AVX1-NEXT: vpinsrd $2, %r8d, %xmm2, %xmm2
2345 ; X64-AVX1-NEXT: vpinsrd $3, %edi, %xmm2, %xmm2
2346 ; X64-AVX1-NEXT: vpmulld %xmm1, %xmm2, %xmm1
2347 ; X64-AVX1-NEXT: imull $8199, %ecx, %eax # imm = 0x2007
2348 ; X64-AVX1-NEXT: movl %eax, (%rax)
2349 ; X64-AVX1-NEXT: vmovdqa %xmm1, (%rax)
2350 ; X64-AVX1-NEXT: vmovdqa %xmm0, (%rax)
2351 ; X64-AVX1-NEXT: popq %rbx
2352 ; X64-AVX1-NEXT: popq %rbp
2353 ; X64-AVX1-NEXT: retq
2355 ; X64-AVX2-LABEL: PR34947:
2356 ; X64-AVX2: # %bb.0:
2357 ; X64-AVX2-NEXT: vpmovzxwd {{.*#+}} ymm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero
2358 ; X64-AVX2-NEXT: vpmovzxwd {{.*#+}} ymm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero
2359 ; X64-AVX2-NEXT: vextracti128 $1, %ymm1, %xmm2
2360 ; X64-AVX2-NEXT: vpextrd $1, %xmm2, %eax
2361 ; X64-AVX2-NEXT: xorl %edx, %edx
2362 ; X64-AVX2-NEXT: divl 20(%rsi)
2363 ; X64-AVX2-NEXT: movl %edx, %ecx
2364 ; X64-AVX2-NEXT: vmovd %xmm2, %eax
2365 ; X64-AVX2-NEXT: xorl %edx, %edx
2366 ; X64-AVX2-NEXT: divl 16(%rsi)
2367 ; X64-AVX2-NEXT: vmovd %edx, %xmm3
2368 ; X64-AVX2-NEXT: vpinsrd $1, %ecx, %xmm3, %xmm3
2369 ; X64-AVX2-NEXT: vpextrd $2, %xmm2, %eax
2370 ; X64-AVX2-NEXT: xorl %edx, %edx
2371 ; X64-AVX2-NEXT: divl 24(%rsi)
2372 ; X64-AVX2-NEXT: vpinsrd $2, %edx, %xmm3, %xmm3
2373 ; X64-AVX2-NEXT: vpextrd $3, %xmm2, %eax
2374 ; X64-AVX2-NEXT: xorl %edx, %edx
2375 ; X64-AVX2-NEXT: divl 28(%rsi)
2376 ; X64-AVX2-NEXT: vpinsrd $3, %edx, %xmm3, %xmm2
2377 ; X64-AVX2-NEXT: vpextrd $1, %xmm1, %eax
2378 ; X64-AVX2-NEXT: xorl %edx, %edx
2379 ; X64-AVX2-NEXT: divl 4(%rsi)
2380 ; X64-AVX2-NEXT: movl %edx, %ecx
2381 ; X64-AVX2-NEXT: vmovd %xmm1, %eax
2382 ; X64-AVX2-NEXT: xorl %edx, %edx
2383 ; X64-AVX2-NEXT: divl (%rsi)
2384 ; X64-AVX2-NEXT: vmovd %edx, %xmm3
2385 ; X64-AVX2-NEXT: vpinsrd $1, %ecx, %xmm3, %xmm3
2386 ; X64-AVX2-NEXT: vpextrd $2, %xmm1, %eax
2387 ; X64-AVX2-NEXT: xorl %edx, %edx
2388 ; X64-AVX2-NEXT: divl 8(%rsi)
2389 ; X64-AVX2-NEXT: vpinsrd $2, %edx, %xmm3, %xmm3
2390 ; X64-AVX2-NEXT: vpextrd $3, %xmm1, %eax
2391 ; X64-AVX2-NEXT: xorl %edx, %edx
2392 ; X64-AVX2-NEXT: divl 12(%rsi)
2393 ; X64-AVX2-NEXT: vpinsrd $3, %edx, %xmm3, %xmm1
2394 ; X64-AVX2-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1
2395 ; X64-AVX2-NEXT: vmovd %xmm0, %eax
2396 ; X64-AVX2-NEXT: xorl %edx, %edx
2397 ; X64-AVX2-NEXT: divl 32(%rsi)
2398 ; X64-AVX2-NEXT: vpbroadcastd {{.*#+}} ymm0 = [8199,8199,8199,8199,8199,8199,8199,8199]
2399 ; X64-AVX2-NEXT: vpmulld %ymm0, %ymm1, %ymm0
2400 ; X64-AVX2-NEXT: imull $8199, %edx, %eax # imm = 0x2007
2401 ; X64-AVX2-NEXT: movl %eax, (%rax)
2402 ; X64-AVX2-NEXT: vmovdqa %ymm0, (%rax)
2403 ; X64-AVX2-NEXT: vzeroupper
2404 ; X64-AVX2-NEXT: retq
2405 %a0 = load <9 x i16>, ptr %p0, align 64
2406 %a1 = load <9 x i32>, ptr %p1, align 64
2407 %ext0 = zext <9 x i16> %a0 to <9 x i32>
2408 %rem = urem <9 x i32> %ext0, %a1
2409 %mul = mul <9 x i32> <i32 8199, i32 8199, i32 8199, i32 8199, i32 8199, i32 8199, i32 8199, i32 8199, i32 8199>, %rem
2410 store <9 x i32> %mul, ptr undef, align 64