1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefix=X86 --check-prefix=X86-SSE
3 ; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+avx | FileCheck %s --check-prefix=X86 --check-prefix=X86-AVX --check-prefix=X86-AVX1
4 ; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=X86 --check-prefix=X86-AVX --check-prefix=X86-AVX2
5 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefix=X64 --check-prefix=X64-SSE
6 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefix=X64 --check-prefix=X64-AVX --check-prefix=X64-AVX1
7 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=X64 --check-prefix=X64-AVX --check-prefix=X64-AVX2
9 @c = external global i32*, align 8
11 ; %val1 = load <2 x i8>
12 ; %op1 = zext<2 x i32> %val1
13 ; %val2 = load <2 x i8>
14 ; %op2 = zext<2 x i32> %val2
15 ; %rst = mul <2 x i32> %op1, %op2
17 define void @mul_2xi8(i8* nocapture readonly %a, i8* nocapture readonly %b, i64 %index) nounwind {
18 ; X86-SSE-LABEL: mul_2xi8:
19 ; X86-SSE: # %bb.0: # %entry
20 ; X86-SSE-NEXT: pushl %esi
21 ; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %eax
22 ; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %ecx
23 ; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %edx
24 ; X86-SSE-NEXT: movl c, %esi
25 ; X86-SSE-NEXT: movzwl (%edx,%ecx), %edx
26 ; X86-SSE-NEXT: movd %edx, %xmm0
27 ; X86-SSE-NEXT: movzwl (%eax,%ecx), %eax
28 ; X86-SSE-NEXT: movd %eax, %xmm1
29 ; X86-SSE-NEXT: pxor %xmm2, %xmm2
30 ; X86-SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7]
31 ; X86-SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
32 ; X86-SSE-NEXT: pmullw %xmm0, %xmm1
33 ; X86-SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
34 ; X86-SSE-NEXT: movq %xmm1, (%esi,%ecx,4)
35 ; X86-SSE-NEXT: popl %esi
38 ; X86-AVX-LABEL: mul_2xi8:
39 ; X86-AVX: # %bb.0: # %entry
40 ; X86-AVX-NEXT: pushl %esi
41 ; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax
42 ; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %ecx
43 ; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %edx
44 ; X86-AVX-NEXT: movl c, %esi
45 ; X86-AVX-NEXT: movzwl (%edx,%ecx), %edx
46 ; X86-AVX-NEXT: vmovd %edx, %xmm0
47 ; X86-AVX-NEXT: vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
48 ; X86-AVX-NEXT: movzwl (%eax,%ecx), %eax
49 ; X86-AVX-NEXT: vmovd %eax, %xmm1
50 ; X86-AVX-NEXT: vpmovzxbd {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero
51 ; X86-AVX-NEXT: vpmaddwd %xmm0, %xmm1, %xmm0
52 ; X86-AVX-NEXT: vmovq %xmm0, (%esi,%ecx,4)
53 ; X86-AVX-NEXT: popl %esi
56 ; X64-SSE-LABEL: mul_2xi8:
57 ; X64-SSE: # %bb.0: # %entry
58 ; X64-SSE-NEXT: movq {{.*}}(%rip), %rax
59 ; X64-SSE-NEXT: movzwl (%rdi,%rdx), %ecx
60 ; X64-SSE-NEXT: movd %ecx, %xmm0
61 ; X64-SSE-NEXT: movzwl (%rsi,%rdx), %ecx
62 ; X64-SSE-NEXT: movd %ecx, %xmm1
63 ; X64-SSE-NEXT: pxor %xmm2, %xmm2
64 ; X64-SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7]
65 ; X64-SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
66 ; X64-SSE-NEXT: pmullw %xmm0, %xmm1
67 ; X64-SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
68 ; X64-SSE-NEXT: movq %xmm1, (%rax,%rdx,4)
71 ; X64-AVX-LABEL: mul_2xi8:
72 ; X64-AVX: # %bb.0: # %entry
73 ; X64-AVX-NEXT: movq {{.*}}(%rip), %rax
74 ; X64-AVX-NEXT: movzwl (%rdi,%rdx), %ecx
75 ; X64-AVX-NEXT: vmovd %ecx, %xmm0
76 ; X64-AVX-NEXT: vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
77 ; X64-AVX-NEXT: movzwl (%rsi,%rdx), %ecx
78 ; X64-AVX-NEXT: vmovd %ecx, %xmm1
79 ; X64-AVX-NEXT: vpmovzxbd {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero
80 ; X64-AVX-NEXT: vpmaddwd %xmm0, %xmm1, %xmm0
81 ; X64-AVX-NEXT: vmovq %xmm0, (%rax,%rdx,4)
84 %pre = load i32*, i32** @c
85 %tmp6 = getelementptr inbounds i8, i8* %a, i64 %index
86 %tmp7 = bitcast i8* %tmp6 to <2 x i8>*
87 %wide.load = load <2 x i8>, <2 x i8>* %tmp7, align 1
88 %tmp8 = zext <2 x i8> %wide.load to <2 x i32>
89 %tmp10 = getelementptr inbounds i8, i8* %b, i64 %index
90 %tmp11 = bitcast i8* %tmp10 to <2 x i8>*
91 %wide.load17 = load <2 x i8>, <2 x i8>* %tmp11, align 1
92 %tmp12 = zext <2 x i8> %wide.load17 to <2 x i32>
93 %tmp13 = mul nuw nsw <2 x i32> %tmp12, %tmp8
94 %tmp14 = getelementptr inbounds i32, i32* %pre, i64 %index
95 %tmp15 = bitcast i32* %tmp14 to <2 x i32>*
96 store <2 x i32> %tmp13, <2 x i32>* %tmp15, align 4
100 ; %val1 = load <4 x i8>
101 ; %op1 = zext<4 x i32> %val1
102 ; %val2 = load <4 x i8>
103 ; %op2 = zext<4 x i32> %val2
104 ; %rst = mul <4 x i32> %op1, %op2
106 define void @mul_4xi8(i8* nocapture readonly %a, i8* nocapture readonly %b, i64 %index) nounwind {
107 ; X86-SSE-LABEL: mul_4xi8:
108 ; X86-SSE: # %bb.0: # %entry
109 ; X86-SSE-NEXT: pushl %esi
110 ; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %eax
111 ; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %ecx
112 ; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %edx
113 ; X86-SSE-NEXT: movl c, %esi
114 ; X86-SSE-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
115 ; X86-SSE-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
116 ; X86-SSE-NEXT: pxor %xmm2, %xmm2
117 ; X86-SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7]
118 ; X86-SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
119 ; X86-SSE-NEXT: pmullw %xmm0, %xmm1
120 ; X86-SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
121 ; X86-SSE-NEXT: movdqu %xmm1, (%esi,%ecx,4)
122 ; X86-SSE-NEXT: popl %esi
125 ; X86-AVX-LABEL: mul_4xi8:
126 ; X86-AVX: # %bb.0: # %entry
127 ; X86-AVX-NEXT: pushl %esi
128 ; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax
129 ; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %ecx
130 ; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %edx
131 ; X86-AVX-NEXT: movl c, %esi
132 ; X86-AVX-NEXT: vpmovzxbd {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
133 ; X86-AVX-NEXT: vpmovzxbd {{.*#+}} xmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
134 ; X86-AVX-NEXT: vpmaddwd %xmm0, %xmm1, %xmm0
135 ; X86-AVX-NEXT: vmovdqu %xmm0, (%esi,%ecx,4)
136 ; X86-AVX-NEXT: popl %esi
139 ; X64-SSE-LABEL: mul_4xi8:
140 ; X64-SSE: # %bb.0: # %entry
141 ; X64-SSE-NEXT: movq {{.*}}(%rip), %rax
142 ; X64-SSE-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
143 ; X64-SSE-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
144 ; X64-SSE-NEXT: pxor %xmm2, %xmm2
145 ; X64-SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7]
146 ; X64-SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
147 ; X64-SSE-NEXT: pmullw %xmm0, %xmm1
148 ; X64-SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
149 ; X64-SSE-NEXT: movdqu %xmm1, (%rax,%rdx,4)
152 ; X64-AVX-LABEL: mul_4xi8:
153 ; X64-AVX: # %bb.0: # %entry
154 ; X64-AVX-NEXT: movq {{.*}}(%rip), %rax
155 ; X64-AVX-NEXT: vpmovzxbd {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
156 ; X64-AVX-NEXT: vpmovzxbd {{.*#+}} xmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
157 ; X64-AVX-NEXT: vpmaddwd %xmm0, %xmm1, %xmm0
158 ; X64-AVX-NEXT: vmovdqu %xmm0, (%rax,%rdx,4)
161 %pre = load i32*, i32** @c
162 %tmp6 = getelementptr inbounds i8, i8* %a, i64 %index
163 %tmp7 = bitcast i8* %tmp6 to <4 x i8>*
164 %wide.load = load <4 x i8>, <4 x i8>* %tmp7, align 1
165 %tmp8 = zext <4 x i8> %wide.load to <4 x i32>
166 %tmp10 = getelementptr inbounds i8, i8* %b, i64 %index
167 %tmp11 = bitcast i8* %tmp10 to <4 x i8>*
168 %wide.load17 = load <4 x i8>, <4 x i8>* %tmp11, align 1
169 %tmp12 = zext <4 x i8> %wide.load17 to <4 x i32>
170 %tmp13 = mul nuw nsw <4 x i32> %tmp12, %tmp8
171 %tmp14 = getelementptr inbounds i32, i32* %pre, i64 %index
172 %tmp15 = bitcast i32* %tmp14 to <4 x i32>*
173 store <4 x i32> %tmp13, <4 x i32>* %tmp15, align 4
177 ; %val1 = load <8 x i8>
178 ; %op1 = zext<8 x i32> %val1
179 ; %val2 = load <8 x i8>
180 ; %op2 = zext<8 x i32> %val2
181 ; %rst = mul <8 x i32> %op1, %op2
183 define void @mul_8xi8(i8* nocapture readonly %a, i8* nocapture readonly %b, i64 %index) nounwind {
184 ; X86-SSE-LABEL: mul_8xi8:
185 ; X86-SSE: # %bb.0: # %entry
186 ; X86-SSE-NEXT: pushl %esi
187 ; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %eax
188 ; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %ecx
189 ; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %edx
190 ; X86-SSE-NEXT: movl c, %esi
191 ; X86-SSE-NEXT: movq {{.*#+}} xmm0 = mem[0],zero
192 ; X86-SSE-NEXT: movq {{.*#+}} xmm1 = mem[0],zero
193 ; X86-SSE-NEXT: pxor %xmm2, %xmm2
194 ; X86-SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7]
195 ; X86-SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
196 ; X86-SSE-NEXT: pmullw %xmm0, %xmm1
197 ; X86-SSE-NEXT: movdqa %xmm1, %xmm0
198 ; X86-SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
199 ; X86-SSE-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
200 ; X86-SSE-NEXT: movdqu %xmm1, 16(%esi,%ecx,4)
201 ; X86-SSE-NEXT: movdqu %xmm0, (%esi,%ecx,4)
202 ; X86-SSE-NEXT: popl %esi
205 ; X86-AVX1-LABEL: mul_8xi8:
206 ; X86-AVX1: # %bb.0: # %entry
207 ; X86-AVX1-NEXT: pushl %esi
208 ; X86-AVX1-NEXT: movl {{[0-9]+}}(%esp), %eax
209 ; X86-AVX1-NEXT: movl {{[0-9]+}}(%esp), %ecx
210 ; X86-AVX1-NEXT: movl {{[0-9]+}}(%esp), %edx
211 ; X86-AVX1-NEXT: movl c, %esi
212 ; X86-AVX1-NEXT: vpmovzxbd {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
213 ; X86-AVX1-NEXT: vpmovzxbd {{.*#+}} xmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
214 ; X86-AVX1-NEXT: vpmovzxbd {{.*#+}} xmm2 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
215 ; X86-AVX1-NEXT: vpmaddwd %xmm0, %xmm2, %xmm0
216 ; X86-AVX1-NEXT: vpmovzxbd {{.*#+}} xmm2 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
217 ; X86-AVX1-NEXT: vpmaddwd %xmm1, %xmm2, %xmm1
218 ; X86-AVX1-NEXT: vmovdqu %xmm0, 16(%esi,%ecx,4)
219 ; X86-AVX1-NEXT: vmovdqu %xmm1, (%esi,%ecx,4)
220 ; X86-AVX1-NEXT: popl %esi
221 ; X86-AVX1-NEXT: retl
223 ; X86-AVX2-LABEL: mul_8xi8:
224 ; X86-AVX2: # %bb.0: # %entry
225 ; X86-AVX2-NEXT: pushl %esi
226 ; X86-AVX2-NEXT: movl {{[0-9]+}}(%esp), %eax
227 ; X86-AVX2-NEXT: movl {{[0-9]+}}(%esp), %ecx
228 ; X86-AVX2-NEXT: movl {{[0-9]+}}(%esp), %edx
229 ; X86-AVX2-NEXT: movl c, %esi
230 ; X86-AVX2-NEXT: vpmovzxbd {{.*#+}} ymm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero
231 ; X86-AVX2-NEXT: vpmovzxbd {{.*#+}} ymm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero
232 ; X86-AVX2-NEXT: vpmaddwd %ymm0, %ymm1, %ymm0
233 ; X86-AVX2-NEXT: vmovdqu %ymm0, (%esi,%ecx,4)
234 ; X86-AVX2-NEXT: popl %esi
235 ; X86-AVX2-NEXT: vzeroupper
236 ; X86-AVX2-NEXT: retl
238 ; X64-SSE-LABEL: mul_8xi8:
239 ; X64-SSE: # %bb.0: # %entry
240 ; X64-SSE-NEXT: movq {{.*}}(%rip), %rax
241 ; X64-SSE-NEXT: movq {{.*#+}} xmm0 = mem[0],zero
242 ; X64-SSE-NEXT: movq {{.*#+}} xmm1 = mem[0],zero
243 ; X64-SSE-NEXT: pxor %xmm2, %xmm2
244 ; X64-SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7]
245 ; X64-SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
246 ; X64-SSE-NEXT: pmullw %xmm0, %xmm1
247 ; X64-SSE-NEXT: movdqa %xmm1, %xmm0
248 ; X64-SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
249 ; X64-SSE-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
250 ; X64-SSE-NEXT: movdqu %xmm1, 16(%rax,%rdx,4)
251 ; X64-SSE-NEXT: movdqu %xmm0, (%rax,%rdx,4)
254 ; X64-AVX1-LABEL: mul_8xi8:
255 ; X64-AVX1: # %bb.0: # %entry
256 ; X64-AVX1-NEXT: movq {{.*}}(%rip), %rax
257 ; X64-AVX1-NEXT: vpmovzxbd {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
258 ; X64-AVX1-NEXT: vpmovzxbd {{.*#+}} xmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
259 ; X64-AVX1-NEXT: vpmovzxbd {{.*#+}} xmm2 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
260 ; X64-AVX1-NEXT: vpmaddwd %xmm0, %xmm2, %xmm0
261 ; X64-AVX1-NEXT: vpmovzxbd {{.*#+}} xmm2 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
262 ; X64-AVX1-NEXT: vpmaddwd %xmm1, %xmm2, %xmm1
263 ; X64-AVX1-NEXT: vmovdqu %xmm0, 16(%rax,%rdx,4)
264 ; X64-AVX1-NEXT: vmovdqu %xmm1, (%rax,%rdx,4)
265 ; X64-AVX1-NEXT: retq
267 ; X64-AVX2-LABEL: mul_8xi8:
268 ; X64-AVX2: # %bb.0: # %entry
269 ; X64-AVX2-NEXT: movq {{.*}}(%rip), %rax
270 ; X64-AVX2-NEXT: vpmovzxbd {{.*#+}} ymm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero
271 ; X64-AVX2-NEXT: vpmovzxbd {{.*#+}} ymm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero
272 ; X64-AVX2-NEXT: vpmaddwd %ymm0, %ymm1, %ymm0
273 ; X64-AVX2-NEXT: vmovdqu %ymm0, (%rax,%rdx,4)
274 ; X64-AVX2-NEXT: vzeroupper
275 ; X64-AVX2-NEXT: retq
277 %pre = load i32*, i32** @c
278 %tmp6 = getelementptr inbounds i8, i8* %a, i64 %index
279 %tmp7 = bitcast i8* %tmp6 to <8 x i8>*
280 %wide.load = load <8 x i8>, <8 x i8>* %tmp7, align 1
281 %tmp8 = zext <8 x i8> %wide.load to <8 x i32>
282 %tmp10 = getelementptr inbounds i8, i8* %b, i64 %index
283 %tmp11 = bitcast i8* %tmp10 to <8 x i8>*
284 %wide.load17 = load <8 x i8>, <8 x i8>* %tmp11, align 1
285 %tmp12 = zext <8 x i8> %wide.load17 to <8 x i32>
286 %tmp13 = mul nuw nsw <8 x i32> %tmp12, %tmp8
287 %tmp14 = getelementptr inbounds i32, i32* %pre, i64 %index
288 %tmp15 = bitcast i32* %tmp14 to <8 x i32>*
289 store <8 x i32> %tmp13, <8 x i32>* %tmp15, align 4
293 ; %val1 = load <16 x i8>
294 ; %op1 = zext<16 x i32> %val1
295 ; %val2 = load <16 x i8>
296 ; %op2 = zext<16 x i32> %val2
297 ; %rst = mul <16 x i32> %op1, %op2
299 define void @mul_16xi8(i8* nocapture readonly %a, i8* nocapture readonly %b, i64 %index) nounwind {
300 ; X86-SSE-LABEL: mul_16xi8:
301 ; X86-SSE: # %bb.0: # %entry
302 ; X86-SSE-NEXT: pushl %esi
303 ; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %eax
304 ; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %ecx
305 ; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %edx
306 ; X86-SSE-NEXT: movl c, %esi
307 ; X86-SSE-NEXT: movdqu (%edx,%ecx), %xmm0
308 ; X86-SSE-NEXT: movdqu (%eax,%ecx), %xmm1
309 ; X86-SSE-NEXT: pxor %xmm2, %xmm2
310 ; X86-SSE-NEXT: movdqa %xmm0, %xmm3
311 ; X86-SSE-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3],xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7]
312 ; X86-SSE-NEXT: movdqa %xmm1, %xmm4
313 ; X86-SSE-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1],xmm4[2],xmm2[2],xmm4[3],xmm2[3],xmm4[4],xmm2[4],xmm4[5],xmm2[5],xmm4[6],xmm2[6],xmm4[7],xmm2[7]
314 ; X86-SSE-NEXT: pmullw %xmm3, %xmm4
315 ; X86-SSE-NEXT: movdqa %xmm4, %xmm3
316 ; X86-SSE-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3]
317 ; X86-SSE-NEXT: punpckhwd {{.*#+}} xmm4 = xmm4[4],xmm2[4],xmm4[5],xmm2[5],xmm4[6],xmm2[6],xmm4[7],xmm2[7]
318 ; X86-SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm2[8],xmm0[9],xmm2[9],xmm0[10],xmm2[10],xmm0[11],xmm2[11],xmm0[12],xmm2[12],xmm0[13],xmm2[13],xmm0[14],xmm2[14],xmm0[15],xmm2[15]
319 ; X86-SSE-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm2[8],xmm1[9],xmm2[9],xmm1[10],xmm2[10],xmm1[11],xmm2[11],xmm1[12],xmm2[12],xmm1[13],xmm2[13],xmm1[14],xmm2[14],xmm1[15],xmm2[15]
320 ; X86-SSE-NEXT: pmullw %xmm0, %xmm1
321 ; X86-SSE-NEXT: movdqa %xmm1, %xmm0
322 ; X86-SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
323 ; X86-SSE-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
324 ; X86-SSE-NEXT: movdqu %xmm1, 48(%esi,%ecx,4)
325 ; X86-SSE-NEXT: movdqu %xmm0, 32(%esi,%ecx,4)
326 ; X86-SSE-NEXT: movdqu %xmm4, 16(%esi,%ecx,4)
327 ; X86-SSE-NEXT: movdqu %xmm3, (%esi,%ecx,4)
328 ; X86-SSE-NEXT: popl %esi
331 ; X86-AVX1-LABEL: mul_16xi8:
332 ; X86-AVX1: # %bb.0: # %entry
333 ; X86-AVX1-NEXT: pushl %esi
334 ; X86-AVX1-NEXT: movl {{[0-9]+}}(%esp), %eax
335 ; X86-AVX1-NEXT: movl {{[0-9]+}}(%esp), %ecx
336 ; X86-AVX1-NEXT: movl {{[0-9]+}}(%esp), %edx
337 ; X86-AVX1-NEXT: movl c, %esi
338 ; X86-AVX1-NEXT: vpmovzxbd {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
339 ; X86-AVX1-NEXT: vpmovzxbd {{.*#+}} xmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
340 ; X86-AVX1-NEXT: vpmovzxbd {{.*#+}} xmm2 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
341 ; X86-AVX1-NEXT: vpmovzxbd {{.*#+}} xmm3 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
342 ; X86-AVX1-NEXT: vpmovzxbd {{.*#+}} xmm4 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
343 ; X86-AVX1-NEXT: vpmaddwd %xmm0, %xmm4, %xmm0
344 ; X86-AVX1-NEXT: vpmovzxbd {{.*#+}} xmm4 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
345 ; X86-AVX1-NEXT: vpmaddwd %xmm1, %xmm4, %xmm1
346 ; X86-AVX1-NEXT: vpmovzxbd {{.*#+}} xmm4 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
347 ; X86-AVX1-NEXT: vpmaddwd %xmm2, %xmm4, %xmm2
348 ; X86-AVX1-NEXT: vpmovzxbd {{.*#+}} xmm4 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
349 ; X86-AVX1-NEXT: vpmaddwd %xmm3, %xmm4, %xmm3
350 ; X86-AVX1-NEXT: vmovdqu %xmm0, 48(%esi,%ecx,4)
351 ; X86-AVX1-NEXT: vmovdqu %xmm1, 32(%esi,%ecx,4)
352 ; X86-AVX1-NEXT: vmovdqu %xmm2, 16(%esi,%ecx,4)
353 ; X86-AVX1-NEXT: vmovdqu %xmm3, (%esi,%ecx,4)
354 ; X86-AVX1-NEXT: popl %esi
355 ; X86-AVX1-NEXT: retl
357 ; X86-AVX2-LABEL: mul_16xi8:
358 ; X86-AVX2: # %bb.0: # %entry
359 ; X86-AVX2-NEXT: pushl %esi
360 ; X86-AVX2-NEXT: movl {{[0-9]+}}(%esp), %eax
361 ; X86-AVX2-NEXT: movl {{[0-9]+}}(%esp), %ecx
362 ; X86-AVX2-NEXT: movl {{[0-9]+}}(%esp), %edx
363 ; X86-AVX2-NEXT: movl c, %esi
364 ; X86-AVX2-NEXT: vpmovzxbd {{.*#+}} ymm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero
365 ; X86-AVX2-NEXT: vpmovzxbd {{.*#+}} ymm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero
366 ; X86-AVX2-NEXT: vpmovzxbd {{.*#+}} ymm2 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero
367 ; X86-AVX2-NEXT: vpmaddwd %ymm0, %ymm2, %ymm0
368 ; X86-AVX2-NEXT: vpmovzxbd {{.*#+}} ymm2 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero
369 ; X86-AVX2-NEXT: vpmaddwd %ymm1, %ymm2, %ymm1
370 ; X86-AVX2-NEXT: vmovdqu %ymm0, 32(%esi,%ecx,4)
371 ; X86-AVX2-NEXT: vmovdqu %ymm1, (%esi,%ecx,4)
372 ; X86-AVX2-NEXT: popl %esi
373 ; X86-AVX2-NEXT: vzeroupper
374 ; X86-AVX2-NEXT: retl
376 ; X64-SSE-LABEL: mul_16xi8:
377 ; X64-SSE: # %bb.0: # %entry
378 ; X64-SSE-NEXT: movq {{.*}}(%rip), %rax
379 ; X64-SSE-NEXT: movdqu (%rdi,%rdx), %xmm0
380 ; X64-SSE-NEXT: movdqu (%rsi,%rdx), %xmm1
381 ; X64-SSE-NEXT: pxor %xmm2, %xmm2
382 ; X64-SSE-NEXT: movdqa %xmm0, %xmm3
383 ; X64-SSE-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3],xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7]
384 ; X64-SSE-NEXT: movdqa %xmm1, %xmm4
385 ; X64-SSE-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1],xmm4[2],xmm2[2],xmm4[3],xmm2[3],xmm4[4],xmm2[4],xmm4[5],xmm2[5],xmm4[6],xmm2[6],xmm4[7],xmm2[7]
386 ; X64-SSE-NEXT: pmullw %xmm3, %xmm4
387 ; X64-SSE-NEXT: movdqa %xmm4, %xmm3
388 ; X64-SSE-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3]
389 ; X64-SSE-NEXT: punpckhwd {{.*#+}} xmm4 = xmm4[4],xmm2[4],xmm4[5],xmm2[5],xmm4[6],xmm2[6],xmm4[7],xmm2[7]
390 ; X64-SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm2[8],xmm0[9],xmm2[9],xmm0[10],xmm2[10],xmm0[11],xmm2[11],xmm0[12],xmm2[12],xmm0[13],xmm2[13],xmm0[14],xmm2[14],xmm0[15],xmm2[15]
391 ; X64-SSE-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm2[8],xmm1[9],xmm2[9],xmm1[10],xmm2[10],xmm1[11],xmm2[11],xmm1[12],xmm2[12],xmm1[13],xmm2[13],xmm1[14],xmm2[14],xmm1[15],xmm2[15]
392 ; X64-SSE-NEXT: pmullw %xmm0, %xmm1
393 ; X64-SSE-NEXT: movdqa %xmm1, %xmm0
394 ; X64-SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
395 ; X64-SSE-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
396 ; X64-SSE-NEXT: movdqu %xmm1, 48(%rax,%rdx,4)
397 ; X64-SSE-NEXT: movdqu %xmm0, 32(%rax,%rdx,4)
398 ; X64-SSE-NEXT: movdqu %xmm4, 16(%rax,%rdx,4)
399 ; X64-SSE-NEXT: movdqu %xmm3, (%rax,%rdx,4)
402 ; X64-AVX1-LABEL: mul_16xi8:
403 ; X64-AVX1: # %bb.0: # %entry
404 ; X64-AVX1-NEXT: movq {{.*}}(%rip), %rax
405 ; X64-AVX1-NEXT: vpmovzxbd {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
406 ; X64-AVX1-NEXT: vpmovzxbd {{.*#+}} xmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
407 ; X64-AVX1-NEXT: vpmovzxbd {{.*#+}} xmm2 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
408 ; X64-AVX1-NEXT: vpmovzxbd {{.*#+}} xmm3 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
409 ; X64-AVX1-NEXT: vpmovzxbd {{.*#+}} xmm4 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
410 ; X64-AVX1-NEXT: vpmaddwd %xmm0, %xmm4, %xmm0
411 ; X64-AVX1-NEXT: vpmovzxbd {{.*#+}} xmm4 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
412 ; X64-AVX1-NEXT: vpmaddwd %xmm1, %xmm4, %xmm1
413 ; X64-AVX1-NEXT: vpmovzxbd {{.*#+}} xmm4 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
414 ; X64-AVX1-NEXT: vpmaddwd %xmm2, %xmm4, %xmm2
415 ; X64-AVX1-NEXT: vpmovzxbd {{.*#+}} xmm4 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
416 ; X64-AVX1-NEXT: vpmaddwd %xmm3, %xmm4, %xmm3
417 ; X64-AVX1-NEXT: vmovdqu %xmm0, 48(%rax,%rdx,4)
418 ; X64-AVX1-NEXT: vmovdqu %xmm1, 32(%rax,%rdx,4)
419 ; X64-AVX1-NEXT: vmovdqu %xmm2, 16(%rax,%rdx,4)
420 ; X64-AVX1-NEXT: vmovdqu %xmm3, (%rax,%rdx,4)
421 ; X64-AVX1-NEXT: retq
423 ; X64-AVX2-LABEL: mul_16xi8:
424 ; X64-AVX2: # %bb.0: # %entry
425 ; X64-AVX2-NEXT: movq {{.*}}(%rip), %rax
426 ; X64-AVX2-NEXT: vpmovzxbd {{.*#+}} ymm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero
427 ; X64-AVX2-NEXT: vpmovzxbd {{.*#+}} ymm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero
428 ; X64-AVX2-NEXT: vpmovzxbd {{.*#+}} ymm2 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero
429 ; X64-AVX2-NEXT: vpmaddwd %ymm0, %ymm2, %ymm0
430 ; X64-AVX2-NEXT: vpmovzxbd {{.*#+}} ymm2 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero
431 ; X64-AVX2-NEXT: vpmaddwd %ymm1, %ymm2, %ymm1
432 ; X64-AVX2-NEXT: vmovdqu %ymm0, 32(%rax,%rdx,4)
433 ; X64-AVX2-NEXT: vmovdqu %ymm1, (%rax,%rdx,4)
434 ; X64-AVX2-NEXT: vzeroupper
435 ; X64-AVX2-NEXT: retq
437 %pre = load i32*, i32** @c
438 %tmp6 = getelementptr inbounds i8, i8* %a, i64 %index
439 %tmp7 = bitcast i8* %tmp6 to <16 x i8>*
440 %wide.load = load <16 x i8>, <16 x i8>* %tmp7, align 1
441 %tmp8 = zext <16 x i8> %wide.load to <16 x i32>
442 %tmp10 = getelementptr inbounds i8, i8* %b, i64 %index
443 %tmp11 = bitcast i8* %tmp10 to <16 x i8>*
444 %wide.load17 = load <16 x i8>, <16 x i8>* %tmp11, align 1
445 %tmp12 = zext <16 x i8> %wide.load17 to <16 x i32>
446 %tmp13 = mul nuw nsw <16 x i32> %tmp12, %tmp8
447 %tmp14 = getelementptr inbounds i32, i32* %pre, i64 %index
448 %tmp15 = bitcast i32* %tmp14 to <16 x i32>*
449 store <16 x i32> %tmp13, <16 x i32>* %tmp15, align 4
453 ; %val1 = load <2 x i16>
454 ; %op1 = zext<2 x i32> %val1
455 ; %val2 = load <2 x i16>
456 ; %op2 = zext<2 x i32> %val2
457 ; %rst = mul <2 x i32> %op1, %op2
459 define void @mul_2xi16(i8* nocapture readonly %a, i8* nocapture readonly %b, i64 %index) nounwind {
460 ; X86-SSE-LABEL: mul_2xi16:
461 ; X86-SSE: # %bb.0: # %entry
462 ; X86-SSE-NEXT: pushl %esi
463 ; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %eax
464 ; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %ecx
465 ; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %edx
466 ; X86-SSE-NEXT: movl c, %esi
467 ; X86-SSE-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
468 ; X86-SSE-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
469 ; X86-SSE-NEXT: movdqa %xmm1, %xmm2
470 ; X86-SSE-NEXT: pmulhuw %xmm0, %xmm2
471 ; X86-SSE-NEXT: pmullw %xmm0, %xmm1
472 ; X86-SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
473 ; X86-SSE-NEXT: movq %xmm1, (%esi,%ecx,4)
474 ; X86-SSE-NEXT: popl %esi
477 ; X86-AVX-LABEL: mul_2xi16:
478 ; X86-AVX: # %bb.0: # %entry
479 ; X86-AVX-NEXT: pushl %esi
480 ; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax
481 ; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %ecx
482 ; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %edx
483 ; X86-AVX-NEXT: movl c, %esi
484 ; X86-AVX-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
485 ; X86-AVX-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
486 ; X86-AVX-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero
487 ; X86-AVX-NEXT: vpmovzxwd {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero
488 ; X86-AVX-NEXT: vpmulld %xmm0, %xmm1, %xmm0
489 ; X86-AVX-NEXT: vmovq %xmm0, (%esi,%ecx,4)
490 ; X86-AVX-NEXT: popl %esi
493 ; X64-SSE-LABEL: mul_2xi16:
494 ; X64-SSE: # %bb.0: # %entry
495 ; X64-SSE-NEXT: movq {{.*}}(%rip), %rax
496 ; X64-SSE-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
497 ; X64-SSE-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
498 ; X64-SSE-NEXT: movdqa %xmm1, %xmm2
499 ; X64-SSE-NEXT: pmulhuw %xmm0, %xmm2
500 ; X64-SSE-NEXT: pmullw %xmm0, %xmm1
501 ; X64-SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
502 ; X64-SSE-NEXT: movq %xmm1, (%rax,%rdx,4)
505 ; X64-AVX-LABEL: mul_2xi16:
506 ; X64-AVX: # %bb.0: # %entry
507 ; X64-AVX-NEXT: movq {{.*}}(%rip), %rax
508 ; X64-AVX-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
509 ; X64-AVX-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
510 ; X64-AVX-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero
511 ; X64-AVX-NEXT: vpmovzxwd {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero
512 ; X64-AVX-NEXT: vpmulld %xmm0, %xmm1, %xmm0
513 ; X64-AVX-NEXT: vmovq %xmm0, (%rax,%rdx,4)
516 %pre = load i32*, i32** @c
517 %tmp6 = getelementptr inbounds i8, i8* %a, i64 %index
518 %tmp7 = bitcast i8* %tmp6 to <2 x i16>*
519 %wide.load = load <2 x i16>, <2 x i16>* %tmp7, align 1
520 %tmp8 = zext <2 x i16> %wide.load to <2 x i32>
521 %tmp10 = getelementptr inbounds i8, i8* %b, i64 %index
522 %tmp11 = bitcast i8* %tmp10 to <2 x i16>*
523 %wide.load17 = load <2 x i16>, <2 x i16>* %tmp11, align 1
524 %tmp12 = zext <2 x i16> %wide.load17 to <2 x i32>
525 %tmp13 = mul nuw nsw <2 x i32> %tmp12, %tmp8
526 %tmp14 = getelementptr inbounds i32, i32* %pre, i64 %index
527 %tmp15 = bitcast i32* %tmp14 to <2 x i32>*
528 store <2 x i32> %tmp13, <2 x i32>* %tmp15, align 4
532 ; %val1 = load <4 x i16>
533 ; %op1 = zext<4 x i32> %val1
534 ; %val2 = load <4 x i16>
535 ; %op2 = zext<4 x i32> %val2
536 ; %rst = mul <4 x i32> %op1, %op2
538 define void @mul_4xi16(i8* nocapture readonly %a, i8* nocapture readonly %b, i64 %index) nounwind {
539 ; X86-SSE-LABEL: mul_4xi16:
540 ; X86-SSE: # %bb.0: # %entry
541 ; X86-SSE-NEXT: pushl %esi
542 ; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %eax
543 ; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %ecx
544 ; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %edx
545 ; X86-SSE-NEXT: movl c, %esi
546 ; X86-SSE-NEXT: movq {{.*#+}} xmm0 = mem[0],zero
547 ; X86-SSE-NEXT: movq {{.*#+}} xmm1 = mem[0],zero
548 ; X86-SSE-NEXT: movdqa %xmm1, %xmm2
549 ; X86-SSE-NEXT: pmulhuw %xmm0, %xmm2
550 ; X86-SSE-NEXT: pmullw %xmm0, %xmm1
551 ; X86-SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
552 ; X86-SSE-NEXT: movdqu %xmm1, (%esi,%ecx,4)
553 ; X86-SSE-NEXT: popl %esi
556 ; X86-AVX-LABEL: mul_4xi16:
557 ; X86-AVX: # %bb.0: # %entry
558 ; X86-AVX-NEXT: pushl %esi
559 ; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax
560 ; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %ecx
561 ; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %edx
562 ; X86-AVX-NEXT: movl c, %esi
563 ; X86-AVX-NEXT: vpmovzxwd {{.*#+}} xmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
564 ; X86-AVX-NEXT: vpmovzxwd {{.*#+}} xmm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
565 ; X86-AVX-NEXT: vpmulld %xmm0, %xmm1, %xmm0
566 ; X86-AVX-NEXT: vmovdqu %xmm0, (%esi,%ecx,4)
567 ; X86-AVX-NEXT: popl %esi
570 ; X64-SSE-LABEL: mul_4xi16:
571 ; X64-SSE: # %bb.0: # %entry
572 ; X64-SSE-NEXT: movq {{.*}}(%rip), %rax
573 ; X64-SSE-NEXT: movq {{.*#+}} xmm0 = mem[0],zero
574 ; X64-SSE-NEXT: movq {{.*#+}} xmm1 = mem[0],zero
575 ; X64-SSE-NEXT: movdqa %xmm1, %xmm2
576 ; X64-SSE-NEXT: pmulhuw %xmm0, %xmm2
577 ; X64-SSE-NEXT: pmullw %xmm0, %xmm1
578 ; X64-SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
579 ; X64-SSE-NEXT: movdqu %xmm1, (%rax,%rdx,4)
582 ; X64-AVX-LABEL: mul_4xi16:
583 ; X64-AVX: # %bb.0: # %entry
584 ; X64-AVX-NEXT: movq {{.*}}(%rip), %rax
585 ; X64-AVX-NEXT: vpmovzxwd {{.*#+}} xmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
586 ; X64-AVX-NEXT: vpmovzxwd {{.*#+}} xmm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
587 ; X64-AVX-NEXT: vpmulld %xmm0, %xmm1, %xmm0
588 ; X64-AVX-NEXT: vmovdqu %xmm0, (%rax,%rdx,4)
591 %pre = load i32*, i32** @c
592 %tmp6 = getelementptr inbounds i8, i8* %a, i64 %index
593 %tmp7 = bitcast i8* %tmp6 to <4 x i16>*
594 %wide.load = load <4 x i16>, <4 x i16>* %tmp7, align 1
595 %tmp8 = zext <4 x i16> %wide.load to <4 x i32>
596 %tmp10 = getelementptr inbounds i8, i8* %b, i64 %index
597 %tmp11 = bitcast i8* %tmp10 to <4 x i16>*
598 %wide.load17 = load <4 x i16>, <4 x i16>* %tmp11, align 1
599 %tmp12 = zext <4 x i16> %wide.load17 to <4 x i32>
600 %tmp13 = mul nuw nsw <4 x i32> %tmp12, %tmp8
601 %tmp14 = getelementptr inbounds i32, i32* %pre, i64 %index
602 %tmp15 = bitcast i32* %tmp14 to <4 x i32>*
603 store <4 x i32> %tmp13, <4 x i32>* %tmp15, align 4
607 ; %val1 = load <8 x i16>
608 ; %op1 = zext<8 x i32> %val1
609 ; %val2 = load <8 x i16>
610 ; %op2 = zext<8 x i32> %val2
611 ; %rst = mul <8 x i32> %op1, %op2
613 define void @mul_8xi16(i8* nocapture readonly %a, i8* nocapture readonly %b, i64 %index) nounwind {
614 ; X86-SSE-LABEL: mul_8xi16:
615 ; X86-SSE: # %bb.0: # %entry
616 ; X86-SSE-NEXT: pushl %esi
617 ; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %eax
618 ; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %ecx
619 ; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %edx
620 ; X86-SSE-NEXT: movl c, %esi
621 ; X86-SSE-NEXT: movdqu (%edx,%ecx), %xmm0
622 ; X86-SSE-NEXT: movdqu (%eax,%ecx), %xmm1
623 ; X86-SSE-NEXT: movdqa %xmm1, %xmm2
624 ; X86-SSE-NEXT: pmulhuw %xmm0, %xmm2
625 ; X86-SSE-NEXT: pmullw %xmm0, %xmm1
626 ; X86-SSE-NEXT: movdqa %xmm1, %xmm0
627 ; X86-SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
628 ; X86-SSE-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
629 ; X86-SSE-NEXT: movdqu %xmm1, 16(%esi,%ecx,4)
630 ; X86-SSE-NEXT: movdqu %xmm0, (%esi,%ecx,4)
631 ; X86-SSE-NEXT: popl %esi
634 ; X86-AVX1-LABEL: mul_8xi16:
635 ; X86-AVX1: # %bb.0: # %entry
636 ; X86-AVX1-NEXT: pushl %esi
637 ; X86-AVX1-NEXT: movl {{[0-9]+}}(%esp), %eax
638 ; X86-AVX1-NEXT: movl {{[0-9]+}}(%esp), %ecx
639 ; X86-AVX1-NEXT: movl {{[0-9]+}}(%esp), %edx
640 ; X86-AVX1-NEXT: movl c, %esi
641 ; X86-AVX1-NEXT: vpmovzxwd {{.*#+}} xmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
642 ; X86-AVX1-NEXT: vpmovzxwd {{.*#+}} xmm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
643 ; X86-AVX1-NEXT: vpmovzxwd {{.*#+}} xmm2 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
644 ; X86-AVX1-NEXT: vpmulld %xmm0, %xmm2, %xmm0
645 ; X86-AVX1-NEXT: vpmovzxwd {{.*#+}} xmm2 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
646 ; X86-AVX1-NEXT: vpmulld %xmm1, %xmm2, %xmm1
647 ; X86-AVX1-NEXT: vmovdqu %xmm0, 16(%esi,%ecx,4)
648 ; X86-AVX1-NEXT: vmovdqu %xmm1, (%esi,%ecx,4)
649 ; X86-AVX1-NEXT: popl %esi
650 ; X86-AVX1-NEXT: retl
652 ; X86-AVX2-LABEL: mul_8xi16:
653 ; X86-AVX2: # %bb.0: # %entry
654 ; X86-AVX2-NEXT: pushl %esi
655 ; X86-AVX2-NEXT: movl {{[0-9]+}}(%esp), %eax
656 ; X86-AVX2-NEXT: movl {{[0-9]+}}(%esp), %ecx
657 ; X86-AVX2-NEXT: movl {{[0-9]+}}(%esp), %edx
658 ; X86-AVX2-NEXT: movl c, %esi
659 ; X86-AVX2-NEXT: vpmovzxwd {{.*#+}} ymm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero
660 ; X86-AVX2-NEXT: vpmovzxwd {{.*#+}} ymm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero
661 ; X86-AVX2-NEXT: vpmulld %ymm0, %ymm1, %ymm0
662 ; X86-AVX2-NEXT: vmovdqu %ymm0, (%esi,%ecx,4)
663 ; X86-AVX2-NEXT: popl %esi
664 ; X86-AVX2-NEXT: vzeroupper
665 ; X86-AVX2-NEXT: retl
667 ; X64-SSE-LABEL: mul_8xi16:
668 ; X64-SSE: # %bb.0: # %entry
669 ; X64-SSE-NEXT: movq {{.*}}(%rip), %rax
670 ; X64-SSE-NEXT: movdqu (%rdi,%rdx), %xmm0
671 ; X64-SSE-NEXT: movdqu (%rsi,%rdx), %xmm1
672 ; X64-SSE-NEXT: movdqa %xmm1, %xmm2
673 ; X64-SSE-NEXT: pmulhuw %xmm0, %xmm2
674 ; X64-SSE-NEXT: pmullw %xmm0, %xmm1
675 ; X64-SSE-NEXT: movdqa %xmm1, %xmm0
676 ; X64-SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
677 ; X64-SSE-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
678 ; X64-SSE-NEXT: movdqu %xmm1, 16(%rax,%rdx,4)
679 ; X64-SSE-NEXT: movdqu %xmm0, (%rax,%rdx,4)
682 ; X64-AVX1-LABEL: mul_8xi16:
683 ; X64-AVX1: # %bb.0: # %entry
684 ; X64-AVX1-NEXT: movq {{.*}}(%rip), %rax
685 ; X64-AVX1-NEXT: vpmovzxwd {{.*#+}} xmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
686 ; X64-AVX1-NEXT: vpmovzxwd {{.*#+}} xmm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
687 ; X64-AVX1-NEXT: vpmovzxwd {{.*#+}} xmm2 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
688 ; X64-AVX1-NEXT: vpmulld %xmm0, %xmm2, %xmm0
689 ; X64-AVX1-NEXT: vpmovzxwd {{.*#+}} xmm2 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
690 ; X64-AVX1-NEXT: vpmulld %xmm1, %xmm2, %xmm1
691 ; X64-AVX1-NEXT: vmovdqu %xmm0, 16(%rax,%rdx,4)
692 ; X64-AVX1-NEXT: vmovdqu %xmm1, (%rax,%rdx,4)
693 ; X64-AVX1-NEXT: retq
695 ; X64-AVX2-LABEL: mul_8xi16:
696 ; X64-AVX2: # %bb.0: # %entry
697 ; X64-AVX2-NEXT: movq {{.*}}(%rip), %rax
698 ; X64-AVX2-NEXT: vpmovzxwd {{.*#+}} ymm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero
699 ; X64-AVX2-NEXT: vpmovzxwd {{.*#+}} ymm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero
700 ; X64-AVX2-NEXT: vpmulld %ymm0, %ymm1, %ymm0
701 ; X64-AVX2-NEXT: vmovdqu %ymm0, (%rax,%rdx,4)
702 ; X64-AVX2-NEXT: vzeroupper
703 ; X64-AVX2-NEXT: retq
705 %pre = load i32*, i32** @c
706 %tmp6 = getelementptr inbounds i8, i8* %a, i64 %index
707 %tmp7 = bitcast i8* %tmp6 to <8 x i16>*
708 %wide.load = load <8 x i16>, <8 x i16>* %tmp7, align 1
709 %tmp8 = zext <8 x i16> %wide.load to <8 x i32>
710 %tmp10 = getelementptr inbounds i8, i8* %b, i64 %index
711 %tmp11 = bitcast i8* %tmp10 to <8 x i16>*
712 %wide.load17 = load <8 x i16>, <8 x i16>* %tmp11, align 1
713 %tmp12 = zext <8 x i16> %wide.load17 to <8 x i32>
714 %tmp13 = mul nuw nsw <8 x i32> %tmp12, %tmp8
715 %tmp14 = getelementptr inbounds i32, i32* %pre, i64 %index
716 %tmp15 = bitcast i32* %tmp14 to <8 x i32>*
717 store <8 x i32> %tmp13, <8 x i32>* %tmp15, align 4
721 ; %val1 = load <16 x i16>
722 ; %op1 = zext<16 x i32> %val1
723 ; %val2 = load <16 x i16>
724 ; %op2 = zext<16 x i32> %val2
725 ; %rst = mul <16 x i32> %op1, %op2
727 define void @mul_16xi16(i8* nocapture readonly %a, i8* nocapture readonly %b, i64 %index) nounwind {
728 ; X86-SSE-LABEL: mul_16xi16:
729 ; X86-SSE: # %bb.0: # %entry
730 ; X86-SSE-NEXT: pushl %esi
731 ; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %eax
732 ; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %ecx
733 ; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %edx
734 ; X86-SSE-NEXT: movl c, %esi
735 ; X86-SSE-NEXT: movdqu (%edx,%ecx), %xmm0
736 ; X86-SSE-NEXT: movdqu 16(%edx,%ecx), %xmm1
737 ; X86-SSE-NEXT: movdqu (%eax,%ecx), %xmm2
738 ; X86-SSE-NEXT: movdqu 16(%eax,%ecx), %xmm3
739 ; X86-SSE-NEXT: movdqa %xmm2, %xmm4
740 ; X86-SSE-NEXT: pmulhuw %xmm0, %xmm4
741 ; X86-SSE-NEXT: pmullw %xmm0, %xmm2
742 ; X86-SSE-NEXT: movdqa %xmm2, %xmm0
743 ; X86-SSE-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm4[4],xmm0[5],xmm4[5],xmm0[6],xmm4[6],xmm0[7],xmm4[7]
744 ; X86-SSE-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3]
745 ; X86-SSE-NEXT: movdqa %xmm3, %xmm4
746 ; X86-SSE-NEXT: pmulhuw %xmm1, %xmm4
747 ; X86-SSE-NEXT: pmullw %xmm1, %xmm3
748 ; X86-SSE-NEXT: movdqa %xmm3, %xmm1
749 ; X86-SSE-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm4[4],xmm1[5],xmm4[5],xmm1[6],xmm4[6],xmm1[7],xmm4[7]
750 ; X86-SSE-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3]
751 ; X86-SSE-NEXT: movdqu %xmm3, 32(%esi,%ecx,4)
752 ; X86-SSE-NEXT: movdqu %xmm1, 48(%esi,%ecx,4)
753 ; X86-SSE-NEXT: movdqu %xmm2, (%esi,%ecx,4)
754 ; X86-SSE-NEXT: movdqu %xmm0, 16(%esi,%ecx,4)
755 ; X86-SSE-NEXT: popl %esi
758 ; X86-AVX1-LABEL: mul_16xi16:
759 ; X86-AVX1: # %bb.0: # %entry
760 ; X86-AVX1-NEXT: pushl %esi
761 ; X86-AVX1-NEXT: movl {{[0-9]+}}(%esp), %eax
762 ; X86-AVX1-NEXT: movl {{[0-9]+}}(%esp), %ecx
763 ; X86-AVX1-NEXT: movl {{[0-9]+}}(%esp), %edx
764 ; X86-AVX1-NEXT: movl c, %esi
765 ; X86-AVX1-NEXT: vpmovzxwd {{.*#+}} xmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
766 ; X86-AVX1-NEXT: vpmovzxwd {{.*#+}} xmm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
767 ; X86-AVX1-NEXT: vpmovzxwd {{.*#+}} xmm2 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
768 ; X86-AVX1-NEXT: vpmovzxwd {{.*#+}} xmm3 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
769 ; X86-AVX1-NEXT: vpmovzxwd {{.*#+}} xmm4 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
770 ; X86-AVX1-NEXT: vpmulld %xmm0, %xmm4, %xmm0
771 ; X86-AVX1-NEXT: vpmovzxwd {{.*#+}} xmm4 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
772 ; X86-AVX1-NEXT: vpmulld %xmm1, %xmm4, %xmm1
773 ; X86-AVX1-NEXT: vpmovzxwd {{.*#+}} xmm4 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
774 ; X86-AVX1-NEXT: vpmulld %xmm2, %xmm4, %xmm2
775 ; X86-AVX1-NEXT: vpmovzxwd {{.*#+}} xmm4 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
776 ; X86-AVX1-NEXT: vpmulld %xmm3, %xmm4, %xmm3
777 ; X86-AVX1-NEXT: vmovdqu %xmm0, 48(%esi,%ecx,4)
778 ; X86-AVX1-NEXT: vmovdqu %xmm1, 32(%esi,%ecx,4)
779 ; X86-AVX1-NEXT: vmovdqu %xmm2, 16(%esi,%ecx,4)
780 ; X86-AVX1-NEXT: vmovdqu %xmm3, (%esi,%ecx,4)
781 ; X86-AVX1-NEXT: popl %esi
782 ; X86-AVX1-NEXT: retl
784 ; X86-AVX2-LABEL: mul_16xi16:
785 ; X86-AVX2: # %bb.0: # %entry
786 ; X86-AVX2-NEXT: pushl %esi
787 ; X86-AVX2-NEXT: movl {{[0-9]+}}(%esp), %eax
788 ; X86-AVX2-NEXT: movl {{[0-9]+}}(%esp), %ecx
789 ; X86-AVX2-NEXT: movl {{[0-9]+}}(%esp), %edx
790 ; X86-AVX2-NEXT: movl c, %esi
791 ; X86-AVX2-NEXT: vpmovzxwd {{.*#+}} ymm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero
792 ; X86-AVX2-NEXT: vpmovzxwd {{.*#+}} ymm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero
793 ; X86-AVX2-NEXT: vpmovzxwd {{.*#+}} ymm2 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero
794 ; X86-AVX2-NEXT: vpmulld %ymm0, %ymm2, %ymm0
795 ; X86-AVX2-NEXT: vpmovzxwd {{.*#+}} ymm2 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero
796 ; X86-AVX2-NEXT: vpmulld %ymm1, %ymm2, %ymm1
797 ; X86-AVX2-NEXT: vmovdqu %ymm0, 32(%esi,%ecx,4)
798 ; X86-AVX2-NEXT: vmovdqu %ymm1, (%esi,%ecx,4)
799 ; X86-AVX2-NEXT: popl %esi
800 ; X86-AVX2-NEXT: vzeroupper
801 ; X86-AVX2-NEXT: retl
803 ; X64-SSE-LABEL: mul_16xi16:
804 ; X64-SSE: # %bb.0: # %entry
805 ; X64-SSE-NEXT: movq {{.*}}(%rip), %rax
806 ; X64-SSE-NEXT: movdqu (%rdi,%rdx), %xmm0
807 ; X64-SSE-NEXT: movdqu 16(%rdi,%rdx), %xmm1
808 ; X64-SSE-NEXT: movdqu (%rsi,%rdx), %xmm2
809 ; X64-SSE-NEXT: movdqu 16(%rsi,%rdx), %xmm3
810 ; X64-SSE-NEXT: movdqa %xmm2, %xmm4
811 ; X64-SSE-NEXT: pmulhuw %xmm0, %xmm4
812 ; X64-SSE-NEXT: pmullw %xmm0, %xmm2
813 ; X64-SSE-NEXT: movdqa %xmm2, %xmm0
814 ; X64-SSE-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm4[4],xmm0[5],xmm4[5],xmm0[6],xmm4[6],xmm0[7],xmm4[7]
815 ; X64-SSE-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3]
816 ; X64-SSE-NEXT: movdqa %xmm3, %xmm4
817 ; X64-SSE-NEXT: pmulhuw %xmm1, %xmm4
818 ; X64-SSE-NEXT: pmullw %xmm1, %xmm3
819 ; X64-SSE-NEXT: movdqa %xmm3, %xmm1
820 ; X64-SSE-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm4[4],xmm1[5],xmm4[5],xmm1[6],xmm4[6],xmm1[7],xmm4[7]
821 ; X64-SSE-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3]
822 ; X64-SSE-NEXT: movdqu %xmm3, 32(%rax,%rdx,4)
823 ; X64-SSE-NEXT: movdqu %xmm1, 48(%rax,%rdx,4)
824 ; X64-SSE-NEXT: movdqu %xmm2, (%rax,%rdx,4)
825 ; X64-SSE-NEXT: movdqu %xmm0, 16(%rax,%rdx,4)
828 ; X64-AVX1-LABEL: mul_16xi16:
829 ; X64-AVX1: # %bb.0: # %entry
830 ; X64-AVX1-NEXT: movq {{.*}}(%rip), %rax
831 ; X64-AVX1-NEXT: vpmovzxwd {{.*#+}} xmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
832 ; X64-AVX1-NEXT: vpmovzxwd {{.*#+}} xmm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
833 ; X64-AVX1-NEXT: vpmovzxwd {{.*#+}} xmm2 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
834 ; X64-AVX1-NEXT: vpmovzxwd {{.*#+}} xmm3 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
835 ; X64-AVX1-NEXT: vpmovzxwd {{.*#+}} xmm4 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
836 ; X64-AVX1-NEXT: vpmulld %xmm0, %xmm4, %xmm0
837 ; X64-AVX1-NEXT: vpmovzxwd {{.*#+}} xmm4 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
838 ; X64-AVX1-NEXT: vpmulld %xmm1, %xmm4, %xmm1
839 ; X64-AVX1-NEXT: vpmovzxwd {{.*#+}} xmm4 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
840 ; X64-AVX1-NEXT: vpmulld %xmm2, %xmm4, %xmm2
841 ; X64-AVX1-NEXT: vpmovzxwd {{.*#+}} xmm4 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
842 ; X64-AVX1-NEXT: vpmulld %xmm3, %xmm4, %xmm3
843 ; X64-AVX1-NEXT: vmovdqu %xmm0, 48(%rax,%rdx,4)
844 ; X64-AVX1-NEXT: vmovdqu %xmm1, 32(%rax,%rdx,4)
845 ; X64-AVX1-NEXT: vmovdqu %xmm2, 16(%rax,%rdx,4)
846 ; X64-AVX1-NEXT: vmovdqu %xmm3, (%rax,%rdx,4)
847 ; X64-AVX1-NEXT: retq
849 ; X64-AVX2-LABEL: mul_16xi16:
850 ; X64-AVX2: # %bb.0: # %entry
851 ; X64-AVX2-NEXT: movq {{.*}}(%rip), %rax
852 ; X64-AVX2-NEXT: vpmovzxwd {{.*#+}} ymm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero
853 ; X64-AVX2-NEXT: vpmovzxwd {{.*#+}} ymm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero
854 ; X64-AVX2-NEXT: vpmovzxwd {{.*#+}} ymm2 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero
855 ; X64-AVX2-NEXT: vpmulld %ymm0, %ymm2, %ymm0
856 ; X64-AVX2-NEXT: vpmovzxwd {{.*#+}} ymm2 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero
857 ; X64-AVX2-NEXT: vpmulld %ymm1, %ymm2, %ymm1
858 ; X64-AVX2-NEXT: vmovdqu %ymm0, 32(%rax,%rdx,4)
859 ; X64-AVX2-NEXT: vmovdqu %ymm1, (%rax,%rdx,4)
860 ; X64-AVX2-NEXT: vzeroupper
861 ; X64-AVX2-NEXT: retq
863 %pre = load i32*, i32** @c
864 %tmp6 = getelementptr inbounds i8, i8* %a, i64 %index
865 %tmp7 = bitcast i8* %tmp6 to <16 x i16>*
866 %wide.load = load <16 x i16>, <16 x i16>* %tmp7, align 1
867 %tmp8 = zext <16 x i16> %wide.load to <16 x i32>
868 %tmp10 = getelementptr inbounds i8, i8* %b, i64 %index
869 %tmp11 = bitcast i8* %tmp10 to <16 x i16>*
870 %wide.load17 = load <16 x i16>, <16 x i16>* %tmp11, align 1
871 %tmp12 = zext <16 x i16> %wide.load17 to <16 x i32>
872 %tmp13 = mul nuw nsw <16 x i32> %tmp12, %tmp8
873 %tmp14 = getelementptr inbounds i32, i32* %pre, i64 %index
874 %tmp15 = bitcast i32* %tmp14 to <16 x i32>*
875 store <16 x i32> %tmp13, <16 x i32>* %tmp15, align 4
879 ; %val1 = load <2 x i8>
880 ; %op1 = sext<2 x i32> %val1
881 ; %val2 = load <2 x i8>
882 ; %op2 = sext<2 x i32> %val2
883 ; %rst = mul <2 x i32> %op1, %op2
885 define void @mul_2xi8_sext(i8* nocapture readonly %a, i8* nocapture readonly %b, i64 %index) nounwind {
886 ; X86-SSE-LABEL: mul_2xi8_sext:
887 ; X86-SSE: # %bb.0: # %entry
888 ; X86-SSE-NEXT: pushl %esi
889 ; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %eax
890 ; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %ecx
891 ; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %edx
892 ; X86-SSE-NEXT: movl c, %esi
893 ; X86-SSE-NEXT: movzwl (%edx,%ecx), %edx
894 ; X86-SSE-NEXT: movd %edx, %xmm0
895 ; X86-SSE-NEXT: movzwl (%eax,%ecx), %eax
896 ; X86-SSE-NEXT: movd %eax, %xmm1
897 ; X86-SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
898 ; X86-SSE-NEXT: psraw $8, %xmm0
899 ; X86-SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
900 ; X86-SSE-NEXT: psraw $8, %xmm1
901 ; X86-SSE-NEXT: pmullw %xmm0, %xmm1
902 ; X86-SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
903 ; X86-SSE-NEXT: psrad $16, %xmm0
904 ; X86-SSE-NEXT: movq %xmm0, (%esi,%ecx,4)
905 ; X86-SSE-NEXT: popl %esi
908 ; X86-AVX-LABEL: mul_2xi8_sext:
909 ; X86-AVX: # %bb.0: # %entry
910 ; X86-AVX-NEXT: pushl %esi
911 ; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax
912 ; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %ecx
913 ; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %edx
914 ; X86-AVX-NEXT: movl c, %esi
915 ; X86-AVX-NEXT: movzwl (%edx,%ecx), %edx
916 ; X86-AVX-NEXT: vmovd %edx, %xmm0
917 ; X86-AVX-NEXT: vpmovsxbd %xmm0, %xmm0
918 ; X86-AVX-NEXT: movzwl (%eax,%ecx), %eax
919 ; X86-AVX-NEXT: vmovd %eax, %xmm1
920 ; X86-AVX-NEXT: vpmovsxbd %xmm1, %xmm1
921 ; X86-AVX-NEXT: vpmulld %xmm0, %xmm1, %xmm0
922 ; X86-AVX-NEXT: vmovq %xmm0, (%esi,%ecx,4)
923 ; X86-AVX-NEXT: popl %esi
926 ; X64-SSE-LABEL: mul_2xi8_sext:
927 ; X64-SSE: # %bb.0: # %entry
928 ; X64-SSE-NEXT: movq {{.*}}(%rip), %rax
929 ; X64-SSE-NEXT: movzwl (%rdi,%rdx), %ecx
930 ; X64-SSE-NEXT: movd %ecx, %xmm0
931 ; X64-SSE-NEXT: movzwl (%rsi,%rdx), %ecx
932 ; X64-SSE-NEXT: movd %ecx, %xmm1
933 ; X64-SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
934 ; X64-SSE-NEXT: psraw $8, %xmm0
935 ; X64-SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
936 ; X64-SSE-NEXT: psraw $8, %xmm1
937 ; X64-SSE-NEXT: pmullw %xmm0, %xmm1
938 ; X64-SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
939 ; X64-SSE-NEXT: psrad $16, %xmm0
940 ; X64-SSE-NEXT: movq %xmm0, (%rax,%rdx,4)
943 ; X64-AVX-LABEL: mul_2xi8_sext:
944 ; X64-AVX: # %bb.0: # %entry
945 ; X64-AVX-NEXT: movq {{.*}}(%rip), %rax
946 ; X64-AVX-NEXT: movzwl (%rdi,%rdx), %ecx
947 ; X64-AVX-NEXT: vmovd %ecx, %xmm0
948 ; X64-AVX-NEXT: vpmovsxbd %xmm0, %xmm0
949 ; X64-AVX-NEXT: movzwl (%rsi,%rdx), %ecx
950 ; X64-AVX-NEXT: vmovd %ecx, %xmm1
951 ; X64-AVX-NEXT: vpmovsxbd %xmm1, %xmm1
952 ; X64-AVX-NEXT: vpmulld %xmm0, %xmm1, %xmm0
953 ; X64-AVX-NEXT: vmovq %xmm0, (%rax,%rdx,4)
956 %pre = load i32*, i32** @c
957 %tmp6 = getelementptr inbounds i8, i8* %a, i64 %index
958 %tmp7 = bitcast i8* %tmp6 to <2 x i8>*
959 %wide.load = load <2 x i8>, <2 x i8>* %tmp7, align 1
960 %tmp8 = sext <2 x i8> %wide.load to <2 x i32>
961 %tmp10 = getelementptr inbounds i8, i8* %b, i64 %index
962 %tmp11 = bitcast i8* %tmp10 to <2 x i8>*
963 %wide.load17 = load <2 x i8>, <2 x i8>* %tmp11, align 1
964 %tmp12 = sext <2 x i8> %wide.load17 to <2 x i32>
965 %tmp13 = mul nuw nsw <2 x i32> %tmp12, %tmp8
966 %tmp14 = getelementptr inbounds i32, i32* %pre, i64 %index
967 %tmp15 = bitcast i32* %tmp14 to <2 x i32>*
968 store <2 x i32> %tmp13, <2 x i32>* %tmp15, align 4
972 ; %val1 = load <2 x i8>
973 ; %op1 = sext<2 x i32> %val1
974 ; %val2 = load <2 x i8>
975 ; %op2 = zext<2 x i32> %val2
976 ; %rst = mul <2 x i32> %op1, %op2
978 define void @mul_2xi8_sext_zext(i8* nocapture readonly %a, i8* nocapture readonly %b, i64 %index) nounwind {
979 ; X86-SSE-LABEL: mul_2xi8_sext_zext:
980 ; X86-SSE: # %bb.0: # %entry
981 ; X86-SSE-NEXT: pushl %esi
982 ; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %eax
983 ; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %ecx
984 ; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %edx
985 ; X86-SSE-NEXT: movl c, %esi
986 ; X86-SSE-NEXT: movzwl (%edx,%ecx), %edx
987 ; X86-SSE-NEXT: movd %edx, %xmm0
988 ; X86-SSE-NEXT: movzwl (%eax,%ecx), %eax
989 ; X86-SSE-NEXT: movd %eax, %xmm1
990 ; X86-SSE-NEXT: pxor %xmm2, %xmm2
991 ; X86-SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
992 ; X86-SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
993 ; X86-SSE-NEXT: psraw $8, %xmm0
994 ; X86-SSE-NEXT: movdqa %xmm1, %xmm2
995 ; X86-SSE-NEXT: pmulhw %xmm0, %xmm2
996 ; X86-SSE-NEXT: pmullw %xmm1, %xmm0
997 ; X86-SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
998 ; X86-SSE-NEXT: movq %xmm0, (%esi,%ecx,4)
999 ; X86-SSE-NEXT: popl %esi
1000 ; X86-SSE-NEXT: retl
1002 ; X86-AVX-LABEL: mul_2xi8_sext_zext:
1003 ; X86-AVX: # %bb.0: # %entry
1004 ; X86-AVX-NEXT: pushl %esi
1005 ; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax
1006 ; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %ecx
1007 ; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %edx
1008 ; X86-AVX-NEXT: movl c, %esi
1009 ; X86-AVX-NEXT: movzwl (%edx,%ecx), %edx
1010 ; X86-AVX-NEXT: vmovd %edx, %xmm0
1011 ; X86-AVX-NEXT: vpmovsxbd %xmm0, %xmm0
1012 ; X86-AVX-NEXT: movzwl (%eax,%ecx), %eax
1013 ; X86-AVX-NEXT: vmovd %eax, %xmm1
1014 ; X86-AVX-NEXT: vpmovzxbd {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero
1015 ; X86-AVX-NEXT: vpmulld %xmm0, %xmm1, %xmm0
1016 ; X86-AVX-NEXT: vmovq %xmm0, (%esi,%ecx,4)
1017 ; X86-AVX-NEXT: popl %esi
1018 ; X86-AVX-NEXT: retl
1020 ; X64-SSE-LABEL: mul_2xi8_sext_zext:
1021 ; X64-SSE: # %bb.0: # %entry
1022 ; X64-SSE-NEXT: movq {{.*}}(%rip), %rax
1023 ; X64-SSE-NEXT: movzwl (%rdi,%rdx), %ecx
1024 ; X64-SSE-NEXT: movd %ecx, %xmm0
1025 ; X64-SSE-NEXT: movzwl (%rsi,%rdx), %ecx
1026 ; X64-SSE-NEXT: movd %ecx, %xmm1
1027 ; X64-SSE-NEXT: pxor %xmm2, %xmm2
1028 ; X64-SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
1029 ; X64-SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
1030 ; X64-SSE-NEXT: psraw $8, %xmm0
1031 ; X64-SSE-NEXT: movdqa %xmm1, %xmm2
1032 ; X64-SSE-NEXT: pmulhw %xmm0, %xmm2
1033 ; X64-SSE-NEXT: pmullw %xmm1, %xmm0
1034 ; X64-SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
1035 ; X64-SSE-NEXT: movq %xmm0, (%rax,%rdx,4)
1036 ; X64-SSE-NEXT: retq
1038 ; X64-AVX-LABEL: mul_2xi8_sext_zext:
1039 ; X64-AVX: # %bb.0: # %entry
1040 ; X64-AVX-NEXT: movq {{.*}}(%rip), %rax
1041 ; X64-AVX-NEXT: movzwl (%rdi,%rdx), %ecx
1042 ; X64-AVX-NEXT: vmovd %ecx, %xmm0
1043 ; X64-AVX-NEXT: vpmovsxbd %xmm0, %xmm0
1044 ; X64-AVX-NEXT: movzwl (%rsi,%rdx), %ecx
1045 ; X64-AVX-NEXT: vmovd %ecx, %xmm1
1046 ; X64-AVX-NEXT: vpmovzxbd {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero
1047 ; X64-AVX-NEXT: vpmulld %xmm0, %xmm1, %xmm0
1048 ; X64-AVX-NEXT: vmovq %xmm0, (%rax,%rdx,4)
1049 ; X64-AVX-NEXT: retq
1051 %pre = load i32*, i32** @c
1052 %tmp6 = getelementptr inbounds i8, i8* %a, i64 %index
1053 %tmp7 = bitcast i8* %tmp6 to <2 x i8>*
1054 %wide.load = load <2 x i8>, <2 x i8>* %tmp7, align 1
1055 %tmp8 = sext <2 x i8> %wide.load to <2 x i32>
1056 %tmp10 = getelementptr inbounds i8, i8* %b, i64 %index
1057 %tmp11 = bitcast i8* %tmp10 to <2 x i8>*
1058 %wide.load17 = load <2 x i8>, <2 x i8>* %tmp11, align 1
1059 %tmp12 = zext <2 x i8> %wide.load17 to <2 x i32>
1060 %tmp13 = mul nuw nsw <2 x i32> %tmp12, %tmp8
1061 %tmp14 = getelementptr inbounds i32, i32* %pre, i64 %index
1062 %tmp15 = bitcast i32* %tmp14 to <2 x i32>*
1063 store <2 x i32> %tmp13, <2 x i32>* %tmp15, align 4
1067 ; %val1 = load <2 x i16>
1068 ; %op1 = sext<2 x i32> %val1
1069 ; %val2 = load <2 x i16>
1070 ; %op2 = sext<2 x i32> %val2
1071 ; %rst = mul <2 x i32> %op1, %op2
1073 define void @mul_2xi16_sext(i8* nocapture readonly %a, i8* nocapture readonly %b, i64 %index) nounwind {
1074 ; X86-SSE-LABEL: mul_2xi16_sext:
1075 ; X86-SSE: # %bb.0: # %entry
1076 ; X86-SSE-NEXT: pushl %esi
1077 ; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %eax
1078 ; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %ecx
1079 ; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %edx
1080 ; X86-SSE-NEXT: movl c, %esi
1081 ; X86-SSE-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
1082 ; X86-SSE-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
1083 ; X86-SSE-NEXT: movdqa %xmm1, %xmm2
1084 ; X86-SSE-NEXT: pmulhw %xmm0, %xmm2
1085 ; X86-SSE-NEXT: pmullw %xmm0, %xmm1
1086 ; X86-SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
1087 ; X86-SSE-NEXT: movq %xmm1, (%esi,%ecx,4)
1088 ; X86-SSE-NEXT: popl %esi
1089 ; X86-SSE-NEXT: retl
1091 ; X86-AVX-LABEL: mul_2xi16_sext:
1092 ; X86-AVX: # %bb.0: # %entry
1093 ; X86-AVX-NEXT: pushl %esi
1094 ; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax
1095 ; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %ecx
1096 ; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %edx
1097 ; X86-AVX-NEXT: movl c, %esi
1098 ; X86-AVX-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
1099 ; X86-AVX-NEXT: vpmovsxwd %xmm0, %xmm0
1100 ; X86-AVX-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero
1101 ; X86-AVX-NEXT: vpmovsxwd %xmm1, %xmm1
1102 ; X86-AVX-NEXT: vpmulld %xmm0, %xmm1, %xmm0
1103 ; X86-AVX-NEXT: vmovq %xmm0, (%esi,%ecx,4)
1104 ; X86-AVX-NEXT: popl %esi
1105 ; X86-AVX-NEXT: retl
1107 ; X64-SSE-LABEL: mul_2xi16_sext:
1108 ; X64-SSE: # %bb.0: # %entry
1109 ; X64-SSE-NEXT: movq {{.*}}(%rip), %rax
1110 ; X64-SSE-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
1111 ; X64-SSE-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
1112 ; X64-SSE-NEXT: movdqa %xmm1, %xmm2
1113 ; X64-SSE-NEXT: pmulhw %xmm0, %xmm2
1114 ; X64-SSE-NEXT: pmullw %xmm0, %xmm1
1115 ; X64-SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
1116 ; X64-SSE-NEXT: movq %xmm1, (%rax,%rdx,4)
1117 ; X64-SSE-NEXT: retq
1119 ; X64-AVX-LABEL: mul_2xi16_sext:
1120 ; X64-AVX: # %bb.0: # %entry
1121 ; X64-AVX-NEXT: movq {{.*}}(%rip), %rax
1122 ; X64-AVX-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
1123 ; X64-AVX-NEXT: vpmovsxwd %xmm0, %xmm0
1124 ; X64-AVX-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero
1125 ; X64-AVX-NEXT: vpmovsxwd %xmm1, %xmm1
1126 ; X64-AVX-NEXT: vpmulld %xmm0, %xmm1, %xmm0
1127 ; X64-AVX-NEXT: vmovq %xmm0, (%rax,%rdx,4)
1128 ; X64-AVX-NEXT: retq
1130 %pre = load i32*, i32** @c
1131 %tmp6 = getelementptr inbounds i8, i8* %a, i64 %index
1132 %tmp7 = bitcast i8* %tmp6 to <2 x i16>*
1133 %wide.load = load <2 x i16>, <2 x i16>* %tmp7, align 1
1134 %tmp8 = sext <2 x i16> %wide.load to <2 x i32>
1135 %tmp10 = getelementptr inbounds i8, i8* %b, i64 %index
1136 %tmp11 = bitcast i8* %tmp10 to <2 x i16>*
1137 %wide.load17 = load <2 x i16>, <2 x i16>* %tmp11, align 1
1138 %tmp12 = sext <2 x i16> %wide.load17 to <2 x i32>
1139 %tmp13 = mul nuw nsw <2 x i32> %tmp12, %tmp8
1140 %tmp14 = getelementptr inbounds i32, i32* %pre, i64 %index
1141 %tmp15 = bitcast i32* %tmp14 to <2 x i32>*
1142 store <2 x i32> %tmp13, <2 x i32>* %tmp15, align 4
1146 ; %val1 = load <2 x i16>
1147 ; %op1 = sext<2 x i32> %val1
1148 ; %val2 = load <2 x i16>
1149 ; %op2 = zext<2 x i32> %val2
1150 ; %rst = mul <2 x i32> %op1, %op2
1152 define void @mul_2xi16_sext_zext(i8* nocapture readonly %a, i8* nocapture readonly %b, i64 %index) nounwind {
1153 ; X86-SSE-LABEL: mul_2xi16_sext_zext:
1154 ; X86-SSE: # %bb.0: # %entry
1155 ; X86-SSE-NEXT: pushl %esi
1156 ; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %eax
1157 ; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %ecx
1158 ; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %edx
1159 ; X86-SSE-NEXT: movl c, %esi
1160 ; X86-SSE-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
1161 ; X86-SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,2,1,4,5,6,7]
1162 ; X86-SSE-NEXT: psrad $16, %xmm0
1163 ; X86-SSE-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
1164 ; X86-SSE-NEXT: pxor %xmm2, %xmm2
1165 ; X86-SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
1166 ; X86-SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
1167 ; X86-SSE-NEXT: pmuludq %xmm0, %xmm1
1168 ; X86-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
1169 ; X86-SSE-NEXT: pmuludq %xmm2, %xmm0
1170 ; X86-SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
1171 ; X86-SSE-NEXT: movq %xmm1, (%esi,%ecx,4)
1172 ; X86-SSE-NEXT: popl %esi
1173 ; X86-SSE-NEXT: retl
1175 ; X86-AVX-LABEL: mul_2xi16_sext_zext:
1176 ; X86-AVX: # %bb.0: # %entry
1177 ; X86-AVX-NEXT: pushl %esi
1178 ; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax
1179 ; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %ecx
1180 ; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %edx
1181 ; X86-AVX-NEXT: movl c, %esi
1182 ; X86-AVX-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
1183 ; X86-AVX-NEXT: vpmovsxwd %xmm0, %xmm0
1184 ; X86-AVX-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero
1185 ; X86-AVX-NEXT: vpmovzxwd {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero
1186 ; X86-AVX-NEXT: vpmulld %xmm0, %xmm1, %xmm0
1187 ; X86-AVX-NEXT: vmovq %xmm0, (%esi,%ecx,4)
1188 ; X86-AVX-NEXT: popl %esi
1189 ; X86-AVX-NEXT: retl
1191 ; X64-SSE-LABEL: mul_2xi16_sext_zext:
1192 ; X64-SSE: # %bb.0: # %entry
1193 ; X64-SSE-NEXT: movq {{.*}}(%rip), %rax
1194 ; X64-SSE-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
1195 ; X64-SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,2,1,4,5,6,7]
1196 ; X64-SSE-NEXT: psrad $16, %xmm0
1197 ; X64-SSE-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
1198 ; X64-SSE-NEXT: pxor %xmm2, %xmm2
1199 ; X64-SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
1200 ; X64-SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
1201 ; X64-SSE-NEXT: pmuludq %xmm0, %xmm1
1202 ; X64-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
1203 ; X64-SSE-NEXT: pmuludq %xmm2, %xmm0
1204 ; X64-SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
1205 ; X64-SSE-NEXT: movq %xmm1, (%rax,%rdx,4)
1206 ; X64-SSE-NEXT: retq
1208 ; X64-AVX-LABEL: mul_2xi16_sext_zext:
1209 ; X64-AVX: # %bb.0: # %entry
1210 ; X64-AVX-NEXT: movq {{.*}}(%rip), %rax
1211 ; X64-AVX-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
1212 ; X64-AVX-NEXT: vpmovsxwd %xmm0, %xmm0
1213 ; X64-AVX-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero
1214 ; X64-AVX-NEXT: vpmovzxwd {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero
1215 ; X64-AVX-NEXT: vpmulld %xmm0, %xmm1, %xmm0
1216 ; X64-AVX-NEXT: vmovq %xmm0, (%rax,%rdx,4)
1217 ; X64-AVX-NEXT: retq
1219 %pre = load i32*, i32** @c
1220 %tmp6 = getelementptr inbounds i8, i8* %a, i64 %index
1221 %tmp7 = bitcast i8* %tmp6 to <2 x i16>*
1222 %wide.load = load <2 x i16>, <2 x i16>* %tmp7, align 1
1223 %tmp8 = sext <2 x i16> %wide.load to <2 x i32>
1224 %tmp10 = getelementptr inbounds i8, i8* %b, i64 %index
1225 %tmp11 = bitcast i8* %tmp10 to <2 x i16>*
1226 %wide.load17 = load <2 x i16>, <2 x i16>* %tmp11, align 1
1227 %tmp12 = zext <2 x i16> %wide.load17 to <2 x i32>
1228 %tmp13 = mul nuw nsw <2 x i32> %tmp12, %tmp8
1229 %tmp14 = getelementptr inbounds i32, i32* %pre, i64 %index
1230 %tmp15 = bitcast i32* %tmp14 to <2 x i32>*
1231 store <2 x i32> %tmp13, <2 x i32>* %tmp15, align 4
1235 ; %val1 = load <16 x i16>
1236 ; %op1 = sext<16 x i32> %val1
1237 ; %val2 = load <16 x i16>
1238 ; %op2 = sext<16 x i32> %val2
1239 ; %rst = mul <16 x i32> %op1, %op2
1241 define void @mul_16xi16_sext(i8* nocapture readonly %a, i8* nocapture readonly %b, i64 %index) nounwind {
1242 ; X86-SSE-LABEL: mul_16xi16_sext:
1243 ; X86-SSE: # %bb.0: # %entry
1244 ; X86-SSE-NEXT: pushl %esi
1245 ; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %eax
1246 ; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %ecx
1247 ; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %edx
1248 ; X86-SSE-NEXT: movl c, %esi
1249 ; X86-SSE-NEXT: movdqu (%edx,%ecx), %xmm0
1250 ; X86-SSE-NEXT: movdqu 16(%edx,%ecx), %xmm1
1251 ; X86-SSE-NEXT: movdqu (%eax,%ecx), %xmm2
1252 ; X86-SSE-NEXT: movdqu 16(%eax,%ecx), %xmm3
1253 ; X86-SSE-NEXT: movdqa %xmm2, %xmm4
1254 ; X86-SSE-NEXT: pmulhw %xmm0, %xmm4
1255 ; X86-SSE-NEXT: pmullw %xmm0, %xmm2
1256 ; X86-SSE-NEXT: movdqa %xmm2, %xmm0
1257 ; X86-SSE-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm4[4],xmm0[5],xmm4[5],xmm0[6],xmm4[6],xmm0[7],xmm4[7]
1258 ; X86-SSE-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3]
1259 ; X86-SSE-NEXT: movdqa %xmm3, %xmm4
1260 ; X86-SSE-NEXT: pmulhw %xmm1, %xmm4
1261 ; X86-SSE-NEXT: pmullw %xmm1, %xmm3
1262 ; X86-SSE-NEXT: movdqa %xmm3, %xmm1
1263 ; X86-SSE-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm4[4],xmm1[5],xmm4[5],xmm1[6],xmm4[6],xmm1[7],xmm4[7]
1264 ; X86-SSE-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3]
1265 ; X86-SSE-NEXT: movdqu %xmm3, 32(%esi,%ecx,4)
1266 ; X86-SSE-NEXT: movdqu %xmm1, 48(%esi,%ecx,4)
1267 ; X86-SSE-NEXT: movdqu %xmm2, (%esi,%ecx,4)
1268 ; X86-SSE-NEXT: movdqu %xmm0, 16(%esi,%ecx,4)
1269 ; X86-SSE-NEXT: popl %esi
1270 ; X86-SSE-NEXT: retl
1272 ; X86-AVX1-LABEL: mul_16xi16_sext:
1273 ; X86-AVX1: # %bb.0: # %entry
1274 ; X86-AVX1-NEXT: pushl %esi
1275 ; X86-AVX1-NEXT: movl {{[0-9]+}}(%esp), %eax
1276 ; X86-AVX1-NEXT: movl {{[0-9]+}}(%esp), %ecx
1277 ; X86-AVX1-NEXT: movl {{[0-9]+}}(%esp), %edx
1278 ; X86-AVX1-NEXT: movl c, %esi
1279 ; X86-AVX1-NEXT: vpmovsxwd 24(%edx,%ecx), %xmm0
1280 ; X86-AVX1-NEXT: vpmovsxwd 16(%edx,%ecx), %xmm1
1281 ; X86-AVX1-NEXT: vpmovsxwd 8(%edx,%ecx), %xmm2
1282 ; X86-AVX1-NEXT: vpmovsxwd (%edx,%ecx), %xmm3
1283 ; X86-AVX1-NEXT: vpmovsxwd 24(%eax,%ecx), %xmm4
1284 ; X86-AVX1-NEXT: vpmulld %xmm0, %xmm4, %xmm0
1285 ; X86-AVX1-NEXT: vpmovsxwd 16(%eax,%ecx), %xmm4
1286 ; X86-AVX1-NEXT: vpmulld %xmm1, %xmm4, %xmm1
1287 ; X86-AVX1-NEXT: vpmovsxwd 8(%eax,%ecx), %xmm4
1288 ; X86-AVX1-NEXT: vpmulld %xmm2, %xmm4, %xmm2
1289 ; X86-AVX1-NEXT: vpmovsxwd (%eax,%ecx), %xmm4
1290 ; X86-AVX1-NEXT: vpmulld %xmm3, %xmm4, %xmm3
1291 ; X86-AVX1-NEXT: vmovdqu %xmm0, 48(%esi,%ecx,4)
1292 ; X86-AVX1-NEXT: vmovdqu %xmm1, 32(%esi,%ecx,4)
1293 ; X86-AVX1-NEXT: vmovdqu %xmm2, 16(%esi,%ecx,4)
1294 ; X86-AVX1-NEXT: vmovdqu %xmm3, (%esi,%ecx,4)
1295 ; X86-AVX1-NEXT: popl %esi
1296 ; X86-AVX1-NEXT: retl
1298 ; X86-AVX2-LABEL: mul_16xi16_sext:
1299 ; X86-AVX2: # %bb.0: # %entry
1300 ; X86-AVX2-NEXT: pushl %esi
1301 ; X86-AVX2-NEXT: movl {{[0-9]+}}(%esp), %eax
1302 ; X86-AVX2-NEXT: movl {{[0-9]+}}(%esp), %ecx
1303 ; X86-AVX2-NEXT: movl {{[0-9]+}}(%esp), %edx
1304 ; X86-AVX2-NEXT: movl c, %esi
1305 ; X86-AVX2-NEXT: vpmovsxwd 16(%edx,%ecx), %ymm0
1306 ; X86-AVX2-NEXT: vpmovsxwd (%edx,%ecx), %ymm1
1307 ; X86-AVX2-NEXT: vpmovsxwd 16(%eax,%ecx), %ymm2
1308 ; X86-AVX2-NEXT: vpmulld %ymm0, %ymm2, %ymm0
1309 ; X86-AVX2-NEXT: vpmovsxwd (%eax,%ecx), %ymm2
1310 ; X86-AVX2-NEXT: vpmulld %ymm1, %ymm2, %ymm1
1311 ; X86-AVX2-NEXT: vmovdqu %ymm0, 32(%esi,%ecx,4)
1312 ; X86-AVX2-NEXT: vmovdqu %ymm1, (%esi,%ecx,4)
1313 ; X86-AVX2-NEXT: popl %esi
1314 ; X86-AVX2-NEXT: vzeroupper
1315 ; X86-AVX2-NEXT: retl
1317 ; X64-SSE-LABEL: mul_16xi16_sext:
1318 ; X64-SSE: # %bb.0: # %entry
1319 ; X64-SSE-NEXT: movq {{.*}}(%rip), %rax
1320 ; X64-SSE-NEXT: movdqu (%rdi,%rdx), %xmm0
1321 ; X64-SSE-NEXT: movdqu 16(%rdi,%rdx), %xmm1
1322 ; X64-SSE-NEXT: movdqu (%rsi,%rdx), %xmm2
1323 ; X64-SSE-NEXT: movdqu 16(%rsi,%rdx), %xmm3
1324 ; X64-SSE-NEXT: movdqa %xmm2, %xmm4
1325 ; X64-SSE-NEXT: pmulhw %xmm0, %xmm4
1326 ; X64-SSE-NEXT: pmullw %xmm0, %xmm2
1327 ; X64-SSE-NEXT: movdqa %xmm2, %xmm0
1328 ; X64-SSE-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm4[4],xmm0[5],xmm4[5],xmm0[6],xmm4[6],xmm0[7],xmm4[7]
1329 ; X64-SSE-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3]
1330 ; X64-SSE-NEXT: movdqa %xmm3, %xmm4
1331 ; X64-SSE-NEXT: pmulhw %xmm1, %xmm4
1332 ; X64-SSE-NEXT: pmullw %xmm1, %xmm3
1333 ; X64-SSE-NEXT: movdqa %xmm3, %xmm1
1334 ; X64-SSE-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm4[4],xmm1[5],xmm4[5],xmm1[6],xmm4[6],xmm1[7],xmm4[7]
1335 ; X64-SSE-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3]
1336 ; X64-SSE-NEXT: movdqu %xmm3, 32(%rax,%rdx,4)
1337 ; X64-SSE-NEXT: movdqu %xmm1, 48(%rax,%rdx,4)
1338 ; X64-SSE-NEXT: movdqu %xmm2, (%rax,%rdx,4)
1339 ; X64-SSE-NEXT: movdqu %xmm0, 16(%rax,%rdx,4)
1340 ; X64-SSE-NEXT: retq
1342 ; X64-AVX1-LABEL: mul_16xi16_sext:
1343 ; X64-AVX1: # %bb.0: # %entry
1344 ; X64-AVX1-NEXT: movq {{.*}}(%rip), %rax
1345 ; X64-AVX1-NEXT: vpmovsxwd 24(%rdi,%rdx), %xmm0
1346 ; X64-AVX1-NEXT: vpmovsxwd 16(%rdi,%rdx), %xmm1
1347 ; X64-AVX1-NEXT: vpmovsxwd 8(%rdi,%rdx), %xmm2
1348 ; X64-AVX1-NEXT: vpmovsxwd (%rdi,%rdx), %xmm3
1349 ; X64-AVX1-NEXT: vpmovsxwd 24(%rsi,%rdx), %xmm4
1350 ; X64-AVX1-NEXT: vpmulld %xmm0, %xmm4, %xmm0
1351 ; X64-AVX1-NEXT: vpmovsxwd 16(%rsi,%rdx), %xmm4
1352 ; X64-AVX1-NEXT: vpmulld %xmm1, %xmm4, %xmm1
1353 ; X64-AVX1-NEXT: vpmovsxwd 8(%rsi,%rdx), %xmm4
1354 ; X64-AVX1-NEXT: vpmulld %xmm2, %xmm4, %xmm2
1355 ; X64-AVX1-NEXT: vpmovsxwd (%rsi,%rdx), %xmm4
1356 ; X64-AVX1-NEXT: vpmulld %xmm3, %xmm4, %xmm3
1357 ; X64-AVX1-NEXT: vmovdqu %xmm0, 48(%rax,%rdx,4)
1358 ; X64-AVX1-NEXT: vmovdqu %xmm1, 32(%rax,%rdx,4)
1359 ; X64-AVX1-NEXT: vmovdqu %xmm2, 16(%rax,%rdx,4)
1360 ; X64-AVX1-NEXT: vmovdqu %xmm3, (%rax,%rdx,4)
1361 ; X64-AVX1-NEXT: retq
1363 ; X64-AVX2-LABEL: mul_16xi16_sext:
1364 ; X64-AVX2: # %bb.0: # %entry
1365 ; X64-AVX2-NEXT: movq {{.*}}(%rip), %rax
1366 ; X64-AVX2-NEXT: vpmovsxwd 16(%rdi,%rdx), %ymm0
1367 ; X64-AVX2-NEXT: vpmovsxwd (%rdi,%rdx), %ymm1
1368 ; X64-AVX2-NEXT: vpmovsxwd 16(%rsi,%rdx), %ymm2
1369 ; X64-AVX2-NEXT: vpmulld %ymm0, %ymm2, %ymm0
1370 ; X64-AVX2-NEXT: vpmovsxwd (%rsi,%rdx), %ymm2
1371 ; X64-AVX2-NEXT: vpmulld %ymm1, %ymm2, %ymm1
1372 ; X64-AVX2-NEXT: vmovdqu %ymm0, 32(%rax,%rdx,4)
1373 ; X64-AVX2-NEXT: vmovdqu %ymm1, (%rax,%rdx,4)
1374 ; X64-AVX2-NEXT: vzeroupper
1375 ; X64-AVX2-NEXT: retq
1377 %pre = load i32*, i32** @c
1378 %tmp6 = getelementptr inbounds i8, i8* %a, i64 %index
1379 %tmp7 = bitcast i8* %tmp6 to <16 x i16>*
1380 %wide.load = load <16 x i16>, <16 x i16>* %tmp7, align 1
1381 %tmp8 = sext <16 x i16> %wide.load to <16 x i32>
1382 %tmp10 = getelementptr inbounds i8, i8* %b, i64 %index
1383 %tmp11 = bitcast i8* %tmp10 to <16 x i16>*
1384 %wide.load17 = load <16 x i16>, <16 x i16>* %tmp11, align 1
1385 %tmp12 = sext <16 x i16> %wide.load17 to <16 x i32>
1386 %tmp13 = mul nuw nsw <16 x i32> %tmp12, %tmp8
1387 %tmp14 = getelementptr inbounds i32, i32* %pre, i64 %index
1388 %tmp15 = bitcast i32* %tmp14 to <16 x i32>*
1389 store <16 x i32> %tmp13, <16 x i32>* %tmp15, align 4
1393 ; %val = load <2 x i8>
1394 ; %op1 = zext<2 x i32> %val
1395 ; %op2 = const <2 x i32> {c1, c2} // c1 and c2 are within (0 ~ 255)
1396 ; %rst = mul <2 x i32> %op1, %op2
1398 define void @mul_2xi8_varconst1(i8* nocapture readonly %a, i64 %index) {
1399 ; X86-SSE-LABEL: mul_2xi8_varconst1:
1400 ; X86-SSE: # %bb.0: # %entry
1401 ; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %eax
1402 ; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %ecx
1403 ; X86-SSE-NEXT: movl c, %edx
1404 ; X86-SSE-NEXT: movzwl (%ecx,%eax), %ecx
1405 ; X86-SSE-NEXT: movd %ecx, %xmm0
1406 ; X86-SSE-NEXT: pxor %xmm1, %xmm1
1407 ; X86-SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
1408 ; X86-SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
1409 ; X86-SSE-NEXT: pmaddwd {{\.LCPI.*}}, %xmm0
1410 ; X86-SSE-NEXT: movq %xmm0, (%edx,%eax,4)
1411 ; X86-SSE-NEXT: retl
1413 ; X86-AVX-LABEL: mul_2xi8_varconst1:
1414 ; X86-AVX: # %bb.0: # %entry
1415 ; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax
1416 ; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %ecx
1417 ; X86-AVX-NEXT: movl c, %edx
1418 ; X86-AVX-NEXT: movzwl (%ecx,%eax), %ecx
1419 ; X86-AVX-NEXT: vmovd %ecx, %xmm0
1420 ; X86-AVX-NEXT: vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
1421 ; X86-AVX-NEXT: vpmaddwd {{\.LCPI.*}}, %xmm0, %xmm0
1422 ; X86-AVX-NEXT: vmovq %xmm0, (%edx,%eax,4)
1423 ; X86-AVX-NEXT: retl
1425 ; X64-SSE-LABEL: mul_2xi8_varconst1:
1426 ; X64-SSE: # %bb.0: # %entry
1427 ; X64-SSE-NEXT: movq {{.*}}(%rip), %rax
1428 ; X64-SSE-NEXT: movzwl (%rdi,%rsi), %ecx
1429 ; X64-SSE-NEXT: movd %ecx, %xmm0
1430 ; X64-SSE-NEXT: pxor %xmm1, %xmm1
1431 ; X64-SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
1432 ; X64-SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
1433 ; X64-SSE-NEXT: pmaddwd {{.*}}(%rip), %xmm0
1434 ; X64-SSE-NEXT: movq %xmm0, (%rax,%rsi,4)
1435 ; X64-SSE-NEXT: retq
1437 ; X64-AVX-LABEL: mul_2xi8_varconst1:
1438 ; X64-AVX: # %bb.0: # %entry
1439 ; X64-AVX-NEXT: movq {{.*}}(%rip), %rax
1440 ; X64-AVX-NEXT: movzwl (%rdi,%rsi), %ecx
1441 ; X64-AVX-NEXT: vmovd %ecx, %xmm0
1442 ; X64-AVX-NEXT: vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
1443 ; X64-AVX-NEXT: vpmaddwd {{.*}}(%rip), %xmm0, %xmm0
1444 ; X64-AVX-NEXT: vmovq %xmm0, (%rax,%rsi,4)
1445 ; X64-AVX-NEXT: retq
1447 %pre = load i32*, i32** @c
1448 %tmp6 = getelementptr inbounds i8, i8* %a, i64 %index
1449 %tmp7 = bitcast i8* %tmp6 to <2 x i8>*
1450 %wide.load = load <2 x i8>, <2 x i8>* %tmp7, align 1
1451 %tmp8 = zext <2 x i8> %wide.load to <2 x i32>
1452 %tmp13 = mul nuw nsw <2 x i32> %tmp8, <i32 0, i32 255>
1453 %tmp14 = getelementptr inbounds i32, i32* %pre, i64 %index
1454 %tmp15 = bitcast i32* %tmp14 to <2 x i32>*
1455 store <2 x i32> %tmp13, <2 x i32>* %tmp15, align 4
1459 ; %val = load <2 x i8>
1460 ; %op1 = sext<2 x i32> %val
1461 ; %op2 = const <2 x i32> {c1, c2} // c1 and c2 are within (-128 ~ 127)
1462 ; %rst = mul <2 x i32> %op1, %op2
1464 define void @mul_2xi8_varconst2(i8* nocapture readonly %a, i64 %index) {
1465 ; X86-SSE-LABEL: mul_2xi8_varconst2:
1466 ; X86-SSE: # %bb.0: # %entry
1467 ; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %eax
1468 ; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %ecx
1469 ; X86-SSE-NEXT: movl c, %edx
1470 ; X86-SSE-NEXT: movzwl (%ecx,%eax), %ecx
1471 ; X86-SSE-NEXT: movd %ecx, %xmm0
1472 ; X86-SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
1473 ; X86-SSE-NEXT: psraw $8, %xmm0
1474 ; X86-SSE-NEXT: pmullw {{\.LCPI.*}}, %xmm0
1475 ; X86-SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
1476 ; X86-SSE-NEXT: psrad $16, %xmm0
1477 ; X86-SSE-NEXT: movq %xmm0, (%edx,%eax,4)
1478 ; X86-SSE-NEXT: retl
1480 ; X86-AVX-LABEL: mul_2xi8_varconst2:
1481 ; X86-AVX: # %bb.0: # %entry
1482 ; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax
1483 ; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %ecx
1484 ; X86-AVX-NEXT: movl c, %edx
1485 ; X86-AVX-NEXT: movzwl (%ecx,%eax), %ecx
1486 ; X86-AVX-NEXT: vmovd %ecx, %xmm0
1487 ; X86-AVX-NEXT: vpmovsxbd %xmm0, %xmm0
1488 ; X86-AVX-NEXT: vpmulld {{\.LCPI.*}}, %xmm0, %xmm0
1489 ; X86-AVX-NEXT: vmovq %xmm0, (%edx,%eax,4)
1490 ; X86-AVX-NEXT: retl
1492 ; X64-SSE-LABEL: mul_2xi8_varconst2:
1493 ; X64-SSE: # %bb.0: # %entry
1494 ; X64-SSE-NEXT: movq {{.*}}(%rip), %rax
1495 ; X64-SSE-NEXT: movzwl (%rdi,%rsi), %ecx
1496 ; X64-SSE-NEXT: movd %ecx, %xmm0
1497 ; X64-SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
1498 ; X64-SSE-NEXT: psraw $8, %xmm0
1499 ; X64-SSE-NEXT: pmullw {{.*}}(%rip), %xmm0
1500 ; X64-SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
1501 ; X64-SSE-NEXT: psrad $16, %xmm0
1502 ; X64-SSE-NEXT: movq %xmm0, (%rax,%rsi,4)
1503 ; X64-SSE-NEXT: retq
1505 ; X64-AVX-LABEL: mul_2xi8_varconst2:
1506 ; X64-AVX: # %bb.0: # %entry
1507 ; X64-AVX-NEXT: movq {{.*}}(%rip), %rax
1508 ; X64-AVX-NEXT: movzwl (%rdi,%rsi), %ecx
1509 ; X64-AVX-NEXT: vmovd %ecx, %xmm0
1510 ; X64-AVX-NEXT: vpmovsxbd %xmm0, %xmm0
1511 ; X64-AVX-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm0
1512 ; X64-AVX-NEXT: vmovq %xmm0, (%rax,%rsi,4)
1513 ; X64-AVX-NEXT: retq
1515 %pre = load i32*, i32** @c
1516 %tmp6 = getelementptr inbounds i8, i8* %a, i64 %index
1517 %tmp7 = bitcast i8* %tmp6 to <2 x i8>*
1518 %wide.load = load <2 x i8>, <2 x i8>* %tmp7, align 1
1519 %tmp8 = sext <2 x i8> %wide.load to <2 x i32>
1520 %tmp13 = mul nuw nsw <2 x i32> %tmp8, <i32 -128, i32 127>
1521 %tmp14 = getelementptr inbounds i32, i32* %pre, i64 %index
1522 %tmp15 = bitcast i32* %tmp14 to <2 x i32>*
1523 store <2 x i32> %tmp13, <2 x i32>* %tmp15, align 4
1527 ; %val = load <2 x i8>
1528 ; %op1 = zext<2 x i32> %val
1529 ; %op2 = const <2 x i32> {c1, c2} // c1 and c2 are within (0 ~ 256)
1530 ; %rst = mul <2 x i32> %op1, %op2
1532 define void @mul_2xi8_varconst3(i8* nocapture readonly %a, i64 %index) {
1533 ; X86-SSE-LABEL: mul_2xi8_varconst3:
1534 ; X86-SSE: # %bb.0: # %entry
1535 ; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %eax
1536 ; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %ecx
1537 ; X86-SSE-NEXT: movl c, %edx
1538 ; X86-SSE-NEXT: movzwl (%ecx,%eax), %ecx
1539 ; X86-SSE-NEXT: movd %ecx, %xmm0
1540 ; X86-SSE-NEXT: pxor %xmm1, %xmm1
1541 ; X86-SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
1542 ; X86-SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
1543 ; X86-SSE-NEXT: pmaddwd {{\.LCPI.*}}, %xmm0
1544 ; X86-SSE-NEXT: movq %xmm0, (%edx,%eax,4)
1545 ; X86-SSE-NEXT: retl
1547 ; X86-AVX-LABEL: mul_2xi8_varconst3:
1548 ; X86-AVX: # %bb.0: # %entry
1549 ; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax
1550 ; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %ecx
1551 ; X86-AVX-NEXT: movl c, %edx
1552 ; X86-AVX-NEXT: movzwl (%ecx,%eax), %ecx
1553 ; X86-AVX-NEXT: vmovd %ecx, %xmm0
1554 ; X86-AVX-NEXT: vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
1555 ; X86-AVX-NEXT: vpmaddwd {{\.LCPI.*}}, %xmm0, %xmm0
1556 ; X86-AVX-NEXT: vmovq %xmm0, (%edx,%eax,4)
1557 ; X86-AVX-NEXT: retl
1559 ; X64-SSE-LABEL: mul_2xi8_varconst3:
1560 ; X64-SSE: # %bb.0: # %entry
1561 ; X64-SSE-NEXT: movq {{.*}}(%rip), %rax
1562 ; X64-SSE-NEXT: movzwl (%rdi,%rsi), %ecx
1563 ; X64-SSE-NEXT: movd %ecx, %xmm0
1564 ; X64-SSE-NEXT: pxor %xmm1, %xmm1
1565 ; X64-SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
1566 ; X64-SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
1567 ; X64-SSE-NEXT: pmaddwd {{.*}}(%rip), %xmm0
1568 ; X64-SSE-NEXT: movq %xmm0, (%rax,%rsi,4)
1569 ; X64-SSE-NEXT: retq
1571 ; X64-AVX-LABEL: mul_2xi8_varconst3:
1572 ; X64-AVX: # %bb.0: # %entry
1573 ; X64-AVX-NEXT: movq {{.*}}(%rip), %rax
1574 ; X64-AVX-NEXT: movzwl (%rdi,%rsi), %ecx
1575 ; X64-AVX-NEXT: vmovd %ecx, %xmm0
1576 ; X64-AVX-NEXT: vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
1577 ; X64-AVX-NEXT: vpmaddwd {{.*}}(%rip), %xmm0, %xmm0
1578 ; X64-AVX-NEXT: vmovq %xmm0, (%rax,%rsi,4)
1579 ; X64-AVX-NEXT: retq
1581 %pre = load i32*, i32** @c
1582 %tmp6 = getelementptr inbounds i8, i8* %a, i64 %index
1583 %tmp7 = bitcast i8* %tmp6 to <2 x i8>*
1584 %wide.load = load <2 x i8>, <2 x i8>* %tmp7, align 1
1585 %tmp8 = zext <2 x i8> %wide.load to <2 x i32>
1586 %tmp13 = mul nuw nsw <2 x i32> %tmp8, <i32 0, i32 256>
1587 %tmp14 = getelementptr inbounds i32, i32* %pre, i64 %index
1588 %tmp15 = bitcast i32* %tmp14 to <2 x i32>*
1589 store <2 x i32> %tmp13, <2 x i32>* %tmp15, align 4
1593 ; %val = load <2 x i8>
1594 ; %op1 = zext<2 x i32> %val
1595 ; %op2 = const <2 x i32> {c1, c2} // c1 and c2 are within (-1 ~ 255)
1596 ; %rst = mul <2 x i32> %op1, %op2
1598 define void @mul_2xi8_varconst4(i8* nocapture readonly %a, i64 %index) {
1599 ; X86-SSE-LABEL: mul_2xi8_varconst4:
1600 ; X86-SSE: # %bb.0: # %entry
1601 ; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %eax
1602 ; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %ecx
1603 ; X86-SSE-NEXT: movl c, %edx
1604 ; X86-SSE-NEXT: movzwl (%ecx,%eax), %ecx
1605 ; X86-SSE-NEXT: movd %ecx, %xmm0
1606 ; X86-SSE-NEXT: pxor %xmm1, %xmm1
1607 ; X86-SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
1608 ; X86-SSE-NEXT: movdqa {{.*#+}} xmm1 = <65535,255,u,u,u,u,u,u>
1609 ; X86-SSE-NEXT: movdqa %xmm0, %xmm2
1610 ; X86-SSE-NEXT: pmulhw %xmm1, %xmm2
1611 ; X86-SSE-NEXT: pmullw %xmm1, %xmm0
1612 ; X86-SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
1613 ; X86-SSE-NEXT: movq %xmm0, (%edx,%eax,4)
1614 ; X86-SSE-NEXT: retl
1616 ; X86-AVX-LABEL: mul_2xi8_varconst4:
1617 ; X86-AVX: # %bb.0: # %entry
1618 ; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax
1619 ; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %ecx
1620 ; X86-AVX-NEXT: movl c, %edx
1621 ; X86-AVX-NEXT: movzwl (%ecx,%eax), %ecx
1622 ; X86-AVX-NEXT: vmovd %ecx, %xmm0
1623 ; X86-AVX-NEXT: vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
1624 ; X86-AVX-NEXT: vpmulld {{\.LCPI.*}}, %xmm0, %xmm0
1625 ; X86-AVX-NEXT: vmovq %xmm0, (%edx,%eax,4)
1626 ; X86-AVX-NEXT: retl
1628 ; X64-SSE-LABEL: mul_2xi8_varconst4:
1629 ; X64-SSE: # %bb.0: # %entry
1630 ; X64-SSE-NEXT: movq {{.*}}(%rip), %rax
1631 ; X64-SSE-NEXT: movzwl (%rdi,%rsi), %ecx
1632 ; X64-SSE-NEXT: movd %ecx, %xmm0
1633 ; X64-SSE-NEXT: pxor %xmm1, %xmm1
1634 ; X64-SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
1635 ; X64-SSE-NEXT: movdqa {{.*#+}} xmm1 = <65535,255,u,u,u,u,u,u>
1636 ; X64-SSE-NEXT: movdqa %xmm0, %xmm2
1637 ; X64-SSE-NEXT: pmulhw %xmm1, %xmm2
1638 ; X64-SSE-NEXT: pmullw %xmm1, %xmm0
1639 ; X64-SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
1640 ; X64-SSE-NEXT: movq %xmm0, (%rax,%rsi,4)
1641 ; X64-SSE-NEXT: retq
1643 ; X64-AVX-LABEL: mul_2xi8_varconst4:
1644 ; X64-AVX: # %bb.0: # %entry
1645 ; X64-AVX-NEXT: movq {{.*}}(%rip), %rax
1646 ; X64-AVX-NEXT: movzwl (%rdi,%rsi), %ecx
1647 ; X64-AVX-NEXT: vmovd %ecx, %xmm0
1648 ; X64-AVX-NEXT: vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
1649 ; X64-AVX-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm0
1650 ; X64-AVX-NEXT: vmovq %xmm0, (%rax,%rsi,4)
1651 ; X64-AVX-NEXT: retq
1653 %pre = load i32*, i32** @c
1654 %tmp6 = getelementptr inbounds i8, i8* %a, i64 %index
1655 %tmp7 = bitcast i8* %tmp6 to <2 x i8>*
1656 %wide.load = load <2 x i8>, <2 x i8>* %tmp7, align 1
1657 %tmp8 = zext <2 x i8> %wide.load to <2 x i32>
1658 %tmp13 = mul nuw nsw <2 x i32> %tmp8, <i32 -1, i32 255>
1659 %tmp14 = getelementptr inbounds i32, i32* %pre, i64 %index
1660 %tmp15 = bitcast i32* %tmp14 to <2 x i32>*
1661 store <2 x i32> %tmp13, <2 x i32>* %tmp15, align 4
1665 ; %val = load <2 x i8>
1666 ; %op1 = sext<2 x i32> %val
1667 ; %op2 = const <2 x i32> {c1, c2} // c1 and c2 are within (-129 ~ 127)
1668 ; %rst = mul <2 x i32> %op1, %op2
1670 define void @mul_2xi8_varconst5(i8* nocapture readonly %a, i64 %index) {
1671 ; X86-SSE-LABEL: mul_2xi8_varconst5:
1672 ; X86-SSE: # %bb.0: # %entry
1673 ; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %eax
1674 ; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %ecx
1675 ; X86-SSE-NEXT: movl c, %edx
1676 ; X86-SSE-NEXT: movzwl (%ecx,%eax), %ecx
1677 ; X86-SSE-NEXT: movd %ecx, %xmm0
1678 ; X86-SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
1679 ; X86-SSE-NEXT: psraw $8, %xmm0
1680 ; X86-SSE-NEXT: movdqa {{.*#+}} xmm1 = <65407,127,u,u,u,u,u,u>
1681 ; X86-SSE-NEXT: movdqa %xmm0, %xmm2
1682 ; X86-SSE-NEXT: pmulhw %xmm1, %xmm2
1683 ; X86-SSE-NEXT: pmullw %xmm1, %xmm0
1684 ; X86-SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
1685 ; X86-SSE-NEXT: movq %xmm0, (%edx,%eax,4)
1686 ; X86-SSE-NEXT: retl
1688 ; X86-AVX-LABEL: mul_2xi8_varconst5:
1689 ; X86-AVX: # %bb.0: # %entry
1690 ; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax
1691 ; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %ecx
1692 ; X86-AVX-NEXT: movl c, %edx
1693 ; X86-AVX-NEXT: movzwl (%ecx,%eax), %ecx
1694 ; X86-AVX-NEXT: vmovd %ecx, %xmm0
1695 ; X86-AVX-NEXT: vpmovsxbd %xmm0, %xmm0
1696 ; X86-AVX-NEXT: vpmulld {{\.LCPI.*}}, %xmm0, %xmm0
1697 ; X86-AVX-NEXT: vmovq %xmm0, (%edx,%eax,4)
1698 ; X86-AVX-NEXT: retl
1700 ; X64-SSE-LABEL: mul_2xi8_varconst5:
1701 ; X64-SSE: # %bb.0: # %entry
1702 ; X64-SSE-NEXT: movq {{.*}}(%rip), %rax
1703 ; X64-SSE-NEXT: movzwl (%rdi,%rsi), %ecx
1704 ; X64-SSE-NEXT: movd %ecx, %xmm0
1705 ; X64-SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
1706 ; X64-SSE-NEXT: psraw $8, %xmm0
1707 ; X64-SSE-NEXT: movdqa {{.*#+}} xmm1 = <65407,127,u,u,u,u,u,u>
1708 ; X64-SSE-NEXT: movdqa %xmm0, %xmm2
1709 ; X64-SSE-NEXT: pmulhw %xmm1, %xmm2
1710 ; X64-SSE-NEXT: pmullw %xmm1, %xmm0
1711 ; X64-SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
1712 ; X64-SSE-NEXT: movq %xmm0, (%rax,%rsi,4)
1713 ; X64-SSE-NEXT: retq
1715 ; X64-AVX-LABEL: mul_2xi8_varconst5:
1716 ; X64-AVX: # %bb.0: # %entry
1717 ; X64-AVX-NEXT: movq {{.*}}(%rip), %rax
1718 ; X64-AVX-NEXT: movzwl (%rdi,%rsi), %ecx
1719 ; X64-AVX-NEXT: vmovd %ecx, %xmm0
1720 ; X64-AVX-NEXT: vpmovsxbd %xmm0, %xmm0
1721 ; X64-AVX-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm0
1722 ; X64-AVX-NEXT: vmovq %xmm0, (%rax,%rsi,4)
1723 ; X64-AVX-NEXT: retq
1725 %pre = load i32*, i32** @c
1726 %tmp6 = getelementptr inbounds i8, i8* %a, i64 %index
1727 %tmp7 = bitcast i8* %tmp6 to <2 x i8>*
1728 %wide.load = load <2 x i8>, <2 x i8>* %tmp7, align 1
1729 %tmp8 = sext <2 x i8> %wide.load to <2 x i32>
1730 %tmp13 = mul nuw nsw <2 x i32> %tmp8, <i32 -129, i32 127>
1731 %tmp14 = getelementptr inbounds i32, i32* %pre, i64 %index
1732 %tmp15 = bitcast i32* %tmp14 to <2 x i32>*
1733 store <2 x i32> %tmp13, <2 x i32>* %tmp15, align 4
1737 ; %val = load <2 x i8>
1738 ; %op1 = sext<2 x i32> %val
1739 ; %op2 = const <2 x i32> {c1, c2} // c1 and c2 are within (-128 ~ 128)
1740 ; %rst = mul <2 x i32> %op1, %op2
1742 define void @mul_2xi8_varconst6(i8* nocapture readonly %a, i64 %index) {
1743 ; X86-SSE-LABEL: mul_2xi8_varconst6:
1744 ; X86-SSE: # %bb.0: # %entry
1745 ; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %eax
1746 ; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %ecx
1747 ; X86-SSE-NEXT: movl c, %edx
1748 ; X86-SSE-NEXT: movzwl (%ecx,%eax), %ecx
1749 ; X86-SSE-NEXT: movd %ecx, %xmm0
1750 ; X86-SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
1751 ; X86-SSE-NEXT: psraw $8, %xmm0
1752 ; X86-SSE-NEXT: movdqa {{.*#+}} xmm1 = <65408,128,u,u,u,u,u,u>
1753 ; X86-SSE-NEXT: movdqa %xmm0, %xmm2
1754 ; X86-SSE-NEXT: pmulhw %xmm1, %xmm2
1755 ; X86-SSE-NEXT: pmullw %xmm1, %xmm0
1756 ; X86-SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
1757 ; X86-SSE-NEXT: movq %xmm0, (%edx,%eax,4)
1758 ; X86-SSE-NEXT: retl
1760 ; X86-AVX-LABEL: mul_2xi8_varconst6:
1761 ; X86-AVX: # %bb.0: # %entry
1762 ; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax
1763 ; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %ecx
1764 ; X86-AVX-NEXT: movl c, %edx
1765 ; X86-AVX-NEXT: movzwl (%ecx,%eax), %ecx
1766 ; X86-AVX-NEXT: vmovd %ecx, %xmm0
1767 ; X86-AVX-NEXT: vpmovsxbd %xmm0, %xmm0
1768 ; X86-AVX-NEXT: vpmulld {{\.LCPI.*}}, %xmm0, %xmm0
1769 ; X86-AVX-NEXT: vmovq %xmm0, (%edx,%eax,4)
1770 ; X86-AVX-NEXT: retl
1772 ; X64-SSE-LABEL: mul_2xi8_varconst6:
1773 ; X64-SSE: # %bb.0: # %entry
1774 ; X64-SSE-NEXT: movq {{.*}}(%rip), %rax
1775 ; X64-SSE-NEXT: movzwl (%rdi,%rsi), %ecx
1776 ; X64-SSE-NEXT: movd %ecx, %xmm0
1777 ; X64-SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
1778 ; X64-SSE-NEXT: psraw $8, %xmm0
1779 ; X64-SSE-NEXT: movdqa {{.*#+}} xmm1 = <65408,128,u,u,u,u,u,u>
1780 ; X64-SSE-NEXT: movdqa %xmm0, %xmm2
1781 ; X64-SSE-NEXT: pmulhw %xmm1, %xmm2
1782 ; X64-SSE-NEXT: pmullw %xmm1, %xmm0
1783 ; X64-SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
1784 ; X64-SSE-NEXT: movq %xmm0, (%rax,%rsi,4)
1785 ; X64-SSE-NEXT: retq
1787 ; X64-AVX-LABEL: mul_2xi8_varconst6:
1788 ; X64-AVX: # %bb.0: # %entry
1789 ; X64-AVX-NEXT: movq {{.*}}(%rip), %rax
1790 ; X64-AVX-NEXT: movzwl (%rdi,%rsi), %ecx
1791 ; X64-AVX-NEXT: vmovd %ecx, %xmm0
1792 ; X64-AVX-NEXT: vpmovsxbd %xmm0, %xmm0
1793 ; X64-AVX-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm0
1794 ; X64-AVX-NEXT: vmovq %xmm0, (%rax,%rsi,4)
1795 ; X64-AVX-NEXT: retq
1797 %pre = load i32*, i32** @c
1798 %tmp6 = getelementptr inbounds i8, i8* %a, i64 %index
1799 %tmp7 = bitcast i8* %tmp6 to <2 x i8>*
1800 %wide.load = load <2 x i8>, <2 x i8>* %tmp7, align 1
1801 %tmp8 = sext <2 x i8> %wide.load to <2 x i32>
1802 %tmp13 = mul nuw nsw <2 x i32> %tmp8, <i32 -128, i32 128>
1803 %tmp14 = getelementptr inbounds i32, i32* %pre, i64 %index
1804 %tmp15 = bitcast i32* %tmp14 to <2 x i32>*
1805 store <2 x i32> %tmp13, <2 x i32>* %tmp15, align 4
1809 ; %val = load <2 x i16>
1810 ; %op1 = zext<2 x i32> %val
1811 ; %op2 = const <2 x i32> {c1, c2} // c1 and c2 are within (0 ~ 65535)
1812 ; %rst = mul <2 x i32> %op1, %op2
1814 define void @mul_2xi16_varconst1(i8* nocapture readonly %a, i64 %index) {
1815 ; X86-SSE-LABEL: mul_2xi16_varconst1:
1816 ; X86-SSE: # %bb.0: # %entry
1817 ; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %eax
1818 ; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %ecx
1819 ; X86-SSE-NEXT: movl c, %edx
1820 ; X86-SSE-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
1821 ; X86-SSE-NEXT: movdqa {{.*#+}} xmm1 = <0,65535,u,u,u,u,u,u>
1822 ; X86-SSE-NEXT: movdqa %xmm0, %xmm2
1823 ; X86-SSE-NEXT: pmulhuw %xmm1, %xmm2
1824 ; X86-SSE-NEXT: pmullw %xmm1, %xmm0
1825 ; X86-SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
1826 ; X86-SSE-NEXT: movq %xmm0, (%edx,%eax,4)
1827 ; X86-SSE-NEXT: retl
1829 ; X86-AVX-LABEL: mul_2xi16_varconst1:
1830 ; X86-AVX: # %bb.0: # %entry
1831 ; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax
1832 ; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %ecx
1833 ; X86-AVX-NEXT: movl c, %edx
1834 ; X86-AVX-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
1835 ; X86-AVX-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
1836 ; X86-AVX-NEXT: vpmulld {{\.LCPI.*}}, %xmm0, %xmm0
1837 ; X86-AVX-NEXT: vmovq %xmm0, (%edx,%eax,4)
1838 ; X86-AVX-NEXT: retl
1840 ; X64-SSE-LABEL: mul_2xi16_varconst1:
1841 ; X64-SSE: # %bb.0: # %entry
1842 ; X64-SSE-NEXT: movq {{.*}}(%rip), %rax
1843 ; X64-SSE-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
1844 ; X64-SSE-NEXT: movdqa {{.*#+}} xmm1 = <0,65535,u,u,u,u,u,u>
1845 ; X64-SSE-NEXT: movdqa %xmm0, %xmm2
1846 ; X64-SSE-NEXT: pmulhuw %xmm1, %xmm2
1847 ; X64-SSE-NEXT: pmullw %xmm1, %xmm0
1848 ; X64-SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
1849 ; X64-SSE-NEXT: movq %xmm0, (%rax,%rsi,4)
1850 ; X64-SSE-NEXT: retq
1852 ; X64-AVX-LABEL: mul_2xi16_varconst1:
1853 ; X64-AVX: # %bb.0: # %entry
1854 ; X64-AVX-NEXT: movq {{.*}}(%rip), %rax
1855 ; X64-AVX-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
1856 ; X64-AVX-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
1857 ; X64-AVX-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm0
1858 ; X64-AVX-NEXT: vmovq %xmm0, (%rax,%rsi,4)
1859 ; X64-AVX-NEXT: retq
1861 %pre = load i32*, i32** @c
1862 %tmp6 = getelementptr inbounds i8, i8* %a, i64 %index
1863 %tmp7 = bitcast i8* %tmp6 to <2 x i16>*
1864 %wide.load = load <2 x i16>, <2 x i16>* %tmp7, align 1
1865 %tmp8 = zext <2 x i16> %wide.load to <2 x i32>
1866 %tmp13 = mul nuw nsw <2 x i32> %tmp8, <i32 0, i32 65535>
1867 %tmp14 = getelementptr inbounds i32, i32* %pre, i64 %index
1868 %tmp15 = bitcast i32* %tmp14 to <2 x i32>*
1869 store <2 x i32> %tmp13, <2 x i32>* %tmp15, align 4
1873 ; %val = load <2 x i16>
1874 ; %op1 = sext<2 x i32> %val
1875 ; %op2 = const <2 x i32> {c1, c2} // c1 and c2 are within (-32768 ~ 32767)
1876 ; %rst = mul <2 x i32> %op1, %op2
1878 define void @mul_2xi16_varconst2(i8* nocapture readonly %a, i64 %index) {
1879 ; X86-SSE-LABEL: mul_2xi16_varconst2:
1880 ; X86-SSE: # %bb.0: # %entry
1881 ; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %eax
1882 ; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %ecx
1883 ; X86-SSE-NEXT: movl c, %edx
1884 ; X86-SSE-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
1885 ; X86-SSE-NEXT: movdqa {{.*#+}} xmm1 = <32768,32767,u,u,u,u,u,u>
1886 ; X86-SSE-NEXT: movdqa %xmm0, %xmm2
1887 ; X86-SSE-NEXT: pmulhw %xmm1, %xmm2
1888 ; X86-SSE-NEXT: pmullw %xmm1, %xmm0
1889 ; X86-SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
1890 ; X86-SSE-NEXT: movq %xmm0, (%edx,%eax,4)
1891 ; X86-SSE-NEXT: retl
1893 ; X86-AVX-LABEL: mul_2xi16_varconst2:
1894 ; X86-AVX: # %bb.0: # %entry
1895 ; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax
1896 ; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %ecx
1897 ; X86-AVX-NEXT: movl c, %edx
1898 ; X86-AVX-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
1899 ; X86-AVX-NEXT: vpmovsxwd %xmm0, %xmm0
1900 ; X86-AVX-NEXT: vpmulld {{\.LCPI.*}}, %xmm0, %xmm0
1901 ; X86-AVX-NEXT: vmovq %xmm0, (%edx,%eax,4)
1902 ; X86-AVX-NEXT: retl
1904 ; X64-SSE-LABEL: mul_2xi16_varconst2:
1905 ; X64-SSE: # %bb.0: # %entry
1906 ; X64-SSE-NEXT: movq {{.*}}(%rip), %rax
1907 ; X64-SSE-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
1908 ; X64-SSE-NEXT: movdqa {{.*#+}} xmm1 = <32768,32767,u,u,u,u,u,u>
1909 ; X64-SSE-NEXT: movdqa %xmm0, %xmm2
1910 ; X64-SSE-NEXT: pmulhw %xmm1, %xmm2
1911 ; X64-SSE-NEXT: pmullw %xmm1, %xmm0
1912 ; X64-SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
1913 ; X64-SSE-NEXT: movq %xmm0, (%rax,%rsi,4)
1914 ; X64-SSE-NEXT: retq
1916 ; X64-AVX-LABEL: mul_2xi16_varconst2:
1917 ; X64-AVX: # %bb.0: # %entry
1918 ; X64-AVX-NEXT: movq {{.*}}(%rip), %rax
1919 ; X64-AVX-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
1920 ; X64-AVX-NEXT: vpmovsxwd %xmm0, %xmm0
1921 ; X64-AVX-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm0
1922 ; X64-AVX-NEXT: vmovq %xmm0, (%rax,%rsi,4)
1923 ; X64-AVX-NEXT: retq
1925 %pre = load i32*, i32** @c
1926 %tmp6 = getelementptr inbounds i8, i8* %a, i64 %index
1927 %tmp7 = bitcast i8* %tmp6 to <2 x i16>*
1928 %wide.load = load <2 x i16>, <2 x i16>* %tmp7, align 1
1929 %tmp8 = sext <2 x i16> %wide.load to <2 x i32>
1930 %tmp13 = mul nuw nsw <2 x i32> %tmp8, <i32 -32768, i32 32767>
1931 %tmp14 = getelementptr inbounds i32, i32* %pre, i64 %index
1932 %tmp15 = bitcast i32* %tmp14 to <2 x i32>*
1933 store <2 x i32> %tmp13, <2 x i32>* %tmp15, align 4
1937 ; %val = load <2 x i16>
1938 ; %op1 = zext<2 x i32> %val
1939 ; %op2 = const <2 x i32> {c1, c2} // c1 and c2 are within (0 ~ 65536)
1940 ; %rst = mul <2 x i32> %op1, %op2
1942 define void @mul_2xi16_varconst3(i8* nocapture readonly %a, i64 %index) {
1943 ; X86-SSE-LABEL: mul_2xi16_varconst3:
1944 ; X86-SSE: # %bb.0: # %entry
1945 ; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %eax
1946 ; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %ecx
1947 ; X86-SSE-NEXT: movl c, %edx
1948 ; X86-SSE-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
1949 ; X86-SSE-NEXT: pxor %xmm1, %xmm1
1950 ; X86-SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
1951 ; X86-SSE-NEXT: movdqa {{.*#+}} xmm1 = <0,65536,u,u>
1952 ; X86-SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
1953 ; X86-SSE-NEXT: pmuludq %xmm1, %xmm0
1954 ; X86-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
1955 ; X86-SSE-NEXT: pmuludq %xmm2, %xmm1
1956 ; X86-SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
1957 ; X86-SSE-NEXT: movq %xmm0, (%edx,%eax,4)
1958 ; X86-SSE-NEXT: retl
1960 ; X86-AVX-LABEL: mul_2xi16_varconst3:
1961 ; X86-AVX: # %bb.0: # %entry
1962 ; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax
1963 ; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %ecx
1964 ; X86-AVX-NEXT: movl c, %edx
1965 ; X86-AVX-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
1966 ; X86-AVX-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
1967 ; X86-AVX-NEXT: vpmulld {{\.LCPI.*}}, %xmm0, %xmm0
1968 ; X86-AVX-NEXT: vmovq %xmm0, (%edx,%eax,4)
1969 ; X86-AVX-NEXT: retl
1971 ; X64-SSE-LABEL: mul_2xi16_varconst3:
1972 ; X64-SSE: # %bb.0: # %entry
1973 ; X64-SSE-NEXT: movq {{.*}}(%rip), %rax
1974 ; X64-SSE-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
1975 ; X64-SSE-NEXT: pxor %xmm1, %xmm1
1976 ; X64-SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
1977 ; X64-SSE-NEXT: movdqa {{.*#+}} xmm1 = <0,65536,u,u>
1978 ; X64-SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
1979 ; X64-SSE-NEXT: pmuludq %xmm1, %xmm0
1980 ; X64-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
1981 ; X64-SSE-NEXT: pmuludq %xmm2, %xmm1
1982 ; X64-SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
1983 ; X64-SSE-NEXT: movq %xmm0, (%rax,%rsi,4)
1984 ; X64-SSE-NEXT: retq
1986 ; X64-AVX-LABEL: mul_2xi16_varconst3:
1987 ; X64-AVX: # %bb.0: # %entry
1988 ; X64-AVX-NEXT: movq {{.*}}(%rip), %rax
1989 ; X64-AVX-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
1990 ; X64-AVX-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
1991 ; X64-AVX-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm0
1992 ; X64-AVX-NEXT: vmovq %xmm0, (%rax,%rsi,4)
1993 ; X64-AVX-NEXT: retq
1995 %pre = load i32*, i32** @c
1996 %tmp6 = getelementptr inbounds i8, i8* %a, i64 %index
1997 %tmp7 = bitcast i8* %tmp6 to <2 x i16>*
1998 %wide.load = load <2 x i16>, <2 x i16>* %tmp7, align 1
1999 %tmp8 = zext <2 x i16> %wide.load to <2 x i32>
2000 %tmp13 = mul nuw nsw <2 x i32> %tmp8, <i32 0, i32 65536>
2001 %tmp14 = getelementptr inbounds i32, i32* %pre, i64 %index
2002 %tmp15 = bitcast i32* %tmp14 to <2 x i32>*
2003 store <2 x i32> %tmp13, <2 x i32>* %tmp15, align 4
2007 ; %val = load <2 x i16>
2008 ; %op1 = sext<2 x i32> %val
2009 ; %op2 = const <2 x i32> {c1, c2} // c1 and c2 are within (0 ~ 32768)
2010 ; %rst = mul <2 x i32> %op1, %op2
2012 define void @mul_2xi16_varconst4(i8* nocapture readonly %a, i64 %index) {
2013 ; X86-SSE-LABEL: mul_2xi16_varconst4:
2014 ; X86-SSE: # %bb.0: # %entry
2015 ; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %eax
2016 ; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %ecx
2017 ; X86-SSE-NEXT: movl c, %edx
2018 ; X86-SSE-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
2019 ; X86-SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,2,1,4,5,6,7]
2020 ; X86-SSE-NEXT: psrad $16, %xmm0
2021 ; X86-SSE-NEXT: movdqa {{.*#+}} xmm1 = <0,32768,u,u>
2022 ; X86-SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
2023 ; X86-SSE-NEXT: pmuludq %xmm1, %xmm0
2024 ; X86-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
2025 ; X86-SSE-NEXT: pmuludq %xmm2, %xmm1
2026 ; X86-SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
2027 ; X86-SSE-NEXT: movq %xmm0, (%edx,%eax,4)
2028 ; X86-SSE-NEXT: retl
2030 ; X86-AVX-LABEL: mul_2xi16_varconst4:
2031 ; X86-AVX: # %bb.0: # %entry
2032 ; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax
2033 ; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %ecx
2034 ; X86-AVX-NEXT: movl c, %edx
2035 ; X86-AVX-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
2036 ; X86-AVX-NEXT: vpmovsxwd %xmm0, %xmm0
2037 ; X86-AVX-NEXT: vpmulld {{\.LCPI.*}}, %xmm0, %xmm0
2038 ; X86-AVX-NEXT: vmovq %xmm0, (%edx,%eax,4)
2039 ; X86-AVX-NEXT: retl
2041 ; X64-SSE-LABEL: mul_2xi16_varconst4:
2042 ; X64-SSE: # %bb.0: # %entry
2043 ; X64-SSE-NEXT: movq {{.*}}(%rip), %rax
2044 ; X64-SSE-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
2045 ; X64-SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,2,1,4,5,6,7]
2046 ; X64-SSE-NEXT: psrad $16, %xmm0
2047 ; X64-SSE-NEXT: movdqa {{.*#+}} xmm1 = <0,32768,u,u>
2048 ; X64-SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
2049 ; X64-SSE-NEXT: pmuludq %xmm1, %xmm0
2050 ; X64-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
2051 ; X64-SSE-NEXT: pmuludq %xmm2, %xmm1
2052 ; X64-SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
2053 ; X64-SSE-NEXT: movq %xmm0, (%rax,%rsi,4)
2054 ; X64-SSE-NEXT: retq
2056 ; X64-AVX-LABEL: mul_2xi16_varconst4:
2057 ; X64-AVX: # %bb.0: # %entry
2058 ; X64-AVX-NEXT: movq {{.*}}(%rip), %rax
2059 ; X64-AVX-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
2060 ; X64-AVX-NEXT: vpmovsxwd %xmm0, %xmm0
2061 ; X64-AVX-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm0
2062 ; X64-AVX-NEXT: vmovq %xmm0, (%rax,%rsi,4)
2063 ; X64-AVX-NEXT: retq
2065 %pre = load i32*, i32** @c
2066 %tmp6 = getelementptr inbounds i8, i8* %a, i64 %index
2067 %tmp7 = bitcast i8* %tmp6 to <2 x i16>*
2068 %wide.load = load <2 x i16>, <2 x i16>* %tmp7, align 1
2069 %tmp8 = sext <2 x i16> %wide.load to <2 x i32>
2070 %tmp13 = mul nuw nsw <2 x i32> %tmp8, <i32 0, i32 32768>
2071 %tmp14 = getelementptr inbounds i32, i32* %pre, i64 %index
2072 %tmp15 = bitcast i32* %tmp14 to <2 x i32>*
2073 store <2 x i32> %tmp13, <2 x i32>* %tmp15, align 4
2081 define void @PR34947(<9 x i16>* %p0, <9 x i32>* %p1) nounwind {
2082 ; X86-SSE-LABEL: PR34947:
2084 ; X86-SSE-NEXT: pushl %esi
2085 ; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %ecx
2086 ; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %eax
2087 ; X86-SSE-NEXT: movdqa (%eax), %xmm5
2088 ; X86-SSE-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
2089 ; X86-SSE-NEXT: movdqa (%ecx), %xmm2
2090 ; X86-SSE-NEXT: movdqa 16(%ecx), %xmm6
2091 ; X86-SSE-NEXT: pxor %xmm0, %xmm0
2092 ; X86-SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
2093 ; X86-SSE-NEXT: movdqa %xmm5, %xmm4
2094 ; X86-SSE-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1],xmm4[2],xmm0[2],xmm4[3],xmm0[3]
2095 ; X86-SSE-NEXT: punpckhwd {{.*#+}} xmm5 = xmm5[4],xmm0[4],xmm5[5],xmm0[5],xmm5[6],xmm0[6],xmm5[7],xmm0[7]
2096 ; X86-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm5[3,1,2,3]
2097 ; X86-SSE-NEXT: movd %xmm0, %eax
2098 ; X86-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm6[3,1,2,3]
2099 ; X86-SSE-NEXT: movd %xmm0, %esi
2100 ; X86-SSE-NEXT: xorl %edx, %edx
2101 ; X86-SSE-NEXT: divl %esi
2102 ; X86-SSE-NEXT: movd %edx, %xmm0
2103 ; X86-SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm5[2,3,0,1]
2104 ; X86-SSE-NEXT: movd %xmm3, %eax
2105 ; X86-SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm6[2,3,0,1]
2106 ; X86-SSE-NEXT: movd %xmm3, %esi
2107 ; X86-SSE-NEXT: xorl %edx, %edx
2108 ; X86-SSE-NEXT: divl %esi
2109 ; X86-SSE-NEXT: movd %edx, %xmm7
2110 ; X86-SSE-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm0[0],xmm7[1],xmm0[1]
2111 ; X86-SSE-NEXT: movd %xmm5, %eax
2112 ; X86-SSE-NEXT: movd %xmm6, %esi
2113 ; X86-SSE-NEXT: xorl %edx, %edx
2114 ; X86-SSE-NEXT: divl %esi
2115 ; X86-SSE-NEXT: movd %edx, %xmm3
2116 ; X86-SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,1,2,3]
2117 ; X86-SSE-NEXT: movd %xmm5, %eax
2118 ; X86-SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm6[1,1,2,3]
2119 ; X86-SSE-NEXT: movd %xmm5, %esi
2120 ; X86-SSE-NEXT: xorl %edx, %edx
2121 ; X86-SSE-NEXT: divl %esi
2122 ; X86-SSE-NEXT: movd %edx, %xmm5
2123 ; X86-SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm5[0],xmm3[1],xmm5[1]
2124 ; X86-SSE-NEXT: punpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm7[0]
2125 ; X86-SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm4[3,1,2,3]
2126 ; X86-SSE-NEXT: movd %xmm6, %eax
2127 ; X86-SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm2[3,1,2,3]
2128 ; X86-SSE-NEXT: movd %xmm6, %esi
2129 ; X86-SSE-NEXT: xorl %edx, %edx
2130 ; X86-SSE-NEXT: divl %esi
2131 ; X86-SSE-NEXT: movd %edx, %xmm6
2132 ; X86-SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm4[2,3,0,1]
2133 ; X86-SSE-NEXT: movd %xmm7, %eax
2134 ; X86-SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm2[2,3,0,1]
2135 ; X86-SSE-NEXT: movd %xmm7, %esi
2136 ; X86-SSE-NEXT: xorl %edx, %edx
2137 ; X86-SSE-NEXT: divl %esi
2138 ; X86-SSE-NEXT: movd %edx, %xmm7
2139 ; X86-SSE-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm6[0],xmm7[1],xmm6[1]
2140 ; X86-SSE-NEXT: movd %xmm4, %eax
2141 ; X86-SSE-NEXT: movd %xmm2, %esi
2142 ; X86-SSE-NEXT: xorl %edx, %edx
2143 ; X86-SSE-NEXT: divl %esi
2144 ; X86-SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,2,3]
2145 ; X86-SSE-NEXT: movd %xmm4, %eax
2146 ; X86-SSE-NEXT: movd %edx, %xmm4
2147 ; X86-SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,2,3]
2148 ; X86-SSE-NEXT: movd %xmm2, %esi
2149 ; X86-SSE-NEXT: xorl %edx, %edx
2150 ; X86-SSE-NEXT: divl %esi
2151 ; X86-SSE-NEXT: movd %edx, %xmm2
2152 ; X86-SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1]
2153 ; X86-SSE-NEXT: punpcklqdq {{.*#+}} xmm4 = xmm4[0],xmm7[0]
2154 ; X86-SSE-NEXT: movd %xmm1, %eax
2155 ; X86-SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,0],xmm6[0,0]
2156 ; X86-SSE-NEXT: movdqa {{.*#+}} xmm1 = [8199,8199,8199,8199]
2157 ; X86-SSE-NEXT: pmuludq %xmm1, %xmm4
2158 ; X86-SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,2,2,3]
2159 ; X86-SSE-NEXT: pmuludq %xmm1, %xmm2
2160 ; X86-SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
2161 ; X86-SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1]
2162 ; X86-SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,0],xmm0[0,0]
2163 ; X86-SSE-NEXT: pmuludq %xmm1, %xmm3
2164 ; X86-SSE-NEXT: pmuludq %xmm1, %xmm5
2165 ; X86-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm3[0,2,2,3]
2166 ; X86-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm5[0,2,2,3]
2167 ; X86-SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
2168 ; X86-SSE-NEXT: xorl %edx, %edx
2169 ; X86-SSE-NEXT: divl 32(%ecx)
2170 ; X86-SSE-NEXT: movdqa %xmm0, (%eax)
2171 ; X86-SSE-NEXT: movdqa %xmm4, (%eax)
2172 ; X86-SSE-NEXT: imull $8199, %edx, %eax # imm = 0x2007
2173 ; X86-SSE-NEXT: movl %eax, (%eax)
2174 ; X86-SSE-NEXT: popl %esi
2175 ; X86-SSE-NEXT: retl
2177 ; X86-AVX1-LABEL: PR34947:
2178 ; X86-AVX1: # %bb.0:
2179 ; X86-AVX1-NEXT: pushl %ebp
2180 ; X86-AVX1-NEXT: pushl %ebx
2181 ; X86-AVX1-NEXT: pushl %edi
2182 ; X86-AVX1-NEXT: pushl %esi
2183 ; X86-AVX1-NEXT: subl $16, %esp
2184 ; X86-AVX1-NEXT: movl {{[0-9]+}}(%esp), %ecx
2185 ; X86-AVX1-NEXT: movl {{[0-9]+}}(%esp), %eax
2186 ; X86-AVX1-NEXT: vpmovzxwd {{.*#+}} xmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
2187 ; X86-AVX1-NEXT: vpmovzxwd {{.*#+}} xmm2 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
2188 ; X86-AVX1-NEXT: vpmovzxwd {{.*#+}} xmm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
2189 ; X86-AVX1-NEXT: vmovd %xmm1, %eax
2190 ; X86-AVX1-NEXT: xorl %edx, %edx
2191 ; X86-AVX1-NEXT: divl 32(%ecx)
2192 ; X86-AVX1-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
2193 ; X86-AVX1-NEXT: vpextrd $3, %xmm2, %eax
2194 ; X86-AVX1-NEXT: vmovdqa (%ecx), %xmm1
2195 ; X86-AVX1-NEXT: vmovdqa 16(%ecx), %xmm3
2196 ; X86-AVX1-NEXT: vpextrd $3, %xmm3, %ecx
2197 ; X86-AVX1-NEXT: xorl %edx, %edx
2198 ; X86-AVX1-NEXT: divl %ecx
2199 ; X86-AVX1-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
2200 ; X86-AVX1-NEXT: vpextrd $2, %xmm2, %eax
2201 ; X86-AVX1-NEXT: vpextrd $2, %xmm3, %ecx
2202 ; X86-AVX1-NEXT: xorl %edx, %edx
2203 ; X86-AVX1-NEXT: divl %ecx
2204 ; X86-AVX1-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
2205 ; X86-AVX1-NEXT: vpextrd $1, %xmm2, %eax
2206 ; X86-AVX1-NEXT: vpextrd $1, %xmm3, %ecx
2207 ; X86-AVX1-NEXT: xorl %edx, %edx
2208 ; X86-AVX1-NEXT: divl %ecx
2209 ; X86-AVX1-NEXT: movl %edx, (%esp) # 4-byte Spill
2210 ; X86-AVX1-NEXT: vmovd %xmm2, %eax
2211 ; X86-AVX1-NEXT: vmovd %xmm3, %ecx
2212 ; X86-AVX1-NEXT: xorl %edx, %edx
2213 ; X86-AVX1-NEXT: divl %ecx
2214 ; X86-AVX1-NEXT: movl %edx, %ebp
2215 ; X86-AVX1-NEXT: vpextrd $3, %xmm0, %eax
2216 ; X86-AVX1-NEXT: vpextrd $3, %xmm1, %ecx
2217 ; X86-AVX1-NEXT: xorl %edx, %edx
2218 ; X86-AVX1-NEXT: divl %ecx
2219 ; X86-AVX1-NEXT: movl %edx, %ebx
2220 ; X86-AVX1-NEXT: vpextrd $2, %xmm0, %eax
2221 ; X86-AVX1-NEXT: vpextrd $2, %xmm1, %esi
2222 ; X86-AVX1-NEXT: xorl %edx, %edx
2223 ; X86-AVX1-NEXT: divl %esi
2224 ; X86-AVX1-NEXT: movl %edx, %esi
2225 ; X86-AVX1-NEXT: vpextrd $1, %xmm0, %eax
2226 ; X86-AVX1-NEXT: vpextrd $1, %xmm1, %edi
2227 ; X86-AVX1-NEXT: xorl %edx, %edx
2228 ; X86-AVX1-NEXT: divl %edi
2229 ; X86-AVX1-NEXT: movl %edx, %edi
2230 ; X86-AVX1-NEXT: vmovd %xmm0, %eax
2231 ; X86-AVX1-NEXT: vmovd %xmm1, %ecx
2232 ; X86-AVX1-NEXT: xorl %edx, %edx
2233 ; X86-AVX1-NEXT: divl %ecx
2234 ; X86-AVX1-NEXT: vmovd %edx, %xmm0
2235 ; X86-AVX1-NEXT: vpinsrd $1, %edi, %xmm0, %xmm0
2236 ; X86-AVX1-NEXT: vpinsrd $2, %esi, %xmm0, %xmm0
2237 ; X86-AVX1-NEXT: vpinsrd $3, %ebx, %xmm0, %xmm0
2238 ; X86-AVX1-NEXT: vmovd %ebp, %xmm1
2239 ; X86-AVX1-NEXT: vpinsrd $1, (%esp), %xmm1, %xmm1 # 4-byte Folded Reload
2240 ; X86-AVX1-NEXT: vpinsrd $2, {{[-0-9]+}}(%e{{[sb]}}p), %xmm1, %xmm1 # 4-byte Folded Reload
2241 ; X86-AVX1-NEXT: vpinsrd $3, {{[-0-9]+}}(%e{{[sb]}}p), %xmm1, %xmm1 # 4-byte Folded Reload
2242 ; X86-AVX1-NEXT: imull $8199, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
2243 ; X86-AVX1-NEXT: # imm = 0x2007
2244 ; X86-AVX1-NEXT: movl %eax, (%eax)
2245 ; X86-AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [8199,8199,8199,8199]
2246 ; X86-AVX1-NEXT: vpmulld %xmm2, %xmm0, %xmm0
2247 ; X86-AVX1-NEXT: vpmulld %xmm2, %xmm1, %xmm1
2248 ; X86-AVX1-NEXT: vmovdqa %xmm1, (%eax)
2249 ; X86-AVX1-NEXT: vmovdqa %xmm0, (%eax)
2250 ; X86-AVX1-NEXT: addl $16, %esp
2251 ; X86-AVX1-NEXT: popl %esi
2252 ; X86-AVX1-NEXT: popl %edi
2253 ; X86-AVX1-NEXT: popl %ebx
2254 ; X86-AVX1-NEXT: popl %ebp
2255 ; X86-AVX1-NEXT: retl
2257 ; X86-AVX2-LABEL: PR34947:
2258 ; X86-AVX2: # %bb.0:
2259 ; X86-AVX2-NEXT: pushl %edi
2260 ; X86-AVX2-NEXT: pushl %esi
2261 ; X86-AVX2-NEXT: movl {{[0-9]+}}(%esp), %esi
2262 ; X86-AVX2-NEXT: movl {{[0-9]+}}(%esp), %eax
2263 ; X86-AVX2-NEXT: vpmovzxwd {{.*#+}} ymm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero
2264 ; X86-AVX2-NEXT: vpmovzxwd {{.*#+}} ymm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero
2265 ; X86-AVX2-NEXT: vmovdqa (%esi), %xmm2
2266 ; X86-AVX2-NEXT: vmovdqa 16(%esi), %xmm3
2267 ; X86-AVX2-NEXT: vpextrd $1, %xmm3, %ecx
2268 ; X86-AVX2-NEXT: vextracti128 $1, %ymm1, %xmm4
2269 ; X86-AVX2-NEXT: vpextrd $1, %xmm4, %eax
2270 ; X86-AVX2-NEXT: xorl %edx, %edx
2271 ; X86-AVX2-NEXT: divl %ecx
2272 ; X86-AVX2-NEXT: movl %edx, %ecx
2273 ; X86-AVX2-NEXT: vmovd %xmm3, %edi
2274 ; X86-AVX2-NEXT: vmovd %xmm4, %eax
2275 ; X86-AVX2-NEXT: xorl %edx, %edx
2276 ; X86-AVX2-NEXT: divl %edi
2277 ; X86-AVX2-NEXT: vmovd %edx, %xmm5
2278 ; X86-AVX2-NEXT: vpinsrd $1, %ecx, %xmm5, %xmm5
2279 ; X86-AVX2-NEXT: vpextrd $2, %xmm3, %ecx
2280 ; X86-AVX2-NEXT: vpextrd $2, %xmm4, %eax
2281 ; X86-AVX2-NEXT: xorl %edx, %edx
2282 ; X86-AVX2-NEXT: divl %ecx
2283 ; X86-AVX2-NEXT: vpinsrd $2, %edx, %xmm5, %xmm5
2284 ; X86-AVX2-NEXT: vpextrd $3, %xmm3, %ecx
2285 ; X86-AVX2-NEXT: vpextrd $3, %xmm4, %eax
2286 ; X86-AVX2-NEXT: xorl %edx, %edx
2287 ; X86-AVX2-NEXT: divl %ecx
2288 ; X86-AVX2-NEXT: vpinsrd $3, %edx, %xmm5, %xmm3
2289 ; X86-AVX2-NEXT: vpextrd $1, %xmm2, %ecx
2290 ; X86-AVX2-NEXT: vpextrd $1, %xmm1, %eax
2291 ; X86-AVX2-NEXT: xorl %edx, %edx
2292 ; X86-AVX2-NEXT: divl %ecx
2293 ; X86-AVX2-NEXT: movl %edx, %ecx
2294 ; X86-AVX2-NEXT: vmovd %xmm2, %edi
2295 ; X86-AVX2-NEXT: vmovd %xmm1, %eax
2296 ; X86-AVX2-NEXT: xorl %edx, %edx
2297 ; X86-AVX2-NEXT: divl %edi
2298 ; X86-AVX2-NEXT: vmovd %edx, %xmm4
2299 ; X86-AVX2-NEXT: vpinsrd $1, %ecx, %xmm4, %xmm4
2300 ; X86-AVX2-NEXT: vpextrd $2, %xmm2, %ecx
2301 ; X86-AVX2-NEXT: vpextrd $2, %xmm1, %eax
2302 ; X86-AVX2-NEXT: xorl %edx, %edx
2303 ; X86-AVX2-NEXT: divl %ecx
2304 ; X86-AVX2-NEXT: vpinsrd $2, %edx, %xmm4, %xmm4
2305 ; X86-AVX2-NEXT: vpextrd $3, %xmm2, %ecx
2306 ; X86-AVX2-NEXT: vpextrd $3, %xmm1, %eax
2307 ; X86-AVX2-NEXT: xorl %edx, %edx
2308 ; X86-AVX2-NEXT: divl %ecx
2309 ; X86-AVX2-NEXT: vpinsrd $3, %edx, %xmm4, %xmm1
2310 ; X86-AVX2-NEXT: vinserti128 $1, %xmm3, %ymm1, %ymm1
2311 ; X86-AVX2-NEXT: vmovd %xmm0, %eax
2312 ; X86-AVX2-NEXT: xorl %edx, %edx
2313 ; X86-AVX2-NEXT: divl 32(%esi)
2314 ; X86-AVX2-NEXT: vpbroadcastd {{.*#+}} ymm0 = [8199,8199,8199,8199,8199,8199,8199,8199]
2315 ; X86-AVX2-NEXT: vpmulld %ymm0, %ymm1, %ymm0
2316 ; X86-AVX2-NEXT: imull $8199, %edx, %eax # imm = 0x2007
2317 ; X86-AVX2-NEXT: movl %eax, (%eax)
2318 ; X86-AVX2-NEXT: vmovdqa %ymm0, (%eax)
2319 ; X86-AVX2-NEXT: popl %esi
2320 ; X86-AVX2-NEXT: popl %edi
2321 ; X86-AVX2-NEXT: vzeroupper
2322 ; X86-AVX2-NEXT: retl
2324 ; X64-SSE-LABEL: PR34947:
2326 ; X64-SSE-NEXT: movdqa (%rdi), %xmm5
2327 ; X64-SSE-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
2328 ; X64-SSE-NEXT: movdqa (%rsi), %xmm2
2329 ; X64-SSE-NEXT: movdqa 16(%rsi), %xmm6
2330 ; X64-SSE-NEXT: pxor %xmm0, %xmm0
2331 ; X64-SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
2332 ; X64-SSE-NEXT: movdqa %xmm5, %xmm3
2333 ; X64-SSE-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3]
2334 ; X64-SSE-NEXT: punpckhwd {{.*#+}} xmm5 = xmm5[4],xmm0[4],xmm5[5],xmm0[5],xmm5[6],xmm0[6],xmm5[7],xmm0[7]
2335 ; X64-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm5[3,1,2,3]
2336 ; X64-SSE-NEXT: movd %xmm0, %eax
2337 ; X64-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm6[3,1,2,3]
2338 ; X64-SSE-NEXT: movd %xmm0, %ecx
2339 ; X64-SSE-NEXT: xorl %edx, %edx
2340 ; X64-SSE-NEXT: divl %ecx
2341 ; X64-SSE-NEXT: movd %edx, %xmm8
2342 ; X64-SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm5[2,3,0,1]
2343 ; X64-SSE-NEXT: movd %xmm4, %eax
2344 ; X64-SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm6[2,3,0,1]
2345 ; X64-SSE-NEXT: movd %xmm4, %ecx
2346 ; X64-SSE-NEXT: xorl %edx, %edx
2347 ; X64-SSE-NEXT: divl %ecx
2348 ; X64-SSE-NEXT: movd %edx, %xmm7
2349 ; X64-SSE-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm8[0],xmm7[1],xmm8[1]
2350 ; X64-SSE-NEXT: movd %xmm5, %eax
2351 ; X64-SSE-NEXT: movd %xmm6, %ecx
2352 ; X64-SSE-NEXT: xorl %edx, %edx
2353 ; X64-SSE-NEXT: divl %ecx
2354 ; X64-SSE-NEXT: movd %edx, %xmm4
2355 ; X64-SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,1,2,3]
2356 ; X64-SSE-NEXT: movd %xmm5, %eax
2357 ; X64-SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm6[1,1,2,3]
2358 ; X64-SSE-NEXT: movd %xmm5, %ecx
2359 ; X64-SSE-NEXT: xorl %edx, %edx
2360 ; X64-SSE-NEXT: divl %ecx
2361 ; X64-SSE-NEXT: movd %edx, %xmm5
2362 ; X64-SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1]
2363 ; X64-SSE-NEXT: punpcklqdq {{.*#+}} xmm4 = xmm4[0],xmm7[0]
2364 ; X64-SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm3[3,1,2,3]
2365 ; X64-SSE-NEXT: movd %xmm6, %eax
2366 ; X64-SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm2[3,1,2,3]
2367 ; X64-SSE-NEXT: movd %xmm6, %ecx
2368 ; X64-SSE-NEXT: xorl %edx, %edx
2369 ; X64-SSE-NEXT: divl %ecx
2370 ; X64-SSE-NEXT: movd %edx, %xmm6
2371 ; X64-SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm3[2,3,0,1]
2372 ; X64-SSE-NEXT: movd %xmm7, %eax
2373 ; X64-SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm2[2,3,0,1]
2374 ; X64-SSE-NEXT: movd %xmm7, %ecx
2375 ; X64-SSE-NEXT: xorl %edx, %edx
2376 ; X64-SSE-NEXT: divl %ecx
2377 ; X64-SSE-NEXT: movd %edx, %xmm7
2378 ; X64-SSE-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm6[0],xmm7[1],xmm6[1]
2379 ; X64-SSE-NEXT: movd %xmm3, %eax
2380 ; X64-SSE-NEXT: movd %xmm2, %ecx
2381 ; X64-SSE-NEXT: xorl %edx, %edx
2382 ; X64-SSE-NEXT: divl %ecx
2383 ; X64-SSE-NEXT: movd %edx, %xmm0
2384 ; X64-SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,2,3]
2385 ; X64-SSE-NEXT: movd %xmm3, %eax
2386 ; X64-SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,2,3]
2387 ; X64-SSE-NEXT: movd %xmm2, %ecx
2388 ; X64-SSE-NEXT: xorl %edx, %edx
2389 ; X64-SSE-NEXT: divl %ecx
2390 ; X64-SSE-NEXT: movd %edx, %xmm2
2391 ; X64-SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
2392 ; X64-SSE-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm7[0]
2393 ; X64-SSE-NEXT: movd %xmm1, %eax
2394 ; X64-SSE-NEXT: xorl %edx, %edx
2395 ; X64-SSE-NEXT: divl 32(%rsi)
2396 ; X64-SSE-NEXT: movdqa {{.*#+}} xmm1 = [8199,8199,8199,8199]
2397 ; X64-SSE-NEXT: pmuludq %xmm1, %xmm0
2398 ; X64-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
2399 ; X64-SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,0],xmm6[0,0]
2400 ; X64-SSE-NEXT: pmuludq %xmm1, %xmm2
2401 ; X64-SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
2402 ; X64-SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
2403 ; X64-SSE-NEXT: pmuludq %xmm1, %xmm4
2404 ; X64-SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm4[0,2,2,3]
2405 ; X64-SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,0],xmm8[0,0]
2406 ; X64-SSE-NEXT: pmuludq %xmm1, %xmm5
2407 ; X64-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm5[0,2,2,3]
2408 ; X64-SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
2409 ; X64-SSE-NEXT: imull $8199, %edx, %eax # imm = 0x2007
2410 ; X64-SSE-NEXT: movl %eax, (%rax)
2411 ; X64-SSE-NEXT: movdqa %xmm2, (%rax)
2412 ; X64-SSE-NEXT: movdqa %xmm0, (%rax)
2413 ; X64-SSE-NEXT: retq
2415 ; X64-AVX1-LABEL: PR34947:
2416 ; X64-AVX1: # %bb.0:
2417 ; X64-AVX1-NEXT: pushq %rbp
2418 ; X64-AVX1-NEXT: pushq %rbx
2419 ; X64-AVX1-NEXT: vpmovzxwd {{.*#+}} xmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
2420 ; X64-AVX1-NEXT: vpmovzxwd {{.*#+}} xmm2 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
2421 ; X64-AVX1-NEXT: vpmovzxwd {{.*#+}} xmm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
2422 ; X64-AVX1-NEXT: vmovd %xmm1, %eax
2423 ; X64-AVX1-NEXT: xorl %edx, %edx
2424 ; X64-AVX1-NEXT: divl 32(%rsi)
2425 ; X64-AVX1-NEXT: movl %edx, %r8d
2426 ; X64-AVX1-NEXT: vpextrd $3, %xmm2, %eax
2427 ; X64-AVX1-NEXT: vmovdqa (%rsi), %xmm1
2428 ; X64-AVX1-NEXT: vmovdqa 16(%rsi), %xmm3
2429 ; X64-AVX1-NEXT: vpextrd $3, %xmm3, %ecx
2430 ; X64-AVX1-NEXT: xorl %edx, %edx
2431 ; X64-AVX1-NEXT: divl %ecx
2432 ; X64-AVX1-NEXT: movl %edx, %r9d
2433 ; X64-AVX1-NEXT: vpextrd $2, %xmm2, %eax
2434 ; X64-AVX1-NEXT: vpextrd $2, %xmm3, %ecx
2435 ; X64-AVX1-NEXT: xorl %edx, %edx
2436 ; X64-AVX1-NEXT: divl %ecx
2437 ; X64-AVX1-NEXT: movl %edx, %r10d
2438 ; X64-AVX1-NEXT: vpextrd $1, %xmm2, %eax
2439 ; X64-AVX1-NEXT: vpextrd $1, %xmm3, %ecx
2440 ; X64-AVX1-NEXT: xorl %edx, %edx
2441 ; X64-AVX1-NEXT: divl %ecx
2442 ; X64-AVX1-NEXT: movl %edx, %r11d
2443 ; X64-AVX1-NEXT: vmovd %xmm2, %eax
2444 ; X64-AVX1-NEXT: vmovd %xmm3, %ecx
2445 ; X64-AVX1-NEXT: xorl %edx, %edx
2446 ; X64-AVX1-NEXT: divl %ecx
2447 ; X64-AVX1-NEXT: movl %edx, %esi
2448 ; X64-AVX1-NEXT: vpextrd $3, %xmm0, %eax
2449 ; X64-AVX1-NEXT: vpextrd $3, %xmm1, %ecx
2450 ; X64-AVX1-NEXT: xorl %edx, %edx
2451 ; X64-AVX1-NEXT: divl %ecx
2452 ; X64-AVX1-NEXT: movl %edx, %edi
2453 ; X64-AVX1-NEXT: vpextrd $2, %xmm0, %eax
2454 ; X64-AVX1-NEXT: vpextrd $2, %xmm1, %ecx
2455 ; X64-AVX1-NEXT: xorl %edx, %edx
2456 ; X64-AVX1-NEXT: divl %ecx
2457 ; X64-AVX1-NEXT: movl %edx, %ecx
2458 ; X64-AVX1-NEXT: vpextrd $1, %xmm0, %eax
2459 ; X64-AVX1-NEXT: vpextrd $1, %xmm1, %ebx
2460 ; X64-AVX1-NEXT: xorl %edx, %edx
2461 ; X64-AVX1-NEXT: divl %ebx
2462 ; X64-AVX1-NEXT: movl %edx, %ebx
2463 ; X64-AVX1-NEXT: vmovd %xmm0, %eax
2464 ; X64-AVX1-NEXT: vmovd %xmm1, %ebp
2465 ; X64-AVX1-NEXT: xorl %edx, %edx
2466 ; X64-AVX1-NEXT: divl %ebp
2467 ; X64-AVX1-NEXT: vmovd %edx, %xmm0
2468 ; X64-AVX1-NEXT: vpinsrd $1, %ebx, %xmm0, %xmm0
2469 ; X64-AVX1-NEXT: vpinsrd $2, %ecx, %xmm0, %xmm0
2470 ; X64-AVX1-NEXT: vpinsrd $3, %edi, %xmm0, %xmm0
2471 ; X64-AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [8199,8199,8199,8199]
2472 ; X64-AVX1-NEXT: vpmulld %xmm1, %xmm0, %xmm0
2473 ; X64-AVX1-NEXT: vmovd %esi, %xmm2
2474 ; X64-AVX1-NEXT: vpinsrd $1, %r11d, %xmm2, %xmm2
2475 ; X64-AVX1-NEXT: vpinsrd $2, %r10d, %xmm2, %xmm2
2476 ; X64-AVX1-NEXT: vpinsrd $3, %r9d, %xmm2, %xmm2
2477 ; X64-AVX1-NEXT: vpmulld %xmm1, %xmm2, %xmm1
2478 ; X64-AVX1-NEXT: imull $8199, %r8d, %eax # imm = 0x2007
2479 ; X64-AVX1-NEXT: movl %eax, (%rax)
2480 ; X64-AVX1-NEXT: vmovdqa %xmm1, (%rax)
2481 ; X64-AVX1-NEXT: vmovdqa %xmm0, (%rax)
2482 ; X64-AVX1-NEXT: popq %rbx
2483 ; X64-AVX1-NEXT: popq %rbp
2484 ; X64-AVX1-NEXT: retq
2486 ; X64-AVX2-LABEL: PR34947:
2487 ; X64-AVX2: # %bb.0:
2488 ; X64-AVX2-NEXT: vpmovzxwd {{.*#+}} ymm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero
2489 ; X64-AVX2-NEXT: vpmovzxwd {{.*#+}} ymm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero
2490 ; X64-AVX2-NEXT: vmovdqa (%rsi), %xmm2
2491 ; X64-AVX2-NEXT: vmovdqa 16(%rsi), %xmm3
2492 ; X64-AVX2-NEXT: vpextrd $1, %xmm3, %ecx
2493 ; X64-AVX2-NEXT: vextracti128 $1, %ymm1, %xmm4
2494 ; X64-AVX2-NEXT: vpextrd $1, %xmm4, %eax
2495 ; X64-AVX2-NEXT: xorl %edx, %edx
2496 ; X64-AVX2-NEXT: divl %ecx
2497 ; X64-AVX2-NEXT: movl %edx, %ecx
2498 ; X64-AVX2-NEXT: vmovd %xmm3, %edi
2499 ; X64-AVX2-NEXT: vmovd %xmm4, %eax
2500 ; X64-AVX2-NEXT: xorl %edx, %edx
2501 ; X64-AVX2-NEXT: divl %edi
2502 ; X64-AVX2-NEXT: vmovd %edx, %xmm5
2503 ; X64-AVX2-NEXT: vpinsrd $1, %ecx, %xmm5, %xmm5
2504 ; X64-AVX2-NEXT: vpextrd $2, %xmm3, %ecx
2505 ; X64-AVX2-NEXT: vpextrd $2, %xmm4, %eax
2506 ; X64-AVX2-NEXT: xorl %edx, %edx
2507 ; X64-AVX2-NEXT: divl %ecx
2508 ; X64-AVX2-NEXT: vpinsrd $2, %edx, %xmm5, %xmm5
2509 ; X64-AVX2-NEXT: vpextrd $3, %xmm3, %ecx
2510 ; X64-AVX2-NEXT: vpextrd $3, %xmm4, %eax
2511 ; X64-AVX2-NEXT: xorl %edx, %edx
2512 ; X64-AVX2-NEXT: divl %ecx
2513 ; X64-AVX2-NEXT: vpinsrd $3, %edx, %xmm5, %xmm3
2514 ; X64-AVX2-NEXT: vpextrd $1, %xmm2, %ecx
2515 ; X64-AVX2-NEXT: vpextrd $1, %xmm1, %eax
2516 ; X64-AVX2-NEXT: xorl %edx, %edx
2517 ; X64-AVX2-NEXT: divl %ecx
2518 ; X64-AVX2-NEXT: movl %edx, %ecx
2519 ; X64-AVX2-NEXT: vmovd %xmm2, %edi
2520 ; X64-AVX2-NEXT: vmovd %xmm1, %eax
2521 ; X64-AVX2-NEXT: xorl %edx, %edx
2522 ; X64-AVX2-NEXT: divl %edi
2523 ; X64-AVX2-NEXT: vmovd %edx, %xmm4
2524 ; X64-AVX2-NEXT: vpinsrd $1, %ecx, %xmm4, %xmm4
2525 ; X64-AVX2-NEXT: vpextrd $2, %xmm2, %ecx
2526 ; X64-AVX2-NEXT: vpextrd $2, %xmm1, %eax
2527 ; X64-AVX2-NEXT: xorl %edx, %edx
2528 ; X64-AVX2-NEXT: divl %ecx
2529 ; X64-AVX2-NEXT: vpinsrd $2, %edx, %xmm4, %xmm4
2530 ; X64-AVX2-NEXT: vpextrd $3, %xmm2, %ecx
2531 ; X64-AVX2-NEXT: vpextrd $3, %xmm1, %eax
2532 ; X64-AVX2-NEXT: xorl %edx, %edx
2533 ; X64-AVX2-NEXT: divl %ecx
2534 ; X64-AVX2-NEXT: vpinsrd $3, %edx, %xmm4, %xmm1
2535 ; X64-AVX2-NEXT: vinserti128 $1, %xmm3, %ymm1, %ymm1
2536 ; X64-AVX2-NEXT: vmovd %xmm0, %eax
2537 ; X64-AVX2-NEXT: xorl %edx, %edx
2538 ; X64-AVX2-NEXT: divl 32(%rsi)
2539 ; X64-AVX2-NEXT: vpbroadcastd {{.*#+}} ymm0 = [8199,8199,8199,8199,8199,8199,8199,8199]
2540 ; X64-AVX2-NEXT: vpmulld %ymm0, %ymm1, %ymm0
2541 ; X64-AVX2-NEXT: imull $8199, %edx, %eax # imm = 0x2007
2542 ; X64-AVX2-NEXT: movl %eax, (%rax)
2543 ; X64-AVX2-NEXT: vmovdqa %ymm0, (%rax)
2544 ; X64-AVX2-NEXT: vzeroupper
2545 ; X64-AVX2-NEXT: retq
2546 %a0 = load <9 x i16>, <9 x i16>* %p0, align 64
2547 %a1 = load <9 x i32>, <9 x i32>* %p1, align 64
2548 %ext0 = zext <9 x i16> %a0 to <9 x i32>
2549 %rem = urem <9 x i32> %ext0, %a1
2550 %mul = mul <9 x i32> <i32 8199, i32 8199, i32 8199, i32 8199, i32 8199, i32 8199, i32 8199, i32 8199, i32 8199>, %rem
2551 store <9 x i32> %mul, <9 x i32>* undef, align 64