1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefix=X86 --check-prefix=X86-SSE
3 ; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+avx | FileCheck %s --check-prefix=X86 --check-prefix=X86-AVX --check-prefix=X86-AVX1
4 ; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=X86 --check-prefix=X86-AVX --check-prefix=X86-AVX2
5 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefix=X64 --check-prefix=X64-SSE
6 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefix=X64 --check-prefix=X64-AVX --check-prefix=X64-AVX1
7 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=X64 --check-prefix=X64-AVX --check-prefix=X64-AVX2
9 @c = external global i32*, align 8
11 ; %val1 = load <2 x i8>
12 ; %op1 = zext<2 x i32> %val1
13 ; %val2 = load <2 x i8>
14 ; %op2 = zext<2 x i32> %val2
15 ; %rst = mul <2 x i32> %op1, %op2
17 define void @mul_2xi8(i8* nocapture readonly %a, i8* nocapture readonly %b, i64 %index) nounwind {
18 ; X86-SSE-LABEL: mul_2xi8:
19 ; X86-SSE: # %bb.0: # %entry
20 ; X86-SSE-NEXT: pushl %esi
21 ; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %eax
22 ; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %ecx
23 ; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %edx
24 ; X86-SSE-NEXT: movl c, %esi
25 ; X86-SSE-NEXT: movzwl (%edx,%ecx), %edx
26 ; X86-SSE-NEXT: movd %edx, %xmm0
27 ; X86-SSE-NEXT: movzwl (%eax,%ecx), %eax
28 ; X86-SSE-NEXT: movd %eax, %xmm1
29 ; X86-SSE-NEXT: pxor %xmm2, %xmm2
30 ; X86-SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7]
31 ; X86-SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
32 ; X86-SSE-NEXT: pmullw %xmm0, %xmm1
33 ; X86-SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
34 ; X86-SSE-NEXT: movq %xmm1, (%esi,%ecx,4)
35 ; X86-SSE-NEXT: popl %esi
38 ; X86-AVX-LABEL: mul_2xi8:
39 ; X86-AVX: # %bb.0: # %entry
40 ; X86-AVX-NEXT: pushl %esi
41 ; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax
42 ; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %ecx
43 ; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %edx
44 ; X86-AVX-NEXT: movl c, %esi
45 ; X86-AVX-NEXT: vpmovzxbq {{.*#+}} xmm0 = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero
46 ; X86-AVX-NEXT: vpmovzxbq {{.*#+}} xmm1 = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero
47 ; X86-AVX-NEXT: vpmuludq %xmm0, %xmm1, %xmm0
48 ; X86-AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
49 ; X86-AVX-NEXT: vmovq %xmm0, (%esi,%ecx,4)
50 ; X86-AVX-NEXT: popl %esi
53 ; X64-SSE-LABEL: mul_2xi8:
54 ; X64-SSE: # %bb.0: # %entry
55 ; X64-SSE-NEXT: movq {{.*}}(%rip), %rax
56 ; X64-SSE-NEXT: movzwl (%rdi,%rdx), %ecx
57 ; X64-SSE-NEXT: movd %ecx, %xmm0
58 ; X64-SSE-NEXT: movzwl (%rsi,%rdx), %ecx
59 ; X64-SSE-NEXT: movd %ecx, %xmm1
60 ; X64-SSE-NEXT: pxor %xmm2, %xmm2
61 ; X64-SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7]
62 ; X64-SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
63 ; X64-SSE-NEXT: pmullw %xmm0, %xmm1
64 ; X64-SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
65 ; X64-SSE-NEXT: movq %xmm1, (%rax,%rdx,4)
68 ; X64-AVX-LABEL: mul_2xi8:
69 ; X64-AVX: # %bb.0: # %entry
70 ; X64-AVX-NEXT: movq {{.*}}(%rip), %rax
71 ; X64-AVX-NEXT: vpmovzxbq {{.*#+}} xmm0 = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero
72 ; X64-AVX-NEXT: vpmovzxbq {{.*#+}} xmm1 = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero
73 ; X64-AVX-NEXT: vpmuludq %xmm0, %xmm1, %xmm0
74 ; X64-AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
75 ; X64-AVX-NEXT: vmovq %xmm0, (%rax,%rdx,4)
78 %pre = load i32*, i32** @c
79 %tmp6 = getelementptr inbounds i8, i8* %a, i64 %index
80 %tmp7 = bitcast i8* %tmp6 to <2 x i8>*
81 %wide.load = load <2 x i8>, <2 x i8>* %tmp7, align 1
82 %tmp8 = zext <2 x i8> %wide.load to <2 x i32>
83 %tmp10 = getelementptr inbounds i8, i8* %b, i64 %index
84 %tmp11 = bitcast i8* %tmp10 to <2 x i8>*
85 %wide.load17 = load <2 x i8>, <2 x i8>* %tmp11, align 1
86 %tmp12 = zext <2 x i8> %wide.load17 to <2 x i32>
87 %tmp13 = mul nuw nsw <2 x i32> %tmp12, %tmp8
88 %tmp14 = getelementptr inbounds i32, i32* %pre, i64 %index
89 %tmp15 = bitcast i32* %tmp14 to <2 x i32>*
90 store <2 x i32> %tmp13, <2 x i32>* %tmp15, align 4
94 ; %val1 = load <4 x i8>
95 ; %op1 = zext<4 x i32> %val1
96 ; %val2 = load <4 x i8>
97 ; %op2 = zext<4 x i32> %val2
98 ; %rst = mul <4 x i32> %op1, %op2
100 define void @mul_4xi8(i8* nocapture readonly %a, i8* nocapture readonly %b, i64 %index) nounwind {
101 ; X86-SSE-LABEL: mul_4xi8:
102 ; X86-SSE: # %bb.0: # %entry
103 ; X86-SSE-NEXT: pushl %esi
104 ; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %eax
105 ; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %ecx
106 ; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %edx
107 ; X86-SSE-NEXT: movl c, %esi
108 ; X86-SSE-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
109 ; X86-SSE-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
110 ; X86-SSE-NEXT: pxor %xmm2, %xmm2
111 ; X86-SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7]
112 ; X86-SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
113 ; X86-SSE-NEXT: pmullw %xmm0, %xmm1
114 ; X86-SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
115 ; X86-SSE-NEXT: movdqu %xmm1, (%esi,%ecx,4)
116 ; X86-SSE-NEXT: popl %esi
119 ; X86-AVX-LABEL: mul_4xi8:
120 ; X86-AVX: # %bb.0: # %entry
121 ; X86-AVX-NEXT: pushl %esi
122 ; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax
123 ; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %ecx
124 ; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %edx
125 ; X86-AVX-NEXT: movl c, %esi
126 ; X86-AVX-NEXT: vpmovzxbd {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
127 ; X86-AVX-NEXT: vpmovzxbd {{.*#+}} xmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
128 ; X86-AVX-NEXT: vpmaddwd %xmm0, %xmm1, %xmm0
129 ; X86-AVX-NEXT: vmovdqu %xmm0, (%esi,%ecx,4)
130 ; X86-AVX-NEXT: popl %esi
133 ; X64-SSE-LABEL: mul_4xi8:
134 ; X64-SSE: # %bb.0: # %entry
135 ; X64-SSE-NEXT: movq {{.*}}(%rip), %rax
136 ; X64-SSE-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
137 ; X64-SSE-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
138 ; X64-SSE-NEXT: pxor %xmm2, %xmm2
139 ; X64-SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7]
140 ; X64-SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
141 ; X64-SSE-NEXT: pmullw %xmm0, %xmm1
142 ; X64-SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
143 ; X64-SSE-NEXT: movdqu %xmm1, (%rax,%rdx,4)
146 ; X64-AVX-LABEL: mul_4xi8:
147 ; X64-AVX: # %bb.0: # %entry
148 ; X64-AVX-NEXT: movq {{.*}}(%rip), %rax
149 ; X64-AVX-NEXT: vpmovzxbd {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
150 ; X64-AVX-NEXT: vpmovzxbd {{.*#+}} xmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
151 ; X64-AVX-NEXT: vpmaddwd %xmm0, %xmm1, %xmm0
152 ; X64-AVX-NEXT: vmovdqu %xmm0, (%rax,%rdx,4)
155 %pre = load i32*, i32** @c
156 %tmp6 = getelementptr inbounds i8, i8* %a, i64 %index
157 %tmp7 = bitcast i8* %tmp6 to <4 x i8>*
158 %wide.load = load <4 x i8>, <4 x i8>* %tmp7, align 1
159 %tmp8 = zext <4 x i8> %wide.load to <4 x i32>
160 %tmp10 = getelementptr inbounds i8, i8* %b, i64 %index
161 %tmp11 = bitcast i8* %tmp10 to <4 x i8>*
162 %wide.load17 = load <4 x i8>, <4 x i8>* %tmp11, align 1
163 %tmp12 = zext <4 x i8> %wide.load17 to <4 x i32>
164 %tmp13 = mul nuw nsw <4 x i32> %tmp12, %tmp8
165 %tmp14 = getelementptr inbounds i32, i32* %pre, i64 %index
166 %tmp15 = bitcast i32* %tmp14 to <4 x i32>*
167 store <4 x i32> %tmp13, <4 x i32>* %tmp15, align 4
171 ; %val1 = load <8 x i8>
172 ; %op1 = zext<8 x i32> %val1
173 ; %val2 = load <8 x i8>
174 ; %op2 = zext<8 x i32> %val2
175 ; %rst = mul <8 x i32> %op1, %op2
177 define void @mul_8xi8(i8* nocapture readonly %a, i8* nocapture readonly %b, i64 %index) nounwind {
178 ; X86-SSE-LABEL: mul_8xi8:
179 ; X86-SSE: # %bb.0: # %entry
180 ; X86-SSE-NEXT: pushl %esi
181 ; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %eax
182 ; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %ecx
183 ; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %edx
184 ; X86-SSE-NEXT: movl c, %esi
185 ; X86-SSE-NEXT: movq {{.*#+}} xmm0 = mem[0],zero
186 ; X86-SSE-NEXT: movq {{.*#+}} xmm1 = mem[0],zero
187 ; X86-SSE-NEXT: pxor %xmm2, %xmm2
188 ; X86-SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7]
189 ; X86-SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
190 ; X86-SSE-NEXT: pmullw %xmm0, %xmm1
191 ; X86-SSE-NEXT: movdqa %xmm1, %xmm0
192 ; X86-SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
193 ; X86-SSE-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
194 ; X86-SSE-NEXT: movdqu %xmm1, 16(%esi,%ecx,4)
195 ; X86-SSE-NEXT: movdqu %xmm0, (%esi,%ecx,4)
196 ; X86-SSE-NEXT: popl %esi
199 ; X86-AVX1-LABEL: mul_8xi8:
200 ; X86-AVX1: # %bb.0: # %entry
201 ; X86-AVX1-NEXT: pushl %esi
202 ; X86-AVX1-NEXT: movl {{[0-9]+}}(%esp), %eax
203 ; X86-AVX1-NEXT: movl {{[0-9]+}}(%esp), %ecx
204 ; X86-AVX1-NEXT: movl {{[0-9]+}}(%esp), %edx
205 ; X86-AVX1-NEXT: movl c, %esi
206 ; X86-AVX1-NEXT: vpmovzxbd {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
207 ; X86-AVX1-NEXT: vpmovzxbd {{.*#+}} xmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
208 ; X86-AVX1-NEXT: vpmovzxbd {{.*#+}} xmm2 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
209 ; X86-AVX1-NEXT: vpmaddwd %xmm0, %xmm2, %xmm0
210 ; X86-AVX1-NEXT: vpmovzxbd {{.*#+}} xmm2 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
211 ; X86-AVX1-NEXT: vpmaddwd %xmm1, %xmm2, %xmm1
212 ; X86-AVX1-NEXT: vmovdqu %xmm0, 16(%esi,%ecx,4)
213 ; X86-AVX1-NEXT: vmovdqu %xmm1, (%esi,%ecx,4)
214 ; X86-AVX1-NEXT: popl %esi
215 ; X86-AVX1-NEXT: retl
217 ; X86-AVX2-LABEL: mul_8xi8:
218 ; X86-AVX2: # %bb.0: # %entry
219 ; X86-AVX2-NEXT: pushl %esi
220 ; X86-AVX2-NEXT: movl {{[0-9]+}}(%esp), %eax
221 ; X86-AVX2-NEXT: movl {{[0-9]+}}(%esp), %ecx
222 ; X86-AVX2-NEXT: movl {{[0-9]+}}(%esp), %edx
223 ; X86-AVX2-NEXT: movl c, %esi
224 ; X86-AVX2-NEXT: vpmovzxbd {{.*#+}} ymm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero
225 ; X86-AVX2-NEXT: vpmovzxbd {{.*#+}} ymm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero
226 ; X86-AVX2-NEXT: vpmaddwd %ymm0, %ymm1, %ymm0
227 ; X86-AVX2-NEXT: vmovdqu %ymm0, (%esi,%ecx,4)
228 ; X86-AVX2-NEXT: popl %esi
229 ; X86-AVX2-NEXT: vzeroupper
230 ; X86-AVX2-NEXT: retl
232 ; X64-SSE-LABEL: mul_8xi8:
233 ; X64-SSE: # %bb.0: # %entry
234 ; X64-SSE-NEXT: movq {{.*}}(%rip), %rax
235 ; X64-SSE-NEXT: movq {{.*#+}} xmm0 = mem[0],zero
236 ; X64-SSE-NEXT: movq {{.*#+}} xmm1 = mem[0],zero
237 ; X64-SSE-NEXT: pxor %xmm2, %xmm2
238 ; X64-SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7]
239 ; X64-SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
240 ; X64-SSE-NEXT: pmullw %xmm0, %xmm1
241 ; X64-SSE-NEXT: movdqa %xmm1, %xmm0
242 ; X64-SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
243 ; X64-SSE-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
244 ; X64-SSE-NEXT: movdqu %xmm1, 16(%rax,%rdx,4)
245 ; X64-SSE-NEXT: movdqu %xmm0, (%rax,%rdx,4)
248 ; X64-AVX1-LABEL: mul_8xi8:
249 ; X64-AVX1: # %bb.0: # %entry
250 ; X64-AVX1-NEXT: movq {{.*}}(%rip), %rax
251 ; X64-AVX1-NEXT: vpmovzxbd {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
252 ; X64-AVX1-NEXT: vpmovzxbd {{.*#+}} xmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
253 ; X64-AVX1-NEXT: vpmovzxbd {{.*#+}} xmm2 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
254 ; X64-AVX1-NEXT: vpmaddwd %xmm0, %xmm2, %xmm0
255 ; X64-AVX1-NEXT: vpmovzxbd {{.*#+}} xmm2 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
256 ; X64-AVX1-NEXT: vpmaddwd %xmm1, %xmm2, %xmm1
257 ; X64-AVX1-NEXT: vmovdqu %xmm0, 16(%rax,%rdx,4)
258 ; X64-AVX1-NEXT: vmovdqu %xmm1, (%rax,%rdx,4)
259 ; X64-AVX1-NEXT: retq
261 ; X64-AVX2-LABEL: mul_8xi8:
262 ; X64-AVX2: # %bb.0: # %entry
263 ; X64-AVX2-NEXT: movq {{.*}}(%rip), %rax
264 ; X64-AVX2-NEXT: vpmovzxbd {{.*#+}} ymm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero
265 ; X64-AVX2-NEXT: vpmovzxbd {{.*#+}} ymm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero
266 ; X64-AVX2-NEXT: vpmaddwd %ymm0, %ymm1, %ymm0
267 ; X64-AVX2-NEXT: vmovdqu %ymm0, (%rax,%rdx,4)
268 ; X64-AVX2-NEXT: vzeroupper
269 ; X64-AVX2-NEXT: retq
271 %pre = load i32*, i32** @c
272 %tmp6 = getelementptr inbounds i8, i8* %a, i64 %index
273 %tmp7 = bitcast i8* %tmp6 to <8 x i8>*
274 %wide.load = load <8 x i8>, <8 x i8>* %tmp7, align 1
275 %tmp8 = zext <8 x i8> %wide.load to <8 x i32>
276 %tmp10 = getelementptr inbounds i8, i8* %b, i64 %index
277 %tmp11 = bitcast i8* %tmp10 to <8 x i8>*
278 %wide.load17 = load <8 x i8>, <8 x i8>* %tmp11, align 1
279 %tmp12 = zext <8 x i8> %wide.load17 to <8 x i32>
280 %tmp13 = mul nuw nsw <8 x i32> %tmp12, %tmp8
281 %tmp14 = getelementptr inbounds i32, i32* %pre, i64 %index
282 %tmp15 = bitcast i32* %tmp14 to <8 x i32>*
283 store <8 x i32> %tmp13, <8 x i32>* %tmp15, align 4
287 ; %val1 = load <16 x i8>
288 ; %op1 = zext<16 x i32> %val1
289 ; %val2 = load <16 x i8>
290 ; %op2 = zext<16 x i32> %val2
291 ; %rst = mul <16 x i32> %op1, %op2
293 define void @mul_16xi8(i8* nocapture readonly %a, i8* nocapture readonly %b, i64 %index) nounwind {
294 ; X86-SSE-LABEL: mul_16xi8:
295 ; X86-SSE: # %bb.0: # %entry
296 ; X86-SSE-NEXT: pushl %esi
297 ; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %eax
298 ; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %ecx
299 ; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %edx
300 ; X86-SSE-NEXT: movl c, %esi
301 ; X86-SSE-NEXT: movdqu (%edx,%ecx), %xmm0
302 ; X86-SSE-NEXT: movdqu (%eax,%ecx), %xmm1
303 ; X86-SSE-NEXT: pxor %xmm2, %xmm2
304 ; X86-SSE-NEXT: movdqa %xmm0, %xmm3
305 ; X86-SSE-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3],xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7]
306 ; X86-SSE-NEXT: movdqa %xmm1, %xmm4
307 ; X86-SSE-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1],xmm4[2],xmm2[2],xmm4[3],xmm2[3],xmm4[4],xmm2[4],xmm4[5],xmm2[5],xmm4[6],xmm2[6],xmm4[7],xmm2[7]
308 ; X86-SSE-NEXT: pmullw %xmm3, %xmm4
309 ; X86-SSE-NEXT: movdqa %xmm4, %xmm3
310 ; X86-SSE-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3]
311 ; X86-SSE-NEXT: punpckhwd {{.*#+}} xmm4 = xmm4[4],xmm2[4],xmm4[5],xmm2[5],xmm4[6],xmm2[6],xmm4[7],xmm2[7]
312 ; X86-SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm2[8],xmm0[9],xmm2[9],xmm0[10],xmm2[10],xmm0[11],xmm2[11],xmm0[12],xmm2[12],xmm0[13],xmm2[13],xmm0[14],xmm2[14],xmm0[15],xmm2[15]
313 ; X86-SSE-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm2[8],xmm1[9],xmm2[9],xmm1[10],xmm2[10],xmm1[11],xmm2[11],xmm1[12],xmm2[12],xmm1[13],xmm2[13],xmm1[14],xmm2[14],xmm1[15],xmm2[15]
314 ; X86-SSE-NEXT: pmullw %xmm0, %xmm1
315 ; X86-SSE-NEXT: movdqa %xmm1, %xmm0
316 ; X86-SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
317 ; X86-SSE-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
318 ; X86-SSE-NEXT: movdqu %xmm1, 48(%esi,%ecx,4)
319 ; X86-SSE-NEXT: movdqu %xmm0, 32(%esi,%ecx,4)
320 ; X86-SSE-NEXT: movdqu %xmm4, 16(%esi,%ecx,4)
321 ; X86-SSE-NEXT: movdqu %xmm3, (%esi,%ecx,4)
322 ; X86-SSE-NEXT: popl %esi
325 ; X86-AVX1-LABEL: mul_16xi8:
326 ; X86-AVX1: # %bb.0: # %entry
327 ; X86-AVX1-NEXT: pushl %esi
328 ; X86-AVX1-NEXT: movl {{[0-9]+}}(%esp), %eax
329 ; X86-AVX1-NEXT: movl {{[0-9]+}}(%esp), %ecx
330 ; X86-AVX1-NEXT: movl {{[0-9]+}}(%esp), %edx
331 ; X86-AVX1-NEXT: movl c, %esi
332 ; X86-AVX1-NEXT: vpmovzxbd {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
333 ; X86-AVX1-NEXT: vpmovzxbd {{.*#+}} xmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
334 ; X86-AVX1-NEXT: vpmovzxbd {{.*#+}} xmm2 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
335 ; X86-AVX1-NEXT: vpmovzxbd {{.*#+}} xmm3 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
336 ; X86-AVX1-NEXT: vpmovzxbd {{.*#+}} xmm4 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
337 ; X86-AVX1-NEXT: vpmaddwd %xmm0, %xmm4, %xmm0
338 ; X86-AVX1-NEXT: vpmovzxbd {{.*#+}} xmm4 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
339 ; X86-AVX1-NEXT: vpmaddwd %xmm1, %xmm4, %xmm1
340 ; X86-AVX1-NEXT: vpmovzxbd {{.*#+}} xmm4 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
341 ; X86-AVX1-NEXT: vpmaddwd %xmm2, %xmm4, %xmm2
342 ; X86-AVX1-NEXT: vpmovzxbd {{.*#+}} xmm4 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
343 ; X86-AVX1-NEXT: vpmaddwd %xmm3, %xmm4, %xmm3
344 ; X86-AVX1-NEXT: vmovdqu %xmm0, 48(%esi,%ecx,4)
345 ; X86-AVX1-NEXT: vmovdqu %xmm1, 32(%esi,%ecx,4)
346 ; X86-AVX1-NEXT: vmovdqu %xmm2, 16(%esi,%ecx,4)
347 ; X86-AVX1-NEXT: vmovdqu %xmm3, (%esi,%ecx,4)
348 ; X86-AVX1-NEXT: popl %esi
349 ; X86-AVX1-NEXT: retl
351 ; X86-AVX2-LABEL: mul_16xi8:
352 ; X86-AVX2: # %bb.0: # %entry
353 ; X86-AVX2-NEXT: pushl %esi
354 ; X86-AVX2-NEXT: movl {{[0-9]+}}(%esp), %eax
355 ; X86-AVX2-NEXT: movl {{[0-9]+}}(%esp), %ecx
356 ; X86-AVX2-NEXT: movl {{[0-9]+}}(%esp), %edx
357 ; X86-AVX2-NEXT: movl c, %esi
358 ; X86-AVX2-NEXT: vpmovzxbd {{.*#+}} ymm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero
359 ; X86-AVX2-NEXT: vpmovzxbd {{.*#+}} ymm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero
360 ; X86-AVX2-NEXT: vpmovzxbd {{.*#+}} ymm2 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero
361 ; X86-AVX2-NEXT: vpmaddwd %ymm0, %ymm2, %ymm0
362 ; X86-AVX2-NEXT: vpmovzxbd {{.*#+}} ymm2 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero
363 ; X86-AVX2-NEXT: vpmaddwd %ymm1, %ymm2, %ymm1
364 ; X86-AVX2-NEXT: vmovdqu %ymm0, 32(%esi,%ecx,4)
365 ; X86-AVX2-NEXT: vmovdqu %ymm1, (%esi,%ecx,4)
366 ; X86-AVX2-NEXT: popl %esi
367 ; X86-AVX2-NEXT: vzeroupper
368 ; X86-AVX2-NEXT: retl
370 ; X64-SSE-LABEL: mul_16xi8:
371 ; X64-SSE: # %bb.0: # %entry
372 ; X64-SSE-NEXT: movq {{.*}}(%rip), %rax
373 ; X64-SSE-NEXT: movdqu (%rdi,%rdx), %xmm0
374 ; X64-SSE-NEXT: movdqu (%rsi,%rdx), %xmm1
375 ; X64-SSE-NEXT: pxor %xmm2, %xmm2
376 ; X64-SSE-NEXT: movdqa %xmm0, %xmm3
377 ; X64-SSE-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3],xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7]
378 ; X64-SSE-NEXT: movdqa %xmm1, %xmm4
379 ; X64-SSE-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1],xmm4[2],xmm2[2],xmm4[3],xmm2[3],xmm4[4],xmm2[4],xmm4[5],xmm2[5],xmm4[6],xmm2[6],xmm4[7],xmm2[7]
380 ; X64-SSE-NEXT: pmullw %xmm3, %xmm4
381 ; X64-SSE-NEXT: movdqa %xmm4, %xmm3
382 ; X64-SSE-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3]
383 ; X64-SSE-NEXT: punpckhwd {{.*#+}} xmm4 = xmm4[4],xmm2[4],xmm4[5],xmm2[5],xmm4[6],xmm2[6],xmm4[7],xmm2[7]
384 ; X64-SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm2[8],xmm0[9],xmm2[9],xmm0[10],xmm2[10],xmm0[11],xmm2[11],xmm0[12],xmm2[12],xmm0[13],xmm2[13],xmm0[14],xmm2[14],xmm0[15],xmm2[15]
385 ; X64-SSE-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm2[8],xmm1[9],xmm2[9],xmm1[10],xmm2[10],xmm1[11],xmm2[11],xmm1[12],xmm2[12],xmm1[13],xmm2[13],xmm1[14],xmm2[14],xmm1[15],xmm2[15]
386 ; X64-SSE-NEXT: pmullw %xmm0, %xmm1
387 ; X64-SSE-NEXT: movdqa %xmm1, %xmm0
388 ; X64-SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
389 ; X64-SSE-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
390 ; X64-SSE-NEXT: movdqu %xmm1, 48(%rax,%rdx,4)
391 ; X64-SSE-NEXT: movdqu %xmm0, 32(%rax,%rdx,4)
392 ; X64-SSE-NEXT: movdqu %xmm4, 16(%rax,%rdx,4)
393 ; X64-SSE-NEXT: movdqu %xmm3, (%rax,%rdx,4)
396 ; X64-AVX1-LABEL: mul_16xi8:
397 ; X64-AVX1: # %bb.0: # %entry
398 ; X64-AVX1-NEXT: movq {{.*}}(%rip), %rax
399 ; X64-AVX1-NEXT: vpmovzxbd {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
400 ; X64-AVX1-NEXT: vpmovzxbd {{.*#+}} xmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
401 ; X64-AVX1-NEXT: vpmovzxbd {{.*#+}} xmm2 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
402 ; X64-AVX1-NEXT: vpmovzxbd {{.*#+}} xmm3 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
403 ; X64-AVX1-NEXT: vpmovzxbd {{.*#+}} xmm4 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
404 ; X64-AVX1-NEXT: vpmaddwd %xmm0, %xmm4, %xmm0
405 ; X64-AVX1-NEXT: vpmovzxbd {{.*#+}} xmm4 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
406 ; X64-AVX1-NEXT: vpmaddwd %xmm1, %xmm4, %xmm1
407 ; X64-AVX1-NEXT: vpmovzxbd {{.*#+}} xmm4 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
408 ; X64-AVX1-NEXT: vpmaddwd %xmm2, %xmm4, %xmm2
409 ; X64-AVX1-NEXT: vpmovzxbd {{.*#+}} xmm4 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
410 ; X64-AVX1-NEXT: vpmaddwd %xmm3, %xmm4, %xmm3
411 ; X64-AVX1-NEXT: vmovdqu %xmm0, 48(%rax,%rdx,4)
412 ; X64-AVX1-NEXT: vmovdqu %xmm1, 32(%rax,%rdx,4)
413 ; X64-AVX1-NEXT: vmovdqu %xmm2, 16(%rax,%rdx,4)
414 ; X64-AVX1-NEXT: vmovdqu %xmm3, (%rax,%rdx,4)
415 ; X64-AVX1-NEXT: retq
417 ; X64-AVX2-LABEL: mul_16xi8:
418 ; X64-AVX2: # %bb.0: # %entry
419 ; X64-AVX2-NEXT: movq {{.*}}(%rip), %rax
420 ; X64-AVX2-NEXT: vpmovzxbd {{.*#+}} ymm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero
421 ; X64-AVX2-NEXT: vpmovzxbd {{.*#+}} ymm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero
422 ; X64-AVX2-NEXT: vpmovzxbd {{.*#+}} ymm2 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero
423 ; X64-AVX2-NEXT: vpmaddwd %ymm0, %ymm2, %ymm0
424 ; X64-AVX2-NEXT: vpmovzxbd {{.*#+}} ymm2 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero
425 ; X64-AVX2-NEXT: vpmaddwd %ymm1, %ymm2, %ymm1
426 ; X64-AVX2-NEXT: vmovdqu %ymm0, 32(%rax,%rdx,4)
427 ; X64-AVX2-NEXT: vmovdqu %ymm1, (%rax,%rdx,4)
428 ; X64-AVX2-NEXT: vzeroupper
429 ; X64-AVX2-NEXT: retq
431 %pre = load i32*, i32** @c
432 %tmp6 = getelementptr inbounds i8, i8* %a, i64 %index
433 %tmp7 = bitcast i8* %tmp6 to <16 x i8>*
434 %wide.load = load <16 x i8>, <16 x i8>* %tmp7, align 1
435 %tmp8 = zext <16 x i8> %wide.load to <16 x i32>
436 %tmp10 = getelementptr inbounds i8, i8* %b, i64 %index
437 %tmp11 = bitcast i8* %tmp10 to <16 x i8>*
438 %wide.load17 = load <16 x i8>, <16 x i8>* %tmp11, align 1
439 %tmp12 = zext <16 x i8> %wide.load17 to <16 x i32>
440 %tmp13 = mul nuw nsw <16 x i32> %tmp12, %tmp8
441 %tmp14 = getelementptr inbounds i32, i32* %pre, i64 %index
442 %tmp15 = bitcast i32* %tmp14 to <16 x i32>*
443 store <16 x i32> %tmp13, <16 x i32>* %tmp15, align 4
447 ; %val1 = load <2 x i16>
448 ; %op1 = zext<2 x i32> %val1
449 ; %val2 = load <2 x i16>
450 ; %op2 = zext<2 x i32> %val2
451 ; %rst = mul <2 x i32> %op1, %op2
453 define void @mul_2xi16(i8* nocapture readonly %a, i8* nocapture readonly %b, i64 %index) nounwind {
454 ; X86-SSE-LABEL: mul_2xi16:
455 ; X86-SSE: # %bb.0: # %entry
456 ; X86-SSE-NEXT: pushl %esi
457 ; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %eax
458 ; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %ecx
459 ; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %edx
460 ; X86-SSE-NEXT: movl c, %esi
461 ; X86-SSE-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
462 ; X86-SSE-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
463 ; X86-SSE-NEXT: movdqa %xmm1, %xmm2
464 ; X86-SSE-NEXT: pmulhuw %xmm0, %xmm2
465 ; X86-SSE-NEXT: pmullw %xmm0, %xmm1
466 ; X86-SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
467 ; X86-SSE-NEXT: movq %xmm1, (%esi,%ecx,4)
468 ; X86-SSE-NEXT: popl %esi
471 ; X86-AVX-LABEL: mul_2xi16:
472 ; X86-AVX: # %bb.0: # %entry
473 ; X86-AVX-NEXT: pushl %esi
474 ; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax
475 ; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %ecx
476 ; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %edx
477 ; X86-AVX-NEXT: movl c, %esi
478 ; X86-AVX-NEXT: vpmovzxwq {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero
479 ; X86-AVX-NEXT: vpmovzxwq {{.*#+}} xmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero
480 ; X86-AVX-NEXT: vpmuludq %xmm0, %xmm1, %xmm0
481 ; X86-AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
482 ; X86-AVX-NEXT: vmovq %xmm0, (%esi,%ecx,4)
483 ; X86-AVX-NEXT: popl %esi
486 ; X64-SSE-LABEL: mul_2xi16:
487 ; X64-SSE: # %bb.0: # %entry
488 ; X64-SSE-NEXT: movq {{.*}}(%rip), %rax
489 ; X64-SSE-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
490 ; X64-SSE-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
491 ; X64-SSE-NEXT: movdqa %xmm1, %xmm2
492 ; X64-SSE-NEXT: pmulhuw %xmm0, %xmm2
493 ; X64-SSE-NEXT: pmullw %xmm0, %xmm1
494 ; X64-SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
495 ; X64-SSE-NEXT: movq %xmm1, (%rax,%rdx,4)
498 ; X64-AVX-LABEL: mul_2xi16:
499 ; X64-AVX: # %bb.0: # %entry
500 ; X64-AVX-NEXT: movq {{.*}}(%rip), %rax
501 ; X64-AVX-NEXT: vpmovzxwq {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero
502 ; X64-AVX-NEXT: vpmovzxwq {{.*#+}} xmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero
503 ; X64-AVX-NEXT: vpmuludq %xmm0, %xmm1, %xmm0
504 ; X64-AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
505 ; X64-AVX-NEXT: vmovq %xmm0, (%rax,%rdx,4)
508 %pre = load i32*, i32** @c
509 %tmp6 = getelementptr inbounds i8, i8* %a, i64 %index
510 %tmp7 = bitcast i8* %tmp6 to <2 x i16>*
511 %wide.load = load <2 x i16>, <2 x i16>* %tmp7, align 1
512 %tmp8 = zext <2 x i16> %wide.load to <2 x i32>
513 %tmp10 = getelementptr inbounds i8, i8* %b, i64 %index
514 %tmp11 = bitcast i8* %tmp10 to <2 x i16>*
515 %wide.load17 = load <2 x i16>, <2 x i16>* %tmp11, align 1
516 %tmp12 = zext <2 x i16> %wide.load17 to <2 x i32>
517 %tmp13 = mul nuw nsw <2 x i32> %tmp12, %tmp8
518 %tmp14 = getelementptr inbounds i32, i32* %pre, i64 %index
519 %tmp15 = bitcast i32* %tmp14 to <2 x i32>*
520 store <2 x i32> %tmp13, <2 x i32>* %tmp15, align 4
524 ; %val1 = load <4 x i16>
525 ; %op1 = zext<4 x i32> %val1
526 ; %val2 = load <4 x i16>
527 ; %op2 = zext<4 x i32> %val2
528 ; %rst = mul <4 x i32> %op1, %op2
530 define void @mul_4xi16(i8* nocapture readonly %a, i8* nocapture readonly %b, i64 %index) nounwind {
531 ; X86-SSE-LABEL: mul_4xi16:
532 ; X86-SSE: # %bb.0: # %entry
533 ; X86-SSE-NEXT: pushl %esi
534 ; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %eax
535 ; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %ecx
536 ; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %edx
537 ; X86-SSE-NEXT: movl c, %esi
538 ; X86-SSE-NEXT: movq {{.*#+}} xmm0 = mem[0],zero
539 ; X86-SSE-NEXT: movq {{.*#+}} xmm1 = mem[0],zero
540 ; X86-SSE-NEXT: movdqa %xmm1, %xmm2
541 ; X86-SSE-NEXT: pmulhuw %xmm0, %xmm2
542 ; X86-SSE-NEXT: pmullw %xmm0, %xmm1
543 ; X86-SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
544 ; X86-SSE-NEXT: movdqu %xmm1, (%esi,%ecx,4)
545 ; X86-SSE-NEXT: popl %esi
548 ; X86-AVX-LABEL: mul_4xi16:
549 ; X86-AVX: # %bb.0: # %entry
550 ; X86-AVX-NEXT: pushl %esi
551 ; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax
552 ; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %ecx
553 ; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %edx
554 ; X86-AVX-NEXT: movl c, %esi
555 ; X86-AVX-NEXT: vpmovzxwd {{.*#+}} xmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
556 ; X86-AVX-NEXT: vpmovzxwd {{.*#+}} xmm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
557 ; X86-AVX-NEXT: vpmulld %xmm0, %xmm1, %xmm0
558 ; X86-AVX-NEXT: vmovdqu %xmm0, (%esi,%ecx,4)
559 ; X86-AVX-NEXT: popl %esi
562 ; X64-SSE-LABEL: mul_4xi16:
563 ; X64-SSE: # %bb.0: # %entry
564 ; X64-SSE-NEXT: movq {{.*}}(%rip), %rax
565 ; X64-SSE-NEXT: movq {{.*#+}} xmm0 = mem[0],zero
566 ; X64-SSE-NEXT: movq {{.*#+}} xmm1 = mem[0],zero
567 ; X64-SSE-NEXT: movdqa %xmm1, %xmm2
568 ; X64-SSE-NEXT: pmulhuw %xmm0, %xmm2
569 ; X64-SSE-NEXT: pmullw %xmm0, %xmm1
570 ; X64-SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
571 ; X64-SSE-NEXT: movdqu %xmm1, (%rax,%rdx,4)
574 ; X64-AVX-LABEL: mul_4xi16:
575 ; X64-AVX: # %bb.0: # %entry
576 ; X64-AVX-NEXT: movq {{.*}}(%rip), %rax
577 ; X64-AVX-NEXT: vpmovzxwd {{.*#+}} xmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
578 ; X64-AVX-NEXT: vpmovzxwd {{.*#+}} xmm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
579 ; X64-AVX-NEXT: vpmulld %xmm0, %xmm1, %xmm0
580 ; X64-AVX-NEXT: vmovdqu %xmm0, (%rax,%rdx,4)
583 %pre = load i32*, i32** @c
584 %tmp6 = getelementptr inbounds i8, i8* %a, i64 %index
585 %tmp7 = bitcast i8* %tmp6 to <4 x i16>*
586 %wide.load = load <4 x i16>, <4 x i16>* %tmp7, align 1
587 %tmp8 = zext <4 x i16> %wide.load to <4 x i32>
588 %tmp10 = getelementptr inbounds i8, i8* %b, i64 %index
589 %tmp11 = bitcast i8* %tmp10 to <4 x i16>*
590 %wide.load17 = load <4 x i16>, <4 x i16>* %tmp11, align 1
591 %tmp12 = zext <4 x i16> %wide.load17 to <4 x i32>
592 %tmp13 = mul nuw nsw <4 x i32> %tmp12, %tmp8
593 %tmp14 = getelementptr inbounds i32, i32* %pre, i64 %index
594 %tmp15 = bitcast i32* %tmp14 to <4 x i32>*
595 store <4 x i32> %tmp13, <4 x i32>* %tmp15, align 4
599 ; %val1 = load <8 x i16>
600 ; %op1 = zext<8 x i32> %val1
601 ; %val2 = load <8 x i16>
602 ; %op2 = zext<8 x i32> %val2
603 ; %rst = mul <8 x i32> %op1, %op2
605 define void @mul_8xi16(i8* nocapture readonly %a, i8* nocapture readonly %b, i64 %index) nounwind {
606 ; X86-SSE-LABEL: mul_8xi16:
607 ; X86-SSE: # %bb.0: # %entry
608 ; X86-SSE-NEXT: pushl %esi
609 ; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %eax
610 ; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %ecx
611 ; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %edx
612 ; X86-SSE-NEXT: movl c, %esi
613 ; X86-SSE-NEXT: movdqu (%edx,%ecx), %xmm0
614 ; X86-SSE-NEXT: movdqu (%eax,%ecx), %xmm1
615 ; X86-SSE-NEXT: movdqa %xmm1, %xmm2
616 ; X86-SSE-NEXT: pmulhuw %xmm0, %xmm2
617 ; X86-SSE-NEXT: pmullw %xmm0, %xmm1
618 ; X86-SSE-NEXT: movdqa %xmm1, %xmm0
619 ; X86-SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
620 ; X86-SSE-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
621 ; X86-SSE-NEXT: movdqu %xmm1, 16(%esi,%ecx,4)
622 ; X86-SSE-NEXT: movdqu %xmm0, (%esi,%ecx,4)
623 ; X86-SSE-NEXT: popl %esi
626 ; X86-AVX1-LABEL: mul_8xi16:
627 ; X86-AVX1: # %bb.0: # %entry
628 ; X86-AVX1-NEXT: pushl %esi
629 ; X86-AVX1-NEXT: movl {{[0-9]+}}(%esp), %eax
630 ; X86-AVX1-NEXT: movl {{[0-9]+}}(%esp), %ecx
631 ; X86-AVX1-NEXT: movl {{[0-9]+}}(%esp), %edx
632 ; X86-AVX1-NEXT: movl c, %esi
633 ; X86-AVX1-NEXT: vpmovzxwd {{.*#+}} xmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
634 ; X86-AVX1-NEXT: vpmovzxwd {{.*#+}} xmm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
635 ; X86-AVX1-NEXT: vpmovzxwd {{.*#+}} xmm2 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
636 ; X86-AVX1-NEXT: vpmulld %xmm0, %xmm2, %xmm0
637 ; X86-AVX1-NEXT: vpmovzxwd {{.*#+}} xmm2 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
638 ; X86-AVX1-NEXT: vpmulld %xmm1, %xmm2, %xmm1
639 ; X86-AVX1-NEXT: vmovdqu %xmm0, 16(%esi,%ecx,4)
640 ; X86-AVX1-NEXT: vmovdqu %xmm1, (%esi,%ecx,4)
641 ; X86-AVX1-NEXT: popl %esi
642 ; X86-AVX1-NEXT: retl
644 ; X86-AVX2-LABEL: mul_8xi16:
645 ; X86-AVX2: # %bb.0: # %entry
646 ; X86-AVX2-NEXT: pushl %esi
647 ; X86-AVX2-NEXT: movl {{[0-9]+}}(%esp), %eax
648 ; X86-AVX2-NEXT: movl {{[0-9]+}}(%esp), %ecx
649 ; X86-AVX2-NEXT: movl {{[0-9]+}}(%esp), %edx
650 ; X86-AVX2-NEXT: movl c, %esi
651 ; X86-AVX2-NEXT: vpmovzxwd {{.*#+}} ymm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero
652 ; X86-AVX2-NEXT: vpmovzxwd {{.*#+}} ymm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero
653 ; X86-AVX2-NEXT: vpmulld %ymm0, %ymm1, %ymm0
654 ; X86-AVX2-NEXT: vmovdqu %ymm0, (%esi,%ecx,4)
655 ; X86-AVX2-NEXT: popl %esi
656 ; X86-AVX2-NEXT: vzeroupper
657 ; X86-AVX2-NEXT: retl
659 ; X64-SSE-LABEL: mul_8xi16:
660 ; X64-SSE: # %bb.0: # %entry
661 ; X64-SSE-NEXT: movq {{.*}}(%rip), %rax
662 ; X64-SSE-NEXT: movdqu (%rdi,%rdx), %xmm0
663 ; X64-SSE-NEXT: movdqu (%rsi,%rdx), %xmm1
664 ; X64-SSE-NEXT: movdqa %xmm1, %xmm2
665 ; X64-SSE-NEXT: pmulhuw %xmm0, %xmm2
666 ; X64-SSE-NEXT: pmullw %xmm0, %xmm1
667 ; X64-SSE-NEXT: movdqa %xmm1, %xmm0
668 ; X64-SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
669 ; X64-SSE-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
670 ; X64-SSE-NEXT: movdqu %xmm1, 16(%rax,%rdx,4)
671 ; X64-SSE-NEXT: movdqu %xmm0, (%rax,%rdx,4)
674 ; X64-AVX1-LABEL: mul_8xi16:
675 ; X64-AVX1: # %bb.0: # %entry
676 ; X64-AVX1-NEXT: movq {{.*}}(%rip), %rax
677 ; X64-AVX1-NEXT: vpmovzxwd {{.*#+}} xmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
678 ; X64-AVX1-NEXT: vpmovzxwd {{.*#+}} xmm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
679 ; X64-AVX1-NEXT: vpmovzxwd {{.*#+}} xmm2 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
680 ; X64-AVX1-NEXT: vpmulld %xmm0, %xmm2, %xmm0
681 ; X64-AVX1-NEXT: vpmovzxwd {{.*#+}} xmm2 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
682 ; X64-AVX1-NEXT: vpmulld %xmm1, %xmm2, %xmm1
683 ; X64-AVX1-NEXT: vmovdqu %xmm0, 16(%rax,%rdx,4)
684 ; X64-AVX1-NEXT: vmovdqu %xmm1, (%rax,%rdx,4)
685 ; X64-AVX1-NEXT: retq
687 ; X64-AVX2-LABEL: mul_8xi16:
688 ; X64-AVX2: # %bb.0: # %entry
689 ; X64-AVX2-NEXT: movq {{.*}}(%rip), %rax
690 ; X64-AVX2-NEXT: vpmovzxwd {{.*#+}} ymm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero
691 ; X64-AVX2-NEXT: vpmovzxwd {{.*#+}} ymm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero
692 ; X64-AVX2-NEXT: vpmulld %ymm0, %ymm1, %ymm0
693 ; X64-AVX2-NEXT: vmovdqu %ymm0, (%rax,%rdx,4)
694 ; X64-AVX2-NEXT: vzeroupper
695 ; X64-AVX2-NEXT: retq
697 %pre = load i32*, i32** @c
698 %tmp6 = getelementptr inbounds i8, i8* %a, i64 %index
699 %tmp7 = bitcast i8* %tmp6 to <8 x i16>*
700 %wide.load = load <8 x i16>, <8 x i16>* %tmp7, align 1
701 %tmp8 = zext <8 x i16> %wide.load to <8 x i32>
702 %tmp10 = getelementptr inbounds i8, i8* %b, i64 %index
703 %tmp11 = bitcast i8* %tmp10 to <8 x i16>*
704 %wide.load17 = load <8 x i16>, <8 x i16>* %tmp11, align 1
705 %tmp12 = zext <8 x i16> %wide.load17 to <8 x i32>
706 %tmp13 = mul nuw nsw <8 x i32> %tmp12, %tmp8
707 %tmp14 = getelementptr inbounds i32, i32* %pre, i64 %index
708 %tmp15 = bitcast i32* %tmp14 to <8 x i32>*
709 store <8 x i32> %tmp13, <8 x i32>* %tmp15, align 4
713 ; %val1 = load <16 x i16>
714 ; %op1 = zext<16 x i32> %val1
715 ; %val2 = load <16 x i16>
716 ; %op2 = zext<16 x i32> %val2
717 ; %rst = mul <16 x i32> %op1, %op2
719 define void @mul_16xi16(i8* nocapture readonly %a, i8* nocapture readonly %b, i64 %index) nounwind {
720 ; X86-SSE-LABEL: mul_16xi16:
721 ; X86-SSE: # %bb.0: # %entry
722 ; X86-SSE-NEXT: pushl %esi
723 ; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %eax
724 ; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %ecx
725 ; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %edx
726 ; X86-SSE-NEXT: movl c, %esi
727 ; X86-SSE-NEXT: movdqu (%edx,%ecx), %xmm0
728 ; X86-SSE-NEXT: movdqu 16(%edx,%ecx), %xmm1
729 ; X86-SSE-NEXT: movdqu (%eax,%ecx), %xmm2
730 ; X86-SSE-NEXT: movdqu 16(%eax,%ecx), %xmm3
731 ; X86-SSE-NEXT: movdqa %xmm2, %xmm4
732 ; X86-SSE-NEXT: pmulhuw %xmm0, %xmm4
733 ; X86-SSE-NEXT: pmullw %xmm0, %xmm2
734 ; X86-SSE-NEXT: movdqa %xmm2, %xmm0
735 ; X86-SSE-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm4[4],xmm0[5],xmm4[5],xmm0[6],xmm4[6],xmm0[7],xmm4[7]
736 ; X86-SSE-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3]
737 ; X86-SSE-NEXT: movdqa %xmm3, %xmm4
738 ; X86-SSE-NEXT: pmulhuw %xmm1, %xmm4
739 ; X86-SSE-NEXT: pmullw %xmm1, %xmm3
740 ; X86-SSE-NEXT: movdqa %xmm3, %xmm1
741 ; X86-SSE-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm4[4],xmm1[5],xmm4[5],xmm1[6],xmm4[6],xmm1[7],xmm4[7]
742 ; X86-SSE-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3]
743 ; X86-SSE-NEXT: movdqu %xmm3, 32(%esi,%ecx,4)
744 ; X86-SSE-NEXT: movdqu %xmm1, 48(%esi,%ecx,4)
745 ; X86-SSE-NEXT: movdqu %xmm2, (%esi,%ecx,4)
746 ; X86-SSE-NEXT: movdqu %xmm0, 16(%esi,%ecx,4)
747 ; X86-SSE-NEXT: popl %esi
750 ; X86-AVX1-LABEL: mul_16xi16:
751 ; X86-AVX1: # %bb.0: # %entry
752 ; X86-AVX1-NEXT: pushl %esi
753 ; X86-AVX1-NEXT: movl {{[0-9]+}}(%esp), %eax
754 ; X86-AVX1-NEXT: movl {{[0-9]+}}(%esp), %ecx
755 ; X86-AVX1-NEXT: movl {{[0-9]+}}(%esp), %edx
756 ; X86-AVX1-NEXT: movl c, %esi
757 ; X86-AVX1-NEXT: vpmovzxwd {{.*#+}} xmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
758 ; X86-AVX1-NEXT: vpmovzxwd {{.*#+}} xmm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
759 ; X86-AVX1-NEXT: vpmovzxwd {{.*#+}} xmm2 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
760 ; X86-AVX1-NEXT: vpmovzxwd {{.*#+}} xmm3 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
761 ; X86-AVX1-NEXT: vpmovzxwd {{.*#+}} xmm4 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
762 ; X86-AVX1-NEXT: vpmulld %xmm0, %xmm4, %xmm0
763 ; X86-AVX1-NEXT: vpmovzxwd {{.*#+}} xmm4 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
764 ; X86-AVX1-NEXT: vpmulld %xmm1, %xmm4, %xmm1
765 ; X86-AVX1-NEXT: vpmovzxwd {{.*#+}} xmm4 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
766 ; X86-AVX1-NEXT: vpmulld %xmm2, %xmm4, %xmm2
767 ; X86-AVX1-NEXT: vpmovzxwd {{.*#+}} xmm4 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
768 ; X86-AVX1-NEXT: vpmulld %xmm3, %xmm4, %xmm3
769 ; X86-AVX1-NEXT: vmovdqu %xmm0, 48(%esi,%ecx,4)
770 ; X86-AVX1-NEXT: vmovdqu %xmm1, 32(%esi,%ecx,4)
771 ; X86-AVX1-NEXT: vmovdqu %xmm2, 16(%esi,%ecx,4)
772 ; X86-AVX1-NEXT: vmovdqu %xmm3, (%esi,%ecx,4)
773 ; X86-AVX1-NEXT: popl %esi
774 ; X86-AVX1-NEXT: retl
776 ; X86-AVX2-LABEL: mul_16xi16:
777 ; X86-AVX2: # %bb.0: # %entry
778 ; X86-AVX2-NEXT: pushl %esi
779 ; X86-AVX2-NEXT: movl {{[0-9]+}}(%esp), %eax
780 ; X86-AVX2-NEXT: movl {{[0-9]+}}(%esp), %ecx
781 ; X86-AVX2-NEXT: movl {{[0-9]+}}(%esp), %edx
782 ; X86-AVX2-NEXT: movl c, %esi
783 ; X86-AVX2-NEXT: vpmovzxwd {{.*#+}} ymm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero
784 ; X86-AVX2-NEXT: vpmovzxwd {{.*#+}} ymm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero
785 ; X86-AVX2-NEXT: vpmovzxwd {{.*#+}} ymm2 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero
786 ; X86-AVX2-NEXT: vpmulld %ymm0, %ymm2, %ymm0
787 ; X86-AVX2-NEXT: vpmovzxwd {{.*#+}} ymm2 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero
788 ; X86-AVX2-NEXT: vpmulld %ymm1, %ymm2, %ymm1
789 ; X86-AVX2-NEXT: vmovdqu %ymm0, 32(%esi,%ecx,4)
790 ; X86-AVX2-NEXT: vmovdqu %ymm1, (%esi,%ecx,4)
791 ; X86-AVX2-NEXT: popl %esi
792 ; X86-AVX2-NEXT: vzeroupper
793 ; X86-AVX2-NEXT: retl
795 ; X64-SSE-LABEL: mul_16xi16:
796 ; X64-SSE: # %bb.0: # %entry
797 ; X64-SSE-NEXT: movq {{.*}}(%rip), %rax
798 ; X64-SSE-NEXT: movdqu (%rdi,%rdx), %xmm0
799 ; X64-SSE-NEXT: movdqu 16(%rdi,%rdx), %xmm1
800 ; X64-SSE-NEXT: movdqu (%rsi,%rdx), %xmm2
801 ; X64-SSE-NEXT: movdqu 16(%rsi,%rdx), %xmm3
802 ; X64-SSE-NEXT: movdqa %xmm2, %xmm4
803 ; X64-SSE-NEXT: pmulhuw %xmm0, %xmm4
804 ; X64-SSE-NEXT: pmullw %xmm0, %xmm2
805 ; X64-SSE-NEXT: movdqa %xmm2, %xmm0
806 ; X64-SSE-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm4[4],xmm0[5],xmm4[5],xmm0[6],xmm4[6],xmm0[7],xmm4[7]
807 ; X64-SSE-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3]
808 ; X64-SSE-NEXT: movdqa %xmm3, %xmm4
809 ; X64-SSE-NEXT: pmulhuw %xmm1, %xmm4
810 ; X64-SSE-NEXT: pmullw %xmm1, %xmm3
811 ; X64-SSE-NEXT: movdqa %xmm3, %xmm1
812 ; X64-SSE-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm4[4],xmm1[5],xmm4[5],xmm1[6],xmm4[6],xmm1[7],xmm4[7]
813 ; X64-SSE-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3]
814 ; X64-SSE-NEXT: movdqu %xmm3, 32(%rax,%rdx,4)
815 ; X64-SSE-NEXT: movdqu %xmm1, 48(%rax,%rdx,4)
816 ; X64-SSE-NEXT: movdqu %xmm2, (%rax,%rdx,4)
817 ; X64-SSE-NEXT: movdqu %xmm0, 16(%rax,%rdx,4)
820 ; X64-AVX1-LABEL: mul_16xi16:
821 ; X64-AVX1: # %bb.0: # %entry
822 ; X64-AVX1-NEXT: movq {{.*}}(%rip), %rax
823 ; X64-AVX1-NEXT: vpmovzxwd {{.*#+}} xmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
824 ; X64-AVX1-NEXT: vpmovzxwd {{.*#+}} xmm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
825 ; X64-AVX1-NEXT: vpmovzxwd {{.*#+}} xmm2 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
826 ; X64-AVX1-NEXT: vpmovzxwd {{.*#+}} xmm3 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
827 ; X64-AVX1-NEXT: vpmovzxwd {{.*#+}} xmm4 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
828 ; X64-AVX1-NEXT: vpmulld %xmm0, %xmm4, %xmm0
829 ; X64-AVX1-NEXT: vpmovzxwd {{.*#+}} xmm4 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
830 ; X64-AVX1-NEXT: vpmulld %xmm1, %xmm4, %xmm1
831 ; X64-AVX1-NEXT: vpmovzxwd {{.*#+}} xmm4 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
832 ; X64-AVX1-NEXT: vpmulld %xmm2, %xmm4, %xmm2
833 ; X64-AVX1-NEXT: vpmovzxwd {{.*#+}} xmm4 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
834 ; X64-AVX1-NEXT: vpmulld %xmm3, %xmm4, %xmm3
835 ; X64-AVX1-NEXT: vmovdqu %xmm0, 48(%rax,%rdx,4)
836 ; X64-AVX1-NEXT: vmovdqu %xmm1, 32(%rax,%rdx,4)
837 ; X64-AVX1-NEXT: vmovdqu %xmm2, 16(%rax,%rdx,4)
838 ; X64-AVX1-NEXT: vmovdqu %xmm3, (%rax,%rdx,4)
839 ; X64-AVX1-NEXT: retq
841 ; X64-AVX2-LABEL: mul_16xi16:
842 ; X64-AVX2: # %bb.0: # %entry
843 ; X64-AVX2-NEXT: movq {{.*}}(%rip), %rax
844 ; X64-AVX2-NEXT: vpmovzxwd {{.*#+}} ymm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero
845 ; X64-AVX2-NEXT: vpmovzxwd {{.*#+}} ymm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero
846 ; X64-AVX2-NEXT: vpmovzxwd {{.*#+}} ymm2 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero
847 ; X64-AVX2-NEXT: vpmulld %ymm0, %ymm2, %ymm0
848 ; X64-AVX2-NEXT: vpmovzxwd {{.*#+}} ymm2 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero
849 ; X64-AVX2-NEXT: vpmulld %ymm1, %ymm2, %ymm1
850 ; X64-AVX2-NEXT: vmovdqu %ymm0, 32(%rax,%rdx,4)
851 ; X64-AVX2-NEXT: vmovdqu %ymm1, (%rax,%rdx,4)
852 ; X64-AVX2-NEXT: vzeroupper
853 ; X64-AVX2-NEXT: retq
855 %pre = load i32*, i32** @c
856 %tmp6 = getelementptr inbounds i8, i8* %a, i64 %index
857 %tmp7 = bitcast i8* %tmp6 to <16 x i16>*
858 %wide.load = load <16 x i16>, <16 x i16>* %tmp7, align 1
859 %tmp8 = zext <16 x i16> %wide.load to <16 x i32>
860 %tmp10 = getelementptr inbounds i8, i8* %b, i64 %index
861 %tmp11 = bitcast i8* %tmp10 to <16 x i16>*
862 %wide.load17 = load <16 x i16>, <16 x i16>* %tmp11, align 1
863 %tmp12 = zext <16 x i16> %wide.load17 to <16 x i32>
864 %tmp13 = mul nuw nsw <16 x i32> %tmp12, %tmp8
865 %tmp14 = getelementptr inbounds i32, i32* %pre, i64 %index
866 %tmp15 = bitcast i32* %tmp14 to <16 x i32>*
867 store <16 x i32> %tmp13, <16 x i32>* %tmp15, align 4
871 ; %val1 = load <2 x i8>
872 ; %op1 = sext<2 x i32> %val1
873 ; %val2 = load <2 x i8>
874 ; %op2 = sext<2 x i32> %val2
875 ; %rst = mul <2 x i32> %op1, %op2
877 define void @mul_2xi8_sext(i8* nocapture readonly %a, i8* nocapture readonly %b, i64 %index) nounwind {
878 ; X86-SSE-LABEL: mul_2xi8_sext:
879 ; X86-SSE: # %bb.0: # %entry
880 ; X86-SSE-NEXT: pushl %esi
881 ; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %eax
882 ; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %ecx
883 ; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %edx
884 ; X86-SSE-NEXT: movl c, %esi
885 ; X86-SSE-NEXT: movzwl (%edx,%ecx), %edx
886 ; X86-SSE-NEXT: movd %edx, %xmm0
887 ; X86-SSE-NEXT: movzwl (%eax,%ecx), %eax
888 ; X86-SSE-NEXT: movd %eax, %xmm1
889 ; X86-SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
890 ; X86-SSE-NEXT: psraw $8, %xmm0
891 ; X86-SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
892 ; X86-SSE-NEXT: psraw $8, %xmm1
893 ; X86-SSE-NEXT: pmullw %xmm0, %xmm1
894 ; X86-SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
895 ; X86-SSE-NEXT: psrad $16, %xmm0
896 ; X86-SSE-NEXT: movq %xmm0, (%esi,%ecx,4)
897 ; X86-SSE-NEXT: popl %esi
900 ; X86-AVX-LABEL: mul_2xi8_sext:
901 ; X86-AVX: # %bb.0: # %entry
902 ; X86-AVX-NEXT: pushl %esi
903 ; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax
904 ; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %ecx
905 ; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %edx
906 ; X86-AVX-NEXT: movl c, %esi
907 ; X86-AVX-NEXT: vpmovsxbq (%edx,%ecx), %xmm0
908 ; X86-AVX-NEXT: vpmovsxbq (%eax,%ecx), %xmm1
909 ; X86-AVX-NEXT: vpmuludq %xmm0, %xmm1, %xmm0
910 ; X86-AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
911 ; X86-AVX-NEXT: vmovq %xmm0, (%esi,%ecx,4)
912 ; X86-AVX-NEXT: popl %esi
915 ; X64-SSE-LABEL: mul_2xi8_sext:
916 ; X64-SSE: # %bb.0: # %entry
917 ; X64-SSE-NEXT: movq {{.*}}(%rip), %rax
918 ; X64-SSE-NEXT: movzwl (%rdi,%rdx), %ecx
919 ; X64-SSE-NEXT: movd %ecx, %xmm0
920 ; X64-SSE-NEXT: movzwl (%rsi,%rdx), %ecx
921 ; X64-SSE-NEXT: movd %ecx, %xmm1
922 ; X64-SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
923 ; X64-SSE-NEXT: psraw $8, %xmm0
924 ; X64-SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
925 ; X64-SSE-NEXT: psraw $8, %xmm1
926 ; X64-SSE-NEXT: pmullw %xmm0, %xmm1
927 ; X64-SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
928 ; X64-SSE-NEXT: psrad $16, %xmm0
929 ; X64-SSE-NEXT: movq %xmm0, (%rax,%rdx,4)
932 ; X64-AVX-LABEL: mul_2xi8_sext:
933 ; X64-AVX: # %bb.0: # %entry
934 ; X64-AVX-NEXT: movq {{.*}}(%rip), %rax
935 ; X64-AVX-NEXT: vpmovsxbq (%rdi,%rdx), %xmm0
936 ; X64-AVX-NEXT: vpmovsxbq (%rsi,%rdx), %xmm1
937 ; X64-AVX-NEXT: vpmuludq %xmm0, %xmm1, %xmm0
938 ; X64-AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
939 ; X64-AVX-NEXT: vmovq %xmm0, (%rax,%rdx,4)
942 %pre = load i32*, i32** @c
943 %tmp6 = getelementptr inbounds i8, i8* %a, i64 %index
944 %tmp7 = bitcast i8* %tmp6 to <2 x i8>*
945 %wide.load = load <2 x i8>, <2 x i8>* %tmp7, align 1
946 %tmp8 = sext <2 x i8> %wide.load to <2 x i32>
947 %tmp10 = getelementptr inbounds i8, i8* %b, i64 %index
948 %tmp11 = bitcast i8* %tmp10 to <2 x i8>*
949 %wide.load17 = load <2 x i8>, <2 x i8>* %tmp11, align 1
950 %tmp12 = sext <2 x i8> %wide.load17 to <2 x i32>
951 %tmp13 = mul nuw nsw <2 x i32> %tmp12, %tmp8
952 %tmp14 = getelementptr inbounds i32, i32* %pre, i64 %index
953 %tmp15 = bitcast i32* %tmp14 to <2 x i32>*
954 store <2 x i32> %tmp13, <2 x i32>* %tmp15, align 4
958 ; %val1 = load <2 x i8>
959 ; %op1 = sext<2 x i32> %val1
960 ; %val2 = load <2 x i8>
961 ; %op2 = zext<2 x i32> %val2
962 ; %rst = mul <2 x i32> %op1, %op2
964 define void @mul_2xi8_sext_zext(i8* nocapture readonly %a, i8* nocapture readonly %b, i64 %index) nounwind {
965 ; X86-SSE-LABEL: mul_2xi8_sext_zext:
966 ; X86-SSE: # %bb.0: # %entry
967 ; X86-SSE-NEXT: pushl %esi
968 ; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %eax
969 ; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %ecx
970 ; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %edx
971 ; X86-SSE-NEXT: movl c, %esi
972 ; X86-SSE-NEXT: movzwl (%edx,%ecx), %edx
973 ; X86-SSE-NEXT: movd %edx, %xmm0
974 ; X86-SSE-NEXT: movzwl (%eax,%ecx), %eax
975 ; X86-SSE-NEXT: movd %eax, %xmm1
976 ; X86-SSE-NEXT: pxor %xmm2, %xmm2
977 ; X86-SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
978 ; X86-SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
979 ; X86-SSE-NEXT: psraw $8, %xmm0
980 ; X86-SSE-NEXT: movdqa %xmm1, %xmm2
981 ; X86-SSE-NEXT: pmulhw %xmm0, %xmm2
982 ; X86-SSE-NEXT: pmullw %xmm1, %xmm0
983 ; X86-SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
984 ; X86-SSE-NEXT: movq %xmm0, (%esi,%ecx,4)
985 ; X86-SSE-NEXT: popl %esi
988 ; X86-AVX-LABEL: mul_2xi8_sext_zext:
989 ; X86-AVX: # %bb.0: # %entry
990 ; X86-AVX-NEXT: pushl %esi
991 ; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax
992 ; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %ecx
993 ; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %edx
994 ; X86-AVX-NEXT: movl c, %esi
995 ; X86-AVX-NEXT: vpmovsxbq (%edx,%ecx), %xmm0
996 ; X86-AVX-NEXT: vpmovzxbq {{.*#+}} xmm1 = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero
997 ; X86-AVX-NEXT: vpmuludq %xmm0, %xmm1, %xmm0
998 ; X86-AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
999 ; X86-AVX-NEXT: vmovq %xmm0, (%esi,%ecx,4)
1000 ; X86-AVX-NEXT: popl %esi
1001 ; X86-AVX-NEXT: retl
1003 ; X64-SSE-LABEL: mul_2xi8_sext_zext:
1004 ; X64-SSE: # %bb.0: # %entry
1005 ; X64-SSE-NEXT: movq {{.*}}(%rip), %rax
1006 ; X64-SSE-NEXT: movzwl (%rdi,%rdx), %ecx
1007 ; X64-SSE-NEXT: movd %ecx, %xmm0
1008 ; X64-SSE-NEXT: movzwl (%rsi,%rdx), %ecx
1009 ; X64-SSE-NEXT: movd %ecx, %xmm1
1010 ; X64-SSE-NEXT: pxor %xmm2, %xmm2
1011 ; X64-SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
1012 ; X64-SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
1013 ; X64-SSE-NEXT: psraw $8, %xmm0
1014 ; X64-SSE-NEXT: movdqa %xmm1, %xmm2
1015 ; X64-SSE-NEXT: pmulhw %xmm0, %xmm2
1016 ; X64-SSE-NEXT: pmullw %xmm1, %xmm0
1017 ; X64-SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
1018 ; X64-SSE-NEXT: movq %xmm0, (%rax,%rdx,4)
1019 ; X64-SSE-NEXT: retq
1021 ; X64-AVX-LABEL: mul_2xi8_sext_zext:
1022 ; X64-AVX: # %bb.0: # %entry
1023 ; X64-AVX-NEXT: movq {{.*}}(%rip), %rax
1024 ; X64-AVX-NEXT: vpmovsxbq (%rdi,%rdx), %xmm0
1025 ; X64-AVX-NEXT: vpmovzxbq {{.*#+}} xmm1 = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero
1026 ; X64-AVX-NEXT: vpmuludq %xmm0, %xmm1, %xmm0
1027 ; X64-AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
1028 ; X64-AVX-NEXT: vmovq %xmm0, (%rax,%rdx,4)
1029 ; X64-AVX-NEXT: retq
1031 %pre = load i32*, i32** @c
1032 %tmp6 = getelementptr inbounds i8, i8* %a, i64 %index
1033 %tmp7 = bitcast i8* %tmp6 to <2 x i8>*
1034 %wide.load = load <2 x i8>, <2 x i8>* %tmp7, align 1
1035 %tmp8 = sext <2 x i8> %wide.load to <2 x i32>
1036 %tmp10 = getelementptr inbounds i8, i8* %b, i64 %index
1037 %tmp11 = bitcast i8* %tmp10 to <2 x i8>*
1038 %wide.load17 = load <2 x i8>, <2 x i8>* %tmp11, align 1
1039 %tmp12 = zext <2 x i8> %wide.load17 to <2 x i32>
1040 %tmp13 = mul nuw nsw <2 x i32> %tmp12, %tmp8
1041 %tmp14 = getelementptr inbounds i32, i32* %pre, i64 %index
1042 %tmp15 = bitcast i32* %tmp14 to <2 x i32>*
1043 store <2 x i32> %tmp13, <2 x i32>* %tmp15, align 4
1047 ; %val1 = load <2 x i16>
1048 ; %op1 = sext<2 x i32> %val1
1049 ; %val2 = load <2 x i16>
1050 ; %op2 = sext<2 x i32> %val2
1051 ; %rst = mul <2 x i32> %op1, %op2
1053 define void @mul_2xi16_sext(i8* nocapture readonly %a, i8* nocapture readonly %b, i64 %index) nounwind {
1054 ; X86-SSE-LABEL: mul_2xi16_sext:
1055 ; X86-SSE: # %bb.0: # %entry
1056 ; X86-SSE-NEXT: pushl %esi
1057 ; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %eax
1058 ; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %ecx
1059 ; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %edx
1060 ; X86-SSE-NEXT: movl c, %esi
1061 ; X86-SSE-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
1062 ; X86-SSE-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
1063 ; X86-SSE-NEXT: movdqa %xmm1, %xmm2
1064 ; X86-SSE-NEXT: pmulhw %xmm0, %xmm2
1065 ; X86-SSE-NEXT: pmullw %xmm0, %xmm1
1066 ; X86-SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
1067 ; X86-SSE-NEXT: movq %xmm1, (%esi,%ecx,4)
1068 ; X86-SSE-NEXT: popl %esi
1069 ; X86-SSE-NEXT: retl
1071 ; X86-AVX-LABEL: mul_2xi16_sext:
1072 ; X86-AVX: # %bb.0: # %entry
1073 ; X86-AVX-NEXT: pushl %esi
1074 ; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax
1075 ; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %ecx
1076 ; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %edx
1077 ; X86-AVX-NEXT: movl c, %esi
1078 ; X86-AVX-NEXT: vpmovsxwq (%edx,%ecx), %xmm0
1079 ; X86-AVX-NEXT: vpmovsxwq (%eax,%ecx), %xmm1
1080 ; X86-AVX-NEXT: vpmuludq %xmm0, %xmm1, %xmm0
1081 ; X86-AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
1082 ; X86-AVX-NEXT: vmovq %xmm0, (%esi,%ecx,4)
1083 ; X86-AVX-NEXT: popl %esi
1084 ; X86-AVX-NEXT: retl
1086 ; X64-SSE-LABEL: mul_2xi16_sext:
1087 ; X64-SSE: # %bb.0: # %entry
1088 ; X64-SSE-NEXT: movq {{.*}}(%rip), %rax
1089 ; X64-SSE-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
1090 ; X64-SSE-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
1091 ; X64-SSE-NEXT: movdqa %xmm1, %xmm2
1092 ; X64-SSE-NEXT: pmulhw %xmm0, %xmm2
1093 ; X64-SSE-NEXT: pmullw %xmm0, %xmm1
1094 ; X64-SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
1095 ; X64-SSE-NEXT: movq %xmm1, (%rax,%rdx,4)
1096 ; X64-SSE-NEXT: retq
1098 ; X64-AVX-LABEL: mul_2xi16_sext:
1099 ; X64-AVX: # %bb.0: # %entry
1100 ; X64-AVX-NEXT: movq {{.*}}(%rip), %rax
1101 ; X64-AVX-NEXT: vpmovsxwq (%rdi,%rdx), %xmm0
1102 ; X64-AVX-NEXT: vpmovsxwq (%rsi,%rdx), %xmm1
1103 ; X64-AVX-NEXT: vpmuludq %xmm0, %xmm1, %xmm0
1104 ; X64-AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
1105 ; X64-AVX-NEXT: vmovq %xmm0, (%rax,%rdx,4)
1106 ; X64-AVX-NEXT: retq
1108 %pre = load i32*, i32** @c
1109 %tmp6 = getelementptr inbounds i8, i8* %a, i64 %index
1110 %tmp7 = bitcast i8* %tmp6 to <2 x i16>*
1111 %wide.load = load <2 x i16>, <2 x i16>* %tmp7, align 1
1112 %tmp8 = sext <2 x i16> %wide.load to <2 x i32>
1113 %tmp10 = getelementptr inbounds i8, i8* %b, i64 %index
1114 %tmp11 = bitcast i8* %tmp10 to <2 x i16>*
1115 %wide.load17 = load <2 x i16>, <2 x i16>* %tmp11, align 1
1116 %tmp12 = sext <2 x i16> %wide.load17 to <2 x i32>
1117 %tmp13 = mul nuw nsw <2 x i32> %tmp12, %tmp8
1118 %tmp14 = getelementptr inbounds i32, i32* %pre, i64 %index
1119 %tmp15 = bitcast i32* %tmp14 to <2 x i32>*
1120 store <2 x i32> %tmp13, <2 x i32>* %tmp15, align 4
1124 ; %val1 = load <2 x i16>
1125 ; %op1 = sext<2 x i32> %val1
1126 ; %val2 = load <2 x i16>
1127 ; %op2 = zext<2 x i32> %val2
1128 ; %rst = mul <2 x i32> %op1, %op2
1130 define void @mul_2xi16_sext_zext(i8* nocapture readonly %a, i8* nocapture readonly %b, i64 %index) nounwind {
1131 ; X86-SSE-LABEL: mul_2xi16_sext_zext:
1132 ; X86-SSE: # %bb.0: # %entry
1133 ; X86-SSE-NEXT: pushl %esi
1134 ; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %eax
1135 ; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %ecx
1136 ; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %edx
1137 ; X86-SSE-NEXT: movl c, %esi
1138 ; X86-SSE-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
1139 ; X86-SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,2,1,4,5,6,7]
1140 ; X86-SSE-NEXT: psrad $16, %xmm0
1141 ; X86-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,1,3]
1142 ; X86-SSE-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
1143 ; X86-SSE-NEXT: pxor %xmm2, %xmm2
1144 ; X86-SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
1145 ; X86-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,1,3]
1146 ; X86-SSE-NEXT: pmuludq %xmm0, %xmm1
1147 ; X86-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,2,2,3]
1148 ; X86-SSE-NEXT: movq %xmm0, (%esi,%ecx,4)
1149 ; X86-SSE-NEXT: popl %esi
1150 ; X86-SSE-NEXT: retl
1152 ; X86-AVX-LABEL: mul_2xi16_sext_zext:
1153 ; X86-AVX: # %bb.0: # %entry
1154 ; X86-AVX-NEXT: pushl %esi
1155 ; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax
1156 ; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %ecx
1157 ; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %edx
1158 ; X86-AVX-NEXT: movl c, %esi
1159 ; X86-AVX-NEXT: vpmovsxwq (%edx,%ecx), %xmm0
1160 ; X86-AVX-NEXT: vpmovzxwq {{.*#+}} xmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero
1161 ; X86-AVX-NEXT: vpmuludq %xmm0, %xmm1, %xmm0
1162 ; X86-AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
1163 ; X86-AVX-NEXT: vmovq %xmm0, (%esi,%ecx,4)
1164 ; X86-AVX-NEXT: popl %esi
1165 ; X86-AVX-NEXT: retl
1167 ; X64-SSE-LABEL: mul_2xi16_sext_zext:
1168 ; X64-SSE: # %bb.0: # %entry
1169 ; X64-SSE-NEXT: movq {{.*}}(%rip), %rax
1170 ; X64-SSE-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
1171 ; X64-SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,2,1,4,5,6,7]
1172 ; X64-SSE-NEXT: psrad $16, %xmm0
1173 ; X64-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,1,3]
1174 ; X64-SSE-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
1175 ; X64-SSE-NEXT: pxor %xmm2, %xmm2
1176 ; X64-SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
1177 ; X64-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,1,3]
1178 ; X64-SSE-NEXT: pmuludq %xmm0, %xmm1
1179 ; X64-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,2,2,3]
1180 ; X64-SSE-NEXT: movq %xmm0, (%rax,%rdx,4)
1181 ; X64-SSE-NEXT: retq
1183 ; X64-AVX-LABEL: mul_2xi16_sext_zext:
1184 ; X64-AVX: # %bb.0: # %entry
1185 ; X64-AVX-NEXT: movq {{.*}}(%rip), %rax
1186 ; X64-AVX-NEXT: vpmovsxwq (%rdi,%rdx), %xmm0
1187 ; X64-AVX-NEXT: vpmovzxwq {{.*#+}} xmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero
1188 ; X64-AVX-NEXT: vpmuludq %xmm0, %xmm1, %xmm0
1189 ; X64-AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
1190 ; X64-AVX-NEXT: vmovq %xmm0, (%rax,%rdx,4)
1191 ; X64-AVX-NEXT: retq
1193 %pre = load i32*, i32** @c
1194 %tmp6 = getelementptr inbounds i8, i8* %a, i64 %index
1195 %tmp7 = bitcast i8* %tmp6 to <2 x i16>*
1196 %wide.load = load <2 x i16>, <2 x i16>* %tmp7, align 1
1197 %tmp8 = sext <2 x i16> %wide.load to <2 x i32>
1198 %tmp10 = getelementptr inbounds i8, i8* %b, i64 %index
1199 %tmp11 = bitcast i8* %tmp10 to <2 x i16>*
1200 %wide.load17 = load <2 x i16>, <2 x i16>* %tmp11, align 1
1201 %tmp12 = zext <2 x i16> %wide.load17 to <2 x i32>
1202 %tmp13 = mul nuw nsw <2 x i32> %tmp12, %tmp8
1203 %tmp14 = getelementptr inbounds i32, i32* %pre, i64 %index
1204 %tmp15 = bitcast i32* %tmp14 to <2 x i32>*
1205 store <2 x i32> %tmp13, <2 x i32>* %tmp15, align 4
1209 ; %val1 = load <16 x i16>
1210 ; %op1 = sext<16 x i32> %val1
1211 ; %val2 = load <16 x i16>
1212 ; %op2 = sext<16 x i32> %val2
1213 ; %rst = mul <16 x i32> %op1, %op2
1215 define void @mul_16xi16_sext(i8* nocapture readonly %a, i8* nocapture readonly %b, i64 %index) nounwind {
1216 ; X86-SSE-LABEL: mul_16xi16_sext:
1217 ; X86-SSE: # %bb.0: # %entry
1218 ; X86-SSE-NEXT: pushl %esi
1219 ; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %eax
1220 ; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %ecx
1221 ; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %edx
1222 ; X86-SSE-NEXT: movl c, %esi
1223 ; X86-SSE-NEXT: movdqu (%edx,%ecx), %xmm0
1224 ; X86-SSE-NEXT: movdqu 16(%edx,%ecx), %xmm1
1225 ; X86-SSE-NEXT: movdqu (%eax,%ecx), %xmm2
1226 ; X86-SSE-NEXT: movdqu 16(%eax,%ecx), %xmm3
1227 ; X86-SSE-NEXT: movdqa %xmm2, %xmm4
1228 ; X86-SSE-NEXT: pmulhw %xmm0, %xmm4
1229 ; X86-SSE-NEXT: pmullw %xmm0, %xmm2
1230 ; X86-SSE-NEXT: movdqa %xmm2, %xmm0
1231 ; X86-SSE-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm4[4],xmm0[5],xmm4[5],xmm0[6],xmm4[6],xmm0[7],xmm4[7]
1232 ; X86-SSE-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3]
1233 ; X86-SSE-NEXT: movdqa %xmm3, %xmm4
1234 ; X86-SSE-NEXT: pmulhw %xmm1, %xmm4
1235 ; X86-SSE-NEXT: pmullw %xmm1, %xmm3
1236 ; X86-SSE-NEXT: movdqa %xmm3, %xmm1
1237 ; X86-SSE-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm4[4],xmm1[5],xmm4[5],xmm1[6],xmm4[6],xmm1[7],xmm4[7]
1238 ; X86-SSE-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3]
1239 ; X86-SSE-NEXT: movdqu %xmm3, 32(%esi,%ecx,4)
1240 ; X86-SSE-NEXT: movdqu %xmm1, 48(%esi,%ecx,4)
1241 ; X86-SSE-NEXT: movdqu %xmm2, (%esi,%ecx,4)
1242 ; X86-SSE-NEXT: movdqu %xmm0, 16(%esi,%ecx,4)
1243 ; X86-SSE-NEXT: popl %esi
1244 ; X86-SSE-NEXT: retl
1246 ; X86-AVX1-LABEL: mul_16xi16_sext:
1247 ; X86-AVX1: # %bb.0: # %entry
1248 ; X86-AVX1-NEXT: pushl %esi
1249 ; X86-AVX1-NEXT: movl {{[0-9]+}}(%esp), %eax
1250 ; X86-AVX1-NEXT: movl {{[0-9]+}}(%esp), %ecx
1251 ; X86-AVX1-NEXT: movl {{[0-9]+}}(%esp), %edx
1252 ; X86-AVX1-NEXT: movl c, %esi
1253 ; X86-AVX1-NEXT: vpmovsxwd 24(%edx,%ecx), %xmm0
1254 ; X86-AVX1-NEXT: vpmovsxwd 16(%edx,%ecx), %xmm1
1255 ; X86-AVX1-NEXT: vpmovsxwd 8(%edx,%ecx), %xmm2
1256 ; X86-AVX1-NEXT: vpmovsxwd (%edx,%ecx), %xmm3
1257 ; X86-AVX1-NEXT: vpmovsxwd 24(%eax,%ecx), %xmm4
1258 ; X86-AVX1-NEXT: vpmulld %xmm0, %xmm4, %xmm0
1259 ; X86-AVX1-NEXT: vpmovsxwd 16(%eax,%ecx), %xmm4
1260 ; X86-AVX1-NEXT: vpmulld %xmm1, %xmm4, %xmm1
1261 ; X86-AVX1-NEXT: vpmovsxwd 8(%eax,%ecx), %xmm4
1262 ; X86-AVX1-NEXT: vpmulld %xmm2, %xmm4, %xmm2
1263 ; X86-AVX1-NEXT: vpmovsxwd (%eax,%ecx), %xmm4
1264 ; X86-AVX1-NEXT: vpmulld %xmm3, %xmm4, %xmm3
1265 ; X86-AVX1-NEXT: vmovdqu %xmm0, 48(%esi,%ecx,4)
1266 ; X86-AVX1-NEXT: vmovdqu %xmm1, 32(%esi,%ecx,4)
1267 ; X86-AVX1-NEXT: vmovdqu %xmm2, 16(%esi,%ecx,4)
1268 ; X86-AVX1-NEXT: vmovdqu %xmm3, (%esi,%ecx,4)
1269 ; X86-AVX1-NEXT: popl %esi
1270 ; X86-AVX1-NEXT: retl
1272 ; X86-AVX2-LABEL: mul_16xi16_sext:
1273 ; X86-AVX2: # %bb.0: # %entry
1274 ; X86-AVX2-NEXT: pushl %esi
1275 ; X86-AVX2-NEXT: movl {{[0-9]+}}(%esp), %eax
1276 ; X86-AVX2-NEXT: movl {{[0-9]+}}(%esp), %ecx
1277 ; X86-AVX2-NEXT: movl {{[0-9]+}}(%esp), %edx
1278 ; X86-AVX2-NEXT: movl c, %esi
1279 ; X86-AVX2-NEXT: vpmovsxwd 16(%edx,%ecx), %ymm0
1280 ; X86-AVX2-NEXT: vpmovsxwd (%edx,%ecx), %ymm1
1281 ; X86-AVX2-NEXT: vpmovsxwd 16(%eax,%ecx), %ymm2
1282 ; X86-AVX2-NEXT: vpmulld %ymm0, %ymm2, %ymm0
1283 ; X86-AVX2-NEXT: vpmovsxwd (%eax,%ecx), %ymm2
1284 ; X86-AVX2-NEXT: vpmulld %ymm1, %ymm2, %ymm1
1285 ; X86-AVX2-NEXT: vmovdqu %ymm0, 32(%esi,%ecx,4)
1286 ; X86-AVX2-NEXT: vmovdqu %ymm1, (%esi,%ecx,4)
1287 ; X86-AVX2-NEXT: popl %esi
1288 ; X86-AVX2-NEXT: vzeroupper
1289 ; X86-AVX2-NEXT: retl
1291 ; X64-SSE-LABEL: mul_16xi16_sext:
1292 ; X64-SSE: # %bb.0: # %entry
1293 ; X64-SSE-NEXT: movq {{.*}}(%rip), %rax
1294 ; X64-SSE-NEXT: movdqu (%rdi,%rdx), %xmm0
1295 ; X64-SSE-NEXT: movdqu 16(%rdi,%rdx), %xmm1
1296 ; X64-SSE-NEXT: movdqu (%rsi,%rdx), %xmm2
1297 ; X64-SSE-NEXT: movdqu 16(%rsi,%rdx), %xmm3
1298 ; X64-SSE-NEXT: movdqa %xmm2, %xmm4
1299 ; X64-SSE-NEXT: pmulhw %xmm0, %xmm4
1300 ; X64-SSE-NEXT: pmullw %xmm0, %xmm2
1301 ; X64-SSE-NEXT: movdqa %xmm2, %xmm0
1302 ; X64-SSE-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm4[4],xmm0[5],xmm4[5],xmm0[6],xmm4[6],xmm0[7],xmm4[7]
1303 ; X64-SSE-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3]
1304 ; X64-SSE-NEXT: movdqa %xmm3, %xmm4
1305 ; X64-SSE-NEXT: pmulhw %xmm1, %xmm4
1306 ; X64-SSE-NEXT: pmullw %xmm1, %xmm3
1307 ; X64-SSE-NEXT: movdqa %xmm3, %xmm1
1308 ; X64-SSE-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm4[4],xmm1[5],xmm4[5],xmm1[6],xmm4[6],xmm1[7],xmm4[7]
1309 ; X64-SSE-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3]
1310 ; X64-SSE-NEXT: movdqu %xmm3, 32(%rax,%rdx,4)
1311 ; X64-SSE-NEXT: movdqu %xmm1, 48(%rax,%rdx,4)
1312 ; X64-SSE-NEXT: movdqu %xmm2, (%rax,%rdx,4)
1313 ; X64-SSE-NEXT: movdqu %xmm0, 16(%rax,%rdx,4)
1314 ; X64-SSE-NEXT: retq
1316 ; X64-AVX1-LABEL: mul_16xi16_sext:
1317 ; X64-AVX1: # %bb.0: # %entry
1318 ; X64-AVX1-NEXT: movq {{.*}}(%rip), %rax
1319 ; X64-AVX1-NEXT: vpmovsxwd 24(%rdi,%rdx), %xmm0
1320 ; X64-AVX1-NEXT: vpmovsxwd 16(%rdi,%rdx), %xmm1
1321 ; X64-AVX1-NEXT: vpmovsxwd 8(%rdi,%rdx), %xmm2
1322 ; X64-AVX1-NEXT: vpmovsxwd (%rdi,%rdx), %xmm3
1323 ; X64-AVX1-NEXT: vpmovsxwd 24(%rsi,%rdx), %xmm4
1324 ; X64-AVX1-NEXT: vpmulld %xmm0, %xmm4, %xmm0
1325 ; X64-AVX1-NEXT: vpmovsxwd 16(%rsi,%rdx), %xmm4
1326 ; X64-AVX1-NEXT: vpmulld %xmm1, %xmm4, %xmm1
1327 ; X64-AVX1-NEXT: vpmovsxwd 8(%rsi,%rdx), %xmm4
1328 ; X64-AVX1-NEXT: vpmulld %xmm2, %xmm4, %xmm2
1329 ; X64-AVX1-NEXT: vpmovsxwd (%rsi,%rdx), %xmm4
1330 ; X64-AVX1-NEXT: vpmulld %xmm3, %xmm4, %xmm3
1331 ; X64-AVX1-NEXT: vmovdqu %xmm0, 48(%rax,%rdx,4)
1332 ; X64-AVX1-NEXT: vmovdqu %xmm1, 32(%rax,%rdx,4)
1333 ; X64-AVX1-NEXT: vmovdqu %xmm2, 16(%rax,%rdx,4)
1334 ; X64-AVX1-NEXT: vmovdqu %xmm3, (%rax,%rdx,4)
1335 ; X64-AVX1-NEXT: retq
1337 ; X64-AVX2-LABEL: mul_16xi16_sext:
1338 ; X64-AVX2: # %bb.0: # %entry
1339 ; X64-AVX2-NEXT: movq {{.*}}(%rip), %rax
1340 ; X64-AVX2-NEXT: vpmovsxwd 16(%rdi,%rdx), %ymm0
1341 ; X64-AVX2-NEXT: vpmovsxwd (%rdi,%rdx), %ymm1
1342 ; X64-AVX2-NEXT: vpmovsxwd 16(%rsi,%rdx), %ymm2
1343 ; X64-AVX2-NEXT: vpmulld %ymm0, %ymm2, %ymm0
1344 ; X64-AVX2-NEXT: vpmovsxwd (%rsi,%rdx), %ymm2
1345 ; X64-AVX2-NEXT: vpmulld %ymm1, %ymm2, %ymm1
1346 ; X64-AVX2-NEXT: vmovdqu %ymm0, 32(%rax,%rdx,4)
1347 ; X64-AVX2-NEXT: vmovdqu %ymm1, (%rax,%rdx,4)
1348 ; X64-AVX2-NEXT: vzeroupper
1349 ; X64-AVX2-NEXT: retq
1351 %pre = load i32*, i32** @c
1352 %tmp6 = getelementptr inbounds i8, i8* %a, i64 %index
1353 %tmp7 = bitcast i8* %tmp6 to <16 x i16>*
1354 %wide.load = load <16 x i16>, <16 x i16>* %tmp7, align 1
1355 %tmp8 = sext <16 x i16> %wide.load to <16 x i32>
1356 %tmp10 = getelementptr inbounds i8, i8* %b, i64 %index
1357 %tmp11 = bitcast i8* %tmp10 to <16 x i16>*
1358 %wide.load17 = load <16 x i16>, <16 x i16>* %tmp11, align 1
1359 %tmp12 = sext <16 x i16> %wide.load17 to <16 x i32>
1360 %tmp13 = mul nuw nsw <16 x i32> %tmp12, %tmp8
1361 %tmp14 = getelementptr inbounds i32, i32* %pre, i64 %index
1362 %tmp15 = bitcast i32* %tmp14 to <16 x i32>*
1363 store <16 x i32> %tmp13, <16 x i32>* %tmp15, align 4
1367 ; %val = load <2 x i8>
1368 ; %op1 = zext<2 x i32> %val
1369 ; %op2 = const <2 x i32> {c1, c2} // c1 and c2 are within (0 ~ 255)
1370 ; %rst = mul <2 x i32> %op1, %op2
1372 define void @mul_2xi8_varconst1(i8* nocapture readonly %a, i64 %index) {
1373 ; X86-SSE-LABEL: mul_2xi8_varconst1:
1374 ; X86-SSE: # %bb.0: # %entry
1375 ; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %eax
1376 ; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %ecx
1377 ; X86-SSE-NEXT: movl c, %edx
1378 ; X86-SSE-NEXT: movzwl (%ecx,%eax), %ecx
1379 ; X86-SSE-NEXT: movd %ecx, %xmm0
1380 ; X86-SSE-NEXT: pxor %xmm1, %xmm1
1381 ; X86-SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
1382 ; X86-SSE-NEXT: pmullw {{\.LCPI.*}}, %xmm0
1383 ; X86-SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
1384 ; X86-SSE-NEXT: movq %xmm0, (%edx,%eax,4)
1385 ; X86-SSE-NEXT: retl
1387 ; X86-AVX-LABEL: mul_2xi8_varconst1:
1388 ; X86-AVX: # %bb.0: # %entry
1389 ; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax
1390 ; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %ecx
1391 ; X86-AVX-NEXT: movl c, %edx
1392 ; X86-AVX-NEXT: vpmovzxbq {{.*#+}} xmm0 = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero
1393 ; X86-AVX-NEXT: vpmuludq {{\.LCPI.*}}, %xmm0, %xmm0
1394 ; X86-AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
1395 ; X86-AVX-NEXT: vmovq %xmm0, (%edx,%eax,4)
1396 ; X86-AVX-NEXT: retl
1398 ; X64-SSE-LABEL: mul_2xi8_varconst1:
1399 ; X64-SSE: # %bb.0: # %entry
1400 ; X64-SSE-NEXT: movq {{.*}}(%rip), %rax
1401 ; X64-SSE-NEXT: movzwl (%rdi,%rsi), %ecx
1402 ; X64-SSE-NEXT: movd %ecx, %xmm0
1403 ; X64-SSE-NEXT: pxor %xmm1, %xmm1
1404 ; X64-SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
1405 ; X64-SSE-NEXT: pmullw {{.*}}(%rip), %xmm0
1406 ; X64-SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
1407 ; X64-SSE-NEXT: movq %xmm0, (%rax,%rsi,4)
1408 ; X64-SSE-NEXT: retq
1410 ; X64-AVX-LABEL: mul_2xi8_varconst1:
1411 ; X64-AVX: # %bb.0: # %entry
1412 ; X64-AVX-NEXT: movq {{.*}}(%rip), %rax
1413 ; X64-AVX-NEXT: vpmovzxbq {{.*#+}} xmm0 = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero
1414 ; X64-AVX-NEXT: vpmuludq {{.*}}(%rip), %xmm0, %xmm0
1415 ; X64-AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
1416 ; X64-AVX-NEXT: vmovq %xmm0, (%rax,%rsi,4)
1417 ; X64-AVX-NEXT: retq
1419 %pre = load i32*, i32** @c
1420 %tmp6 = getelementptr inbounds i8, i8* %a, i64 %index
1421 %tmp7 = bitcast i8* %tmp6 to <2 x i8>*
1422 %wide.load = load <2 x i8>, <2 x i8>* %tmp7, align 1
1423 %tmp8 = zext <2 x i8> %wide.load to <2 x i32>
1424 %tmp13 = mul nuw nsw <2 x i32> %tmp8, <i32 0, i32 255>
1425 %tmp14 = getelementptr inbounds i32, i32* %pre, i64 %index
1426 %tmp15 = bitcast i32* %tmp14 to <2 x i32>*
1427 store <2 x i32> %tmp13, <2 x i32>* %tmp15, align 4
1431 ; %val = load <2 x i8>
1432 ; %op1 = sext<2 x i32> %val
1433 ; %op2 = const <2 x i32> {c1, c2} // c1 and c2 are within (-128 ~ 127)
1434 ; %rst = mul <2 x i32> %op1, %op2
1436 define void @mul_2xi8_varconst2(i8* nocapture readonly %a, i64 %index) {
1437 ; X86-SSE-LABEL: mul_2xi8_varconst2:
1438 ; X86-SSE: # %bb.0: # %entry
1439 ; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %eax
1440 ; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %ecx
1441 ; X86-SSE-NEXT: movl c, %edx
1442 ; X86-SSE-NEXT: movzwl (%ecx,%eax), %ecx
1443 ; X86-SSE-NEXT: movd %ecx, %xmm0
1444 ; X86-SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
1445 ; X86-SSE-NEXT: psraw $8, %xmm0
1446 ; X86-SSE-NEXT: pmullw {{\.LCPI.*}}, %xmm0
1447 ; X86-SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
1448 ; X86-SSE-NEXT: psrad $16, %xmm0
1449 ; X86-SSE-NEXT: movq %xmm0, (%edx,%eax,4)
1450 ; X86-SSE-NEXT: retl
1452 ; X86-AVX-LABEL: mul_2xi8_varconst2:
1453 ; X86-AVX: # %bb.0: # %entry
1454 ; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax
1455 ; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %ecx
1456 ; X86-AVX-NEXT: movl c, %edx
1457 ; X86-AVX-NEXT: vpmovsxbq (%ecx,%eax), %xmm0
1458 ; X86-AVX-NEXT: vpmuludq {{\.LCPI.*}}, %xmm0, %xmm0
1459 ; X86-AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
1460 ; X86-AVX-NEXT: vmovq %xmm0, (%edx,%eax,4)
1461 ; X86-AVX-NEXT: retl
1463 ; X64-SSE-LABEL: mul_2xi8_varconst2:
1464 ; X64-SSE: # %bb.0: # %entry
1465 ; X64-SSE-NEXT: movq {{.*}}(%rip), %rax
1466 ; X64-SSE-NEXT: movzwl (%rdi,%rsi), %ecx
1467 ; X64-SSE-NEXT: movd %ecx, %xmm0
1468 ; X64-SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
1469 ; X64-SSE-NEXT: psraw $8, %xmm0
1470 ; X64-SSE-NEXT: pmullw {{.*}}(%rip), %xmm0
1471 ; X64-SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
1472 ; X64-SSE-NEXT: psrad $16, %xmm0
1473 ; X64-SSE-NEXT: movq %xmm0, (%rax,%rsi,4)
1474 ; X64-SSE-NEXT: retq
1476 ; X64-AVX-LABEL: mul_2xi8_varconst2:
1477 ; X64-AVX: # %bb.0: # %entry
1478 ; X64-AVX-NEXT: movq {{.*}}(%rip), %rax
1479 ; X64-AVX-NEXT: vpmovsxbq (%rdi,%rsi), %xmm0
1480 ; X64-AVX-NEXT: vpmuludq {{.*}}(%rip), %xmm0, %xmm0
1481 ; X64-AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
1482 ; X64-AVX-NEXT: vmovq %xmm0, (%rax,%rsi,4)
1483 ; X64-AVX-NEXT: retq
1485 %pre = load i32*, i32** @c
1486 %tmp6 = getelementptr inbounds i8, i8* %a, i64 %index
1487 %tmp7 = bitcast i8* %tmp6 to <2 x i8>*
1488 %wide.load = load <2 x i8>, <2 x i8>* %tmp7, align 1
1489 %tmp8 = sext <2 x i8> %wide.load to <2 x i32>
1490 %tmp13 = mul nuw nsw <2 x i32> %tmp8, <i32 -128, i32 127>
1491 %tmp14 = getelementptr inbounds i32, i32* %pre, i64 %index
1492 %tmp15 = bitcast i32* %tmp14 to <2 x i32>*
1493 store <2 x i32> %tmp13, <2 x i32>* %tmp15, align 4
1497 ; %val = load <2 x i8>
1498 ; %op1 = zext<2 x i32> %val
1499 ; %op2 = const <2 x i32> {c1, c2} // c1 and c2 are within (0 ~ 256)
1500 ; %rst = mul <2 x i32> %op1, %op2
1502 define void @mul_2xi8_varconst3(i8* nocapture readonly %a, i64 %index) {
1503 ; X86-SSE-LABEL: mul_2xi8_varconst3:
1504 ; X86-SSE: # %bb.0: # %entry
1505 ; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %eax
1506 ; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %ecx
1507 ; X86-SSE-NEXT: movl c, %edx
1508 ; X86-SSE-NEXT: movzwl (%ecx,%eax), %ecx
1509 ; X86-SSE-NEXT: movd %ecx, %xmm0
1510 ; X86-SSE-NEXT: pxor %xmm1, %xmm1
1511 ; X86-SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
1512 ; X86-SSE-NEXT: movdqa {{.*#+}} xmm1 = <0,256,u,u,u,u,u,u>
1513 ; X86-SSE-NEXT: movdqa %xmm0, %xmm2
1514 ; X86-SSE-NEXT: pmulhw %xmm1, %xmm2
1515 ; X86-SSE-NEXT: pmullw %xmm1, %xmm0
1516 ; X86-SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
1517 ; X86-SSE-NEXT: movq %xmm0, (%edx,%eax,4)
1518 ; X86-SSE-NEXT: retl
1520 ; X86-AVX-LABEL: mul_2xi8_varconst3:
1521 ; X86-AVX: # %bb.0: # %entry
1522 ; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax
1523 ; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %ecx
1524 ; X86-AVX-NEXT: movl c, %edx
1525 ; X86-AVX-NEXT: vpmovzxbq {{.*#+}} xmm0 = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero
1526 ; X86-AVX-NEXT: vpmuludq {{\.LCPI.*}}, %xmm0, %xmm0
1527 ; X86-AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
1528 ; X86-AVX-NEXT: vmovq %xmm0, (%edx,%eax,4)
1529 ; X86-AVX-NEXT: retl
1531 ; X64-SSE-LABEL: mul_2xi8_varconst3:
1532 ; X64-SSE: # %bb.0: # %entry
1533 ; X64-SSE-NEXT: movq {{.*}}(%rip), %rax
1534 ; X64-SSE-NEXT: movzwl (%rdi,%rsi), %ecx
1535 ; X64-SSE-NEXT: movd %ecx, %xmm0
1536 ; X64-SSE-NEXT: pxor %xmm1, %xmm1
1537 ; X64-SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
1538 ; X64-SSE-NEXT: movdqa {{.*#+}} xmm1 = <0,256,u,u,u,u,u,u>
1539 ; X64-SSE-NEXT: movdqa %xmm0, %xmm2
1540 ; X64-SSE-NEXT: pmulhw %xmm1, %xmm2
1541 ; X64-SSE-NEXT: pmullw %xmm1, %xmm0
1542 ; X64-SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
1543 ; X64-SSE-NEXT: movq %xmm0, (%rax,%rsi,4)
1544 ; X64-SSE-NEXT: retq
1546 ; X64-AVX-LABEL: mul_2xi8_varconst3:
1547 ; X64-AVX: # %bb.0: # %entry
1548 ; X64-AVX-NEXT: movq {{.*}}(%rip), %rax
1549 ; X64-AVX-NEXT: vpmovzxbq {{.*#+}} xmm0 = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero
1550 ; X64-AVX-NEXT: vpmuludq {{.*}}(%rip), %xmm0, %xmm0
1551 ; X64-AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
1552 ; X64-AVX-NEXT: vmovq %xmm0, (%rax,%rsi,4)
1553 ; X64-AVX-NEXT: retq
1555 %pre = load i32*, i32** @c
1556 %tmp6 = getelementptr inbounds i8, i8* %a, i64 %index
1557 %tmp7 = bitcast i8* %tmp6 to <2 x i8>*
1558 %wide.load = load <2 x i8>, <2 x i8>* %tmp7, align 1
1559 %tmp8 = zext <2 x i8> %wide.load to <2 x i32>
1560 %tmp13 = mul nuw nsw <2 x i32> %tmp8, <i32 0, i32 256>
1561 %tmp14 = getelementptr inbounds i32, i32* %pre, i64 %index
1562 %tmp15 = bitcast i32* %tmp14 to <2 x i32>*
1563 store <2 x i32> %tmp13, <2 x i32>* %tmp15, align 4
1567 ; %val = load <2 x i8>
1568 ; %op1 = zext<2 x i32> %val
1569 ; %op2 = const <2 x i32> {c1, c2} // c1 and c2 are within (-1 ~ 255)
1570 ; %rst = mul <2 x i32> %op1, %op2
1572 define void @mul_2xi8_varconst4(i8* nocapture readonly %a, i64 %index) {
1573 ; X86-SSE-LABEL: mul_2xi8_varconst4:
1574 ; X86-SSE: # %bb.0: # %entry
1575 ; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %eax
1576 ; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %ecx
1577 ; X86-SSE-NEXT: movl c, %edx
1578 ; X86-SSE-NEXT: movzwl (%ecx,%eax), %ecx
1579 ; X86-SSE-NEXT: movd %ecx, %xmm0
1580 ; X86-SSE-NEXT: pxor %xmm1, %xmm1
1581 ; X86-SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
1582 ; X86-SSE-NEXT: movdqa {{.*#+}} xmm1 = <65535,255,u,u,u,u,u,u>
1583 ; X86-SSE-NEXT: movdqa %xmm0, %xmm2
1584 ; X86-SSE-NEXT: pmulhw %xmm1, %xmm2
1585 ; X86-SSE-NEXT: pmullw %xmm1, %xmm0
1586 ; X86-SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
1587 ; X86-SSE-NEXT: movq %xmm0, (%edx,%eax,4)
1588 ; X86-SSE-NEXT: retl
1590 ; X86-AVX-LABEL: mul_2xi8_varconst4:
1591 ; X86-AVX: # %bb.0: # %entry
1592 ; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax
1593 ; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %ecx
1594 ; X86-AVX-NEXT: movl c, %edx
1595 ; X86-AVX-NEXT: vpmovzxbq {{.*#+}} xmm0 = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero
1596 ; X86-AVX-NEXT: vpmuludq {{\.LCPI.*}}, %xmm0, %xmm0
1597 ; X86-AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
1598 ; X86-AVX-NEXT: vmovq %xmm0, (%edx,%eax,4)
1599 ; X86-AVX-NEXT: retl
1601 ; X64-SSE-LABEL: mul_2xi8_varconst4:
1602 ; X64-SSE: # %bb.0: # %entry
1603 ; X64-SSE-NEXT: movq {{.*}}(%rip), %rax
1604 ; X64-SSE-NEXT: movzwl (%rdi,%rsi), %ecx
1605 ; X64-SSE-NEXT: movd %ecx, %xmm0
1606 ; X64-SSE-NEXT: pxor %xmm1, %xmm1
1607 ; X64-SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
1608 ; X64-SSE-NEXT: movdqa {{.*#+}} xmm1 = <65535,255,u,u,u,u,u,u>
1609 ; X64-SSE-NEXT: movdqa %xmm0, %xmm2
1610 ; X64-SSE-NEXT: pmulhw %xmm1, %xmm2
1611 ; X64-SSE-NEXT: pmullw %xmm1, %xmm0
1612 ; X64-SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
1613 ; X64-SSE-NEXT: movq %xmm0, (%rax,%rsi,4)
1614 ; X64-SSE-NEXT: retq
1616 ; X64-AVX-LABEL: mul_2xi8_varconst4:
1617 ; X64-AVX: # %bb.0: # %entry
1618 ; X64-AVX-NEXT: movq {{.*}}(%rip), %rax
1619 ; X64-AVX-NEXT: vpmovzxbq {{.*#+}} xmm0 = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero
1620 ; X64-AVX-NEXT: vpmuludq {{.*}}(%rip), %xmm0, %xmm0
1621 ; X64-AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
1622 ; X64-AVX-NEXT: vmovq %xmm0, (%rax,%rsi,4)
1623 ; X64-AVX-NEXT: retq
1625 %pre = load i32*, i32** @c
1626 %tmp6 = getelementptr inbounds i8, i8* %a, i64 %index
1627 %tmp7 = bitcast i8* %tmp6 to <2 x i8>*
1628 %wide.load = load <2 x i8>, <2 x i8>* %tmp7, align 1
1629 %tmp8 = zext <2 x i8> %wide.load to <2 x i32>
1630 %tmp13 = mul nuw nsw <2 x i32> %tmp8, <i32 -1, i32 255>
1631 %tmp14 = getelementptr inbounds i32, i32* %pre, i64 %index
1632 %tmp15 = bitcast i32* %tmp14 to <2 x i32>*
1633 store <2 x i32> %tmp13, <2 x i32>* %tmp15, align 4
1637 ; %val = load <2 x i8>
1638 ; %op1 = sext<2 x i32> %val
1639 ; %op2 = const <2 x i32> {c1, c2} // c1 and c2 are within (-129 ~ 127)
1640 ; %rst = mul <2 x i32> %op1, %op2
1642 define void @mul_2xi8_varconst5(i8* nocapture readonly %a, i64 %index) {
1643 ; X86-SSE-LABEL: mul_2xi8_varconst5:
1644 ; X86-SSE: # %bb.0: # %entry
1645 ; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %eax
1646 ; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %ecx
1647 ; X86-SSE-NEXT: movl c, %edx
1648 ; X86-SSE-NEXT: movzwl (%ecx,%eax), %ecx
1649 ; X86-SSE-NEXT: movd %ecx, %xmm0
1650 ; X86-SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
1651 ; X86-SSE-NEXT: psraw $8, %xmm0
1652 ; X86-SSE-NEXT: movdqa {{.*#+}} xmm1 = <65407,127,u,u,u,u,u,u>
1653 ; X86-SSE-NEXT: movdqa %xmm0, %xmm2
1654 ; X86-SSE-NEXT: pmulhw %xmm1, %xmm2
1655 ; X86-SSE-NEXT: pmullw %xmm1, %xmm0
1656 ; X86-SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
1657 ; X86-SSE-NEXT: movq %xmm0, (%edx,%eax,4)
1658 ; X86-SSE-NEXT: retl
1660 ; X86-AVX-LABEL: mul_2xi8_varconst5:
1661 ; X86-AVX: # %bb.0: # %entry
1662 ; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax
1663 ; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %ecx
1664 ; X86-AVX-NEXT: movl c, %edx
1665 ; X86-AVX-NEXT: vpmovsxbq (%ecx,%eax), %xmm0
1666 ; X86-AVX-NEXT: vpmuludq {{\.LCPI.*}}, %xmm0, %xmm0
1667 ; X86-AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
1668 ; X86-AVX-NEXT: vmovq %xmm0, (%edx,%eax,4)
1669 ; X86-AVX-NEXT: retl
1671 ; X64-SSE-LABEL: mul_2xi8_varconst5:
1672 ; X64-SSE: # %bb.0: # %entry
1673 ; X64-SSE-NEXT: movq {{.*}}(%rip), %rax
1674 ; X64-SSE-NEXT: movzwl (%rdi,%rsi), %ecx
1675 ; X64-SSE-NEXT: movd %ecx, %xmm0
1676 ; X64-SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
1677 ; X64-SSE-NEXT: psraw $8, %xmm0
1678 ; X64-SSE-NEXT: movdqa {{.*#+}} xmm1 = <65407,127,u,u,u,u,u,u>
1679 ; X64-SSE-NEXT: movdqa %xmm0, %xmm2
1680 ; X64-SSE-NEXT: pmulhw %xmm1, %xmm2
1681 ; X64-SSE-NEXT: pmullw %xmm1, %xmm0
1682 ; X64-SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
1683 ; X64-SSE-NEXT: movq %xmm0, (%rax,%rsi,4)
1684 ; X64-SSE-NEXT: retq
1686 ; X64-AVX-LABEL: mul_2xi8_varconst5:
1687 ; X64-AVX: # %bb.0: # %entry
1688 ; X64-AVX-NEXT: movq {{.*}}(%rip), %rax
1689 ; X64-AVX-NEXT: vpmovsxbq (%rdi,%rsi), %xmm0
1690 ; X64-AVX-NEXT: vpmuludq {{.*}}(%rip), %xmm0, %xmm0
1691 ; X64-AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
1692 ; X64-AVX-NEXT: vmovq %xmm0, (%rax,%rsi,4)
1693 ; X64-AVX-NEXT: retq
1695 %pre = load i32*, i32** @c
1696 %tmp6 = getelementptr inbounds i8, i8* %a, i64 %index
1697 %tmp7 = bitcast i8* %tmp6 to <2 x i8>*
1698 %wide.load = load <2 x i8>, <2 x i8>* %tmp7, align 1
1699 %tmp8 = sext <2 x i8> %wide.load to <2 x i32>
1700 %tmp13 = mul nuw nsw <2 x i32> %tmp8, <i32 -129, i32 127>
1701 %tmp14 = getelementptr inbounds i32, i32* %pre, i64 %index
1702 %tmp15 = bitcast i32* %tmp14 to <2 x i32>*
1703 store <2 x i32> %tmp13, <2 x i32>* %tmp15, align 4
1707 ; %val = load <2 x i8>
1708 ; %op1 = sext<2 x i32> %val
1709 ; %op2 = const <2 x i32> {c1, c2} // c1 and c2 are within (-128 ~ 128)
1710 ; %rst = mul <2 x i32> %op1, %op2
1712 define void @mul_2xi8_varconst6(i8* nocapture readonly %a, i64 %index) {
1713 ; X86-SSE-LABEL: mul_2xi8_varconst6:
1714 ; X86-SSE: # %bb.0: # %entry
1715 ; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %eax
1716 ; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %ecx
1717 ; X86-SSE-NEXT: movl c, %edx
1718 ; X86-SSE-NEXT: movzwl (%ecx,%eax), %ecx
1719 ; X86-SSE-NEXT: movd %ecx, %xmm0
1720 ; X86-SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
1721 ; X86-SSE-NEXT: psraw $8, %xmm0
1722 ; X86-SSE-NEXT: movdqa {{.*#+}} xmm1 = <65408,128,u,u,u,u,u,u>
1723 ; X86-SSE-NEXT: movdqa %xmm0, %xmm2
1724 ; X86-SSE-NEXT: pmulhw %xmm1, %xmm2
1725 ; X86-SSE-NEXT: pmullw %xmm1, %xmm0
1726 ; X86-SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
1727 ; X86-SSE-NEXT: movq %xmm0, (%edx,%eax,4)
1728 ; X86-SSE-NEXT: retl
1730 ; X86-AVX-LABEL: mul_2xi8_varconst6:
1731 ; X86-AVX: # %bb.0: # %entry
1732 ; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax
1733 ; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %ecx
1734 ; X86-AVX-NEXT: movl c, %edx
1735 ; X86-AVX-NEXT: vpmovsxbq (%ecx,%eax), %xmm0
1736 ; X86-AVX-NEXT: vpmuludq {{\.LCPI.*}}, %xmm0, %xmm0
1737 ; X86-AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
1738 ; X86-AVX-NEXT: vmovq %xmm0, (%edx,%eax,4)
1739 ; X86-AVX-NEXT: retl
1741 ; X64-SSE-LABEL: mul_2xi8_varconst6:
1742 ; X64-SSE: # %bb.0: # %entry
1743 ; X64-SSE-NEXT: movq {{.*}}(%rip), %rax
1744 ; X64-SSE-NEXT: movzwl (%rdi,%rsi), %ecx
1745 ; X64-SSE-NEXT: movd %ecx, %xmm0
1746 ; X64-SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
1747 ; X64-SSE-NEXT: psraw $8, %xmm0
1748 ; X64-SSE-NEXT: movdqa {{.*#+}} xmm1 = <65408,128,u,u,u,u,u,u>
1749 ; X64-SSE-NEXT: movdqa %xmm0, %xmm2
1750 ; X64-SSE-NEXT: pmulhw %xmm1, %xmm2
1751 ; X64-SSE-NEXT: pmullw %xmm1, %xmm0
1752 ; X64-SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
1753 ; X64-SSE-NEXT: movq %xmm0, (%rax,%rsi,4)
1754 ; X64-SSE-NEXT: retq
1756 ; X64-AVX-LABEL: mul_2xi8_varconst6:
1757 ; X64-AVX: # %bb.0: # %entry
1758 ; X64-AVX-NEXT: movq {{.*}}(%rip), %rax
1759 ; X64-AVX-NEXT: vpmovsxbq (%rdi,%rsi), %xmm0
1760 ; X64-AVX-NEXT: vpmuludq {{.*}}(%rip), %xmm0, %xmm0
1761 ; X64-AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
1762 ; X64-AVX-NEXT: vmovq %xmm0, (%rax,%rsi,4)
1763 ; X64-AVX-NEXT: retq
1765 %pre = load i32*, i32** @c
1766 %tmp6 = getelementptr inbounds i8, i8* %a, i64 %index
1767 %tmp7 = bitcast i8* %tmp6 to <2 x i8>*
1768 %wide.load = load <2 x i8>, <2 x i8>* %tmp7, align 1
1769 %tmp8 = sext <2 x i8> %wide.load to <2 x i32>
1770 %tmp13 = mul nuw nsw <2 x i32> %tmp8, <i32 -128, i32 128>
1771 %tmp14 = getelementptr inbounds i32, i32* %pre, i64 %index
1772 %tmp15 = bitcast i32* %tmp14 to <2 x i32>*
1773 store <2 x i32> %tmp13, <2 x i32>* %tmp15, align 4
1777 ; %val = load <2 x i16>
1778 ; %op1 = zext<2 x i32> %val
1779 ; %op2 = const <2 x i32> {c1, c2} // c1 and c2 are within (0 ~ 65535)
1780 ; %rst = mul <2 x i32> %op1, %op2
1782 define void @mul_2xi16_varconst1(i8* nocapture readonly %a, i64 %index) {
1783 ; X86-SSE-LABEL: mul_2xi16_varconst1:
1784 ; X86-SSE: # %bb.0: # %entry
1785 ; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %eax
1786 ; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %ecx
1787 ; X86-SSE-NEXT: movl c, %edx
1788 ; X86-SSE-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
1789 ; X86-SSE-NEXT: movdqa {{.*#+}} xmm1 = <0,65535,u,u,u,u,u,u>
1790 ; X86-SSE-NEXT: movdqa %xmm0, %xmm2
1791 ; X86-SSE-NEXT: pmulhuw %xmm1, %xmm2
1792 ; X86-SSE-NEXT: pmullw %xmm1, %xmm0
1793 ; X86-SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
1794 ; X86-SSE-NEXT: movq %xmm0, (%edx,%eax,4)
1795 ; X86-SSE-NEXT: retl
1797 ; X86-AVX-LABEL: mul_2xi16_varconst1:
1798 ; X86-AVX: # %bb.0: # %entry
1799 ; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax
1800 ; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %ecx
1801 ; X86-AVX-NEXT: movl c, %edx
1802 ; X86-AVX-NEXT: vpmovzxwq {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero
1803 ; X86-AVX-NEXT: vpmuludq {{\.LCPI.*}}, %xmm0, %xmm0
1804 ; X86-AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
1805 ; X86-AVX-NEXT: vmovq %xmm0, (%edx,%eax,4)
1806 ; X86-AVX-NEXT: retl
1808 ; X64-SSE-LABEL: mul_2xi16_varconst1:
1809 ; X64-SSE: # %bb.0: # %entry
1810 ; X64-SSE-NEXT: movq {{.*}}(%rip), %rax
1811 ; X64-SSE-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
1812 ; X64-SSE-NEXT: movdqa {{.*#+}} xmm1 = <0,65535,u,u,u,u,u,u>
1813 ; X64-SSE-NEXT: movdqa %xmm0, %xmm2
1814 ; X64-SSE-NEXT: pmulhuw %xmm1, %xmm2
1815 ; X64-SSE-NEXT: pmullw %xmm1, %xmm0
1816 ; X64-SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
1817 ; X64-SSE-NEXT: movq %xmm0, (%rax,%rsi,4)
1818 ; X64-SSE-NEXT: retq
1820 ; X64-AVX-LABEL: mul_2xi16_varconst1:
1821 ; X64-AVX: # %bb.0: # %entry
1822 ; X64-AVX-NEXT: movq {{.*}}(%rip), %rax
1823 ; X64-AVX-NEXT: vpmovzxwq {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero
1824 ; X64-AVX-NEXT: vpmuludq {{.*}}(%rip), %xmm0, %xmm0
1825 ; X64-AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
1826 ; X64-AVX-NEXT: vmovq %xmm0, (%rax,%rsi,4)
1827 ; X64-AVX-NEXT: retq
1829 %pre = load i32*, i32** @c
1830 %tmp6 = getelementptr inbounds i8, i8* %a, i64 %index
1831 %tmp7 = bitcast i8* %tmp6 to <2 x i16>*
1832 %wide.load = load <2 x i16>, <2 x i16>* %tmp7, align 1
1833 %tmp8 = zext <2 x i16> %wide.load to <2 x i32>
1834 %tmp13 = mul nuw nsw <2 x i32> %tmp8, <i32 0, i32 65535>
1835 %tmp14 = getelementptr inbounds i32, i32* %pre, i64 %index
1836 %tmp15 = bitcast i32* %tmp14 to <2 x i32>*
1837 store <2 x i32> %tmp13, <2 x i32>* %tmp15, align 4
1841 ; %val = load <2 x i16>
1842 ; %op1 = sext<2 x i32> %val
1843 ; %op2 = const <2 x i32> {c1, c2} // c1 and c2 are within (-32768 ~ 32767)
1844 ; %rst = mul <2 x i32> %op1, %op2
1846 define void @mul_2xi16_varconst2(i8* nocapture readonly %a, i64 %index) {
1847 ; X86-SSE-LABEL: mul_2xi16_varconst2:
1848 ; X86-SSE: # %bb.0: # %entry
1849 ; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %eax
1850 ; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %ecx
1851 ; X86-SSE-NEXT: movl c, %edx
1852 ; X86-SSE-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
1853 ; X86-SSE-NEXT: movdqa {{.*#+}} xmm1 = <32768,32767,u,u,u,u,u,u>
1854 ; X86-SSE-NEXT: movdqa %xmm0, %xmm2
1855 ; X86-SSE-NEXT: pmulhw %xmm1, %xmm2
1856 ; X86-SSE-NEXT: pmullw %xmm1, %xmm0
1857 ; X86-SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
1858 ; X86-SSE-NEXT: movq %xmm0, (%edx,%eax,4)
1859 ; X86-SSE-NEXT: retl
1861 ; X86-AVX-LABEL: mul_2xi16_varconst2:
1862 ; X86-AVX: # %bb.0: # %entry
1863 ; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax
1864 ; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %ecx
1865 ; X86-AVX-NEXT: movl c, %edx
1866 ; X86-AVX-NEXT: vpmovsxwq (%ecx,%eax), %xmm0
1867 ; X86-AVX-NEXT: vpmuludq {{\.LCPI.*}}, %xmm0, %xmm0
1868 ; X86-AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
1869 ; X86-AVX-NEXT: vmovq %xmm0, (%edx,%eax,4)
1870 ; X86-AVX-NEXT: retl
1872 ; X64-SSE-LABEL: mul_2xi16_varconst2:
1873 ; X64-SSE: # %bb.0: # %entry
1874 ; X64-SSE-NEXT: movq {{.*}}(%rip), %rax
1875 ; X64-SSE-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
1876 ; X64-SSE-NEXT: movdqa {{.*#+}} xmm1 = <32768,32767,u,u,u,u,u,u>
1877 ; X64-SSE-NEXT: movdqa %xmm0, %xmm2
1878 ; X64-SSE-NEXT: pmulhw %xmm1, %xmm2
1879 ; X64-SSE-NEXT: pmullw %xmm1, %xmm0
1880 ; X64-SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
1881 ; X64-SSE-NEXT: movq %xmm0, (%rax,%rsi,4)
1882 ; X64-SSE-NEXT: retq
1884 ; X64-AVX-LABEL: mul_2xi16_varconst2:
1885 ; X64-AVX: # %bb.0: # %entry
1886 ; X64-AVX-NEXT: movq {{.*}}(%rip), %rax
1887 ; X64-AVX-NEXT: vpmovsxwq (%rdi,%rsi), %xmm0
1888 ; X64-AVX-NEXT: vpmuludq {{.*}}(%rip), %xmm0, %xmm0
1889 ; X64-AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
1890 ; X64-AVX-NEXT: vmovq %xmm0, (%rax,%rsi,4)
1891 ; X64-AVX-NEXT: retq
1893 %pre = load i32*, i32** @c
1894 %tmp6 = getelementptr inbounds i8, i8* %a, i64 %index
1895 %tmp7 = bitcast i8* %tmp6 to <2 x i16>*
1896 %wide.load = load <2 x i16>, <2 x i16>* %tmp7, align 1
1897 %tmp8 = sext <2 x i16> %wide.load to <2 x i32>
1898 %tmp13 = mul nuw nsw <2 x i32> %tmp8, <i32 -32768, i32 32767>
1899 %tmp14 = getelementptr inbounds i32, i32* %pre, i64 %index
1900 %tmp15 = bitcast i32* %tmp14 to <2 x i32>*
1901 store <2 x i32> %tmp13, <2 x i32>* %tmp15, align 4
1905 ; %val = load <2 x i16>
1906 ; %op1 = zext<2 x i32> %val
1907 ; %op2 = const <2 x i32> {c1, c2} // c1 and c2 are within (0 ~ 65536)
1908 ; %rst = mul <2 x i32> %op1, %op2
1910 define void @mul_2xi16_varconst3(i8* nocapture readonly %a, i64 %index) {
1911 ; X86-SSE-LABEL: mul_2xi16_varconst3:
1912 ; X86-SSE: # %bb.0: # %entry
1913 ; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %eax
1914 ; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %ecx
1915 ; X86-SSE-NEXT: movl c, %edx
1916 ; X86-SSE-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
1917 ; X86-SSE-NEXT: pxor %xmm1, %xmm1
1918 ; X86-SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
1919 ; X86-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,1,3]
1920 ; X86-SSE-NEXT: pmuludq {{\.LCPI.*}}, %xmm0
1921 ; X86-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
1922 ; X86-SSE-NEXT: movq %xmm0, (%edx,%eax,4)
1923 ; X86-SSE-NEXT: retl
1925 ; X86-AVX-LABEL: mul_2xi16_varconst3:
1926 ; X86-AVX: # %bb.0: # %entry
1927 ; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax
1928 ; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %ecx
1929 ; X86-AVX-NEXT: movl c, %edx
1930 ; X86-AVX-NEXT: vpmovzxwq {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero
1931 ; X86-AVX-NEXT: vpmuludq {{\.LCPI.*}}, %xmm0, %xmm0
1932 ; X86-AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
1933 ; X86-AVX-NEXT: vmovq %xmm0, (%edx,%eax,4)
1934 ; X86-AVX-NEXT: retl
1936 ; X64-SSE-LABEL: mul_2xi16_varconst3:
1937 ; X64-SSE: # %bb.0: # %entry
1938 ; X64-SSE-NEXT: movq {{.*}}(%rip), %rax
1939 ; X64-SSE-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
1940 ; X64-SSE-NEXT: pxor %xmm1, %xmm1
1941 ; X64-SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
1942 ; X64-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,1,3]
1943 ; X64-SSE-NEXT: pmuludq {{.*}}(%rip), %xmm0
1944 ; X64-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
1945 ; X64-SSE-NEXT: movq %xmm0, (%rax,%rsi,4)
1946 ; X64-SSE-NEXT: retq
1948 ; X64-AVX-LABEL: mul_2xi16_varconst3:
1949 ; X64-AVX: # %bb.0: # %entry
1950 ; X64-AVX-NEXT: movq {{.*}}(%rip), %rax
1951 ; X64-AVX-NEXT: vpmovzxwq {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero
1952 ; X64-AVX-NEXT: vpmuludq {{.*}}(%rip), %xmm0, %xmm0
1953 ; X64-AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
1954 ; X64-AVX-NEXT: vmovq %xmm0, (%rax,%rsi,4)
1955 ; X64-AVX-NEXT: retq
1957 %pre = load i32*, i32** @c
1958 %tmp6 = getelementptr inbounds i8, i8* %a, i64 %index
1959 %tmp7 = bitcast i8* %tmp6 to <2 x i16>*
1960 %wide.load = load <2 x i16>, <2 x i16>* %tmp7, align 1
1961 %tmp8 = zext <2 x i16> %wide.load to <2 x i32>
1962 %tmp13 = mul nuw nsw <2 x i32> %tmp8, <i32 0, i32 65536>
1963 %tmp14 = getelementptr inbounds i32, i32* %pre, i64 %index
1964 %tmp15 = bitcast i32* %tmp14 to <2 x i32>*
1965 store <2 x i32> %tmp13, <2 x i32>* %tmp15, align 4
1969 ; %val = load <2 x i16>
1970 ; %op1 = sext<2 x i32> %val
1971 ; %op2 = const <2 x i32> {c1, c2} // c1 and c2 are within (0 ~ 32768)
1972 ; %rst = mul <2 x i32> %op1, %op2
1974 define void @mul_2xi16_varconst4(i8* nocapture readonly %a, i64 %index) {
1975 ; X86-SSE-LABEL: mul_2xi16_varconst4:
1976 ; X86-SSE: # %bb.0: # %entry
1977 ; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %eax
1978 ; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %ecx
1979 ; X86-SSE-NEXT: movl c, %edx
1980 ; X86-SSE-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
1981 ; X86-SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,2,1,4,5,6,7]
1982 ; X86-SSE-NEXT: psrad $16, %xmm0
1983 ; X86-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,1,3]
1984 ; X86-SSE-NEXT: pmuludq {{\.LCPI.*}}, %xmm0
1985 ; X86-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
1986 ; X86-SSE-NEXT: movq %xmm0, (%edx,%eax,4)
1987 ; X86-SSE-NEXT: retl
1989 ; X86-AVX-LABEL: mul_2xi16_varconst4:
1990 ; X86-AVX: # %bb.0: # %entry
1991 ; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax
1992 ; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %ecx
1993 ; X86-AVX-NEXT: movl c, %edx
1994 ; X86-AVX-NEXT: vpmovsxwq (%ecx,%eax), %xmm0
1995 ; X86-AVX-NEXT: vpmuludq {{\.LCPI.*}}, %xmm0, %xmm0
1996 ; X86-AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
1997 ; X86-AVX-NEXT: vmovq %xmm0, (%edx,%eax,4)
1998 ; X86-AVX-NEXT: retl
2000 ; X64-SSE-LABEL: mul_2xi16_varconst4:
2001 ; X64-SSE: # %bb.0: # %entry
2002 ; X64-SSE-NEXT: movq {{.*}}(%rip), %rax
2003 ; X64-SSE-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
2004 ; X64-SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,2,1,4,5,6,7]
2005 ; X64-SSE-NEXT: psrad $16, %xmm0
2006 ; X64-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,1,3]
2007 ; X64-SSE-NEXT: pmuludq {{.*}}(%rip), %xmm0
2008 ; X64-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
2009 ; X64-SSE-NEXT: movq %xmm0, (%rax,%rsi,4)
2010 ; X64-SSE-NEXT: retq
2012 ; X64-AVX-LABEL: mul_2xi16_varconst4:
2013 ; X64-AVX: # %bb.0: # %entry
2014 ; X64-AVX-NEXT: movq {{.*}}(%rip), %rax
2015 ; X64-AVX-NEXT: vpmovsxwq (%rdi,%rsi), %xmm0
2016 ; X64-AVX-NEXT: vpmuludq {{.*}}(%rip), %xmm0, %xmm0
2017 ; X64-AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
2018 ; X64-AVX-NEXT: vmovq %xmm0, (%rax,%rsi,4)
2019 ; X64-AVX-NEXT: retq
2021 %pre = load i32*, i32** @c
2022 %tmp6 = getelementptr inbounds i8, i8* %a, i64 %index
2023 %tmp7 = bitcast i8* %tmp6 to <2 x i16>*
2024 %wide.load = load <2 x i16>, <2 x i16>* %tmp7, align 1
2025 %tmp8 = sext <2 x i16> %wide.load to <2 x i32>
2026 %tmp13 = mul nuw nsw <2 x i32> %tmp8, <i32 0, i32 32768>
2027 %tmp14 = getelementptr inbounds i32, i32* %pre, i64 %index
2028 %tmp15 = bitcast i32* %tmp14 to <2 x i32>*
2029 store <2 x i32> %tmp13, <2 x i32>* %tmp15, align 4
2037 define void @PR34947(<9 x i16>* %p0, <9 x i32>* %p1) nounwind {
2038 ; X86-SSE-LABEL: PR34947:
2040 ; X86-SSE-NEXT: pushl %esi
2041 ; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %ecx
2042 ; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %eax
2043 ; X86-SSE-NEXT: movdqa (%eax), %xmm5
2044 ; X86-SSE-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
2045 ; X86-SSE-NEXT: movdqa (%ecx), %xmm2
2046 ; X86-SSE-NEXT: movdqa 16(%ecx), %xmm6
2047 ; X86-SSE-NEXT: pxor %xmm0, %xmm0
2048 ; X86-SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
2049 ; X86-SSE-NEXT: movdqa %xmm5, %xmm4
2050 ; X86-SSE-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1],xmm4[2],xmm0[2],xmm4[3],xmm0[3]
2051 ; X86-SSE-NEXT: punpckhwd {{.*#+}} xmm5 = xmm5[4],xmm0[4],xmm5[5],xmm0[5],xmm5[6],xmm0[6],xmm5[7],xmm0[7]
2052 ; X86-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm5[3,1,2,3]
2053 ; X86-SSE-NEXT: movd %xmm0, %eax
2054 ; X86-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm6[3,1,2,3]
2055 ; X86-SSE-NEXT: movd %xmm0, %esi
2056 ; X86-SSE-NEXT: xorl %edx, %edx
2057 ; X86-SSE-NEXT: divl %esi
2058 ; X86-SSE-NEXT: movd %edx, %xmm0
2059 ; X86-SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm5[2,3,0,1]
2060 ; X86-SSE-NEXT: movd %xmm3, %eax
2061 ; X86-SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm6[2,3,0,1]
2062 ; X86-SSE-NEXT: movd %xmm3, %esi
2063 ; X86-SSE-NEXT: xorl %edx, %edx
2064 ; X86-SSE-NEXT: divl %esi
2065 ; X86-SSE-NEXT: movd %edx, %xmm7
2066 ; X86-SSE-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm0[0],xmm7[1],xmm0[1]
2067 ; X86-SSE-NEXT: movd %xmm5, %eax
2068 ; X86-SSE-NEXT: movd %xmm6, %esi
2069 ; X86-SSE-NEXT: xorl %edx, %edx
2070 ; X86-SSE-NEXT: divl %esi
2071 ; X86-SSE-NEXT: movd %edx, %xmm3
2072 ; X86-SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,1,2,3]
2073 ; X86-SSE-NEXT: movd %xmm5, %eax
2074 ; X86-SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm6[1,1,2,3]
2075 ; X86-SSE-NEXT: movd %xmm5, %esi
2076 ; X86-SSE-NEXT: xorl %edx, %edx
2077 ; X86-SSE-NEXT: divl %esi
2078 ; X86-SSE-NEXT: movd %edx, %xmm5
2079 ; X86-SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm5[0],xmm3[1],xmm5[1]
2080 ; X86-SSE-NEXT: punpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm7[0]
2081 ; X86-SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm4[3,1,2,3]
2082 ; X86-SSE-NEXT: movd %xmm6, %eax
2083 ; X86-SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm2[3,1,2,3]
2084 ; X86-SSE-NEXT: movd %xmm6, %esi
2085 ; X86-SSE-NEXT: xorl %edx, %edx
2086 ; X86-SSE-NEXT: divl %esi
2087 ; X86-SSE-NEXT: movd %edx, %xmm6
2088 ; X86-SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm4[2,3,0,1]
2089 ; X86-SSE-NEXT: movd %xmm7, %eax
2090 ; X86-SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm2[2,3,0,1]
2091 ; X86-SSE-NEXT: movd %xmm7, %esi
2092 ; X86-SSE-NEXT: xorl %edx, %edx
2093 ; X86-SSE-NEXT: divl %esi
2094 ; X86-SSE-NEXT: movd %edx, %xmm7
2095 ; X86-SSE-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm6[0],xmm7[1],xmm6[1]
2096 ; X86-SSE-NEXT: movd %xmm4, %eax
2097 ; X86-SSE-NEXT: movd %xmm2, %esi
2098 ; X86-SSE-NEXT: xorl %edx, %edx
2099 ; X86-SSE-NEXT: divl %esi
2100 ; X86-SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,2,3]
2101 ; X86-SSE-NEXT: movd %xmm4, %eax
2102 ; X86-SSE-NEXT: movd %edx, %xmm4
2103 ; X86-SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,2,3]
2104 ; X86-SSE-NEXT: movd %xmm2, %esi
2105 ; X86-SSE-NEXT: xorl %edx, %edx
2106 ; X86-SSE-NEXT: divl %esi
2107 ; X86-SSE-NEXT: movd %edx, %xmm2
2108 ; X86-SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1]
2109 ; X86-SSE-NEXT: punpcklqdq {{.*#+}} xmm4 = xmm4[0],xmm7[0]
2110 ; X86-SSE-NEXT: movd %xmm1, %eax
2111 ; X86-SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,0],xmm6[0,0]
2112 ; X86-SSE-NEXT: movdqa {{.*#+}} xmm1 = [8199,8199,8199,8199]
2113 ; X86-SSE-NEXT: pmuludq %xmm1, %xmm4
2114 ; X86-SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,2,2,3]
2115 ; X86-SSE-NEXT: pmuludq %xmm1, %xmm2
2116 ; X86-SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
2117 ; X86-SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1]
2118 ; X86-SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,0],xmm0[0,0]
2119 ; X86-SSE-NEXT: pmuludq %xmm1, %xmm3
2120 ; X86-SSE-NEXT: pmuludq %xmm1, %xmm5
2121 ; X86-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm3[0,2,2,3]
2122 ; X86-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm5[0,2,2,3]
2123 ; X86-SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
2124 ; X86-SSE-NEXT: xorl %edx, %edx
2125 ; X86-SSE-NEXT: divl 32(%ecx)
2126 ; X86-SSE-NEXT: movdqa %xmm0, (%eax)
2127 ; X86-SSE-NEXT: movdqa %xmm4, (%eax)
2128 ; X86-SSE-NEXT: imull $8199, %edx, %eax # imm = 0x2007
2129 ; X86-SSE-NEXT: movl %eax, (%eax)
2130 ; X86-SSE-NEXT: popl %esi
2131 ; X86-SSE-NEXT: retl
2133 ; X86-AVX1-LABEL: PR34947:
2134 ; X86-AVX1: # %bb.0:
2135 ; X86-AVX1-NEXT: pushl %ebp
2136 ; X86-AVX1-NEXT: pushl %ebx
2137 ; X86-AVX1-NEXT: pushl %edi
2138 ; X86-AVX1-NEXT: pushl %esi
2139 ; X86-AVX1-NEXT: subl $16, %esp
2140 ; X86-AVX1-NEXT: movl {{[0-9]+}}(%esp), %ecx
2141 ; X86-AVX1-NEXT: movl {{[0-9]+}}(%esp), %eax
2142 ; X86-AVX1-NEXT: vpmovzxwd {{.*#+}} xmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
2143 ; X86-AVX1-NEXT: vpmovzxwd {{.*#+}} xmm2 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
2144 ; X86-AVX1-NEXT: vpmovzxwd {{.*#+}} xmm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
2145 ; X86-AVX1-NEXT: vmovd %xmm1, %eax
2146 ; X86-AVX1-NEXT: xorl %edx, %edx
2147 ; X86-AVX1-NEXT: divl 32(%ecx)
2148 ; X86-AVX1-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
2149 ; X86-AVX1-NEXT: vpextrd $3, %xmm2, %eax
2150 ; X86-AVX1-NEXT: vmovdqa (%ecx), %xmm1
2151 ; X86-AVX1-NEXT: vmovdqa 16(%ecx), %xmm3
2152 ; X86-AVX1-NEXT: vpextrd $3, %xmm3, %ecx
2153 ; X86-AVX1-NEXT: xorl %edx, %edx
2154 ; X86-AVX1-NEXT: divl %ecx
2155 ; X86-AVX1-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
2156 ; X86-AVX1-NEXT: vpextrd $2, %xmm2, %eax
2157 ; X86-AVX1-NEXT: vpextrd $2, %xmm3, %ecx
2158 ; X86-AVX1-NEXT: xorl %edx, %edx
2159 ; X86-AVX1-NEXT: divl %ecx
2160 ; X86-AVX1-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
2161 ; X86-AVX1-NEXT: vpextrd $1, %xmm2, %eax
2162 ; X86-AVX1-NEXT: vpextrd $1, %xmm3, %ecx
2163 ; X86-AVX1-NEXT: xorl %edx, %edx
2164 ; X86-AVX1-NEXT: divl %ecx
2165 ; X86-AVX1-NEXT: movl %edx, (%esp) # 4-byte Spill
2166 ; X86-AVX1-NEXT: vmovd %xmm2, %eax
2167 ; X86-AVX1-NEXT: vmovd %xmm3, %ecx
2168 ; X86-AVX1-NEXT: xorl %edx, %edx
2169 ; X86-AVX1-NEXT: divl %ecx
2170 ; X86-AVX1-NEXT: movl %edx, %ebp
2171 ; X86-AVX1-NEXT: vpextrd $3, %xmm0, %eax
2172 ; X86-AVX1-NEXT: vpextrd $3, %xmm1, %ecx
2173 ; X86-AVX1-NEXT: xorl %edx, %edx
2174 ; X86-AVX1-NEXT: divl %ecx
2175 ; X86-AVX1-NEXT: movl %edx, %ebx
2176 ; X86-AVX1-NEXT: vpextrd $2, %xmm0, %eax
2177 ; X86-AVX1-NEXT: vpextrd $2, %xmm1, %esi
2178 ; X86-AVX1-NEXT: xorl %edx, %edx
2179 ; X86-AVX1-NEXT: divl %esi
2180 ; X86-AVX1-NEXT: movl %edx, %esi
2181 ; X86-AVX1-NEXT: vpextrd $1, %xmm0, %eax
2182 ; X86-AVX1-NEXT: vpextrd $1, %xmm1, %edi
2183 ; X86-AVX1-NEXT: xorl %edx, %edx
2184 ; X86-AVX1-NEXT: divl %edi
2185 ; X86-AVX1-NEXT: movl %edx, %edi
2186 ; X86-AVX1-NEXT: vmovd %xmm0, %eax
2187 ; X86-AVX1-NEXT: vmovd %xmm1, %ecx
2188 ; X86-AVX1-NEXT: xorl %edx, %edx
2189 ; X86-AVX1-NEXT: divl %ecx
2190 ; X86-AVX1-NEXT: vmovd %edx, %xmm0
2191 ; X86-AVX1-NEXT: vpinsrd $1, %edi, %xmm0, %xmm0
2192 ; X86-AVX1-NEXT: vpinsrd $2, %esi, %xmm0, %xmm0
2193 ; X86-AVX1-NEXT: vpinsrd $3, %ebx, %xmm0, %xmm0
2194 ; X86-AVX1-NEXT: vmovd %ebp, %xmm1
2195 ; X86-AVX1-NEXT: vpinsrd $1, (%esp), %xmm1, %xmm1 # 4-byte Folded Reload
2196 ; X86-AVX1-NEXT: vpinsrd $2, {{[-0-9]+}}(%e{{[sb]}}p), %xmm1, %xmm1 # 4-byte Folded Reload
2197 ; X86-AVX1-NEXT: vpinsrd $3, {{[-0-9]+}}(%e{{[sb]}}p), %xmm1, %xmm1 # 4-byte Folded Reload
2198 ; X86-AVX1-NEXT: imull $8199, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
2199 ; X86-AVX1-NEXT: # imm = 0x2007
2200 ; X86-AVX1-NEXT: movl %eax, (%eax)
2201 ; X86-AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [8199,8199,8199,8199]
2202 ; X86-AVX1-NEXT: vpmulld %xmm2, %xmm0, %xmm0
2203 ; X86-AVX1-NEXT: vpmulld %xmm2, %xmm1, %xmm1
2204 ; X86-AVX1-NEXT: vmovdqa %xmm1, (%eax)
2205 ; X86-AVX1-NEXT: vmovdqa %xmm0, (%eax)
2206 ; X86-AVX1-NEXT: addl $16, %esp
2207 ; X86-AVX1-NEXT: popl %esi
2208 ; X86-AVX1-NEXT: popl %edi
2209 ; X86-AVX1-NEXT: popl %ebx
2210 ; X86-AVX1-NEXT: popl %ebp
2211 ; X86-AVX1-NEXT: retl
2213 ; X86-AVX2-LABEL: PR34947:
2214 ; X86-AVX2: # %bb.0:
2215 ; X86-AVX2-NEXT: pushl %edi
2216 ; X86-AVX2-NEXT: pushl %esi
2217 ; X86-AVX2-NEXT: movl {{[0-9]+}}(%esp), %esi
2218 ; X86-AVX2-NEXT: movl {{[0-9]+}}(%esp), %eax
2219 ; X86-AVX2-NEXT: vpmovzxwd {{.*#+}} ymm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero
2220 ; X86-AVX2-NEXT: vpmovzxwd {{.*#+}} ymm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero
2221 ; X86-AVX2-NEXT: vmovdqa (%esi), %xmm2
2222 ; X86-AVX2-NEXT: vmovdqa 16(%esi), %xmm3
2223 ; X86-AVX2-NEXT: vpextrd $1, %xmm3, %ecx
2224 ; X86-AVX2-NEXT: vextracti128 $1, %ymm1, %xmm4
2225 ; X86-AVX2-NEXT: vpextrd $1, %xmm4, %eax
2226 ; X86-AVX2-NEXT: xorl %edx, %edx
2227 ; X86-AVX2-NEXT: divl %ecx
2228 ; X86-AVX2-NEXT: movl %edx, %ecx
2229 ; X86-AVX2-NEXT: vmovd %xmm3, %edi
2230 ; X86-AVX2-NEXT: vmovd %xmm4, %eax
2231 ; X86-AVX2-NEXT: xorl %edx, %edx
2232 ; X86-AVX2-NEXT: divl %edi
2233 ; X86-AVX2-NEXT: vmovd %edx, %xmm5
2234 ; X86-AVX2-NEXT: vpinsrd $1, %ecx, %xmm5, %xmm5
2235 ; X86-AVX2-NEXT: vpextrd $2, %xmm3, %ecx
2236 ; X86-AVX2-NEXT: vpextrd $2, %xmm4, %eax
2237 ; X86-AVX2-NEXT: xorl %edx, %edx
2238 ; X86-AVX2-NEXT: divl %ecx
2239 ; X86-AVX2-NEXT: vpinsrd $2, %edx, %xmm5, %xmm5
2240 ; X86-AVX2-NEXT: vpextrd $3, %xmm3, %ecx
2241 ; X86-AVX2-NEXT: vpextrd $3, %xmm4, %eax
2242 ; X86-AVX2-NEXT: xorl %edx, %edx
2243 ; X86-AVX2-NEXT: divl %ecx
2244 ; X86-AVX2-NEXT: vpinsrd $3, %edx, %xmm5, %xmm3
2245 ; X86-AVX2-NEXT: vpextrd $1, %xmm2, %ecx
2246 ; X86-AVX2-NEXT: vpextrd $1, %xmm1, %eax
2247 ; X86-AVX2-NEXT: xorl %edx, %edx
2248 ; X86-AVX2-NEXT: divl %ecx
2249 ; X86-AVX2-NEXT: movl %edx, %ecx
2250 ; X86-AVX2-NEXT: vmovd %xmm2, %edi
2251 ; X86-AVX2-NEXT: vmovd %xmm1, %eax
2252 ; X86-AVX2-NEXT: xorl %edx, %edx
2253 ; X86-AVX2-NEXT: divl %edi
2254 ; X86-AVX2-NEXT: vmovd %edx, %xmm4
2255 ; X86-AVX2-NEXT: vpinsrd $1, %ecx, %xmm4, %xmm4
2256 ; X86-AVX2-NEXT: vpextrd $2, %xmm2, %ecx
2257 ; X86-AVX2-NEXT: vpextrd $2, %xmm1, %eax
2258 ; X86-AVX2-NEXT: xorl %edx, %edx
2259 ; X86-AVX2-NEXT: divl %ecx
2260 ; X86-AVX2-NEXT: vpinsrd $2, %edx, %xmm4, %xmm4
2261 ; X86-AVX2-NEXT: vpextrd $3, %xmm2, %ecx
2262 ; X86-AVX2-NEXT: vpextrd $3, %xmm1, %eax
2263 ; X86-AVX2-NEXT: xorl %edx, %edx
2264 ; X86-AVX2-NEXT: divl %ecx
2265 ; X86-AVX2-NEXT: vpinsrd $3, %edx, %xmm4, %xmm1
2266 ; X86-AVX2-NEXT: vinserti128 $1, %xmm3, %ymm1, %ymm1
2267 ; X86-AVX2-NEXT: vmovd %xmm0, %eax
2268 ; X86-AVX2-NEXT: xorl %edx, %edx
2269 ; X86-AVX2-NEXT: divl 32(%esi)
2270 ; X86-AVX2-NEXT: vpbroadcastd {{.*#+}} ymm0 = [8199,8199,8199,8199,8199,8199,8199,8199]
2271 ; X86-AVX2-NEXT: vpmulld %ymm0, %ymm1, %ymm0
2272 ; X86-AVX2-NEXT: imull $8199, %edx, %eax # imm = 0x2007
2273 ; X86-AVX2-NEXT: movl %eax, (%eax)
2274 ; X86-AVX2-NEXT: vmovdqa %ymm0, (%eax)
2275 ; X86-AVX2-NEXT: popl %esi
2276 ; X86-AVX2-NEXT: popl %edi
2277 ; X86-AVX2-NEXT: vzeroupper
2278 ; X86-AVX2-NEXT: retl
2280 ; X64-SSE-LABEL: PR34947:
2282 ; X64-SSE-NEXT: movdqa (%rdi), %xmm5
2283 ; X64-SSE-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
2284 ; X64-SSE-NEXT: movdqa (%rsi), %xmm2
2285 ; X64-SSE-NEXT: movdqa 16(%rsi), %xmm6
2286 ; X64-SSE-NEXT: pxor %xmm0, %xmm0
2287 ; X64-SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
2288 ; X64-SSE-NEXT: movdqa %xmm5, %xmm3
2289 ; X64-SSE-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3]
2290 ; X64-SSE-NEXT: punpckhwd {{.*#+}} xmm5 = xmm5[4],xmm0[4],xmm5[5],xmm0[5],xmm5[6],xmm0[6],xmm5[7],xmm0[7]
2291 ; X64-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm5[3,1,2,3]
2292 ; X64-SSE-NEXT: movd %xmm0, %eax
2293 ; X64-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm6[3,1,2,3]
2294 ; X64-SSE-NEXT: movd %xmm0, %ecx
2295 ; X64-SSE-NEXT: xorl %edx, %edx
2296 ; X64-SSE-NEXT: divl %ecx
2297 ; X64-SSE-NEXT: movd %edx, %xmm8
2298 ; X64-SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm5[2,3,0,1]
2299 ; X64-SSE-NEXT: movd %xmm4, %eax
2300 ; X64-SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm6[2,3,0,1]
2301 ; X64-SSE-NEXT: movd %xmm4, %ecx
2302 ; X64-SSE-NEXT: xorl %edx, %edx
2303 ; X64-SSE-NEXT: divl %ecx
2304 ; X64-SSE-NEXT: movd %edx, %xmm7
2305 ; X64-SSE-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm8[0],xmm7[1],xmm8[1]
2306 ; X64-SSE-NEXT: movd %xmm5, %eax
2307 ; X64-SSE-NEXT: movd %xmm6, %ecx
2308 ; X64-SSE-NEXT: xorl %edx, %edx
2309 ; X64-SSE-NEXT: divl %ecx
2310 ; X64-SSE-NEXT: movd %edx, %xmm4
2311 ; X64-SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,1,2,3]
2312 ; X64-SSE-NEXT: movd %xmm5, %eax
2313 ; X64-SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm6[1,1,2,3]
2314 ; X64-SSE-NEXT: movd %xmm5, %ecx
2315 ; X64-SSE-NEXT: xorl %edx, %edx
2316 ; X64-SSE-NEXT: divl %ecx
2317 ; X64-SSE-NEXT: movd %edx, %xmm5
2318 ; X64-SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1]
2319 ; X64-SSE-NEXT: punpcklqdq {{.*#+}} xmm4 = xmm4[0],xmm7[0]
2320 ; X64-SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm3[3,1,2,3]
2321 ; X64-SSE-NEXT: movd %xmm6, %eax
2322 ; X64-SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm2[3,1,2,3]
2323 ; X64-SSE-NEXT: movd %xmm6, %ecx
2324 ; X64-SSE-NEXT: xorl %edx, %edx
2325 ; X64-SSE-NEXT: divl %ecx
2326 ; X64-SSE-NEXT: movd %edx, %xmm6
2327 ; X64-SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm3[2,3,0,1]
2328 ; X64-SSE-NEXT: movd %xmm7, %eax
2329 ; X64-SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm2[2,3,0,1]
2330 ; X64-SSE-NEXT: movd %xmm7, %ecx
2331 ; X64-SSE-NEXT: xorl %edx, %edx
2332 ; X64-SSE-NEXT: divl %ecx
2333 ; X64-SSE-NEXT: movd %edx, %xmm7
2334 ; X64-SSE-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm6[0],xmm7[1],xmm6[1]
2335 ; X64-SSE-NEXT: movd %xmm3, %eax
2336 ; X64-SSE-NEXT: movd %xmm2, %ecx
2337 ; X64-SSE-NEXT: xorl %edx, %edx
2338 ; X64-SSE-NEXT: divl %ecx
2339 ; X64-SSE-NEXT: movd %edx, %xmm0
2340 ; X64-SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,2,3]
2341 ; X64-SSE-NEXT: movd %xmm3, %eax
2342 ; X64-SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,2,3]
2343 ; X64-SSE-NEXT: movd %xmm2, %ecx
2344 ; X64-SSE-NEXT: xorl %edx, %edx
2345 ; X64-SSE-NEXT: divl %ecx
2346 ; X64-SSE-NEXT: movd %edx, %xmm2
2347 ; X64-SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
2348 ; X64-SSE-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm7[0]
2349 ; X64-SSE-NEXT: movd %xmm1, %eax
2350 ; X64-SSE-NEXT: xorl %edx, %edx
2351 ; X64-SSE-NEXT: divl 32(%rsi)
2352 ; X64-SSE-NEXT: movdqa {{.*#+}} xmm1 = [8199,8199,8199,8199]
2353 ; X64-SSE-NEXT: pmuludq %xmm1, %xmm0
2354 ; X64-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
2355 ; X64-SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,0],xmm6[0,0]
2356 ; X64-SSE-NEXT: pmuludq %xmm1, %xmm2
2357 ; X64-SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
2358 ; X64-SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
2359 ; X64-SSE-NEXT: pmuludq %xmm1, %xmm4
2360 ; X64-SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm4[0,2,2,3]
2361 ; X64-SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,0],xmm8[0,0]
2362 ; X64-SSE-NEXT: pmuludq %xmm1, %xmm5
2363 ; X64-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm5[0,2,2,3]
2364 ; X64-SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
2365 ; X64-SSE-NEXT: imull $8199, %edx, %eax # imm = 0x2007
2366 ; X64-SSE-NEXT: movl %eax, (%rax)
2367 ; X64-SSE-NEXT: movdqa %xmm2, (%rax)
2368 ; X64-SSE-NEXT: movdqa %xmm0, (%rax)
2369 ; X64-SSE-NEXT: retq
2371 ; X64-AVX1-LABEL: PR34947:
2372 ; X64-AVX1: # %bb.0:
2373 ; X64-AVX1-NEXT: pushq %rbp
2374 ; X64-AVX1-NEXT: pushq %rbx
2375 ; X64-AVX1-NEXT: vpmovzxwd {{.*#+}} xmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
2376 ; X64-AVX1-NEXT: vpmovzxwd {{.*#+}} xmm2 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
2377 ; X64-AVX1-NEXT: vpmovzxwd {{.*#+}} xmm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
2378 ; X64-AVX1-NEXT: vmovd %xmm1, %eax
2379 ; X64-AVX1-NEXT: xorl %edx, %edx
2380 ; X64-AVX1-NEXT: divl 32(%rsi)
2381 ; X64-AVX1-NEXT: movl %edx, %r8d
2382 ; X64-AVX1-NEXT: vpextrd $3, %xmm2, %eax
2383 ; X64-AVX1-NEXT: vmovdqa (%rsi), %xmm1
2384 ; X64-AVX1-NEXT: vmovdqa 16(%rsi), %xmm3
2385 ; X64-AVX1-NEXT: vpextrd $3, %xmm3, %ecx
2386 ; X64-AVX1-NEXT: xorl %edx, %edx
2387 ; X64-AVX1-NEXT: divl %ecx
2388 ; X64-AVX1-NEXT: movl %edx, %r9d
2389 ; X64-AVX1-NEXT: vpextrd $2, %xmm2, %eax
2390 ; X64-AVX1-NEXT: vpextrd $2, %xmm3, %ecx
2391 ; X64-AVX1-NEXT: xorl %edx, %edx
2392 ; X64-AVX1-NEXT: divl %ecx
2393 ; X64-AVX1-NEXT: movl %edx, %r10d
2394 ; X64-AVX1-NEXT: vpextrd $1, %xmm2, %eax
2395 ; X64-AVX1-NEXT: vpextrd $1, %xmm3, %ecx
2396 ; X64-AVX1-NEXT: xorl %edx, %edx
2397 ; X64-AVX1-NEXT: divl %ecx
2398 ; X64-AVX1-NEXT: movl %edx, %r11d
2399 ; X64-AVX1-NEXT: vmovd %xmm2, %eax
2400 ; X64-AVX1-NEXT: vmovd %xmm3, %ecx
2401 ; X64-AVX1-NEXT: xorl %edx, %edx
2402 ; X64-AVX1-NEXT: divl %ecx
2403 ; X64-AVX1-NEXT: movl %edx, %esi
2404 ; X64-AVX1-NEXT: vpextrd $3, %xmm0, %eax
2405 ; X64-AVX1-NEXT: vpextrd $3, %xmm1, %ecx
2406 ; X64-AVX1-NEXT: xorl %edx, %edx
2407 ; X64-AVX1-NEXT: divl %ecx
2408 ; X64-AVX1-NEXT: movl %edx, %edi
2409 ; X64-AVX1-NEXT: vpextrd $2, %xmm0, %eax
2410 ; X64-AVX1-NEXT: vpextrd $2, %xmm1, %ecx
2411 ; X64-AVX1-NEXT: xorl %edx, %edx
2412 ; X64-AVX1-NEXT: divl %ecx
2413 ; X64-AVX1-NEXT: movl %edx, %ecx
2414 ; X64-AVX1-NEXT: vpextrd $1, %xmm0, %eax
2415 ; X64-AVX1-NEXT: vpextrd $1, %xmm1, %ebx
2416 ; X64-AVX1-NEXT: xorl %edx, %edx
2417 ; X64-AVX1-NEXT: divl %ebx
2418 ; X64-AVX1-NEXT: movl %edx, %ebx
2419 ; X64-AVX1-NEXT: vmovd %xmm0, %eax
2420 ; X64-AVX1-NEXT: vmovd %xmm1, %ebp
2421 ; X64-AVX1-NEXT: xorl %edx, %edx
2422 ; X64-AVX1-NEXT: divl %ebp
2423 ; X64-AVX1-NEXT: vmovd %edx, %xmm0
2424 ; X64-AVX1-NEXT: vpinsrd $1, %ebx, %xmm0, %xmm0
2425 ; X64-AVX1-NEXT: vpinsrd $2, %ecx, %xmm0, %xmm0
2426 ; X64-AVX1-NEXT: vpinsrd $3, %edi, %xmm0, %xmm0
2427 ; X64-AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [8199,8199,8199,8199]
2428 ; X64-AVX1-NEXT: vpmulld %xmm1, %xmm0, %xmm0
2429 ; X64-AVX1-NEXT: vmovd %esi, %xmm2
2430 ; X64-AVX1-NEXT: vpinsrd $1, %r11d, %xmm2, %xmm2
2431 ; X64-AVX1-NEXT: vpinsrd $2, %r10d, %xmm2, %xmm2
2432 ; X64-AVX1-NEXT: vpinsrd $3, %r9d, %xmm2, %xmm2
2433 ; X64-AVX1-NEXT: vpmulld %xmm1, %xmm2, %xmm1
2434 ; X64-AVX1-NEXT: imull $8199, %r8d, %eax # imm = 0x2007
2435 ; X64-AVX1-NEXT: movl %eax, (%rax)
2436 ; X64-AVX1-NEXT: vmovdqa %xmm1, (%rax)
2437 ; X64-AVX1-NEXT: vmovdqa %xmm0, (%rax)
2438 ; X64-AVX1-NEXT: popq %rbx
2439 ; X64-AVX1-NEXT: popq %rbp
2440 ; X64-AVX1-NEXT: retq
2442 ; X64-AVX2-LABEL: PR34947:
2443 ; X64-AVX2: # %bb.0:
2444 ; X64-AVX2-NEXT: vpmovzxwd {{.*#+}} ymm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero
2445 ; X64-AVX2-NEXT: vpmovzxwd {{.*#+}} ymm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero
2446 ; X64-AVX2-NEXT: vmovdqa (%rsi), %xmm2
2447 ; X64-AVX2-NEXT: vmovdqa 16(%rsi), %xmm3
2448 ; X64-AVX2-NEXT: vpextrd $1, %xmm3, %ecx
2449 ; X64-AVX2-NEXT: vextracti128 $1, %ymm1, %xmm4
2450 ; X64-AVX2-NEXT: vpextrd $1, %xmm4, %eax
2451 ; X64-AVX2-NEXT: xorl %edx, %edx
2452 ; X64-AVX2-NEXT: divl %ecx
2453 ; X64-AVX2-NEXT: movl %edx, %ecx
2454 ; X64-AVX2-NEXT: vmovd %xmm3, %edi
2455 ; X64-AVX2-NEXT: vmovd %xmm4, %eax
2456 ; X64-AVX2-NEXT: xorl %edx, %edx
2457 ; X64-AVX2-NEXT: divl %edi
2458 ; X64-AVX2-NEXT: vmovd %edx, %xmm5
2459 ; X64-AVX2-NEXT: vpinsrd $1, %ecx, %xmm5, %xmm5
2460 ; X64-AVX2-NEXT: vpextrd $2, %xmm3, %ecx
2461 ; X64-AVX2-NEXT: vpextrd $2, %xmm4, %eax
2462 ; X64-AVX2-NEXT: xorl %edx, %edx
2463 ; X64-AVX2-NEXT: divl %ecx
2464 ; X64-AVX2-NEXT: vpinsrd $2, %edx, %xmm5, %xmm5
2465 ; X64-AVX2-NEXT: vpextrd $3, %xmm3, %ecx
2466 ; X64-AVX2-NEXT: vpextrd $3, %xmm4, %eax
2467 ; X64-AVX2-NEXT: xorl %edx, %edx
2468 ; X64-AVX2-NEXT: divl %ecx
2469 ; X64-AVX2-NEXT: vpinsrd $3, %edx, %xmm5, %xmm3
2470 ; X64-AVX2-NEXT: vpextrd $1, %xmm2, %ecx
2471 ; X64-AVX2-NEXT: vpextrd $1, %xmm1, %eax
2472 ; X64-AVX2-NEXT: xorl %edx, %edx
2473 ; X64-AVX2-NEXT: divl %ecx
2474 ; X64-AVX2-NEXT: movl %edx, %ecx
2475 ; X64-AVX2-NEXT: vmovd %xmm2, %edi
2476 ; X64-AVX2-NEXT: vmovd %xmm1, %eax
2477 ; X64-AVX2-NEXT: xorl %edx, %edx
2478 ; X64-AVX2-NEXT: divl %edi
2479 ; X64-AVX2-NEXT: vmovd %edx, %xmm4
2480 ; X64-AVX2-NEXT: vpinsrd $1, %ecx, %xmm4, %xmm4
2481 ; X64-AVX2-NEXT: vpextrd $2, %xmm2, %ecx
2482 ; X64-AVX2-NEXT: vpextrd $2, %xmm1, %eax
2483 ; X64-AVX2-NEXT: xorl %edx, %edx
2484 ; X64-AVX2-NEXT: divl %ecx
2485 ; X64-AVX2-NEXT: vpinsrd $2, %edx, %xmm4, %xmm4
2486 ; X64-AVX2-NEXT: vpextrd $3, %xmm2, %ecx
2487 ; X64-AVX2-NEXT: vpextrd $3, %xmm1, %eax
2488 ; X64-AVX2-NEXT: xorl %edx, %edx
2489 ; X64-AVX2-NEXT: divl %ecx
2490 ; X64-AVX2-NEXT: vpinsrd $3, %edx, %xmm4, %xmm1
2491 ; X64-AVX2-NEXT: vinserti128 $1, %xmm3, %ymm1, %ymm1
2492 ; X64-AVX2-NEXT: vmovd %xmm0, %eax
2493 ; X64-AVX2-NEXT: xorl %edx, %edx
2494 ; X64-AVX2-NEXT: divl 32(%rsi)
2495 ; X64-AVX2-NEXT: vpbroadcastd {{.*#+}} ymm0 = [8199,8199,8199,8199,8199,8199,8199,8199]
2496 ; X64-AVX2-NEXT: vpmulld %ymm0, %ymm1, %ymm0
2497 ; X64-AVX2-NEXT: imull $8199, %edx, %eax # imm = 0x2007
2498 ; X64-AVX2-NEXT: movl %eax, (%rax)
2499 ; X64-AVX2-NEXT: vmovdqa %ymm0, (%rax)
2500 ; X64-AVX2-NEXT: vzeroupper
2501 ; X64-AVX2-NEXT: retq
2502 %a0 = load <9 x i16>, <9 x i16>* %p0, align 64
2503 %a1 = load <9 x i32>, <9 x i32>* %p1, align 64
2504 %ext0 = zext <9 x i16> %a0 to <9 x i32>
2505 %rem = urem <9 x i32> %ext0, %a1
2506 %mul = mul <9 x i32> <i32 8199, i32 8199, i32 8199, i32 8199, i32 8199, i32 8199, i32 8199, i32 8199, i32 8199>, %rem
2507 store <9 x i32> %mul, <9 x i32>* undef, align 64