1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefix=X86 --check-prefix=X86-SSE
3 ; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+avx | FileCheck %s --check-prefix=X86 --check-prefix=X86-AVX --check-prefix=X86-AVX1
4 ; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=X86 --check-prefix=X86-AVX --check-prefix=X86-AVX2
5 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefix=X64 --check-prefix=X64-SSE
6 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefix=X64 --check-prefix=X64-AVX --check-prefix=X64-AVX1
7 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=X64 --check-prefix=X64-AVX --check-prefix=X64-AVX2
9 @c = external global i32*, align 8
11 ; %val1 = load <2 x i8>
12 ; %op1 = zext<2 x i32> %val1
13 ; %val2 = load <2 x i8>
14 ; %op2 = zext<2 x i32> %val2
15 ; %rst = mul <2 x i32> %op1, %op2
17 define void @mul_2xi8(i8* nocapture readonly %a, i8* nocapture readonly %b, i64 %index) nounwind {
18 ; X86-SSE-LABEL: mul_2xi8:
19 ; X86-SSE: # %bb.0: # %entry
20 ; X86-SSE-NEXT: pushl %esi
21 ; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %eax
22 ; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %ecx
23 ; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %edx
24 ; X86-SSE-NEXT: movl c, %esi
25 ; X86-SSE-NEXT: movzwl (%edx,%ecx), %edx
26 ; X86-SSE-NEXT: movd %edx, %xmm0
27 ; X86-SSE-NEXT: movzwl (%eax,%ecx), %eax
28 ; X86-SSE-NEXT: movd %eax, %xmm1
29 ; X86-SSE-NEXT: pxor %xmm2, %xmm2
30 ; X86-SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7]
31 ; X86-SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
32 ; X86-SSE-NEXT: pmullw %xmm0, %xmm1
33 ; X86-SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
34 ; X86-SSE-NEXT: movq %xmm1, (%esi,%ecx,4)
35 ; X86-SSE-NEXT: popl %esi
38 ; X86-AVX-LABEL: mul_2xi8:
39 ; X86-AVX: # %bb.0: # %entry
40 ; X86-AVX-NEXT: pushl %esi
41 ; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax
42 ; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %ecx
43 ; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %edx
44 ; X86-AVX-NEXT: movl c, %esi
45 ; X86-AVX-NEXT: vpmovzxbq {{.*#+}} xmm0 = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero
46 ; X86-AVX-NEXT: vpmovzxbq {{.*#+}} xmm1 = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero
47 ; X86-AVX-NEXT: vpmuludq %xmm0, %xmm1, %xmm0
48 ; X86-AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
49 ; X86-AVX-NEXT: vmovq %xmm0, (%esi,%ecx,4)
50 ; X86-AVX-NEXT: popl %esi
53 ; X64-SSE-LABEL: mul_2xi8:
54 ; X64-SSE: # %bb.0: # %entry
55 ; X64-SSE-NEXT: movq {{.*}}(%rip), %rax
56 ; X64-SSE-NEXT: movzwl (%rdi,%rdx), %ecx
57 ; X64-SSE-NEXT: movd %ecx, %xmm0
58 ; X64-SSE-NEXT: movzwl (%rsi,%rdx), %ecx
59 ; X64-SSE-NEXT: movd %ecx, %xmm1
60 ; X64-SSE-NEXT: pxor %xmm2, %xmm2
61 ; X64-SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7]
62 ; X64-SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
63 ; X64-SSE-NEXT: pmullw %xmm0, %xmm1
64 ; X64-SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
65 ; X64-SSE-NEXT: movq %xmm1, (%rax,%rdx,4)
68 ; X64-AVX-LABEL: mul_2xi8:
69 ; X64-AVX: # %bb.0: # %entry
70 ; X64-AVX-NEXT: movq {{.*}}(%rip), %rax
71 ; X64-AVX-NEXT: vpmovzxbq {{.*#+}} xmm0 = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero
72 ; X64-AVX-NEXT: vpmovzxbq {{.*#+}} xmm1 = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero
73 ; X64-AVX-NEXT: vpmuludq %xmm0, %xmm1, %xmm0
74 ; X64-AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
75 ; X64-AVX-NEXT: vmovq %xmm0, (%rax,%rdx,4)
78 %pre = load i32*, i32** @c
79 %tmp6 = getelementptr inbounds i8, i8* %a, i64 %index
80 %tmp7 = bitcast i8* %tmp6 to <2 x i8>*
81 %wide.load = load <2 x i8>, <2 x i8>* %tmp7, align 1
82 %tmp8 = zext <2 x i8> %wide.load to <2 x i32>
83 %tmp10 = getelementptr inbounds i8, i8* %b, i64 %index
84 %tmp11 = bitcast i8* %tmp10 to <2 x i8>*
85 %wide.load17 = load <2 x i8>, <2 x i8>* %tmp11, align 1
86 %tmp12 = zext <2 x i8> %wide.load17 to <2 x i32>
87 %tmp13 = mul nuw nsw <2 x i32> %tmp12, %tmp8
88 %tmp14 = getelementptr inbounds i32, i32* %pre, i64 %index
89 %tmp15 = bitcast i32* %tmp14 to <2 x i32>*
90 store <2 x i32> %tmp13, <2 x i32>* %tmp15, align 4
94 ; %val1 = load <4 x i8>
95 ; %op1 = zext<4 x i32> %val1
96 ; %val2 = load <4 x i8>
97 ; %op2 = zext<4 x i32> %val2
98 ; %rst = mul <4 x i32> %op1, %op2
100 define void @mul_4xi8(i8* nocapture readonly %a, i8* nocapture readonly %b, i64 %index) nounwind {
101 ; X86-SSE-LABEL: mul_4xi8:
102 ; X86-SSE: # %bb.0: # %entry
103 ; X86-SSE-NEXT: pushl %esi
104 ; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %eax
105 ; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %ecx
106 ; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %edx
107 ; X86-SSE-NEXT: movl c, %esi
108 ; X86-SSE-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
109 ; X86-SSE-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
110 ; X86-SSE-NEXT: pxor %xmm2, %xmm2
111 ; X86-SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7]
112 ; X86-SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
113 ; X86-SSE-NEXT: pmullw %xmm0, %xmm1
114 ; X86-SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
115 ; X86-SSE-NEXT: movdqu %xmm1, (%esi,%ecx,4)
116 ; X86-SSE-NEXT: popl %esi
119 ; X86-AVX-LABEL: mul_4xi8:
120 ; X86-AVX: # %bb.0: # %entry
121 ; X86-AVX-NEXT: pushl %esi
122 ; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax
123 ; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %ecx
124 ; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %edx
125 ; X86-AVX-NEXT: movl c, %esi
126 ; X86-AVX-NEXT: vpmovzxbd {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
127 ; X86-AVX-NEXT: vpmovzxbd {{.*#+}} xmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
128 ; X86-AVX-NEXT: vpmaddwd %xmm0, %xmm1, %xmm0
129 ; X86-AVX-NEXT: vmovdqu %xmm0, (%esi,%ecx,4)
130 ; X86-AVX-NEXT: popl %esi
133 ; X64-SSE-LABEL: mul_4xi8:
134 ; X64-SSE: # %bb.0: # %entry
135 ; X64-SSE-NEXT: movq {{.*}}(%rip), %rax
136 ; X64-SSE-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
137 ; X64-SSE-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
138 ; X64-SSE-NEXT: pxor %xmm2, %xmm2
139 ; X64-SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7]
140 ; X64-SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
141 ; X64-SSE-NEXT: pmullw %xmm0, %xmm1
142 ; X64-SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
143 ; X64-SSE-NEXT: movdqu %xmm1, (%rax,%rdx,4)
146 ; X64-AVX-LABEL: mul_4xi8:
147 ; X64-AVX: # %bb.0: # %entry
148 ; X64-AVX-NEXT: movq {{.*}}(%rip), %rax
149 ; X64-AVX-NEXT: vpmovzxbd {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
150 ; X64-AVX-NEXT: vpmovzxbd {{.*#+}} xmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
151 ; X64-AVX-NEXT: vpmaddwd %xmm0, %xmm1, %xmm0
152 ; X64-AVX-NEXT: vmovdqu %xmm0, (%rax,%rdx,4)
155 %pre = load i32*, i32** @c
156 %tmp6 = getelementptr inbounds i8, i8* %a, i64 %index
157 %tmp7 = bitcast i8* %tmp6 to <4 x i8>*
158 %wide.load = load <4 x i8>, <4 x i8>* %tmp7, align 1
159 %tmp8 = zext <4 x i8> %wide.load to <4 x i32>
160 %tmp10 = getelementptr inbounds i8, i8* %b, i64 %index
161 %tmp11 = bitcast i8* %tmp10 to <4 x i8>*
162 %wide.load17 = load <4 x i8>, <4 x i8>* %tmp11, align 1
163 %tmp12 = zext <4 x i8> %wide.load17 to <4 x i32>
164 %tmp13 = mul nuw nsw <4 x i32> %tmp12, %tmp8
165 %tmp14 = getelementptr inbounds i32, i32* %pre, i64 %index
166 %tmp15 = bitcast i32* %tmp14 to <4 x i32>*
167 store <4 x i32> %tmp13, <4 x i32>* %tmp15, align 4
171 ; %val1 = load <8 x i8>
172 ; %op1 = zext<8 x i32> %val1
173 ; %val2 = load <8 x i8>
174 ; %op2 = zext<8 x i32> %val2
175 ; %rst = mul <8 x i32> %op1, %op2
177 define void @mul_8xi8(i8* nocapture readonly %a, i8* nocapture readonly %b, i64 %index) nounwind {
178 ; X86-SSE-LABEL: mul_8xi8:
179 ; X86-SSE: # %bb.0: # %entry
180 ; X86-SSE-NEXT: pushl %esi
181 ; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %eax
182 ; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %ecx
183 ; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %edx
184 ; X86-SSE-NEXT: movl c, %esi
185 ; X86-SSE-NEXT: movq {{.*#+}} xmm0 = mem[0],zero
186 ; X86-SSE-NEXT: movq {{.*#+}} xmm1 = mem[0],zero
187 ; X86-SSE-NEXT: pxor %xmm2, %xmm2
188 ; X86-SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7]
189 ; X86-SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
190 ; X86-SSE-NEXT: pmullw %xmm0, %xmm1
191 ; X86-SSE-NEXT: movdqa %xmm1, %xmm0
192 ; X86-SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
193 ; X86-SSE-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
194 ; X86-SSE-NEXT: movdqu %xmm1, 16(%esi,%ecx,4)
195 ; X86-SSE-NEXT: movdqu %xmm0, (%esi,%ecx,4)
196 ; X86-SSE-NEXT: popl %esi
199 ; X86-AVX1-LABEL: mul_8xi8:
200 ; X86-AVX1: # %bb.0: # %entry
201 ; X86-AVX1-NEXT: pushl %esi
202 ; X86-AVX1-NEXT: movl {{[0-9]+}}(%esp), %eax
203 ; X86-AVX1-NEXT: movl {{[0-9]+}}(%esp), %ecx
204 ; X86-AVX1-NEXT: movl {{[0-9]+}}(%esp), %edx
205 ; X86-AVX1-NEXT: movl c, %esi
206 ; X86-AVX1-NEXT: vpmovzxbd {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
207 ; X86-AVX1-NEXT: vpmovzxbd {{.*#+}} xmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
208 ; X86-AVX1-NEXT: vpmovzxbd {{.*#+}} xmm2 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
209 ; X86-AVX1-NEXT: vpmaddwd %xmm0, %xmm2, %xmm0
210 ; X86-AVX1-NEXT: vpmovzxbd {{.*#+}} xmm2 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
211 ; X86-AVX1-NEXT: vpmaddwd %xmm1, %xmm2, %xmm1
212 ; X86-AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
213 ; X86-AVX1-NEXT: vmovups %ymm0, (%esi,%ecx,4)
214 ; X86-AVX1-NEXT: popl %esi
215 ; X86-AVX1-NEXT: vzeroupper
216 ; X86-AVX1-NEXT: retl
218 ; X86-AVX2-LABEL: mul_8xi8:
219 ; X86-AVX2: # %bb.0: # %entry
220 ; X86-AVX2-NEXT: pushl %esi
221 ; X86-AVX2-NEXT: movl {{[0-9]+}}(%esp), %eax
222 ; X86-AVX2-NEXT: movl {{[0-9]+}}(%esp), %ecx
223 ; X86-AVX2-NEXT: movl {{[0-9]+}}(%esp), %edx
224 ; X86-AVX2-NEXT: movl c, %esi
225 ; X86-AVX2-NEXT: vpmovzxbd {{.*#+}} ymm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero
226 ; X86-AVX2-NEXT: vpmovzxbd {{.*#+}} ymm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero
227 ; X86-AVX2-NEXT: vpmaddwd %ymm0, %ymm1, %ymm0
228 ; X86-AVX2-NEXT: vmovdqu %ymm0, (%esi,%ecx,4)
229 ; X86-AVX2-NEXT: popl %esi
230 ; X86-AVX2-NEXT: vzeroupper
231 ; X86-AVX2-NEXT: retl
233 ; X64-SSE-LABEL: mul_8xi8:
234 ; X64-SSE: # %bb.0: # %entry
235 ; X64-SSE-NEXT: movq {{.*}}(%rip), %rax
236 ; X64-SSE-NEXT: movq {{.*#+}} xmm0 = mem[0],zero
237 ; X64-SSE-NEXT: movq {{.*#+}} xmm1 = mem[0],zero
238 ; X64-SSE-NEXT: pxor %xmm2, %xmm2
239 ; X64-SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7]
240 ; X64-SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
241 ; X64-SSE-NEXT: pmullw %xmm0, %xmm1
242 ; X64-SSE-NEXT: movdqa %xmm1, %xmm0
243 ; X64-SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
244 ; X64-SSE-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
245 ; X64-SSE-NEXT: movdqu %xmm1, 16(%rax,%rdx,4)
246 ; X64-SSE-NEXT: movdqu %xmm0, (%rax,%rdx,4)
249 ; X64-AVX1-LABEL: mul_8xi8:
250 ; X64-AVX1: # %bb.0: # %entry
251 ; X64-AVX1-NEXT: movq {{.*}}(%rip), %rax
252 ; X64-AVX1-NEXT: vpmovzxbd {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
253 ; X64-AVX1-NEXT: vpmovzxbd {{.*#+}} xmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
254 ; X64-AVX1-NEXT: vpmovzxbd {{.*#+}} xmm2 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
255 ; X64-AVX1-NEXT: vpmaddwd %xmm0, %xmm2, %xmm0
256 ; X64-AVX1-NEXT: vpmovzxbd {{.*#+}} xmm2 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
257 ; X64-AVX1-NEXT: vpmaddwd %xmm1, %xmm2, %xmm1
258 ; X64-AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
259 ; X64-AVX1-NEXT: vmovups %ymm0, (%rax,%rdx,4)
260 ; X64-AVX1-NEXT: vzeroupper
261 ; X64-AVX1-NEXT: retq
263 ; X64-AVX2-LABEL: mul_8xi8:
264 ; X64-AVX2: # %bb.0: # %entry
265 ; X64-AVX2-NEXT: movq {{.*}}(%rip), %rax
266 ; X64-AVX2-NEXT: vpmovzxbd {{.*#+}} ymm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero
267 ; X64-AVX2-NEXT: vpmovzxbd {{.*#+}} ymm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero
268 ; X64-AVX2-NEXT: vpmaddwd %ymm0, %ymm1, %ymm0
269 ; X64-AVX2-NEXT: vmovdqu %ymm0, (%rax,%rdx,4)
270 ; X64-AVX2-NEXT: vzeroupper
271 ; X64-AVX2-NEXT: retq
273 %pre = load i32*, i32** @c
274 %tmp6 = getelementptr inbounds i8, i8* %a, i64 %index
275 %tmp7 = bitcast i8* %tmp6 to <8 x i8>*
276 %wide.load = load <8 x i8>, <8 x i8>* %tmp7, align 1
277 %tmp8 = zext <8 x i8> %wide.load to <8 x i32>
278 %tmp10 = getelementptr inbounds i8, i8* %b, i64 %index
279 %tmp11 = bitcast i8* %tmp10 to <8 x i8>*
280 %wide.load17 = load <8 x i8>, <8 x i8>* %tmp11, align 1
281 %tmp12 = zext <8 x i8> %wide.load17 to <8 x i32>
282 %tmp13 = mul nuw nsw <8 x i32> %tmp12, %tmp8
283 %tmp14 = getelementptr inbounds i32, i32* %pre, i64 %index
284 %tmp15 = bitcast i32* %tmp14 to <8 x i32>*
285 store <8 x i32> %tmp13, <8 x i32>* %tmp15, align 4
289 ; %val1 = load <16 x i8>
290 ; %op1 = zext<16 x i32> %val1
291 ; %val2 = load <16 x i8>
292 ; %op2 = zext<16 x i32> %val2
293 ; %rst = mul <16 x i32> %op1, %op2
295 define void @mul_16xi8(i8* nocapture readonly %a, i8* nocapture readonly %b, i64 %index) nounwind {
296 ; X86-SSE-LABEL: mul_16xi8:
297 ; X86-SSE: # %bb.0: # %entry
298 ; X86-SSE-NEXT: pushl %esi
299 ; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %eax
300 ; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %ecx
301 ; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %edx
302 ; X86-SSE-NEXT: movl c, %esi
303 ; X86-SSE-NEXT: movdqu (%edx,%ecx), %xmm0
304 ; X86-SSE-NEXT: movdqu (%eax,%ecx), %xmm1
305 ; X86-SSE-NEXT: pxor %xmm2, %xmm2
306 ; X86-SSE-NEXT: movdqa %xmm0, %xmm3
307 ; X86-SSE-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3],xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7]
308 ; X86-SSE-NEXT: movdqa %xmm1, %xmm4
309 ; X86-SSE-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1],xmm4[2],xmm2[2],xmm4[3],xmm2[3],xmm4[4],xmm2[4],xmm4[5],xmm2[5],xmm4[6],xmm2[6],xmm4[7],xmm2[7]
310 ; X86-SSE-NEXT: pmullw %xmm3, %xmm4
311 ; X86-SSE-NEXT: movdqa %xmm4, %xmm3
312 ; X86-SSE-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3]
313 ; X86-SSE-NEXT: punpckhwd {{.*#+}} xmm4 = xmm4[4],xmm2[4],xmm4[5],xmm2[5],xmm4[6],xmm2[6],xmm4[7],xmm2[7]
314 ; X86-SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm2[8],xmm0[9],xmm2[9],xmm0[10],xmm2[10],xmm0[11],xmm2[11],xmm0[12],xmm2[12],xmm0[13],xmm2[13],xmm0[14],xmm2[14],xmm0[15],xmm2[15]
315 ; X86-SSE-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm2[8],xmm1[9],xmm2[9],xmm1[10],xmm2[10],xmm1[11],xmm2[11],xmm1[12],xmm2[12],xmm1[13],xmm2[13],xmm1[14],xmm2[14],xmm1[15],xmm2[15]
316 ; X86-SSE-NEXT: pmullw %xmm0, %xmm1
317 ; X86-SSE-NEXT: movdqa %xmm1, %xmm0
318 ; X86-SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
319 ; X86-SSE-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
320 ; X86-SSE-NEXT: movdqu %xmm1, 48(%esi,%ecx,4)
321 ; X86-SSE-NEXT: movdqu %xmm0, 32(%esi,%ecx,4)
322 ; X86-SSE-NEXT: movdqu %xmm4, 16(%esi,%ecx,4)
323 ; X86-SSE-NEXT: movdqu %xmm3, (%esi,%ecx,4)
324 ; X86-SSE-NEXT: popl %esi
327 ; X86-AVX1-LABEL: mul_16xi8:
328 ; X86-AVX1: # %bb.0: # %entry
329 ; X86-AVX1-NEXT: pushl %esi
330 ; X86-AVX1-NEXT: movl {{[0-9]+}}(%esp), %eax
331 ; X86-AVX1-NEXT: movl {{[0-9]+}}(%esp), %ecx
332 ; X86-AVX1-NEXT: movl {{[0-9]+}}(%esp), %edx
333 ; X86-AVX1-NEXT: movl c, %esi
334 ; X86-AVX1-NEXT: vpmovzxbd {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
335 ; X86-AVX1-NEXT: vpmovzxbd {{.*#+}} xmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
336 ; X86-AVX1-NEXT: vpmovzxbd {{.*#+}} xmm2 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
337 ; X86-AVX1-NEXT: vpmovzxbd {{.*#+}} xmm3 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
338 ; X86-AVX1-NEXT: vpmovzxbd {{.*#+}} xmm4 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
339 ; X86-AVX1-NEXT: vpmaddwd %xmm0, %xmm4, %xmm0
340 ; X86-AVX1-NEXT: vpmovzxbd {{.*#+}} xmm4 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
341 ; X86-AVX1-NEXT: vpmaddwd %xmm1, %xmm4, %xmm1
342 ; X86-AVX1-NEXT: vpmovzxbd {{.*#+}} xmm4 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
343 ; X86-AVX1-NEXT: vpmaddwd %xmm2, %xmm4, %xmm2
344 ; X86-AVX1-NEXT: vpmovzxbd {{.*#+}} xmm4 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
345 ; X86-AVX1-NEXT: vpmaddwd %xmm3, %xmm4, %xmm3
346 ; X86-AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2
347 ; X86-AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
348 ; X86-AVX1-NEXT: vmovups %ymm0, 32(%esi,%ecx,4)
349 ; X86-AVX1-NEXT: vmovups %ymm2, (%esi,%ecx,4)
350 ; X86-AVX1-NEXT: popl %esi
351 ; X86-AVX1-NEXT: vzeroupper
352 ; X86-AVX1-NEXT: retl
354 ; X86-AVX2-LABEL: mul_16xi8:
355 ; X86-AVX2: # %bb.0: # %entry
356 ; X86-AVX2-NEXT: pushl %esi
357 ; X86-AVX2-NEXT: movl {{[0-9]+}}(%esp), %eax
358 ; X86-AVX2-NEXT: movl {{[0-9]+}}(%esp), %ecx
359 ; X86-AVX2-NEXT: movl {{[0-9]+}}(%esp), %edx
360 ; X86-AVX2-NEXT: movl c, %esi
361 ; X86-AVX2-NEXT: vpmovzxbd {{.*#+}} ymm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero
362 ; X86-AVX2-NEXT: vpmovzxbd {{.*#+}} ymm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero
363 ; X86-AVX2-NEXT: vpmovzxbd {{.*#+}} ymm2 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero
364 ; X86-AVX2-NEXT: vpmaddwd %ymm0, %ymm2, %ymm0
365 ; X86-AVX2-NEXT: vpmovzxbd {{.*#+}} ymm2 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero
366 ; X86-AVX2-NEXT: vpmaddwd %ymm1, %ymm2, %ymm1
367 ; X86-AVX2-NEXT: vmovdqu %ymm0, 32(%esi,%ecx,4)
368 ; X86-AVX2-NEXT: vmovdqu %ymm1, (%esi,%ecx,4)
369 ; X86-AVX2-NEXT: popl %esi
370 ; X86-AVX2-NEXT: vzeroupper
371 ; X86-AVX2-NEXT: retl
373 ; X64-SSE-LABEL: mul_16xi8:
374 ; X64-SSE: # %bb.0: # %entry
375 ; X64-SSE-NEXT: movq {{.*}}(%rip), %rax
376 ; X64-SSE-NEXT: movdqu (%rdi,%rdx), %xmm0
377 ; X64-SSE-NEXT: movdqu (%rsi,%rdx), %xmm1
378 ; X64-SSE-NEXT: pxor %xmm2, %xmm2
379 ; X64-SSE-NEXT: movdqa %xmm0, %xmm3
380 ; X64-SSE-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3],xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7]
381 ; X64-SSE-NEXT: movdqa %xmm1, %xmm4
382 ; X64-SSE-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1],xmm4[2],xmm2[2],xmm4[3],xmm2[3],xmm4[4],xmm2[4],xmm4[5],xmm2[5],xmm4[6],xmm2[6],xmm4[7],xmm2[7]
383 ; X64-SSE-NEXT: pmullw %xmm3, %xmm4
384 ; X64-SSE-NEXT: movdqa %xmm4, %xmm3
385 ; X64-SSE-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3]
386 ; X64-SSE-NEXT: punpckhwd {{.*#+}} xmm4 = xmm4[4],xmm2[4],xmm4[5],xmm2[5],xmm4[6],xmm2[6],xmm4[7],xmm2[7]
387 ; X64-SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm2[8],xmm0[9],xmm2[9],xmm0[10],xmm2[10],xmm0[11],xmm2[11],xmm0[12],xmm2[12],xmm0[13],xmm2[13],xmm0[14],xmm2[14],xmm0[15],xmm2[15]
388 ; X64-SSE-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm2[8],xmm1[9],xmm2[9],xmm1[10],xmm2[10],xmm1[11],xmm2[11],xmm1[12],xmm2[12],xmm1[13],xmm2[13],xmm1[14],xmm2[14],xmm1[15],xmm2[15]
389 ; X64-SSE-NEXT: pmullw %xmm0, %xmm1
390 ; X64-SSE-NEXT: movdqa %xmm1, %xmm0
391 ; X64-SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
392 ; X64-SSE-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
393 ; X64-SSE-NEXT: movdqu %xmm1, 48(%rax,%rdx,4)
394 ; X64-SSE-NEXT: movdqu %xmm0, 32(%rax,%rdx,4)
395 ; X64-SSE-NEXT: movdqu %xmm4, 16(%rax,%rdx,4)
396 ; X64-SSE-NEXT: movdqu %xmm3, (%rax,%rdx,4)
399 ; X64-AVX1-LABEL: mul_16xi8:
400 ; X64-AVX1: # %bb.0: # %entry
401 ; X64-AVX1-NEXT: movq {{.*}}(%rip), %rax
402 ; X64-AVX1-NEXT: vpmovzxbd {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
403 ; X64-AVX1-NEXT: vpmovzxbd {{.*#+}} xmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
404 ; X64-AVX1-NEXT: vpmovzxbd {{.*#+}} xmm2 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
405 ; X64-AVX1-NEXT: vpmovzxbd {{.*#+}} xmm3 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
406 ; X64-AVX1-NEXT: vpmovzxbd {{.*#+}} xmm4 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
407 ; X64-AVX1-NEXT: vpmaddwd %xmm0, %xmm4, %xmm0
408 ; X64-AVX1-NEXT: vpmovzxbd {{.*#+}} xmm4 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
409 ; X64-AVX1-NEXT: vpmaddwd %xmm1, %xmm4, %xmm1
410 ; X64-AVX1-NEXT: vpmovzxbd {{.*#+}} xmm4 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
411 ; X64-AVX1-NEXT: vpmaddwd %xmm2, %xmm4, %xmm2
412 ; X64-AVX1-NEXT: vpmovzxbd {{.*#+}} xmm4 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
413 ; X64-AVX1-NEXT: vpmaddwd %xmm3, %xmm4, %xmm3
414 ; X64-AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2
415 ; X64-AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
416 ; X64-AVX1-NEXT: vmovups %ymm0, 32(%rax,%rdx,4)
417 ; X64-AVX1-NEXT: vmovups %ymm2, (%rax,%rdx,4)
418 ; X64-AVX1-NEXT: vzeroupper
419 ; X64-AVX1-NEXT: retq
421 ; X64-AVX2-LABEL: mul_16xi8:
422 ; X64-AVX2: # %bb.0: # %entry
423 ; X64-AVX2-NEXT: movq {{.*}}(%rip), %rax
424 ; X64-AVX2-NEXT: vpmovzxbd {{.*#+}} ymm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero
425 ; X64-AVX2-NEXT: vpmovzxbd {{.*#+}} ymm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero
426 ; X64-AVX2-NEXT: vpmovzxbd {{.*#+}} ymm2 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero
427 ; X64-AVX2-NEXT: vpmaddwd %ymm0, %ymm2, %ymm0
428 ; X64-AVX2-NEXT: vpmovzxbd {{.*#+}} ymm2 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero
429 ; X64-AVX2-NEXT: vpmaddwd %ymm1, %ymm2, %ymm1
430 ; X64-AVX2-NEXT: vmovdqu %ymm0, 32(%rax,%rdx,4)
431 ; X64-AVX2-NEXT: vmovdqu %ymm1, (%rax,%rdx,4)
432 ; X64-AVX2-NEXT: vzeroupper
433 ; X64-AVX2-NEXT: retq
435 %pre = load i32*, i32** @c
436 %tmp6 = getelementptr inbounds i8, i8* %a, i64 %index
437 %tmp7 = bitcast i8* %tmp6 to <16 x i8>*
438 %wide.load = load <16 x i8>, <16 x i8>* %tmp7, align 1
439 %tmp8 = zext <16 x i8> %wide.load to <16 x i32>
440 %tmp10 = getelementptr inbounds i8, i8* %b, i64 %index
441 %tmp11 = bitcast i8* %tmp10 to <16 x i8>*
442 %wide.load17 = load <16 x i8>, <16 x i8>* %tmp11, align 1
443 %tmp12 = zext <16 x i8> %wide.load17 to <16 x i32>
444 %tmp13 = mul nuw nsw <16 x i32> %tmp12, %tmp8
445 %tmp14 = getelementptr inbounds i32, i32* %pre, i64 %index
446 %tmp15 = bitcast i32* %tmp14 to <16 x i32>*
447 store <16 x i32> %tmp13, <16 x i32>* %tmp15, align 4
451 ; %val1 = load <2 x i16>
452 ; %op1 = zext<2 x i32> %val1
453 ; %val2 = load <2 x i16>
454 ; %op2 = zext<2 x i32> %val2
455 ; %rst = mul <2 x i32> %op1, %op2
457 define void @mul_2xi16(i8* nocapture readonly %a, i8* nocapture readonly %b, i64 %index) nounwind {
458 ; X86-SSE-LABEL: mul_2xi16:
459 ; X86-SSE: # %bb.0: # %entry
460 ; X86-SSE-NEXT: pushl %esi
461 ; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %eax
462 ; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %ecx
463 ; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %edx
464 ; X86-SSE-NEXT: movl c, %esi
465 ; X86-SSE-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
466 ; X86-SSE-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
467 ; X86-SSE-NEXT: movdqa %xmm1, %xmm2
468 ; X86-SSE-NEXT: pmulhuw %xmm0, %xmm2
469 ; X86-SSE-NEXT: pmullw %xmm0, %xmm1
470 ; X86-SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
471 ; X86-SSE-NEXT: movq %xmm1, (%esi,%ecx,4)
472 ; X86-SSE-NEXT: popl %esi
475 ; X86-AVX-LABEL: mul_2xi16:
476 ; X86-AVX: # %bb.0: # %entry
477 ; X86-AVX-NEXT: pushl %esi
478 ; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax
479 ; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %ecx
480 ; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %edx
481 ; X86-AVX-NEXT: movl c, %esi
482 ; X86-AVX-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
483 ; X86-AVX-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
484 ; X86-AVX-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero
485 ; X86-AVX-NEXT: vpmovzxwd {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero
486 ; X86-AVX-NEXT: vpmulld %xmm0, %xmm1, %xmm0
487 ; X86-AVX-NEXT: vmovq %xmm0, (%esi,%ecx,4)
488 ; X86-AVX-NEXT: popl %esi
491 ; X64-SSE-LABEL: mul_2xi16:
492 ; X64-SSE: # %bb.0: # %entry
493 ; X64-SSE-NEXT: movq {{.*}}(%rip), %rax
494 ; X64-SSE-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
495 ; X64-SSE-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
496 ; X64-SSE-NEXT: movdqa %xmm1, %xmm2
497 ; X64-SSE-NEXT: pmulhuw %xmm0, %xmm2
498 ; X64-SSE-NEXT: pmullw %xmm0, %xmm1
499 ; X64-SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
500 ; X64-SSE-NEXT: movq %xmm1, (%rax,%rdx,4)
503 ; X64-AVX-LABEL: mul_2xi16:
504 ; X64-AVX: # %bb.0: # %entry
505 ; X64-AVX-NEXT: movq {{.*}}(%rip), %rax
506 ; X64-AVX-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
507 ; X64-AVX-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
508 ; X64-AVX-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero
509 ; X64-AVX-NEXT: vpmovzxwd {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero
510 ; X64-AVX-NEXT: vpmulld %xmm0, %xmm1, %xmm0
511 ; X64-AVX-NEXT: vmovq %xmm0, (%rax,%rdx,4)
514 %pre = load i32*, i32** @c
515 %tmp6 = getelementptr inbounds i8, i8* %a, i64 %index
516 %tmp7 = bitcast i8* %tmp6 to <2 x i16>*
517 %wide.load = load <2 x i16>, <2 x i16>* %tmp7, align 1
518 %tmp8 = zext <2 x i16> %wide.load to <2 x i32>
519 %tmp10 = getelementptr inbounds i8, i8* %b, i64 %index
520 %tmp11 = bitcast i8* %tmp10 to <2 x i16>*
521 %wide.load17 = load <2 x i16>, <2 x i16>* %tmp11, align 1
522 %tmp12 = zext <2 x i16> %wide.load17 to <2 x i32>
523 %tmp13 = mul nuw nsw <2 x i32> %tmp12, %tmp8
524 %tmp14 = getelementptr inbounds i32, i32* %pre, i64 %index
525 %tmp15 = bitcast i32* %tmp14 to <2 x i32>*
526 store <2 x i32> %tmp13, <2 x i32>* %tmp15, align 4
530 ; %val1 = load <4 x i16>
531 ; %op1 = zext<4 x i32> %val1
532 ; %val2 = load <4 x i16>
533 ; %op2 = zext<4 x i32> %val2
534 ; %rst = mul <4 x i32> %op1, %op2
536 define void @mul_4xi16(i8* nocapture readonly %a, i8* nocapture readonly %b, i64 %index) nounwind {
537 ; X86-SSE-LABEL: mul_4xi16:
538 ; X86-SSE: # %bb.0: # %entry
539 ; X86-SSE-NEXT: pushl %esi
540 ; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %eax
541 ; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %ecx
542 ; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %edx
543 ; X86-SSE-NEXT: movl c, %esi
544 ; X86-SSE-NEXT: movq {{.*#+}} xmm0 = mem[0],zero
545 ; X86-SSE-NEXT: movq {{.*#+}} xmm1 = mem[0],zero
546 ; X86-SSE-NEXT: movdqa %xmm1, %xmm2
547 ; X86-SSE-NEXT: pmulhuw %xmm0, %xmm2
548 ; X86-SSE-NEXT: pmullw %xmm0, %xmm1
549 ; X86-SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
550 ; X86-SSE-NEXT: movdqu %xmm1, (%esi,%ecx,4)
551 ; X86-SSE-NEXT: popl %esi
554 ; X86-AVX-LABEL: mul_4xi16:
555 ; X86-AVX: # %bb.0: # %entry
556 ; X86-AVX-NEXT: pushl %esi
557 ; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax
558 ; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %ecx
559 ; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %edx
560 ; X86-AVX-NEXT: movl c, %esi
561 ; X86-AVX-NEXT: vpmovzxwd {{.*#+}} xmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
562 ; X86-AVX-NEXT: vpmovzxwd {{.*#+}} xmm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
563 ; X86-AVX-NEXT: vpmulld %xmm0, %xmm1, %xmm0
564 ; X86-AVX-NEXT: vmovdqu %xmm0, (%esi,%ecx,4)
565 ; X86-AVX-NEXT: popl %esi
568 ; X64-SSE-LABEL: mul_4xi16:
569 ; X64-SSE: # %bb.0: # %entry
570 ; X64-SSE-NEXT: movq {{.*}}(%rip), %rax
571 ; X64-SSE-NEXT: movq {{.*#+}} xmm0 = mem[0],zero
572 ; X64-SSE-NEXT: movq {{.*#+}} xmm1 = mem[0],zero
573 ; X64-SSE-NEXT: movdqa %xmm1, %xmm2
574 ; X64-SSE-NEXT: pmulhuw %xmm0, %xmm2
575 ; X64-SSE-NEXT: pmullw %xmm0, %xmm1
576 ; X64-SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
577 ; X64-SSE-NEXT: movdqu %xmm1, (%rax,%rdx,4)
580 ; X64-AVX-LABEL: mul_4xi16:
581 ; X64-AVX: # %bb.0: # %entry
582 ; X64-AVX-NEXT: movq {{.*}}(%rip), %rax
583 ; X64-AVX-NEXT: vpmovzxwd {{.*#+}} xmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
584 ; X64-AVX-NEXT: vpmovzxwd {{.*#+}} xmm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
585 ; X64-AVX-NEXT: vpmulld %xmm0, %xmm1, %xmm0
586 ; X64-AVX-NEXT: vmovdqu %xmm0, (%rax,%rdx,4)
589 %pre = load i32*, i32** @c
590 %tmp6 = getelementptr inbounds i8, i8* %a, i64 %index
591 %tmp7 = bitcast i8* %tmp6 to <4 x i16>*
592 %wide.load = load <4 x i16>, <4 x i16>* %tmp7, align 1
593 %tmp8 = zext <4 x i16> %wide.load to <4 x i32>
594 %tmp10 = getelementptr inbounds i8, i8* %b, i64 %index
595 %tmp11 = bitcast i8* %tmp10 to <4 x i16>*
596 %wide.load17 = load <4 x i16>, <4 x i16>* %tmp11, align 1
597 %tmp12 = zext <4 x i16> %wide.load17 to <4 x i32>
598 %tmp13 = mul nuw nsw <4 x i32> %tmp12, %tmp8
599 %tmp14 = getelementptr inbounds i32, i32* %pre, i64 %index
600 %tmp15 = bitcast i32* %tmp14 to <4 x i32>*
601 store <4 x i32> %tmp13, <4 x i32>* %tmp15, align 4
605 ; %val1 = load <8 x i16>
606 ; %op1 = zext<8 x i32> %val1
607 ; %val2 = load <8 x i16>
608 ; %op2 = zext<8 x i32> %val2
609 ; %rst = mul <8 x i32> %op1, %op2
611 define void @mul_8xi16(i8* nocapture readonly %a, i8* nocapture readonly %b, i64 %index) nounwind {
612 ; X86-SSE-LABEL: mul_8xi16:
613 ; X86-SSE: # %bb.0: # %entry
614 ; X86-SSE-NEXT: pushl %esi
615 ; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %eax
616 ; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %ecx
617 ; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %edx
618 ; X86-SSE-NEXT: movl c, %esi
619 ; X86-SSE-NEXT: movdqu (%edx,%ecx), %xmm0
620 ; X86-SSE-NEXT: movdqu (%eax,%ecx), %xmm1
621 ; X86-SSE-NEXT: movdqa %xmm1, %xmm2
622 ; X86-SSE-NEXT: pmulhuw %xmm0, %xmm2
623 ; X86-SSE-NEXT: pmullw %xmm0, %xmm1
624 ; X86-SSE-NEXT: movdqa %xmm1, %xmm0
625 ; X86-SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
626 ; X86-SSE-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
627 ; X86-SSE-NEXT: movdqu %xmm1, 16(%esi,%ecx,4)
628 ; X86-SSE-NEXT: movdqu %xmm0, (%esi,%ecx,4)
629 ; X86-SSE-NEXT: popl %esi
632 ; X86-AVX1-LABEL: mul_8xi16:
633 ; X86-AVX1: # %bb.0: # %entry
634 ; X86-AVX1-NEXT: pushl %esi
635 ; X86-AVX1-NEXT: movl {{[0-9]+}}(%esp), %eax
636 ; X86-AVX1-NEXT: movl {{[0-9]+}}(%esp), %ecx
637 ; X86-AVX1-NEXT: movl {{[0-9]+}}(%esp), %edx
638 ; X86-AVX1-NEXT: movl c, %esi
639 ; X86-AVX1-NEXT: vpmovzxwd {{.*#+}} xmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
640 ; X86-AVX1-NEXT: vpmovzxwd {{.*#+}} xmm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
641 ; X86-AVX1-NEXT: vpmovzxwd {{.*#+}} xmm2 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
642 ; X86-AVX1-NEXT: vpmulld %xmm0, %xmm2, %xmm0
643 ; X86-AVX1-NEXT: vpmovzxwd {{.*#+}} xmm2 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
644 ; X86-AVX1-NEXT: vpmulld %xmm1, %xmm2, %xmm1
645 ; X86-AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
646 ; X86-AVX1-NEXT: vmovups %ymm0, (%esi,%ecx,4)
647 ; X86-AVX1-NEXT: popl %esi
648 ; X86-AVX1-NEXT: vzeroupper
649 ; X86-AVX1-NEXT: retl
651 ; X86-AVX2-LABEL: mul_8xi16:
652 ; X86-AVX2: # %bb.0: # %entry
653 ; X86-AVX2-NEXT: pushl %esi
654 ; X86-AVX2-NEXT: movl {{[0-9]+}}(%esp), %eax
655 ; X86-AVX2-NEXT: movl {{[0-9]+}}(%esp), %ecx
656 ; X86-AVX2-NEXT: movl {{[0-9]+}}(%esp), %edx
657 ; X86-AVX2-NEXT: movl c, %esi
658 ; X86-AVX2-NEXT: vpmovzxwd {{.*#+}} ymm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero
659 ; X86-AVX2-NEXT: vpmovzxwd {{.*#+}} ymm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero
660 ; X86-AVX2-NEXT: vpmulld %ymm0, %ymm1, %ymm0
661 ; X86-AVX2-NEXT: vmovdqu %ymm0, (%esi,%ecx,4)
662 ; X86-AVX2-NEXT: popl %esi
663 ; X86-AVX2-NEXT: vzeroupper
664 ; X86-AVX2-NEXT: retl
666 ; X64-SSE-LABEL: mul_8xi16:
667 ; X64-SSE: # %bb.0: # %entry
668 ; X64-SSE-NEXT: movq {{.*}}(%rip), %rax
669 ; X64-SSE-NEXT: movdqu (%rdi,%rdx), %xmm0
670 ; X64-SSE-NEXT: movdqu (%rsi,%rdx), %xmm1
671 ; X64-SSE-NEXT: movdqa %xmm1, %xmm2
672 ; X64-SSE-NEXT: pmulhuw %xmm0, %xmm2
673 ; X64-SSE-NEXT: pmullw %xmm0, %xmm1
674 ; X64-SSE-NEXT: movdqa %xmm1, %xmm0
675 ; X64-SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
676 ; X64-SSE-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
677 ; X64-SSE-NEXT: movdqu %xmm1, 16(%rax,%rdx,4)
678 ; X64-SSE-NEXT: movdqu %xmm0, (%rax,%rdx,4)
681 ; X64-AVX1-LABEL: mul_8xi16:
682 ; X64-AVX1: # %bb.0: # %entry
683 ; X64-AVX1-NEXT: movq {{.*}}(%rip), %rax
684 ; X64-AVX1-NEXT: vpmovzxwd {{.*#+}} xmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
685 ; X64-AVX1-NEXT: vpmovzxwd {{.*#+}} xmm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
686 ; X64-AVX1-NEXT: vpmovzxwd {{.*#+}} xmm2 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
687 ; X64-AVX1-NEXT: vpmulld %xmm0, %xmm2, %xmm0
688 ; X64-AVX1-NEXT: vpmovzxwd {{.*#+}} xmm2 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
689 ; X64-AVX1-NEXT: vpmulld %xmm1, %xmm2, %xmm1
690 ; X64-AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
691 ; X64-AVX1-NEXT: vmovups %ymm0, (%rax,%rdx,4)
692 ; X64-AVX1-NEXT: vzeroupper
693 ; X64-AVX1-NEXT: retq
695 ; X64-AVX2-LABEL: mul_8xi16:
696 ; X64-AVX2: # %bb.0: # %entry
697 ; X64-AVX2-NEXT: movq {{.*}}(%rip), %rax
698 ; X64-AVX2-NEXT: vpmovzxwd {{.*#+}} ymm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero
699 ; X64-AVX2-NEXT: vpmovzxwd {{.*#+}} ymm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero
700 ; X64-AVX2-NEXT: vpmulld %ymm0, %ymm1, %ymm0
701 ; X64-AVX2-NEXT: vmovdqu %ymm0, (%rax,%rdx,4)
702 ; X64-AVX2-NEXT: vzeroupper
703 ; X64-AVX2-NEXT: retq
705 %pre = load i32*, i32** @c
706 %tmp6 = getelementptr inbounds i8, i8* %a, i64 %index
707 %tmp7 = bitcast i8* %tmp6 to <8 x i16>*
708 %wide.load = load <8 x i16>, <8 x i16>* %tmp7, align 1
709 %tmp8 = zext <8 x i16> %wide.load to <8 x i32>
710 %tmp10 = getelementptr inbounds i8, i8* %b, i64 %index
711 %tmp11 = bitcast i8* %tmp10 to <8 x i16>*
712 %wide.load17 = load <8 x i16>, <8 x i16>* %tmp11, align 1
713 %tmp12 = zext <8 x i16> %wide.load17 to <8 x i32>
714 %tmp13 = mul nuw nsw <8 x i32> %tmp12, %tmp8
715 %tmp14 = getelementptr inbounds i32, i32* %pre, i64 %index
716 %tmp15 = bitcast i32* %tmp14 to <8 x i32>*
717 store <8 x i32> %tmp13, <8 x i32>* %tmp15, align 4
721 ; %val1 = load <16 x i16>
722 ; %op1 = zext<16 x i32> %val1
723 ; %val2 = load <16 x i16>
724 ; %op2 = zext<16 x i32> %val2
725 ; %rst = mul <16 x i32> %op1, %op2
727 define void @mul_16xi16(i8* nocapture readonly %a, i8* nocapture readonly %b, i64 %index) nounwind {
728 ; X86-SSE-LABEL: mul_16xi16:
729 ; X86-SSE: # %bb.0: # %entry
730 ; X86-SSE-NEXT: pushl %esi
731 ; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %eax
732 ; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %ecx
733 ; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %edx
734 ; X86-SSE-NEXT: movl c, %esi
735 ; X86-SSE-NEXT: movdqu (%edx,%ecx), %xmm0
736 ; X86-SSE-NEXT: movdqu 16(%edx,%ecx), %xmm1
737 ; X86-SSE-NEXT: movdqu (%eax,%ecx), %xmm2
738 ; X86-SSE-NEXT: movdqu 16(%eax,%ecx), %xmm3
739 ; X86-SSE-NEXT: movdqa %xmm2, %xmm4
740 ; X86-SSE-NEXT: pmulhuw %xmm0, %xmm4
741 ; X86-SSE-NEXT: pmullw %xmm0, %xmm2
742 ; X86-SSE-NEXT: movdqa %xmm2, %xmm0
743 ; X86-SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3]
744 ; X86-SSE-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm4[4],xmm2[5],xmm4[5],xmm2[6],xmm4[6],xmm2[7],xmm4[7]
745 ; X86-SSE-NEXT: movdqa %xmm3, %xmm4
746 ; X86-SSE-NEXT: pmulhuw %xmm1, %xmm4
747 ; X86-SSE-NEXT: pmullw %xmm1, %xmm3
748 ; X86-SSE-NEXT: movdqa %xmm3, %xmm1
749 ; X86-SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3]
750 ; X86-SSE-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7]
751 ; X86-SSE-NEXT: movdqu %xmm3, 48(%esi,%ecx,4)
752 ; X86-SSE-NEXT: movdqu %xmm1, 32(%esi,%ecx,4)
753 ; X86-SSE-NEXT: movdqu %xmm2, 16(%esi,%ecx,4)
754 ; X86-SSE-NEXT: movdqu %xmm0, (%esi,%ecx,4)
755 ; X86-SSE-NEXT: popl %esi
758 ; X86-AVX1-LABEL: mul_16xi16:
759 ; X86-AVX1: # %bb.0: # %entry
760 ; X86-AVX1-NEXT: pushl %esi
761 ; X86-AVX1-NEXT: movl {{[0-9]+}}(%esp), %eax
762 ; X86-AVX1-NEXT: movl {{[0-9]+}}(%esp), %ecx
763 ; X86-AVX1-NEXT: movl {{[0-9]+}}(%esp), %edx
764 ; X86-AVX1-NEXT: movl c, %esi
765 ; X86-AVX1-NEXT: vpmovzxwd {{.*#+}} xmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
766 ; X86-AVX1-NEXT: vpmovzxwd {{.*#+}} xmm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
767 ; X86-AVX1-NEXT: vpmovzxwd {{.*#+}} xmm2 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
768 ; X86-AVX1-NEXT: vpmovzxwd {{.*#+}} xmm3 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
769 ; X86-AVX1-NEXT: vpmovzxwd {{.*#+}} xmm4 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
770 ; X86-AVX1-NEXT: vpmulld %xmm0, %xmm4, %xmm0
771 ; X86-AVX1-NEXT: vpmovzxwd {{.*#+}} xmm4 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
772 ; X86-AVX1-NEXT: vpmulld %xmm1, %xmm4, %xmm1
773 ; X86-AVX1-NEXT: vpmovzxwd {{.*#+}} xmm4 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
774 ; X86-AVX1-NEXT: vpmulld %xmm2, %xmm4, %xmm2
775 ; X86-AVX1-NEXT: vpmovzxwd {{.*#+}} xmm4 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
776 ; X86-AVX1-NEXT: vpmulld %xmm3, %xmm4, %xmm3
777 ; X86-AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2
778 ; X86-AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
779 ; X86-AVX1-NEXT: vmovups %ymm0, 32(%esi,%ecx,4)
780 ; X86-AVX1-NEXT: vmovups %ymm2, (%esi,%ecx,4)
781 ; X86-AVX1-NEXT: popl %esi
782 ; X86-AVX1-NEXT: vzeroupper
783 ; X86-AVX1-NEXT: retl
785 ; X86-AVX2-LABEL: mul_16xi16:
786 ; X86-AVX2: # %bb.0: # %entry
787 ; X86-AVX2-NEXT: pushl %esi
788 ; X86-AVX2-NEXT: movl {{[0-9]+}}(%esp), %eax
789 ; X86-AVX2-NEXT: movl {{[0-9]+}}(%esp), %ecx
790 ; X86-AVX2-NEXT: movl {{[0-9]+}}(%esp), %edx
791 ; X86-AVX2-NEXT: movl c, %esi
792 ; X86-AVX2-NEXT: vpmovzxwd {{.*#+}} ymm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero
793 ; X86-AVX2-NEXT: vpmovzxwd {{.*#+}} ymm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero
794 ; X86-AVX2-NEXT: vpmovzxwd {{.*#+}} ymm2 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero
795 ; X86-AVX2-NEXT: vpmulld %ymm0, %ymm2, %ymm0
796 ; X86-AVX2-NEXT: vpmovzxwd {{.*#+}} ymm2 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero
797 ; X86-AVX2-NEXT: vpmulld %ymm1, %ymm2, %ymm1
798 ; X86-AVX2-NEXT: vmovdqu %ymm0, 32(%esi,%ecx,4)
799 ; X86-AVX2-NEXT: vmovdqu %ymm1, (%esi,%ecx,4)
800 ; X86-AVX2-NEXT: popl %esi
801 ; X86-AVX2-NEXT: vzeroupper
802 ; X86-AVX2-NEXT: retl
804 ; X64-SSE-LABEL: mul_16xi16:
805 ; X64-SSE: # %bb.0: # %entry
806 ; X64-SSE-NEXT: movq {{.*}}(%rip), %rax
807 ; X64-SSE-NEXT: movdqu (%rdi,%rdx), %xmm0
808 ; X64-SSE-NEXT: movdqu 16(%rdi,%rdx), %xmm1
809 ; X64-SSE-NEXT: movdqu (%rsi,%rdx), %xmm2
810 ; X64-SSE-NEXT: movdqu 16(%rsi,%rdx), %xmm3
811 ; X64-SSE-NEXT: movdqa %xmm2, %xmm4
812 ; X64-SSE-NEXT: pmulhuw %xmm0, %xmm4
813 ; X64-SSE-NEXT: pmullw %xmm0, %xmm2
814 ; X64-SSE-NEXT: movdqa %xmm2, %xmm0
815 ; X64-SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3]
816 ; X64-SSE-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm4[4],xmm2[5],xmm4[5],xmm2[6],xmm4[6],xmm2[7],xmm4[7]
817 ; X64-SSE-NEXT: movdqa %xmm3, %xmm4
818 ; X64-SSE-NEXT: pmulhuw %xmm1, %xmm4
819 ; X64-SSE-NEXT: pmullw %xmm1, %xmm3
820 ; X64-SSE-NEXT: movdqa %xmm3, %xmm1
821 ; X64-SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3]
822 ; X64-SSE-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7]
823 ; X64-SSE-NEXT: movdqu %xmm3, 48(%rax,%rdx,4)
824 ; X64-SSE-NEXT: movdqu %xmm1, 32(%rax,%rdx,4)
825 ; X64-SSE-NEXT: movdqu %xmm2, 16(%rax,%rdx,4)
826 ; X64-SSE-NEXT: movdqu %xmm0, (%rax,%rdx,4)
829 ; X64-AVX1-LABEL: mul_16xi16:
830 ; X64-AVX1: # %bb.0: # %entry
831 ; X64-AVX1-NEXT: movq {{.*}}(%rip), %rax
832 ; X64-AVX1-NEXT: vpmovzxwd {{.*#+}} xmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
833 ; X64-AVX1-NEXT: vpmovzxwd {{.*#+}} xmm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
834 ; X64-AVX1-NEXT: vpmovzxwd {{.*#+}} xmm2 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
835 ; X64-AVX1-NEXT: vpmovzxwd {{.*#+}} xmm3 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
836 ; X64-AVX1-NEXT: vpmovzxwd {{.*#+}} xmm4 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
837 ; X64-AVX1-NEXT: vpmulld %xmm0, %xmm4, %xmm0
838 ; X64-AVX1-NEXT: vpmovzxwd {{.*#+}} xmm4 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
839 ; X64-AVX1-NEXT: vpmulld %xmm1, %xmm4, %xmm1
840 ; X64-AVX1-NEXT: vpmovzxwd {{.*#+}} xmm4 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
841 ; X64-AVX1-NEXT: vpmulld %xmm2, %xmm4, %xmm2
842 ; X64-AVX1-NEXT: vpmovzxwd {{.*#+}} xmm4 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
843 ; X64-AVX1-NEXT: vpmulld %xmm3, %xmm4, %xmm3
844 ; X64-AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2
845 ; X64-AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
846 ; X64-AVX1-NEXT: vmovups %ymm0, 32(%rax,%rdx,4)
847 ; X64-AVX1-NEXT: vmovups %ymm2, (%rax,%rdx,4)
848 ; X64-AVX1-NEXT: vzeroupper
849 ; X64-AVX1-NEXT: retq
851 ; X64-AVX2-LABEL: mul_16xi16:
852 ; X64-AVX2: # %bb.0: # %entry
853 ; X64-AVX2-NEXT: movq {{.*}}(%rip), %rax
854 ; X64-AVX2-NEXT: vpmovzxwd {{.*#+}} ymm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero
855 ; X64-AVX2-NEXT: vpmovzxwd {{.*#+}} ymm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero
856 ; X64-AVX2-NEXT: vpmovzxwd {{.*#+}} ymm2 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero
857 ; X64-AVX2-NEXT: vpmulld %ymm0, %ymm2, %ymm0
858 ; X64-AVX2-NEXT: vpmovzxwd {{.*#+}} ymm2 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero
859 ; X64-AVX2-NEXT: vpmulld %ymm1, %ymm2, %ymm1
860 ; X64-AVX2-NEXT: vmovdqu %ymm0, 32(%rax,%rdx,4)
861 ; X64-AVX2-NEXT: vmovdqu %ymm1, (%rax,%rdx,4)
862 ; X64-AVX2-NEXT: vzeroupper
863 ; X64-AVX2-NEXT: retq
865 %pre = load i32*, i32** @c
866 %tmp6 = getelementptr inbounds i8, i8* %a, i64 %index
867 %tmp7 = bitcast i8* %tmp6 to <16 x i16>*
868 %wide.load = load <16 x i16>, <16 x i16>* %tmp7, align 1
869 %tmp8 = zext <16 x i16> %wide.load to <16 x i32>
870 %tmp10 = getelementptr inbounds i8, i8* %b, i64 %index
871 %tmp11 = bitcast i8* %tmp10 to <16 x i16>*
872 %wide.load17 = load <16 x i16>, <16 x i16>* %tmp11, align 1
873 %tmp12 = zext <16 x i16> %wide.load17 to <16 x i32>
874 %tmp13 = mul nuw nsw <16 x i32> %tmp12, %tmp8
875 %tmp14 = getelementptr inbounds i32, i32* %pre, i64 %index
876 %tmp15 = bitcast i32* %tmp14 to <16 x i32>*
877 store <16 x i32> %tmp13, <16 x i32>* %tmp15, align 4
881 ; %val1 = load <2 x i8>
882 ; %op1 = sext<2 x i32> %val1
883 ; %val2 = load <2 x i8>
884 ; %op2 = sext<2 x i32> %val2
885 ; %rst = mul <2 x i32> %op1, %op2
887 define void @mul_2xi8_sext(i8* nocapture readonly %a, i8* nocapture readonly %b, i64 %index) nounwind {
888 ; X86-SSE-LABEL: mul_2xi8_sext:
889 ; X86-SSE: # %bb.0: # %entry
890 ; X86-SSE-NEXT: pushl %esi
891 ; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %eax
892 ; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %ecx
893 ; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %edx
894 ; X86-SSE-NEXT: movl c, %esi
895 ; X86-SSE-NEXT: movzwl (%edx,%ecx), %edx
896 ; X86-SSE-NEXT: movd %edx, %xmm0
897 ; X86-SSE-NEXT: movzwl (%eax,%ecx), %eax
898 ; X86-SSE-NEXT: movd %eax, %xmm1
899 ; X86-SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
900 ; X86-SSE-NEXT: psraw $8, %xmm0
901 ; X86-SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
902 ; X86-SSE-NEXT: psraw $8, %xmm1
903 ; X86-SSE-NEXT: pmullw %xmm0, %xmm1
904 ; X86-SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
905 ; X86-SSE-NEXT: psrad $16, %xmm0
906 ; X86-SSE-NEXT: movq %xmm0, (%esi,%ecx,4)
907 ; X86-SSE-NEXT: popl %esi
910 ; X86-AVX-LABEL: mul_2xi8_sext:
911 ; X86-AVX: # %bb.0: # %entry
912 ; X86-AVX-NEXT: pushl %esi
913 ; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax
914 ; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %ecx
915 ; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %edx
916 ; X86-AVX-NEXT: movl c, %esi
917 ; X86-AVX-NEXT: vpmovsxbq (%edx,%ecx), %xmm0
918 ; X86-AVX-NEXT: vpmovsxbq (%eax,%ecx), %xmm1
919 ; X86-AVX-NEXT: vpmuludq %xmm0, %xmm1, %xmm0
920 ; X86-AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
921 ; X86-AVX-NEXT: vmovq %xmm0, (%esi,%ecx,4)
922 ; X86-AVX-NEXT: popl %esi
925 ; X64-SSE-LABEL: mul_2xi8_sext:
926 ; X64-SSE: # %bb.0: # %entry
927 ; X64-SSE-NEXT: movq {{.*}}(%rip), %rax
928 ; X64-SSE-NEXT: movzwl (%rdi,%rdx), %ecx
929 ; X64-SSE-NEXT: movd %ecx, %xmm0
930 ; X64-SSE-NEXT: movzwl (%rsi,%rdx), %ecx
931 ; X64-SSE-NEXT: movd %ecx, %xmm1
932 ; X64-SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
933 ; X64-SSE-NEXT: psraw $8, %xmm0
934 ; X64-SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
935 ; X64-SSE-NEXT: psraw $8, %xmm1
936 ; X64-SSE-NEXT: pmullw %xmm0, %xmm1
937 ; X64-SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
938 ; X64-SSE-NEXT: psrad $16, %xmm0
939 ; X64-SSE-NEXT: movq %xmm0, (%rax,%rdx,4)
942 ; X64-AVX-LABEL: mul_2xi8_sext:
943 ; X64-AVX: # %bb.0: # %entry
944 ; X64-AVX-NEXT: movq {{.*}}(%rip), %rax
945 ; X64-AVX-NEXT: vpmovsxbq (%rdi,%rdx), %xmm0
946 ; X64-AVX-NEXT: vpmovsxbq (%rsi,%rdx), %xmm1
947 ; X64-AVX-NEXT: vpmuludq %xmm0, %xmm1, %xmm0
948 ; X64-AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
949 ; X64-AVX-NEXT: vmovq %xmm0, (%rax,%rdx,4)
952 %pre = load i32*, i32** @c
953 %tmp6 = getelementptr inbounds i8, i8* %a, i64 %index
954 %tmp7 = bitcast i8* %tmp6 to <2 x i8>*
955 %wide.load = load <2 x i8>, <2 x i8>* %tmp7, align 1
956 %tmp8 = sext <2 x i8> %wide.load to <2 x i32>
957 %tmp10 = getelementptr inbounds i8, i8* %b, i64 %index
958 %tmp11 = bitcast i8* %tmp10 to <2 x i8>*
959 %wide.load17 = load <2 x i8>, <2 x i8>* %tmp11, align 1
960 %tmp12 = sext <2 x i8> %wide.load17 to <2 x i32>
961 %tmp13 = mul nuw nsw <2 x i32> %tmp12, %tmp8
962 %tmp14 = getelementptr inbounds i32, i32* %pre, i64 %index
963 %tmp15 = bitcast i32* %tmp14 to <2 x i32>*
964 store <2 x i32> %tmp13, <2 x i32>* %tmp15, align 4
968 ; %val1 = load <2 x i8>
969 ; %op1 = sext<2 x i32> %val1
970 ; %val2 = load <2 x i8>
971 ; %op2 = zext<2 x i32> %val2
972 ; %rst = mul <2 x i32> %op1, %op2
974 define void @mul_2xi8_sext_zext(i8* nocapture readonly %a, i8* nocapture readonly %b, i64 %index) nounwind {
975 ; X86-SSE-LABEL: mul_2xi8_sext_zext:
976 ; X86-SSE: # %bb.0: # %entry
977 ; X86-SSE-NEXT: pushl %esi
978 ; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %eax
979 ; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %ecx
980 ; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %edx
981 ; X86-SSE-NEXT: movl c, %esi
982 ; X86-SSE-NEXT: movzwl (%edx,%ecx), %edx
983 ; X86-SSE-NEXT: movd %edx, %xmm0
984 ; X86-SSE-NEXT: movzwl (%eax,%ecx), %eax
985 ; X86-SSE-NEXT: movd %eax, %xmm1
986 ; X86-SSE-NEXT: pxor %xmm2, %xmm2
987 ; X86-SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
988 ; X86-SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
989 ; X86-SSE-NEXT: psraw $8, %xmm0
990 ; X86-SSE-NEXT: movdqa %xmm1, %xmm2
991 ; X86-SSE-NEXT: pmulhw %xmm0, %xmm2
992 ; X86-SSE-NEXT: pmullw %xmm1, %xmm0
993 ; X86-SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
994 ; X86-SSE-NEXT: movq %xmm0, (%esi,%ecx,4)
995 ; X86-SSE-NEXT: popl %esi
998 ; X86-AVX-LABEL: mul_2xi8_sext_zext:
999 ; X86-AVX: # %bb.0: # %entry
1000 ; X86-AVX-NEXT: pushl %esi
1001 ; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax
1002 ; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %ecx
1003 ; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %edx
1004 ; X86-AVX-NEXT: movl c, %esi
1005 ; X86-AVX-NEXT: vpmovsxbq (%edx,%ecx), %xmm0
1006 ; X86-AVX-NEXT: vpmovzxbq {{.*#+}} xmm1 = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero
1007 ; X86-AVX-NEXT: vpmuludq %xmm0, %xmm1, %xmm0
1008 ; X86-AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
1009 ; X86-AVX-NEXT: vmovq %xmm0, (%esi,%ecx,4)
1010 ; X86-AVX-NEXT: popl %esi
1011 ; X86-AVX-NEXT: retl
1013 ; X64-SSE-LABEL: mul_2xi8_sext_zext:
1014 ; X64-SSE: # %bb.0: # %entry
1015 ; X64-SSE-NEXT: movq {{.*}}(%rip), %rax
1016 ; X64-SSE-NEXT: movzwl (%rdi,%rdx), %ecx
1017 ; X64-SSE-NEXT: movd %ecx, %xmm0
1018 ; X64-SSE-NEXT: movzwl (%rsi,%rdx), %ecx
1019 ; X64-SSE-NEXT: movd %ecx, %xmm1
1020 ; X64-SSE-NEXT: pxor %xmm2, %xmm2
1021 ; X64-SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
1022 ; X64-SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
1023 ; X64-SSE-NEXT: psraw $8, %xmm0
1024 ; X64-SSE-NEXT: movdqa %xmm1, %xmm2
1025 ; X64-SSE-NEXT: pmulhw %xmm0, %xmm2
1026 ; X64-SSE-NEXT: pmullw %xmm1, %xmm0
1027 ; X64-SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
1028 ; X64-SSE-NEXT: movq %xmm0, (%rax,%rdx,4)
1029 ; X64-SSE-NEXT: retq
1031 ; X64-AVX-LABEL: mul_2xi8_sext_zext:
1032 ; X64-AVX: # %bb.0: # %entry
1033 ; X64-AVX-NEXT: movq {{.*}}(%rip), %rax
1034 ; X64-AVX-NEXT: vpmovsxbq (%rdi,%rdx), %xmm0
1035 ; X64-AVX-NEXT: vpmovzxbq {{.*#+}} xmm1 = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero
1036 ; X64-AVX-NEXT: vpmuludq %xmm0, %xmm1, %xmm0
1037 ; X64-AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
1038 ; X64-AVX-NEXT: vmovq %xmm0, (%rax,%rdx,4)
1039 ; X64-AVX-NEXT: retq
1041 %pre = load i32*, i32** @c
1042 %tmp6 = getelementptr inbounds i8, i8* %a, i64 %index
1043 %tmp7 = bitcast i8* %tmp6 to <2 x i8>*
1044 %wide.load = load <2 x i8>, <2 x i8>* %tmp7, align 1
1045 %tmp8 = sext <2 x i8> %wide.load to <2 x i32>
1046 %tmp10 = getelementptr inbounds i8, i8* %b, i64 %index
1047 %tmp11 = bitcast i8* %tmp10 to <2 x i8>*
1048 %wide.load17 = load <2 x i8>, <2 x i8>* %tmp11, align 1
1049 %tmp12 = zext <2 x i8> %wide.load17 to <2 x i32>
1050 %tmp13 = mul nuw nsw <2 x i32> %tmp12, %tmp8
1051 %tmp14 = getelementptr inbounds i32, i32* %pre, i64 %index
1052 %tmp15 = bitcast i32* %tmp14 to <2 x i32>*
1053 store <2 x i32> %tmp13, <2 x i32>* %tmp15, align 4
1057 ; %val1 = load <2 x i16>
1058 ; %op1 = sext<2 x i32> %val1
1059 ; %val2 = load <2 x i16>
1060 ; %op2 = sext<2 x i32> %val2
1061 ; %rst = mul <2 x i32> %op1, %op2
1063 define void @mul_2xi16_sext(i8* nocapture readonly %a, i8* nocapture readonly %b, i64 %index) nounwind {
1064 ; X86-SSE-LABEL: mul_2xi16_sext:
1065 ; X86-SSE: # %bb.0: # %entry
1066 ; X86-SSE-NEXT: pushl %esi
1067 ; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %eax
1068 ; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %ecx
1069 ; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %edx
1070 ; X86-SSE-NEXT: movl c, %esi
1071 ; X86-SSE-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
1072 ; X86-SSE-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
1073 ; X86-SSE-NEXT: movdqa %xmm1, %xmm2
1074 ; X86-SSE-NEXT: pmulhw %xmm0, %xmm2
1075 ; X86-SSE-NEXT: pmullw %xmm0, %xmm1
1076 ; X86-SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
1077 ; X86-SSE-NEXT: movq %xmm1, (%esi,%ecx,4)
1078 ; X86-SSE-NEXT: popl %esi
1079 ; X86-SSE-NEXT: retl
1081 ; X86-AVX-LABEL: mul_2xi16_sext:
1082 ; X86-AVX: # %bb.0: # %entry
1083 ; X86-AVX-NEXT: pushl %esi
1084 ; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax
1085 ; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %ecx
1086 ; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %edx
1087 ; X86-AVX-NEXT: movl c, %esi
1088 ; X86-AVX-NEXT: vpmovsxwq (%edx,%ecx), %xmm0
1089 ; X86-AVX-NEXT: vpmovsxwq (%eax,%ecx), %xmm1
1090 ; X86-AVX-NEXT: vpmuludq %xmm0, %xmm1, %xmm0
1091 ; X86-AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
1092 ; X86-AVX-NEXT: vmovq %xmm0, (%esi,%ecx,4)
1093 ; X86-AVX-NEXT: popl %esi
1094 ; X86-AVX-NEXT: retl
1096 ; X64-SSE-LABEL: mul_2xi16_sext:
1097 ; X64-SSE: # %bb.0: # %entry
1098 ; X64-SSE-NEXT: movq {{.*}}(%rip), %rax
1099 ; X64-SSE-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
1100 ; X64-SSE-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
1101 ; X64-SSE-NEXT: movdqa %xmm1, %xmm2
1102 ; X64-SSE-NEXT: pmulhw %xmm0, %xmm2
1103 ; X64-SSE-NEXT: pmullw %xmm0, %xmm1
1104 ; X64-SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
1105 ; X64-SSE-NEXT: movq %xmm1, (%rax,%rdx,4)
1106 ; X64-SSE-NEXT: retq
1108 ; X64-AVX-LABEL: mul_2xi16_sext:
1109 ; X64-AVX: # %bb.0: # %entry
1110 ; X64-AVX-NEXT: movq {{.*}}(%rip), %rax
1111 ; X64-AVX-NEXT: vpmovsxwq (%rdi,%rdx), %xmm0
1112 ; X64-AVX-NEXT: vpmovsxwq (%rsi,%rdx), %xmm1
1113 ; X64-AVX-NEXT: vpmuludq %xmm0, %xmm1, %xmm0
1114 ; X64-AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
1115 ; X64-AVX-NEXT: vmovq %xmm0, (%rax,%rdx,4)
1116 ; X64-AVX-NEXT: retq
1118 %pre = load i32*, i32** @c
1119 %tmp6 = getelementptr inbounds i8, i8* %a, i64 %index
1120 %tmp7 = bitcast i8* %tmp6 to <2 x i16>*
1121 %wide.load = load <2 x i16>, <2 x i16>* %tmp7, align 1
1122 %tmp8 = sext <2 x i16> %wide.load to <2 x i32>
1123 %tmp10 = getelementptr inbounds i8, i8* %b, i64 %index
1124 %tmp11 = bitcast i8* %tmp10 to <2 x i16>*
1125 %wide.load17 = load <2 x i16>, <2 x i16>* %tmp11, align 1
1126 %tmp12 = sext <2 x i16> %wide.load17 to <2 x i32>
1127 %tmp13 = mul nuw nsw <2 x i32> %tmp12, %tmp8
1128 %tmp14 = getelementptr inbounds i32, i32* %pre, i64 %index
1129 %tmp15 = bitcast i32* %tmp14 to <2 x i32>*
1130 store <2 x i32> %tmp13, <2 x i32>* %tmp15, align 4
1134 ; %val1 = load <2 x i16>
1135 ; %op1 = sext<2 x i32> %val1
1136 ; %val2 = load <2 x i16>
1137 ; %op2 = zext<2 x i32> %val2
1138 ; %rst = mul <2 x i32> %op1, %op2
1140 define void @mul_2xi16_sext_zext(i8* nocapture readonly %a, i8* nocapture readonly %b, i64 %index) nounwind {
1141 ; X86-SSE-LABEL: mul_2xi16_sext_zext:
1142 ; X86-SSE: # %bb.0: # %entry
1143 ; X86-SSE-NEXT: pushl %esi
1144 ; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %eax
1145 ; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %ecx
1146 ; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %edx
1147 ; X86-SSE-NEXT: movl c, %esi
1148 ; X86-SSE-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
1149 ; X86-SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,2,1,4,5,6,7]
1150 ; X86-SSE-NEXT: psrad $16, %xmm0
1151 ; X86-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,1,3]
1152 ; X86-SSE-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
1153 ; X86-SSE-NEXT: pxor %xmm2, %xmm2
1154 ; X86-SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
1155 ; X86-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,1,3]
1156 ; X86-SSE-NEXT: pmuludq %xmm0, %xmm1
1157 ; X86-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,2,2,3]
1158 ; X86-SSE-NEXT: movq %xmm0, (%esi,%ecx,4)
1159 ; X86-SSE-NEXT: popl %esi
1160 ; X86-SSE-NEXT: retl
1162 ; X86-AVX-LABEL: mul_2xi16_sext_zext:
1163 ; X86-AVX: # %bb.0: # %entry
1164 ; X86-AVX-NEXT: pushl %esi
1165 ; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax
1166 ; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %ecx
1167 ; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %edx
1168 ; X86-AVX-NEXT: movl c, %esi
1169 ; X86-AVX-NEXT: vpmovsxwq (%edx,%ecx), %xmm0
1170 ; X86-AVX-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero
1171 ; X86-AVX-NEXT: vpmovzxwd {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero
1172 ; X86-AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
1173 ; X86-AVX-NEXT: vpmulld %xmm0, %xmm1, %xmm0
1174 ; X86-AVX-NEXT: vmovq %xmm0, (%esi,%ecx,4)
1175 ; X86-AVX-NEXT: popl %esi
1176 ; X86-AVX-NEXT: retl
1178 ; X64-SSE-LABEL: mul_2xi16_sext_zext:
1179 ; X64-SSE: # %bb.0: # %entry
1180 ; X64-SSE-NEXT: movq {{.*}}(%rip), %rax
1181 ; X64-SSE-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
1182 ; X64-SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,2,1,4,5,6,7]
1183 ; X64-SSE-NEXT: psrad $16, %xmm0
1184 ; X64-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,1,3]
1185 ; X64-SSE-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
1186 ; X64-SSE-NEXT: pxor %xmm2, %xmm2
1187 ; X64-SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
1188 ; X64-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,1,3]
1189 ; X64-SSE-NEXT: pmuludq %xmm0, %xmm1
1190 ; X64-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,2,2,3]
1191 ; X64-SSE-NEXT: movq %xmm0, (%rax,%rdx,4)
1192 ; X64-SSE-NEXT: retq
1194 ; X64-AVX-LABEL: mul_2xi16_sext_zext:
1195 ; X64-AVX: # %bb.0: # %entry
1196 ; X64-AVX-NEXT: movq {{.*}}(%rip), %rax
1197 ; X64-AVX-NEXT: vpmovsxwq (%rdi,%rdx), %xmm0
1198 ; X64-AVX-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero
1199 ; X64-AVX-NEXT: vpmovzxwd {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero
1200 ; X64-AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
1201 ; X64-AVX-NEXT: vpmulld %xmm0, %xmm1, %xmm0
1202 ; X64-AVX-NEXT: vmovq %xmm0, (%rax,%rdx,4)
1203 ; X64-AVX-NEXT: retq
1205 %pre = load i32*, i32** @c
1206 %tmp6 = getelementptr inbounds i8, i8* %a, i64 %index
1207 %tmp7 = bitcast i8* %tmp6 to <2 x i16>*
1208 %wide.load = load <2 x i16>, <2 x i16>* %tmp7, align 1
1209 %tmp8 = sext <2 x i16> %wide.load to <2 x i32>
1210 %tmp10 = getelementptr inbounds i8, i8* %b, i64 %index
1211 %tmp11 = bitcast i8* %tmp10 to <2 x i16>*
1212 %wide.load17 = load <2 x i16>, <2 x i16>* %tmp11, align 1
1213 %tmp12 = zext <2 x i16> %wide.load17 to <2 x i32>
1214 %tmp13 = mul nuw nsw <2 x i32> %tmp12, %tmp8
1215 %tmp14 = getelementptr inbounds i32, i32* %pre, i64 %index
1216 %tmp15 = bitcast i32* %tmp14 to <2 x i32>*
1217 store <2 x i32> %tmp13, <2 x i32>* %tmp15, align 4
1221 ; %val1 = load <16 x i16>
1222 ; %op1 = sext<16 x i32> %val1
1223 ; %val2 = load <16 x i16>
1224 ; %op2 = sext<16 x i32> %val2
1225 ; %rst = mul <16 x i32> %op1, %op2
1227 define void @mul_16xi16_sext(i8* nocapture readonly %a, i8* nocapture readonly %b, i64 %index) nounwind {
1228 ; X86-SSE-LABEL: mul_16xi16_sext:
1229 ; X86-SSE: # %bb.0: # %entry
1230 ; X86-SSE-NEXT: pushl %esi
1231 ; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %eax
1232 ; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %ecx
1233 ; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %edx
1234 ; X86-SSE-NEXT: movl c, %esi
1235 ; X86-SSE-NEXT: movdqu (%edx,%ecx), %xmm0
1236 ; X86-SSE-NEXT: movdqu 16(%edx,%ecx), %xmm1
1237 ; X86-SSE-NEXT: movdqu (%eax,%ecx), %xmm2
1238 ; X86-SSE-NEXT: movdqu 16(%eax,%ecx), %xmm3
1239 ; X86-SSE-NEXT: movdqa %xmm2, %xmm4
1240 ; X86-SSE-NEXT: pmulhw %xmm0, %xmm4
1241 ; X86-SSE-NEXT: pmullw %xmm0, %xmm2
1242 ; X86-SSE-NEXT: movdqa %xmm2, %xmm0
1243 ; X86-SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3]
1244 ; X86-SSE-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm4[4],xmm2[5],xmm4[5],xmm2[6],xmm4[6],xmm2[7],xmm4[7]
1245 ; X86-SSE-NEXT: movdqa %xmm3, %xmm4
1246 ; X86-SSE-NEXT: pmulhw %xmm1, %xmm4
1247 ; X86-SSE-NEXT: pmullw %xmm1, %xmm3
1248 ; X86-SSE-NEXT: movdqa %xmm3, %xmm1
1249 ; X86-SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3]
1250 ; X86-SSE-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7]
1251 ; X86-SSE-NEXT: movdqu %xmm3, 48(%esi,%ecx,4)
1252 ; X86-SSE-NEXT: movdqu %xmm1, 32(%esi,%ecx,4)
1253 ; X86-SSE-NEXT: movdqu %xmm2, 16(%esi,%ecx,4)
1254 ; X86-SSE-NEXT: movdqu %xmm0, (%esi,%ecx,4)
1255 ; X86-SSE-NEXT: popl %esi
1256 ; X86-SSE-NEXT: retl
1258 ; X86-AVX1-LABEL: mul_16xi16_sext:
1259 ; X86-AVX1: # %bb.0: # %entry
1260 ; X86-AVX1-NEXT: pushl %esi
1261 ; X86-AVX1-NEXT: movl {{[0-9]+}}(%esp), %eax
1262 ; X86-AVX1-NEXT: movl {{[0-9]+}}(%esp), %ecx
1263 ; X86-AVX1-NEXT: movl {{[0-9]+}}(%esp), %edx
1264 ; X86-AVX1-NEXT: movl c, %esi
1265 ; X86-AVX1-NEXT: vpmovsxwd 16(%edx,%ecx), %xmm0
1266 ; X86-AVX1-NEXT: vpmovsxwd 24(%edx,%ecx), %xmm1
1267 ; X86-AVX1-NEXT: vpmovsxwd (%edx,%ecx), %xmm2
1268 ; X86-AVX1-NEXT: vpmovsxwd 8(%edx,%ecx), %xmm3
1269 ; X86-AVX1-NEXT: vpmovsxwd 16(%eax,%ecx), %xmm4
1270 ; X86-AVX1-NEXT: vpmulld %xmm0, %xmm4, %xmm0
1271 ; X86-AVX1-NEXT: vpmovsxwd 24(%eax,%ecx), %xmm4
1272 ; X86-AVX1-NEXT: vpmulld %xmm1, %xmm4, %xmm1
1273 ; X86-AVX1-NEXT: vpmovsxwd (%eax,%ecx), %xmm4
1274 ; X86-AVX1-NEXT: vpmulld %xmm2, %xmm4, %xmm2
1275 ; X86-AVX1-NEXT: vpmovsxwd 8(%eax,%ecx), %xmm4
1276 ; X86-AVX1-NEXT: vpmulld %xmm3, %xmm4, %xmm3
1277 ; X86-AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2
1278 ; X86-AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
1279 ; X86-AVX1-NEXT: vmovups %ymm0, 32(%esi,%ecx,4)
1280 ; X86-AVX1-NEXT: vmovups %ymm2, (%esi,%ecx,4)
1281 ; X86-AVX1-NEXT: popl %esi
1282 ; X86-AVX1-NEXT: vzeroupper
1283 ; X86-AVX1-NEXT: retl
1285 ; X86-AVX2-LABEL: mul_16xi16_sext:
1286 ; X86-AVX2: # %bb.0: # %entry
1287 ; X86-AVX2-NEXT: pushl %esi
1288 ; X86-AVX2-NEXT: movl {{[0-9]+}}(%esp), %eax
1289 ; X86-AVX2-NEXT: movl {{[0-9]+}}(%esp), %ecx
1290 ; X86-AVX2-NEXT: movl {{[0-9]+}}(%esp), %edx
1291 ; X86-AVX2-NEXT: movl c, %esi
1292 ; X86-AVX2-NEXT: vpmovsxwd 16(%edx,%ecx), %ymm0
1293 ; X86-AVX2-NEXT: vpmovsxwd (%edx,%ecx), %ymm1
1294 ; X86-AVX2-NEXT: vpmovsxwd 16(%eax,%ecx), %ymm2
1295 ; X86-AVX2-NEXT: vpmulld %ymm0, %ymm2, %ymm0
1296 ; X86-AVX2-NEXT: vpmovsxwd (%eax,%ecx), %ymm2
1297 ; X86-AVX2-NEXT: vpmulld %ymm1, %ymm2, %ymm1
1298 ; X86-AVX2-NEXT: vmovdqu %ymm0, 32(%esi,%ecx,4)
1299 ; X86-AVX2-NEXT: vmovdqu %ymm1, (%esi,%ecx,4)
1300 ; X86-AVX2-NEXT: popl %esi
1301 ; X86-AVX2-NEXT: vzeroupper
1302 ; X86-AVX2-NEXT: retl
1304 ; X64-SSE-LABEL: mul_16xi16_sext:
1305 ; X64-SSE: # %bb.0: # %entry
1306 ; X64-SSE-NEXT: movq {{.*}}(%rip), %rax
1307 ; X64-SSE-NEXT: movdqu (%rdi,%rdx), %xmm0
1308 ; X64-SSE-NEXT: movdqu 16(%rdi,%rdx), %xmm1
1309 ; X64-SSE-NEXT: movdqu (%rsi,%rdx), %xmm2
1310 ; X64-SSE-NEXT: movdqu 16(%rsi,%rdx), %xmm3
1311 ; X64-SSE-NEXT: movdqa %xmm2, %xmm4
1312 ; X64-SSE-NEXT: pmulhw %xmm0, %xmm4
1313 ; X64-SSE-NEXT: pmullw %xmm0, %xmm2
1314 ; X64-SSE-NEXT: movdqa %xmm2, %xmm0
1315 ; X64-SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3]
1316 ; X64-SSE-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm4[4],xmm2[5],xmm4[5],xmm2[6],xmm4[6],xmm2[7],xmm4[7]
1317 ; X64-SSE-NEXT: movdqa %xmm3, %xmm4
1318 ; X64-SSE-NEXT: pmulhw %xmm1, %xmm4
1319 ; X64-SSE-NEXT: pmullw %xmm1, %xmm3
1320 ; X64-SSE-NEXT: movdqa %xmm3, %xmm1
1321 ; X64-SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3]
1322 ; X64-SSE-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7]
1323 ; X64-SSE-NEXT: movdqu %xmm3, 48(%rax,%rdx,4)
1324 ; X64-SSE-NEXT: movdqu %xmm1, 32(%rax,%rdx,4)
1325 ; X64-SSE-NEXT: movdqu %xmm2, 16(%rax,%rdx,4)
1326 ; X64-SSE-NEXT: movdqu %xmm0, (%rax,%rdx,4)
1327 ; X64-SSE-NEXT: retq
1329 ; X64-AVX1-LABEL: mul_16xi16_sext:
1330 ; X64-AVX1: # %bb.0: # %entry
1331 ; X64-AVX1-NEXT: movq {{.*}}(%rip), %rax
1332 ; X64-AVX1-NEXT: vpmovsxwd 16(%rdi,%rdx), %xmm0
1333 ; X64-AVX1-NEXT: vpmovsxwd 24(%rdi,%rdx), %xmm1
1334 ; X64-AVX1-NEXT: vpmovsxwd (%rdi,%rdx), %xmm2
1335 ; X64-AVX1-NEXT: vpmovsxwd 8(%rdi,%rdx), %xmm3
1336 ; X64-AVX1-NEXT: vpmovsxwd 16(%rsi,%rdx), %xmm4
1337 ; X64-AVX1-NEXT: vpmulld %xmm0, %xmm4, %xmm0
1338 ; X64-AVX1-NEXT: vpmovsxwd 24(%rsi,%rdx), %xmm4
1339 ; X64-AVX1-NEXT: vpmulld %xmm1, %xmm4, %xmm1
1340 ; X64-AVX1-NEXT: vpmovsxwd (%rsi,%rdx), %xmm4
1341 ; X64-AVX1-NEXT: vpmulld %xmm2, %xmm4, %xmm2
1342 ; X64-AVX1-NEXT: vpmovsxwd 8(%rsi,%rdx), %xmm4
1343 ; X64-AVX1-NEXT: vpmulld %xmm3, %xmm4, %xmm3
1344 ; X64-AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2
1345 ; X64-AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
1346 ; X64-AVX1-NEXT: vmovups %ymm0, 32(%rax,%rdx,4)
1347 ; X64-AVX1-NEXT: vmovups %ymm2, (%rax,%rdx,4)
1348 ; X64-AVX1-NEXT: vzeroupper
1349 ; X64-AVX1-NEXT: retq
1351 ; X64-AVX2-LABEL: mul_16xi16_sext:
1352 ; X64-AVX2: # %bb.0: # %entry
1353 ; X64-AVX2-NEXT: movq {{.*}}(%rip), %rax
1354 ; X64-AVX2-NEXT: vpmovsxwd 16(%rdi,%rdx), %ymm0
1355 ; X64-AVX2-NEXT: vpmovsxwd (%rdi,%rdx), %ymm1
1356 ; X64-AVX2-NEXT: vpmovsxwd 16(%rsi,%rdx), %ymm2
1357 ; X64-AVX2-NEXT: vpmulld %ymm0, %ymm2, %ymm0
1358 ; X64-AVX2-NEXT: vpmovsxwd (%rsi,%rdx), %ymm2
1359 ; X64-AVX2-NEXT: vpmulld %ymm1, %ymm2, %ymm1
1360 ; X64-AVX2-NEXT: vmovdqu %ymm0, 32(%rax,%rdx,4)
1361 ; X64-AVX2-NEXT: vmovdqu %ymm1, (%rax,%rdx,4)
1362 ; X64-AVX2-NEXT: vzeroupper
1363 ; X64-AVX2-NEXT: retq
1365 %pre = load i32*, i32** @c
1366 %tmp6 = getelementptr inbounds i8, i8* %a, i64 %index
1367 %tmp7 = bitcast i8* %tmp6 to <16 x i16>*
1368 %wide.load = load <16 x i16>, <16 x i16>* %tmp7, align 1
1369 %tmp8 = sext <16 x i16> %wide.load to <16 x i32>
1370 %tmp10 = getelementptr inbounds i8, i8* %b, i64 %index
1371 %tmp11 = bitcast i8* %tmp10 to <16 x i16>*
1372 %wide.load17 = load <16 x i16>, <16 x i16>* %tmp11, align 1
1373 %tmp12 = sext <16 x i16> %wide.load17 to <16 x i32>
1374 %tmp13 = mul nuw nsw <16 x i32> %tmp12, %tmp8
1375 %tmp14 = getelementptr inbounds i32, i32* %pre, i64 %index
1376 %tmp15 = bitcast i32* %tmp14 to <16 x i32>*
1377 store <16 x i32> %tmp13, <16 x i32>* %tmp15, align 4
1381 ; %val = load <2 x i8>
1382 ; %op1 = zext<2 x i32> %val
1383 ; %op2 = const <2 x i32> {c1, c2} // c1 and c2 are within (0 ~ 255)
1384 ; %rst = mul <2 x i32> %op1, %op2
1386 define void @mul_2xi8_varconst1(i8* nocapture readonly %a, i64 %index) {
1387 ; X86-SSE-LABEL: mul_2xi8_varconst1:
1388 ; X86-SSE: # %bb.0: # %entry
1389 ; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %eax
1390 ; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %ecx
1391 ; X86-SSE-NEXT: movl c, %edx
1392 ; X86-SSE-NEXT: movzwl (%ecx,%eax), %ecx
1393 ; X86-SSE-NEXT: movd %ecx, %xmm0
1394 ; X86-SSE-NEXT: pxor %xmm1, %xmm1
1395 ; X86-SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
1396 ; X86-SSE-NEXT: pmullw {{\.LCPI.*}}, %xmm0
1397 ; X86-SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
1398 ; X86-SSE-NEXT: movq %xmm0, (%edx,%eax,4)
1399 ; X86-SSE-NEXT: retl
1401 ; X86-AVX-LABEL: mul_2xi8_varconst1:
1402 ; X86-AVX: # %bb.0: # %entry
1403 ; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax
1404 ; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %ecx
1405 ; X86-AVX-NEXT: movl c, %edx
1406 ; X86-AVX-NEXT: vpmovzxbq {{.*#+}} xmm0 = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero
1407 ; X86-AVX-NEXT: vpmuludq {{\.LCPI.*}}, %xmm0, %xmm0
1408 ; X86-AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
1409 ; X86-AVX-NEXT: vmovq %xmm0, (%edx,%eax,4)
1410 ; X86-AVX-NEXT: retl
1412 ; X64-SSE-LABEL: mul_2xi8_varconst1:
1413 ; X64-SSE: # %bb.0: # %entry
1414 ; X64-SSE-NEXT: movq {{.*}}(%rip), %rax
1415 ; X64-SSE-NEXT: movzwl (%rdi,%rsi), %ecx
1416 ; X64-SSE-NEXT: movd %ecx, %xmm0
1417 ; X64-SSE-NEXT: pxor %xmm1, %xmm1
1418 ; X64-SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
1419 ; X64-SSE-NEXT: pmullw {{.*}}(%rip), %xmm0
1420 ; X64-SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
1421 ; X64-SSE-NEXT: movq %xmm0, (%rax,%rsi,4)
1422 ; X64-SSE-NEXT: retq
1424 ; X64-AVX-LABEL: mul_2xi8_varconst1:
1425 ; X64-AVX: # %bb.0: # %entry
1426 ; X64-AVX-NEXT: movq {{.*}}(%rip), %rax
1427 ; X64-AVX-NEXT: vpmovzxbq {{.*#+}} xmm0 = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero
1428 ; X64-AVX-NEXT: movl $255, %ecx
1429 ; X64-AVX-NEXT: vmovq %rcx, %xmm1
1430 ; X64-AVX-NEXT: vpslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0,1,2,3,4,5,6,7]
1431 ; X64-AVX-NEXT: vpmuludq %xmm1, %xmm0, %xmm0
1432 ; X64-AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
1433 ; X64-AVX-NEXT: vmovq %xmm0, (%rax,%rsi,4)
1434 ; X64-AVX-NEXT: retq
1436 %pre = load i32*, i32** @c
1437 %tmp6 = getelementptr inbounds i8, i8* %a, i64 %index
1438 %tmp7 = bitcast i8* %tmp6 to <2 x i8>*
1439 %wide.load = load <2 x i8>, <2 x i8>* %tmp7, align 1
1440 %tmp8 = zext <2 x i8> %wide.load to <2 x i32>
1441 %tmp13 = mul nuw nsw <2 x i32> %tmp8, <i32 0, i32 255>
1442 %tmp14 = getelementptr inbounds i32, i32* %pre, i64 %index
1443 %tmp15 = bitcast i32* %tmp14 to <2 x i32>*
1444 store <2 x i32> %tmp13, <2 x i32>* %tmp15, align 4
1448 ; %val = load <2 x i8>
1449 ; %op1 = sext<2 x i32> %val
1450 ; %op2 = const <2 x i32> {c1, c2} // c1 and c2 are within (-128 ~ 127)
1451 ; %rst = mul <2 x i32> %op1, %op2
1453 define void @mul_2xi8_varconst2(i8* nocapture readonly %a, i64 %index) {
1454 ; X86-SSE-LABEL: mul_2xi8_varconst2:
1455 ; X86-SSE: # %bb.0: # %entry
1456 ; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %eax
1457 ; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %ecx
1458 ; X86-SSE-NEXT: movl c, %edx
1459 ; X86-SSE-NEXT: movzwl (%ecx,%eax), %ecx
1460 ; X86-SSE-NEXT: movd %ecx, %xmm0
1461 ; X86-SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
1462 ; X86-SSE-NEXT: psraw $8, %xmm0
1463 ; X86-SSE-NEXT: pmullw {{\.LCPI.*}}, %xmm0
1464 ; X86-SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
1465 ; X86-SSE-NEXT: psrad $16, %xmm0
1466 ; X86-SSE-NEXT: movq %xmm0, (%edx,%eax,4)
1467 ; X86-SSE-NEXT: retl
1469 ; X86-AVX-LABEL: mul_2xi8_varconst2:
1470 ; X86-AVX: # %bb.0: # %entry
1471 ; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax
1472 ; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %ecx
1473 ; X86-AVX-NEXT: movl c, %edx
1474 ; X86-AVX-NEXT: vpmovsxbq (%ecx,%eax), %xmm0
1475 ; X86-AVX-NEXT: vpmuludq {{\.LCPI.*}}, %xmm0, %xmm0
1476 ; X86-AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
1477 ; X86-AVX-NEXT: vmovq %xmm0, (%edx,%eax,4)
1478 ; X86-AVX-NEXT: retl
1480 ; X64-SSE-LABEL: mul_2xi8_varconst2:
1481 ; X64-SSE: # %bb.0: # %entry
1482 ; X64-SSE-NEXT: movq {{.*}}(%rip), %rax
1483 ; X64-SSE-NEXT: movzwl (%rdi,%rsi), %ecx
1484 ; X64-SSE-NEXT: movd %ecx, %xmm0
1485 ; X64-SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
1486 ; X64-SSE-NEXT: psraw $8, %xmm0
1487 ; X64-SSE-NEXT: pmullw {{.*}}(%rip), %xmm0
1488 ; X64-SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
1489 ; X64-SSE-NEXT: psrad $16, %xmm0
1490 ; X64-SSE-NEXT: movq %xmm0, (%rax,%rsi,4)
1491 ; X64-SSE-NEXT: retq
1493 ; X64-AVX-LABEL: mul_2xi8_varconst2:
1494 ; X64-AVX: # %bb.0: # %entry
1495 ; X64-AVX-NEXT: movq {{.*}}(%rip), %rax
1496 ; X64-AVX-NEXT: vpmovsxbq (%rdi,%rsi), %xmm0
1497 ; X64-AVX-NEXT: vpmuludq {{.*}}(%rip), %xmm0, %xmm0
1498 ; X64-AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
1499 ; X64-AVX-NEXT: vmovq %xmm0, (%rax,%rsi,4)
1500 ; X64-AVX-NEXT: retq
1502 %pre = load i32*, i32** @c
1503 %tmp6 = getelementptr inbounds i8, i8* %a, i64 %index
1504 %tmp7 = bitcast i8* %tmp6 to <2 x i8>*
1505 %wide.load = load <2 x i8>, <2 x i8>* %tmp7, align 1
1506 %tmp8 = sext <2 x i8> %wide.load to <2 x i32>
1507 %tmp13 = mul nuw nsw <2 x i32> %tmp8, <i32 -128, i32 127>
1508 %tmp14 = getelementptr inbounds i32, i32* %pre, i64 %index
1509 %tmp15 = bitcast i32* %tmp14 to <2 x i32>*
1510 store <2 x i32> %tmp13, <2 x i32>* %tmp15, align 4
1514 ; %val = load <2 x i8>
1515 ; %op1 = zext<2 x i32> %val
1516 ; %op2 = const <2 x i32> {c1, c2} // c1 and c2 are within (0 ~ 256)
1517 ; %rst = mul <2 x i32> %op1, %op2
1519 define void @mul_2xi8_varconst3(i8* nocapture readonly %a, i64 %index) {
1520 ; X86-SSE-LABEL: mul_2xi8_varconst3:
1521 ; X86-SSE: # %bb.0: # %entry
1522 ; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %eax
1523 ; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %ecx
1524 ; X86-SSE-NEXT: movl c, %edx
1525 ; X86-SSE-NEXT: movzwl (%ecx,%eax), %ecx
1526 ; X86-SSE-NEXT: movd %ecx, %xmm0
1527 ; X86-SSE-NEXT: pxor %xmm1, %xmm1
1528 ; X86-SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
1529 ; X86-SSE-NEXT: movdqa {{.*#+}} xmm1 = <0,256,u,u,u,u,u,u>
1530 ; X86-SSE-NEXT: movdqa %xmm0, %xmm2
1531 ; X86-SSE-NEXT: pmulhw %xmm1, %xmm2
1532 ; X86-SSE-NEXT: pmullw %xmm1, %xmm0
1533 ; X86-SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
1534 ; X86-SSE-NEXT: movq %xmm0, (%edx,%eax,4)
1535 ; X86-SSE-NEXT: retl
1537 ; X86-AVX-LABEL: mul_2xi8_varconst3:
1538 ; X86-AVX: # %bb.0: # %entry
1539 ; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax
1540 ; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %ecx
1541 ; X86-AVX-NEXT: movl c, %edx
1542 ; X86-AVX-NEXT: vpmovzxbq {{.*#+}} xmm0 = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero
1543 ; X86-AVX-NEXT: vpmuludq {{\.LCPI.*}}, %xmm0, %xmm0
1544 ; X86-AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
1545 ; X86-AVX-NEXT: vmovq %xmm0, (%edx,%eax,4)
1546 ; X86-AVX-NEXT: retl
1548 ; X64-SSE-LABEL: mul_2xi8_varconst3:
1549 ; X64-SSE: # %bb.0: # %entry
1550 ; X64-SSE-NEXT: movq {{.*}}(%rip), %rax
1551 ; X64-SSE-NEXT: movzwl (%rdi,%rsi), %ecx
1552 ; X64-SSE-NEXT: movd %ecx, %xmm0
1553 ; X64-SSE-NEXT: pxor %xmm1, %xmm1
1554 ; X64-SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
1555 ; X64-SSE-NEXT: movdqa {{.*#+}} xmm1 = <0,256,u,u,u,u,u,u>
1556 ; X64-SSE-NEXT: movdqa %xmm0, %xmm2
1557 ; X64-SSE-NEXT: pmulhw %xmm1, %xmm2
1558 ; X64-SSE-NEXT: pmullw %xmm1, %xmm0
1559 ; X64-SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
1560 ; X64-SSE-NEXT: movq %xmm0, (%rax,%rsi,4)
1561 ; X64-SSE-NEXT: retq
1563 ; X64-AVX-LABEL: mul_2xi8_varconst3:
1564 ; X64-AVX: # %bb.0: # %entry
1565 ; X64-AVX-NEXT: movq {{.*}}(%rip), %rax
1566 ; X64-AVX-NEXT: vpmovzxbq {{.*#+}} xmm0 = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero
1567 ; X64-AVX-NEXT: movl $256, %ecx # imm = 0x100
1568 ; X64-AVX-NEXT: vmovq %rcx, %xmm1
1569 ; X64-AVX-NEXT: vpslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0,1,2,3,4,5,6,7]
1570 ; X64-AVX-NEXT: vpmuludq %xmm1, %xmm0, %xmm0
1571 ; X64-AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
1572 ; X64-AVX-NEXT: vmovq %xmm0, (%rax,%rsi,4)
1573 ; X64-AVX-NEXT: retq
1575 %pre = load i32*, i32** @c
1576 %tmp6 = getelementptr inbounds i8, i8* %a, i64 %index
1577 %tmp7 = bitcast i8* %tmp6 to <2 x i8>*
1578 %wide.load = load <2 x i8>, <2 x i8>* %tmp7, align 1
1579 %tmp8 = zext <2 x i8> %wide.load to <2 x i32>
1580 %tmp13 = mul nuw nsw <2 x i32> %tmp8, <i32 0, i32 256>
1581 %tmp14 = getelementptr inbounds i32, i32* %pre, i64 %index
1582 %tmp15 = bitcast i32* %tmp14 to <2 x i32>*
1583 store <2 x i32> %tmp13, <2 x i32>* %tmp15, align 4
1587 ; %val = load <2 x i8>
1588 ; %op1 = zext<2 x i32> %val
1589 ; %op2 = const <2 x i32> {c1, c2} // c1 and c2 are within (-1 ~ 255)
1590 ; %rst = mul <2 x i32> %op1, %op2
1592 define void @mul_2xi8_varconst4(i8* nocapture readonly %a, i64 %index) {
1593 ; X86-SSE-LABEL: mul_2xi8_varconst4:
1594 ; X86-SSE: # %bb.0: # %entry
1595 ; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %eax
1596 ; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %ecx
1597 ; X86-SSE-NEXT: movl c, %edx
1598 ; X86-SSE-NEXT: movzwl (%ecx,%eax), %ecx
1599 ; X86-SSE-NEXT: movd %ecx, %xmm0
1600 ; X86-SSE-NEXT: pxor %xmm1, %xmm1
1601 ; X86-SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
1602 ; X86-SSE-NEXT: movdqa {{.*#+}} xmm1 = <65535,255,u,u,u,u,u,u>
1603 ; X86-SSE-NEXT: movdqa %xmm0, %xmm2
1604 ; X86-SSE-NEXT: pmulhw %xmm1, %xmm2
1605 ; X86-SSE-NEXT: pmullw %xmm1, %xmm0
1606 ; X86-SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
1607 ; X86-SSE-NEXT: movq %xmm0, (%edx,%eax,4)
1608 ; X86-SSE-NEXT: retl
1610 ; X86-AVX-LABEL: mul_2xi8_varconst4:
1611 ; X86-AVX: # %bb.0: # %entry
1612 ; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax
1613 ; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %ecx
1614 ; X86-AVX-NEXT: movl c, %edx
1615 ; X86-AVX-NEXT: vpmovzxbq {{.*#+}} xmm0 = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero
1616 ; X86-AVX-NEXT: vpmuludq {{\.LCPI.*}}, %xmm0, %xmm0
1617 ; X86-AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
1618 ; X86-AVX-NEXT: vmovq %xmm0, (%edx,%eax,4)
1619 ; X86-AVX-NEXT: retl
1621 ; X64-SSE-LABEL: mul_2xi8_varconst4:
1622 ; X64-SSE: # %bb.0: # %entry
1623 ; X64-SSE-NEXT: movq {{.*}}(%rip), %rax
1624 ; X64-SSE-NEXT: movzwl (%rdi,%rsi), %ecx
1625 ; X64-SSE-NEXT: movd %ecx, %xmm0
1626 ; X64-SSE-NEXT: pxor %xmm1, %xmm1
1627 ; X64-SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
1628 ; X64-SSE-NEXT: movdqa {{.*#+}} xmm1 = <65535,255,u,u,u,u,u,u>
1629 ; X64-SSE-NEXT: movdqa %xmm0, %xmm2
1630 ; X64-SSE-NEXT: pmulhw %xmm1, %xmm2
1631 ; X64-SSE-NEXT: pmullw %xmm1, %xmm0
1632 ; X64-SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
1633 ; X64-SSE-NEXT: movq %xmm0, (%rax,%rsi,4)
1634 ; X64-SSE-NEXT: retq
1636 ; X64-AVX-LABEL: mul_2xi8_varconst4:
1637 ; X64-AVX: # %bb.0: # %entry
1638 ; X64-AVX-NEXT: movq {{.*}}(%rip), %rax
1639 ; X64-AVX-NEXT: vpmovzxbq {{.*#+}} xmm0 = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero
1640 ; X64-AVX-NEXT: vpmuludq {{.*}}(%rip), %xmm0, %xmm0
1641 ; X64-AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
1642 ; X64-AVX-NEXT: vmovq %xmm0, (%rax,%rsi,4)
1643 ; X64-AVX-NEXT: retq
1645 %pre = load i32*, i32** @c
1646 %tmp6 = getelementptr inbounds i8, i8* %a, i64 %index
1647 %tmp7 = bitcast i8* %tmp6 to <2 x i8>*
1648 %wide.load = load <2 x i8>, <2 x i8>* %tmp7, align 1
1649 %tmp8 = zext <2 x i8> %wide.load to <2 x i32>
1650 %tmp13 = mul nuw nsw <2 x i32> %tmp8, <i32 -1, i32 255>
1651 %tmp14 = getelementptr inbounds i32, i32* %pre, i64 %index
1652 %tmp15 = bitcast i32* %tmp14 to <2 x i32>*
1653 store <2 x i32> %tmp13, <2 x i32>* %tmp15, align 4
1657 ; %val = load <2 x i8>
1658 ; %op1 = sext<2 x i32> %val
1659 ; %op2 = const <2 x i32> {c1, c2} // c1 and c2 are within (-129 ~ 127)
1660 ; %rst = mul <2 x i32> %op1, %op2
1662 define void @mul_2xi8_varconst5(i8* nocapture readonly %a, i64 %index) {
1663 ; X86-SSE-LABEL: mul_2xi8_varconst5:
1664 ; X86-SSE: # %bb.0: # %entry
1665 ; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %eax
1666 ; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %ecx
1667 ; X86-SSE-NEXT: movl c, %edx
1668 ; X86-SSE-NEXT: movzwl (%ecx,%eax), %ecx
1669 ; X86-SSE-NEXT: movd %ecx, %xmm0
1670 ; X86-SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
1671 ; X86-SSE-NEXT: psraw $8, %xmm0
1672 ; X86-SSE-NEXT: movdqa {{.*#+}} xmm1 = <65407,127,u,u,u,u,u,u>
1673 ; X86-SSE-NEXT: movdqa %xmm0, %xmm2
1674 ; X86-SSE-NEXT: pmulhw %xmm1, %xmm2
1675 ; X86-SSE-NEXT: pmullw %xmm1, %xmm0
1676 ; X86-SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
1677 ; X86-SSE-NEXT: movq %xmm0, (%edx,%eax,4)
1678 ; X86-SSE-NEXT: retl
1680 ; X86-AVX-LABEL: mul_2xi8_varconst5:
1681 ; X86-AVX: # %bb.0: # %entry
1682 ; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax
1683 ; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %ecx
1684 ; X86-AVX-NEXT: movl c, %edx
1685 ; X86-AVX-NEXT: vpmovsxbq (%ecx,%eax), %xmm0
1686 ; X86-AVX-NEXT: vpmuludq {{\.LCPI.*}}, %xmm0, %xmm0
1687 ; X86-AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
1688 ; X86-AVX-NEXT: vmovq %xmm0, (%edx,%eax,4)
1689 ; X86-AVX-NEXT: retl
1691 ; X64-SSE-LABEL: mul_2xi8_varconst5:
1692 ; X64-SSE: # %bb.0: # %entry
1693 ; X64-SSE-NEXT: movq {{.*}}(%rip), %rax
1694 ; X64-SSE-NEXT: movzwl (%rdi,%rsi), %ecx
1695 ; X64-SSE-NEXT: movd %ecx, %xmm0
1696 ; X64-SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
1697 ; X64-SSE-NEXT: psraw $8, %xmm0
1698 ; X64-SSE-NEXT: movdqa {{.*#+}} xmm1 = <65407,127,u,u,u,u,u,u>
1699 ; X64-SSE-NEXT: movdqa %xmm0, %xmm2
1700 ; X64-SSE-NEXT: pmulhw %xmm1, %xmm2
1701 ; X64-SSE-NEXT: pmullw %xmm1, %xmm0
1702 ; X64-SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
1703 ; X64-SSE-NEXT: movq %xmm0, (%rax,%rsi,4)
1704 ; X64-SSE-NEXT: retq
1706 ; X64-AVX-LABEL: mul_2xi8_varconst5:
1707 ; X64-AVX: # %bb.0: # %entry
1708 ; X64-AVX-NEXT: movq {{.*}}(%rip), %rax
1709 ; X64-AVX-NEXT: vpmovsxbq (%rdi,%rsi), %xmm0
1710 ; X64-AVX-NEXT: vpmuludq {{.*}}(%rip), %xmm0, %xmm0
1711 ; X64-AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
1712 ; X64-AVX-NEXT: vmovq %xmm0, (%rax,%rsi,4)
1713 ; X64-AVX-NEXT: retq
1715 %pre = load i32*, i32** @c
1716 %tmp6 = getelementptr inbounds i8, i8* %a, i64 %index
1717 %tmp7 = bitcast i8* %tmp6 to <2 x i8>*
1718 %wide.load = load <2 x i8>, <2 x i8>* %tmp7, align 1
1719 %tmp8 = sext <2 x i8> %wide.load to <2 x i32>
1720 %tmp13 = mul nuw nsw <2 x i32> %tmp8, <i32 -129, i32 127>
1721 %tmp14 = getelementptr inbounds i32, i32* %pre, i64 %index
1722 %tmp15 = bitcast i32* %tmp14 to <2 x i32>*
1723 store <2 x i32> %tmp13, <2 x i32>* %tmp15, align 4
1727 ; %val = load <2 x i8>
1728 ; %op1 = sext<2 x i32> %val
1729 ; %op2 = const <2 x i32> {c1, c2} // c1 and c2 are within (-128 ~ 128)
1730 ; %rst = mul <2 x i32> %op1, %op2
1732 define void @mul_2xi8_varconst6(i8* nocapture readonly %a, i64 %index) {
1733 ; X86-SSE-LABEL: mul_2xi8_varconst6:
1734 ; X86-SSE: # %bb.0: # %entry
1735 ; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %eax
1736 ; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %ecx
1737 ; X86-SSE-NEXT: movl c, %edx
1738 ; X86-SSE-NEXT: movzwl (%ecx,%eax), %ecx
1739 ; X86-SSE-NEXT: movd %ecx, %xmm0
1740 ; X86-SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
1741 ; X86-SSE-NEXT: psraw $8, %xmm0
1742 ; X86-SSE-NEXT: movdqa {{.*#+}} xmm1 = <65408,128,u,u,u,u,u,u>
1743 ; X86-SSE-NEXT: movdqa %xmm0, %xmm2
1744 ; X86-SSE-NEXT: pmulhw %xmm1, %xmm2
1745 ; X86-SSE-NEXT: pmullw %xmm1, %xmm0
1746 ; X86-SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
1747 ; X86-SSE-NEXT: movq %xmm0, (%edx,%eax,4)
1748 ; X86-SSE-NEXT: retl
1750 ; X86-AVX-LABEL: mul_2xi8_varconst6:
1751 ; X86-AVX: # %bb.0: # %entry
1752 ; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax
1753 ; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %ecx
1754 ; X86-AVX-NEXT: movl c, %edx
1755 ; X86-AVX-NEXT: vpmovsxbq (%ecx,%eax), %xmm0
1756 ; X86-AVX-NEXT: vpmuludq {{\.LCPI.*}}, %xmm0, %xmm0
1757 ; X86-AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
1758 ; X86-AVX-NEXT: vmovq %xmm0, (%edx,%eax,4)
1759 ; X86-AVX-NEXT: retl
1761 ; X64-SSE-LABEL: mul_2xi8_varconst6:
1762 ; X64-SSE: # %bb.0: # %entry
1763 ; X64-SSE-NEXT: movq {{.*}}(%rip), %rax
1764 ; X64-SSE-NEXT: movzwl (%rdi,%rsi), %ecx
1765 ; X64-SSE-NEXT: movd %ecx, %xmm0
1766 ; X64-SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
1767 ; X64-SSE-NEXT: psraw $8, %xmm0
1768 ; X64-SSE-NEXT: movdqa {{.*#+}} xmm1 = <65408,128,u,u,u,u,u,u>
1769 ; X64-SSE-NEXT: movdqa %xmm0, %xmm2
1770 ; X64-SSE-NEXT: pmulhw %xmm1, %xmm2
1771 ; X64-SSE-NEXT: pmullw %xmm1, %xmm0
1772 ; X64-SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
1773 ; X64-SSE-NEXT: movq %xmm0, (%rax,%rsi,4)
1774 ; X64-SSE-NEXT: retq
1776 ; X64-AVX-LABEL: mul_2xi8_varconst6:
1777 ; X64-AVX: # %bb.0: # %entry
1778 ; X64-AVX-NEXT: movq {{.*}}(%rip), %rax
1779 ; X64-AVX-NEXT: vpmovsxbq (%rdi,%rsi), %xmm0
1780 ; X64-AVX-NEXT: vpmuludq {{.*}}(%rip), %xmm0, %xmm0
1781 ; X64-AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
1782 ; X64-AVX-NEXT: vmovq %xmm0, (%rax,%rsi,4)
1783 ; X64-AVX-NEXT: retq
1785 %pre = load i32*, i32** @c
1786 %tmp6 = getelementptr inbounds i8, i8* %a, i64 %index
1787 %tmp7 = bitcast i8* %tmp6 to <2 x i8>*
1788 %wide.load = load <2 x i8>, <2 x i8>* %tmp7, align 1
1789 %tmp8 = sext <2 x i8> %wide.load to <2 x i32>
1790 %tmp13 = mul nuw nsw <2 x i32> %tmp8, <i32 -128, i32 128>
1791 %tmp14 = getelementptr inbounds i32, i32* %pre, i64 %index
1792 %tmp15 = bitcast i32* %tmp14 to <2 x i32>*
1793 store <2 x i32> %tmp13, <2 x i32>* %tmp15, align 4
1797 ; %val = load <2 x i16>
1798 ; %op1 = zext<2 x i32> %val
1799 ; %op2 = const <2 x i32> {c1, c2} // c1 and c2 are within (0 ~ 65535)
1800 ; %rst = mul <2 x i32> %op1, %op2
1802 define void @mul_2xi16_varconst1(i8* nocapture readonly %a, i64 %index) {
1803 ; X86-SSE-LABEL: mul_2xi16_varconst1:
1804 ; X86-SSE: # %bb.0: # %entry
1805 ; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %eax
1806 ; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %ecx
1807 ; X86-SSE-NEXT: movl c, %edx
1808 ; X86-SSE-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
1809 ; X86-SSE-NEXT: movdqa {{.*#+}} xmm1 = <0,65535,u,u,u,u,u,u>
1810 ; X86-SSE-NEXT: movdqa %xmm0, %xmm2
1811 ; X86-SSE-NEXT: pmulhuw %xmm1, %xmm2
1812 ; X86-SSE-NEXT: pmullw %xmm1, %xmm0
1813 ; X86-SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
1814 ; X86-SSE-NEXT: movq %xmm0, (%edx,%eax,4)
1815 ; X86-SSE-NEXT: retl
1817 ; X86-AVX-LABEL: mul_2xi16_varconst1:
1818 ; X86-AVX: # %bb.0: # %entry
1819 ; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax
1820 ; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %ecx
1821 ; X86-AVX-NEXT: movl c, %edx
1822 ; X86-AVX-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
1823 ; X86-AVX-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
1824 ; X86-AVX-NEXT: vpmulld {{\.LCPI.*}}, %xmm0, %xmm0
1825 ; X86-AVX-NEXT: vmovq %xmm0, (%edx,%eax,4)
1826 ; X86-AVX-NEXT: retl
1828 ; X64-SSE-LABEL: mul_2xi16_varconst1:
1829 ; X64-SSE: # %bb.0: # %entry
1830 ; X64-SSE-NEXT: movq {{.*}}(%rip), %rax
1831 ; X64-SSE-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
1832 ; X64-SSE-NEXT: movdqa {{.*#+}} xmm1 = <0,65535,u,u,u,u,u,u>
1833 ; X64-SSE-NEXT: movdqa %xmm0, %xmm2
1834 ; X64-SSE-NEXT: pmulhuw %xmm1, %xmm2
1835 ; X64-SSE-NEXT: pmullw %xmm1, %xmm0
1836 ; X64-SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
1837 ; X64-SSE-NEXT: movq %xmm0, (%rax,%rsi,4)
1838 ; X64-SSE-NEXT: retq
1840 ; X64-AVX-LABEL: mul_2xi16_varconst1:
1841 ; X64-AVX: # %bb.0: # %entry
1842 ; X64-AVX-NEXT: movq {{.*}}(%rip), %rax
1843 ; X64-AVX-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
1844 ; X64-AVX-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
1845 ; X64-AVX-NEXT: movl $65535, %ecx # imm = 0xFFFF
1846 ; X64-AVX-NEXT: vmovq %rcx, %xmm1
1847 ; X64-AVX-NEXT: vpslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0,1,2,3,4,5,6,7]
1848 ; X64-AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
1849 ; X64-AVX-NEXT: vpmulld %xmm1, %xmm0, %xmm0
1850 ; X64-AVX-NEXT: vmovq %xmm0, (%rax,%rsi,4)
1851 ; X64-AVX-NEXT: retq
1853 %pre = load i32*, i32** @c
1854 %tmp6 = getelementptr inbounds i8, i8* %a, i64 %index
1855 %tmp7 = bitcast i8* %tmp6 to <2 x i16>*
1856 %wide.load = load <2 x i16>, <2 x i16>* %tmp7, align 1
1857 %tmp8 = zext <2 x i16> %wide.load to <2 x i32>
1858 %tmp13 = mul nuw nsw <2 x i32> %tmp8, <i32 0, i32 65535>
1859 %tmp14 = getelementptr inbounds i32, i32* %pre, i64 %index
1860 %tmp15 = bitcast i32* %tmp14 to <2 x i32>*
1861 store <2 x i32> %tmp13, <2 x i32>* %tmp15, align 4
1865 ; %val = load <2 x i16>
1866 ; %op1 = sext<2 x i32> %val
1867 ; %op2 = const <2 x i32> {c1, c2} // c1 and c2 are within (-32768 ~ 32767)
1868 ; %rst = mul <2 x i32> %op1, %op2
1870 define void @mul_2xi16_varconst2(i8* nocapture readonly %a, i64 %index) {
1871 ; X86-SSE-LABEL: mul_2xi16_varconst2:
1872 ; X86-SSE: # %bb.0: # %entry
1873 ; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %eax
1874 ; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %ecx
1875 ; X86-SSE-NEXT: movl c, %edx
1876 ; X86-SSE-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
1877 ; X86-SSE-NEXT: movdqa {{.*#+}} xmm1 = <32768,32767,u,u,u,u,u,u>
1878 ; X86-SSE-NEXT: movdqa %xmm0, %xmm2
1879 ; X86-SSE-NEXT: pmulhw %xmm1, %xmm2
1880 ; X86-SSE-NEXT: pmullw %xmm1, %xmm0
1881 ; X86-SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
1882 ; X86-SSE-NEXT: movq %xmm0, (%edx,%eax,4)
1883 ; X86-SSE-NEXT: retl
1885 ; X86-AVX-LABEL: mul_2xi16_varconst2:
1886 ; X86-AVX: # %bb.0: # %entry
1887 ; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax
1888 ; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %ecx
1889 ; X86-AVX-NEXT: movl c, %edx
1890 ; X86-AVX-NEXT: vpmovsxwq (%ecx,%eax), %xmm0
1891 ; X86-AVX-NEXT: vpmuludq {{\.LCPI.*}}, %xmm0, %xmm0
1892 ; X86-AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
1893 ; X86-AVX-NEXT: vmovq %xmm0, (%edx,%eax,4)
1894 ; X86-AVX-NEXT: retl
1896 ; X64-SSE-LABEL: mul_2xi16_varconst2:
1897 ; X64-SSE: # %bb.0: # %entry
1898 ; X64-SSE-NEXT: movq {{.*}}(%rip), %rax
1899 ; X64-SSE-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
1900 ; X64-SSE-NEXT: movdqa {{.*#+}} xmm1 = <32768,32767,u,u,u,u,u,u>
1901 ; X64-SSE-NEXT: movdqa %xmm0, %xmm2
1902 ; X64-SSE-NEXT: pmulhw %xmm1, %xmm2
1903 ; X64-SSE-NEXT: pmullw %xmm1, %xmm0
1904 ; X64-SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
1905 ; X64-SSE-NEXT: movq %xmm0, (%rax,%rsi,4)
1906 ; X64-SSE-NEXT: retq
1908 ; X64-AVX-LABEL: mul_2xi16_varconst2:
1909 ; X64-AVX: # %bb.0: # %entry
1910 ; X64-AVX-NEXT: movq {{.*}}(%rip), %rax
1911 ; X64-AVX-NEXT: vpmovsxwq (%rdi,%rsi), %xmm0
1912 ; X64-AVX-NEXT: vpmuludq {{.*}}(%rip), %xmm0, %xmm0
1913 ; X64-AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
1914 ; X64-AVX-NEXT: vmovq %xmm0, (%rax,%rsi,4)
1915 ; X64-AVX-NEXT: retq
1917 %pre = load i32*, i32** @c
1918 %tmp6 = getelementptr inbounds i8, i8* %a, i64 %index
1919 %tmp7 = bitcast i8* %tmp6 to <2 x i16>*
1920 %wide.load = load <2 x i16>, <2 x i16>* %tmp7, align 1
1921 %tmp8 = sext <2 x i16> %wide.load to <2 x i32>
1922 %tmp13 = mul nuw nsw <2 x i32> %tmp8, <i32 -32768, i32 32767>
1923 %tmp14 = getelementptr inbounds i32, i32* %pre, i64 %index
1924 %tmp15 = bitcast i32* %tmp14 to <2 x i32>*
1925 store <2 x i32> %tmp13, <2 x i32>* %tmp15, align 4
1929 ; %val = load <2 x i16>
1930 ; %op1 = zext<2 x i32> %val
1931 ; %op2 = const <2 x i32> {c1, c2} // c1 and c2 are within (0 ~ 65536)
1932 ; %rst = mul <2 x i32> %op1, %op2
1934 define void @mul_2xi16_varconst3(i8* nocapture readonly %a, i64 %index) {
1935 ; X86-SSE-LABEL: mul_2xi16_varconst3:
1936 ; X86-SSE: # %bb.0: # %entry
1937 ; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %eax
1938 ; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %ecx
1939 ; X86-SSE-NEXT: movl c, %edx
1940 ; X86-SSE-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
1941 ; X86-SSE-NEXT: pxor %xmm1, %xmm1
1942 ; X86-SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
1943 ; X86-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,1,3]
1944 ; X86-SSE-NEXT: pmuludq {{\.LCPI.*}}, %xmm0
1945 ; X86-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
1946 ; X86-SSE-NEXT: movq %xmm0, (%edx,%eax,4)
1947 ; X86-SSE-NEXT: retl
1949 ; X86-AVX-LABEL: mul_2xi16_varconst3:
1950 ; X86-AVX: # %bb.0: # %entry
1951 ; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax
1952 ; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %ecx
1953 ; X86-AVX-NEXT: movl c, %edx
1954 ; X86-AVX-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
1955 ; X86-AVX-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
1956 ; X86-AVX-NEXT: vpmulld {{\.LCPI.*}}, %xmm0, %xmm0
1957 ; X86-AVX-NEXT: vmovq %xmm0, (%edx,%eax,4)
1958 ; X86-AVX-NEXT: retl
1960 ; X64-SSE-LABEL: mul_2xi16_varconst3:
1961 ; X64-SSE: # %bb.0: # %entry
1962 ; X64-SSE-NEXT: movq {{.*}}(%rip), %rax
1963 ; X64-SSE-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
1964 ; X64-SSE-NEXT: pxor %xmm1, %xmm1
1965 ; X64-SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
1966 ; X64-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,1,3]
1967 ; X64-SSE-NEXT: movl $65536, %ecx # imm = 0x10000
1968 ; X64-SSE-NEXT: movq %rcx, %xmm1
1969 ; X64-SSE-NEXT: pslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0,1,2,3,4,5,6,7]
1970 ; X64-SSE-NEXT: pmuludq %xmm0, %xmm1
1971 ; X64-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,2,2,3]
1972 ; X64-SSE-NEXT: movq %xmm0, (%rax,%rsi,4)
1973 ; X64-SSE-NEXT: retq
1975 ; X64-AVX-LABEL: mul_2xi16_varconst3:
1976 ; X64-AVX: # %bb.0: # %entry
1977 ; X64-AVX-NEXT: movq {{.*}}(%rip), %rax
1978 ; X64-AVX-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
1979 ; X64-AVX-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
1980 ; X64-AVX-NEXT: movl $65536, %ecx # imm = 0x10000
1981 ; X64-AVX-NEXT: vmovq %rcx, %xmm1
1982 ; X64-AVX-NEXT: vpslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0,1,2,3,4,5,6,7]
1983 ; X64-AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
1984 ; X64-AVX-NEXT: vpmulld %xmm1, %xmm0, %xmm0
1985 ; X64-AVX-NEXT: vmovq %xmm0, (%rax,%rsi,4)
1986 ; X64-AVX-NEXT: retq
1988 %pre = load i32*, i32** @c
1989 %tmp6 = getelementptr inbounds i8, i8* %a, i64 %index
1990 %tmp7 = bitcast i8* %tmp6 to <2 x i16>*
1991 %wide.load = load <2 x i16>, <2 x i16>* %tmp7, align 1
1992 %tmp8 = zext <2 x i16> %wide.load to <2 x i32>
1993 %tmp13 = mul nuw nsw <2 x i32> %tmp8, <i32 0, i32 65536>
1994 %tmp14 = getelementptr inbounds i32, i32* %pre, i64 %index
1995 %tmp15 = bitcast i32* %tmp14 to <2 x i32>*
1996 store <2 x i32> %tmp13, <2 x i32>* %tmp15, align 4
2000 ; %val = load <2 x i16>
2001 ; %op1 = sext<2 x i32> %val
2002 ; %op2 = const <2 x i32> {c1, c2} // c1 and c2 are within (0 ~ 32768)
2003 ; %rst = mul <2 x i32> %op1, %op2
2005 define void @mul_2xi16_varconst4(i8* nocapture readonly %a, i64 %index) {
2006 ; X86-SSE-LABEL: mul_2xi16_varconst4:
2007 ; X86-SSE: # %bb.0: # %entry
2008 ; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %eax
2009 ; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %ecx
2010 ; X86-SSE-NEXT: movl c, %edx
2011 ; X86-SSE-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
2012 ; X86-SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,2,1,4,5,6,7]
2013 ; X86-SSE-NEXT: psrad $16, %xmm0
2014 ; X86-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,1,3]
2015 ; X86-SSE-NEXT: pmuludq {{\.LCPI.*}}, %xmm0
2016 ; X86-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
2017 ; X86-SSE-NEXT: movq %xmm0, (%edx,%eax,4)
2018 ; X86-SSE-NEXT: retl
2020 ; X86-AVX-LABEL: mul_2xi16_varconst4:
2021 ; X86-AVX: # %bb.0: # %entry
2022 ; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax
2023 ; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %ecx
2024 ; X86-AVX-NEXT: movl c, %edx
2025 ; X86-AVX-NEXT: vpmovsxwq (%ecx,%eax), %xmm0
2026 ; X86-AVX-NEXT: vpmuludq {{\.LCPI.*}}, %xmm0, %xmm0
2027 ; X86-AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
2028 ; X86-AVX-NEXT: vmovq %xmm0, (%edx,%eax,4)
2029 ; X86-AVX-NEXT: retl
2031 ; X64-SSE-LABEL: mul_2xi16_varconst4:
2032 ; X64-SSE: # %bb.0: # %entry
2033 ; X64-SSE-NEXT: movq {{.*}}(%rip), %rax
2034 ; X64-SSE-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
2035 ; X64-SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,2,1,4,5,6,7]
2036 ; X64-SSE-NEXT: psrad $16, %xmm0
2037 ; X64-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,1,3]
2038 ; X64-SSE-NEXT: movl $32768, %ecx # imm = 0x8000
2039 ; X64-SSE-NEXT: movq %rcx, %xmm1
2040 ; X64-SSE-NEXT: pslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0,1,2,3,4,5,6,7]
2041 ; X64-SSE-NEXT: pmuludq %xmm0, %xmm1
2042 ; X64-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,2,2,3]
2043 ; X64-SSE-NEXT: movq %xmm0, (%rax,%rsi,4)
2044 ; X64-SSE-NEXT: retq
2046 ; X64-AVX-LABEL: mul_2xi16_varconst4:
2047 ; X64-AVX: # %bb.0: # %entry
2048 ; X64-AVX-NEXT: movq {{.*}}(%rip), %rax
2049 ; X64-AVX-NEXT: vpmovsxwq (%rdi,%rsi), %xmm0
2050 ; X64-AVX-NEXT: movl $32768, %ecx # imm = 0x8000
2051 ; X64-AVX-NEXT: vmovq %rcx, %xmm1
2052 ; X64-AVX-NEXT: vpslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0,1,2,3,4,5,6,7]
2053 ; X64-AVX-NEXT: vpmuludq %xmm1, %xmm0, %xmm0
2054 ; X64-AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
2055 ; X64-AVX-NEXT: vmovq %xmm0, (%rax,%rsi,4)
2056 ; X64-AVX-NEXT: retq
2058 %pre = load i32*, i32** @c
2059 %tmp6 = getelementptr inbounds i8, i8* %a, i64 %index
2060 %tmp7 = bitcast i8* %tmp6 to <2 x i16>*
2061 %wide.load = load <2 x i16>, <2 x i16>* %tmp7, align 1
2062 %tmp8 = sext <2 x i16> %wide.load to <2 x i32>
2063 %tmp13 = mul nuw nsw <2 x i32> %tmp8, <i32 0, i32 32768>
2064 %tmp14 = getelementptr inbounds i32, i32* %pre, i64 %index
2065 %tmp15 = bitcast i32* %tmp14 to <2 x i32>*
2066 store <2 x i32> %tmp13, <2 x i32>* %tmp15, align 4
2074 define void @PR34947(<9 x i16>* %p0, <9 x i32>* %p1) nounwind {
2075 ; X86-SSE-LABEL: PR34947:
2077 ; X86-SSE-NEXT: pushl %esi
2078 ; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %ecx
2079 ; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %eax
2080 ; X86-SSE-NEXT: movdqa (%eax), %xmm5
2081 ; X86-SSE-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
2082 ; X86-SSE-NEXT: movdqa (%ecx), %xmm2
2083 ; X86-SSE-NEXT: movdqa 16(%ecx), %xmm6
2084 ; X86-SSE-NEXT: pxor %xmm0, %xmm0
2085 ; X86-SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
2086 ; X86-SSE-NEXT: movdqa %xmm5, %xmm4
2087 ; X86-SSE-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1],xmm4[2],xmm0[2],xmm4[3],xmm0[3]
2088 ; X86-SSE-NEXT: punpckhwd {{.*#+}} xmm5 = xmm5[4],xmm0[4],xmm5[5],xmm0[5],xmm5[6],xmm0[6],xmm5[7],xmm0[7]
2089 ; X86-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm5[3,1,2,3]
2090 ; X86-SSE-NEXT: movd %xmm0, %eax
2091 ; X86-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm6[3,1,2,3]
2092 ; X86-SSE-NEXT: movd %xmm0, %esi
2093 ; X86-SSE-NEXT: xorl %edx, %edx
2094 ; X86-SSE-NEXT: divl %esi
2095 ; X86-SSE-NEXT: movd %edx, %xmm0
2096 ; X86-SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm5[2,3,0,1]
2097 ; X86-SSE-NEXT: movd %xmm3, %eax
2098 ; X86-SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm6[2,3,0,1]
2099 ; X86-SSE-NEXT: movd %xmm3, %esi
2100 ; X86-SSE-NEXT: xorl %edx, %edx
2101 ; X86-SSE-NEXT: divl %esi
2102 ; X86-SSE-NEXT: movd %edx, %xmm7
2103 ; X86-SSE-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm0[0],xmm7[1],xmm0[1]
2104 ; X86-SSE-NEXT: movd %xmm5, %eax
2105 ; X86-SSE-NEXT: movd %xmm6, %esi
2106 ; X86-SSE-NEXT: xorl %edx, %edx
2107 ; X86-SSE-NEXT: divl %esi
2108 ; X86-SSE-NEXT: movd %edx, %xmm3
2109 ; X86-SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,1,2,3]
2110 ; X86-SSE-NEXT: movd %xmm5, %eax
2111 ; X86-SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm6[1,1,2,3]
2112 ; X86-SSE-NEXT: movd %xmm5, %esi
2113 ; X86-SSE-NEXT: xorl %edx, %edx
2114 ; X86-SSE-NEXT: divl %esi
2115 ; X86-SSE-NEXT: movd %edx, %xmm5
2116 ; X86-SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm5[0],xmm3[1],xmm5[1]
2117 ; X86-SSE-NEXT: punpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm7[0]
2118 ; X86-SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm4[3,1,2,3]
2119 ; X86-SSE-NEXT: movd %xmm6, %eax
2120 ; X86-SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm2[3,1,2,3]
2121 ; X86-SSE-NEXT: movd %xmm6, %esi
2122 ; X86-SSE-NEXT: xorl %edx, %edx
2123 ; X86-SSE-NEXT: divl %esi
2124 ; X86-SSE-NEXT: movd %edx, %xmm6
2125 ; X86-SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm4[2,3,0,1]
2126 ; X86-SSE-NEXT: movd %xmm7, %eax
2127 ; X86-SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm2[2,3,0,1]
2128 ; X86-SSE-NEXT: movd %xmm7, %esi
2129 ; X86-SSE-NEXT: xorl %edx, %edx
2130 ; X86-SSE-NEXT: divl %esi
2131 ; X86-SSE-NEXT: movd %edx, %xmm7
2132 ; X86-SSE-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm6[0],xmm7[1],xmm6[1]
2133 ; X86-SSE-NEXT: movd %xmm4, %eax
2134 ; X86-SSE-NEXT: movd %xmm2, %esi
2135 ; X86-SSE-NEXT: xorl %edx, %edx
2136 ; X86-SSE-NEXT: divl %esi
2137 ; X86-SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,2,3]
2138 ; X86-SSE-NEXT: movd %xmm4, %eax
2139 ; X86-SSE-NEXT: movd %edx, %xmm4
2140 ; X86-SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,2,3]
2141 ; X86-SSE-NEXT: movd %xmm2, %esi
2142 ; X86-SSE-NEXT: xorl %edx, %edx
2143 ; X86-SSE-NEXT: divl %esi
2144 ; X86-SSE-NEXT: movd %edx, %xmm2
2145 ; X86-SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1]
2146 ; X86-SSE-NEXT: punpcklqdq {{.*#+}} xmm4 = xmm4[0],xmm7[0]
2147 ; X86-SSE-NEXT: movd %xmm1, %eax
2148 ; X86-SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,0],xmm6[0,0]
2149 ; X86-SSE-NEXT: movdqa {{.*#+}} xmm1 = [8199,8199,8199,8199]
2150 ; X86-SSE-NEXT: pmuludq %xmm1, %xmm4
2151 ; X86-SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,2,2,3]
2152 ; X86-SSE-NEXT: pmuludq %xmm1, %xmm2
2153 ; X86-SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
2154 ; X86-SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1]
2155 ; X86-SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,0],xmm0[0,0]
2156 ; X86-SSE-NEXT: pmuludq %xmm1, %xmm3
2157 ; X86-SSE-NEXT: pmuludq %xmm1, %xmm5
2158 ; X86-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm3[0,2,2,3]
2159 ; X86-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm5[0,2,2,3]
2160 ; X86-SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
2161 ; X86-SSE-NEXT: xorl %edx, %edx
2162 ; X86-SSE-NEXT: divl 32(%ecx)
2163 ; X86-SSE-NEXT: movdqa %xmm0, (%eax)
2164 ; X86-SSE-NEXT: movdqa %xmm4, (%eax)
2165 ; X86-SSE-NEXT: imull $8199, %edx, %eax # imm = 0x2007
2166 ; X86-SSE-NEXT: movl %eax, (%eax)
2167 ; X86-SSE-NEXT: popl %esi
2168 ; X86-SSE-NEXT: retl
2170 ; X86-AVX1-LABEL: PR34947:
2171 ; X86-AVX1: # %bb.0:
2172 ; X86-AVX1-NEXT: pushl %ebp
2173 ; X86-AVX1-NEXT: pushl %ebx
2174 ; X86-AVX1-NEXT: pushl %edi
2175 ; X86-AVX1-NEXT: pushl %esi
2176 ; X86-AVX1-NEXT: subl $16, %esp
2177 ; X86-AVX1-NEXT: movl {{[0-9]+}}(%esp), %ecx
2178 ; X86-AVX1-NEXT: movl {{[0-9]+}}(%esp), %eax
2179 ; X86-AVX1-NEXT: vpmovzxwd {{.*#+}} xmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
2180 ; X86-AVX1-NEXT: vpmovzxwd {{.*#+}} xmm2 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
2181 ; X86-AVX1-NEXT: vpmovzxwd {{.*#+}} xmm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
2182 ; X86-AVX1-NEXT: vmovd %xmm1, %eax
2183 ; X86-AVX1-NEXT: xorl %edx, %edx
2184 ; X86-AVX1-NEXT: divl 32(%ecx)
2185 ; X86-AVX1-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
2186 ; X86-AVX1-NEXT: vpextrd $3, %xmm2, %eax
2187 ; X86-AVX1-NEXT: vmovdqa (%ecx), %xmm3
2188 ; X86-AVX1-NEXT: vmovdqa 16(%ecx), %xmm1
2189 ; X86-AVX1-NEXT: vpextrd $3, %xmm3, %ecx
2190 ; X86-AVX1-NEXT: xorl %edx, %edx
2191 ; X86-AVX1-NEXT: divl %ecx
2192 ; X86-AVX1-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
2193 ; X86-AVX1-NEXT: vpextrd $2, %xmm2, %eax
2194 ; X86-AVX1-NEXT: vpextrd $2, %xmm3, %ecx
2195 ; X86-AVX1-NEXT: xorl %edx, %edx
2196 ; X86-AVX1-NEXT: divl %ecx
2197 ; X86-AVX1-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
2198 ; X86-AVX1-NEXT: vpextrd $1, %xmm2, %eax
2199 ; X86-AVX1-NEXT: vpextrd $1, %xmm3, %ecx
2200 ; X86-AVX1-NEXT: xorl %edx, %edx
2201 ; X86-AVX1-NEXT: divl %ecx
2202 ; X86-AVX1-NEXT: movl %edx, (%esp) # 4-byte Spill
2203 ; X86-AVX1-NEXT: vmovd %xmm2, %eax
2204 ; X86-AVX1-NEXT: vmovd %xmm3, %ecx
2205 ; X86-AVX1-NEXT: xorl %edx, %edx
2206 ; X86-AVX1-NEXT: divl %ecx
2207 ; X86-AVX1-NEXT: movl %edx, %ebp
2208 ; X86-AVX1-NEXT: vpextrd $3, %xmm0, %eax
2209 ; X86-AVX1-NEXT: vpextrd $3, %xmm1, %ecx
2210 ; X86-AVX1-NEXT: xorl %edx, %edx
2211 ; X86-AVX1-NEXT: divl %ecx
2212 ; X86-AVX1-NEXT: movl %edx, %ebx
2213 ; X86-AVX1-NEXT: vpextrd $2, %xmm0, %eax
2214 ; X86-AVX1-NEXT: vpextrd $2, %xmm1, %esi
2215 ; X86-AVX1-NEXT: xorl %edx, %edx
2216 ; X86-AVX1-NEXT: divl %esi
2217 ; X86-AVX1-NEXT: movl %edx, %esi
2218 ; X86-AVX1-NEXT: vpextrd $1, %xmm0, %eax
2219 ; X86-AVX1-NEXT: vpextrd $1, %xmm1, %edi
2220 ; X86-AVX1-NEXT: xorl %edx, %edx
2221 ; X86-AVX1-NEXT: divl %edi
2222 ; X86-AVX1-NEXT: movl %edx, %edi
2223 ; X86-AVX1-NEXT: vmovd %xmm0, %eax
2224 ; X86-AVX1-NEXT: vmovd %xmm1, %ecx
2225 ; X86-AVX1-NEXT: xorl %edx, %edx
2226 ; X86-AVX1-NEXT: divl %ecx
2227 ; X86-AVX1-NEXT: vmovd %edx, %xmm0
2228 ; X86-AVX1-NEXT: vpinsrd $1, %edi, %xmm0, %xmm0
2229 ; X86-AVX1-NEXT: vpinsrd $2, %esi, %xmm0, %xmm0
2230 ; X86-AVX1-NEXT: vpinsrd $3, %ebx, %xmm0, %xmm0
2231 ; X86-AVX1-NEXT: vmovd %ebp, %xmm1
2232 ; X86-AVX1-NEXT: vpinsrd $1, (%esp), %xmm1, %xmm1 # 4-byte Folded Reload
2233 ; X86-AVX1-NEXT: vpinsrd $2, {{[-0-9]+}}(%e{{[sb]}}p), %xmm1, %xmm1 # 4-byte Folded Reload
2234 ; X86-AVX1-NEXT: vpinsrd $3, {{[-0-9]+}}(%e{{[sb]}}p), %xmm1, %xmm1 # 4-byte Folded Reload
2235 ; X86-AVX1-NEXT: imull $8199, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
2236 ; X86-AVX1-NEXT: # imm = 0x2007
2237 ; X86-AVX1-NEXT: movl %eax, (%eax)
2238 ; X86-AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [8199,8199,8199,8199]
2239 ; X86-AVX1-NEXT: vpmulld %xmm2, %xmm0, %xmm0
2240 ; X86-AVX1-NEXT: vpmulld %xmm2, %xmm1, %xmm1
2241 ; X86-AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
2242 ; X86-AVX1-NEXT: vmovaps %ymm0, (%eax)
2243 ; X86-AVX1-NEXT: addl $16, %esp
2244 ; X86-AVX1-NEXT: popl %esi
2245 ; X86-AVX1-NEXT: popl %edi
2246 ; X86-AVX1-NEXT: popl %ebx
2247 ; X86-AVX1-NEXT: popl %ebp
2248 ; X86-AVX1-NEXT: vzeroupper
2249 ; X86-AVX1-NEXT: retl
2251 ; X86-AVX2-LABEL: PR34947:
2252 ; X86-AVX2: # %bb.0:
2253 ; X86-AVX2-NEXT: pushl %edi
2254 ; X86-AVX2-NEXT: pushl %esi
2255 ; X86-AVX2-NEXT: movl {{[0-9]+}}(%esp), %esi
2256 ; X86-AVX2-NEXT: movl {{[0-9]+}}(%esp), %eax
2257 ; X86-AVX2-NEXT: vpmovzxwd {{.*#+}} ymm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero
2258 ; X86-AVX2-NEXT: vpmovzxwd {{.*#+}} ymm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero
2259 ; X86-AVX2-NEXT: vmovdqa (%esi), %xmm2
2260 ; X86-AVX2-NEXT: vmovdqa 16(%esi), %xmm3
2261 ; X86-AVX2-NEXT: vpextrd $1, %xmm3, %ecx
2262 ; X86-AVX2-NEXT: vextracti128 $1, %ymm1, %xmm4
2263 ; X86-AVX2-NEXT: vpextrd $1, %xmm4, %eax
2264 ; X86-AVX2-NEXT: xorl %edx, %edx
2265 ; X86-AVX2-NEXT: divl %ecx
2266 ; X86-AVX2-NEXT: movl %edx, %ecx
2267 ; X86-AVX2-NEXT: vmovd %xmm3, %edi
2268 ; X86-AVX2-NEXT: vmovd %xmm4, %eax
2269 ; X86-AVX2-NEXT: xorl %edx, %edx
2270 ; X86-AVX2-NEXT: divl %edi
2271 ; X86-AVX2-NEXT: vmovd %edx, %xmm5
2272 ; X86-AVX2-NEXT: vpinsrd $1, %ecx, %xmm5, %xmm5
2273 ; X86-AVX2-NEXT: vpextrd $2, %xmm3, %ecx
2274 ; X86-AVX2-NEXT: vpextrd $2, %xmm4, %eax
2275 ; X86-AVX2-NEXT: xorl %edx, %edx
2276 ; X86-AVX2-NEXT: divl %ecx
2277 ; X86-AVX2-NEXT: vpinsrd $2, %edx, %xmm5, %xmm5
2278 ; X86-AVX2-NEXT: vpextrd $3, %xmm3, %ecx
2279 ; X86-AVX2-NEXT: vpextrd $3, %xmm4, %eax
2280 ; X86-AVX2-NEXT: xorl %edx, %edx
2281 ; X86-AVX2-NEXT: divl %ecx
2282 ; X86-AVX2-NEXT: vpinsrd $3, %edx, %xmm5, %xmm3
2283 ; X86-AVX2-NEXT: vpextrd $1, %xmm2, %ecx
2284 ; X86-AVX2-NEXT: vpextrd $1, %xmm1, %eax
2285 ; X86-AVX2-NEXT: xorl %edx, %edx
2286 ; X86-AVX2-NEXT: divl %ecx
2287 ; X86-AVX2-NEXT: movl %edx, %ecx
2288 ; X86-AVX2-NEXT: vmovd %xmm2, %edi
2289 ; X86-AVX2-NEXT: vmovd %xmm1, %eax
2290 ; X86-AVX2-NEXT: xorl %edx, %edx
2291 ; X86-AVX2-NEXT: divl %edi
2292 ; X86-AVX2-NEXT: vmovd %edx, %xmm4
2293 ; X86-AVX2-NEXT: vpinsrd $1, %ecx, %xmm4, %xmm4
2294 ; X86-AVX2-NEXT: vpextrd $2, %xmm2, %ecx
2295 ; X86-AVX2-NEXT: vpextrd $2, %xmm1, %eax
2296 ; X86-AVX2-NEXT: xorl %edx, %edx
2297 ; X86-AVX2-NEXT: divl %ecx
2298 ; X86-AVX2-NEXT: vpinsrd $2, %edx, %xmm4, %xmm4
2299 ; X86-AVX2-NEXT: vpextrd $3, %xmm2, %ecx
2300 ; X86-AVX2-NEXT: vpextrd $3, %xmm1, %eax
2301 ; X86-AVX2-NEXT: xorl %edx, %edx
2302 ; X86-AVX2-NEXT: divl %ecx
2303 ; X86-AVX2-NEXT: vpinsrd $3, %edx, %xmm4, %xmm1
2304 ; X86-AVX2-NEXT: vinserti128 $1, %xmm3, %ymm1, %ymm1
2305 ; X86-AVX2-NEXT: vmovd %xmm0, %eax
2306 ; X86-AVX2-NEXT: xorl %edx, %edx
2307 ; X86-AVX2-NEXT: divl 32(%esi)
2308 ; X86-AVX2-NEXT: vpbroadcastd {{.*#+}} ymm0 = [8199,8199,8199,8199,8199,8199,8199,8199]
2309 ; X86-AVX2-NEXT: vpmulld %ymm0, %ymm1, %ymm0
2310 ; X86-AVX2-NEXT: imull $8199, %edx, %eax # imm = 0x2007
2311 ; X86-AVX2-NEXT: movl %eax, (%eax)
2312 ; X86-AVX2-NEXT: vmovdqa %ymm0, (%eax)
2313 ; X86-AVX2-NEXT: popl %esi
2314 ; X86-AVX2-NEXT: popl %edi
2315 ; X86-AVX2-NEXT: vzeroupper
2316 ; X86-AVX2-NEXT: retl
2318 ; X64-SSE-LABEL: PR34947:
2320 ; X64-SSE-NEXT: movdqa (%rdi), %xmm5
2321 ; X64-SSE-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
2322 ; X64-SSE-NEXT: movdqa (%rsi), %xmm2
2323 ; X64-SSE-NEXT: movdqa 16(%rsi), %xmm6
2324 ; X64-SSE-NEXT: pxor %xmm0, %xmm0
2325 ; X64-SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
2326 ; X64-SSE-NEXT: movdqa %xmm5, %xmm3
2327 ; X64-SSE-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3]
2328 ; X64-SSE-NEXT: punpckhwd {{.*#+}} xmm5 = xmm5[4],xmm0[4],xmm5[5],xmm0[5],xmm5[6],xmm0[6],xmm5[7],xmm0[7]
2329 ; X64-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm5[3,1,2,3]
2330 ; X64-SSE-NEXT: movd %xmm0, %eax
2331 ; X64-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm6[3,1,2,3]
2332 ; X64-SSE-NEXT: movd %xmm0, %ecx
2333 ; X64-SSE-NEXT: xorl %edx, %edx
2334 ; X64-SSE-NEXT: divl %ecx
2335 ; X64-SSE-NEXT: movd %edx, %xmm8
2336 ; X64-SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm5[2,3,0,1]
2337 ; X64-SSE-NEXT: movd %xmm4, %eax
2338 ; X64-SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm6[2,3,0,1]
2339 ; X64-SSE-NEXT: movd %xmm4, %ecx
2340 ; X64-SSE-NEXT: xorl %edx, %edx
2341 ; X64-SSE-NEXT: divl %ecx
2342 ; X64-SSE-NEXT: movd %edx, %xmm7
2343 ; X64-SSE-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm8[0],xmm7[1],xmm8[1]
2344 ; X64-SSE-NEXT: movd %xmm5, %eax
2345 ; X64-SSE-NEXT: movd %xmm6, %ecx
2346 ; X64-SSE-NEXT: xorl %edx, %edx
2347 ; X64-SSE-NEXT: divl %ecx
2348 ; X64-SSE-NEXT: movd %edx, %xmm4
2349 ; X64-SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,1,2,3]
2350 ; X64-SSE-NEXT: movd %xmm5, %eax
2351 ; X64-SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm6[1,1,2,3]
2352 ; X64-SSE-NEXT: movd %xmm5, %ecx
2353 ; X64-SSE-NEXT: xorl %edx, %edx
2354 ; X64-SSE-NEXT: divl %ecx
2355 ; X64-SSE-NEXT: movd %edx, %xmm5
2356 ; X64-SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1]
2357 ; X64-SSE-NEXT: punpcklqdq {{.*#+}} xmm4 = xmm4[0],xmm7[0]
2358 ; X64-SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm3[3,1,2,3]
2359 ; X64-SSE-NEXT: movd %xmm6, %eax
2360 ; X64-SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm2[3,1,2,3]
2361 ; X64-SSE-NEXT: movd %xmm6, %ecx
2362 ; X64-SSE-NEXT: xorl %edx, %edx
2363 ; X64-SSE-NEXT: divl %ecx
2364 ; X64-SSE-NEXT: movd %edx, %xmm6
2365 ; X64-SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm3[2,3,0,1]
2366 ; X64-SSE-NEXT: movd %xmm7, %eax
2367 ; X64-SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm2[2,3,0,1]
2368 ; X64-SSE-NEXT: movd %xmm7, %ecx
2369 ; X64-SSE-NEXT: xorl %edx, %edx
2370 ; X64-SSE-NEXT: divl %ecx
2371 ; X64-SSE-NEXT: movd %edx, %xmm7
2372 ; X64-SSE-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm6[0],xmm7[1],xmm6[1]
2373 ; X64-SSE-NEXT: movd %xmm3, %eax
2374 ; X64-SSE-NEXT: movd %xmm2, %ecx
2375 ; X64-SSE-NEXT: xorl %edx, %edx
2376 ; X64-SSE-NEXT: divl %ecx
2377 ; X64-SSE-NEXT: movd %edx, %xmm0
2378 ; X64-SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,2,3]
2379 ; X64-SSE-NEXT: movd %xmm3, %eax
2380 ; X64-SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,2,3]
2381 ; X64-SSE-NEXT: movd %xmm2, %ecx
2382 ; X64-SSE-NEXT: xorl %edx, %edx
2383 ; X64-SSE-NEXT: divl %ecx
2384 ; X64-SSE-NEXT: movd %edx, %xmm2
2385 ; X64-SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
2386 ; X64-SSE-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm7[0]
2387 ; X64-SSE-NEXT: movd %xmm1, %eax
2388 ; X64-SSE-NEXT: xorl %edx, %edx
2389 ; X64-SSE-NEXT: divl 32(%rsi)
2390 ; X64-SSE-NEXT: movdqa {{.*#+}} xmm1 = [8199,8199,8199,8199]
2391 ; X64-SSE-NEXT: pmuludq %xmm1, %xmm0
2392 ; X64-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
2393 ; X64-SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,0],xmm6[0,0]
2394 ; X64-SSE-NEXT: pmuludq %xmm1, %xmm2
2395 ; X64-SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
2396 ; X64-SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
2397 ; X64-SSE-NEXT: pmuludq %xmm1, %xmm4
2398 ; X64-SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm4[0,2,2,3]
2399 ; X64-SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,0],xmm8[0,0]
2400 ; X64-SSE-NEXT: pmuludq %xmm1, %xmm5
2401 ; X64-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm5[0,2,2,3]
2402 ; X64-SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
2403 ; X64-SSE-NEXT: imull $8199, %edx, %eax # imm = 0x2007
2404 ; X64-SSE-NEXT: movl %eax, (%rax)
2405 ; X64-SSE-NEXT: movdqa %xmm2, (%rax)
2406 ; X64-SSE-NEXT: movdqa %xmm0, (%rax)
2407 ; X64-SSE-NEXT: retq
2409 ; X64-AVX1-LABEL: PR34947:
2410 ; X64-AVX1: # %bb.0:
2411 ; X64-AVX1-NEXT: pushq %rbp
2412 ; X64-AVX1-NEXT: pushq %rbx
2413 ; X64-AVX1-NEXT: vpmovzxwd {{.*#+}} xmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
2414 ; X64-AVX1-NEXT: vpmovzxwd {{.*#+}} xmm2 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
2415 ; X64-AVX1-NEXT: vpmovzxwd {{.*#+}} xmm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
2416 ; X64-AVX1-NEXT: vmovd %xmm1, %eax
2417 ; X64-AVX1-NEXT: xorl %edx, %edx
2418 ; X64-AVX1-NEXT: divl 32(%rsi)
2419 ; X64-AVX1-NEXT: movl %edx, %r8d
2420 ; X64-AVX1-NEXT: vpextrd $3, %xmm2, %eax
2421 ; X64-AVX1-NEXT: vmovdqa (%rsi), %xmm3
2422 ; X64-AVX1-NEXT: vmovdqa 16(%rsi), %xmm1
2423 ; X64-AVX1-NEXT: vpextrd $3, %xmm3, %ecx
2424 ; X64-AVX1-NEXT: xorl %edx, %edx
2425 ; X64-AVX1-NEXT: divl %ecx
2426 ; X64-AVX1-NEXT: movl %edx, %r9d
2427 ; X64-AVX1-NEXT: vpextrd $2, %xmm2, %eax
2428 ; X64-AVX1-NEXT: vpextrd $2, %xmm3, %ecx
2429 ; X64-AVX1-NEXT: xorl %edx, %edx
2430 ; X64-AVX1-NEXT: divl %ecx
2431 ; X64-AVX1-NEXT: movl %edx, %r10d
2432 ; X64-AVX1-NEXT: vpextrd $1, %xmm2, %eax
2433 ; X64-AVX1-NEXT: vpextrd $1, %xmm3, %ecx
2434 ; X64-AVX1-NEXT: xorl %edx, %edx
2435 ; X64-AVX1-NEXT: divl %ecx
2436 ; X64-AVX1-NEXT: movl %edx, %r11d
2437 ; X64-AVX1-NEXT: vmovd %xmm2, %eax
2438 ; X64-AVX1-NEXT: vmovd %xmm3, %ecx
2439 ; X64-AVX1-NEXT: xorl %edx, %edx
2440 ; X64-AVX1-NEXT: divl %ecx
2441 ; X64-AVX1-NEXT: movl %edx, %esi
2442 ; X64-AVX1-NEXT: vpextrd $3, %xmm0, %eax
2443 ; X64-AVX1-NEXT: vpextrd $3, %xmm1, %ecx
2444 ; X64-AVX1-NEXT: xorl %edx, %edx
2445 ; X64-AVX1-NEXT: divl %ecx
2446 ; X64-AVX1-NEXT: movl %edx, %edi
2447 ; X64-AVX1-NEXT: vpextrd $2, %xmm0, %eax
2448 ; X64-AVX1-NEXT: vpextrd $2, %xmm1, %ecx
2449 ; X64-AVX1-NEXT: xorl %edx, %edx
2450 ; X64-AVX1-NEXT: divl %ecx
2451 ; X64-AVX1-NEXT: movl %edx, %ecx
2452 ; X64-AVX1-NEXT: vpextrd $1, %xmm0, %eax
2453 ; X64-AVX1-NEXT: vpextrd $1, %xmm1, %ebx
2454 ; X64-AVX1-NEXT: xorl %edx, %edx
2455 ; X64-AVX1-NEXT: divl %ebx
2456 ; X64-AVX1-NEXT: movl %edx, %ebx
2457 ; X64-AVX1-NEXT: vmovd %xmm0, %eax
2458 ; X64-AVX1-NEXT: vmovd %xmm1, %ebp
2459 ; X64-AVX1-NEXT: xorl %edx, %edx
2460 ; X64-AVX1-NEXT: divl %ebp
2461 ; X64-AVX1-NEXT: vmovd %edx, %xmm0
2462 ; X64-AVX1-NEXT: vpinsrd $1, %ebx, %xmm0, %xmm0
2463 ; X64-AVX1-NEXT: vpinsrd $2, %ecx, %xmm0, %xmm0
2464 ; X64-AVX1-NEXT: vpinsrd $3, %edi, %xmm0, %xmm0
2465 ; X64-AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [8199,8199,8199,8199]
2466 ; X64-AVX1-NEXT: vpmulld %xmm1, %xmm0, %xmm0
2467 ; X64-AVX1-NEXT: vmovd %esi, %xmm2
2468 ; X64-AVX1-NEXT: vpinsrd $1, %r11d, %xmm2, %xmm2
2469 ; X64-AVX1-NEXT: vpinsrd $2, %r10d, %xmm2, %xmm2
2470 ; X64-AVX1-NEXT: vpinsrd $3, %r9d, %xmm2, %xmm2
2471 ; X64-AVX1-NEXT: vpmulld %xmm1, %xmm2, %xmm1
2472 ; X64-AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
2473 ; X64-AVX1-NEXT: imull $8199, %r8d, %eax # imm = 0x2007
2474 ; X64-AVX1-NEXT: movl %eax, (%rax)
2475 ; X64-AVX1-NEXT: vmovaps %ymm0, (%rax)
2476 ; X64-AVX1-NEXT: popq %rbx
2477 ; X64-AVX1-NEXT: popq %rbp
2478 ; X64-AVX1-NEXT: vzeroupper
2479 ; X64-AVX1-NEXT: retq
2481 ; X64-AVX2-LABEL: PR34947:
2482 ; X64-AVX2: # %bb.0:
2483 ; X64-AVX2-NEXT: vpmovzxwd {{.*#+}} ymm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero
2484 ; X64-AVX2-NEXT: vpmovzxwd {{.*#+}} ymm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero
2485 ; X64-AVX2-NEXT: vmovdqa (%rsi), %xmm2
2486 ; X64-AVX2-NEXT: vmovdqa 16(%rsi), %xmm3
2487 ; X64-AVX2-NEXT: vpextrd $1, %xmm3, %ecx
2488 ; X64-AVX2-NEXT: vextracti128 $1, %ymm1, %xmm4
2489 ; X64-AVX2-NEXT: vpextrd $1, %xmm4, %eax
2490 ; X64-AVX2-NEXT: xorl %edx, %edx
2491 ; X64-AVX2-NEXT: divl %ecx
2492 ; X64-AVX2-NEXT: movl %edx, %ecx
2493 ; X64-AVX2-NEXT: vmovd %xmm3, %edi
2494 ; X64-AVX2-NEXT: vmovd %xmm4, %eax
2495 ; X64-AVX2-NEXT: xorl %edx, %edx
2496 ; X64-AVX2-NEXT: divl %edi
2497 ; X64-AVX2-NEXT: vmovd %edx, %xmm5
2498 ; X64-AVX2-NEXT: vpinsrd $1, %ecx, %xmm5, %xmm5
2499 ; X64-AVX2-NEXT: vpextrd $2, %xmm3, %ecx
2500 ; X64-AVX2-NEXT: vpextrd $2, %xmm4, %eax
2501 ; X64-AVX2-NEXT: xorl %edx, %edx
2502 ; X64-AVX2-NEXT: divl %ecx
2503 ; X64-AVX2-NEXT: vpinsrd $2, %edx, %xmm5, %xmm5
2504 ; X64-AVX2-NEXT: vpextrd $3, %xmm3, %ecx
2505 ; X64-AVX2-NEXT: vpextrd $3, %xmm4, %eax
2506 ; X64-AVX2-NEXT: xorl %edx, %edx
2507 ; X64-AVX2-NEXT: divl %ecx
2508 ; X64-AVX2-NEXT: vpinsrd $3, %edx, %xmm5, %xmm3
2509 ; X64-AVX2-NEXT: vpextrd $1, %xmm2, %ecx
2510 ; X64-AVX2-NEXT: vpextrd $1, %xmm1, %eax
2511 ; X64-AVX2-NEXT: xorl %edx, %edx
2512 ; X64-AVX2-NEXT: divl %ecx
2513 ; X64-AVX2-NEXT: movl %edx, %ecx
2514 ; X64-AVX2-NEXT: vmovd %xmm2, %edi
2515 ; X64-AVX2-NEXT: vmovd %xmm1, %eax
2516 ; X64-AVX2-NEXT: xorl %edx, %edx
2517 ; X64-AVX2-NEXT: divl %edi
2518 ; X64-AVX2-NEXT: vmovd %edx, %xmm4
2519 ; X64-AVX2-NEXT: vpinsrd $1, %ecx, %xmm4, %xmm4
2520 ; X64-AVX2-NEXT: vpextrd $2, %xmm2, %ecx
2521 ; X64-AVX2-NEXT: vpextrd $2, %xmm1, %eax
2522 ; X64-AVX2-NEXT: xorl %edx, %edx
2523 ; X64-AVX2-NEXT: divl %ecx
2524 ; X64-AVX2-NEXT: vpinsrd $2, %edx, %xmm4, %xmm4
2525 ; X64-AVX2-NEXT: vpextrd $3, %xmm2, %ecx
2526 ; X64-AVX2-NEXT: vpextrd $3, %xmm1, %eax
2527 ; X64-AVX2-NEXT: xorl %edx, %edx
2528 ; X64-AVX2-NEXT: divl %ecx
2529 ; X64-AVX2-NEXT: vpinsrd $3, %edx, %xmm4, %xmm1
2530 ; X64-AVX2-NEXT: vinserti128 $1, %xmm3, %ymm1, %ymm1
2531 ; X64-AVX2-NEXT: vmovd %xmm0, %eax
2532 ; X64-AVX2-NEXT: xorl %edx, %edx
2533 ; X64-AVX2-NEXT: divl 32(%rsi)
2534 ; X64-AVX2-NEXT: vpbroadcastd {{.*#+}} ymm0 = [8199,8199,8199,8199,8199,8199,8199,8199]
2535 ; X64-AVX2-NEXT: vpmulld %ymm0, %ymm1, %ymm0
2536 ; X64-AVX2-NEXT: imull $8199, %edx, %eax # imm = 0x2007
2537 ; X64-AVX2-NEXT: movl %eax, (%rax)
2538 ; X64-AVX2-NEXT: vmovdqa %ymm0, (%rax)
2539 ; X64-AVX2-NEXT: vzeroupper
2540 ; X64-AVX2-NEXT: retq
2541 %a0 = load <9 x i16>, <9 x i16>* %p0, align 64
2542 %a1 = load <9 x i32>, <9 x i32>* %p1, align 64
2543 %ext0 = zext <9 x i16> %a0 to <9 x i32>
2544 %rem = urem <9 x i32> %ext0, %a1
2545 %mul = mul <9 x i32> <i32 8199, i32 8199, i32 8199, i32 8199, i32 8199, i32 8199, i32 8199, i32 8199, i32 8199>, %rem
2546 store <9 x i32> %mul, <9 x i32>* undef, align 64