1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
2 ; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx2 | FileCheck %s
4 define <4 x i32> @hadd_select_v4i32(<4 x i32> %x, <4 x i32> %y) {
5 ; CHECK-LABEL: hadd_select_v4i32:
6 ; CHECK: # %bb.0: # %entry
7 ; CHECK-NEXT: vxorps %xmm0, %xmm0, %xmm0
10 %and1 = and <4 x i32> %x, <i32 3, i32 3, i32 3, i32 3>
11 %and2 = and <4 x i32> %y, <i32 3, i32 3, i32 3, i32 3>
12 %hadd = tail call <4 x i32> @llvm.x86.ssse3.phadd.d.128(<4 x i32> %and1, <4 x i32> %and2)
13 %cond = icmp ule <4 x i32> %hadd, <i32 8, i32 8, i32 8, i32 8>
14 %ret = select <4 x i1> %cond, <4 x i32> zeroinitializer, <4 x i32> %hadd
18 define <8 x i8> @hadd_trunc_v8i16(<8 x i16> %x, <8 x i16> %y) {
19 ; CHECK-LABEL: hadd_trunc_v8i16:
20 ; CHECK: # %bb.0: # %entry
21 ; CHECK-NEXT: vpbroadcastw {{.*#+}} xmm2 = [3,3,3,3,3,3,3,3]
22 ; CHECK-NEXT: vpand %xmm2, %xmm0, %xmm0
23 ; CHECK-NEXT: vpand %xmm2, %xmm1, %xmm1
24 ; CHECK-NEXT: vphaddw %xmm1, %xmm0, %xmm0
25 ; CHECK-NEXT: vpackuswb %xmm0, %xmm0, %xmm0
28 %and1 = and <8 x i16> %x, <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>
29 %and2 = and <8 x i16> %y, <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>
30 %hadd = tail call <8 x i16> @llvm.x86.ssse3.phadd.w.128(<8 x i16> %and1, <8 x i16> %and2)
31 %conv = trunc <8 x i16> %hadd to <8 x i8>
35 define <8 x i16> @hadd_trunc_v8i32(<8 x i32> %x, <8 x i32> %y) {
36 ; CHECK-LABEL: hadd_trunc_v8i32:
37 ; CHECK: # %bb.0: # %entry
38 ; CHECK-NEXT: vpbroadcastd {{.*#+}} ymm2 = [3,3,3,3,3,3,3,3]
39 ; CHECK-NEXT: vpand %ymm2, %ymm0, %ymm0
40 ; CHECK-NEXT: vpand %ymm2, %ymm1, %ymm1
41 ; CHECK-NEXT: vphaddd %ymm1, %ymm0, %ymm0
42 ; CHECK-NEXT: vextracti128 $1, %ymm0, %xmm1
43 ; CHECK-NEXT: vpackusdw %xmm1, %xmm0, %xmm0
44 ; CHECK-NEXT: vzeroupper
47 %and1 = and <8 x i32> %x, <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
48 %and2 = and <8 x i32> %y, <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
49 %hadd = tail call <8 x i32> @llvm.x86.avx2.phadd.d(<8 x i32> %and1, <8 x i32> %and2)
50 %conv = trunc <8 x i32> %hadd to <8 x i16>
54 define <16 x i8> @hadd_trunc_v16i16(<16 x i16> %x, <16 x i16> %y) {
55 ; CHECK-LABEL: hadd_trunc_v16i16:
56 ; CHECK: # %bb.0: # %entry
57 ; CHECK-NEXT: vpbroadcastw {{.*#+}} ymm2 = [3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3]
58 ; CHECK-NEXT: vpand %ymm2, %ymm0, %ymm0
59 ; CHECK-NEXT: vpand %ymm2, %ymm1, %ymm1
60 ; CHECK-NEXT: vphaddw %ymm1, %ymm0, %ymm0
61 ; CHECK-NEXT: vextracti128 $1, %ymm0, %xmm1
62 ; CHECK-NEXT: vpackuswb %xmm1, %xmm0, %xmm0
63 ; CHECK-NEXT: vzeroupper
66 %and1 = and <16 x i16> %x, <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>
67 %and2 = and <16 x i16> %y, <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>
68 %hadd = tail call <16 x i16> @llvm.x86.avx2.phadd.w(<16 x i16> %and1, <16 x i16> %and2)
69 %conv = trunc <16 x i16> %hadd to <16 x i8>
73 define <4 x i32> @hsub_select_shl_v4i32(<4 x i32> %x, <4 x i32> %y) {
74 ; CHECK-LABEL: hsub_select_shl_v4i32:
76 ; CHECK-NEXT: vxorps %xmm0, %xmm0, %xmm0
78 %or1 = or <4 x i32> %x, <i32 65535, i32 65535, i32 65535, i32 65535>
79 %or2 = or <4 x i32> %y, <i32 65535, i32 65535, i32 65535, i32 65535>
80 %hsub = tail call <4 x i32> @llvm.x86.ssse3.phsub.d.128(<4 x i32> %or1, <4 x i32> %or2)
81 %shl = shl <4 x i32> %hsub, <i32 16, i32 16, i32 16, i32 16>
82 %cond = icmp ule <4 x i32> %shl, <i32 8, i32 8, i32 8, i32 8>
83 %ret = select <4 x i1> %cond, <4 x i32> zeroinitializer, <4 x i32> %hsub
87 define <8 x i8> @hsub_trunc_v8i16(<8 x i16> %x, <8 x i16> %y) {
88 ; CHECK-LABEL: hsub_trunc_v8i16:
89 ; CHECK: # %bb.0: # %entry
90 ; CHECK-NEXT: vxorps %xmm0, %xmm0, %xmm0
93 %or1 = or <8 x i16> %x, <i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255>
94 %or2 = or <8 x i16> %y, <i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255>
95 %hsub = tail call <8 x i16> @llvm.x86.ssse3.phsub.w.128(<8 x i16> %or1, <8 x i16> %or2)
96 %conv = trunc <8 x i16> %hsub to <8 x i8>
100 define <8 x i16> @hsub_trunc_v8i32(<8 x i32> %x, <8 x i32> %y) {
101 ; CHECK-LABEL: hsub_trunc_v8i32:
102 ; CHECK: # %bb.0: # %entry
103 ; CHECK-NEXT: vxorps %xmm0, %xmm0, %xmm0
106 %or1 = or <8 x i32> %x, <i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535>
107 %or2 = or <8 x i32> %y, <i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535>
108 %hsub = tail call <8 x i32> @llvm.x86.avx2.phsub.d(<8 x i32> %or1, <8 x i32> %or2)
109 %conv = trunc <8 x i32> %hsub to <8 x i16>
113 define <16 x i8> @hsub_trunc_v16i16(<16 x i16> %x, <16 x i16> %y) {
114 ; CHECK-LABEL: hsub_trunc_v16i16:
115 ; CHECK: # %bb.0: # %entry
116 ; CHECK-NEXT: vxorps %xmm0, %xmm0, %xmm0
119 %or1 = or <16 x i16> %x, <i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255>
120 %or2 = or <16 x i16> %y, <i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255>
121 %hsub = tail call <16 x i16> @llvm.x86.avx2.phsub.w(<16 x i16> %or1, <16 x i16> %or2)
122 %conv = trunc <16 x i16> %hsub to <16 x i8>
126 define <8 x i16> @hadd_extract_2st_trunc_v8i32(<8 x i32> %x, <8 x i32> %y) {
127 ; CHECK-LABEL: hadd_extract_2st_trunc_v8i32:
128 ; CHECK: # %bb.0: # %entry
129 ; CHECK-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
130 ; CHECK-NEXT: vphaddd %xmm0, %xmm0, %xmm0
131 ; CHECK-NEXT: vpshufb {{.*#+}} xmm0 = zero,zero,xmm0[4,5],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
132 ; CHECK-NEXT: vzeroupper
135 %and1 = and <8 x i32> %x, <i32 -1, i32 -1, i32 3, i32 3, i32 -1, i32 -1, i32 -1, i32 -1>
136 %and2 = and <8 x i32> %y, <i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1>
137 %hadd = tail call <8 x i32> @llvm.x86.avx2.phadd.d(<8 x i32> %and1, <8 x i32> %and2)
138 %andr = and <8 x i32> %hadd, <i32 0, i32 -1, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
139 %conv = trunc <8 x i32> %andr to <8 x i16>
143 define <8 x i16> @hadd_extract_8th_trunc_v8i32(<8 x i32> %x, <8 x i32> %y) {
144 ; CHECK-LABEL: hadd_extract_8th_trunc_v8i32:
145 ; CHECK: # %bb.0: # %entry
146 ; CHECK-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1
147 ; CHECK-NEXT: vphaddd %ymm1, %ymm0, %ymm0
148 ; CHECK-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,17,20,21,24,25,28,29,u,u,u,u,u,u,u,u]
149 ; CHECK-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
150 ; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1
151 ; CHECK-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3,4,5,6],xmm0[7]
152 ; CHECK-NEXT: vzeroupper
155 %and1 = and <8 x i32> %x, <i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1>
156 %and2 = and <8 x i32> %y, <i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 3, i32 3>
157 %hadd = tail call <8 x i32> @llvm.x86.avx2.phadd.d(<8 x i32> %and1, <8 x i32> %and2)
158 %andr = and <8 x i32> %hadd, <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 -1>
159 %conv = trunc <8 x i32> %andr to <8 x i16>
163 define <8 x i16> @hadd_extract_2st_trunc_redundant_and_v4i32(<8 x i32> %x, <8 x i32> %y) {
164 ; CHECK-LABEL: hadd_extract_2st_trunc_redundant_and_v4i32:
165 ; CHECK: # %bb.0: # %entry
166 ; CHECK-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
167 ; CHECK-NEXT: vphaddd %xmm0, %xmm0, %xmm0
168 ; CHECK-NEXT: vpshufb {{.*#+}} xmm0 = zero,zero,xmm0[4,5],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
169 ; CHECK-NEXT: vzeroupper
172 %and1 = and <8 x i32> %x, <i32 3, i32 3, i32 -1, i32 -1, i32 3, i32 3, i32 3, i32 3>
173 %and2 = and <8 x i32> %y, <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
174 %hadd = tail call <8 x i32> @llvm.x86.avx2.phadd.d(<8 x i32> %and1, <8 x i32> %and2)
175 %andr = and <8 x i32> %hadd, <i32 0, i32 -1, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
176 %conv = trunc <8 x i32> %andr to <8 x i16>
180 define <8 x i16> @hadd_extract_4th_trunc_redundant_and_v4i32(<8 x i32> %x, <8 x i32> %y) {
181 ; CHECK-LABEL: hadd_extract_4th_trunc_redundant_and_v4i32:
182 ; CHECK: # %bb.0: # %entry
183 ; CHECK-NEXT: vpbroadcastd {{.*#+}} ymm2 = [3,3,3,3,3,3,3,3]
184 ; CHECK-NEXT: vpand %ymm2, %ymm0, %ymm0
185 ; CHECK-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1
186 ; CHECK-NEXT: vphaddd %ymm1, %ymm0, %ymm0
187 ; CHECK-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,17,20,21,24,25,28,29,u,u,u,u,u,u,u,u]
188 ; CHECK-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
189 ; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1
190 ; CHECK-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3,4,5,6],xmm0[7]
191 ; CHECK-NEXT: vzeroupper
194 %and1 = and <8 x i32> %x, <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
195 %and2 = and <8 x i32> %y, <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 -1, i32 -1>
196 %hadd = tail call <8 x i32> @llvm.x86.avx2.phadd.d(<8 x i32> %and1, <8 x i32> %and2)
197 %andr = and <8 x i32> %hadd, <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 -1>
198 %conv = trunc <8 x i32> %andr to <8 x i16>