1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc -global-isel -amdgpu-codegenprepare-widen-16-bit-ops=0 -mtriple=amdgcn-mesa-mesa3d -mcpu=tahiti < %s | FileCheck -check-prefixes=GCN,GFX6 %s
3 ; RUN: llc -global-isel -amdgpu-codegenprepare-widen-16-bit-ops=0 -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx900 < %s | FileCheck -check-prefixes=GCN,GFX9 %s
4 ; RUN: llc -global-isel -amdgpu-codegenprepare-widen-16-bit-ops=0 -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1010 < %s | FileCheck -check-prefixes=GFX10 %s
6 define amdgpu_ps i32 @s_andn2_i32(i32 inreg %src0, i32 inreg %src1) {
7 ; GCN-LABEL: s_andn2_i32:
9 ; GCN-NEXT: s_andn2_b32 s0, s2, s3
10 ; GCN-NEXT: ; return to shader part epilog
12 ; GFX10-LABEL: s_andn2_i32:
14 ; GFX10-NEXT: s_andn2_b32 s0, s2, s3
15 ; GFX10-NEXT: ; return to shader part epilog
16 %not.src1 = xor i32 %src1, -1
17 %and = and i32 %src0, %not.src1
21 define amdgpu_ps i32 @s_andn2_i32_commute(i32 inreg %src0, i32 inreg %src1) {
22 ; GCN-LABEL: s_andn2_i32_commute:
24 ; GCN-NEXT: s_andn2_b32 s0, s2, s3
25 ; GCN-NEXT: ; return to shader part epilog
27 ; GFX10-LABEL: s_andn2_i32_commute:
29 ; GFX10-NEXT: s_andn2_b32 s0, s2, s3
30 ; GFX10-NEXT: ; return to shader part epilog
31 %not.src1 = xor i32 %src1, -1
32 %and = and i32 %not.src1, %src0
36 define amdgpu_ps { i32, i32 } @s_andn2_i32_multi_use(i32 inreg %src0, i32 inreg %src1) {
37 ; GCN-LABEL: s_andn2_i32_multi_use:
39 ; GCN-NEXT: s_not_b32 s1, s3
40 ; GCN-NEXT: s_andn2_b32 s0, s2, s3
41 ; GCN-NEXT: ; return to shader part epilog
43 ; GFX10-LABEL: s_andn2_i32_multi_use:
45 ; GFX10-NEXT: s_andn2_b32 s0, s2, s3
46 ; GFX10-NEXT: s_not_b32 s1, s3
47 ; GFX10-NEXT: ; return to shader part epilog
48 %not.src1 = xor i32 %src1, -1
49 %and = and i32 %src0, %not.src1
50 %insert.0 = insertvalue { i32, i32 } undef, i32 %and, 0
51 %insert.1 = insertvalue { i32, i32 } %insert.0, i32 %not.src1, 1
52 ret { i32, i32 } %insert.1
55 define amdgpu_ps { i32, i32 } @s_andn2_i32_multi_foldable_use(i32 inreg %src0, i32 inreg %src1, i32 inreg %src2) {
56 ; GCN-LABEL: s_andn2_i32_multi_foldable_use:
58 ; GCN-NEXT: s_andn2_b32 s0, s2, s4
59 ; GCN-NEXT: s_andn2_b32 s1, s3, s4
60 ; GCN-NEXT: ; return to shader part epilog
62 ; GFX10-LABEL: s_andn2_i32_multi_foldable_use:
64 ; GFX10-NEXT: s_andn2_b32 s0, s2, s4
65 ; GFX10-NEXT: s_andn2_b32 s1, s3, s4
66 ; GFX10-NEXT: ; return to shader part epilog
67 %not.src2 = xor i32 %src2, -1
68 %and0 = and i32 %src0, %not.src2
69 %and1 = and i32 %src1, %not.src2
70 %insert.0 = insertvalue { i32, i32 } undef, i32 %and0, 0
71 %insert.1 = insertvalue { i32, i32 } %insert.0, i32 %and1, 1
72 ret { i32, i32 } %insert.1
75 define i32 @v_andn2_i32(i32 %src0, i32 %src1) {
76 ; GCN-LABEL: v_andn2_i32:
78 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
79 ; GCN-NEXT: v_xor_b32_e32 v1, -1, v1
80 ; GCN-NEXT: v_and_b32_e32 v0, v0, v1
81 ; GCN-NEXT: s_setpc_b64 s[30:31]
83 ; GFX10-LABEL: v_andn2_i32:
85 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
86 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
87 ; GFX10-NEXT: v_xor_b32_e32 v1, -1, v1
88 ; GFX10-NEXT: v_and_b32_e32 v0, v0, v1
89 ; GFX10-NEXT: s_setpc_b64 s[30:31]
90 %not.src1 = xor i32 %src1, -1
91 %and = and i32 %src0, %not.src1
95 define amdgpu_ps float @v_andn2_i32_sv(i32 inreg %src0, i32 %src1) {
96 ; GCN-LABEL: v_andn2_i32_sv:
98 ; GCN-NEXT: v_xor_b32_e32 v0, -1, v0
99 ; GCN-NEXT: v_and_b32_e32 v0, s2, v0
100 ; GCN-NEXT: ; return to shader part epilog
102 ; GFX10-LABEL: v_andn2_i32_sv:
104 ; GFX10-NEXT: v_xor_b32_e32 v0, -1, v0
105 ; GFX10-NEXT: v_and_b32_e32 v0, s2, v0
106 ; GFX10-NEXT: ; return to shader part epilog
107 %not.src1 = xor i32 %src1, -1
108 %and = and i32 %src0, %not.src1
109 %cast = bitcast i32 %and to float
113 define amdgpu_ps float @v_andn2_i32_vs(i32 %src0, i32 inreg %src1) {
114 ; GCN-LABEL: v_andn2_i32_vs:
116 ; GCN-NEXT: s_not_b32 s0, s2
117 ; GCN-NEXT: v_and_b32_e32 v0, s0, v0
118 ; GCN-NEXT: ; return to shader part epilog
120 ; GFX10-LABEL: v_andn2_i32_vs:
122 ; GFX10-NEXT: s_not_b32 s0, s2
123 ; GFX10-NEXT: v_and_b32_e32 v0, s0, v0
124 ; GFX10-NEXT: ; return to shader part epilog
125 %not.src1 = xor i32 %src1, -1
126 %and = and i32 %src0, %not.src1
127 %cast = bitcast i32 %and to float
131 define amdgpu_ps i64 @s_andn2_i64(i64 inreg %src0, i64 inreg %src1) {
132 ; GCN-LABEL: s_andn2_i64:
134 ; GCN-NEXT: s_andn2_b64 s[0:1], s[2:3], s[4:5]
135 ; GCN-NEXT: ; return to shader part epilog
137 ; GFX10-LABEL: s_andn2_i64:
139 ; GFX10-NEXT: s_andn2_b64 s[0:1], s[2:3], s[4:5]
140 ; GFX10-NEXT: ; return to shader part epilog
141 %not.src1 = xor i64 %src1, -1
142 %and = and i64 %src0, %not.src1
146 define amdgpu_ps i64 @s_andn2_i64_commute(i64 inreg %src0, i64 inreg %src1) {
147 ; GCN-LABEL: s_andn2_i64_commute:
149 ; GCN-NEXT: s_andn2_b64 s[0:1], s[2:3], s[4:5]
150 ; GCN-NEXT: ; return to shader part epilog
152 ; GFX10-LABEL: s_andn2_i64_commute:
154 ; GFX10-NEXT: s_andn2_b64 s[0:1], s[2:3], s[4:5]
155 ; GFX10-NEXT: ; return to shader part epilog
156 %not.src1 = xor i64 %src1, -1
157 %and = and i64 %not.src1, %src0
161 define amdgpu_ps { i64, i64 } @s_andn2_i64_multi_foldable_use(i64 inreg %src0, i64 inreg %src1, i64 inreg %src2) {
162 ; GCN-LABEL: s_andn2_i64_multi_foldable_use:
164 ; GCN-NEXT: s_andn2_b64 s[0:1], s[2:3], s[6:7]
165 ; GCN-NEXT: s_andn2_b64 s[2:3], s[4:5], s[6:7]
166 ; GCN-NEXT: ; return to shader part epilog
168 ; GFX10-LABEL: s_andn2_i64_multi_foldable_use:
170 ; GFX10-NEXT: s_andn2_b64 s[0:1], s[2:3], s[6:7]
171 ; GFX10-NEXT: s_andn2_b64 s[2:3], s[4:5], s[6:7]
172 ; GFX10-NEXT: ; return to shader part epilog
173 %not.src2 = xor i64 %src2, -1
174 %and0 = and i64 %src0, %not.src2
175 %and1 = and i64 %src1, %not.src2
176 %insert.0 = insertvalue { i64, i64 } undef, i64 %and0, 0
177 %insert.1 = insertvalue { i64, i64 } %insert.0, i64 %and1, 1
178 ret { i64, i64 } %insert.1
181 define amdgpu_ps { i64, i64 } @s_andn2_i64_multi_use(i64 inreg %src0, i64 inreg %src1) {
182 ; GCN-LABEL: s_andn2_i64_multi_use:
184 ; GCN-NEXT: s_not_b64 s[6:7], s[4:5]
185 ; GCN-NEXT: s_andn2_b64 s[0:1], s[2:3], s[4:5]
186 ; GCN-NEXT: s_mov_b32 s2, s6
187 ; GCN-NEXT: s_mov_b32 s3, s7
188 ; GCN-NEXT: ; return to shader part epilog
190 ; GFX10-LABEL: s_andn2_i64_multi_use:
192 ; GFX10-NEXT: s_andn2_b64 s[0:1], s[2:3], s[4:5]
193 ; GFX10-NEXT: s_not_b64 s[2:3], s[4:5]
194 ; GFX10-NEXT: ; return to shader part epilog
195 %not.src1 = xor i64 %src1, -1
196 %and = and i64 %src0, %not.src1
197 %insert.0 = insertvalue { i64, i64 } undef, i64 %and, 0
198 %insert.1 = insertvalue { i64, i64 } %insert.0, i64 %not.src1, 1
199 ret { i64, i64 } %insert.1
202 define i64 @v_andn2_i64(i64 %src0, i64 %src1) {
203 ; GCN-LABEL: v_andn2_i64:
205 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
206 ; GCN-NEXT: v_xor_b32_e32 v2, -1, v2
207 ; GCN-NEXT: v_xor_b32_e32 v3, -1, v3
208 ; GCN-NEXT: v_and_b32_e32 v0, v0, v2
209 ; GCN-NEXT: v_and_b32_e32 v1, v1, v3
210 ; GCN-NEXT: s_setpc_b64 s[30:31]
212 ; GFX10-LABEL: v_andn2_i64:
214 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
215 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
216 ; GFX10-NEXT: v_xor_b32_e32 v2, -1, v2
217 ; GFX10-NEXT: v_xor_b32_e32 v3, -1, v3
218 ; GFX10-NEXT: v_and_b32_e32 v0, v0, v2
219 ; GFX10-NEXT: v_and_b32_e32 v1, v1, v3
220 ; GFX10-NEXT: s_setpc_b64 s[30:31]
221 %not.src1 = xor i64 %src1, -1
222 %and = and i64 %src0, %not.src1
226 define amdgpu_ps <2 x float> @v_andn2_i64_sv(i64 inreg %src0, i64 %src1) {
227 ; GCN-LABEL: v_andn2_i64_sv:
229 ; GCN-NEXT: v_xor_b32_e32 v0, -1, v0
230 ; GCN-NEXT: v_xor_b32_e32 v1, -1, v1
231 ; GCN-NEXT: v_and_b32_e32 v0, s2, v0
232 ; GCN-NEXT: v_and_b32_e32 v1, s3, v1
233 ; GCN-NEXT: ; return to shader part epilog
235 ; GFX10-LABEL: v_andn2_i64_sv:
237 ; GFX10-NEXT: v_xor_b32_e32 v0, -1, v0
238 ; GFX10-NEXT: v_xor_b32_e32 v1, -1, v1
239 ; GFX10-NEXT: v_and_b32_e32 v0, s2, v0
240 ; GFX10-NEXT: v_and_b32_e32 v1, s3, v1
241 ; GFX10-NEXT: ; return to shader part epilog
242 %not.src1 = xor i64 %src1, -1
243 %and = and i64 %src0, %not.src1
244 %cast = bitcast i64 %and to <2 x float>
245 ret <2 x float> %cast
248 define amdgpu_ps <2 x float> @v_andn2_i64_vs(i64 %src0, i64 inreg %src1) {
249 ; GCN-LABEL: v_andn2_i64_vs:
251 ; GCN-NEXT: s_not_b64 s[0:1], s[2:3]
252 ; GCN-NEXT: v_and_b32_e32 v0, s0, v0
253 ; GCN-NEXT: v_and_b32_e32 v1, s1, v1
254 ; GCN-NEXT: ; return to shader part epilog
256 ; GFX10-LABEL: v_andn2_i64_vs:
258 ; GFX10-NEXT: s_not_b64 s[0:1], s[2:3]
259 ; GFX10-NEXT: v_and_b32_e32 v0, s0, v0
260 ; GFX10-NEXT: v_and_b32_e32 v1, s1, v1
261 ; GFX10-NEXT: ; return to shader part epilog
262 %not.src1 = xor i64 %src1, -1
263 %and = and i64 %src0, %not.src1
264 %cast = bitcast i64 %and to <2 x float>
265 ret <2 x float> %cast
268 define amdgpu_ps <2 x i32> @s_andn2_v2i32(<2 x i32> inreg %src0, <2 x i32> inreg %src1) {
269 ; GCN-LABEL: s_andn2_v2i32:
271 ; GCN-NEXT: s_andn2_b64 s[0:1], s[2:3], s[4:5]
272 ; GCN-NEXT: ; return to shader part epilog
274 ; GFX10-LABEL: s_andn2_v2i32:
276 ; GFX10-NEXT: s_andn2_b64 s[0:1], s[2:3], s[4:5]
277 ; GFX10-NEXT: ; return to shader part epilog
278 %not.src1 = xor <2 x i32> %src1, <i32 -1, i32 -1>
279 %and = and <2 x i32> %src0, %not.src1
283 define amdgpu_ps <2 x i32> @s_andn2_v2i32_commute(<2 x i32> inreg %src0, <2 x i32> inreg %src1) {
284 ; GCN-LABEL: s_andn2_v2i32_commute:
286 ; GCN-NEXT: s_andn2_b64 s[0:1], s[2:3], s[4:5]
287 ; GCN-NEXT: ; return to shader part epilog
289 ; GFX10-LABEL: s_andn2_v2i32_commute:
291 ; GFX10-NEXT: s_andn2_b64 s[0:1], s[2:3], s[4:5]
292 ; GFX10-NEXT: ; return to shader part epilog
293 %not.src1 = xor <2 x i32> %src1, <i32 -1, i32 -1>
294 %and = and <2 x i32> %not.src1, %src0
298 define amdgpu_ps i16 @s_andn2_i16(i16 inreg %src0, i16 inreg %src1) {
299 ; GCN-LABEL: s_andn2_i16:
301 ; GCN-NEXT: s_andn2_b32 s0, s2, s3
302 ; GCN-NEXT: ; return to shader part epilog
304 ; GFX10-LABEL: s_andn2_i16:
306 ; GFX10-NEXT: s_andn2_b32 s0, s2, s3
307 ; GFX10-NEXT: ; return to shader part epilog
308 %not.src1 = xor i16 %src1, -1
309 %and = and i16 %src0, %not.src1
313 define amdgpu_ps i16 @s_andn2_i16_commute(i16 inreg %src0, i16 inreg %src1) {
314 ; GCN-LABEL: s_andn2_i16_commute:
316 ; GCN-NEXT: s_andn2_b32 s0, s2, s3
317 ; GCN-NEXT: ; return to shader part epilog
319 ; GFX10-LABEL: s_andn2_i16_commute:
321 ; GFX10-NEXT: s_andn2_b32 s0, s2, s3
322 ; GFX10-NEXT: ; return to shader part epilog
323 %not.src1 = xor i16 %src1, -1
324 %and = and i16 %not.src1, %src0
328 define amdgpu_ps { i16, i16 } @s_andn2_i16_multi_use(i16 inreg %src0, i16 inreg %src1) {
329 ; GCN-LABEL: s_andn2_i16_multi_use:
331 ; GCN-NEXT: s_xor_b32 s1, s3, -1
332 ; GCN-NEXT: s_andn2_b32 s0, s2, s3
333 ; GCN-NEXT: ; return to shader part epilog
335 ; GFX10-LABEL: s_andn2_i16_multi_use:
337 ; GFX10-NEXT: s_andn2_b32 s0, s2, s3
338 ; GFX10-NEXT: s_xor_b32 s1, s3, -1
339 ; GFX10-NEXT: ; return to shader part epilog
340 %not.src1 = xor i16 %src1, -1
341 %and = and i16 %src0, %not.src1
342 %insert.0 = insertvalue { i16, i16 } undef, i16 %and, 0
343 %insert.1 = insertvalue { i16, i16 } %insert.0, i16 %not.src1, 1
344 ret { i16, i16 } %insert.1
347 define amdgpu_ps { i16, i16 } @s_andn2_i16_multi_foldable_use(i16 inreg %src0, i16 inreg %src1, i16 inreg %src2) {
348 ; GCN-LABEL: s_andn2_i16_multi_foldable_use:
350 ; GCN-NEXT: s_andn2_b32 s0, s2, s4
351 ; GCN-NEXT: s_andn2_b32 s1, s3, s4
352 ; GCN-NEXT: ; return to shader part epilog
354 ; GFX10-LABEL: s_andn2_i16_multi_foldable_use:
356 ; GFX10-NEXT: s_andn2_b32 s0, s2, s4
357 ; GFX10-NEXT: s_andn2_b32 s1, s3, s4
358 ; GFX10-NEXT: ; return to shader part epilog
359 %not.src2 = xor i16 %src2, -1
360 %and0 = and i16 %src0, %not.src2
361 %and1 = and i16 %src1, %not.src2
362 %insert.0 = insertvalue { i16, i16 } undef, i16 %and0, 0
363 %insert.1 = insertvalue { i16, i16 } %insert.0, i16 %and1, 1
364 ret { i16, i16 } %insert.1
367 define i16 @v_andn2_i16(i16 %src0, i16 %src1) {
368 ; GCN-LABEL: v_andn2_i16:
370 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
371 ; GCN-NEXT: v_xor_b32_e32 v1, -1, v1
372 ; GCN-NEXT: v_and_b32_e32 v0, v0, v1
373 ; GCN-NEXT: s_setpc_b64 s[30:31]
375 ; GFX10-LABEL: v_andn2_i16:
377 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
378 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
379 ; GFX10-NEXT: v_xor_b32_e32 v1, -1, v1
380 ; GFX10-NEXT: v_and_b32_e32 v0, v0, v1
381 ; GFX10-NEXT: s_setpc_b64 s[30:31]
382 %not.src1 = xor i16 %src1, -1
383 %and = and i16 %src0, %not.src1
387 define amdgpu_ps float @v_andn2_i16_sv(i16 inreg %src0, i16 %src1) {
388 ; GCN-LABEL: v_andn2_i16_sv:
390 ; GCN-NEXT: v_xor_b32_e32 v0, -1, v0
391 ; GCN-NEXT: v_and_b32_e32 v0, s2, v0
392 ; GCN-NEXT: v_bfe_u32 v0, v0, 0, 16
393 ; GCN-NEXT: ; return to shader part epilog
395 ; GFX10-LABEL: v_andn2_i16_sv:
397 ; GFX10-NEXT: v_xor_b32_e32 v0, -1, v0
398 ; GFX10-NEXT: v_and_b32_e32 v0, s2, v0
399 ; GFX10-NEXT: v_bfe_u32 v0, v0, 0, 16
400 ; GFX10-NEXT: ; return to shader part epilog
401 %not.src1 = xor i16 %src1, -1
402 %and = and i16 %src0, %not.src1
403 %zext = zext i16 %and to i32
404 %cast.zext = bitcast i32 %zext to float
408 define amdgpu_ps float @v_andn2_i16_vs(i16 %src0, i16 inreg %src1) {
409 ; GCN-LABEL: v_andn2_i16_vs:
411 ; GCN-NEXT: s_xor_b32 s0, s2, -1
412 ; GCN-NEXT: v_and_b32_e32 v0, s0, v0
413 ; GCN-NEXT: v_bfe_u32 v0, v0, 0, 16
414 ; GCN-NEXT: ; return to shader part epilog
416 ; GFX10-LABEL: v_andn2_i16_vs:
418 ; GFX10-NEXT: s_xor_b32 s0, s2, -1
419 ; GFX10-NEXT: v_and_b32_e32 v0, s0, v0
420 ; GFX10-NEXT: v_bfe_u32 v0, v0, 0, 16
421 ; GFX10-NEXT: ; return to shader part epilog
422 %not.src1 = xor i16 %src1, -1
423 %and = and i16 %src0, %not.src1
424 %zext = zext i16 %and to i32
425 %cast.zext = bitcast i32 %zext to float
429 define amdgpu_ps i32 @s_andn2_v2i16(<2 x i16> inreg %src0, <2 x i16> inreg %src1) {
430 ; GFX6-LABEL: s_andn2_v2i16:
432 ; GFX6-NEXT: s_mov_b32 s1, 0xffff
433 ; GFX6-NEXT: s_lshl_b32 s0, s3, 16
434 ; GFX6-NEXT: s_and_b32 s2, s2, s1
435 ; GFX6-NEXT: s_or_b32 s0, s0, s2
436 ; GFX6-NEXT: s_lshl_b32 s2, s5, 16
437 ; GFX6-NEXT: s_and_b32 s1, s4, s1
438 ; GFX6-NEXT: s_or_b32 s1, s2, s1
439 ; GFX6-NEXT: s_xor_b32 s1, s1, -1
440 ; GFX6-NEXT: s_and_b32 s0, s0, s1
441 ; GFX6-NEXT: ; return to shader part epilog
443 ; GFX9-LABEL: s_andn2_v2i16:
445 ; GFX9-NEXT: s_andn2_b32 s0, s2, s3
446 ; GFX9-NEXT: ; return to shader part epilog
448 ; GFX10-LABEL: s_andn2_v2i16:
450 ; GFX10-NEXT: s_andn2_b32 s0, s2, s3
451 ; GFX10-NEXT: ; return to shader part epilog
452 %not.src1 = xor <2 x i16> %src1, <i16 -1, i16 -1>
453 %and = and <2 x i16> %src0, %not.src1
454 %cast = bitcast <2 x i16> %and to i32
458 define amdgpu_ps i32 @s_andn2_v2i16_commute(<2 x i16> inreg %src0, <2 x i16> inreg %src1) {
459 ; GFX6-LABEL: s_andn2_v2i16_commute:
461 ; GFX6-NEXT: s_mov_b32 s1, 0xffff
462 ; GFX6-NEXT: s_lshl_b32 s0, s3, 16
463 ; GFX6-NEXT: s_and_b32 s2, s2, s1
464 ; GFX6-NEXT: s_or_b32 s0, s0, s2
465 ; GFX6-NEXT: s_lshl_b32 s2, s5, 16
466 ; GFX6-NEXT: s_and_b32 s1, s4, s1
467 ; GFX6-NEXT: s_or_b32 s1, s2, s1
468 ; GFX6-NEXT: s_xor_b32 s1, s1, -1
469 ; GFX6-NEXT: s_and_b32 s0, s1, s0
470 ; GFX6-NEXT: ; return to shader part epilog
472 ; GFX9-LABEL: s_andn2_v2i16_commute:
474 ; GFX9-NEXT: s_andn2_b32 s0, s2, s3
475 ; GFX9-NEXT: ; return to shader part epilog
477 ; GFX10-LABEL: s_andn2_v2i16_commute:
479 ; GFX10-NEXT: s_andn2_b32 s0, s2, s3
480 ; GFX10-NEXT: ; return to shader part epilog
481 %not.src1 = xor <2 x i16> %src1, <i16 -1, i16 -1>
482 %and = and <2 x i16> %not.src1, %src0
483 %cast = bitcast <2 x i16> %and to i32
487 define amdgpu_ps { i32, i32 } @s_andn2_v2i16_multi_use(<2 x i16> inreg %src0, <2 x i16> inreg %src1) {
488 ; GFX6-LABEL: s_andn2_v2i16_multi_use:
490 ; GFX6-NEXT: s_mov_b32 s1, 0xffff
491 ; GFX6-NEXT: s_lshl_b32 s0, s3, 16
492 ; GFX6-NEXT: s_and_b32 s2, s2, s1
493 ; GFX6-NEXT: s_or_b32 s0, s0, s2
494 ; GFX6-NEXT: s_lshl_b32 s2, s5, 16
495 ; GFX6-NEXT: s_and_b32 s1, s4, s1
496 ; GFX6-NEXT: s_or_b32 s1, s2, s1
497 ; GFX6-NEXT: s_xor_b32 s1, s1, -1
498 ; GFX6-NEXT: s_and_b32 s0, s0, s1
499 ; GFX6-NEXT: ; return to shader part epilog
501 ; GFX9-LABEL: s_andn2_v2i16_multi_use:
503 ; GFX9-NEXT: s_xor_b32 s1, s3, -1
504 ; GFX9-NEXT: s_andn2_b32 s0, s2, s3
505 ; GFX9-NEXT: ; return to shader part epilog
507 ; GFX10-LABEL: s_andn2_v2i16_multi_use:
509 ; GFX10-NEXT: s_andn2_b32 s0, s2, s3
510 ; GFX10-NEXT: s_xor_b32 s1, s3, -1
511 ; GFX10-NEXT: ; return to shader part epilog
512 %not.src1 = xor <2 x i16> %src1, <i16 -1, i16 -1>
513 %and = and <2 x i16> %src0, %not.src1
515 %cast.0 = bitcast <2 x i16> %and to i32
516 %cast.1 = bitcast <2 x i16> %not.src1 to i32
517 %insert.0 = insertvalue { i32, i32 } undef, i32 %cast.0, 0
518 %insert.1 = insertvalue { i32, i32 } %insert.0, i32 %cast.1, 1
519 ret { i32, i32 } %insert.1
522 define amdgpu_ps { i32, i32 } @s_andn2_v2i16_multi_foldable_use(<2 x i16> inreg %src0, <2 x i16> inreg %src1, <2 x i16> inreg %src2) {
523 ; GFX6-LABEL: s_andn2_v2i16_multi_foldable_use:
525 ; GFX6-NEXT: s_mov_b32 s1, 0xffff
526 ; GFX6-NEXT: s_lshl_b32 s0, s3, 16
527 ; GFX6-NEXT: s_and_b32 s2, s2, s1
528 ; GFX6-NEXT: s_or_b32 s0, s0, s2
529 ; GFX6-NEXT: s_lshl_b32 s2, s5, 16
530 ; GFX6-NEXT: s_and_b32 s3, s4, s1
531 ; GFX6-NEXT: s_or_b32 s2, s2, s3
532 ; GFX6-NEXT: s_lshl_b32 s3, s7, 16
533 ; GFX6-NEXT: s_and_b32 s1, s6, s1
534 ; GFX6-NEXT: s_or_b32 s1, s3, s1
535 ; GFX6-NEXT: s_xor_b32 s1, s1, -1
536 ; GFX6-NEXT: s_and_b32 s0, s0, s1
537 ; GFX6-NEXT: s_and_b32 s1, s2, s1
538 ; GFX6-NEXT: ; return to shader part epilog
540 ; GFX9-LABEL: s_andn2_v2i16_multi_foldable_use:
542 ; GFX9-NEXT: s_andn2_b32 s0, s2, s4
543 ; GFX9-NEXT: s_andn2_b32 s1, s3, s4
544 ; GFX9-NEXT: ; return to shader part epilog
546 ; GFX10-LABEL: s_andn2_v2i16_multi_foldable_use:
548 ; GFX10-NEXT: s_andn2_b32 s0, s2, s4
549 ; GFX10-NEXT: s_andn2_b32 s1, s3, s4
550 ; GFX10-NEXT: ; return to shader part epilog
551 %not.src2 = xor <2 x i16> %src2, <i16 -1, i16 -1>
552 %and0 = and <2 x i16> %src0, %not.src2
553 %and1 = and <2 x i16> %src1, %not.src2
555 %cast.0 = bitcast <2 x i16> %and0 to i32
556 %cast.1 = bitcast <2 x i16> %and1 to i32
557 %insert.0 = insertvalue { i32, i32 } undef, i32 %cast.0, 0
558 %insert.1 = insertvalue { i32, i32 } %insert.0, i32 %cast.1, 1
559 ret { i32, i32 } %insert.1
562 define <2 x i16> @v_andn2_v2i16(<2 x i16> %src0, <2 x i16> %src1) {
563 ; GFX6-LABEL: v_andn2_v2i16:
565 ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
566 ; GFX6-NEXT: v_mov_b32_e32 v4, 0xffff
567 ; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1
568 ; GFX6-NEXT: v_and_b32_e32 v0, v0, v4
569 ; GFX6-NEXT: v_or_b32_e32 v0, v1, v0
570 ; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v3
571 ; GFX6-NEXT: v_and_b32_e32 v2, v2, v4
572 ; GFX6-NEXT: v_or_b32_e32 v1, v1, v2
573 ; GFX6-NEXT: v_xor_b32_e32 v1, -1, v1
574 ; GFX6-NEXT: v_and_b32_e32 v0, v0, v1
575 ; GFX6-NEXT: v_lshrrev_b32_e32 v1, 16, v0
576 ; GFX6-NEXT: s_setpc_b64 s[30:31]
578 ; GFX9-LABEL: v_andn2_v2i16:
580 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
581 ; GFX9-NEXT: v_xor_b32_e32 v1, -1, v1
582 ; GFX9-NEXT: v_and_b32_e32 v0, v0, v1
583 ; GFX9-NEXT: s_setpc_b64 s[30:31]
585 ; GFX10-LABEL: v_andn2_v2i16:
587 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
588 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
589 ; GFX10-NEXT: v_xor_b32_e32 v1, -1, v1
590 ; GFX10-NEXT: v_and_b32_e32 v0, v0, v1
591 ; GFX10-NEXT: s_setpc_b64 s[30:31]
592 %not.src1 = xor <2 x i16> %src1, <i16 -1, i16 -1>
593 %and = and <2 x i16> %src0, %not.src1
598 ; define amdgpu_ps i48 @s_andn2_v3i16(<3 x i16> inreg %src0, <3 x i16> inreg %src1) {
599 ; %not.src1 = xor <3 x i16> %src1, <i16 -1, i16 -1, i16 -1>
600 ; %and = and <3 x i16> %src0, %not.src1
601 ; %cast = bitcast <3 x i16> %and to i48
605 ; define amdgpu_ps i48 @s_andn2_v3i16_commute(<3 x i16> inreg %src0, <3 x i16> inreg %src1) {
606 ; %not.src1 = xor <3 x i16> %src1, <i16 -1, i16 -1, i16 -1>
607 ; %and = and <3 x i16> %not.src1, %src0
608 ; %cast = bitcast <3 x i16> %and to i48
612 ; define amdgpu_ps { i48, i48 } @s_andn2_v3i16_multi_use(<3 x i16> inreg %src0, <3 x i16> inreg %src1) {
613 ; %not.src1 = xor <3 x i16> %src1, <i16 -1, i16 -1, i16 -1>
614 ; %and = and <3 x i16> %src0, %not.src1
616 ; %cast.0 = bitcast <3 x i16> %and to i48
617 ; %cast.1 = bitcast <3 x i16> %not.src1 to i48
618 ; %insert.0 = insertvalue { i48, i48 } undef, i48 %cast.0, 0
619 ; %insert.1 = insertvalue { i48, i48 } %insert.0, i48 %cast.1, 1
620 ; ret { i48, i48 } %insert.1
623 ; define <3 x i16> @v_andn2_v3i16(<3 x i16> %src0, <3 x i16> %src1) {
624 ; %not.src1 = xor <3 x i16> %src1, <i16 -1, i16 -1, i16 -11>
625 ; %and = and <3 x i16> %src0, %not.src1
629 define amdgpu_ps i64 @s_andn2_v4i16(<4 x i16> inreg %src0, <4 x i16> inreg %src1) {
630 ; GFX6-LABEL: s_andn2_v4i16:
632 ; GFX6-NEXT: s_lshl_b32 s0, s3, 16
633 ; GFX6-NEXT: s_mov_b32 s3, 0xffff
634 ; GFX6-NEXT: s_and_b32 s1, s2, s3
635 ; GFX6-NEXT: s_or_b32 s0, s0, s1
636 ; GFX6-NEXT: s_lshl_b32 s1, s5, 16
637 ; GFX6-NEXT: s_and_b32 s2, s4, s3
638 ; GFX6-NEXT: s_or_b32 s1, s1, s2
639 ; GFX6-NEXT: s_lshl_b32 s2, s7, 16
640 ; GFX6-NEXT: s_and_b32 s4, s6, s3
641 ; GFX6-NEXT: s_or_b32 s2, s2, s4
642 ; GFX6-NEXT: s_lshl_b32 s4, s9, 16
643 ; GFX6-NEXT: s_and_b32 s3, s8, s3
644 ; GFX6-NEXT: s_or_b32 s3, s4, s3
645 ; GFX6-NEXT: s_mov_b32 s4, -1
646 ; GFX6-NEXT: s_mov_b32 s5, s4
647 ; GFX6-NEXT: s_xor_b64 s[2:3], s[2:3], s[4:5]
648 ; GFX6-NEXT: s_and_b64 s[0:1], s[0:1], s[2:3]
649 ; GFX6-NEXT: ; return to shader part epilog
651 ; GFX9-LABEL: s_andn2_v4i16:
653 ; GFX9-NEXT: s_mov_b32 s0, -1
654 ; GFX9-NEXT: s_mov_b32 s1, s0
655 ; GFX9-NEXT: s_xor_b64 s[0:1], s[4:5], s[0:1]
656 ; GFX9-NEXT: s_and_b64 s[0:1], s[2:3], s[0:1]
657 ; GFX9-NEXT: ; return to shader part epilog
659 ; GFX10-LABEL: s_andn2_v4i16:
661 ; GFX10-NEXT: s_mov_b32 s0, -1
662 ; GFX10-NEXT: s_mov_b32 s1, s0
663 ; GFX10-NEXT: s_xor_b64 s[0:1], s[4:5], s[0:1]
664 ; GFX10-NEXT: s_and_b64 s[0:1], s[2:3], s[0:1]
665 ; GFX10-NEXT: ; return to shader part epilog
666 %not.src1 = xor <4 x i16> %src1, <i16 -1, i16 -1, i16 -1, i16 -1>
667 %and = and <4 x i16> %src0, %not.src1
668 %cast = bitcast <4 x i16> %and to i64
672 define amdgpu_ps i64 @s_andn2_v4i16_commute(<4 x i16> inreg %src0, <4 x i16> inreg %src1) {
673 ; GFX6-LABEL: s_andn2_v4i16_commute:
675 ; GFX6-NEXT: s_lshl_b32 s0, s3, 16
676 ; GFX6-NEXT: s_mov_b32 s3, 0xffff
677 ; GFX6-NEXT: s_and_b32 s1, s2, s3
678 ; GFX6-NEXT: s_or_b32 s0, s0, s1
679 ; GFX6-NEXT: s_lshl_b32 s1, s5, 16
680 ; GFX6-NEXT: s_and_b32 s2, s4, s3
681 ; GFX6-NEXT: s_or_b32 s1, s1, s2
682 ; GFX6-NEXT: s_lshl_b32 s2, s7, 16
683 ; GFX6-NEXT: s_and_b32 s4, s6, s3
684 ; GFX6-NEXT: s_or_b32 s2, s2, s4
685 ; GFX6-NEXT: s_lshl_b32 s4, s9, 16
686 ; GFX6-NEXT: s_and_b32 s3, s8, s3
687 ; GFX6-NEXT: s_or_b32 s3, s4, s3
688 ; GFX6-NEXT: s_mov_b32 s4, -1
689 ; GFX6-NEXT: s_mov_b32 s5, s4
690 ; GFX6-NEXT: s_xor_b64 s[2:3], s[2:3], s[4:5]
691 ; GFX6-NEXT: s_and_b64 s[0:1], s[2:3], s[0:1]
692 ; GFX6-NEXT: ; return to shader part epilog
694 ; GFX9-LABEL: s_andn2_v4i16_commute:
696 ; GFX9-NEXT: s_mov_b32 s0, -1
697 ; GFX9-NEXT: s_mov_b32 s1, s0
698 ; GFX9-NEXT: s_xor_b64 s[0:1], s[4:5], s[0:1]
699 ; GFX9-NEXT: s_and_b64 s[0:1], s[0:1], s[2:3]
700 ; GFX9-NEXT: ; return to shader part epilog
702 ; GFX10-LABEL: s_andn2_v4i16_commute:
704 ; GFX10-NEXT: s_mov_b32 s0, -1
705 ; GFX10-NEXT: s_mov_b32 s1, s0
706 ; GFX10-NEXT: s_xor_b64 s[0:1], s[4:5], s[0:1]
707 ; GFX10-NEXT: s_and_b64 s[0:1], s[0:1], s[2:3]
708 ; GFX10-NEXT: ; return to shader part epilog
709 %not.src1 = xor <4 x i16> %src1, <i16 -1, i16 -1, i16 -1, i16 -1>
710 %and = and <4 x i16> %not.src1, %src0
711 %cast = bitcast <4 x i16> %and to i64
715 define amdgpu_ps { i64, i64 } @s_andn2_v4i16_multi_use(<4 x i16> inreg %src0, <4 x i16> inreg %src1) {
716 ; GFX6-LABEL: s_andn2_v4i16_multi_use:
718 ; GFX6-NEXT: s_lshl_b32 s0, s3, 16
719 ; GFX6-NEXT: s_mov_b32 s3, 0xffff
720 ; GFX6-NEXT: s_and_b32 s1, s2, s3
721 ; GFX6-NEXT: s_or_b32 s0, s0, s1
722 ; GFX6-NEXT: s_lshl_b32 s1, s5, 16
723 ; GFX6-NEXT: s_and_b32 s2, s4, s3
724 ; GFX6-NEXT: s_or_b32 s1, s1, s2
725 ; GFX6-NEXT: s_lshl_b32 s2, s7, 16
726 ; GFX6-NEXT: s_and_b32 s4, s6, s3
727 ; GFX6-NEXT: s_or_b32 s2, s2, s4
728 ; GFX6-NEXT: s_lshl_b32 s4, s9, 16
729 ; GFX6-NEXT: s_and_b32 s3, s8, s3
730 ; GFX6-NEXT: s_or_b32 s3, s4, s3
731 ; GFX6-NEXT: s_mov_b32 s4, -1
732 ; GFX6-NEXT: s_mov_b32 s5, s4
733 ; GFX6-NEXT: s_xor_b64 s[2:3], s[2:3], s[4:5]
734 ; GFX6-NEXT: s_and_b64 s[0:1], s[0:1], s[2:3]
735 ; GFX6-NEXT: ; return to shader part epilog
737 ; GFX9-LABEL: s_andn2_v4i16_multi_use:
739 ; GFX9-NEXT: s_mov_b32 s0, -1
740 ; GFX9-NEXT: s_mov_b32 s1, s0
741 ; GFX9-NEXT: s_xor_b64 s[4:5], s[4:5], s[0:1]
742 ; GFX9-NEXT: s_and_b64 s[0:1], s[2:3], s[4:5]
743 ; GFX9-NEXT: s_mov_b32 s2, s4
744 ; GFX9-NEXT: s_mov_b32 s3, s5
745 ; GFX9-NEXT: ; return to shader part epilog
747 ; GFX10-LABEL: s_andn2_v4i16_multi_use:
749 ; GFX10-NEXT: s_mov_b32 s0, -1
750 ; GFX10-NEXT: s_mov_b32 s1, s0
751 ; GFX10-NEXT: s_xor_b64 s[4:5], s[4:5], s[0:1]
752 ; GFX10-NEXT: s_and_b64 s[0:1], s[2:3], s[4:5]
753 ; GFX10-NEXT: s_mov_b32 s2, s4
754 ; GFX10-NEXT: s_mov_b32 s3, s5
755 ; GFX10-NEXT: ; return to shader part epilog
756 %not.src1 = xor <4 x i16> %src1, <i16 -1, i16 -1, i16 -1, i16 -1>
757 %and = and <4 x i16> %src0, %not.src1
759 %cast.0 = bitcast <4 x i16> %and to i64
760 %cast.1 = bitcast <4 x i16> %not.src1 to i64
761 %insert.0 = insertvalue { i64, i64 } undef, i64 %cast.0, 0
762 %insert.1 = insertvalue { i64, i64 } %insert.0, i64 %cast.1, 1
763 ret { i64, i64 } %insert.1
766 define amdgpu_ps { i64, i64 } @s_andn2_v4i16_multi_foldable_use(<4 x i16> inreg %src0, <4 x i16> inreg %src1, <4 x i16> inreg %src2) {
767 ; GFX6-LABEL: s_andn2_v4i16_multi_foldable_use:
769 ; GFX6-NEXT: s_mov_b32 s14, 0xffff
770 ; GFX6-NEXT: s_lshl_b32 s0, s3, 16
771 ; GFX6-NEXT: s_and_b32 s1, s2, s14
772 ; GFX6-NEXT: s_or_b32 s0, s0, s1
773 ; GFX6-NEXT: s_lshl_b32 s1, s5, 16
774 ; GFX6-NEXT: s_and_b32 s2, s4, s14
775 ; GFX6-NEXT: s_or_b32 s1, s1, s2
776 ; GFX6-NEXT: s_lshl_b32 s2, s7, 16
777 ; GFX6-NEXT: s_and_b32 s3, s6, s14
778 ; GFX6-NEXT: s_or_b32 s2, s2, s3
779 ; GFX6-NEXT: s_lshl_b32 s3, s9, 16
780 ; GFX6-NEXT: s_and_b32 s4, s8, s14
781 ; GFX6-NEXT: s_or_b32 s3, s3, s4
782 ; GFX6-NEXT: s_lshl_b32 s4, s11, 16
783 ; GFX6-NEXT: s_and_b32 s5, s10, s14
784 ; GFX6-NEXT: s_or_b32 s4, s4, s5
785 ; GFX6-NEXT: s_lshl_b32 s5, s13, 16
786 ; GFX6-NEXT: s_and_b32 s6, s12, s14
787 ; GFX6-NEXT: s_or_b32 s5, s5, s6
788 ; GFX6-NEXT: s_mov_b32 s6, -1
789 ; GFX6-NEXT: s_mov_b32 s7, s6
790 ; GFX6-NEXT: s_xor_b64 s[4:5], s[4:5], s[6:7]
791 ; GFX6-NEXT: s_and_b64 s[0:1], s[0:1], s[4:5]
792 ; GFX6-NEXT: s_and_b64 s[2:3], s[2:3], s[4:5]
793 ; GFX6-NEXT: ; return to shader part epilog
795 ; GFX9-LABEL: s_andn2_v4i16_multi_foldable_use:
797 ; GFX9-NEXT: s_mov_b32 s0, -1
798 ; GFX9-NEXT: s_mov_b32 s1, s0
799 ; GFX9-NEXT: s_xor_b64 s[6:7], s[6:7], s[0:1]
800 ; GFX9-NEXT: s_and_b64 s[0:1], s[2:3], s[6:7]
801 ; GFX9-NEXT: s_and_b64 s[2:3], s[4:5], s[6:7]
802 ; GFX9-NEXT: ; return to shader part epilog
804 ; GFX10-LABEL: s_andn2_v4i16_multi_foldable_use:
806 ; GFX10-NEXT: s_mov_b32 s0, -1
807 ; GFX10-NEXT: s_mov_b32 s1, s0
808 ; GFX10-NEXT: s_xor_b64 s[6:7], s[6:7], s[0:1]
809 ; GFX10-NEXT: s_and_b64 s[0:1], s[2:3], s[6:7]
810 ; GFX10-NEXT: s_and_b64 s[2:3], s[4:5], s[6:7]
811 ; GFX10-NEXT: ; return to shader part epilog
812 %not.src2 = xor <4 x i16> %src2, <i16 -1, i16 -1, i16 -1, i16 -1>
813 %and0 = and <4 x i16> %src0, %not.src2
814 %and1 = and <4 x i16> %src1, %not.src2
816 %cast.0 = bitcast <4 x i16> %and0 to i64
817 %cast.1 = bitcast <4 x i16> %and1 to i64
818 %insert.0 = insertvalue { i64, i64 } undef, i64 %cast.0, 0
819 %insert.1 = insertvalue { i64, i64 } %insert.0, i64 %cast.1, 1
820 ret { i64, i64 } %insert.1
823 define <4 x i16> @v_andn2_v4i16(<4 x i16> %src0, <4 x i16> %src1) {
824 ; GFX6-LABEL: v_andn2_v4i16:
826 ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
827 ; GFX6-NEXT: v_mov_b32_e32 v8, 0xffff
828 ; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1
829 ; GFX6-NEXT: v_and_b32_e32 v0, v0, v8
830 ; GFX6-NEXT: v_or_b32_e32 v0, v1, v0
831 ; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v3
832 ; GFX6-NEXT: v_and_b32_e32 v2, v2, v8
833 ; GFX6-NEXT: v_or_b32_e32 v1, v1, v2
834 ; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v5
835 ; GFX6-NEXT: v_and_b32_e32 v3, v4, v8
836 ; GFX6-NEXT: v_or_b32_e32 v2, v2, v3
837 ; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v7
838 ; GFX6-NEXT: v_and_b32_e32 v4, v6, v8
839 ; GFX6-NEXT: v_or_b32_e32 v3, v3, v4
840 ; GFX6-NEXT: v_xor_b32_e32 v2, -1, v2
841 ; GFX6-NEXT: v_xor_b32_e32 v3, -1, v3
842 ; GFX6-NEXT: v_and_b32_e32 v0, v0, v2
843 ; GFX6-NEXT: v_and_b32_e32 v2, v1, v3
844 ; GFX6-NEXT: v_lshrrev_b32_e32 v1, 16, v0
845 ; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v2
846 ; GFX6-NEXT: s_setpc_b64 s[30:31]
848 ; GFX9-LABEL: v_andn2_v4i16:
850 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
851 ; GFX9-NEXT: v_xor_b32_e32 v2, -1, v2
852 ; GFX9-NEXT: v_xor_b32_e32 v3, -1, v3
853 ; GFX9-NEXT: v_and_b32_e32 v0, v0, v2
854 ; GFX9-NEXT: v_and_b32_e32 v1, v1, v3
855 ; GFX9-NEXT: s_setpc_b64 s[30:31]
857 ; GFX10-LABEL: v_andn2_v4i16:
859 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
860 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
861 ; GFX10-NEXT: v_xor_b32_e32 v2, -1, v2
862 ; GFX10-NEXT: v_xor_b32_e32 v3, -1, v3
863 ; GFX10-NEXT: v_and_b32_e32 v0, v0, v2
864 ; GFX10-NEXT: v_and_b32_e32 v1, v1, v3
865 ; GFX10-NEXT: s_setpc_b64 s[30:31]
866 %not.src1 = xor <4 x i16> %src1, <i16 -1, i16 -1, i16 -1, i16 -1>
867 %and = and <4 x i16> %src0, %not.src1