test/CodeGen/X86/combine-mul.ll

   1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
   2 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefix=SSE
   3 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=AVX
   4
   5 ; fold (mul x, 0) -> 0
   6 define <4 x i32> @combine_vec_mul_zero(<4 x i32> %x) {
   7 ; SSE-LABEL: combine_vec_mul_zero:
   8 ; SSE:       # %bb.0:
   9 ; SSE-NEXT:    xorps %xmm0, %xmm0
  10 ; SSE-NEXT:    retq
  11 ;
  12 ; AVX-LABEL: combine_vec_mul_zero:
  13 ; AVX:       # %bb.0:
  14 ; AVX-NEXT:    vxorps %xmm0, %xmm0, %xmm0
  15 ; AVX-NEXT:    retq
  16   %1 = mul <4 x i32> %x, zeroinitializer
  17   ret <4 x i32> %1
  18 }
  19
  20 ; fold (mul x, 1) -> x
  21 define <4 x i32> @combine_vec_mul_one(<4 x i32> %x) {
  22 ; SSE-LABEL: combine_vec_mul_one:
  23 ; SSE:       # %bb.0:
  24 ; SSE-NEXT:    retq
  25 ;
  26 ; AVX-LABEL: combine_vec_mul_one:
  27 ; AVX:       # %bb.0:
  28 ; AVX-NEXT:    retq
  29   %1 = mul <4 x i32> %x, <i32 1, i32 1, i32 1, i32 1>
  30   ret <4 x i32> %1
  31 }
  32
  33 ; fold (mul x, -1) -> 0-x
  34 define <4 x i32> @combine_vec_mul_negone(<4 x i32> %x) {
  35 ; SSE-LABEL: combine_vec_mul_negone:
  36 ; SSE:       # %bb.0:
  37 ; SSE-NEXT:    pxor %xmm1, %xmm1
  38 ; SSE-NEXT:    psubd %xmm0, %xmm1
  39 ; SSE-NEXT:    movdqa %xmm1, %xmm0
  40 ; SSE-NEXT:    retq
  41 ;
  42 ; AVX-LABEL: combine_vec_mul_negone:
  43 ; AVX:       # %bb.0:
  44 ; AVX-NEXT:    vpxor %xmm1, %xmm1, %xmm1
  45 ; AVX-NEXT:    vpsubd %xmm0, %xmm1, %xmm0
  46 ; AVX-NEXT:    retq
  47   %1 = mul <4 x i32> %x, <i32 -1, i32 -1, i32 -1, i32 -1>
  48   ret <4 x i32> %1
  49 }
  50
  51 ; fold (mul x, (1 << c)) -> x << c
  52 define <4 x i32> @combine_vec_mul_pow2a(<4 x i32> %x) {
  53 ; SSE-LABEL: combine_vec_mul_pow2a:
  54 ; SSE:       # %bb.0:
  55 ; SSE-NEXT:    paddd %xmm0, %xmm0
  56 ; SSE-NEXT:    retq
  57 ;
  58 ; AVX-LABEL: combine_vec_mul_pow2a:
  59 ; AVX:       # %bb.0:
  60 ; AVX-NEXT:    vpaddd %xmm0, %xmm0, %xmm0
  61 ; AVX-NEXT:    retq
  62   %1 = mul <4 x i32> %x, <i32 2, i32 2, i32 2, i32 2>
  63   ret <4 x i32> %1
  64 }
  65
  66 define <4 x i32> @combine_vec_mul_pow2b(<4 x i32> %x) {
  67 ; SSE-LABEL: combine_vec_mul_pow2b:
  68 ; SSE:       # %bb.0:
  69 ; SSE-NEXT:    pmulld {{.*}}(%rip), %xmm0
  70 ; SSE-NEXT:    retq
  71 ;
  72 ; AVX-LABEL: combine_vec_mul_pow2b:
  73 ; AVX:       # %bb.0:
  74 ; AVX-NEXT:    vpsllvd {{.*}}(%rip), %xmm0, %xmm0
  75 ; AVX-NEXT:    retq
  76   %1 = mul <4 x i32> %x, <i32 1, i32 2, i32 4, i32 16>
  77   ret <4 x i32> %1
  78 }
  79
  80 define <4 x i64> @combine_vec_mul_pow2c(<4 x i64> %x) {
  81 ; SSE-LABEL: combine_vec_mul_pow2c:
  82 ; SSE:       # %bb.0:
  83 ; SSE-NEXT:    movdqa %xmm0, %xmm2
  84 ; SSE-NEXT:    psllq $1, %xmm2
  85 ; SSE-NEXT:    pblendw {{.*#+}} xmm2 = xmm0[0,1,2,3],xmm2[4,5,6,7]
  86 ; SSE-NEXT:    movdqa %xmm1, %xmm0
  87 ; SSE-NEXT:    psllq $4, %xmm0
  88 ; SSE-NEXT:    psllq $2, %xmm1
  89 ; SSE-NEXT:    pblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm0[4,5,6,7]
  90 ; SSE-NEXT:    movdqa %xmm2, %xmm0
  91 ; SSE-NEXT:    retq
  92 ;
  93 ; AVX-LABEL: combine_vec_mul_pow2c:
  94 ; AVX:       # %bb.0:
  95 ; AVX-NEXT:    vpsllvq {{.*}}(%rip), %ymm0, %ymm0
  96 ; AVX-NEXT:    retq
  97   %1 = mul <4 x i64> %x, <i64 1, i64 2, i64 4, i64 16>
  98   ret <4 x i64> %1
  99 }
 100
 101 ; fold (mul x, -(1 << c)) -> -(x << c) or (-x) << c
 102 define <4 x i32> @combine_vec_mul_negpow2a(<4 x i32> %x) {
 103 ; SSE-LABEL: combine_vec_mul_negpow2a:
 104 ; SSE:       # %bb.0:
 105 ; SSE-NEXT:    paddd %xmm0, %xmm0
 106 ; SSE-NEXT:    pxor %xmm1, %xmm1
 107 ; SSE-NEXT:    psubd %xmm0, %xmm1
 108 ; SSE-NEXT:    movdqa %xmm1, %xmm0
 109 ; SSE-NEXT:    retq
 110 ;
 111 ; AVX-LABEL: combine_vec_mul_negpow2a:
 112 ; AVX:       # %bb.0:
 113 ; AVX-NEXT:    vpaddd %xmm0, %xmm0, %xmm0
 114 ; AVX-NEXT:    vpxor %xmm1, %xmm1, %xmm1
 115 ; AVX-NEXT:    vpsubd %xmm0, %xmm1, %xmm0
 116 ; AVX-NEXT:    retq
 117   %1 = mul <4 x i32> %x, <i32 -2, i32 -2, i32 -2, i32 -2>
 118   ret <4 x i32> %1
 119 }
 120
 121 define <4 x i32> @combine_vec_mul_negpow2b(<4 x i32> %x) {
 122 ; SSE-LABEL: combine_vec_mul_negpow2b:
 123 ; SSE:       # %bb.0:
 124 ; SSE-NEXT:    pmulld {{.*}}(%rip), %xmm0
 125 ; SSE-NEXT:    retq
 126 ;
 127 ; AVX-LABEL: combine_vec_mul_negpow2b:
 128 ; AVX:       # %bb.0:
 129 ; AVX-NEXT:    vpmulld {{.*}}(%rip), %xmm0, %xmm0
 130 ; AVX-NEXT:    retq
 131   %1 = mul <4 x i32> %x, <i32 -1, i32 -2, i32 -4, i32 -16>
 132   ret <4 x i32> %1
 133 }
 134
 135 define <4 x i64> @combine_vec_mul_negpow2c(<4 x i64> %x) {
 136 ; SSE-LABEL: combine_vec_mul_negpow2c:
 137 ; SSE:       # %bb.0:
 138 ; SSE-NEXT:    movdqa {{.*#+}} xmm2 = [4294967295,4294967295]
 139 ; SSE-NEXT:    movdqa %xmm0, %xmm3
 140 ; SSE-NEXT:    pmuludq %xmm2, %xmm3
 141 ; SSE-NEXT:    movdqa %xmm0, %xmm4
 142 ; SSE-NEXT:    psrlq $32, %xmm4
 143 ; SSE-NEXT:    movdqa {{.*#+}} xmm5 = [18446744073709551615,18446744073709551614]
 144 ; SSE-NEXT:    pmuludq %xmm5, %xmm4
 145 ; SSE-NEXT:    paddq %xmm3, %xmm4
 146 ; SSE-NEXT:    psllq $32, %xmm4
 147 ; SSE-NEXT:    pmuludq %xmm5, %xmm0
 148 ; SSE-NEXT:    paddq %xmm4, %xmm0
 149 ; SSE-NEXT:    pmuludq %xmm1, %xmm2
 150 ; SSE-NEXT:    movdqa %xmm1, %xmm3
 151 ; SSE-NEXT:    psrlq $32, %xmm3
 152 ; SSE-NEXT:    movdqa {{.*#+}} xmm4 = [18446744073709551612,18446744073709551600]
 153 ; SSE-NEXT:    pmuludq %xmm4, %xmm3
 154 ; SSE-NEXT:    paddq %xmm2, %xmm3
 155 ; SSE-NEXT:    psllq $32, %xmm3
 156 ; SSE-NEXT:    pmuludq %xmm4, %xmm1
 157 ; SSE-NEXT:    paddq %xmm3, %xmm1
 158 ; SSE-NEXT:    retq
 159 ;
 160 ; AVX-LABEL: combine_vec_mul_negpow2c:
 161 ; AVX:       # %bb.0:
 162 ; AVX-NEXT:    vpbroadcastq {{.*#+}} ymm1 = [4294967295,4294967295,4294967295,4294967295]
 163 ; AVX-NEXT:    vpmuludq %ymm1, %ymm0, %ymm1
 164 ; AVX-NEXT:    vpsrlq $32, %ymm0, %ymm2
 165 ; AVX-NEXT:    vmovdqa {{.*#+}} ymm3 = [18446744073709551615,18446744073709551614,18446744073709551612,18446744073709551600]
 166 ; AVX-NEXT:    vpmuludq %ymm3, %ymm2, %ymm2
 167 ; AVX-NEXT:    vpaddq %ymm2, %ymm1, %ymm1
 168 ; AVX-NEXT:    vpsllq $32, %ymm1, %ymm1
 169 ; AVX-NEXT:    vpmuludq %ymm3, %ymm0, %ymm0
 170 ; AVX-NEXT:    vpaddq %ymm1, %ymm0, %ymm0
 171 ; AVX-NEXT:    retq
 172   %1 = mul <4 x i64> %x, <i64 -1, i64 -2, i64 -4, i64 -16>
 173   ret <4 x i64> %1
 174 }
 175
 176 ; (mul (shl X, c1), c2) -> (mul X, c2 << c1)
 177 define <4 x i32> @combine_vec_mul_shl_const(<4 x i32> %x) {
 178 ; SSE-LABEL: combine_vec_mul_shl_const:
 179 ; SSE:       # %bb.0:
 180 ; SSE-NEXT:    pmulld {{.*}}(%rip), %xmm0
 181 ; SSE-NEXT:    retq
 182 ;
 183 ; AVX-LABEL: combine_vec_mul_shl_const:
 184 ; AVX:       # %bb.0:
 185 ; AVX-NEXT:    vpmulld {{.*}}(%rip), %xmm0, %xmm0
 186 ; AVX-NEXT:    retq
 187   %1 = shl <4 x i32> %x, <i32 1, i32 2, i32 8, i32 16>
 188   %2 = mul <4 x i32> %1, <i32 1, i32 3, i32 5, i32 7>
 189   ret <4 x i32> %2
 190 }
 191
 192 ; (mul (shl X, C), Y) -> (shl (mul X, Y), C) when the shift has one use.
 193 define <4 x i32> @combine_vec_mul_shl_oneuse0(<4 x i32> %x, <4 x i32> %y) {
 194 ; SSE-LABEL: combine_vec_mul_shl_oneuse0:
 195 ; SSE:       # %bb.0:
 196 ; SSE-NEXT:    pmulld %xmm1, %xmm0
 197 ; SSE-NEXT:    pmulld {{.*}}(%rip), %xmm0
 198 ; SSE-NEXT:    retq
 199 ;
 200 ; AVX-LABEL: combine_vec_mul_shl_oneuse0:
 201 ; AVX:       # %bb.0:
 202 ; AVX-NEXT:    vpmulld %xmm1, %xmm0, %xmm0
 203 ; AVX-NEXT:    vpsllvd {{.*}}(%rip), %xmm0, %xmm0
 204 ; AVX-NEXT:    retq
 205   %1 = shl <4 x i32> %x, <i32 1, i32 2, i32 8, i32 16>
 206   %2 = mul <4 x i32> %1, %y
 207   ret <4 x i32> %2
 208 }
 209
 210 define <4 x i32> @combine_vec_mul_shl_oneuse1(<4 x i32> %x, <4 x i32> %y) {
 211 ; SSE-LABEL: combine_vec_mul_shl_oneuse1:
 212 ; SSE:       # %bb.0:
 213 ; SSE-NEXT:    pmulld %xmm1, %xmm0
 214 ; SSE-NEXT:    pmulld {{.*}}(%rip), %xmm0
 215 ; SSE-NEXT:    retq
 216 ;
 217 ; AVX-LABEL: combine_vec_mul_shl_oneuse1:
 218 ; AVX:       # %bb.0:
 219 ; AVX-NEXT:    vpmulld %xmm1, %xmm0, %xmm0
 220 ; AVX-NEXT:    vpsllvd {{.*}}(%rip), %xmm0, %xmm0
 221 ; AVX-NEXT:    retq
 222   %1 = shl <4 x i32> %x, <i32 1, i32 2, i32 8, i32 16>
 223   %2 = mul <4 x i32> %y, %1
 224   ret <4 x i32> %2
 225 }
 226
 227 define <4 x i32> @combine_vec_mul_shl_multiuse0(<4 x i32> %x, <4 x i32> %y) {
 228 ; SSE-LABEL: combine_vec_mul_shl_multiuse0:
 229 ; SSE:       # %bb.0:
 230 ; SSE-NEXT:    pmulld {{.*}}(%rip), %xmm0
 231 ; SSE-NEXT:    pmulld %xmm0, %xmm1
 232 ; SSE-NEXT:    paddd %xmm1, %xmm0
 233 ; SSE-NEXT:    retq
 234 ;
 235 ; AVX-LABEL: combine_vec_mul_shl_multiuse0:
 236 ; AVX:       # %bb.0:
 237 ; AVX-NEXT:    vpsllvd {{.*}}(%rip), %xmm0, %xmm0
 238 ; AVX-NEXT:    vpmulld %xmm1, %xmm0, %xmm1
 239 ; AVX-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
 240 ; AVX-NEXT:    retq
 241   %1 = shl <4 x i32> %x, <i32 1, i32 2, i32 8, i32 16>
 242   %2 = mul <4 x i32> %1, %y
 243   %3 = add <4 x i32> %1, %2
 244   ret <4 x i32> %3
 245 }
 246
 247 define <4 x i32> @combine_vec_mul_shl_multiuse1(<4 x i32> %x, <4 x i32> %y) {
 248 ; SSE-LABEL: combine_vec_mul_shl_multiuse1:
 249 ; SSE:       # %bb.0:
 250 ; SSE-NEXT:    pmulld {{.*}}(%rip), %xmm0
 251 ; SSE-NEXT:    pmulld %xmm0, %xmm1
 252 ; SSE-NEXT:    paddd %xmm1, %xmm0
 253 ; SSE-NEXT:    retq
 254 ;
 255 ; AVX-LABEL: combine_vec_mul_shl_multiuse1:
 256 ; AVX:       # %bb.0:
 257 ; AVX-NEXT:    vpsllvd {{.*}}(%rip), %xmm0, %xmm0
 258 ; AVX-NEXT:    vpmulld %xmm0, %xmm1, %xmm1
 259 ; AVX-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
 260 ; AVX-NEXT:    retq
 261   %1 = shl <4 x i32> %x, <i32 1, i32 2, i32 8, i32 16>
 262   %2 = mul <4 x i32> %y, %1
 263   %3 = add <4 x i32> %1, %2
 264   ret <4 x i32> %3
 265 }
 266
 267 ; fold (mul (add x, c1), c2) -> (add (mul x, c2), c1*c2)
 268
 269 define <4 x i32> @combine_vec_mul_add(<4 x i32> %x) {
 270 ; SSE-LABEL: combine_vec_mul_add:
 271 ; SSE:       # %bb.0:
 272 ; SSE-NEXT:    pmulld {{.*}}(%rip), %xmm0
 273 ; SSE-NEXT:    paddd {{.*}}(%rip), %xmm0
 274 ; SSE-NEXT:    retq
 275 ;
 276 ; AVX-LABEL: combine_vec_mul_add:
 277 ; AVX:       # %bb.0:
 278 ; AVX-NEXT:    vpmulld {{.*}}(%rip), %xmm0, %xmm0
 279 ; AVX-NEXT:    vpaddd {{.*}}(%rip), %xmm0, %xmm0
 280 ; AVX-NEXT:    retq
 281   %1 = add <4 x i32> %x, <i32 1, i32 2, i32 8, i32 16>
 282   %2 = mul <4 x i32> %1, <i32 4, i32 6, i32 2, i32 0>
 283   ret <4 x i32> %2
 284 }
 285
 286 ; This would infinite loop because DAGCombiner wants to turn this into a shift,
 287 ; but x86 lowering wants to avoid non-uniform vector shift amounts.
 288
 289 define <16 x i8> @PR35579(<16 x i8> %x) {
 290 ; SSE-LABEL: PR35579:
 291 ; SSE:       # %bb.0:
 292 ; SSE-NEXT:    pmovzxbw {{.*#+}} xmm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
 293 ; SSE-NEXT:    punpckhbw {{.*#+}} xmm0 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
 294 ; SSE-NEXT:    pmullw {{.*}}(%rip), %xmm0
 295 ; SSE-NEXT:    movdqa {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255]
 296 ; SSE-NEXT:    pand %xmm2, %xmm0
 297 ; SSE-NEXT:    pmullw {{.*}}(%rip), %xmm1
 298 ; SSE-NEXT:    pand %xmm2, %xmm1
 299 ; SSE-NEXT:    packuswb %xmm0, %xmm1
 300 ; SSE-NEXT:    movdqa %xmm1, %xmm0
 301 ; SSE-NEXT:    retq
 302 ;
 303 ; AVX-LABEL: PR35579:
 304 ; AVX:       # %bb.0:
 305 ; AVX-NEXT:    vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
 306 ; AVX-NEXT:    vpmullw {{.*}}(%rip), %ymm0, %ymm0
 307 ; AVX-NEXT:    vpand {{.*}}(%rip), %ymm0, %ymm0
 308 ; AVX-NEXT:    vextracti128 $1, %ymm0, %xmm1
 309 ; AVX-NEXT:    vpackuswb %xmm1, %xmm0, %xmm0
 310 ; AVX-NEXT:    vzeroupper
 311 ; AVX-NEXT:    retq
 312   %r = mul <16 x i8> %x, <i8 0, i8 1, i8 2, i8 1, i8 4, i8 1, i8 2, i8 1, i8 8, i8 1, i8 2, i8 1, i8 4, i8 1, i8 2, i8 1>
 313   ret <16 x i8> %r
 314 }
 315
 316 ; OSS Fuzz: https://bugs.chromium.org/p/oss-fuzz/issues/detail?id=15429
 317 define <4 x i64> @fuzz15429(<4 x i64> %InVec) {
 318 ; SSE-LABEL: fuzz15429:
 319 ; SSE:       # %bb.0:
 320 ; SSE-NEXT:    movdqa %xmm1, %xmm2
 321 ; SSE-NEXT:    psllq $3, %xmm2
 322 ; SSE-NEXT:    psllq $2, %xmm1
 323 ; SSE-NEXT:    pblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm2[4,5,6,7]
 324 ; SSE-NEXT:    paddq %xmm0, %xmm0
 325 ; SSE-NEXT:    movabsq $9223372036854775807, %rax # imm = 0x7FFFFFFFFFFFFFFF
 326 ; SSE-NEXT:    pinsrq $0, %rax, %xmm0
 327 ; SSE-NEXT:    retq
 328 ;
 329 ; AVX-LABEL: fuzz15429:
 330 ; AVX:       # %bb.0:
 331 ; AVX-NEXT:    vpsllvq {{.*}}(%rip), %ymm0, %ymm0
 332 ; AVX-NEXT:    movabsq $9223372036854775807, %rax # imm = 0x7FFFFFFFFFFFFFFF
 333 ; AVX-NEXT:    vpinsrq $0, %rax, %xmm0, %xmm1
 334 ; AVX-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
 335 ; AVX-NEXT:    retq
 336   %mul = mul <4 x i64> %InVec, <i64 1, i64 2, i64 4, i64 8>
 337   %I = insertelement <4 x i64> %mul, i64 9223372036854775807, i64 0
 338   ret <4 x i64> %I
 339 }