llvm/test/CodeGen/X86/combine-mul.ll

   1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
   2 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefix=SSE
   3 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=AVX
   4
   5 ; fold (mul x, 0) -> 0
   6 define <4 x i32> @combine_vec_mul_zero(<4 x i32> %x) {
   7 ; SSE-LABEL: combine_vec_mul_zero:
   8 ; SSE:       # %bb.0:
   9 ; SSE-NEXT:    xorps %xmm0, %xmm0
  10 ; SSE-NEXT:    retq
  11 ;
  12 ; AVX-LABEL: combine_vec_mul_zero:
  13 ; AVX:       # %bb.0:
  14 ; AVX-NEXT:    vxorps %xmm0, %xmm0, %xmm0
  15 ; AVX-NEXT:    retq
  16   %1 = mul <4 x i32> %x, zeroinitializer
  17   ret <4 x i32> %1
  18 }
  19
  20 ; fold (mul x, 1) -> x
  21 define <4 x i32> @combine_vec_mul_one(<4 x i32> %x) {
  22 ; SSE-LABEL: combine_vec_mul_one:
  23 ; SSE:       # %bb.0:
  24 ; SSE-NEXT:    retq
  25 ;
  26 ; AVX-LABEL: combine_vec_mul_one:
  27 ; AVX:       # %bb.0:
  28 ; AVX-NEXT:    retq
  29   %1 = mul <4 x i32> %x, <i32 1, i32 1, i32 1, i32 1>
  30   ret <4 x i32> %1
  31 }
  32
  33 ; fold (mul x, -1) -> 0-x
  34 define <4 x i32> @combine_vec_mul_negone(<4 x i32> %x) {
  35 ; SSE-LABEL: combine_vec_mul_negone:
  36 ; SSE:       # %bb.0:
  37 ; SSE-NEXT:    pxor %xmm1, %xmm1
  38 ; SSE-NEXT:    psubd %xmm0, %xmm1
  39 ; SSE-NEXT:    movdqa %xmm1, %xmm0
  40 ; SSE-NEXT:    retq
  41 ;
  42 ; AVX-LABEL: combine_vec_mul_negone:
  43 ; AVX:       # %bb.0:
  44 ; AVX-NEXT:    vpxor %xmm1, %xmm1, %xmm1
  45 ; AVX-NEXT:    vpsubd %xmm0, %xmm1, %xmm0
  46 ; AVX-NEXT:    retq
  47   %1 = mul <4 x i32> %x, <i32 -1, i32 -1, i32 -1, i32 -1>
  48   ret <4 x i32> %1
  49 }
  50
  51 ; fold (mul x, (1 << c)) -> x << c
  52 define <4 x i32> @combine_vec_mul_pow2a(<4 x i32> %x) {
  53 ; SSE-LABEL: combine_vec_mul_pow2a:
  54 ; SSE:       # %bb.0:
  55 ; SSE-NEXT:    paddd %xmm0, %xmm0
  56 ; SSE-NEXT:    retq
  57 ;
  58 ; AVX-LABEL: combine_vec_mul_pow2a:
  59 ; AVX:       # %bb.0:
  60 ; AVX-NEXT:    vpaddd %xmm0, %xmm0, %xmm0
  61 ; AVX-NEXT:    retq
  62   %1 = mul <4 x i32> %x, <i32 2, i32 2, i32 2, i32 2>
  63   ret <4 x i32> %1
  64 }
  65
  66 define <4 x i32> @combine_vec_mul_pow2b(<4 x i32> %x) {
  67 ; SSE-LABEL: combine_vec_mul_pow2b:
  68 ; SSE:       # %bb.0:
  69 ; SSE-NEXT:    pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
  70 ; SSE-NEXT:    retq
  71 ;
  72 ; AVX-LABEL: combine_vec_mul_pow2b:
  73 ; AVX:       # %bb.0:
  74 ; AVX-NEXT:    vpsllvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
  75 ; AVX-NEXT:    retq
  76   %1 = mul <4 x i32> %x, <i32 1, i32 2, i32 4, i32 16>
  77   ret <4 x i32> %1
  78 }
  79
  80 define <4 x i64> @combine_vec_mul_pow2c(<4 x i64> %x) {
  81 ; SSE-LABEL: combine_vec_mul_pow2c:
  82 ; SSE:       # %bb.0:
  83 ; SSE-NEXT:    movdqa %xmm0, %xmm2
  84 ; SSE-NEXT:    paddq %xmm0, %xmm2
  85 ; SSE-NEXT:    pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm2[4,5,6,7]
  86 ; SSE-NEXT:    movdqa %xmm1, %xmm2
  87 ; SSE-NEXT:    psllq $4, %xmm2
  88 ; SSE-NEXT:    psllq $2, %xmm1
  89 ; SSE-NEXT:    pblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm2[4,5,6,7]
  90 ; SSE-NEXT:    retq
  91 ;
  92 ; AVX-LABEL: combine_vec_mul_pow2c:
  93 ; AVX:       # %bb.0:
  94 ; AVX-NEXT:    vpsllvq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
  95 ; AVX-NEXT:    retq
  96   %1 = mul <4 x i64> %x, <i64 1, i64 2, i64 4, i64 16>
  97   ret <4 x i64> %1
  98 }
  99
 100 ; fold (mul x, -(1 << c)) -> -(x << c) or (-x) << c
 101 define <4 x i32> @combine_vec_mul_negpow2a(<4 x i32> %x) {
 102 ; SSE-LABEL: combine_vec_mul_negpow2a:
 103 ; SSE:       # %bb.0:
 104 ; SSE-NEXT:    paddd %xmm0, %xmm0
 105 ; SSE-NEXT:    pxor %xmm1, %xmm1
 106 ; SSE-NEXT:    psubd %xmm0, %xmm1
 107 ; SSE-NEXT:    movdqa %xmm1, %xmm0
 108 ; SSE-NEXT:    retq
 109 ;
 110 ; AVX-LABEL: combine_vec_mul_negpow2a:
 111 ; AVX:       # %bb.0:
 112 ; AVX-NEXT:    vpaddd %xmm0, %xmm0, %xmm0
 113 ; AVX-NEXT:    vpxor %xmm1, %xmm1, %xmm1
 114 ; AVX-NEXT:    vpsubd %xmm0, %xmm1, %xmm0
 115 ; AVX-NEXT:    retq
 116   %1 = mul <4 x i32> %x, <i32 -2, i32 -2, i32 -2, i32 -2>
 117   ret <4 x i32> %1
 118 }
 119
 120 define <4 x i32> @combine_vec_mul_negpow2b(<4 x i32> %x) {
 121 ; SSE-LABEL: combine_vec_mul_negpow2b:
 122 ; SSE:       # %bb.0:
 123 ; SSE-NEXT:    pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
 124 ; SSE-NEXT:    retq
 125 ;
 126 ; AVX-LABEL: combine_vec_mul_negpow2b:
 127 ; AVX:       # %bb.0:
 128 ; AVX-NEXT:    vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
 129 ; AVX-NEXT:    retq
 130   %1 = mul <4 x i32> %x, <i32 -1, i32 -2, i32 -4, i32 -16>
 131   ret <4 x i32> %1
 132 }
 133
 134 define <4 x i64> @combine_vec_mul_negpow2c(<4 x i64> %x) {
 135 ; SSE-LABEL: combine_vec_mul_negpow2c:
 136 ; SSE:       # %bb.0:
 137 ; SSE-NEXT:    pmovsxbd {{.*#+}} xmm2 = [4294967295,0,4294967295,0]
 138 ; SSE-NEXT:    movdqa %xmm0, %xmm3
 139 ; SSE-NEXT:    pmuludq %xmm2, %xmm3
 140 ; SSE-NEXT:    movdqa %xmm0, %xmm4
 141 ; SSE-NEXT:    psrlq $32, %xmm4
 142 ; SSE-NEXT:    pmovsxbq {{.*#+}} xmm5 = [18446744073709551615,18446744073709551614]
 143 ; SSE-NEXT:    pmuludq %xmm5, %xmm4
 144 ; SSE-NEXT:    paddq %xmm3, %xmm4
 145 ; SSE-NEXT:    psllq $32, %xmm4
 146 ; SSE-NEXT:    pmuludq %xmm5, %xmm0
 147 ; SSE-NEXT:    paddq %xmm4, %xmm0
 148 ; SSE-NEXT:    pmuludq %xmm1, %xmm2
 149 ; SSE-NEXT:    movdqa %xmm1, %xmm3
 150 ; SSE-NEXT:    psrlq $32, %xmm3
 151 ; SSE-NEXT:    pmovsxbq {{.*#+}} xmm4 = [18446744073709551612,18446744073709551600]
 152 ; SSE-NEXT:    pmuludq %xmm4, %xmm3
 153 ; SSE-NEXT:    paddq %xmm2, %xmm3
 154 ; SSE-NEXT:    psllq $32, %xmm3
 155 ; SSE-NEXT:    pmuludq %xmm4, %xmm1
 156 ; SSE-NEXT:    paddq %xmm3, %xmm1
 157 ; SSE-NEXT:    retq
 158 ;
 159 ; AVX-LABEL: combine_vec_mul_negpow2c:
 160 ; AVX:       # %bb.0:
 161 ; AVX-NEXT:    vpbroadcastq {{.*#+}} ymm1 = [4294967295,4294967295,4294967295,4294967295]
 162 ; AVX-NEXT:    vpmuludq %ymm1, %ymm0, %ymm1
 163 ; AVX-NEXT:    vpsrlq $32, %ymm0, %ymm2
 164 ; AVX-NEXT:    vpmovsxbq {{.*#+}} ymm3 = [18446744073709551615,18446744073709551614,18446744073709551612,18446744073709551600]
 165 ; AVX-NEXT:    vpmuludq %ymm3, %ymm2, %ymm2
 166 ; AVX-NEXT:    vpaddq %ymm2, %ymm1, %ymm1
 167 ; AVX-NEXT:    vpsllq $32, %ymm1, %ymm1
 168 ; AVX-NEXT:    vpmuludq %ymm3, %ymm0, %ymm0
 169 ; AVX-NEXT:    vpaddq %ymm1, %ymm0, %ymm0
 170 ; AVX-NEXT:    retq
 171   %1 = mul <4 x i64> %x, <i64 -1, i64 -2, i64 -4, i64 -16>
 172   ret <4 x i64> %1
 173 }
 174
 175 ; (mul (shl X, c1), c2) -> (mul X, c2 << c1)
 176 define <4 x i32> @combine_vec_mul_shl_const(<4 x i32> %x) {
 177 ; SSE-LABEL: combine_vec_mul_shl_const:
 178 ; SSE:       # %bb.0:
 179 ; SSE-NEXT:    pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
 180 ; SSE-NEXT:    retq
 181 ;
 182 ; AVX-LABEL: combine_vec_mul_shl_const:
 183 ; AVX:       # %bb.0:
 184 ; AVX-NEXT:    vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
 185 ; AVX-NEXT:    retq
 186   %1 = shl <4 x i32> %x, <i32 1, i32 2, i32 8, i32 16>
 187   %2 = mul <4 x i32> %1, <i32 1, i32 3, i32 5, i32 7>
 188   ret <4 x i32> %2
 189 }
 190
 191 ; (mul (shl X, C), Y) -> (shl (mul X, Y), C) when the shift has one use.
 192 define <4 x i32> @combine_vec_mul_shl_oneuse0(<4 x i32> %x, <4 x i32> %y) {
 193 ; SSE-LABEL: combine_vec_mul_shl_oneuse0:
 194 ; SSE:       # %bb.0:
 195 ; SSE-NEXT:    pmulld %xmm1, %xmm0
 196 ; SSE-NEXT:    pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
 197 ; SSE-NEXT:    retq
 198 ;
 199 ; AVX-LABEL: combine_vec_mul_shl_oneuse0:
 200 ; AVX:       # %bb.0:
 201 ; AVX-NEXT:    vpmulld %xmm1, %xmm0, %xmm0
 202 ; AVX-NEXT:    vpsllvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
 203 ; AVX-NEXT:    retq
 204   %1 = shl <4 x i32> %x, <i32 1, i32 2, i32 8, i32 16>
 205   %2 = mul <4 x i32> %1, %y
 206   ret <4 x i32> %2
 207 }
 208
 209 define <4 x i32> @combine_vec_mul_shl_oneuse1(<4 x i32> %x, <4 x i32> %y) {
 210 ; SSE-LABEL: combine_vec_mul_shl_oneuse1:
 211 ; SSE:       # %bb.0:
 212 ; SSE-NEXT:    pmulld %xmm1, %xmm0
 213 ; SSE-NEXT:    pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
 214 ; SSE-NEXT:    retq
 215 ;
 216 ; AVX-LABEL: combine_vec_mul_shl_oneuse1:
 217 ; AVX:       # %bb.0:
 218 ; AVX-NEXT:    vpmulld %xmm1, %xmm0, %xmm0
 219 ; AVX-NEXT:    vpsllvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
 220 ; AVX-NEXT:    retq
 221   %1 = shl <4 x i32> %x, <i32 1, i32 2, i32 8, i32 16>
 222   %2 = mul <4 x i32> %y, %1
 223   ret <4 x i32> %2
 224 }
 225
 226 define <4 x i32> @combine_vec_mul_shl_multiuse0(<4 x i32> %x, <4 x i32> %y) {
 227 ; SSE-LABEL: combine_vec_mul_shl_multiuse0:
 228 ; SSE:       # %bb.0:
 229 ; SSE-NEXT:    pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
 230 ; SSE-NEXT:    pmulld %xmm0, %xmm1
 231 ; SSE-NEXT:    paddd %xmm1, %xmm0
 232 ; SSE-NEXT:    retq
 233 ;
 234 ; AVX-LABEL: combine_vec_mul_shl_multiuse0:
 235 ; AVX:       # %bb.0:
 236 ; AVX-NEXT:    vpsllvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
 237 ; AVX-NEXT:    vpmulld %xmm1, %xmm0, %xmm1
 238 ; AVX-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
 239 ; AVX-NEXT:    retq
 240   %1 = shl <4 x i32> %x, <i32 1, i32 2, i32 8, i32 16>
 241   %2 = mul <4 x i32> %1, %y
 242   %3 = add <4 x i32> %1, %2
 243   ret <4 x i32> %3
 244 }
 245
 246 define <4 x i32> @combine_vec_mul_shl_multiuse1(<4 x i32> %x, <4 x i32> %y) {
 247 ; SSE-LABEL: combine_vec_mul_shl_multiuse1:
 248 ; SSE:       # %bb.0:
 249 ; SSE-NEXT:    pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
 250 ; SSE-NEXT:    pmulld %xmm0, %xmm1
 251 ; SSE-NEXT:    paddd %xmm1, %xmm0
 252 ; SSE-NEXT:    retq
 253 ;
 254 ; AVX-LABEL: combine_vec_mul_shl_multiuse1:
 255 ; AVX:       # %bb.0:
 256 ; AVX-NEXT:    vpsllvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
 257 ; AVX-NEXT:    vpmulld %xmm0, %xmm1, %xmm1
 258 ; AVX-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
 259 ; AVX-NEXT:    retq
 260   %1 = shl <4 x i32> %x, <i32 1, i32 2, i32 8, i32 16>
 261   %2 = mul <4 x i32> %y, %1
 262   %3 = add <4 x i32> %1, %2
 263   ret <4 x i32> %3
 264 }
 265
 266 ; fold (mul (add x, c1), c2) -> (add (mul x, c2), c1*c2)
 267
 268 define <4 x i32> @combine_vec_mul_add(<4 x i32> %x) {
 269 ; SSE-LABEL: combine_vec_mul_add:
 270 ; SSE:       # %bb.0:
 271 ; SSE-NEXT:    pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
 272 ; SSE-NEXT:    paddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
 273 ; SSE-NEXT:    retq
 274 ;
 275 ; AVX-LABEL: combine_vec_mul_add:
 276 ; AVX:       # %bb.0:
 277 ; AVX-NEXT:    vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
 278 ; AVX-NEXT:    vpaddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
 279 ; AVX-NEXT:    retq
 280   %1 = add <4 x i32> %x, <i32 1, i32 2, i32 8, i32 16>
 281   %2 = mul <4 x i32> %1, <i32 4, i32 6, i32 2, i32 0>
 282   ret <4 x i32> %2
 283 }
 284
 285 ; fold Y = sra (X, size(X)-1); mul (or (Y, 1), X) -> (abs X)
 286
 287 define <16 x i8> @combine_mul_to_abs_v16i8(<16 x i8> %x) {
 288 ; SSE-LABEL: combine_mul_to_abs_v16i8:
 289 ; SSE:       # %bb.0:
 290 ; SSE-NEXT:    pabsb %xmm0, %xmm0
 291 ; SSE-NEXT:    retq
 292 ;
 293 ; AVX-LABEL: combine_mul_to_abs_v16i8:
 294 ; AVX:       # %bb.0:
 295 ; AVX-NEXT:    vpabsb %xmm0, %xmm0
 296 ; AVX-NEXT:    retq
 297   %s = ashr <16 x i8> %x, <i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7>
 298   %o = or <16 x i8> %s, <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>
 299   %m = mul <16 x i8> %o, %x
 300   ret <16 x i8> %m
 301 }
 302
 303 define <2 x i64> @combine_mul_to_abs_v2i64(<2 x i64> %x) {
 304 ; SSE-LABEL: combine_mul_to_abs_v2i64:
 305 ; SSE:       # %bb.0:
 306 ; SSE-NEXT:    pxor %xmm1, %xmm1
 307 ; SSE-NEXT:    psubq %xmm0, %xmm1
 308 ; SSE-NEXT:    blendvpd %xmm0, %xmm1, %xmm0
 309 ; SSE-NEXT:    retq
 310 ;
 311 ; AVX-LABEL: combine_mul_to_abs_v2i64:
 312 ; AVX:       # %bb.0:
 313 ; AVX-NEXT:    vpxor %xmm1, %xmm1, %xmm1
 314 ; AVX-NEXT:    vpsubq %xmm0, %xmm1, %xmm1
 315 ; AVX-NEXT:    vblendvpd %xmm0, %xmm1, %xmm0, %xmm0
 316 ; AVX-NEXT:    retq
 317   %s = ashr <2 x i64> %x, <i64 63, i64 63>
 318   %o = or <2 x i64> %s, <i64 1, i64 1>
 319   %m = mul <2 x i64> %x, %o
 320   ret <2 x i64> %m
 321 }
 322
 323 ; 'Quadratic Reciprocity' - and(mul(x,x),2) -> 0
 324
 325 define i64 @combine_mul_self_knownbits(i64 %x) {
 326 ; SSE-LABEL: combine_mul_self_knownbits:
 327 ; SSE:       # %bb.0:
 328 ; SSE-NEXT:    xorl %eax, %eax
 329 ; SSE-NEXT:    retq
 330 ;
 331 ; AVX-LABEL: combine_mul_self_knownbits:
 332 ; AVX:       # %bb.0:
 333 ; AVX-NEXT:    xorl %eax, %eax
 334 ; AVX-NEXT:    retq
 335   %1 = mul i64 %x, %x
 336   %2 = and i64 %1, 2
 337   ret i64 %2
 338 }
 339
 340 define <4 x i32> @combine_mul_self_knownbits_vector(<4 x i32> %x) {
 341 ; SSE-LABEL: combine_mul_self_knownbits_vector:
 342 ; SSE:       # %bb.0:
 343 ; SSE-NEXT:    xorps %xmm0, %xmm0
 344 ; SSE-NEXT:    retq
 345 ;
 346 ; AVX-LABEL: combine_mul_self_knownbits_vector:
 347 ; AVX:       # %bb.0:
 348 ; AVX-NEXT:    vxorps %xmm0, %xmm0, %xmm0
 349 ; AVX-NEXT:    retq
 350   %1 = mul <4 x i32> %x, %x
 351   %2 = and <4 x i32> %1, <i32 2, i32 2, i32 2, i32 2>
 352   ret <4 x i32> %2
 353 }
 354
 355 ; mul(x,x) - bit[1] is 0, but if demanding the other bits the source must not be undef
 356
 357 define i64 @combine_mul_self_demandedbits(i64 %x) {
 358 ; SSE-LABEL: combine_mul_self_demandedbits:
 359 ; SSE:       # %bb.0:
 360 ; SSE-NEXT:    movq %rdi, %rax
 361 ; SSE-NEXT:    imulq %rdi, %rax
 362 ; SSE-NEXT:    retq
 363 ;
 364 ; AVX-LABEL: combine_mul_self_demandedbits:
 365 ; AVX:       # %bb.0:
 366 ; AVX-NEXT:    movq %rdi, %rax
 367 ; AVX-NEXT:    imulq %rdi, %rax
 368 ; AVX-NEXT:    retq
 369   %1 = mul i64 %x, %x
 370   %2 = and i64 %1, -3
 371   ret i64 %2
 372 }
 373
 374 define <4 x i32> @combine_mul_self_demandedbits_vector(<4 x i32> %x) {
 375 ; SSE-LABEL: combine_mul_self_demandedbits_vector:
 376 ; SSE:       # %bb.0:
 377 ; SSE-NEXT:    pmulld %xmm0, %xmm0
 378 ; SSE-NEXT:    retq
 379 ;
 380 ; AVX-LABEL: combine_mul_self_demandedbits_vector:
 381 ; AVX:       # %bb.0:
 382 ; AVX-NEXT:    vpmulld %xmm0, %xmm0, %xmm0
 383 ; AVX-NEXT:    retq
 384   %1 = freeze <4 x i32> %x
 385   %2 = mul <4 x i32> %1, %1
 386   %3 = and <4 x i32> %2, <i32 -3, i32 -3, i32 -3, i32 -3>
 387   ret <4 x i32> %3
 388 }
 389
 390 ; PR59217 - Reuse umul_lohi/smul_lohi node
 391
 392 define i64 @combine_mul_umul_lohi_i64(i64 %a, i64 %b) {
 393 ; SSE-LABEL: combine_mul_umul_lohi_i64:
 394 ; SSE:       # %bb.0:
 395 ; SSE-NEXT:    movq %rdi, %rax
 396 ; SSE-NEXT:    mulq %rsi
 397 ; SSE-NEXT:    xorq %rdx, %rax
 398 ; SSE-NEXT:    retq
 399 ;
 400 ; AVX-LABEL: combine_mul_umul_lohi_i64:
 401 ; AVX:       # %bb.0:
 402 ; AVX-NEXT:    movq %rdi, %rax
 403 ; AVX-NEXT:    mulq %rsi
 404 ; AVX-NEXT:    xorq %rdx, %rax
 405 ; AVX-NEXT:    retq
 406   %a128 = zext i64 %a to i128
 407   %b128 = zext i64 %b to i128
 408   %m128 = mul nuw i128 %a128, %b128
 409   %hi128 = lshr i128 %m128, 64
 410   %hi = trunc i128 %hi128 to i64
 411   %lo = mul i64 %a, %b
 412   %r = xor i64 %lo, %hi
 413   ret i64 %r
 414 }
 415
 416 define i64 @combine_mul_smul_lohi_commute_i64(i64 %a, i64 %b) {
 417 ; SSE-LABEL: combine_mul_smul_lohi_commute_i64:
 418 ; SSE:       # %bb.0:
 419 ; SSE-NEXT:    movq %rdi, %rax
 420 ; SSE-NEXT:    imulq %rsi
 421 ; SSE-NEXT:    xorq %rdx, %rax
 422 ; SSE-NEXT:    retq
 423 ;
 424 ; AVX-LABEL: combine_mul_smul_lohi_commute_i64:
 425 ; AVX:       # %bb.0:
 426 ; AVX-NEXT:    movq %rdi, %rax
 427 ; AVX-NEXT:    imulq %rsi
 428 ; AVX-NEXT:    xorq %rdx, %rax
 429 ; AVX-NEXT:    retq
 430   %a128 = sext i64 %a to i128
 431   %b128 = sext i64 %b to i128
 432   %m128 = mul nsw i128 %a128, %b128
 433   %hi128 = lshr i128 %m128, 64
 434   %hi = trunc i128 %hi128 to i64
 435   %lo = mul i64 %b, %a
 436   %r = xor i64 %lo, %hi
 437   ret i64 %r
 438 }
 439
 440 define i64 @combine_mul_umul_lohi_const_i64(i64 %h) {
 441 ; SSE-LABEL: combine_mul_umul_lohi_const_i64:
 442 ; SSE:       # %bb.0:
 443 ; SSE-NEXT:    movq %rdi, %rax
 444 ; SSE-NEXT:    movabsq $-4265267296055464877, %rcx # imm = 0xC4CEB9FE1A85EC53
 445 ; SSE-NEXT:    mulq %rcx
 446 ; SSE-NEXT:    xorq %rdx, %rax
 447 ; SSE-NEXT:    retq
 448 ;
 449 ; AVX-LABEL: combine_mul_umul_lohi_const_i64:
 450 ; AVX:       # %bb.0:
 451 ; AVX-NEXT:    movq %rdi, %rax
 452 ; AVX-NEXT:    movabsq $-4265267296055464877, %rcx # imm = 0xC4CEB9FE1A85EC53
 453 ; AVX-NEXT:    mulq %rcx
 454 ; AVX-NEXT:    xorq %rdx, %rax
 455 ; AVX-NEXT:    retq
 456   %h128 = zext i64 %h to i128
 457   %m128 = mul nuw i128 %h128, 14181476777654086739
 458   %hi128 = lshr i128 %m128, 64
 459   %hi = trunc i128 %hi128 to i64
 460   %lo = mul i64 %h, 14181476777654086739
 461   %r = xor i64 %lo, %hi
 462   ret i64 %r
 463 }
 464
 465 define i64 @combine_mul_smul_lohi_const_i64(i64 %h) {
 466 ; SSE-LABEL: combine_mul_smul_lohi_const_i64:
 467 ; SSE:       # %bb.0:
 468 ; SSE-NEXT:    movq %rdi, %rax
 469 ; SSE-NEXT:    movq %rdi, %rcx
 470 ; SSE-NEXT:    sarq $63, %rcx
 471 ; SSE-NEXT:    movabsq $-4265267296055464877, %rsi # imm = 0xC4CEB9FE1A85EC53
 472 ; SSE-NEXT:    mulq %rsi
 473 ; SSE-NEXT:    imulq %rsi, %rcx
 474 ; SSE-NEXT:    addq %rdx, %rcx
 475 ; SSE-NEXT:    xorq %rcx, %rax
 476 ; SSE-NEXT:    retq
 477 ;
 478 ; AVX-LABEL: combine_mul_smul_lohi_const_i64:
 479 ; AVX:       # %bb.0:
 480 ; AVX-NEXT:    movq %rdi, %rax
 481 ; AVX-NEXT:    movq %rdi, %rcx
 482 ; AVX-NEXT:    sarq $63, %rcx
 483 ; AVX-NEXT:    movabsq $-4265267296055464877, %rsi # imm = 0xC4CEB9FE1A85EC53
 484 ; AVX-NEXT:    mulq %rsi
 485 ; AVX-NEXT:    imulq %rsi, %rcx
 486 ; AVX-NEXT:    addq %rdx, %rcx
 487 ; AVX-NEXT:    xorq %rcx, %rax
 488 ; AVX-NEXT:    retq
 489   %h128 = sext i64 %h to i128
 490   %m128 = mul nsw i128 %h128, 14181476777654086739
 491   %hi128 = lshr i128 %m128, 64
 492   %hi = trunc i128 %hi128 to i64
 493   %lo = mul i64 %h, 14181476777654086739
 494   %r = xor i64 %lo, %hi
 495   ret i64 %r
 496 }
 497
 498 ; This would infinite loop because DAGCombiner wants to turn this into a shift,
 499 ; but x86 lowering wants to avoid non-uniform vector shift amounts.
 500
 501 define <16 x i8> @PR35579(<16 x i8> %x) {
 502 ; SSE-LABEL: PR35579:
 503 ; SSE:       # %bb.0:
 504 ; SSE-NEXT:    movdqa %xmm0, %xmm1
 505 ; SSE-NEXT:    pmaddubsw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1]
 506 ; SSE-NEXT:    psllw $8, %xmm1
 507 ; SSE-NEXT:    pmaddubsw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [0,0,2,0,4,0,2,0,8,0,2,0,4,0,2,0]
 508 ; SSE-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
 509 ; SSE-NEXT:    por %xmm1, %xmm0
 510 ; SSE-NEXT:    retq
 511 ;
 512 ; AVX-LABEL: PR35579:
 513 ; AVX:       # %bb.0:
 514 ; AVX-NEXT:    vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
 515 ; AVX-NEXT:    vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 # [0,1,2,1,4,1,2,1,8,1,2,1,4,1,2,1]
 516 ; AVX-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
 517 ; AVX-NEXT:    vextracti128 $1, %ymm0, %xmm1
 518 ; AVX-NEXT:    vpackuswb %xmm1, %xmm0, %xmm0
 519 ; AVX-NEXT:    vzeroupper
 520 ; AVX-NEXT:    retq
 521   %r = mul <16 x i8> %x, <i8 0, i8 1, i8 2, i8 1, i8 4, i8 1, i8 2, i8 1, i8 8, i8 1, i8 2, i8 1, i8 4, i8 1, i8 2, i8 1>
 522   ret <16 x i8> %r
 523 }
 524
 525 ; OSS Fuzz: https://bugs.chromium.org/p/oss-fuzz/issues/detail?id=15429
 526 define <4 x i64> @fuzz15429(<4 x i64> %InVec) {
 527 ; SSE-LABEL: fuzz15429:
 528 ; SSE:       # %bb.0:
 529 ; SSE-NEXT:    movdqa %xmm1, %xmm2
 530 ; SSE-NEXT:    psllq $3, %xmm2
 531 ; SSE-NEXT:    psllq $2, %xmm1
 532 ; SSE-NEXT:    pblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm2[4,5,6,7]
 533 ; SSE-NEXT:    paddq %xmm0, %xmm0
 534 ; SSE-NEXT:    movabsq $9223372036854775807, %rax # imm = 0x7FFFFFFFFFFFFFFF
 535 ; SSE-NEXT:    pinsrq $0, %rax, %xmm0
 536 ; SSE-NEXT:    retq
 537 ;
 538 ; AVX-LABEL: fuzz15429:
 539 ; AVX:       # %bb.0:
 540 ; AVX-NEXT:    vpsllvq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
 541 ; AVX-NEXT:    movabsq $9223372036854775807, %rax # imm = 0x7FFFFFFFFFFFFFFF
 542 ; AVX-NEXT:    vmovq %rax, %xmm1
 543 ; AVX-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7]
 544 ; AVX-NEXT:    retq
 545   %mul = mul <4 x i64> %InVec, <i64 1, i64 2, i64 4, i64 8>
 546   %I = insertelement <4 x i64> %mul, i64 9223372036854775807, i64 0
 547   ret <4 x i64> %I
 548 }