llvm/test/CodeGen/X86/combine-or-shuffle.ll

   1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
   2 ; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mattr=+sse    | FileCheck %s -check-prefixes=SSE,SSE2
   3 ; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mattr=+sse4.1 | FileCheck %s -check-prefixes=SSE,SSE4
   4 ; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mattr=+avx  | FileCheck %s -check-prefixes=AVX,AVX1
   5 ; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mattr=+avx2 | FileCheck %s -check-prefixes=AVX,AVX2
   6 ; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mcpu=x86-64-v4 | FileCheck %s -check-prefixes=AVX,AVX512
   7
   8 ; Verify that each of the following test cases is folded into a single
   9 ; instruction which performs a blend operation.
  10
  11 define <2 x i64> @test1(<2 x i64> %a, <2 x i64> %b) {
  12 ; SSE2-LABEL: test1:
  13 ; SSE2:       # %bb.0:
  14 ; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
  15 ; SSE2-NEXT:    retq
  16 ;
  17 ; SSE4-LABEL: test1:
  18 ; SSE4:       # %bb.0:
  19 ; SSE4-NEXT:    blendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
  20 ; SSE4-NEXT:    retq
  21 ;
  22 ; AVX-LABEL: test1:
  23 ; AVX:       # %bb.0:
  24 ; AVX-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
  25 ; AVX-NEXT:    retq
  26   %shuf1 = shufflevector <2 x i64> %a, <2 x i64> zeroinitializer, <2 x i32><i32 0, i32 2>
  27   %shuf2 = shufflevector <2 x i64> %b, <2 x i64> zeroinitializer, <2 x i32><i32 2, i32 1>
  28   %or = or <2 x i64> %shuf1, %shuf2
  29   ret <2 x i64> %or
  30 }
  31
  32
  33 define <4 x i32> @test2(<4 x i32> %a, <4 x i32> %b) {
  34 ; SSE2-LABEL: test2:
  35 ; SSE2:       # %bb.0:
  36 ; SSE2-NEXT:    movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
  37 ; SSE2-NEXT:    retq
  38 ;
  39 ; SSE4-LABEL: test2:
  40 ; SSE4:       # %bb.0:
  41 ; SSE4-NEXT:    blendps {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
  42 ; SSE4-NEXT:    retq
  43 ;
  44 ; AVX-LABEL: test2:
  45 ; AVX:       # %bb.0:
  46 ; AVX-NEXT:    vblendps {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
  47 ; AVX-NEXT:    retq
  48   %shuf1 = shufflevector <4 x i32> %a, <4 x i32> zeroinitializer, <4 x i32><i32 4, i32 4, i32 2, i32 3>
  49   %shuf2 = shufflevector <4 x i32> %b, <4 x i32> zeroinitializer, <4 x i32><i32 0, i32 1, i32 4, i32 4>
  50   %or = or <4 x i32> %shuf1, %shuf2
  51   ret <4 x i32> %or
  52 }
  53
  54
  55 define <2 x i64> @test3(<2 x i64> %a, <2 x i64> %b) {
  56 ; SSE2-LABEL: test3:
  57 ; SSE2:       # %bb.0:
  58 ; SSE2-NEXT:    movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
  59 ; SSE2-NEXT:    retq
  60 ;
  61 ; SSE4-LABEL: test3:
  62 ; SSE4:       # %bb.0:
  63 ; SSE4-NEXT:    blendps {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
  64 ; SSE4-NEXT:    retq
  65 ;
  66 ; AVX-LABEL: test3:
  67 ; AVX:       # %bb.0:
  68 ; AVX-NEXT:    vblendps {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
  69 ; AVX-NEXT:    retq
  70   %shuf1 = shufflevector <2 x i64> %a, <2 x i64> zeroinitializer, <2 x i32><i32 2, i32 1>
  71   %shuf2 = shufflevector <2 x i64> %b, <2 x i64> zeroinitializer, <2 x i32><i32 0, i32 2>
  72   %or = or <2 x i64> %shuf1, %shuf2
  73   ret <2 x i64> %or
  74 }
  75
  76
  77 define <4 x i32> @test4(<4 x i32> %a, <4 x i32> %b) {
  78 ; SSE2-LABEL: test4:
  79 ; SSE2:       # %bb.0:
  80 ; SSE2-NEXT:    movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3]
  81 ; SSE2-NEXT:    movaps %xmm1, %xmm0
  82 ; SSE2-NEXT:    retq
  83 ;
  84 ; SSE4-LABEL: test4:
  85 ; SSE4:       # %bb.0:
  86 ; SSE4-NEXT:    blendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
  87 ; SSE4-NEXT:    retq
  88 ;
  89 ; AVX-LABEL: test4:
  90 ; AVX:       # %bb.0:
  91 ; AVX-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
  92 ; AVX-NEXT:    retq
  93   %shuf1 = shufflevector <4 x i32> %a, <4 x i32> zeroinitializer, <4 x i32><i32 0, i32 4, i32 4, i32 4>
  94   %shuf2 = shufflevector <4 x i32> %b, <4 x i32> zeroinitializer, <4 x i32><i32 4, i32 1, i32 2, i32 3>
  95   %or = or <4 x i32> %shuf1, %shuf2
  96   ret <4 x i32> %or
  97 }
  98
  99
 100 define <4 x i32> @test5(<4 x i32> %a, <4 x i32> %b) {
 101 ; SSE2-LABEL: test5:
 102 ; SSE2:       # %bb.0:
 103 ; SSE2-NEXT:    movss {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
 104 ; SSE2-NEXT:    retq
 105 ;
 106 ; SSE4-LABEL: test5:
 107 ; SSE4:       # %bb.0:
 108 ; SSE4-NEXT:    blendps {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
 109 ; SSE4-NEXT:    retq
 110 ;
 111 ; AVX-LABEL: test5:
 112 ; AVX:       # %bb.0:
 113 ; AVX-NEXT:    vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
 114 ; AVX-NEXT:    retq
 115   %shuf1 = shufflevector <4 x i32> %a, <4 x i32> zeroinitializer, <4 x i32><i32 4, i32 1, i32 2, i32 3>
 116   %shuf2 = shufflevector <4 x i32> %b, <4 x i32> zeroinitializer, <4 x i32><i32 0, i32 4, i32 4, i32 4>
 117   %or = or <4 x i32> %shuf1, %shuf2
 118   ret <4 x i32> %or
 119 }
 120
 121
 122 define <4 x i32> @test6(<4 x i32> %a, <4 x i32> %b) {
 123 ; SSE2-LABEL: test6:
 124 ; SSE2:       # %bb.0:
 125 ; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
 126 ; SSE2-NEXT:    retq
 127 ;
 128 ; SSE4-LABEL: test6:
 129 ; SSE4:       # %bb.0:
 130 ; SSE4-NEXT:    blendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
 131 ; SSE4-NEXT:    retq
 132 ;
 133 ; AVX-LABEL: test6:
 134 ; AVX:       # %bb.0:
 135 ; AVX-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
 136 ; AVX-NEXT:    retq
 137   %shuf1 = shufflevector <4 x i32> %a, <4 x i32> zeroinitializer, <4 x i32><i32 0, i32 1, i32 4, i32 4>
 138   %shuf2 = shufflevector <4 x i32> %b, <4 x i32> zeroinitializer, <4 x i32><i32 4, i32 4, i32 2, i32 3>
 139   %or = or <4 x i32> %shuf1, %shuf2
 140   ret <4 x i32> %or
 141 }
 142
 143
 144 define <4 x i32> @test7(<4 x i32> %a, <4 x i32> %b) {
 145 ; SSE2-LABEL: test7:
 146 ; SSE2:       # %bb.0:
 147 ; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
 148 ; SSE2-NEXT:    retq
 149 ;
 150 ; SSE4-LABEL: test7:
 151 ; SSE4:       # %bb.0:
 152 ; SSE4-NEXT:    blendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
 153 ; SSE4-NEXT:    retq
 154 ;
 155 ; AVX-LABEL: test7:
 156 ; AVX:       # %bb.0:
 157 ; AVX-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
 158 ; AVX-NEXT:    retq
 159   %and1 = and <4 x i32> %a, <i32 -1, i32 -1, i32 0, i32 0>
 160   %and2 = and <4 x i32> %b, <i32 0, i32 0, i32 -1, i32 -1>
 161   %or = or <4 x i32> %and1, %and2
 162   ret <4 x i32> %or
 163 }
 164
 165
 166 define <2 x i64> @test8(<2 x i64> %a, <2 x i64> %b) {
 167 ; SSE2-LABEL: test8:
 168 ; SSE2:       # %bb.0:
 169 ; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
 170 ; SSE2-NEXT:    retq
 171 ;
 172 ; SSE4-LABEL: test8:
 173 ; SSE4:       # %bb.0:
 174 ; SSE4-NEXT:    blendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
 175 ; SSE4-NEXT:    retq
 176 ;
 177 ; AVX-LABEL: test8:
 178 ; AVX:       # %bb.0:
 179 ; AVX-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
 180 ; AVX-NEXT:    retq
 181   %and1 = and <2 x i64> %a, <i64 -1, i64 0>
 182   %and2 = and <2 x i64> %b, <i64 0, i64 -1>
 183   %or = or <2 x i64> %and1, %and2
 184   ret <2 x i64> %or
 185 }
 186
 187
 188 define <4 x i32> @test9(<4 x i32> %a, <4 x i32> %b) {
 189 ; SSE2-LABEL: test9:
 190 ; SSE2:       # %bb.0:
 191 ; SSE2-NEXT:    movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
 192 ; SSE2-NEXT:    retq
 193 ;
 194 ; SSE4-LABEL: test9:
 195 ; SSE4:       # %bb.0:
 196 ; SSE4-NEXT:    blendps {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
 197 ; SSE4-NEXT:    retq
 198 ;
 199 ; AVX-LABEL: test9:
 200 ; AVX:       # %bb.0:
 201 ; AVX-NEXT:    vblendps {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
 202 ; AVX-NEXT:    retq
 203   %and1 = and <4 x i32> %a, <i32 0, i32 0, i32 -1, i32 -1>
 204   %and2 = and <4 x i32> %b, <i32 -1, i32 -1, i32 0, i32 0>
 205   %or = or <4 x i32> %and1, %and2
 206   ret <4 x i32> %or
 207 }
 208
 209
 210 define <2 x i64> @test10(<2 x i64> %a, <2 x i64> %b) {
 211 ; SSE2-LABEL: test10:
 212 ; SSE2:       # %bb.0:
 213 ; SSE2-NEXT:    movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
 214 ; SSE2-NEXT:    retq
 215 ;
 216 ; SSE4-LABEL: test10:
 217 ; SSE4:       # %bb.0:
 218 ; SSE4-NEXT:    blendps {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
 219 ; SSE4-NEXT:    retq
 220 ;
 221 ; AVX-LABEL: test10:
 222 ; AVX:       # %bb.0:
 223 ; AVX-NEXT:    vblendps {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
 224 ; AVX-NEXT:    retq
 225   %and1 = and <2 x i64> %a, <i64 0, i64 -1>
 226   %and2 = and <2 x i64> %b, <i64 -1, i64 0>
 227   %or = or <2 x i64> %and1, %and2
 228   ret <2 x i64> %or
 229 }
 230
 231
 232 define <4 x i32> @test11(<4 x i32> %a, <4 x i32> %b) {
 233 ; SSE2-LABEL: test11:
 234 ; SSE2:       # %bb.0:
 235 ; SSE2-NEXT:    movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3]
 236 ; SSE2-NEXT:    movaps %xmm1, %xmm0
 237 ; SSE2-NEXT:    retq
 238 ;
 239 ; SSE4-LABEL: test11:
 240 ; SSE4:       # %bb.0:
 241 ; SSE4-NEXT:    blendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
 242 ; SSE4-NEXT:    retq
 243 ;
 244 ; AVX-LABEL: test11:
 245 ; AVX:       # %bb.0:
 246 ; AVX-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
 247 ; AVX-NEXT:    retq
 248   %and1 = and <4 x i32> %a, <i32 -1, i32 0, i32 0, i32 0>
 249   %and2 = and <4 x i32> %b, <i32 0, i32 -1, i32 -1, i32 -1>
 250   %or = or <4 x i32> %and1, %and2
 251   ret <4 x i32> %or
 252 }
 253
 254
 255 define <4 x i32> @test12(<4 x i32> %a, <4 x i32> %b) {
 256 ; SSE2-LABEL: test12:
 257 ; SSE2:       # %bb.0:
 258 ; SSE2-NEXT:    movss {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
 259 ; SSE2-NEXT:    retq
 260 ;
 261 ; SSE4-LABEL: test12:
 262 ; SSE4:       # %bb.0:
 263 ; SSE4-NEXT:    blendps {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
 264 ; SSE4-NEXT:    retq
 265 ;
 266 ; AVX-LABEL: test12:
 267 ; AVX:       # %bb.0:
 268 ; AVX-NEXT:    vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
 269 ; AVX-NEXT:    retq
 270   %and1 = and <4 x i32> %a, <i32 0, i32 -1, i32 -1, i32 -1>
 271   %and2 = and <4 x i32> %b, <i32 -1, i32 0, i32 0, i32 0>
 272   %or = or <4 x i32> %and1, %and2
 273   ret <4 x i32> %or
 274 }
 275
 276
 277 ; Verify that the following test cases are folded into single shuffles.
 278
 279 define <4 x i32> @test13(<4 x i32> %a, <4 x i32> %b) {
 280 ; SSE-LABEL: test13:
 281 ; SSE:       # %bb.0:
 282 ; SSE-NEXT:    shufps {{.*#+}} xmm0 = xmm0[1,1],xmm1[2,3]
 283 ; SSE-NEXT:    retq
 284 ;
 285 ; AVX-LABEL: test13:
 286 ; AVX:       # %bb.0:
 287 ; AVX-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[1,1],xmm1[2,3]
 288 ; AVX-NEXT:    retq
 289   %shuf1 = shufflevector <4 x i32> %a, <4 x i32> zeroinitializer, <4 x i32><i32 1, i32 1, i32 4, i32 4>
 290   %shuf2 = shufflevector <4 x i32> %b, <4 x i32> zeroinitializer, <4 x i32><i32 4, i32 4, i32 2, i32 3>
 291   %or = or <4 x i32> %shuf1, %shuf2
 292   ret <4 x i32> %or
 293 }
 294
 295
 296 define <2 x i64> @test14(<2 x i64> %a, <2 x i64> %b) {
 297 ; SSE-LABEL: test14:
 298 ; SSE:       # %bb.0:
 299 ; SSE-NEXT:    movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
 300 ; SSE-NEXT:    retq
 301 ;
 302 ; AVX-LABEL: test14:
 303 ; AVX:       # %bb.0:
 304 ; AVX-NEXT:    vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
 305 ; AVX-NEXT:    retq
 306   %shuf1 = shufflevector <2 x i64> %a, <2 x i64> zeroinitializer, <2 x i32><i32 0, i32 2>
 307   %shuf2 = shufflevector <2 x i64> %b, <2 x i64> zeroinitializer, <2 x i32><i32 2, i32 0>
 308   %or = or <2 x i64> %shuf1, %shuf2
 309   ret <2 x i64> %or
 310 }
 311
 312
 313 define <4 x i32> @test15(<4 x i32> %a, <4 x i32> %b) {
 314 ; SSE-LABEL: test15:
 315 ; SSE:       # %bb.0:
 316 ; SSE-NEXT:    shufps {{.*#+}} xmm1 = xmm1[2,1],xmm0[2,1]
 317 ; SSE-NEXT:    movaps %xmm1, %xmm0
 318 ; SSE-NEXT:    retq
 319 ;
 320 ; AVX-LABEL: test15:
 321 ; AVX:       # %bb.0:
 322 ; AVX-NEXT:    vshufps {{.*#+}} xmm0 = xmm1[2,1],xmm0[2,1]
 323 ; AVX-NEXT:    retq
 324   %shuf1 = shufflevector <4 x i32> %a, <4 x i32> zeroinitializer, <4 x i32><i32 4, i32 4, i32 2, i32 1>
 325   %shuf2 = shufflevector <4 x i32> %b, <4 x i32> zeroinitializer, <4 x i32><i32 2, i32 1, i32 4, i32 4>
 326   %or = or <4 x i32> %shuf1, %shuf2
 327   ret <4 x i32> %or
 328 }
 329
 330
 331 define <2 x i64> @test16(<2 x i64> %a, <2 x i64> %b) {
 332 ; SSE-LABEL: test16:
 333 ; SSE:       # %bb.0:
 334 ; SSE-NEXT:    movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0]
 335 ; SSE-NEXT:    movaps %xmm1, %xmm0
 336 ; SSE-NEXT:    retq
 337 ;
 338 ; AVX-LABEL: test16:
 339 ; AVX:       # %bb.0:
 340 ; AVX-NEXT:    vmovlhps {{.*#+}} xmm0 = xmm1[0],xmm0[0]
 341 ; AVX-NEXT:    retq
 342   %shuf1 = shufflevector <2 x i64> %a, <2 x i64> zeroinitializer, <2 x i32><i32 2, i32 0>
 343   %shuf2 = shufflevector <2 x i64> %b, <2 x i64> zeroinitializer, <2 x i32><i32 0, i32 2>
 344   %or = or <2 x i64> %shuf1, %shuf2
 345   ret <2 x i64> %or
 346 }
 347
 348
 349 ; Verify that the dag-combiner does not fold a OR of two shuffles into a single
 350 ; shuffle instruction when the shuffle indexes are not compatible.
 351
 352 define <4 x i32> @test17(<4 x i32> %a, <4 x i32> %b) {
 353 ; SSE-LABEL: test17:
 354 ; SSE:       # %bb.0:
 355 ; SSE-NEXT:    psllq $32, %xmm0
 356 ; SSE-NEXT:    movq {{.*#+}} xmm1 = xmm1[0],zero
 357 ; SSE-NEXT:    por %xmm1, %xmm0
 358 ; SSE-NEXT:    retq
 359 ;
 360 ; AVX-LABEL: test17:
 361 ; AVX:       # %bb.0:
 362 ; AVX-NEXT:    vpsllq $32, %xmm0, %xmm0
 363 ; AVX-NEXT:    vmovq {{.*#+}} xmm1 = xmm1[0],zero
 364 ; AVX-NEXT:    vpor %xmm1, %xmm0, %xmm0
 365 ; AVX-NEXT:    retq
 366   %shuf1 = shufflevector <4 x i32> %a, <4 x i32> zeroinitializer, <4 x i32><i32 4, i32 0, i32 4, i32 2>
 367   %shuf2 = shufflevector <4 x i32> %b, <4 x i32> zeroinitializer, <4 x i32><i32 0, i32 1, i32 4, i32 4>
 368   %or = or <4 x i32> %shuf1, %shuf2
 369   ret <4 x i32> %or
 370 }
 371
 372
 373 define <4 x i32> @test18(<4 x i32> %a, <4 x i32> %b) {
 374 ; SSE2-LABEL: test18:
 375 ; SSE2:       # %bb.0:
 376 ; SSE2-NEXT:    xorps %xmm2, %xmm2
 377 ; SSE2-NEXT:    xorps %xmm3, %xmm3
 378 ; SSE2-NEXT:    movss {{.*#+}} xmm3 = xmm0[0],xmm3[1,2,3]
 379 ; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm3[1,0,1,1]
 380 ; SSE2-NEXT:    movss {{.*#+}} xmm2 = xmm1[0],xmm2[1,2,3]
 381 ; SSE2-NEXT:    orps %xmm0, %xmm2
 382 ; SSE2-NEXT:    movaps %xmm2, %xmm0
 383 ; SSE2-NEXT:    retq
 384 ;
 385 ; SSE4-LABEL: test18:
 386 ; SSE4:       # %bb.0:
 387 ; SSE4-NEXT:    pxor %xmm2, %xmm2
 388 ; SSE4-NEXT:    pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3,4,5,6,7]
 389 ; SSE4-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,0,1,1]
 390 ; SSE4-NEXT:    pblendw {{.*#+}} xmm2 = xmm1[0,1],xmm2[2,3,4,5,6,7]
 391 ; SSE4-NEXT:    por %xmm0, %xmm2
 392 ; SSE4-NEXT:    movdqa %xmm2, %xmm0
 393 ; SSE4-NEXT:    retq
 394 ;
 395 ; AVX1-LABEL: test18:
 396 ; AVX1:       # %bb.0:
 397 ; AVX1-NEXT:    vxorps %xmm2, %xmm2, %xmm2
 398 ; AVX1-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0],xmm2[1,2,3]
 399 ; AVX1-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[1,0,1,1]
 400 ; AVX1-NEXT:    vblendps {{.*#+}} xmm1 = xmm1[0],xmm2[1,2,3]
 401 ; AVX1-NEXT:    vorps %xmm1, %xmm0, %xmm0
 402 ; AVX1-NEXT:    retq
 403 ;
 404 ; AVX2-LABEL: test18:
 405 ; AVX2:       # %bb.0:
 406 ; AVX2-NEXT:    vxorps %xmm2, %xmm2, %xmm2
 407 ; AVX2-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0],xmm2[1,2,3]
 408 ; AVX2-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[1,0,1,1]
 409 ; AVX2-NEXT:    vblendps {{.*#+}} xmm1 = xmm1[0],xmm2[1,2,3]
 410 ; AVX2-NEXT:    vorps %xmm1, %xmm0, %xmm0
 411 ; AVX2-NEXT:    retq
 412 ;
 413 ; AVX512-LABEL: test18:
 414 ; AVX512:       # %bb.0:
 415 ; AVX512-NEXT:    vpshufb {{.*#+}} xmm0 = zero,zero,zero,zero,xmm0[0,1,2,3],zero,zero,zero,zero,zero,zero,zero,zero
 416 ; AVX512-NEXT:    vpxor %xmm2, %xmm2, %xmm2
 417 ; AVX512-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3,4,5,6,7]
 418 ; AVX512-NEXT:    vpor %xmm1, %xmm0, %xmm0
 419 ; AVX512-NEXT:    retq
 420   %shuf1 = shufflevector <4 x i32> %a, <4 x i32> zeroinitializer, <4 x i32><i32 4, i32 0, i32 4, i32 4>
 421   %shuf2 = shufflevector <4 x i32> %b, <4 x i32> zeroinitializer, <4 x i32><i32 0, i32 4, i32 4, i32 4>
 422   %or = or <4 x i32> %shuf1, %shuf2
 423   ret <4 x i32> %or
 424 }
 425
 426
 427 define <4 x i32> @test19(<4 x i32> %a, <4 x i32> %b) {
 428 ; SSE2-LABEL: test19:
 429 ; SSE2:       # %bb.0:
 430 ; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,3,2,3]
 431 ; SSE2-NEXT:    pxor %xmm2, %xmm2
 432 ; SSE2-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1]
 433 ; SSE2-NEXT:    movdqa %xmm1, %xmm0
 434 ; SSE2-NEXT:    pslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5,6,7]
 435 ; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[2,2]
 436 ; SSE2-NEXT:    orps %xmm2, %xmm0
 437 ; SSE2-NEXT:    retq
 438 ;
 439 ; SSE4-LABEL: test19:
 440 ; SSE4:       # %bb.0:
 441 ; SSE4-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[0,0,2,3]
 442 ; SSE4-NEXT:    pxor %xmm3, %xmm3
 443 ; SSE4-NEXT:    pblendw {{.*#+}} xmm2 = xmm3[0,1],xmm2[2,3],xmm3[4,5],xmm2[6,7]
 444 ; SSE4-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[0,1,2,2]
 445 ; SSE4-NEXT:    pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm3[2,3],xmm0[4,5,6,7]
 446 ; SSE4-NEXT:    por %xmm2, %xmm0
 447 ; SSE4-NEXT:    retq
 448 ;
 449 ; AVX1-LABEL: test19:
 450 ; AVX1:       # %bb.0:
 451 ; AVX1-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[0,0,2,3]
 452 ; AVX1-NEXT:    vxorps %xmm2, %xmm2, %xmm2
 453 ; AVX1-NEXT:    vblendps {{.*#+}} xmm0 = xmm2[0],xmm0[1],xmm2[2],xmm0[3]
 454 ; AVX1-NEXT:    vshufps {{.*#+}} xmm1 = xmm1[0,1,2,2]
 455 ; AVX1-NEXT:    vblendps {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2,3]
 456 ; AVX1-NEXT:    vorps %xmm1, %xmm0, %xmm0
 457 ; AVX1-NEXT:    retq
 458 ;
 459 ; AVX2-LABEL: test19:
 460 ; AVX2:       # %bb.0:
 461 ; AVX2-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[0,0,2,3]
 462 ; AVX2-NEXT:    vxorps %xmm2, %xmm2, %xmm2
 463 ; AVX2-NEXT:    vblendps {{.*#+}} xmm0 = xmm2[0],xmm0[1],xmm2[2],xmm0[3]
 464 ; AVX2-NEXT:    vshufps {{.*#+}} xmm1 = xmm1[0,1,2,2]
 465 ; AVX2-NEXT:    vblendps {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2,3]
 466 ; AVX2-NEXT:    vorps %xmm1, %xmm0, %xmm0
 467 ; AVX2-NEXT:    retq
 468 ;
 469 ; AVX512-LABEL: test19:
 470 ; AVX512:       # %bb.0:
 471 ; AVX512-NEXT:    vpshufb {{.*#+}} xmm0 = zero,zero,zero,zero,xmm0[0,1,2,3],zero,zero,zero,zero,xmm0[12,13,14,15]
 472 ; AVX512-NEXT:    vpshufb {{.*#+}} xmm1 = xmm1[0,1,2,3],zero,zero,zero,zero,xmm1[8,9,10,11,8,9,10,11]
 473 ; AVX512-NEXT:    vpor %xmm1, %xmm0, %xmm0
 474 ; AVX512-NEXT:    retq
 475   %shuf1 = shufflevector <4 x i32> %a, <4 x i32> zeroinitializer, <4 x i32><i32 4, i32 0, i32 4, i32 3>
 476   %shuf2 = shufflevector <4 x i32> %b, <4 x i32> zeroinitializer, <4 x i32><i32 0, i32 4, i32 2, i32 2>
 477   %or = or <4 x i32> %shuf1, %shuf2
 478   ret <4 x i32> %or
 479 }
 480
 481
 482 define <2 x i64> @test20(<2 x i64> %a, <2 x i64> %b) {
 483 ; SSE-LABEL: test20:
 484 ; SSE:       # %bb.0:
 485 ; SSE-NEXT:    por %xmm1, %xmm0
 486 ; SSE-NEXT:    movq {{.*#+}} xmm0 = xmm0[0],zero
 487 ; SSE-NEXT:    retq
 488 ;
 489 ; AVX-LABEL: test20:
 490 ; AVX:       # %bb.0:
 491 ; AVX-NEXT:    vpor %xmm1, %xmm0, %xmm0
 492 ; AVX-NEXT:    vmovq {{.*#+}} xmm0 = xmm0[0],zero
 493 ; AVX-NEXT:    retq
 494   %shuf1 = shufflevector <2 x i64> %a, <2 x i64> zeroinitializer, <2 x i32><i32 0, i32 2>
 495   %shuf2 = shufflevector <2 x i64> %b, <2 x i64> zeroinitializer, <2 x i32><i32 0, i32 2>
 496   %or = or <2 x i64> %shuf1, %shuf2
 497   ret <2 x i64> %or
 498 }
 499
 500
 501 define <2 x i64> @test21(<2 x i64> %a, <2 x i64> %b) {
 502 ; SSE-LABEL: test21:
 503 ; SSE:       # %bb.0:
 504 ; SSE-NEXT:    por %xmm1, %xmm0
 505 ; SSE-NEXT:    pslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5,6,7]
 506 ; SSE-NEXT:    retq
 507 ;
 508 ; AVX1-LABEL: test21:
 509 ; AVX1:       # %bb.0:
 510 ; AVX1-NEXT:    vpor %xmm1, %xmm0, %xmm0
 511 ; AVX1-NEXT:    vpslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5,6,7]
 512 ; AVX1-NEXT:    retq
 513 ;
 514 ; AVX2-LABEL: test21:
 515 ; AVX2:       # %bb.0:
 516 ; AVX2-NEXT:    vpor %xmm1, %xmm0, %xmm0
 517 ; AVX2-NEXT:    vpslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5,6,7]
 518 ; AVX2-NEXT:    retq
 519 ;
 520 ; AVX512-LABEL: test21:
 521 ; AVX512:       # %bb.0:
 522 ; AVX512-NEXT:    vorpd %xmm1, %xmm0, %xmm0
 523 ; AVX512-NEXT:    vpslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5,6,7]
 524 ; AVX512-NEXT:    retq
 525   %shuf1 = shufflevector <2 x i64> %a, <2 x i64> zeroinitializer, <2 x i32><i32 2, i32 0>
 526   %shuf2 = shufflevector <2 x i64> %b, <2 x i64> zeroinitializer, <2 x i32><i32 2, i32 0>
 527   %or = or <2 x i64> %shuf1, %shuf2
 528   ret <2 x i64> %or
 529 }
 530
 531
 532 ; Verify that the dag-combiner keeps the correct domain for float/double vectors
 533 ; bitcast to use the mask-or blend combine.
 534
 535 define <2 x double> @test22(<2 x double> %a0, <2 x double> %a1) {
 536 ; SSE2-LABEL: test22:
 537 ; SSE2:       # %bb.0:
 538 ; SSE2-NEXT:    movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
 539 ; SSE2-NEXT:    retq
 540 ;
 541 ; SSE4-LABEL: test22:
 542 ; SSE4:       # %bb.0:
 543 ; SSE4-NEXT:    blendps {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
 544 ; SSE4-NEXT:    retq
 545 ;
 546 ; AVX-LABEL: test22:
 547 ; AVX:       # %bb.0:
 548 ; AVX-NEXT:    vblendps {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
 549 ; AVX-NEXT:    retq
 550   %bc1 = bitcast <2 x double> %a0 to <2 x i64>
 551   %bc2 = bitcast <2 x double> %a1 to <2 x i64>
 552   %and1 = and <2 x i64> %bc1, <i64 0, i64 -1>
 553   %and2 = and <2 x i64> %bc2, <i64 -1, i64 0>
 554   %or = or <2 x i64> %and1, %and2
 555   %bc3 = bitcast <2 x i64> %or to <2 x double>
 556   ret <2 x double> %bc3
 557 }
 558
 559
 560 define <4 x float> @test23(<4 x float> %a0, <4 x float> %a1) {
 561 ; SSE2-LABEL: test23:
 562 ; SSE2:       # %bb.0:
 563 ; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[1,2],xmm1[0,3]
 564 ; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[2,0,1,3]
 565 ; SSE2-NEXT:    retq
 566 ;
 567 ; SSE4-LABEL: test23:
 568 ; SSE4:       # %bb.0:
 569 ; SSE4-NEXT:    blendps {{.*#+}} xmm0 = xmm1[0],xmm0[1,2],xmm1[3]
 570 ; SSE4-NEXT:    retq
 571 ;
 572 ; AVX-LABEL: test23:
 573 ; AVX:       # %bb.0:
 574 ; AVX-NEXT:    vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1,2],xmm1[3]
 575 ; AVX-NEXT:    retq
 576   %bc1 = bitcast <4 x float> %a0 to <4 x i32>
 577   %bc2 = bitcast <4 x float> %a1 to <4 x i32>
 578   %and1 = and <4 x i32> %bc1, <i32 0, i32 -1, i32 -1, i32 0>
 579   %and2 = and <4 x i32> %bc2, <i32 -1, i32 0, i32 0, i32 -1>
 580   %or = or <4 x i32> %and1, %and2
 581   %bc3 = bitcast <4 x i32> %or to <4 x float>
 582   ret <4 x float> %bc3
 583 }
 584
 585
 586 define <4 x float> @test24(<4 x float> %a0, <4 x float> %a1) {
 587 ; SSE2-LABEL: test24:
 588 ; SSE2:       # %bb.0:
 589 ; SSE2-NEXT:    movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
 590 ; SSE2-NEXT:    retq
 591 ;
 592 ; SSE4-LABEL: test24:
 593 ; SSE4:       # %bb.0:
 594 ; SSE4-NEXT:    blendps {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
 595 ; SSE4-NEXT:    retq
 596 ;
 597 ; AVX-LABEL: test24:
 598 ; AVX:       # %bb.0:
 599 ; AVX-NEXT:    vblendps {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
 600 ; AVX-NEXT:    retq
 601   %bc1 = bitcast <4 x float> %a0 to <2 x i64>
 602   %bc2 = bitcast <4 x float> %a1 to <2 x i64>
 603   %and1 = and <2 x i64> %bc1, <i64 0, i64 -1>
 604   %and2 = and <2 x i64> %bc2, <i64 -1, i64 0>
 605   %or = or <2 x i64> %and1, %and2
 606   %bc3 = bitcast <2 x i64> %or to <4 x float>
 607   ret <4 x float> %bc3
 608 }
 609
 610
 611 define <4 x float> @test25(<4 x float> %a0) {
 612 ; SSE2-LABEL: test25:
 613 ; SSE2:       # %bb.0:
 614 ; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[1,2],mem[0,3]
 615 ; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[2,0,1,3]
 616 ; SSE2-NEXT:    retq
 617 ;
 618 ; SSE4-LABEL: test25:
 619 ; SSE4:       # %bb.0:
 620 ; SSE4-NEXT:    blendps {{.*#+}} xmm0 = mem[0],xmm0[1,2],mem[3]
 621 ; SSE4-NEXT:    retq
 622 ;
 623 ; AVX1-LABEL: test25:
 624 ; AVX1:       # %bb.0:
 625 ; AVX1-NEXT:    vblendps {{.*#+}} xmm0 = mem[0],xmm0[1,2],mem[3]
 626 ; AVX1-NEXT:    retq
 627 ;
 628 ; AVX2-LABEL: test25:
 629 ; AVX2:       # %bb.0:
 630 ; AVX2-NEXT:    vbroadcastss {{.*#+}} xmm1 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0]
 631 ; AVX2-NEXT:    vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1,2],xmm1[3]
 632 ; AVX2-NEXT:    retq
 633 ;
 634 ; AVX512-LABEL: test25:
 635 ; AVX512:       # %bb.0:
 636 ; AVX512-NEXT:    vbroadcastss {{.*#+}} xmm1 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0]
 637 ; AVX512-NEXT:    vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1,2],xmm1[3]
 638 ; AVX512-NEXT:    retq
 639   %bc1 = bitcast <4 x float> %a0 to <4 x i32>
 640   %bc2 = bitcast <4 x float> <float 1.0, float 1.0, float 1.0, float 1.0> to <4 x i32>
 641   %and1 = and <4 x i32> %bc1, <i32 0, i32 -1, i32 -1, i32 0>
 642   %and2 = and <4 x i32> %bc2, <i32 -1, i32 0, i32 0, i32 -1>
 643   %or = or <4 x i32> %and1, %and2
 644   %bc3 = bitcast <4 x i32> %or to <4 x float>
 645   ret <4 x float> %bc3
 646 }
 647
 648
 649 ; Verify that the DAGCombiner doesn't crash in the attempt to check if a shuffle
 650 ; with illegal type has a legal mask. Method 'isShuffleMaskLegal' only knows how to
 651 ; handle legal vector value types.
 652 define <4 x i8> @test_crash(<4 x i8> %a, <4 x i8> %b) {
 653 ; SSE2-LABEL: test_crash:
 654 ; SSE2:       # %bb.0:
 655 ; SSE2-NEXT:    movaps {{.*#+}} xmm2 = [65535,0,65535,65535,65535,65535,65535,65535]
 656 ; SSE2-NEXT:    andps %xmm2, %xmm1
 657 ; SSE2-NEXT:    andnps %xmm0, %xmm2
 658 ; SSE2-NEXT:    orps %xmm1, %xmm2
 659 ; SSE2-NEXT:    movaps %xmm2, %xmm0
 660 ; SSE2-NEXT:    retq
 661 ;
 662 ; SSE4-LABEL: test_crash:
 663 ; SSE4:       # %bb.0:
 664 ; SSE4-NEXT:    pblendw {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3,4,5,6,7]
 665 ; SSE4-NEXT:    retq
 666 ;
 667 ; AVX-LABEL: test_crash:
 668 ; AVX:       # %bb.0:
 669 ; AVX-NEXT:    vpblendw {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3,4,5,6,7]
 670 ; AVX-NEXT:    retq
 671   %shuf1 = shufflevector <4 x i8> %a, <4 x i8> zeroinitializer, <4 x i32><i32 4, i32 4, i32 2, i32 3>
 672   %shuf2 = shufflevector <4 x i8> %b, <4 x i8> zeroinitializer, <4 x i32><i32 0, i32 1, i32 4, i32 4>
 673   %or = or <4 x i8> %shuf1, %shuf2
 674   ret <4 x i8> %or
 675 }
 676
 677 ; Verify that we can fold regardless of which operand is the zeroinitializer
 678
 679 define <4 x i32> @test2b(<4 x i32> %a, <4 x i32> %b) {
 680 ; SSE2-LABEL: test2b:
 681 ; SSE2:       # %bb.0:
 682 ; SSE2-NEXT:    movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
 683 ; SSE2-NEXT:    retq
 684 ;
 685 ; SSE4-LABEL: test2b:
 686 ; SSE4:       # %bb.0:
 687 ; SSE4-NEXT:    blendps {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
 688 ; SSE4-NEXT:    retq
 689 ;
 690 ; AVX-LABEL: test2b:
 691 ; AVX:       # %bb.0:
 692 ; AVX-NEXT:    vblendps {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
 693 ; AVX-NEXT:    retq
 694   %shuf1 = shufflevector <4 x i32> zeroinitializer, <4 x i32> %a, <4 x i32><i32 0, i32 0, i32 6, i32 7>
 695   %shuf2 = shufflevector <4 x i32> %b, <4 x i32> zeroinitializer, <4 x i32><i32 0, i32 1, i32 4, i32 4>
 696   %or = or <4 x i32> %shuf1, %shuf2
 697   ret <4 x i32> %or
 698 }
 699
 700 define <4 x i32> @test2c(<4 x i32> %a, <4 x i32> %b) {
 701 ; SSE2-LABEL: test2c:
 702 ; SSE2:       # %bb.0:
 703 ; SSE2-NEXT:    movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
 704 ; SSE2-NEXT:    retq
 705 ;
 706 ; SSE4-LABEL: test2c:
 707 ; SSE4:       # %bb.0:
 708 ; SSE4-NEXT:    blendps {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
 709 ; SSE4-NEXT:    retq
 710 ;
 711 ; AVX-LABEL: test2c:
 712 ; AVX:       # %bb.0:
 713 ; AVX-NEXT:    vblendps {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
 714 ; AVX-NEXT:    retq
 715   %shuf1 = shufflevector <4 x i32> zeroinitializer, <4 x i32> %a, <4 x i32><i32 0, i32 0, i32 6, i32 7>
 716   %shuf2 = shufflevector <4 x i32> zeroinitializer, <4 x i32> %b, <4 x i32><i32 4, i32 5, i32 0, i32 0>
 717   %or = or <4 x i32> %shuf1, %shuf2
 718   ret <4 x i32> %or
 719 }
 720
 721
 722 define <4 x i32> @test2d(<4 x i32> %a, <4 x i32> %b) {
 723 ; SSE2-LABEL: test2d:
 724 ; SSE2:       # %bb.0:
 725 ; SSE2-NEXT:    movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
 726 ; SSE2-NEXT:    retq
 727 ;
 728 ; SSE4-LABEL: test2d:
 729 ; SSE4:       # %bb.0:
 730 ; SSE4-NEXT:    blendps {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
 731 ; SSE4-NEXT:    retq
 732 ;
 733 ; AVX-LABEL: test2d:
 734 ; AVX:       # %bb.0:
 735 ; AVX-NEXT:    vblendps {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
 736 ; AVX-NEXT:    retq
 737   %shuf1 = shufflevector <4 x i32> %a, <4 x i32> zeroinitializer, <4 x i32><i32 4, i32 4, i32 2, i32 3>
 738   %shuf2 = shufflevector <4 x i32> zeroinitializer, <4 x i32> %b, <4 x i32><i32 4, i32 5, i32 0, i32 0>
 739   %or = or <4 x i32> %shuf1, %shuf2
 740   ret <4 x i32> %or
 741 }
 742
 743 ; Make sure we can have an undef where an index pointing to the zero vector should be
 744
 745 define <4 x i32> @test2e(<4 x i32> %a, <4 x i32> %b) {
 746 ; SSE2-LABEL: test2e:
 747 ; SSE2:       # %bb.0:
 748 ; SSE2-NEXT:    movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
 749 ; SSE2-NEXT:    retq
 750 ;
 751 ; SSE4-LABEL: test2e:
 752 ; SSE4:       # %bb.0:
 753 ; SSE4-NEXT:    blendps {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
 754 ; SSE4-NEXT:    retq
 755 ;
 756 ; AVX-LABEL: test2e:
 757 ; AVX:       # %bb.0:
 758 ; AVX-NEXT:    vblendps {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
 759 ; AVX-NEXT:    retq
 760   %shuf1 = shufflevector <4 x i32> %a, <4 x i32> <i32 0, i32 undef, i32 undef, i32 undef>, <4 x i32><i32 undef, i32 4, i32 2, i32 3>
 761   %shuf2 = shufflevector <4 x i32> %b, <4 x i32> <i32 0, i32 undef, i32 undef, i32 undef>, <4 x i32><i32 0, i32 1, i32 4, i32 4>
 762   %or = or <4 x i32> %shuf1, %shuf2
 763   ret <4 x i32> %or
 764 }
 765
 766 define <4 x i32> @test2f(<4 x i32> %a, <4 x i32> %b) {
 767 ; SSE2-LABEL: test2f:
 768 ; SSE2:       # %bb.0:
 769 ; SSE2-NEXT:    movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
 770 ; SSE2-NEXT:    retq
 771 ;
 772 ; SSE4-LABEL: test2f:
 773 ; SSE4:       # %bb.0:
 774 ; SSE4-NEXT:    blendps {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
 775 ; SSE4-NEXT:    retq
 776 ;
 777 ; AVX-LABEL: test2f:
 778 ; AVX:       # %bb.0:
 779 ; AVX-NEXT:    vblendps {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
 780 ; AVX-NEXT:    retq
 781   %shuf1 = shufflevector <4 x i32> %a, <4 x i32> <i32 0, i32 undef, i32 undef, i32 undef>, <4 x i32><i32 4, i32 4, i32 2, i32 3>
 782   %shuf2 = shufflevector <4 x i32> %b, <4 x i32> <i32 0, i32 undef, i32 undef, i32 undef>, <4 x i32><i32 undef, i32 1, i32 4, i32 4>
 783   %or = or <4 x i32> %shuf1, %shuf2
 784   ret <4 x i32> %or
 785 }
 786
 787 ; (or (and X, c1), c2) -> (and (or X, c2), c1|c2) iff (c1 & c2) != 0
 788
 789 define <2 x i64> @or_and_v2i64(<2 x i64> %a0) {
 790 ; SSE-LABEL: or_and_v2i64:
 791 ; SSE:       # %bb.0:
 792 ; SSE-NEXT:    orps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
 793 ; SSE-NEXT:    andps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
 794 ; SSE-NEXT:    retq
 795 ;
 796 ; AVX1-LABEL: or_and_v2i64:
 797 ; AVX1:       # %bb.0:
 798 ; AVX1-NEXT:    vorps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
 799 ; AVX1-NEXT:    vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
 800 ; AVX1-NEXT:    retq
 801 ;
 802 ; AVX2-LABEL: or_and_v2i64:
 803 ; AVX2:       # %bb.0:
 804 ; AVX2-NEXT:    vorps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
 805 ; AVX2-NEXT:    vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
 806 ; AVX2-NEXT:    retq
 807 ;
 808 ; AVX512-LABEL: or_and_v2i64:
 809 ; AVX512:       # %bb.0:
 810 ; AVX512-NEXT:    vpbroadcastq {{.*#+}} xmm1 = [7,7]
 811 ; AVX512-NEXT:    vpternlogq $200, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to2}, %xmm1, %xmm0
 812 ; AVX512-NEXT:    retq
 813   %1 = and <2 x i64> %a0, <i64 7, i64 7>
 814   %2 = or <2 x i64> %1, <i64 3, i64 3>
 815   ret <2 x i64> %2
 816 }
 817
 818 define <4 x i32> @or_and_v4i32(<4 x i32> %a0) {
 819 ; SSE-LABEL: or_and_v4i32:
 820 ; SSE:       # %bb.0:
 821 ; SSE-NEXT:    orps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
 822 ; SSE-NEXT:    andps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
 823 ; SSE-NEXT:    retq
 824 ;
 825 ; AVX1-LABEL: or_and_v4i32:
 826 ; AVX1:       # %bb.0:
 827 ; AVX1-NEXT:    vorps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
 828 ; AVX1-NEXT:    vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
 829 ; AVX1-NEXT:    retq
 830 ;
 831 ; AVX2-LABEL: or_and_v4i32:
 832 ; AVX2:       # %bb.0:
 833 ; AVX2-NEXT:    vorps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
 834 ; AVX2-NEXT:    vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
 835 ; AVX2-NEXT:    retq
 836 ;
 837 ; AVX512-LABEL: or_and_v4i32:
 838 ; AVX512:       # %bb.0:
 839 ; AVX512-NEXT:    vpmovsxbd {{.*#+}} xmm1 = [3,3,15,7]
 840 ; AVX512-NEXT:    vpternlogd $200, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm0
 841 ; AVX512-NEXT:    retq
 842   %1 = and <4 x i32> %a0, <i32 1, i32 3, i32 5, i32 7>
 843   %2 = or <4 x i32> %1, <i32 3, i32 2, i32 15, i32 2>
 844   ret <4 x i32> %2
 845 }
 846
 847 ; If all masked bits are going to be set, that's a constant fold.
 848
 849 define <4 x i32> @or_and_v4i32_fold(<4 x i32> %a0) {
 850 ; SSE-LABEL: or_and_v4i32_fold:
 851 ; SSE:       # %bb.0:
 852 ; SSE-NEXT:    movaps {{.*#+}} xmm0 = [3,3,3,3]
 853 ; SSE-NEXT:    retq
 854 ;
 855 ; AVX-LABEL: or_and_v4i32_fold:
 856 ; AVX:       # %bb.0:
 857 ; AVX-NEXT:    vbroadcastss {{.*#+}} xmm0 = [3,3,3,3]
 858 ; AVX-NEXT:    retq
 859   %1 = and <4 x i32> %a0, <i32 1, i32 1, i32 1, i32 1>
 860   %2 = or <4 x i32> %1, <i32 3, i32 3, i32 3, i32 3>
 861   ret <4 x i32> %2
 862 }