test/Transforms/SLPVectorizer/X86/ctpop.ll

   1 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
   2 ; RUN: opt < %s -mtriple=x86_64-unknown -mattr=+sse2 -basicaa -slp-vectorizer -S | FileCheck %s --check-prefix=CHECK --check-prefix=SSE --check-prefix=SSE2
   3 ; RUN: opt < %s -mtriple=x86_64-unknown -mattr=+sse4.2,+popcnt -basicaa -slp-vectorizer -S | FileCheck %s --check-prefix=CHECK --check-prefix=SSE --check-prefix=SSE42
   4 ; RUN: opt < %s -mtriple=x86_64-unknown -mattr=+avx,+popcnt -basicaa -slp-vectorizer -S | FileCheck %s --check-prefix=CHECK --check-prefix=AVX --check-prefix=AVX1
   5 ; RUN: opt < %s -mtriple=x86_64-unknown -mattr=+avx2,+popcnt -basicaa -slp-vectorizer -S | FileCheck %s --check-prefix=CHECK --check-prefix=AVX --check-prefix=AVX2
   6
   7 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
   8
   9 @src64 = common global [4 x i64] zeroinitializer, align 32
  10 @dst64 = common global [4 x i64] zeroinitializer, align 32
  11 @src32 = common global [8 x i32] zeroinitializer, align 32
  12 @dst32 = common global [8 x i32] zeroinitializer, align 32
  13 @src16 = common global [16 x i16] zeroinitializer, align 32
  14 @dst16 = common global [16 x i16] zeroinitializer, align 32
  15 @src8  = common global [32 x i8] zeroinitializer, align 32
  16 @dst8  = common global [32 x i8] zeroinitializer, align 32
  17
  18 declare i64 @llvm.ctpop.i64(i64)
  19 declare i32 @llvm.ctpop.i32(i32)
  20 declare i16 @llvm.ctpop.i16(i16)
  21 declare  i8 @llvm.ctpop.i8(i8)
  22
  23 define void @ctpop_2i64() #0 {
  24 ; CHECK-LABEL: @ctpop_2i64(
  25 ; CHECK-NEXT:    [[LD0:%.*]] = load i64, i64* getelementptr inbounds ([4 x i64], [4 x i64]* @src64, i32 0, i64 0), align 8
  26 ; CHECK-NEXT:    [[LD1:%.*]] = load i64, i64* getelementptr inbounds ([4 x i64], [4 x i64]* @src64, i32 0, i64 1), align 8
  27 ; CHECK-NEXT:    [[CTPOP0:%.*]] = call i64 @llvm.ctpop.i64(i64 [[LD0]])
  28 ; CHECK-NEXT:    [[CTPOP1:%.*]] = call i64 @llvm.ctpop.i64(i64 [[LD1]])
  29 ; CHECK-NEXT:    store i64 [[CTPOP0]], i64* getelementptr inbounds ([4 x i64], [4 x i64]* @dst64, i32 0, i64 0), align 8
  30 ; CHECK-NEXT:    store i64 [[CTPOP1]], i64* getelementptr inbounds ([4 x i64], [4 x i64]* @dst64, i32 0, i64 1), align 8
  31 ; CHECK-NEXT:    ret void
  32 ;
  33   %ld0 = load i64, i64* getelementptr inbounds ([4 x i64], [4 x i64]* @src64, i32 0, i64 0), align 8
  34   %ld1 = load i64, i64* getelementptr inbounds ([4 x i64], [4 x i64]* @src64, i32 0, i64 1), align 8
  35   %ctpop0 = call i64 @llvm.ctpop.i64(i64 %ld0)
  36   %ctpop1 = call i64 @llvm.ctpop.i64(i64 %ld1)
  37   store i64 %ctpop0, i64* getelementptr inbounds ([4 x i64], [4 x i64]* @dst64, i32 0, i64 0), align 8
  38   store i64 %ctpop1, i64* getelementptr inbounds ([4 x i64], [4 x i64]* @dst64, i32 0, i64 1), align 8
  39   ret void
  40 }
  41
  42 define void @ctpop_4i64() #0 {
  43 ; SSE-LABEL: @ctpop_4i64(
  44 ; SSE-NEXT:    [[LD0:%.*]] = load i64, i64* getelementptr inbounds ([4 x i64], [4 x i64]* @src64, i64 0, i64 0), align 4
  45 ; SSE-NEXT:    [[LD1:%.*]] = load i64, i64* getelementptr inbounds ([4 x i64], [4 x i64]* @src64, i64 0, i64 1), align 4
  46 ; SSE-NEXT:    [[LD2:%.*]] = load i64, i64* getelementptr inbounds ([4 x i64], [4 x i64]* @src64, i64 0, i64 2), align 4
  47 ; SSE-NEXT:    [[LD3:%.*]] = load i64, i64* getelementptr inbounds ([4 x i64], [4 x i64]* @src64, i64 0, i64 3), align 4
  48 ; SSE-NEXT:    [[CTPOP0:%.*]] = call i64 @llvm.ctpop.i64(i64 [[LD0]])
  49 ; SSE-NEXT:    [[CTPOP1:%.*]] = call i64 @llvm.ctpop.i64(i64 [[LD1]])
  50 ; SSE-NEXT:    [[CTPOP2:%.*]] = call i64 @llvm.ctpop.i64(i64 [[LD2]])
  51 ; SSE-NEXT:    [[CTPOP3:%.*]] = call i64 @llvm.ctpop.i64(i64 [[LD3]])
  52 ; SSE-NEXT:    store i64 [[CTPOP0]], i64* getelementptr inbounds ([4 x i64], [4 x i64]* @dst64, i64 0, i64 0), align 4
  53 ; SSE-NEXT:    store i64 [[CTPOP1]], i64* getelementptr inbounds ([4 x i64], [4 x i64]* @dst64, i64 0, i64 1), align 4
  54 ; SSE-NEXT:    store i64 [[CTPOP2]], i64* getelementptr inbounds ([4 x i64], [4 x i64]* @dst64, i64 0, i64 2), align 4
  55 ; SSE-NEXT:    store i64 [[CTPOP3]], i64* getelementptr inbounds ([4 x i64], [4 x i64]* @dst64, i64 0, i64 3), align 4
  56 ; SSE-NEXT:    ret void
  57 ;
  58 ; AVX1-LABEL: @ctpop_4i64(
  59 ; AVX1-NEXT:    [[LD0:%.*]] = load i64, i64* getelementptr inbounds ([4 x i64], [4 x i64]* @src64, i64 0, i64 0), align 4
  60 ; AVX1-NEXT:    [[LD1:%.*]] = load i64, i64* getelementptr inbounds ([4 x i64], [4 x i64]* @src64, i64 0, i64 1), align 4
  61 ; AVX1-NEXT:    [[LD2:%.*]] = load i64, i64* getelementptr inbounds ([4 x i64], [4 x i64]* @src64, i64 0, i64 2), align 4
  62 ; AVX1-NEXT:    [[LD3:%.*]] = load i64, i64* getelementptr inbounds ([4 x i64], [4 x i64]* @src64, i64 0, i64 3), align 4
  63 ; AVX1-NEXT:    [[CTPOP0:%.*]] = call i64 @llvm.ctpop.i64(i64 [[LD0]])
  64 ; AVX1-NEXT:    [[CTPOP1:%.*]] = call i64 @llvm.ctpop.i64(i64 [[LD1]])
  65 ; AVX1-NEXT:    [[CTPOP2:%.*]] = call i64 @llvm.ctpop.i64(i64 [[LD2]])
  66 ; AVX1-NEXT:    [[CTPOP3:%.*]] = call i64 @llvm.ctpop.i64(i64 [[LD3]])
  67 ; AVX1-NEXT:    store i64 [[CTPOP0]], i64* getelementptr inbounds ([4 x i64], [4 x i64]* @dst64, i64 0, i64 0), align 4
  68 ; AVX1-NEXT:    store i64 [[CTPOP1]], i64* getelementptr inbounds ([4 x i64], [4 x i64]* @dst64, i64 0, i64 1), align 4
  69 ; AVX1-NEXT:    store i64 [[CTPOP2]], i64* getelementptr inbounds ([4 x i64], [4 x i64]* @dst64, i64 0, i64 2), align 4
  70 ; AVX1-NEXT:    store i64 [[CTPOP3]], i64* getelementptr inbounds ([4 x i64], [4 x i64]* @dst64, i64 0, i64 3), align 4
  71 ; AVX1-NEXT:    ret void
  72 ;
  73 ; AVX2-LABEL: @ctpop_4i64(
  74 ; AVX2-NEXT:    [[TMP1:%.*]] = load <4 x i64>, <4 x i64>* bitcast ([4 x i64]* @src64 to <4 x i64>*), align 4
  75 ; AVX2-NEXT:    [[TMP2:%.*]] = call <4 x i64> @llvm.ctpop.v4i64(<4 x i64> [[TMP1]])
  76 ; AVX2-NEXT:    store <4 x i64> [[TMP2]], <4 x i64>* bitcast ([4 x i64]* @dst64 to <4 x i64>*), align 4
  77 ; AVX2-NEXT:    ret void
  78 ;
  79   %ld0 = load i64, i64* getelementptr inbounds ([4 x i64], [4 x i64]* @src64, i64 0, i64 0), align 4
  80   %ld1 = load i64, i64* getelementptr inbounds ([4 x i64], [4 x i64]* @src64, i64 0, i64 1), align 4
  81   %ld2 = load i64, i64* getelementptr inbounds ([4 x i64], [4 x i64]* @src64, i64 0, i64 2), align 4
  82   %ld3 = load i64, i64* getelementptr inbounds ([4 x i64], [4 x i64]* @src64, i64 0, i64 3), align 4
  83   %ctpop0 = call i64 @llvm.ctpop.i64(i64 %ld0)
  84   %ctpop1 = call i64 @llvm.ctpop.i64(i64 %ld1)
  85   %ctpop2 = call i64 @llvm.ctpop.i64(i64 %ld2)
  86   %ctpop3 = call i64 @llvm.ctpop.i64(i64 %ld3)
  87   store i64 %ctpop0, i64* getelementptr inbounds ([4 x i64], [4 x i64]* @dst64, i64 0, i64 0), align 4
  88   store i64 %ctpop1, i64* getelementptr inbounds ([4 x i64], [4 x i64]* @dst64, i64 0, i64 1), align 4
  89   store i64 %ctpop2, i64* getelementptr inbounds ([4 x i64], [4 x i64]* @dst64, i64 0, i64 2), align 4
  90   store i64 %ctpop3, i64* getelementptr inbounds ([4 x i64], [4 x i64]* @dst64, i64 0, i64 3), align 4
  91   ret void
  92 }
  93
  94 define void @ctpop_4i32() #0 {
  95 ; SSE2-LABEL: @ctpop_4i32(
  96 ; SSE2-NEXT:    [[TMP1:%.*]] = load <4 x i32>, <4 x i32>* bitcast ([8 x i32]* @src32 to <4 x i32>*), align 4
  97 ; SSE2-NEXT:    [[TMP2:%.*]] = call <4 x i32> @llvm.ctpop.v4i32(<4 x i32> [[TMP1]])
  98 ; SSE2-NEXT:    store <4 x i32> [[TMP2]], <4 x i32>* bitcast ([8 x i32]* @dst32 to <4 x i32>*), align 4
  99 ; SSE2-NEXT:    ret void
 100 ;
 101 ; SSE42-LABEL: @ctpop_4i32(
 102 ; SSE42-NEXT:    [[LD0:%.*]] = load i32, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @src32, i32 0, i64 0), align 4
 103 ; SSE42-NEXT:    [[LD1:%.*]] = load i32, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @src32, i32 0, i64 1), align 4
 104 ; SSE42-NEXT:    [[LD2:%.*]] = load i32, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @src32, i32 0, i64 2), align 4
 105 ; SSE42-NEXT:    [[LD3:%.*]] = load i32, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @src32, i32 0, i64 3), align 4
 106 ; SSE42-NEXT:    [[CTPOP0:%.*]] = call i32 @llvm.ctpop.i32(i32 [[LD0]])
 107 ; SSE42-NEXT:    [[CTPOP1:%.*]] = call i32 @llvm.ctpop.i32(i32 [[LD1]])
 108 ; SSE42-NEXT:    [[CTPOP2:%.*]] = call i32 @llvm.ctpop.i32(i32 [[LD2]])
 109 ; SSE42-NEXT:    [[CTPOP3:%.*]] = call i32 @llvm.ctpop.i32(i32 [[LD3]])
 110 ; SSE42-NEXT:    store i32 [[CTPOP0]], i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 0), align 4
 111 ; SSE42-NEXT:    store i32 [[CTPOP1]], i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 1), align 4
 112 ; SSE42-NEXT:    store i32 [[CTPOP2]], i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 2), align 4
 113 ; SSE42-NEXT:    store i32 [[CTPOP3]], i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 3), align 4
 114 ; SSE42-NEXT:    ret void
 115 ;
 116 ; AVX-LABEL: @ctpop_4i32(
 117 ; AVX-NEXT:    [[LD0:%.*]] = load i32, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @src32, i32 0, i64 0), align 4
 118 ; AVX-NEXT:    [[LD1:%.*]] = load i32, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @src32, i32 0, i64 1), align 4
 119 ; AVX-NEXT:    [[LD2:%.*]] = load i32, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @src32, i32 0, i64 2), align 4
 120 ; AVX-NEXT:    [[LD3:%.*]] = load i32, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @src32, i32 0, i64 3), align 4
 121 ; AVX-NEXT:    [[CTPOP0:%.*]] = call i32 @llvm.ctpop.i32(i32 [[LD0]])
 122 ; AVX-NEXT:    [[CTPOP1:%.*]] = call i32 @llvm.ctpop.i32(i32 [[LD1]])
 123 ; AVX-NEXT:    [[CTPOP2:%.*]] = call i32 @llvm.ctpop.i32(i32 [[LD2]])
 124 ; AVX-NEXT:    [[CTPOP3:%.*]] = call i32 @llvm.ctpop.i32(i32 [[LD3]])
 125 ; AVX-NEXT:    store i32 [[CTPOP0]], i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 0), align 4
 126 ; AVX-NEXT:    store i32 [[CTPOP1]], i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 1), align 4
 127 ; AVX-NEXT:    store i32 [[CTPOP2]], i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 2), align 4
 128 ; AVX-NEXT:    store i32 [[CTPOP3]], i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 3), align 4
 129 ; AVX-NEXT:    ret void
 130 ;
 131   %ld0 = load i32, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @src32, i32 0, i64 0), align 4
 132   %ld1 = load i32, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @src32, i32 0, i64 1), align 4
 133   %ld2 = load i32, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @src32, i32 0, i64 2), align 4
 134   %ld3 = load i32, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @src32, i32 0, i64 3), align 4
 135   %ctpop0 = call i32 @llvm.ctpop.i32(i32 %ld0)
 136   %ctpop1 = call i32 @llvm.ctpop.i32(i32 %ld1)
 137   %ctpop2 = call i32 @llvm.ctpop.i32(i32 %ld2)
 138   %ctpop3 = call i32 @llvm.ctpop.i32(i32 %ld3)
 139   store i32 %ctpop0, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 0), align 4
 140   store i32 %ctpop1, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 1), align 4
 141   store i32 %ctpop2, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 2), align 4
 142   store i32 %ctpop3, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 3), align 4
 143   ret void
 144 }
 145
 146 define void @ctpop_8i32() #0 {
 147 ; SSE2-LABEL: @ctpop_8i32(
 148 ; SSE2-NEXT:    [[TMP1:%.*]] = load <4 x i32>, <4 x i32>* bitcast ([8 x i32]* @src32 to <4 x i32>*), align 2
 149 ; SSE2-NEXT:    [[TMP2:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([8 x i32], [8 x i32]* @src32, i32 0, i64 4) to <4 x i32>*), align 2
 150 ; SSE2-NEXT:    [[TMP3:%.*]] = call <4 x i32> @llvm.ctpop.v4i32(<4 x i32> [[TMP1]])
 151 ; SSE2-NEXT:    [[TMP4:%.*]] = call <4 x i32> @llvm.ctpop.v4i32(<4 x i32> [[TMP2]])
 152 ; SSE2-NEXT:    store <4 x i32> [[TMP3]], <4 x i32>* bitcast ([8 x i32]* @dst32 to <4 x i32>*), align 2
 153 ; SSE2-NEXT:    store <4 x i32> [[TMP4]], <4 x i32>* bitcast (i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 4) to <4 x i32>*), align 2
 154 ; SSE2-NEXT:    ret void
 155 ;
 156 ; SSE42-LABEL: @ctpop_8i32(
 157 ; SSE42-NEXT:    [[LD0:%.*]] = load i32, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @src32, i32 0, i64 0), align 2
 158 ; SSE42-NEXT:    [[LD1:%.*]] = load i32, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @src32, i32 0, i64 1), align 2
 159 ; SSE42-NEXT:    [[LD2:%.*]] = load i32, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @src32, i32 0, i64 2), align 2
 160 ; SSE42-NEXT:    [[LD3:%.*]] = load i32, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @src32, i32 0, i64 3), align 2
 161 ; SSE42-NEXT:    [[LD4:%.*]] = load i32, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @src32, i32 0, i64 4), align 2
 162 ; SSE42-NEXT:    [[LD5:%.*]] = load i32, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @src32, i32 0, i64 5), align 2
 163 ; SSE42-NEXT:    [[LD6:%.*]] = load i32, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @src32, i32 0, i64 6), align 2
 164 ; SSE42-NEXT:    [[LD7:%.*]] = load i32, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @src32, i32 0, i64 7), align 2
 165 ; SSE42-NEXT:    [[CTPOP0:%.*]] = call i32 @llvm.ctpop.i32(i32 [[LD0]])
 166 ; SSE42-NEXT:    [[CTPOP1:%.*]] = call i32 @llvm.ctpop.i32(i32 [[LD1]])
 167 ; SSE42-NEXT:    [[CTPOP2:%.*]] = call i32 @llvm.ctpop.i32(i32 [[LD2]])
 168 ; SSE42-NEXT:    [[CTPOP3:%.*]] = call i32 @llvm.ctpop.i32(i32 [[LD3]])
 169 ; SSE42-NEXT:    [[CTPOP4:%.*]] = call i32 @llvm.ctpop.i32(i32 [[LD4]])
 170 ; SSE42-NEXT:    [[CTPOP5:%.*]] = call i32 @llvm.ctpop.i32(i32 [[LD5]])
 171 ; SSE42-NEXT:    [[CTPOP6:%.*]] = call i32 @llvm.ctpop.i32(i32 [[LD6]])
 172 ; SSE42-NEXT:    [[CTPOP7:%.*]] = call i32 @llvm.ctpop.i32(i32 [[LD7]])
 173 ; SSE42-NEXT:    store i32 [[CTPOP0]], i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 0), align 2
 174 ; SSE42-NEXT:    store i32 [[CTPOP1]], i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 1), align 2
 175 ; SSE42-NEXT:    store i32 [[CTPOP2]], i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 2), align 2
 176 ; SSE42-NEXT:    store i32 [[CTPOP3]], i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 3), align 2
 177 ; SSE42-NEXT:    store i32 [[CTPOP4]], i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 4), align 2
 178 ; SSE42-NEXT:    store i32 [[CTPOP5]], i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 5), align 2
 179 ; SSE42-NEXT:    store i32 [[CTPOP6]], i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 6), align 2
 180 ; SSE42-NEXT:    store i32 [[CTPOP7]], i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 7), align 2
 181 ; SSE42-NEXT:    ret void
 182 ;
 183 ; AVX1-LABEL: @ctpop_8i32(
 184 ; AVX1-NEXT:    [[LD0:%.*]] = load i32, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @src32, i32 0, i64 0), align 2
 185 ; AVX1-NEXT:    [[LD1:%.*]] = load i32, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @src32, i32 0, i64 1), align 2
 186 ; AVX1-NEXT:    [[LD2:%.*]] = load i32, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @src32, i32 0, i64 2), align 2
 187 ; AVX1-NEXT:    [[LD3:%.*]] = load i32, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @src32, i32 0, i64 3), align 2
 188 ; AVX1-NEXT:    [[LD4:%.*]] = load i32, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @src32, i32 0, i64 4), align 2
 189 ; AVX1-NEXT:    [[LD5:%.*]] = load i32, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @src32, i32 0, i64 5), align 2
 190 ; AVX1-NEXT:    [[LD6:%.*]] = load i32, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @src32, i32 0, i64 6), align 2
 191 ; AVX1-NEXT:    [[LD7:%.*]] = load i32, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @src32, i32 0, i64 7), align 2
 192 ; AVX1-NEXT:    [[CTPOP0:%.*]] = call i32 @llvm.ctpop.i32(i32 [[LD0]])
 193 ; AVX1-NEXT:    [[CTPOP1:%.*]] = call i32 @llvm.ctpop.i32(i32 [[LD1]])
 194 ; AVX1-NEXT:    [[CTPOP2:%.*]] = call i32 @llvm.ctpop.i32(i32 [[LD2]])
 195 ; AVX1-NEXT:    [[CTPOP3:%.*]] = call i32 @llvm.ctpop.i32(i32 [[LD3]])
 196 ; AVX1-NEXT:    [[CTPOP4:%.*]] = call i32 @llvm.ctpop.i32(i32 [[LD4]])
 197 ; AVX1-NEXT:    [[CTPOP5:%.*]] = call i32 @llvm.ctpop.i32(i32 [[LD5]])
 198 ; AVX1-NEXT:    [[CTPOP6:%.*]] = call i32 @llvm.ctpop.i32(i32 [[LD6]])
 199 ; AVX1-NEXT:    [[CTPOP7:%.*]] = call i32 @llvm.ctpop.i32(i32 [[LD7]])
 200 ; AVX1-NEXT:    store i32 [[CTPOP0]], i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 0), align 2
 201 ; AVX1-NEXT:    store i32 [[CTPOP1]], i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 1), align 2
 202 ; AVX1-NEXT:    store i32 [[CTPOP2]], i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 2), align 2
 203 ; AVX1-NEXT:    store i32 [[CTPOP3]], i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 3), align 2
 204 ; AVX1-NEXT:    store i32 [[CTPOP4]], i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 4), align 2
 205 ; AVX1-NEXT:    store i32 [[CTPOP5]], i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 5), align 2
 206 ; AVX1-NEXT:    store i32 [[CTPOP6]], i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 6), align 2
 207 ; AVX1-NEXT:    store i32 [[CTPOP7]], i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 7), align 2
 208 ; AVX1-NEXT:    ret void
 209 ;
 210 ; AVX2-LABEL: @ctpop_8i32(
 211 ; AVX2-NEXT:    [[TMP1:%.*]] = load <8 x i32>, <8 x i32>* bitcast ([8 x i32]* @src32 to <8 x i32>*), align 2
 212 ; AVX2-NEXT:    [[TMP2:%.*]] = call <8 x i32> @llvm.ctpop.v8i32(<8 x i32> [[TMP1]])
 213 ; AVX2-NEXT:    store <8 x i32> [[TMP2]], <8 x i32>* bitcast ([8 x i32]* @dst32 to <8 x i32>*), align 2
 214 ; AVX2-NEXT:    ret void
 215 ;
 216   %ld0 = load i32, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @src32, i32 0, i64 0), align 2
 217   %ld1 = load i32, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @src32, i32 0, i64 1), align 2
 218   %ld2 = load i32, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @src32, i32 0, i64 2), align 2
 219   %ld3 = load i32, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @src32, i32 0, i64 3), align 2
 220   %ld4 = load i32, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @src32, i32 0, i64 4), align 2
 221   %ld5 = load i32, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @src32, i32 0, i64 5), align 2
 222   %ld6 = load i32, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @src32, i32 0, i64 6), align 2
 223   %ld7 = load i32, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @src32, i32 0, i64 7), align 2
 224   %ctpop0 = call i32 @llvm.ctpop.i32(i32 %ld0)
 225   %ctpop1 = call i32 @llvm.ctpop.i32(i32 %ld1)
 226   %ctpop2 = call i32 @llvm.ctpop.i32(i32 %ld2)
 227   %ctpop3 = call i32 @llvm.ctpop.i32(i32 %ld3)
 228   %ctpop4 = call i32 @llvm.ctpop.i32(i32 %ld4)
 229   %ctpop5 = call i32 @llvm.ctpop.i32(i32 %ld5)
 230   %ctpop6 = call i32 @llvm.ctpop.i32(i32 %ld6)
 231   %ctpop7 = call i32 @llvm.ctpop.i32(i32 %ld7)
 232   store i32 %ctpop0, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 0), align 2
 233   store i32 %ctpop1, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 1), align 2
 234   store i32 %ctpop2, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 2), align 2
 235   store i32 %ctpop3, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 3), align 2
 236   store i32 %ctpop4, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 4), align 2
 237   store i32 %ctpop5, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 5), align 2
 238   store i32 %ctpop6, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 6), align 2
 239   store i32 %ctpop7, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 7), align 2
 240   ret void
 241 }
 242
 243 define void @ctpop_8i16() #0 {
 244 ; CHECK-LABEL: @ctpop_8i16(
 245 ; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i16>, <8 x i16>* bitcast ([16 x i16]* @src16 to <8 x i16>*), align 2
 246 ; CHECK-NEXT:    [[TMP2:%.*]] = call <8 x i16> @llvm.ctpop.v8i16(<8 x i16> [[TMP1]])
 247 ; CHECK-NEXT:    store <8 x i16> [[TMP2]], <8 x i16>* bitcast ([16 x i16]* @dst16 to <8 x i16>*), align 2
 248 ; CHECK-NEXT:    ret void
 249 ;
 250   %ld0 = load i16, i16* getelementptr inbounds ([16 x i16], [16 x i16]* @src16, i16 0, i64 0), align 2
 251   %ld1 = load i16, i16* getelementptr inbounds ([16 x i16], [16 x i16]* @src16, i16 0, i64 1), align 2
 252   %ld2 = load i16, i16* getelementptr inbounds ([16 x i16], [16 x i16]* @src16, i16 0, i64 2), align 2
 253   %ld3 = load i16, i16* getelementptr inbounds ([16 x i16], [16 x i16]* @src16, i16 0, i64 3), align 2
 254   %ld4 = load i16, i16* getelementptr inbounds ([16 x i16], [16 x i16]* @src16, i16 0, i64 4), align 2
 255   %ld5 = load i16, i16* getelementptr inbounds ([16 x i16], [16 x i16]* @src16, i16 0, i64 5), align 2
 256   %ld6 = load i16, i16* getelementptr inbounds ([16 x i16], [16 x i16]* @src16, i16 0, i64 6), align 2
 257   %ld7 = load i16, i16* getelementptr inbounds ([16 x i16], [16 x i16]* @src16, i16 0, i64 7), align 2
 258   %ctpop0 = call i16 @llvm.ctpop.i16(i16 %ld0)
 259   %ctpop1 = call i16 @llvm.ctpop.i16(i16 %ld1)
 260   %ctpop2 = call i16 @llvm.ctpop.i16(i16 %ld2)
 261   %ctpop3 = call i16 @llvm.ctpop.i16(i16 %ld3)
 262   %ctpop4 = call i16 @llvm.ctpop.i16(i16 %ld4)
 263   %ctpop5 = call i16 @llvm.ctpop.i16(i16 %ld5)
 264   %ctpop6 = call i16 @llvm.ctpop.i16(i16 %ld6)
 265   %ctpop7 = call i16 @llvm.ctpop.i16(i16 %ld7)
 266   store i16 %ctpop0, i16* getelementptr inbounds ([16 x i16], [16 x i16]* @dst16, i16 0, i64 0), align 2
 267   store i16 %ctpop1, i16* getelementptr inbounds ([16 x i16], [16 x i16]* @dst16, i16 0, i64 1), align 2
 268   store i16 %ctpop2, i16* getelementptr inbounds ([16 x i16], [16 x i16]* @dst16, i16 0, i64 2), align 2
 269   store i16 %ctpop3, i16* getelementptr inbounds ([16 x i16], [16 x i16]* @dst16, i16 0, i64 3), align 2
 270   store i16 %ctpop4, i16* getelementptr inbounds ([16 x i16], [16 x i16]* @dst16, i16 0, i64 4), align 2
 271   store i16 %ctpop5, i16* getelementptr inbounds ([16 x i16], [16 x i16]* @dst16, i16 0, i64 5), align 2
 272   store i16 %ctpop6, i16* getelementptr inbounds ([16 x i16], [16 x i16]* @dst16, i16 0, i64 6), align 2
 273   store i16 %ctpop7, i16* getelementptr inbounds ([16 x i16], [16 x i16]* @dst16, i16 0, i64 7), align 2
 274   ret void
 275 }
 276
 277 define void @ctpop_16i16() #0 {
 278 ; SSE-LABEL: @ctpop_16i16(
 279 ; SSE-NEXT:    [[TMP1:%.*]] = load <8 x i16>, <8 x i16>* bitcast ([16 x i16]* @src16 to <8 x i16>*), align 2
 280 ; SSE-NEXT:    [[TMP2:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([16 x i16], [16 x i16]* @src16, i16 0, i64 8) to <8 x i16>*), align 2
 281 ; SSE-NEXT:    [[TMP3:%.*]] = call <8 x i16> @llvm.ctpop.v8i16(<8 x i16> [[TMP1]])
 282 ; SSE-NEXT:    [[TMP4:%.*]] = call <8 x i16> @llvm.ctpop.v8i16(<8 x i16> [[TMP2]])
 283 ; SSE-NEXT:    store <8 x i16> [[TMP3]], <8 x i16>* bitcast ([16 x i16]* @dst16 to <8 x i16>*), align 2
 284 ; SSE-NEXT:    store <8 x i16> [[TMP4]], <8 x i16>* bitcast (i16* getelementptr inbounds ([16 x i16], [16 x i16]* @dst16, i16 0, i64 8) to <8 x i16>*), align 2
 285 ; SSE-NEXT:    ret void
 286 ;
 287 ; AVX-LABEL: @ctpop_16i16(
 288 ; AVX-NEXT:    [[TMP1:%.*]] = load <16 x i16>, <16 x i16>* bitcast ([16 x i16]* @src16 to <16 x i16>*), align 2
 289 ; AVX-NEXT:    [[TMP2:%.*]] = call <16 x i16> @llvm.ctpop.v16i16(<16 x i16> [[TMP1]])
 290 ; AVX-NEXT:    store <16 x i16> [[TMP2]], <16 x i16>* bitcast ([16 x i16]* @dst16 to <16 x i16>*), align 2
 291 ; AVX-NEXT:    ret void
 292 ;
 293   %ld0  = load i16, i16* getelementptr inbounds ([16 x i16], [16 x i16]* @src16, i16 0, i64  0), align 2
 294   %ld1  = load i16, i16* getelementptr inbounds ([16 x i16], [16 x i16]* @src16, i16 0, i64  1), align 2
 295   %ld2  = load i16, i16* getelementptr inbounds ([16 x i16], [16 x i16]* @src16, i16 0, i64  2), align 2
 296   %ld3  = load i16, i16* getelementptr inbounds ([16 x i16], [16 x i16]* @src16, i16 0, i64  3), align 2
 297   %ld4  = load i16, i16* getelementptr inbounds ([16 x i16], [16 x i16]* @src16, i16 0, i64  4), align 2
 298   %ld5  = load i16, i16* getelementptr inbounds ([16 x i16], [16 x i16]* @src16, i16 0, i64  5), align 2
 299   %ld6  = load i16, i16* getelementptr inbounds ([16 x i16], [16 x i16]* @src16, i16 0, i64  6), align 2
 300   %ld7  = load i16, i16* getelementptr inbounds ([16 x i16], [16 x i16]* @src16, i16 0, i64  7), align 2
 301   %ld8  = load i16, i16* getelementptr inbounds ([16 x i16], [16 x i16]* @src16, i16 0, i64  8), align 2
 302   %ld9  = load i16, i16* getelementptr inbounds ([16 x i16], [16 x i16]* @src16, i16 0, i64  9), align 2
 303   %ld10 = load i16, i16* getelementptr inbounds ([16 x i16], [16 x i16]* @src16, i16 0, i64 10), align 2
 304   %ld11 = load i16, i16* getelementptr inbounds ([16 x i16], [16 x i16]* @src16, i16 0, i64 11), align 2
 305   %ld12 = load i16, i16* getelementptr inbounds ([16 x i16], [16 x i16]* @src16, i16 0, i64 12), align 2
 306   %ld13 = load i16, i16* getelementptr inbounds ([16 x i16], [16 x i16]* @src16, i16 0, i64 13), align 2
 307   %ld14 = load i16, i16* getelementptr inbounds ([16 x i16], [16 x i16]* @src16, i16 0, i64 14), align 2
 308   %ld15 = load i16, i16* getelementptr inbounds ([16 x i16], [16 x i16]* @src16, i16 0, i64 15), align 2
 309   %ctpop0  = call i16 @llvm.ctpop.i16(i16 %ld0)
 310   %ctpop1  = call i16 @llvm.ctpop.i16(i16 %ld1)
 311   %ctpop2  = call i16 @llvm.ctpop.i16(i16 %ld2)
 312   %ctpop3  = call i16 @llvm.ctpop.i16(i16 %ld3)
 313   %ctpop4  = call i16 @llvm.ctpop.i16(i16 %ld4)
 314   %ctpop5  = call i16 @llvm.ctpop.i16(i16 %ld5)
 315   %ctpop6  = call i16 @llvm.ctpop.i16(i16 %ld6)
 316   %ctpop7  = call i16 @llvm.ctpop.i16(i16 %ld7)
 317   %ctpop8  = call i16 @llvm.ctpop.i16(i16 %ld8)
 318   %ctpop9  = call i16 @llvm.ctpop.i16(i16 %ld9)
 319   %ctpop10 = call i16 @llvm.ctpop.i16(i16 %ld10)
 320   %ctpop11 = call i16 @llvm.ctpop.i16(i16 %ld11)
 321   %ctpop12 = call i16 @llvm.ctpop.i16(i16 %ld12)
 322   %ctpop13 = call i16 @llvm.ctpop.i16(i16 %ld13)
 323   %ctpop14 = call i16 @llvm.ctpop.i16(i16 %ld14)
 324   %ctpop15 = call i16 @llvm.ctpop.i16(i16 %ld15)
 325   store i16 %ctpop0 , i16* getelementptr inbounds ([16 x i16], [16 x i16]* @dst16, i16 0, i64  0), align 2
 326   store i16 %ctpop1 , i16* getelementptr inbounds ([16 x i16], [16 x i16]* @dst16, i16 0, i64  1), align 2
 327   store i16 %ctpop2 , i16* getelementptr inbounds ([16 x i16], [16 x i16]* @dst16, i16 0, i64  2), align 2
 328   store i16 %ctpop3 , i16* getelementptr inbounds ([16 x i16], [16 x i16]* @dst16, i16 0, i64  3), align 2
 329   store i16 %ctpop4 , i16* getelementptr inbounds ([16 x i16], [16 x i16]* @dst16, i16 0, i64  4), align 2
 330   store i16 %ctpop5 , i16* getelementptr inbounds ([16 x i16], [16 x i16]* @dst16, i16 0, i64  5), align 2
 331   store i16 %ctpop6 , i16* getelementptr inbounds ([16 x i16], [16 x i16]* @dst16, i16 0, i64  6), align 2
 332   store i16 %ctpop7 , i16* getelementptr inbounds ([16 x i16], [16 x i16]* @dst16, i16 0, i64  7), align 2
 333   store i16 %ctpop8 , i16* getelementptr inbounds ([16 x i16], [16 x i16]* @dst16, i16 0, i64  8), align 2
 334   store i16 %ctpop9 , i16* getelementptr inbounds ([16 x i16], [16 x i16]* @dst16, i16 0, i64  9), align 2
 335   store i16 %ctpop10, i16* getelementptr inbounds ([16 x i16], [16 x i16]* @dst16, i16 0, i64 10), align 2
 336   store i16 %ctpop11, i16* getelementptr inbounds ([16 x i16], [16 x i16]* @dst16, i16 0, i64 11), align 2
 337   store i16 %ctpop12, i16* getelementptr inbounds ([16 x i16], [16 x i16]* @dst16, i16 0, i64 12), align 2
 338   store i16 %ctpop13, i16* getelementptr inbounds ([16 x i16], [16 x i16]* @dst16, i16 0, i64 13), align 2
 339   store i16 %ctpop14, i16* getelementptr inbounds ([16 x i16], [16 x i16]* @dst16, i16 0, i64 14), align 2
 340   store i16 %ctpop15, i16* getelementptr inbounds ([16 x i16], [16 x i16]* @dst16, i16 0, i64 15), align 2
 341   ret void
 342 }
 343
 344 define void @ctpop_16i8() #0 {
 345 ; CHECK-LABEL: @ctpop_16i8(
 346 ; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i8>, <16 x i8>* bitcast ([32 x i8]* @src8 to <16 x i8>*), align 1
 347 ; CHECK-NEXT:    [[TMP2:%.*]] = call <16 x i8> @llvm.ctpop.v16i8(<16 x i8> [[TMP1]])
 348 ; CHECK-NEXT:    store <16 x i8> [[TMP2]], <16 x i8>* bitcast ([32 x i8]* @dst8 to <16 x i8>*), align 1
 349 ; CHECK-NEXT:    ret void
 350 ;
 351   %ld0  = load i8, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @src8, i8 0, i64  0), align 1
 352   %ld1  = load i8, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @src8, i8 0, i64  1), align 1
 353   %ld2  = load i8, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @src8, i8 0, i64  2), align 1
 354   %ld3  = load i8, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @src8, i8 0, i64  3), align 1
 355   %ld4  = load i8, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @src8, i8 0, i64  4), align 1
 356   %ld5  = load i8, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @src8, i8 0, i64  5), align 1
 357   %ld6  = load i8, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @src8, i8 0, i64  6), align 1
 358   %ld7  = load i8, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @src8, i8 0, i64  7), align 1
 359   %ld8  = load i8, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @src8, i8 0, i64  8), align 1
 360   %ld9  = load i8, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @src8, i8 0, i64  9), align 1
 361   %ld10 = load i8, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @src8, i8 0, i64 10), align 1
 362   %ld11 = load i8, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @src8, i8 0, i64 11), align 1
 363   %ld12 = load i8, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @src8, i8 0, i64 12), align 1
 364   %ld13 = load i8, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @src8, i8 0, i64 13), align 1
 365   %ld14 = load i8, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @src8, i8 0, i64 14), align 1
 366   %ld15 = load i8, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @src8, i8 0, i64 15), align 1
 367   %ctpop0  = call i8 @llvm.ctpop.i8(i8 %ld0)
 368   %ctpop1  = call i8 @llvm.ctpop.i8(i8 %ld1)
 369   %ctpop2  = call i8 @llvm.ctpop.i8(i8 %ld2)
 370   %ctpop3  = call i8 @llvm.ctpop.i8(i8 %ld3)
 371   %ctpop4  = call i8 @llvm.ctpop.i8(i8 %ld4)
 372   %ctpop5  = call i8 @llvm.ctpop.i8(i8 %ld5)
 373   %ctpop6  = call i8 @llvm.ctpop.i8(i8 %ld6)
 374   %ctpop7  = call i8 @llvm.ctpop.i8(i8 %ld7)
 375   %ctpop8  = call i8 @llvm.ctpop.i8(i8 %ld8)
 376   %ctpop9  = call i8 @llvm.ctpop.i8(i8 %ld9)
 377   %ctpop10 = call i8 @llvm.ctpop.i8(i8 %ld10)
 378   %ctpop11 = call i8 @llvm.ctpop.i8(i8 %ld11)
 379   %ctpop12 = call i8 @llvm.ctpop.i8(i8 %ld12)
 380   %ctpop13 = call i8 @llvm.ctpop.i8(i8 %ld13)
 381   %ctpop14 = call i8 @llvm.ctpop.i8(i8 %ld14)
 382   %ctpop15 = call i8 @llvm.ctpop.i8(i8 %ld15)
 383   store i8 %ctpop0 , i8* getelementptr inbounds ([32 x i8], [32 x i8]* @dst8, i8 0, i64  0), align 1
 384   store i8 %ctpop1 , i8* getelementptr inbounds ([32 x i8], [32 x i8]* @dst8, i8 0, i64  1), align 1
 385   store i8 %ctpop2 , i8* getelementptr inbounds ([32 x i8], [32 x i8]* @dst8, i8 0, i64  2), align 1
 386   store i8 %ctpop3 , i8* getelementptr inbounds ([32 x i8], [32 x i8]* @dst8, i8 0, i64  3), align 1
 387   store i8 %ctpop4 , i8* getelementptr inbounds ([32 x i8], [32 x i8]* @dst8, i8 0, i64  4), align 1
 388   store i8 %ctpop5 , i8* getelementptr inbounds ([32 x i8], [32 x i8]* @dst8, i8 0, i64  5), align 1
 389   store i8 %ctpop6 , i8* getelementptr inbounds ([32 x i8], [32 x i8]* @dst8, i8 0, i64  6), align 1
 390   store i8 %ctpop7 , i8* getelementptr inbounds ([32 x i8], [32 x i8]* @dst8, i8 0, i64  7), align 1
 391   store i8 %ctpop8 , i8* getelementptr inbounds ([32 x i8], [32 x i8]* @dst8, i8 0, i64  8), align 1
 392   store i8 %ctpop9 , i8* getelementptr inbounds ([32 x i8], [32 x i8]* @dst8, i8 0, i64  9), align 1
 393   store i8 %ctpop10, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @dst8, i8 0, i64 10), align 1
 394   store i8 %ctpop11, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @dst8, i8 0, i64 11), align 1
 395   store i8 %ctpop12, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @dst8, i8 0, i64 12), align 1
 396   store i8 %ctpop13, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @dst8, i8 0, i64 13), align 1
 397   store i8 %ctpop14, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @dst8, i8 0, i64 14), align 1
 398   store i8 %ctpop15, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @dst8, i8 0, i64 15), align 1
 399   ret void
 400 }
 401
 402 define void @ctpop_32i8() #0 {
 403 ; CHECK-LABEL: @ctpop_32i8(
 404 ; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i8>, <16 x i8>* bitcast ([32 x i8]* @src8 to <16 x i8>*), align 1
 405 ; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([32 x i8], [32 x i8]* @src8, i8 0, i64 16) to <16 x i8>*), align 1
 406 ; CHECK-NEXT:    [[TMP3:%.*]] = call <16 x i8> @llvm.ctpop.v16i8(<16 x i8> [[TMP1]])
 407 ; CHECK-NEXT:    [[TMP4:%.*]] = call <16 x i8> @llvm.ctpop.v16i8(<16 x i8> [[TMP2]])
 408 ; CHECK-NEXT:    store <16 x i8> [[TMP3]], <16 x i8>* bitcast ([32 x i8]* @dst8 to <16 x i8>*), align 1
 409 ; CHECK-NEXT:    store <16 x i8> [[TMP4]], <16 x i8>* bitcast (i8* getelementptr inbounds ([32 x i8], [32 x i8]* @dst8, i8 0, i64 16) to <16 x i8>*), align 1
 410 ; CHECK-NEXT:    ret void
 411 ;
 412   %ld0  = load i8, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @src8, i8 0, i64  0), align 1
 413   %ld1  = load i8, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @src8, i8 0, i64  1), align 1
 414   %ld2  = load i8, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @src8, i8 0, i64  2), align 1
 415   %ld3  = load i8, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @src8, i8 0, i64  3), align 1
 416   %ld4  = load i8, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @src8, i8 0, i64  4), align 1
 417   %ld5  = load i8, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @src8, i8 0, i64  5), align 1
 418   %ld6  = load i8, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @src8, i8 0, i64  6), align 1
 419   %ld7  = load i8, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @src8, i8 0, i64  7), align 1
 420   %ld8  = load i8, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @src8, i8 0, i64  8), align 1
 421   %ld9  = load i8, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @src8, i8 0, i64  9), align 1
 422   %ld10 = load i8, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @src8, i8 0, i64 10), align 1
 423   %ld11 = load i8, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @src8, i8 0, i64 11), align 1
 424   %ld12 = load i8, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @src8, i8 0, i64 12), align 1
 425   %ld13 = load i8, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @src8, i8 0, i64 13), align 1
 426   %ld14 = load i8, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @src8, i8 0, i64 14), align 1
 427   %ld15 = load i8, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @src8, i8 0, i64 15), align 1
 428   %ld16 = load i8, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @src8, i8 0, i64 16), align 1
 429   %ld17 = load i8, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @src8, i8 0, i64 17), align 1
 430   %ld18 = load i8, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @src8, i8 0, i64 18), align 1
 431   %ld19 = load i8, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @src8, i8 0, i64 19), align 1
 432   %ld20 = load i8, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @src8, i8 0, i64 20), align 1
 433   %ld21 = load i8, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @src8, i8 0, i64 21), align 1
 434   %ld22 = load i8, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @src8, i8 0, i64 22), align 1
 435   %ld23 = load i8, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @src8, i8 0, i64 23), align 1
 436   %ld24 = load i8, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @src8, i8 0, i64 24), align 1
 437   %ld25 = load i8, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @src8, i8 0, i64 25), align 1
 438   %ld26 = load i8, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @src8, i8 0, i64 26), align 1
 439   %ld27 = load i8, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @src8, i8 0, i64 27), align 1
 440   %ld28 = load i8, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @src8, i8 0, i64 28), align 1
 441   %ld29 = load i8, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @src8, i8 0, i64 29), align 1
 442   %ld30 = load i8, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @src8, i8 0, i64 30), align 1
 443   %ld31 = load i8, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @src8, i8 0, i64 31), align 1
 444   %ctpop0  = call i8 @llvm.ctpop.i8(i8 %ld0)
 445   %ctpop1  = call i8 @llvm.ctpop.i8(i8 %ld1)
 446   %ctpop2  = call i8 @llvm.ctpop.i8(i8 %ld2)
 447   %ctpop3  = call i8 @llvm.ctpop.i8(i8 %ld3)
 448   %ctpop4  = call i8 @llvm.ctpop.i8(i8 %ld4)
 449   %ctpop5  = call i8 @llvm.ctpop.i8(i8 %ld5)
 450   %ctpop6  = call i8 @llvm.ctpop.i8(i8 %ld6)
 451   %ctpop7  = call i8 @llvm.ctpop.i8(i8 %ld7)
 452   %ctpop8  = call i8 @llvm.ctpop.i8(i8 %ld8)
 453   %ctpop9  = call i8 @llvm.ctpop.i8(i8 %ld9)
 454   %ctpop10 = call i8 @llvm.ctpop.i8(i8 %ld10)
 455   %ctpop11 = call i8 @llvm.ctpop.i8(i8 %ld11)
 456   %ctpop12 = call i8 @llvm.ctpop.i8(i8 %ld12)
 457   %ctpop13 = call i8 @llvm.ctpop.i8(i8 %ld13)
 458   %ctpop14 = call i8 @llvm.ctpop.i8(i8 %ld14)
 459   %ctpop15 = call i8 @llvm.ctpop.i8(i8 %ld15)
 460   %ctpop16 = call i8 @llvm.ctpop.i8(i8 %ld16)
 461   %ctpop17 = call i8 @llvm.ctpop.i8(i8 %ld17)
 462   %ctpop18 = call i8 @llvm.ctpop.i8(i8 %ld18)
 463   %ctpop19 = call i8 @llvm.ctpop.i8(i8 %ld19)
 464   %ctpop20 = call i8 @llvm.ctpop.i8(i8 %ld20)
 465   %ctpop21 = call i8 @llvm.ctpop.i8(i8 %ld21)
 466   %ctpop22 = call i8 @llvm.ctpop.i8(i8 %ld22)
 467   %ctpop23 = call i8 @llvm.ctpop.i8(i8 %ld23)
 468   %ctpop24 = call i8 @llvm.ctpop.i8(i8 %ld24)
 469   %ctpop25 = call i8 @llvm.ctpop.i8(i8 %ld25)
 470   %ctpop26 = call i8 @llvm.ctpop.i8(i8 %ld26)
 471   %ctpop27 = call i8 @llvm.ctpop.i8(i8 %ld27)
 472   %ctpop28 = call i8 @llvm.ctpop.i8(i8 %ld28)
 473   %ctpop29 = call i8 @llvm.ctpop.i8(i8 %ld29)
 474   %ctpop30 = call i8 @llvm.ctpop.i8(i8 %ld30)
 475   %ctpop31 = call i8 @llvm.ctpop.i8(i8 %ld31)
 476   store i8 %ctpop0 , i8* getelementptr inbounds ([32 x i8], [32 x i8]* @dst8, i8 0, i64  0), align 1
 477   store i8 %ctpop1 , i8* getelementptr inbounds ([32 x i8], [32 x i8]* @dst8, i8 0, i64  1), align 1
 478   store i8 %ctpop2 , i8* getelementptr inbounds ([32 x i8], [32 x i8]* @dst8, i8 0, i64  2), align 1
 479   store i8 %ctpop3 , i8* getelementptr inbounds ([32 x i8], [32 x i8]* @dst8, i8 0, i64  3), align 1
 480   store i8 %ctpop4 , i8* getelementptr inbounds ([32 x i8], [32 x i8]* @dst8, i8 0, i64  4), align 1
 481   store i8 %ctpop5 , i8* getelementptr inbounds ([32 x i8], [32 x i8]* @dst8, i8 0, i64  5), align 1
 482   store i8 %ctpop6 , i8* getelementptr inbounds ([32 x i8], [32 x i8]* @dst8, i8 0, i64  6), align 1
 483   store i8 %ctpop7 , i8* getelementptr inbounds ([32 x i8], [32 x i8]* @dst8, i8 0, i64  7), align 1
 484   store i8 %ctpop8 , i8* getelementptr inbounds ([32 x i8], [32 x i8]* @dst8, i8 0, i64  8), align 1
 485   store i8 %ctpop9 , i8* getelementptr inbounds ([32 x i8], [32 x i8]* @dst8, i8 0, i64  9), align 1
 486   store i8 %ctpop10, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @dst8, i8 0, i64 10), align 1
 487   store i8 %ctpop11, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @dst8, i8 0, i64 11), align 1
 488   store i8 %ctpop12, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @dst8, i8 0, i64 12), align 1
 489   store i8 %ctpop13, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @dst8, i8 0, i64 13), align 1
 490   store i8 %ctpop14, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @dst8, i8 0, i64 14), align 1
 491   store i8 %ctpop15, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @dst8, i8 0, i64 15), align 1
 492   store i8 %ctpop16, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @dst8, i8 0, i64 16), align 1
 493   store i8 %ctpop17, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @dst8, i8 0, i64 17), align 1
 494   store i8 %ctpop18, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @dst8, i8 0, i64 18), align 1
 495   store i8 %ctpop19, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @dst8, i8 0, i64 19), align 1
 496   store i8 %ctpop20, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @dst8, i8 0, i64 20), align 1
 497   store i8 %ctpop21, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @dst8, i8 0, i64 21), align 1
 498   store i8 %ctpop22, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @dst8, i8 0, i64 22), align 1
 499   store i8 %ctpop23, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @dst8, i8 0, i64 23), align 1
 500   store i8 %ctpop24, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @dst8, i8 0, i64 24), align 1
 501   store i8 %ctpop25, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @dst8, i8 0, i64 25), align 1
 502   store i8 %ctpop26, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @dst8, i8 0, i64 26), align 1
 503   store i8 %ctpop27, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @dst8, i8 0, i64 27), align 1
 504   store i8 %ctpop28, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @dst8, i8 0, i64 28), align 1
 505   store i8 %ctpop29, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @dst8, i8 0, i64 29), align 1
 506   store i8 %ctpop30, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @dst8, i8 0, i64 30), align 1
 507   store i8 %ctpop31, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @dst8, i8 0, i64 31), align 1
 508   ret void
 509 }
 510
 511 attributes #0 = { nounwind }
 512