test/Transforms/SLPVectorizer/X86/uitofp.ll

   1 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
   2 ; RUN: opt < %s -mtriple=x86_64-unknown -basicaa -slp-vectorizer -S | FileCheck %s --check-prefix=CHECK --check-prefix=SSE
   3 ; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=corei7-avx -basicaa -slp-vectorizer -S | FileCheck %s --check-prefix=CHECK --check-prefix=AVX --check-prefix=AVX256 --check-prefix=AVX256NODQ
   4 ; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=bdver1 -basicaa -slp-vectorizer -S | FileCheck %s --check-prefix=CHECK --check-prefix=AVX --check-prefix=AVX256 --check-prefix=AVX256NODQ
   5 ; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=core-avx2 -basicaa -slp-vectorizer -S | FileCheck %s --check-prefix=CHECK --check-prefix=AVX --check-prefix=AVX256 --check-prefix=AVX256NODQ
   6 ; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=skylake-avx512 -mattr=-prefer-256-bit -basicaa -slp-vectorizer -S | FileCheck %s --check-prefix=CHECK --check-prefix=AVX --check-prefix=AVX512
   7 ; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=skylake-avx512 -mattr=+prefer-256-bit -basicaa -slp-vectorizer -S | FileCheck %s --check-prefix=CHECK --check-prefix=AVX --check-prefix=AVX256 --check-prefix=AVX256DQ
   8
   9 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
  10
  11 @src64 = common global [8 x i64] zeroinitializer, align 64
  12 @src32 = common global [16 x i32] zeroinitializer, align 64
  13 @src16 = common global [32 x i16] zeroinitializer, align 64
  14 @src8  = common global [64 x i8] zeroinitializer, align 64
  15
  16 @dst64 = common global [8 x double] zeroinitializer, align 64
  17 @dst32 = common global [16 x float] zeroinitializer, align 64
  18
  19 ;
  20 ; UITOFP to vXf64
  21 ;
  22
  23 define void @uitofp_2i64_2f64() #0 {
  24 ; CHECK-LABEL: @uitofp_2i64_2f64(
  25 ; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i64>, <2 x i64>* bitcast ([8 x i64]* @src64 to <2 x i64>*), align 64
  26 ; CHECK-NEXT:    [[TMP2:%.*]] = uitofp <2 x i64> [[TMP1]] to <2 x double>
  27 ; CHECK-NEXT:    store <2 x double> [[TMP2]], <2 x double>* bitcast ([8 x double]* @dst64 to <2 x double>*), align 64
  28 ; CHECK-NEXT:    ret void
  29 ;
  30   %ld0 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 0), align 64
  31   %ld1 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 1), align 8
  32   %cvt0 = uitofp i64 %ld0 to double
  33   %cvt1 = uitofp i64 %ld1 to double
  34   store double %cvt0, double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 0), align 64
  35   store double %cvt1, double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 1), align 8
  36   ret void
  37 }
  38
  39 define void @uitofp_4i64_4f64() #0 {
  40 ; SSE-LABEL: @uitofp_4i64_4f64(
  41 ; SSE-NEXT:    [[TMP1:%.*]] = load <2 x i64>, <2 x i64>* bitcast ([8 x i64]* @src64 to <2 x i64>*), align 64
  42 ; SSE-NEXT:    [[TMP2:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 2) to <2 x i64>*), align 16
  43 ; SSE-NEXT:    [[TMP3:%.*]] = uitofp <2 x i64> [[TMP1]] to <2 x double>
  44 ; SSE-NEXT:    [[TMP4:%.*]] = uitofp <2 x i64> [[TMP2]] to <2 x double>
  45 ; SSE-NEXT:    store <2 x double> [[TMP3]], <2 x double>* bitcast ([8 x double]* @dst64 to <2 x double>*), align 64
  46 ; SSE-NEXT:    store <2 x double> [[TMP4]], <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 2) to <2 x double>*), align 16
  47 ; SSE-NEXT:    ret void
  48 ;
  49 ; AVX-LABEL: @uitofp_4i64_4f64(
  50 ; AVX-NEXT:    [[TMP1:%.*]] = load <4 x i64>, <4 x i64>* bitcast ([8 x i64]* @src64 to <4 x i64>*), align 64
  51 ; AVX-NEXT:    [[TMP2:%.*]] = uitofp <4 x i64> [[TMP1]] to <4 x double>
  52 ; AVX-NEXT:    store <4 x double> [[TMP2]], <4 x double>* bitcast ([8 x double]* @dst64 to <4 x double>*), align 64
  53 ; AVX-NEXT:    ret void
  54 ;
  55   %ld0 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 0), align 64
  56   %ld1 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 1), align 8
  57   %ld2 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 2), align 16
  58   %ld3 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 3), align 8
  59   %cvt0 = uitofp i64 %ld0 to double
  60   %cvt1 = uitofp i64 %ld1 to double
  61   %cvt2 = uitofp i64 %ld2 to double
  62   %cvt3 = uitofp i64 %ld3 to double
  63   store double %cvt0, double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 0), align 64
  64   store double %cvt1, double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 1), align 8
  65   store double %cvt2, double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 2), align 16
  66   store double %cvt3, double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 3), align 8
  67   ret void
  68 }
  69
  70 define void @uitofp_8i64_8f64() #0 {
  71 ; SSE-LABEL: @uitofp_8i64_8f64(
  72 ; SSE-NEXT:    [[TMP1:%.*]] = load <2 x i64>, <2 x i64>* bitcast ([8 x i64]* @src64 to <2 x i64>*), align 64
  73 ; SSE-NEXT:    [[TMP2:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 2) to <2 x i64>*), align 16
  74 ; SSE-NEXT:    [[TMP3:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 4) to <2 x i64>*), align 32
  75 ; SSE-NEXT:    [[TMP4:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 6) to <2 x i64>*), align 16
  76 ; SSE-NEXT:    [[TMP5:%.*]] = uitofp <2 x i64> [[TMP1]] to <2 x double>
  77 ; SSE-NEXT:    [[TMP6:%.*]] = uitofp <2 x i64> [[TMP2]] to <2 x double>
  78 ; SSE-NEXT:    [[TMP7:%.*]] = uitofp <2 x i64> [[TMP3]] to <2 x double>
  79 ; SSE-NEXT:    [[TMP8:%.*]] = uitofp <2 x i64> [[TMP4]] to <2 x double>
  80 ; SSE-NEXT:    store <2 x double> [[TMP5]], <2 x double>* bitcast ([8 x double]* @dst64 to <2 x double>*), align 64
  81 ; SSE-NEXT:    store <2 x double> [[TMP6]], <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 2) to <2 x double>*), align 16
  82 ; SSE-NEXT:    store <2 x double> [[TMP7]], <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 4) to <2 x double>*), align 32
  83 ; SSE-NEXT:    store <2 x double> [[TMP8]], <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 6) to <2 x double>*), align 16
  84 ; SSE-NEXT:    ret void
  85 ;
  86 ; AVX256-LABEL: @uitofp_8i64_8f64(
  87 ; AVX256-NEXT:    [[TMP1:%.*]] = load <4 x i64>, <4 x i64>* bitcast ([8 x i64]* @src64 to <4 x i64>*), align 64
  88 ; AVX256-NEXT:    [[TMP2:%.*]] = load <4 x i64>, <4 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 4) to <4 x i64>*), align 32
  89 ; AVX256-NEXT:    [[TMP3:%.*]] = uitofp <4 x i64> [[TMP1]] to <4 x double>
  90 ; AVX256-NEXT:    [[TMP4:%.*]] = uitofp <4 x i64> [[TMP2]] to <4 x double>
  91 ; AVX256-NEXT:    store <4 x double> [[TMP3]], <4 x double>* bitcast ([8 x double]* @dst64 to <4 x double>*), align 64
  92 ; AVX256-NEXT:    store <4 x double> [[TMP4]], <4 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 4) to <4 x double>*), align 32
  93 ; AVX256-NEXT:    ret void
  94 ;
  95 ; AVX512-LABEL: @uitofp_8i64_8f64(
  96 ; AVX512-NEXT:    [[TMP1:%.*]] = load <8 x i64>, <8 x i64>* bitcast ([8 x i64]* @src64 to <8 x i64>*), align 64
  97 ; AVX512-NEXT:    [[TMP2:%.*]] = uitofp <8 x i64> [[TMP1]] to <8 x double>
  98 ; AVX512-NEXT:    store <8 x double> [[TMP2]], <8 x double>* bitcast ([8 x double]* @dst64 to <8 x double>*), align 64
  99 ; AVX512-NEXT:    ret void
 100 ;
 101   %ld0 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 0), align 64
 102   %ld1 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 1), align 8
 103   %ld2 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 2), align 16
 104   %ld3 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 3), align 8
 105   %ld4 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 4), align 32
 106   %ld5 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 5), align 8
 107   %ld6 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 6), align 16
 108   %ld7 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 7), align 8
 109   %cvt0 = uitofp i64 %ld0 to double
 110   %cvt1 = uitofp i64 %ld1 to double
 111   %cvt2 = uitofp i64 %ld2 to double
 112   %cvt3 = uitofp i64 %ld3 to double
 113   %cvt4 = uitofp i64 %ld4 to double
 114   %cvt5 = uitofp i64 %ld5 to double
 115   %cvt6 = uitofp i64 %ld6 to double
 116   %cvt7 = uitofp i64 %ld7 to double
 117   store double %cvt0, double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 0), align 64
 118   store double %cvt1, double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 1), align 8
 119   store double %cvt2, double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 2), align 16
 120   store double %cvt3, double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 3), align 8
 121   store double %cvt4, double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 4), align 32
 122   store double %cvt5, double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 5), align 8
 123   store double %cvt6, double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 6), align 16
 124   store double %cvt7, double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 7), align 8
 125   ret void
 126 }
 127
 128 define void @uitofp_2i32_2f64() #0 {
 129 ; SSE-LABEL: @uitofp_2i32_2f64(
 130 ; SSE-NEXT:    [[LD0:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @src32, i32 0, i64 0), align 64
 131 ; SSE-NEXT:    [[LD1:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @src32, i32 0, i64 1), align 4
 132 ; SSE-NEXT:    [[CVT0:%.*]] = uitofp i32 [[LD0]] to double
 133 ; SSE-NEXT:    [[CVT1:%.*]] = uitofp i32 [[LD1]] to double
 134 ; SSE-NEXT:    store double [[CVT0]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 0), align 64
 135 ; SSE-NEXT:    store double [[CVT1]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 1), align 8
 136 ; SSE-NEXT:    ret void
 137 ;
 138 ; AVX256NODQ-LABEL: @uitofp_2i32_2f64(
 139 ; AVX256NODQ-NEXT:    [[LD0:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @src32, i32 0, i64 0), align 64
 140 ; AVX256NODQ-NEXT:    [[LD1:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @src32, i32 0, i64 1), align 4
 141 ; AVX256NODQ-NEXT:    [[CVT0:%.*]] = uitofp i32 [[LD0]] to double
 142 ; AVX256NODQ-NEXT:    [[CVT1:%.*]] = uitofp i32 [[LD1]] to double
 143 ; AVX256NODQ-NEXT:    store double [[CVT0]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 0), align 64
 144 ; AVX256NODQ-NEXT:    store double [[CVT1]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 1), align 8
 145 ; AVX256NODQ-NEXT:    ret void
 146 ;
 147 ; AVX512-LABEL: @uitofp_2i32_2f64(
 148 ; AVX512-NEXT:    [[TMP1:%.*]] = load <2 x i32>, <2 x i32>* bitcast ([16 x i32]* @src32 to <2 x i32>*), align 64
 149 ; AVX512-NEXT:    [[TMP2:%.*]] = uitofp <2 x i32> [[TMP1]] to <2 x double>
 150 ; AVX512-NEXT:    store <2 x double> [[TMP2]], <2 x double>* bitcast ([8 x double]* @dst64 to <2 x double>*), align 64
 151 ; AVX512-NEXT:    ret void
 152 ;
 153 ; AVX256DQ-LABEL: @uitofp_2i32_2f64(
 154 ; AVX256DQ-NEXT:    [[TMP1:%.*]] = load <2 x i32>, <2 x i32>* bitcast ([16 x i32]* @src32 to <2 x i32>*), align 64
 155 ; AVX256DQ-NEXT:    [[TMP2:%.*]] = uitofp <2 x i32> [[TMP1]] to <2 x double>
 156 ; AVX256DQ-NEXT:    store <2 x double> [[TMP2]], <2 x double>* bitcast ([8 x double]* @dst64 to <2 x double>*), align 64
 157 ; AVX256DQ-NEXT:    ret void
 158 ;
 159   %ld0 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @src32, i32 0, i64 0), align 64
 160   %ld1 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @src32, i32 0, i64 1), align 4
 161   %cvt0 = uitofp i32 %ld0 to double
 162   %cvt1 = uitofp i32 %ld1 to double
 163   store double %cvt0, double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 0), align 64
 164   store double %cvt1, double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 1), align 8
 165   ret void
 166 }
 167
 168 define void @uitofp_4i32_4f64() #0 {
 169 ; SSE-LABEL: @uitofp_4i32_4f64(
 170 ; SSE-NEXT:    [[LD0:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @src32, i32 0, i64 0), align 64
 171 ; SSE-NEXT:    [[LD1:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @src32, i32 0, i64 1), align 4
 172 ; SSE-NEXT:    [[LD2:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @src32, i32 0, i64 2), align 8
 173 ; SSE-NEXT:    [[LD3:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @src32, i32 0, i64 3), align 4
 174 ; SSE-NEXT:    [[CVT0:%.*]] = uitofp i32 [[LD0]] to double
 175 ; SSE-NEXT:    [[CVT1:%.*]] = uitofp i32 [[LD1]] to double
 176 ; SSE-NEXT:    [[CVT2:%.*]] = uitofp i32 [[LD2]] to double
 177 ; SSE-NEXT:    [[CVT3:%.*]] = uitofp i32 [[LD3]] to double
 178 ; SSE-NEXT:    store double [[CVT0]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 0), align 64
 179 ; SSE-NEXT:    store double [[CVT1]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 1), align 8
 180 ; SSE-NEXT:    store double [[CVT2]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 2), align 16
 181 ; SSE-NEXT:    store double [[CVT3]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 3), align 8
 182 ; SSE-NEXT:    ret void
 183 ;
 184 ; AVX-LABEL: @uitofp_4i32_4f64(
 185 ; AVX-NEXT:    [[TMP1:%.*]] = load <4 x i32>, <4 x i32>* bitcast ([16 x i32]* @src32 to <4 x i32>*), align 64
 186 ; AVX-NEXT:    [[TMP2:%.*]] = uitofp <4 x i32> [[TMP1]] to <4 x double>
 187 ; AVX-NEXT:    store <4 x double> [[TMP2]], <4 x double>* bitcast ([8 x double]* @dst64 to <4 x double>*), align 64
 188 ; AVX-NEXT:    ret void
 189 ;
 190   %ld0 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @src32, i32 0, i64 0), align 64
 191   %ld1 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @src32, i32 0, i64 1), align 4
 192   %ld2 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @src32, i32 0, i64 2), align 8
 193   %ld3 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @src32, i32 0, i64 3), align 4
 194   %cvt0 = uitofp i32 %ld0 to double
 195   %cvt1 = uitofp i32 %ld1 to double
 196   %cvt2 = uitofp i32 %ld2 to double
 197   %cvt3 = uitofp i32 %ld3 to double
 198   store double %cvt0, double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 0), align 64
 199   store double %cvt1, double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 1), align 8
 200   store double %cvt2, double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 2), align 16
 201   store double %cvt3, double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 3), align 8
 202   ret void
 203 }
 204
 205 define void @uitofp_8i32_8f64() #0 {
 206 ; SSE-LABEL: @uitofp_8i32_8f64(
 207 ; SSE-NEXT:    [[LD0:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @src32, i32 0, i64 0), align 64
 208 ; SSE-NEXT:    [[LD1:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @src32, i32 0, i64 1), align 4
 209 ; SSE-NEXT:    [[LD2:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @src32, i32 0, i64 2), align 8
 210 ; SSE-NEXT:    [[LD3:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @src32, i32 0, i64 3), align 4
 211 ; SSE-NEXT:    [[LD4:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @src32, i32 0, i64 4), align 16
 212 ; SSE-NEXT:    [[LD5:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @src32, i32 0, i64 5), align 4
 213 ; SSE-NEXT:    [[LD6:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @src32, i32 0, i64 6), align 8
 214 ; SSE-NEXT:    [[LD7:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @src32, i32 0, i64 7), align 4
 215 ; SSE-NEXT:    [[CVT0:%.*]] = uitofp i32 [[LD0]] to double
 216 ; SSE-NEXT:    [[CVT1:%.*]] = uitofp i32 [[LD1]] to double
 217 ; SSE-NEXT:    [[CVT2:%.*]] = uitofp i32 [[LD2]] to double
 218 ; SSE-NEXT:    [[CVT3:%.*]] = uitofp i32 [[LD3]] to double
 219 ; SSE-NEXT:    [[CVT4:%.*]] = uitofp i32 [[LD4]] to double
 220 ; SSE-NEXT:    [[CVT5:%.*]] = uitofp i32 [[LD5]] to double
 221 ; SSE-NEXT:    [[CVT6:%.*]] = uitofp i32 [[LD6]] to double
 222 ; SSE-NEXT:    [[CVT7:%.*]] = uitofp i32 [[LD7]] to double
 223 ; SSE-NEXT:    store double [[CVT0]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 0), align 64
 224 ; SSE-NEXT:    store double [[CVT1]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 1), align 8
 225 ; SSE-NEXT:    store double [[CVT2]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 2), align 16
 226 ; SSE-NEXT:    store double [[CVT3]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 3), align 8
 227 ; SSE-NEXT:    store double [[CVT4]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 4), align 32
 228 ; SSE-NEXT:    store double [[CVT5]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 5), align 8
 229 ; SSE-NEXT:    store double [[CVT6]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 6), align 16
 230 ; SSE-NEXT:    store double [[CVT7]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 7), align 8
 231 ; SSE-NEXT:    ret void
 232 ;
 233 ; AVX256-LABEL: @uitofp_8i32_8f64(
 234 ; AVX256-NEXT:    [[TMP1:%.*]] = load <4 x i32>, <4 x i32>* bitcast ([16 x i32]* @src32 to <4 x i32>*), align 64
 235 ; AVX256-NEXT:    [[TMP2:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @src32, i32 0, i64 4) to <4 x i32>*), align 16
 236 ; AVX256-NEXT:    [[TMP3:%.*]] = uitofp <4 x i32> [[TMP1]] to <4 x double>
 237 ; AVX256-NEXT:    [[TMP4:%.*]] = uitofp <4 x i32> [[TMP2]] to <4 x double>
 238 ; AVX256-NEXT:    store <4 x double> [[TMP3]], <4 x double>* bitcast ([8 x double]* @dst64 to <4 x double>*), align 64
 239 ; AVX256-NEXT:    store <4 x double> [[TMP4]], <4 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 4) to <4 x double>*), align 32
 240 ; AVX256-NEXT:    ret void
 241 ;
 242 ; AVX512-LABEL: @uitofp_8i32_8f64(
 243 ; AVX512-NEXT:    [[TMP1:%.*]] = load <8 x i32>, <8 x i32>* bitcast ([16 x i32]* @src32 to <8 x i32>*), align 64
 244 ; AVX512-NEXT:    [[TMP2:%.*]] = uitofp <8 x i32> [[TMP1]] to <8 x double>
 245 ; AVX512-NEXT:    store <8 x double> [[TMP2]], <8 x double>* bitcast ([8 x double]* @dst64 to <8 x double>*), align 64
 246 ; AVX512-NEXT:    ret void
 247 ;
 248   %ld0 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @src32, i32 0, i64 0), align 64
 249   %ld1 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @src32, i32 0, i64 1), align 4
 250   %ld2 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @src32, i32 0, i64 2), align 8
 251   %ld3 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @src32, i32 0, i64 3), align 4
 252   %ld4 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @src32, i32 0, i64 4), align 16
 253   %ld5 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @src32, i32 0, i64 5), align 4
 254   %ld6 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @src32, i32 0, i64 6), align 8
 255   %ld7 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @src32, i32 0, i64 7), align 4
 256   %cvt0 = uitofp i32 %ld0 to double
 257   %cvt1 = uitofp i32 %ld1 to double
 258   %cvt2 = uitofp i32 %ld2 to double
 259   %cvt3 = uitofp i32 %ld3 to double
 260   %cvt4 = uitofp i32 %ld4 to double
 261   %cvt5 = uitofp i32 %ld5 to double
 262   %cvt6 = uitofp i32 %ld6 to double
 263   %cvt7 = uitofp i32 %ld7 to double
 264   store double %cvt0, double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 0), align 64
 265   store double %cvt1, double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 1), align 8
 266   store double %cvt2, double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 2), align 16
 267   store double %cvt3, double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 3), align 8
 268   store double %cvt4, double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 4), align 32
 269   store double %cvt5, double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 5), align 8
 270   store double %cvt6, double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 6), align 16
 271   store double %cvt7, double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 7), align 8
 272   ret void
 273 }
 274
 275 define void @uitofp_2i16_2f64() #0 {
 276 ; CHECK-LABEL: @uitofp_2i16_2f64(
 277 ; CHECK-NEXT:    [[LD0:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 0), align 64
 278 ; CHECK-NEXT:    [[LD1:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 1), align 2
 279 ; CHECK-NEXT:    [[CVT0:%.*]] = uitofp i16 [[LD0]] to double
 280 ; CHECK-NEXT:    [[CVT1:%.*]] = uitofp i16 [[LD1]] to double
 281 ; CHECK-NEXT:    store double [[CVT0]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 0), align 64
 282 ; CHECK-NEXT:    store double [[CVT1]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 1), align 8
 283 ; CHECK-NEXT:    ret void
 284 ;
 285   %ld0 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 0), align 64
 286   %ld1 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 1), align 2
 287   %cvt0 = uitofp i16 %ld0 to double
 288   %cvt1 = uitofp i16 %ld1 to double
 289   store double %cvt0, double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 0), align 64
 290   store double %cvt1, double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 1), align 8
 291   ret void
 292 }
 293
 294 define void @uitofp_4i16_4f64() #0 {
 295 ; SSE-LABEL: @uitofp_4i16_4f64(
 296 ; SSE-NEXT:    [[LD0:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 0), align 64
 297 ; SSE-NEXT:    [[LD1:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 1), align 2
 298 ; SSE-NEXT:    [[LD2:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 2), align 4
 299 ; SSE-NEXT:    [[LD3:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 3), align 2
 300 ; SSE-NEXT:    [[CVT0:%.*]] = uitofp i16 [[LD0]] to double
 301 ; SSE-NEXT:    [[CVT1:%.*]] = uitofp i16 [[LD1]] to double
 302 ; SSE-NEXT:    [[CVT2:%.*]] = uitofp i16 [[LD2]] to double
 303 ; SSE-NEXT:    [[CVT3:%.*]] = uitofp i16 [[LD3]] to double
 304 ; SSE-NEXT:    store double [[CVT0]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 0), align 64
 305 ; SSE-NEXT:    store double [[CVT1]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 1), align 8
 306 ; SSE-NEXT:    store double [[CVT2]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 2), align 16
 307 ; SSE-NEXT:    store double [[CVT3]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 3), align 8
 308 ; SSE-NEXT:    ret void
 309 ;
 310 ; AVX-LABEL: @uitofp_4i16_4f64(
 311 ; AVX-NEXT:    [[TMP1:%.*]] = load <4 x i16>, <4 x i16>* bitcast ([32 x i16]* @src16 to <4 x i16>*), align 64
 312 ; AVX-NEXT:    [[TMP2:%.*]] = uitofp <4 x i16> [[TMP1]] to <4 x double>
 313 ; AVX-NEXT:    store <4 x double> [[TMP2]], <4 x double>* bitcast ([8 x double]* @dst64 to <4 x double>*), align 64
 314 ; AVX-NEXT:    ret void
 315 ;
 316   %ld0 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 0), align 64
 317   %ld1 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 1), align 2
 318   %ld2 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 2), align 4
 319   %ld3 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 3), align 2
 320   %cvt0 = uitofp i16 %ld0 to double
 321   %cvt1 = uitofp i16 %ld1 to double
 322   %cvt2 = uitofp i16 %ld2 to double
 323   %cvt3 = uitofp i16 %ld3 to double
 324   store double %cvt0, double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 0), align 64
 325   store double %cvt1, double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 1), align 8
 326   store double %cvt2, double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 2), align 16
 327   store double %cvt3, double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 3), align 8
 328   ret void
 329 }
 330
 331 define void @uitofp_8i16_8f64() #0 {
 332 ; SSE-LABEL: @uitofp_8i16_8f64(
 333 ; SSE-NEXT:    [[LD0:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 0), align 64
 334 ; SSE-NEXT:    [[LD1:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 1), align 2
 335 ; SSE-NEXT:    [[LD2:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 2), align 4
 336 ; SSE-NEXT:    [[LD3:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 3), align 2
 337 ; SSE-NEXT:    [[LD4:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 4), align 8
 338 ; SSE-NEXT:    [[LD5:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 5), align 2
 339 ; SSE-NEXT:    [[LD6:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 6), align 4
 340 ; SSE-NEXT:    [[LD7:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 7), align 2
 341 ; SSE-NEXT:    [[CVT0:%.*]] = uitofp i16 [[LD0]] to double
 342 ; SSE-NEXT:    [[CVT1:%.*]] = uitofp i16 [[LD1]] to double
 343 ; SSE-NEXT:    [[CVT2:%.*]] = uitofp i16 [[LD2]] to double
 344 ; SSE-NEXT:    [[CVT3:%.*]] = uitofp i16 [[LD3]] to double
 345 ; SSE-NEXT:    [[CVT4:%.*]] = uitofp i16 [[LD4]] to double
 346 ; SSE-NEXT:    [[CVT5:%.*]] = uitofp i16 [[LD5]] to double
 347 ; SSE-NEXT:    [[CVT6:%.*]] = uitofp i16 [[LD6]] to double
 348 ; SSE-NEXT:    [[CVT7:%.*]] = uitofp i16 [[LD7]] to double
 349 ; SSE-NEXT:    store double [[CVT0]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 0), align 64
 350 ; SSE-NEXT:    store double [[CVT1]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 1), align 8
 351 ; SSE-NEXT:    store double [[CVT2]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 2), align 16
 352 ; SSE-NEXT:    store double [[CVT3]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 3), align 8
 353 ; SSE-NEXT:    store double [[CVT4]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 4), align 32
 354 ; SSE-NEXT:    store double [[CVT5]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 5), align 8
 355 ; SSE-NEXT:    store double [[CVT6]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 6), align 16
 356 ; SSE-NEXT:    store double [[CVT7]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 7), align 8
 357 ; SSE-NEXT:    ret void
 358 ;
 359 ; AVX256-LABEL: @uitofp_8i16_8f64(
 360 ; AVX256-NEXT:    [[TMP1:%.*]] = load <4 x i16>, <4 x i16>* bitcast ([32 x i16]* @src16 to <4 x i16>*), align 64
 361 ; AVX256-NEXT:    [[TMP2:%.*]] = load <4 x i16>, <4 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 4) to <4 x i16>*), align 8
 362 ; AVX256-NEXT:    [[TMP3:%.*]] = uitofp <4 x i16> [[TMP1]] to <4 x double>
 363 ; AVX256-NEXT:    [[TMP4:%.*]] = uitofp <4 x i16> [[TMP2]] to <4 x double>
 364 ; AVX256-NEXT:    store <4 x double> [[TMP3]], <4 x double>* bitcast ([8 x double]* @dst64 to <4 x double>*), align 64
 365 ; AVX256-NEXT:    store <4 x double> [[TMP4]], <4 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 4) to <4 x double>*), align 32
 366 ; AVX256-NEXT:    ret void
 367 ;
 368 ; AVX512-LABEL: @uitofp_8i16_8f64(
 369 ; AVX512-NEXT:    [[TMP1:%.*]] = load <8 x i16>, <8 x i16>* bitcast ([32 x i16]* @src16 to <8 x i16>*), align 64
 370 ; AVX512-NEXT:    [[TMP2:%.*]] = uitofp <8 x i16> [[TMP1]] to <8 x double>
 371 ; AVX512-NEXT:    store <8 x double> [[TMP2]], <8 x double>* bitcast ([8 x double]* @dst64 to <8 x double>*), align 64
 372 ; AVX512-NEXT:    ret void
 373 ;
 374   %ld0 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 0), align 64
 375   %ld1 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 1), align 2
 376   %ld2 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 2), align 4
 377   %ld3 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 3), align 2
 378   %ld4 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 4), align 8
 379   %ld5 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 5), align 2
 380   %ld6 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 6), align 4
 381   %ld7 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 7), align 2
 382   %cvt0 = uitofp i16 %ld0 to double
 383   %cvt1 = uitofp i16 %ld1 to double
 384   %cvt2 = uitofp i16 %ld2 to double
 385   %cvt3 = uitofp i16 %ld3 to double
 386   %cvt4 = uitofp i16 %ld4 to double
 387   %cvt5 = uitofp i16 %ld5 to double
 388   %cvt6 = uitofp i16 %ld6 to double
 389   %cvt7 = uitofp i16 %ld7 to double
 390   store double %cvt0, double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 0), align 64
 391   store double %cvt1, double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 1), align 8
 392   store double %cvt2, double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 2), align 16
 393   store double %cvt3, double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 3), align 8
 394   store double %cvt4, double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 4), align 32
 395   store double %cvt5, double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 5), align 8
 396   store double %cvt6, double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 6), align 16
 397   store double %cvt7, double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 7), align 8
 398   ret void
 399 }
 400
 401 define void @uitofp_2i8_2f64() #0 {
 402 ; SSE-LABEL: @uitofp_2i8_2f64(
 403 ; SSE-NEXT:    [[LD0:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @src8, i32 0, i64 0), align 64
 404 ; SSE-NEXT:    [[LD1:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @src8, i32 0, i64 1), align 1
 405 ; SSE-NEXT:    [[CVT0:%.*]] = uitofp i8 [[LD0]] to double
 406 ; SSE-NEXT:    [[CVT1:%.*]] = uitofp i8 [[LD1]] to double
 407 ; SSE-NEXT:    store double [[CVT0]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 0), align 64
 408 ; SSE-NEXT:    store double [[CVT1]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 1), align 8
 409 ; SSE-NEXT:    ret void
 410 ;
 411 ; AVX256NODQ-LABEL: @uitofp_2i8_2f64(
 412 ; AVX256NODQ-NEXT:    [[LD0:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @src8, i32 0, i64 0), align 64
 413 ; AVX256NODQ-NEXT:    [[LD1:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @src8, i32 0, i64 1), align 1
 414 ; AVX256NODQ-NEXT:    [[CVT0:%.*]] = uitofp i8 [[LD0]] to double
 415 ; AVX256NODQ-NEXT:    [[CVT1:%.*]] = uitofp i8 [[LD1]] to double
 416 ; AVX256NODQ-NEXT:    store double [[CVT0]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 0), align 64
 417 ; AVX256NODQ-NEXT:    store double [[CVT1]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 1), align 8
 418 ; AVX256NODQ-NEXT:    ret void
 419 ;
 420 ; AVX512-LABEL: @uitofp_2i8_2f64(
 421 ; AVX512-NEXT:    [[TMP1:%.*]] = load <2 x i8>, <2 x i8>* bitcast ([64 x i8]* @src8 to <2 x i8>*), align 64
 422 ; AVX512-NEXT:    [[TMP2:%.*]] = uitofp <2 x i8> [[TMP1]] to <2 x double>
 423 ; AVX512-NEXT:    store <2 x double> [[TMP2]], <2 x double>* bitcast ([8 x double]* @dst64 to <2 x double>*), align 64
 424 ; AVX512-NEXT:    ret void
 425 ;
 426 ; AVX256DQ-LABEL: @uitofp_2i8_2f64(
 427 ; AVX256DQ-NEXT:    [[TMP1:%.*]] = load <2 x i8>, <2 x i8>* bitcast ([64 x i8]* @src8 to <2 x i8>*), align 64
 428 ; AVX256DQ-NEXT:    [[TMP2:%.*]] = uitofp <2 x i8> [[TMP1]] to <2 x double>
 429 ; AVX256DQ-NEXT:    store <2 x double> [[TMP2]], <2 x double>* bitcast ([8 x double]* @dst64 to <2 x double>*), align 64
 430 ; AVX256DQ-NEXT:    ret void
 431 ;
 432   %ld0 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @src8, i32 0, i64 0), align 64
 433   %ld1 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @src8, i32 0, i64 1), align 1
 434   %cvt0 = uitofp i8 %ld0 to double
 435   %cvt1 = uitofp i8 %ld1 to double
 436   store double %cvt0, double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 0), align 64
 437   store double %cvt1, double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 1), align 8
 438   ret void
 439 }
 440
 441 define void @uitofp_4i8_4f64() #0 {
 442 ; SSE-LABEL: @uitofp_4i8_4f64(
 443 ; SSE-NEXT:    [[LD0:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @src8, i32 0, i64 0), align 64
 444 ; SSE-NEXT:    [[LD1:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @src8, i32 0, i64 1), align 1
 445 ; SSE-NEXT:    [[LD2:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @src8, i32 0, i64 2), align 2
 446 ; SSE-NEXT:    [[LD3:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @src8, i32 0, i64 3), align 1
 447 ; SSE-NEXT:    [[CVT0:%.*]] = uitofp i8 [[LD0]] to double
 448 ; SSE-NEXT:    [[CVT1:%.*]] = uitofp i8 [[LD1]] to double
 449 ; SSE-NEXT:    [[CVT2:%.*]] = uitofp i8 [[LD2]] to double
 450 ; SSE-NEXT:    [[CVT3:%.*]] = uitofp i8 [[LD3]] to double
 451 ; SSE-NEXT:    store double [[CVT0]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 0), align 64
 452 ; SSE-NEXT:    store double [[CVT1]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 1), align 8
 453 ; SSE-NEXT:    store double [[CVT2]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 2), align 16
 454 ; SSE-NEXT:    store double [[CVT3]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 3), align 8
 455 ; SSE-NEXT:    ret void
 456 ;
 457 ; AVX-LABEL: @uitofp_4i8_4f64(
 458 ; AVX-NEXT:    [[TMP1:%.*]] = load <4 x i8>, <4 x i8>* bitcast ([64 x i8]* @src8 to <4 x i8>*), align 64
 459 ; AVX-NEXT:    [[TMP2:%.*]] = uitofp <4 x i8> [[TMP1]] to <4 x double>
 460 ; AVX-NEXT:    store <4 x double> [[TMP2]], <4 x double>* bitcast ([8 x double]* @dst64 to <4 x double>*), align 64
 461 ; AVX-NEXT:    ret void
 462 ;
 463   %ld0 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @src8, i32 0, i64 0), align 64
 464   %ld1 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @src8, i32 0, i64 1), align 1
 465   %ld2 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @src8, i32 0, i64 2), align 2
 466   %ld3 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @src8, i32 0, i64 3), align 1
 467   %cvt0 = uitofp i8 %ld0 to double
 468   %cvt1 = uitofp i8 %ld1 to double
 469   %cvt2 = uitofp i8 %ld2 to double
 470   %cvt3 = uitofp i8 %ld3 to double
 471   store double %cvt0, double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 0), align 64
 472   store double %cvt1, double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 1), align 8
 473   store double %cvt2, double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 2), align 16
 474   store double %cvt3, double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 3), align 8
 475   ret void
 476 }
 477
 478 define void @uitofp_8i8_8f64() #0 {
 479 ; SSE-LABEL: @uitofp_8i8_8f64(
 480 ; SSE-NEXT:    [[LD0:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @src8, i32 0, i64 0), align 64
 481 ; SSE-NEXT:    [[LD1:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @src8, i32 0, i64 1), align 1
 482 ; SSE-NEXT:    [[LD2:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @src8, i32 0, i64 2), align 2
 483 ; SSE-NEXT:    [[LD3:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @src8, i32 0, i64 3), align 1
 484 ; SSE-NEXT:    [[LD4:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @src8, i32 0, i64 4), align 4
 485 ; SSE-NEXT:    [[LD5:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @src8, i32 0, i64 5), align 1
 486 ; SSE-NEXT:    [[LD6:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @src8, i32 0, i64 6), align 2
 487 ; SSE-NEXT:    [[LD7:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @src8, i32 0, i64 7), align 1
 488 ; SSE-NEXT:    [[CVT0:%.*]] = uitofp i8 [[LD0]] to double
 489 ; SSE-NEXT:    [[CVT1:%.*]] = uitofp i8 [[LD1]] to double
 490 ; SSE-NEXT:    [[CVT2:%.*]] = uitofp i8 [[LD2]] to double
 491 ; SSE-NEXT:    [[CVT3:%.*]] = uitofp i8 [[LD3]] to double
 492 ; SSE-NEXT:    [[CVT4:%.*]] = uitofp i8 [[LD4]] to double
 493 ; SSE-NEXT:    [[CVT5:%.*]] = uitofp i8 [[LD5]] to double
 494 ; SSE-NEXT:    [[CVT6:%.*]] = uitofp i8 [[LD6]] to double
 495 ; SSE-NEXT:    [[CVT7:%.*]] = uitofp i8 [[LD7]] to double
 496 ; SSE-NEXT:    store double [[CVT0]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 0), align 64
 497 ; SSE-NEXT:    store double [[CVT1]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 1), align 8
 498 ; SSE-NEXT:    store double [[CVT2]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 2), align 16
 499 ; SSE-NEXT:    store double [[CVT3]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 3), align 8
 500 ; SSE-NEXT:    store double [[CVT4]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 4), align 32
 501 ; SSE-NEXT:    store double [[CVT5]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 5), align 8
 502 ; SSE-NEXT:    store double [[CVT6]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 6), align 16
 503 ; SSE-NEXT:    store double [[CVT7]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 7), align 8
 504 ; SSE-NEXT:    ret void
 505 ;
 506 ; AVX256-LABEL: @uitofp_8i8_8f64(
 507 ; AVX256-NEXT:    [[TMP1:%.*]] = load <4 x i8>, <4 x i8>* bitcast ([64 x i8]* @src8 to <4 x i8>*), align 64
 508 ; AVX256-NEXT:    [[TMP2:%.*]] = load <4 x i8>, <4 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @src8, i32 0, i64 4) to <4 x i8>*), align 4
 509 ; AVX256-NEXT:    [[TMP3:%.*]] = uitofp <4 x i8> [[TMP1]] to <4 x double>
 510 ; AVX256-NEXT:    [[TMP4:%.*]] = uitofp <4 x i8> [[TMP2]] to <4 x double>
 511 ; AVX256-NEXT:    store <4 x double> [[TMP3]], <4 x double>* bitcast ([8 x double]* @dst64 to <4 x double>*), align 64
 512 ; AVX256-NEXT:    store <4 x double> [[TMP4]], <4 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 4) to <4 x double>*), align 32
 513 ; AVX256-NEXT:    ret void
 514 ;
 515 ; AVX512-LABEL: @uitofp_8i8_8f64(
 516 ; AVX512-NEXT:    [[TMP1:%.*]] = load <8 x i8>, <8 x i8>* bitcast ([64 x i8]* @src8 to <8 x i8>*), align 64
 517 ; AVX512-NEXT:    [[TMP2:%.*]] = uitofp <8 x i8> [[TMP1]] to <8 x double>
 518 ; AVX512-NEXT:    store <8 x double> [[TMP2]], <8 x double>* bitcast ([8 x double]* @dst64 to <8 x double>*), align 64
 519 ; AVX512-NEXT:    ret void
 520 ;
 521   %ld0 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @src8, i32 0, i64 0), align 64
 522   %ld1 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @src8, i32 0, i64 1), align 1
 523   %ld2 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @src8, i32 0, i64 2), align 2
 524   %ld3 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @src8, i32 0, i64 3), align 1
 525   %ld4 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @src8, i32 0, i64 4), align 4
 526   %ld5 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @src8, i32 0, i64 5), align 1
 527   %ld6 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @src8, i32 0, i64 6), align 2
 528   %ld7 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @src8, i32 0, i64 7), align 1
 529   %cvt0 = uitofp i8 %ld0 to double
 530   %cvt1 = uitofp i8 %ld1 to double
 531   %cvt2 = uitofp i8 %ld2 to double
 532   %cvt3 = uitofp i8 %ld3 to double
 533   %cvt4 = uitofp i8 %ld4 to double
 534   %cvt5 = uitofp i8 %ld5 to double
 535   %cvt6 = uitofp i8 %ld6 to double
 536   %cvt7 = uitofp i8 %ld7 to double
 537   store double %cvt0, double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 0), align 64
 538   store double %cvt1, double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 1), align 8
 539   store double %cvt2, double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 2), align 16
 540   store double %cvt3, double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 3), align 8
 541   store double %cvt4, double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 4), align 32
 542   store double %cvt5, double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 5), align 8
 543   store double %cvt6, double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 6), align 16
 544   store double %cvt7, double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 7), align 8
 545   ret void
 546 }
 547
 548 ;
 549 ; UITOFP to vXf32
 550 ;
 551
 552 define void @uitofp_2i64_2f32() #0 {
 553 ; SSE-LABEL: @uitofp_2i64_2f32(
 554 ; SSE-NEXT:    [[LD0:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 0), align 64
 555 ; SSE-NEXT:    [[LD1:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 1), align 8
 556 ; SSE-NEXT:    [[CVT0:%.*]] = uitofp i64 [[LD0]] to float
 557 ; SSE-NEXT:    [[CVT1:%.*]] = uitofp i64 [[LD1]] to float
 558 ; SSE-NEXT:    store float [[CVT0]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 0), align 64
 559 ; SSE-NEXT:    store float [[CVT1]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 1), align 4
 560 ; SSE-NEXT:    ret void
 561 ;
 562 ; AVX256NODQ-LABEL: @uitofp_2i64_2f32(
 563 ; AVX256NODQ-NEXT:    [[LD0:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 0), align 64
 564 ; AVX256NODQ-NEXT:    [[LD1:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 1), align 8
 565 ; AVX256NODQ-NEXT:    [[CVT0:%.*]] = uitofp i64 [[LD0]] to float
 566 ; AVX256NODQ-NEXT:    [[CVT1:%.*]] = uitofp i64 [[LD1]] to float
 567 ; AVX256NODQ-NEXT:    store float [[CVT0]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 0), align 64
 568 ; AVX256NODQ-NEXT:    store float [[CVT1]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 1), align 4
 569 ; AVX256NODQ-NEXT:    ret void
 570 ;
 571 ; AVX512-LABEL: @uitofp_2i64_2f32(
 572 ; AVX512-NEXT:    [[TMP1:%.*]] = load <2 x i64>, <2 x i64>* bitcast ([8 x i64]* @src64 to <2 x i64>*), align 64
 573 ; AVX512-NEXT:    [[TMP2:%.*]] = uitofp <2 x i64> [[TMP1]] to <2 x float>
 574 ; AVX512-NEXT:    store <2 x float> [[TMP2]], <2 x float>* bitcast ([16 x float]* @dst32 to <2 x float>*), align 64
 575 ; AVX512-NEXT:    ret void
 576 ;
 577 ; AVX256DQ-LABEL: @uitofp_2i64_2f32(
 578 ; AVX256DQ-NEXT:    [[TMP1:%.*]] = load <2 x i64>, <2 x i64>* bitcast ([8 x i64]* @src64 to <2 x i64>*), align 64
 579 ; AVX256DQ-NEXT:    [[TMP2:%.*]] = uitofp <2 x i64> [[TMP1]] to <2 x float>
 580 ; AVX256DQ-NEXT:    store <2 x float> [[TMP2]], <2 x float>* bitcast ([16 x float]* @dst32 to <2 x float>*), align 64
 581 ; AVX256DQ-NEXT:    ret void
 582 ;
 583   %ld0 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 0), align 64
 584   %ld1 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 1), align 8
 585   %cvt0 = uitofp i64 %ld0 to float
 586   %cvt1 = uitofp i64 %ld1 to float
 587   store float %cvt0, float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 0), align 64
 588   store float %cvt1, float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 1), align 4
 589   ret void
 590 }
 591
 592 define void @uitofp_4i64_4f32() #0 {
 593 ; SSE-LABEL: @uitofp_4i64_4f32(
 594 ; SSE-NEXT:    [[LD0:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 0), align 64
 595 ; SSE-NEXT:    [[LD1:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 1), align 8
 596 ; SSE-NEXT:    [[LD2:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 2), align 16
 597 ; SSE-NEXT:    [[LD3:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 3), align 8
 598 ; SSE-NEXT:    [[CVT0:%.*]] = uitofp i64 [[LD0]] to float
 599 ; SSE-NEXT:    [[CVT1:%.*]] = uitofp i64 [[LD1]] to float
 600 ; SSE-NEXT:    [[CVT2:%.*]] = uitofp i64 [[LD2]] to float
 601 ; SSE-NEXT:    [[CVT3:%.*]] = uitofp i64 [[LD3]] to float
 602 ; SSE-NEXT:    store float [[CVT0]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 0), align 64
 603 ; SSE-NEXT:    store float [[CVT1]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 1), align 4
 604 ; SSE-NEXT:    store float [[CVT2]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 2), align 8
 605 ; SSE-NEXT:    store float [[CVT3]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 3), align 4
 606 ; SSE-NEXT:    ret void
 607 ;
 608 ; AVX256NODQ-LABEL: @uitofp_4i64_4f32(
 609 ; AVX256NODQ-NEXT:    [[LD0:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 0), align 64
 610 ; AVX256NODQ-NEXT:    [[LD1:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 1), align 8
 611 ; AVX256NODQ-NEXT:    [[LD2:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 2), align 16
 612 ; AVX256NODQ-NEXT:    [[LD3:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 3), align 8
 613 ; AVX256NODQ-NEXT:    [[CVT0:%.*]] = uitofp i64 [[LD0]] to float
 614 ; AVX256NODQ-NEXT:    [[CVT1:%.*]] = uitofp i64 [[LD1]] to float
 615 ; AVX256NODQ-NEXT:    [[CVT2:%.*]] = uitofp i64 [[LD2]] to float
 616 ; AVX256NODQ-NEXT:    [[CVT3:%.*]] = uitofp i64 [[LD3]] to float
 617 ; AVX256NODQ-NEXT:    store float [[CVT0]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 0), align 64
 618 ; AVX256NODQ-NEXT:    store float [[CVT1]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 1), align 4
 619 ; AVX256NODQ-NEXT:    store float [[CVT2]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 2), align 8
 620 ; AVX256NODQ-NEXT:    store float [[CVT3]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 3), align 4
 621 ; AVX256NODQ-NEXT:    ret void
 622 ;
 623 ; AVX512-LABEL: @uitofp_4i64_4f32(
 624 ; AVX512-NEXT:    [[TMP1:%.*]] = load <4 x i64>, <4 x i64>* bitcast ([8 x i64]* @src64 to <4 x i64>*), align 64
 625 ; AVX512-NEXT:    [[TMP2:%.*]] = uitofp <4 x i64> [[TMP1]] to <4 x float>
 626 ; AVX512-NEXT:    store <4 x float> [[TMP2]], <4 x float>* bitcast ([16 x float]* @dst32 to <4 x float>*), align 64
 627 ; AVX512-NEXT:    ret void
 628 ;
 629 ; AVX256DQ-LABEL: @uitofp_4i64_4f32(
 630 ; AVX256DQ-NEXT:    [[TMP1:%.*]] = load <4 x i64>, <4 x i64>* bitcast ([8 x i64]* @src64 to <4 x i64>*), align 64
 631 ; AVX256DQ-NEXT:    [[TMP2:%.*]] = uitofp <4 x i64> [[TMP1]] to <4 x float>
 632 ; AVX256DQ-NEXT:    store <4 x float> [[TMP2]], <4 x float>* bitcast ([16 x float]* @dst32 to <4 x float>*), align 64
 633 ; AVX256DQ-NEXT:    ret void
 634 ;
 635   %ld0 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 0), align 64
 636   %ld1 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 1), align 8
 637   %ld2 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 2), align 16
 638   %ld3 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 3), align 8
 639   %cvt0 = uitofp i64 %ld0 to float
 640   %cvt1 = uitofp i64 %ld1 to float
 641   %cvt2 = uitofp i64 %ld2 to float
 642   %cvt3 = uitofp i64 %ld3 to float
 643   store float %cvt0, float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 0), align 64
 644   store float %cvt1, float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 1), align 4
 645   store float %cvt2, float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 2), align 8
 646   store float %cvt3, float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 3), align 4
 647   ret void
 648 }
 649
 650 define void @uitofp_8i64_8f32() #0 {
 651 ; SSE-LABEL: @uitofp_8i64_8f32(
 652 ; SSE-NEXT:    [[LD0:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 0), align 64
 653 ; SSE-NEXT:    [[LD1:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 1), align 8
 654 ; SSE-NEXT:    [[LD2:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 2), align 16
 655 ; SSE-NEXT:    [[LD3:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 3), align 8
 656 ; SSE-NEXT:    [[LD4:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 4), align 32
 657 ; SSE-NEXT:    [[LD5:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 5), align 8
 658 ; SSE-NEXT:    [[LD6:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 6), align 16
 659 ; SSE-NEXT:    [[LD7:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 7), align 8
 660 ; SSE-NEXT:    [[CVT0:%.*]] = uitofp i64 [[LD0]] to float
 661 ; SSE-NEXT:    [[CVT1:%.*]] = uitofp i64 [[LD1]] to float
 662 ; SSE-NEXT:    [[CVT2:%.*]] = uitofp i64 [[LD2]] to float
 663 ; SSE-NEXT:    [[CVT3:%.*]] = uitofp i64 [[LD3]] to float
 664 ; SSE-NEXT:    [[CVT4:%.*]] = uitofp i64 [[LD4]] to float
 665 ; SSE-NEXT:    [[CVT5:%.*]] = uitofp i64 [[LD5]] to float
 666 ; SSE-NEXT:    [[CVT6:%.*]] = uitofp i64 [[LD6]] to float
 667 ; SSE-NEXT:    [[CVT7:%.*]] = uitofp i64 [[LD7]] to float
 668 ; SSE-NEXT:    store float [[CVT0]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 0), align 64
 669 ; SSE-NEXT:    store float [[CVT1]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 1), align 4
 670 ; SSE-NEXT:    store float [[CVT2]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 2), align 8
 671 ; SSE-NEXT:    store float [[CVT3]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 3), align 4
 672 ; SSE-NEXT:    store float [[CVT4]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 4), align 16
 673 ; SSE-NEXT:    store float [[CVT5]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 5), align 4
 674 ; SSE-NEXT:    store float [[CVT6]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 6), align 8
 675 ; SSE-NEXT:    store float [[CVT7]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 7), align 4
 676 ; SSE-NEXT:    ret void
 677 ;
 678 ; AVX256NODQ-LABEL: @uitofp_8i64_8f32(
 679 ; AVX256NODQ-NEXT:    [[LD0:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 0), align 64
 680 ; AVX256NODQ-NEXT:    [[LD1:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 1), align 8
 681 ; AVX256NODQ-NEXT:    [[LD2:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 2), align 16
 682 ; AVX256NODQ-NEXT:    [[LD3:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 3), align 8
 683 ; AVX256NODQ-NEXT:    [[LD4:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 4), align 32
 684 ; AVX256NODQ-NEXT:    [[LD5:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 5), align 8
 685 ; AVX256NODQ-NEXT:    [[LD6:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 6), align 16
 686 ; AVX256NODQ-NEXT:    [[LD7:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 7), align 8
 687 ; AVX256NODQ-NEXT:    [[CVT0:%.*]] = uitofp i64 [[LD0]] to float
 688 ; AVX256NODQ-NEXT:    [[CVT1:%.*]] = uitofp i64 [[LD1]] to float
 689 ; AVX256NODQ-NEXT:    [[CVT2:%.*]] = uitofp i64 [[LD2]] to float
 690 ; AVX256NODQ-NEXT:    [[CVT3:%.*]] = uitofp i64 [[LD3]] to float
 691 ; AVX256NODQ-NEXT:    [[CVT4:%.*]] = uitofp i64 [[LD4]] to float
 692 ; AVX256NODQ-NEXT:    [[CVT5:%.*]] = uitofp i64 [[LD5]] to float
 693 ; AVX256NODQ-NEXT:    [[CVT6:%.*]] = uitofp i64 [[LD6]] to float
 694 ; AVX256NODQ-NEXT:    [[CVT7:%.*]] = uitofp i64 [[LD7]] to float
 695 ; AVX256NODQ-NEXT:    store float [[CVT0]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 0), align 64
 696 ; AVX256NODQ-NEXT:    store float [[CVT1]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 1), align 4
 697 ; AVX256NODQ-NEXT:    store float [[CVT2]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 2), align 8
 698 ; AVX256NODQ-NEXT:    store float [[CVT3]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 3), align 4
 699 ; AVX256NODQ-NEXT:    store float [[CVT4]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 4), align 16
 700 ; AVX256NODQ-NEXT:    store float [[CVT5]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 5), align 4
 701 ; AVX256NODQ-NEXT:    store float [[CVT6]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 6), align 8
 702 ; AVX256NODQ-NEXT:    store float [[CVT7]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 7), align 4
 703 ; AVX256NODQ-NEXT:    ret void
 704 ;
 705 ; AVX512-LABEL: @uitofp_8i64_8f32(
 706 ; AVX512-NEXT:    [[TMP1:%.*]] = load <8 x i64>, <8 x i64>* bitcast ([8 x i64]* @src64 to <8 x i64>*), align 64
 707 ; AVX512-NEXT:    [[TMP2:%.*]] = uitofp <8 x i64> [[TMP1]] to <8 x float>
 708 ; AVX512-NEXT:    store <8 x float> [[TMP2]], <8 x float>* bitcast ([16 x float]* @dst32 to <8 x float>*), align 64
 709 ; AVX512-NEXT:    ret void
 710 ;
 711 ; AVX256DQ-LABEL: @uitofp_8i64_8f32(
 712 ; AVX256DQ-NEXT:    [[TMP1:%.*]] = load <8 x i64>, <8 x i64>* bitcast ([8 x i64]* @src64 to <8 x i64>*), align 64
 713 ; AVX256DQ-NEXT:    [[TMP2:%.*]] = uitofp <8 x i64> [[TMP1]] to <8 x float>
 714 ; AVX256DQ-NEXT:    store <8 x float> [[TMP2]], <8 x float>* bitcast ([16 x float]* @dst32 to <8 x float>*), align 64
 715 ; AVX256DQ-NEXT:    ret void
 716 ;
 717   %ld0 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 0), align 64
 718   %ld1 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 1), align 8
 719   %ld2 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 2), align 16
 720   %ld3 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 3), align 8
 721   %ld4 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 4), align 32
 722   %ld5 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 5), align 8
 723   %ld6 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 6), align 16
 724   %ld7 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 7), align 8
 725   %cvt0 = uitofp i64 %ld0 to float
 726   %cvt1 = uitofp i64 %ld1 to float
 727   %cvt2 = uitofp i64 %ld2 to float
 728   %cvt3 = uitofp i64 %ld3 to float
 729   %cvt4 = uitofp i64 %ld4 to float
 730   %cvt5 = uitofp i64 %ld5 to float
 731   %cvt6 = uitofp i64 %ld6 to float
 732   %cvt7 = uitofp i64 %ld7 to float
 733   store float %cvt0, float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 0), align 64
 734   store float %cvt1, float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 1), align 4
 735   store float %cvt2, float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 2), align 8
 736   store float %cvt3, float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 3), align 4
 737   store float %cvt4, float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 4), align 16
 738   store float %cvt5, float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 5), align 4
 739   store float %cvt6, float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 6), align 8
 740   store float %cvt7, float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 7), align 4
 741   ret void
 742 }
 743
 744 define void @uitofp_4i32_4f32() #0 {
 745 ; CHECK-LABEL: @uitofp_4i32_4f32(
 746 ; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, <4 x i32>* bitcast ([16 x i32]* @src32 to <4 x i32>*), align 64
 747 ; CHECK-NEXT:    [[TMP2:%.*]] = uitofp <4 x i32> [[TMP1]] to <4 x float>
 748 ; CHECK-NEXT:    store <4 x float> [[TMP2]], <4 x float>* bitcast ([16 x float]* @dst32 to <4 x float>*), align 64
 749 ; CHECK-NEXT:    ret void
 750 ;
 751   %ld0 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @src32, i32 0, i64 0), align 64
 752   %ld1 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @src32, i32 0, i64 1), align 4
 753   %ld2 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @src32, i32 0, i64 2), align 8
 754   %ld3 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @src32, i32 0, i64 3), align 4
 755   %cvt0 = uitofp i32 %ld0 to float
 756   %cvt1 = uitofp i32 %ld1 to float
 757   %cvt2 = uitofp i32 %ld2 to float
 758   %cvt3 = uitofp i32 %ld3 to float
 759   store float %cvt0, float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 0), align 64
 760   store float %cvt1, float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 1), align 4
 761   store float %cvt2, float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 2), align 8
 762   store float %cvt3, float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 3), align 4
 763   ret void
 764 }
 765
 766 define void @uitofp_8i32_8f32() #0 {
 767 ; SSE-LABEL: @uitofp_8i32_8f32(
 768 ; SSE-NEXT:    [[TMP1:%.*]] = load <4 x i32>, <4 x i32>* bitcast ([16 x i32]* @src32 to <4 x i32>*), align 64
 769 ; SSE-NEXT:    [[TMP2:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @src32, i32 0, i64 4) to <4 x i32>*), align 16
 770 ; SSE-NEXT:    [[TMP3:%.*]] = uitofp <4 x i32> [[TMP1]] to <4 x float>
 771 ; SSE-NEXT:    [[TMP4:%.*]] = uitofp <4 x i32> [[TMP2]] to <4 x float>
 772 ; SSE-NEXT:    store <4 x float> [[TMP3]], <4 x float>* bitcast ([16 x float]* @dst32 to <4 x float>*), align 64
 773 ; SSE-NEXT:    store <4 x float> [[TMP4]], <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 4) to <4 x float>*), align 16
 774 ; SSE-NEXT:    ret void
 775 ;
 776 ; AVX-LABEL: @uitofp_8i32_8f32(
 777 ; AVX-NEXT:    [[TMP1:%.*]] = load <8 x i32>, <8 x i32>* bitcast ([16 x i32]* @src32 to <8 x i32>*), align 64
 778 ; AVX-NEXT:    [[TMP2:%.*]] = uitofp <8 x i32> [[TMP1]] to <8 x float>
 779 ; AVX-NEXT:    store <8 x float> [[TMP2]], <8 x float>* bitcast ([16 x float]* @dst32 to <8 x float>*), align 64
 780 ; AVX-NEXT:    ret void
 781 ;
 782   %ld0 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @src32, i32 0, i64 0), align 64
 783   %ld1 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @src32, i32 0, i64 1), align 4
 784   %ld2 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @src32, i32 0, i64 2), align 8
 785   %ld3 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @src32, i32 0, i64 3), align 4
 786   %ld4 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @src32, i32 0, i64 4), align 16
 787   %ld5 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @src32, i32 0, i64 5), align 4
 788   %ld6 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @src32, i32 0, i64 6), align 8
 789   %ld7 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @src32, i32 0, i64 7), align 4
 790   %cvt0 = uitofp i32 %ld0 to float
 791   %cvt1 = uitofp i32 %ld1 to float
 792   %cvt2 = uitofp i32 %ld2 to float
 793   %cvt3 = uitofp i32 %ld3 to float
 794   %cvt4 = uitofp i32 %ld4 to float
 795   %cvt5 = uitofp i32 %ld5 to float
 796   %cvt6 = uitofp i32 %ld6 to float
 797   %cvt7 = uitofp i32 %ld7 to float
 798   store float %cvt0, float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 0), align 64
 799   store float %cvt1, float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 1), align 4
 800   store float %cvt2, float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 2), align 8
 801   store float %cvt3, float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 3), align 4
 802   store float %cvt4, float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 4), align 16
 803   store float %cvt5, float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 5), align 4
 804   store float %cvt6, float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 6), align 8
 805   store float %cvt7, float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 7), align 4
 806   ret void
 807 }
 808
 809 define void @uitofp_16i32_16f32() #0 {
 810 ; SSE-LABEL: @uitofp_16i32_16f32(
 811 ; SSE-NEXT:    [[TMP1:%.*]] = load <4 x i32>, <4 x i32>* bitcast ([16 x i32]* @src32 to <4 x i32>*), align 64
 812 ; SSE-NEXT:    [[TMP2:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @src32, i32 0, i64 4) to <4 x i32>*), align 16
 813 ; SSE-NEXT:    [[TMP3:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @src32, i32 0, i64 8) to <4 x i32>*), align 32
 814 ; SSE-NEXT:    [[TMP4:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @src32, i32 0, i64 12) to <4 x i32>*), align 16
 815 ; SSE-NEXT:    [[TMP5:%.*]] = uitofp <4 x i32> [[TMP1]] to <4 x float>
 816 ; SSE-NEXT:    [[TMP6:%.*]] = uitofp <4 x i32> [[TMP2]] to <4 x float>
 817 ; SSE-NEXT:    [[TMP7:%.*]] = uitofp <4 x i32> [[TMP3]] to <4 x float>
 818 ; SSE-NEXT:    [[TMP8:%.*]] = uitofp <4 x i32> [[TMP4]] to <4 x float>
 819 ; SSE-NEXT:    store <4 x float> [[TMP5]], <4 x float>* bitcast ([16 x float]* @dst32 to <4 x float>*), align 64
 820 ; SSE-NEXT:    store <4 x float> [[TMP6]], <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 4) to <4 x float>*), align 16
 821 ; SSE-NEXT:    store <4 x float> [[TMP7]], <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 8) to <4 x float>*), align 32
 822 ; SSE-NEXT:    store <4 x float> [[TMP8]], <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 12) to <4 x float>*), align 16
 823 ; SSE-NEXT:    ret void
 824 ;
 825 ; AVX256-LABEL: @uitofp_16i32_16f32(
 826 ; AVX256-NEXT:    [[TMP1:%.*]] = load <8 x i32>, <8 x i32>* bitcast ([16 x i32]* @src32 to <8 x i32>*), align 64
 827 ; AVX256-NEXT:    [[TMP2:%.*]] = load <8 x i32>, <8 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @src32, i32 0, i64 8) to <8 x i32>*), align 32
 828 ; AVX256-NEXT:    [[TMP3:%.*]] = uitofp <8 x i32> [[TMP1]] to <8 x float>
 829 ; AVX256-NEXT:    [[TMP4:%.*]] = uitofp <8 x i32> [[TMP2]] to <8 x float>
 830 ; AVX256-NEXT:    store <8 x float> [[TMP3]], <8 x float>* bitcast ([16 x float]* @dst32 to <8 x float>*), align 64
 831 ; AVX256-NEXT:    store <8 x float> [[TMP4]], <8 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 8) to <8 x float>*), align 32
 832 ; AVX256-NEXT:    ret void
 833 ;
 834 ; AVX512-LABEL: @uitofp_16i32_16f32(
 835 ; AVX512-NEXT:    [[TMP1:%.*]] = load <16 x i32>, <16 x i32>* bitcast ([16 x i32]* @src32 to <16 x i32>*), align 64
 836 ; AVX512-NEXT:    [[TMP2:%.*]] = uitofp <16 x i32> [[TMP1]] to <16 x float>
 837 ; AVX512-NEXT:    store <16 x float> [[TMP2]], <16 x float>* bitcast ([16 x float]* @dst32 to <16 x float>*), align 64
 838 ; AVX512-NEXT:    ret void
 839 ;
 840   %ld0  = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @src32, i32 0, i64 0 ), align 64
 841   %ld1  = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @src32, i32 0, i64 1 ), align 4
 842   %ld2  = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @src32, i32 0, i64 2 ), align 8
 843   %ld3  = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @src32, i32 0, i64 3 ), align 4
 844   %ld4  = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @src32, i32 0, i64 4 ), align 16
 845   %ld5  = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @src32, i32 0, i64 5 ), align 4
 846   %ld6  = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @src32, i32 0, i64 6 ), align 8
 847   %ld7  = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @src32, i32 0, i64 7 ), align 4
 848   %ld8  = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @src32, i32 0, i64 8 ), align 32
 849   %ld9  = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @src32, i32 0, i64 9 ), align 4
 850   %ld10 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @src32, i32 0, i64 10), align 8
 851   %ld11 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @src32, i32 0, i64 11), align 4
 852   %ld12 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @src32, i32 0, i64 12), align 16
 853   %ld13 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @src32, i32 0, i64 13), align 4
 854   %ld14 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @src32, i32 0, i64 14), align 8
 855   %ld15 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @src32, i32 0, i64 15), align 4
 856   %cvt0  = uitofp i32 %ld0  to float
 857   %cvt1  = uitofp i32 %ld1  to float
 858   %cvt2  = uitofp i32 %ld2  to float
 859   %cvt3  = uitofp i32 %ld3  to float
 860   %cvt4  = uitofp i32 %ld4  to float
 861   %cvt5  = uitofp i32 %ld5  to float
 862   %cvt6  = uitofp i32 %ld6  to float
 863   %cvt7  = uitofp i32 %ld7  to float
 864   %cvt8  = uitofp i32 %ld8  to float
 865   %cvt9  = uitofp i32 %ld9  to float
 866   %cvt10 = uitofp i32 %ld10 to float
 867   %cvt11 = uitofp i32 %ld11 to float
 868   %cvt12 = uitofp i32 %ld12 to float
 869   %cvt13 = uitofp i32 %ld13 to float
 870   %cvt14 = uitofp i32 %ld14 to float
 871   %cvt15 = uitofp i32 %ld15 to float
 872   store float %cvt0 , float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 0 ), align 64
 873   store float %cvt1 , float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 1 ), align 4
 874   store float %cvt2 , float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 2 ), align 8
 875   store float %cvt3 , float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 3 ), align 4
 876   store float %cvt4 , float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 4 ), align 16
 877   store float %cvt5 , float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 5 ), align 4
 878   store float %cvt6 , float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 6 ), align 8
 879   store float %cvt7 , float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 7 ), align 4
 880   store float %cvt8 , float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 8 ), align 32
 881   store float %cvt9 , float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 9 ), align 4
 882   store float %cvt10, float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 10), align 8
 883   store float %cvt11, float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 11), align 4
 884   store float %cvt12, float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 12), align 16
 885   store float %cvt13, float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 13), align 4
 886   store float %cvt14, float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 14), align 8
 887   store float %cvt15, float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 15), align 4
 888   ret void
 889 }
 890
 891 define void @uitofp_4i16_4f32() #0 {
 892 ; CHECK-LABEL: @uitofp_4i16_4f32(
 893 ; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i16>, <4 x i16>* bitcast ([32 x i16]* @src16 to <4 x i16>*), align 64
 894 ; CHECK-NEXT:    [[TMP2:%.*]] = uitofp <4 x i16> [[TMP1]] to <4 x float>
 895 ; CHECK-NEXT:    store <4 x float> [[TMP2]], <4 x float>* bitcast ([16 x float]* @dst32 to <4 x float>*), align 64
 896 ; CHECK-NEXT:    ret void
 897 ;
 898   %ld0 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 0), align 64
 899   %ld1 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 1), align 2
 900   %ld2 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 2), align 4
 901   %ld3 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 3), align 2
 902   %cvt0 = uitofp i16 %ld0 to float
 903   %cvt1 = uitofp i16 %ld1 to float
 904   %cvt2 = uitofp i16 %ld2 to float
 905   %cvt3 = uitofp i16 %ld3 to float
 906   store float %cvt0, float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 0), align 64
 907   store float %cvt1, float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 1), align 4
 908   store float %cvt2, float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 2), align 8
 909   store float %cvt3, float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 3), align 4
 910   ret void
 911 }
 912
 913 define void @uitofp_8i16_8f32() #0 {
 914 ; SSE-LABEL: @uitofp_8i16_8f32(
 915 ; SSE-NEXT:    [[TMP1:%.*]] = load <4 x i16>, <4 x i16>* bitcast ([32 x i16]* @src16 to <4 x i16>*), align 64
 916 ; SSE-NEXT:    [[TMP2:%.*]] = load <4 x i16>, <4 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 4) to <4 x i16>*), align 8
 917 ; SSE-NEXT:    [[TMP3:%.*]] = uitofp <4 x i16> [[TMP1]] to <4 x float>
 918 ; SSE-NEXT:    [[TMP4:%.*]] = uitofp <4 x i16> [[TMP2]] to <4 x float>
 919 ; SSE-NEXT:    store <4 x float> [[TMP3]], <4 x float>* bitcast ([16 x float]* @dst32 to <4 x float>*), align 64
 920 ; SSE-NEXT:    store <4 x float> [[TMP4]], <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 4) to <4 x float>*), align 16
 921 ; SSE-NEXT:    ret void
 922 ;
 923 ; AVX-LABEL: @uitofp_8i16_8f32(
 924 ; AVX-NEXT:    [[TMP1:%.*]] = load <8 x i16>, <8 x i16>* bitcast ([32 x i16]* @src16 to <8 x i16>*), align 64
 925 ; AVX-NEXT:    [[TMP2:%.*]] = uitofp <8 x i16> [[TMP1]] to <8 x float>
 926 ; AVX-NEXT:    store <8 x float> [[TMP2]], <8 x float>* bitcast ([16 x float]* @dst32 to <8 x float>*), align 64
 927 ; AVX-NEXT:    ret void
 928 ;
 929   %ld0 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 0), align 64
 930   %ld1 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 1), align 2
 931   %ld2 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 2), align 4
 932   %ld3 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 3), align 2
 933   %ld4 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 4), align 8
 934   %ld5 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 5), align 2
 935   %ld6 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 6), align 4
 936   %ld7 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 7), align 2
 937   %cvt0 = uitofp i16 %ld0 to float
 938   %cvt1 = uitofp i16 %ld1 to float
 939   %cvt2 = uitofp i16 %ld2 to float
 940   %cvt3 = uitofp i16 %ld3 to float
 941   %cvt4 = uitofp i16 %ld4 to float
 942   %cvt5 = uitofp i16 %ld5 to float
 943   %cvt6 = uitofp i16 %ld6 to float
 944   %cvt7 = uitofp i16 %ld7 to float
 945   store float %cvt0, float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 0), align 64
 946   store float %cvt1, float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 1), align 4
 947   store float %cvt2, float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 2), align 8
 948   store float %cvt3, float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 3), align 4
 949   store float %cvt4, float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 4), align 16
 950   store float %cvt5, float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 5), align 4
 951   store float %cvt6, float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 6), align 8
 952   store float %cvt7, float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 7), align 4
 953   ret void
 954 }
 955
 956 define void @uitofp_16i16_16f32() #0 {
 957 ; SSE-LABEL: @uitofp_16i16_16f32(
 958 ; SSE-NEXT:    [[TMP1:%.*]] = load <4 x i16>, <4 x i16>* bitcast ([32 x i16]* @src16 to <4 x i16>*), align 64
 959 ; SSE-NEXT:    [[TMP2:%.*]] = load <4 x i16>, <4 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 4) to <4 x i16>*), align 8
 960 ; SSE-NEXT:    [[TMP3:%.*]] = load <4 x i16>, <4 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 8) to <4 x i16>*), align 16
 961 ; SSE-NEXT:    [[TMP4:%.*]] = load <4 x i16>, <4 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 12) to <4 x i16>*), align 8
 962 ; SSE-NEXT:    [[TMP5:%.*]] = uitofp <4 x i16> [[TMP1]] to <4 x float>
 963 ; SSE-NEXT:    [[TMP6:%.*]] = uitofp <4 x i16> [[TMP2]] to <4 x float>
 964 ; SSE-NEXT:    [[TMP7:%.*]] = uitofp <4 x i16> [[TMP3]] to <4 x float>
 965 ; SSE-NEXT:    [[TMP8:%.*]] = uitofp <4 x i16> [[TMP4]] to <4 x float>
 966 ; SSE-NEXT:    store <4 x float> [[TMP5]], <4 x float>* bitcast ([16 x float]* @dst32 to <4 x float>*), align 64
 967 ; SSE-NEXT:    store <4 x float> [[TMP6]], <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 4) to <4 x float>*), align 16
 968 ; SSE-NEXT:    store <4 x float> [[TMP7]], <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 8) to <4 x float>*), align 32
 969 ; SSE-NEXT:    store <4 x float> [[TMP8]], <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 12) to <4 x float>*), align 16
 970 ; SSE-NEXT:    ret void
 971 ;
 972 ; AVX256-LABEL: @uitofp_16i16_16f32(
 973 ; AVX256-NEXT:    [[TMP1:%.*]] = load <8 x i16>, <8 x i16>* bitcast ([32 x i16]* @src16 to <8 x i16>*), align 64
 974 ; AVX256-NEXT:    [[TMP2:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 8) to <8 x i16>*), align 16
 975 ; AVX256-NEXT:    [[TMP3:%.*]] = uitofp <8 x i16> [[TMP1]] to <8 x float>
 976 ; AVX256-NEXT:    [[TMP4:%.*]] = uitofp <8 x i16> [[TMP2]] to <8 x float>
 977 ; AVX256-NEXT:    store <8 x float> [[TMP3]], <8 x float>* bitcast ([16 x float]* @dst32 to <8 x float>*), align 64
 978 ; AVX256-NEXT:    store <8 x float> [[TMP4]], <8 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 8) to <8 x float>*), align 32
 979 ; AVX256-NEXT:    ret void
 980 ;
 981 ; AVX512-LABEL: @uitofp_16i16_16f32(
 982 ; AVX512-NEXT:    [[TMP1:%.*]] = load <16 x i16>, <16 x i16>* bitcast ([32 x i16]* @src16 to <16 x i16>*), align 64
 983 ; AVX512-NEXT:    [[TMP2:%.*]] = uitofp <16 x i16> [[TMP1]] to <16 x float>
 984 ; AVX512-NEXT:    store <16 x float> [[TMP2]], <16 x float>* bitcast ([16 x float]* @dst32 to <16 x float>*), align 64
 985 ; AVX512-NEXT:    ret void
 986 ;
 987   %ld0  = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 0 ), align 64
 988   %ld1  = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 1 ), align 2
 989   %ld2  = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 2 ), align 4
 990   %ld3  = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 3 ), align 2
 991   %ld4  = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 4 ), align 8
 992   %ld5  = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 5 ), align 2
 993   %ld6  = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 6 ), align 4
 994   %ld7  = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 7 ), align 2
 995   %ld8  = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 8 ), align 16
 996   %ld9  = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 9 ), align 2
 997   %ld10 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 10), align 4
 998   %ld11 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 11), align 2
 999   %ld12 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 12), align 8
1000   %ld13 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 13), align 2
1001   %ld14 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 14), align 4
1002   %ld15 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 15), align 2
1003   %cvt0  = uitofp i16 %ld0  to float
1004   %cvt1  = uitofp i16 %ld1  to float
1005   %cvt2  = uitofp i16 %ld2  to float
1006   %cvt3  = uitofp i16 %ld3  to float
1007   %cvt4  = uitofp i16 %ld4  to float
1008   %cvt5  = uitofp i16 %ld5  to float
1009   %cvt6  = uitofp i16 %ld6  to float
1010   %cvt7  = uitofp i16 %ld7  to float
1011   %cvt8  = uitofp i16 %ld8  to float
1012   %cvt9  = uitofp i16 %ld9  to float
1013   %cvt10 = uitofp i16 %ld10 to float
1014   %cvt11 = uitofp i16 %ld11 to float
1015   %cvt12 = uitofp i16 %ld12 to float
1016   %cvt13 = uitofp i16 %ld13 to float
1017   %cvt14 = uitofp i16 %ld14 to float
1018   %cvt15 = uitofp i16 %ld15 to float
1019   store float %cvt0 , float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 0 ), align 64
1020   store float %cvt1 , float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 1 ), align 4
1021   store float %cvt2 , float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 2 ), align 8
1022   store float %cvt3 , float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 3 ), align 4
1023   store float %cvt4 , float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 4 ), align 16
1024   store float %cvt5 , float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 5 ), align 4
1025   store float %cvt6 , float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 6 ), align 8
1026   store float %cvt7 , float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 7 ), align 4
1027   store float %cvt8 , float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 8 ), align 32
1028   store float %cvt9 , float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 9 ), align 4
1029   store float %cvt10, float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 10), align 8
1030   store float %cvt11, float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 11), align 4
1031   store float %cvt12, float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 12), align 16
1032   store float %cvt13, float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 13), align 4
1033   store float %cvt14, float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 14), align 8
1034   store float %cvt15, float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 15), align 4
1035   ret void
1036 }
1037
1038 define void @uitofp_4i8_4f32() #0 {
1039 ; CHECK-LABEL: @uitofp_4i8_4f32(
1040 ; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i8>, <4 x i8>* bitcast ([64 x i8]* @src8 to <4 x i8>*), align 64
1041 ; CHECK-NEXT:    [[TMP2:%.*]] = uitofp <4 x i8> [[TMP1]] to <4 x float>
1042 ; CHECK-NEXT:    store <4 x float> [[TMP2]], <4 x float>* bitcast ([16 x float]* @dst32 to <4 x float>*), align 64
1043 ; CHECK-NEXT:    ret void
1044 ;
1045   %ld0 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @src8, i32 0, i64 0), align 64
1046   %ld1 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @src8, i32 0, i64 1), align 1
1047   %ld2 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @src8, i32 0, i64 2), align 2
1048   %ld3 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @src8, i32 0, i64 3), align 1
1049   %cvt0 = uitofp i8 %ld0 to float
1050   %cvt1 = uitofp i8 %ld1 to float
1051   %cvt2 = uitofp i8 %ld2 to float
1052   %cvt3 = uitofp i8 %ld3 to float
1053   store float %cvt0, float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 0), align 64
1054   store float %cvt1, float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 1), align 4
1055   store float %cvt2, float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 2), align 8
1056   store float %cvt3, float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 3), align 4
1057   ret void
1058 }
1059
1060 define void @uitofp_8i8_8f32() #0 {
1061 ; SSE-LABEL: @uitofp_8i8_8f32(
1062 ; SSE-NEXT:    [[TMP1:%.*]] = load <4 x i8>, <4 x i8>* bitcast ([64 x i8]* @src8 to <4 x i8>*), align 64
1063 ; SSE-NEXT:    [[TMP2:%.*]] = load <4 x i8>, <4 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @src8, i32 0, i64 4) to <4 x i8>*), align 4
1064 ; SSE-NEXT:    [[TMP3:%.*]] = uitofp <4 x i8> [[TMP1]] to <4 x float>
1065 ; SSE-NEXT:    [[TMP4:%.*]] = uitofp <4 x i8> [[TMP2]] to <4 x float>
1066 ; SSE-NEXT:    store <4 x float> [[TMP3]], <4 x float>* bitcast ([16 x float]* @dst32 to <4 x float>*), align 64
1067 ; SSE-NEXT:    store <4 x float> [[TMP4]], <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 4) to <4 x float>*), align 16
1068 ; SSE-NEXT:    ret void
1069 ;
1070 ; AVX-LABEL: @uitofp_8i8_8f32(
1071 ; AVX-NEXT:    [[TMP1:%.*]] = load <8 x i8>, <8 x i8>* bitcast ([64 x i8]* @src8 to <8 x i8>*), align 64
1072 ; AVX-NEXT:    [[TMP2:%.*]] = uitofp <8 x i8> [[TMP1]] to <8 x float>
1073 ; AVX-NEXT:    store <8 x float> [[TMP2]], <8 x float>* bitcast ([16 x float]* @dst32 to <8 x float>*), align 64
1074 ; AVX-NEXT:    ret void
1075 ;
1076   %ld0 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @src8, i32 0, i64 0), align 64
1077   %ld1 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @src8, i32 0, i64 1), align 1
1078   %ld2 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @src8, i32 0, i64 2), align 2
1079   %ld3 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @src8, i32 0, i64 3), align 1
1080   %ld4 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @src8, i32 0, i64 4), align 4
1081   %ld5 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @src8, i32 0, i64 5), align 1
1082   %ld6 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @src8, i32 0, i64 6), align 2
1083   %ld7 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @src8, i32 0, i64 7), align 1
1084   %cvt0 = uitofp i8 %ld0 to float
1085   %cvt1 = uitofp i8 %ld1 to float
1086   %cvt2 = uitofp i8 %ld2 to float
1087   %cvt3 = uitofp i8 %ld3 to float
1088   %cvt4 = uitofp i8 %ld4 to float
1089   %cvt5 = uitofp i8 %ld5 to float
1090   %cvt6 = uitofp i8 %ld6 to float
1091   %cvt7 = uitofp i8 %ld7 to float
1092   store float %cvt0, float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 0), align 64
1093   store float %cvt1, float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 1), align 4
1094   store float %cvt2, float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 2), align 8
1095   store float %cvt3, float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 3), align 4
1096   store float %cvt4, float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 4), align 16
1097   store float %cvt5, float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 5), align 4
1098   store float %cvt6, float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 6), align 8
1099   store float %cvt7, float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 7), align 4
1100   ret void
1101 }
1102
1103 define void @uitofp_16i8_16f32() #0 {
1104 ; SSE-LABEL: @uitofp_16i8_16f32(
1105 ; SSE-NEXT:    [[TMP1:%.*]] = load <4 x i8>, <4 x i8>* bitcast ([64 x i8]* @src8 to <4 x i8>*), align 64
1106 ; SSE-NEXT:    [[TMP2:%.*]] = load <4 x i8>, <4 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @src8, i32 0, i64 4) to <4 x i8>*), align 4
1107 ; SSE-NEXT:    [[TMP3:%.*]] = load <4 x i8>, <4 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @src8, i32 0, i64 8) to <4 x i8>*), align 8
1108 ; SSE-NEXT:    [[TMP4:%.*]] = load <4 x i8>, <4 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @src8, i32 0, i64 12) to <4 x i8>*), align 4
1109 ; SSE-NEXT:    [[TMP5:%.*]] = uitofp <4 x i8> [[TMP1]] to <4 x float>
1110 ; SSE-NEXT:    [[TMP6:%.*]] = uitofp <4 x i8> [[TMP2]] to <4 x float>
1111 ; SSE-NEXT:    [[TMP7:%.*]] = uitofp <4 x i8> [[TMP3]] to <4 x float>
1112 ; SSE-NEXT:    [[TMP8:%.*]] = uitofp <4 x i8> [[TMP4]] to <4 x float>
1113 ; SSE-NEXT:    store <4 x float> [[TMP5]], <4 x float>* bitcast ([16 x float]* @dst32 to <4 x float>*), align 64
1114 ; SSE-NEXT:    store <4 x float> [[TMP6]], <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 4) to <4 x float>*), align 16
1115 ; SSE-NEXT:    store <4 x float> [[TMP7]], <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 8) to <4 x float>*), align 32
1116 ; SSE-NEXT:    store <4 x float> [[TMP8]], <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 12) to <4 x float>*), align 16
1117 ; SSE-NEXT:    ret void
1118 ;
1119 ; AVX256-LABEL: @uitofp_16i8_16f32(
1120 ; AVX256-NEXT:    [[TMP1:%.*]] = load <8 x i8>, <8 x i8>* bitcast ([64 x i8]* @src8 to <8 x i8>*), align 64
1121 ; AVX256-NEXT:    [[TMP2:%.*]] = load <8 x i8>, <8 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @src8, i32 0, i64 8) to <8 x i8>*), align 8
1122 ; AVX256-NEXT:    [[TMP3:%.*]] = uitofp <8 x i8> [[TMP1]] to <8 x float>
1123 ; AVX256-NEXT:    [[TMP4:%.*]] = uitofp <8 x i8> [[TMP2]] to <8 x float>
1124 ; AVX256-NEXT:    store <8 x float> [[TMP3]], <8 x float>* bitcast ([16 x float]* @dst32 to <8 x float>*), align 64
1125 ; AVX256-NEXT:    store <8 x float> [[TMP4]], <8 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 8) to <8 x float>*), align 32
1126 ; AVX256-NEXT:    ret void
1127 ;
1128 ; AVX512-LABEL: @uitofp_16i8_16f32(
1129 ; AVX512-NEXT:    [[TMP1:%.*]] = load <16 x i8>, <16 x i8>* bitcast ([64 x i8]* @src8 to <16 x i8>*), align 64
1130 ; AVX512-NEXT:    [[TMP2:%.*]] = uitofp <16 x i8> [[TMP1]] to <16 x float>
1131 ; AVX512-NEXT:    store <16 x float> [[TMP2]], <16 x float>* bitcast ([16 x float]* @dst32 to <16 x float>*), align 64
1132 ; AVX512-NEXT:    ret void
1133 ;
1134   %ld0  = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @src8, i32 0, i64 0 ), align 64
1135   %ld1  = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @src8, i32 0, i64 1 ), align 1
1136   %ld2  = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @src8, i32 0, i64 2 ), align 2
1137   %ld3  = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @src8, i32 0, i64 3 ), align 1
1138   %ld4  = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @src8, i32 0, i64 4 ), align 4
1139   %ld5  = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @src8, i32 0, i64 5 ), align 1
1140   %ld6  = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @src8, i32 0, i64 6 ), align 2
1141   %ld7  = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @src8, i32 0, i64 7 ), align 1
1142   %ld8  = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @src8, i32 0, i64 8 ), align 8
1143   %ld9  = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @src8, i32 0, i64 9 ), align 1
1144   %ld10 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @src8, i32 0, i64 10), align 2
1145   %ld11 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @src8, i32 0, i64 11), align 1
1146   %ld12 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @src8, i32 0, i64 12), align 4
1147   %ld13 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @src8, i32 0, i64 13), align 1
1148   %ld14 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @src8, i32 0, i64 14), align 2
1149   %ld15 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @src8, i32 0, i64 15), align 1
1150   %cvt0  = uitofp i8 %ld0  to float
1151   %cvt1  = uitofp i8 %ld1  to float
1152   %cvt2  = uitofp i8 %ld2  to float
1153   %cvt3  = uitofp i8 %ld3  to float
1154   %cvt4  = uitofp i8 %ld4  to float
1155   %cvt5  = uitofp i8 %ld5  to float
1156   %cvt6  = uitofp i8 %ld6  to float
1157   %cvt7  = uitofp i8 %ld7  to float
1158   %cvt8  = uitofp i8 %ld8  to float
1159   %cvt9  = uitofp i8 %ld9  to float
1160   %cvt10 = uitofp i8 %ld10 to float
1161   %cvt11 = uitofp i8 %ld11 to float
1162   %cvt12 = uitofp i8 %ld12 to float
1163   %cvt13 = uitofp i8 %ld13 to float
1164   %cvt14 = uitofp i8 %ld14 to float
1165   %cvt15 = uitofp i8 %ld15 to float
1166   store float %cvt0 , float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 0 ), align 64
1167   store float %cvt1 , float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 1 ), align 4
1168   store float %cvt2 , float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 2 ), align 8
1169   store float %cvt3 , float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 3 ), align 4
1170   store float %cvt4 , float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 4 ), align 16
1171   store float %cvt5 , float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 5 ), align 4
1172   store float %cvt6 , float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 6 ), align 8
1173   store float %cvt7 , float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 7 ), align 4
1174   store float %cvt8 , float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 8 ), align 32
1175   store float %cvt9 , float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 9 ), align 4
1176   store float %cvt10, float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 10), align 8
1177   store float %cvt11, float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 11), align 4
1178   store float %cvt12, float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 12), align 16
1179   store float %cvt13, float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 13), align 4
1180   store float %cvt14, float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 14), align 8
1181   store float %cvt15, float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 15), align 4
1182   ret void
1183 }
1184
1185 attributes #0 = { nounwind }