1 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
2 ; RUN: opt -lower-matrix-intrinsics -S < %s | FileCheck %s
3 ; RUN: opt -passes='lower-matrix-intrinsics' -S < %s | FileCheck %s
5 define void @transpose_multiply(<9 x double>* %A.Ptr, <9 x double>* %B.Ptr, <9 x double>* %C.Ptr) {
6 ; CHECK-LABEL: @transpose_multiply(
8 ; CHECK-NEXT: [[TMP0:%.*]] = bitcast <9 x double>* [[A_PTR:%.*]] to double*
9 ; CHECK-NEXT: [[VEC_CAST:%.*]] = bitcast double* [[TMP0]] to <3 x double>*
10 ; CHECK-NEXT: [[COL_LOAD:%.*]] = load <3 x double>, <3 x double>* [[VEC_CAST]], align 8
11 ; CHECK-NEXT: [[VEC_GEP:%.*]] = getelementptr double, double* [[TMP0]], i64 3
12 ; CHECK-NEXT: [[VEC_CAST1:%.*]] = bitcast double* [[VEC_GEP]] to <3 x double>*
13 ; CHECK-NEXT: [[COL_LOAD2:%.*]] = load <3 x double>, <3 x double>* [[VEC_CAST1]], align 8
14 ; CHECK-NEXT: [[VEC_GEP3:%.*]] = getelementptr double, double* [[TMP0]], i64 6
15 ; CHECK-NEXT: [[VEC_CAST4:%.*]] = bitcast double* [[VEC_GEP3]] to <3 x double>*
16 ; CHECK-NEXT: [[COL_LOAD5:%.*]] = load <3 x double>, <3 x double>* [[VEC_CAST4]], align 8
17 ; CHECK-NEXT: [[TMP1:%.*]] = bitcast <9 x double>* [[B_PTR:%.*]] to double*
18 ; CHECK-NEXT: [[VEC_CAST6:%.*]] = bitcast double* [[TMP1]] to <3 x double>*
19 ; CHECK-NEXT: [[COL_LOAD7:%.*]] = load <3 x double>, <3 x double>* [[VEC_CAST6]], align 8
20 ; CHECK-NEXT: [[VEC_GEP8:%.*]] = getelementptr double, double* [[TMP1]], i64 3
21 ; CHECK-NEXT: [[VEC_CAST9:%.*]] = bitcast double* [[VEC_GEP8]] to <3 x double>*
22 ; CHECK-NEXT: [[COL_LOAD10:%.*]] = load <3 x double>, <3 x double>* [[VEC_CAST9]], align 8
23 ; CHECK-NEXT: [[VEC_GEP11:%.*]] = getelementptr double, double* [[TMP1]], i64 6
24 ; CHECK-NEXT: [[VEC_CAST12:%.*]] = bitcast double* [[VEC_GEP11]] to <3 x double>*
25 ; CHECK-NEXT: [[COL_LOAD13:%.*]] = load <3 x double>, <3 x double>* [[VEC_CAST12]], align 8
26 ; CHECK-NEXT: [[TMP2:%.*]] = extractelement <3 x double> [[COL_LOAD]], i64 0
27 ; CHECK-NEXT: [[TMP3:%.*]] = insertelement <3 x double> undef, double [[TMP2]], i64 0
28 ; CHECK-NEXT: [[TMP4:%.*]] = extractelement <3 x double> [[COL_LOAD2]], i64 0
29 ; CHECK-NEXT: [[TMP5:%.*]] = insertelement <3 x double> [[TMP3]], double [[TMP4]], i64 1
30 ; CHECK-NEXT: [[TMP6:%.*]] = extractelement <3 x double> [[COL_LOAD5]], i64 0
31 ; CHECK-NEXT: [[TMP7:%.*]] = insertelement <3 x double> [[TMP5]], double [[TMP6]], i64 2
32 ; CHECK-NEXT: [[TMP8:%.*]] = extractelement <3 x double> [[COL_LOAD]], i64 1
33 ; CHECK-NEXT: [[TMP9:%.*]] = insertelement <3 x double> undef, double [[TMP8]], i64 0
34 ; CHECK-NEXT: [[TMP10:%.*]] = extractelement <3 x double> [[COL_LOAD2]], i64 1
35 ; CHECK-NEXT: [[TMP11:%.*]] = insertelement <3 x double> [[TMP9]], double [[TMP10]], i64 1
36 ; CHECK-NEXT: [[TMP12:%.*]] = extractelement <3 x double> [[COL_LOAD5]], i64 1
37 ; CHECK-NEXT: [[TMP13:%.*]] = insertelement <3 x double> [[TMP11]], double [[TMP12]], i64 2
38 ; CHECK-NEXT: [[TMP14:%.*]] = extractelement <3 x double> [[COL_LOAD]], i64 2
39 ; CHECK-NEXT: [[TMP15:%.*]] = insertelement <3 x double> undef, double [[TMP14]], i64 0
40 ; CHECK-NEXT: [[TMP16:%.*]] = extractelement <3 x double> [[COL_LOAD2]], i64 2
41 ; CHECK-NEXT: [[TMP17:%.*]] = insertelement <3 x double> [[TMP15]], double [[TMP16]], i64 1
42 ; CHECK-NEXT: [[TMP18:%.*]] = extractelement <3 x double> [[COL_LOAD5]], i64 2
43 ; CHECK-NEXT: [[TMP19:%.*]] = insertelement <3 x double> [[TMP17]], double [[TMP18]], i64 2
44 ; CHECK-NEXT: [[BLOCK:%.*]] = shufflevector <3 x double> [[TMP7]], <3 x double> poison, <1 x i32> zeroinitializer
45 ; CHECK-NEXT: [[TMP20:%.*]] = extractelement <3 x double> [[COL_LOAD7]], i64 0
46 ; CHECK-NEXT: [[SPLAT_SPLATINSERT:%.*]] = insertelement <1 x double> poison, double [[TMP20]], i32 0
47 ; CHECK-NEXT: [[SPLAT_SPLAT:%.*]] = shufflevector <1 x double> [[SPLAT_SPLATINSERT]], <1 x double> poison, <1 x i32> zeroinitializer
48 ; CHECK-NEXT: [[TMP21:%.*]] = fmul <1 x double> [[BLOCK]], [[SPLAT_SPLAT]]
49 ; CHECK-NEXT: [[BLOCK14:%.*]] = shufflevector <3 x double> [[TMP13]], <3 x double> poison, <1 x i32> zeroinitializer
50 ; CHECK-NEXT: [[TMP22:%.*]] = extractelement <3 x double> [[COL_LOAD7]], i64 1
51 ; CHECK-NEXT: [[SPLAT_SPLATINSERT15:%.*]] = insertelement <1 x double> poison, double [[TMP22]], i32 0
52 ; CHECK-NEXT: [[SPLAT_SPLAT16:%.*]] = shufflevector <1 x double> [[SPLAT_SPLATINSERT15]], <1 x double> poison, <1 x i32> zeroinitializer
53 ; CHECK-NEXT: [[TMP23:%.*]] = fmul <1 x double> [[BLOCK14]], [[SPLAT_SPLAT16]]
54 ; CHECK-NEXT: [[TMP24:%.*]] = fadd <1 x double> [[TMP21]], [[TMP23]]
55 ; CHECK-NEXT: [[BLOCK17:%.*]] = shufflevector <3 x double> [[TMP19]], <3 x double> poison, <1 x i32> zeroinitializer
56 ; CHECK-NEXT: [[TMP25:%.*]] = extractelement <3 x double> [[COL_LOAD7]], i64 2
57 ; CHECK-NEXT: [[SPLAT_SPLATINSERT18:%.*]] = insertelement <1 x double> poison, double [[TMP25]], i32 0
58 ; CHECK-NEXT: [[SPLAT_SPLAT19:%.*]] = shufflevector <1 x double> [[SPLAT_SPLATINSERT18]], <1 x double> poison, <1 x i32> zeroinitializer
59 ; CHECK-NEXT: [[TMP26:%.*]] = fmul <1 x double> [[BLOCK17]], [[SPLAT_SPLAT19]]
60 ; CHECK-NEXT: [[TMP27:%.*]] = fadd <1 x double> [[TMP24]], [[TMP26]]
61 ; CHECK-NEXT: [[TMP28:%.*]] = shufflevector <1 x double> [[TMP27]], <1 x double> poison, <3 x i32> <i32 0, i32 undef, i32 undef>
62 ; CHECK-NEXT: [[TMP29:%.*]] = shufflevector <3 x double> undef, <3 x double> [[TMP28]], <3 x i32> <i32 3, i32 1, i32 2>
63 ; CHECK-NEXT: [[BLOCK20:%.*]] = shufflevector <3 x double> [[TMP7]], <3 x double> poison, <1 x i32> <i32 1>
64 ; CHECK-NEXT: [[TMP30:%.*]] = extractelement <3 x double> [[COL_LOAD7]], i64 0
65 ; CHECK-NEXT: [[SPLAT_SPLATINSERT21:%.*]] = insertelement <1 x double> poison, double [[TMP30]], i32 0
66 ; CHECK-NEXT: [[SPLAT_SPLAT22:%.*]] = shufflevector <1 x double> [[SPLAT_SPLATINSERT21]], <1 x double> poison, <1 x i32> zeroinitializer
67 ; CHECK-NEXT: [[TMP31:%.*]] = fmul <1 x double> [[BLOCK20]], [[SPLAT_SPLAT22]]
68 ; CHECK-NEXT: [[BLOCK23:%.*]] = shufflevector <3 x double> [[TMP13]], <3 x double> poison, <1 x i32> <i32 1>
69 ; CHECK-NEXT: [[TMP32:%.*]] = extractelement <3 x double> [[COL_LOAD7]], i64 1
70 ; CHECK-NEXT: [[SPLAT_SPLATINSERT24:%.*]] = insertelement <1 x double> poison, double [[TMP32]], i32 0
71 ; CHECK-NEXT: [[SPLAT_SPLAT25:%.*]] = shufflevector <1 x double> [[SPLAT_SPLATINSERT24]], <1 x double> poison, <1 x i32> zeroinitializer
72 ; CHECK-NEXT: [[TMP33:%.*]] = fmul <1 x double> [[BLOCK23]], [[SPLAT_SPLAT25]]
73 ; CHECK-NEXT: [[TMP34:%.*]] = fadd <1 x double> [[TMP31]], [[TMP33]]
74 ; CHECK-NEXT: [[BLOCK26:%.*]] = shufflevector <3 x double> [[TMP19]], <3 x double> poison, <1 x i32> <i32 1>
75 ; CHECK-NEXT: [[TMP35:%.*]] = extractelement <3 x double> [[COL_LOAD7]], i64 2
76 ; CHECK-NEXT: [[SPLAT_SPLATINSERT27:%.*]] = insertelement <1 x double> poison, double [[TMP35]], i32 0
77 ; CHECK-NEXT: [[SPLAT_SPLAT28:%.*]] = shufflevector <1 x double> [[SPLAT_SPLATINSERT27]], <1 x double> poison, <1 x i32> zeroinitializer
78 ; CHECK-NEXT: [[TMP36:%.*]] = fmul <1 x double> [[BLOCK26]], [[SPLAT_SPLAT28]]
79 ; CHECK-NEXT: [[TMP37:%.*]] = fadd <1 x double> [[TMP34]], [[TMP36]]
80 ; CHECK-NEXT: [[TMP38:%.*]] = shufflevector <1 x double> [[TMP37]], <1 x double> poison, <3 x i32> <i32 0, i32 undef, i32 undef>
81 ; CHECK-NEXT: [[TMP39:%.*]] = shufflevector <3 x double> [[TMP29]], <3 x double> [[TMP38]], <3 x i32> <i32 0, i32 3, i32 2>
82 ; CHECK-NEXT: [[BLOCK29:%.*]] = shufflevector <3 x double> [[TMP7]], <3 x double> poison, <1 x i32> <i32 2>
83 ; CHECK-NEXT: [[TMP40:%.*]] = extractelement <3 x double> [[COL_LOAD7]], i64 0
84 ; CHECK-NEXT: [[SPLAT_SPLATINSERT30:%.*]] = insertelement <1 x double> poison, double [[TMP40]], i32 0
85 ; CHECK-NEXT: [[SPLAT_SPLAT31:%.*]] = shufflevector <1 x double> [[SPLAT_SPLATINSERT30]], <1 x double> poison, <1 x i32> zeroinitializer
86 ; CHECK-NEXT: [[TMP41:%.*]] = fmul <1 x double> [[BLOCK29]], [[SPLAT_SPLAT31]]
87 ; CHECK-NEXT: [[BLOCK32:%.*]] = shufflevector <3 x double> [[TMP13]], <3 x double> poison, <1 x i32> <i32 2>
88 ; CHECK-NEXT: [[TMP42:%.*]] = extractelement <3 x double> [[COL_LOAD7]], i64 1
89 ; CHECK-NEXT: [[SPLAT_SPLATINSERT33:%.*]] = insertelement <1 x double> poison, double [[TMP42]], i32 0
90 ; CHECK-NEXT: [[SPLAT_SPLAT34:%.*]] = shufflevector <1 x double> [[SPLAT_SPLATINSERT33]], <1 x double> poison, <1 x i32> zeroinitializer
91 ; CHECK-NEXT: [[TMP43:%.*]] = fmul <1 x double> [[BLOCK32]], [[SPLAT_SPLAT34]]
92 ; CHECK-NEXT: [[TMP44:%.*]] = fadd <1 x double> [[TMP41]], [[TMP43]]
93 ; CHECK-NEXT: [[BLOCK35:%.*]] = shufflevector <3 x double> [[TMP19]], <3 x double> poison, <1 x i32> <i32 2>
94 ; CHECK-NEXT: [[TMP45:%.*]] = extractelement <3 x double> [[COL_LOAD7]], i64 2
95 ; CHECK-NEXT: [[SPLAT_SPLATINSERT36:%.*]] = insertelement <1 x double> poison, double [[TMP45]], i32 0
96 ; CHECK-NEXT: [[SPLAT_SPLAT37:%.*]] = shufflevector <1 x double> [[SPLAT_SPLATINSERT36]], <1 x double> poison, <1 x i32> zeroinitializer
97 ; CHECK-NEXT: [[TMP46:%.*]] = fmul <1 x double> [[BLOCK35]], [[SPLAT_SPLAT37]]
98 ; CHECK-NEXT: [[TMP47:%.*]] = fadd <1 x double> [[TMP44]], [[TMP46]]
99 ; CHECK-NEXT: [[TMP48:%.*]] = shufflevector <1 x double> [[TMP47]], <1 x double> poison, <3 x i32> <i32 0, i32 undef, i32 undef>
100 ; CHECK-NEXT: [[TMP49:%.*]] = shufflevector <3 x double> [[TMP39]], <3 x double> [[TMP48]], <3 x i32> <i32 0, i32 1, i32 3>
101 ; CHECK-NEXT: [[BLOCK38:%.*]] = shufflevector <3 x double> [[TMP7]], <3 x double> poison, <1 x i32> zeroinitializer
102 ; CHECK-NEXT: [[TMP50:%.*]] = extractelement <3 x double> [[COL_LOAD10]], i64 0
103 ; CHECK-NEXT: [[SPLAT_SPLATINSERT39:%.*]] = insertelement <1 x double> poison, double [[TMP50]], i32 0
104 ; CHECK-NEXT: [[SPLAT_SPLAT40:%.*]] = shufflevector <1 x double> [[SPLAT_SPLATINSERT39]], <1 x double> poison, <1 x i32> zeroinitializer
105 ; CHECK-NEXT: [[TMP51:%.*]] = fmul <1 x double> [[BLOCK38]], [[SPLAT_SPLAT40]]
106 ; CHECK-NEXT: [[BLOCK41:%.*]] = shufflevector <3 x double> [[TMP13]], <3 x double> poison, <1 x i32> zeroinitializer
107 ; CHECK-NEXT: [[TMP52:%.*]] = extractelement <3 x double> [[COL_LOAD10]], i64 1
108 ; CHECK-NEXT: [[SPLAT_SPLATINSERT42:%.*]] = insertelement <1 x double> poison, double [[TMP52]], i32 0
109 ; CHECK-NEXT: [[SPLAT_SPLAT43:%.*]] = shufflevector <1 x double> [[SPLAT_SPLATINSERT42]], <1 x double> poison, <1 x i32> zeroinitializer
110 ; CHECK-NEXT: [[TMP53:%.*]] = fmul <1 x double> [[BLOCK41]], [[SPLAT_SPLAT43]]
111 ; CHECK-NEXT: [[TMP54:%.*]] = fadd <1 x double> [[TMP51]], [[TMP53]]
112 ; CHECK-NEXT: [[BLOCK44:%.*]] = shufflevector <3 x double> [[TMP19]], <3 x double> poison, <1 x i32> zeroinitializer
113 ; CHECK-NEXT: [[TMP55:%.*]] = extractelement <3 x double> [[COL_LOAD10]], i64 2
114 ; CHECK-NEXT: [[SPLAT_SPLATINSERT45:%.*]] = insertelement <1 x double> poison, double [[TMP55]], i32 0
115 ; CHECK-NEXT: [[SPLAT_SPLAT46:%.*]] = shufflevector <1 x double> [[SPLAT_SPLATINSERT45]], <1 x double> poison, <1 x i32> zeroinitializer
116 ; CHECK-NEXT: [[TMP56:%.*]] = fmul <1 x double> [[BLOCK44]], [[SPLAT_SPLAT46]]
117 ; CHECK-NEXT: [[TMP57:%.*]] = fadd <1 x double> [[TMP54]], [[TMP56]]
118 ; CHECK-NEXT: [[TMP58:%.*]] = shufflevector <1 x double> [[TMP57]], <1 x double> poison, <3 x i32> <i32 0, i32 undef, i32 undef>
119 ; CHECK-NEXT: [[TMP59:%.*]] = shufflevector <3 x double> undef, <3 x double> [[TMP58]], <3 x i32> <i32 3, i32 1, i32 2>
120 ; CHECK-NEXT: [[BLOCK47:%.*]] = shufflevector <3 x double> [[TMP7]], <3 x double> poison, <1 x i32> <i32 1>
121 ; CHECK-NEXT: [[TMP60:%.*]] = extractelement <3 x double> [[COL_LOAD10]], i64 0
122 ; CHECK-NEXT: [[SPLAT_SPLATINSERT48:%.*]] = insertelement <1 x double> poison, double [[TMP60]], i32 0
123 ; CHECK-NEXT: [[SPLAT_SPLAT49:%.*]] = shufflevector <1 x double> [[SPLAT_SPLATINSERT48]], <1 x double> poison, <1 x i32> zeroinitializer
124 ; CHECK-NEXT: [[TMP61:%.*]] = fmul <1 x double> [[BLOCK47]], [[SPLAT_SPLAT49]]
125 ; CHECK-NEXT: [[BLOCK50:%.*]] = shufflevector <3 x double> [[TMP13]], <3 x double> poison, <1 x i32> <i32 1>
126 ; CHECK-NEXT: [[TMP62:%.*]] = extractelement <3 x double> [[COL_LOAD10]], i64 1
127 ; CHECK-NEXT: [[SPLAT_SPLATINSERT51:%.*]] = insertelement <1 x double> poison, double [[TMP62]], i32 0
128 ; CHECK-NEXT: [[SPLAT_SPLAT52:%.*]] = shufflevector <1 x double> [[SPLAT_SPLATINSERT51]], <1 x double> poison, <1 x i32> zeroinitializer
129 ; CHECK-NEXT: [[TMP63:%.*]] = fmul <1 x double> [[BLOCK50]], [[SPLAT_SPLAT52]]
130 ; CHECK-NEXT: [[TMP64:%.*]] = fadd <1 x double> [[TMP61]], [[TMP63]]
131 ; CHECK-NEXT: [[BLOCK53:%.*]] = shufflevector <3 x double> [[TMP19]], <3 x double> poison, <1 x i32> <i32 1>
132 ; CHECK-NEXT: [[TMP65:%.*]] = extractelement <3 x double> [[COL_LOAD10]], i64 2
133 ; CHECK-NEXT: [[SPLAT_SPLATINSERT54:%.*]] = insertelement <1 x double> poison, double [[TMP65]], i32 0
134 ; CHECK-NEXT: [[SPLAT_SPLAT55:%.*]] = shufflevector <1 x double> [[SPLAT_SPLATINSERT54]], <1 x double> poison, <1 x i32> zeroinitializer
135 ; CHECK-NEXT: [[TMP66:%.*]] = fmul <1 x double> [[BLOCK53]], [[SPLAT_SPLAT55]]
136 ; CHECK-NEXT: [[TMP67:%.*]] = fadd <1 x double> [[TMP64]], [[TMP66]]
137 ; CHECK-NEXT: [[TMP68:%.*]] = shufflevector <1 x double> [[TMP67]], <1 x double> poison, <3 x i32> <i32 0, i32 undef, i32 undef>
138 ; CHECK-NEXT: [[TMP69:%.*]] = shufflevector <3 x double> [[TMP59]], <3 x double> [[TMP68]], <3 x i32> <i32 0, i32 3, i32 2>
139 ; CHECK-NEXT: [[BLOCK56:%.*]] = shufflevector <3 x double> [[TMP7]], <3 x double> poison, <1 x i32> <i32 2>
140 ; CHECK-NEXT: [[TMP70:%.*]] = extractelement <3 x double> [[COL_LOAD10]], i64 0
141 ; CHECK-NEXT: [[SPLAT_SPLATINSERT57:%.*]] = insertelement <1 x double> poison, double [[TMP70]], i32 0
142 ; CHECK-NEXT: [[SPLAT_SPLAT58:%.*]] = shufflevector <1 x double> [[SPLAT_SPLATINSERT57]], <1 x double> poison, <1 x i32> zeroinitializer
143 ; CHECK-NEXT: [[TMP71:%.*]] = fmul <1 x double> [[BLOCK56]], [[SPLAT_SPLAT58]]
144 ; CHECK-NEXT: [[BLOCK59:%.*]] = shufflevector <3 x double> [[TMP13]], <3 x double> poison, <1 x i32> <i32 2>
145 ; CHECK-NEXT: [[TMP72:%.*]] = extractelement <3 x double> [[COL_LOAD10]], i64 1
146 ; CHECK-NEXT: [[SPLAT_SPLATINSERT60:%.*]] = insertelement <1 x double> poison, double [[TMP72]], i32 0
147 ; CHECK-NEXT: [[SPLAT_SPLAT61:%.*]] = shufflevector <1 x double> [[SPLAT_SPLATINSERT60]], <1 x double> poison, <1 x i32> zeroinitializer
148 ; CHECK-NEXT: [[TMP73:%.*]] = fmul <1 x double> [[BLOCK59]], [[SPLAT_SPLAT61]]
149 ; CHECK-NEXT: [[TMP74:%.*]] = fadd <1 x double> [[TMP71]], [[TMP73]]
150 ; CHECK-NEXT: [[BLOCK62:%.*]] = shufflevector <3 x double> [[TMP19]], <3 x double> poison, <1 x i32> <i32 2>
151 ; CHECK-NEXT: [[TMP75:%.*]] = extractelement <3 x double> [[COL_LOAD10]], i64 2
152 ; CHECK-NEXT: [[SPLAT_SPLATINSERT63:%.*]] = insertelement <1 x double> poison, double [[TMP75]], i32 0
153 ; CHECK-NEXT: [[SPLAT_SPLAT64:%.*]] = shufflevector <1 x double> [[SPLAT_SPLATINSERT63]], <1 x double> poison, <1 x i32> zeroinitializer
154 ; CHECK-NEXT: [[TMP76:%.*]] = fmul <1 x double> [[BLOCK62]], [[SPLAT_SPLAT64]]
155 ; CHECK-NEXT: [[TMP77:%.*]] = fadd <1 x double> [[TMP74]], [[TMP76]]
156 ; CHECK-NEXT: [[TMP78:%.*]] = shufflevector <1 x double> [[TMP77]], <1 x double> poison, <3 x i32> <i32 0, i32 undef, i32 undef>
157 ; CHECK-NEXT: [[TMP79:%.*]] = shufflevector <3 x double> [[TMP69]], <3 x double> [[TMP78]], <3 x i32> <i32 0, i32 1, i32 3>
158 ; CHECK-NEXT: [[BLOCK65:%.*]] = shufflevector <3 x double> [[TMP7]], <3 x double> poison, <1 x i32> zeroinitializer
159 ; CHECK-NEXT: [[TMP80:%.*]] = extractelement <3 x double> [[COL_LOAD13]], i64 0
160 ; CHECK-NEXT: [[SPLAT_SPLATINSERT66:%.*]] = insertelement <1 x double> poison, double [[TMP80]], i32 0
161 ; CHECK-NEXT: [[SPLAT_SPLAT67:%.*]] = shufflevector <1 x double> [[SPLAT_SPLATINSERT66]], <1 x double> poison, <1 x i32> zeroinitializer
162 ; CHECK-NEXT: [[TMP81:%.*]] = fmul <1 x double> [[BLOCK65]], [[SPLAT_SPLAT67]]
163 ; CHECK-NEXT: [[BLOCK68:%.*]] = shufflevector <3 x double> [[TMP13]], <3 x double> poison, <1 x i32> zeroinitializer
164 ; CHECK-NEXT: [[TMP82:%.*]] = extractelement <3 x double> [[COL_LOAD13]], i64 1
165 ; CHECK-NEXT: [[SPLAT_SPLATINSERT69:%.*]] = insertelement <1 x double> poison, double [[TMP82]], i32 0
166 ; CHECK-NEXT: [[SPLAT_SPLAT70:%.*]] = shufflevector <1 x double> [[SPLAT_SPLATINSERT69]], <1 x double> poison, <1 x i32> zeroinitializer
167 ; CHECK-NEXT: [[TMP83:%.*]] = fmul <1 x double> [[BLOCK68]], [[SPLAT_SPLAT70]]
168 ; CHECK-NEXT: [[TMP84:%.*]] = fadd <1 x double> [[TMP81]], [[TMP83]]
169 ; CHECK-NEXT: [[BLOCK71:%.*]] = shufflevector <3 x double> [[TMP19]], <3 x double> poison, <1 x i32> zeroinitializer
170 ; CHECK-NEXT: [[TMP85:%.*]] = extractelement <3 x double> [[COL_LOAD13]], i64 2
171 ; CHECK-NEXT: [[SPLAT_SPLATINSERT72:%.*]] = insertelement <1 x double> poison, double [[TMP85]], i32 0
172 ; CHECK-NEXT: [[SPLAT_SPLAT73:%.*]] = shufflevector <1 x double> [[SPLAT_SPLATINSERT72]], <1 x double> poison, <1 x i32> zeroinitializer
173 ; CHECK-NEXT: [[TMP86:%.*]] = fmul <1 x double> [[BLOCK71]], [[SPLAT_SPLAT73]]
174 ; CHECK-NEXT: [[TMP87:%.*]] = fadd <1 x double> [[TMP84]], [[TMP86]]
175 ; CHECK-NEXT: [[TMP88:%.*]] = shufflevector <1 x double> [[TMP87]], <1 x double> poison, <3 x i32> <i32 0, i32 undef, i32 undef>
176 ; CHECK-NEXT: [[TMP89:%.*]] = shufflevector <3 x double> undef, <3 x double> [[TMP88]], <3 x i32> <i32 3, i32 1, i32 2>
177 ; CHECK-NEXT: [[BLOCK74:%.*]] = shufflevector <3 x double> [[TMP7]], <3 x double> poison, <1 x i32> <i32 1>
178 ; CHECK-NEXT: [[TMP90:%.*]] = extractelement <3 x double> [[COL_LOAD13]], i64 0
179 ; CHECK-NEXT: [[SPLAT_SPLATINSERT75:%.*]] = insertelement <1 x double> poison, double [[TMP90]], i32 0
180 ; CHECK-NEXT: [[SPLAT_SPLAT76:%.*]] = shufflevector <1 x double> [[SPLAT_SPLATINSERT75]], <1 x double> poison, <1 x i32> zeroinitializer
181 ; CHECK-NEXT: [[TMP91:%.*]] = fmul <1 x double> [[BLOCK74]], [[SPLAT_SPLAT76]]
182 ; CHECK-NEXT: [[BLOCK77:%.*]] = shufflevector <3 x double> [[TMP13]], <3 x double> poison, <1 x i32> <i32 1>
183 ; CHECK-NEXT: [[TMP92:%.*]] = extractelement <3 x double> [[COL_LOAD13]], i64 1
184 ; CHECK-NEXT: [[SPLAT_SPLATINSERT78:%.*]] = insertelement <1 x double> poison, double [[TMP92]], i32 0
185 ; CHECK-NEXT: [[SPLAT_SPLAT79:%.*]] = shufflevector <1 x double> [[SPLAT_SPLATINSERT78]], <1 x double> poison, <1 x i32> zeroinitializer
186 ; CHECK-NEXT: [[TMP93:%.*]] = fmul <1 x double> [[BLOCK77]], [[SPLAT_SPLAT79]]
187 ; CHECK-NEXT: [[TMP94:%.*]] = fadd <1 x double> [[TMP91]], [[TMP93]]
188 ; CHECK-NEXT: [[BLOCK80:%.*]] = shufflevector <3 x double> [[TMP19]], <3 x double> poison, <1 x i32> <i32 1>
189 ; CHECK-NEXT: [[TMP95:%.*]] = extractelement <3 x double> [[COL_LOAD13]], i64 2
190 ; CHECK-NEXT: [[SPLAT_SPLATINSERT81:%.*]] = insertelement <1 x double> poison, double [[TMP95]], i32 0
191 ; CHECK-NEXT: [[SPLAT_SPLAT82:%.*]] = shufflevector <1 x double> [[SPLAT_SPLATINSERT81]], <1 x double> poison, <1 x i32> zeroinitializer
192 ; CHECK-NEXT: [[TMP96:%.*]] = fmul <1 x double> [[BLOCK80]], [[SPLAT_SPLAT82]]
193 ; CHECK-NEXT: [[TMP97:%.*]] = fadd <1 x double> [[TMP94]], [[TMP96]]
194 ; CHECK-NEXT: [[TMP98:%.*]] = shufflevector <1 x double> [[TMP97]], <1 x double> poison, <3 x i32> <i32 0, i32 undef, i32 undef>
195 ; CHECK-NEXT: [[TMP99:%.*]] = shufflevector <3 x double> [[TMP89]], <3 x double> [[TMP98]], <3 x i32> <i32 0, i32 3, i32 2>
196 ; CHECK-NEXT: [[BLOCK83:%.*]] = shufflevector <3 x double> [[TMP7]], <3 x double> poison, <1 x i32> <i32 2>
197 ; CHECK-NEXT: [[TMP100:%.*]] = extractelement <3 x double> [[COL_LOAD13]], i64 0
198 ; CHECK-NEXT: [[SPLAT_SPLATINSERT84:%.*]] = insertelement <1 x double> poison, double [[TMP100]], i32 0
199 ; CHECK-NEXT: [[SPLAT_SPLAT85:%.*]] = shufflevector <1 x double> [[SPLAT_SPLATINSERT84]], <1 x double> poison, <1 x i32> zeroinitializer
200 ; CHECK-NEXT: [[TMP101:%.*]] = fmul <1 x double> [[BLOCK83]], [[SPLAT_SPLAT85]]
201 ; CHECK-NEXT: [[BLOCK86:%.*]] = shufflevector <3 x double> [[TMP13]], <3 x double> poison, <1 x i32> <i32 2>
202 ; CHECK-NEXT: [[TMP102:%.*]] = extractelement <3 x double> [[COL_LOAD13]], i64 1
203 ; CHECK-NEXT: [[SPLAT_SPLATINSERT87:%.*]] = insertelement <1 x double> poison, double [[TMP102]], i32 0
204 ; CHECK-NEXT: [[SPLAT_SPLAT88:%.*]] = shufflevector <1 x double> [[SPLAT_SPLATINSERT87]], <1 x double> poison, <1 x i32> zeroinitializer
205 ; CHECK-NEXT: [[TMP103:%.*]] = fmul <1 x double> [[BLOCK86]], [[SPLAT_SPLAT88]]
206 ; CHECK-NEXT: [[TMP104:%.*]] = fadd <1 x double> [[TMP101]], [[TMP103]]
207 ; CHECK-NEXT: [[BLOCK89:%.*]] = shufflevector <3 x double> [[TMP19]], <3 x double> poison, <1 x i32> <i32 2>
208 ; CHECK-NEXT: [[TMP105:%.*]] = extractelement <3 x double> [[COL_LOAD13]], i64 2
209 ; CHECK-NEXT: [[SPLAT_SPLATINSERT90:%.*]] = insertelement <1 x double> poison, double [[TMP105]], i32 0
210 ; CHECK-NEXT: [[SPLAT_SPLAT91:%.*]] = shufflevector <1 x double> [[SPLAT_SPLATINSERT90]], <1 x double> poison, <1 x i32> zeroinitializer
211 ; CHECK-NEXT: [[TMP106:%.*]] = fmul <1 x double> [[BLOCK89]], [[SPLAT_SPLAT91]]
212 ; CHECK-NEXT: [[TMP107:%.*]] = fadd <1 x double> [[TMP104]], [[TMP106]]
213 ; CHECK-NEXT: [[TMP108:%.*]] = shufflevector <1 x double> [[TMP107]], <1 x double> poison, <3 x i32> <i32 0, i32 undef, i32 undef>
214 ; CHECK-NEXT: [[TMP109:%.*]] = shufflevector <3 x double> [[TMP99]], <3 x double> [[TMP108]], <3 x i32> <i32 0, i32 1, i32 3>
215 ; CHECK-NEXT: [[TMP110:%.*]] = bitcast <9 x double>* [[C_PTR:%.*]] to double*
216 ; CHECK-NEXT: [[VEC_CAST92:%.*]] = bitcast double* [[TMP110]] to <3 x double>*
217 ; CHECK-NEXT: store <3 x double> [[TMP49]], <3 x double>* [[VEC_CAST92]], align 8
218 ; CHECK-NEXT: [[VEC_GEP93:%.*]] = getelementptr double, double* [[TMP110]], i64 3
219 ; CHECK-NEXT: [[VEC_CAST94:%.*]] = bitcast double* [[VEC_GEP93]] to <3 x double>*
220 ; CHECK-NEXT: store <3 x double> [[TMP79]], <3 x double>* [[VEC_CAST94]], align 8
221 ; CHECK-NEXT: [[VEC_GEP95:%.*]] = getelementptr double, double* [[TMP110]], i64 6
222 ; CHECK-NEXT: [[VEC_CAST96:%.*]] = bitcast double* [[VEC_GEP95]] to <3 x double>*
223 ; CHECK-NEXT: store <3 x double> [[TMP109]], <3 x double>* [[VEC_CAST96]], align 8
224 ; CHECK-NEXT: ret void
227 ; Load columns of input matrixes %A and %B.
233 ; Lower multiply(transpose(%A), %B)
236 ; Store result columns.
240 %a = load <9 x double>, <9 x double>* %A.Ptr, align 8
241 %b = load <9 x double>, <9 x double>* %B.Ptr, align 8
242 %a.trans = call <9 x double> @llvm.matrix.transpose(<9 x double> %a, i32 3, i32 3)
243 %c = call <9 x double> @llvm.matrix.multiply.v9f64.v9f64.v9f64(<9 x double> %a.trans, <9 x double> %b, i32 3, i32 3, i32 3)
244 store <9 x double> %c, <9 x double>* %C.Ptr, align 8
248 declare <9 x double> @llvm.matrix.transpose(<9 x double>, i32, i32)
249 declare <9 x double> @llvm.matrix.multiply.v9f64.v9f64.v9f64(<9 x double>, <9 x double>, i32, i32, i32)
251 define void @transpose_multiply_add(<9 x double>* %A.Ptr, <9 x double>* %B.Ptr, <9 x double>* %C.Ptr) {
252 ; CHECK-LABEL: @transpose_multiply_add(
254 ; CHECK-NEXT: [[TMP0:%.*]] = bitcast <9 x double>* [[A_PTR:%.*]] to double*
255 ; CHECK-NEXT: [[VEC_CAST:%.*]] = bitcast double* [[TMP0]] to <3 x double>*
256 ; CHECK-NEXT: [[COL_LOAD:%.*]] = load <3 x double>, <3 x double>* [[VEC_CAST]], align 8
257 ; CHECK-NEXT: [[VEC_GEP:%.*]] = getelementptr double, double* [[TMP0]], i64 3
258 ; CHECK-NEXT: [[VEC_CAST1:%.*]] = bitcast double* [[VEC_GEP]] to <3 x double>*
259 ; CHECK-NEXT: [[COL_LOAD2:%.*]] = load <3 x double>, <3 x double>* [[VEC_CAST1]], align 8
260 ; CHECK-NEXT: [[VEC_GEP3:%.*]] = getelementptr double, double* [[TMP0]], i64 6
261 ; CHECK-NEXT: [[VEC_CAST4:%.*]] = bitcast double* [[VEC_GEP3]] to <3 x double>*
262 ; CHECK-NEXT: [[COL_LOAD5:%.*]] = load <3 x double>, <3 x double>* [[VEC_CAST4]], align 8
263 ; CHECK-NEXT: [[TMP1:%.*]] = bitcast <9 x double>* [[B_PTR:%.*]] to double*
264 ; CHECK-NEXT: [[VEC_CAST6:%.*]] = bitcast double* [[TMP1]] to <3 x double>*
265 ; CHECK-NEXT: [[COL_LOAD7:%.*]] = load <3 x double>, <3 x double>* [[VEC_CAST6]], align 8
266 ; CHECK-NEXT: [[VEC_GEP8:%.*]] = getelementptr double, double* [[TMP1]], i64 3
267 ; CHECK-NEXT: [[VEC_CAST9:%.*]] = bitcast double* [[VEC_GEP8]] to <3 x double>*
268 ; CHECK-NEXT: [[COL_LOAD10:%.*]] = load <3 x double>, <3 x double>* [[VEC_CAST9]], align 8
269 ; CHECK-NEXT: [[VEC_GEP11:%.*]] = getelementptr double, double* [[TMP1]], i64 6
270 ; CHECK-NEXT: [[VEC_CAST12:%.*]] = bitcast double* [[VEC_GEP11]] to <3 x double>*
271 ; CHECK-NEXT: [[COL_LOAD13:%.*]] = load <3 x double>, <3 x double>* [[VEC_CAST12]], align 8
272 ; CHECK-NEXT: [[TMP2:%.*]] = extractelement <3 x double> [[COL_LOAD]], i64 0
273 ; CHECK-NEXT: [[TMP3:%.*]] = insertelement <3 x double> undef, double [[TMP2]], i64 0
274 ; CHECK-NEXT: [[TMP4:%.*]] = extractelement <3 x double> [[COL_LOAD2]], i64 0
275 ; CHECK-NEXT: [[TMP5:%.*]] = insertelement <3 x double> [[TMP3]], double [[TMP4]], i64 1
276 ; CHECK-NEXT: [[TMP6:%.*]] = extractelement <3 x double> [[COL_LOAD5]], i64 0
277 ; CHECK-NEXT: [[TMP7:%.*]] = insertelement <3 x double> [[TMP5]], double [[TMP6]], i64 2
278 ; CHECK-NEXT: [[TMP8:%.*]] = extractelement <3 x double> [[COL_LOAD]], i64 1
279 ; CHECK-NEXT: [[TMP9:%.*]] = insertelement <3 x double> undef, double [[TMP8]], i64 0
280 ; CHECK-NEXT: [[TMP10:%.*]] = extractelement <3 x double> [[COL_LOAD2]], i64 1
281 ; CHECK-NEXT: [[TMP11:%.*]] = insertelement <3 x double> [[TMP9]], double [[TMP10]], i64 1
282 ; CHECK-NEXT: [[TMP12:%.*]] = extractelement <3 x double> [[COL_LOAD5]], i64 1
283 ; CHECK-NEXT: [[TMP13:%.*]] = insertelement <3 x double> [[TMP11]], double [[TMP12]], i64 2
284 ; CHECK-NEXT: [[TMP14:%.*]] = extractelement <3 x double> [[COL_LOAD]], i64 2
285 ; CHECK-NEXT: [[TMP15:%.*]] = insertelement <3 x double> undef, double [[TMP14]], i64 0
286 ; CHECK-NEXT: [[TMP16:%.*]] = extractelement <3 x double> [[COL_LOAD2]], i64 2
287 ; CHECK-NEXT: [[TMP17:%.*]] = insertelement <3 x double> [[TMP15]], double [[TMP16]], i64 1
288 ; CHECK-NEXT: [[TMP18:%.*]] = extractelement <3 x double> [[COL_LOAD5]], i64 2
289 ; CHECK-NEXT: [[TMP19:%.*]] = insertelement <3 x double> [[TMP17]], double [[TMP18]], i64 2
290 ; CHECK-NEXT: [[BLOCK:%.*]] = shufflevector <3 x double> [[TMP7]], <3 x double> poison, <1 x i32> zeroinitializer
291 ; CHECK-NEXT: [[TMP20:%.*]] = extractelement <3 x double> [[COL_LOAD7]], i64 0
292 ; CHECK-NEXT: [[SPLAT_SPLATINSERT:%.*]] = insertelement <1 x double> poison, double [[TMP20]], i32 0
293 ; CHECK-NEXT: [[SPLAT_SPLAT:%.*]] = shufflevector <1 x double> [[SPLAT_SPLATINSERT]], <1 x double> poison, <1 x i32> zeroinitializer
294 ; CHECK-NEXT: [[TMP21:%.*]] = fmul <1 x double> [[BLOCK]], [[SPLAT_SPLAT]]
295 ; CHECK-NEXT: [[BLOCK14:%.*]] = shufflevector <3 x double> [[TMP13]], <3 x double> poison, <1 x i32> zeroinitializer
296 ; CHECK-NEXT: [[TMP22:%.*]] = extractelement <3 x double> [[COL_LOAD7]], i64 1
297 ; CHECK-NEXT: [[SPLAT_SPLATINSERT15:%.*]] = insertelement <1 x double> poison, double [[TMP22]], i32 0
298 ; CHECK-NEXT: [[SPLAT_SPLAT16:%.*]] = shufflevector <1 x double> [[SPLAT_SPLATINSERT15]], <1 x double> poison, <1 x i32> zeroinitializer
299 ; CHECK-NEXT: [[TMP23:%.*]] = fmul <1 x double> [[BLOCK14]], [[SPLAT_SPLAT16]]
300 ; CHECK-NEXT: [[TMP24:%.*]] = fadd <1 x double> [[TMP21]], [[TMP23]]
301 ; CHECK-NEXT: [[BLOCK17:%.*]] = shufflevector <3 x double> [[TMP19]], <3 x double> poison, <1 x i32> zeroinitializer
302 ; CHECK-NEXT: [[TMP25:%.*]] = extractelement <3 x double> [[COL_LOAD7]], i64 2
303 ; CHECK-NEXT: [[SPLAT_SPLATINSERT18:%.*]] = insertelement <1 x double> poison, double [[TMP25]], i32 0
304 ; CHECK-NEXT: [[SPLAT_SPLAT19:%.*]] = shufflevector <1 x double> [[SPLAT_SPLATINSERT18]], <1 x double> poison, <1 x i32> zeroinitializer
305 ; CHECK-NEXT: [[TMP26:%.*]] = fmul <1 x double> [[BLOCK17]], [[SPLAT_SPLAT19]]
306 ; CHECK-NEXT: [[TMP27:%.*]] = fadd <1 x double> [[TMP24]], [[TMP26]]
307 ; CHECK-NEXT: [[TMP28:%.*]] = shufflevector <1 x double> [[TMP27]], <1 x double> poison, <3 x i32> <i32 0, i32 undef, i32 undef>
308 ; CHECK-NEXT: [[TMP29:%.*]] = shufflevector <3 x double> undef, <3 x double> [[TMP28]], <3 x i32> <i32 3, i32 1, i32 2>
309 ; CHECK-NEXT: [[BLOCK20:%.*]] = shufflevector <3 x double> [[TMP7]], <3 x double> poison, <1 x i32> <i32 1>
310 ; CHECK-NEXT: [[TMP30:%.*]] = extractelement <3 x double> [[COL_LOAD7]], i64 0
311 ; CHECK-NEXT: [[SPLAT_SPLATINSERT21:%.*]] = insertelement <1 x double> poison, double [[TMP30]], i32 0
312 ; CHECK-NEXT: [[SPLAT_SPLAT22:%.*]] = shufflevector <1 x double> [[SPLAT_SPLATINSERT21]], <1 x double> poison, <1 x i32> zeroinitializer
313 ; CHECK-NEXT: [[TMP31:%.*]] = fmul <1 x double> [[BLOCK20]], [[SPLAT_SPLAT22]]
314 ; CHECK-NEXT: [[BLOCK23:%.*]] = shufflevector <3 x double> [[TMP13]], <3 x double> poison, <1 x i32> <i32 1>
315 ; CHECK-NEXT: [[TMP32:%.*]] = extractelement <3 x double> [[COL_LOAD7]], i64 1
316 ; CHECK-NEXT: [[SPLAT_SPLATINSERT24:%.*]] = insertelement <1 x double> poison, double [[TMP32]], i32 0
317 ; CHECK-NEXT: [[SPLAT_SPLAT25:%.*]] = shufflevector <1 x double> [[SPLAT_SPLATINSERT24]], <1 x double> poison, <1 x i32> zeroinitializer
318 ; CHECK-NEXT: [[TMP33:%.*]] = fmul <1 x double> [[BLOCK23]], [[SPLAT_SPLAT25]]
319 ; CHECK-NEXT: [[TMP34:%.*]] = fadd <1 x double> [[TMP31]], [[TMP33]]
320 ; CHECK-NEXT: [[BLOCK26:%.*]] = shufflevector <3 x double> [[TMP19]], <3 x double> poison, <1 x i32> <i32 1>
321 ; CHECK-NEXT: [[TMP35:%.*]] = extractelement <3 x double> [[COL_LOAD7]], i64 2
322 ; CHECK-NEXT: [[SPLAT_SPLATINSERT27:%.*]] = insertelement <1 x double> poison, double [[TMP35]], i32 0
323 ; CHECK-NEXT: [[SPLAT_SPLAT28:%.*]] = shufflevector <1 x double> [[SPLAT_SPLATINSERT27]], <1 x double> poison, <1 x i32> zeroinitializer
324 ; CHECK-NEXT: [[TMP36:%.*]] = fmul <1 x double> [[BLOCK26]], [[SPLAT_SPLAT28]]
325 ; CHECK-NEXT: [[TMP37:%.*]] = fadd <1 x double> [[TMP34]], [[TMP36]]
326 ; CHECK-NEXT: [[TMP38:%.*]] = shufflevector <1 x double> [[TMP37]], <1 x double> poison, <3 x i32> <i32 0, i32 undef, i32 undef>
327 ; CHECK-NEXT: [[TMP39:%.*]] = shufflevector <3 x double> [[TMP29]], <3 x double> [[TMP38]], <3 x i32> <i32 0, i32 3, i32 2>
328 ; CHECK-NEXT: [[BLOCK29:%.*]] = shufflevector <3 x double> [[TMP7]], <3 x double> poison, <1 x i32> <i32 2>
329 ; CHECK-NEXT: [[TMP40:%.*]] = extractelement <3 x double> [[COL_LOAD7]], i64 0
330 ; CHECK-NEXT: [[SPLAT_SPLATINSERT30:%.*]] = insertelement <1 x double> poison, double [[TMP40]], i32 0
331 ; CHECK-NEXT: [[SPLAT_SPLAT31:%.*]] = shufflevector <1 x double> [[SPLAT_SPLATINSERT30]], <1 x double> poison, <1 x i32> zeroinitializer
332 ; CHECK-NEXT: [[TMP41:%.*]] = fmul <1 x double> [[BLOCK29]], [[SPLAT_SPLAT31]]
333 ; CHECK-NEXT: [[BLOCK32:%.*]] = shufflevector <3 x double> [[TMP13]], <3 x double> poison, <1 x i32> <i32 2>
334 ; CHECK-NEXT: [[TMP42:%.*]] = extractelement <3 x double> [[COL_LOAD7]], i64 1
335 ; CHECK-NEXT: [[SPLAT_SPLATINSERT33:%.*]] = insertelement <1 x double> poison, double [[TMP42]], i32 0
336 ; CHECK-NEXT: [[SPLAT_SPLAT34:%.*]] = shufflevector <1 x double> [[SPLAT_SPLATINSERT33]], <1 x double> poison, <1 x i32> zeroinitializer
337 ; CHECK-NEXT: [[TMP43:%.*]] = fmul <1 x double> [[BLOCK32]], [[SPLAT_SPLAT34]]
338 ; CHECK-NEXT: [[TMP44:%.*]] = fadd <1 x double> [[TMP41]], [[TMP43]]
339 ; CHECK-NEXT: [[BLOCK35:%.*]] = shufflevector <3 x double> [[TMP19]], <3 x double> poison, <1 x i32> <i32 2>
340 ; CHECK-NEXT: [[TMP45:%.*]] = extractelement <3 x double> [[COL_LOAD7]], i64 2
341 ; CHECK-NEXT: [[SPLAT_SPLATINSERT36:%.*]] = insertelement <1 x double> poison, double [[TMP45]], i32 0
342 ; CHECK-NEXT: [[SPLAT_SPLAT37:%.*]] = shufflevector <1 x double> [[SPLAT_SPLATINSERT36]], <1 x double> poison, <1 x i32> zeroinitializer
343 ; CHECK-NEXT: [[TMP46:%.*]] = fmul <1 x double> [[BLOCK35]], [[SPLAT_SPLAT37]]
344 ; CHECK-NEXT: [[TMP47:%.*]] = fadd <1 x double> [[TMP44]], [[TMP46]]
345 ; CHECK-NEXT: [[TMP48:%.*]] = shufflevector <1 x double> [[TMP47]], <1 x double> poison, <3 x i32> <i32 0, i32 undef, i32 undef>
346 ; CHECK-NEXT: [[TMP49:%.*]] = shufflevector <3 x double> [[TMP39]], <3 x double> [[TMP48]], <3 x i32> <i32 0, i32 1, i32 3>
347 ; CHECK-NEXT: [[BLOCK38:%.*]] = shufflevector <3 x double> [[TMP7]], <3 x double> poison, <1 x i32> zeroinitializer
348 ; CHECK-NEXT: [[TMP50:%.*]] = extractelement <3 x double> [[COL_LOAD10]], i64 0
349 ; CHECK-NEXT: [[SPLAT_SPLATINSERT39:%.*]] = insertelement <1 x double> poison, double [[TMP50]], i32 0
350 ; CHECK-NEXT: [[SPLAT_SPLAT40:%.*]] = shufflevector <1 x double> [[SPLAT_SPLATINSERT39]], <1 x double> poison, <1 x i32> zeroinitializer
351 ; CHECK-NEXT: [[TMP51:%.*]] = fmul <1 x double> [[BLOCK38]], [[SPLAT_SPLAT40]]
352 ; CHECK-NEXT: [[BLOCK41:%.*]] = shufflevector <3 x double> [[TMP13]], <3 x double> poison, <1 x i32> zeroinitializer
353 ; CHECK-NEXT: [[TMP52:%.*]] = extractelement <3 x double> [[COL_LOAD10]], i64 1
354 ; CHECK-NEXT: [[SPLAT_SPLATINSERT42:%.*]] = insertelement <1 x double> poison, double [[TMP52]], i32 0
355 ; CHECK-NEXT: [[SPLAT_SPLAT43:%.*]] = shufflevector <1 x double> [[SPLAT_SPLATINSERT42]], <1 x double> poison, <1 x i32> zeroinitializer
356 ; CHECK-NEXT: [[TMP53:%.*]] = fmul <1 x double> [[BLOCK41]], [[SPLAT_SPLAT43]]
357 ; CHECK-NEXT: [[TMP54:%.*]] = fadd <1 x double> [[TMP51]], [[TMP53]]
358 ; CHECK-NEXT: [[BLOCK44:%.*]] = shufflevector <3 x double> [[TMP19]], <3 x double> poison, <1 x i32> zeroinitializer
359 ; CHECK-NEXT: [[TMP55:%.*]] = extractelement <3 x double> [[COL_LOAD10]], i64 2
360 ; CHECK-NEXT: [[SPLAT_SPLATINSERT45:%.*]] = insertelement <1 x double> poison, double [[TMP55]], i32 0
361 ; CHECK-NEXT: [[SPLAT_SPLAT46:%.*]] = shufflevector <1 x double> [[SPLAT_SPLATINSERT45]], <1 x double> poison, <1 x i32> zeroinitializer
362 ; CHECK-NEXT: [[TMP56:%.*]] = fmul <1 x double> [[BLOCK44]], [[SPLAT_SPLAT46]]
363 ; CHECK-NEXT: [[TMP57:%.*]] = fadd <1 x double> [[TMP54]], [[TMP56]]
364 ; CHECK-NEXT: [[TMP58:%.*]] = shufflevector <1 x double> [[TMP57]], <1 x double> poison, <3 x i32> <i32 0, i32 undef, i32 undef>
365 ; CHECK-NEXT: [[TMP59:%.*]] = shufflevector <3 x double> undef, <3 x double> [[TMP58]], <3 x i32> <i32 3, i32 1, i32 2>
366 ; CHECK-NEXT: [[BLOCK47:%.*]] = shufflevector <3 x double> [[TMP7]], <3 x double> poison, <1 x i32> <i32 1>
367 ; CHECK-NEXT: [[TMP60:%.*]] = extractelement <3 x double> [[COL_LOAD10]], i64 0
368 ; CHECK-NEXT: [[SPLAT_SPLATINSERT48:%.*]] = insertelement <1 x double> poison, double [[TMP60]], i32 0
369 ; CHECK-NEXT: [[SPLAT_SPLAT49:%.*]] = shufflevector <1 x double> [[SPLAT_SPLATINSERT48]], <1 x double> poison, <1 x i32> zeroinitializer
370 ; CHECK-NEXT: [[TMP61:%.*]] = fmul <1 x double> [[BLOCK47]], [[SPLAT_SPLAT49]]
371 ; CHECK-NEXT: [[BLOCK50:%.*]] = shufflevector <3 x double> [[TMP13]], <3 x double> poison, <1 x i32> <i32 1>
372 ; CHECK-NEXT: [[TMP62:%.*]] = extractelement <3 x double> [[COL_LOAD10]], i64 1
373 ; CHECK-NEXT: [[SPLAT_SPLATINSERT51:%.*]] = insertelement <1 x double> poison, double [[TMP62]], i32 0
374 ; CHECK-NEXT: [[SPLAT_SPLAT52:%.*]] = shufflevector <1 x double> [[SPLAT_SPLATINSERT51]], <1 x double> poison, <1 x i32> zeroinitializer
375 ; CHECK-NEXT: [[TMP63:%.*]] = fmul <1 x double> [[BLOCK50]], [[SPLAT_SPLAT52]]
376 ; CHECK-NEXT: [[TMP64:%.*]] = fadd <1 x double> [[TMP61]], [[TMP63]]
377 ; CHECK-NEXT: [[BLOCK53:%.*]] = shufflevector <3 x double> [[TMP19]], <3 x double> poison, <1 x i32> <i32 1>
378 ; CHECK-NEXT: [[TMP65:%.*]] = extractelement <3 x double> [[COL_LOAD10]], i64 2
379 ; CHECK-NEXT: [[SPLAT_SPLATINSERT54:%.*]] = insertelement <1 x double> poison, double [[TMP65]], i32 0
380 ; CHECK-NEXT: [[SPLAT_SPLAT55:%.*]] = shufflevector <1 x double> [[SPLAT_SPLATINSERT54]], <1 x double> poison, <1 x i32> zeroinitializer
381 ; CHECK-NEXT: [[TMP66:%.*]] = fmul <1 x double> [[BLOCK53]], [[SPLAT_SPLAT55]]
382 ; CHECK-NEXT: [[TMP67:%.*]] = fadd <1 x double> [[TMP64]], [[TMP66]]
383 ; CHECK-NEXT: [[TMP68:%.*]] = shufflevector <1 x double> [[TMP67]], <1 x double> poison, <3 x i32> <i32 0, i32 undef, i32 undef>
384 ; CHECK-NEXT: [[TMP69:%.*]] = shufflevector <3 x double> [[TMP59]], <3 x double> [[TMP68]], <3 x i32> <i32 0, i32 3, i32 2>
385 ; CHECK-NEXT: [[BLOCK56:%.*]] = shufflevector <3 x double> [[TMP7]], <3 x double> poison, <1 x i32> <i32 2>
386 ; CHECK-NEXT: [[TMP70:%.*]] = extractelement <3 x double> [[COL_LOAD10]], i64 0
387 ; CHECK-NEXT: [[SPLAT_SPLATINSERT57:%.*]] = insertelement <1 x double> poison, double [[TMP70]], i32 0
388 ; CHECK-NEXT: [[SPLAT_SPLAT58:%.*]] = shufflevector <1 x double> [[SPLAT_SPLATINSERT57]], <1 x double> poison, <1 x i32> zeroinitializer
389 ; CHECK-NEXT: [[TMP71:%.*]] = fmul <1 x double> [[BLOCK56]], [[SPLAT_SPLAT58]]
390 ; CHECK-NEXT: [[BLOCK59:%.*]] = shufflevector <3 x double> [[TMP13]], <3 x double> poison, <1 x i32> <i32 2>
391 ; CHECK-NEXT: [[TMP72:%.*]] = extractelement <3 x double> [[COL_LOAD10]], i64 1
392 ; CHECK-NEXT: [[SPLAT_SPLATINSERT60:%.*]] = insertelement <1 x double> poison, double [[TMP72]], i32 0
393 ; CHECK-NEXT: [[SPLAT_SPLAT61:%.*]] = shufflevector <1 x double> [[SPLAT_SPLATINSERT60]], <1 x double> poison, <1 x i32> zeroinitializer
394 ; CHECK-NEXT: [[TMP73:%.*]] = fmul <1 x double> [[BLOCK59]], [[SPLAT_SPLAT61]]
395 ; CHECK-NEXT: [[TMP74:%.*]] = fadd <1 x double> [[TMP71]], [[TMP73]]
396 ; CHECK-NEXT: [[BLOCK62:%.*]] = shufflevector <3 x double> [[TMP19]], <3 x double> poison, <1 x i32> <i32 2>
397 ; CHECK-NEXT: [[TMP75:%.*]] = extractelement <3 x double> [[COL_LOAD10]], i64 2
398 ; CHECK-NEXT: [[SPLAT_SPLATINSERT63:%.*]] = insertelement <1 x double> poison, double [[TMP75]], i32 0
399 ; CHECK-NEXT: [[SPLAT_SPLAT64:%.*]] = shufflevector <1 x double> [[SPLAT_SPLATINSERT63]], <1 x double> poison, <1 x i32> zeroinitializer
400 ; CHECK-NEXT: [[TMP76:%.*]] = fmul <1 x double> [[BLOCK62]], [[SPLAT_SPLAT64]]
401 ; CHECK-NEXT: [[TMP77:%.*]] = fadd <1 x double> [[TMP74]], [[TMP76]]
402 ; CHECK-NEXT: [[TMP78:%.*]] = shufflevector <1 x double> [[TMP77]], <1 x double> poison, <3 x i32> <i32 0, i32 undef, i32 undef>
403 ; CHECK-NEXT: [[TMP79:%.*]] = shufflevector <3 x double> [[TMP69]], <3 x double> [[TMP78]], <3 x i32> <i32 0, i32 1, i32 3>
404 ; CHECK-NEXT: [[BLOCK65:%.*]] = shufflevector <3 x double> [[TMP7]], <3 x double> poison, <1 x i32> zeroinitializer
405 ; CHECK-NEXT: [[TMP80:%.*]] = extractelement <3 x double> [[COL_LOAD13]], i64 0
406 ; CHECK-NEXT: [[SPLAT_SPLATINSERT66:%.*]] = insertelement <1 x double> poison, double [[TMP80]], i32 0
407 ; CHECK-NEXT: [[SPLAT_SPLAT67:%.*]] = shufflevector <1 x double> [[SPLAT_SPLATINSERT66]], <1 x double> poison, <1 x i32> zeroinitializer
408 ; CHECK-NEXT: [[TMP81:%.*]] = fmul <1 x double> [[BLOCK65]], [[SPLAT_SPLAT67]]
409 ; CHECK-NEXT: [[BLOCK68:%.*]] = shufflevector <3 x double> [[TMP13]], <3 x double> poison, <1 x i32> zeroinitializer
410 ; CHECK-NEXT: [[TMP82:%.*]] = extractelement <3 x double> [[COL_LOAD13]], i64 1
411 ; CHECK-NEXT: [[SPLAT_SPLATINSERT69:%.*]] = insertelement <1 x double> poison, double [[TMP82]], i32 0
412 ; CHECK-NEXT: [[SPLAT_SPLAT70:%.*]] = shufflevector <1 x double> [[SPLAT_SPLATINSERT69]], <1 x double> poison, <1 x i32> zeroinitializer
413 ; CHECK-NEXT: [[TMP83:%.*]] = fmul <1 x double> [[BLOCK68]], [[SPLAT_SPLAT70]]
414 ; CHECK-NEXT: [[TMP84:%.*]] = fadd <1 x double> [[TMP81]], [[TMP83]]
415 ; CHECK-NEXT: [[BLOCK71:%.*]] = shufflevector <3 x double> [[TMP19]], <3 x double> poison, <1 x i32> zeroinitializer
416 ; CHECK-NEXT: [[TMP85:%.*]] = extractelement <3 x double> [[COL_LOAD13]], i64 2
417 ; CHECK-NEXT: [[SPLAT_SPLATINSERT72:%.*]] = insertelement <1 x double> poison, double [[TMP85]], i32 0
418 ; CHECK-NEXT: [[SPLAT_SPLAT73:%.*]] = shufflevector <1 x double> [[SPLAT_SPLATINSERT72]], <1 x double> poison, <1 x i32> zeroinitializer
419 ; CHECK-NEXT: [[TMP86:%.*]] = fmul <1 x double> [[BLOCK71]], [[SPLAT_SPLAT73]]
420 ; CHECK-NEXT: [[TMP87:%.*]] = fadd <1 x double> [[TMP84]], [[TMP86]]
421 ; CHECK-NEXT: [[TMP88:%.*]] = shufflevector <1 x double> [[TMP87]], <1 x double> poison, <3 x i32> <i32 0, i32 undef, i32 undef>
422 ; CHECK-NEXT: [[TMP89:%.*]] = shufflevector <3 x double> undef, <3 x double> [[TMP88]], <3 x i32> <i32 3, i32 1, i32 2>
423 ; CHECK-NEXT: [[BLOCK74:%.*]] = shufflevector <3 x double> [[TMP7]], <3 x double> poison, <1 x i32> <i32 1>
424 ; CHECK-NEXT: [[TMP90:%.*]] = extractelement <3 x double> [[COL_LOAD13]], i64 0
425 ; CHECK-NEXT: [[SPLAT_SPLATINSERT75:%.*]] = insertelement <1 x double> poison, double [[TMP90]], i32 0
426 ; CHECK-NEXT: [[SPLAT_SPLAT76:%.*]] = shufflevector <1 x double> [[SPLAT_SPLATINSERT75]], <1 x double> poison, <1 x i32> zeroinitializer
427 ; CHECK-NEXT: [[TMP91:%.*]] = fmul <1 x double> [[BLOCK74]], [[SPLAT_SPLAT76]]
428 ; CHECK-NEXT: [[BLOCK77:%.*]] = shufflevector <3 x double> [[TMP13]], <3 x double> poison, <1 x i32> <i32 1>
429 ; CHECK-NEXT: [[TMP92:%.*]] = extractelement <3 x double> [[COL_LOAD13]], i64 1
430 ; CHECK-NEXT: [[SPLAT_SPLATINSERT78:%.*]] = insertelement <1 x double> poison, double [[TMP92]], i32 0
431 ; CHECK-NEXT: [[SPLAT_SPLAT79:%.*]] = shufflevector <1 x double> [[SPLAT_SPLATINSERT78]], <1 x double> poison, <1 x i32> zeroinitializer
432 ; CHECK-NEXT: [[TMP93:%.*]] = fmul <1 x double> [[BLOCK77]], [[SPLAT_SPLAT79]]
433 ; CHECK-NEXT: [[TMP94:%.*]] = fadd <1 x double> [[TMP91]], [[TMP93]]
434 ; CHECK-NEXT: [[BLOCK80:%.*]] = shufflevector <3 x double> [[TMP19]], <3 x double> poison, <1 x i32> <i32 1>
435 ; CHECK-NEXT: [[TMP95:%.*]] = extractelement <3 x double> [[COL_LOAD13]], i64 2
436 ; CHECK-NEXT: [[SPLAT_SPLATINSERT81:%.*]] = insertelement <1 x double> poison, double [[TMP95]], i32 0
437 ; CHECK-NEXT: [[SPLAT_SPLAT82:%.*]] = shufflevector <1 x double> [[SPLAT_SPLATINSERT81]], <1 x double> poison, <1 x i32> zeroinitializer
438 ; CHECK-NEXT: [[TMP96:%.*]] = fmul <1 x double> [[BLOCK80]], [[SPLAT_SPLAT82]]
439 ; CHECK-NEXT: [[TMP97:%.*]] = fadd <1 x double> [[TMP94]], [[TMP96]]
440 ; CHECK-NEXT: [[TMP98:%.*]] = shufflevector <1 x double> [[TMP97]], <1 x double> poison, <3 x i32> <i32 0, i32 undef, i32 undef>
441 ; CHECK-NEXT: [[TMP99:%.*]] = shufflevector <3 x double> [[TMP89]], <3 x double> [[TMP98]], <3 x i32> <i32 0, i32 3, i32 2>
442 ; CHECK-NEXT: [[BLOCK83:%.*]] = shufflevector <3 x double> [[TMP7]], <3 x double> poison, <1 x i32> <i32 2>
443 ; CHECK-NEXT: [[TMP100:%.*]] = extractelement <3 x double> [[COL_LOAD13]], i64 0
444 ; CHECK-NEXT: [[SPLAT_SPLATINSERT84:%.*]] = insertelement <1 x double> poison, double [[TMP100]], i32 0
445 ; CHECK-NEXT: [[SPLAT_SPLAT85:%.*]] = shufflevector <1 x double> [[SPLAT_SPLATINSERT84]], <1 x double> poison, <1 x i32> zeroinitializer
446 ; CHECK-NEXT: [[TMP101:%.*]] = fmul <1 x double> [[BLOCK83]], [[SPLAT_SPLAT85]]
447 ; CHECK-NEXT: [[BLOCK86:%.*]] = shufflevector <3 x double> [[TMP13]], <3 x double> poison, <1 x i32> <i32 2>
448 ; CHECK-NEXT: [[TMP102:%.*]] = extractelement <3 x double> [[COL_LOAD13]], i64 1
449 ; CHECK-NEXT: [[SPLAT_SPLATINSERT87:%.*]] = insertelement <1 x double> poison, double [[TMP102]], i32 0
450 ; CHECK-NEXT: [[SPLAT_SPLAT88:%.*]] = shufflevector <1 x double> [[SPLAT_SPLATINSERT87]], <1 x double> poison, <1 x i32> zeroinitializer
451 ; CHECK-NEXT: [[TMP103:%.*]] = fmul <1 x double> [[BLOCK86]], [[SPLAT_SPLAT88]]
452 ; CHECK-NEXT: [[TMP104:%.*]] = fadd <1 x double> [[TMP101]], [[TMP103]]
453 ; CHECK-NEXT: [[BLOCK89:%.*]] = shufflevector <3 x double> [[TMP19]], <3 x double> poison, <1 x i32> <i32 2>
454 ; CHECK-NEXT: [[TMP105:%.*]] = extractelement <3 x double> [[COL_LOAD13]], i64 2
455 ; CHECK-NEXT: [[SPLAT_SPLATINSERT90:%.*]] = insertelement <1 x double> poison, double [[TMP105]], i32 0
456 ; CHECK-NEXT: [[SPLAT_SPLAT91:%.*]] = shufflevector <1 x double> [[SPLAT_SPLATINSERT90]], <1 x double> poison, <1 x i32> zeroinitializer
457 ; CHECK-NEXT: [[TMP106:%.*]] = fmul <1 x double> [[BLOCK89]], [[SPLAT_SPLAT91]]
458 ; CHECK-NEXT: [[TMP107:%.*]] = fadd <1 x double> [[TMP104]], [[TMP106]]
459 ; CHECK-NEXT: [[TMP108:%.*]] = shufflevector <1 x double> [[TMP107]], <1 x double> poison, <3 x i32> <i32 0, i32 undef, i32 undef>
460 ; CHECK-NEXT: [[TMP109:%.*]] = shufflevector <3 x double> [[TMP99]], <3 x double> [[TMP108]], <3 x i32> <i32 0, i32 1, i32 3>
461 ; CHECK-NEXT: [[TMP110:%.*]] = bitcast <9 x double>* [[C_PTR:%.*]] to double*
462 ; CHECK-NEXT: [[VEC_CAST92:%.*]] = bitcast double* [[TMP110]] to <3 x double>*
463 ; CHECK-NEXT: [[COL_LOAD93:%.*]] = load <3 x double>, <3 x double>* [[VEC_CAST92]], align 8
464 ; CHECK-NEXT: [[VEC_GEP94:%.*]] = getelementptr double, double* [[TMP110]], i64 3
465 ; CHECK-NEXT: [[VEC_CAST95:%.*]] = bitcast double* [[VEC_GEP94]] to <3 x double>*
466 ; CHECK-NEXT: [[COL_LOAD96:%.*]] = load <3 x double>, <3 x double>* [[VEC_CAST95]], align 8
467 ; CHECK-NEXT: [[VEC_GEP97:%.*]] = getelementptr double, double* [[TMP110]], i64 6
468 ; CHECK-NEXT: [[VEC_CAST98:%.*]] = bitcast double* [[VEC_GEP97]] to <3 x double>*
469 ; CHECK-NEXT: [[COL_LOAD99:%.*]] = load <3 x double>, <3 x double>* [[VEC_CAST98]], align 8
470 ; CHECK-NEXT: [[TMP111:%.*]] = fadd <3 x double> [[COL_LOAD93]], [[TMP49]]
471 ; CHECK-NEXT: [[TMP112:%.*]] = fadd <3 x double> [[COL_LOAD96]], [[TMP79]]
472 ; CHECK-NEXT: [[TMP113:%.*]] = fadd <3 x double> [[COL_LOAD99]], [[TMP109]]
473 ; CHECK-NEXT: [[TMP114:%.*]] = bitcast <9 x double>* [[C_PTR]] to double*
474 ; CHECK-NEXT: [[VEC_CAST100:%.*]] = bitcast double* [[TMP114]] to <3 x double>*
475 ; CHECK-NEXT: store <3 x double> [[TMP111]], <3 x double>* [[VEC_CAST100]], align 8
476 ; CHECK-NEXT: [[VEC_GEP101:%.*]] = getelementptr double, double* [[TMP114]], i64 3
477 ; CHECK-NEXT: [[VEC_CAST102:%.*]] = bitcast double* [[VEC_GEP101]] to <3 x double>*
478 ; CHECK-NEXT: store <3 x double> [[TMP112]], <3 x double>* [[VEC_CAST102]], align 8
479 ; CHECK-NEXT: [[VEC_GEP103:%.*]] = getelementptr double, double* [[TMP114]], i64 6
480 ; CHECK-NEXT: [[VEC_CAST104:%.*]] = bitcast double* [[VEC_GEP103]] to <3 x double>*
481 ; CHECK-NEXT: store <3 x double> [[TMP113]], <3 x double>* [[VEC_CAST104]], align 8
482 ; CHECK-NEXT: ret void
489 ; Lower multiply(transpose(%A), %B)
492 ; Embed result of multiply into flat vector.
498 ; Add column vectors.
501 ; Store result columns.
505 %a = load <9 x double>, <9 x double>* %A.Ptr, align 8
506 %b = load <9 x double>, <9 x double>* %B.Ptr, align 8
507 %a.trans = call <9 x double> @llvm.matrix.transpose(<9 x double> %a, i32 3, i32 3)
508 %mult = call <9 x double> @llvm.matrix.multiply.v9f64.v9f64.v9f64(<9 x double> %a.trans, <9 x double> %b, i32 3, i32 3, i32 3)
509 %c = load <9 x double>, <9 x double>* %C.Ptr, align 8
510 %res = fadd <9 x double> %c, %mult
512 store <9 x double> %res, <9 x double>* %C.Ptr, align 8