Run DCE after a LoopFlatten test to reduce spurious output [nfc]
[llvm-project.git] / clang / test / CodeGen / matrix-type-builtins.c
blob0320eb34ce41ca126a90209d4167ae0d93bafcba
1 // RUN: %clang_cc1 -no-enable-noundef-analysis -fenable-matrix -triple x86_64-apple-darwin %s -emit-llvm -disable-llvm-passes -o - | FileCheck --check-prefixes=COMMON,CHECK64 %s
2 // RUN: %clang_cc1 -no-enable-noundef-analysis -fenable-matrix -triple i386-apple-darwin %s -emit-llvm -disable-llvm-passes -o - | FileCheck --check-prefixes=COMMON,CHECK32 %s
4 // Also check we do not crash when running some middle-end passes. Most
5 // importantly this includes the IR verifier, to ensure we emit valid IR.
6 // RUN: %clang_cc1 -fenable-matrix -emit-llvm -triple x86_64-apple-darwin %s -o %t
8 // Tests for the matrix type builtins.
10 typedef double dx5x5_t __attribute__((matrix_type(5, 5)));
11 typedef float fx2x3_t __attribute__((matrix_type(2, 3)));
12 typedef float fx3x2_t __attribute__((matrix_type(3, 2)));
13 typedef int ix20x4_t __attribute__((matrix_type(20, 4)));
14 typedef int ix4x20_t __attribute__((matrix_type(4, 20)));
15 typedef unsigned ux1x6_t __attribute__((matrix_type(1, 6)));
16 typedef unsigned ux6x1_t __attribute__((matrix_type(6, 1)));
18 void transpose_double_5x5(dx5x5_t *a) {
19 // COMMON-LABEL: define{{.*}} void @transpose_double_5x5(
20 // CHECK32: [[A:%.*]] = load <25 x double>, ptr {{.*}}, align 4
21 // CHECK64: [[A:%.*]] = load <25 x double>, ptr {{.*}}, align 8
22 // COMMON-NEXT: [[TRANS:%.*]] = call <25 x double> @llvm.matrix.transpose.v25f64(<25 x double> [[A]], i32 5, i32 5)
23 // CHECK32-NEXT: store <25 x double> [[TRANS]], ptr %a_t, align 4
24 // CHECK64-NEXT: store <25 x double> [[TRANS]], ptr %a_t, align 8
26 dx5x5_t a_t = __builtin_matrix_transpose(*a);
29 void transpose_float_3x2(fx3x2_t *a) {
30 // COMMON-LABEL: define{{.*}} void @transpose_float_3x2(
31 // COMMON: [[A:%.*]] = load <6 x float>, ptr {{.*}}, align 4
32 // COMMON-NEXT: [[TRANS:%.*]] = call <6 x float> @llvm.matrix.transpose.v6f32(<6 x float> [[A]], i32 3, i32 2)
33 // COMMON-NEXT: store <6 x float> [[TRANS]], ptr %a_t, align 4
35 fx2x3_t a_t = __builtin_matrix_transpose(*a);
38 void transpose_int_20x4(ix20x4_t *a) {
39 // COMMON-LABEL: define{{.*}} void @transpose_int_20x4(
40 // COMMON: [[A:%.*]] = load <80 x i32>, ptr {{.*}}, align 4
41 // COMMON-NEXT: [[TRANS:%.*]] = call <80 x i32> @llvm.matrix.transpose.v80i32(<80 x i32> [[A]], i32 20, i32 4)
42 // COMMON-NEXT: store <80 x i32> [[TRANS]], ptr %a_t, align 4
44 ix4x20_t a_t = __builtin_matrix_transpose(*a);
47 struct Foo {
48 ux1x6_t in;
49 ux6x1_t out;
52 void transpose_struct_member(struct Foo *F) {
53 // COMMON-LABEL: define{{.*}} void @transpose_struct_member(
54 // COMMON: [[M:%.*]] = load <6 x i32>, ptr {{.*}}, align 4
55 // COMMON-NEXT: [[M_T:%.*]] = call <6 x i32> @llvm.matrix.transpose.v6i32(<6 x i32> [[M]], i32 1, i32 6)
56 // CHECK32-NEXT: [[F_ADDR:%.*]] = load ptr, ptr %F.addr, align 4
57 // CHECK64-NEXT: [[F_ADDR:%.*]] = load ptr, ptr %F.addr, align 8
58 // COMMON-NEXT: [[OUT_PTR:%.*]] = getelementptr inbounds %struct.Foo, ptr [[F_ADDR]], i32 0, i32 1
59 // COMMON-NEXT: store <6 x i32> [[M_T]], ptr [[OUT_PTR]], align 4
61 F->out = __builtin_matrix_transpose(F->in);
64 void transpose_transpose_struct_member(struct Foo *F) {
65 // COMMON-LABEL: define{{.*}} void @transpose_transpose_struct_member(
66 // COMMON: [[M:%.*]] = load <6 x i32>, ptr {{.*}}, align 4
67 // COMMON-NEXT: [[M_T:%.*]] = call <6 x i32> @llvm.matrix.transpose.v6i32(<6 x i32> [[M]], i32 1, i32 6)
68 // COMMON-NEXT: [[M_T2:%.*]] = call <6 x i32> @llvm.matrix.transpose.v6i32(<6 x i32> [[M_T]], i32 6, i32 1)
69 // CHECK32-NEXT: [[F_ADDR:%.*]] = load ptr, ptr %F.addr, align 4
70 // CHECK64-NEXT: [[F_ADDR:%.*]] = load ptr, ptr %F.addr, align 8
71 // COMMON-NEXT: [[IN_PTR:%.*]] = getelementptr inbounds %struct.Foo, ptr [[F_ADDR]], i32 0, i32 0
72 // COMMON-NEXT: store <6 x i32> [[M_T2]], ptr [[IN_PTR]], align 4
74 F->in = __builtin_matrix_transpose(__builtin_matrix_transpose(F->in));
77 dx5x5_t get_matrix(void);
79 void transpose_rvalue(void) {
80 // COMMON-LABEL: define{{.*}} void @transpose_rvalue()
81 // COMMON-NEXT: entry:
82 // CHECK32-NEXT: [[M_T_ADDR:%.*]] = alloca [25 x double], align 4
83 // CHECK64-NEXT: [[M_T_ADDR:%.*]] = alloca [25 x double], align 8
84 // COMMON-NEXT: [[CALL:%.*]] = call <25 x double> @get_matrix()
85 // COMMON-NEXT: [[M_T:%.*]] = call <25 x double> @llvm.matrix.transpose.v25f64(<25 x double> [[CALL]], i32 5, i32 5)
86 // CHECK32-NEXT: store <25 x double> [[M_T]], ptr [[M_T_ADDR]], align 4
87 // CHECK64-NEXT: store <25 x double> [[M_T]], ptr [[M_T_ADDR]], align 8
89 dx5x5_t m_t = __builtin_matrix_transpose(get_matrix());
92 const dx5x5_t global_matrix;
94 void transpose_global(void) {
95 // COMMON-LABEL: define{{.*}} void @transpose_global()
96 // COMMON-NEXT: entry:
97 // CHECK32-NEXT: [[M_T_ADDR:%.*]] = alloca [25 x double], align 4
98 // CHECK32-NEXT: [[GLOBAL_MATRIX:%.*]] = load <25 x double>, ptr @global_matrix, align 4
99 // CHECK64-NEXT: [[M_T_ADDR:%.*]] = alloca [25 x double], align 8
100 // CHECK64-NEXT: [[GLOBAL_MATRIX:%.*]] = load <25 x double>, ptr @global_matrix, align 8
101 // COMMON-NEXT: [[M_T:%.*]] = call <25 x double> @llvm.matrix.transpose.v25f64(<25 x double> [[GLOBAL_MATRIX]], i32 5, i32 5)
102 // CHECK32-NEXT: store <25 x double> [[M_T]], ptr [[M_T_ADDR]], align 4
103 // CHECK64-NEXT: store <25 x double> [[M_T]], ptr [[M_T_ADDR]], align 8
105 dx5x5_t m_t = __builtin_matrix_transpose(global_matrix);
108 void column_major_load_with_const_stride_double(double *Ptr) {
109 // COMMON-LABEL: define{{.*}} void @column_major_load_with_const_stride_double(ptr %Ptr)
110 // CHECK32: [[PTR:%.*]] = load ptr, ptr %Ptr.addr, align 4
111 // CHECK32-NEXT: call <25 x double> @llvm.matrix.column.major.load.v25f64.i32(ptr align 4 [[PTR]], i32 5, i1 false, i32 5, i32 5)
112 // CHECK64: [[PTR:%.*]] = load ptr, ptr %Ptr.addr, align 8
113 // CHECK64-NEXT: call <25 x double> @llvm.matrix.column.major.load.v25f64.i64(ptr align 8 [[PTR]], i64 5, i1 false, i32 5, i32 5)
115 dx5x5_t m_a1 = __builtin_matrix_column_major_load(Ptr, 5, 5, 5);
118 void column_major_load_with_const_stride2_double(double *Ptr) {
119 // COMMON-LABEL: define{{.*}} void @column_major_load_with_const_stride2_double(ptr %Ptr)
120 // CHECK32: [[PTR:%.*]] = load ptr, ptr %Ptr.addr, align 4
121 // CHECK32-NEXT: call <25 x double> @llvm.matrix.column.major.load.v25f64.i32(ptr align 4 [[PTR]], i32 15, i1 false, i32 5, i32 5)
122 // CHECK64: [[PTR:%.*]] = load ptr, ptr %Ptr.addr, align 8
123 // CHECK64-NEXT: call <25 x double> @llvm.matrix.column.major.load.v25f64.i64(ptr align 8 [[PTR]], i64 15, i1 false, i32 5, i32 5)
125 dx5x5_t m_a2 = __builtin_matrix_column_major_load(Ptr, 5, 5, 2 * 3 + 9);
128 void column_major_load_with_variable_stride_ull_float(float *Ptr, unsigned long long S) {
129 // COMMON-LABEL: define{{.*}} void @column_major_load_with_variable_stride_ull_float(ptr %Ptr, i64 %S)
130 // CHECK32: [[S:%.*]] = load i64, ptr %S.addr, align 8
131 // CHECK32-NEXT: [[STRIDE_TRUNC:%.*]] = trunc i64 [[S]] to i32
132 // CHECK32-NEXT: [[PTR:%.*]] = load ptr, ptr %Ptr.addr, align 4
133 // CHECK32-NEXT: call <6 x float> @llvm.matrix.column.major.load.v6f32.i32(ptr align 4 [[PTR]], i32 [[STRIDE_TRUNC]], i1 false, i32 2, i32 3)
135 // CHECK64: [[S:%.*]] = load i64, ptr %S.addr, align 8
136 // CHECK64-NEXT: [[PTR:%.*]] = load ptr, ptr %Ptr.addr, align 8
137 // CHECK64-NEXT: call <6 x float> @llvm.matrix.column.major.load.v6f32.i64(ptr align 4 [[PTR]], i64 [[S]], i1 false, i32 2, i32 3)
139 fx2x3_t m_b = __builtin_matrix_column_major_load(Ptr, 2, 3, S);
142 void column_major_load_with_stride_math_int(int *Ptr, int S) {
143 // COMMON-LABEL: define{{.*}} void @column_major_load_with_stride_math_int(ptr %Ptr, i32 %S)
144 // COMMON: [[S:%.*]] = load i32, ptr %S.addr, align 4
145 // COMMON-NEXT: [[STRIDE:%.*]] = add nsw i32 [[S]], 32
146 // CHECK32-NEXT: [[PTR:%.*]] = load ptr, ptr %Ptr.addr, align 4
147 // CHECK32-NEXT: call <80 x i32> @llvm.matrix.column.major.load.v80i32.i32(ptr align 4 [[PTR]], i32 [[STRIDE]], i1 false, i32 4, i32 20)
149 // CHECK64-NEXT: [[STRIDE_EXT:%.*]] = sext i32 [[STRIDE]] to i64
150 // CHECK64-NEXT: [[PTR:%.*]] = load ptr, ptr %Ptr.addr, align 8
151 // CHECK64-NEXT: call <80 x i32> @llvm.matrix.column.major.load.v80i32.i64(ptr align 4 [[PTR]], i64 [[STRIDE_EXT]], i1 false, i32 4, i32 20)
153 ix4x20_t m_c = __builtin_matrix_column_major_load(Ptr, 4, 20, S + 32);
156 void column_major_load_with_stride_math_s_int(int *Ptr, short S) {
157 // COMMON-LABEL: define{{.*}} void @column_major_load_with_stride_math_s_int(ptr %Ptr, i16 signext %S)
158 // COMMON: [[S:%.*]] = load i16, ptr %S.addr, align 2
159 // COMMON-NEXT: [[S_EXT:%.*]] = sext i16 [[S]] to i32
160 // COMMON-NEXT: [[STRIDE:%.*]] = add nsw i32 [[S_EXT]], 32
161 // CHECK32-NEXT: [[PTR:%.*]] = load ptr, ptr %Ptr.addr, align 4
162 // CHECK32-NEXT: %matrix = call <80 x i32> @llvm.matrix.column.major.load.v80i32.i32(ptr align 4 [[PTR]], i32 [[STRIDE]], i1 false, i32 4, i32 20)
164 // CHECK64-NEXT: [[STRIDE_EXT:%.*]] = sext i32 [[STRIDE]] to i64
165 // CHECK64-NEXT: [[PTR:%.*]] = load ptr, ptr %Ptr.addr, align 8
166 // CHECK64-NEXT: %matrix = call <80 x i32> @llvm.matrix.column.major.load.v80i32.i64(ptr align 4 [[PTR]], i64 [[STRIDE_EXT]], i1 false, i32 4, i32 20)
168 ix4x20_t m_c = __builtin_matrix_column_major_load(Ptr, 4, 20, S + 32);
171 void column_major_load_array1(double Ptr[25]) {
172 // COMMON-LABEL: define{{.*}} void @column_major_load_array1(ptr %Ptr)
173 // CHECK32: [[ADDR:%.*]] = load ptr, ptr %Ptr.addr, align 4
174 // CHECK32-NEXT: call <25 x double> @llvm.matrix.column.major.load.v25f64.i32(ptr align 4 [[ADDR]], i32 5, i1 false, i32 5, i32 5)
176 // CHECK64: [[ADDR:%.*]] = load ptr, ptr %Ptr.addr, align 8
177 // CHECK64-NEXT: call <25 x double> @llvm.matrix.column.major.load.v25f64.i64(ptr align 8 [[ADDR]], i64 5, i1 false, i32 5, i32 5)
179 dx5x5_t m = __builtin_matrix_column_major_load(Ptr, 5, 5, 5);
182 void column_major_load_array2(void) {
183 // COMMON-LABEL: define{{.*}} void @column_major_load_array2() #0 {
184 // COMMON-NEXT: entry:
185 // CHECK32-NEXT: [[PTR:%.*]] = alloca [25 x double], align 8
186 // CHECK32: [[ARRAY_DEC:%.*]] = getelementptr inbounds [25 x double], ptr [[PTR]], i32 0, i32 0
187 // CHECK32-NEXT: call <25 x double> @llvm.matrix.column.major.load.v25f64.i32(ptr align 8 [[ARRAY_DEC]], i32 5, i1 false, i32 5, i32 5)
189 // CHECK64-NEXT: [[PTR:%.*]] = alloca [25 x double], align 16
190 // CHECK64: [[ARRAY_DEC:%.*]] = getelementptr inbounds [25 x double], ptr [[PTR]], i64 0, i64 0
191 // CHECK64-NEXT: call <25 x double> @llvm.matrix.column.major.load.v25f64.i64(ptr align 16 [[ARRAY_DEC]], i64 5, i1 false, i32 5, i32 5)
193 double Ptr[25];
194 dx5x5_t m = __builtin_matrix_column_major_load(Ptr, 5, 5, 5);
197 void column_major_load_const(const double *Ptr) {
198 // COMMON-LABEL: define{{.*}} void @column_major_load_const(ptr %Ptr)
199 // CHECK32: [[PTR:%.*]] = load ptr, ptr %Ptr.addr, align 4
200 // CHECK32-NEXT: call <25 x double> @llvm.matrix.column.major.load.v25f64.i32(ptr align 4 [[PTR]], i32 5, i1 false, i32 5, i32 5)
202 // CHECK64: [[PTR:%.*]] = load ptr, ptr %Ptr.addr, align 8
203 // CHECK64-NEXT: call <25 x double> @llvm.matrix.column.major.load.v25f64.i64(ptr align 8 [[PTR]], i64 5, i1 false, i32 5, i32 5)
205 dx5x5_t m_a1 = __builtin_matrix_column_major_load(Ptr, 5, 5, 5);
208 void column_major_load_volatile(volatile double *Ptr) {
209 // COMMON-LABEL: define{{.*}} void @column_major_load_volatile(ptr %Ptr)
210 // CHECK32: [[PTR:%.*]] = load ptr, ptr %Ptr.addr, align 4
211 // CHECK32-NEXT: call <25 x double> @llvm.matrix.column.major.load.v25f64.i32(ptr align 4 [[PTR]], i32 5, i1 true, i32 5, i32 5)
213 // CHECK64: [[PTR:%.*]] = load ptr, ptr %Ptr.addr, align 8
214 // CHECK64-NEXT: call <25 x double> @llvm.matrix.column.major.load.v25f64.i64(ptr align 8 [[PTR]], i64 5, i1 true, i32 5, i32 5)
216 dx5x5_t m_a1 = __builtin_matrix_column_major_load(Ptr, 5, 5, 5);
219 void column_major_store_with_const_stride_double(double *Ptr) {
220 // COMMON-LABEL: define{{.*}} void @column_major_store_with_const_stride_double(ptr %Ptr)
221 // CHECK32: [[M:%.*]] = load <25 x double>, ptr {{.*}}, align 4
222 // CHECK32-NEXT: [[PTR:%.*]] = load ptr, ptr %Ptr.addr, align 4
223 // CHECK32-NEXT: call void @llvm.matrix.column.major.store.v25f64.i32(<25 x double> [[M]], ptr align 4 [[PTR]], i32 5, i1 false, i32 5, i32 5)
225 // CHECK64: [[M:%.*]] = load <25 x double>, ptr {{.*}}, align 8
226 // CHECK64-NEXT: [[PTR:%.*]] = load ptr, ptr %Ptr.addr, align 8
227 // CHECK64-NEXT: call void @llvm.matrix.column.major.store.v25f64.i64(<25 x double> [[M]], ptr align 8 [[PTR]], i64 5, i1 false, i32 5, i32 5)
229 dx5x5_t m;
230 __builtin_matrix_column_major_store(m, Ptr, 5);
233 void column_major_store_with_const_stride2_double(double *Ptr) {
234 // COMMON-LABEL: define{{.*}} void @column_major_store_with_const_stride2_double(ptr %Ptr)
235 // CHECK32: [[M:%.*]] = load <25 x double>, ptr {{.*}}, align 4
236 // CHECK32-NEXT: [[PTR:%.*]] = load ptr, ptr %Ptr.addr, align 4
237 // CHECK32-NEXT: call void @llvm.matrix.column.major.store.v25f64.i32(<25 x double> [[M]], ptr align 4 [[PTR]], i32 15, i1 false, i32 5, i32 5)
239 // CHECK64: [[M:%.*]] = load <25 x double>, ptr {{.*}}, align 8
240 // CHECK64-NEXT: [[PTR:%.*]] = load ptr, ptr %Ptr.addr, align 8
241 // CHECK64-NEXT: call void @llvm.matrix.column.major.store.v25f64.i64(<25 x double> [[M]], ptr align 8 [[PTR]], i64 15, i1 false, i32 5, i32 5)
243 dx5x5_t m;
244 __builtin_matrix_column_major_store(m, Ptr, 2 * 3 + 9);
247 void column_major_store_with_stride_math_int(int *Ptr, int S) {
248 // COMMON-LABEL: define{{.*}} void @column_major_store_with_stride_math_int(ptr %Ptr, i32 %S)
249 // COMMON: [[M:%.*]] = load <80 x i32>, ptr {{.*}}, align 4
250 // CHECK32-NEXT: [[PTR:%.*]] = load ptr, ptr %Ptr.addr, align 4
251 // CHECK64-NEXT: [[PTR:%.*]] = load ptr, ptr %Ptr.addr, align 8
252 // COMMON-NEXT: [[S:%.*]] = load i32, ptr %S.addr, align 4
253 // COMMON-NEXT: [[ADD:%.*]] = add nsw i32 [[S]], 32
254 // CHECK32-NEXT: call void @llvm.matrix.column.major.store.v80i32.i32(<80 x i32> [[M]], ptr align 4 [[PTR]], i32 [[ADD]], i1 false, i32 4, i32 20)
256 // CHECK64-NEXT: [[IDX:%.*]] = sext i32 [[ADD]] to i64
257 // CHECK64-NEXT: call void @llvm.matrix.column.major.store.v80i32.i64(<80 x i32> [[M]], ptr align 4 [[PTR]], i64 [[IDX]], i1 false, i32 4, i32 20)
259 ix4x20_t m;
260 __builtin_matrix_column_major_store(m, Ptr, S + 32);
263 void column_major_store_with_stride_math_s_int(int *Ptr, short S) {
264 // COMMON-LABEL: define{{.*}} void @column_major_store_with_stride_math_s_int(ptr %Ptr, i16 signext %S)
265 // COMMON: [[M:%.*]] = load <80 x i32>, ptr {{.*}}, align 4
266 // CHECK32-NEXT: [[PTR:%.*]] = load ptr, ptr %Ptr.addr, align 4
267 // CHECK64-NEXT: [[PTR:%.*]] = load ptr, ptr %Ptr.addr, align 8
268 // COMMON-NEXT: [[S:%.*]] = load i16, ptr %S.addr, align 2
269 // COMMON-NEXT: [[EXT:%.*]] = sext i16 [[S]] to i32
270 // COMMON-NEXT: [[ADD:%.*]] = add nsw i32 [[EXT]], 2
271 // CHECK32-NEXT: call void @llvm.matrix.column.major.store.v80i32.i32(<80 x i32> [[M]], ptr align 4 [[PTR]], i32 [[ADD]], i1 false, i32 4, i32 20)
273 // CHECK64-NEXT: [[IDX:%.*]] = sext i32 [[ADD]] to i64
274 // CHECK64-NEXT: call void @llvm.matrix.column.major.store.v80i32.i64(<80 x i32> [[M]], ptr align 4 [[PTR]], i64 [[IDX]], i1 false, i32 4, i32 20)
276 ix4x20_t m;
277 __builtin_matrix_column_major_store(m, Ptr, S + 2);
280 void column_major_store_array1(double Ptr[25]) {
281 // COMMON-LABEL: define{{.*}} void @column_major_store_array1(ptr %Ptr)
282 // CHECK32: [[M:%.*]] = load <25 x double>, ptr {{.*}}, align 4
283 // CHECK32-NEXT: [[PTR:%.*]] = load ptr, ptr %Ptr.addr, align 4
284 // CHECK32-NEXT: call void @llvm.matrix.column.major.store.v25f64.i32(<25 x double> [[M]], ptr align 4 [[PTR]], i32 5, i1 false, i32 5, i32 5)
286 // CHECK64: [[M:%.*]] = load <25 x double>, ptr {{.*}}, align 8
287 // CHECK64-NEXT: [[PTR:%.*]] = load ptr, ptr %Ptr.addr, align 8
288 // CHECK64-NEXT: call void @llvm.matrix.column.major.store.v25f64.i64(<25 x double> [[M]], ptr align 8 [[PTR]], i64 5, i1 false, i32 5, i32 5)
290 dx5x5_t m;
291 __builtin_matrix_column_major_store(m, Ptr, 5);
294 void column_major_store_array2(void) {
295 // COMMON-LABEL: define{{.*}} void @column_major_store_array2()
296 // CHECK32: [[M:%.*]] = load <25 x double>, ptr {{.*}}, align 4
297 // CHECK32-NEXT: [[PTR:%.*]] = getelementptr inbounds [25 x double], ptr %Ptr, i32 0, i32 0
298 // CHECK32-NEXT: call void @llvm.matrix.column.major.store.v25f64.i32(<25 x double> [[M]], ptr align 8 [[PTR]], i32 5, i1 false, i32 5, i32 5)
300 // CHECK64: [[M:%.*]] = load <25 x double>, ptr {{.*}}, align 8
301 // CHECK64-NEXT: [[PTR:%.*]] = getelementptr inbounds [25 x double], ptr %Ptr, i64 0, i64 0
302 // CHECK64-NEXT: call void @llvm.matrix.column.major.store.v25f64.i64(<25 x double> [[M]], ptr align 16 [[PTR]], i64 5, i1 false, i32 5, i32 5)
304 double Ptr[25];
305 dx5x5_t m;
306 __builtin_matrix_column_major_store(m, Ptr, 5);
309 void column_major_store_volatile(volatile double *Ptr) {
310 // COMMON-LABEL: define{{.*}} void @column_major_store_volatile(ptr %Ptr) #0 {
311 // CHECK32: [[M:%.*]] = load <25 x double>, ptr {{.*}}, align 4
312 // CHECK32-NEXT: [[PTR:%.*]] = load ptr, ptr %Ptr.addr, align 4
313 // CHECK32-NEXT: call void @llvm.matrix.column.major.store.v25f64.i32(<25 x double> [[M]], ptr align 4 [[PTR]], i32 5, i1 true, i32 5, i32 5)
315 // CHECK64: [[M:%.*]] = load <25 x double>, ptr {{.*}}, align 8
316 // CHECK64-NEXT: [[PTR:%.*]] = load ptr, ptr %Ptr.addr, align 8
317 // CHECK64-NEXT: call void @llvm.matrix.column.major.store.v25f64.i64(<25 x double> [[M]], ptr align 8 [[PTR]], i64 5, i1 true, i32 5, i32 5)
319 dx5x5_t m;
320 __builtin_matrix_column_major_store(m, Ptr, 5);