1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc -mtriple=aarch64 < %s | FileCheck %s
4 @A = external dso_local local_unnamed_addr global [8 x [8 x i64]], align 8
5 @B = external dso_local local_unnamed_addr global [8 x [8 x i64]], align 8
6 @C = external dso_local local_unnamed_addr global [8 x [8 x i64]], align 8
8 define dso_local void @run_test() local_unnamed_addr uwtable {
9 ; CHECK-LABEL: run_test:
10 ; CHECK: // %bb.0: // %entry
11 ; CHECK-NEXT: sub sp, sp, #192
12 ; CHECK-NEXT: .cfi_def_cfa_offset 192
13 ; CHECK-NEXT: stp d15, d14, [sp, #96] // 16-byte Folded Spill
14 ; CHECK-NEXT: stp d13, d12, [sp, #112] // 16-byte Folded Spill
15 ; CHECK-NEXT: stp d11, d10, [sp, #128] // 16-byte Folded Spill
16 ; CHECK-NEXT: stp d9, d8, [sp, #144] // 16-byte Folded Spill
17 ; CHECK-NEXT: stp x22, x21, [sp, #160] // 16-byte Folded Spill
18 ; CHECK-NEXT: stp x20, x19, [sp, #176] // 16-byte Folded Spill
19 ; CHECK-NEXT: .cfi_offset w19, -8
20 ; CHECK-NEXT: .cfi_offset w20, -16
21 ; CHECK-NEXT: .cfi_offset w21, -24
22 ; CHECK-NEXT: .cfi_offset w22, -32
23 ; CHECK-NEXT: .cfi_offset b8, -40
24 ; CHECK-NEXT: .cfi_offset b9, -48
25 ; CHECK-NEXT: .cfi_offset b10, -56
26 ; CHECK-NEXT: .cfi_offset b11, -64
27 ; CHECK-NEXT: .cfi_offset b12, -72
28 ; CHECK-NEXT: .cfi_offset b13, -80
29 ; CHECK-NEXT: .cfi_offset b14, -88
30 ; CHECK-NEXT: .cfi_offset b15, -96
31 ; CHECK-NEXT: movi v1.2d, #0000000000000000
32 ; CHECK-NEXT: mov x8, xzr
33 ; CHECK-NEXT: mov x9, xzr
34 ; CHECK-NEXT: adrp x10, B+48
35 ; CHECK-NEXT: add x10, x10, :lo12:B+48
36 ; CHECK-NEXT: adrp x11, A
37 ; CHECK-NEXT: add x11, x11, :lo12:A
38 ; CHECK-NEXT: // implicit-def: $q6
39 ; CHECK-NEXT: // implicit-def: $q7
40 ; CHECK-NEXT: // implicit-def: $q10
41 ; CHECK-NEXT: // implicit-def: $q3
42 ; CHECK-NEXT: // implicit-def: $q4
43 ; CHECK-NEXT: // implicit-def: $q5
44 ; CHECK-NEXT: // implicit-def: $q2
45 ; CHECK-NEXT: // implicit-def: $q16
46 ; CHECK-NEXT: // implicit-def: $q17
47 ; CHECK-NEXT: // implicit-def: $q18
48 ; CHECK-NEXT: // implicit-def: $q19
49 ; CHECK-NEXT: // implicit-def: $q20
50 ; CHECK-NEXT: // implicit-def: $q21
51 ; CHECK-NEXT: // implicit-def: $q22
52 ; CHECK-NEXT: // implicit-def: $q23
53 ; CHECK-NEXT: // implicit-def: $q24
54 ; CHECK-NEXT: // implicit-def: $q25
55 ; CHECK-NEXT: // implicit-def: $q27
56 ; CHECK-NEXT: // implicit-def: $q26
57 ; CHECK-NEXT: // implicit-def: $q28
58 ; CHECK-NEXT: // implicit-def: $q30
59 ; CHECK-NEXT: // implicit-def: $q15
60 ; CHECK-NEXT: // implicit-def: $q29
61 ; CHECK-NEXT: // implicit-def: $q31
62 ; CHECK-NEXT: // implicit-def: $q11
63 ; CHECK-NEXT: // implicit-def: $q9
64 ; CHECK-NEXT: // kill: killed $q6
65 ; CHECK-NEXT: // implicit-def: $q12
66 ; CHECK-NEXT: // implicit-def: $q13
67 ; CHECK-NEXT: // implicit-def: $q6
68 ; CHECK-NEXT: // kill: killed $q6
69 ; CHECK-NEXT: .LBB0_1: // %for.cond1.preheader
70 ; CHECK-NEXT: // =>This Inner Loop Header: Depth=1
71 ; CHECK-NEXT: ldr q14, [x8]
72 ; CHECK-NEXT: mov x12, xzr
73 ; CHECK-NEXT: add x7, x11, x8
74 ; CHECK-NEXT: ldr x13, [x12]
75 ; CHECK-NEXT: ldr x5, [x8]
76 ; CHECK-NEXT: ldr x7, [x7, #128]
77 ; CHECK-NEXT: mov x14, v14.d[1]
78 ; CHECK-NEXT: stp q22, q26, [sp] // 32-byte Folded Spill
79 ; CHECK-NEXT: mov v22.16b, v9.16b
80 ; CHECK-NEXT: stp q31, q15, [sp, #32] // 32-byte Folded Spill
81 ; CHECK-NEXT: ldr q15, [x12]
82 ; CHECK-NEXT: fmov x12, d14
83 ; CHECK-NEXT: ldr q14, [x10], #64
84 ; CHECK-NEXT: mov v9.16b, v30.16b
85 ; CHECK-NEXT: fmov x17, d15
86 ; CHECK-NEXT: mov x16, v15.d[1]
87 ; CHECK-NEXT: mov v30.16b, v27.16b
88 ; CHECK-NEXT: mul x15, x12, x13
89 ; CHECK-NEXT: mov x0, v14.d[1]
90 ; CHECK-NEXT: fmov x4, d14
91 ; CHECK-NEXT: mov v27.16b, v23.16b
92 ; CHECK-NEXT: mov v23.16b, v19.16b
93 ; CHECK-NEXT: mov v19.16b, v2.16b
94 ; CHECK-NEXT: mul x1, x14, x13
95 ; CHECK-NEXT: mov v8.16b, v28.16b
96 ; CHECK-NEXT: mov v28.16b, v24.16b
97 ; CHECK-NEXT: mov v24.16b, v20.16b
98 ; CHECK-NEXT: mov v20.16b, v16.16b
99 ; CHECK-NEXT: mov v16.16b, v3.16b
100 ; CHECK-NEXT: mul x18, x17, x13
101 ; CHECK-NEXT: mov v31.16b, v18.16b
102 ; CHECK-NEXT: mov v26.16b, v5.16b
103 ; CHECK-NEXT: fmov d15, x15
104 ; CHECK-NEXT: mov v5.16b, v1.16b
105 ; CHECK-NEXT: mov v18.16b, v10.16b
106 ; CHECK-NEXT: mul x2, x16, x13
107 ; CHECK-NEXT: mov v10.16b, v29.16b
108 ; CHECK-NEXT: mov v29.16b, v25.16b
109 ; CHECK-NEXT: mov v25.16b, v21.16b
110 ; CHECK-NEXT: mov v21.16b, v17.16b
111 ; CHECK-NEXT: mov v17.16b, v4.16b
112 ; CHECK-NEXT: mov v15.d[1], x1
113 ; CHECK-NEXT: mul x19, x12, x5
114 ; CHECK-NEXT: add x8, x8, #8
115 ; CHECK-NEXT: fmov d14, x18
116 ; CHECK-NEXT: cmp x8, #64
117 ; CHECK-NEXT: add x9, x9, #1
118 ; CHECK-NEXT: mul x12, x12, x7
119 ; CHECK-NEXT: mov v14.d[1], x2
120 ; CHECK-NEXT: add v12.2d, v12.2d, v15.2d
121 ; CHECK-NEXT: mul x3, x0, x13
122 ; CHECK-NEXT: fmov d1, x19
123 ; CHECK-NEXT: mul x13, x4, x13
124 ; CHECK-NEXT: fmov d2, x12
125 ; CHECK-NEXT: mul x6, x14, x5
126 ; CHECK-NEXT: add v6.2d, v13.2d, v14.2d
127 ; CHECK-NEXT: mov v13.16b, v12.16b
128 ; CHECK-NEXT: ldr q12, [sp, #80] // 16-byte Folded Reload
129 ; CHECK-NEXT: mul x14, x14, x7
130 ; CHECK-NEXT: fmov d0, x13
131 ; CHECK-NEXT: add v12.2d, v12.2d, v14.2d
132 ; CHECK-NEXT: mul x21, x17, x7
133 ; CHECK-NEXT: mov v1.d[1], x6
134 ; CHECK-NEXT: mul x18, x4, x7
135 ; CHECK-NEXT: mov v0.d[1], x3
136 ; CHECK-NEXT: mov v2.d[1], x14
137 ; CHECK-NEXT: str q12, [sp, #80] // 16-byte Folded Spill
138 ; CHECK-NEXT: mov v12.16b, v13.16b
139 ; CHECK-NEXT: mul x13, x17, x5
140 ; CHECK-NEXT: mov v13.16b, v6.16b
141 ; CHECK-NEXT: fmov d3, x21
142 ; CHECK-NEXT: ldp q15, q6, [sp, #48] // 32-byte Folded Reload
143 ; CHECK-NEXT: mul x20, x16, x7
144 ; CHECK-NEXT: add v11.2d, v11.2d, v1.2d
145 ; CHECK-NEXT: fmov d4, x18
146 ; CHECK-NEXT: mul x22, x0, x7
147 ; CHECK-NEXT: add v6.2d, v6.2d, v0.2d
148 ; CHECK-NEXT: add v15.2d, v15.2d, v2.2d
149 ; CHECK-NEXT: fmov d14, x13
150 ; CHECK-NEXT: mov v2.16b, v19.16b
151 ; CHECK-NEXT: mov v19.16b, v23.16b
152 ; CHECK-NEXT: mul x14, x4, x5
153 ; CHECK-NEXT: mov v23.16b, v27.16b
154 ; CHECK-NEXT: mov v27.16b, v30.16b
155 ; CHECK-NEXT: mov v3.d[1], x20
156 ; CHECK-NEXT: mov v30.16b, v9.16b
157 ; CHECK-NEXT: mov v9.16b, v22.16b
158 ; CHECK-NEXT: mul x12, x16, x5
159 ; CHECK-NEXT: str q6, [sp, #64] // 16-byte Folded Spill
160 ; CHECK-NEXT: mov v6.16b, v18.16b
161 ; CHECK-NEXT: mov v4.d[1], x22
162 ; CHECK-NEXT: add v27.2d, v27.2d, v1.2d
163 ; CHECK-NEXT: add v23.2d, v23.2d, v1.2d
164 ; CHECK-NEXT: mul x13, x0, x5
165 ; CHECK-NEXT: add v19.2d, v19.2d, v1.2d
166 ; CHECK-NEXT: add v2.2d, v2.2d, v1.2d
167 ; CHECK-NEXT: fmov d0, x14
168 ; CHECK-NEXT: add v30.2d, v30.2d, v3.2d
169 ; CHECK-NEXT: mov v3.16b, v16.16b
170 ; CHECK-NEXT: mov v16.16b, v20.16b
171 ; CHECK-NEXT: mov v20.16b, v24.16b
172 ; CHECK-NEXT: mov v24.16b, v28.16b
173 ; CHECK-NEXT: mov v14.d[1], x12
174 ; CHECK-NEXT: mov v28.16b, v8.16b
175 ; CHECK-NEXT: add v1.2d, v5.2d, v1.2d
176 ; CHECK-NEXT: add v28.2d, v8.2d, v4.2d
177 ; CHECK-NEXT: mov v4.16b, v17.16b
178 ; CHECK-NEXT: mov v17.16b, v21.16b
179 ; CHECK-NEXT: mov v0.d[1], x13
180 ; CHECK-NEXT: mov v21.16b, v25.16b
181 ; CHECK-NEXT: mov v25.16b, v29.16b
182 ; CHECK-NEXT: mov v29.16b, v10.16b
183 ; CHECK-NEXT: mov v5.16b, v26.16b
184 ; CHECK-NEXT: mov v18.16b, v31.16b
185 ; CHECK-NEXT: ldp q22, q26, [sp] // 32-byte Folded Reload
186 ; CHECK-NEXT: ldr q31, [sp, #32] // 16-byte Folded Reload
187 ; CHECK-NEXT: add v9.2d, v9.2d, v14.2d
188 ; CHECK-NEXT: add v24.2d, v24.2d, v14.2d
189 ; CHECK-NEXT: add v20.2d, v20.2d, v14.2d
190 ; CHECK-NEXT: add v31.2d, v31.2d, v14.2d
191 ; CHECK-NEXT: add v18.2d, v18.2d, v14.2d
192 ; CHECK-NEXT: add v16.2d, v16.2d, v14.2d
193 ; CHECK-NEXT: add v26.2d, v26.2d, v14.2d
194 ; CHECK-NEXT: add v22.2d, v22.2d, v14.2d
195 ; CHECK-NEXT: add v5.2d, v5.2d, v14.2d
196 ; CHECK-NEXT: add v3.2d, v3.2d, v14.2d
197 ; CHECK-NEXT: add v10.2d, v6.2d, v14.2d
198 ; CHECK-NEXT: add v29.2d, v29.2d, v0.2d
199 ; CHECK-NEXT: add v25.2d, v25.2d, v0.2d
200 ; CHECK-NEXT: add v21.2d, v21.2d, v0.2d
201 ; CHECK-NEXT: add v17.2d, v17.2d, v0.2d
202 ; CHECK-NEXT: add v4.2d, v4.2d, v0.2d
203 ; CHECK-NEXT: add v7.2d, v7.2d, v0.2d
204 ; CHECK-NEXT: b.ne .LBB0_1
205 ; CHECK-NEXT: // %bb.2: // %for.cond.cleanup
206 ; CHECK-NEXT: adrp x8, C
207 ; CHECK-NEXT: add x8, x8, :lo12:C
208 ; CHECK-NEXT: ldr q0, [sp, #80] // 16-byte Folded Reload
209 ; CHECK-NEXT: stp q13, q12, [x8]
210 ; CHECK-NEXT: ldr q6, [sp, #64] // 16-byte Folded Reload
211 ; CHECK-NEXT: stp q9, q11, [x8, #64]
212 ; CHECK-NEXT: ldp x20, x19, [sp, #176] // 16-byte Folded Reload
213 ; CHECK-NEXT: stp q15, q30, [x8, #144]
214 ; CHECK-NEXT: ldp x22, x21, [sp, #160] // 16-byte Folded Reload
215 ; CHECK-NEXT: stp q4, q3, [x8, #432]
216 ; CHECK-NEXT: ldp d9, d8, [sp, #144] // 16-byte Folded Reload
217 ; CHECK-NEXT: stp q0, q6, [x8, #32]
218 ; CHECK-NEXT: ldp d13, d12, [sp, #112] // 16-byte Folded Reload
219 ; CHECK-NEXT: stp q31, q29, [x8, #96]
220 ; CHECK-NEXT: ldp d15, d14, [sp, #96] // 16-byte Folded Reload
221 ; CHECK-NEXT: stp q28, q26, [x8, #176]
222 ; CHECK-NEXT: str q27, [x8, #208]
223 ; CHECK-NEXT: stp q25, q24, [x8, #240]
224 ; CHECK-NEXT: stp q23, q22, [x8, #272]
225 ; CHECK-NEXT: stp q21, q20, [x8, #304]
226 ; CHECK-NEXT: stp q19, q18, [x8, #336]
227 ; CHECK-NEXT: stp q17, q16, [x8, #368]
228 ; CHECK-NEXT: stp q2, q5, [x8, #400]
229 ; CHECK-NEXT: stp q1, q10, [x8, #464]
230 ; CHECK-NEXT: ldp d11, d10, [sp, #128] // 16-byte Folded Reload
231 ; CHECK-NEXT: str q7, [x8, #496]
232 ; CHECK-NEXT: add sp, sp, #192
233 ; CHECK-NEXT: .cfi_def_cfa_offset 0
234 ; CHECK-NEXT: .cfi_restore w19
235 ; CHECK-NEXT: .cfi_restore w20
236 ; CHECK-NEXT: .cfi_restore w21
237 ; CHECK-NEXT: .cfi_restore w22
238 ; CHECK-NEXT: .cfi_restore b8
239 ; CHECK-NEXT: .cfi_restore b9
240 ; CHECK-NEXT: .cfi_restore b10
241 ; CHECK-NEXT: .cfi_restore b11
242 ; CHECK-NEXT: .cfi_restore b12
243 ; CHECK-NEXT: .cfi_restore b13
244 ; CHECK-NEXT: .cfi_restore b14
245 ; CHECK-NEXT: .cfi_restore b15
247 ; CH`ECK-NEXT: .cfi_offset b9, -16
249 br label %for.cond1.preheader
251 for.cond1.preheader: ; preds = %for.cond1.preheader, %entry
252 %0 = phi <2 x i64> [ undef, %entry ], [ %118, %for.cond1.preheader ]
253 %1 = phi <2 x i64> [ undef, %entry ], [ %116, %for.cond1.preheader ]
254 %2 = phi <2 x i64> [ zeroinitializer, %entry ], [ %114, %for.cond1.preheader ]
255 %3 = phi <2 x i64> [ undef, %entry ], [ %112, %for.cond1.preheader ]
256 %4 = phi <2 x i64> [ undef, %entry ], [ %107, %for.cond1.preheader ]
257 %5 = phi <2 x i64> [ undef, %entry ], [ %105, %for.cond1.preheader ]
258 %6 = phi <2 x i64> [ undef, %entry ], [ %103, %for.cond1.preheader ]
259 %7 = phi <2 x i64> [ undef, %entry ], [ %101, %for.cond1.preheader ]
260 %8 = phi <2 x i64> [ undef, %entry ], [ %96, %for.cond1.preheader ]
261 %9 = phi <2 x i64> [ undef, %entry ], [ %94, %for.cond1.preheader ]
262 %10 = phi <2 x i64> [ undef, %entry ], [ %92, %for.cond1.preheader ]
263 %11 = phi <2 x i64> [ undef, %entry ], [ %90, %for.cond1.preheader ]
264 %12 = phi <2 x i64> [ undef, %entry ], [ %85, %for.cond1.preheader ]
265 %13 = phi <2 x i64> [ undef, %entry ], [ %83, %for.cond1.preheader ]
266 %14 = phi <2 x i64> [ undef, %entry ], [ %81, %for.cond1.preheader ]
267 %15 = phi <2 x i64> [ undef, %entry ], [ %79, %for.cond1.preheader ]
268 %16 = phi <2 x i64> [ undef, %entry ], [ %74, %for.cond1.preheader ]
269 %17 = phi <2 x i64> [ undef, %entry ], [ %72, %for.cond1.preheader ]
270 %18 = phi <2 x i64> [ undef, %entry ], [ %70, %for.cond1.preheader ]
271 %19 = phi <2 x i64> [ undef, %entry ], [ %65, %for.cond1.preheader ]
272 %20 = phi <2 x i64> [ undef, %entry ], [ %63, %for.cond1.preheader ]
273 %21 = phi <2 x i64> [ undef, %entry ], [ %61, %for.cond1.preheader ]
274 %22 = phi <2 x i64> [ undef, %entry ], [ %56, %for.cond1.preheader ]
275 %23 = phi <2 x i64> [ undef, %entry ], [ %54, %for.cond1.preheader ]
276 %24 = phi <2 x i64> [ undef, %entry ], [ %52, %for.cond1.preheader ]
277 %25 = phi <2 x i64> [ undef, %entry ], [ %50, %for.cond1.preheader ]
278 %26 = phi <2 x i64> [ undef, %entry ], [ %45, %for.cond1.preheader ]
279 %27 = phi <2 x i64> [ undef, %entry ], [ %43, %for.cond1.preheader ]
280 %28 = phi <2 x i64> [ undef, %entry ], [ %41, %for.cond1.preheader ]
281 %29 = phi <2 x i64> [ undef, %entry ], [ %39, %for.cond1.preheader ]
282 %indvars.iv40 = phi i64 [ 0, %entry ], [ %indvars.iv.next41, %for.cond1.preheader ]
283 %30 = load <2 x i64>, <2 x i64>* null, align 8
284 %31 = load <2 x i64>, <2 x i64>* undef, align 8
285 %arrayidx14.4.phi.trans.insert = getelementptr inbounds [8 x [8 x i64]], [8 x [8 x i64]]* @B, i64 0, i64 %indvars.iv40, i64 4
286 %32 = load <2 x i64>, <2 x i64>* null, align 8
287 %arrayidx14.6.phi.trans.insert = getelementptr inbounds [8 x [8 x i64]], [8 x [8 x i64]]* @B, i64 0, i64 %indvars.iv40, i64 6
288 %33 = bitcast i64* %arrayidx14.6.phi.trans.insert to <2 x i64>*
289 %34 = load <2 x i64>, <2 x i64>* %33, align 8
290 %35 = load i64, i64* null, align 8
291 %36 = insertelement <2 x i64> undef, i64 %35, i32 0
292 %37 = shufflevector <2 x i64> %36, <2 x i64> undef, <2 x i32> zeroinitializer
293 %38 = mul nsw <2 x i64> %30, %37
294 %39 = add nsw <2 x i64> %29, %38
295 %40 = mul nsw <2 x i64> %31, %37
296 %41 = add nsw <2 x i64> %28, %40
297 %42 = mul nsw <2 x i64> %32, %37
298 %43 = add nsw <2 x i64> %27, %42
299 %44 = mul nsw <2 x i64> %34, %37
300 %45 = add nsw <2 x i64> %26, %44
301 %46 = load i64, i64* undef, align 8
302 %47 = insertelement <2 x i64> undef, i64 %46, i32 0
303 %48 = shufflevector <2 x i64> %47, <2 x i64> undef, <2 x i32> zeroinitializer
304 %49 = mul nsw <2 x i64> %30, %48
305 %50 = add nsw <2 x i64> %25, %49
306 %51 = mul nsw <2 x i64> %31, %48
307 %52 = add nsw <2 x i64> %24, %51
308 %53 = mul nsw <2 x i64> %32, %48
309 %54 = add nsw <2 x i64> %23, %53
310 %55 = mul nsw <2 x i64> %34, %48
311 %56 = add nsw <2 x i64> %22, %55
312 %arrayidx10.2 = getelementptr inbounds [8 x [8 x i64]], [8 x [8 x i64]]* @A, i64 0, i64 2, i64 %indvars.iv40
313 %57 = load i64, i64* %arrayidx10.2, align 8
314 %58 = insertelement <2 x i64> undef, i64 %57, i32 0
315 %59 = shufflevector <2 x i64> %58, <2 x i64> undef, <2 x i32> zeroinitializer
316 %60 = mul nsw <2 x i64> %31, %59
317 %61 = add nsw <2 x i64> %21, %60
318 %62 = mul nsw <2 x i64> %32, %59
319 %63 = add nsw <2 x i64> %20, %62
320 %64 = mul nsw <2 x i64> %34, %59
321 %65 = add nsw <2 x i64> %19, %64
322 %66 = load i64, i64* undef, align 8
323 %67 = insertelement <2 x i64> undef, i64 %66, i32 0
324 %68 = shufflevector <2 x i64> %67, <2 x i64> undef, <2 x i32> zeroinitializer
325 %69 = mul nsw <2 x i64> %30, %68
326 %70 = add nsw <2 x i64> %18, %69
327 %71 = mul nsw <2 x i64> %31, %68
328 %72 = add nsw <2 x i64> %17, %71
329 %73 = mul nsw <2 x i64> %34, %68
330 %74 = add nsw <2 x i64> %16, %73
331 %75 = load i64, i64* undef, align 8
332 %76 = insertelement <2 x i64> undef, i64 %75, i32 0
333 %77 = shufflevector <2 x i64> %76, <2 x i64> undef, <2 x i32> zeroinitializer
334 %78 = mul nsw <2 x i64> %30, %77
335 %79 = add nsw <2 x i64> %15, %78
336 %80 = mul nsw <2 x i64> %31, %77
337 %81 = add nsw <2 x i64> %14, %80
338 %82 = mul nsw <2 x i64> %32, %77
339 %83 = add nsw <2 x i64> %13, %82
340 %84 = mul nsw <2 x i64> %34, %77
341 %85 = add nsw <2 x i64> %12, %84
342 %86 = load i64, i64* undef, align 8
343 %87 = insertelement <2 x i64> undef, i64 %86, i32 0
344 %88 = shufflevector <2 x i64> %87, <2 x i64> undef, <2 x i32> zeroinitializer
345 %89 = mul nsw <2 x i64> %30, %88
346 %90 = add nsw <2 x i64> %11, %89
347 %91 = mul nsw <2 x i64> %31, %88
348 %92 = add nsw <2 x i64> %10, %91
349 %93 = mul nsw <2 x i64> %32, %88
350 %94 = add nsw <2 x i64> %9, %93
351 %95 = mul nsw <2 x i64> %34, %88
352 %96 = add nsw <2 x i64> %8, %95
353 %97 = load i64, i64* undef, align 8
354 %98 = insertelement <2 x i64> undef, i64 %97, i32 0
355 %99 = shufflevector <2 x i64> %98, <2 x i64> undef, <2 x i32> zeroinitializer
356 %100 = mul nsw <2 x i64> %30, %99
357 %101 = add nsw <2 x i64> %7, %100
358 %102 = mul nsw <2 x i64> %31, %99
359 %103 = add nsw <2 x i64> %6, %102
360 %104 = mul nsw <2 x i64> %32, %99
361 %105 = add nsw <2 x i64> %5, %104
362 %106 = mul nsw <2 x i64> %34, %99
363 %107 = add nsw <2 x i64> %4, %106
364 %108 = load i64, i64* undef, align 8
365 %109 = insertelement <2 x i64> undef, i64 %108, i32 0
366 %110 = shufflevector <2 x i64> %109, <2 x i64> undef, <2 x i32> zeroinitializer
367 %111 = mul nsw <2 x i64> %30, %110
368 %112 = add nsw <2 x i64> %3, %111
369 %113 = mul nsw <2 x i64> %31, %110
370 %114 = add nsw <2 x i64> %2, %113
371 %115 = mul nsw <2 x i64> %32, %110
372 %116 = add nsw <2 x i64> %1, %115
373 %117 = mul nsw <2 x i64> %34, %110
374 %118 = add nsw <2 x i64> %0, %117
375 %indvars.iv.next41 = add nuw nsw i64 %indvars.iv40, 1
376 %exitcond42 = icmp eq i64 %indvars.iv.next41, 8
377 br i1 %exitcond42, label %for.cond.cleanup, label %for.cond1.preheader
379 for.cond.cleanup: ; preds = %for.cond1.preheader
380 store <2 x i64> %39, <2 x i64>* bitcast ([8 x [8 x i64]]* @C to <2 x i64>*), align 8
381 store <2 x i64> %41, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x [8 x i64]], [8 x [8 x i64]]* @C, i64 0, i64 0, i64 2) to <2 x i64>*), align 8
382 store <2 x i64> %43, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x [8 x i64]], [8 x [8 x i64]]* @C, i64 0, i64 0, i64 4) to <2 x i64>*), align 8
383 store <2 x i64> %45, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x [8 x i64]], [8 x [8 x i64]]* @C, i64 0, i64 0, i64 6) to <2 x i64>*), align 8
384 store <2 x i64> %50, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x [8 x i64]], [8 x [8 x i64]]* @C, i64 0, i64 1, i64 0) to <2 x i64>*), align 8
385 store <2 x i64> %52, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x [8 x i64]], [8 x [8 x i64]]* @C, i64 0, i64 1, i64 2) to <2 x i64>*), align 8
386 store <2 x i64> %54, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x [8 x i64]], [8 x [8 x i64]]* @C, i64 0, i64 1, i64 4) to <2 x i64>*), align 8
387 store <2 x i64> %56, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x [8 x i64]], [8 x [8 x i64]]* @C, i64 0, i64 1, i64 6) to <2 x i64>*), align 8
388 store <2 x i64> %61, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x [8 x i64]], [8 x [8 x i64]]* @C, i64 0, i64 2, i64 2) to <2 x i64>*), align 8
389 store <2 x i64> %63, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x [8 x i64]], [8 x [8 x i64]]* @C, i64 0, i64 2, i64 4) to <2 x i64>*), align 8
390 store <2 x i64> %65, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x [8 x i64]], [8 x [8 x i64]]* @C, i64 0, i64 2, i64 6) to <2 x i64>*), align 8
391 store <2 x i64> %70, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x [8 x i64]], [8 x [8 x i64]]* @C, i64 0, i64 3, i64 0) to <2 x i64>*), align 8
392 store <2 x i64> %72, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x [8 x i64]], [8 x [8 x i64]]* @C, i64 0, i64 3, i64 2) to <2 x i64>*), align 8
393 store <2 x i64> %74, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x [8 x i64]], [8 x [8 x i64]]* @C, i64 0, i64 3, i64 6) to <2 x i64>*), align 8
394 store <2 x i64> %79, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x [8 x i64]], [8 x [8 x i64]]* @C, i64 0, i64 4, i64 0) to <2 x i64>*), align 8
395 store <2 x i64> %81, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x [8 x i64]], [8 x [8 x i64]]* @C, i64 0, i64 4, i64 2) to <2 x i64>*), align 8
396 store <2 x i64> %83, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x [8 x i64]], [8 x [8 x i64]]* @C, i64 0, i64 4, i64 4) to <2 x i64>*), align 8
397 store <2 x i64> %85, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x [8 x i64]], [8 x [8 x i64]]* @C, i64 0, i64 4, i64 6) to <2 x i64>*), align 8
398 store <2 x i64> %90, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x [8 x i64]], [8 x [8 x i64]]* @C, i64 0, i64 5, i64 0) to <2 x i64>*), align 8
399 store <2 x i64> %92, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x [8 x i64]], [8 x [8 x i64]]* @C, i64 0, i64 5, i64 2) to <2 x i64>*), align 8
400 store <2 x i64> %94, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x [8 x i64]], [8 x [8 x i64]]* @C, i64 0, i64 5, i64 4) to <2 x i64>*), align 8
401 store <2 x i64> %96, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x [8 x i64]], [8 x [8 x i64]]* @C, i64 0, i64 5, i64 6) to <2 x i64>*), align 8
402 store <2 x i64> %101, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x [8 x i64]], [8 x [8 x i64]]* @C, i64 0, i64 6, i64 0) to <2 x i64>*), align 8
403 store <2 x i64> %103, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x [8 x i64]], [8 x [8 x i64]]* @C, i64 0, i64 6, i64 2) to <2 x i64>*), align 8
404 store <2 x i64> %105, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x [8 x i64]], [8 x [8 x i64]]* @C, i64 0, i64 6, i64 4) to <2 x i64>*), align 8
405 store <2 x i64> %107, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x [8 x i64]], [8 x [8 x i64]]* @C, i64 0, i64 6, i64 6) to <2 x i64>*), align 8
406 store <2 x i64> %112, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x [8 x i64]], [8 x [8 x i64]]* @C, i64 0, i64 7, i64 0) to <2 x i64>*), align 8
407 store <2 x i64> %114, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x [8 x i64]], [8 x [8 x i64]]* @C, i64 0, i64 7, i64 2) to <2 x i64>*), align 8
408 store <2 x i64> %116, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x [8 x i64]], [8 x [8 x i64]]* @C, i64 0, i64 7, i64 4) to <2 x i64>*), align 8
409 store <2 x i64> %118, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x [8 x i64]], [8 x [8 x i64]]* @C, i64 0, i64 7, i64 6) to <2 x i64>*), align 8