1 ; RUN: opt -passes=loop-vectorize -mtriple=arm64-apple-darwin -S %s | FileCheck %s
3 ; Test cases for extending the vectorization factor, if small memory operations
6 ; Test with a loop that contains memory accesses of i8 and i32 types. The
7 ; maximum VF for NEON is calculated by 128/size of smallest type in loop.
8 ; And while we don't have an instruction to load 4 x i8, vectorization
9 ; might still be profitable.
10 define void @test_load_i8_store_i32(ptr noalias %src, ptr noalias %dst, i32 %off, i64 %N) {
11 ; CHECK-LABEL: @test_load_i8_store_i32(
18 %iv = phi i64 [ %iv.next, %loop ], [ 0, %entry ]
19 %gep.src = getelementptr inbounds i8, ptr %src, i64 %iv
20 %lv = load i8, ptr %gep.src, align 1
21 %lv.ext = zext i8 %lv to i32
22 %add = add i32 %lv.ext, %off
23 %gep.dst = getelementptr inbounds i32, ptr %dst, i64 %iv
24 store i32 %add, ptr %gep.dst
25 %iv.next = add nuw nsw i64 %iv, 1
26 %exitcond.not = icmp eq i64 %iv.next, %N
27 br i1 %exitcond.not, label %exit, label %loop
33 ; Same as test_load_i8_store_i32, but with types flipped for load and store.
34 define void @test_load_i32_store_i8(ptr noalias %src, ptr noalias %dst, i32 %off, i64 %N) {
35 ; CHECK-LABEL: @test_load_i32_store_i8(
42 %iv = phi i64 [ %iv.next, %loop ], [ 0, %entry ]
43 %gep.src = getelementptr inbounds i32, ptr %src, i64 %iv
44 %lv = load i32, ptr %gep.src, align 1
45 %add = add i32 %lv, %off
46 %add.trunc = trunc i32 %add to i8
47 %gep.dst = getelementptr inbounds i8, ptr %dst, i64 %iv
48 store i8 %add.trunc, ptr %gep.dst
49 %iv.next = add nuw nsw i64 %iv, 1
50 %exitcond.not = icmp eq i64 %iv.next, %N
51 br i1 %exitcond.not, label %exit, label %loop
57 ; All memory operations use i32, all memory operations are profitable with VF 4.
58 define void @test_load_i32_store_i32(ptr noalias %src, ptr noalias %dst, i8 %off, i64 %N) {
59 ; CHECK-LABEL: @test_load_i32_store_i32(
67 %iv = phi i64 [ %iv.next, %loop ], [ 0, %entry ]
68 %gep.src = getelementptr inbounds i32, ptr %src, i64 %iv
69 %lv = load i32, ptr %gep.src, align 1
70 %lv.trunc = trunc i32 %lv to i8
71 %add = add i8 %lv.trunc, %off
72 %add.ext = zext i8 %add to i32
73 %gep.dst = getelementptr inbounds i32, ptr %dst, i64 %iv
74 store i32 %add.ext, ptr %gep.dst
75 %iv.next = add nuw nsw i64 %iv, 1
76 %exitcond.not = icmp eq i64 %iv.next, %N
77 br i1 %exitcond.not, label %exit, label %loop
83 ; Test with loop body that requires a large number of vector registers if the
84 ; vectorization factor is large. Make sure the register estimates limit the
85 ; vectorization factor.
86 define void @test_load_i8_store_i64_large(ptr noalias %src, ptr noalias %dst, ptr noalias %dst.2, ptr noalias %dst.3, ptr noalias %dst.4, ptr noalias %dst.5, i64%off, i64 %off.2, i64 %N) {
87 ; CHECK-LABEL: @test_load_i8_store_i64_large
94 %iv = phi i64 [ %iv.next, %loop ], [ 0, %entry ]
95 %gep.src = getelementptr inbounds i8, ptr %src, i64 %iv
96 %gep.dst.3 = getelementptr inbounds i64, ptr %dst.3, i64 %iv
97 %lv.dst.3 = load i64, ptr %gep.dst.3, align 1
98 %gep.dst.5 = getelementptr inbounds i64, ptr %dst.5, i64 %iv
99 %lv.dst.5 = load i64, ptr %gep.dst.3, align 1
101 %lv = load i8, ptr %gep.src, align 1
102 %lv.ext = zext i8 %lv to i64
103 %add = add i64 %lv.ext, %off
104 %add.2 = add i64 %add, %off.2
105 %gep.dst = getelementptr inbounds i64, ptr %dst, i64 %iv
106 %gep.dst.2 = getelementptr inbounds i64, ptr %dst.2, i64 %iv
108 %add.3 = add i64 %add.2, %lv.dst.3
109 %add.4 = add i64 %add.3, %add
110 %gep.dst.4 = getelementptr inbounds i64, ptr %dst.4, i64 %iv
111 %add.5 = add i64 %add.2, %lv.dst.5
112 store i64 %add.2, ptr %gep.dst.2
113 store i64 %add, ptr %gep.dst
114 store i64 %add.3, ptr %gep.dst.3
115 store i64 %add.4, ptr %gep.dst.4
116 store i64 %add.5, ptr %gep.dst.5
118 %iv.next = add nuw nsw i64 %iv, 1
119 %exitcond.not = icmp eq i64 %iv.next, %N
120 br i1 %exitcond.not, label %exit, label %loop