1 ; RUN: opt -loop-vectorize -mtriple=arm64-apple-darwin -S %s | FileCheck %s
3 ; Test cases for extending the vectorization factor, if small memory operations
6 ; Test with a loop that contains memory accesses of i8 and i32 types. The
7 ; default maximum VF for NEON is 4. And while we don't have an instruction to
8 ; load 4 x i8, vectorization might still be profitable.
9 define void @test_load_i8_store_i32(i8* noalias %src, i32* noalias %dst, i32 %off, i64 %N) {
10 ; CHECK-LABEL: @test_load_i8_store_i32(
17 %iv = phi i64 [ %iv.next, %loop ], [ 0, %entry ]
18 %gep.src = getelementptr inbounds i8, i8* %src, i64 %iv
19 %lv = load i8, i8* %gep.src, align 1
20 %lv.ext = zext i8 %lv to i32
21 %add = add i32 %lv.ext, %off
22 %gep.dst = getelementptr inbounds i32, i32* %dst, i64 %iv
23 store i32 %add, i32* %gep.dst
24 %iv.next = add nuw nsw i64 %iv, 1
25 %exitcond.not = icmp eq i64 %iv.next, %N
26 br i1 %exitcond.not, label %exit, label %loop
32 ; Same as test_load_i8_store_i32, but with types flipped for load and store.
33 define void @test_load_i32_store_i8(i32* noalias %src, i8* noalias %dst, i32 %off, i64 %N) {
34 ; CHECK-LABEL: @test_load_i32_store_i8(
41 %iv = phi i64 [ %iv.next, %loop ], [ 0, %entry ]
42 %gep.src = getelementptr inbounds i32, i32* %src, i64 %iv
43 %lv = load i32, i32* %gep.src, align 1
44 %add = add i32 %lv, %off
45 %add.trunc = trunc i32 %add to i8
46 %gep.dst = getelementptr inbounds i8, i8* %dst, i64 %iv
47 store i8 %add.trunc, i8* %gep.dst
48 %iv.next = add nuw nsw i64 %iv, 1
49 %exitcond.not = icmp eq i64 %iv.next, %N
50 br i1 %exitcond.not, label %exit, label %loop
56 ; All memory operations use i32, all memory operations are profitable with VF 4.
57 define void @test_load_i32_store_i32(i32* noalias %src, i32* noalias %dst, i8 %off, i64 %N) {
58 ; CHECK-LABEL: @test_load_i32_store_i32(
66 %iv = phi i64 [ %iv.next, %loop ], [ 0, %entry ]
67 %gep.src = getelementptr inbounds i32, i32* %src, i64 %iv
68 %lv = load i32, i32* %gep.src, align 1
69 %lv.trunc = trunc i32 %lv to i8
70 %add = add i8 %lv.trunc, %off
71 %add.ext = zext i8 %add to i32
72 %gep.dst = getelementptr inbounds i32, i32* %dst, i64 %iv
73 store i32 %add.ext, i32* %gep.dst
74 %iv.next = add nuw nsw i64 %iv, 1
75 %exitcond.not = icmp eq i64 %iv.next, %N
76 br i1 %exitcond.not, label %exit, label %loop
82 ; Test with loop body that requires a large number of vector registers if the
83 ; vectorization factor is large. Make sure the register estimates limit the
84 ; vectorization factor.
85 define void @test_load_i8_store_i64_large(i8* noalias %src, i64* noalias %dst, i64* noalias %dst.2, i64* noalias %dst.3, i64* noalias %dst.4, i64* noalias %dst.5, i64%off, i64 %off.2, i64 %N) {
86 ; CHECK-LABEL: @test_load_i8_store_i64_large
93 %iv = phi i64 [ %iv.next, %loop ], [ 0, %entry ]
94 %gep.src = getelementptr inbounds i8, i8* %src, i64 %iv
95 %gep.dst.3 = getelementptr inbounds i64, i64* %dst.3, i64 %iv
96 %lv.dst.3 = load i64, i64* %gep.dst.3, align 1
97 %gep.dst.5 = getelementptr inbounds i64, i64* %dst.5, i64 %iv
98 %lv.dst.5 = load i64, i64* %gep.dst.3, align 1
100 %lv = load i8, i8* %gep.src, align 1
101 %lv.ext = zext i8 %lv to i64
102 %add = add i64 %lv.ext, %off
103 %add.2 = add i64 %add, %off.2
104 %gep.dst = getelementptr inbounds i64, i64* %dst, i64 %iv
105 %gep.dst.2 = getelementptr inbounds i64, i64* %dst.2, i64 %iv
107 %add.3 = add i64 %add.2, %lv.dst.3
108 %add.4 = add i64 %add.3, %add
109 %gep.dst.4 = getelementptr inbounds i64, i64* %dst.4, i64 %iv
110 %add.5 = add i64 %add.2, %lv.dst.5
111 store i64 %add.2, i64* %gep.dst.2
112 store i64 %add, i64* %gep.dst
113 store i64 %add.3, i64* %gep.dst.3
114 store i64 %add.4, i64* %gep.dst.4
115 store i64 %add.5, i64* %gep.dst.5
117 %iv.next = add nuw nsw i64 %iv, 1
118 %exitcond.not = icmp eq i64 %iv.next, %N
119 br i1 %exitcond.not, label %exit, label %loop