1 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
2 ; RUN: opt -passes='require<profile-summary>,function(codegenprepare)' -S %s | FileCheck %s
4 target datalayout = "e-m:o-i64:64-i128:128-n32:64-S128"
5 target triple = "arm64-apple-ios"
7 ; It's profitable to convert the zext to a shuffle, which in turn will be
8 ; lowered to 4 tbl instructions. The masks are materialized outside the loop.
9 define void @zext_v16i8_to_v16i32_in_loop(ptr %src, ptr %dst) {
10 ; CHECK-LABEL: @zext_v16i8_to_v16i32_in_loop(
12 ; CHECK-NEXT: br label [[LOOP:%.*]]
14 ; CHECK-NEXT: [[IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[IV_NEXT:%.*]], [[LOOP]] ]
15 ; CHECK-NEXT: [[SRC_GEP:%.*]] = getelementptr i8, ptr [[SRC:%.*]], i64 [[IV]]
16 ; CHECK-NEXT: [[LOAD:%.*]] = load <16 x i8>, ptr [[SRC_GEP]], align 16
17 ; CHECK-NEXT: [[TMP0:%.*]] = shufflevector <16 x i8> [[LOAD]], <16 x i8> <i8 0, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison>, <64 x i32> <i32 0, i32 16, i32 16, i32 16, i32 1, i32 16, i32 16, i32 16, i32 2, i32 16, i32 16, i32 16, i32 3, i32 16, i32 16, i32 16, i32 4, i32 16, i32 16, i32 16, i32 5, i32 16, i32 16, i32 16, i32 6, i32 16, i32 16, i32 16, i32 7, i32 16, i32 16, i32 16, i32 8, i32 16, i32 16, i32 16, i32 9, i32 16, i32 16, i32 16, i32 10, i32 16, i32 16, i32 16, i32 11, i32 16, i32 16, i32 16, i32 12, i32 16, i32 16, i32 16, i32 13, i32 16, i32 16, i32 16, i32 14, i32 16, i32 16, i32 16, i32 15, i32 16, i32 16, i32 16>
18 ; CHECK-NEXT: [[TMP1:%.*]] = bitcast <64 x i8> [[TMP0]] to <16 x i32>
19 ; CHECK-NEXT: [[DST_GEP:%.*]] = getelementptr i32, ptr [[DST:%.*]], i64 [[IV]]
20 ; CHECK-NEXT: store <16 x i32> [[TMP1]], ptr [[DST_GEP]], align 64
21 ; CHECK-NEXT: [[IV_NEXT]] = add nuw i64 [[IV]], 16
22 ; CHECK-NEXT: [[EC:%.*]] = icmp eq i64 [[IV_NEXT]], 128
23 ; CHECK-NEXT: br i1 [[EC]], label [[EXIT:%.*]], label [[LOOP]]
25 ; CHECK-NEXT: ret void
31 %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ]
32 %src.gep = getelementptr i8, ptr %src, i64 %iv
33 %load = load <16 x i8>, ptr %src.gep
34 %ext = zext <16 x i8> %load to <16 x i32>
35 %dst.gep = getelementptr i32, ptr %dst, i64 %iv
36 store <16 x i32> %ext, ptr %dst.gep
37 %iv.next = add nuw i64 %iv, 16
38 %ec = icmp eq i64 %iv.next, 128
39 br i1 %ec, label %exit, label %loop
45 ; Not profitable to use shuffle/tbl, as 4 tbls + materializing the masks
46 ; require more instructions than lowering zext directly.
47 define void @zext_v16i8_to_v16i32_no_loop(ptr %src, ptr %dst) {
48 ; CHECK-LABEL: @zext_v16i8_to_v16i32_no_loop(
50 ; CHECK-NEXT: [[LOAD:%.*]] = load <16 x i8>, ptr [[SRC:%.*]], align 16
51 ; CHECK-NEXT: [[EXT:%.*]] = zext <16 x i8> [[LOAD]] to <16 x i32>
52 ; CHECK-NEXT: store <16 x i32> [[EXT]], ptr [[DST:%.*]], align 64
53 ; CHECK-NEXT: ret void
56 %load = load <16 x i8>, ptr %src
57 %ext = zext <16 x i8> %load to <16 x i32>
58 store <16 x i32> %ext, ptr %dst
62 define void @zext_v16i8_to_v16i16_in_loop(ptr %src, ptr %dst) {
63 ; CHECK-LABEL: @zext_v16i8_to_v16i16_in_loop(
65 ; CHECK-NEXT: br label [[LOOP:%.*]]
67 ; CHECK-NEXT: [[IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[IV_NEXT:%.*]], [[LOOP]] ]
68 ; CHECK-NEXT: [[SRC_GEP:%.*]] = getelementptr i8, ptr [[SRC:%.*]], i64 [[IV]]
69 ; CHECK-NEXT: [[LOAD:%.*]] = load <16 x i8>, ptr [[SRC_GEP]], align 16
70 ; CHECK-NEXT: [[EXT:%.*]] = zext <16 x i8> [[LOAD]] to <16 x i16>
71 ; CHECK-NEXT: [[DST_GEP:%.*]] = getelementptr i16, ptr [[DST:%.*]], i64 [[IV]]
72 ; CHECK-NEXT: store <16 x i16> [[EXT]], ptr [[DST_GEP]], align 32
73 ; CHECK-NEXT: [[IV_NEXT]] = add nuw i64 [[IV]], 16
74 ; CHECK-NEXT: [[EC:%.*]] = icmp eq i64 [[IV_NEXT]], 128
75 ; CHECK-NEXT: br i1 [[EC]], label [[EXIT:%.*]], label [[LOOP]]
77 ; CHECK-NEXT: ret void
83 %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ]
84 %src.gep = getelementptr i8, ptr %src, i64 %iv
85 %load = load <16 x i8>, ptr %src.gep
86 %ext = zext <16 x i8> %load to <16 x i16>
87 %dst.gep = getelementptr i16, ptr %dst, i64 %iv
88 store <16 x i16> %ext, ptr %dst.gep
89 %iv.next = add nuw i64 %iv, 16
90 %ec = icmp eq i64 %iv.next, 128
91 br i1 %ec, label %exit, label %loop
97 define void @zext_v8i8_to_v8i32_in_loop(ptr %src, ptr %dst) {
98 ; CHECK-LABEL: @zext_v8i8_to_v8i32_in_loop(
100 ; CHECK-NEXT: br label [[LOOP:%.*]]
102 ; CHECK-NEXT: [[IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[IV_NEXT:%.*]], [[LOOP]] ]
103 ; CHECK-NEXT: [[SRC_GEP:%.*]] = getelementptr i8, ptr [[SRC:%.*]], i64 [[IV]]
104 ; CHECK-NEXT: [[LOAD:%.*]] = load <8 x i8>, ptr [[SRC_GEP]], align 8
105 ; CHECK-NEXT: [[TMP0:%.*]] = shufflevector <8 x i8> [[LOAD]], <8 x i8> <i8 0, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison>, <32 x i32> <i32 0, i32 8, i32 8, i32 8, i32 1, i32 8, i32 8, i32 8, i32 2, i32 8, i32 8, i32 8, i32 3, i32 8, i32 8, i32 8, i32 4, i32 8, i32 8, i32 8, i32 5, i32 8, i32 8, i32 8, i32 6, i32 8, i32 8, i32 8, i32 7, i32 8, i32 8, i32 8>
106 ; CHECK-NEXT: [[TMP1:%.*]] = bitcast <32 x i8> [[TMP0]] to <8 x i32>
107 ; CHECK-NEXT: [[DST_GEP:%.*]] = getelementptr i32, ptr [[DST:%.*]], i64 [[IV]]
108 ; CHECK-NEXT: store <8 x i32> [[TMP1]], ptr [[DST_GEP]], align 32
109 ; CHECK-NEXT: [[IV_NEXT]] = add nuw i64 [[IV]], 16
110 ; CHECK-NEXT: [[EC:%.*]] = icmp eq i64 [[IV_NEXT]], 128
111 ; CHECK-NEXT: br i1 [[EC]], label [[EXIT:%.*]], label [[LOOP]]
113 ; CHECK-NEXT: ret void
119 %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ]
120 %src.gep = getelementptr i8, ptr %src, i64 %iv
121 %load = load <8 x i8>, ptr %src.gep
122 %ext = zext <8 x i8> %load to <8 x i32>
123 %dst.gep = getelementptr i32, ptr %dst, i64 %iv
124 store <8 x i32> %ext, ptr %dst.gep
125 %iv.next = add nuw i64 %iv, 16
126 %ec = icmp eq i64 %iv.next, 128
127 br i1 %ec, label %exit, label %loop
133 define void @zext_v16i8_to_v16i64_in_loop(ptr %src, ptr %dst) {
134 ; CHECK-LABEL: @zext_v16i8_to_v16i64_in_loop(
136 ; CHECK-NEXT: br label [[LOOP:%.*]]
138 ; CHECK-NEXT: [[IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[IV_NEXT:%.*]], [[LOOP]] ]
139 ; CHECK-NEXT: [[SRC_GEP:%.*]] = getelementptr i8, ptr [[SRC:%.*]], i64 [[IV]]
140 ; CHECK-NEXT: [[LOAD:%.*]] = load <16 x i8>, ptr [[SRC_GEP]], align 16
141 ; CHECK-NEXT: [[EXT:%.*]] = zext <16 x i8> [[LOAD]] to <16 x i64>
142 ; CHECK-NEXT: [[DST_GEP:%.*]] = getelementptr i64, ptr [[DST:%.*]], i64 [[IV]]
143 ; CHECK-NEXT: store <16 x i64> [[EXT]], ptr [[DST_GEP]], align 128
144 ; CHECK-NEXT: [[IV_NEXT]] = add nuw i64 [[IV]], 16
145 ; CHECK-NEXT: [[EC:%.*]] = icmp eq i64 [[IV_NEXT]], 128
146 ; CHECK-NEXT: br i1 [[EC]], label [[EXIT:%.*]], label [[LOOP]]
148 ; CHECK-NEXT: ret void
154 %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ]
155 %src.gep = getelementptr i8, ptr %src, i64 %iv
156 %load = load <16 x i8>, ptr %src.gep
157 %ext = zext <16 x i8> %load to <16 x i64>
158 %dst.gep = getelementptr i64, ptr %dst, i64 %iv
159 store <16 x i64> %ext, ptr %dst.gep
160 %iv.next = add nuw i64 %iv, 16
161 %ec = icmp eq i64 %iv.next, 128
162 br i1 %ec, label %exit, label %loop