1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc -mtriple=riscv32 -mattr=+v -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,RV32
3 ; RUN: llc -mtriple=riscv64 -mattr=+v -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,RV64
4 ; RUN: llc -mtriple=riscv32 -mattr=+v,+zvkb -verify-machineinstrs < %s | FileCheck %s --check-prefixes=ZVKB
5 ; RUN: llc -mtriple=riscv64 -mattr=+v,+zvkb -verify-machineinstrs < %s | FileCheck %s --check-prefixes=ZVKB
7 define void @bswap_v8i16(ptr %x, ptr %y) {
8 ; CHECK-LABEL: bswap_v8i16:
10 ; CHECK-NEXT: vsetivli zero, 8, e16, m1, ta, ma
11 ; CHECK-NEXT: vle16.v v8, (a0)
12 ; CHECK-NEXT: vsrl.vi v9, v8, 8
13 ; CHECK-NEXT: vsll.vi v8, v8, 8
14 ; CHECK-NEXT: vor.vv v8, v8, v9
15 ; CHECK-NEXT: vse16.v v8, (a0)
18 ; ZVKB-LABEL: bswap_v8i16:
20 ; ZVKB-NEXT: vsetivli zero, 8, e16, m1, ta, ma
21 ; ZVKB-NEXT: vle16.v v8, (a0)
22 ; ZVKB-NEXT: vrev8.v v8, v8
23 ; ZVKB-NEXT: vse16.v v8, (a0)
25 %a = load <8 x i16>, ptr %x
26 %b = load <8 x i16>, ptr %y
27 %c = call <8 x i16> @llvm.bswap.v8i16(<8 x i16> %a)
28 store <8 x i16> %c, ptr %x
31 declare <8 x i16> @llvm.bswap.v8i16(<8 x i16>)
33 define void @bswap_v4i32(ptr %x, ptr %y) {
34 ; CHECK-LABEL: bswap_v4i32:
36 ; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma
37 ; CHECK-NEXT: vle32.v v8, (a0)
38 ; CHECK-NEXT: lui a1, 16
39 ; CHECK-NEXT: addi a1, a1, -256
40 ; CHECK-NEXT: vsrl.vi v9, v8, 8
41 ; CHECK-NEXT: vsrl.vi v10, v8, 24
42 ; CHECK-NEXT: vand.vx v9, v9, a1
43 ; CHECK-NEXT: vor.vv v9, v9, v10
44 ; CHECK-NEXT: vand.vx v10, v8, a1
45 ; CHECK-NEXT: vsll.vi v8, v8, 24
46 ; CHECK-NEXT: vsll.vi v10, v10, 8
47 ; CHECK-NEXT: vor.vv v8, v8, v10
48 ; CHECK-NEXT: vor.vv v8, v8, v9
49 ; CHECK-NEXT: vse32.v v8, (a0)
52 ; ZVKB-LABEL: bswap_v4i32:
54 ; ZVKB-NEXT: vsetivli zero, 4, e32, m1, ta, ma
55 ; ZVKB-NEXT: vle32.v v8, (a0)
56 ; ZVKB-NEXT: vrev8.v v8, v8
57 ; ZVKB-NEXT: vse32.v v8, (a0)
59 %a = load <4 x i32>, ptr %x
60 %b = load <4 x i32>, ptr %y
61 %c = call <4 x i32> @llvm.bswap.v4i32(<4 x i32> %a)
62 store <4 x i32> %c, ptr %x
65 declare <4 x i32> @llvm.bswap.v4i32(<4 x i32>)
67 define void @bswap_v2i64(ptr %x, ptr %y) {
68 ; RV32-LABEL: bswap_v2i64:
70 ; RV32-NEXT: addi sp, sp, -16
71 ; RV32-NEXT: .cfi_def_cfa_offset 16
72 ; RV32-NEXT: vsetivli zero, 2, e64, m1, ta, ma
73 ; RV32-NEXT: vle64.v v8, (a0)
74 ; RV32-NEXT: lui a1, 1044480
75 ; RV32-NEXT: li a2, 56
76 ; RV32-NEXT: li a3, 40
77 ; RV32-NEXT: lui a4, 16
78 ; RV32-NEXT: lui a5, 4080
79 ; RV32-NEXT: addi a6, sp, 8
80 ; RV32-NEXT: sw a1, 8(sp)
81 ; RV32-NEXT: sw zero, 12(sp)
82 ; RV32-NEXT: addi a1, a4, -256
83 ; RV32-NEXT: vlse64.v v9, (a6), zero
84 ; RV32-NEXT: vsrl.vx v10, v8, a2
85 ; RV32-NEXT: vsrl.vx v11, v8, a3
86 ; RV32-NEXT: vsrl.vi v12, v8, 24
87 ; RV32-NEXT: vsll.vx v13, v8, a2
88 ; RV32-NEXT: vand.vx v11, v11, a1
89 ; RV32-NEXT: vor.vv v10, v11, v10
90 ; RV32-NEXT: vand.vx v11, v8, a1
91 ; RV32-NEXT: vsll.vx v11, v11, a3
92 ; RV32-NEXT: vor.vv v11, v13, v11
93 ; RV32-NEXT: vsrl.vi v13, v8, 8
94 ; RV32-NEXT: vand.vx v12, v12, a5
95 ; RV32-NEXT: vand.vv v13, v13, v9
96 ; RV32-NEXT: vor.vv v12, v13, v12
97 ; RV32-NEXT: vand.vv v9, v8, v9
98 ; RV32-NEXT: vand.vx v8, v8, a5
99 ; RV32-NEXT: vsll.vi v8, v8, 24
100 ; RV32-NEXT: vsll.vi v9, v9, 8
101 ; RV32-NEXT: vor.vv v10, v12, v10
102 ; RV32-NEXT: vor.vv v8, v8, v9
103 ; RV32-NEXT: vor.vv v8, v11, v8
104 ; RV32-NEXT: vor.vv v8, v8, v10
105 ; RV32-NEXT: vse64.v v8, (a0)
106 ; RV32-NEXT: addi sp, sp, 16
107 ; RV32-NEXT: .cfi_def_cfa_offset 0
110 ; RV64-LABEL: bswap_v2i64:
112 ; RV64-NEXT: vsetivli zero, 2, e64, m1, ta, ma
113 ; RV64-NEXT: vle64.v v8, (a0)
114 ; RV64-NEXT: li a1, 56
115 ; RV64-NEXT: li a2, 40
116 ; RV64-NEXT: lui a3, 16
117 ; RV64-NEXT: lui a4, 4080
118 ; RV64-NEXT: li a5, 255
119 ; RV64-NEXT: addiw a3, a3, -256
120 ; RV64-NEXT: slli a5, a5, 24
121 ; RV64-NEXT: vsrl.vx v9, v8, a1
122 ; RV64-NEXT: vsrl.vx v10, v8, a2
123 ; RV64-NEXT: vsrl.vi v11, v8, 24
124 ; RV64-NEXT: vsrl.vi v12, v8, 8
125 ; RV64-NEXT: vand.vx v10, v10, a3
126 ; RV64-NEXT: vor.vv v9, v10, v9
127 ; RV64-NEXT: vand.vx v10, v8, a5
128 ; RV64-NEXT: vand.vx v11, v11, a4
129 ; RV64-NEXT: vand.vx v12, v12, a5
130 ; RV64-NEXT: vor.vv v11, v12, v11
131 ; RV64-NEXT: vand.vx v12, v8, a4
132 ; RV64-NEXT: vsll.vi v10, v10, 8
133 ; RV64-NEXT: vsll.vi v12, v12, 24
134 ; RV64-NEXT: vor.vv v10, v12, v10
135 ; RV64-NEXT: vsll.vx v12, v8, a1
136 ; RV64-NEXT: vand.vx v8, v8, a3
137 ; RV64-NEXT: vsll.vx v8, v8, a2
138 ; RV64-NEXT: vor.vv v8, v12, v8
139 ; RV64-NEXT: vor.vv v9, v11, v9
140 ; RV64-NEXT: vor.vv v8, v8, v10
141 ; RV64-NEXT: vor.vv v8, v8, v9
142 ; RV64-NEXT: vse64.v v8, (a0)
145 ; ZVKB-LABEL: bswap_v2i64:
147 ; ZVKB-NEXT: vsetivli zero, 2, e64, m1, ta, ma
148 ; ZVKB-NEXT: vle64.v v8, (a0)
149 ; ZVKB-NEXT: vrev8.v v8, v8
150 ; ZVKB-NEXT: vse64.v v8, (a0)
152 %a = load <2 x i64>, ptr %x
153 %b = load <2 x i64>, ptr %y
154 %c = call <2 x i64> @llvm.bswap.v2i64(<2 x i64> %a)
155 store <2 x i64> %c, ptr %x
158 declare <2 x i64> @llvm.bswap.v2i64(<2 x i64>)
160 define void @bswap_v16i16(ptr %x, ptr %y) {
161 ; CHECK-LABEL: bswap_v16i16:
163 ; CHECK-NEXT: vsetivli zero, 16, e16, m2, ta, ma
164 ; CHECK-NEXT: vle16.v v8, (a0)
165 ; CHECK-NEXT: vsrl.vi v10, v8, 8
166 ; CHECK-NEXT: vsll.vi v8, v8, 8
167 ; CHECK-NEXT: vor.vv v8, v8, v10
168 ; CHECK-NEXT: vse16.v v8, (a0)
171 ; ZVKB-LABEL: bswap_v16i16:
173 ; ZVKB-NEXT: vsetivli zero, 16, e16, m2, ta, ma
174 ; ZVKB-NEXT: vle16.v v8, (a0)
175 ; ZVKB-NEXT: vrev8.v v8, v8
176 ; ZVKB-NEXT: vse16.v v8, (a0)
178 %a = load <16 x i16>, ptr %x
179 %b = load <16 x i16>, ptr %y
180 %c = call <16 x i16> @llvm.bswap.v16i16(<16 x i16> %a)
181 store <16 x i16> %c, ptr %x
184 declare <16 x i16> @llvm.bswap.v16i16(<16 x i16>)
186 define void @bswap_v8i32(ptr %x, ptr %y) {
187 ; CHECK-LABEL: bswap_v8i32:
189 ; CHECK-NEXT: vsetivli zero, 8, e32, m2, ta, ma
190 ; CHECK-NEXT: vle32.v v8, (a0)
191 ; CHECK-NEXT: lui a1, 16
192 ; CHECK-NEXT: addi a1, a1, -256
193 ; CHECK-NEXT: vsrl.vi v10, v8, 8
194 ; CHECK-NEXT: vsrl.vi v12, v8, 24
195 ; CHECK-NEXT: vand.vx v10, v10, a1
196 ; CHECK-NEXT: vor.vv v10, v10, v12
197 ; CHECK-NEXT: vand.vx v12, v8, a1
198 ; CHECK-NEXT: vsll.vi v8, v8, 24
199 ; CHECK-NEXT: vsll.vi v12, v12, 8
200 ; CHECK-NEXT: vor.vv v8, v8, v12
201 ; CHECK-NEXT: vor.vv v8, v8, v10
202 ; CHECK-NEXT: vse32.v v8, (a0)
205 ; ZVKB-LABEL: bswap_v8i32:
207 ; ZVKB-NEXT: vsetivli zero, 8, e32, m2, ta, ma
208 ; ZVKB-NEXT: vle32.v v8, (a0)
209 ; ZVKB-NEXT: vrev8.v v8, v8
210 ; ZVKB-NEXT: vse32.v v8, (a0)
212 %a = load <8 x i32>, ptr %x
213 %b = load <8 x i32>, ptr %y
214 %c = call <8 x i32> @llvm.bswap.v8i32(<8 x i32> %a)
215 store <8 x i32> %c, ptr %x
218 declare <8 x i32> @llvm.bswap.v8i32(<8 x i32>)
220 define void @bswap_v4i64(ptr %x, ptr %y) {
221 ; RV32-LABEL: bswap_v4i64:
223 ; RV32-NEXT: addi sp, sp, -16
224 ; RV32-NEXT: .cfi_def_cfa_offset 16
225 ; RV32-NEXT: vsetivli zero, 4, e64, m2, ta, ma
226 ; RV32-NEXT: vle64.v v8, (a0)
227 ; RV32-NEXT: lui a1, 1044480
228 ; RV32-NEXT: li a2, 56
229 ; RV32-NEXT: li a3, 40
230 ; RV32-NEXT: lui a4, 16
231 ; RV32-NEXT: lui a5, 4080
232 ; RV32-NEXT: addi a6, sp, 8
233 ; RV32-NEXT: sw a1, 8(sp)
234 ; RV32-NEXT: sw zero, 12(sp)
235 ; RV32-NEXT: addi a1, a4, -256
236 ; RV32-NEXT: vlse64.v v10, (a6), zero
237 ; RV32-NEXT: vsrl.vx v12, v8, a2
238 ; RV32-NEXT: vsrl.vx v14, v8, a3
239 ; RV32-NEXT: vsrl.vi v16, v8, 24
240 ; RV32-NEXT: vsll.vx v18, v8, a2
241 ; RV32-NEXT: vand.vx v14, v14, a1
242 ; RV32-NEXT: vor.vv v12, v14, v12
243 ; RV32-NEXT: vand.vx v14, v8, a1
244 ; RV32-NEXT: vsll.vx v14, v14, a3
245 ; RV32-NEXT: vor.vv v14, v18, v14
246 ; RV32-NEXT: vsrl.vi v18, v8, 8
247 ; RV32-NEXT: vand.vx v16, v16, a5
248 ; RV32-NEXT: vand.vv v18, v18, v10
249 ; RV32-NEXT: vor.vv v16, v18, v16
250 ; RV32-NEXT: vand.vv v10, v8, v10
251 ; RV32-NEXT: vand.vx v8, v8, a5
252 ; RV32-NEXT: vsll.vi v8, v8, 24
253 ; RV32-NEXT: vsll.vi v10, v10, 8
254 ; RV32-NEXT: vor.vv v12, v16, v12
255 ; RV32-NEXT: vor.vv v8, v8, v10
256 ; RV32-NEXT: vor.vv v8, v14, v8
257 ; RV32-NEXT: vor.vv v8, v8, v12
258 ; RV32-NEXT: vse64.v v8, (a0)
259 ; RV32-NEXT: addi sp, sp, 16
260 ; RV32-NEXT: .cfi_def_cfa_offset 0
263 ; RV64-LABEL: bswap_v4i64:
265 ; RV64-NEXT: vsetivli zero, 4, e64, m2, ta, ma
266 ; RV64-NEXT: vle64.v v8, (a0)
267 ; RV64-NEXT: li a1, 56
268 ; RV64-NEXT: li a2, 40
269 ; RV64-NEXT: lui a3, 16
270 ; RV64-NEXT: lui a4, 4080
271 ; RV64-NEXT: li a5, 255
272 ; RV64-NEXT: addiw a3, a3, -256
273 ; RV64-NEXT: slli a5, a5, 24
274 ; RV64-NEXT: vsrl.vx v10, v8, a1
275 ; RV64-NEXT: vsrl.vx v12, v8, a2
276 ; RV64-NEXT: vsrl.vi v14, v8, 24
277 ; RV64-NEXT: vsrl.vi v16, v8, 8
278 ; RV64-NEXT: vand.vx v12, v12, a3
279 ; RV64-NEXT: vor.vv v10, v12, v10
280 ; RV64-NEXT: vand.vx v12, v8, a5
281 ; RV64-NEXT: vand.vx v14, v14, a4
282 ; RV64-NEXT: vand.vx v16, v16, a5
283 ; RV64-NEXT: vor.vv v14, v16, v14
284 ; RV64-NEXT: vand.vx v16, v8, a4
285 ; RV64-NEXT: vsll.vi v12, v12, 8
286 ; RV64-NEXT: vsll.vi v16, v16, 24
287 ; RV64-NEXT: vor.vv v12, v16, v12
288 ; RV64-NEXT: vsll.vx v16, v8, a1
289 ; RV64-NEXT: vand.vx v8, v8, a3
290 ; RV64-NEXT: vsll.vx v8, v8, a2
291 ; RV64-NEXT: vor.vv v8, v16, v8
292 ; RV64-NEXT: vor.vv v10, v14, v10
293 ; RV64-NEXT: vor.vv v8, v8, v12
294 ; RV64-NEXT: vor.vv v8, v8, v10
295 ; RV64-NEXT: vse64.v v8, (a0)
298 ; ZVKB-LABEL: bswap_v4i64:
300 ; ZVKB-NEXT: vsetivli zero, 4, e64, m2, ta, ma
301 ; ZVKB-NEXT: vle64.v v8, (a0)
302 ; ZVKB-NEXT: vrev8.v v8, v8
303 ; ZVKB-NEXT: vse64.v v8, (a0)
305 %a = load <4 x i64>, ptr %x
306 %b = load <4 x i64>, ptr %y
307 %c = call <4 x i64> @llvm.bswap.v4i64(<4 x i64> %a)
308 store <4 x i64> %c, ptr %x
311 declare <4 x i64> @llvm.bswap.v4i64(<4 x i64>)