Run DCE after a LoopFlatten test to reduce spurious output [nfc]
[llvm-project.git] / clang / test / CodeGen / PowerPC / builtins-ppc-ld-st-rmb.c
blob3616ee6b694872c66d857a39f957173e7b166509
1 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
2 // REQUIRES: powerpc-registered-target
3 // RUN: %clang_cc1 -flax-vector-conversions=none -triple powerpc64-unknown-unknown -emit-llvm %s \
4 // RUN: -target-cpu pwr8 -o - | FileCheck %s -check-prefix=BE-PWR8
5 // RUN: %clang_cc1 -flax-vector-conversions=none -triple powerpc64le-unknown-unknown -emit-llvm %s \
6 // RUN: -target-cpu pwr8 -o - | FileCheck %s -check-prefix=LE-PWR8
8 // RUN: %clang_cc1 -flax-vector-conversions=none -triple powerpc64-unknown-unknown -emit-llvm %s \
9 // RUN: -target-cpu pwr9 -o - | FileCheck %s -check-prefix=BE-PWR9
10 // RUN: %clang_cc1 -flax-vector-conversions=none -triple powerpc64le-unknown-unknown -emit-llvm %s \
11 // RUN: -target-cpu pwr9 -o - | FileCheck %s -check-prefix=LE-PWR9
12 // RUN: %clang_cc1 -flax-vector-conversions=none -triple powerpc-unknown-unknown -emit-llvm %s \
13 // RUN: -target-cpu pwr9 -o - | FileCheck %s -check-prefix=BE32-PWR9
15 #include <altivec.h>
16 // BE-PWR8-LABEL: @test_ldrmb1(
17 // BE-PWR8-NEXT: entry:
18 // BE-PWR8-NEXT: [[PTR_ADDR:%.*]] = alloca ptr, align 8
19 // BE-PWR8-NEXT: store ptr [[PTR:%.*]], ptr [[PTR_ADDR]], align 8
20 // BE-PWR8-NEXT: [[TMP0:%.*]] = load ptr, ptr [[PTR_ADDR]], align 8
21 // BE-PWR8-NEXT: [[TMP1:%.*]] = getelementptr i8, ptr [[TMP0]], i32 0
22 // BE-PWR8-NEXT: [[LD_LO:%.*]] = call <4 x i32> @llvm.ppc.altivec.lvx(ptr [[TMP0]])
23 // BE-PWR8-NEXT: [[LD_HI:%.*]] = call <4 x i32> @llvm.ppc.altivec.lvx(ptr [[TMP1]])
24 // BE-PWR8-NEXT: [[MASK1:%.*]] = call <16 x i8> @llvm.ppc.altivec.lvsl(ptr [[TMP0]])
25 // BE-PWR8-NEXT: [[SHUFFLE1:%.*]] = call <4 x i32> @llvm.ppc.altivec.vperm(<4 x i32> [[LD_LO]], <4 x i32> [[LD_HI]], <16 x i8> [[MASK1]])
26 // BE-PWR8-NEXT: [[SHUFFLE2:%.*]] = call <4 x i32> @llvm.ppc.altivec.vperm(<4 x i32> zeroinitializer, <4 x i32> [[SHUFFLE1]], <16 x i8> <i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16>)
27 // BE-PWR8-NEXT: [[TMP2:%.*]] = bitcast <4 x i32> [[SHUFFLE2]] to <16 x i8>
28 // BE-PWR8-NEXT: ret <16 x i8> [[TMP2]]
30 // LE-PWR8-LABEL: @test_ldrmb1(
31 // LE-PWR8-NEXT: entry:
32 // LE-PWR8-NEXT: [[PTR_ADDR:%.*]] = alloca ptr, align 8
33 // LE-PWR8-NEXT: store ptr [[PTR:%.*]], ptr [[PTR_ADDR]], align 8
34 // LE-PWR8-NEXT: [[TMP0:%.*]] = load ptr, ptr [[PTR_ADDR]], align 8
35 // LE-PWR8-NEXT: [[TMP1:%.*]] = getelementptr i8, ptr [[TMP0]], i32 0
36 // LE-PWR8-NEXT: [[LD_LO:%.*]] = call <4 x i32> @llvm.ppc.altivec.lvx(ptr [[TMP0]])
37 // LE-PWR8-NEXT: [[LD_HI:%.*]] = call <4 x i32> @llvm.ppc.altivec.lvx(ptr [[TMP1]])
38 // LE-PWR8-NEXT: [[MASK1:%.*]] = call <16 x i8> @llvm.ppc.altivec.lvsr(ptr [[TMP0]])
39 // LE-PWR8-NEXT: [[SHUFFLE1:%.*]] = call <4 x i32> @llvm.ppc.altivec.vperm(<4 x i32> [[LD_HI]], <4 x i32> [[LD_LO]], <16 x i8> [[MASK1]])
40 // LE-PWR8-NEXT: [[TMP2:%.*]] = bitcast <4 x i32> [[SHUFFLE1]] to <16 x i8>
41 // LE-PWR8-NEXT: [[TMP3:%.*]] = shufflevector <16 x i8> [[TMP2]], <16 x i8> zeroinitializer, <16 x i32> <i32 0, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30>
42 // LE-PWR8-NEXT: ret <16 x i8> [[TMP3]]
44 // BE-PWR9-LABEL: @test_ldrmb1(
45 // BE-PWR9-NEXT: entry:
46 // BE-PWR9-NEXT: [[__A_ADDR_I:%.*]] = alloca ptr, align 8
47 // BE-PWR9-NEXT: [[__B_ADDR_I:%.*]] = alloca i64, align 8
48 // BE-PWR9-NEXT: [[__RES_I:%.*]] = alloca <16 x i8>, align 16
49 // BE-PWR9-NEXT: [[__MASK_I:%.*]] = alloca <16 x i8>, align 16
50 // BE-PWR9-NEXT: [[PTR_ADDR:%.*]] = alloca ptr, align 8
51 // BE-PWR9-NEXT: store ptr [[PTR:%.*]], ptr [[PTR_ADDR]], align 8
52 // BE-PWR9-NEXT: [[TMP0:%.*]] = load ptr, ptr [[PTR_ADDR]], align 8
53 // BE-PWR9-NEXT: store ptr [[TMP0]], ptr [[__A_ADDR_I]], align 8
54 // BE-PWR9-NEXT: store i64 1, ptr [[__B_ADDR_I]], align 8
55 // BE-PWR9-NEXT: [[TMP1:%.*]] = load ptr, ptr [[__A_ADDR_I]], align 8
56 // BE-PWR9-NEXT: [[TMP2:%.*]] = load i64, ptr [[__B_ADDR_I]], align 8
57 // BE-PWR9-NEXT: [[SHL_I:%.*]] = shl i64 [[TMP2]], 56
58 // BE-PWR9-NEXT: [[TMP3:%.*]] = call <4 x i32> @llvm.ppc.vsx.lxvll(ptr [[TMP1]], i64 [[SHL_I]])
59 // BE-PWR9-NEXT: [[TMP4:%.*]] = bitcast <4 x i32> [[TMP3]] to <16 x i8>
60 // BE-PWR9-NEXT: store <16 x i8> [[TMP4]], ptr [[__RES_I]], align 16
61 // BE-PWR9-NEXT: [[TMP5:%.*]] = load i64, ptr [[__B_ADDR_I]], align 8
62 // BE-PWR9-NEXT: [[SUB_I:%.*]] = sub i64 16, [[TMP5]]
63 // BE-PWR9-NEXT: [[CONV_I:%.*]] = trunc i64 [[SUB_I]] to i8
64 // BE-PWR9-NEXT: [[TMP6:%.*]] = getelementptr i8, ptr null, i8 [[CONV_I]]
65 // BE-PWR9-NEXT: [[TMP7:%.*]] = call <16 x i8> @llvm.ppc.altivec.lvsr(ptr [[TMP6]])
66 // BE-PWR9-NEXT: store <16 x i8> [[TMP7]], ptr [[__MASK_I]], align 16
67 // BE-PWR9-NEXT: [[TMP8:%.*]] = load <16 x i8>, ptr [[__RES_I]], align 16
68 // BE-PWR9-NEXT: [[TMP9:%.*]] = bitcast <16 x i8> [[TMP8]] to <4 x i32>
69 // BE-PWR9-NEXT: [[TMP10:%.*]] = load <16 x i8>, ptr [[__RES_I]], align 16
70 // BE-PWR9-NEXT: [[TMP11:%.*]] = bitcast <16 x i8> [[TMP10]] to <4 x i32>
71 // BE-PWR9-NEXT: [[TMP12:%.*]] = load <16 x i8>, ptr [[__MASK_I]], align 16
72 // BE-PWR9-NEXT: [[TMP13:%.*]] = call <4 x i32> @llvm.ppc.altivec.vperm(<4 x i32> [[TMP9]], <4 x i32> [[TMP11]], <16 x i8> [[TMP12]])
73 // BE-PWR9-NEXT: [[TMP14:%.*]] = bitcast <4 x i32> [[TMP13]] to <16 x i8>
74 // BE-PWR9-NEXT: ret <16 x i8> [[TMP14]]
76 // LE-PWR9-LABEL: @test_ldrmb1(
77 // LE-PWR9-NEXT: entry:
78 // LE-PWR9-NEXT: [[__A_ADDR_I:%.*]] = alloca ptr, align 8
79 // LE-PWR9-NEXT: [[__B_ADDR_I:%.*]] = alloca i64, align 8
80 // LE-PWR9-NEXT: [[__RES_I:%.*]] = alloca <16 x i8>, align 16
81 // LE-PWR9-NEXT: [[__MASK_I:%.*]] = alloca <16 x i8>, align 16
82 // LE-PWR9-NEXT: [[PTR_ADDR:%.*]] = alloca ptr, align 8
83 // LE-PWR9-NEXT: store ptr [[PTR:%.*]], ptr [[PTR_ADDR]], align 8
84 // LE-PWR9-NEXT: [[TMP0:%.*]] = load ptr, ptr [[PTR_ADDR]], align 8
85 // LE-PWR9-NEXT: store ptr [[TMP0]], ptr [[__A_ADDR_I]], align 8
86 // LE-PWR9-NEXT: store i64 1, ptr [[__B_ADDR_I]], align 8
87 // LE-PWR9-NEXT: [[TMP1:%.*]] = load ptr, ptr [[__A_ADDR_I]], align 8
88 // LE-PWR9-NEXT: [[TMP2:%.*]] = load i64, ptr [[__B_ADDR_I]], align 8
89 // LE-PWR9-NEXT: [[SHL_I:%.*]] = shl i64 [[TMP2]], 56
90 // LE-PWR9-NEXT: [[TMP3:%.*]] = call <4 x i32> @llvm.ppc.vsx.lxvll(ptr [[TMP1]], i64 [[SHL_I]])
91 // LE-PWR9-NEXT: [[TMP4:%.*]] = bitcast <4 x i32> [[TMP3]] to <16 x i8>
92 // LE-PWR9-NEXT: store <16 x i8> [[TMP4]], ptr [[__RES_I]], align 16
93 // LE-PWR9-NEXT: [[TMP5:%.*]] = load i64, ptr [[__B_ADDR_I]], align 8
94 // LE-PWR9-NEXT: [[SUB_I:%.*]] = sub i64 16, [[TMP5]]
95 // LE-PWR9-NEXT: [[CONV_I:%.*]] = trunc i64 [[SUB_I]] to i8
96 // LE-PWR9-NEXT: [[TMP6:%.*]] = getelementptr i8, ptr null, i8 [[CONV_I]]
97 // LE-PWR9-NEXT: [[TMP7:%.*]] = call <16 x i8> @llvm.ppc.altivec.lvsr(ptr [[TMP6]])
98 // LE-PWR9-NEXT: store <16 x i8> [[TMP7]], ptr [[__MASK_I]], align 16
99 // LE-PWR9-NEXT: [[TMP8:%.*]] = load <16 x i8>, ptr [[__RES_I]], align 16
100 // LE-PWR9-NEXT: [[TMP9:%.*]] = bitcast <16 x i8> [[TMP8]] to <4 x i32>
101 // LE-PWR9-NEXT: [[TMP10:%.*]] = load <16 x i8>, ptr [[__RES_I]], align 16
102 // LE-PWR9-NEXT: [[TMP11:%.*]] = bitcast <16 x i8> [[TMP10]] to <4 x i32>
103 // LE-PWR9-NEXT: [[TMP12:%.*]] = load <16 x i8>, ptr [[__MASK_I]], align 16
104 // LE-PWR9-NEXT: [[TMP13:%.*]] = call <4 x i32> @llvm.ppc.altivec.vperm(<4 x i32> [[TMP9]], <4 x i32> [[TMP11]], <16 x i8> [[TMP12]])
105 // LE-PWR9-NEXT: [[TMP14:%.*]] = bitcast <4 x i32> [[TMP13]] to <16 x i8>
106 // LE-PWR9-NEXT: ret <16 x i8> [[TMP14]]
108 // BE32-PWR9-LABEL: @test_ldrmb1(
109 // BE32-PWR9-NEXT: entry:
110 // BE32-PWR9-NEXT: [[PTR_ADDR:%.*]] = alloca ptr, align 4
111 // BE32-PWR9-NEXT: store ptr [[PTR:%.*]], ptr [[PTR_ADDR]], align 4
112 // BE32-PWR9-NEXT: [[TMP0:%.*]] = load ptr, ptr [[PTR_ADDR]], align 4
113 // BE32-PWR9-NEXT: [[TMP1:%.*]] = getelementptr i8, ptr [[TMP0]], i32 0
114 // BE32-PWR9-NEXT: [[LD_LO:%.*]] = call <4 x i32> @llvm.ppc.altivec.lvx(ptr [[TMP0]])
115 // BE32-PWR9-NEXT: [[LD_HI:%.*]] = call <4 x i32> @llvm.ppc.altivec.lvx(ptr [[TMP1]])
116 // BE32-PWR9-NEXT: [[MASK1:%.*]] = call <16 x i8> @llvm.ppc.altivec.lvsl(ptr [[TMP0]])
117 // BE32-PWR9-NEXT: [[SHUFFLE1:%.*]] = call <4 x i32> @llvm.ppc.altivec.vperm(<4 x i32> [[LD_LO]], <4 x i32> [[LD_HI]], <16 x i8> [[MASK1]])
118 // BE32-PWR9-NEXT: [[SHUFFLE2:%.*]] = call <4 x i32> @llvm.ppc.altivec.vperm(<4 x i32> zeroinitializer, <4 x i32> [[SHUFFLE1]], <16 x i8> <i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16>)
119 // BE32-PWR9-NEXT: [[TMP2:%.*]] = bitcast <4 x i32> [[SHUFFLE2]] to <16 x i8>
120 // BE32-PWR9-NEXT: ret <16 x i8> [[TMP2]]
122 vector unsigned char test_ldrmb1(char *ptr) { return __vec_ldrmb(ptr, 1); }
124 // BE-PWR8-LABEL: @test_strmb1(
125 // BE-PWR8-NEXT: entry:
126 // BE-PWR8-NEXT: [[PTR_ADDR:%.*]] = alloca ptr, align 8
127 // BE-PWR8-NEXT: [[DATA_ADDR:%.*]] = alloca <16 x i8>, align 16
128 // BE-PWR8-NEXT: store ptr [[PTR:%.*]], ptr [[PTR_ADDR]], align 8
129 // BE-PWR8-NEXT: store <16 x i8> [[DATA:%.*]], ptr [[DATA_ADDR]], align 16
130 // BE-PWR8-NEXT: [[TMP0:%.*]] = load ptr, ptr [[PTR_ADDR]], align 8
131 // BE-PWR8-NEXT: [[TMP1:%.*]] = load <16 x i8>, ptr [[DATA_ADDR]], align 16
132 // BE-PWR8-NEXT: [[TMP2:%.*]] = getelementptr i8, ptr [[TMP0]], i64 0
133 // BE-PWR8-NEXT: [[TMP3:%.*]] = extractelement <16 x i8> [[TMP1]], i64 15
134 // BE-PWR8-NEXT: store i8 [[TMP3]], ptr [[TMP2]], align 1
135 // BE-PWR8-NEXT: ret void
137 // LE-PWR8-LABEL: @test_strmb1(
138 // LE-PWR8-NEXT: entry:
139 // LE-PWR8-NEXT: [[PTR_ADDR:%.*]] = alloca ptr, align 8
140 // LE-PWR8-NEXT: [[DATA_ADDR:%.*]] = alloca <16 x i8>, align 16
141 // LE-PWR8-NEXT: store ptr [[PTR:%.*]], ptr [[PTR_ADDR]], align 8
142 // LE-PWR8-NEXT: store <16 x i8> [[DATA:%.*]], ptr [[DATA_ADDR]], align 16
143 // LE-PWR8-NEXT: [[TMP0:%.*]] = load ptr, ptr [[PTR_ADDR]], align 8
144 // LE-PWR8-NEXT: [[TMP1:%.*]] = load <16 x i8>, ptr [[DATA_ADDR]], align 16
145 // LE-PWR8-NEXT: [[TMP2:%.*]] = getelementptr i8, ptr [[TMP0]], i64 0
146 // LE-PWR8-NEXT: [[TMP3:%.*]] = extractelement <16 x i8> [[TMP1]], i64 0
147 // LE-PWR8-NEXT: store i8 [[TMP3]], ptr [[TMP2]], align 1
148 // LE-PWR8-NEXT: ret void
150 // BE-PWR9-LABEL: @test_strmb1(
151 // BE-PWR9-NEXT: entry:
152 // BE-PWR9-NEXT: [[__A_ADDR_I:%.*]] = alloca <16 x i8>, align 16
153 // BE-PWR9-NEXT: [[__B_ADDR_I:%.*]] = alloca ptr, align 8
154 // BE-PWR9-NEXT: [[__C_ADDR_I:%.*]] = alloca i64, align 8
155 // BE-PWR9-NEXT: [[__MASK_I:%.*]] = alloca <16 x i8>, align 16
156 // BE-PWR9-NEXT: [[__RES_I:%.*]] = alloca <16 x i8>, align 16
157 // BE-PWR9-NEXT: [[PTR_ADDR:%.*]] = alloca ptr, align 8
158 // BE-PWR9-NEXT: [[DATA_ADDR:%.*]] = alloca <16 x i8>, align 16
159 // BE-PWR9-NEXT: store ptr [[PTR:%.*]], ptr [[PTR_ADDR]], align 8
160 // BE-PWR9-NEXT: store <16 x i8> [[DATA:%.*]], ptr [[DATA_ADDR]], align 16
161 // BE-PWR9-NEXT: [[TMP0:%.*]] = load <16 x i8>, ptr [[DATA_ADDR]], align 16
162 // BE-PWR9-NEXT: [[TMP1:%.*]] = load ptr, ptr [[PTR_ADDR]], align 8
163 // BE-PWR9-NEXT: store <16 x i8> [[TMP0]], ptr [[__A_ADDR_I]], align 16
164 // BE-PWR9-NEXT: store ptr [[TMP1]], ptr [[__B_ADDR_I]], align 8
165 // BE-PWR9-NEXT: store i64 1, ptr [[__C_ADDR_I]], align 8
166 // BE-PWR9-NEXT: [[TMP2:%.*]] = load i64, ptr [[__C_ADDR_I]], align 8
167 // BE-PWR9-NEXT: [[SUB_I:%.*]] = sub i64 16, [[TMP2]]
168 // BE-PWR9-NEXT: [[CONV_I:%.*]] = trunc i64 [[SUB_I]] to i8
169 // BE-PWR9-NEXT: [[TMP3:%.*]] = getelementptr i8, ptr null, i8 [[CONV_I]]
170 // BE-PWR9-NEXT: [[TMP4:%.*]] = call <16 x i8> @llvm.ppc.altivec.lvsl(ptr [[TMP3]])
171 // BE-PWR9-NEXT: store <16 x i8> [[TMP4]], ptr [[__MASK_I]], align 16
172 // BE-PWR9-NEXT: [[TMP5:%.*]] = load <16 x i8>, ptr [[__A_ADDR_I]], align 16
173 // BE-PWR9-NEXT: [[TMP6:%.*]] = bitcast <16 x i8> [[TMP5]] to <4 x i32>
174 // BE-PWR9-NEXT: [[TMP7:%.*]] = load <16 x i8>, ptr [[__A_ADDR_I]], align 16
175 // BE-PWR9-NEXT: [[TMP8:%.*]] = bitcast <16 x i8> [[TMP7]] to <4 x i32>
176 // BE-PWR9-NEXT: [[TMP9:%.*]] = load <16 x i8>, ptr [[__MASK_I]], align 16
177 // BE-PWR9-NEXT: [[TMP10:%.*]] = call <4 x i32> @llvm.ppc.altivec.vperm(<4 x i32> [[TMP6]], <4 x i32> [[TMP8]], <16 x i8> [[TMP9]])
178 // BE-PWR9-NEXT: [[TMP11:%.*]] = bitcast <4 x i32> [[TMP10]] to <16 x i8>
179 // BE-PWR9-NEXT: store <16 x i8> [[TMP11]], ptr [[__RES_I]], align 16
180 // BE-PWR9-NEXT: [[TMP12:%.*]] = load <16 x i8>, ptr [[__RES_I]], align 16
181 // BE-PWR9-NEXT: [[TMP13:%.*]] = bitcast <16 x i8> [[TMP12]] to <4 x i32>
182 // BE-PWR9-NEXT: [[TMP14:%.*]] = load ptr, ptr [[__B_ADDR_I]], align 8
183 // BE-PWR9-NEXT: [[TMP15:%.*]] = load i64, ptr [[__C_ADDR_I]], align 8
184 // BE-PWR9-NEXT: [[SHL_I:%.*]] = shl i64 [[TMP15]], 56
185 // BE-PWR9-NEXT: call void @llvm.ppc.vsx.stxvll(<4 x i32> [[TMP13]], ptr [[TMP14]], i64 [[SHL_I]])
186 // BE-PWR9-NEXT: ret void
188 // LE-PWR9-LABEL: @test_strmb1(
189 // LE-PWR9-NEXT: entry:
190 // LE-PWR9-NEXT: [[__A_ADDR_I:%.*]] = alloca <16 x i8>, align 16
191 // LE-PWR9-NEXT: [[__B_ADDR_I:%.*]] = alloca ptr, align 8
192 // LE-PWR9-NEXT: [[__C_ADDR_I:%.*]] = alloca i64, align 8
193 // LE-PWR9-NEXT: [[__MASK_I:%.*]] = alloca <16 x i8>, align 16
194 // LE-PWR9-NEXT: [[__RES_I:%.*]] = alloca <16 x i8>, align 16
195 // LE-PWR9-NEXT: [[PTR_ADDR:%.*]] = alloca ptr, align 8
196 // LE-PWR9-NEXT: [[DATA_ADDR:%.*]] = alloca <16 x i8>, align 16
197 // LE-PWR9-NEXT: store ptr [[PTR:%.*]], ptr [[PTR_ADDR]], align 8
198 // LE-PWR9-NEXT: store <16 x i8> [[DATA:%.*]], ptr [[DATA_ADDR]], align 16
199 // LE-PWR9-NEXT: [[TMP0:%.*]] = load <16 x i8>, ptr [[DATA_ADDR]], align 16
200 // LE-PWR9-NEXT: [[TMP1:%.*]] = load ptr, ptr [[PTR_ADDR]], align 8
201 // LE-PWR9-NEXT: store <16 x i8> [[TMP0]], ptr [[__A_ADDR_I]], align 16
202 // LE-PWR9-NEXT: store ptr [[TMP1]], ptr [[__B_ADDR_I]], align 8
203 // LE-PWR9-NEXT: store i64 1, ptr [[__C_ADDR_I]], align 8
204 // LE-PWR9-NEXT: [[TMP2:%.*]] = load i64, ptr [[__C_ADDR_I]], align 8
205 // LE-PWR9-NEXT: [[SUB_I:%.*]] = sub i64 16, [[TMP2]]
206 // LE-PWR9-NEXT: [[CONV_I:%.*]] = trunc i64 [[SUB_I]] to i8
207 // LE-PWR9-NEXT: [[TMP3:%.*]] = getelementptr i8, ptr null, i8 [[CONV_I]]
208 // LE-PWR9-NEXT: [[TMP4:%.*]] = call <16 x i8> @llvm.ppc.altivec.lvsl(ptr [[TMP3]])
209 // LE-PWR9-NEXT: store <16 x i8> [[TMP4]], ptr [[__MASK_I]], align 16
210 // LE-PWR9-NEXT: [[TMP5:%.*]] = load <16 x i8>, ptr [[__A_ADDR_I]], align 16
211 // LE-PWR9-NEXT: [[TMP6:%.*]] = bitcast <16 x i8> [[TMP5]] to <4 x i32>
212 // LE-PWR9-NEXT: [[TMP7:%.*]] = load <16 x i8>, ptr [[__A_ADDR_I]], align 16
213 // LE-PWR9-NEXT: [[TMP8:%.*]] = bitcast <16 x i8> [[TMP7]] to <4 x i32>
214 // LE-PWR9-NEXT: [[TMP9:%.*]] = load <16 x i8>, ptr [[__MASK_I]], align 16
215 // LE-PWR9-NEXT: [[TMP10:%.*]] = call <4 x i32> @llvm.ppc.altivec.vperm(<4 x i32> [[TMP6]], <4 x i32> [[TMP8]], <16 x i8> [[TMP9]])
216 // LE-PWR9-NEXT: [[TMP11:%.*]] = bitcast <4 x i32> [[TMP10]] to <16 x i8>
217 // LE-PWR9-NEXT: store <16 x i8> [[TMP11]], ptr [[__RES_I]], align 16
218 // LE-PWR9-NEXT: [[TMP12:%.*]] = load <16 x i8>, ptr [[__RES_I]], align 16
219 // LE-PWR9-NEXT: [[TMP13:%.*]] = bitcast <16 x i8> [[TMP12]] to <4 x i32>
220 // LE-PWR9-NEXT: [[TMP14:%.*]] = load ptr, ptr [[__B_ADDR_I]], align 8
221 // LE-PWR9-NEXT: [[TMP15:%.*]] = load i64, ptr [[__C_ADDR_I]], align 8
222 // LE-PWR9-NEXT: [[SHL_I:%.*]] = shl i64 [[TMP15]], 56
223 // LE-PWR9-NEXT: call void @llvm.ppc.vsx.stxvll(<4 x i32> [[TMP13]], ptr [[TMP14]], i64 [[SHL_I]])
224 // LE-PWR9-NEXT: ret void
226 // BE32-PWR9-LABEL: @test_strmb1(
227 // BE32-PWR9-NEXT: entry:
228 // BE32-PWR9-NEXT: [[PTR_ADDR:%.*]] = alloca ptr, align 4
229 // BE32-PWR9-NEXT: [[DATA_ADDR:%.*]] = alloca <16 x i8>, align 16
230 // BE32-PWR9-NEXT: store ptr [[PTR:%.*]], ptr [[PTR_ADDR]], align 4
231 // BE32-PWR9-NEXT: store <16 x i8> [[DATA:%.*]], ptr [[DATA_ADDR]], align 16
232 // BE32-PWR9-NEXT: [[TMP0:%.*]] = load ptr, ptr [[PTR_ADDR]], align 4
233 // BE32-PWR9-NEXT: [[TMP1:%.*]] = load <16 x i8>, ptr [[DATA_ADDR]], align 16
234 // BE32-PWR9-NEXT: [[TMP2:%.*]] = getelementptr i8, ptr [[TMP0]], i64 0
235 // BE32-PWR9-NEXT: [[TMP3:%.*]] = extractelement <16 x i8> [[TMP1]], i64 15
236 // BE32-PWR9-NEXT: store i8 [[TMP3]], ptr [[TMP2]], align 1
237 // BE32-PWR9-NEXT: ret void
239 void test_strmb1(char *ptr, vector unsigned char data) {
240 __vec_strmb(ptr, 1, data);
243 // BE-PWR8-LABEL: @test_strmb2(
244 // BE-PWR8-NEXT: entry:
245 // BE-PWR8-NEXT: [[PTR_ADDR:%.*]] = alloca ptr, align 8
246 // BE-PWR8-NEXT: [[DATA_ADDR:%.*]] = alloca <16 x i8>, align 16
247 // BE-PWR8-NEXT: store ptr [[PTR:%.*]], ptr [[PTR_ADDR]], align 8
248 // BE-PWR8-NEXT: store <16 x i8> [[DATA:%.*]], ptr [[DATA_ADDR]], align 16
249 // BE-PWR8-NEXT: [[TMP0:%.*]] = load ptr, ptr [[PTR_ADDR]], align 8
250 // BE-PWR8-NEXT: [[TMP1:%.*]] = load <16 x i8>, ptr [[DATA_ADDR]], align 16
251 // BE-PWR8-NEXT: [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
252 // BE-PWR8-NEXT: [[TMP3:%.*]] = getelementptr i8, ptr [[TMP0]], i64 0
253 // BE-PWR8-NEXT: [[TMP5:%.*]] = extractelement <8 x i16> [[TMP2]], i64 7
254 // BE-PWR8-NEXT: store i16 [[TMP5]], ptr [[TMP3]], align 1
255 // BE-PWR8-NEXT: ret void
257 // LE-PWR8-LABEL: @test_strmb2(
258 // LE-PWR8-NEXT: entry:
259 // LE-PWR8-NEXT: [[PTR_ADDR:%.*]] = alloca ptr, align 8
260 // LE-PWR8-NEXT: [[DATA_ADDR:%.*]] = alloca <16 x i8>, align 16
261 // LE-PWR8-NEXT: store ptr [[PTR:%.*]], ptr [[PTR_ADDR]], align 8
262 // LE-PWR8-NEXT: store <16 x i8> [[DATA:%.*]], ptr [[DATA_ADDR]], align 16
263 // LE-PWR8-NEXT: [[TMP0:%.*]] = load ptr, ptr [[PTR_ADDR]], align 8
264 // LE-PWR8-NEXT: [[TMP1:%.*]] = load <16 x i8>, ptr [[DATA_ADDR]], align 16
265 // LE-PWR8-NEXT: [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
266 // LE-PWR8-NEXT: [[TMP3:%.*]] = getelementptr i8, ptr [[TMP0]], i64 0
267 // LE-PWR8-NEXT: [[TMP5:%.*]] = extractelement <8 x i16> [[TMP2]], i64 0
268 // LE-PWR8-NEXT: [[TMP6:%.*]] = call i16 @llvm.bswap.i16(i16 [[TMP5]])
269 // LE-PWR8-NEXT: store i16 [[TMP6]], ptr [[TMP3]], align 1
270 // LE-PWR8-NEXT: ret void
272 // BE-PWR9-LABEL: @test_strmb2(
273 // BE-PWR9-NEXT: entry:
274 // BE-PWR9-NEXT: [[__A_ADDR_I:%.*]] = alloca <16 x i8>, align 16
275 // BE-PWR9-NEXT: [[__B_ADDR_I:%.*]] = alloca ptr, align 8
276 // BE-PWR9-NEXT: [[__C_ADDR_I:%.*]] = alloca i64, align 8
277 // BE-PWR9-NEXT: [[__MASK_I:%.*]] = alloca <16 x i8>, align 16
278 // BE-PWR9-NEXT: [[__RES_I:%.*]] = alloca <16 x i8>, align 16
279 // BE-PWR9-NEXT: [[PTR_ADDR:%.*]] = alloca ptr, align 8
280 // BE-PWR9-NEXT: [[DATA_ADDR:%.*]] = alloca <16 x i8>, align 16
281 // BE-PWR9-NEXT: store ptr [[PTR:%.*]], ptr [[PTR_ADDR]], align 8
282 // BE-PWR9-NEXT: store <16 x i8> [[DATA:%.*]], ptr [[DATA_ADDR]], align 16
283 // BE-PWR9-NEXT: [[TMP0:%.*]] = load <16 x i8>, ptr [[DATA_ADDR]], align 16
284 // BE-PWR9-NEXT: [[TMP1:%.*]] = load ptr, ptr [[PTR_ADDR]], align 8
285 // BE-PWR9-NEXT: store <16 x i8> [[TMP0]], ptr [[__A_ADDR_I]], align 16
286 // BE-PWR9-NEXT: store ptr [[TMP1]], ptr [[__B_ADDR_I]], align 8
287 // BE-PWR9-NEXT: store i64 2, ptr [[__C_ADDR_I]], align 8
288 // BE-PWR9-NEXT: [[TMP2:%.*]] = load i64, ptr [[__C_ADDR_I]], align 8
289 // BE-PWR9-NEXT: [[SUB_I:%.*]] = sub i64 16, [[TMP2]]
290 // BE-PWR9-NEXT: [[CONV_I:%.*]] = trunc i64 [[SUB_I]] to i8
291 // BE-PWR9-NEXT: [[TMP3:%.*]] = getelementptr i8, ptr null, i8 [[CONV_I]]
292 // BE-PWR9-NEXT: [[TMP4:%.*]] = call <16 x i8> @llvm.ppc.altivec.lvsl(ptr [[TMP3]])
293 // BE-PWR9-NEXT: store <16 x i8> [[TMP4]], ptr [[__MASK_I]], align 16
294 // BE-PWR9-NEXT: [[TMP5:%.*]] = load <16 x i8>, ptr [[__A_ADDR_I]], align 16
295 // BE-PWR9-NEXT: [[TMP6:%.*]] = bitcast <16 x i8> [[TMP5]] to <4 x i32>
296 // BE-PWR9-NEXT: [[TMP7:%.*]] = load <16 x i8>, ptr [[__A_ADDR_I]], align 16
297 // BE-PWR9-NEXT: [[TMP8:%.*]] = bitcast <16 x i8> [[TMP7]] to <4 x i32>
298 // BE-PWR9-NEXT: [[TMP9:%.*]] = load <16 x i8>, ptr [[__MASK_I]], align 16
299 // BE-PWR9-NEXT: [[TMP10:%.*]] = call <4 x i32> @llvm.ppc.altivec.vperm(<4 x i32> [[TMP6]], <4 x i32> [[TMP8]], <16 x i8> [[TMP9]])
300 // BE-PWR9-NEXT: [[TMP11:%.*]] = bitcast <4 x i32> [[TMP10]] to <16 x i8>
301 // BE-PWR9-NEXT: store <16 x i8> [[TMP11]], ptr [[__RES_I]], align 16
302 // BE-PWR9-NEXT: [[TMP12:%.*]] = load <16 x i8>, ptr [[__RES_I]], align 16
303 // BE-PWR9-NEXT: [[TMP13:%.*]] = bitcast <16 x i8> [[TMP12]] to <4 x i32>
304 // BE-PWR9-NEXT: [[TMP14:%.*]] = load ptr, ptr [[__B_ADDR_I]], align 8
305 // BE-PWR9-NEXT: [[TMP15:%.*]] = load i64, ptr [[__C_ADDR_I]], align 8
306 // BE-PWR9-NEXT: [[SHL_I:%.*]] = shl i64 [[TMP15]], 56
307 // BE-PWR9-NEXT: call void @llvm.ppc.vsx.stxvll(<4 x i32> [[TMP13]], ptr [[TMP14]], i64 [[SHL_I]])
308 // BE-PWR9-NEXT: ret void
310 // LE-PWR9-LABEL: @test_strmb2(
311 // LE-PWR9-NEXT: entry:
312 // LE-PWR9-NEXT: [[__A_ADDR_I:%.*]] = alloca <16 x i8>, align 16
313 // LE-PWR9-NEXT: [[__B_ADDR_I:%.*]] = alloca ptr, align 8
314 // LE-PWR9-NEXT: [[__C_ADDR_I:%.*]] = alloca i64, align 8
315 // LE-PWR9-NEXT: [[__MASK_I:%.*]] = alloca <16 x i8>, align 16
316 // LE-PWR9-NEXT: [[__RES_I:%.*]] = alloca <16 x i8>, align 16
317 // LE-PWR9-NEXT: [[PTR_ADDR:%.*]] = alloca ptr, align 8
318 // LE-PWR9-NEXT: [[DATA_ADDR:%.*]] = alloca <16 x i8>, align 16
319 // LE-PWR9-NEXT: store ptr [[PTR:%.*]], ptr [[PTR_ADDR]], align 8
320 // LE-PWR9-NEXT: store <16 x i8> [[DATA:%.*]], ptr [[DATA_ADDR]], align 16
321 // LE-PWR9-NEXT: [[TMP0:%.*]] = load <16 x i8>, ptr [[DATA_ADDR]], align 16
322 // LE-PWR9-NEXT: [[TMP1:%.*]] = load ptr, ptr [[PTR_ADDR]], align 8
323 // LE-PWR9-NEXT: store <16 x i8> [[TMP0]], ptr [[__A_ADDR_I]], align 16
324 // LE-PWR9-NEXT: store ptr [[TMP1]], ptr [[__B_ADDR_I]], align 8
325 // LE-PWR9-NEXT: store i64 2, ptr [[__C_ADDR_I]], align 8
326 // LE-PWR9-NEXT: [[TMP2:%.*]] = load i64, ptr [[__C_ADDR_I]], align 8
327 // LE-PWR9-NEXT: [[SUB_I:%.*]] = sub i64 16, [[TMP2]]
328 // LE-PWR9-NEXT: [[CONV_I:%.*]] = trunc i64 [[SUB_I]] to i8
329 // LE-PWR9-NEXT: [[TMP3:%.*]] = getelementptr i8, ptr null, i8 [[CONV_I]]
330 // LE-PWR9-NEXT: [[TMP4:%.*]] = call <16 x i8> @llvm.ppc.altivec.lvsl(ptr [[TMP3]])
331 // LE-PWR9-NEXT: store <16 x i8> [[TMP4]], ptr [[__MASK_I]], align 16
332 // LE-PWR9-NEXT: [[TMP5:%.*]] = load <16 x i8>, ptr [[__A_ADDR_I]], align 16
333 // LE-PWR9-NEXT: [[TMP6:%.*]] = bitcast <16 x i8> [[TMP5]] to <4 x i32>
334 // LE-PWR9-NEXT: [[TMP7:%.*]] = load <16 x i8>, ptr [[__A_ADDR_I]], align 16
335 // LE-PWR9-NEXT: [[TMP8:%.*]] = bitcast <16 x i8> [[TMP7]] to <4 x i32>
336 // LE-PWR9-NEXT: [[TMP9:%.*]] = load <16 x i8>, ptr [[__MASK_I]], align 16
337 // LE-PWR9-NEXT: [[TMP10:%.*]] = call <4 x i32> @llvm.ppc.altivec.vperm(<4 x i32> [[TMP6]], <4 x i32> [[TMP8]], <16 x i8> [[TMP9]])
338 // LE-PWR9-NEXT: [[TMP11:%.*]] = bitcast <4 x i32> [[TMP10]] to <16 x i8>
339 // LE-PWR9-NEXT: store <16 x i8> [[TMP11]], ptr [[__RES_I]], align 16
340 // LE-PWR9-NEXT: [[TMP12:%.*]] = load <16 x i8>, ptr [[__RES_I]], align 16
341 // LE-PWR9-NEXT: [[TMP13:%.*]] = bitcast <16 x i8> [[TMP12]] to <4 x i32>
342 // LE-PWR9-NEXT: [[TMP14:%.*]] = load ptr, ptr [[__B_ADDR_I]], align 8
343 // LE-PWR9-NEXT: [[TMP15:%.*]] = load i64, ptr [[__C_ADDR_I]], align 8
344 // LE-PWR9-NEXT: [[SHL_I:%.*]] = shl i64 [[TMP15]], 56
345 // LE-PWR9-NEXT: call void @llvm.ppc.vsx.stxvll(<4 x i32> [[TMP13]], ptr [[TMP14]], i64 [[SHL_I]])
346 // LE-PWR9-NEXT: ret void
348 // BE32-PWR9-LABEL: @test_strmb2(
349 // BE32-PWR9-NEXT: entry:
350 // BE32-PWR9-NEXT: [[PTR_ADDR:%.*]] = alloca ptr, align 4
351 // BE32-PWR9-NEXT: [[DATA_ADDR:%.*]] = alloca <16 x i8>, align 16
352 // BE32-PWR9-NEXT: store ptr [[PTR:%.*]], ptr [[PTR_ADDR]], align 4
353 // BE32-PWR9-NEXT: store <16 x i8> [[DATA:%.*]], ptr [[DATA_ADDR]], align 16
354 // BE32-PWR9-NEXT: [[TMP0:%.*]] = load ptr, ptr [[PTR_ADDR]], align 4
355 // BE32-PWR9-NEXT: [[TMP1:%.*]] = load <16 x i8>, ptr [[DATA_ADDR]], align 16
356 // BE32-PWR9-NEXT: [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
357 // BE32-PWR9-NEXT: [[TMP3:%.*]] = getelementptr i8, ptr [[TMP0]], i64 0
358 // BE32-PWR9-NEXT: [[TMP5:%.*]] = extractelement <8 x i16> [[TMP2]], i64 7
359 // BE32-PWR9-NEXT: store i16 [[TMP5]], ptr [[TMP3]], align 1
360 // BE32-PWR9-NEXT: ret void
362 void test_strmb2(char *ptr, vector unsigned char data) {
363 __vec_strmb(ptr, 2, data);
366 // BE-PWR8-LABEL: @test_strmb3(
367 // BE-PWR8-NEXT: entry:
368 // BE-PWR8-NEXT: [[PTR_ADDR:%.*]] = alloca ptr, align 8
369 // BE-PWR8-NEXT: [[DATA_ADDR:%.*]] = alloca <16 x i8>, align 16
370 // BE-PWR8-NEXT: store ptr [[PTR:%.*]], ptr [[PTR_ADDR]], align 8
371 // BE-PWR8-NEXT: store <16 x i8> [[DATA:%.*]], ptr [[DATA_ADDR]], align 16
372 // BE-PWR8-NEXT: [[TMP0:%.*]] = load ptr, ptr [[PTR_ADDR]], align 8
373 // BE-PWR8-NEXT: [[TMP1:%.*]] = load <16 x i8>, ptr [[DATA_ADDR]], align 16
374 // BE-PWR8-NEXT: [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
375 // BE-PWR8-NEXT: [[TMP3:%.*]] = getelementptr i8, ptr [[TMP0]], i64 1
376 // BE-PWR8-NEXT: [[TMP5:%.*]] = extractelement <8 x i16> [[TMP2]], i64 7
377 // BE-PWR8-NEXT: store i16 [[TMP5]], ptr [[TMP3]], align 1
378 // BE-PWR8-NEXT: [[TMP6:%.*]] = getelementptr i8, ptr [[TMP0]], i64 0
379 // BE-PWR8-NEXT: [[TMP7:%.*]] = extractelement <16 x i8> [[TMP1]], i64 13
380 // BE-PWR8-NEXT: store i8 [[TMP7]], ptr [[TMP6]], align 1
381 // BE-PWR8-NEXT: ret void
383 // LE-PWR8-LABEL: @test_strmb3(
384 // LE-PWR8-NEXT: entry:
385 // LE-PWR8-NEXT: [[PTR_ADDR:%.*]] = alloca ptr, align 8
386 // LE-PWR8-NEXT: [[DATA_ADDR:%.*]] = alloca <16 x i8>, align 16
387 // LE-PWR8-NEXT: store ptr [[PTR:%.*]], ptr [[PTR_ADDR]], align 8
388 // LE-PWR8-NEXT: store <16 x i8> [[DATA:%.*]], ptr [[DATA_ADDR]], align 16
389 // LE-PWR8-NEXT: [[TMP0:%.*]] = load ptr, ptr [[PTR_ADDR]], align 8
390 // LE-PWR8-NEXT: [[TMP1:%.*]] = load <16 x i8>, ptr [[DATA_ADDR]], align 16
391 // LE-PWR8-NEXT: [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
392 // LE-PWR8-NEXT: [[TMP3:%.*]] = getelementptr i8, ptr [[TMP0]], i64 1
393 // LE-PWR8-NEXT: [[TMP5:%.*]] = extractelement <8 x i16> [[TMP2]], i64 0
394 // LE-PWR8-NEXT: [[TMP6:%.*]] = call i16 @llvm.bswap.i16(i16 [[TMP5]])
395 // LE-PWR8-NEXT: store i16 [[TMP6]], ptr [[TMP3]], align 1
396 // LE-PWR8-NEXT: [[TMP7:%.*]] = getelementptr i8, ptr [[TMP0]], i64 0
397 // LE-PWR8-NEXT: [[TMP8:%.*]] = extractelement <16 x i8> [[TMP1]], i64 2
398 // LE-PWR8-NEXT: store i8 [[TMP8]], ptr [[TMP7]], align 1
399 // LE-PWR8-NEXT: ret void
401 // BE-PWR9-LABEL: @test_strmb3(
402 // BE-PWR9-NEXT: entry:
403 // BE-PWR9-NEXT: [[__A_ADDR_I:%.*]] = alloca <16 x i8>, align 16
404 // BE-PWR9-NEXT: [[__B_ADDR_I:%.*]] = alloca ptr, align 8
405 // BE-PWR9-NEXT: [[__C_ADDR_I:%.*]] = alloca i64, align 8
406 // BE-PWR9-NEXT: [[__MASK_I:%.*]] = alloca <16 x i8>, align 16
407 // BE-PWR9-NEXT: [[__RES_I:%.*]] = alloca <16 x i8>, align 16
408 // BE-PWR9-NEXT: [[PTR_ADDR:%.*]] = alloca ptr, align 8
409 // BE-PWR9-NEXT: [[DATA_ADDR:%.*]] = alloca <16 x i8>, align 16
410 // BE-PWR9-NEXT: store ptr [[PTR:%.*]], ptr [[PTR_ADDR]], align 8
411 // BE-PWR9-NEXT: store <16 x i8> [[DATA:%.*]], ptr [[DATA_ADDR]], align 16
412 // BE-PWR9-NEXT: [[TMP0:%.*]] = load <16 x i8>, ptr [[DATA_ADDR]], align 16
413 // BE-PWR9-NEXT: [[TMP1:%.*]] = load ptr, ptr [[PTR_ADDR]], align 8
414 // BE-PWR9-NEXT: store <16 x i8> [[TMP0]], ptr [[__A_ADDR_I]], align 16
415 // BE-PWR9-NEXT: store ptr [[TMP1]], ptr [[__B_ADDR_I]], align 8
416 // BE-PWR9-NEXT: store i64 3, ptr [[__C_ADDR_I]], align 8
417 // BE-PWR9-NEXT: [[TMP2:%.*]] = load i64, ptr [[__C_ADDR_I]], align 8
418 // BE-PWR9-NEXT: [[SUB_I:%.*]] = sub i64 16, [[TMP2]]
419 // BE-PWR9-NEXT: [[CONV_I:%.*]] = trunc i64 [[SUB_I]] to i8
420 // BE-PWR9-NEXT: [[TMP3:%.*]] = getelementptr i8, ptr null, i8 [[CONV_I]]
421 // BE-PWR9-NEXT: [[TMP4:%.*]] = call <16 x i8> @llvm.ppc.altivec.lvsl(ptr [[TMP3]])
422 // BE-PWR9-NEXT: store <16 x i8> [[TMP4]], ptr [[__MASK_I]], align 16
423 // BE-PWR9-NEXT: [[TMP5:%.*]] = load <16 x i8>, ptr [[__A_ADDR_I]], align 16
424 // BE-PWR9-NEXT: [[TMP6:%.*]] = bitcast <16 x i8> [[TMP5]] to <4 x i32>
425 // BE-PWR9-NEXT: [[TMP7:%.*]] = load <16 x i8>, ptr [[__A_ADDR_I]], align 16
426 // BE-PWR9-NEXT: [[TMP8:%.*]] = bitcast <16 x i8> [[TMP7]] to <4 x i32>
427 // BE-PWR9-NEXT: [[TMP9:%.*]] = load <16 x i8>, ptr [[__MASK_I]], align 16
428 // BE-PWR9-NEXT: [[TMP10:%.*]] = call <4 x i32> @llvm.ppc.altivec.vperm(<4 x i32> [[TMP6]], <4 x i32> [[TMP8]], <16 x i8> [[TMP9]])
429 // BE-PWR9-NEXT: [[TMP11:%.*]] = bitcast <4 x i32> [[TMP10]] to <16 x i8>
430 // BE-PWR9-NEXT: store <16 x i8> [[TMP11]], ptr [[__RES_I]], align 16
431 // BE-PWR9-NEXT: [[TMP12:%.*]] = load <16 x i8>, ptr [[__RES_I]], align 16
432 // BE-PWR9-NEXT: [[TMP13:%.*]] = bitcast <16 x i8> [[TMP12]] to <4 x i32>
433 // BE-PWR9-NEXT: [[TMP14:%.*]] = load ptr, ptr [[__B_ADDR_I]], align 8
434 // BE-PWR9-NEXT: [[TMP15:%.*]] = load i64, ptr [[__C_ADDR_I]], align 8
435 // BE-PWR9-NEXT: [[SHL_I:%.*]] = shl i64 [[TMP15]], 56
436 // BE-PWR9-NEXT: call void @llvm.ppc.vsx.stxvll(<4 x i32> [[TMP13]], ptr [[TMP14]], i64 [[SHL_I]])
437 // BE-PWR9-NEXT: ret void
439 // LE-PWR9-LABEL: @test_strmb3(
440 // LE-PWR9-NEXT: entry:
441 // LE-PWR9-NEXT: [[__A_ADDR_I:%.*]] = alloca <16 x i8>, align 16
442 // LE-PWR9-NEXT: [[__B_ADDR_I:%.*]] = alloca ptr, align 8
443 // LE-PWR9-NEXT: [[__C_ADDR_I:%.*]] = alloca i64, align 8
444 // LE-PWR9-NEXT: [[__MASK_I:%.*]] = alloca <16 x i8>, align 16
445 // LE-PWR9-NEXT: [[__RES_I:%.*]] = alloca <16 x i8>, align 16
446 // LE-PWR9-NEXT: [[PTR_ADDR:%.*]] = alloca ptr, align 8
447 // LE-PWR9-NEXT: [[DATA_ADDR:%.*]] = alloca <16 x i8>, align 16
448 // LE-PWR9-NEXT: store ptr [[PTR:%.*]], ptr [[PTR_ADDR]], align 8
449 // LE-PWR9-NEXT: store <16 x i8> [[DATA:%.*]], ptr [[DATA_ADDR]], align 16
450 // LE-PWR9-NEXT: [[TMP0:%.*]] = load <16 x i8>, ptr [[DATA_ADDR]], align 16
451 // LE-PWR9-NEXT: [[TMP1:%.*]] = load ptr, ptr [[PTR_ADDR]], align 8
452 // LE-PWR9-NEXT: store <16 x i8> [[TMP0]], ptr [[__A_ADDR_I]], align 16
453 // LE-PWR9-NEXT: store ptr [[TMP1]], ptr [[__B_ADDR_I]], align 8
454 // LE-PWR9-NEXT: store i64 3, ptr [[__C_ADDR_I]], align 8
455 // LE-PWR9-NEXT: [[TMP2:%.*]] = load i64, ptr [[__C_ADDR_I]], align 8
456 // LE-PWR9-NEXT: [[SUB_I:%.*]] = sub i64 16, [[TMP2]]
457 // LE-PWR9-NEXT: [[CONV_I:%.*]] = trunc i64 [[SUB_I]] to i8
458 // LE-PWR9-NEXT: [[TMP3:%.*]] = getelementptr i8, ptr null, i8 [[CONV_I]]
459 // LE-PWR9-NEXT: [[TMP4:%.*]] = call <16 x i8> @llvm.ppc.altivec.lvsl(ptr [[TMP3]])
460 // LE-PWR9-NEXT: store <16 x i8> [[TMP4]], ptr [[__MASK_I]], align 16
461 // LE-PWR9-NEXT: [[TMP5:%.*]] = load <16 x i8>, ptr [[__A_ADDR_I]], align 16
462 // LE-PWR9-NEXT: [[TMP6:%.*]] = bitcast <16 x i8> [[TMP5]] to <4 x i32>
463 // LE-PWR9-NEXT: [[TMP7:%.*]] = load <16 x i8>, ptr [[__A_ADDR_I]], align 16
464 // LE-PWR9-NEXT: [[TMP8:%.*]] = bitcast <16 x i8> [[TMP7]] to <4 x i32>
465 // LE-PWR9-NEXT: [[TMP9:%.*]] = load <16 x i8>, ptr [[__MASK_I]], align 16
466 // LE-PWR9-NEXT: [[TMP10:%.*]] = call <4 x i32> @llvm.ppc.altivec.vperm(<4 x i32> [[TMP6]], <4 x i32> [[TMP8]], <16 x i8> [[TMP9]])
467 // LE-PWR9-NEXT: [[TMP11:%.*]] = bitcast <4 x i32> [[TMP10]] to <16 x i8>
468 // LE-PWR9-NEXT: store <16 x i8> [[TMP11]], ptr [[__RES_I]], align 16
469 // LE-PWR9-NEXT: [[TMP12:%.*]] = load <16 x i8>, ptr [[__RES_I]], align 16
470 // LE-PWR9-NEXT: [[TMP13:%.*]] = bitcast <16 x i8> [[TMP12]] to <4 x i32>
471 // LE-PWR9-NEXT: [[TMP14:%.*]] = load ptr, ptr [[__B_ADDR_I]], align 8
472 // LE-PWR9-NEXT: [[TMP15:%.*]] = load i64, ptr [[__C_ADDR_I]], align 8
473 // LE-PWR9-NEXT: [[SHL_I:%.*]] = shl i64 [[TMP15]], 56
474 // LE-PWR9-NEXT: call void @llvm.ppc.vsx.stxvll(<4 x i32> [[TMP13]], ptr [[TMP14]], i64 [[SHL_I]])
475 // LE-PWR9-NEXT: ret void
477 // BE32-PWR9-LABEL: @test_strmb3(
478 // BE32-PWR9-NEXT: entry:
479 // BE32-PWR9-NEXT: [[PTR_ADDR:%.*]] = alloca ptr, align 4
480 // BE32-PWR9-NEXT: [[DATA_ADDR:%.*]] = alloca <16 x i8>, align 16
481 // BE32-PWR9-NEXT: store ptr [[PTR:%.*]], ptr [[PTR_ADDR]], align 4
482 // BE32-PWR9-NEXT: store <16 x i8> [[DATA:%.*]], ptr [[DATA_ADDR]], align 16
483 // BE32-PWR9-NEXT: [[TMP0:%.*]] = load ptr, ptr [[PTR_ADDR]], align 4
484 // BE32-PWR9-NEXT: [[TMP1:%.*]] = load <16 x i8>, ptr [[DATA_ADDR]], align 16
485 // BE32-PWR9-NEXT: [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
486 // BE32-PWR9-NEXT: [[TMP3:%.*]] = getelementptr i8, ptr [[TMP0]], i64 1
487 // BE32-PWR9-NEXT: [[TMP5:%.*]] = extractelement <8 x i16> [[TMP2]], i64 7
488 // BE32-PWR9-NEXT: store i16 [[TMP5]], ptr [[TMP3]], align 1
489 // BE32-PWR9-NEXT: [[TMP6:%.*]] = getelementptr i8, ptr [[TMP0]], i64 0
490 // BE32-PWR9-NEXT: [[TMP7:%.*]] = extractelement <16 x i8> [[TMP1]], i64 13
491 // BE32-PWR9-NEXT: store i8 [[TMP7]], ptr [[TMP6]], align 1
492 // BE32-PWR9-NEXT: ret void
494 void test_strmb3(char *ptr, vector unsigned char data) {
495 __vec_strmb(ptr, 3, data);
498 // BE-PWR8-LABEL: @test_strmb4(
499 // BE-PWR8-NEXT: entry:
500 // BE-PWR8-NEXT: [[PTR_ADDR:%.*]] = alloca ptr, align 8
501 // BE-PWR8-NEXT: [[DATA_ADDR:%.*]] = alloca <16 x i8>, align 16
502 // BE-PWR8-NEXT: store ptr [[PTR:%.*]], ptr [[PTR_ADDR]], align 8
503 // BE-PWR8-NEXT: store <16 x i8> [[DATA:%.*]], ptr [[DATA_ADDR]], align 16
504 // BE-PWR8-NEXT: [[TMP0:%.*]] = load ptr, ptr [[PTR_ADDR]], align 8
505 // BE-PWR8-NEXT: [[TMP1:%.*]] = load <16 x i8>, ptr [[DATA_ADDR]], align 16
506 // BE-PWR8-NEXT: [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
507 // BE-PWR8-NEXT: [[TMP3:%.*]] = getelementptr i8, ptr [[TMP0]], i64 0
508 // BE-PWR8-NEXT: [[TMP5:%.*]] = extractelement <4 x i32> [[TMP2]], i64 3
509 // BE-PWR8-NEXT: store i32 [[TMP5]], ptr [[TMP3]], align 1
510 // BE-PWR8-NEXT: ret void
512 // LE-PWR8-LABEL: @test_strmb4(
513 // LE-PWR8-NEXT: entry:
514 // LE-PWR8-NEXT: [[PTR_ADDR:%.*]] = alloca ptr, align 8
515 // LE-PWR8-NEXT: [[DATA_ADDR:%.*]] = alloca <16 x i8>, align 16
516 // LE-PWR8-NEXT: store ptr [[PTR:%.*]], ptr [[PTR_ADDR]], align 8
517 // LE-PWR8-NEXT: store <16 x i8> [[DATA:%.*]], ptr [[DATA_ADDR]], align 16
518 // LE-PWR8-NEXT: [[TMP0:%.*]] = load ptr, ptr [[PTR_ADDR]], align 8
519 // LE-PWR8-NEXT: [[TMP1:%.*]] = load <16 x i8>, ptr [[DATA_ADDR]], align 16
520 // LE-PWR8-NEXT: [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
521 // LE-PWR8-NEXT: [[TMP3:%.*]] = getelementptr i8, ptr [[TMP0]], i64 0
522 // LE-PWR8-NEXT: [[TMP5:%.*]] = extractelement <4 x i32> [[TMP2]], i64 0
523 // LE-PWR8-NEXT: [[TMP6:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP5]])
524 // LE-PWR8-NEXT: store i32 [[TMP6]], ptr [[TMP3]], align 1
525 // LE-PWR8-NEXT: ret void
527 // BE-PWR9-LABEL: @test_strmb4(
528 // BE-PWR9-NEXT: entry:
529 // BE-PWR9-NEXT: [[__A_ADDR_I:%.*]] = alloca <16 x i8>, align 16
530 // BE-PWR9-NEXT: [[__B_ADDR_I:%.*]] = alloca ptr, align 8
531 // BE-PWR9-NEXT: [[__C_ADDR_I:%.*]] = alloca i64, align 8
532 // BE-PWR9-NEXT: [[__MASK_I:%.*]] = alloca <16 x i8>, align 16
533 // BE-PWR9-NEXT: [[__RES_I:%.*]] = alloca <16 x i8>, align 16
534 // BE-PWR9-NEXT: [[PTR_ADDR:%.*]] = alloca ptr, align 8
535 // BE-PWR9-NEXT: [[DATA_ADDR:%.*]] = alloca <16 x i8>, align 16
536 // BE-PWR9-NEXT: store ptr [[PTR:%.*]], ptr [[PTR_ADDR]], align 8
537 // BE-PWR9-NEXT: store <16 x i8> [[DATA:%.*]], ptr [[DATA_ADDR]], align 16
538 // BE-PWR9-NEXT: [[TMP0:%.*]] = load <16 x i8>, ptr [[DATA_ADDR]], align 16
539 // BE-PWR9-NEXT: [[TMP1:%.*]] = load ptr, ptr [[PTR_ADDR]], align 8
540 // BE-PWR9-NEXT: store <16 x i8> [[TMP0]], ptr [[__A_ADDR_I]], align 16
541 // BE-PWR9-NEXT: store ptr [[TMP1]], ptr [[__B_ADDR_I]], align 8
542 // BE-PWR9-NEXT: store i64 4, ptr [[__C_ADDR_I]], align 8
543 // BE-PWR9-NEXT: [[TMP2:%.*]] = load i64, ptr [[__C_ADDR_I]], align 8
544 // BE-PWR9-NEXT: [[SUB_I:%.*]] = sub i64 16, [[TMP2]]
545 // BE-PWR9-NEXT: [[CONV_I:%.*]] = trunc i64 [[SUB_I]] to i8
546 // BE-PWR9-NEXT: [[TMP3:%.*]] = getelementptr i8, ptr null, i8 [[CONV_I]]
547 // BE-PWR9-NEXT: [[TMP4:%.*]] = call <16 x i8> @llvm.ppc.altivec.lvsl(ptr [[TMP3]])
548 // BE-PWR9-NEXT: store <16 x i8> [[TMP4]], ptr [[__MASK_I]], align 16
549 // BE-PWR9-NEXT: [[TMP5:%.*]] = load <16 x i8>, ptr [[__A_ADDR_I]], align 16
550 // BE-PWR9-NEXT: [[TMP6:%.*]] = bitcast <16 x i8> [[TMP5]] to <4 x i32>
551 // BE-PWR9-NEXT: [[TMP7:%.*]] = load <16 x i8>, ptr [[__A_ADDR_I]], align 16
552 // BE-PWR9-NEXT: [[TMP8:%.*]] = bitcast <16 x i8> [[TMP7]] to <4 x i32>
553 // BE-PWR9-NEXT: [[TMP9:%.*]] = load <16 x i8>, ptr [[__MASK_I]], align 16
554 // BE-PWR9-NEXT: [[TMP10:%.*]] = call <4 x i32> @llvm.ppc.altivec.vperm(<4 x i32> [[TMP6]], <4 x i32> [[TMP8]], <16 x i8> [[TMP9]])
555 // BE-PWR9-NEXT: [[TMP11:%.*]] = bitcast <4 x i32> [[TMP10]] to <16 x i8>
556 // BE-PWR9-NEXT: store <16 x i8> [[TMP11]], ptr [[__RES_I]], align 16
557 // BE-PWR9-NEXT: [[TMP12:%.*]] = load <16 x i8>, ptr [[__RES_I]], align 16
558 // BE-PWR9-NEXT: [[TMP13:%.*]] = bitcast <16 x i8> [[TMP12]] to <4 x i32>
559 // BE-PWR9-NEXT: [[TMP14:%.*]] = load ptr, ptr [[__B_ADDR_I]], align 8
560 // BE-PWR9-NEXT: [[TMP15:%.*]] = load i64, ptr [[__C_ADDR_I]], align 8
561 // BE-PWR9-NEXT: [[SHL_I:%.*]] = shl i64 [[TMP15]], 56
562 // BE-PWR9-NEXT: call void @llvm.ppc.vsx.stxvll(<4 x i32> [[TMP13]], ptr [[TMP14]], i64 [[SHL_I]])
563 // BE-PWR9-NEXT: ret void
565 // LE-PWR9-LABEL: @test_strmb4(
566 // LE-PWR9-NEXT: entry:
567 // LE-PWR9-NEXT: [[__A_ADDR_I:%.*]] = alloca <16 x i8>, align 16
568 // LE-PWR9-NEXT: [[__B_ADDR_I:%.*]] = alloca ptr, align 8
569 // LE-PWR9-NEXT: [[__C_ADDR_I:%.*]] = alloca i64, align 8
570 // LE-PWR9-NEXT: [[__MASK_I:%.*]] = alloca <16 x i8>, align 16
571 // LE-PWR9-NEXT: [[__RES_I:%.*]] = alloca <16 x i8>, align 16
572 // LE-PWR9-NEXT: [[PTR_ADDR:%.*]] = alloca ptr, align 8
573 // LE-PWR9-NEXT: [[DATA_ADDR:%.*]] = alloca <16 x i8>, align 16
574 // LE-PWR9-NEXT: store ptr [[PTR:%.*]], ptr [[PTR_ADDR]], align 8
575 // LE-PWR9-NEXT: store <16 x i8> [[DATA:%.*]], ptr [[DATA_ADDR]], align 16
576 // LE-PWR9-NEXT: [[TMP0:%.*]] = load <16 x i8>, ptr [[DATA_ADDR]], align 16
577 // LE-PWR9-NEXT: [[TMP1:%.*]] = load ptr, ptr [[PTR_ADDR]], align 8
578 // LE-PWR9-NEXT: store <16 x i8> [[TMP0]], ptr [[__A_ADDR_I]], align 16
579 // LE-PWR9-NEXT: store ptr [[TMP1]], ptr [[__B_ADDR_I]], align 8
580 // LE-PWR9-NEXT: store i64 4, ptr [[__C_ADDR_I]], align 8
581 // LE-PWR9-NEXT: [[TMP2:%.*]] = load i64, ptr [[__C_ADDR_I]], align 8
582 // LE-PWR9-NEXT: [[SUB_I:%.*]] = sub i64 16, [[TMP2]]
583 // LE-PWR9-NEXT: [[CONV_I:%.*]] = trunc i64 [[SUB_I]] to i8
584 // LE-PWR9-NEXT: [[TMP3:%.*]] = getelementptr i8, ptr null, i8 [[CONV_I]]
585 // LE-PWR9-NEXT: [[TMP4:%.*]] = call <16 x i8> @llvm.ppc.altivec.lvsl(ptr [[TMP3]])
586 // LE-PWR9-NEXT: store <16 x i8> [[TMP4]], ptr [[__MASK_I]], align 16
587 // LE-PWR9-NEXT: [[TMP5:%.*]] = load <16 x i8>, ptr [[__A_ADDR_I]], align 16
588 // LE-PWR9-NEXT: [[TMP6:%.*]] = bitcast <16 x i8> [[TMP5]] to <4 x i32>
589 // LE-PWR9-NEXT: [[TMP7:%.*]] = load <16 x i8>, ptr [[__A_ADDR_I]], align 16
590 // LE-PWR9-NEXT: [[TMP8:%.*]] = bitcast <16 x i8> [[TMP7]] to <4 x i32>
591 // LE-PWR9-NEXT: [[TMP9:%.*]] = load <16 x i8>, ptr [[__MASK_I]], align 16
592 // LE-PWR9-NEXT: [[TMP10:%.*]] = call <4 x i32> @llvm.ppc.altivec.vperm(<4 x i32> [[TMP6]], <4 x i32> [[TMP8]], <16 x i8> [[TMP9]])
593 // LE-PWR9-NEXT: [[TMP11:%.*]] = bitcast <4 x i32> [[TMP10]] to <16 x i8>
594 // LE-PWR9-NEXT: store <16 x i8> [[TMP11]], ptr [[__RES_I]], align 16
595 // LE-PWR9-NEXT: [[TMP12:%.*]] = load <16 x i8>, ptr [[__RES_I]], align 16
596 // LE-PWR9-NEXT: [[TMP13:%.*]] = bitcast <16 x i8> [[TMP12]] to <4 x i32>
597 // LE-PWR9-NEXT: [[TMP14:%.*]] = load ptr, ptr [[__B_ADDR_I]], align 8
598 // LE-PWR9-NEXT: [[TMP15:%.*]] = load i64, ptr [[__C_ADDR_I]], align 8
599 // LE-PWR9-NEXT: [[SHL_I:%.*]] = shl i64 [[TMP15]], 56
600 // LE-PWR9-NEXT: call void @llvm.ppc.vsx.stxvll(<4 x i32> [[TMP13]], ptr [[TMP14]], i64 [[SHL_I]])
601 // LE-PWR9-NEXT: ret void
603 // BE32-PWR9-LABEL: @test_strmb4(
604 // BE32-PWR9-NEXT: entry:
605 // BE32-PWR9-NEXT: [[PTR_ADDR:%.*]] = alloca ptr, align 4
606 // BE32-PWR9-NEXT: [[DATA_ADDR:%.*]] = alloca <16 x i8>, align 16
607 // BE32-PWR9-NEXT: store ptr [[PTR:%.*]], ptr [[PTR_ADDR]], align 4
608 // BE32-PWR9-NEXT: store <16 x i8> [[DATA:%.*]], ptr [[DATA_ADDR]], align 16
609 // BE32-PWR9-NEXT: [[TMP0:%.*]] = load ptr, ptr [[PTR_ADDR]], align 4
610 // BE32-PWR9-NEXT: [[TMP1:%.*]] = load <16 x i8>, ptr [[DATA_ADDR]], align 16
611 // BE32-PWR9-NEXT: [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
612 // BE32-PWR9-NEXT: [[TMP3:%.*]] = getelementptr i8, ptr [[TMP0]], i64 0
613 // BE32-PWR9-NEXT: [[TMP5:%.*]] = extractelement <4 x i32> [[TMP2]], i64 3
614 // BE32-PWR9-NEXT: store i32 [[TMP5]], ptr [[TMP3]], align 1
615 // BE32-PWR9-NEXT: ret void
617 void test_strmb4(char *ptr, vector unsigned char data) {
618 __vec_strmb(ptr, 4, data);
621 // BE-PWR8-LABEL: @test_strmb5(
622 // BE-PWR8-NEXT: entry:
623 // BE-PWR8-NEXT: [[PTR_ADDR:%.*]] = alloca ptr, align 8
624 // BE-PWR8-NEXT: [[DATA_ADDR:%.*]] = alloca <16 x i8>, align 16
625 // BE-PWR8-NEXT: store ptr [[PTR:%.*]], ptr [[PTR_ADDR]], align 8
626 // BE-PWR8-NEXT: store <16 x i8> [[DATA:%.*]], ptr [[DATA_ADDR]], align 16
627 // BE-PWR8-NEXT: [[TMP0:%.*]] = load ptr, ptr [[PTR_ADDR]], align 8
628 // BE-PWR8-NEXT: [[TMP1:%.*]] = load <16 x i8>, ptr [[DATA_ADDR]], align 16
629 // BE-PWR8-NEXT: [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
630 // BE-PWR8-NEXT: [[TMP3:%.*]] = getelementptr i8, ptr [[TMP0]], i64 1
631 // BE-PWR8-NEXT: [[TMP5:%.*]] = extractelement <4 x i32> [[TMP2]], i64 3
632 // BE-PWR8-NEXT: store i32 [[TMP5]], ptr [[TMP3]], align 1
633 // BE-PWR8-NEXT: [[TMP6:%.*]] = getelementptr i8, ptr [[TMP0]], i64 0
634 // BE-PWR8-NEXT: [[TMP7:%.*]] = extractelement <16 x i8> [[TMP1]], i64 11
635 // BE-PWR8-NEXT: store i8 [[TMP7]], ptr [[TMP6]], align 1
636 // BE-PWR8-NEXT: ret void
638 // LE-PWR8-LABEL: @test_strmb5(
639 // LE-PWR8-NEXT: entry:
640 // LE-PWR8-NEXT: [[PTR_ADDR:%.*]] = alloca ptr, align 8
641 // LE-PWR8-NEXT: [[DATA_ADDR:%.*]] = alloca <16 x i8>, align 16
642 // LE-PWR8-NEXT: store ptr [[PTR:%.*]], ptr [[PTR_ADDR]], align 8
643 // LE-PWR8-NEXT: store <16 x i8> [[DATA:%.*]], ptr [[DATA_ADDR]], align 16
644 // LE-PWR8-NEXT: [[TMP0:%.*]] = load ptr, ptr [[PTR_ADDR]], align 8
645 // LE-PWR8-NEXT: [[TMP1:%.*]] = load <16 x i8>, ptr [[DATA_ADDR]], align 16
646 // LE-PWR8-NEXT: [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
647 // LE-PWR8-NEXT: [[TMP3:%.*]] = getelementptr i8, ptr [[TMP0]], i64 1
648 // LE-PWR8-NEXT: [[TMP5:%.*]] = extractelement <4 x i32> [[TMP2]], i64 0
649 // LE-PWR8-NEXT: [[TMP6:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP5]])
650 // LE-PWR8-NEXT: store i32 [[TMP6]], ptr [[TMP3]], align 1
651 // LE-PWR8-NEXT: [[TMP7:%.*]] = getelementptr i8, ptr [[TMP0]], i64 0
652 // LE-PWR8-NEXT: [[TMP8:%.*]] = extractelement <16 x i8> [[TMP1]], i64 4
653 // LE-PWR8-NEXT: store i8 [[TMP8]], ptr [[TMP7]], align 1
654 // LE-PWR8-NEXT: ret void
656 // BE-PWR9-LABEL: @test_strmb5(
657 // BE-PWR9-NEXT: entry:
658 // BE-PWR9-NEXT: [[__A_ADDR_I:%.*]] = alloca <16 x i8>, align 16
659 // BE-PWR9-NEXT: [[__B_ADDR_I:%.*]] = alloca ptr, align 8
660 // BE-PWR9-NEXT: [[__C_ADDR_I:%.*]] = alloca i64, align 8
661 // BE-PWR9-NEXT: [[__MASK_I:%.*]] = alloca <16 x i8>, align 16
662 // BE-PWR9-NEXT: [[__RES_I:%.*]] = alloca <16 x i8>, align 16
663 // BE-PWR9-NEXT: [[PTR_ADDR:%.*]] = alloca ptr, align 8
664 // BE-PWR9-NEXT: [[DATA_ADDR:%.*]] = alloca <16 x i8>, align 16
665 // BE-PWR9-NEXT: store ptr [[PTR:%.*]], ptr [[PTR_ADDR]], align 8
666 // BE-PWR9-NEXT: store <16 x i8> [[DATA:%.*]], ptr [[DATA_ADDR]], align 16
667 // BE-PWR9-NEXT: [[TMP0:%.*]] = load <16 x i8>, ptr [[DATA_ADDR]], align 16
668 // BE-PWR9-NEXT: [[TMP1:%.*]] = load ptr, ptr [[PTR_ADDR]], align 8
669 // BE-PWR9-NEXT: store <16 x i8> [[TMP0]], ptr [[__A_ADDR_I]], align 16
670 // BE-PWR9-NEXT: store ptr [[TMP1]], ptr [[__B_ADDR_I]], align 8
671 // BE-PWR9-NEXT: store i64 5, ptr [[__C_ADDR_I]], align 8
672 // BE-PWR9-NEXT: [[TMP2:%.*]] = load i64, ptr [[__C_ADDR_I]], align 8
673 // BE-PWR9-NEXT: [[SUB_I:%.*]] = sub i64 16, [[TMP2]]
674 // BE-PWR9-NEXT: [[CONV_I:%.*]] = trunc i64 [[SUB_I]] to i8
675 // BE-PWR9-NEXT: [[TMP3:%.*]] = getelementptr i8, ptr null, i8 [[CONV_I]]
676 // BE-PWR9-NEXT: [[TMP4:%.*]] = call <16 x i8> @llvm.ppc.altivec.lvsl(ptr [[TMP3]])
677 // BE-PWR9-NEXT: store <16 x i8> [[TMP4]], ptr [[__MASK_I]], align 16
678 // BE-PWR9-NEXT: [[TMP5:%.*]] = load <16 x i8>, ptr [[__A_ADDR_I]], align 16
679 // BE-PWR9-NEXT: [[TMP6:%.*]] = bitcast <16 x i8> [[TMP5]] to <4 x i32>
680 // BE-PWR9-NEXT: [[TMP7:%.*]] = load <16 x i8>, ptr [[__A_ADDR_I]], align 16
681 // BE-PWR9-NEXT: [[TMP8:%.*]] = bitcast <16 x i8> [[TMP7]] to <4 x i32>
682 // BE-PWR9-NEXT: [[TMP9:%.*]] = load <16 x i8>, ptr [[__MASK_I]], align 16
683 // BE-PWR9-NEXT: [[TMP10:%.*]] = call <4 x i32> @llvm.ppc.altivec.vperm(<4 x i32> [[TMP6]], <4 x i32> [[TMP8]], <16 x i8> [[TMP9]])
684 // BE-PWR9-NEXT: [[TMP11:%.*]] = bitcast <4 x i32> [[TMP10]] to <16 x i8>
685 // BE-PWR9-NEXT: store <16 x i8> [[TMP11]], ptr [[__RES_I]], align 16
686 // BE-PWR9-NEXT: [[TMP12:%.*]] = load <16 x i8>, ptr [[__RES_I]], align 16
687 // BE-PWR9-NEXT: [[TMP13:%.*]] = bitcast <16 x i8> [[TMP12]] to <4 x i32>
688 // BE-PWR9-NEXT: [[TMP14:%.*]] = load ptr, ptr [[__B_ADDR_I]], align 8
689 // BE-PWR9-NEXT: [[TMP15:%.*]] = load i64, ptr [[__C_ADDR_I]], align 8
690 // BE-PWR9-NEXT: [[SHL_I:%.*]] = shl i64 [[TMP15]], 56
691 // BE-PWR9-NEXT: call void @llvm.ppc.vsx.stxvll(<4 x i32> [[TMP13]], ptr [[TMP14]], i64 [[SHL_I]])
692 // BE-PWR9-NEXT: ret void
694 // LE-PWR9-LABEL: @test_strmb5(
695 // LE-PWR9-NEXT: entry:
696 // LE-PWR9-NEXT: [[__A_ADDR_I:%.*]] = alloca <16 x i8>, align 16
697 // LE-PWR9-NEXT: [[__B_ADDR_I:%.*]] = alloca ptr, align 8
698 // LE-PWR9-NEXT: [[__C_ADDR_I:%.*]] = alloca i64, align 8
699 // LE-PWR9-NEXT: [[__MASK_I:%.*]] = alloca <16 x i8>, align 16
700 // LE-PWR9-NEXT: [[__RES_I:%.*]] = alloca <16 x i8>, align 16
701 // LE-PWR9-NEXT: [[PTR_ADDR:%.*]] = alloca ptr, align 8
702 // LE-PWR9-NEXT: [[DATA_ADDR:%.*]] = alloca <16 x i8>, align 16
703 // LE-PWR9-NEXT: store ptr [[PTR:%.*]], ptr [[PTR_ADDR]], align 8
704 // LE-PWR9-NEXT: store <16 x i8> [[DATA:%.*]], ptr [[DATA_ADDR]], align 16
705 // LE-PWR9-NEXT: [[TMP0:%.*]] = load <16 x i8>, ptr [[DATA_ADDR]], align 16
706 // LE-PWR9-NEXT: [[TMP1:%.*]] = load ptr, ptr [[PTR_ADDR]], align 8
707 // LE-PWR9-NEXT: store <16 x i8> [[TMP0]], ptr [[__A_ADDR_I]], align 16
708 // LE-PWR9-NEXT: store ptr [[TMP1]], ptr [[__B_ADDR_I]], align 8
709 // LE-PWR9-NEXT: store i64 5, ptr [[__C_ADDR_I]], align 8
710 // LE-PWR9-NEXT: [[TMP2:%.*]] = load i64, ptr [[__C_ADDR_I]], align 8
711 // LE-PWR9-NEXT: [[SUB_I:%.*]] = sub i64 16, [[TMP2]]
712 // LE-PWR9-NEXT: [[CONV_I:%.*]] = trunc i64 [[SUB_I]] to i8
713 // LE-PWR9-NEXT: [[TMP3:%.*]] = getelementptr i8, ptr null, i8 [[CONV_I]]
714 // LE-PWR9-NEXT: [[TMP4:%.*]] = call <16 x i8> @llvm.ppc.altivec.lvsl(ptr [[TMP3]])
715 // LE-PWR9-NEXT: store <16 x i8> [[TMP4]], ptr [[__MASK_I]], align 16
716 // LE-PWR9-NEXT: [[TMP5:%.*]] = load <16 x i8>, ptr [[__A_ADDR_I]], align 16
717 // LE-PWR9-NEXT: [[TMP6:%.*]] = bitcast <16 x i8> [[TMP5]] to <4 x i32>
718 // LE-PWR9-NEXT: [[TMP7:%.*]] = load <16 x i8>, ptr [[__A_ADDR_I]], align 16
719 // LE-PWR9-NEXT: [[TMP8:%.*]] = bitcast <16 x i8> [[TMP7]] to <4 x i32>
720 // LE-PWR9-NEXT: [[TMP9:%.*]] = load <16 x i8>, ptr [[__MASK_I]], align 16
721 // LE-PWR9-NEXT: [[TMP10:%.*]] = call <4 x i32> @llvm.ppc.altivec.vperm(<4 x i32> [[TMP6]], <4 x i32> [[TMP8]], <16 x i8> [[TMP9]])
722 // LE-PWR9-NEXT: [[TMP11:%.*]] = bitcast <4 x i32> [[TMP10]] to <16 x i8>
723 // LE-PWR9-NEXT: store <16 x i8> [[TMP11]], ptr [[__RES_I]], align 16
724 // LE-PWR9-NEXT: [[TMP12:%.*]] = load <16 x i8>, ptr [[__RES_I]], align 16
725 // LE-PWR9-NEXT: [[TMP13:%.*]] = bitcast <16 x i8> [[TMP12]] to <4 x i32>
726 // LE-PWR9-NEXT: [[TMP14:%.*]] = load ptr, ptr [[__B_ADDR_I]], align 8
727 // LE-PWR9-NEXT: [[TMP15:%.*]] = load i64, ptr [[__C_ADDR_I]], align 8
728 // LE-PWR9-NEXT: [[SHL_I:%.*]] = shl i64 [[TMP15]], 56
729 // LE-PWR9-NEXT: call void @llvm.ppc.vsx.stxvll(<4 x i32> [[TMP13]], ptr [[TMP14]], i64 [[SHL_I]])
730 // LE-PWR9-NEXT: ret void
732 // BE32-PWR9-LABEL: @test_strmb5(
733 // BE32-PWR9-NEXT: entry:
734 // BE32-PWR9-NEXT: [[PTR_ADDR:%.*]] = alloca ptr, align 4
735 // BE32-PWR9-NEXT: [[DATA_ADDR:%.*]] = alloca <16 x i8>, align 16
736 // BE32-PWR9-NEXT: store ptr [[PTR:%.*]], ptr [[PTR_ADDR]], align 4
737 // BE32-PWR9-NEXT: store <16 x i8> [[DATA:%.*]], ptr [[DATA_ADDR]], align 16
738 // BE32-PWR9-NEXT: [[TMP0:%.*]] = load ptr, ptr [[PTR_ADDR]], align 4
739 // BE32-PWR9-NEXT: [[TMP1:%.*]] = load <16 x i8>, ptr [[DATA_ADDR]], align 16
740 // BE32-PWR9-NEXT: [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
741 // BE32-PWR9-NEXT: [[TMP3:%.*]] = getelementptr i8, ptr [[TMP0]], i64 1
742 // BE32-PWR9-NEXT: [[TMP5:%.*]] = extractelement <4 x i32> [[TMP2]], i64 3
743 // BE32-PWR9-NEXT: store i32 [[TMP5]], ptr [[TMP3]], align 1
744 // BE32-PWR9-NEXT: [[TMP6:%.*]] = getelementptr i8, ptr [[TMP0]], i64 0
745 // BE32-PWR9-NEXT: [[TMP7:%.*]] = extractelement <16 x i8> [[TMP1]], i64 11
746 // BE32-PWR9-NEXT: store i8 [[TMP7]], ptr [[TMP6]], align 1
747 // BE32-PWR9-NEXT: ret void
749 void test_strmb5(char *ptr, vector unsigned char data) {
750 __vec_strmb(ptr, 5, data);
753 // BE-PWR8-LABEL: @test_strmb6(
754 // BE-PWR8-NEXT: entry:
755 // BE-PWR8-NEXT: [[PTR_ADDR:%.*]] = alloca ptr, align 8
756 // BE-PWR8-NEXT: [[DATA_ADDR:%.*]] = alloca <16 x i8>, align 16
757 // BE-PWR8-NEXT: store ptr [[PTR:%.*]], ptr [[PTR_ADDR]], align 8
758 // BE-PWR8-NEXT: store <16 x i8> [[DATA:%.*]], ptr [[DATA_ADDR]], align 16
759 // BE-PWR8-NEXT: [[TMP0:%.*]] = load ptr, ptr [[PTR_ADDR]], align 8
760 // BE-PWR8-NEXT: [[TMP1:%.*]] = load <16 x i8>, ptr [[DATA_ADDR]], align 16
761 // BE-PWR8-NEXT: [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
762 // BE-PWR8-NEXT: [[TMP3:%.*]] = getelementptr i8, ptr [[TMP0]], i64 2
763 // BE-PWR8-NEXT: [[TMP5:%.*]] = extractelement <4 x i32> [[TMP2]], i64 3
764 // BE-PWR8-NEXT: store i32 [[TMP5]], ptr [[TMP3]], align 1
765 // BE-PWR8-NEXT: [[TMP6:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
766 // BE-PWR8-NEXT: [[TMP7:%.*]] = getelementptr i8, ptr [[TMP0]], i64 0
767 // BE-PWR8-NEXT: [[TMP9:%.*]] = extractelement <8 x i16> [[TMP6]], i64 5
768 // BE-PWR8-NEXT: store i16 [[TMP9]], ptr [[TMP7]], align 1
769 // BE-PWR8-NEXT: ret void
771 // LE-PWR8-LABEL: @test_strmb6(
772 // LE-PWR8-NEXT: entry:
773 // LE-PWR8-NEXT: [[PTR_ADDR:%.*]] = alloca ptr, align 8
774 // LE-PWR8-NEXT: [[DATA_ADDR:%.*]] = alloca <16 x i8>, align 16
775 // LE-PWR8-NEXT: store ptr [[PTR:%.*]], ptr [[PTR_ADDR]], align 8
776 // LE-PWR8-NEXT: store <16 x i8> [[DATA:%.*]], ptr [[DATA_ADDR]], align 16
777 // LE-PWR8-NEXT: [[TMP0:%.*]] = load ptr, ptr [[PTR_ADDR]], align 8
778 // LE-PWR8-NEXT: [[TMP1:%.*]] = load <16 x i8>, ptr [[DATA_ADDR]], align 16
779 // LE-PWR8-NEXT: [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
780 // LE-PWR8-NEXT: [[TMP3:%.*]] = getelementptr i8, ptr [[TMP0]], i64 2
781 // LE-PWR8-NEXT: [[TMP5:%.*]] = extractelement <4 x i32> [[TMP2]], i64 0
782 // LE-PWR8-NEXT: [[TMP6:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP5]])
783 // LE-PWR8-NEXT: store i32 [[TMP6]], ptr [[TMP3]], align 1
784 // LE-PWR8-NEXT: [[TMP7:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
785 // LE-PWR8-NEXT: [[TMP8:%.*]] = getelementptr i8, ptr [[TMP0]], i64 0
786 // LE-PWR8-NEXT: [[TMP10:%.*]] = extractelement <8 x i16> [[TMP7]], i64 2
787 // LE-PWR8-NEXT: [[TMP11:%.*]] = call i16 @llvm.bswap.i16(i16 [[TMP10]])
788 // LE-PWR8-NEXT: store i16 [[TMP11]], ptr [[TMP8]], align 1
789 // LE-PWR8-NEXT: ret void
791 // BE-PWR9-LABEL: @test_strmb6(
792 // BE-PWR9-NEXT: entry:
793 // BE-PWR9-NEXT: [[__A_ADDR_I:%.*]] = alloca <16 x i8>, align 16
794 // BE-PWR9-NEXT: [[__B_ADDR_I:%.*]] = alloca ptr, align 8
795 // BE-PWR9-NEXT: [[__C_ADDR_I:%.*]] = alloca i64, align 8
796 // BE-PWR9-NEXT: [[__MASK_I:%.*]] = alloca <16 x i8>, align 16
797 // BE-PWR9-NEXT: [[__RES_I:%.*]] = alloca <16 x i8>, align 16
798 // BE-PWR9-NEXT: [[PTR_ADDR:%.*]] = alloca ptr, align 8
799 // BE-PWR9-NEXT: [[DATA_ADDR:%.*]] = alloca <16 x i8>, align 16
800 // BE-PWR9-NEXT: store ptr [[PTR:%.*]], ptr [[PTR_ADDR]], align 8
801 // BE-PWR9-NEXT: store <16 x i8> [[DATA:%.*]], ptr [[DATA_ADDR]], align 16
802 // BE-PWR9-NEXT: [[TMP0:%.*]] = load <16 x i8>, ptr [[DATA_ADDR]], align 16
803 // BE-PWR9-NEXT: [[TMP1:%.*]] = load ptr, ptr [[PTR_ADDR]], align 8
804 // BE-PWR9-NEXT: store <16 x i8> [[TMP0]], ptr [[__A_ADDR_I]], align 16
805 // BE-PWR9-NEXT: store ptr [[TMP1]], ptr [[__B_ADDR_I]], align 8
806 // BE-PWR9-NEXT: store i64 6, ptr [[__C_ADDR_I]], align 8
807 // BE-PWR9-NEXT: [[TMP2:%.*]] = load i64, ptr [[__C_ADDR_I]], align 8
808 // BE-PWR9-NEXT: [[SUB_I:%.*]] = sub i64 16, [[TMP2]]
809 // BE-PWR9-NEXT: [[CONV_I:%.*]] = trunc i64 [[SUB_I]] to i8
810 // BE-PWR9-NEXT: [[TMP3:%.*]] = getelementptr i8, ptr null, i8 [[CONV_I]]
811 // BE-PWR9-NEXT: [[TMP4:%.*]] = call <16 x i8> @llvm.ppc.altivec.lvsl(ptr [[TMP3]])
812 // BE-PWR9-NEXT: store <16 x i8> [[TMP4]], ptr [[__MASK_I]], align 16
813 // BE-PWR9-NEXT: [[TMP5:%.*]] = load <16 x i8>, ptr [[__A_ADDR_I]], align 16
814 // BE-PWR9-NEXT: [[TMP6:%.*]] = bitcast <16 x i8> [[TMP5]] to <4 x i32>
815 // BE-PWR9-NEXT: [[TMP7:%.*]] = load <16 x i8>, ptr [[__A_ADDR_I]], align 16
816 // BE-PWR9-NEXT: [[TMP8:%.*]] = bitcast <16 x i8> [[TMP7]] to <4 x i32>
817 // BE-PWR9-NEXT: [[TMP9:%.*]] = load <16 x i8>, ptr [[__MASK_I]], align 16
818 // BE-PWR9-NEXT: [[TMP10:%.*]] = call <4 x i32> @llvm.ppc.altivec.vperm(<4 x i32> [[TMP6]], <4 x i32> [[TMP8]], <16 x i8> [[TMP9]])
819 // BE-PWR9-NEXT: [[TMP11:%.*]] = bitcast <4 x i32> [[TMP10]] to <16 x i8>
820 // BE-PWR9-NEXT: store <16 x i8> [[TMP11]], ptr [[__RES_I]], align 16
821 // BE-PWR9-NEXT: [[TMP12:%.*]] = load <16 x i8>, ptr [[__RES_I]], align 16
822 // BE-PWR9-NEXT: [[TMP13:%.*]] = bitcast <16 x i8> [[TMP12]] to <4 x i32>
823 // BE-PWR9-NEXT: [[TMP14:%.*]] = load ptr, ptr [[__B_ADDR_I]], align 8
824 // BE-PWR9-NEXT: [[TMP15:%.*]] = load i64, ptr [[__C_ADDR_I]], align 8
825 // BE-PWR9-NEXT: [[SHL_I:%.*]] = shl i64 [[TMP15]], 56
826 // BE-PWR9-NEXT: call void @llvm.ppc.vsx.stxvll(<4 x i32> [[TMP13]], ptr [[TMP14]], i64 [[SHL_I]])
827 // BE-PWR9-NEXT: ret void
829 // LE-PWR9-LABEL: @test_strmb6(
830 // LE-PWR9-NEXT: entry:
831 // LE-PWR9-NEXT: [[__A_ADDR_I:%.*]] = alloca <16 x i8>, align 16
832 // LE-PWR9-NEXT: [[__B_ADDR_I:%.*]] = alloca ptr, align 8
833 // LE-PWR9-NEXT: [[__C_ADDR_I:%.*]] = alloca i64, align 8
834 // LE-PWR9-NEXT: [[__MASK_I:%.*]] = alloca <16 x i8>, align 16
835 // LE-PWR9-NEXT: [[__RES_I:%.*]] = alloca <16 x i8>, align 16
836 // LE-PWR9-NEXT: [[PTR_ADDR:%.*]] = alloca ptr, align 8
837 // LE-PWR9-NEXT: [[DATA_ADDR:%.*]] = alloca <16 x i8>, align 16
838 // LE-PWR9-NEXT: store ptr [[PTR:%.*]], ptr [[PTR_ADDR]], align 8
839 // LE-PWR9-NEXT: store <16 x i8> [[DATA:%.*]], ptr [[DATA_ADDR]], align 16
840 // LE-PWR9-NEXT: [[TMP0:%.*]] = load <16 x i8>, ptr [[DATA_ADDR]], align 16
841 // LE-PWR9-NEXT: [[TMP1:%.*]] = load ptr, ptr [[PTR_ADDR]], align 8
842 // LE-PWR9-NEXT: store <16 x i8> [[TMP0]], ptr [[__A_ADDR_I]], align 16
843 // LE-PWR9-NEXT: store ptr [[TMP1]], ptr [[__B_ADDR_I]], align 8
844 // LE-PWR9-NEXT: store i64 6, ptr [[__C_ADDR_I]], align 8
845 // LE-PWR9-NEXT: [[TMP2:%.*]] = load i64, ptr [[__C_ADDR_I]], align 8
846 // LE-PWR9-NEXT: [[SUB_I:%.*]] = sub i64 16, [[TMP2]]
847 // LE-PWR9-NEXT: [[CONV_I:%.*]] = trunc i64 [[SUB_I]] to i8
848 // LE-PWR9-NEXT: [[TMP3:%.*]] = getelementptr i8, ptr null, i8 [[CONV_I]]
849 // LE-PWR9-NEXT: [[TMP4:%.*]] = call <16 x i8> @llvm.ppc.altivec.lvsl(ptr [[TMP3]])
850 // LE-PWR9-NEXT: store <16 x i8> [[TMP4]], ptr [[__MASK_I]], align 16
851 // LE-PWR9-NEXT: [[TMP5:%.*]] = load <16 x i8>, ptr [[__A_ADDR_I]], align 16
852 // LE-PWR9-NEXT: [[TMP6:%.*]] = bitcast <16 x i8> [[TMP5]] to <4 x i32>
853 // LE-PWR9-NEXT: [[TMP7:%.*]] = load <16 x i8>, ptr [[__A_ADDR_I]], align 16
854 // LE-PWR9-NEXT: [[TMP8:%.*]] = bitcast <16 x i8> [[TMP7]] to <4 x i32>
855 // LE-PWR9-NEXT: [[TMP9:%.*]] = load <16 x i8>, ptr [[__MASK_I]], align 16
856 // LE-PWR9-NEXT: [[TMP10:%.*]] = call <4 x i32> @llvm.ppc.altivec.vperm(<4 x i32> [[TMP6]], <4 x i32> [[TMP8]], <16 x i8> [[TMP9]])
857 // LE-PWR9-NEXT: [[TMP11:%.*]] = bitcast <4 x i32> [[TMP10]] to <16 x i8>
858 // LE-PWR9-NEXT: store <16 x i8> [[TMP11]], ptr [[__RES_I]], align 16
859 // LE-PWR9-NEXT: [[TMP12:%.*]] = load <16 x i8>, ptr [[__RES_I]], align 16
860 // LE-PWR9-NEXT: [[TMP13:%.*]] = bitcast <16 x i8> [[TMP12]] to <4 x i32>
861 // LE-PWR9-NEXT: [[TMP14:%.*]] = load ptr, ptr [[__B_ADDR_I]], align 8
862 // LE-PWR9-NEXT: [[TMP15:%.*]] = load i64, ptr [[__C_ADDR_I]], align 8
863 // LE-PWR9-NEXT: [[SHL_I:%.*]] = shl i64 [[TMP15]], 56
864 // LE-PWR9-NEXT: call void @llvm.ppc.vsx.stxvll(<4 x i32> [[TMP13]], ptr [[TMP14]], i64 [[SHL_I]])
865 // LE-PWR9-NEXT: ret void
867 // BE32-PWR9-LABEL: @test_strmb6(
868 // BE32-PWR9-NEXT: entry:
869 // BE32-PWR9-NEXT: [[PTR_ADDR:%.*]] = alloca ptr, align 4
870 // BE32-PWR9-NEXT: [[DATA_ADDR:%.*]] = alloca <16 x i8>, align 16
871 // BE32-PWR9-NEXT: store ptr [[PTR:%.*]], ptr [[PTR_ADDR]], align 4
872 // BE32-PWR9-NEXT: store <16 x i8> [[DATA:%.*]], ptr [[DATA_ADDR]], align 16
873 // BE32-PWR9-NEXT: [[TMP0:%.*]] = load ptr, ptr [[PTR_ADDR]], align 4
874 // BE32-PWR9-NEXT: [[TMP1:%.*]] = load <16 x i8>, ptr [[DATA_ADDR]], align 16
875 // BE32-PWR9-NEXT: [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
876 // BE32-PWR9-NEXT: [[TMP3:%.*]] = getelementptr i8, ptr [[TMP0]], i64 2
877 // BE32-PWR9-NEXT: [[TMP5:%.*]] = extractelement <4 x i32> [[TMP2]], i64 3
878 // BE32-PWR9-NEXT: store i32 [[TMP5]], ptr [[TMP3]], align 1
879 // BE32-PWR9-NEXT: [[TMP6:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
880 // BE32-PWR9-NEXT: [[TMP7:%.*]] = getelementptr i8, ptr [[TMP0]], i64 0
881 // BE32-PWR9-NEXT: [[TMP9:%.*]] = extractelement <8 x i16> [[TMP6]], i64 5
882 // BE32-PWR9-NEXT: store i16 [[TMP9]], ptr [[TMP7]], align 1
883 // BE32-PWR9-NEXT: ret void
885 void test_strmb6(char *ptr, vector unsigned char data) {
886 __vec_strmb(ptr, 6, data);
889 // BE-PWR8-LABEL: @test_strmb7(
890 // BE-PWR8-NEXT: entry:
891 // BE-PWR8-NEXT: [[PTR_ADDR:%.*]] = alloca ptr, align 8
892 // BE-PWR8-NEXT: [[DATA_ADDR:%.*]] = alloca <16 x i8>, align 16
893 // BE-PWR8-NEXT: store ptr [[PTR:%.*]], ptr [[PTR_ADDR]], align 8
894 // BE-PWR8-NEXT: store <16 x i8> [[DATA:%.*]], ptr [[DATA_ADDR]], align 16
895 // BE-PWR8-NEXT: [[TMP0:%.*]] = load ptr, ptr [[PTR_ADDR]], align 8
896 // BE-PWR8-NEXT: [[TMP1:%.*]] = load <16 x i8>, ptr [[DATA_ADDR]], align 16
897 // BE-PWR8-NEXT: [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
898 // BE-PWR8-NEXT: [[TMP3:%.*]] = getelementptr i8, ptr [[TMP0]], i64 3
899 // BE-PWR8-NEXT: [[TMP5:%.*]] = extractelement <4 x i32> [[TMP2]], i64 3
900 // BE-PWR8-NEXT: store i32 [[TMP5]], ptr [[TMP3]], align 1
901 // BE-PWR8-NEXT: [[TMP6:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
902 // BE-PWR8-NEXT: [[TMP7:%.*]] = getelementptr i8, ptr [[TMP0]], i64 1
903 // BE-PWR8-NEXT: [[TMP9:%.*]] = extractelement <8 x i16> [[TMP6]], i64 5
904 // BE-PWR8-NEXT: store i16 [[TMP9]], ptr [[TMP7]], align 1
905 // BE-PWR8-NEXT: [[TMP10:%.*]] = getelementptr i8, ptr [[TMP0]], i64 0
906 // BE-PWR8-NEXT: [[TMP11:%.*]] = extractelement <16 x i8> [[TMP1]], i64 9
907 // BE-PWR8-NEXT: store i8 [[TMP11]], ptr [[TMP10]], align 1
908 // BE-PWR8-NEXT: ret void
910 // LE-PWR8-LABEL: @test_strmb7(
911 // LE-PWR8-NEXT: entry:
912 // LE-PWR8-NEXT: [[PTR_ADDR:%.*]] = alloca ptr, align 8
913 // LE-PWR8-NEXT: [[DATA_ADDR:%.*]] = alloca <16 x i8>, align 16
914 // LE-PWR8-NEXT: store ptr [[PTR:%.*]], ptr [[PTR_ADDR]], align 8
915 // LE-PWR8-NEXT: store <16 x i8> [[DATA:%.*]], ptr [[DATA_ADDR]], align 16
916 // LE-PWR8-NEXT: [[TMP0:%.*]] = load ptr, ptr [[PTR_ADDR]], align 8
917 // LE-PWR8-NEXT: [[TMP1:%.*]] = load <16 x i8>, ptr [[DATA_ADDR]], align 16
918 // LE-PWR8-NEXT: [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
919 // LE-PWR8-NEXT: [[TMP3:%.*]] = getelementptr i8, ptr [[TMP0]], i64 3
920 // LE-PWR8-NEXT: [[TMP5:%.*]] = extractelement <4 x i32> [[TMP2]], i64 0
921 // LE-PWR8-NEXT: [[TMP6:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP5]])
922 // LE-PWR8-NEXT: store i32 [[TMP6]], ptr [[TMP3]], align 1
923 // LE-PWR8-NEXT: [[TMP7:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
924 // LE-PWR8-NEXT: [[TMP8:%.*]] = getelementptr i8, ptr [[TMP0]], i64 1
925 // LE-PWR8-NEXT: [[TMP10:%.*]] = extractelement <8 x i16> [[TMP7]], i64 2
926 // LE-PWR8-NEXT: [[TMP11:%.*]] = call i16 @llvm.bswap.i16(i16 [[TMP10]])
927 // LE-PWR8-NEXT: store i16 [[TMP11]], ptr [[TMP8]], align 1
928 // LE-PWR8-NEXT: [[TMP12:%.*]] = getelementptr i8, ptr [[TMP0]], i64 0
929 // LE-PWR8-NEXT: [[TMP13:%.*]] = extractelement <16 x i8> [[TMP1]], i64 6
930 // LE-PWR8-NEXT: store i8 [[TMP13]], ptr [[TMP12]], align 1
931 // LE-PWR8-NEXT: ret void
933 // BE-PWR9-LABEL: @test_strmb7(
934 // BE-PWR9-NEXT: entry:
935 // BE-PWR9-NEXT: [[__A_ADDR_I:%.*]] = alloca <16 x i8>, align 16
936 // BE-PWR9-NEXT: [[__B_ADDR_I:%.*]] = alloca ptr, align 8
937 // BE-PWR9-NEXT: [[__C_ADDR_I:%.*]] = alloca i64, align 8
938 // BE-PWR9-NEXT: [[__MASK_I:%.*]] = alloca <16 x i8>, align 16
939 // BE-PWR9-NEXT: [[__RES_I:%.*]] = alloca <16 x i8>, align 16
940 // BE-PWR9-NEXT: [[PTR_ADDR:%.*]] = alloca ptr, align 8
941 // BE-PWR9-NEXT: [[DATA_ADDR:%.*]] = alloca <16 x i8>, align 16
942 // BE-PWR9-NEXT: store ptr [[PTR:%.*]], ptr [[PTR_ADDR]], align 8
943 // BE-PWR9-NEXT: store <16 x i8> [[DATA:%.*]], ptr [[DATA_ADDR]], align 16
944 // BE-PWR9-NEXT: [[TMP0:%.*]] = load <16 x i8>, ptr [[DATA_ADDR]], align 16
945 // BE-PWR9-NEXT: [[TMP1:%.*]] = load ptr, ptr [[PTR_ADDR]], align 8
946 // BE-PWR9-NEXT: store <16 x i8> [[TMP0]], ptr [[__A_ADDR_I]], align 16
947 // BE-PWR9-NEXT: store ptr [[TMP1]], ptr [[__B_ADDR_I]], align 8
948 // BE-PWR9-NEXT: store i64 7, ptr [[__C_ADDR_I]], align 8
949 // BE-PWR9-NEXT: [[TMP2:%.*]] = load i64, ptr [[__C_ADDR_I]], align 8
950 // BE-PWR9-NEXT: [[SUB_I:%.*]] = sub i64 16, [[TMP2]]
951 // BE-PWR9-NEXT: [[CONV_I:%.*]] = trunc i64 [[SUB_I]] to i8
952 // BE-PWR9-NEXT: [[TMP3:%.*]] = getelementptr i8, ptr null, i8 [[CONV_I]]
953 // BE-PWR9-NEXT: [[TMP4:%.*]] = call <16 x i8> @llvm.ppc.altivec.lvsl(ptr [[TMP3]])
954 // BE-PWR9-NEXT: store <16 x i8> [[TMP4]], ptr [[__MASK_I]], align 16
955 // BE-PWR9-NEXT: [[TMP5:%.*]] = load <16 x i8>, ptr [[__A_ADDR_I]], align 16
956 // BE-PWR9-NEXT: [[TMP6:%.*]] = bitcast <16 x i8> [[TMP5]] to <4 x i32>
957 // BE-PWR9-NEXT: [[TMP7:%.*]] = load <16 x i8>, ptr [[__A_ADDR_I]], align 16
958 // BE-PWR9-NEXT: [[TMP8:%.*]] = bitcast <16 x i8> [[TMP7]] to <4 x i32>
959 // BE-PWR9-NEXT: [[TMP9:%.*]] = load <16 x i8>, ptr [[__MASK_I]], align 16
960 // BE-PWR9-NEXT: [[TMP10:%.*]] = call <4 x i32> @llvm.ppc.altivec.vperm(<4 x i32> [[TMP6]], <4 x i32> [[TMP8]], <16 x i8> [[TMP9]])
961 // BE-PWR9-NEXT: [[TMP11:%.*]] = bitcast <4 x i32> [[TMP10]] to <16 x i8>
962 // BE-PWR9-NEXT: store <16 x i8> [[TMP11]], ptr [[__RES_I]], align 16
963 // BE-PWR9-NEXT: [[TMP12:%.*]] = load <16 x i8>, ptr [[__RES_I]], align 16
964 // BE-PWR9-NEXT: [[TMP13:%.*]] = bitcast <16 x i8> [[TMP12]] to <4 x i32>
965 // BE-PWR9-NEXT: [[TMP14:%.*]] = load ptr, ptr [[__B_ADDR_I]], align 8
966 // BE-PWR9-NEXT: [[TMP15:%.*]] = load i64, ptr [[__C_ADDR_I]], align 8
967 // BE-PWR9-NEXT: [[SHL_I:%.*]] = shl i64 [[TMP15]], 56
968 // BE-PWR9-NEXT: call void @llvm.ppc.vsx.stxvll(<4 x i32> [[TMP13]], ptr [[TMP14]], i64 [[SHL_I]])
969 // BE-PWR9-NEXT: ret void
971 // LE-PWR9-LABEL: @test_strmb7(
972 // LE-PWR9-NEXT: entry:
973 // LE-PWR9-NEXT: [[__A_ADDR_I:%.*]] = alloca <16 x i8>, align 16
974 // LE-PWR9-NEXT: [[__B_ADDR_I:%.*]] = alloca ptr, align 8
975 // LE-PWR9-NEXT: [[__C_ADDR_I:%.*]] = alloca i64, align 8
976 // LE-PWR9-NEXT: [[__MASK_I:%.*]] = alloca <16 x i8>, align 16
977 // LE-PWR9-NEXT: [[__RES_I:%.*]] = alloca <16 x i8>, align 16
978 // LE-PWR9-NEXT: [[PTR_ADDR:%.*]] = alloca ptr, align 8
979 // LE-PWR9-NEXT: [[DATA_ADDR:%.*]] = alloca <16 x i8>, align 16
980 // LE-PWR9-NEXT: store ptr [[PTR:%.*]], ptr [[PTR_ADDR]], align 8
981 // LE-PWR9-NEXT: store <16 x i8> [[DATA:%.*]], ptr [[DATA_ADDR]], align 16
982 // LE-PWR9-NEXT: [[TMP0:%.*]] = load <16 x i8>, ptr [[DATA_ADDR]], align 16
983 // LE-PWR9-NEXT: [[TMP1:%.*]] = load ptr, ptr [[PTR_ADDR]], align 8
984 // LE-PWR9-NEXT: store <16 x i8> [[TMP0]], ptr [[__A_ADDR_I]], align 16
985 // LE-PWR9-NEXT: store ptr [[TMP1]], ptr [[__B_ADDR_I]], align 8
986 // LE-PWR9-NEXT: store i64 7, ptr [[__C_ADDR_I]], align 8
987 // LE-PWR9-NEXT: [[TMP2:%.*]] = load i64, ptr [[__C_ADDR_I]], align 8
988 // LE-PWR9-NEXT: [[SUB_I:%.*]] = sub i64 16, [[TMP2]]
989 // LE-PWR9-NEXT: [[CONV_I:%.*]] = trunc i64 [[SUB_I]] to i8
990 // LE-PWR9-NEXT: [[TMP3:%.*]] = getelementptr i8, ptr null, i8 [[CONV_I]]
991 // LE-PWR9-NEXT: [[TMP4:%.*]] = call <16 x i8> @llvm.ppc.altivec.lvsl(ptr [[TMP3]])
992 // LE-PWR9-NEXT: store <16 x i8> [[TMP4]], ptr [[__MASK_I]], align 16
993 // LE-PWR9-NEXT: [[TMP5:%.*]] = load <16 x i8>, ptr [[__A_ADDR_I]], align 16
994 // LE-PWR9-NEXT: [[TMP6:%.*]] = bitcast <16 x i8> [[TMP5]] to <4 x i32>
995 // LE-PWR9-NEXT: [[TMP7:%.*]] = load <16 x i8>, ptr [[__A_ADDR_I]], align 16
996 // LE-PWR9-NEXT: [[TMP8:%.*]] = bitcast <16 x i8> [[TMP7]] to <4 x i32>
997 // LE-PWR9-NEXT: [[TMP9:%.*]] = load <16 x i8>, ptr [[__MASK_I]], align 16
998 // LE-PWR9-NEXT: [[TMP10:%.*]] = call <4 x i32> @llvm.ppc.altivec.vperm(<4 x i32> [[TMP6]], <4 x i32> [[TMP8]], <16 x i8> [[TMP9]])
999 // LE-PWR9-NEXT: [[TMP11:%.*]] = bitcast <4 x i32> [[TMP10]] to <16 x i8>
1000 // LE-PWR9-NEXT: store <16 x i8> [[TMP11]], ptr [[__RES_I]], align 16
1001 // LE-PWR9-NEXT: [[TMP12:%.*]] = load <16 x i8>, ptr [[__RES_I]], align 16
1002 // LE-PWR9-NEXT: [[TMP13:%.*]] = bitcast <16 x i8> [[TMP12]] to <4 x i32>
1003 // LE-PWR9-NEXT: [[TMP14:%.*]] = load ptr, ptr [[__B_ADDR_I]], align 8
1004 // LE-PWR9-NEXT: [[TMP15:%.*]] = load i64, ptr [[__C_ADDR_I]], align 8
1005 // LE-PWR9-NEXT: [[SHL_I:%.*]] = shl i64 [[TMP15]], 56
1006 // LE-PWR9-NEXT: call void @llvm.ppc.vsx.stxvll(<4 x i32> [[TMP13]], ptr [[TMP14]], i64 [[SHL_I]])
1007 // LE-PWR9-NEXT: ret void
1009 // BE32-PWR9-LABEL: @test_strmb7(
1010 // BE32-PWR9-NEXT: entry:
1011 // BE32-PWR9-NEXT: [[PTR_ADDR:%.*]] = alloca ptr, align 4
1012 // BE32-PWR9-NEXT: [[DATA_ADDR:%.*]] = alloca <16 x i8>, align 16
1013 // BE32-PWR9-NEXT: store ptr [[PTR:%.*]], ptr [[PTR_ADDR]], align 4
1014 // BE32-PWR9-NEXT: store <16 x i8> [[DATA:%.*]], ptr [[DATA_ADDR]], align 16
1015 // BE32-PWR9-NEXT: [[TMP0:%.*]] = load ptr, ptr [[PTR_ADDR]], align 4
1016 // BE32-PWR9-NEXT: [[TMP1:%.*]] = load <16 x i8>, ptr [[DATA_ADDR]], align 16
1017 // BE32-PWR9-NEXT: [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
1018 // BE32-PWR9-NEXT: [[TMP3:%.*]] = getelementptr i8, ptr [[TMP0]], i64 3
1019 // BE32-PWR9-NEXT: [[TMP5:%.*]] = extractelement <4 x i32> [[TMP2]], i64 3
1020 // BE32-PWR9-NEXT: store i32 [[TMP5]], ptr [[TMP3]], align 1
1021 // BE32-PWR9-NEXT: [[TMP6:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
1022 // BE32-PWR9-NEXT: [[TMP7:%.*]] = getelementptr i8, ptr [[TMP0]], i64 1
1023 // BE32-PWR9-NEXT: [[TMP9:%.*]] = extractelement <8 x i16> [[TMP6]], i64 5
1024 // BE32-PWR9-NEXT: store i16 [[TMP9]], ptr [[TMP7]], align 1
1025 // BE32-PWR9-NEXT: [[TMP10:%.*]] = getelementptr i8, ptr [[TMP0]], i64 0
1026 // BE32-PWR9-NEXT: [[TMP11:%.*]] = extractelement <16 x i8> [[TMP1]], i64 9
1027 // BE32-PWR9-NEXT: store i8 [[TMP11]], ptr [[TMP10]], align 1
1028 // BE32-PWR9-NEXT: ret void
1030 void test_strmb7(char *ptr, vector unsigned char data) {
1031 __vec_strmb(ptr, 7, data);
1034 // BE-PWR8-LABEL: @test_strmb8(
1035 // BE-PWR8-NEXT: entry:
1036 // BE-PWR8-NEXT: [[PTR_ADDR:%.*]] = alloca ptr, align 8
1037 // BE-PWR8-NEXT: [[DATA_ADDR:%.*]] = alloca <16 x i8>, align 16
1038 // BE-PWR8-NEXT: store ptr [[PTR:%.*]], ptr [[PTR_ADDR]], align 8
1039 // BE-PWR8-NEXT: store <16 x i8> [[DATA:%.*]], ptr [[DATA_ADDR]], align 16
1040 // BE-PWR8-NEXT: [[TMP0:%.*]] = load ptr, ptr [[PTR_ADDR]], align 8
1041 // BE-PWR8-NEXT: [[TMP1:%.*]] = load <16 x i8>, ptr [[DATA_ADDR]], align 16
1042 // BE-PWR8-NEXT: [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64>
1043 // BE-PWR8-NEXT: [[TMP3:%.*]] = getelementptr i8, ptr [[TMP0]], i64 0
1044 // BE-PWR8-NEXT: [[TMP5:%.*]] = extractelement <2 x i64> [[TMP2]], i64 1
1045 // BE-PWR8-NEXT: store i64 [[TMP5]], ptr [[TMP3]], align 1
1046 // BE-PWR8-NEXT: ret void
1048 // LE-PWR8-LABEL: @test_strmb8(
1049 // LE-PWR8-NEXT: entry:
1050 // LE-PWR8-NEXT: [[PTR_ADDR:%.*]] = alloca ptr, align 8
1051 // LE-PWR8-NEXT: [[DATA_ADDR:%.*]] = alloca <16 x i8>, align 16
1052 // LE-PWR8-NEXT: store ptr [[PTR:%.*]], ptr [[PTR_ADDR]], align 8
1053 // LE-PWR8-NEXT: store <16 x i8> [[DATA:%.*]], ptr [[DATA_ADDR]], align 16
1054 // LE-PWR8-NEXT: [[TMP0:%.*]] = load ptr, ptr [[PTR_ADDR]], align 8
1055 // LE-PWR8-NEXT: [[TMP1:%.*]] = load <16 x i8>, ptr [[DATA_ADDR]], align 16
1056 // LE-PWR8-NEXT: [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64>
1057 // LE-PWR8-NEXT: [[TMP3:%.*]] = getelementptr i8, ptr [[TMP0]], i64 0
1058 // LE-PWR8-NEXT: [[TMP5:%.*]] = extractelement <2 x i64> [[TMP2]], i64 0
1059 // LE-PWR8-NEXT: [[TMP6:%.*]] = call i64 @llvm.bswap.i64(i64 [[TMP5]])
1060 // LE-PWR8-NEXT: store i64 [[TMP6]], ptr [[TMP3]], align 1
1061 // LE-PWR8-NEXT: ret void
1063 // BE-PWR9-LABEL: @test_strmb8(
1064 // BE-PWR9-NEXT: entry:
1065 // BE-PWR9-NEXT: [[__A_ADDR_I:%.*]] = alloca <16 x i8>, align 16
1066 // BE-PWR9-NEXT: [[__B_ADDR_I:%.*]] = alloca ptr, align 8
1067 // BE-PWR9-NEXT: [[__C_ADDR_I:%.*]] = alloca i64, align 8
1068 // BE-PWR9-NEXT: [[__MASK_I:%.*]] = alloca <16 x i8>, align 16
1069 // BE-PWR9-NEXT: [[__RES_I:%.*]] = alloca <16 x i8>, align 16
1070 // BE-PWR9-NEXT: [[PTR_ADDR:%.*]] = alloca ptr, align 8
1071 // BE-PWR9-NEXT: [[DATA_ADDR:%.*]] = alloca <16 x i8>, align 16
1072 // BE-PWR9-NEXT: store ptr [[PTR:%.*]], ptr [[PTR_ADDR]], align 8
1073 // BE-PWR9-NEXT: store <16 x i8> [[DATA:%.*]], ptr [[DATA_ADDR]], align 16
1074 // BE-PWR9-NEXT: [[TMP0:%.*]] = load <16 x i8>, ptr [[DATA_ADDR]], align 16
1075 // BE-PWR9-NEXT: [[TMP1:%.*]] = load ptr, ptr [[PTR_ADDR]], align 8
1076 // BE-PWR9-NEXT: store <16 x i8> [[TMP0]], ptr [[__A_ADDR_I]], align 16
1077 // BE-PWR9-NEXT: store ptr [[TMP1]], ptr [[__B_ADDR_I]], align 8
1078 // BE-PWR9-NEXT: store i64 8, ptr [[__C_ADDR_I]], align 8
1079 // BE-PWR9-NEXT: [[TMP2:%.*]] = load i64, ptr [[__C_ADDR_I]], align 8
1080 // BE-PWR9-NEXT: [[SUB_I:%.*]] = sub i64 16, [[TMP2]]
1081 // BE-PWR9-NEXT: [[CONV_I:%.*]] = trunc i64 [[SUB_I]] to i8
1082 // BE-PWR9-NEXT: [[TMP3:%.*]] = getelementptr i8, ptr null, i8 [[CONV_I]]
1083 // BE-PWR9-NEXT: [[TMP4:%.*]] = call <16 x i8> @llvm.ppc.altivec.lvsl(ptr [[TMP3]])
1084 // BE-PWR9-NEXT: store <16 x i8> [[TMP4]], ptr [[__MASK_I]], align 16
1085 // BE-PWR9-NEXT: [[TMP5:%.*]] = load <16 x i8>, ptr [[__A_ADDR_I]], align 16
1086 // BE-PWR9-NEXT: [[TMP6:%.*]] = bitcast <16 x i8> [[TMP5]] to <4 x i32>
1087 // BE-PWR9-NEXT: [[TMP7:%.*]] = load <16 x i8>, ptr [[__A_ADDR_I]], align 16
1088 // BE-PWR9-NEXT: [[TMP8:%.*]] = bitcast <16 x i8> [[TMP7]] to <4 x i32>
1089 // BE-PWR9-NEXT: [[TMP9:%.*]] = load <16 x i8>, ptr [[__MASK_I]], align 16
1090 // BE-PWR9-NEXT: [[TMP10:%.*]] = call <4 x i32> @llvm.ppc.altivec.vperm(<4 x i32> [[TMP6]], <4 x i32> [[TMP8]], <16 x i8> [[TMP9]])
1091 // BE-PWR9-NEXT: [[TMP11:%.*]] = bitcast <4 x i32> [[TMP10]] to <16 x i8>
1092 // BE-PWR9-NEXT: store <16 x i8> [[TMP11]], ptr [[__RES_I]], align 16
1093 // BE-PWR9-NEXT: [[TMP12:%.*]] = load <16 x i8>, ptr [[__RES_I]], align 16
1094 // BE-PWR9-NEXT: [[TMP13:%.*]] = bitcast <16 x i8> [[TMP12]] to <4 x i32>
1095 // BE-PWR9-NEXT: [[TMP14:%.*]] = load ptr, ptr [[__B_ADDR_I]], align 8
1096 // BE-PWR9-NEXT: [[TMP15:%.*]] = load i64, ptr [[__C_ADDR_I]], align 8
1097 // BE-PWR9-NEXT: [[SHL_I:%.*]] = shl i64 [[TMP15]], 56
1098 // BE-PWR9-NEXT: call void @llvm.ppc.vsx.stxvll(<4 x i32> [[TMP13]], ptr [[TMP14]], i64 [[SHL_I]])
1099 // BE-PWR9-NEXT: ret void
1101 // LE-PWR9-LABEL: @test_strmb8(
1102 // LE-PWR9-NEXT: entry:
1103 // LE-PWR9-NEXT: [[__A_ADDR_I:%.*]] = alloca <16 x i8>, align 16
1104 // LE-PWR9-NEXT: [[__B_ADDR_I:%.*]] = alloca ptr, align 8
1105 // LE-PWR9-NEXT: [[__C_ADDR_I:%.*]] = alloca i64, align 8
1106 // LE-PWR9-NEXT: [[__MASK_I:%.*]] = alloca <16 x i8>, align 16
1107 // LE-PWR9-NEXT: [[__RES_I:%.*]] = alloca <16 x i8>, align 16
1108 // LE-PWR9-NEXT: [[PTR_ADDR:%.*]] = alloca ptr, align 8
1109 // LE-PWR9-NEXT: [[DATA_ADDR:%.*]] = alloca <16 x i8>, align 16
1110 // LE-PWR9-NEXT: store ptr [[PTR:%.*]], ptr [[PTR_ADDR]], align 8
1111 // LE-PWR9-NEXT: store <16 x i8> [[DATA:%.*]], ptr [[DATA_ADDR]], align 16
1112 // LE-PWR9-NEXT: [[TMP0:%.*]] = load <16 x i8>, ptr [[DATA_ADDR]], align 16
1113 // LE-PWR9-NEXT: [[TMP1:%.*]] = load ptr, ptr [[PTR_ADDR]], align 8
1114 // LE-PWR9-NEXT: store <16 x i8> [[TMP0]], ptr [[__A_ADDR_I]], align 16
1115 // LE-PWR9-NEXT: store ptr [[TMP1]], ptr [[__B_ADDR_I]], align 8
1116 // LE-PWR9-NEXT: store i64 8, ptr [[__C_ADDR_I]], align 8
1117 // LE-PWR9-NEXT: [[TMP2:%.*]] = load i64, ptr [[__C_ADDR_I]], align 8
1118 // LE-PWR9-NEXT: [[SUB_I:%.*]] = sub i64 16, [[TMP2]]
1119 // LE-PWR9-NEXT: [[CONV_I:%.*]] = trunc i64 [[SUB_I]] to i8
1120 // LE-PWR9-NEXT: [[TMP3:%.*]] = getelementptr i8, ptr null, i8 [[CONV_I]]
1121 // LE-PWR9-NEXT: [[TMP4:%.*]] = call <16 x i8> @llvm.ppc.altivec.lvsl(ptr [[TMP3]])
1122 // LE-PWR9-NEXT: store <16 x i8> [[TMP4]], ptr [[__MASK_I]], align 16
1123 // LE-PWR9-NEXT: [[TMP5:%.*]] = load <16 x i8>, ptr [[__A_ADDR_I]], align 16
1124 // LE-PWR9-NEXT: [[TMP6:%.*]] = bitcast <16 x i8> [[TMP5]] to <4 x i32>
1125 // LE-PWR9-NEXT: [[TMP7:%.*]] = load <16 x i8>, ptr [[__A_ADDR_I]], align 16
1126 // LE-PWR9-NEXT: [[TMP8:%.*]] = bitcast <16 x i8> [[TMP7]] to <4 x i32>
1127 // LE-PWR9-NEXT: [[TMP9:%.*]] = load <16 x i8>, ptr [[__MASK_I]], align 16
1128 // LE-PWR9-NEXT: [[TMP10:%.*]] = call <4 x i32> @llvm.ppc.altivec.vperm(<4 x i32> [[TMP6]], <4 x i32> [[TMP8]], <16 x i8> [[TMP9]])
1129 // LE-PWR9-NEXT: [[TMP11:%.*]] = bitcast <4 x i32> [[TMP10]] to <16 x i8>
1130 // LE-PWR9-NEXT: store <16 x i8> [[TMP11]], ptr [[__RES_I]], align 16
1131 // LE-PWR9-NEXT: [[TMP12:%.*]] = load <16 x i8>, ptr [[__RES_I]], align 16
1132 // LE-PWR9-NEXT: [[TMP13:%.*]] = bitcast <16 x i8> [[TMP12]] to <4 x i32>
1133 // LE-PWR9-NEXT: [[TMP14:%.*]] = load ptr, ptr [[__B_ADDR_I]], align 8
1134 // LE-PWR9-NEXT: [[TMP15:%.*]] = load i64, ptr [[__C_ADDR_I]], align 8
1135 // LE-PWR9-NEXT: [[SHL_I:%.*]] = shl i64 [[TMP15]], 56
1136 // LE-PWR9-NEXT: call void @llvm.ppc.vsx.stxvll(<4 x i32> [[TMP13]], ptr [[TMP14]], i64 [[SHL_I]])
1137 // LE-PWR9-NEXT: ret void
1139 // BE32-PWR9-LABEL: @test_strmb8(
1140 // BE32-PWR9-NEXT: entry:
1141 // BE32-PWR9-NEXT: [[PTR_ADDR:%.*]] = alloca ptr, align 4
1142 // BE32-PWR9-NEXT: [[DATA_ADDR:%.*]] = alloca <16 x i8>, align 16
1143 // BE32-PWR9-NEXT: store ptr [[PTR:%.*]], ptr [[PTR_ADDR]], align 4
1144 // BE32-PWR9-NEXT: store <16 x i8> [[DATA:%.*]], ptr [[DATA_ADDR]], align 16
1145 // BE32-PWR9-NEXT: [[TMP0:%.*]] = load ptr, ptr [[PTR_ADDR]], align 4
1146 // BE32-PWR9-NEXT: [[TMP1:%.*]] = load <16 x i8>, ptr [[DATA_ADDR]], align 16
1147 // BE32-PWR9-NEXT: [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64>
1148 // BE32-PWR9-NEXT: [[TMP3:%.*]] = getelementptr i8, ptr [[TMP0]], i64 0
1149 // BE32-PWR9-NEXT: [[TMP5:%.*]] = extractelement <2 x i64> [[TMP2]], i64 1
1150 // BE32-PWR9-NEXT: store i64 [[TMP5]], ptr [[TMP3]], align 1
1151 // BE32-PWR9-NEXT: ret void
1153 void test_strmb8(char *ptr, vector unsigned char data) {
1154 __vec_strmb(ptr, 8, data);
1156 // BE-PWR8-LABEL: @test_ldrmb9(
1157 // BE-PWR8-NEXT: entry:
1158 // BE-PWR8-NEXT: [[PTR_ADDR:%.*]] = alloca ptr, align 8
1159 // BE-PWR8-NEXT: store ptr [[PTR:%.*]], ptr [[PTR_ADDR]], align 8
1160 // BE-PWR8-NEXT: [[TMP0:%.*]] = load ptr, ptr [[PTR_ADDR]], align 8
1161 // BE-PWR8-NEXT: [[TMP1:%.*]] = getelementptr i8, ptr [[TMP0]], i32 8
1162 // BE-PWR8-NEXT: [[LD_LO:%.*]] = call <4 x i32> @llvm.ppc.altivec.lvx(ptr [[TMP0]])
1163 // BE-PWR8-NEXT: [[LD_HI:%.*]] = call <4 x i32> @llvm.ppc.altivec.lvx(ptr [[TMP1]])
1164 // BE-PWR8-NEXT: [[MASK1:%.*]] = call <16 x i8> @llvm.ppc.altivec.lvsl(ptr [[TMP0]])
1165 // BE-PWR8-NEXT: [[SHUFFLE1:%.*]] = call <4 x i32> @llvm.ppc.altivec.vperm(<4 x i32> [[LD_LO]], <4 x i32> [[LD_HI]], <16 x i8> [[MASK1]])
1166 // BE-PWR8-NEXT: [[SHUFFLE2:%.*]] = call <4 x i32> @llvm.ppc.altivec.vperm(<4 x i32> zeroinitializer, <4 x i32> [[SHUFFLE1]], <16 x i8> <i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19, i8 20, i8 21, i8 22, i8 23, i8 24>)
1167 // BE-PWR8-NEXT: [[TMP2:%.*]] = bitcast <4 x i32> [[SHUFFLE2]] to <16 x i8>
1168 // BE-PWR8-NEXT: ret <16 x i8> [[TMP2]]
1170 // LE-PWR8-LABEL: @test_ldrmb9(
1171 // LE-PWR8-NEXT: entry:
1172 // LE-PWR8-NEXT: [[PTR_ADDR:%.*]] = alloca ptr, align 8
1173 // LE-PWR8-NEXT: store ptr [[PTR:%.*]], ptr [[PTR_ADDR]], align 8
1174 // LE-PWR8-NEXT: [[TMP0:%.*]] = load ptr, ptr [[PTR_ADDR]], align 8
1175 // LE-PWR8-NEXT: [[TMP1:%.*]] = getelementptr i8, ptr [[TMP0]], i32 8
1176 // LE-PWR8-NEXT: [[LD_LO:%.*]] = call <4 x i32> @llvm.ppc.altivec.lvx(ptr [[TMP0]])
1177 // LE-PWR8-NEXT: [[LD_HI:%.*]] = call <4 x i32> @llvm.ppc.altivec.lvx(ptr [[TMP1]])
1178 // LE-PWR8-NEXT: [[MASK1:%.*]] = call <16 x i8> @llvm.ppc.altivec.lvsr(ptr [[TMP0]])
1179 // LE-PWR8-NEXT: [[SHUFFLE1:%.*]] = call <4 x i32> @llvm.ppc.altivec.vperm(<4 x i32> [[LD_HI]], <4 x i32> [[LD_LO]], <16 x i8> [[MASK1]])
1180 // LE-PWR8-NEXT: [[TMP2:%.*]] = bitcast <4 x i32> [[SHUFFLE1]] to <16 x i8>
1181 // LE-PWR8-NEXT: [[TMP3:%.*]] = shufflevector <16 x i8> [[TMP2]], <16 x i8> zeroinitializer, <16 x i32> <i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22>
1182 // LE-PWR8-NEXT: ret <16 x i8> [[TMP3]]
1184 // BE-PWR9-LABEL: @test_ldrmb9(
1185 // BE-PWR9-NEXT: entry:
1186 // BE-PWR9-NEXT: [[__A_ADDR_I:%.*]] = alloca ptr, align 8
1187 // BE-PWR9-NEXT: [[__B_ADDR_I:%.*]] = alloca i64, align 8
1188 // BE-PWR9-NEXT: [[__RES_I:%.*]] = alloca <16 x i8>, align 16
1189 // BE-PWR9-NEXT: [[__MASK_I:%.*]] = alloca <16 x i8>, align 16
1190 // BE-PWR9-NEXT: [[PTR_ADDR:%.*]] = alloca ptr, align 8
1191 // BE-PWR9-NEXT: store ptr [[PTR:%.*]], ptr [[PTR_ADDR]], align 8
1192 // BE-PWR9-NEXT: [[TMP0:%.*]] = load ptr, ptr [[PTR_ADDR]], align 8
1193 // BE-PWR9-NEXT: store ptr [[TMP0]], ptr [[__A_ADDR_I]], align 8
1194 // BE-PWR9-NEXT: store i64 9, ptr [[__B_ADDR_I]], align 8
1195 // BE-PWR9-NEXT: [[TMP1:%.*]] = load ptr, ptr [[__A_ADDR_I]], align 8
1196 // BE-PWR9-NEXT: [[TMP2:%.*]] = load i64, ptr [[__B_ADDR_I]], align 8
1197 // BE-PWR9-NEXT: [[SHL_I:%.*]] = shl i64 [[TMP2]], 56
1198 // BE-PWR9-NEXT: [[TMP3:%.*]] = call <4 x i32> @llvm.ppc.vsx.lxvll(ptr [[TMP1]], i64 [[SHL_I]])
1199 // BE-PWR9-NEXT: [[TMP4:%.*]] = bitcast <4 x i32> [[TMP3]] to <16 x i8>
1200 // BE-PWR9-NEXT: store <16 x i8> [[TMP4]], ptr [[__RES_I]], align 16
1201 // BE-PWR9-NEXT: [[TMP5:%.*]] = load i64, ptr [[__B_ADDR_I]], align 8
1202 // BE-PWR9-NEXT: [[SUB_I:%.*]] = sub i64 16, [[TMP5]]
1203 // BE-PWR9-NEXT: [[CONV_I:%.*]] = trunc i64 [[SUB_I]] to i8
1204 // BE-PWR9-NEXT: [[TMP6:%.*]] = getelementptr i8, ptr null, i8 [[CONV_I]]
1205 // BE-PWR9-NEXT: [[TMP7:%.*]] = call <16 x i8> @llvm.ppc.altivec.lvsr(ptr [[TMP6]])
1206 // BE-PWR9-NEXT: store <16 x i8> [[TMP7]], ptr [[__MASK_I]], align 16
1207 // BE-PWR9-NEXT: [[TMP8:%.*]] = load <16 x i8>, ptr [[__RES_I]], align 16
1208 // BE-PWR9-NEXT: [[TMP9:%.*]] = bitcast <16 x i8> [[TMP8]] to <4 x i32>
1209 // BE-PWR9-NEXT: [[TMP10:%.*]] = load <16 x i8>, ptr [[__RES_I]], align 16
1210 // BE-PWR9-NEXT: [[TMP11:%.*]] = bitcast <16 x i8> [[TMP10]] to <4 x i32>
1211 // BE-PWR9-NEXT: [[TMP12:%.*]] = load <16 x i8>, ptr [[__MASK_I]], align 16
1212 // BE-PWR9-NEXT: [[TMP13:%.*]] = call <4 x i32> @llvm.ppc.altivec.vperm(<4 x i32> [[TMP9]], <4 x i32> [[TMP11]], <16 x i8> [[TMP12]])
1213 // BE-PWR9-NEXT: [[TMP14:%.*]] = bitcast <4 x i32> [[TMP13]] to <16 x i8>
1214 // BE-PWR9-NEXT: ret <16 x i8> [[TMP14]]
1216 // LE-PWR9-LABEL: @test_ldrmb9(
1217 // LE-PWR9-NEXT: entry:
1218 // LE-PWR9-NEXT: [[__A_ADDR_I:%.*]] = alloca ptr, align 8
1219 // LE-PWR9-NEXT: [[__B_ADDR_I:%.*]] = alloca i64, align 8
1220 // LE-PWR9-NEXT: [[__RES_I:%.*]] = alloca <16 x i8>, align 16
1221 // LE-PWR9-NEXT: [[__MASK_I:%.*]] = alloca <16 x i8>, align 16
1222 // LE-PWR9-NEXT: [[PTR_ADDR:%.*]] = alloca ptr, align 8
1223 // LE-PWR9-NEXT: store ptr [[PTR:%.*]], ptr [[PTR_ADDR]], align 8
1224 // LE-PWR9-NEXT: [[TMP0:%.*]] = load ptr, ptr [[PTR_ADDR]], align 8
1225 // LE-PWR9-NEXT: store ptr [[TMP0]], ptr [[__A_ADDR_I]], align 8
1226 // LE-PWR9-NEXT: store i64 9, ptr [[__B_ADDR_I]], align 8
1227 // LE-PWR9-NEXT: [[TMP1:%.*]] = load ptr, ptr [[__A_ADDR_I]], align 8
1228 // LE-PWR9-NEXT: [[TMP2:%.*]] = load i64, ptr [[__B_ADDR_I]], align 8
1229 // LE-PWR9-NEXT: [[SHL_I:%.*]] = shl i64 [[TMP2]], 56
1230 // LE-PWR9-NEXT: [[TMP3:%.*]] = call <4 x i32> @llvm.ppc.vsx.lxvll(ptr [[TMP1]], i64 [[SHL_I]])
1231 // LE-PWR9-NEXT: [[TMP4:%.*]] = bitcast <4 x i32> [[TMP3]] to <16 x i8>
1232 // LE-PWR9-NEXT: store <16 x i8> [[TMP4]], ptr [[__RES_I]], align 16
1233 // LE-PWR9-NEXT: [[TMP5:%.*]] = load i64, ptr [[__B_ADDR_I]], align 8
1234 // LE-PWR9-NEXT: [[SUB_I:%.*]] = sub i64 16, [[TMP5]]
1235 // LE-PWR9-NEXT: [[CONV_I:%.*]] = trunc i64 [[SUB_I]] to i8
1236 // LE-PWR9-NEXT: [[TMP6:%.*]] = getelementptr i8, ptr null, i8 [[CONV_I]]
1237 // LE-PWR9-NEXT: [[TMP7:%.*]] = call <16 x i8> @llvm.ppc.altivec.lvsr(ptr [[TMP6]])
1238 // LE-PWR9-NEXT: store <16 x i8> [[TMP7]], ptr [[__MASK_I]], align 16
1239 // LE-PWR9-NEXT: [[TMP8:%.*]] = load <16 x i8>, ptr [[__RES_I]], align 16
1240 // LE-PWR9-NEXT: [[TMP9:%.*]] = bitcast <16 x i8> [[TMP8]] to <4 x i32>
1241 // LE-PWR9-NEXT: [[TMP10:%.*]] = load <16 x i8>, ptr [[__RES_I]], align 16
1242 // LE-PWR9-NEXT: [[TMP11:%.*]] = bitcast <16 x i8> [[TMP10]] to <4 x i32>
1243 // LE-PWR9-NEXT: [[TMP12:%.*]] = load <16 x i8>, ptr [[__MASK_I]], align 16
1244 // LE-PWR9-NEXT: [[TMP13:%.*]] = call <4 x i32> @llvm.ppc.altivec.vperm(<4 x i32> [[TMP9]], <4 x i32> [[TMP11]], <16 x i8> [[TMP12]])
1245 // LE-PWR9-NEXT: [[TMP14:%.*]] = bitcast <4 x i32> [[TMP13]] to <16 x i8>
1246 // LE-PWR9-NEXT: ret <16 x i8> [[TMP14]]
1248 // BE32-PWR9-LABEL: @test_ldrmb9(
1249 // BE32-PWR9-NEXT: entry:
1250 // BE32-PWR9-NEXT: [[PTR_ADDR:%.*]] = alloca ptr, align 4
1251 // BE32-PWR9-NEXT: store ptr [[PTR:%.*]], ptr [[PTR_ADDR]], align 4
1252 // BE32-PWR9-NEXT: [[TMP0:%.*]] = load ptr, ptr [[PTR_ADDR]], align 4
1253 // BE32-PWR9-NEXT: [[TMP1:%.*]] = getelementptr i8, ptr [[TMP0]], i32 8
1254 // BE32-PWR9-NEXT: [[LD_LO:%.*]] = call <4 x i32> @llvm.ppc.altivec.lvx(ptr [[TMP0]])
1255 // BE32-PWR9-NEXT: [[LD_HI:%.*]] = call <4 x i32> @llvm.ppc.altivec.lvx(ptr [[TMP1]])
1256 // BE32-PWR9-NEXT: [[MASK1:%.*]] = call <16 x i8> @llvm.ppc.altivec.lvsl(ptr [[TMP0]])
1257 // BE32-PWR9-NEXT: [[SHUFFLE1:%.*]] = call <4 x i32> @llvm.ppc.altivec.vperm(<4 x i32> [[LD_LO]], <4 x i32> [[LD_HI]], <16 x i8> [[MASK1]])
1258 // BE32-PWR9-NEXT: [[SHUFFLE2:%.*]] = call <4 x i32> @llvm.ppc.altivec.vperm(<4 x i32> zeroinitializer, <4 x i32> [[SHUFFLE1]], <16 x i8> <i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19, i8 20, i8 21, i8 22, i8 23, i8 24>)
1259 // BE32-PWR9-NEXT: [[TMP2:%.*]] = bitcast <4 x i32> [[SHUFFLE2]] to <16 x i8>
1260 // BE32-PWR9-NEXT: ret <16 x i8> [[TMP2]]
1262 vector unsigned char test_ldrmb9(char *ptr) { return __vec_ldrmb(ptr, 9); }
1264 // BE-PWR8-LABEL: @test_strmb9(
1265 // BE-PWR8-NEXT: entry:
1266 // BE-PWR8-NEXT: [[PTR_ADDR:%.*]] = alloca ptr, align 8
1267 // BE-PWR8-NEXT: [[DATA_ADDR:%.*]] = alloca <16 x i8>, align 16
1268 // BE-PWR8-NEXT: store ptr [[PTR:%.*]], ptr [[PTR_ADDR]], align 8
1269 // BE-PWR8-NEXT: store <16 x i8> [[DATA:%.*]], ptr [[DATA_ADDR]], align 16
1270 // BE-PWR8-NEXT: [[TMP0:%.*]] = load ptr, ptr [[PTR_ADDR]], align 8
1271 // BE-PWR8-NEXT: [[TMP1:%.*]] = load <16 x i8>, ptr [[DATA_ADDR]], align 16
1272 // BE-PWR8-NEXT: [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64>
1273 // BE-PWR8-NEXT: [[TMP3:%.*]] = getelementptr i8, ptr [[TMP0]], i64 1
1274 // BE-PWR8-NEXT: [[TMP5:%.*]] = extractelement <2 x i64> [[TMP2]], i64 1
1275 // BE-PWR8-NEXT: store i64 [[TMP5]], ptr [[TMP3]], align 1
1276 // BE-PWR8-NEXT: [[TMP6:%.*]] = getelementptr i8, ptr [[TMP0]], i64 0
1277 // BE-PWR8-NEXT: [[TMP7:%.*]] = extractelement <16 x i8> [[TMP1]], i64 7
1278 // BE-PWR8-NEXT: store i8 [[TMP7]], ptr [[TMP6]], align 1
1279 // BE-PWR8-NEXT: ret void
1281 // LE-PWR8-LABEL: @test_strmb9(
1282 // LE-PWR8-NEXT: entry:
1283 // LE-PWR8-NEXT: [[PTR_ADDR:%.*]] = alloca ptr, align 8
1284 // LE-PWR8-NEXT: [[DATA_ADDR:%.*]] = alloca <16 x i8>, align 16
1285 // LE-PWR8-NEXT: store ptr [[PTR:%.*]], ptr [[PTR_ADDR]], align 8
1286 // LE-PWR8-NEXT: store <16 x i8> [[DATA:%.*]], ptr [[DATA_ADDR]], align 16
1287 // LE-PWR8-NEXT: [[TMP0:%.*]] = load ptr, ptr [[PTR_ADDR]], align 8
1288 // LE-PWR8-NEXT: [[TMP1:%.*]] = load <16 x i8>, ptr [[DATA_ADDR]], align 16
1289 // LE-PWR8-NEXT: [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64>
1290 // LE-PWR8-NEXT: [[TMP3:%.*]] = getelementptr i8, ptr [[TMP0]], i64 1
1291 // LE-PWR8-NEXT: [[TMP5:%.*]] = extractelement <2 x i64> [[TMP2]], i64 0
1292 // LE-PWR8-NEXT: [[TMP6:%.*]] = call i64 @llvm.bswap.i64(i64 [[TMP5]])
1293 // LE-PWR8-NEXT: store i64 [[TMP6]], ptr [[TMP3]], align 1
1294 // LE-PWR8-NEXT: [[TMP7:%.*]] = getelementptr i8, ptr [[TMP0]], i64 0
1295 // LE-PWR8-NEXT: [[TMP8:%.*]] = extractelement <16 x i8> [[TMP1]], i64 8
1296 // LE-PWR8-NEXT: store i8 [[TMP8]], ptr [[TMP7]], align 1
1297 // LE-PWR8-NEXT: ret void
1299 // BE-PWR9-LABEL: @test_strmb9(
1300 // BE-PWR9-NEXT: entry:
1301 // BE-PWR9-NEXT: [[__A_ADDR_I:%.*]] = alloca <16 x i8>, align 16
1302 // BE-PWR9-NEXT: [[__B_ADDR_I:%.*]] = alloca ptr, align 8
1303 // BE-PWR9-NEXT: [[__C_ADDR_I:%.*]] = alloca i64, align 8
1304 // BE-PWR9-NEXT: [[__MASK_I:%.*]] = alloca <16 x i8>, align 16
1305 // BE-PWR9-NEXT: [[__RES_I:%.*]] = alloca <16 x i8>, align 16
1306 // BE-PWR9-NEXT: [[PTR_ADDR:%.*]] = alloca ptr, align 8
1307 // BE-PWR9-NEXT: [[DATA_ADDR:%.*]] = alloca <16 x i8>, align 16
1308 // BE-PWR9-NEXT: store ptr [[PTR:%.*]], ptr [[PTR_ADDR]], align 8
1309 // BE-PWR9-NEXT: store <16 x i8> [[DATA:%.*]], ptr [[DATA_ADDR]], align 16
1310 // BE-PWR9-NEXT: [[TMP0:%.*]] = load <16 x i8>, ptr [[DATA_ADDR]], align 16
1311 // BE-PWR9-NEXT: [[TMP1:%.*]] = load ptr, ptr [[PTR_ADDR]], align 8
1312 // BE-PWR9-NEXT: store <16 x i8> [[TMP0]], ptr [[__A_ADDR_I]], align 16
1313 // BE-PWR9-NEXT: store ptr [[TMP1]], ptr [[__B_ADDR_I]], align 8
1314 // BE-PWR9-NEXT: store i64 9, ptr [[__C_ADDR_I]], align 8
1315 // BE-PWR9-NEXT: [[TMP2:%.*]] = load i64, ptr [[__C_ADDR_I]], align 8
1316 // BE-PWR9-NEXT: [[SUB_I:%.*]] = sub i64 16, [[TMP2]]
1317 // BE-PWR9-NEXT: [[CONV_I:%.*]] = trunc i64 [[SUB_I]] to i8
1318 // BE-PWR9-NEXT: [[TMP3:%.*]] = getelementptr i8, ptr null, i8 [[CONV_I]]
1319 // BE-PWR9-NEXT: [[TMP4:%.*]] = call <16 x i8> @llvm.ppc.altivec.lvsl(ptr [[TMP3]])
1320 // BE-PWR9-NEXT: store <16 x i8> [[TMP4]], ptr [[__MASK_I]], align 16
1321 // BE-PWR9-NEXT: [[TMP5:%.*]] = load <16 x i8>, ptr [[__A_ADDR_I]], align 16
1322 // BE-PWR9-NEXT: [[TMP6:%.*]] = bitcast <16 x i8> [[TMP5]] to <4 x i32>
1323 // BE-PWR9-NEXT: [[TMP7:%.*]] = load <16 x i8>, ptr [[__A_ADDR_I]], align 16
1324 // BE-PWR9-NEXT: [[TMP8:%.*]] = bitcast <16 x i8> [[TMP7]] to <4 x i32>
1325 // BE-PWR9-NEXT: [[TMP9:%.*]] = load <16 x i8>, ptr [[__MASK_I]], align 16
1326 // BE-PWR9-NEXT: [[TMP10:%.*]] = call <4 x i32> @llvm.ppc.altivec.vperm(<4 x i32> [[TMP6]], <4 x i32> [[TMP8]], <16 x i8> [[TMP9]])
1327 // BE-PWR9-NEXT: [[TMP11:%.*]] = bitcast <4 x i32> [[TMP10]] to <16 x i8>
1328 // BE-PWR9-NEXT: store <16 x i8> [[TMP11]], ptr [[__RES_I]], align 16
1329 // BE-PWR9-NEXT: [[TMP12:%.*]] = load <16 x i8>, ptr [[__RES_I]], align 16
1330 // BE-PWR9-NEXT: [[TMP13:%.*]] = bitcast <16 x i8> [[TMP12]] to <4 x i32>
1331 // BE-PWR9-NEXT: [[TMP14:%.*]] = load ptr, ptr [[__B_ADDR_I]], align 8
1332 // BE-PWR9-NEXT: [[TMP15:%.*]] = load i64, ptr [[__C_ADDR_I]], align 8
1333 // BE-PWR9-NEXT: [[SHL_I:%.*]] = shl i64 [[TMP15]], 56
1334 // BE-PWR9-NEXT: call void @llvm.ppc.vsx.stxvll(<4 x i32> [[TMP13]], ptr [[TMP14]], i64 [[SHL_I]])
1335 // BE-PWR9-NEXT: ret void
1337 // LE-PWR9-LABEL: @test_strmb9(
1338 // LE-PWR9-NEXT: entry:
1339 // LE-PWR9-NEXT: [[__A_ADDR_I:%.*]] = alloca <16 x i8>, align 16
1340 // LE-PWR9-NEXT: [[__B_ADDR_I:%.*]] = alloca ptr, align 8
1341 // LE-PWR9-NEXT: [[__C_ADDR_I:%.*]] = alloca i64, align 8
1342 // LE-PWR9-NEXT: [[__MASK_I:%.*]] = alloca <16 x i8>, align 16
1343 // LE-PWR9-NEXT: [[__RES_I:%.*]] = alloca <16 x i8>, align 16
1344 // LE-PWR9-NEXT: [[PTR_ADDR:%.*]] = alloca ptr, align 8
1345 // LE-PWR9-NEXT: [[DATA_ADDR:%.*]] = alloca <16 x i8>, align 16
1346 // LE-PWR9-NEXT: store ptr [[PTR:%.*]], ptr [[PTR_ADDR]], align 8
1347 // LE-PWR9-NEXT: store <16 x i8> [[DATA:%.*]], ptr [[DATA_ADDR]], align 16
1348 // LE-PWR9-NEXT: [[TMP0:%.*]] = load <16 x i8>, ptr [[DATA_ADDR]], align 16
1349 // LE-PWR9-NEXT: [[TMP1:%.*]] = load ptr, ptr [[PTR_ADDR]], align 8
1350 // LE-PWR9-NEXT: store <16 x i8> [[TMP0]], ptr [[__A_ADDR_I]], align 16
1351 // LE-PWR9-NEXT: store ptr [[TMP1]], ptr [[__B_ADDR_I]], align 8
1352 // LE-PWR9-NEXT: store i64 9, ptr [[__C_ADDR_I]], align 8
1353 // LE-PWR9-NEXT: [[TMP2:%.*]] = load i64, ptr [[__C_ADDR_I]], align 8
1354 // LE-PWR9-NEXT: [[SUB_I:%.*]] = sub i64 16, [[TMP2]]
1355 // LE-PWR9-NEXT: [[CONV_I:%.*]] = trunc i64 [[SUB_I]] to i8
1356 // LE-PWR9-NEXT: [[TMP3:%.*]] = getelementptr i8, ptr null, i8 [[CONV_I]]
1357 // LE-PWR9-NEXT: [[TMP4:%.*]] = call <16 x i8> @llvm.ppc.altivec.lvsl(ptr [[TMP3]])
1358 // LE-PWR9-NEXT: store <16 x i8> [[TMP4]], ptr [[__MASK_I]], align 16
1359 // LE-PWR9-NEXT: [[TMP5:%.*]] = load <16 x i8>, ptr [[__A_ADDR_I]], align 16
1360 // LE-PWR9-NEXT: [[TMP6:%.*]] = bitcast <16 x i8> [[TMP5]] to <4 x i32>
1361 // LE-PWR9-NEXT: [[TMP7:%.*]] = load <16 x i8>, ptr [[__A_ADDR_I]], align 16
1362 // LE-PWR9-NEXT: [[TMP8:%.*]] = bitcast <16 x i8> [[TMP7]] to <4 x i32>
1363 // LE-PWR9-NEXT: [[TMP9:%.*]] = load <16 x i8>, ptr [[__MASK_I]], align 16
1364 // LE-PWR9-NEXT: [[TMP10:%.*]] = call <4 x i32> @llvm.ppc.altivec.vperm(<4 x i32> [[TMP6]], <4 x i32> [[TMP8]], <16 x i8> [[TMP9]])
1365 // LE-PWR9-NEXT: [[TMP11:%.*]] = bitcast <4 x i32> [[TMP10]] to <16 x i8>
1366 // LE-PWR9-NEXT: store <16 x i8> [[TMP11]], ptr [[__RES_I]], align 16
1367 // LE-PWR9-NEXT: [[TMP12:%.*]] = load <16 x i8>, ptr [[__RES_I]], align 16
1368 // LE-PWR9-NEXT: [[TMP13:%.*]] = bitcast <16 x i8> [[TMP12]] to <4 x i32>
1369 // LE-PWR9-NEXT: [[TMP14:%.*]] = load ptr, ptr [[__B_ADDR_I]], align 8
1370 // LE-PWR9-NEXT: [[TMP15:%.*]] = load i64, ptr [[__C_ADDR_I]], align 8
1371 // LE-PWR9-NEXT: [[SHL_I:%.*]] = shl i64 [[TMP15]], 56
1372 // LE-PWR9-NEXT: call void @llvm.ppc.vsx.stxvll(<4 x i32> [[TMP13]], ptr [[TMP14]], i64 [[SHL_I]])
1373 // LE-PWR9-NEXT: ret void
1375 // BE32-PWR9-LABEL: @test_strmb9(
1376 // BE32-PWR9-NEXT: entry:
1377 // BE32-PWR9-NEXT: [[PTR_ADDR:%.*]] = alloca ptr, align 4
1378 // BE32-PWR9-NEXT: [[DATA_ADDR:%.*]] = alloca <16 x i8>, align 16
1379 // BE32-PWR9-NEXT: store ptr [[PTR:%.*]], ptr [[PTR_ADDR]], align 4
1380 // BE32-PWR9-NEXT: store <16 x i8> [[DATA:%.*]], ptr [[DATA_ADDR]], align 16
1381 // BE32-PWR9-NEXT: [[TMP0:%.*]] = load ptr, ptr [[PTR_ADDR]], align 4
1382 // BE32-PWR9-NEXT: [[TMP1:%.*]] = load <16 x i8>, ptr [[DATA_ADDR]], align 16
1383 // BE32-PWR9-NEXT: [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64>
1384 // BE32-PWR9-NEXT: [[TMP3:%.*]] = getelementptr i8, ptr [[TMP0]], i64 1
1385 // BE32-PWR9-NEXT: [[TMP5:%.*]] = extractelement <2 x i64> [[TMP2]], i64 1
1386 // BE32-PWR9-NEXT: store i64 [[TMP5]], ptr [[TMP3]], align 1
1387 // BE32-PWR9-NEXT: [[TMP6:%.*]] = getelementptr i8, ptr [[TMP0]], i64 0
1388 // BE32-PWR9-NEXT: [[TMP7:%.*]] = extractelement <16 x i8> [[TMP1]], i64 7
1389 // BE32-PWR9-NEXT: store i8 [[TMP7]], ptr [[TMP6]], align 1
1390 // BE32-PWR9-NEXT: ret void
1392 void test_strmb9(char *ptr, vector unsigned char data) {
1393 __vec_strmb(ptr, 9, data);
1396 // BE-PWR8-LABEL: @test_strmb10(
1397 // BE-PWR8-NEXT: entry:
1398 // BE-PWR8-NEXT: [[PTR_ADDR:%.*]] = alloca ptr, align 8
1399 // BE-PWR8-NEXT: [[DATA_ADDR:%.*]] = alloca <16 x i8>, align 16
1400 // BE-PWR8-NEXT: store ptr [[PTR:%.*]], ptr [[PTR_ADDR]], align 8
1401 // BE-PWR8-NEXT: store <16 x i8> [[DATA:%.*]], ptr [[DATA_ADDR]], align 16
1402 // BE-PWR8-NEXT: [[TMP0:%.*]] = load ptr, ptr [[PTR_ADDR]], align 8
1403 // BE-PWR8-NEXT: [[TMP1:%.*]] = load <16 x i8>, ptr [[DATA_ADDR]], align 16
1404 // BE-PWR8-NEXT: [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64>
1405 // BE-PWR8-NEXT: [[TMP3:%.*]] = getelementptr i8, ptr [[TMP0]], i64 2
1406 // BE-PWR8-NEXT: [[TMP5:%.*]] = extractelement <2 x i64> [[TMP2]], i64 1
1407 // BE-PWR8-NEXT: store i64 [[TMP5]], ptr [[TMP3]], align 1
1408 // BE-PWR8-NEXT: [[TMP6:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
1409 // BE-PWR8-NEXT: [[TMP7:%.*]] = getelementptr i8, ptr [[TMP0]], i64 0
1410 // BE-PWR8-NEXT: [[TMP9:%.*]] = extractelement <8 x i16> [[TMP6]], i64 3
1411 // BE-PWR8-NEXT: store i16 [[TMP9]], ptr [[TMP7]], align 1
1412 // BE-PWR8-NEXT: ret void
1414 // LE-PWR8-LABEL: @test_strmb10(
1415 // LE-PWR8-NEXT: entry:
1416 // LE-PWR8-NEXT: [[PTR_ADDR:%.*]] = alloca ptr, align 8
1417 // LE-PWR8-NEXT: [[DATA_ADDR:%.*]] = alloca <16 x i8>, align 16
1418 // LE-PWR8-NEXT: store ptr [[PTR:%.*]], ptr [[PTR_ADDR]], align 8
1419 // LE-PWR8-NEXT: store <16 x i8> [[DATA:%.*]], ptr [[DATA_ADDR]], align 16
1420 // LE-PWR8-NEXT: [[TMP0:%.*]] = load ptr, ptr [[PTR_ADDR]], align 8
1421 // LE-PWR8-NEXT: [[TMP1:%.*]] = load <16 x i8>, ptr [[DATA_ADDR]], align 16
1422 // LE-PWR8-NEXT: [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64>
1423 // LE-PWR8-NEXT: [[TMP3:%.*]] = getelementptr i8, ptr [[TMP0]], i64 2
1424 // LE-PWR8-NEXT: [[TMP5:%.*]] = extractelement <2 x i64> [[TMP2]], i64 0
1425 // LE-PWR8-NEXT: [[TMP6:%.*]] = call i64 @llvm.bswap.i64(i64 [[TMP5]])
1426 // LE-PWR8-NEXT: store i64 [[TMP6]], ptr [[TMP3]], align 1
1427 // LE-PWR8-NEXT: [[TMP7:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
1428 // LE-PWR8-NEXT: [[TMP8:%.*]] = getelementptr i8, ptr [[TMP0]], i64 0
1429 // LE-PWR8-NEXT: [[TMP10:%.*]] = extractelement <8 x i16> [[TMP7]], i64 4
1430 // LE-PWR8-NEXT: [[TMP11:%.*]] = call i16 @llvm.bswap.i16(i16 [[TMP10]])
1431 // LE-PWR8-NEXT: store i16 [[TMP11]], ptr [[TMP8]], align 1
1432 // LE-PWR8-NEXT: ret void
1434 // BE-PWR9-LABEL: @test_strmb10(
1435 // BE-PWR9-NEXT: entry:
1436 // BE-PWR9-NEXT: [[__A_ADDR_I:%.*]] = alloca <16 x i8>, align 16
1437 // BE-PWR9-NEXT: [[__B_ADDR_I:%.*]] = alloca ptr, align 8
1438 // BE-PWR9-NEXT: [[__C_ADDR_I:%.*]] = alloca i64, align 8
1439 // BE-PWR9-NEXT: [[__MASK_I:%.*]] = alloca <16 x i8>, align 16
1440 // BE-PWR9-NEXT: [[__RES_I:%.*]] = alloca <16 x i8>, align 16
1441 // BE-PWR9-NEXT: [[PTR_ADDR:%.*]] = alloca ptr, align 8
1442 // BE-PWR9-NEXT: [[DATA_ADDR:%.*]] = alloca <16 x i8>, align 16
1443 // BE-PWR9-NEXT: store ptr [[PTR:%.*]], ptr [[PTR_ADDR]], align 8
1444 // BE-PWR9-NEXT: store <16 x i8> [[DATA:%.*]], ptr [[DATA_ADDR]], align 16
1445 // BE-PWR9-NEXT: [[TMP0:%.*]] = load <16 x i8>, ptr [[DATA_ADDR]], align 16
1446 // BE-PWR9-NEXT: [[TMP1:%.*]] = load ptr, ptr [[PTR_ADDR]], align 8
1447 // BE-PWR9-NEXT: store <16 x i8> [[TMP0]], ptr [[__A_ADDR_I]], align 16
1448 // BE-PWR9-NEXT: store ptr [[TMP1]], ptr [[__B_ADDR_I]], align 8
1449 // BE-PWR9-NEXT: store i64 10, ptr [[__C_ADDR_I]], align 8
1450 // BE-PWR9-NEXT: [[TMP2:%.*]] = load i64, ptr [[__C_ADDR_I]], align 8
1451 // BE-PWR9-NEXT: [[SUB_I:%.*]] = sub i64 16, [[TMP2]]
1452 // BE-PWR9-NEXT: [[CONV_I:%.*]] = trunc i64 [[SUB_I]] to i8
1453 // BE-PWR9-NEXT: [[TMP3:%.*]] = getelementptr i8, ptr null, i8 [[CONV_I]]
1454 // BE-PWR9-NEXT: [[TMP4:%.*]] = call <16 x i8> @llvm.ppc.altivec.lvsl(ptr [[TMP3]])
1455 // BE-PWR9-NEXT: store <16 x i8> [[TMP4]], ptr [[__MASK_I]], align 16
1456 // BE-PWR9-NEXT: [[TMP5:%.*]] = load <16 x i8>, ptr [[__A_ADDR_I]], align 16
1457 // BE-PWR9-NEXT: [[TMP6:%.*]] = bitcast <16 x i8> [[TMP5]] to <4 x i32>
1458 // BE-PWR9-NEXT: [[TMP7:%.*]] = load <16 x i8>, ptr [[__A_ADDR_I]], align 16
1459 // BE-PWR9-NEXT: [[TMP8:%.*]] = bitcast <16 x i8> [[TMP7]] to <4 x i32>
1460 // BE-PWR9-NEXT: [[TMP9:%.*]] = load <16 x i8>, ptr [[__MASK_I]], align 16
1461 // BE-PWR9-NEXT: [[TMP10:%.*]] = call <4 x i32> @llvm.ppc.altivec.vperm(<4 x i32> [[TMP6]], <4 x i32> [[TMP8]], <16 x i8> [[TMP9]])
1462 // BE-PWR9-NEXT: [[TMP11:%.*]] = bitcast <4 x i32> [[TMP10]] to <16 x i8>
1463 // BE-PWR9-NEXT: store <16 x i8> [[TMP11]], ptr [[__RES_I]], align 16
1464 // BE-PWR9-NEXT: [[TMP12:%.*]] = load <16 x i8>, ptr [[__RES_I]], align 16
1465 // BE-PWR9-NEXT: [[TMP13:%.*]] = bitcast <16 x i8> [[TMP12]] to <4 x i32>
1466 // BE-PWR9-NEXT: [[TMP14:%.*]] = load ptr, ptr [[__B_ADDR_I]], align 8
1467 // BE-PWR9-NEXT: [[TMP15:%.*]] = load i64, ptr [[__C_ADDR_I]], align 8
1468 // BE-PWR9-NEXT: [[SHL_I:%.*]] = shl i64 [[TMP15]], 56
1469 // BE-PWR9-NEXT: call void @llvm.ppc.vsx.stxvll(<4 x i32> [[TMP13]], ptr [[TMP14]], i64 [[SHL_I]])
1470 // BE-PWR9-NEXT: ret void
1472 // LE-PWR9-LABEL: @test_strmb10(
1473 // LE-PWR9-NEXT: entry:
1474 // LE-PWR9-NEXT: [[__A_ADDR_I:%.*]] = alloca <16 x i8>, align 16
1475 // LE-PWR9-NEXT: [[__B_ADDR_I:%.*]] = alloca ptr, align 8
1476 // LE-PWR9-NEXT: [[__C_ADDR_I:%.*]] = alloca i64, align 8
1477 // LE-PWR9-NEXT: [[__MASK_I:%.*]] = alloca <16 x i8>, align 16
1478 // LE-PWR9-NEXT: [[__RES_I:%.*]] = alloca <16 x i8>, align 16
1479 // LE-PWR9-NEXT: [[PTR_ADDR:%.*]] = alloca ptr, align 8
1480 // LE-PWR9-NEXT: [[DATA_ADDR:%.*]] = alloca <16 x i8>, align 16
1481 // LE-PWR9-NEXT: store ptr [[PTR:%.*]], ptr [[PTR_ADDR]], align 8
1482 // LE-PWR9-NEXT: store <16 x i8> [[DATA:%.*]], ptr [[DATA_ADDR]], align 16
1483 // LE-PWR9-NEXT: [[TMP0:%.*]] = load <16 x i8>, ptr [[DATA_ADDR]], align 16
1484 // LE-PWR9-NEXT: [[TMP1:%.*]] = load ptr, ptr [[PTR_ADDR]], align 8
1485 // LE-PWR9-NEXT: store <16 x i8> [[TMP0]], ptr [[__A_ADDR_I]], align 16
1486 // LE-PWR9-NEXT: store ptr [[TMP1]], ptr [[__B_ADDR_I]], align 8
1487 // LE-PWR9-NEXT: store i64 10, ptr [[__C_ADDR_I]], align 8
1488 // LE-PWR9-NEXT: [[TMP2:%.*]] = load i64, ptr [[__C_ADDR_I]], align 8
1489 // LE-PWR9-NEXT: [[SUB_I:%.*]] = sub i64 16, [[TMP2]]
1490 // LE-PWR9-NEXT: [[CONV_I:%.*]] = trunc i64 [[SUB_I]] to i8
1491 // LE-PWR9-NEXT: [[TMP3:%.*]] = getelementptr i8, ptr null, i8 [[CONV_I]]
1492 // LE-PWR9-NEXT: [[TMP4:%.*]] = call <16 x i8> @llvm.ppc.altivec.lvsl(ptr [[TMP3]])
1493 // LE-PWR9-NEXT: store <16 x i8> [[TMP4]], ptr [[__MASK_I]], align 16
1494 // LE-PWR9-NEXT: [[TMP5:%.*]] = load <16 x i8>, ptr [[__A_ADDR_I]], align 16
1495 // LE-PWR9-NEXT: [[TMP6:%.*]] = bitcast <16 x i8> [[TMP5]] to <4 x i32>
1496 // LE-PWR9-NEXT: [[TMP7:%.*]] = load <16 x i8>, ptr [[__A_ADDR_I]], align 16
1497 // LE-PWR9-NEXT: [[TMP8:%.*]] = bitcast <16 x i8> [[TMP7]] to <4 x i32>
1498 // LE-PWR9-NEXT: [[TMP9:%.*]] = load <16 x i8>, ptr [[__MASK_I]], align 16
1499 // LE-PWR9-NEXT: [[TMP10:%.*]] = call <4 x i32> @llvm.ppc.altivec.vperm(<4 x i32> [[TMP6]], <4 x i32> [[TMP8]], <16 x i8> [[TMP9]])
1500 // LE-PWR9-NEXT: [[TMP11:%.*]] = bitcast <4 x i32> [[TMP10]] to <16 x i8>
1501 // LE-PWR9-NEXT: store <16 x i8> [[TMP11]], ptr [[__RES_I]], align 16
1502 // LE-PWR9-NEXT: [[TMP12:%.*]] = load <16 x i8>, ptr [[__RES_I]], align 16
1503 // LE-PWR9-NEXT: [[TMP13:%.*]] = bitcast <16 x i8> [[TMP12]] to <4 x i32>
1504 // LE-PWR9-NEXT: [[TMP14:%.*]] = load ptr, ptr [[__B_ADDR_I]], align 8
1505 // LE-PWR9-NEXT: [[TMP15:%.*]] = load i64, ptr [[__C_ADDR_I]], align 8
1506 // LE-PWR9-NEXT: [[SHL_I:%.*]] = shl i64 [[TMP15]], 56
1507 // LE-PWR9-NEXT: call void @llvm.ppc.vsx.stxvll(<4 x i32> [[TMP13]], ptr [[TMP14]], i64 [[SHL_I]])
1508 // LE-PWR9-NEXT: ret void
1510 // BE32-PWR9-LABEL: @test_strmb10(
1511 // BE32-PWR9-NEXT: entry:
1512 // BE32-PWR9-NEXT: [[PTR_ADDR:%.*]] = alloca ptr, align 4
1513 // BE32-PWR9-NEXT: [[DATA_ADDR:%.*]] = alloca <16 x i8>, align 16
1514 // BE32-PWR9-NEXT: store ptr [[PTR:%.*]], ptr [[PTR_ADDR]], align 4
1515 // BE32-PWR9-NEXT: store <16 x i8> [[DATA:%.*]], ptr [[DATA_ADDR]], align 16
1516 // BE32-PWR9-NEXT: [[TMP0:%.*]] = load ptr, ptr [[PTR_ADDR]], align 4
1517 // BE32-PWR9-NEXT: [[TMP1:%.*]] = load <16 x i8>, ptr [[DATA_ADDR]], align 16
1518 // BE32-PWR9-NEXT: [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64>
1519 // BE32-PWR9-NEXT: [[TMP3:%.*]] = getelementptr i8, ptr [[TMP0]], i64 2
1520 // BE32-PWR9-NEXT: [[TMP5:%.*]] = extractelement <2 x i64> [[TMP2]], i64 1
1521 // BE32-PWR9-NEXT: store i64 [[TMP5]], ptr [[TMP3]], align 1
1522 // BE32-PWR9-NEXT: [[TMP6:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
1523 // BE32-PWR9-NEXT: [[TMP7:%.*]] = getelementptr i8, ptr [[TMP0]], i64 0
1524 // BE32-PWR9-NEXT: [[TMP9:%.*]] = extractelement <8 x i16> [[TMP6]], i64 3
1525 // BE32-PWR9-NEXT: store i16 [[TMP9]], ptr [[TMP7]], align 1
1526 // BE32-PWR9-NEXT: ret void
1528 void test_strmb10(char *ptr, vector unsigned char data) {
1529 __vec_strmb(ptr, 10, data);
1532 // BE-PWR8-LABEL: @test_strmb11(
1533 // BE-PWR8-NEXT: entry:
1534 // BE-PWR8-NEXT: [[PTR_ADDR:%.*]] = alloca ptr, align 8
1535 // BE-PWR8-NEXT: [[DATA_ADDR:%.*]] = alloca <16 x i8>, align 16
1536 // BE-PWR8-NEXT: store ptr [[PTR:%.*]], ptr [[PTR_ADDR]], align 8
1537 // BE-PWR8-NEXT: store <16 x i8> [[DATA:%.*]], ptr [[DATA_ADDR]], align 16
1538 // BE-PWR8-NEXT: [[TMP0:%.*]] = load ptr, ptr [[PTR_ADDR]], align 8
1539 // BE-PWR8-NEXT: [[TMP1:%.*]] = load <16 x i8>, ptr [[DATA_ADDR]], align 16
1540 // BE-PWR8-NEXT: [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64>
1541 // BE-PWR8-NEXT: [[TMP3:%.*]] = getelementptr i8, ptr [[TMP0]], i64 3
1542 // BE-PWR8-NEXT: [[TMP5:%.*]] = extractelement <2 x i64> [[TMP2]], i64 1
1543 // BE-PWR8-NEXT: store i64 [[TMP5]], ptr [[TMP3]], align 1
1544 // BE-PWR8-NEXT: [[TMP6:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
1545 // BE-PWR8-NEXT: [[TMP7:%.*]] = getelementptr i8, ptr [[TMP0]], i64 1
1546 // BE-PWR8-NEXT: [[TMP9:%.*]] = extractelement <8 x i16> [[TMP6]], i64 3
1547 // BE-PWR8-NEXT: store i16 [[TMP9]], ptr [[TMP7]], align 1
1548 // BE-PWR8-NEXT: [[TMP10:%.*]] = getelementptr i8, ptr [[TMP0]], i64 0
1549 // BE-PWR8-NEXT: [[TMP11:%.*]] = extractelement <16 x i8> [[TMP1]], i64 5
1550 // BE-PWR8-NEXT: store i8 [[TMP11]], ptr [[TMP10]], align 1
1551 // BE-PWR8-NEXT: ret void
1553 // LE-PWR8-LABEL: @test_strmb11(
1554 // LE-PWR8-NEXT: entry:
1555 // LE-PWR8-NEXT: [[PTR_ADDR:%.*]] = alloca ptr, align 8
1556 // LE-PWR8-NEXT: [[DATA_ADDR:%.*]] = alloca <16 x i8>, align 16
1557 // LE-PWR8-NEXT: store ptr [[PTR:%.*]], ptr [[PTR_ADDR]], align 8
1558 // LE-PWR8-NEXT: store <16 x i8> [[DATA:%.*]], ptr [[DATA_ADDR]], align 16
1559 // LE-PWR8-NEXT: [[TMP0:%.*]] = load ptr, ptr [[PTR_ADDR]], align 8
1560 // LE-PWR8-NEXT: [[TMP1:%.*]] = load <16 x i8>, ptr [[DATA_ADDR]], align 16
1561 // LE-PWR8-NEXT: [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64>
1562 // LE-PWR8-NEXT: [[TMP3:%.*]] = getelementptr i8, ptr [[TMP0]], i64 3
1563 // LE-PWR8-NEXT: [[TMP5:%.*]] = extractelement <2 x i64> [[TMP2]], i64 0
1564 // LE-PWR8-NEXT: [[TMP6:%.*]] = call i64 @llvm.bswap.i64(i64 [[TMP5]])
1565 // LE-PWR8-NEXT: store i64 [[TMP6]], ptr [[TMP3]], align 1
1566 // LE-PWR8-NEXT: [[TMP7:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
1567 // LE-PWR8-NEXT: [[TMP8:%.*]] = getelementptr i8, ptr [[TMP0]], i64 1
1568 // LE-PWR8-NEXT: [[TMP10:%.*]] = extractelement <8 x i16> [[TMP7]], i64 4
1569 // LE-PWR8-NEXT: [[TMP11:%.*]] = call i16 @llvm.bswap.i16(i16 [[TMP10]])
1570 // LE-PWR8-NEXT: store i16 [[TMP11]], ptr [[TMP8]], align 1
1571 // LE-PWR8-NEXT: [[TMP12:%.*]] = getelementptr i8, ptr [[TMP0]], i64 0
1572 // LE-PWR8-NEXT: [[TMP13:%.*]] = extractelement <16 x i8> [[TMP1]], i64 10
1573 // LE-PWR8-NEXT: store i8 [[TMP13]], ptr [[TMP12]], align 1
1574 // LE-PWR8-NEXT: ret void
1576 // BE-PWR9-LABEL: @test_strmb11(
1577 // BE-PWR9-NEXT: entry:
1578 // BE-PWR9-NEXT: [[__A_ADDR_I:%.*]] = alloca <16 x i8>, align 16
1579 // BE-PWR9-NEXT: [[__B_ADDR_I:%.*]] = alloca ptr, align 8
1580 // BE-PWR9-NEXT: [[__C_ADDR_I:%.*]] = alloca i64, align 8
1581 // BE-PWR9-NEXT: [[__MASK_I:%.*]] = alloca <16 x i8>, align 16
1582 // BE-PWR9-NEXT: [[__RES_I:%.*]] = alloca <16 x i8>, align 16
1583 // BE-PWR9-NEXT: [[PTR_ADDR:%.*]] = alloca ptr, align 8
1584 // BE-PWR9-NEXT: [[DATA_ADDR:%.*]] = alloca <16 x i8>, align 16
1585 // BE-PWR9-NEXT: store ptr [[PTR:%.*]], ptr [[PTR_ADDR]], align 8
1586 // BE-PWR9-NEXT: store <16 x i8> [[DATA:%.*]], ptr [[DATA_ADDR]], align 16
1587 // BE-PWR9-NEXT: [[TMP0:%.*]] = load <16 x i8>, ptr [[DATA_ADDR]], align 16
1588 // BE-PWR9-NEXT: [[TMP1:%.*]] = load ptr, ptr [[PTR_ADDR]], align 8
1589 // BE-PWR9-NEXT: store <16 x i8> [[TMP0]], ptr [[__A_ADDR_I]], align 16
1590 // BE-PWR9-NEXT: store ptr [[TMP1]], ptr [[__B_ADDR_I]], align 8
1591 // BE-PWR9-NEXT: store i64 11, ptr [[__C_ADDR_I]], align 8
1592 // BE-PWR9-NEXT: [[TMP2:%.*]] = load i64, ptr [[__C_ADDR_I]], align 8
1593 // BE-PWR9-NEXT: [[SUB_I:%.*]] = sub i64 16, [[TMP2]]
1594 // BE-PWR9-NEXT: [[CONV_I:%.*]] = trunc i64 [[SUB_I]] to i8
1595 // BE-PWR9-NEXT: [[TMP3:%.*]] = getelementptr i8, ptr null, i8 [[CONV_I]]
1596 // BE-PWR9-NEXT: [[TMP4:%.*]] = call <16 x i8> @llvm.ppc.altivec.lvsl(ptr [[TMP3]])
1597 // BE-PWR9-NEXT: store <16 x i8> [[TMP4]], ptr [[__MASK_I]], align 16
1598 // BE-PWR9-NEXT: [[TMP5:%.*]] = load <16 x i8>, ptr [[__A_ADDR_I]], align 16
1599 // BE-PWR9-NEXT: [[TMP6:%.*]] = bitcast <16 x i8> [[TMP5]] to <4 x i32>
1600 // BE-PWR9-NEXT: [[TMP7:%.*]] = load <16 x i8>, ptr [[__A_ADDR_I]], align 16
1601 // BE-PWR9-NEXT: [[TMP8:%.*]] = bitcast <16 x i8> [[TMP7]] to <4 x i32>
1602 // BE-PWR9-NEXT: [[TMP9:%.*]] = load <16 x i8>, ptr [[__MASK_I]], align 16
1603 // BE-PWR9-NEXT: [[TMP10:%.*]] = call <4 x i32> @llvm.ppc.altivec.vperm(<4 x i32> [[TMP6]], <4 x i32> [[TMP8]], <16 x i8> [[TMP9]])
1604 // BE-PWR9-NEXT: [[TMP11:%.*]] = bitcast <4 x i32> [[TMP10]] to <16 x i8>
1605 // BE-PWR9-NEXT: store <16 x i8> [[TMP11]], ptr [[__RES_I]], align 16
1606 // BE-PWR9-NEXT: [[TMP12:%.*]] = load <16 x i8>, ptr [[__RES_I]], align 16
1607 // BE-PWR9-NEXT: [[TMP13:%.*]] = bitcast <16 x i8> [[TMP12]] to <4 x i32>
1608 // BE-PWR9-NEXT: [[TMP14:%.*]] = load ptr, ptr [[__B_ADDR_I]], align 8
1609 // BE-PWR9-NEXT: [[TMP15:%.*]] = load i64, ptr [[__C_ADDR_I]], align 8
1610 // BE-PWR9-NEXT: [[SHL_I:%.*]] = shl i64 [[TMP15]], 56
1611 // BE-PWR9-NEXT: call void @llvm.ppc.vsx.stxvll(<4 x i32> [[TMP13]], ptr [[TMP14]], i64 [[SHL_I]])
1612 // BE-PWR9-NEXT: ret void
1614 // LE-PWR9-LABEL: @test_strmb11(
1615 // LE-PWR9-NEXT: entry:
1616 // LE-PWR9-NEXT: [[__A_ADDR_I:%.*]] = alloca <16 x i8>, align 16
1617 // LE-PWR9-NEXT: [[__B_ADDR_I:%.*]] = alloca ptr, align 8
1618 // LE-PWR9-NEXT: [[__C_ADDR_I:%.*]] = alloca i64, align 8
1619 // LE-PWR9-NEXT: [[__MASK_I:%.*]] = alloca <16 x i8>, align 16
1620 // LE-PWR9-NEXT: [[__RES_I:%.*]] = alloca <16 x i8>, align 16
1621 // LE-PWR9-NEXT: [[PTR_ADDR:%.*]] = alloca ptr, align 8
1622 // LE-PWR9-NEXT: [[DATA_ADDR:%.*]] = alloca <16 x i8>, align 16
1623 // LE-PWR9-NEXT: store ptr [[PTR:%.*]], ptr [[PTR_ADDR]], align 8
1624 // LE-PWR9-NEXT: store <16 x i8> [[DATA:%.*]], ptr [[DATA_ADDR]], align 16
1625 // LE-PWR9-NEXT: [[TMP0:%.*]] = load <16 x i8>, ptr [[DATA_ADDR]], align 16
1626 // LE-PWR9-NEXT: [[TMP1:%.*]] = load ptr, ptr [[PTR_ADDR]], align 8
1627 // LE-PWR9-NEXT: store <16 x i8> [[TMP0]], ptr [[__A_ADDR_I]], align 16
1628 // LE-PWR9-NEXT: store ptr [[TMP1]], ptr [[__B_ADDR_I]], align 8
1629 // LE-PWR9-NEXT: store i64 11, ptr [[__C_ADDR_I]], align 8
1630 // LE-PWR9-NEXT: [[TMP2:%.*]] = load i64, ptr [[__C_ADDR_I]], align 8
1631 // LE-PWR9-NEXT: [[SUB_I:%.*]] = sub i64 16, [[TMP2]]
1632 // LE-PWR9-NEXT: [[CONV_I:%.*]] = trunc i64 [[SUB_I]] to i8
1633 // LE-PWR9-NEXT: [[TMP3:%.*]] = getelementptr i8, ptr null, i8 [[CONV_I]]
1634 // LE-PWR9-NEXT: [[TMP4:%.*]] = call <16 x i8> @llvm.ppc.altivec.lvsl(ptr [[TMP3]])
1635 // LE-PWR9-NEXT: store <16 x i8> [[TMP4]], ptr [[__MASK_I]], align 16
1636 // LE-PWR9-NEXT: [[TMP5:%.*]] = load <16 x i8>, ptr [[__A_ADDR_I]], align 16
1637 // LE-PWR9-NEXT: [[TMP6:%.*]] = bitcast <16 x i8> [[TMP5]] to <4 x i32>
1638 // LE-PWR9-NEXT: [[TMP7:%.*]] = load <16 x i8>, ptr [[__A_ADDR_I]], align 16
1639 // LE-PWR9-NEXT: [[TMP8:%.*]] = bitcast <16 x i8> [[TMP7]] to <4 x i32>
1640 // LE-PWR9-NEXT: [[TMP9:%.*]] = load <16 x i8>, ptr [[__MASK_I]], align 16
1641 // LE-PWR9-NEXT: [[TMP10:%.*]] = call <4 x i32> @llvm.ppc.altivec.vperm(<4 x i32> [[TMP6]], <4 x i32> [[TMP8]], <16 x i8> [[TMP9]])
1642 // LE-PWR9-NEXT: [[TMP11:%.*]] = bitcast <4 x i32> [[TMP10]] to <16 x i8>
1643 // LE-PWR9-NEXT: store <16 x i8> [[TMP11]], ptr [[__RES_I]], align 16
1644 // LE-PWR9-NEXT: [[TMP12:%.*]] = load <16 x i8>, ptr [[__RES_I]], align 16
1645 // LE-PWR9-NEXT: [[TMP13:%.*]] = bitcast <16 x i8> [[TMP12]] to <4 x i32>
1646 // LE-PWR9-NEXT: [[TMP14:%.*]] = load ptr, ptr [[__B_ADDR_I]], align 8
1647 // LE-PWR9-NEXT: [[TMP15:%.*]] = load i64, ptr [[__C_ADDR_I]], align 8
1648 // LE-PWR9-NEXT: [[SHL_I:%.*]] = shl i64 [[TMP15]], 56
1649 // LE-PWR9-NEXT: call void @llvm.ppc.vsx.stxvll(<4 x i32> [[TMP13]], ptr [[TMP14]], i64 [[SHL_I]])
1650 // LE-PWR9-NEXT: ret void
1652 // BE32-PWR9-LABEL: @test_strmb11(
1653 // BE32-PWR9-NEXT: entry:
1654 // BE32-PWR9-NEXT: [[PTR_ADDR:%.*]] = alloca ptr, align 4
1655 // BE32-PWR9-NEXT: [[DATA_ADDR:%.*]] = alloca <16 x i8>, align 16
1656 // BE32-PWR9-NEXT: store ptr [[PTR:%.*]], ptr [[PTR_ADDR]], align 4
1657 // BE32-PWR9-NEXT: store <16 x i8> [[DATA:%.*]], ptr [[DATA_ADDR]], align 16
1658 // BE32-PWR9-NEXT: [[TMP0:%.*]] = load ptr, ptr [[PTR_ADDR]], align 4
1659 // BE32-PWR9-NEXT: [[TMP1:%.*]] = load <16 x i8>, ptr [[DATA_ADDR]], align 16
1660 // BE32-PWR9-NEXT: [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64>
1661 // BE32-PWR9-NEXT: [[TMP3:%.*]] = getelementptr i8, ptr [[TMP0]], i64 3
1662 // BE32-PWR9-NEXT: [[TMP5:%.*]] = extractelement <2 x i64> [[TMP2]], i64 1
1663 // BE32-PWR9-NEXT: store i64 [[TMP5]], ptr [[TMP3]], align 1
1664 // BE32-PWR9-NEXT: [[TMP6:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
1665 // BE32-PWR9-NEXT: [[TMP7:%.*]] = getelementptr i8, ptr [[TMP0]], i64 1
1666 // BE32-PWR9-NEXT: [[TMP9:%.*]] = extractelement <8 x i16> [[TMP6]], i64 3
1667 // BE32-PWR9-NEXT: store i16 [[TMP9]], ptr [[TMP7]], align 1
1668 // BE32-PWR9-NEXT: [[TMP10:%.*]] = getelementptr i8, ptr [[TMP0]], i64 0
1669 // BE32-PWR9-NEXT: [[TMP11:%.*]] = extractelement <16 x i8> [[TMP1]], i64 5
1670 // BE32-PWR9-NEXT: store i8 [[TMP11]], ptr [[TMP10]], align 1
1671 // BE32-PWR9-NEXT: ret void
1673 void test_strmb11(char *ptr, vector unsigned char data) {
1674 __vec_strmb(ptr, 11, data);
1677 // BE-PWR8-LABEL: @test_strmb12(
1678 // BE-PWR8-NEXT: entry:
1679 // BE-PWR8-NEXT: [[PTR_ADDR:%.*]] = alloca ptr, align 8
1680 // BE-PWR8-NEXT: [[DATA_ADDR:%.*]] = alloca <16 x i8>, align 16
1681 // BE-PWR8-NEXT: store ptr [[PTR:%.*]], ptr [[PTR_ADDR]], align 8
1682 // BE-PWR8-NEXT: store <16 x i8> [[DATA:%.*]], ptr [[DATA_ADDR]], align 16
1683 // BE-PWR8-NEXT: [[TMP0:%.*]] = load ptr, ptr [[PTR_ADDR]], align 8
1684 // BE-PWR8-NEXT: [[TMP1:%.*]] = load <16 x i8>, ptr [[DATA_ADDR]], align 16
1685 // BE-PWR8-NEXT: [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64>
1686 // BE-PWR8-NEXT: [[TMP3:%.*]] = getelementptr i8, ptr [[TMP0]], i64 4
1687 // BE-PWR8-NEXT: [[TMP5:%.*]] = extractelement <2 x i64> [[TMP2]], i64 1
1688 // BE-PWR8-NEXT: store i64 [[TMP5]], ptr [[TMP3]], align 1
1689 // BE-PWR8-NEXT: [[TMP6:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
1690 // BE-PWR8-NEXT: [[TMP7:%.*]] = getelementptr i8, ptr [[TMP0]], i64 0
1691 // BE-PWR8-NEXT: [[TMP9:%.*]] = extractelement <4 x i32> [[TMP6]], i64 1
1692 // BE-PWR8-NEXT: store i32 [[TMP9]], ptr [[TMP7]], align 1
1693 // BE-PWR8-NEXT: ret void
1695 // LE-PWR8-LABEL: @test_strmb12(
1696 // LE-PWR8-NEXT: entry:
1697 // LE-PWR8-NEXT: [[PTR_ADDR:%.*]] = alloca ptr, align 8
1698 // LE-PWR8-NEXT: [[DATA_ADDR:%.*]] = alloca <16 x i8>, align 16
1699 // LE-PWR8-NEXT: store ptr [[PTR:%.*]], ptr [[PTR_ADDR]], align 8
1700 // LE-PWR8-NEXT: store <16 x i8> [[DATA:%.*]], ptr [[DATA_ADDR]], align 16
1701 // LE-PWR8-NEXT: [[TMP0:%.*]] = load ptr, ptr [[PTR_ADDR]], align 8
1702 // LE-PWR8-NEXT: [[TMP1:%.*]] = load <16 x i8>, ptr [[DATA_ADDR]], align 16
1703 // LE-PWR8-NEXT: [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64>
1704 // LE-PWR8-NEXT: [[TMP3:%.*]] = getelementptr i8, ptr [[TMP0]], i64 4
1705 // LE-PWR8-NEXT: [[TMP5:%.*]] = extractelement <2 x i64> [[TMP2]], i64 0
1706 // LE-PWR8-NEXT: [[TMP6:%.*]] = call i64 @llvm.bswap.i64(i64 [[TMP5]])
1707 // LE-PWR8-NEXT: store i64 [[TMP6]], ptr [[TMP3]], align 1
1708 // LE-PWR8-NEXT: [[TMP7:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
1709 // LE-PWR8-NEXT: [[TMP8:%.*]] = getelementptr i8, ptr [[TMP0]], i64 0
1710 // LE-PWR8-NEXT: [[TMP10:%.*]] = extractelement <4 x i32> [[TMP7]], i64 2
1711 // LE-PWR8-NEXT: [[TMP11:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP10]])
1712 // LE-PWR8-NEXT: store i32 [[TMP11]], ptr [[TMP8]], align 1
1713 // LE-PWR8-NEXT: ret void
1715 // BE-PWR9-LABEL: @test_strmb12(
1716 // BE-PWR9-NEXT: entry:
1717 // BE-PWR9-NEXT: [[__A_ADDR_I:%.*]] = alloca <16 x i8>, align 16
1718 // BE-PWR9-NEXT: [[__B_ADDR_I:%.*]] = alloca ptr, align 8
1719 // BE-PWR9-NEXT: [[__C_ADDR_I:%.*]] = alloca i64, align 8
1720 // BE-PWR9-NEXT: [[__MASK_I:%.*]] = alloca <16 x i8>, align 16
1721 // BE-PWR9-NEXT: [[__RES_I:%.*]] = alloca <16 x i8>, align 16
1722 // BE-PWR9-NEXT: [[PTR_ADDR:%.*]] = alloca ptr, align 8
1723 // BE-PWR9-NEXT: [[DATA_ADDR:%.*]] = alloca <16 x i8>, align 16
1724 // BE-PWR9-NEXT: store ptr [[PTR:%.*]], ptr [[PTR_ADDR]], align 8
1725 // BE-PWR9-NEXT: store <16 x i8> [[DATA:%.*]], ptr [[DATA_ADDR]], align 16
1726 // BE-PWR9-NEXT: [[TMP0:%.*]] = load <16 x i8>, ptr [[DATA_ADDR]], align 16
1727 // BE-PWR9-NEXT: [[TMP1:%.*]] = load ptr, ptr [[PTR_ADDR]], align 8
1728 // BE-PWR9-NEXT: store <16 x i8> [[TMP0]], ptr [[__A_ADDR_I]], align 16
1729 // BE-PWR9-NEXT: store ptr [[TMP1]], ptr [[__B_ADDR_I]], align 8
1730 // BE-PWR9-NEXT: store i64 12, ptr [[__C_ADDR_I]], align 8
1731 // BE-PWR9-NEXT: [[TMP2:%.*]] = load i64, ptr [[__C_ADDR_I]], align 8
1732 // BE-PWR9-NEXT: [[SUB_I:%.*]] = sub i64 16, [[TMP2]]
1733 // BE-PWR9-NEXT: [[CONV_I:%.*]] = trunc i64 [[SUB_I]] to i8
1734 // BE-PWR9-NEXT: [[TMP3:%.*]] = getelementptr i8, ptr null, i8 [[CONV_I]]
1735 // BE-PWR9-NEXT: [[TMP4:%.*]] = call <16 x i8> @llvm.ppc.altivec.lvsl(ptr [[TMP3]])
1736 // BE-PWR9-NEXT: store <16 x i8> [[TMP4]], ptr [[__MASK_I]], align 16
1737 // BE-PWR9-NEXT: [[TMP5:%.*]] = load <16 x i8>, ptr [[__A_ADDR_I]], align 16
1738 // BE-PWR9-NEXT: [[TMP6:%.*]] = bitcast <16 x i8> [[TMP5]] to <4 x i32>
1739 // BE-PWR9-NEXT: [[TMP7:%.*]] = load <16 x i8>, ptr [[__A_ADDR_I]], align 16
1740 // BE-PWR9-NEXT: [[TMP8:%.*]] = bitcast <16 x i8> [[TMP7]] to <4 x i32>
1741 // BE-PWR9-NEXT: [[TMP9:%.*]] = load <16 x i8>, ptr [[__MASK_I]], align 16
1742 // BE-PWR9-NEXT: [[TMP10:%.*]] = call <4 x i32> @llvm.ppc.altivec.vperm(<4 x i32> [[TMP6]], <4 x i32> [[TMP8]], <16 x i8> [[TMP9]])
1743 // BE-PWR9-NEXT: [[TMP11:%.*]] = bitcast <4 x i32> [[TMP10]] to <16 x i8>
1744 // BE-PWR9-NEXT: store <16 x i8> [[TMP11]], ptr [[__RES_I]], align 16
1745 // BE-PWR9-NEXT: [[TMP12:%.*]] = load <16 x i8>, ptr [[__RES_I]], align 16
1746 // BE-PWR9-NEXT: [[TMP13:%.*]] = bitcast <16 x i8> [[TMP12]] to <4 x i32>
1747 // BE-PWR9-NEXT: [[TMP14:%.*]] = load ptr, ptr [[__B_ADDR_I]], align 8
1748 // BE-PWR9-NEXT: [[TMP15:%.*]] = load i64, ptr [[__C_ADDR_I]], align 8
1749 // BE-PWR9-NEXT: [[SHL_I:%.*]] = shl i64 [[TMP15]], 56
1750 // BE-PWR9-NEXT: call void @llvm.ppc.vsx.stxvll(<4 x i32> [[TMP13]], ptr [[TMP14]], i64 [[SHL_I]])
1751 // BE-PWR9-NEXT: ret void
1753 // LE-PWR9-LABEL: @test_strmb12(
1754 // LE-PWR9-NEXT: entry:
1755 // LE-PWR9-NEXT: [[__A_ADDR_I:%.*]] = alloca <16 x i8>, align 16
1756 // LE-PWR9-NEXT: [[__B_ADDR_I:%.*]] = alloca ptr, align 8
1757 // LE-PWR9-NEXT: [[__C_ADDR_I:%.*]] = alloca i64, align 8
1758 // LE-PWR9-NEXT: [[__MASK_I:%.*]] = alloca <16 x i8>, align 16
1759 // LE-PWR9-NEXT: [[__RES_I:%.*]] = alloca <16 x i8>, align 16
1760 // LE-PWR9-NEXT: [[PTR_ADDR:%.*]] = alloca ptr, align 8
1761 // LE-PWR9-NEXT: [[DATA_ADDR:%.*]] = alloca <16 x i8>, align 16
1762 // LE-PWR9-NEXT: store ptr [[PTR:%.*]], ptr [[PTR_ADDR]], align 8
1763 // LE-PWR9-NEXT: store <16 x i8> [[DATA:%.*]], ptr [[DATA_ADDR]], align 16
1764 // LE-PWR9-NEXT: [[TMP0:%.*]] = load <16 x i8>, ptr [[DATA_ADDR]], align 16
1765 // LE-PWR9-NEXT: [[TMP1:%.*]] = load ptr, ptr [[PTR_ADDR]], align 8
1766 // LE-PWR9-NEXT: store <16 x i8> [[TMP0]], ptr [[__A_ADDR_I]], align 16
1767 // LE-PWR9-NEXT: store ptr [[TMP1]], ptr [[__B_ADDR_I]], align 8
1768 // LE-PWR9-NEXT: store i64 12, ptr [[__C_ADDR_I]], align 8
1769 // LE-PWR9-NEXT: [[TMP2:%.*]] = load i64, ptr [[__C_ADDR_I]], align 8
1770 // LE-PWR9-NEXT: [[SUB_I:%.*]] = sub i64 16, [[TMP2]]
1771 // LE-PWR9-NEXT: [[CONV_I:%.*]] = trunc i64 [[SUB_I]] to i8
1772 // LE-PWR9-NEXT: [[TMP3:%.*]] = getelementptr i8, ptr null, i8 [[CONV_I]]
1773 // LE-PWR9-NEXT: [[TMP4:%.*]] = call <16 x i8> @llvm.ppc.altivec.lvsl(ptr [[TMP3]])
1774 // LE-PWR9-NEXT: store <16 x i8> [[TMP4]], ptr [[__MASK_I]], align 16
1775 // LE-PWR9-NEXT: [[TMP5:%.*]] = load <16 x i8>, ptr [[__A_ADDR_I]], align 16
1776 // LE-PWR9-NEXT: [[TMP6:%.*]] = bitcast <16 x i8> [[TMP5]] to <4 x i32>
1777 // LE-PWR9-NEXT: [[TMP7:%.*]] = load <16 x i8>, ptr [[__A_ADDR_I]], align 16
1778 // LE-PWR9-NEXT: [[TMP8:%.*]] = bitcast <16 x i8> [[TMP7]] to <4 x i32>
1779 // LE-PWR9-NEXT: [[TMP9:%.*]] = load <16 x i8>, ptr [[__MASK_I]], align 16
1780 // LE-PWR9-NEXT: [[TMP10:%.*]] = call <4 x i32> @llvm.ppc.altivec.vperm(<4 x i32> [[TMP6]], <4 x i32> [[TMP8]], <16 x i8> [[TMP9]])
1781 // LE-PWR9-NEXT: [[TMP11:%.*]] = bitcast <4 x i32> [[TMP10]] to <16 x i8>
1782 // LE-PWR9-NEXT: store <16 x i8> [[TMP11]], ptr [[__RES_I]], align 16
1783 // LE-PWR9-NEXT: [[TMP12:%.*]] = load <16 x i8>, ptr [[__RES_I]], align 16
1784 // LE-PWR9-NEXT: [[TMP13:%.*]] = bitcast <16 x i8> [[TMP12]] to <4 x i32>
1785 // LE-PWR9-NEXT: [[TMP14:%.*]] = load ptr, ptr [[__B_ADDR_I]], align 8
1786 // LE-PWR9-NEXT: [[TMP15:%.*]] = load i64, ptr [[__C_ADDR_I]], align 8
1787 // LE-PWR9-NEXT: [[SHL_I:%.*]] = shl i64 [[TMP15]], 56
1788 // LE-PWR9-NEXT: call void @llvm.ppc.vsx.stxvll(<4 x i32> [[TMP13]], ptr [[TMP14]], i64 [[SHL_I]])
1789 // LE-PWR9-NEXT: ret void
1791 // BE32-PWR9-LABEL: @test_strmb12(
1792 // BE32-PWR9-NEXT: entry:
1793 // BE32-PWR9-NEXT: [[PTR_ADDR:%.*]] = alloca ptr, align 4
1794 // BE32-PWR9-NEXT: [[DATA_ADDR:%.*]] = alloca <16 x i8>, align 16
1795 // BE32-PWR9-NEXT: store ptr [[PTR:%.*]], ptr [[PTR_ADDR]], align 4
1796 // BE32-PWR9-NEXT: store <16 x i8> [[DATA:%.*]], ptr [[DATA_ADDR]], align 16
1797 // BE32-PWR9-NEXT: [[TMP0:%.*]] = load ptr, ptr [[PTR_ADDR]], align 4
1798 // BE32-PWR9-NEXT: [[TMP1:%.*]] = load <16 x i8>, ptr [[DATA_ADDR]], align 16
1799 // BE32-PWR9-NEXT: [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64>
1800 // BE32-PWR9-NEXT: [[TMP3:%.*]] = getelementptr i8, ptr [[TMP0]], i64 4
1801 // BE32-PWR9-NEXT: [[TMP5:%.*]] = extractelement <2 x i64> [[TMP2]], i64 1
1802 // BE32-PWR9-NEXT: store i64 [[TMP5]], ptr [[TMP3]], align 1
1803 // BE32-PWR9-NEXT: [[TMP6:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
1804 // BE32-PWR9-NEXT: [[TMP7:%.*]] = getelementptr i8, ptr [[TMP0]], i64 0
1805 // BE32-PWR9-NEXT: [[TMP9:%.*]] = extractelement <4 x i32> [[TMP6]], i64 1
1806 // BE32-PWR9-NEXT: store i32 [[TMP9]], ptr [[TMP7]], align 1
1807 // BE32-PWR9-NEXT: ret void
1809 void test_strmb12(char *ptr, vector unsigned char data) {
1810 __vec_strmb(ptr, 12, data);
1813 // BE-PWR8-LABEL: @test_strmb13(
1814 // BE-PWR8-NEXT: entry:
1815 // BE-PWR8-NEXT: [[PTR_ADDR:%.*]] = alloca ptr, align 8
1816 // BE-PWR8-NEXT: [[DATA_ADDR:%.*]] = alloca <16 x i8>, align 16
1817 // BE-PWR8-NEXT: store ptr [[PTR:%.*]], ptr [[PTR_ADDR]], align 8
1818 // BE-PWR8-NEXT: store <16 x i8> [[DATA:%.*]], ptr [[DATA_ADDR]], align 16
1819 // BE-PWR8-NEXT: [[TMP0:%.*]] = load ptr, ptr [[PTR_ADDR]], align 8
1820 // BE-PWR8-NEXT: [[TMP1:%.*]] = load <16 x i8>, ptr [[DATA_ADDR]], align 16
1821 // BE-PWR8-NEXT: [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64>
1822 // BE-PWR8-NEXT: [[TMP3:%.*]] = getelementptr i8, ptr [[TMP0]], i64 5
1823 // BE-PWR8-NEXT: [[TMP5:%.*]] = extractelement <2 x i64> [[TMP2]], i64 1
1824 // BE-PWR8-NEXT: store i64 [[TMP5]], ptr [[TMP3]], align 1
1825 // BE-PWR8-NEXT: [[TMP6:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
1826 // BE-PWR8-NEXT: [[TMP7:%.*]] = getelementptr i8, ptr [[TMP0]], i64 1
1827 // BE-PWR8-NEXT: [[TMP9:%.*]] = extractelement <4 x i32> [[TMP6]], i64 1
1828 // BE-PWR8-NEXT: store i32 [[TMP9]], ptr [[TMP7]], align 1
1829 // BE-PWR8-NEXT: [[TMP10:%.*]] = getelementptr i8, ptr [[TMP0]], i64 0
1830 // BE-PWR8-NEXT: [[TMP11:%.*]] = extractelement <16 x i8> [[TMP1]], i64 3
1831 // BE-PWR8-NEXT: store i8 [[TMP11]], ptr [[TMP10]], align 1
1832 // BE-PWR8-NEXT: ret void
1834 // LE-PWR8-LABEL: @test_strmb13(
1835 // LE-PWR8-NEXT: entry:
1836 // LE-PWR8-NEXT: [[PTR_ADDR:%.*]] = alloca ptr, align 8
1837 // LE-PWR8-NEXT: [[DATA_ADDR:%.*]] = alloca <16 x i8>, align 16
1838 // LE-PWR8-NEXT: store ptr [[PTR:%.*]], ptr [[PTR_ADDR]], align 8
1839 // LE-PWR8-NEXT: store <16 x i8> [[DATA:%.*]], ptr [[DATA_ADDR]], align 16
1840 // LE-PWR8-NEXT: [[TMP0:%.*]] = load ptr, ptr [[PTR_ADDR]], align 8
1841 // LE-PWR8-NEXT: [[TMP1:%.*]] = load <16 x i8>, ptr [[DATA_ADDR]], align 16
1842 // LE-PWR8-NEXT: [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64>
1843 // LE-PWR8-NEXT: [[TMP3:%.*]] = getelementptr i8, ptr [[TMP0]], i64 5
1844 // LE-PWR8-NEXT: [[TMP5:%.*]] = extractelement <2 x i64> [[TMP2]], i64 0
1845 // LE-PWR8-NEXT: [[TMP6:%.*]] = call i64 @llvm.bswap.i64(i64 [[TMP5]])
1846 // LE-PWR8-NEXT: store i64 [[TMP6]], ptr [[TMP3]], align 1
1847 // LE-PWR8-NEXT: [[TMP7:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
1848 // LE-PWR8-NEXT: [[TMP8:%.*]] = getelementptr i8, ptr [[TMP0]], i64 1
1849 // LE-PWR8-NEXT: [[TMP10:%.*]] = extractelement <4 x i32> [[TMP7]], i64 2
1850 // LE-PWR8-NEXT: [[TMP11:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP10]])
1851 // LE-PWR8-NEXT: store i32 [[TMP11]], ptr [[TMP8]], align 1
1852 // LE-PWR8-NEXT: [[TMP12:%.*]] = getelementptr i8, ptr [[TMP0]], i64 0
1853 // LE-PWR8-NEXT: [[TMP13:%.*]] = extractelement <16 x i8> [[TMP1]], i64 12
1854 // LE-PWR8-NEXT: store i8 [[TMP13]], ptr [[TMP12]], align 1
1855 // LE-PWR8-NEXT: ret void
1857 // BE-PWR9-LABEL: @test_strmb13(
1858 // BE-PWR9-NEXT: entry:
1859 // BE-PWR9-NEXT: [[__A_ADDR_I:%.*]] = alloca <16 x i8>, align 16
1860 // BE-PWR9-NEXT: [[__B_ADDR_I:%.*]] = alloca ptr, align 8
1861 // BE-PWR9-NEXT: [[__C_ADDR_I:%.*]] = alloca i64, align 8
1862 // BE-PWR9-NEXT: [[__MASK_I:%.*]] = alloca <16 x i8>, align 16
1863 // BE-PWR9-NEXT: [[__RES_I:%.*]] = alloca <16 x i8>, align 16
1864 // BE-PWR9-NEXT: [[PTR_ADDR:%.*]] = alloca ptr, align 8
1865 // BE-PWR9-NEXT: [[DATA_ADDR:%.*]] = alloca <16 x i8>, align 16
1866 // BE-PWR9-NEXT: store ptr [[PTR:%.*]], ptr [[PTR_ADDR]], align 8
1867 // BE-PWR9-NEXT: store <16 x i8> [[DATA:%.*]], ptr [[DATA_ADDR]], align 16
1868 // BE-PWR9-NEXT: [[TMP0:%.*]] = load <16 x i8>, ptr [[DATA_ADDR]], align 16
1869 // BE-PWR9-NEXT: [[TMP1:%.*]] = load ptr, ptr [[PTR_ADDR]], align 8
1870 // BE-PWR9-NEXT: store <16 x i8> [[TMP0]], ptr [[__A_ADDR_I]], align 16
1871 // BE-PWR9-NEXT: store ptr [[TMP1]], ptr [[__B_ADDR_I]], align 8
1872 // BE-PWR9-NEXT: store i64 13, ptr [[__C_ADDR_I]], align 8
1873 // BE-PWR9-NEXT: [[TMP2:%.*]] = load i64, ptr [[__C_ADDR_I]], align 8
1874 // BE-PWR9-NEXT: [[SUB_I:%.*]] = sub i64 16, [[TMP2]]
1875 // BE-PWR9-NEXT: [[CONV_I:%.*]] = trunc i64 [[SUB_I]] to i8
1876 // BE-PWR9-NEXT: [[TMP3:%.*]] = getelementptr i8, ptr null, i8 [[CONV_I]]
1877 // BE-PWR9-NEXT: [[TMP4:%.*]] = call <16 x i8> @llvm.ppc.altivec.lvsl(ptr [[TMP3]])
1878 // BE-PWR9-NEXT: store <16 x i8> [[TMP4]], ptr [[__MASK_I]], align 16
1879 // BE-PWR9-NEXT: [[TMP5:%.*]] = load <16 x i8>, ptr [[__A_ADDR_I]], align 16
1880 // BE-PWR9-NEXT: [[TMP6:%.*]] = bitcast <16 x i8> [[TMP5]] to <4 x i32>
1881 // BE-PWR9-NEXT: [[TMP7:%.*]] = load <16 x i8>, ptr [[__A_ADDR_I]], align 16
1882 // BE-PWR9-NEXT: [[TMP8:%.*]] = bitcast <16 x i8> [[TMP7]] to <4 x i32>
1883 // BE-PWR9-NEXT: [[TMP9:%.*]] = load <16 x i8>, ptr [[__MASK_I]], align 16
1884 // BE-PWR9-NEXT: [[TMP10:%.*]] = call <4 x i32> @llvm.ppc.altivec.vperm(<4 x i32> [[TMP6]], <4 x i32> [[TMP8]], <16 x i8> [[TMP9]])
1885 // BE-PWR9-NEXT: [[TMP11:%.*]] = bitcast <4 x i32> [[TMP10]] to <16 x i8>
1886 // BE-PWR9-NEXT: store <16 x i8> [[TMP11]], ptr [[__RES_I]], align 16
1887 // BE-PWR9-NEXT: [[TMP12:%.*]] = load <16 x i8>, ptr [[__RES_I]], align 16
1888 // BE-PWR9-NEXT: [[TMP13:%.*]] = bitcast <16 x i8> [[TMP12]] to <4 x i32>
1889 // BE-PWR9-NEXT: [[TMP14:%.*]] = load ptr, ptr [[__B_ADDR_I]], align 8
1890 // BE-PWR9-NEXT: [[TMP15:%.*]] = load i64, ptr [[__C_ADDR_I]], align 8
1891 // BE-PWR9-NEXT: [[SHL_I:%.*]] = shl i64 [[TMP15]], 56
1892 // BE-PWR9-NEXT: call void @llvm.ppc.vsx.stxvll(<4 x i32> [[TMP13]], ptr [[TMP14]], i64 [[SHL_I]])
1893 // BE-PWR9-NEXT: ret void
1895 // LE-PWR9-LABEL: @test_strmb13(
1896 // LE-PWR9-NEXT: entry:
1897 // LE-PWR9-NEXT: [[__A_ADDR_I:%.*]] = alloca <16 x i8>, align 16
1898 // LE-PWR9-NEXT: [[__B_ADDR_I:%.*]] = alloca ptr, align 8
1899 // LE-PWR9-NEXT: [[__C_ADDR_I:%.*]] = alloca i64, align 8
1900 // LE-PWR9-NEXT: [[__MASK_I:%.*]] = alloca <16 x i8>, align 16
1901 // LE-PWR9-NEXT: [[__RES_I:%.*]] = alloca <16 x i8>, align 16
1902 // LE-PWR9-NEXT: [[PTR_ADDR:%.*]] = alloca ptr, align 8
1903 // LE-PWR9-NEXT: [[DATA_ADDR:%.*]] = alloca <16 x i8>, align 16
1904 // LE-PWR9-NEXT: store ptr [[PTR:%.*]], ptr [[PTR_ADDR]], align 8
1905 // LE-PWR9-NEXT: store <16 x i8> [[DATA:%.*]], ptr [[DATA_ADDR]], align 16
1906 // LE-PWR9-NEXT: [[TMP0:%.*]] = load <16 x i8>, ptr [[DATA_ADDR]], align 16
1907 // LE-PWR9-NEXT: [[TMP1:%.*]] = load ptr, ptr [[PTR_ADDR]], align 8
1908 // LE-PWR9-NEXT: store <16 x i8> [[TMP0]], ptr [[__A_ADDR_I]], align 16
1909 // LE-PWR9-NEXT: store ptr [[TMP1]], ptr [[__B_ADDR_I]], align 8
1910 // LE-PWR9-NEXT: store i64 13, ptr [[__C_ADDR_I]], align 8
1911 // LE-PWR9-NEXT: [[TMP2:%.*]] = load i64, ptr [[__C_ADDR_I]], align 8
1912 // LE-PWR9-NEXT: [[SUB_I:%.*]] = sub i64 16, [[TMP2]]
1913 // LE-PWR9-NEXT: [[CONV_I:%.*]] = trunc i64 [[SUB_I]] to i8
1914 // LE-PWR9-NEXT: [[TMP3:%.*]] = getelementptr i8, ptr null, i8 [[CONV_I]]
1915 // LE-PWR9-NEXT: [[TMP4:%.*]] = call <16 x i8> @llvm.ppc.altivec.lvsl(ptr [[TMP3]])
1916 // LE-PWR9-NEXT: store <16 x i8> [[TMP4]], ptr [[__MASK_I]], align 16
1917 // LE-PWR9-NEXT: [[TMP5:%.*]] = load <16 x i8>, ptr [[__A_ADDR_I]], align 16
1918 // LE-PWR9-NEXT: [[TMP6:%.*]] = bitcast <16 x i8> [[TMP5]] to <4 x i32>
1919 // LE-PWR9-NEXT: [[TMP7:%.*]] = load <16 x i8>, ptr [[__A_ADDR_I]], align 16
1920 // LE-PWR9-NEXT: [[TMP8:%.*]] = bitcast <16 x i8> [[TMP7]] to <4 x i32>
1921 // LE-PWR9-NEXT: [[TMP9:%.*]] = load <16 x i8>, ptr [[__MASK_I]], align 16
1922 // LE-PWR9-NEXT: [[TMP10:%.*]] = call <4 x i32> @llvm.ppc.altivec.vperm(<4 x i32> [[TMP6]], <4 x i32> [[TMP8]], <16 x i8> [[TMP9]])
1923 // LE-PWR9-NEXT: [[TMP11:%.*]] = bitcast <4 x i32> [[TMP10]] to <16 x i8>
1924 // LE-PWR9-NEXT: store <16 x i8> [[TMP11]], ptr [[__RES_I]], align 16
1925 // LE-PWR9-NEXT: [[TMP12:%.*]] = load <16 x i8>, ptr [[__RES_I]], align 16
1926 // LE-PWR9-NEXT: [[TMP13:%.*]] = bitcast <16 x i8> [[TMP12]] to <4 x i32>
1927 // LE-PWR9-NEXT: [[TMP14:%.*]] = load ptr, ptr [[__B_ADDR_I]], align 8
1928 // LE-PWR9-NEXT: [[TMP15:%.*]] = load i64, ptr [[__C_ADDR_I]], align 8
1929 // LE-PWR9-NEXT: [[SHL_I:%.*]] = shl i64 [[TMP15]], 56
1930 // LE-PWR9-NEXT: call void @llvm.ppc.vsx.stxvll(<4 x i32> [[TMP13]], ptr [[TMP14]], i64 [[SHL_I]])
1931 // LE-PWR9-NEXT: ret void
1933 // BE32-PWR9-LABEL: @test_strmb13(
1934 // BE32-PWR9-NEXT: entry:
1935 // BE32-PWR9-NEXT: [[PTR_ADDR:%.*]] = alloca ptr, align 4
1936 // BE32-PWR9-NEXT: [[DATA_ADDR:%.*]] = alloca <16 x i8>, align 16
1937 // BE32-PWR9-NEXT: store ptr [[PTR:%.*]], ptr [[PTR_ADDR]], align 4
1938 // BE32-PWR9-NEXT: store <16 x i8> [[DATA:%.*]], ptr [[DATA_ADDR]], align 16
1939 // BE32-PWR9-NEXT: [[TMP0:%.*]] = load ptr, ptr [[PTR_ADDR]], align 4
1940 // BE32-PWR9-NEXT: [[TMP1:%.*]] = load <16 x i8>, ptr [[DATA_ADDR]], align 16
1941 // BE32-PWR9-NEXT: [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64>
1942 // BE32-PWR9-NEXT: [[TMP3:%.*]] = getelementptr i8, ptr [[TMP0]], i64 5
1943 // BE32-PWR9-NEXT: [[TMP5:%.*]] = extractelement <2 x i64> [[TMP2]], i64 1
1944 // BE32-PWR9-NEXT: store i64 [[TMP5]], ptr [[TMP3]], align 1
1945 // BE32-PWR9-NEXT: [[TMP6:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
1946 // BE32-PWR9-NEXT: [[TMP7:%.*]] = getelementptr i8, ptr [[TMP0]], i64 1
1947 // BE32-PWR9-NEXT: [[TMP9:%.*]] = extractelement <4 x i32> [[TMP6]], i64 1
1948 // BE32-PWR9-NEXT: store i32 [[TMP9]], ptr [[TMP7]], align 1
1949 // BE32-PWR9-NEXT: [[TMP10:%.*]] = getelementptr i8, ptr [[TMP0]], i64 0
1950 // BE32-PWR9-NEXT: [[TMP11:%.*]] = extractelement <16 x i8> [[TMP1]], i64 3
1951 // BE32-PWR9-NEXT: store i8 [[TMP11]], ptr [[TMP10]], align 1
1952 // BE32-PWR9-NEXT: ret void
1954 void test_strmb13(char *ptr, vector unsigned char data) {
1955 __vec_strmb(ptr, 13, data);
1958 // BE-PWR8-LABEL: @test_strmb14(
1959 // BE-PWR8-NEXT: entry:
1960 // BE-PWR8-NEXT: [[PTR_ADDR:%.*]] = alloca ptr, align 8
1961 // BE-PWR8-NEXT: [[DATA_ADDR:%.*]] = alloca <16 x i8>, align 16
1962 // BE-PWR8-NEXT: store ptr [[PTR:%.*]], ptr [[PTR_ADDR]], align 8
1963 // BE-PWR8-NEXT: store <16 x i8> [[DATA:%.*]], ptr [[DATA_ADDR]], align 16
1964 // BE-PWR8-NEXT: [[TMP0:%.*]] = load ptr, ptr [[PTR_ADDR]], align 8
1965 // BE-PWR8-NEXT: [[TMP1:%.*]] = load <16 x i8>, ptr [[DATA_ADDR]], align 16
1966 // BE-PWR8-NEXT: [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64>
1967 // BE-PWR8-NEXT: [[TMP3:%.*]] = getelementptr i8, ptr [[TMP0]], i64 6
1968 // BE-PWR8-NEXT: [[TMP5:%.*]] = extractelement <2 x i64> [[TMP2]], i64 1
1969 // BE-PWR8-NEXT: store i64 [[TMP5]], ptr [[TMP3]], align 1
1970 // BE-PWR8-NEXT: [[TMP6:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
1971 // BE-PWR8-NEXT: [[TMP7:%.*]] = getelementptr i8, ptr [[TMP0]], i64 2
1972 // BE-PWR8-NEXT: [[TMP9:%.*]] = extractelement <4 x i32> [[TMP6]], i64 1
1973 // BE-PWR8-NEXT: store i32 [[TMP9]], ptr [[TMP7]], align 1
1974 // BE-PWR8-NEXT: [[TMP10:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
1975 // BE-PWR8-NEXT: [[TMP11:%.*]] = getelementptr i8, ptr [[TMP0]], i64 0
1976 // BE-PWR8-NEXT: [[TMP13:%.*]] = extractelement <8 x i16> [[TMP10]], i64 1
1977 // BE-PWR8-NEXT: store i16 [[TMP13]], ptr [[TMP11]], align 1
1978 // BE-PWR8-NEXT: ret void
1980 // LE-PWR8-LABEL: @test_strmb14(
1981 // LE-PWR8-NEXT: entry:
1982 // LE-PWR8-NEXT: [[PTR_ADDR:%.*]] = alloca ptr, align 8
1983 // LE-PWR8-NEXT: [[DATA_ADDR:%.*]] = alloca <16 x i8>, align 16
1984 // LE-PWR8-NEXT: store ptr [[PTR:%.*]], ptr [[PTR_ADDR]], align 8
1985 // LE-PWR8-NEXT: store <16 x i8> [[DATA:%.*]], ptr [[DATA_ADDR]], align 16
1986 // LE-PWR8-NEXT: [[TMP0:%.*]] = load ptr, ptr [[PTR_ADDR]], align 8
1987 // LE-PWR8-NEXT: [[TMP1:%.*]] = load <16 x i8>, ptr [[DATA_ADDR]], align 16
1988 // LE-PWR8-NEXT: [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64>
1989 // LE-PWR8-NEXT: [[TMP3:%.*]] = getelementptr i8, ptr [[TMP0]], i64 6
1990 // LE-PWR8-NEXT: [[TMP5:%.*]] = extractelement <2 x i64> [[TMP2]], i64 0
1991 // LE-PWR8-NEXT: [[TMP6:%.*]] = call i64 @llvm.bswap.i64(i64 [[TMP5]])
1992 // LE-PWR8-NEXT: store i64 [[TMP6]], ptr [[TMP3]], align 1
1993 // LE-PWR8-NEXT: [[TMP7:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
1994 // LE-PWR8-NEXT: [[TMP8:%.*]] = getelementptr i8, ptr [[TMP0]], i64 2
1995 // LE-PWR8-NEXT: [[TMP10:%.*]] = extractelement <4 x i32> [[TMP7]], i64 2
1996 // LE-PWR8-NEXT: [[TMP11:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP10]])
1997 // LE-PWR8-NEXT: store i32 [[TMP11]], ptr [[TMP8]], align 1
1998 // LE-PWR8-NEXT: [[TMP12:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
1999 // LE-PWR8-NEXT: [[TMP13:%.*]] = getelementptr i8, ptr [[TMP0]], i64 0
2000 // LE-PWR8-NEXT: [[TMP15:%.*]] = extractelement <8 x i16> [[TMP12]], i64 6
2001 // LE-PWR8-NEXT: [[TMP16:%.*]] = call i16 @llvm.bswap.i16(i16 [[TMP15]])
2002 // LE-PWR8-NEXT: store i16 [[TMP16]], ptr [[TMP13]], align 1
2003 // LE-PWR8-NEXT: ret void
2005 // BE-PWR9-LABEL: @test_strmb14(
2006 // BE-PWR9-NEXT: entry:
2007 // BE-PWR9-NEXT: [[__A_ADDR_I:%.*]] = alloca <16 x i8>, align 16
2008 // BE-PWR9-NEXT: [[__B_ADDR_I:%.*]] = alloca ptr, align 8
2009 // BE-PWR9-NEXT: [[__C_ADDR_I:%.*]] = alloca i64, align 8
2010 // BE-PWR9-NEXT: [[__MASK_I:%.*]] = alloca <16 x i8>, align 16
2011 // BE-PWR9-NEXT: [[__RES_I:%.*]] = alloca <16 x i8>, align 16
2012 // BE-PWR9-NEXT: [[PTR_ADDR:%.*]] = alloca ptr, align 8
2013 // BE-PWR9-NEXT: [[DATA_ADDR:%.*]] = alloca <16 x i8>, align 16
2014 // BE-PWR9-NEXT: store ptr [[PTR:%.*]], ptr [[PTR_ADDR]], align 8
2015 // BE-PWR9-NEXT: store <16 x i8> [[DATA:%.*]], ptr [[DATA_ADDR]], align 16
2016 // BE-PWR9-NEXT: [[TMP0:%.*]] = load <16 x i8>, ptr [[DATA_ADDR]], align 16
2017 // BE-PWR9-NEXT: [[TMP1:%.*]] = load ptr, ptr [[PTR_ADDR]], align 8
2018 // BE-PWR9-NEXT: store <16 x i8> [[TMP0]], ptr [[__A_ADDR_I]], align 16
2019 // BE-PWR9-NEXT: store ptr [[TMP1]], ptr [[__B_ADDR_I]], align 8
2020 // BE-PWR9-NEXT: store i64 14, ptr [[__C_ADDR_I]], align 8
2021 // BE-PWR9-NEXT: [[TMP2:%.*]] = load i64, ptr [[__C_ADDR_I]], align 8
2022 // BE-PWR9-NEXT: [[SUB_I:%.*]] = sub i64 16, [[TMP2]]
2023 // BE-PWR9-NEXT: [[CONV_I:%.*]] = trunc i64 [[SUB_I]] to i8
2024 // BE-PWR9-NEXT: [[TMP3:%.*]] = getelementptr i8, ptr null, i8 [[CONV_I]]
2025 // BE-PWR9-NEXT: [[TMP4:%.*]] = call <16 x i8> @llvm.ppc.altivec.lvsl(ptr [[TMP3]])
2026 // BE-PWR9-NEXT: store <16 x i8> [[TMP4]], ptr [[__MASK_I]], align 16
2027 // BE-PWR9-NEXT: [[TMP5:%.*]] = load <16 x i8>, ptr [[__A_ADDR_I]], align 16
2028 // BE-PWR9-NEXT: [[TMP6:%.*]] = bitcast <16 x i8> [[TMP5]] to <4 x i32>
2029 // BE-PWR9-NEXT: [[TMP7:%.*]] = load <16 x i8>, ptr [[__A_ADDR_I]], align 16
2030 // BE-PWR9-NEXT: [[TMP8:%.*]] = bitcast <16 x i8> [[TMP7]] to <4 x i32>
2031 // BE-PWR9-NEXT: [[TMP9:%.*]] = load <16 x i8>, ptr [[__MASK_I]], align 16
2032 // BE-PWR9-NEXT: [[TMP10:%.*]] = call <4 x i32> @llvm.ppc.altivec.vperm(<4 x i32> [[TMP6]], <4 x i32> [[TMP8]], <16 x i8> [[TMP9]])
2033 // BE-PWR9-NEXT: [[TMP11:%.*]] = bitcast <4 x i32> [[TMP10]] to <16 x i8>
2034 // BE-PWR9-NEXT: store <16 x i8> [[TMP11]], ptr [[__RES_I]], align 16
2035 // BE-PWR9-NEXT: [[TMP12:%.*]] = load <16 x i8>, ptr [[__RES_I]], align 16
2036 // BE-PWR9-NEXT: [[TMP13:%.*]] = bitcast <16 x i8> [[TMP12]] to <4 x i32>
2037 // BE-PWR9-NEXT: [[TMP14:%.*]] = load ptr, ptr [[__B_ADDR_I]], align 8
2038 // BE-PWR9-NEXT: [[TMP15:%.*]] = load i64, ptr [[__C_ADDR_I]], align 8
2039 // BE-PWR9-NEXT: [[SHL_I:%.*]] = shl i64 [[TMP15]], 56
2040 // BE-PWR9-NEXT: call void @llvm.ppc.vsx.stxvll(<4 x i32> [[TMP13]], ptr [[TMP14]], i64 [[SHL_I]])
2041 // BE-PWR9-NEXT: ret void
2043 // LE-PWR9-LABEL: @test_strmb14(
2044 // LE-PWR9-NEXT: entry:
2045 // LE-PWR9-NEXT: [[__A_ADDR_I:%.*]] = alloca <16 x i8>, align 16
2046 // LE-PWR9-NEXT: [[__B_ADDR_I:%.*]] = alloca ptr, align 8
2047 // LE-PWR9-NEXT: [[__C_ADDR_I:%.*]] = alloca i64, align 8
2048 // LE-PWR9-NEXT: [[__MASK_I:%.*]] = alloca <16 x i8>, align 16
2049 // LE-PWR9-NEXT: [[__RES_I:%.*]] = alloca <16 x i8>, align 16
2050 // LE-PWR9-NEXT: [[PTR_ADDR:%.*]] = alloca ptr, align 8
2051 // LE-PWR9-NEXT: [[DATA_ADDR:%.*]] = alloca <16 x i8>, align 16
2052 // LE-PWR9-NEXT: store ptr [[PTR:%.*]], ptr [[PTR_ADDR]], align 8
2053 // LE-PWR9-NEXT: store <16 x i8> [[DATA:%.*]], ptr [[DATA_ADDR]], align 16
2054 // LE-PWR9-NEXT: [[TMP0:%.*]] = load <16 x i8>, ptr [[DATA_ADDR]], align 16
2055 // LE-PWR9-NEXT: [[TMP1:%.*]] = load ptr, ptr [[PTR_ADDR]], align 8
2056 // LE-PWR9-NEXT: store <16 x i8> [[TMP0]], ptr [[__A_ADDR_I]], align 16
2057 // LE-PWR9-NEXT: store ptr [[TMP1]], ptr [[__B_ADDR_I]], align 8
2058 // LE-PWR9-NEXT: store i64 14, ptr [[__C_ADDR_I]], align 8
2059 // LE-PWR9-NEXT: [[TMP2:%.*]] = load i64, ptr [[__C_ADDR_I]], align 8
2060 // LE-PWR9-NEXT: [[SUB_I:%.*]] = sub i64 16, [[TMP2]]
2061 // LE-PWR9-NEXT: [[CONV_I:%.*]] = trunc i64 [[SUB_I]] to i8
2062 // LE-PWR9-NEXT: [[TMP3:%.*]] = getelementptr i8, ptr null, i8 [[CONV_I]]
2063 // LE-PWR9-NEXT: [[TMP4:%.*]] = call <16 x i8> @llvm.ppc.altivec.lvsl(ptr [[TMP3]])
2064 // LE-PWR9-NEXT: store <16 x i8> [[TMP4]], ptr [[__MASK_I]], align 16
2065 // LE-PWR9-NEXT: [[TMP5:%.*]] = load <16 x i8>, ptr [[__A_ADDR_I]], align 16
2066 // LE-PWR9-NEXT: [[TMP6:%.*]] = bitcast <16 x i8> [[TMP5]] to <4 x i32>
2067 // LE-PWR9-NEXT: [[TMP7:%.*]] = load <16 x i8>, ptr [[__A_ADDR_I]], align 16
2068 // LE-PWR9-NEXT: [[TMP8:%.*]] = bitcast <16 x i8> [[TMP7]] to <4 x i32>
2069 // LE-PWR9-NEXT: [[TMP9:%.*]] = load <16 x i8>, ptr [[__MASK_I]], align 16
2070 // LE-PWR9-NEXT: [[TMP10:%.*]] = call <4 x i32> @llvm.ppc.altivec.vperm(<4 x i32> [[TMP6]], <4 x i32> [[TMP8]], <16 x i8> [[TMP9]])
2071 // LE-PWR9-NEXT: [[TMP11:%.*]] = bitcast <4 x i32> [[TMP10]] to <16 x i8>
2072 // LE-PWR9-NEXT: store <16 x i8> [[TMP11]], ptr [[__RES_I]], align 16
2073 // LE-PWR9-NEXT: [[TMP12:%.*]] = load <16 x i8>, ptr [[__RES_I]], align 16
2074 // LE-PWR9-NEXT: [[TMP13:%.*]] = bitcast <16 x i8> [[TMP12]] to <4 x i32>
2075 // LE-PWR9-NEXT: [[TMP14:%.*]] = load ptr, ptr [[__B_ADDR_I]], align 8
2076 // LE-PWR9-NEXT: [[TMP15:%.*]] = load i64, ptr [[__C_ADDR_I]], align 8
2077 // LE-PWR9-NEXT: [[SHL_I:%.*]] = shl i64 [[TMP15]], 56
2078 // LE-PWR9-NEXT: call void @llvm.ppc.vsx.stxvll(<4 x i32> [[TMP13]], ptr [[TMP14]], i64 [[SHL_I]])
2079 // LE-PWR9-NEXT: ret void
2081 // BE32-PWR9-LABEL: @test_strmb14(
2082 // BE32-PWR9-NEXT: entry:
2083 // BE32-PWR9-NEXT: [[PTR_ADDR:%.*]] = alloca ptr, align 4
2084 // BE32-PWR9-NEXT: [[DATA_ADDR:%.*]] = alloca <16 x i8>, align 16
2085 // BE32-PWR9-NEXT: store ptr [[PTR:%.*]], ptr [[PTR_ADDR]], align 4
2086 // BE32-PWR9-NEXT: store <16 x i8> [[DATA:%.*]], ptr [[DATA_ADDR]], align 16
2087 // BE32-PWR9-NEXT: [[TMP0:%.*]] = load ptr, ptr [[PTR_ADDR]], align 4
2088 // BE32-PWR9-NEXT: [[TMP1:%.*]] = load <16 x i8>, ptr [[DATA_ADDR]], align 16
2089 // BE32-PWR9-NEXT: [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64>
2090 // BE32-PWR9-NEXT: [[TMP3:%.*]] = getelementptr i8, ptr [[TMP0]], i64 6
2091 // BE32-PWR9-NEXT: [[TMP5:%.*]] = extractelement <2 x i64> [[TMP2]], i64 1
2092 // BE32-PWR9-NEXT: store i64 [[TMP5]], ptr [[TMP3]], align 1
2093 // BE32-PWR9-NEXT: [[TMP6:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
2094 // BE32-PWR9-NEXT: [[TMP7:%.*]] = getelementptr i8, ptr [[TMP0]], i64 2
2095 // BE32-PWR9-NEXT: [[TMP9:%.*]] = extractelement <4 x i32> [[TMP6]], i64 1
2096 // BE32-PWR9-NEXT: store i32 [[TMP9]], ptr [[TMP7]], align 1
2097 // BE32-PWR9-NEXT: [[TMP10:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
2098 // BE32-PWR9-NEXT: [[TMP11:%.*]] = getelementptr i8, ptr [[TMP0]], i64 0
2099 // BE32-PWR9-NEXT: [[TMP13:%.*]] = extractelement <8 x i16> [[TMP10]], i64 1
2100 // BE32-PWR9-NEXT: store i16 [[TMP13]], ptr [[TMP11]], align 1
2101 // BE32-PWR9-NEXT: ret void
2103 void test_strmb14(char *ptr, vector unsigned char data) {
2104 __vec_strmb(ptr, 14, data);
2107 // BE-PWR8-LABEL: @test_strmb15(
2108 // BE-PWR8-NEXT: entry:
2109 // BE-PWR8-NEXT: [[PTR_ADDR:%.*]] = alloca ptr, align 8
2110 // BE-PWR8-NEXT: [[DATA_ADDR:%.*]] = alloca <16 x i8>, align 16
2111 // BE-PWR8-NEXT: store ptr [[PTR:%.*]], ptr [[PTR_ADDR]], align 8
2112 // BE-PWR8-NEXT: store <16 x i8> [[DATA:%.*]], ptr [[DATA_ADDR]], align 16
2113 // BE-PWR8-NEXT: [[TMP0:%.*]] = load ptr, ptr [[PTR_ADDR]], align 8
2114 // BE-PWR8-NEXT: [[TMP1:%.*]] = load <16 x i8>, ptr [[DATA_ADDR]], align 16
2115 // BE-PWR8-NEXT: [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64>
2116 // BE-PWR8-NEXT: [[TMP3:%.*]] = getelementptr i8, ptr [[TMP0]], i64 7
2117 // BE-PWR8-NEXT: [[TMP5:%.*]] = extractelement <2 x i64> [[TMP2]], i64 1
2118 // BE-PWR8-NEXT: store i64 [[TMP5]], ptr [[TMP3]], align 1
2119 // BE-PWR8-NEXT: [[TMP6:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
2120 // BE-PWR8-NEXT: [[TMP7:%.*]] = getelementptr i8, ptr [[TMP0]], i64 3
2121 // BE-PWR8-NEXT: [[TMP9:%.*]] = extractelement <4 x i32> [[TMP6]], i64 1
2122 // BE-PWR8-NEXT: store i32 [[TMP9]], ptr [[TMP7]], align 1
2123 // BE-PWR8-NEXT: [[TMP10:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
2124 // BE-PWR8-NEXT: [[TMP11:%.*]] = getelementptr i8, ptr [[TMP0]], i64 1
2125 // BE-PWR8-NEXT: [[TMP13:%.*]] = extractelement <8 x i16> [[TMP10]], i64 1
2126 // BE-PWR8-NEXT: store i16 [[TMP13]], ptr [[TMP11]], align 1
2127 // BE-PWR8-NEXT: [[TMP14:%.*]] = getelementptr i8, ptr [[TMP0]], i64 0
2128 // BE-PWR8-NEXT: [[TMP15:%.*]] = extractelement <16 x i8> [[TMP1]], i64 1
2129 // BE-PWR8-NEXT: store i8 [[TMP15]], ptr [[TMP14]], align 1
2130 // BE-PWR8-NEXT: ret void
2132 // LE-PWR8-LABEL: @test_strmb15(
2133 // LE-PWR8-NEXT: entry:
2134 // LE-PWR8-NEXT: [[PTR_ADDR:%.*]] = alloca ptr, align 8
2135 // LE-PWR8-NEXT: [[DATA_ADDR:%.*]] = alloca <16 x i8>, align 16
2136 // LE-PWR8-NEXT: store ptr [[PTR:%.*]], ptr [[PTR_ADDR]], align 8
2137 // LE-PWR8-NEXT: store <16 x i8> [[DATA:%.*]], ptr [[DATA_ADDR]], align 16
2138 // LE-PWR8-NEXT: [[TMP0:%.*]] = load ptr, ptr [[PTR_ADDR]], align 8
2139 // LE-PWR8-NEXT: [[TMP1:%.*]] = load <16 x i8>, ptr [[DATA_ADDR]], align 16
2140 // LE-PWR8-NEXT: [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64>
2141 // LE-PWR8-NEXT: [[TMP3:%.*]] = getelementptr i8, ptr [[TMP0]], i64 7
2142 // LE-PWR8-NEXT: [[TMP5:%.*]] = extractelement <2 x i64> [[TMP2]], i64 0
2143 // LE-PWR8-NEXT: [[TMP6:%.*]] = call i64 @llvm.bswap.i64(i64 [[TMP5]])
2144 // LE-PWR8-NEXT: store i64 [[TMP6]], ptr [[TMP3]], align 1
2145 // LE-PWR8-NEXT: [[TMP7:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
2146 // LE-PWR8-NEXT: [[TMP8:%.*]] = getelementptr i8, ptr [[TMP0]], i64 3
2147 // LE-PWR8-NEXT: [[TMP10:%.*]] = extractelement <4 x i32> [[TMP7]], i64 2
2148 // LE-PWR8-NEXT: [[TMP11:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP10]])
2149 // LE-PWR8-NEXT: store i32 [[TMP11]], ptr [[TMP8]], align 1
2150 // LE-PWR8-NEXT: [[TMP12:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
2151 // LE-PWR8-NEXT: [[TMP13:%.*]] = getelementptr i8, ptr [[TMP0]], i64 1
2152 // LE-PWR8-NEXT: [[TMP15:%.*]] = extractelement <8 x i16> [[TMP12]], i64 6
2153 // LE-PWR8-NEXT: [[TMP16:%.*]] = call i16 @llvm.bswap.i16(i16 [[TMP15]])
2154 // LE-PWR8-NEXT: store i16 [[TMP16]], ptr [[TMP13]], align 1
2155 // LE-PWR8-NEXT: [[TMP17:%.*]] = getelementptr i8, ptr [[TMP0]], i64 0
2156 // LE-PWR8-NEXT: [[TMP18:%.*]] = extractelement <16 x i8> [[TMP1]], i64 14
2157 // LE-PWR8-NEXT: store i8 [[TMP18]], ptr [[TMP17]], align 1
2158 // LE-PWR8-NEXT: ret void
2160 // BE-PWR9-LABEL: @test_strmb15(
2161 // BE-PWR9-NEXT: entry:
2162 // BE-PWR9-NEXT: [[__A_ADDR_I:%.*]] = alloca <16 x i8>, align 16
2163 // BE-PWR9-NEXT: [[__B_ADDR_I:%.*]] = alloca ptr, align 8
2164 // BE-PWR9-NEXT: [[__C_ADDR_I:%.*]] = alloca i64, align 8
2165 // BE-PWR9-NEXT: [[__MASK_I:%.*]] = alloca <16 x i8>, align 16
2166 // BE-PWR9-NEXT: [[__RES_I:%.*]] = alloca <16 x i8>, align 16
2167 // BE-PWR9-NEXT: [[PTR_ADDR:%.*]] = alloca ptr, align 8
2168 // BE-PWR9-NEXT: [[DATA_ADDR:%.*]] = alloca <16 x i8>, align 16
2169 // BE-PWR9-NEXT: store ptr [[PTR:%.*]], ptr [[PTR_ADDR]], align 8
2170 // BE-PWR9-NEXT: store <16 x i8> [[DATA:%.*]], ptr [[DATA_ADDR]], align 16
2171 // BE-PWR9-NEXT: [[TMP0:%.*]] = load <16 x i8>, ptr [[DATA_ADDR]], align 16
2172 // BE-PWR9-NEXT: [[TMP1:%.*]] = load ptr, ptr [[PTR_ADDR]], align 8
2173 // BE-PWR9-NEXT: store <16 x i8> [[TMP0]], ptr [[__A_ADDR_I]], align 16
2174 // BE-PWR9-NEXT: store ptr [[TMP1]], ptr [[__B_ADDR_I]], align 8
2175 // BE-PWR9-NEXT: store i64 15, ptr [[__C_ADDR_I]], align 8
2176 // BE-PWR9-NEXT: [[TMP2:%.*]] = load i64, ptr [[__C_ADDR_I]], align 8
2177 // BE-PWR9-NEXT: [[SUB_I:%.*]] = sub i64 16, [[TMP2]]
2178 // BE-PWR9-NEXT: [[CONV_I:%.*]] = trunc i64 [[SUB_I]] to i8
2179 // BE-PWR9-NEXT: [[TMP3:%.*]] = getelementptr i8, ptr null, i8 [[CONV_I]]
2180 // BE-PWR9-NEXT: [[TMP4:%.*]] = call <16 x i8> @llvm.ppc.altivec.lvsl(ptr [[TMP3]])
2181 // BE-PWR9-NEXT: store <16 x i8> [[TMP4]], ptr [[__MASK_I]], align 16
2182 // BE-PWR9-NEXT: [[TMP5:%.*]] = load <16 x i8>, ptr [[__A_ADDR_I]], align 16
2183 // BE-PWR9-NEXT: [[TMP6:%.*]] = bitcast <16 x i8> [[TMP5]] to <4 x i32>
2184 // BE-PWR9-NEXT: [[TMP7:%.*]] = load <16 x i8>, ptr [[__A_ADDR_I]], align 16
2185 // BE-PWR9-NEXT: [[TMP8:%.*]] = bitcast <16 x i8> [[TMP7]] to <4 x i32>
2186 // BE-PWR9-NEXT: [[TMP9:%.*]] = load <16 x i8>, ptr [[__MASK_I]], align 16
2187 // BE-PWR9-NEXT: [[TMP10:%.*]] = call <4 x i32> @llvm.ppc.altivec.vperm(<4 x i32> [[TMP6]], <4 x i32> [[TMP8]], <16 x i8> [[TMP9]])
2188 // BE-PWR9-NEXT: [[TMP11:%.*]] = bitcast <4 x i32> [[TMP10]] to <16 x i8>
2189 // BE-PWR9-NEXT: store <16 x i8> [[TMP11]], ptr [[__RES_I]], align 16
2190 // BE-PWR9-NEXT: [[TMP12:%.*]] = load <16 x i8>, ptr [[__RES_I]], align 16
2191 // BE-PWR9-NEXT: [[TMP13:%.*]] = bitcast <16 x i8> [[TMP12]] to <4 x i32>
2192 // BE-PWR9-NEXT: [[TMP14:%.*]] = load ptr, ptr [[__B_ADDR_I]], align 8
2193 // BE-PWR9-NEXT: [[TMP15:%.*]] = load i64, ptr [[__C_ADDR_I]], align 8
2194 // BE-PWR9-NEXT: [[SHL_I:%.*]] = shl i64 [[TMP15]], 56
2195 // BE-PWR9-NEXT: call void @llvm.ppc.vsx.stxvll(<4 x i32> [[TMP13]], ptr [[TMP14]], i64 [[SHL_I]])
2196 // BE-PWR9-NEXT: ret void
2198 // LE-PWR9-LABEL: @test_strmb15(
2199 // LE-PWR9-NEXT: entry:
2200 // LE-PWR9-NEXT: [[__A_ADDR_I:%.*]] = alloca <16 x i8>, align 16
2201 // LE-PWR9-NEXT: [[__B_ADDR_I:%.*]] = alloca ptr, align 8
2202 // LE-PWR9-NEXT: [[__C_ADDR_I:%.*]] = alloca i64, align 8
2203 // LE-PWR9-NEXT: [[__MASK_I:%.*]] = alloca <16 x i8>, align 16
2204 // LE-PWR9-NEXT: [[__RES_I:%.*]] = alloca <16 x i8>, align 16
2205 // LE-PWR9-NEXT: [[PTR_ADDR:%.*]] = alloca ptr, align 8
2206 // LE-PWR9-NEXT: [[DATA_ADDR:%.*]] = alloca <16 x i8>, align 16
2207 // LE-PWR9-NEXT: store ptr [[PTR:%.*]], ptr [[PTR_ADDR]], align 8
2208 // LE-PWR9-NEXT: store <16 x i8> [[DATA:%.*]], ptr [[DATA_ADDR]], align 16
2209 // LE-PWR9-NEXT: [[TMP0:%.*]] = load <16 x i8>, ptr [[DATA_ADDR]], align 16
2210 // LE-PWR9-NEXT: [[TMP1:%.*]] = load ptr, ptr [[PTR_ADDR]], align 8
2211 // LE-PWR9-NEXT: store <16 x i8> [[TMP0]], ptr [[__A_ADDR_I]], align 16
2212 // LE-PWR9-NEXT: store ptr [[TMP1]], ptr [[__B_ADDR_I]], align 8
2213 // LE-PWR9-NEXT: store i64 15, ptr [[__C_ADDR_I]], align 8
2214 // LE-PWR9-NEXT: [[TMP2:%.*]] = load i64, ptr [[__C_ADDR_I]], align 8
2215 // LE-PWR9-NEXT: [[SUB_I:%.*]] = sub i64 16, [[TMP2]]
2216 // LE-PWR9-NEXT: [[CONV_I:%.*]] = trunc i64 [[SUB_I]] to i8
2217 // LE-PWR9-NEXT: [[TMP3:%.*]] = getelementptr i8, ptr null, i8 [[CONV_I]]
2218 // LE-PWR9-NEXT: [[TMP4:%.*]] = call <16 x i8> @llvm.ppc.altivec.lvsl(ptr [[TMP3]])
2219 // LE-PWR9-NEXT: store <16 x i8> [[TMP4]], ptr [[__MASK_I]], align 16
2220 // LE-PWR9-NEXT: [[TMP5:%.*]] = load <16 x i8>, ptr [[__A_ADDR_I]], align 16
2221 // LE-PWR9-NEXT: [[TMP6:%.*]] = bitcast <16 x i8> [[TMP5]] to <4 x i32>
2222 // LE-PWR9-NEXT: [[TMP7:%.*]] = load <16 x i8>, ptr [[__A_ADDR_I]], align 16
2223 // LE-PWR9-NEXT: [[TMP8:%.*]] = bitcast <16 x i8> [[TMP7]] to <4 x i32>
2224 // LE-PWR9-NEXT: [[TMP9:%.*]] = load <16 x i8>, ptr [[__MASK_I]], align 16
2225 // LE-PWR9-NEXT: [[TMP10:%.*]] = call <4 x i32> @llvm.ppc.altivec.vperm(<4 x i32> [[TMP6]], <4 x i32> [[TMP8]], <16 x i8> [[TMP9]])
2226 // LE-PWR9-NEXT: [[TMP11:%.*]] = bitcast <4 x i32> [[TMP10]] to <16 x i8>
2227 // LE-PWR9-NEXT: store <16 x i8> [[TMP11]], ptr [[__RES_I]], align 16
2228 // LE-PWR9-NEXT: [[TMP12:%.*]] = load <16 x i8>, ptr [[__RES_I]], align 16
2229 // LE-PWR9-NEXT: [[TMP13:%.*]] = bitcast <16 x i8> [[TMP12]] to <4 x i32>
2230 // LE-PWR9-NEXT: [[TMP14:%.*]] = load ptr, ptr [[__B_ADDR_I]], align 8
2231 // LE-PWR9-NEXT: [[TMP15:%.*]] = load i64, ptr [[__C_ADDR_I]], align 8
2232 // LE-PWR9-NEXT: [[SHL_I:%.*]] = shl i64 [[TMP15]], 56
2233 // LE-PWR9-NEXT: call void @llvm.ppc.vsx.stxvll(<4 x i32> [[TMP13]], ptr [[TMP14]], i64 [[SHL_I]])
2234 // LE-PWR9-NEXT: ret void
2236 // BE32-PWR9-LABEL: @test_strmb15(
2237 // BE32-PWR9-NEXT: entry:
2238 // BE32-PWR9-NEXT: [[PTR_ADDR:%.*]] = alloca ptr, align 4
2239 // BE32-PWR9-NEXT: [[DATA_ADDR:%.*]] = alloca <16 x i8>, align 16
2240 // BE32-PWR9-NEXT: store ptr [[PTR:%.*]], ptr [[PTR_ADDR]], align 4
2241 // BE32-PWR9-NEXT: store <16 x i8> [[DATA:%.*]], ptr [[DATA_ADDR]], align 16
2242 // BE32-PWR9-NEXT: [[TMP0:%.*]] = load ptr, ptr [[PTR_ADDR]], align 4
2243 // BE32-PWR9-NEXT: [[TMP1:%.*]] = load <16 x i8>, ptr [[DATA_ADDR]], align 16
2244 // BE32-PWR9-NEXT: [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64>
2245 // BE32-PWR9-NEXT: [[TMP3:%.*]] = getelementptr i8, ptr [[TMP0]], i64 7
2246 // BE32-PWR9-NEXT: [[TMP5:%.*]] = extractelement <2 x i64> [[TMP2]], i64 1
2247 // BE32-PWR9-NEXT: store i64 [[TMP5]], ptr [[TMP3]], align 1
2248 // BE32-PWR9-NEXT: [[TMP6:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
2249 // BE32-PWR9-NEXT: [[TMP7:%.*]] = getelementptr i8, ptr [[TMP0]], i64 3
2250 // BE32-PWR9-NEXT: [[TMP9:%.*]] = extractelement <4 x i32> [[TMP6]], i64 1
2251 // BE32-PWR9-NEXT: store i32 [[TMP9]], ptr [[TMP7]], align 1
2252 // BE32-PWR9-NEXT: [[TMP10:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
2253 // BE32-PWR9-NEXT: [[TMP11:%.*]] = getelementptr i8, ptr [[TMP0]], i64 1
2254 // BE32-PWR9-NEXT: [[TMP13:%.*]] = extractelement <8 x i16> [[TMP10]], i64 1
2255 // BE32-PWR9-NEXT: store i16 [[TMP13]], ptr [[TMP11]], align 1
2256 // BE32-PWR9-NEXT: [[TMP14:%.*]] = getelementptr i8, ptr [[TMP0]], i64 0
2257 // BE32-PWR9-NEXT: [[TMP15:%.*]] = extractelement <16 x i8> [[TMP1]], i64 1
2258 // BE32-PWR9-NEXT: store i8 [[TMP15]], ptr [[TMP14]], align 1
2259 // BE32-PWR9-NEXT: ret void
2261 void test_strmb15(char *ptr, vector unsigned char data) {
2262 __vec_strmb(ptr, 15, data);
2264 // BE-PWR8-LABEL: @test_ldrmb16(
2265 // BE-PWR8-NEXT: entry:
2266 // BE-PWR8-NEXT: [[PTR_ADDR:%.*]] = alloca ptr, align 8
2267 // BE-PWR8-NEXT: store ptr [[PTR:%.*]], ptr [[PTR_ADDR]], align 8
2268 // BE-PWR8-NEXT: [[TMP0:%.*]] = load ptr, ptr [[PTR_ADDR]], align 8
2269 // BE-PWR8-NEXT: [[TMP2:%.*]] = load <16 x i8>, ptr [[TMP0]], align 1
2270 // BE-PWR8-NEXT: ret <16 x i8> [[TMP2]]
2272 // LE-PWR8-LABEL: @test_ldrmb16(
2273 // LE-PWR8-NEXT: entry:
2274 // LE-PWR8-NEXT: [[PTR_ADDR:%.*]] = alloca ptr, align 8
2275 // LE-PWR8-NEXT: store ptr [[PTR:%.*]], ptr [[PTR_ADDR]], align 8
2276 // LE-PWR8-NEXT: [[TMP0:%.*]] = load ptr, ptr [[PTR_ADDR]], align 8
2277 // LE-PWR8-NEXT: [[TMP2:%.*]] = load <16 x i8>, ptr [[TMP0]], align 1
2278 // LE-PWR8-NEXT: [[TMP3:%.*]] = shufflevector <16 x i8> [[TMP2]], <16 x i8> [[TMP2]], <16 x i32> <i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
2279 // LE-PWR8-NEXT: ret <16 x i8> [[TMP3]]
2281 // BE-PWR9-LABEL: @test_ldrmb16(
2282 // BE-PWR9-NEXT: entry:
2283 // BE-PWR9-NEXT: [[__A_ADDR_I:%.*]] = alloca ptr, align 8
2284 // BE-PWR9-NEXT: [[__B_ADDR_I:%.*]] = alloca i64, align 8
2285 // BE-PWR9-NEXT: [[__RES_I:%.*]] = alloca <16 x i8>, align 16
2286 // BE-PWR9-NEXT: [[__MASK_I:%.*]] = alloca <16 x i8>, align 16
2287 // BE-PWR9-NEXT: [[PTR_ADDR:%.*]] = alloca ptr, align 8
2288 // BE-PWR9-NEXT: store ptr [[PTR:%.*]], ptr [[PTR_ADDR]], align 8
2289 // BE-PWR9-NEXT: [[TMP0:%.*]] = load ptr, ptr [[PTR_ADDR]], align 8
2290 // BE-PWR9-NEXT: store ptr [[TMP0]], ptr [[__A_ADDR_I]], align 8
2291 // BE-PWR9-NEXT: store i64 16, ptr [[__B_ADDR_I]], align 8
2292 // BE-PWR9-NEXT: [[TMP1:%.*]] = load ptr, ptr [[__A_ADDR_I]], align 8
2293 // BE-PWR9-NEXT: [[TMP2:%.*]] = load i64, ptr [[__B_ADDR_I]], align 8
2294 // BE-PWR9-NEXT: [[SHL_I:%.*]] = shl i64 [[TMP2]], 56
2295 // BE-PWR9-NEXT: [[TMP3:%.*]] = call <4 x i32> @llvm.ppc.vsx.lxvll(ptr [[TMP1]], i64 [[SHL_I]])
2296 // BE-PWR9-NEXT: [[TMP4:%.*]] = bitcast <4 x i32> [[TMP3]] to <16 x i8>
2297 // BE-PWR9-NEXT: store <16 x i8> [[TMP4]], ptr [[__RES_I]], align 16
2298 // BE-PWR9-NEXT: [[TMP5:%.*]] = load i64, ptr [[__B_ADDR_I]], align 8
2299 // BE-PWR9-NEXT: [[SUB_I:%.*]] = sub i64 16, [[TMP5]]
2300 // BE-PWR9-NEXT: [[CONV_I:%.*]] = trunc i64 [[SUB_I]] to i8
2301 // BE-PWR9-NEXT: [[TMP6:%.*]] = getelementptr i8, ptr null, i8 [[CONV_I]]
2302 // BE-PWR9-NEXT: [[TMP7:%.*]] = call <16 x i8> @llvm.ppc.altivec.lvsr(ptr [[TMP6]])
2303 // BE-PWR9-NEXT: store <16 x i8> [[TMP7]], ptr [[__MASK_I]], align 16
2304 // BE-PWR9-NEXT: [[TMP8:%.*]] = load <16 x i8>, ptr [[__RES_I]], align 16
2305 // BE-PWR9-NEXT: [[TMP9:%.*]] = bitcast <16 x i8> [[TMP8]] to <4 x i32>
2306 // BE-PWR9-NEXT: [[TMP10:%.*]] = load <16 x i8>, ptr [[__RES_I]], align 16
2307 // BE-PWR9-NEXT: [[TMP11:%.*]] = bitcast <16 x i8> [[TMP10]] to <4 x i32>
2308 // BE-PWR9-NEXT: [[TMP12:%.*]] = load <16 x i8>, ptr [[__MASK_I]], align 16
2309 // BE-PWR9-NEXT: [[TMP13:%.*]] = call <4 x i32> @llvm.ppc.altivec.vperm(<4 x i32> [[TMP9]], <4 x i32> [[TMP11]], <16 x i8> [[TMP12]])
2310 // BE-PWR9-NEXT: [[TMP14:%.*]] = bitcast <4 x i32> [[TMP13]] to <16 x i8>
2311 // BE-PWR9-NEXT: ret <16 x i8> [[TMP14]]
2313 // LE-PWR9-LABEL: @test_ldrmb16(
2314 // LE-PWR9-NEXT: entry:
2315 // LE-PWR9-NEXT: [[__A_ADDR_I:%.*]] = alloca ptr, align 8
2316 // LE-PWR9-NEXT: [[__B_ADDR_I:%.*]] = alloca i64, align 8
2317 // LE-PWR9-NEXT: [[__RES_I:%.*]] = alloca <16 x i8>, align 16
2318 // LE-PWR9-NEXT: [[__MASK_I:%.*]] = alloca <16 x i8>, align 16
2319 // LE-PWR9-NEXT: [[PTR_ADDR:%.*]] = alloca ptr, align 8
2320 // LE-PWR9-NEXT: store ptr [[PTR:%.*]], ptr [[PTR_ADDR]], align 8
2321 // LE-PWR9-NEXT: [[TMP0:%.*]] = load ptr, ptr [[PTR_ADDR]], align 8
2322 // LE-PWR9-NEXT: store ptr [[TMP0]], ptr [[__A_ADDR_I]], align 8
2323 // LE-PWR9-NEXT: store i64 16, ptr [[__B_ADDR_I]], align 8
2324 // LE-PWR9-NEXT: [[TMP1:%.*]] = load ptr, ptr [[__A_ADDR_I]], align 8
2325 // LE-PWR9-NEXT: [[TMP2:%.*]] = load i64, ptr [[__B_ADDR_I]], align 8
2326 // LE-PWR9-NEXT: [[SHL_I:%.*]] = shl i64 [[TMP2]], 56
2327 // LE-PWR9-NEXT: [[TMP3:%.*]] = call <4 x i32> @llvm.ppc.vsx.lxvll(ptr [[TMP1]], i64 [[SHL_I]])
2328 // LE-PWR9-NEXT: [[TMP4:%.*]] = bitcast <4 x i32> [[TMP3]] to <16 x i8>
2329 // LE-PWR9-NEXT: store <16 x i8> [[TMP4]], ptr [[__RES_I]], align 16
2330 // LE-PWR9-NEXT: [[TMP5:%.*]] = load i64, ptr [[__B_ADDR_I]], align 8
2331 // LE-PWR9-NEXT: [[SUB_I:%.*]] = sub i64 16, [[TMP5]]
2332 // LE-PWR9-NEXT: [[CONV_I:%.*]] = trunc i64 [[SUB_I]] to i8
2333 // LE-PWR9-NEXT: [[TMP6:%.*]] = getelementptr i8, ptr null, i8 [[CONV_I]]
2334 // LE-PWR9-NEXT: [[TMP7:%.*]] = call <16 x i8> @llvm.ppc.altivec.lvsr(ptr [[TMP6]])
2335 // LE-PWR9-NEXT: store <16 x i8> [[TMP7]], ptr [[__MASK_I]], align 16
2336 // LE-PWR9-NEXT: [[TMP8:%.*]] = load <16 x i8>, ptr [[__RES_I]], align 16
2337 // LE-PWR9-NEXT: [[TMP9:%.*]] = bitcast <16 x i8> [[TMP8]] to <4 x i32>
2338 // LE-PWR9-NEXT: [[TMP10:%.*]] = load <16 x i8>, ptr [[__RES_I]], align 16
2339 // LE-PWR9-NEXT: [[TMP11:%.*]] = bitcast <16 x i8> [[TMP10]] to <4 x i32>
2340 // LE-PWR9-NEXT: [[TMP12:%.*]] = load <16 x i8>, ptr [[__MASK_I]], align 16
2341 // LE-PWR9-NEXT: [[TMP13:%.*]] = call <4 x i32> @llvm.ppc.altivec.vperm(<4 x i32> [[TMP9]], <4 x i32> [[TMP11]], <16 x i8> [[TMP12]])
2342 // LE-PWR9-NEXT: [[TMP14:%.*]] = bitcast <4 x i32> [[TMP13]] to <16 x i8>
2343 // LE-PWR9-NEXT: ret <16 x i8> [[TMP14]]
2345 // BE32-PWR9-LABEL: @test_ldrmb16(
2346 // BE32-PWR9-NEXT: entry:
2347 // BE32-PWR9-NEXT: [[PTR_ADDR:%.*]] = alloca ptr, align 4
2348 // BE32-PWR9-NEXT: store ptr [[PTR:%.*]], ptr [[PTR_ADDR]], align 4
2349 // BE32-PWR9-NEXT: [[TMP0:%.*]] = load ptr, ptr [[PTR_ADDR]], align 4
2350 // BE32-PWR9-NEXT: [[TMP2:%.*]] = load <16 x i8>, ptr [[TMP0]], align 1
2351 // BE32-PWR9-NEXT: ret <16 x i8> [[TMP2]]
2353 vector unsigned char test_ldrmb16(char *ptr) { return __vec_ldrmb(ptr, 16); }
2355 // BE-PWR8-LABEL: @test_strmb16(
2356 // BE-PWR8-NEXT: entry:
2357 // BE-PWR8-NEXT: [[PTR_ADDR:%.*]] = alloca ptr, align 8
2358 // BE-PWR8-NEXT: [[DATA_ADDR:%.*]] = alloca <16 x i8>, align 16
2359 // BE-PWR8-NEXT: store ptr [[PTR:%.*]], ptr [[PTR_ADDR]], align 8
2360 // BE-PWR8-NEXT: store <16 x i8> [[DATA:%.*]], ptr [[DATA_ADDR]], align 16
2361 // BE-PWR8-NEXT: [[TMP0:%.*]] = load ptr, ptr [[PTR_ADDR]], align 8
2362 // BE-PWR8-NEXT: [[TMP1:%.*]] = load <16 x i8>, ptr [[DATA_ADDR]], align 16
2363 // BE-PWR8-NEXT: store <16 x i8> [[TMP1]], ptr [[TMP0]], align 1
2364 // BE-PWR8-NEXT: ret void
2366 // LE-PWR8-LABEL: @test_strmb16(
2367 // LE-PWR8-NEXT: entry:
2368 // LE-PWR8-NEXT: [[PTR_ADDR:%.*]] = alloca ptr, align 8
2369 // LE-PWR8-NEXT: [[DATA_ADDR:%.*]] = alloca <16 x i8>, align 16
2370 // LE-PWR8-NEXT: store ptr [[PTR:%.*]], ptr [[PTR_ADDR]], align 8
2371 // LE-PWR8-NEXT: store <16 x i8> [[DATA:%.*]], ptr [[DATA_ADDR]], align 16
2372 // LE-PWR8-NEXT: [[TMP0:%.*]] = load ptr, ptr [[PTR_ADDR]], align 8
2373 // LE-PWR8-NEXT: [[TMP1:%.*]] = load <16 x i8>, ptr [[DATA_ADDR]], align 16
2374 // LE-PWR8-NEXT: [[TMP3:%.*]] = shufflevector <16 x i8> [[TMP1]], <16 x i8> [[TMP1]], <16 x i32> <i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
2375 // LE-PWR8-NEXT: store <16 x i8> [[TMP3]], ptr [[TMP0]], align 1
2376 // LE-PWR8-NEXT: ret void
2378 // BE-PWR9-LABEL: @test_strmb16(
2379 // BE-PWR9-NEXT: entry:
2380 // BE-PWR9-NEXT: [[__A_ADDR_I:%.*]] = alloca <16 x i8>, align 16
2381 // BE-PWR9-NEXT: [[__B_ADDR_I:%.*]] = alloca ptr, align 8
2382 // BE-PWR9-NEXT: [[__C_ADDR_I:%.*]] = alloca i64, align 8
2383 // BE-PWR9-NEXT: [[__MASK_I:%.*]] = alloca <16 x i8>, align 16
2384 // BE-PWR9-NEXT: [[__RES_I:%.*]] = alloca <16 x i8>, align 16
2385 // BE-PWR9-NEXT: [[PTR_ADDR:%.*]] = alloca ptr, align 8
2386 // BE-PWR9-NEXT: [[DATA_ADDR:%.*]] = alloca <16 x i8>, align 16
2387 // BE-PWR9-NEXT: store ptr [[PTR:%.*]], ptr [[PTR_ADDR]], align 8
2388 // BE-PWR9-NEXT: store <16 x i8> [[DATA:%.*]], ptr [[DATA_ADDR]], align 16
2389 // BE-PWR9-NEXT: [[TMP0:%.*]] = load <16 x i8>, ptr [[DATA_ADDR]], align 16
2390 // BE-PWR9-NEXT: [[TMP1:%.*]] = load ptr, ptr [[PTR_ADDR]], align 8
2391 // BE-PWR9-NEXT: store <16 x i8> [[TMP0]], ptr [[__A_ADDR_I]], align 16
2392 // BE-PWR9-NEXT: store ptr [[TMP1]], ptr [[__B_ADDR_I]], align 8
2393 // BE-PWR9-NEXT: store i64 16, ptr [[__C_ADDR_I]], align 8
2394 // BE-PWR9-NEXT: [[TMP2:%.*]] = load i64, ptr [[__C_ADDR_I]], align 8
2395 // BE-PWR9-NEXT: [[SUB_I:%.*]] = sub i64 16, [[TMP2]]
2396 // BE-PWR9-NEXT: [[CONV_I:%.*]] = trunc i64 [[SUB_I]] to i8
2397 // BE-PWR9-NEXT: [[TMP3:%.*]] = getelementptr i8, ptr null, i8 [[CONV_I]]
2398 // BE-PWR9-NEXT: [[TMP4:%.*]] = call <16 x i8> @llvm.ppc.altivec.lvsl(ptr [[TMP3]])
2399 // BE-PWR9-NEXT: store <16 x i8> [[TMP4]], ptr [[__MASK_I]], align 16
2400 // BE-PWR9-NEXT: [[TMP5:%.*]] = load <16 x i8>, ptr [[__A_ADDR_I]], align 16
2401 // BE-PWR9-NEXT: [[TMP6:%.*]] = bitcast <16 x i8> [[TMP5]] to <4 x i32>
2402 // BE-PWR9-NEXT: [[TMP7:%.*]] = load <16 x i8>, ptr [[__A_ADDR_I]], align 16
2403 // BE-PWR9-NEXT: [[TMP8:%.*]] = bitcast <16 x i8> [[TMP7]] to <4 x i32>
2404 // BE-PWR9-NEXT: [[TMP9:%.*]] = load <16 x i8>, ptr [[__MASK_I]], align 16
2405 // BE-PWR9-NEXT: [[TMP10:%.*]] = call <4 x i32> @llvm.ppc.altivec.vperm(<4 x i32> [[TMP6]], <4 x i32> [[TMP8]], <16 x i8> [[TMP9]])
2406 // BE-PWR9-NEXT: [[TMP11:%.*]] = bitcast <4 x i32> [[TMP10]] to <16 x i8>
2407 // BE-PWR9-NEXT: store <16 x i8> [[TMP11]], ptr [[__RES_I]], align 16
2408 // BE-PWR9-NEXT: [[TMP12:%.*]] = load <16 x i8>, ptr [[__RES_I]], align 16
2409 // BE-PWR9-NEXT: [[TMP13:%.*]] = bitcast <16 x i8> [[TMP12]] to <4 x i32>
2410 // BE-PWR9-NEXT: [[TMP14:%.*]] = load ptr, ptr [[__B_ADDR_I]], align 8
2411 // BE-PWR9-NEXT: [[TMP15:%.*]] = load i64, ptr [[__C_ADDR_I]], align 8
2412 // BE-PWR9-NEXT: [[SHL_I:%.*]] = shl i64 [[TMP15]], 56
2413 // BE-PWR9-NEXT: call void @llvm.ppc.vsx.stxvll(<4 x i32> [[TMP13]], ptr [[TMP14]], i64 [[SHL_I]])
2414 // BE-PWR9-NEXT: ret void
2416 // LE-PWR9-LABEL: @test_strmb16(
2417 // LE-PWR9-NEXT: entry:
2418 // LE-PWR9-NEXT: [[__A_ADDR_I:%.*]] = alloca <16 x i8>, align 16
2419 // LE-PWR9-NEXT: [[__B_ADDR_I:%.*]] = alloca ptr, align 8
2420 // LE-PWR9-NEXT: [[__C_ADDR_I:%.*]] = alloca i64, align 8
2421 // LE-PWR9-NEXT: [[__MASK_I:%.*]] = alloca <16 x i8>, align 16
2422 // LE-PWR9-NEXT: [[__RES_I:%.*]] = alloca <16 x i8>, align 16
2423 // LE-PWR9-NEXT: [[PTR_ADDR:%.*]] = alloca ptr, align 8
2424 // LE-PWR9-NEXT: [[DATA_ADDR:%.*]] = alloca <16 x i8>, align 16
2425 // LE-PWR9-NEXT: store ptr [[PTR:%.*]], ptr [[PTR_ADDR]], align 8
2426 // LE-PWR9-NEXT: store <16 x i8> [[DATA:%.*]], ptr [[DATA_ADDR]], align 16
2427 // LE-PWR9-NEXT: [[TMP0:%.*]] = load <16 x i8>, ptr [[DATA_ADDR]], align 16
2428 // LE-PWR9-NEXT: [[TMP1:%.*]] = load ptr, ptr [[PTR_ADDR]], align 8
2429 // LE-PWR9-NEXT: store <16 x i8> [[TMP0]], ptr [[__A_ADDR_I]], align 16
2430 // LE-PWR9-NEXT: store ptr [[TMP1]], ptr [[__B_ADDR_I]], align 8
2431 // LE-PWR9-NEXT: store i64 16, ptr [[__C_ADDR_I]], align 8
2432 // LE-PWR9-NEXT: [[TMP2:%.*]] = load i64, ptr [[__C_ADDR_I]], align 8
2433 // LE-PWR9-NEXT: [[SUB_I:%.*]] = sub i64 16, [[TMP2]]
2434 // LE-PWR9-NEXT: [[CONV_I:%.*]] = trunc i64 [[SUB_I]] to i8
2435 // LE-PWR9-NEXT: [[TMP3:%.*]] = getelementptr i8, ptr null, i8 [[CONV_I]]
2436 // LE-PWR9-NEXT: [[TMP4:%.*]] = call <16 x i8> @llvm.ppc.altivec.lvsl(ptr [[TMP3]])
2437 // LE-PWR9-NEXT: store <16 x i8> [[TMP4]], ptr [[__MASK_I]], align 16
2438 // LE-PWR9-NEXT: [[TMP5:%.*]] = load <16 x i8>, ptr [[__A_ADDR_I]], align 16
2439 // LE-PWR9-NEXT: [[TMP6:%.*]] = bitcast <16 x i8> [[TMP5]] to <4 x i32>
2440 // LE-PWR9-NEXT: [[TMP7:%.*]] = load <16 x i8>, ptr [[__A_ADDR_I]], align 16
2441 // LE-PWR9-NEXT: [[TMP8:%.*]] = bitcast <16 x i8> [[TMP7]] to <4 x i32>
2442 // LE-PWR9-NEXT: [[TMP9:%.*]] = load <16 x i8>, ptr [[__MASK_I]], align 16
2443 // LE-PWR9-NEXT: [[TMP10:%.*]] = call <4 x i32> @llvm.ppc.altivec.vperm(<4 x i32> [[TMP6]], <4 x i32> [[TMP8]], <16 x i8> [[TMP9]])
2444 // LE-PWR9-NEXT: [[TMP11:%.*]] = bitcast <4 x i32> [[TMP10]] to <16 x i8>
2445 // LE-PWR9-NEXT: store <16 x i8> [[TMP11]], ptr [[__RES_I]], align 16
2446 // LE-PWR9-NEXT: [[TMP12:%.*]] = load <16 x i8>, ptr [[__RES_I]], align 16
2447 // LE-PWR9-NEXT: [[TMP13:%.*]] = bitcast <16 x i8> [[TMP12]] to <4 x i32>
2448 // LE-PWR9-NEXT: [[TMP14:%.*]] = load ptr, ptr [[__B_ADDR_I]], align 8
2449 // LE-PWR9-NEXT: [[TMP15:%.*]] = load i64, ptr [[__C_ADDR_I]], align 8
2450 // LE-PWR9-NEXT: [[SHL_I:%.*]] = shl i64 [[TMP15]], 56
2451 // LE-PWR9-NEXT: call void @llvm.ppc.vsx.stxvll(<4 x i32> [[TMP13]], ptr [[TMP14]], i64 [[SHL_I]])
2452 // LE-PWR9-NEXT: ret void
2454 // BE32-PWR9-LABEL: @test_strmb16(
2455 // BE32-PWR9-NEXT: entry:
2456 // BE32-PWR9-NEXT: [[PTR_ADDR:%.*]] = alloca ptr, align 4
2457 // BE32-PWR9-NEXT: [[DATA_ADDR:%.*]] = alloca <16 x i8>, align 16
2458 // BE32-PWR9-NEXT: store ptr [[PTR:%.*]], ptr [[PTR_ADDR]], align 4
2459 // BE32-PWR9-NEXT: store <16 x i8> [[DATA:%.*]], ptr [[DATA_ADDR]], align 16
2460 // BE32-PWR9-NEXT: [[TMP0:%.*]] = load ptr, ptr [[PTR_ADDR]], align 4
2461 // BE32-PWR9-NEXT: [[TMP1:%.*]] = load <16 x i8>, ptr [[DATA_ADDR]], align 16
2462 // BE32-PWR9-NEXT: store <16 x i8> [[TMP1]], ptr [[TMP0]], align 1
2463 // BE32-PWR9-NEXT: ret void
2465 void test_strmb16(char *ptr, vector unsigned char data) {
2466 __vec_strmb(ptr, 16, data);