1 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
2 ; RUN: opt < %s -mtriple=x86_64-unknown -passes=slp-vectorizer -S | FileCheck %s --check-prefixes=SSE
3 ; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=slm -passes=slp-vectorizer -S | FileCheck %s --check-prefixes=SSE
4 ; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=corei7-avx -passes=slp-vectorizer -S | FileCheck %s --check-prefixes=AVX,AVX1
5 ; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=core-avx2 -passes=slp-vectorizer -S | FileCheck %s --check-prefixes=AVX,AVX2
6 ; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=skx -mattr=+prefer-256-bit -passes=slp-vectorizer -S | FileCheck %s --check-prefixes=AVX,AVX256
7 ; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=skx -mattr=-prefer-256-bit -passes=slp-vectorizer -S | FileCheck %s --check-prefixes=AVX512
8 ; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=knl -passes=slp-vectorizer -S | FileCheck %s --check-prefixes=AVX512
10 @a64 = common global [8 x i64] zeroinitializer, align 64
11 @b64 = common global [8 x i64] zeroinitializer, align 64
12 @c64 = common global [8 x i64] zeroinitializer, align 64
13 @d64 = common global [8 x i64] zeroinitializer, align 64
14 @a32 = common global [16 x i32] zeroinitializer, align 64
15 @b32 = common global [16 x i32] zeroinitializer, align 64
16 @c32 = common global [16 x i32] zeroinitializer, align 64
17 @d32 = common global [16 x i32] zeroinitializer, align 64
18 @a16 = common global [32 x i16] zeroinitializer, align 64
19 @b16 = common global [32 x i16] zeroinitializer, align 64
20 @c16 = common global [32 x i16] zeroinitializer, align 64
21 @d16 = common global [32 x i16] zeroinitializer, align 64
22 @a8 = common global [64 x i8] zeroinitializer, align 64
23 @b8 = common global [64 x i8] zeroinitializer, align 64
24 @c8 = common global [64 x i8] zeroinitializer, align 64
25 @d8 = common global [64 x i8] zeroinitializer, align 64
27 declare i64 @llvm.fshl.i64(i64, i64, i64)
28 declare i32 @llvm.fshl.i32(i32, i32, i32)
29 declare i16 @llvm.fshl.i16(i16, i16, i16)
30 declare i8 @llvm.fshl.i8 (i8 , i8 , i8 )
32 define void @fshl_v8i64() {
33 ; SSE-LABEL: @fshl_v8i64(
34 ; SSE-NEXT: [[A0:%.*]] = load i64, ptr @a64, align 8
35 ; SSE-NEXT: [[A1:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @a64, i32 0, i64 1), align 8
36 ; SSE-NEXT: [[A2:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @a64, i32 0, i64 2), align 8
37 ; SSE-NEXT: [[A3:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @a64, i32 0, i64 3), align 8
38 ; SSE-NEXT: [[A4:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @a64, i32 0, i64 4), align 8
39 ; SSE-NEXT: [[A5:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @a64, i32 0, i64 5), align 8
40 ; SSE-NEXT: [[A6:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @a64, i32 0, i64 6), align 8
41 ; SSE-NEXT: [[A7:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @a64, i32 0, i64 7), align 8
42 ; SSE-NEXT: [[B0:%.*]] = load i64, ptr @b64, align 8
43 ; SSE-NEXT: [[B1:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @b64, i32 0, i64 1), align 8
44 ; SSE-NEXT: [[B2:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @b64, i32 0, i64 2), align 8
45 ; SSE-NEXT: [[B3:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @b64, i32 0, i64 3), align 8
46 ; SSE-NEXT: [[B4:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @b64, i32 0, i64 4), align 8
47 ; SSE-NEXT: [[B5:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @b64, i32 0, i64 5), align 8
48 ; SSE-NEXT: [[B6:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @b64, i32 0, i64 6), align 8
49 ; SSE-NEXT: [[B7:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @b64, i32 0, i64 7), align 8
50 ; SSE-NEXT: [[C0:%.*]] = load i64, ptr @c64, align 8
51 ; SSE-NEXT: [[C1:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @c64, i32 0, i64 1), align 8
52 ; SSE-NEXT: [[C2:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @c64, i32 0, i64 2), align 8
53 ; SSE-NEXT: [[C3:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @c64, i32 0, i64 3), align 8
54 ; SSE-NEXT: [[C4:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @c64, i32 0, i64 4), align 8
55 ; SSE-NEXT: [[C5:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @c64, i32 0, i64 5), align 8
56 ; SSE-NEXT: [[C6:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @c64, i32 0, i64 6), align 8
57 ; SSE-NEXT: [[C7:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @c64, i32 0, i64 7), align 8
58 ; SSE-NEXT: [[R0:%.*]] = call i64 @llvm.fshl.i64(i64 [[A0]], i64 [[B0]], i64 [[C0]])
59 ; SSE-NEXT: [[R1:%.*]] = call i64 @llvm.fshl.i64(i64 [[A1]], i64 [[B1]], i64 [[C1]])
60 ; SSE-NEXT: [[R2:%.*]] = call i64 @llvm.fshl.i64(i64 [[A2]], i64 [[B2]], i64 [[C2]])
61 ; SSE-NEXT: [[R3:%.*]] = call i64 @llvm.fshl.i64(i64 [[A3]], i64 [[B3]], i64 [[C3]])
62 ; SSE-NEXT: [[R4:%.*]] = call i64 @llvm.fshl.i64(i64 [[A4]], i64 [[B4]], i64 [[C4]])
63 ; SSE-NEXT: [[R5:%.*]] = call i64 @llvm.fshl.i64(i64 [[A5]], i64 [[B5]], i64 [[C5]])
64 ; SSE-NEXT: [[R6:%.*]] = call i64 @llvm.fshl.i64(i64 [[A6]], i64 [[B6]], i64 [[C6]])
65 ; SSE-NEXT: [[R7:%.*]] = call i64 @llvm.fshl.i64(i64 [[A7]], i64 [[B7]], i64 [[C7]])
66 ; SSE-NEXT: store i64 [[R0]], ptr @d64, align 8
67 ; SSE-NEXT: store i64 [[R1]], ptr getelementptr inbounds ([8 x i64], ptr @d64, i32 0, i64 1), align 8
68 ; SSE-NEXT: store i64 [[R2]], ptr getelementptr inbounds ([8 x i64], ptr @d64, i32 0, i64 2), align 8
69 ; SSE-NEXT: store i64 [[R3]], ptr getelementptr inbounds ([8 x i64], ptr @d64, i32 0, i64 3), align 8
70 ; SSE-NEXT: store i64 [[R4]], ptr getelementptr inbounds ([8 x i64], ptr @d64, i32 0, i64 4), align 8
71 ; SSE-NEXT: store i64 [[R5]], ptr getelementptr inbounds ([8 x i64], ptr @d64, i32 0, i64 5), align 8
72 ; SSE-NEXT: store i64 [[R6]], ptr getelementptr inbounds ([8 x i64], ptr @d64, i32 0, i64 6), align 8
73 ; SSE-NEXT: store i64 [[R7]], ptr getelementptr inbounds ([8 x i64], ptr @d64, i32 0, i64 7), align 8
76 ; AVX1-LABEL: @fshl_v8i64(
77 ; AVX1-NEXT: [[TMP1:%.*]] = load <2 x i64>, ptr @a64, align 8
78 ; AVX1-NEXT: [[TMP2:%.*]] = load <2 x i64>, ptr @b64, align 8
79 ; AVX1-NEXT: [[TMP3:%.*]] = load <2 x i64>, ptr @c64, align 8
80 ; AVX1-NEXT: [[TMP4:%.*]] = call <2 x i64> @llvm.fshl.v2i64(<2 x i64> [[TMP1]], <2 x i64> [[TMP2]], <2 x i64> [[TMP3]])
81 ; AVX1-NEXT: store <2 x i64> [[TMP4]], ptr @d64, align 8
82 ; AVX1-NEXT: [[TMP5:%.*]] = load <2 x i64>, ptr getelementptr inbounds ([8 x i64], ptr @a64, i32 0, i64 2), align 8
83 ; AVX1-NEXT: [[TMP6:%.*]] = load <2 x i64>, ptr getelementptr inbounds ([8 x i64], ptr @b64, i32 0, i64 2), align 8
84 ; AVX1-NEXT: [[TMP7:%.*]] = load <2 x i64>, ptr getelementptr inbounds ([8 x i64], ptr @c64, i32 0, i64 2), align 8
85 ; AVX1-NEXT: [[TMP8:%.*]] = call <2 x i64> @llvm.fshl.v2i64(<2 x i64> [[TMP5]], <2 x i64> [[TMP6]], <2 x i64> [[TMP7]])
86 ; AVX1-NEXT: store <2 x i64> [[TMP8]], ptr getelementptr inbounds ([8 x i64], ptr @d64, i32 0, i64 2), align 8
87 ; AVX1-NEXT: [[TMP9:%.*]] = load <2 x i64>, ptr getelementptr inbounds ([8 x i64], ptr @a64, i32 0, i64 4), align 8
88 ; AVX1-NEXT: [[TMP10:%.*]] = load <2 x i64>, ptr getelementptr inbounds ([8 x i64], ptr @b64, i32 0, i64 4), align 8
89 ; AVX1-NEXT: [[TMP11:%.*]] = load <2 x i64>, ptr getelementptr inbounds ([8 x i64], ptr @c64, i32 0, i64 4), align 8
90 ; AVX1-NEXT: [[TMP12:%.*]] = call <2 x i64> @llvm.fshl.v2i64(<2 x i64> [[TMP9]], <2 x i64> [[TMP10]], <2 x i64> [[TMP11]])
91 ; AVX1-NEXT: store <2 x i64> [[TMP12]], ptr getelementptr inbounds ([8 x i64], ptr @d64, i32 0, i64 4), align 8
92 ; AVX1-NEXT: [[TMP13:%.*]] = load <2 x i64>, ptr getelementptr inbounds ([8 x i64], ptr @a64, i32 0, i64 6), align 8
93 ; AVX1-NEXT: [[TMP14:%.*]] = load <2 x i64>, ptr getelementptr inbounds ([8 x i64], ptr @b64, i32 0, i64 6), align 8
94 ; AVX1-NEXT: [[TMP15:%.*]] = load <2 x i64>, ptr getelementptr inbounds ([8 x i64], ptr @c64, i32 0, i64 6), align 8
95 ; AVX1-NEXT: [[TMP16:%.*]] = call <2 x i64> @llvm.fshl.v2i64(<2 x i64> [[TMP13]], <2 x i64> [[TMP14]], <2 x i64> [[TMP15]])
96 ; AVX1-NEXT: store <2 x i64> [[TMP16]], ptr getelementptr inbounds ([8 x i64], ptr @d64, i32 0, i64 6), align 8
99 ; AVX2-LABEL: @fshl_v8i64(
100 ; AVX2-NEXT: [[TMP1:%.*]] = load <4 x i64>, ptr @a64, align 8
101 ; AVX2-NEXT: [[TMP2:%.*]] = load <4 x i64>, ptr @b64, align 8
102 ; AVX2-NEXT: [[TMP3:%.*]] = load <4 x i64>, ptr @c64, align 8
103 ; AVX2-NEXT: [[TMP4:%.*]] = call <4 x i64> @llvm.fshl.v4i64(<4 x i64> [[TMP1]], <4 x i64> [[TMP2]], <4 x i64> [[TMP3]])
104 ; AVX2-NEXT: store <4 x i64> [[TMP4]], ptr @d64, align 8
105 ; AVX2-NEXT: [[TMP5:%.*]] = load <4 x i64>, ptr getelementptr inbounds ([8 x i64], ptr @a64, i32 0, i64 4), align 8
106 ; AVX2-NEXT: [[TMP6:%.*]] = load <4 x i64>, ptr getelementptr inbounds ([8 x i64], ptr @b64, i32 0, i64 4), align 8
107 ; AVX2-NEXT: [[TMP7:%.*]] = load <4 x i64>, ptr getelementptr inbounds ([8 x i64], ptr @c64, i32 0, i64 4), align 8
108 ; AVX2-NEXT: [[TMP8:%.*]] = call <4 x i64> @llvm.fshl.v4i64(<4 x i64> [[TMP5]], <4 x i64> [[TMP6]], <4 x i64> [[TMP7]])
109 ; AVX2-NEXT: store <4 x i64> [[TMP8]], ptr getelementptr inbounds ([8 x i64], ptr @d64, i32 0, i64 4), align 8
110 ; AVX2-NEXT: ret void
112 ; AVX256-LABEL: @fshl_v8i64(
113 ; AVX256-NEXT: [[TMP1:%.*]] = load <4 x i64>, ptr @a64, align 8
114 ; AVX256-NEXT: [[TMP2:%.*]] = load <4 x i64>, ptr @b64, align 8
115 ; AVX256-NEXT: [[TMP3:%.*]] = load <4 x i64>, ptr @c64, align 8
116 ; AVX256-NEXT: [[TMP4:%.*]] = call <4 x i64> @llvm.fshl.v4i64(<4 x i64> [[TMP1]], <4 x i64> [[TMP2]], <4 x i64> [[TMP3]])
117 ; AVX256-NEXT: store <4 x i64> [[TMP4]], ptr @d64, align 8
118 ; AVX256-NEXT: [[TMP5:%.*]] = load <4 x i64>, ptr getelementptr inbounds ([8 x i64], ptr @a64, i32 0, i64 4), align 8
119 ; AVX256-NEXT: [[TMP6:%.*]] = load <4 x i64>, ptr getelementptr inbounds ([8 x i64], ptr @b64, i32 0, i64 4), align 8
120 ; AVX256-NEXT: [[TMP7:%.*]] = load <4 x i64>, ptr getelementptr inbounds ([8 x i64], ptr @c64, i32 0, i64 4), align 8
121 ; AVX256-NEXT: [[TMP8:%.*]] = call <4 x i64> @llvm.fshl.v4i64(<4 x i64> [[TMP5]], <4 x i64> [[TMP6]], <4 x i64> [[TMP7]])
122 ; AVX256-NEXT: store <4 x i64> [[TMP8]], ptr getelementptr inbounds ([8 x i64], ptr @d64, i32 0, i64 4), align 8
123 ; AVX256-NEXT: ret void
125 ; AVX512-LABEL: @fshl_v8i64(
126 ; AVX512-NEXT: [[TMP1:%.*]] = load <8 x i64>, ptr @a64, align 8
127 ; AVX512-NEXT: [[TMP2:%.*]] = load <8 x i64>, ptr @b64, align 8
128 ; AVX512-NEXT: [[TMP3:%.*]] = load <8 x i64>, ptr @c64, align 8
129 ; AVX512-NEXT: [[TMP4:%.*]] = call <8 x i64> @llvm.fshl.v8i64(<8 x i64> [[TMP1]], <8 x i64> [[TMP2]], <8 x i64> [[TMP3]])
130 ; AVX512-NEXT: store <8 x i64> [[TMP4]], ptr @d64, align 8
131 ; AVX512-NEXT: ret void
133 %a0 = load i64, ptr @a64, align 8
134 %a1 = load i64, ptr getelementptr inbounds ([8 x i64], ptr @a64, i32 0, i64 1), align 8
135 %a2 = load i64, ptr getelementptr inbounds ([8 x i64], ptr @a64, i32 0, i64 2), align 8
136 %a3 = load i64, ptr getelementptr inbounds ([8 x i64], ptr @a64, i32 0, i64 3), align 8
137 %a4 = load i64, ptr getelementptr inbounds ([8 x i64], ptr @a64, i32 0, i64 4), align 8
138 %a5 = load i64, ptr getelementptr inbounds ([8 x i64], ptr @a64, i32 0, i64 5), align 8
139 %a6 = load i64, ptr getelementptr inbounds ([8 x i64], ptr @a64, i32 0, i64 6), align 8
140 %a7 = load i64, ptr getelementptr inbounds ([8 x i64], ptr @a64, i32 0, i64 7), align 8
141 %b0 = load i64, ptr @b64, align 8
142 %b1 = load i64, ptr getelementptr inbounds ([8 x i64], ptr @b64, i32 0, i64 1), align 8
143 %b2 = load i64, ptr getelementptr inbounds ([8 x i64], ptr @b64, i32 0, i64 2), align 8
144 %b3 = load i64, ptr getelementptr inbounds ([8 x i64], ptr @b64, i32 0, i64 3), align 8
145 %b4 = load i64, ptr getelementptr inbounds ([8 x i64], ptr @b64, i32 0, i64 4), align 8
146 %b5 = load i64, ptr getelementptr inbounds ([8 x i64], ptr @b64, i32 0, i64 5), align 8
147 %b6 = load i64, ptr getelementptr inbounds ([8 x i64], ptr @b64, i32 0, i64 6), align 8
148 %b7 = load i64, ptr getelementptr inbounds ([8 x i64], ptr @b64, i32 0, i64 7), align 8
149 %c0 = load i64, ptr @c64, align 8
150 %c1 = load i64, ptr getelementptr inbounds ([8 x i64], ptr @c64, i32 0, i64 1), align 8
151 %c2 = load i64, ptr getelementptr inbounds ([8 x i64], ptr @c64, i32 0, i64 2), align 8
152 %c3 = load i64, ptr getelementptr inbounds ([8 x i64], ptr @c64, i32 0, i64 3), align 8
153 %c4 = load i64, ptr getelementptr inbounds ([8 x i64], ptr @c64, i32 0, i64 4), align 8
154 %c5 = load i64, ptr getelementptr inbounds ([8 x i64], ptr @c64, i32 0, i64 5), align 8
155 %c6 = load i64, ptr getelementptr inbounds ([8 x i64], ptr @c64, i32 0, i64 6), align 8
156 %c7 = load i64, ptr getelementptr inbounds ([8 x i64], ptr @c64, i32 0, i64 7), align 8
157 %r0 = call i64 @llvm.fshl.i64(i64 %a0, i64 %b0, i64 %c0)
158 %r1 = call i64 @llvm.fshl.i64(i64 %a1, i64 %b1, i64 %c1)
159 %r2 = call i64 @llvm.fshl.i64(i64 %a2, i64 %b2, i64 %c2)
160 %r3 = call i64 @llvm.fshl.i64(i64 %a3, i64 %b3, i64 %c3)
161 %r4 = call i64 @llvm.fshl.i64(i64 %a4, i64 %b4, i64 %c4)
162 %r5 = call i64 @llvm.fshl.i64(i64 %a5, i64 %b5, i64 %c5)
163 %r6 = call i64 @llvm.fshl.i64(i64 %a6, i64 %b6, i64 %c6)
164 %r7 = call i64 @llvm.fshl.i64(i64 %a7, i64 %b7, i64 %c7)
165 store i64 %r0, ptr @d64, align 8
166 store i64 %r1, ptr getelementptr inbounds ([8 x i64], ptr @d64, i32 0, i64 1), align 8
167 store i64 %r2, ptr getelementptr inbounds ([8 x i64], ptr @d64, i32 0, i64 2), align 8
168 store i64 %r3, ptr getelementptr inbounds ([8 x i64], ptr @d64, i32 0, i64 3), align 8
169 store i64 %r4, ptr getelementptr inbounds ([8 x i64], ptr @d64, i32 0, i64 4), align 8
170 store i64 %r5, ptr getelementptr inbounds ([8 x i64], ptr @d64, i32 0, i64 5), align 8
171 store i64 %r6, ptr getelementptr inbounds ([8 x i64], ptr @d64, i32 0, i64 6), align 8
172 store i64 %r7, ptr getelementptr inbounds ([8 x i64], ptr @d64, i32 0, i64 7), align 8
176 define void @fshl_v16i32() {
177 ; SSE-LABEL: @fshl_v16i32(
178 ; SSE-NEXT: [[A0:%.*]] = load i32, ptr @a32, align 4
179 ; SSE-NEXT: [[A1:%.*]] = load i32, ptr getelementptr inbounds ([16 x i32], ptr @a32, i32 0, i64 1), align 4
180 ; SSE-NEXT: [[A2:%.*]] = load i32, ptr getelementptr inbounds ([16 x i32], ptr @a32, i32 0, i64 2), align 4
181 ; SSE-NEXT: [[A3:%.*]] = load i32, ptr getelementptr inbounds ([16 x i32], ptr @a32, i32 0, i64 3), align 4
182 ; SSE-NEXT: [[A4:%.*]] = load i32, ptr getelementptr inbounds ([16 x i32], ptr @a32, i32 0, i64 4), align 4
183 ; SSE-NEXT: [[A5:%.*]] = load i32, ptr getelementptr inbounds ([16 x i32], ptr @a32, i32 0, i64 5), align 4
184 ; SSE-NEXT: [[A6:%.*]] = load i32, ptr getelementptr inbounds ([16 x i32], ptr @a32, i32 0, i64 6), align 4
185 ; SSE-NEXT: [[A7:%.*]] = load i32, ptr getelementptr inbounds ([16 x i32], ptr @a32, i32 0, i64 7), align 4
186 ; SSE-NEXT: [[A8:%.*]] = load i32, ptr getelementptr inbounds ([16 x i32], ptr @a32, i32 0, i64 8), align 4
187 ; SSE-NEXT: [[A9:%.*]] = load i32, ptr getelementptr inbounds ([16 x i32], ptr @a32, i32 0, i64 9), align 4
188 ; SSE-NEXT: [[A10:%.*]] = load i32, ptr getelementptr inbounds ([16 x i32], ptr @a32, i32 0, i64 10), align 4
189 ; SSE-NEXT: [[A11:%.*]] = load i32, ptr getelementptr inbounds ([16 x i32], ptr @a32, i32 0, i64 11), align 4
190 ; SSE-NEXT: [[A12:%.*]] = load i32, ptr getelementptr inbounds ([16 x i32], ptr @a32, i32 0, i64 12), align 4
191 ; SSE-NEXT: [[A13:%.*]] = load i32, ptr getelementptr inbounds ([16 x i32], ptr @a32, i32 0, i64 13), align 4
192 ; SSE-NEXT: [[A14:%.*]] = load i32, ptr getelementptr inbounds ([16 x i32], ptr @a32, i32 0, i64 14), align 4
193 ; SSE-NEXT: [[A15:%.*]] = load i32, ptr getelementptr inbounds ([16 x i32], ptr @a32, i32 0, i64 15), align 4
194 ; SSE-NEXT: [[B0:%.*]] = load i32, ptr @b32, align 4
195 ; SSE-NEXT: [[B1:%.*]] = load i32, ptr getelementptr inbounds ([16 x i32], ptr @b32, i32 0, i64 1), align 4
196 ; SSE-NEXT: [[B2:%.*]] = load i32, ptr getelementptr inbounds ([16 x i32], ptr @b32, i32 0, i64 2), align 4
197 ; SSE-NEXT: [[B3:%.*]] = load i32, ptr getelementptr inbounds ([16 x i32], ptr @b32, i32 0, i64 3), align 4
198 ; SSE-NEXT: [[B4:%.*]] = load i32, ptr getelementptr inbounds ([16 x i32], ptr @b32, i32 0, i64 4), align 4
199 ; SSE-NEXT: [[B5:%.*]] = load i32, ptr getelementptr inbounds ([16 x i32], ptr @b32, i32 0, i64 5), align 4
200 ; SSE-NEXT: [[B6:%.*]] = load i32, ptr getelementptr inbounds ([16 x i32], ptr @b32, i32 0, i64 6), align 4
201 ; SSE-NEXT: [[B7:%.*]] = load i32, ptr getelementptr inbounds ([16 x i32], ptr @b32, i32 0, i64 7), align 4
202 ; SSE-NEXT: [[B8:%.*]] = load i32, ptr getelementptr inbounds ([16 x i32], ptr @b32, i32 0, i64 8), align 4
203 ; SSE-NEXT: [[B9:%.*]] = load i32, ptr getelementptr inbounds ([16 x i32], ptr @b32, i32 0, i64 9), align 4
204 ; SSE-NEXT: [[B10:%.*]] = load i32, ptr getelementptr inbounds ([16 x i32], ptr @b32, i32 0, i64 10), align 4
205 ; SSE-NEXT: [[B11:%.*]] = load i32, ptr getelementptr inbounds ([16 x i32], ptr @b32, i32 0, i64 11), align 4
206 ; SSE-NEXT: [[B12:%.*]] = load i32, ptr getelementptr inbounds ([16 x i32], ptr @b32, i32 0, i64 12), align 4
207 ; SSE-NEXT: [[B13:%.*]] = load i32, ptr getelementptr inbounds ([16 x i32], ptr @b32, i32 0, i64 13), align 4
208 ; SSE-NEXT: [[B14:%.*]] = load i32, ptr getelementptr inbounds ([16 x i32], ptr @b32, i32 0, i64 14), align 4
209 ; SSE-NEXT: [[B15:%.*]] = load i32, ptr getelementptr inbounds ([16 x i32], ptr @b32, i32 0, i64 15), align 4
210 ; SSE-NEXT: [[C0:%.*]] = load i32, ptr @c32, align 4
211 ; SSE-NEXT: [[C1:%.*]] = load i32, ptr getelementptr inbounds ([16 x i32], ptr @c32, i32 0, i64 1), align 4
212 ; SSE-NEXT: [[C2:%.*]] = load i32, ptr getelementptr inbounds ([16 x i32], ptr @c32, i32 0, i64 2), align 4
213 ; SSE-NEXT: [[C3:%.*]] = load i32, ptr getelementptr inbounds ([16 x i32], ptr @c32, i32 0, i64 3), align 4
214 ; SSE-NEXT: [[C4:%.*]] = load i32, ptr getelementptr inbounds ([16 x i32], ptr @c32, i32 0, i64 4), align 4
215 ; SSE-NEXT: [[C5:%.*]] = load i32, ptr getelementptr inbounds ([16 x i32], ptr @c32, i32 0, i64 5), align 4
216 ; SSE-NEXT: [[C6:%.*]] = load i32, ptr getelementptr inbounds ([16 x i32], ptr @c32, i32 0, i64 6), align 4
217 ; SSE-NEXT: [[C7:%.*]] = load i32, ptr getelementptr inbounds ([16 x i32], ptr @c32, i32 0, i64 7), align 4
218 ; SSE-NEXT: [[C8:%.*]] = load i32, ptr getelementptr inbounds ([16 x i32], ptr @c32, i32 0, i64 8), align 4
219 ; SSE-NEXT: [[C9:%.*]] = load i32, ptr getelementptr inbounds ([16 x i32], ptr @c32, i32 0, i64 9), align 4
220 ; SSE-NEXT: [[C10:%.*]] = load i32, ptr getelementptr inbounds ([16 x i32], ptr @c32, i32 0, i64 10), align 4
221 ; SSE-NEXT: [[C11:%.*]] = load i32, ptr getelementptr inbounds ([16 x i32], ptr @c32, i32 0, i64 11), align 4
222 ; SSE-NEXT: [[C12:%.*]] = load i32, ptr getelementptr inbounds ([16 x i32], ptr @c32, i32 0, i64 12), align 4
223 ; SSE-NEXT: [[C13:%.*]] = load i32, ptr getelementptr inbounds ([16 x i32], ptr @c32, i32 0, i64 13), align 4
224 ; SSE-NEXT: [[C14:%.*]] = load i32, ptr getelementptr inbounds ([16 x i32], ptr @c32, i32 0, i64 14), align 4
225 ; SSE-NEXT: [[C15:%.*]] = load i32, ptr getelementptr inbounds ([16 x i32], ptr @c32, i32 0, i64 15), align 4
226 ; SSE-NEXT: [[R0:%.*]] = call i32 @llvm.fshl.i32(i32 [[A0]], i32 [[B0]], i32 [[C0]])
227 ; SSE-NEXT: [[R1:%.*]] = call i32 @llvm.fshl.i32(i32 [[A1]], i32 [[B1]], i32 [[C1]])
228 ; SSE-NEXT: [[R2:%.*]] = call i32 @llvm.fshl.i32(i32 [[A2]], i32 [[B2]], i32 [[C2]])
229 ; SSE-NEXT: [[R3:%.*]] = call i32 @llvm.fshl.i32(i32 [[A3]], i32 [[B3]], i32 [[C3]])
230 ; SSE-NEXT: [[R4:%.*]] = call i32 @llvm.fshl.i32(i32 [[A4]], i32 [[B4]], i32 [[C4]])
231 ; SSE-NEXT: [[R5:%.*]] = call i32 @llvm.fshl.i32(i32 [[A5]], i32 [[B5]], i32 [[C5]])
232 ; SSE-NEXT: [[R6:%.*]] = call i32 @llvm.fshl.i32(i32 [[A6]], i32 [[B6]], i32 [[C6]])
233 ; SSE-NEXT: [[R7:%.*]] = call i32 @llvm.fshl.i32(i32 [[A7]], i32 [[B7]], i32 [[C7]])
234 ; SSE-NEXT: [[R8:%.*]] = call i32 @llvm.fshl.i32(i32 [[A8]], i32 [[B8]], i32 [[C8]])
235 ; SSE-NEXT: [[R9:%.*]] = call i32 @llvm.fshl.i32(i32 [[A9]], i32 [[B9]], i32 [[C9]])
236 ; SSE-NEXT: [[R10:%.*]] = call i32 @llvm.fshl.i32(i32 [[A10]], i32 [[B10]], i32 [[C10]])
237 ; SSE-NEXT: [[R11:%.*]] = call i32 @llvm.fshl.i32(i32 [[A11]], i32 [[B11]], i32 [[C11]])
238 ; SSE-NEXT: [[R12:%.*]] = call i32 @llvm.fshl.i32(i32 [[A12]], i32 [[B12]], i32 [[C12]])
239 ; SSE-NEXT: [[R13:%.*]] = call i32 @llvm.fshl.i32(i32 [[A13]], i32 [[B13]], i32 [[C13]])
240 ; SSE-NEXT: [[R14:%.*]] = call i32 @llvm.fshl.i32(i32 [[A14]], i32 [[B14]], i32 [[C14]])
241 ; SSE-NEXT: [[R15:%.*]] = call i32 @llvm.fshl.i32(i32 [[A15]], i32 [[B15]], i32 [[C15]])
242 ; SSE-NEXT: store i32 [[R0]], ptr @d32, align 4
243 ; SSE-NEXT: store i32 [[R1]], ptr getelementptr inbounds ([16 x i32], ptr @d32, i32 0, i64 1), align 4
244 ; SSE-NEXT: store i32 [[R2]], ptr getelementptr inbounds ([16 x i32], ptr @d32, i32 0, i64 2), align 4
245 ; SSE-NEXT: store i32 [[R3]], ptr getelementptr inbounds ([16 x i32], ptr @d32, i32 0, i64 3), align 4
246 ; SSE-NEXT: store i32 [[R4]], ptr getelementptr inbounds ([16 x i32], ptr @d32, i32 0, i64 4), align 4
247 ; SSE-NEXT: store i32 [[R5]], ptr getelementptr inbounds ([16 x i32], ptr @d32, i32 0, i64 5), align 4
248 ; SSE-NEXT: store i32 [[R6]], ptr getelementptr inbounds ([16 x i32], ptr @d32, i32 0, i64 6), align 4
249 ; SSE-NEXT: store i32 [[R7]], ptr getelementptr inbounds ([16 x i32], ptr @d32, i32 0, i64 7), align 4
250 ; SSE-NEXT: store i32 [[R8]], ptr getelementptr inbounds ([16 x i32], ptr @d32, i32 0, i64 8), align 4
251 ; SSE-NEXT: store i32 [[R9]], ptr getelementptr inbounds ([16 x i32], ptr @d32, i32 0, i64 9), align 4
252 ; SSE-NEXT: store i32 [[R10]], ptr getelementptr inbounds ([16 x i32], ptr @d32, i32 0, i64 10), align 4
253 ; SSE-NEXT: store i32 [[R11]], ptr getelementptr inbounds ([16 x i32], ptr @d32, i32 0, i64 11), align 4
254 ; SSE-NEXT: store i32 [[R12]], ptr getelementptr inbounds ([16 x i32], ptr @d32, i32 0, i64 12), align 4
255 ; SSE-NEXT: store i32 [[R13]], ptr getelementptr inbounds ([16 x i32], ptr @d32, i32 0, i64 13), align 4
256 ; SSE-NEXT: store i32 [[R14]], ptr getelementptr inbounds ([16 x i32], ptr @d32, i32 0, i64 14), align 4
257 ; SSE-NEXT: store i32 [[R15]], ptr getelementptr inbounds ([16 x i32], ptr @d32, i32 0, i64 15), align 4
260 ; AVX-LABEL: @fshl_v16i32(
261 ; AVX-NEXT: [[TMP1:%.*]] = load <8 x i32>, ptr @a32, align 4
262 ; AVX-NEXT: [[TMP2:%.*]] = load <8 x i32>, ptr @b32, align 4
263 ; AVX-NEXT: [[TMP3:%.*]] = load <8 x i32>, ptr @c32, align 4
264 ; AVX-NEXT: [[TMP4:%.*]] = call <8 x i32> @llvm.fshl.v8i32(<8 x i32> [[TMP1]], <8 x i32> [[TMP2]], <8 x i32> [[TMP3]])
265 ; AVX-NEXT: store <8 x i32> [[TMP4]], ptr @d32, align 4
266 ; AVX-NEXT: [[TMP5:%.*]] = load <8 x i32>, ptr getelementptr inbounds ([16 x i32], ptr @a32, i32 0, i64 8), align 4
267 ; AVX-NEXT: [[TMP6:%.*]] = load <8 x i32>, ptr getelementptr inbounds ([16 x i32], ptr @b32, i32 0, i64 8), align 4
268 ; AVX-NEXT: [[TMP7:%.*]] = load <8 x i32>, ptr getelementptr inbounds ([16 x i32], ptr @c32, i32 0, i64 8), align 4
269 ; AVX-NEXT: [[TMP8:%.*]] = call <8 x i32> @llvm.fshl.v8i32(<8 x i32> [[TMP5]], <8 x i32> [[TMP6]], <8 x i32> [[TMP7]])
270 ; AVX-NEXT: store <8 x i32> [[TMP8]], ptr getelementptr inbounds ([16 x i32], ptr @d32, i32 0, i64 8), align 4
273 ; AVX512-LABEL: @fshl_v16i32(
274 ; AVX512-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @a32, align 4
275 ; AVX512-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr @b32, align 4
276 ; AVX512-NEXT: [[TMP3:%.*]] = load <16 x i32>, ptr @c32, align 4
277 ; AVX512-NEXT: [[TMP4:%.*]] = call <16 x i32> @llvm.fshl.v16i32(<16 x i32> [[TMP1]], <16 x i32> [[TMP2]], <16 x i32> [[TMP3]])
278 ; AVX512-NEXT: store <16 x i32> [[TMP4]], ptr @d32, align 4
279 ; AVX512-NEXT: ret void
281 %a0 = load i32, ptr getelementptr inbounds ([16 x i32], ptr @a32, i32 0, i64 0 ), align 4
282 %a1 = load i32, ptr getelementptr inbounds ([16 x i32], ptr @a32, i32 0, i64 1 ), align 4
283 %a2 = load i32, ptr getelementptr inbounds ([16 x i32], ptr @a32, i32 0, i64 2 ), align 4
284 %a3 = load i32, ptr getelementptr inbounds ([16 x i32], ptr @a32, i32 0, i64 3 ), align 4
285 %a4 = load i32, ptr getelementptr inbounds ([16 x i32], ptr @a32, i32 0, i64 4 ), align 4
286 %a5 = load i32, ptr getelementptr inbounds ([16 x i32], ptr @a32, i32 0, i64 5 ), align 4
287 %a6 = load i32, ptr getelementptr inbounds ([16 x i32], ptr @a32, i32 0, i64 6 ), align 4
288 %a7 = load i32, ptr getelementptr inbounds ([16 x i32], ptr @a32, i32 0, i64 7 ), align 4
289 %a8 = load i32, ptr getelementptr inbounds ([16 x i32], ptr @a32, i32 0, i64 8 ), align 4
290 %a9 = load i32, ptr getelementptr inbounds ([16 x i32], ptr @a32, i32 0, i64 9 ), align 4
291 %a10 = load i32, ptr getelementptr inbounds ([16 x i32], ptr @a32, i32 0, i64 10), align 4
292 %a11 = load i32, ptr getelementptr inbounds ([16 x i32], ptr @a32, i32 0, i64 11), align 4
293 %a12 = load i32, ptr getelementptr inbounds ([16 x i32], ptr @a32, i32 0, i64 12), align 4
294 %a13 = load i32, ptr getelementptr inbounds ([16 x i32], ptr @a32, i32 0, i64 13), align 4
295 %a14 = load i32, ptr getelementptr inbounds ([16 x i32], ptr @a32, i32 0, i64 14), align 4
296 %a15 = load i32, ptr getelementptr inbounds ([16 x i32], ptr @a32, i32 0, i64 15), align 4
297 %b0 = load i32, ptr getelementptr inbounds ([16 x i32], ptr @b32, i32 0, i64 0 ), align 4
298 %b1 = load i32, ptr getelementptr inbounds ([16 x i32], ptr @b32, i32 0, i64 1 ), align 4
299 %b2 = load i32, ptr getelementptr inbounds ([16 x i32], ptr @b32, i32 0, i64 2 ), align 4
300 %b3 = load i32, ptr getelementptr inbounds ([16 x i32], ptr @b32, i32 0, i64 3 ), align 4
301 %b4 = load i32, ptr getelementptr inbounds ([16 x i32], ptr @b32, i32 0, i64 4 ), align 4
302 %b5 = load i32, ptr getelementptr inbounds ([16 x i32], ptr @b32, i32 0, i64 5 ), align 4
303 %b6 = load i32, ptr getelementptr inbounds ([16 x i32], ptr @b32, i32 0, i64 6 ), align 4
304 %b7 = load i32, ptr getelementptr inbounds ([16 x i32], ptr @b32, i32 0, i64 7 ), align 4
305 %b8 = load i32, ptr getelementptr inbounds ([16 x i32], ptr @b32, i32 0, i64 8 ), align 4
306 %b9 = load i32, ptr getelementptr inbounds ([16 x i32], ptr @b32, i32 0, i64 9 ), align 4
307 %b10 = load i32, ptr getelementptr inbounds ([16 x i32], ptr @b32, i32 0, i64 10), align 4
308 %b11 = load i32, ptr getelementptr inbounds ([16 x i32], ptr @b32, i32 0, i64 11), align 4
309 %b12 = load i32, ptr getelementptr inbounds ([16 x i32], ptr @b32, i32 0, i64 12), align 4
310 %b13 = load i32, ptr getelementptr inbounds ([16 x i32], ptr @b32, i32 0, i64 13), align 4
311 %b14 = load i32, ptr getelementptr inbounds ([16 x i32], ptr @b32, i32 0, i64 14), align 4
312 %b15 = load i32, ptr getelementptr inbounds ([16 x i32], ptr @b32, i32 0, i64 15), align 4
313 %c0 = load i32, ptr getelementptr inbounds ([16 x i32], ptr @c32, i32 0, i64 0 ), align 4
314 %c1 = load i32, ptr getelementptr inbounds ([16 x i32], ptr @c32, i32 0, i64 1 ), align 4
315 %c2 = load i32, ptr getelementptr inbounds ([16 x i32], ptr @c32, i32 0, i64 2 ), align 4
316 %c3 = load i32, ptr getelementptr inbounds ([16 x i32], ptr @c32, i32 0, i64 3 ), align 4
317 %c4 = load i32, ptr getelementptr inbounds ([16 x i32], ptr @c32, i32 0, i64 4 ), align 4
318 %c5 = load i32, ptr getelementptr inbounds ([16 x i32], ptr @c32, i32 0, i64 5 ), align 4
319 %c6 = load i32, ptr getelementptr inbounds ([16 x i32], ptr @c32, i32 0, i64 6 ), align 4
320 %c7 = load i32, ptr getelementptr inbounds ([16 x i32], ptr @c32, i32 0, i64 7 ), align 4
321 %c8 = load i32, ptr getelementptr inbounds ([16 x i32], ptr @c32, i32 0, i64 8 ), align 4
322 %c9 = load i32, ptr getelementptr inbounds ([16 x i32], ptr @c32, i32 0, i64 9 ), align 4
323 %c10 = load i32, ptr getelementptr inbounds ([16 x i32], ptr @c32, i32 0, i64 10), align 4
324 %c11 = load i32, ptr getelementptr inbounds ([16 x i32], ptr @c32, i32 0, i64 11), align 4
325 %c12 = load i32, ptr getelementptr inbounds ([16 x i32], ptr @c32, i32 0, i64 12), align 4
326 %c13 = load i32, ptr getelementptr inbounds ([16 x i32], ptr @c32, i32 0, i64 13), align 4
327 %c14 = load i32, ptr getelementptr inbounds ([16 x i32], ptr @c32, i32 0, i64 14), align 4
328 %c15 = load i32, ptr getelementptr inbounds ([16 x i32], ptr @c32, i32 0, i64 15), align 4
329 %r0 = call i32 @llvm.fshl.i32(i32 %a0 , i32 %b0 , i32 %c0 )
330 %r1 = call i32 @llvm.fshl.i32(i32 %a1 , i32 %b1 , i32 %c1 )
331 %r2 = call i32 @llvm.fshl.i32(i32 %a2 , i32 %b2 , i32 %c2 )
332 %r3 = call i32 @llvm.fshl.i32(i32 %a3 , i32 %b3 , i32 %c3 )
333 %r4 = call i32 @llvm.fshl.i32(i32 %a4 , i32 %b4 , i32 %c4 )
334 %r5 = call i32 @llvm.fshl.i32(i32 %a5 , i32 %b5 , i32 %c5 )
335 %r6 = call i32 @llvm.fshl.i32(i32 %a6 , i32 %b6 , i32 %c6 )
336 %r7 = call i32 @llvm.fshl.i32(i32 %a7 , i32 %b7 , i32 %c7 )
337 %r8 = call i32 @llvm.fshl.i32(i32 %a8 , i32 %b8 , i32 %c8 )
338 %r9 = call i32 @llvm.fshl.i32(i32 %a9 , i32 %b9 , i32 %c9 )
339 %r10 = call i32 @llvm.fshl.i32(i32 %a10, i32 %b10, i32 %c10)
340 %r11 = call i32 @llvm.fshl.i32(i32 %a11, i32 %b11, i32 %c11)
341 %r12 = call i32 @llvm.fshl.i32(i32 %a12, i32 %b12, i32 %c12)
342 %r13 = call i32 @llvm.fshl.i32(i32 %a13, i32 %b13, i32 %c13)
343 %r14 = call i32 @llvm.fshl.i32(i32 %a14, i32 %b14, i32 %c14)
344 %r15 = call i32 @llvm.fshl.i32(i32 %a15, i32 %b15, i32 %c15)
345 store i32 %r0 , ptr getelementptr inbounds ([16 x i32], ptr @d32, i32 0, i64 0 ), align 4
346 store i32 %r1 , ptr getelementptr inbounds ([16 x i32], ptr @d32, i32 0, i64 1 ), align 4
347 store i32 %r2 , ptr getelementptr inbounds ([16 x i32], ptr @d32, i32 0, i64 2 ), align 4
348 store i32 %r3 , ptr getelementptr inbounds ([16 x i32], ptr @d32, i32 0, i64 3 ), align 4
349 store i32 %r4 , ptr getelementptr inbounds ([16 x i32], ptr @d32, i32 0, i64 4 ), align 4
350 store i32 %r5 , ptr getelementptr inbounds ([16 x i32], ptr @d32, i32 0, i64 5 ), align 4
351 store i32 %r6 , ptr getelementptr inbounds ([16 x i32], ptr @d32, i32 0, i64 6 ), align 4
352 store i32 %r7 , ptr getelementptr inbounds ([16 x i32], ptr @d32, i32 0, i64 7 ), align 4
353 store i32 %r8 , ptr getelementptr inbounds ([16 x i32], ptr @d32, i32 0, i64 8 ), align 4
354 store i32 %r9 , ptr getelementptr inbounds ([16 x i32], ptr @d32, i32 0, i64 9 ), align 4
355 store i32 %r10, ptr getelementptr inbounds ([16 x i32], ptr @d32, i32 0, i64 10), align 4
356 store i32 %r11, ptr getelementptr inbounds ([16 x i32], ptr @d32, i32 0, i64 11), align 4
357 store i32 %r12, ptr getelementptr inbounds ([16 x i32], ptr @d32, i32 0, i64 12), align 4
358 store i32 %r13, ptr getelementptr inbounds ([16 x i32], ptr @d32, i32 0, i64 13), align 4
359 store i32 %r14, ptr getelementptr inbounds ([16 x i32], ptr @d32, i32 0, i64 14), align 4
360 store i32 %r15, ptr getelementptr inbounds ([16 x i32], ptr @d32, i32 0, i64 15), align 4
364 define void @fshl_v32i16() {
365 ; SSE-LABEL: @fshl_v32i16(
366 ; SSE-NEXT: [[TMP1:%.*]] = load <8 x i16>, ptr @a16, align 2
367 ; SSE-NEXT: [[TMP2:%.*]] = load <8 x i16>, ptr @b16, align 2
368 ; SSE-NEXT: [[TMP3:%.*]] = load <8 x i16>, ptr @c16, align 2
369 ; SSE-NEXT: [[TMP4:%.*]] = call <8 x i16> @llvm.fshl.v8i16(<8 x i16> [[TMP1]], <8 x i16> [[TMP2]], <8 x i16> [[TMP3]])
370 ; SSE-NEXT: store <8 x i16> [[TMP4]], ptr @d16, align 2
371 ; SSE-NEXT: [[TMP5:%.*]] = load <8 x i16>, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 8), align 2
372 ; SSE-NEXT: [[TMP6:%.*]] = load <8 x i16>, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 8), align 2
373 ; SSE-NEXT: [[TMP7:%.*]] = load <8 x i16>, ptr getelementptr inbounds ([32 x i16], ptr @c16, i32 0, i64 8), align 2
374 ; SSE-NEXT: [[TMP8:%.*]] = call <8 x i16> @llvm.fshl.v8i16(<8 x i16> [[TMP5]], <8 x i16> [[TMP6]], <8 x i16> [[TMP7]])
375 ; SSE-NEXT: store <8 x i16> [[TMP8]], ptr getelementptr inbounds ([32 x i16], ptr @d16, i32 0, i64 8), align 2
376 ; SSE-NEXT: [[TMP9:%.*]] = load <8 x i16>, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 16), align 2
377 ; SSE-NEXT: [[TMP10:%.*]] = load <8 x i16>, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 16), align 2
378 ; SSE-NEXT: [[TMP11:%.*]] = load <8 x i16>, ptr getelementptr inbounds ([32 x i16], ptr @c16, i32 0, i64 16), align 2
379 ; SSE-NEXT: [[TMP12:%.*]] = call <8 x i16> @llvm.fshl.v8i16(<8 x i16> [[TMP9]], <8 x i16> [[TMP10]], <8 x i16> [[TMP11]])
380 ; SSE-NEXT: store <8 x i16> [[TMP12]], ptr getelementptr inbounds ([32 x i16], ptr @d16, i32 0, i64 16), align 2
381 ; SSE-NEXT: [[TMP13:%.*]] = load <8 x i16>, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 24), align 2
382 ; SSE-NEXT: [[TMP14:%.*]] = load <8 x i16>, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 24), align 2
383 ; SSE-NEXT: [[TMP15:%.*]] = load <8 x i16>, ptr getelementptr inbounds ([32 x i16], ptr @c16, i32 0, i64 24), align 2
384 ; SSE-NEXT: [[TMP16:%.*]] = call <8 x i16> @llvm.fshl.v8i16(<8 x i16> [[TMP13]], <8 x i16> [[TMP14]], <8 x i16> [[TMP15]])
385 ; SSE-NEXT: store <8 x i16> [[TMP16]], ptr getelementptr inbounds ([32 x i16], ptr @d16, i32 0, i64 24), align 2
388 ; AVX-LABEL: @fshl_v32i16(
389 ; AVX-NEXT: [[TMP1:%.*]] = load <16 x i16>, ptr @a16, align 2
390 ; AVX-NEXT: [[TMP2:%.*]] = load <16 x i16>, ptr @b16, align 2
391 ; AVX-NEXT: [[TMP3:%.*]] = load <16 x i16>, ptr @c16, align 2
392 ; AVX-NEXT: [[TMP4:%.*]] = call <16 x i16> @llvm.fshl.v16i16(<16 x i16> [[TMP1]], <16 x i16> [[TMP2]], <16 x i16> [[TMP3]])
393 ; AVX-NEXT: store <16 x i16> [[TMP4]], ptr @d16, align 2
394 ; AVX-NEXT: [[TMP5:%.*]] = load <16 x i16>, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 16), align 2
395 ; AVX-NEXT: [[TMP6:%.*]] = load <16 x i16>, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 16), align 2
396 ; AVX-NEXT: [[TMP7:%.*]] = load <16 x i16>, ptr getelementptr inbounds ([32 x i16], ptr @c16, i32 0, i64 16), align 2
397 ; AVX-NEXT: [[TMP8:%.*]] = call <16 x i16> @llvm.fshl.v16i16(<16 x i16> [[TMP5]], <16 x i16> [[TMP6]], <16 x i16> [[TMP7]])
398 ; AVX-NEXT: store <16 x i16> [[TMP8]], ptr getelementptr inbounds ([32 x i16], ptr @d16, i32 0, i64 16), align 2
401 ; AVX512-LABEL: @fshl_v32i16(
402 ; AVX512-NEXT: [[TMP1:%.*]] = load <32 x i16>, ptr @a16, align 2
403 ; AVX512-NEXT: [[TMP2:%.*]] = load <32 x i16>, ptr @b16, align 2
404 ; AVX512-NEXT: [[TMP3:%.*]] = load <32 x i16>, ptr @c16, align 2
405 ; AVX512-NEXT: [[TMP4:%.*]] = call <32 x i16> @llvm.fshl.v32i16(<32 x i16> [[TMP1]], <32 x i16> [[TMP2]], <32 x i16> [[TMP3]])
406 ; AVX512-NEXT: store <32 x i16> [[TMP4]], ptr @d16, align 2
407 ; AVX512-NEXT: ret void
409 %a0 = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 0 ), align 2
410 %a1 = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 1 ), align 2
411 %a2 = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 2 ), align 2
412 %a3 = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 3 ), align 2
413 %a4 = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 4 ), align 2
414 %a5 = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 5 ), align 2
415 %a6 = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 6 ), align 2
416 %a7 = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 7 ), align 2
417 %a8 = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 8 ), align 2
418 %a9 = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 9 ), align 2
419 %a10 = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 10), align 2
420 %a11 = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 11), align 2
421 %a12 = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 12), align 2
422 %a13 = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 13), align 2
423 %a14 = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 14), align 2
424 %a15 = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 15), align 2
425 %a16 = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 16), align 2
426 %a17 = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 17), align 2
427 %a18 = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 18), align 2
428 %a19 = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 19), align 2
429 %a20 = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 20), align 2
430 %a21 = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 21), align 2
431 %a22 = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 22), align 2
432 %a23 = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 23), align 2
433 %a24 = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 24), align 2
434 %a25 = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 25), align 2
435 %a26 = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 26), align 2
436 %a27 = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 27), align 2
437 %a28 = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 28), align 2
438 %a29 = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 29), align 2
439 %a30 = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 30), align 2
440 %a31 = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 31), align 2
441 %b0 = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 0 ), align 2
442 %b1 = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 1 ), align 2
443 %b2 = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 2 ), align 2
444 %b3 = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 3 ), align 2
445 %b4 = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 4 ), align 2
446 %b5 = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 5 ), align 2
447 %b6 = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 6 ), align 2
448 %b7 = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 7 ), align 2
449 %b8 = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 8 ), align 2
450 %b9 = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 9 ), align 2
451 %b10 = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 10), align 2
452 %b11 = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 11), align 2
453 %b12 = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 12), align 2
454 %b13 = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 13), align 2
455 %b14 = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 14), align 2
456 %b15 = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 15), align 2
457 %b16 = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 16), align 2
458 %b17 = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 17), align 2
459 %b18 = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 18), align 2
460 %b19 = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 19), align 2
461 %b20 = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 20), align 2
462 %b21 = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 21), align 2
463 %b22 = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 22), align 2
464 %b23 = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 23), align 2
465 %b24 = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 24), align 2
466 %b25 = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 25), align 2
467 %b26 = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 26), align 2
468 %b27 = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 27), align 2
469 %b28 = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 28), align 2
470 %b29 = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 29), align 2
471 %b30 = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 30), align 2
472 %b31 = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 31), align 2
473 %c0 = load i16, ptr getelementptr inbounds ([32 x i16], ptr @c16, i32 0, i64 0 ), align 2
474 %c1 = load i16, ptr getelementptr inbounds ([32 x i16], ptr @c16, i32 0, i64 1 ), align 2
475 %c2 = load i16, ptr getelementptr inbounds ([32 x i16], ptr @c16, i32 0, i64 2 ), align 2
476 %c3 = load i16, ptr getelementptr inbounds ([32 x i16], ptr @c16, i32 0, i64 3 ), align 2
477 %c4 = load i16, ptr getelementptr inbounds ([32 x i16], ptr @c16, i32 0, i64 4 ), align 2
478 %c5 = load i16, ptr getelementptr inbounds ([32 x i16], ptr @c16, i32 0, i64 5 ), align 2
479 %c6 = load i16, ptr getelementptr inbounds ([32 x i16], ptr @c16, i32 0, i64 6 ), align 2
480 %c7 = load i16, ptr getelementptr inbounds ([32 x i16], ptr @c16, i32 0, i64 7 ), align 2
481 %c8 = load i16, ptr getelementptr inbounds ([32 x i16], ptr @c16, i32 0, i64 8 ), align 2
482 %c9 = load i16, ptr getelementptr inbounds ([32 x i16], ptr @c16, i32 0, i64 9 ), align 2
483 %c10 = load i16, ptr getelementptr inbounds ([32 x i16], ptr @c16, i32 0, i64 10), align 2
484 %c11 = load i16, ptr getelementptr inbounds ([32 x i16], ptr @c16, i32 0, i64 11), align 2
485 %c12 = load i16, ptr getelementptr inbounds ([32 x i16], ptr @c16, i32 0, i64 12), align 2
486 %c13 = load i16, ptr getelementptr inbounds ([32 x i16], ptr @c16, i32 0, i64 13), align 2
487 %c14 = load i16, ptr getelementptr inbounds ([32 x i16], ptr @c16, i32 0, i64 14), align 2
488 %c15 = load i16, ptr getelementptr inbounds ([32 x i16], ptr @c16, i32 0, i64 15), align 2
489 %c16 = load i16, ptr getelementptr inbounds ([32 x i16], ptr @c16, i32 0, i64 16), align 2
490 %c17 = load i16, ptr getelementptr inbounds ([32 x i16], ptr @c16, i32 0, i64 17), align 2
491 %c18 = load i16, ptr getelementptr inbounds ([32 x i16], ptr @c16, i32 0, i64 18), align 2
492 %c19 = load i16, ptr getelementptr inbounds ([32 x i16], ptr @c16, i32 0, i64 19), align 2
493 %c20 = load i16, ptr getelementptr inbounds ([32 x i16], ptr @c16, i32 0, i64 20), align 2
494 %c21 = load i16, ptr getelementptr inbounds ([32 x i16], ptr @c16, i32 0, i64 21), align 2
495 %c22 = load i16, ptr getelementptr inbounds ([32 x i16], ptr @c16, i32 0, i64 22), align 2
496 %c23 = load i16, ptr getelementptr inbounds ([32 x i16], ptr @c16, i32 0, i64 23), align 2
497 %c24 = load i16, ptr getelementptr inbounds ([32 x i16], ptr @c16, i32 0, i64 24), align 2
498 %c25 = load i16, ptr getelementptr inbounds ([32 x i16], ptr @c16, i32 0, i64 25), align 2
499 %c26 = load i16, ptr getelementptr inbounds ([32 x i16], ptr @c16, i32 0, i64 26), align 2
500 %c27 = load i16, ptr getelementptr inbounds ([32 x i16], ptr @c16, i32 0, i64 27), align 2
501 %c28 = load i16, ptr getelementptr inbounds ([32 x i16], ptr @c16, i32 0, i64 28), align 2
502 %c29 = load i16, ptr getelementptr inbounds ([32 x i16], ptr @c16, i32 0, i64 29), align 2
503 %c30 = load i16, ptr getelementptr inbounds ([32 x i16], ptr @c16, i32 0, i64 30), align 2
504 %c31 = load i16, ptr getelementptr inbounds ([32 x i16], ptr @c16, i32 0, i64 31), align 2
505 %r0 = call i16 @llvm.fshl.i16(i16 %a0 , i16 %b0 , i16 %c0 )
506 %r1 = call i16 @llvm.fshl.i16(i16 %a1 , i16 %b1 , i16 %c1 )
507 %r2 = call i16 @llvm.fshl.i16(i16 %a2 , i16 %b2 , i16 %c2 )
508 %r3 = call i16 @llvm.fshl.i16(i16 %a3 , i16 %b3 , i16 %c3 )
509 %r4 = call i16 @llvm.fshl.i16(i16 %a4 , i16 %b4 , i16 %c4 )
510 %r5 = call i16 @llvm.fshl.i16(i16 %a5 , i16 %b5 , i16 %c5 )
511 %r6 = call i16 @llvm.fshl.i16(i16 %a6 , i16 %b6 , i16 %c6 )
512 %r7 = call i16 @llvm.fshl.i16(i16 %a7 , i16 %b7 , i16 %c7 )
513 %r8 = call i16 @llvm.fshl.i16(i16 %a8 , i16 %b8 , i16 %c8 )
514 %r9 = call i16 @llvm.fshl.i16(i16 %a9 , i16 %b9 , i16 %c9 )
515 %r10 = call i16 @llvm.fshl.i16(i16 %a10, i16 %b10, i16 %c10)
516 %r11 = call i16 @llvm.fshl.i16(i16 %a11, i16 %b11, i16 %c11)
517 %r12 = call i16 @llvm.fshl.i16(i16 %a12, i16 %b12, i16 %c12)
518 %r13 = call i16 @llvm.fshl.i16(i16 %a13, i16 %b13, i16 %c13)
519 %r14 = call i16 @llvm.fshl.i16(i16 %a14, i16 %b14, i16 %c14)
520 %r15 = call i16 @llvm.fshl.i16(i16 %a15, i16 %b15, i16 %c15)
521 %r16 = call i16 @llvm.fshl.i16(i16 %a16, i16 %b16, i16 %c16)
522 %r17 = call i16 @llvm.fshl.i16(i16 %a17, i16 %b17, i16 %c17)
523 %r18 = call i16 @llvm.fshl.i16(i16 %a18, i16 %b18, i16 %c18)
524 %r19 = call i16 @llvm.fshl.i16(i16 %a19, i16 %b19, i16 %c19)
525 %r20 = call i16 @llvm.fshl.i16(i16 %a20, i16 %b20, i16 %c20)
526 %r21 = call i16 @llvm.fshl.i16(i16 %a21, i16 %b21, i16 %c21)
527 %r22 = call i16 @llvm.fshl.i16(i16 %a22, i16 %b22, i16 %c22)
528 %r23 = call i16 @llvm.fshl.i16(i16 %a23, i16 %b23, i16 %c23)
529 %r24 = call i16 @llvm.fshl.i16(i16 %a24, i16 %b24, i16 %c24)
530 %r25 = call i16 @llvm.fshl.i16(i16 %a25, i16 %b25, i16 %c25)
531 %r26 = call i16 @llvm.fshl.i16(i16 %a26, i16 %b26, i16 %c26)
532 %r27 = call i16 @llvm.fshl.i16(i16 %a27, i16 %b27, i16 %c27)
533 %r28 = call i16 @llvm.fshl.i16(i16 %a28, i16 %b28, i16 %c28)
534 %r29 = call i16 @llvm.fshl.i16(i16 %a29, i16 %b29, i16 %c29)
535 %r30 = call i16 @llvm.fshl.i16(i16 %a30, i16 %b30, i16 %c30)
536 %r31 = call i16 @llvm.fshl.i16(i16 %a31, i16 %b31, i16 %c31)
537 store i16 %r0 , ptr getelementptr inbounds ([32 x i16], ptr @d16, i32 0, i64 0 ), align 2
538 store i16 %r1 , ptr getelementptr inbounds ([32 x i16], ptr @d16, i32 0, i64 1 ), align 2
539 store i16 %r2 , ptr getelementptr inbounds ([32 x i16], ptr @d16, i32 0, i64 2 ), align 2
540 store i16 %r3 , ptr getelementptr inbounds ([32 x i16], ptr @d16, i32 0, i64 3 ), align 2
541 store i16 %r4 , ptr getelementptr inbounds ([32 x i16], ptr @d16, i32 0, i64 4 ), align 2
542 store i16 %r5 , ptr getelementptr inbounds ([32 x i16], ptr @d16, i32 0, i64 5 ), align 2
543 store i16 %r6 , ptr getelementptr inbounds ([32 x i16], ptr @d16, i32 0, i64 6 ), align 2
544 store i16 %r7 , ptr getelementptr inbounds ([32 x i16], ptr @d16, i32 0, i64 7 ), align 2
545 store i16 %r8 , ptr getelementptr inbounds ([32 x i16], ptr @d16, i32 0, i64 8 ), align 2
546 store i16 %r9 , ptr getelementptr inbounds ([32 x i16], ptr @d16, i32 0, i64 9 ), align 2
547 store i16 %r10, ptr getelementptr inbounds ([32 x i16], ptr @d16, i32 0, i64 10), align 2
548 store i16 %r11, ptr getelementptr inbounds ([32 x i16], ptr @d16, i32 0, i64 11), align 2
549 store i16 %r12, ptr getelementptr inbounds ([32 x i16], ptr @d16, i32 0, i64 12), align 2
550 store i16 %r13, ptr getelementptr inbounds ([32 x i16], ptr @d16, i32 0, i64 13), align 2
551 store i16 %r14, ptr getelementptr inbounds ([32 x i16], ptr @d16, i32 0, i64 14), align 2
552 store i16 %r15, ptr getelementptr inbounds ([32 x i16], ptr @d16, i32 0, i64 15), align 2
553 store i16 %r16, ptr getelementptr inbounds ([32 x i16], ptr @d16, i32 0, i64 16), align 2
554 store i16 %r17, ptr getelementptr inbounds ([32 x i16], ptr @d16, i32 0, i64 17), align 2
555 store i16 %r18, ptr getelementptr inbounds ([32 x i16], ptr @d16, i32 0, i64 18), align 2
556 store i16 %r19, ptr getelementptr inbounds ([32 x i16], ptr @d16, i32 0, i64 19), align 2
557 store i16 %r20, ptr getelementptr inbounds ([32 x i16], ptr @d16, i32 0, i64 20), align 2
558 store i16 %r21, ptr getelementptr inbounds ([32 x i16], ptr @d16, i32 0, i64 21), align 2
559 store i16 %r22, ptr getelementptr inbounds ([32 x i16], ptr @d16, i32 0, i64 22), align 2
560 store i16 %r23, ptr getelementptr inbounds ([32 x i16], ptr @d16, i32 0, i64 23), align 2
561 store i16 %r24, ptr getelementptr inbounds ([32 x i16], ptr @d16, i32 0, i64 24), align 2
562 store i16 %r25, ptr getelementptr inbounds ([32 x i16], ptr @d16, i32 0, i64 25), align 2
563 store i16 %r26, ptr getelementptr inbounds ([32 x i16], ptr @d16, i32 0, i64 26), align 2
564 store i16 %r27, ptr getelementptr inbounds ([32 x i16], ptr @d16, i32 0, i64 27), align 2
565 store i16 %r28, ptr getelementptr inbounds ([32 x i16], ptr @d16, i32 0, i64 28), align 2
566 store i16 %r29, ptr getelementptr inbounds ([32 x i16], ptr @d16, i32 0, i64 29), align 2
567 store i16 %r30, ptr getelementptr inbounds ([32 x i16], ptr @d16, i32 0, i64 30), align 2
568 store i16 %r31, ptr getelementptr inbounds ([32 x i16], ptr @d16, i32 0, i64 31), align 2
572 define void @fshl_v64i8() {
573 ; SSE-LABEL: @fshl_v64i8(
574 ; SSE-NEXT: [[TMP1:%.*]] = load <16 x i8>, ptr @a8, align 1
575 ; SSE-NEXT: [[TMP2:%.*]] = load <16 x i8>, ptr @b8, align 1
576 ; SSE-NEXT: [[TMP3:%.*]] = load <16 x i8>, ptr @c8, align 1
577 ; SSE-NEXT: [[TMP4:%.*]] = call <16 x i8> @llvm.fshl.v16i8(<16 x i8> [[TMP1]], <16 x i8> [[TMP2]], <16 x i8> [[TMP3]])
578 ; SSE-NEXT: store <16 x i8> [[TMP4]], ptr @d8, align 1
579 ; SSE-NEXT: [[TMP5:%.*]] = load <16 x i8>, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 16), align 1
580 ; SSE-NEXT: [[TMP6:%.*]] = load <16 x i8>, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 16), align 1
581 ; SSE-NEXT: [[TMP7:%.*]] = load <16 x i8>, ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 16), align 1
582 ; SSE-NEXT: [[TMP8:%.*]] = call <16 x i8> @llvm.fshl.v16i8(<16 x i8> [[TMP5]], <16 x i8> [[TMP6]], <16 x i8> [[TMP7]])
583 ; SSE-NEXT: store <16 x i8> [[TMP8]], ptr getelementptr inbounds ([64 x i8], ptr @d8, i32 0, i64 16), align 1
584 ; SSE-NEXT: [[TMP9:%.*]] = load <16 x i8>, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 32), align 1
585 ; SSE-NEXT: [[TMP10:%.*]] = load <16 x i8>, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 32), align 1
586 ; SSE-NEXT: [[TMP11:%.*]] = load <16 x i8>, ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 32), align 1
587 ; SSE-NEXT: [[TMP12:%.*]] = call <16 x i8> @llvm.fshl.v16i8(<16 x i8> [[TMP9]], <16 x i8> [[TMP10]], <16 x i8> [[TMP11]])
588 ; SSE-NEXT: store <16 x i8> [[TMP12]], ptr getelementptr inbounds ([64 x i8], ptr @d8, i32 0, i64 32), align 1
589 ; SSE-NEXT: [[TMP13:%.*]] = load <16 x i8>, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 48), align 1
590 ; SSE-NEXT: [[TMP14:%.*]] = load <16 x i8>, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 48), align 1
591 ; SSE-NEXT: [[TMP15:%.*]] = load <16 x i8>, ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 48), align 1
592 ; SSE-NEXT: [[TMP16:%.*]] = call <16 x i8> @llvm.fshl.v16i8(<16 x i8> [[TMP13]], <16 x i8> [[TMP14]], <16 x i8> [[TMP15]])
593 ; SSE-NEXT: store <16 x i8> [[TMP16]], ptr getelementptr inbounds ([64 x i8], ptr @d8, i32 0, i64 48), align 1
596 ; AVX-LABEL: @fshl_v64i8(
597 ; AVX-NEXT: [[TMP1:%.*]] = load <32 x i8>, ptr @a8, align 1
598 ; AVX-NEXT: [[TMP2:%.*]] = load <32 x i8>, ptr @b8, align 1
599 ; AVX-NEXT: [[TMP3:%.*]] = load <32 x i8>, ptr @c8, align 1
600 ; AVX-NEXT: [[TMP4:%.*]] = call <32 x i8> @llvm.fshl.v32i8(<32 x i8> [[TMP1]], <32 x i8> [[TMP2]], <32 x i8> [[TMP3]])
601 ; AVX-NEXT: store <32 x i8> [[TMP4]], ptr @d8, align 1
602 ; AVX-NEXT: [[TMP5:%.*]] = load <32 x i8>, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 32), align 1
603 ; AVX-NEXT: [[TMP6:%.*]] = load <32 x i8>, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 32), align 1
604 ; AVX-NEXT: [[TMP7:%.*]] = load <32 x i8>, ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 32), align 1
605 ; AVX-NEXT: [[TMP8:%.*]] = call <32 x i8> @llvm.fshl.v32i8(<32 x i8> [[TMP5]], <32 x i8> [[TMP6]], <32 x i8> [[TMP7]])
606 ; AVX-NEXT: store <32 x i8> [[TMP8]], ptr getelementptr inbounds ([64 x i8], ptr @d8, i32 0, i64 32), align 1
609 ; AVX512-LABEL: @fshl_v64i8(
610 ; AVX512-NEXT: [[TMP1:%.*]] = load <64 x i8>, ptr @a8, align 1
611 ; AVX512-NEXT: [[TMP2:%.*]] = load <64 x i8>, ptr @b8, align 1
612 ; AVX512-NEXT: [[TMP3:%.*]] = load <64 x i8>, ptr @c8, align 1
613 ; AVX512-NEXT: [[TMP4:%.*]] = call <64 x i8> @llvm.fshl.v64i8(<64 x i8> [[TMP1]], <64 x i8> [[TMP2]], <64 x i8> [[TMP3]])
614 ; AVX512-NEXT: store <64 x i8> [[TMP4]], ptr @d8, align 1
615 ; AVX512-NEXT: ret void
617 %a0 = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 0 ), align 1
618 %a1 = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 1 ), align 1
619 %a2 = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 2 ), align 1
620 %a3 = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 3 ), align 1
621 %a4 = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 4 ), align 1
622 %a5 = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 5 ), align 1
623 %a6 = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 6 ), align 1
624 %a7 = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 7 ), align 1
625 %a8 = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 8 ), align 1
626 %a9 = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 9 ), align 1
627 %a10 = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 10), align 1
628 %a11 = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 11), align 1
629 %a12 = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 12), align 1
630 %a13 = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 13), align 1
631 %a14 = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 14), align 1
632 %a15 = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 15), align 1
633 %a16 = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 16), align 1
634 %a17 = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 17), align 1
635 %a18 = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 18), align 1
636 %a19 = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 19), align 1
637 %a20 = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 20), align 1
638 %a21 = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 21), align 1
639 %a22 = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 22), align 1
640 %a23 = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 23), align 1
641 %a24 = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 24), align 1
642 %a25 = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 25), align 1
643 %a26 = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 26), align 1
644 %a27 = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 27), align 1
645 %a28 = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 28), align 1
646 %a29 = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 29), align 1
647 %a30 = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 30), align 1
648 %a31 = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 31), align 1
649 %a32 = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 32), align 1
650 %a33 = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 33), align 1
651 %a34 = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 34), align 1
652 %a35 = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 35), align 1
653 %a36 = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 36), align 1
654 %a37 = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 37), align 1
655 %a38 = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 38), align 1
656 %a39 = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 39), align 1
657 %a40 = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 40), align 1
658 %a41 = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 41), align 1
659 %a42 = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 42), align 1
660 %a43 = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 43), align 1
661 %a44 = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 44), align 1
662 %a45 = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 45), align 1
663 %a46 = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 46), align 1
664 %a47 = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 47), align 1
665 %a48 = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 48), align 1
666 %a49 = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 49), align 1
667 %a50 = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 50), align 1
668 %a51 = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 51), align 1
669 %a52 = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 52), align 1
670 %a53 = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 53), align 1
671 %a54 = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 54), align 1
672 %a55 = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 55), align 1
673 %a56 = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 56), align 1
674 %a57 = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 57), align 1
675 %a58 = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 58), align 1
676 %a59 = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 59), align 1
677 %a60 = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 60), align 1
678 %a61 = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 61), align 1
679 %a62 = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 62), align 1
680 %a63 = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 63), align 1
681 %b0 = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 0 ), align 1
682 %b1 = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 1 ), align 1
683 %b2 = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 2 ), align 1
684 %b3 = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 3 ), align 1
685 %b4 = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 4 ), align 1
686 %b5 = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 5 ), align 1
687 %b6 = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 6 ), align 1
688 %b7 = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 7 ), align 1
689 %b8 = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 8 ), align 1
690 %b9 = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 9 ), align 1
691 %b10 = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 10), align 1
692 %b11 = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 11), align 1
693 %b12 = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 12), align 1
694 %b13 = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 13), align 1
695 %b14 = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 14), align 1
696 %b15 = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 15), align 1
697 %b16 = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 16), align 1
698 %b17 = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 17), align 1
699 %b18 = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 18), align 1
700 %b19 = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 19), align 1
701 %b20 = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 20), align 1
702 %b21 = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 21), align 1
703 %b22 = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 22), align 1
704 %b23 = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 23), align 1
705 %b24 = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 24), align 1
706 %b25 = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 25), align 1
707 %b26 = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 26), align 1
708 %b27 = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 27), align 1
709 %b28 = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 28), align 1
710 %b29 = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 29), align 1
711 %b30 = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 30), align 1
712 %b31 = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 31), align 1
713 %b32 = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 32), align 1
714 %b33 = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 33), align 1
715 %b34 = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 34), align 1
716 %b35 = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 35), align 1
717 %b36 = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 36), align 1
718 %b37 = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 37), align 1
719 %b38 = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 38), align 1
720 %b39 = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 39), align 1
721 %b40 = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 40), align 1
722 %b41 = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 41), align 1
723 %b42 = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 42), align 1
724 %b43 = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 43), align 1
725 %b44 = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 44), align 1
726 %b45 = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 45), align 1
727 %b46 = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 46), align 1
728 %b47 = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 47), align 1
729 %b48 = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 48), align 1
730 %b49 = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 49), align 1
731 %b50 = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 50), align 1
732 %b51 = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 51), align 1
733 %b52 = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 52), align 1
734 %b53 = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 53), align 1
735 %b54 = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 54), align 1
736 %b55 = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 55), align 1
737 %b56 = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 56), align 1
738 %b57 = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 57), align 1
739 %b58 = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 58), align 1
740 %b59 = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 59), align 1
741 %b60 = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 60), align 1
742 %b61 = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 61), align 1
743 %b62 = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 62), align 1
744 %b63 = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 63), align 1
745 %c0 = load i8, ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 0 ), align 1
746 %c1 = load i8, ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 1 ), align 1
747 %c2 = load i8, ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 2 ), align 1
748 %c3 = load i8, ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 3 ), align 1
749 %c4 = load i8, ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 4 ), align 1
750 %c5 = load i8, ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 5 ), align 1
751 %c6 = load i8, ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 6 ), align 1
752 %c7 = load i8, ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 7 ), align 1
753 %c8 = load i8, ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 8 ), align 1
754 %c9 = load i8, ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 9 ), align 1
755 %c10 = load i8, ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 10), align 1
756 %c11 = load i8, ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 11), align 1
757 %c12 = load i8, ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 12), align 1
758 %c13 = load i8, ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 13), align 1
759 %c14 = load i8, ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 14), align 1
760 %c15 = load i8, ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 15), align 1
761 %c16 = load i8, ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 16), align 1
762 %c17 = load i8, ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 17), align 1
763 %c18 = load i8, ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 18), align 1
764 %c19 = load i8, ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 19), align 1
765 %c20 = load i8, ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 20), align 1
766 %c21 = load i8, ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 21), align 1
767 %c22 = load i8, ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 22), align 1
768 %c23 = load i8, ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 23), align 1
769 %c24 = load i8, ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 24), align 1
770 %c25 = load i8, ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 25), align 1
771 %c26 = load i8, ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 26), align 1
772 %c27 = load i8, ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 27), align 1
773 %c28 = load i8, ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 28), align 1
774 %c29 = load i8, ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 29), align 1
775 %c30 = load i8, ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 30), align 1
776 %c31 = load i8, ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 31), align 1
777 %c32 = load i8, ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 32), align 1
778 %c33 = load i8, ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 33), align 1
779 %c34 = load i8, ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 34), align 1
780 %c35 = load i8, ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 35), align 1
781 %c36 = load i8, ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 36), align 1
782 %c37 = load i8, ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 37), align 1
783 %c38 = load i8, ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 38), align 1
784 %c39 = load i8, ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 39), align 1
785 %c40 = load i8, ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 40), align 1
786 %c41 = load i8, ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 41), align 1
787 %c42 = load i8, ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 42), align 1
788 %c43 = load i8, ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 43), align 1
789 %c44 = load i8, ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 44), align 1
790 %c45 = load i8, ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 45), align 1
791 %c46 = load i8, ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 46), align 1
792 %c47 = load i8, ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 47), align 1
793 %c48 = load i8, ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 48), align 1
794 %c49 = load i8, ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 49), align 1
795 %c50 = load i8, ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 50), align 1
796 %c51 = load i8, ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 51), align 1
797 %c52 = load i8, ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 52), align 1
798 %c53 = load i8, ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 53), align 1
799 %c54 = load i8, ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 54), align 1
800 %c55 = load i8, ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 55), align 1
801 %c56 = load i8, ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 56), align 1
802 %c57 = load i8, ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 57), align 1
803 %c58 = load i8, ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 58), align 1
804 %c59 = load i8, ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 59), align 1
805 %c60 = load i8, ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 60), align 1
806 %c61 = load i8, ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 61), align 1
807 %c62 = load i8, ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 62), align 1
808 %c63 = load i8, ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 63), align 1
809 %r0 = call i8 @llvm.fshl.i8(i8 %a0 , i8 %b0 , i8 %c0 )
810 %r1 = call i8 @llvm.fshl.i8(i8 %a1 , i8 %b1 , i8 %c1 )
811 %r2 = call i8 @llvm.fshl.i8(i8 %a2 , i8 %b2 , i8 %c2 )
812 %r3 = call i8 @llvm.fshl.i8(i8 %a3 , i8 %b3 , i8 %c3 )
813 %r4 = call i8 @llvm.fshl.i8(i8 %a4 , i8 %b4 , i8 %c4 )
814 %r5 = call i8 @llvm.fshl.i8(i8 %a5 , i8 %b5 , i8 %c5 )
815 %r6 = call i8 @llvm.fshl.i8(i8 %a6 , i8 %b6 , i8 %c6 )
816 %r7 = call i8 @llvm.fshl.i8(i8 %a7 , i8 %b7 , i8 %c7 )
817 %r8 = call i8 @llvm.fshl.i8(i8 %a8 , i8 %b8 , i8 %c8 )
818 %r9 = call i8 @llvm.fshl.i8(i8 %a9 , i8 %b9 , i8 %c9 )
819 %r10 = call i8 @llvm.fshl.i8(i8 %a10, i8 %b10, i8 %c10)
820 %r11 = call i8 @llvm.fshl.i8(i8 %a11, i8 %b11, i8 %c11)
821 %r12 = call i8 @llvm.fshl.i8(i8 %a12, i8 %b12, i8 %c12)
822 %r13 = call i8 @llvm.fshl.i8(i8 %a13, i8 %b13, i8 %c13)
823 %r14 = call i8 @llvm.fshl.i8(i8 %a14, i8 %b14, i8 %c14)
824 %r15 = call i8 @llvm.fshl.i8(i8 %a15, i8 %b15, i8 %c15)
825 %r16 = call i8 @llvm.fshl.i8(i8 %a16, i8 %b16, i8 %c16)
826 %r17 = call i8 @llvm.fshl.i8(i8 %a17, i8 %b17, i8 %c17)
827 %r18 = call i8 @llvm.fshl.i8(i8 %a18, i8 %b18, i8 %c18)
828 %r19 = call i8 @llvm.fshl.i8(i8 %a19, i8 %b19, i8 %c19)
829 %r20 = call i8 @llvm.fshl.i8(i8 %a20, i8 %b20, i8 %c20)
830 %r21 = call i8 @llvm.fshl.i8(i8 %a21, i8 %b21, i8 %c21)
831 %r22 = call i8 @llvm.fshl.i8(i8 %a22, i8 %b22, i8 %c22)
832 %r23 = call i8 @llvm.fshl.i8(i8 %a23, i8 %b23, i8 %c23)
833 %r24 = call i8 @llvm.fshl.i8(i8 %a24, i8 %b24, i8 %c24)
834 %r25 = call i8 @llvm.fshl.i8(i8 %a25, i8 %b25, i8 %c25)
835 %r26 = call i8 @llvm.fshl.i8(i8 %a26, i8 %b26, i8 %c26)
836 %r27 = call i8 @llvm.fshl.i8(i8 %a27, i8 %b27, i8 %c27)
837 %r28 = call i8 @llvm.fshl.i8(i8 %a28, i8 %b28, i8 %c28)
838 %r29 = call i8 @llvm.fshl.i8(i8 %a29, i8 %b29, i8 %c29)
839 %r30 = call i8 @llvm.fshl.i8(i8 %a30, i8 %b30, i8 %c30)
840 %r31 = call i8 @llvm.fshl.i8(i8 %a31, i8 %b31, i8 %c31)
841 %r32 = call i8 @llvm.fshl.i8(i8 %a32, i8 %b32, i8 %c32)
842 %r33 = call i8 @llvm.fshl.i8(i8 %a33, i8 %b33, i8 %c33)
843 %r34 = call i8 @llvm.fshl.i8(i8 %a34, i8 %b34, i8 %c34)
844 %r35 = call i8 @llvm.fshl.i8(i8 %a35, i8 %b35, i8 %c35)
845 %r36 = call i8 @llvm.fshl.i8(i8 %a36, i8 %b36, i8 %c36)
846 %r37 = call i8 @llvm.fshl.i8(i8 %a37, i8 %b37, i8 %c37)
847 %r38 = call i8 @llvm.fshl.i8(i8 %a38, i8 %b38, i8 %c38)
848 %r39 = call i8 @llvm.fshl.i8(i8 %a39, i8 %b39, i8 %c39)
849 %r40 = call i8 @llvm.fshl.i8(i8 %a40, i8 %b40, i8 %c40)
850 %r41 = call i8 @llvm.fshl.i8(i8 %a41, i8 %b41, i8 %c41)
851 %r42 = call i8 @llvm.fshl.i8(i8 %a42, i8 %b42, i8 %c42)
852 %r43 = call i8 @llvm.fshl.i8(i8 %a43, i8 %b43, i8 %c43)
853 %r44 = call i8 @llvm.fshl.i8(i8 %a44, i8 %b44, i8 %c44)
854 %r45 = call i8 @llvm.fshl.i8(i8 %a45, i8 %b45, i8 %c45)
855 %r46 = call i8 @llvm.fshl.i8(i8 %a46, i8 %b46, i8 %c46)
856 %r47 = call i8 @llvm.fshl.i8(i8 %a47, i8 %b47, i8 %c47)
857 %r48 = call i8 @llvm.fshl.i8(i8 %a48, i8 %b48, i8 %c48)
858 %r49 = call i8 @llvm.fshl.i8(i8 %a49, i8 %b49, i8 %c49)
859 %r50 = call i8 @llvm.fshl.i8(i8 %a50, i8 %b50, i8 %c50)
860 %r51 = call i8 @llvm.fshl.i8(i8 %a51, i8 %b51, i8 %c51)
861 %r52 = call i8 @llvm.fshl.i8(i8 %a52, i8 %b52, i8 %c52)
862 %r53 = call i8 @llvm.fshl.i8(i8 %a53, i8 %b53, i8 %c53)
863 %r54 = call i8 @llvm.fshl.i8(i8 %a54, i8 %b54, i8 %c54)
864 %r55 = call i8 @llvm.fshl.i8(i8 %a55, i8 %b55, i8 %c55)
865 %r56 = call i8 @llvm.fshl.i8(i8 %a56, i8 %b56, i8 %c56)
866 %r57 = call i8 @llvm.fshl.i8(i8 %a57, i8 %b57, i8 %c57)
867 %r58 = call i8 @llvm.fshl.i8(i8 %a58, i8 %b58, i8 %c58)
868 %r59 = call i8 @llvm.fshl.i8(i8 %a59, i8 %b59, i8 %c59)
869 %r60 = call i8 @llvm.fshl.i8(i8 %a60, i8 %b60, i8 %c60)
870 %r61 = call i8 @llvm.fshl.i8(i8 %a61, i8 %b61, i8 %c61)
871 %r62 = call i8 @llvm.fshl.i8(i8 %a62, i8 %b62, i8 %c62)
872 %r63 = call i8 @llvm.fshl.i8(i8 %a63, i8 %b63, i8 %c63)
873 store i8 %r0 , ptr getelementptr inbounds ([64 x i8], ptr @d8, i32 0, i64 0 ), align 1
874 store i8 %r1 , ptr getelementptr inbounds ([64 x i8], ptr @d8, i32 0, i64 1 ), align 1
875 store i8 %r2 , ptr getelementptr inbounds ([64 x i8], ptr @d8, i32 0, i64 2 ), align 1
876 store i8 %r3 , ptr getelementptr inbounds ([64 x i8], ptr @d8, i32 0, i64 3 ), align 1
877 store i8 %r4 , ptr getelementptr inbounds ([64 x i8], ptr @d8, i32 0, i64 4 ), align 1
878 store i8 %r5 , ptr getelementptr inbounds ([64 x i8], ptr @d8, i32 0, i64 5 ), align 1
879 store i8 %r6 , ptr getelementptr inbounds ([64 x i8], ptr @d8, i32 0, i64 6 ), align 1
880 store i8 %r7 , ptr getelementptr inbounds ([64 x i8], ptr @d8, i32 0, i64 7 ), align 1
881 store i8 %r8 , ptr getelementptr inbounds ([64 x i8], ptr @d8, i32 0, i64 8 ), align 1
882 store i8 %r9 , ptr getelementptr inbounds ([64 x i8], ptr @d8, i32 0, i64 9 ), align 1
883 store i8 %r10, ptr getelementptr inbounds ([64 x i8], ptr @d8, i32 0, i64 10), align 1
884 store i8 %r11, ptr getelementptr inbounds ([64 x i8], ptr @d8, i32 0, i64 11), align 1
885 store i8 %r12, ptr getelementptr inbounds ([64 x i8], ptr @d8, i32 0, i64 12), align 1
886 store i8 %r13, ptr getelementptr inbounds ([64 x i8], ptr @d8, i32 0, i64 13), align 1
887 store i8 %r14, ptr getelementptr inbounds ([64 x i8], ptr @d8, i32 0, i64 14), align 1
888 store i8 %r15, ptr getelementptr inbounds ([64 x i8], ptr @d8, i32 0, i64 15), align 1
889 store i8 %r16, ptr getelementptr inbounds ([64 x i8], ptr @d8, i32 0, i64 16), align 1
890 store i8 %r17, ptr getelementptr inbounds ([64 x i8], ptr @d8, i32 0, i64 17), align 1
891 store i8 %r18, ptr getelementptr inbounds ([64 x i8], ptr @d8, i32 0, i64 18), align 1
892 store i8 %r19, ptr getelementptr inbounds ([64 x i8], ptr @d8, i32 0, i64 19), align 1
893 store i8 %r20, ptr getelementptr inbounds ([64 x i8], ptr @d8, i32 0, i64 20), align 1
894 store i8 %r21, ptr getelementptr inbounds ([64 x i8], ptr @d8, i32 0, i64 21), align 1
895 store i8 %r22, ptr getelementptr inbounds ([64 x i8], ptr @d8, i32 0, i64 22), align 1
896 store i8 %r23, ptr getelementptr inbounds ([64 x i8], ptr @d8, i32 0, i64 23), align 1
897 store i8 %r24, ptr getelementptr inbounds ([64 x i8], ptr @d8, i32 0, i64 24), align 1
898 store i8 %r25, ptr getelementptr inbounds ([64 x i8], ptr @d8, i32 0, i64 25), align 1
899 store i8 %r26, ptr getelementptr inbounds ([64 x i8], ptr @d8, i32 0, i64 26), align 1
900 store i8 %r27, ptr getelementptr inbounds ([64 x i8], ptr @d8, i32 0, i64 27), align 1
901 store i8 %r28, ptr getelementptr inbounds ([64 x i8], ptr @d8, i32 0, i64 28), align 1
902 store i8 %r29, ptr getelementptr inbounds ([64 x i8], ptr @d8, i32 0, i64 29), align 1
903 store i8 %r30, ptr getelementptr inbounds ([64 x i8], ptr @d8, i32 0, i64 30), align 1
904 store i8 %r31, ptr getelementptr inbounds ([64 x i8], ptr @d8, i32 0, i64 31), align 1
905 store i8 %r32, ptr getelementptr inbounds ([64 x i8], ptr @d8, i32 0, i64 32), align 1
906 store i8 %r33, ptr getelementptr inbounds ([64 x i8], ptr @d8, i32 0, i64 33), align 1
907 store i8 %r34, ptr getelementptr inbounds ([64 x i8], ptr @d8, i32 0, i64 34), align 1
908 store i8 %r35, ptr getelementptr inbounds ([64 x i8], ptr @d8, i32 0, i64 35), align 1
909 store i8 %r36, ptr getelementptr inbounds ([64 x i8], ptr @d8, i32 0, i64 36), align 1
910 store i8 %r37, ptr getelementptr inbounds ([64 x i8], ptr @d8, i32 0, i64 37), align 1
911 store i8 %r38, ptr getelementptr inbounds ([64 x i8], ptr @d8, i32 0, i64 38), align 1
912 store i8 %r39, ptr getelementptr inbounds ([64 x i8], ptr @d8, i32 0, i64 39), align 1
913 store i8 %r40, ptr getelementptr inbounds ([64 x i8], ptr @d8, i32 0, i64 40), align 1
914 store i8 %r41, ptr getelementptr inbounds ([64 x i8], ptr @d8, i32 0, i64 41), align 1
915 store i8 %r42, ptr getelementptr inbounds ([64 x i8], ptr @d8, i32 0, i64 42), align 1
916 store i8 %r43, ptr getelementptr inbounds ([64 x i8], ptr @d8, i32 0, i64 43), align 1
917 store i8 %r44, ptr getelementptr inbounds ([64 x i8], ptr @d8, i32 0, i64 44), align 1
918 store i8 %r45, ptr getelementptr inbounds ([64 x i8], ptr @d8, i32 0, i64 45), align 1
919 store i8 %r46, ptr getelementptr inbounds ([64 x i8], ptr @d8, i32 0, i64 46), align 1
920 store i8 %r47, ptr getelementptr inbounds ([64 x i8], ptr @d8, i32 0, i64 47), align 1
921 store i8 %r48, ptr getelementptr inbounds ([64 x i8], ptr @d8, i32 0, i64 48), align 1
922 store i8 %r49, ptr getelementptr inbounds ([64 x i8], ptr @d8, i32 0, i64 49), align 1
923 store i8 %r50, ptr getelementptr inbounds ([64 x i8], ptr @d8, i32 0, i64 50), align 1
924 store i8 %r51, ptr getelementptr inbounds ([64 x i8], ptr @d8, i32 0, i64 51), align 1
925 store i8 %r52, ptr getelementptr inbounds ([64 x i8], ptr @d8, i32 0, i64 52), align 1
926 store i8 %r53, ptr getelementptr inbounds ([64 x i8], ptr @d8, i32 0, i64 53), align 1
927 store i8 %r54, ptr getelementptr inbounds ([64 x i8], ptr @d8, i32 0, i64 54), align 1
928 store i8 %r55, ptr getelementptr inbounds ([64 x i8], ptr @d8, i32 0, i64 55), align 1
929 store i8 %r56, ptr getelementptr inbounds ([64 x i8], ptr @d8, i32 0, i64 56), align 1
930 store i8 %r57, ptr getelementptr inbounds ([64 x i8], ptr @d8, i32 0, i64 57), align 1
931 store i8 %r58, ptr getelementptr inbounds ([64 x i8], ptr @d8, i32 0, i64 58), align 1
932 store i8 %r59, ptr getelementptr inbounds ([64 x i8], ptr @d8, i32 0, i64 59), align 1
933 store i8 %r60, ptr getelementptr inbounds ([64 x i8], ptr @d8, i32 0, i64 60), align 1
934 store i8 %r61, ptr getelementptr inbounds ([64 x i8], ptr @d8, i32 0, i64 61), align 1
935 store i8 %r62, ptr getelementptr inbounds ([64 x i8], ptr @d8, i32 0, i64 62), align 1
936 store i8 %r63, ptr getelementptr inbounds ([64 x i8], ptr @d8, i32 0, i64 63), align 1