1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc < %s -mtriple=i386-unknown-linux-gnu -mattr=sse2 | FileCheck %s
4 ; Source file looks something like this:
6 ; typedef int AAA[100][100];
8 ; void testCombineMultiplies(AAA a,int lll)
18 ; We want to make sure we don't generate 2 multiply instructions,
19 ; one for a[LOC][] and one for a[LOC+20]. visitMUL in DAGCombiner.cpp
20 ; should combine the instructions in such a way to avoid the extra
23 ; Output looks roughly like this:
27 ; imull $400, %ecx, %edx # imm = 0x190
28 ; leal (%edx,%eax), %esi
29 ; movl $11, 2020(%esi,%ecx,4)
30 ; movl $22, 2080(%edx,%eax)
31 ; movl $33, 10080(%edx,%eax)
33 ; Function Attrs: nounwind
34 define void @testCombineMultiplies([100 x i32]* nocapture %a, i32 %lll) nounwind {
35 ; CHECK-LABEL: testCombineMultiplies:
36 ; CHECK: # %bb.0: # %entry
37 ; CHECK-NEXT: pushl %esi
38 ; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax
39 ; CHECK-NEXT: movl {{[0-9]+}}(%esp), %ecx
40 ; CHECK-NEXT: imull $400, %ecx, %edx # imm = 0x190
41 ; CHECK-NEXT: leal (%edx,%eax), %esi
42 ; CHECK-NEXT: movl $11, 2020(%esi,%ecx,4)
43 ; CHECK-NEXT: movl $22, 2080(%edx,%eax)
44 ; CHECK-NEXT: movl $33, 10080(%edx,%eax)
45 ; CHECK-NEXT: popl %esi
48 %add = add nsw i32 %lll, 5
49 %arrayidx1 = getelementptr inbounds [100 x i32], [100 x i32]* %a, i32 %add, i32 %add
50 store i32 11, i32* %arrayidx1, align 4
51 %arrayidx3 = getelementptr inbounds [100 x i32], [100 x i32]* %a, i32 %add, i32 20
52 store i32 22, i32* %arrayidx3, align 4
53 %add4 = add nsw i32 %lll, 25
54 %arrayidx6 = getelementptr inbounds [100 x i32], [100 x i32]* %a, i32 %add4, i32 20
55 store i32 33, i32* %arrayidx6, align 4
60 ; Test for the same optimization on vector multiplies.
62 ; Source looks something like this:
64 ; typedef int v4int __attribute__((__vector_size__(16)));
68 ; void testCombineMultiplies_splat(v4int v1) {
69 ; v2 = (v1 + (v4int){ 11, 11, 11, 11 }) * (v4int) {22, 22, 22, 22};
70 ; v3 = (v1 + (v4int){ 33, 33, 33, 33 }) * (v4int) {22, 22, 22, 22};
71 ; x = (v1 + (v4int){ 11, 11, 11, 11 });
74 ; Output looks something like this:
76 ; testCombineMultiplies_splat: # @testCombineMultiplies_splat
78 ; movdqa .LCPI1_0, %xmm1 # xmm1 = [11,11,11,11]
80 ; movdqa .LCPI1_1, %xmm2 # xmm2 = [22,22,22,22]
81 ; pshufd $245, %xmm0, %xmm3 # xmm3 = xmm0[1,1,3,3]
82 ; pmuludq %xmm2, %xmm0
83 ; pshufd $232, %xmm0, %xmm0 # xmm0 = xmm0[0,2,2,3]
84 ; pmuludq %xmm2, %xmm3
85 ; pshufd $232, %xmm3, %xmm2 # xmm2 = xmm3[0,2,2,3]
86 ; punpckldq %xmm2, %xmm0 # xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
87 ; movdqa .LCPI1_2, %xmm2 # xmm2 = [242,242,242,242]
89 ; paddd .LCPI1_3, %xmm0
95 ; Again, we want to make sure we don't generate two different multiplies.
96 ; We should have a single multiply for "v1 * {22, 22, 22, 22}" (made up of two
97 ; pmuludq instructions), followed by two adds. Without this optimization, we'd
98 ; do 2 adds, followed by 2 multiplies (i.e. 4 pmuludq instructions).
100 @v2 = common global <4 x i32> zeroinitializer, align 16
101 @v3 = common global <4 x i32> zeroinitializer, align 16
102 @x = common global <4 x i32> zeroinitializer, align 16
104 ; Function Attrs: nounwind
105 define void @testCombineMultiplies_splat(<4 x i32> %v1) nounwind {
106 ; CHECK-LABEL: testCombineMultiplies_splat:
107 ; CHECK: # %bb.0: # %entry
108 ; CHECK-NEXT: movdqa {{.*#+}} xmm1 = [11,11,11,11]
109 ; CHECK-NEXT: paddd %xmm0, %xmm1
110 ; CHECK-NEXT: movdqa {{.*#+}} xmm2 = [22,22,22,22]
111 ; CHECK-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
112 ; CHECK-NEXT: pmuludq %xmm2, %xmm0
113 ; CHECK-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
114 ; CHECK-NEXT: pmuludq %xmm2, %xmm3
115 ; CHECK-NEXT: pshufd {{.*#+}} xmm2 = xmm3[0,2,2,3]
116 ; CHECK-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
117 ; CHECK-NEXT: movdqa {{.*#+}} xmm2 = [242,242,242,242]
118 ; CHECK-NEXT: paddd %xmm0, %xmm2
119 ; CHECK-NEXT: paddd {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
120 ; CHECK-NEXT: movdqa %xmm2, v2
121 ; CHECK-NEXT: movdqa %xmm0, v3
122 ; CHECK-NEXT: movdqa %xmm1, x
125 %add1 = add <4 x i32> %v1, <i32 11, i32 11, i32 11, i32 11>
126 %mul1 = mul <4 x i32> %add1, <i32 22, i32 22, i32 22, i32 22>
127 %add2 = add <4 x i32> %v1, <i32 33, i32 33, i32 33, i32 33>
128 %mul2 = mul <4 x i32> %add2, <i32 22, i32 22, i32 22, i32 22>
129 store <4 x i32> %mul1, <4 x i32>* @v2, align 16
130 store <4 x i32> %mul2, <4 x i32>* @v3, align 16
131 store <4 x i32> %add1, <4 x i32>* @x, align 16
135 ; Finally, check the non-splatted vector case. This is very similar
136 ; to the previous test case, except for the vector values.
138 ; Function Attrs: nounwind
139 define void @testCombineMultiplies_non_splat(<4 x i32> %v1) nounwind {
140 ; CHECK-LABEL: testCombineMultiplies_non_splat:
141 ; CHECK: # %bb.0: # %entry
142 ; CHECK-NEXT: movdqa {{.*#+}} xmm1 = [11,22,33,44]
143 ; CHECK-NEXT: paddd %xmm0, %xmm1
144 ; CHECK-NEXT: movdqa {{.*#+}} xmm2 = [22,33,44,55]
145 ; CHECK-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
146 ; CHECK-NEXT: pmuludq %xmm2, %xmm0
147 ; CHECK-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
148 ; CHECK-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
149 ; CHECK-NEXT: pmuludq %xmm3, %xmm2
150 ; CHECK-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
151 ; CHECK-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
152 ; CHECK-NEXT: movdqa {{.*#+}} xmm2 = [242,726,1452,2420]
153 ; CHECK-NEXT: paddd %xmm0, %xmm2
154 ; CHECK-NEXT: paddd {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
155 ; CHECK-NEXT: movdqa %xmm2, v2
156 ; CHECK-NEXT: movdqa %xmm0, v3
157 ; CHECK-NEXT: movdqa %xmm1, x
160 %add1 = add <4 x i32> %v1, <i32 11, i32 22, i32 33, i32 44>
161 %mul1 = mul <4 x i32> %add1, <i32 22, i32 33, i32 44, i32 55>
162 %add2 = add <4 x i32> %v1, <i32 33, i32 44, i32 55, i32 66>
163 %mul2 = mul <4 x i32> %add2, <i32 22, i32 33, i32 44, i32 55>
164 store <4 x i32> %mul1, <4 x i32>* @v2, align 16
165 store <4 x i32> %mul2, <4 x i32>* @v3, align 16
166 store <4 x i32> %add1, <4 x i32>* @x, align 16