1 ; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -mcpu=gfx908 -passes=slp-vectorizer < %s | FileCheck -check-prefixes=GCN,GFX908 %s
2 ; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a -passes=slp-vectorizer < %s | FileCheck -check-prefixes=GCN,GFX90A %s
4 ; GCN-LABEL: @fadd_combine
7 ; GFX90A: fadd <2 x float>
8 define amdgpu_kernel void @fadd_combine(ptr addrspace(1) %arg) {
10 %tmp = tail call i32 @llvm.amdgcn.workitem.id.x()
11 %tmp1 = zext i32 %tmp to i64
12 %tmp2 = getelementptr inbounds float, ptr addrspace(1) %arg, i64 %tmp1
13 %tmp3 = load float, ptr addrspace(1) %tmp2, align 4
14 %tmp4 = fadd float %tmp3, 1.000000e+00
15 store float %tmp4, ptr addrspace(1) %tmp2, align 4
16 %tmp5 = add nuw nsw i64 %tmp1, 1
17 %tmp6 = getelementptr inbounds float, ptr addrspace(1) %arg, i64 %tmp5
18 %tmp7 = load float, ptr addrspace(1) %tmp6, align 4
19 %tmp8 = fadd float %tmp7, 1.000000e+00
20 store float %tmp8, ptr addrspace(1) %tmp6, align 4
24 ; GCN-LABEL: @fmul_combine
27 ; GFX90A: fmul <2 x float>
28 define amdgpu_kernel void @fmul_combine(ptr addrspace(1) %arg) {
30 %tmp = tail call i32 @llvm.amdgcn.workitem.id.x()
31 %tmp1 = zext i32 %tmp to i64
32 %tmp2 = getelementptr inbounds float, ptr addrspace(1) %arg, i64 %tmp1
33 %tmp3 = load float, ptr addrspace(1) %tmp2, align 4
34 %tmp4 = fmul float %tmp3, 1.000000e+00
35 store float %tmp4, ptr addrspace(1) %tmp2, align 4
36 %tmp5 = add nuw nsw i64 %tmp1, 1
37 %tmp6 = getelementptr inbounds float, ptr addrspace(1) %arg, i64 %tmp5
38 %tmp7 = load float, ptr addrspace(1) %tmp6, align 4
39 %tmp8 = fmul float %tmp7, 1.000000e+00
40 store float %tmp8, ptr addrspace(1) %tmp6, align 4
44 ; GCN-LABEL: @fma_combine
45 ; GFX908: call float @llvm.fma.f32
46 ; GFX908: call float @llvm.fma.f32
47 ; GFX90A: call <2 x float> @llvm.fma.v2f32
48 define amdgpu_kernel void @fma_combine(ptr addrspace(1) %arg) {
50 %tmp = tail call i32 @llvm.amdgcn.workitem.id.x()
51 %tmp1 = zext i32 %tmp to i64
52 %tmp2 = getelementptr inbounds float, ptr addrspace(1) %arg, i64 %tmp1
53 %tmp3 = load float, ptr addrspace(1) %tmp2, align 4
54 %tmp4 = tail call float @llvm.fma.f32(float %tmp3, float 1.000000e+00, float 1.000000e+00)
55 store float %tmp4, ptr addrspace(1) %tmp2, align 4
56 %tmp5 = add nuw nsw i64 %tmp1, 1
57 %tmp6 = getelementptr inbounds float, ptr addrspace(1) %arg, i64 %tmp5
58 %tmp7 = load float, ptr addrspace(1) %tmp6, align 4
59 %tmp8 = tail call float @llvm.fma.f32(float %tmp7, float 1.000000e+00, float 1.000000e+00)
60 store float %tmp8, ptr addrspace(1) %tmp6, align 4
64 declare i32 @llvm.amdgcn.workitem.id.x()
65 declare float @llvm.fma.f32(float, float, float)