1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc < %s --mtriple=x86_64-unknown-linux-gnu -mattr=+sse4.1 | FileCheck %s
4 define void @pmuldq(<2 x i64>* nocapture %0, i32 %1, i64 %2) {
7 ; CHECK-NEXT: testq %rdx, %rdx
8 ; CHECK-NEXT: je .LBB0_3
10 ; CHECK-NEXT: movd %esi, %xmm0
11 ; CHECK-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
12 ; CHECK-NEXT: .p2align 4, 0x90
13 ; CHECK-NEXT: .LBB0_2: # =>This Inner Loop Header: Depth=1
14 ; CHECK-NEXT: movdqa (%rdi), %xmm1
15 ; CHECK-NEXT: pmuldq %xmm0, %xmm1
16 ; CHECK-NEXT: movdqa %xmm1, (%rdi)
17 ; CHECK-NEXT: addq $16, %rdi
18 ; CHECK-NEXT: decq %rdx
19 ; CHECK-NEXT: jne .LBB0_2
20 ; CHECK-NEXT: .LBB0_3:
22 %4 = insertelement <4 x i32> undef, i32 %1, i32 0
23 %5 = shufflevector <4 x i32> %4, <4 x i32> poison, <4 x i32> zeroinitializer
24 %6 = bitcast <4 x i32> %5 to <2 x i64>
25 %7 = shl <2 x i64> %6, <i64 32, i64 32>
26 %8 = ashr exact <2 x i64> %7, <i64 32, i64 32>
27 %9 = icmp eq i64 %2, 0
28 br i1 %9, label %10, label %11
34 %12 = phi i64 [ %18, %11 ], [ 0, %3 ]
35 %13 = getelementptr inbounds <2 x i64>, <2 x i64>* %0, i64 %12
36 %14 = load <2 x i64>, <2 x i64>* %13, align 16
37 %15 = shl <2 x i64> %14, <i64 32, i64 32>
38 %16 = ashr exact <2 x i64> %15, <i64 32, i64 32>
39 %17 = mul nsw <2 x i64> %16, %8
40 store <2 x i64> %17, <2 x i64>* %13, align 16
41 %18 = add nuw i64 %12, 1
42 %19 = icmp eq i64 %18, %2
43 br i1 %19, label %10, label %11
46 define void @pmuludq(<2 x i64>* nocapture %0, i32 %1, i64 %2) {
47 ; CHECK-LABEL: pmuludq:
49 ; CHECK-NEXT: testq %rdx, %rdx
50 ; CHECK-NEXT: je .LBB1_3
51 ; CHECK-NEXT: # %bb.1:
52 ; CHECK-NEXT: movd %esi, %xmm0
53 ; CHECK-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
54 ; CHECK-NEXT: .p2align 4, 0x90
55 ; CHECK-NEXT: .LBB1_2: # =>This Inner Loop Header: Depth=1
56 ; CHECK-NEXT: movdqa (%rdi), %xmm1
57 ; CHECK-NEXT: pmuludq %xmm0, %xmm1
58 ; CHECK-NEXT: movdqa %xmm1, (%rdi)
59 ; CHECK-NEXT: addq $16, %rdi
60 ; CHECK-NEXT: decq %rdx
61 ; CHECK-NEXT: jne .LBB1_2
62 ; CHECK-NEXT: .LBB1_3:
64 %4 = insertelement <4 x i32> undef, i32 %1, i32 0
65 %5 = shufflevector <4 x i32> %4, <4 x i32> poison, <4 x i32> zeroinitializer
66 %6 = bitcast <4 x i32> %5 to <2 x i64>
67 %7 = and <2 x i64> %6, <i64 4294967295, i64 4294967295>
68 %8 = icmp eq i64 %2, 0
69 br i1 %8, label %9, label %10
75 %11 = phi i64 [ %16, %10 ], [ 0, %3 ]
76 %12 = getelementptr inbounds <2 x i64>, <2 x i64>* %0, i64 %11
77 %13 = load <2 x i64>, <2 x i64>* %12, align 16
78 %14 = and <2 x i64> %13, <i64 4294967295, i64 4294967295>
79 %15 = mul nuw <2 x i64> %14, %7
80 store <2 x i64> %15, <2 x i64>* %12, align 16
81 %16 = add nuw i64 %11, 1
82 %17 = icmp eq i64 %16, %2
83 br i1 %17, label %9, label %10