diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -48,9 +48,10 @@ #include "llvm/IR/Function.h" #include "llvm/IR/GlobalAlias.h" #include "llvm/IR/GlobalVariable.h" +#include "llvm/IR/IRBuilder.h" #include "llvm/IR/Instructions.h" #include "llvm/IR/Intrinsics.h" -#include "llvm/IR/IRBuilder.h" +#include "llvm/IR/PatternMatch.h" #include "llvm/MC/MCAsmInfo.h" #include "llvm/MC/MCContext.h" #include "llvm/MC/MCExpr.h" @@ -32091,6 +32092,36 @@ bool X86TargetLowering::shouldSinkOperands(Instruction *I, SmallVectorImpl &Ops) const { + using namespace llvm::PatternMatch; + + FixedVectorType *VTy = dyn_cast(I->getType()); + if (!VTy) + return false; + + if (I->getOpcode() == Instruction::Mul && + VTy->getElementType()->isIntegerTy(64)) { + for (auto &Op : I->operands()) { + // Make sure we are not already sinking this operand + if (any_of(Ops, [&](Use *U) { return U->get() == Op; })) + continue; + + // Look for PMULDQ pattern where the input is a sext_inreg from vXi32 or + // the PMULUDQ pattern where the input is a zext_inreg from vXi32. + if (Subtarget.hasSSE41() && + match(Op.get(), m_AShr(m_Shl(m_Value(), m_SpecificInt(32)), + m_SpecificInt(32)))) { + Ops.push_back(&cast(Op)->getOperandUse(0)); + Ops.push_back(&Op); + } else if (Subtarget.hasSSE2() && + match(Op.get(), + m_And(m_Value(), m_SpecificInt(UINT64_C(0xffffffff))))) { + Ops.push_back(&Op); + } + } + + return !Ops.empty(); + } + // A uniform shift amount in a vector shift or funnel shift may be much // cheaper than a generic variable vector shift, so make that pattern visible // to SDAG by sinking the shuffle instruction next to the shift. diff --git a/llvm/test/CodeGen/X86/pr51371.ll b/llvm/test/CodeGen/X86/pr51371.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/X86/pr51371.ll @@ -0,0 +1,84 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s --mtriple=x86_64-unknown-linux-gnu -mattr=+sse4.1 | FileCheck %s + +define void @pmuldq(<2 x i64>* nocapture %0, i32 %1, i64 %2) { +; CHECK-LABEL: pmuldq: +; CHECK: # %bb.0: +; CHECK-NEXT: testq %rdx, %rdx +; CHECK-NEXT: je .LBB0_3 +; CHECK-NEXT: # %bb.1: +; CHECK-NEXT: movd %esi, %xmm0 +; CHECK-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] +; CHECK-NEXT: .p2align 4, 0x90 +; CHECK-NEXT: .LBB0_2: # =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: movdqa (%rdi), %xmm1 +; CHECK-NEXT: pmuldq %xmm0, %xmm1 +; CHECK-NEXT: movdqa %xmm1, (%rdi) +; CHECK-NEXT: addq $16, %rdi +; CHECK-NEXT: decq %rdx +; CHECK-NEXT: jne .LBB0_2 +; CHECK-NEXT: .LBB0_3: +; CHECK-NEXT: retq + %4 = insertelement <4 x i32> undef, i32 %1, i32 0 + %5 = shufflevector <4 x i32> %4, <4 x i32> poison, <4 x i32> zeroinitializer + %6 = bitcast <4 x i32> %5 to <2 x i64> + %7 = shl <2 x i64> %6, + %8 = ashr exact <2 x i64> %7, + %9 = icmp eq i64 %2, 0 + br i1 %9, label %10, label %11 + +10: ; preds = %11, %3 + ret void + +11: ; preds = %3, %11 + %12 = phi i64 [ %18, %11 ], [ 0, %3 ] + %13 = getelementptr inbounds <2 x i64>, <2 x i64>* %0, i64 %12 + %14 = load <2 x i64>, <2 x i64>* %13, align 16 + %15 = shl <2 x i64> %14, + %16 = ashr exact <2 x i64> %15, + %17 = mul nsw <2 x i64> %16, %8 + store <2 x i64> %17, <2 x i64>* %13, align 16 + %18 = add nuw i64 %12, 1 + %19 = icmp eq i64 %18, %2 + br i1 %19, label %10, label %11 +} + +define void @pmuludq(<2 x i64>* nocapture %0, i32 %1, i64 %2) { +; CHECK-LABEL: pmuludq: +; CHECK: # %bb.0: +; CHECK-NEXT: testq %rdx, %rdx +; CHECK-NEXT: je .LBB1_3 +; CHECK-NEXT: # %bb.1: +; CHECK-NEXT: movd %esi, %xmm0 +; CHECK-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] +; CHECK-NEXT: .p2align 4, 0x90 +; CHECK-NEXT: .LBB1_2: # =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: movdqa (%rdi), %xmm1 +; CHECK-NEXT: pmuludq %xmm0, %xmm1 +; CHECK-NEXT: movdqa %xmm1, (%rdi) +; CHECK-NEXT: addq $16, %rdi +; CHECK-NEXT: decq %rdx +; CHECK-NEXT: jne .LBB1_2 +; CHECK-NEXT: .LBB1_3: +; CHECK-NEXT: retq + %4 = insertelement <4 x i32> undef, i32 %1, i32 0 + %5 = shufflevector <4 x i32> %4, <4 x i32> poison, <4 x i32> zeroinitializer + %6 = bitcast <4 x i32> %5 to <2 x i64> + %7 = and <2 x i64> %6, + %8 = icmp eq i64 %2, 0 + br i1 %8, label %9, label %10 + +9: ; preds = %10, %3 + ret void + +10: ; preds = %3, %10 + %11 = phi i64 [ %16, %10 ], [ 0, %3 ] + %12 = getelementptr inbounds <2 x i64>, <2 x i64>* %0, i64 %11 + %13 = load <2 x i64>, <2 x i64>* %12, align 16 + %14 = and <2 x i64> %13, + %15 = mul nuw <2 x i64> %14, %7 + store <2 x i64> %15, <2 x i64>* %12, align 16 + %16 = add nuw i64 %11, 1 + %17 = icmp eq i64 %16, %2 + br i1 %17, label %9, label %10 +}