diff --git a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp --- a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp +++ b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp @@ -34,6 +34,7 @@ #define DEBUG_TYPE "vector-combine" STATISTIC(NumVecCmp, "Number of vector compares formed"); STATISTIC(NumVecBO, "Number of vector binops formed"); +STATISTIC(NumScalarBO, "Number of scalar binops formed"); static cl::opt DisableVectorCombine( "disable-vector-combine", cl::init(false), cl::Hidden, @@ -308,6 +309,64 @@ return true; } +/// Match a vector binop instruction with inserted scalar operands and convert +/// to scalar binop followed by insertelement. +static bool scalarizeBinop(Instruction &I, const TargetTransformInfo &TTI) { + Instruction *Ins0, *Ins1; + if (!match(&I, m_BinOp(m_Instruction(Ins0), m_Instruction(Ins1)))) + return false; + + // TODO: Loosen restriction for one-use by adjusting cost equation. + // TODO: Deal with mismatched index constants and variable indexes? + Constant *VecC0, *VecC1; + Value *V0, *V1; + uint64_t Index; + if (!match(Ins0, m_OneUse(m_InsertElement(m_Constant(VecC0), m_Value(V0), + m_ConstantInt(Index)))) || + !match(Ins1, m_OneUse(m_InsertElement(m_Constant(VecC1), m_Value(V1), + m_SpecificInt(Index))))) + return false; + + Type *ScalarTy = V0->getType(); + Type *VecTy = I.getType(); + assert(VecTy->isVectorTy() && ScalarTy == V1->getType() && + (ScalarTy->isIntegerTy() || ScalarTy->isFloatingPointTy()) && + "Unexpected types for insert into binop"); + + Instruction::BinaryOps Opcode = cast(&I)->getOpcode(); + int ScalarOpCost = TTI.getArithmeticInstrCost(Opcode, ScalarTy); + int VectorOpCost = TTI.getArithmeticInstrCost(Opcode, VecTy); + + // Get cost estimate for the insert element. This cost will factor into + // both sequences. + int InsertCost = + TTI.getVectorInstrCost(Instruction::InsertElement, VecTy, Index); + int OldCost = InsertCost + InsertCost + VectorOpCost; + int NewCost = ScalarOpCost + InsertCost; + + // We want to scalarize unless the vector variant actually has lower cost. + if (OldCost < NewCost) + return false; + + // vec_bo (inselt VecC0, V0, Index), (inselt VecC1, V1, Index) --> + // inselt NewVecC, (scalar_bo V0, V1), Index + ++NumScalarBO; + IRBuilder<> Builder(&I); + Value *Scalar = Builder.CreateBinOp(Opcode, V0, V1, I.getName() + ".scalar"); + + // All IR flags are safe to back-propagate. There is no potential for extra + // poison to be created by the scalar instruction. + if (auto *ScalarInst = dyn_cast(Scalar)) + ScalarInst->copyIRFlags(&I); + + // Fold the vector constants in the original vectors into a new base vector. + Constant *NewVecC = ConstantExpr::get(Opcode, VecC0, VecC1); + Value *Insert = Builder.CreateInsertElement(NewVecC, Scalar, Index); + I.replaceAllUsesWith(Insert); + Insert->takeName(&I); + return true; +} + /// This is the entry point for all transforms. Pass manager differences are /// handled in the callers of this function. static bool runImpl(Function &F, const TargetTransformInfo &TTI, @@ -330,6 +389,7 @@ continue; MadeChange |= foldExtractExtract(I, TTI); MadeChange |= foldBitcastShuf(I, TTI); + MadeChange |= scalarizeBinop(I, TTI); } } diff --git a/llvm/test/Transforms/VectorCombine/X86/insert-binop.ll b/llvm/test/Transforms/VectorCombine/X86/insert-binop.ll --- a/llvm/test/Transforms/VectorCombine/X86/insert-binop.ll +++ b/llvm/test/Transforms/VectorCombine/X86/insert-binop.ll @@ -8,9 +8,8 @@ define <16 x i8> @ins0_ins0_add(i8 %x, i8 %y) { ; CHECK-LABEL: @ins0_ins0_add( -; CHECK-NEXT: [[I0:%.*]] = insertelement <16 x i8> undef, i8 [[X:%.*]], i32 0 -; CHECK-NEXT: [[I1:%.*]] = insertelement <16 x i8> undef, i8 [[Y:%.*]], i32 0 -; CHECK-NEXT: [[R:%.*]] = add <16 x i8> [[I0]], [[I1]] +; CHECK-NEXT: [[R_SCALAR:%.*]] = add i8 [[X:%.*]], [[Y:%.*]] +; CHECK-NEXT: [[R:%.*]] = insertelement <16 x i8> undef, i8 [[R_SCALAR]], i64 0 ; CHECK-NEXT: ret <16 x i8> [[R]] ; %i0 = insertelement <16 x i8> undef, i8 %x, i32 0 @@ -23,9 +22,8 @@ define <8 x i16> @ins0_ins0_sub_flags(i16 %x, i16 %y) { ; CHECK-LABEL: @ins0_ins0_sub_flags( -; CHECK-NEXT: [[I0:%.*]] = insertelement <8 x i16> undef, i16 [[X:%.*]], i8 5 -; CHECK-NEXT: [[I1:%.*]] = insertelement <8 x i16> undef, i16 [[Y:%.*]], i32 5 -; CHECK-NEXT: [[R:%.*]] = sub nuw nsw <8 x i16> [[I0]], [[I1]] +; CHECK-NEXT: [[R_SCALAR:%.*]] = sub nuw nsw i16 [[X:%.*]], [[Y:%.*]] +; CHECK-NEXT: [[R:%.*]] = insertelement <8 x i16> undef, i16 [[R_SCALAR]], i64 5 ; CHECK-NEXT: ret <8 x i16> [[R]] ; %i0 = insertelement <8 x i16> undef, i16 %x, i8 5 @@ -34,11 +32,13 @@ ret <8 x i16> %r } +; The new vector constant is calculated by constant folding. +; This is conservatively created as zero rather than undef for 'undef ^ undef'. + define <2 x i64> @ins1_ins1_xor(i64 %x, i64 %y) { ; CHECK-LABEL: @ins1_ins1_xor( -; CHECK-NEXT: [[I0:%.*]] = insertelement <2 x i64> undef, i64 [[X:%.*]], i64 1 -; CHECK-NEXT: [[I1:%.*]] = insertelement <2 x i64> undef, i64 [[Y:%.*]], i32 1 -; CHECK-NEXT: [[R:%.*]] = xor <2 x i64> [[I0]], [[I1]] +; CHECK-NEXT: [[R_SCALAR:%.*]] = xor i64 [[X:%.*]], [[Y:%.*]] +; CHECK-NEXT: [[R:%.*]] = insertelement <2 x i64> zeroinitializer, i64 [[R_SCALAR]], i64 1 ; CHECK-NEXT: ret <2 x i64> [[R]] ; %i0 = insertelement <2 x i64> undef, i64 %x, i64 1 @@ -51,9 +51,8 @@ define <2 x double> @ins0_ins0_fadd(double %x, double %y) { ; CHECK-LABEL: @ins0_ins0_fadd( -; CHECK-NEXT: [[I0:%.*]] = insertelement <2 x double> undef, double [[X:%.*]], i32 0 -; CHECK-NEXT: [[I1:%.*]] = insertelement <2 x double> undef, double [[Y:%.*]], i32 0 -; CHECK-NEXT: [[R:%.*]] = fadd reassoc nsz <2 x double> [[I0]], [[I1]] +; CHECK-NEXT: [[R_SCALAR:%.*]] = fadd reassoc nsz double [[X:%.*]], [[Y:%.*]] +; CHECK-NEXT: [[R:%.*]] = insertelement <2 x double> undef, double [[R_SCALAR]], i64 0 ; CHECK-NEXT: ret <2 x double> [[R]] ; %i0 = insertelement <2 x double> undef, double %x, i32 0 @@ -62,6 +61,8 @@ ret <2 x double> %r } +; Negative test - mismatched indexes (but could fold this). + define <16 x i8> @ins1_ins0_add(i8 %x, i8 %y) { ; CHECK-LABEL: @ins1_ins0_add( ; CHECK-NEXT: [[I0:%.*]] = insertelement <16 x i8> undef, i8 [[X:%.*]], i32 1 @@ -75,11 +76,12 @@ ret <16 x i8> %r } +; Base vector does not have to be undef. + define <4 x i32> @ins0_ins0_mul(i32 %x, i32 %y) { ; CHECK-LABEL: @ins0_ins0_mul( -; CHECK-NEXT: [[I0:%.*]] = insertelement <4 x i32> zeroinitializer, i32 [[X:%.*]], i32 0 -; CHECK-NEXT: [[I1:%.*]] = insertelement <4 x i32> undef, i32 [[Y:%.*]], i32 0 -; CHECK-NEXT: [[R:%.*]] = mul <4 x i32> [[I0]], [[I1]] +; CHECK-NEXT: [[R_SCALAR:%.*]] = mul i32 [[X:%.*]], [[Y:%.*]] +; CHECK-NEXT: [[R:%.*]] = insertelement <4 x i32> zeroinitializer, i32 [[R_SCALAR]], i64 0 ; CHECK-NEXT: ret <4 x i32> [[R]] ; %i0 = insertelement <4 x i32> zeroinitializer, i32 %x, i32 0 @@ -88,11 +90,12 @@ ret <4 x i32> %r } +; It is safe to scalarize any binop (no extra UB/poison danger). + define <2 x i64> @ins1_ins1_sdiv(i64 %x, i64 %y) { ; CHECK-LABEL: @ins1_ins1_sdiv( -; CHECK-NEXT: [[I0:%.*]] = insertelement <2 x i64> , i64 [[X:%.*]], i64 1 -; CHECK-NEXT: [[I1:%.*]] = insertelement <2 x i64> , i64 [[Y:%.*]], i32 1 -; CHECK-NEXT: [[R:%.*]] = sdiv <2 x i64> [[I0]], [[I1]] +; CHECK-NEXT: [[R_SCALAR:%.*]] = sdiv i64 [[X:%.*]], [[Y:%.*]] +; CHECK-NEXT: [[R:%.*]] = insertelement <2 x i64> , i64 [[R_SCALAR]], i64 1 ; CHECK-NEXT: ret <2 x i64> [[R]] ; %i0 = insertelement <2 x i64> , i64 %x, i64 1 @@ -101,11 +104,12 @@ ret <2 x i64> %r } +; Constant folding deals with undef per element - the entire value does not become undef. + define <2 x i64> @ins1_ins1_udiv(i64 %x, i64 %y) { ; CHECK-LABEL: @ins1_ins1_udiv( -; CHECK-NEXT: [[I0:%.*]] = insertelement <2 x i64> , i64 [[X:%.*]], i32 1 -; CHECK-NEXT: [[I1:%.*]] = insertelement <2 x i64> , i64 [[Y:%.*]], i32 1 -; CHECK-NEXT: [[R:%.*]] = udiv <2 x i64> [[I0]], [[I1]] +; CHECK-NEXT: [[R_SCALAR:%.*]] = udiv i64 [[X:%.*]], [[Y:%.*]] +; CHECK-NEXT: [[R:%.*]] = insertelement <2 x i64> , i64 [[R_SCALAR]], i64 1 ; CHECK-NEXT: ret <2 x i64> [[R]] ; %i0 = insertelement <2 x i64> , i64 %x, i32 1 @@ -114,11 +118,13 @@ ret <2 x i64> %r } +; This could be simplified -- creates immediate UB without the transform because +; divisor has an undef element -- but that is hidden after the transform. + define <2 x i64> @ins1_ins1_urem(i64 %x, i64 %y) { ; CHECK-LABEL: @ins1_ins1_urem( -; CHECK-NEXT: [[I0:%.*]] = insertelement <2 x i64> , i64 [[X:%.*]], i64 1 -; CHECK-NEXT: [[I1:%.*]] = insertelement <2 x i64> , i64 [[Y:%.*]], i32 1 -; CHECK-NEXT: [[R:%.*]] = urem <2 x i64> [[I0]], [[I1]] +; CHECK-NEXT: [[R_SCALAR:%.*]] = urem i64 [[X:%.*]], [[Y:%.*]] +; CHECK-NEXT: [[R:%.*]] = insertelement <2 x i64> , i64 [[R_SCALAR]], i64 1 ; CHECK-NEXT: ret <2 x i64> [[R]] ; %i0 = insertelement <2 x i64> , i64 %x, i64 1 @@ -127,6 +133,9 @@ ret <2 x i64> %r } +; Negative test +; TODO: extra use can be accounted for in cost calculation. + define <4 x i32> @ins0_ins0_xor(i32 %x, i32 %y) { ; CHECK-LABEL: @ins0_ins0_xor( ; CHECK-NEXT: [[I0:%.*]] = insertelement <4 x i32> undef, i32 [[X:%.*]], i32 0