diff --git a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp --- a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp +++ b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp @@ -312,23 +312,41 @@ /// Match a vector binop instruction with inserted scalar operands and convert /// to scalar binop followed by insertelement. static bool scalarizeBinop(Instruction &I, const TargetTransformInfo &TTI) { - Instruction *Ins0, *Ins1; - if (!match(&I, m_BinOp(m_Instruction(Ins0), m_Instruction(Ins1)))) + Value *Ins0, *Ins1; + if (!match(&I, m_BinOp(m_Value(Ins0), m_Value(Ins1)))) return false; + // Match against one or both scalar values being inserted into constant + // vectors: + // vec_bo VecC0, (inselt VecC1, V1, Index) + // vec_bo (inselt VecC0, V0, Index), VecC1 + // vec_bo (inselt VecC0, V0, Index), (inselt VecC1, V1, Index) // TODO: Deal with mismatched index constants and variable indexes? Constant *VecC0, *VecC1; Value *V0, *V1; - uint64_t Index; + uint64_t Index0, Index1; if (!match(Ins0, m_InsertElt(m_Constant(VecC0), m_Value(V0), - m_ConstantInt(Index))) || - !match(Ins1, m_InsertElt(m_Constant(VecC1), m_Value(V1), - m_SpecificInt(Index)))) + m_ConstantInt(Index0)))) { + V0 = nullptr; + if (!match(Ins0, m_Constant(VecC0))) + return false; + } + if (!match(Ins1, m_InsertElt(m_Constant(VecC1), m_Value(V1), + m_ConstantInt(Index1)))) { + V1 = nullptr; + if (!match(Ins1, m_Constant(VecC1))) + return false; + } + if (!V0 && !V1) + return false; + if (V0 && V1 && Index0 != Index1) return false; - Type *ScalarTy = V0->getType(); + uint64_t Index = V0 ? Index0 : Index1; + Type *ScalarTy = V0 ? V0->getType() : V1->getType(); Type *VecTy = I.getType(); - assert(VecTy->isVectorTy() && ScalarTy == V1->getType() && + assert(VecTy->isVectorTy() && + (!V0 || !V1 || V0->getType() == V1->getType()) && (ScalarTy->isIntegerTy() || ScalarTy->isFloatingPointTy()) && "Unexpected types for insert into binop"); @@ -340,10 +358,10 @@ // both sequences. int InsertCost = TTI.getVectorInstrCost(Instruction::InsertElement, VecTy, Index); - int OldCost = InsertCost + InsertCost + VectorOpCost; + int OldCost = (V0 ? InsertCost : 0) + (V1 ? InsertCost : 0) + VectorOpCost; int NewCost = ScalarOpCost + InsertCost + - !Ins0->hasOneUse() * InsertCost + - !Ins1->hasOneUse() * InsertCost; + (V0 ? !Ins0->hasOneUse() * InsertCost : 0) + + (V1 ? !Ins1->hasOneUse() * InsertCost : 0); // We want to scalarize unless the vector variant actually has lower cost. if (OldCost < NewCost) @@ -353,6 +371,13 @@ // inselt NewVecC, (scalar_bo V0, V1), Index ++NumScalarBO; IRBuilder<> Builder(&I); + + // For constant cases, extract the scalar element, this should constant fold. + if (!V0) + V0 = Builder.CreateExtractElement(VecC0, Index); + if (!V1) + V1 = Builder.CreateExtractElement(VecC1, Index); + Value *Scalar = Builder.CreateBinOp(Opcode, V0, V1, I.getName() + ".scalar"); // All IR flags are safe to back-propagate. There is no potential for extra diff --git a/llvm/test/Transforms/VectorCombine/X86/insert-binop.ll b/llvm/test/Transforms/VectorCombine/X86/insert-binop.ll --- a/llvm/test/Transforms/VectorCombine/X86/insert-binop.ll +++ b/llvm/test/Transforms/VectorCombine/X86/insert-binop.ll @@ -232,3 +232,55 @@ %r = fdiv <4 x float> %i0, %i1 ret <4 x float> %r } + +define <4 x i32> @PR42174(<4 x i32> %arg, i32 %arg1, i32 %arg2, i32 %arg3, i32 %arg4, i32 %arg5, i32 %arg6, i32 %arg7, i32 %arg8) { +; CHECK-LABEL: @PR42174( +; CHECK-NEXT: [[TMP:%.*]] = sdiv i32 [[ARG4:%.*]], 2 +; CHECK-NEXT: [[TMP10:%.*]] = mul nsw i32 [[ARG6:%.*]], 6234 +; CHECK-NEXT: [[TMP12:%.*]] = mul nsw i32 [[ARG3:%.*]], 75 +; CHECK-NEXT: [[TMP14:%.*]] = sdiv i32 [[ARG7:%.*]], 3452 +; CHECK-NEXT: [[TMP16:%.*]] = mul nsw i32 [[ARG5:%.*]], 53 +; CHECK-NEXT: [[TMP18:%.*]] = sdiv i32 [[ARG2:%.*]], 820 +; CHECK-NEXT: [[TMP20:%.*]] = shl nsw i32 [[ARG8:%.*]], 2 +; CHECK-NEXT: [[TMP23_SCALAR:%.*]] = add i32 [[ARG1:%.*]], 1 +; CHECK-NEXT: [[TMP24_SCALAR:%.*]] = add i32 [[TMP23_SCALAR]], [[TMP18]] +; CHECK-NEXT: [[TMP25_SCALAR:%.*]] = add i32 [[TMP24_SCALAR]], [[TMP12]] +; CHECK-NEXT: [[TMP26_SCALAR:%.*]] = add i32 [[TMP25_SCALAR]], [[TMP]] +; CHECK-NEXT: [[TMP27_SCALAR:%.*]] = add i32 [[TMP26_SCALAR]], [[TMP16]] +; CHECK-NEXT: [[TMP28_SCALAR:%.*]] = add i32 [[TMP27_SCALAR]], [[TMP10]] +; CHECK-NEXT: [[TMP29_SCALAR:%.*]] = add i32 [[TMP28_SCALAR]], [[TMP14]] +; CHECK-NEXT: [[TMP30_SCALAR:%.*]] = add i32 [[TMP29_SCALAR]], [[TMP20]] +; CHECK-NEXT: [[TMP31_SCALAR:%.*]] = add i32 [[TMP30_SCALAR]], 317425 +; CHECK-NEXT: [[TMP31:%.*]] = insertelement <4 x i32> undef, i32 [[TMP31_SCALAR]], i64 0 +; CHECK-NEXT: [[TMP32:%.*]] = shufflevector <4 x i32> [[TMP31]], <4 x i32> undef, <4 x i32> zeroinitializer +; CHECK-NEXT: [[TMP33:%.*]] = add <4 x i32> [[TMP32]], [[ARG:%.*]] +; CHECK-NEXT: ret <4 x i32> [[TMP33]] +; + %tmp = sdiv i32 %arg4, 2 + %tmp9 = insertelement <4 x i32> undef, i32 %tmp, i32 0 + %tmp10 = mul nsw i32 %arg6, 6234 + %tmp11 = insertelement <4 x i32> undef, i32 %tmp10, i32 0 + %tmp12 = mul nsw i32 %arg3, 75 + %tmp13 = insertelement <4 x i32> undef, i32 %tmp12, i32 0 + %tmp14 = sdiv i32 %arg7, 3452 + %tmp15 = insertelement <4 x i32> undef, i32 %tmp14, i32 0 + %tmp16 = mul nsw i32 %arg5, 53 + %tmp17 = insertelement <4 x i32> undef, i32 %tmp16, i32 0 + %tmp18 = sdiv i32 %arg2, 820 + %tmp19 = insertelement <4 x i32> undef, i32 %tmp18, i32 0 + %tmp20 = shl nsw i32 %arg8, 2 + %tmp21 = insertelement <4 x i32> undef, i32 %tmp20, i32 0 + %tmp22 = insertelement <4 x i32> undef, i32 %arg1, i32 0 + %tmp23 = add <4 x i32> %tmp22, + %tmp24 = add <4 x i32> %tmp23, %tmp19 + %tmp25 = add <4 x i32> %tmp24, %tmp13 + %tmp26 = add <4 x i32> %tmp25, %tmp9 + %tmp27 = add <4 x i32> %tmp26, %tmp17 + %tmp28 = add <4 x i32> %tmp27, %tmp11 + %tmp29 = add <4 x i32> %tmp28, %tmp15 + %tmp30 = add <4 x i32> %tmp29, %tmp21 + %tmp31 = add <4 x i32> %tmp30, + %tmp32 = shufflevector <4 x i32> %tmp31, <4 x i32> undef, <4 x i32> zeroinitializer + %tmp33 = add <4 x i32> %tmp32, %arg + ret <4 x i32> %tmp33 +}