Index: llvm/lib/Transforms/Vectorize/VectorCombine.cpp =================================================================== --- llvm/lib/Transforms/Vectorize/VectorCombine.cpp +++ llvm/lib/Transforms/Vectorize/VectorCombine.cpp @@ -50,7 +50,7 @@ /// and if one of the extracts should be transformed to a shufflevector, set /// \p ConvertToShuffle to that extract instruction. static bool isExtractExtractCheap(Instruction *Ext0, Instruction *Ext1, - unsigned Opcode, + const Instruction &OpInst, const TargetTransformInfo &TTI, Instruction *&ConvertToShuffle, unsigned PreferredExtractIndex) { @@ -62,6 +62,7 @@ int ScalarOpCost, VectorOpCost; // Get cost estimates for scalar and vector versions of the operation. + unsigned Opcode = OpInst.getOpcode(); bool IsBinOp = Instruction::isBinaryOp(Opcode); if (IsBinOp) { ScalarOpCost = TTI.getArithmeticInstrCost(Opcode, ScalarTy); @@ -79,6 +80,8 @@ // both sequences. unsigned Ext0Index = cast(Ext0->getOperand(1))->getZExtValue(); unsigned Ext1Index = cast(Ext1->getOperand(1))->getZExtValue(); + Value *Ext0Vector = Ext0->getOperand(0); + Value *Ext1Vector = Ext1->getOperand(0); int Extract0Cost = TTI.getVectorInstrCost(Instruction::ExtractElement, VecTy, Ext0Index); @@ -97,7 +100,7 @@ // Extra uses of the extracts mean that we include those costs in the // vector total because those instructions will not be eliminated. int OldCost, NewCost; - if (Ext0->getOperand(0) == Ext1->getOperand(0) && Ext0Index == Ext1Index) { + if (Ext0Vector == Ext1Vector && Ext0Index == Ext1Index) { // Handle a special case. If the 2 extracts are identical, adjust the // formulas to account for that. The extra use charge allows for either the // CSE'd pattern or an unoptimized form with identical values: @@ -118,35 +121,52 @@ if (Ext0Index == Ext1Index) { // If the extract indexes are identical, no shuffle is needed. ConvertToShuffle = nullptr; - } else { - if (IsBinOp && DisableBinopExtractShuffle) - return true; - - // If we are extracting from 2 different indexes, then one operand must be - // shuffled before performing the vector operation. The shuffle mask is - // undefined except for 1 lane that is being translated to the remaining - // extraction lane. Therefore, it is a splat shuffle. Ex: - // ShufMask = { undef, undef, 0, undef } - // TODO: The cost model has an option for a "broadcast" shuffle - // (splat-from-element-0), but no option for a more general splat. - NewCost += - TTI.getShuffleCost(TargetTransformInfo::SK_PermuteSingleSrc, VecTy); - - // The more expensive extract will be replaced by a shuffle. If the costs - // are equal and there is a preferred extract index, shuffle the opposite - // operand. Otherwise, replace the extract with the higher index. - if (Extract0Cost > Extract1Cost) - ConvertToShuffle = Ext0; - else if (Extract1Cost > Extract0Cost) - ConvertToShuffle = Ext1; - else if (PreferredExtractIndex == Ext0Index) - ConvertToShuffle = Ext1; - else if (PreferredExtractIndex == Ext1Index) - ConvertToShuffle = Ext0; - else - ConvertToShuffle = Ext0Index > Ext1Index ? Ext0 : Ext1; + return OldCost < NewCost; } + if (Ext0Vector == Ext1Vector && OpInst.hasOneUse()) { + // If match the reduction pattern, do nothing. + const auto *ReductionInst = OpInst.user_back(); + // User instruction is same operation. + if (ReductionInst->getOpcode() == Opcode) { + assert(ReductionInst->getNumOperands() == 2 && + "Expected a binary operation or compare"); + Value *SecondOp = ReductionInst->getOperand(0) == &OpInst + ? ReductionInst->getOperand(1) + : ReductionInst->getOperand(0); + auto *Ext2 = dyn_cast(SecondOp); + if (Ext2 && Ext2->getOperand(0) == Ext0Vector) + return true; + } + } + + if (IsBinOp && DisableBinopExtractShuffle) + return true; + + // If we are extracting from 2 different indexes, then one operand must be + // shuffled before performing the vector operation. The shuffle mask is + // undefined except for 1 lane that is being translated to the remaining + // extraction lane. Therefore, it is a splat shuffle. Ex: + // ShufMask = { undef, undef, 0, undef } + // TODO: The cost model has an option for a "broadcast" shuffle + // (splat-from-element-0), but no option for a more general splat. + NewCost += + TTI.getShuffleCost(TargetTransformInfo::SK_PermuteSingleSrc, VecTy); + + // The more expensive extract will be replaced by a shuffle. If the costs + // are equal and there is a preferred extract index, shuffle the opposite + // operand. Otherwise, replace the extract with the higher index. + if (Extract0Cost > Extract1Cost) + ConvertToShuffle = Ext0; + else if (Extract1Cost > Extract0Cost) + ConvertToShuffle = Ext1; + else if (PreferredExtractIndex == Ext0Index) + ConvertToShuffle = Ext1; + else if (PreferredExtractIndex == Ext1Index) + ConvertToShuffle = Ext0; + else + ConvertToShuffle = Ext0Index > Ext1Index ? Ext0 : Ext1; + // Aggressively form a vector op if the cost is equal because the transform // may enable further optimization. // Codegen can reverse this transform (scalarize) if it was not profitable. @@ -226,8 +246,7 @@ m_ConstantInt(InsertIndex))); Instruction *ConvertToShuffle; - if (isExtractExtractCheap(Ext0, Ext1, I.getOpcode(), TTI, ConvertToShuffle, - InsertIndex)) + if (isExtractExtractCheap(Ext0, Ext1, I, TTI, ConvertToShuffle, InsertIndex)) return false; if (ConvertToShuffle) { Index: llvm/test/Transforms/VectorCombine/X86/extract-binop.ll =================================================================== --- llvm/test/Transforms/VectorCombine/X86/extract-binop.ll +++ llvm/test/Transforms/VectorCombine/X86/extract-binop.ll @@ -486,3 +486,23 @@ %v3 = insertelement <4 x float> %v2, float %b23, i32 3 ret <4 x float> %v3 } + +; Negative test - simd reduction operation. + +define i32 @ext_ext_reduction(<4 x i32> %x, <4 x i32> %y) { +; CHECK-LABEL: @ext_ext_reduction( +; CHECK-NEXT: %and = and <4 x i32> [[X:%.*]], [[Y:%.*]] +; CHECK-NOT: [[TMP1:%.*]] = shufflevector <4 x i32> %and, <4 x i32> undef, <4 x i32> +; + %and = and <4 x i32> %x, %y + %vecext = extractelement <4 x i32> %and, i32 0 + %vecext.1 = extractelement <4 x i32> %and, i32 1 + %add.1 = or i32 %vecext.1, %vecext + %vecext.2 = extractelement <4 x i32> %and, i32 2 + %add.2 = or i32 %vecext.2, %add.1 + %vecext.3 = extractelement <4 x i32> %and, i32 3 + %add.3 = or i32 %vecext.3, %add.2 + ret i32 %add.3 +} + +