Index: llvm/lib/Transforms/Vectorize/VectorCombine.cpp =================================================================== --- llvm/lib/Transforms/Vectorize/VectorCombine.cpp +++ llvm/lib/Transforms/Vectorize/VectorCombine.cpp @@ -51,7 +51,7 @@ /// and if one of the extracts should be transformed to a shufflevector, set /// \p ConvertToShuffle to that extract instruction. static bool isExtractExtractCheap(Instruction *Ext0, Instruction *Ext1, - unsigned Opcode, + const Instruction &OpInst, const TargetTransformInfo &TTI, Instruction *&ConvertToShuffle, unsigned PreferredExtractIndex) { @@ -63,6 +63,7 @@ int ScalarOpCost, VectorOpCost; // Get cost estimates for scalar and vector versions of the operation. + unsigned Opcode = OpInst.getOpcode(); bool IsBinOp = Instruction::isBinaryOp(Opcode); if (IsBinOp) { ScalarOpCost = TTI.getArithmeticInstrCost(Opcode, ScalarTy); @@ -80,6 +81,8 @@ // both sequences. unsigned Ext0Index = cast(Ext0->getOperand(1))->getZExtValue(); unsigned Ext1Index = cast(Ext1->getOperand(1))->getZExtValue(); + Value *Ext0Vector = Ext0->getOperand(0); + Value *Ext1Vector = Ext1->getOperand(0); int Extract0Cost = TTI.getVectorInstrCost(Instruction::ExtractElement, VecTy, Ext0Index); @@ -98,7 +101,7 @@ // Extra uses of the extracts mean that we include those costs in the // vector total because those instructions will not be eliminated. int OldCost, NewCost; - if (Ext0->getOperand(0) == Ext1->getOperand(0) && Ext0Index == Ext1Index) { + if (Ext0Vector == Ext1Vector && Ext0Index == Ext1Index) { // Handle a special case. If the 2 extracts are identical, adjust the // formulas to account for that. The extra use charge allows for either the // CSE'd pattern or an unoptimized form with identical values: @@ -119,35 +122,52 @@ if (Ext0Index == Ext1Index) { // If the extract indexes are identical, no shuffle is needed. ConvertToShuffle = nullptr; - } else { - if (IsBinOp && DisableBinopExtractShuffle) - return true; - - // If we are extracting from 2 different indexes, then one operand must be - // shuffled before performing the vector operation. The shuffle mask is - // undefined except for 1 lane that is being translated to the remaining - // extraction lane. Therefore, it is a splat shuffle. Ex: - // ShufMask = { undef, undef, 0, undef } - // TODO: The cost model has an option for a "broadcast" shuffle - // (splat-from-element-0), but no option for a more general splat. - NewCost += - TTI.getShuffleCost(TargetTransformInfo::SK_PermuteSingleSrc, VecTy); - - // The more expensive extract will be replaced by a shuffle. If the costs - // are equal and there is a preferred extract index, shuffle the opposite - // operand. Otherwise, replace the extract with the higher index. - if (Extract0Cost > Extract1Cost) - ConvertToShuffle = Ext0; - else if (Extract1Cost > Extract0Cost) - ConvertToShuffle = Ext1; - else if (PreferredExtractIndex == Ext0Index) - ConvertToShuffle = Ext1; - else if (PreferredExtractIndex == Ext1Index) - ConvertToShuffle = Ext0; - else - ConvertToShuffle = Ext0Index > Ext1Index ? Ext0 : Ext1; + return OldCost < NewCost; } + if (Ext0Vector == Ext1Vector && OpInst.hasOneUse()) { + // If match the reduction pattern, do nothing. + const auto *ReductionInst = OpInst.user_back(); + // User instruction is same operation. + if (ReductionInst->getOpcode() == Opcode) { + assert(ReductionInst->getNumOperands() == 2 && + "Expected a binary operation or compare"); + Value *SecondOp = ReductionInst->getOperand(0) == &OpInst + ? ReductionInst->getOperand(1) + : ReductionInst->getOperand(0); + auto *Ext2 = dyn_cast(SecondOp); + if (Ext2 && Ext2->getOperand(0) == Ext0Vector) + return true; + } + } + + if (IsBinOp && DisableBinopExtractShuffle) + return true; + + // If we are extracting from 2 different indexes, then one operand must be + // shuffled before performing the vector operation. The shuffle mask is + // undefined except for 1 lane that is being translated to the remaining + // extraction lane. Therefore, it is a splat shuffle. Ex: + // ShufMask = { undef, undef, 0, undef } + // TODO: The cost model has an option for a "broadcast" shuffle + // (splat-from-element-0), but no option for a more general splat. + NewCost += + TTI.getShuffleCost(TargetTransformInfo::SK_PermuteSingleSrc, VecTy); + + // The more expensive extract will be replaced by a shuffle. If the costs + // are equal and there is a preferred extract index, shuffle the opposite + // operand. Otherwise, replace the extract with the higher index. + if (Extract0Cost > Extract1Cost) + ConvertToShuffle = Ext0; + else if (Extract1Cost > Extract0Cost) + ConvertToShuffle = Ext1; + else if (PreferredExtractIndex == Ext0Index) + ConvertToShuffle = Ext1; + else if (PreferredExtractIndex == Ext1Index) + ConvertToShuffle = Ext0; + else + ConvertToShuffle = Ext0Index > Ext1Index ? Ext0 : Ext1; + // Aggressively form a vector op if the cost is equal because the transform // may enable further optimization. // Codegen can reverse this transform (scalarize) if it was not profitable. @@ -227,8 +247,7 @@ m_ConstantInt(InsertIndex))); Instruction *ConvertToShuffle; - if (isExtractExtractCheap(Ext0, Ext1, I.getOpcode(), TTI, ConvertToShuffle, - InsertIndex)) + if (isExtractExtractCheap(Ext0, Ext1, I, TTI, ConvertToShuffle, InsertIndex)) return false; if (ConvertToShuffle) { Index: llvm/test/Transforms/VectorCombine/X86/extract-binop.ll =================================================================== --- llvm/test/Transforms/VectorCombine/X86/extract-binop.ll +++ llvm/test/Transforms/VectorCombine/X86/extract-binop.ll @@ -1,6 +1,8 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py ; RUN: opt < %s -vector-combine -S -mtriple=x86_64-- -mattr=SSE2 | FileCheck %s --check-prefixes=CHECK,SSE ; RUN: opt < %s -vector-combine -S -mtriple=x86_64-- -mattr=AVX2 | FileCheck %s --check-prefixes=CHECK,AVX +; RUN: opt < %s -O3 -S -mtriple=x86_64-- -mattr=avx | FileCheck %s --check-prefixes=PHASE +; RUN: opt -passes='default' -S < %s | FileCheck %s --check-prefixes=PHASE declare void @use_i8(i8) declare void @use_f32(float) @@ -486,3 +488,92 @@ %v3 = insertelement <4 x float> %v2, float %b23, i32 3 ret <4 x float> %v3 } + +; SIMD reduction operation. +define i32 @ext_ext_reduction(<4 x i32> %x, <4 x i32> %y) { +; CHECK-LABEL: @ext_ext_reduction( +; CHECK-NEXT: [[AND:%.*]] = and <4 x i32> [[X:%.*]], [[Y:%.*]] +; CHECK-NEXT: [[VECEXT:%.*]] = extractelement <4 x i32> [[AND]], i32 0 +; CHECK-NEXT: [[VECEXT_1:%.*]] = extractelement <4 x i32> [[AND]], i32 1 +; CHECK-NEXT: [[ADD_1:%.*]] = or i32 [[VECEXT_1]], [[VECEXT]] +; CHECK-NEXT: [[VECEXT_2:%.*]] = extractelement <4 x i32> [[AND]], i32 2 +; CHECK-NEXT: [[ADD_2:%.*]] = or i32 [[VECEXT_2]], [[ADD_1]] +; CHECK-NEXT: [[VECEXT_3:%.*]] = extractelement <4 x i32> [[AND]], i32 3 +; CHECK-NEXT: [[ADD_3:%.*]] = or i32 [[VECEXT_3]], [[ADD_2]] +; CHECK-NEXT: ret i32 [[ADD_3]] +; +; PHASE-LABEL: @ext_ext_reduction( +; PHASE-NEXT: [[AND:%.*]] = and <4 x i32> [[Y:%.*]], [[X:%.*]] +; PHASE-NEXT: [[RDX_SHUF:%.*]] = shufflevector <4 x i32> [[AND]], <4 x i32> undef, <4 x i32> +; PHASE-NEXT: [[BIN_RDX:%.*]] = or <4 x i32> [[AND]], [[RDX_SHUF]] +; PHASE-NEXT: [[RDX_SHUF1:%.*]] = shufflevector <4 x i32> [[BIN_RDX]], <4 x i32> undef, <4 x i32> +; PHASE-NEXT: [[BIN_RDX2:%.*]] = or <4 x i32> [[BIN_RDX]], [[RDX_SHUF1]] +; PHASE-NEXT: [[TMP1:%.*]] = extractelement <4 x i32> [[BIN_RDX2]], i32 0 +; PHASE-NEXT: ret i32 [[TMP1]] +; + %and = and <4 x i32> %x, %y + %vecext = extractelement <4 x i32> %and, i32 0 + %vecext.1 = extractelement <4 x i32> %and, i32 1 + %add.1 = or i32 %vecext.1, %vecext + %vecext.2 = extractelement <4 x i32> %and, i32 2 + %add.2 = or i32 %vecext.2, %add.1 + %vecext.3 = extractelement <4 x i32> %and, i32 3 + %add.3 = or i32 %vecext.3, %add.2 + ret i32 %add.3 +} + +define i32 @ext_ext_reduction_1(<4 x i32> %x, <4 x i32> %y) { +; CHECK-LABEL: @ext_ext_reduction_1( +; CHECK-NEXT: [[VECEXT:%.*]] = extractelement <4 x i32> [[Y:%.*]], i32 0 +; CHECK-NEXT: [[VECEXT_1:%.*]] = extractelement <4 x i32> [[Y]], i32 1 +; CHECK-NEXT: [[ADD_1:%.*]] = add i32 [[VECEXT_1]], [[VECEXT]] +; CHECK-NEXT: [[VECEXT_2:%.*]] = extractelement <4 x i32> [[Y]], i32 2 +; CHECK-NEXT: [[ADD_2:%.*]] = add i32 [[VECEXT_2]], [[ADD_1]] +; CHECK-NEXT: [[VECEXT_3:%.*]] = extractelement <4 x i32> [[X:%.*]], i32 2 +; CHECK-NEXT: [[ADD_3:%.*]] = add i32 [[VECEXT_3]], [[ADD_2]] +; CHECK-NEXT: ret i32 [[ADD_3]] +; +; PHASE-LABEL: @ext_ext_reduction_1( +; PHASE-NEXT: [[VECEXT_1:%.*]] = extractelement <4 x i32> [[Y:%.*]], i32 1 +; PHASE-NEXT: [[VECEXT_2:%.*]] = extractelement <4 x i32> [[Y]], i32 2 +; PHASE-NEXT: [[TMP1:%.*]] = shufflevector <4 x i32> [[X:%.*]], <4 x i32> undef, <4 x i32> +; PHASE-NEXT: [[TMP2:%.*]] = add <4 x i32> [[TMP1]], [[Y]] +; PHASE-NEXT: [[TMP3:%.*]] = extractelement <4 x i32> [[TMP2]], i32 0 +; PHASE-NEXT: [[ADD_2:%.*]] = add i32 [[TMP3]], [[VECEXT_1]] +; PHASE-NEXT: [[ADD_3:%.*]] = add i32 [[ADD_2]], [[VECEXT_2]] +; PHASE-NEXT: ret i32 [[ADD_3]] +; + %vecext = extractelement <4 x i32> %y, i32 0 + %vecext.1 = extractelement <4 x i32> %y, i32 1 + %add.1 = add i32 %vecext.1, %vecext + %vecext.2 = extractelement <4 x i32> %y, i32 2 + %add.2 = add i32 %vecext.2, %add.1 + %vecext.3 = extractelement <4 x i32> %x, i32 2 + %add.3 = add i32 %vecext.3, %add.2 + ret i32 %add.3 +} + +define i32 @ext_ext_reduction_2(<4 x i32> %x, <4 x i32> %y) { +; CHECK-LABEL: @ext_ext_reduction_2( +; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <4 x i32> [[Y:%.*]], <4 x i32> undef, <4 x i32> +; CHECK-NEXT: [[TMP2:%.*]] = add <4 x i32> [[TMP1]], [[Y]] +; CHECK-NEXT: [[TMP3:%.*]] = extractelement <4 x i32> [[TMP2]], i64 0 +; CHECK-NEXT: [[VECEXT_2:%.*]] = extractelement <4 x i32> [[X:%.*]], i32 2 +; CHECK-NEXT: [[ADD_2:%.*]] = add i32 [[VECEXT_2]], [[TMP3]] +; CHECK-NEXT: ret i32 [[ADD_2]] +; +; PHASE-LABEL: @ext_ext_reduction_2( +; PHASE-NEXT: [[VECEXT_1:%.*]] = extractelement <4 x i32> [[Y:%.*]], i32 1 +; PHASE-NEXT: [[TMP1:%.*]] = shufflevector <4 x i32> [[X:%.*]], <4 x i32> undef, <4 x i32> +; PHASE-NEXT: [[TMP2:%.*]] = add <4 x i32> [[TMP1]], [[Y]] +; PHASE-NEXT: [[TMP3:%.*]] = extractelement <4 x i32> [[TMP2]], i32 0 +; PHASE-NEXT: [[ADD_2:%.*]] = add i32 [[TMP3]], [[VECEXT_1]] +; PHASE-NEXT: ret i32 [[ADD_2]] +; + %vecext = extractelement <4 x i32> %y, i32 0 + %vecext.1 = extractelement <4 x i32> %y, i32 1 + %add.1 = add i32 %vecext.1, %vecext + %vecext.2 = extractelement <4 x i32> %x, i32 2 + %add.2 = add i32 %vecext.2, %add.1 + ret i32 %add.2 +}