Index: llvm/lib/Transforms/Vectorize/VectorCombine.cpp =================================================================== --- llvm/lib/Transforms/Vectorize/VectorCombine.cpp +++ llvm/lib/Transforms/Vectorize/VectorCombine.cpp @@ -51,7 +51,8 @@ static bool isExtractExtractCheap(Instruction *Ext0, Instruction *Ext1, unsigned Opcode, const TargetTransformInfo &TTI, - Instruction *&ConvertToShuffle) { + Instruction *&ConvertToShuffle, + unsigned PreferredExtractIndex) { assert(isa(Ext0->getOperand(1)) && isa(Ext1->getOperand(1)) && "Expected constant extract indexes"); @@ -130,12 +131,17 @@ NewCost += TTI.getShuffleCost(TargetTransformInfo::SK_PermuteSingleSrc, VecTy); - // The more expensive extract will be replaced by a shuffle. If the extracts - // have the same cost, replace the extract with the higher index. + // The more expensive extract will be replaced by a shuffle. If the costs + // are equal and there is a preferred extract index, shuffle the opposite + // operand. Otherwise, replace the extract with the higher index. if (Extract0Cost > Extract1Cost) ConvertToShuffle = Ext0; else if (Extract1Cost > Extract0Cost) ConvertToShuffle = Ext1; + else if (PreferredExtractIndex == Ext0Index) + ConvertToShuffle = Ext1; + else if (PreferredExtractIndex == Ext1Index) + ConvertToShuffle = Ext0; else ConvertToShuffle = Ext0Index > Ext1Index ? Ext0 : Ext1; } @@ -208,8 +214,19 @@ V0->getType() != V1->getType()) return false; + // If the scalar value 'I' is going to be re-inserted into a vector, then try + // to create an extract to that same element. The extract/insert can be + // reduced to a "select shuffle". + // TODO: If we add a larger pattern match that starts from an insert, this + // probably becomes unnecessary. + uint64_t InsertIndex = std::numeric_limits::max(); + if (I.hasOneUse()) + match(I.user_back(), m_InsertElement(m_Value(), m_Value(), + m_ConstantInt(InsertIndex))); + Instruction *ConvertToShuffle; - if (isExtractExtractCheap(Ext0, Ext1, I.getOpcode(), TTI, ConvertToShuffle)) + if (isExtractExtractCheap(Ext0, Ext1, I.getOpcode(), TTI, ConvertToShuffle, + InsertIndex)) return false; if (ConvertToShuffle) { Index: llvm/test/Transforms/VectorCombine/X86/extract-binop.ll =================================================================== --- llvm/test/Transforms/VectorCombine/X86/extract-binop.ll +++ llvm/test/Transforms/VectorCombine/X86/extract-binop.ll @@ -418,9 +418,9 @@ define <4 x float> @ins_bo_ext_ext(<4 x float> %a, <4 x float> %b) { ; CHECK-LABEL: @ins_bo_ext_ext( -; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <4 x float> [[A:%.*]], <4 x float> undef, <4 x i32> -; CHECK-NEXT: [[TMP2:%.*]] = fadd <4 x float> [[A]], [[TMP1]] -; CHECK-NEXT: [[TMP3:%.*]] = extractelement <4 x float> [[TMP2]], i32 2 +; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <4 x float> [[A:%.*]], <4 x float> undef, <4 x i32> +; CHECK-NEXT: [[TMP2:%.*]] = fadd <4 x float> [[TMP1]], [[A]] +; CHECK-NEXT: [[TMP3:%.*]] = extractelement <4 x float> [[TMP2]], i64 3 ; CHECK-NEXT: [[V3:%.*]] = insertelement <4 x float> [[B:%.*]], float [[TMP3]], i32 3 ; CHECK-NEXT: ret <4 x float> [[V3]] ; @@ -431,6 +431,9 @@ ret <4 x float> %v3 } +; TODO: This is conservatively left to extract from the lower index value, +; but it is likely that extracting from index 3 is the better option. + define <4 x float> @ins_bo_ext_ext_uses(<4 x float> %a, <4 x float> %b) { ; CHECK-LABEL: @ins_bo_ext_ext_uses( ; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <4 x float> [[A:%.*]], <4 x float> undef, <4 x i32> @@ -452,13 +455,13 @@ ; CHECK-LABEL: @PR34724( ; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <4 x float> [[A:%.*]], <4 x float> undef, <4 x i32> ; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <4 x float> [[B:%.*]], <4 x float> undef, <4 x i32> -; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <4 x float> [[B]], <4 x float> undef, <4 x i32> +; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <4 x float> [[B]], <4 x float> undef, <4 x i32> ; CHECK-NEXT: [[TMP4:%.*]] = fadd <4 x float> [[A]], [[TMP1]] ; CHECK-NEXT: [[TMP5:%.*]] = extractelement <4 x float> [[TMP4]], i32 2 ; CHECK-NEXT: [[TMP6:%.*]] = fadd <4 x float> [[B]], [[TMP2]] ; CHECK-NEXT: [[TMP7:%.*]] = extractelement <4 x float> [[TMP6]], i32 0 -; CHECK-NEXT: [[TMP8:%.*]] = fadd <4 x float> [[B]], [[TMP3]] -; CHECK-NEXT: [[TMP9:%.*]] = extractelement <4 x float> [[TMP8]], i32 2 +; CHECK-NEXT: [[TMP8:%.*]] = fadd <4 x float> [[TMP3]], [[B]] +; CHECK-NEXT: [[TMP9:%.*]] = extractelement <4 x float> [[TMP8]], i64 3 ; CHECK-NEXT: [[V1:%.*]] = insertelement <4 x float> undef, float [[TMP5]], i32 1 ; CHECK-NEXT: [[V2:%.*]] = insertelement <4 x float> [[V1]], float [[TMP7]], i32 2 ; CHECK-NEXT: [[V3:%.*]] = insertelement <4 x float> [[V2]], float [[TMP9]], i32 3 Index: llvm/test/Transforms/VectorCombine/X86/extract-cmp.ll =================================================================== --- llvm/test/Transforms/VectorCombine/X86/extract-cmp.ll +++ llvm/test/Transforms/VectorCombine/X86/extract-cmp.ll @@ -161,9 +161,9 @@ ; SSE-NEXT: ret <4 x i1> [[R]] ; ; AVX-LABEL: @ins_fcmp_ext_ext( -; AVX-NEXT: [[TMP1:%.*]] = shufflevector <4 x float> [[A:%.*]], <4 x float> undef, <4 x i32> -; AVX-NEXT: [[TMP2:%.*]] = fcmp ugt <4 x float> [[TMP1]], [[A]] -; AVX-NEXT: [[TMP3:%.*]] = extractelement <4 x i1> [[TMP2]], i64 1 +; AVX-NEXT: [[TMP1:%.*]] = shufflevector <4 x float> [[A:%.*]], <4 x float> undef, <4 x i32> +; AVX-NEXT: [[TMP2:%.*]] = fcmp ugt <4 x float> [[A]], [[TMP1]] +; AVX-NEXT: [[TMP3:%.*]] = extractelement <4 x i1> [[TMP2]], i32 2 ; AVX-NEXT: [[R:%.*]] = insertelement <4 x i1> [[B:%.*]], i1 [[TMP3]], i32 2 ; AVX-NEXT: ret <4 x i1> [[R]] ; @@ -176,9 +176,9 @@ define <4 x i1> @ins_icmp_ext_ext(<4 x i32> %a, <4 x i1> %b) { ; CHECK-LABEL: @ins_icmp_ext_ext( -; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <4 x i32> [[A:%.*]], <4 x i32> undef, <4 x i32> -; CHECK-NEXT: [[TMP2:%.*]] = icmp ule <4 x i32> [[A]], [[TMP1]] -; CHECK-NEXT: [[TMP3:%.*]] = extractelement <4 x i1> [[TMP2]], i32 2 +; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <4 x i32> [[A:%.*]], <4 x i32> undef, <4 x i32> +; CHECK-NEXT: [[TMP2:%.*]] = icmp ule <4 x i32> [[TMP1]], [[A]] +; CHECK-NEXT: [[TMP3:%.*]] = extractelement <4 x i1> [[TMP2]], i64 3 ; CHECK-NEXT: [[R:%.*]] = insertelement <4 x i1> [[B:%.*]], i1 [[TMP3]], i32 3 ; CHECK-NEXT: ret <4 x i1> [[R]] ;