Index: llvm/lib/Transforms/Vectorize/VectorCombine.cpp =================================================================== --- llvm/lib/Transforms/Vectorize/VectorCombine.cpp +++ llvm/lib/Transforms/Vectorize/VectorCombine.cpp @@ -97,6 +97,7 @@ void foldExtExtBinop(ExtractElementInst *Ext0, ExtractElementInst *Ext1, Instruction &I); bool foldExtractExtract(Instruction &I); + bool foldInsExtFNeg(Instruction &I); bool foldBitcastShuf(Instruction &I); bool scalarizeBinopOrCmp(Instruction &I); bool foldExtractedCmps(Instruction &I); @@ -533,6 +534,66 @@ return true; } +/// Try to replace an extract + scalar fneg + insert with a vector fneg + +/// shuffle. +bool VectorCombine::foldInsExtFNeg(Instruction &I) { + auto *VecTy = dyn_cast(I.getType()); + if (!VecTy) + return false; + + // Match an insert (op (extract)) pattern. + Value *DestVec; + uint64_t Index; + Instruction *FNeg; + if (!match(&I, m_InsertElt(m_Value(DestVec), m_OneUse(m_Instruction(FNeg)), + m_ConstantInt(Index)))) + return false; + + Value *SrcVec; + if (!match(FNeg, m_FNeg(m_ExtractElt(m_Value(SrcVec), m_SpecificInt(Index))))) + return false; + + if (SrcVec->getType() != VecTy) + return false; + + // We are inserting the negated element into the same lane that we extracted + // from. This is equivalent to a select-shuffle that chooses all but the + // negated element from the destination vector. + SmallVector Mask; + for (unsigned i = 0, e = VecTy->getNumElements(); i != e; ++i) { + if (i == Index) + Mask.push_back(Index + e); + else + Mask.push_back(i); + } + + Type *ScalarTy = VecTy->getScalarType(); + InstructionCost OldCost = + TTI.getArithmeticInstrCost(Instruction::FNeg, ScalarTy) + + TTI.getVectorInstrCost(I, VecTy, Index); + + // If the extract has one use, it will be eliminated, so count it in the + // original cost. If it has more than one use, ignore the cost because it will + // be the same before/after. + Instruction *Extract = cast(FNeg->getOperand(0)); + if (Extract->hasOneUse()) + OldCost += TTI.getVectorInstrCost(*Extract, VecTy, Index); + + InstructionCost NewCost = + TTI.getArithmeticInstrCost(Instruction::FNeg, VecTy) + + TTI.getShuffleCost(TargetTransformInfo::SK_Select, VecTy, Mask); + + if (NewCost > OldCost) + return false; + + // insertelt DestVec, (fneg (extractelt SrcVec, Index)), Index --> + // shuffle DestVec, (fneg SrcVec), Mask + Value *VecFNeg = Builder.CreateFNegFMF(SrcVec, FNeg); + Value *Shuf = Builder.CreateShuffleVector(DestVec, VecFNeg, Mask); + replaceValue(I, *Shuf); + return true; +} + /// If this is a bitcast of a shuffle, try to bitcast the source vector to the /// destination type followed by shuffle. This can enable further transforms by /// moving bitcasts or shuffles together. @@ -1571,6 +1632,7 @@ if (!ScalarizationOnly) { MadeChange |= vectorizeLoadInsert(I); MadeChange |= foldExtractExtract(I); + MadeChange |= foldInsExtFNeg(I); MadeChange |= foldBitcastShuf(I); MadeChange |= foldExtractedCmps(I); MadeChange |= foldShuffleOfBinops(I); Index: llvm/test/Transforms/VectorCombine/X86/extract-fneg-insert.ll =================================================================== --- llvm/test/Transforms/VectorCombine/X86/extract-fneg-insert.ll +++ llvm/test/Transforms/VectorCombine/X86/extract-fneg-insert.ll @@ -4,6 +4,8 @@ declare void @use(float) +; TODO: The insert is costed as free, so creating a shuffle appears to be a loss. + define <4 x float> @ext0_v4f32(<4 x float> %x, <4 x float> %y) { ; CHECK-LABEL: @ext0_v4f32( ; CHECK-NEXT: [[E:%.*]] = extractelement <4 x float> [[X:%.*]], i32 0 @@ -21,9 +23,8 @@ define <4 x float> @ext2_v4f32(<4 x float> %x, <4 x float> %y) { ; CHECK-LABEL: @ext2_v4f32( -; CHECK-NEXT: [[E:%.*]] = extractelement <4 x float> [[X:%.*]], i32 2 -; CHECK-NEXT: [[N:%.*]] = fneg float [[E]] -; CHECK-NEXT: [[R:%.*]] = insertelement <4 x float> [[Y:%.*]], float [[N]], i32 2 +; CHECK-NEXT: [[TMP1:%.*]] = fneg <4 x float> [[X:%.*]] +; CHECK-NEXT: [[R:%.*]] = shufflevector <4 x float> [[Y:%.*]], <4 x float> [[TMP1]], <4 x i32> ; CHECK-NEXT: ret <4 x float> [[R]] ; %e = extractelement <4 x float> %x, i32 2 @@ -36,9 +37,8 @@ define <2 x double> @ext1_v2f64(<2 x double> %x, <2 x double> %y) { ; CHECK-LABEL: @ext1_v2f64( -; CHECK-NEXT: [[E:%.*]] = extractelement <2 x double> [[X:%.*]], i32 1 -; CHECK-NEXT: [[N:%.*]] = fneg nsz double [[E]] -; CHECK-NEXT: [[R:%.*]] = insertelement <2 x double> [[Y:%.*]], double [[N]], i32 1 +; CHECK-NEXT: [[TMP1:%.*]] = fneg nsz <2 x double> [[X:%.*]] +; CHECK-NEXT: [[R:%.*]] = shufflevector <2 x double> [[Y:%.*]], <2 x double> [[TMP1]], <2 x i32> ; CHECK-NEXT: ret <2 x double> [[R]] ; %e = extractelement <2 x double> %x, i32 1 @@ -47,12 +47,20 @@ ret <2 x double> %r } +; The vector fneg would cost twice as much as the scalar op with SSE, +; so we don't transform there (the shuffle would also be more expensive). + define <8 x float> @ext7_v8f32(<8 x float> %x, <8 x float> %y) { -; CHECK-LABEL: @ext7_v8f32( -; CHECK-NEXT: [[E:%.*]] = extractelement <8 x float> [[X:%.*]], i32 7 -; CHECK-NEXT: [[N:%.*]] = fneg float [[E]] -; CHECK-NEXT: [[R:%.*]] = insertelement <8 x float> [[Y:%.*]], float [[N]], i32 7 -; CHECK-NEXT: ret <8 x float> [[R]] +; SSE-LABEL: @ext7_v8f32( +; SSE-NEXT: [[E:%.*]] = extractelement <8 x float> [[X:%.*]], i32 7 +; SSE-NEXT: [[N:%.*]] = fneg float [[E]] +; SSE-NEXT: [[R:%.*]] = insertelement <8 x float> [[Y:%.*]], float [[N]], i32 7 +; SSE-NEXT: ret <8 x float> [[R]] +; +; AVX-LABEL: @ext7_v8f32( +; AVX-NEXT: [[TMP1:%.*]] = fneg <8 x float> [[X:%.*]] +; AVX-NEXT: [[R:%.*]] = shufflevector <8 x float> [[Y:%.*]], <8 x float> [[TMP1]], <8 x i32> +; AVX-NEXT: ret <8 x float> [[R]] ; %e = extractelement <8 x float> %x, i32 7 %n = fneg float %e @@ -60,13 +68,22 @@ ret <8 x float> %r } +; Same as above with an extra use of the extracted element. + define <8 x float> @ext7_v8f32_use1(<8 x float> %x, <8 x float> %y) { -; CHECK-LABEL: @ext7_v8f32_use1( -; CHECK-NEXT: [[E:%.*]] = extractelement <8 x float> [[X:%.*]], i32 5 -; CHECK-NEXT: call void @use(float [[E]]) -; CHECK-NEXT: [[N:%.*]] = fneg float [[E]] -; CHECK-NEXT: [[R:%.*]] = insertelement <8 x float> [[Y:%.*]], float [[N]], i32 5 -; CHECK-NEXT: ret <8 x float> [[R]] +; SSE-LABEL: @ext7_v8f32_use1( +; SSE-NEXT: [[E:%.*]] = extractelement <8 x float> [[X:%.*]], i32 5 +; SSE-NEXT: call void @use(float [[E]]) +; SSE-NEXT: [[N:%.*]] = fneg float [[E]] +; SSE-NEXT: [[R:%.*]] = insertelement <8 x float> [[Y:%.*]], float [[N]], i32 5 +; SSE-NEXT: ret <8 x float> [[R]] +; +; AVX-LABEL: @ext7_v8f32_use1( +; AVX-NEXT: [[E:%.*]] = extractelement <8 x float> [[X:%.*]], i32 5 +; AVX-NEXT: call void @use(float [[E]]) +; AVX-NEXT: [[TMP1:%.*]] = fneg <8 x float> [[X]] +; AVX-NEXT: [[R:%.*]] = shufflevector <8 x float> [[Y:%.*]], <8 x float> [[TMP1]], <8 x i32> +; AVX-NEXT: ret <8 x float> [[R]] ; %e = extractelement <8 x float> %x, i32 5 call void @use(float %e) @@ -75,6 +92,8 @@ ret <8 x float> %r } +; Negative test - the transform is likely not profitable if the fneg has another use. + define <8 x float> @ext7_v8f32_use2(<8 x float> %x, <8 x float> %y) { ; CHECK-LABEL: @ext7_v8f32_use2( ; CHECK-NEXT: [[E:%.*]] = extractelement <8 x float> [[X:%.*]], i32 3 @@ -90,6 +109,8 @@ ret <8 x float> %r } +; Negative test - can't convert variable index to a shuffle. + define <2 x double> @ext_index_var_v2f64(<2 x double> %x, <2 x double> %y, i32 %index) { ; CHECK-LABEL: @ext_index_var_v2f64( ; CHECK-NEXT: [[E:%.*]] = extractelement <2 x double> [[X:%.*]], i32 [[INDEX:%.*]] @@ -103,6 +124,9 @@ ret <2 x double> %r } +; Negative test - require same extract/insert index for simple shuffle. +; TODO: We could handle this by adjusting the cost calculation. + define <2 x double> @ext1_v2f64_ins0(<2 x double> %x, <2 x double> %y) { ; CHECK-LABEL: @ext1_v2f64_ins0( ; CHECK-NEXT: [[E:%.*]] = extractelement <2 x double> [[X:%.*]], i32 1