Index: lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp =================================================================== --- lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp +++ lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp @@ -1048,6 +1048,8 @@ if (TmpV) { I->setOperand(1, TmpV); MadeChange = true; } bool NewUndefElts = false; + unsigned LHSIdx = -1u; + unsigned RHSIdx = -1u; for (unsigned i = 0; i < VWidth; i++) { unsigned MaskVal = Shuffle->getMaskValue(i); if (MaskVal == -1u) { @@ -1059,15 +1061,50 @@ if (UndefElts4[MaskVal]) { NewUndefElts = true; UndefElts.setBit(i); - } + } else + LHSIdx = LHSIdx == -1u ? MaskVal : LHSVWidth; } else { if (UndefElts3[MaskVal - LHSVWidth]) { NewUndefElts = true; UndefElts.setBit(i); - } + } else + RHSIdx = RHSIdx == -1u ? MaskVal - LHSVWidth : LHSVWidth; } } + // Try to transform shuffle with constant vector and single element from + // this constant vector to single insertelement instruction. + // shufflevector V, C, -> + // insertelement V, C[ci], ci-n + if (LHSVWidth == Shuffle->getType()->getNumElements()) { + Value *Op = nullptr; + Constant *Value = nullptr; + unsigned Idx = -1u; + + // Find constant vector wigth the single element in shuffle (LHS or RHS). + if (LHSIdx < LHSVWidth) { + if (auto *CV = dyn_cast(Shuffle->getOperand(0))) { + Op = Shuffle->getOperand(1); + Value = CV->getOperand(LHSIdx); + Idx = LHSIdx; + } + } + if (RHSIdx < LHSVWidth) { + if (auto *CV = dyn_cast(Shuffle->getOperand(1))) { + Op = Shuffle->getOperand(0); + Value = CV->getOperand(RHSIdx); + Idx = RHSIdx; + } + } + // Found constant vector with single element - convert to insertelement. + if (Op && Value) { + Instruction *New = InsertElementInst::Create( + Op, Value, ConstantInt::get(Type::getInt32Ty(I->getContext()), Idx), + Shuffle->getName()); + InsertNewInstWith(New, *Shuffle); + return New; + } + } if (NewUndefElts) { // Add additional discovered undefs. SmallVector Elts; Index: lib/Transforms/InstCombine/InstCombineVectorOps.cpp =================================================================== --- lib/Transforms/InstCombine/InstCombineVectorOps.cpp +++ lib/Transforms/InstCombine/InstCombineVectorOps.cpp @@ -585,58 +585,104 @@ return true; } -/// insertelt (shufflevector X, CVec, Mask), C, CIndex --> -/// shufflevector X, CVec', Mask' +/// insertelt (shufflevector X, CVec, Mask|insertelt X, C1, CIndex1), C, CIndex +/// --> shufflevector X, CVec', Mask' static Instruction *foldConstantInsEltIntoShuffle(InsertElementInst &InsElt) { - // Bail out if the shuffle has more than one use. In that case, we'd be + auto *Inst = dyn_cast(InsElt.getOperand(0)); + // Bail out if the parent has more than one use. In that case, we'd be // replacing the insertelt with a shuffle, and that's not a clear win. - auto *Shuf = dyn_cast(InsElt.getOperand(0)); - if (!Shuf || !Shuf->hasOneUse()) + if (!Inst || !Inst->hasOneUse()) return nullptr; + Value *X = nullptr; + Constant *NewShufVec = nullptr; + Constant *NewMask = nullptr; + if (auto *Shuf = dyn_cast(InsElt.getOperand(0))) { + // The shuffle must have a constant vector operand. The insertelt must have + // a constant scalar being inserted at a constant position in the vector. + Constant *ShufConstVec, *InsEltScalar; + uint64_t InsEltIndex; + if (!match(Shuf->getOperand(1), m_Constant(ShufConstVec)) || + !match(InsElt.getOperand(1), m_Constant(InsEltScalar)) || + !match(InsElt.getOperand(2), m_ConstantInt(InsEltIndex))) + return nullptr; - // The shuffle must have a constant vector operand. The insertelt must have a - // constant scalar being inserted at a constant position in the vector. - Constant *ShufConstVec, *InsEltScalar; - uint64_t InsEltIndex; - if (!match(Shuf->getOperand(1), m_Constant(ShufConstVec)) || - !match(InsElt.getOperand(1), m_Constant(InsEltScalar)) || - !match(InsElt.getOperand(2), m_ConstantInt(InsEltIndex))) - return nullptr; + // Adding an element to an arbitrary shuffle could be expensive, but a + // shuffle that selects elements from vectors without crossing lanes is + // assumed cheap. + // If we're just adding a constant into that shuffle, it will still be + // cheap. + if (!isShuffleEquivalentToSelect(*Shuf)) + return nullptr; - // Adding an element to an arbitrary shuffle could be expensive, but a shuffle - // that selects elements from vectors without crossing lanes is assumed cheap. - // If we're just adding a constant into that shuffle, it will still be cheap. - if (!isShuffleEquivalentToSelect(*Shuf)) - return nullptr; + // From the above 'select' check, we know that the mask has the same number + // of elements as the vector input operands. We also know that each constant + // input element is used in its lane and can not be used more than once by + // the shuffle. Therefore, replace the constant in the shuffle's constant + // vector with the insertelt constant. Replace the constant in the shuffle's + // mask vector with the insertelt index plus the length of the vector + // (because the constant vector operand of a shuffle is always the 2nd + // operand). + Constant *Mask = Shuf->getMask(); + unsigned NumElts = Mask->getType()->getVectorNumElements(); + SmallVector NewShufElts(NumElts); + SmallVector NewMaskElts(NumElts); + for (unsigned I = 0; I != NumElts; ++I) { + if (I == InsEltIndex) { + NewShufElts[I] = InsEltScalar; + Type *Int32Ty = Type::getInt32Ty(Shuf->getContext()); + NewMaskElts[I] = ConstantInt::get(Int32Ty, InsEltIndex + NumElts); + } else { + // Copy over the existing values. + NewShufElts[I] = ShufConstVec->getAggregateElement(I); + NewMaskElts[I] = Mask->getAggregateElement(I); + } + } - // From the above 'select' check, we know that the mask has the same number of - // elements as the vector input operands. We also know that each constant - // input element is used in its lane and can not be used more than once by the - // shuffle. Therefore, replace the constant in the shuffle's constant vector - // with the insertelt constant. Replace the constant in the shuffle's mask - // vector with the insertelt index plus the length of the vector (because the - // constant vector operand of a shuffle is always the 2nd operand). - Constant *Mask = Shuf->getMask(); - unsigned NumElts = Mask->getType()->getVectorNumElements(); - SmallVector NewShufElts(NumElts); - SmallVector NewMaskElts(NumElts); - for (unsigned i = 0; i != NumElts; ++i) { - if (i == InsEltIndex) { - NewShufElts[i] = InsEltScalar; - Type *Int32Ty = Type::getInt32Ty(Shuf->getContext()); - NewMaskElts[i] = ConstantInt::get(Int32Ty, InsEltIndex + NumElts); - } else { - // Copy over the existing values. - NewShufElts[i] = ShufConstVec->getAggregateElement(i); - NewMaskElts[i] = Mask->getAggregateElement(i); + // Create new operands for a shuffle that includes the constant of the + // original insertelt. The old shuffle will be dead now. + X = Shuf->getOperand(0); + NewShufVec = ConstantVector::get(NewShufElts); + NewMask = ConstantVector::get(NewMaskElts); + } else if (auto *IEI = dyn_cast(Inst)) { + // Transform sequences of insertelements ops with constant data/indexes into + // a single shuffle op. + unsigned NumElts = InsElt.getType()->getNumElements(); + + uint64_t InsertIdx[2]; + Constant *Val[2]; + if (!match(InsElt.getOperand(2), m_ConstantInt(InsertIdx[0])) || + !match(InsElt.getOperand(1), m_Constant(Val[0])) || + !match(IEI->getOperand(2), m_ConstantInt(InsertIdx[1])) || + !match(IEI->getOperand(1), m_Constant(Val[1]))) + return nullptr; + SmallVector Values(NumElts); + SmallVector Mask(NumElts); + auto ValI = std::begin(Val); + for (uint64_t I : InsertIdx) { + if (!Values[I]) { + assert(!Mask[I]); + Values[I] = *ValI; + Mask[I] = ConstantInt::get(Type::getInt32Ty(InsElt.getContext()), + NumElts + I); + } + ++ValI; } + for (unsigned I = 0; I < NumElts; ++I) { + if (!Values[I]) { + assert(!Mask[I]); + Values[I] = UndefValue::get(InsElt.getType()->getElementType()); + Mask[I] = ConstantInt::get(Type::getInt32Ty(InsElt.getContext()), I); + } + } + // Create new operands for a shuffle that includes the constant of the + // original insertelt. + X = IEI->getOperand(0); + NewShufVec = ConstantVector::get(Values); + NewMask = ConstantVector::get(Mask); } - - // Create new operands for a shuffle that includes the constant of the - // original insertelt. The old shuffle will be dead now. - Constant *NewShufVec = ConstantVector::get(NewShufElts); - Constant *NewMask = ConstantVector::get(NewMaskElts); - return new ShuffleVectorInst(Shuf->getOperand(0), NewShufVec, NewMask); + if (X && NewShufVec && NewMask) + return new ShuffleVectorInst(X, NewShufVec, NewMask); + return nullptr; } Instruction *InstCombiner::visitInsertElementInst(InsertElementInst &IE) { Index: test/Transforms/InstCombine/vec_demanded_elts.ll =================================================================== --- test/Transforms/InstCombine/vec_demanded_elts.ll +++ test/Transforms/InstCombine/vec_demanded_elts.ll @@ -215,8 +215,8 @@ define <4 x float> @test_select(float %f, float %g) { ; CHECK-LABEL: @test_select( ; CHECK-NEXT: [[A0:%.*]] = insertelement <4 x float> undef, float %f, i32 0 -; CHECK-NEXT: [[A3:%.*]] = insertelement <4 x float> [[A0]], float 3.000000e+00, i32 3 -; CHECK-NEXT: [[RET:%.*]] = select <4 x i1> , <4 x float> [[A3]], <4 x float> +; CHECK-NEXT: [[A31:%.*]] = insertelement <4 x float> [[A0]], float 3.000000e+00, i32 3 +; CHECK-NEXT: [[RET:%.*]] = select <4 x i1> , <4 x float> [[A31]], <4 x float> ; CHECK-NEXT: ret <4 x float> [[RET]] ; %a0 = insertelement <4 x float> undef, float %f, i32 0 Index: test/Transforms/InstCombine/vector_insertelt_shuffle.ll =================================================================== --- test/Transforms/InstCombine/vector_insertelt_shuffle.ll +++ test/Transforms/InstCombine/vector_insertelt_shuffle.ll @@ -7,10 +7,9 @@ ret<4 x float> %ins2 } -; FIXME: insertelements should fold to shuffle +; insertelements should fold to shuffle ; CHECK-LABEL: @foo -; CHECK-NEXT: insertelement <4 x float> %{{.+}}, float 1.000000e+00, i32 1 -; CHECK-NEXT: insertelement <4 x float> %{{.+}}, float 2.000000e+00, i32 2 +; CHECK-NEXT: shufflevector <4 x float> %{{.+}}, <4 x float> , <4 x i32> ; CHECK-NEXT: ret <4 x float> % define<4 x float> @bar(<4 x float> %x, float %a) { @@ -45,12 +44,11 @@ ret<4 x float> %ins6 } -; FIXME: insertelements should fold to shuffle +; insertelements should fold to shuffle ; CHECK-LABEL: @bazz ; CHECK-NEXT: insertelement <4 x float> %{{.+}}, float 1.000000e+00, i32 3 ; CHECK-NEXT: insertelement <4 x float> %{{.+}}, float 5.000000e+00, i32 % -; CHECK-NEXT: insertelement <4 x float> %{{.+}}, float 1.000000e+00, i32 1 -; CHECK-NEXT: insertelement <4 x float> %{{.+}}, float 2.000000e+00, i32 2 +; CHECK-NEXT: shufflevector <4 x float> %{{.+}}, <4 x float> , <4 x i32> ; CHECK-NEXT: insertelement <4 x float> %{{.+}}, float 7.000000e+00, i32 % ; CHECK-NEXT: ret <4 x float> % Index: test/Transforms/InstCombine/x86-insertps.ll =================================================================== --- test/Transforms/InstCombine/x86-insertps.ll +++ test/Transforms/InstCombine/x86-insertps.ll @@ -1,3 +1,4 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py ; RUN: opt < %s -instcombine -S | FileCheck %s declare <4 x float> @llvm.x86.sse41.insertps(<4 x float>, <4 x float>, i8) nounwind readnone @@ -5,146 +6,161 @@ ; This should never happen, but make sure we don't crash handling a non-constant immediate byte. define <4 x float> @insertps_non_const_imm(<4 x float> %v1, <4 x float> %v2, i8 %c) { +; CHECK-LABEL: @insertps_non_const_imm( +; CHECK-NEXT: [[RES:%.*]] = call <4 x float> @llvm.x86.sse41.insertps(<4 x float> %v1, <4 x float> %v2, i8 %c) +; CHECK-NEXT: ret <4 x float> [[RES]] +; %res = call <4 x float> @llvm.x86.sse41.insertps(<4 x float> %v1, <4 x float> %v2, i8 %c) ret <4 x float> %res -; CHECK-LABEL: @insertps_non_const_imm -; CHECK-NEXT: call <4 x float> @llvm.x86.sse41.insertps(<4 x float> %v1, <4 x float> %v2, i8 %c) -; CHECK-NEXT: ret <4 x float> } ; If all zero mask bits are set, return a zero regardless of the other control bits. define <4 x float> @insertps_0x0f(<4 x float> %v1, <4 x float> %v2) { +; CHECK-LABEL: @insertps_0x0f( +; CHECK-NEXT: ret <4 x float> zeroinitializer +; %res = call <4 x float> @llvm.x86.sse41.insertps(<4 x float> %v1, <4 x float> %v2, i8 15) ret <4 x float> %res -; CHECK-LABEL: @insertps_0x0f -; CHECK-NEXT: ret <4 x float> zeroinitializer } define <4 x float> @insertps_0xff(<4 x float> %v1, <4 x float> %v2) { +; CHECK-LABEL: @insertps_0xff( +; CHECK-NEXT: ret <4 x float> zeroinitializer +; %res = call <4 x float> @llvm.x86.sse41.insertps(<4 x float> %v1, <4 x float> %v2, i8 255) ret <4 x float> %res -; CHECK-LABEL: @insertps_0xff -; CHECK-NEXT: ret <4 x float> zeroinitializer } ; If some zero mask bits are set that do not override the insertion, we do not change anything. define <4 x float> @insertps_0x0c(<4 x float> %v1, <4 x float> %v2) { +; CHECK-LABEL: @insertps_0x0c( +; CHECK-NEXT: [[RES:%.*]] = call <4 x float> @llvm.x86.sse41.insertps(<4 x float> %v1, <4 x float> %v2, i8 12) +; CHECK-NEXT: ret <4 x float> [[RES]] +; %res = call <4 x float> @llvm.x86.sse41.insertps(<4 x float> %v1, <4 x float> %v2, i8 12) ret <4 x float> %res -; CHECK-LABEL: @insertps_0x0c -; CHECK-NEXT: call <4 x float> @llvm.x86.sse41.insertps(<4 x float> %v1, <4 x float> %v2, i8 12) -; CHECK-NEXT: ret <4 x float> } ; ...unless both input vectors are the same operand. define <4 x float> @insertps_0x15_single_input(<4 x float> %v1) { +; CHECK-LABEL: @insertps_0x15_single_input( +; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <4 x float> %v1, <4 x float> , <4 x i32> +; CHECK-NEXT: ret <4 x float> [[TMP1]] +; %res = call <4 x float> @llvm.x86.sse41.insertps(<4 x float> %v1, <4 x float> %v1, i8 21) ret <4 x float> %res -; CHECK-LABEL: @insertps_0x15_single_input -; CHECK-NEXT: shufflevector <4 x float> %v1, <4 x float> , <4 x i32> -; CHECK-NEXT: ret <4 x float> } ; The zero mask overrides the insertion lane. define <4 x float> @insertps_0x1a_single_input(<4 x float> %v1) { +; CHECK-LABEL: @insertps_0x1a_single_input( +; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <4 x float> %v1, <4 x float> , <4 x i32> +; CHECK-NEXT: ret <4 x float> [[TMP1]] +; %res = call <4 x float> @llvm.x86.sse41.insertps(<4 x float> %v1, <4 x float> %v1, i8 26) ret <4 x float> %res -; CHECK-LABEL: @insertps_0x1a_single_input -; CHECK-NEXT: shufflevector <4 x float> %v1, <4 x float> , <4 x i32> -; CHECK-NEXT: ret <4 x float> } ; The zero mask overrides the insertion lane, so the second input vector is not used. define <4 x float> @insertps_0xc1(<4 x float> %v1, <4 x float> %v2) { +; CHECK-LABEL: @insertps_0xc1( +; CHECK-NEXT: [[TMP1:%.*]] = insertelement <4 x float> %v1, float 0.000000e+00, i32 0 +; CHECK-NEXT: ret <4 x float> [[TMP1]] +; %res = call <4 x float> @llvm.x86.sse41.insertps(<4 x float> %v1, <4 x float> %v2, i8 193) ret <4 x float> %res -; CHECK-LABEL: @insertps_0xc1 -; CHECK-NEXT: shufflevector <4 x float> %v1, <4 x float> , <4 x i32> -; CHECK-NEXT: ret <4 x float> } ; If no zero mask bits are set, convert to a shuffle. define <4 x float> @insertps_0x00(<4 x float> %v1, <4 x float> %v2) { +; CHECK-LABEL: @insertps_0x00( +; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <4 x float> %v1, <4 x float> %v2, <4 x i32> +; CHECK-NEXT: ret <4 x float> [[TMP1]] +; %res = call <4 x float> @llvm.x86.sse41.insertps(<4 x float> %v1, <4 x float> %v2, i8 0) ret <4 x float> %res -; CHECK-LABEL: @insertps_0x00 -; CHECK-NEXT: shufflevector <4 x float> %v1, <4 x float> %v2, <4 x i32> -; CHECK-NEXT: ret <4 x float> } define <4 x float> @insertps_0x10(<4 x float> %v1, <4 x float> %v2) { +; CHECK-LABEL: @insertps_0x10( +; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <4 x float> %v1, <4 x float> %v2, <4 x i32> +; CHECK-NEXT: ret <4 x float> [[TMP1]] +; %res = call <4 x float> @llvm.x86.sse41.insertps(<4 x float> %v1, <4 x float> %v2, i8 16) ret <4 x float> %res -; CHECK-LABEL: @insertps_0x10 -; CHECK-NEXT: shufflevector <4 x float> %v1, <4 x float> %v2, <4 x i32> -; CHECK-NEXT: ret <4 x float> } define <4 x float> @insertps_0x20(<4 x float> %v1, <4 x float> %v2) { +; CHECK-LABEL: @insertps_0x20( +; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <4 x float> %v1, <4 x float> %v2, <4 x i32> +; CHECK-NEXT: ret <4 x float> [[TMP1]] +; %res = call <4 x float> @llvm.x86.sse41.insertps(<4 x float> %v1, <4 x float> %v2, i8 32) ret <4 x float> %res -; CHECK-LABEL: @insertps_0x20 -; CHECK-NEXT: shufflevector <4 x float> %v1, <4 x float> %v2, <4 x i32> -; CHECK-NEXT: ret <4 x float> } define <4 x float> @insertps_0x30(<4 x float> %v1, <4 x float> %v2) { +; CHECK-LABEL: @insertps_0x30( +; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <4 x float> %v1, <4 x float> %v2, <4 x i32> +; CHECK-NEXT: ret <4 x float> [[TMP1]] +; %res = call <4 x float> @llvm.x86.sse41.insertps(<4 x float> %v1, <4 x float> %v2, i8 48) ret <4 x float> %res -; CHECK-LABEL: @insertps_0x30 -; CHECK-NEXT: shufflevector <4 x float> %v1, <4 x float> %v2, <4 x i32> -; CHECK-NEXT: ret <4 x float> } define <4 x float> @insertps_0xc0(<4 x float> %v1, <4 x float> %v2) { +; CHECK-LABEL: @insertps_0xc0( +; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <4 x float> %v1, <4 x float> %v2, <4 x i32> +; CHECK-NEXT: ret <4 x float> [[TMP1]] +; %res = call <4 x float> @llvm.x86.sse41.insertps(<4 x float> %v1, <4 x float> %v2, i8 192) ret <4 x float> %res -; CHECK-LABEL: @insertps_0xc0 -; CHECK-NEXT: shufflevector <4 x float> %v1, <4 x float> %v2, <4 x i32> -; CHECK-NEXT: ret <4 x float> } define <4 x float> @insertps_0xd0(<4 x float> %v1, <4 x float> %v2) { +; CHECK-LABEL: @insertps_0xd0( +; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <4 x float> %v1, <4 x float> %v2, <4 x i32> +; CHECK-NEXT: ret <4 x float> [[TMP1]] +; %res = call <4 x float> @llvm.x86.sse41.insertps(<4 x float> %v1, <4 x float> %v2, i8 208) ret <4 x float> %res -; CHECK-LABEL: @insertps_0xd0 -; CHECK-NEXT: shufflevector <4 x float> %v1, <4 x float> %v2, <4 x i32> -; CHECK-NEXT: ret <4 x float> } define <4 x float> @insertps_0xe0(<4 x float> %v1, <4 x float> %v2) { +; CHECK-LABEL: @insertps_0xe0( +; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <4 x float> %v1, <4 x float> %v2, <4 x i32> +; CHECK-NEXT: ret <4 x float> [[TMP1]] +; %res = call <4 x float> @llvm.x86.sse41.insertps(<4 x float> %v1, <4 x float> %v2, i8 224) ret <4 x float> %res -; CHECK-LABEL: @insertps_0xe0 -; CHECK-NEXT: shufflevector <4 x float> %v1, <4 x float> %v2, <4 x i32> -; CHECK-NEXT: ret <4 x float> } define <4 x float> @insertps_0xf0(<4 x float> %v1, <4 x float> %v2) { +; CHECK-LABEL: @insertps_0xf0( +; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <4 x float> %v1, <4 x float> %v2, <4 x i32> +; CHECK-NEXT: ret <4 x float> [[TMP1]] +; %res = call <4 x float> @llvm.x86.sse41.insertps(<4 x float> %v1, <4 x float> %v2, i8 240) ret <4 x float> %res -; CHECK-LABEL: @insertps_0xf0 -; CHECK-NEXT: shufflevector <4 x float> %v1, <4 x float> %v2, <4 x i32> -; CHECK-NEXT: ret <4 x float> }