Index: llvm/trunk/lib/Transforms/InstCombine/InstCombineVectorOps.cpp =================================================================== --- llvm/trunk/lib/Transforms/InstCombine/InstCombineVectorOps.cpp +++ llvm/trunk/lib/Transforms/InstCombine/InstCombineVectorOps.cpp @@ -704,10 +704,18 @@ CurrIE = NextIE; } - // Make sure we've seen an insert into every element. - if (llvm::any_of(ElementPresent, [](bool Present) { return !Present; })) + // If this is just a single insertelement (not a sequence), we are done. + if (FirstIE == &InsElt) return nullptr; + // If we are not inserting into an undef vector, make sure we've seen an + // insert into every element. + // TODO: If the base vector is not undef, it might be better to create a splat + // and then a select-shuffle (blend) with the base vector. + if (!isa(FirstIE->getOperand(0))) + if (any_of(ElementPresent, [](bool Present) { return !Present; })) + return nullptr; + // Create the insert + shuffle. Type *Int32Ty = Type::getInt32Ty(InsElt.getContext()); UndefValue *UndefVec = UndefValue::get(VecTy); @@ -715,8 +723,13 @@ if (!cast(FirstIE->getOperand(2))->isZero()) FirstIE = InsertElementInst::Create(UndefVec, SplatVal, Zero, "", &InsElt); - Constant *ZeroMask = ConstantVector::getSplat(NumElements, Zero); - return new ShuffleVectorInst(FirstIE, UndefVec, ZeroMask); + // Splat from element 0, but replace absent elements with undef in the mask. + SmallVector Mask(NumElements, Zero); + for (unsigned i = 0; i != NumElements; ++i) + if (!ElementPresent[i]) + Mask[i] = UndefValue::get(Int32Ty); + + return new ShuffleVectorInst(FirstIE, UndefVec, ConstantVector::get(Mask)); } /// If we have an insertelement instruction feeding into another insertelement Index: llvm/trunk/test/Transforms/InstCombine/broadcast.ll =================================================================== --- llvm/trunk/test/Transforms/InstCombine/broadcast.ll +++ llvm/trunk/test/Transforms/InstCombine/broadcast.ll @@ -72,11 +72,12 @@ ret <4 x float> %res } -define <4 x float> @bad1(float %arg) { -; CHECK-LABEL: @bad1( -; CHECK-NEXT: [[T4:%.*]] = insertelement <4 x float> undef, float [[ARG:%.*]], i32 1 -; CHECK-NEXT: [[T5:%.*]] = insertelement <4 x float> [[T4]], float [[ARG]], i32 2 -; CHECK-NEXT: [[T6:%.*]] = insertelement <4 x float> [[T5]], float [[ARG]], i32 3 +; The insert is changed to allow the canonical shuffle-splat pattern from element 0. + +define <4 x float> @splat_undef1(float %arg) { +; CHECK-LABEL: @splat_undef1( +; CHECK-NEXT: [[TMP1:%.*]] = insertelement <4 x float> undef, float [[ARG:%.*]], i32 0 +; CHECK-NEXT: [[T6:%.*]] = shufflevector <4 x float> [[TMP1]], <4 x float> undef, <4 x i32> ; CHECK-NEXT: ret <4 x float> [[T6]] ; %t = insertelement <4 x float> undef, float %arg, i32 1 @@ -86,11 +87,12 @@ ret <4 x float> %t6 } -define <4 x float> @bad2(float %arg) { -; CHECK-LABEL: @bad2( +; Re-uses the existing first insertelement. + +define <4 x float> @splat_undef2(float %arg) { +; CHECK-LABEL: @splat_undef2( ; CHECK-NEXT: [[T:%.*]] = insertelement <4 x float> undef, float [[ARG:%.*]], i32 0 -; CHECK-NEXT: [[T5:%.*]] = insertelement <4 x float> [[T]], float [[ARG]], i32 2 -; CHECK-NEXT: [[T6:%.*]] = insertelement <4 x float> [[T5]], float [[ARG]], i32 3 +; CHECK-NEXT: [[T6:%.*]] = shufflevector <4 x float> [[T]], <4 x float> undef, <4 x i32> ; CHECK-NEXT: ret <4 x float> [[T6]] ; %t = insertelement <4 x float> undef, float %arg, i32 0 @@ -123,10 +125,13 @@ ret <1 x float> %t } -define <4 x float> @bad5(float %arg) { -; CHECK-LABEL: @bad5( +; Multiple undef elements are ok. +; TODO: Multiple uses triggers the transform at %t4, but we could form another splat from %t6 and simplify? + +define <4 x float> @splat_undef3(float %arg) { +; CHECK-LABEL: @splat_undef3( ; CHECK-NEXT: [[T:%.*]] = insertelement <4 x float> undef, float [[ARG:%.*]], i32 0 -; CHECK-NEXT: [[T4:%.*]] = insertelement <4 x float> [[T]], float [[ARG]], i32 1 +; CHECK-NEXT: [[T4:%.*]] = shufflevector <4 x float> [[T]], <4 x float> undef, <4 x i32> ; CHECK-NEXT: [[T5:%.*]] = insertelement <4 x float> [[T4]], float [[ARG]], i32 2 ; CHECK-NEXT: [[T6:%.*]] = insertelement <4 x float> [[T5]], float [[ARG]], i32 3 ; CHECK-NEXT: [[T7:%.*]] = fadd <4 x float> [[T6]], [[T4]]