Index: llvm/trunk/lib/Transforms/InstCombine/InstCombineVectorOps.cpp =================================================================== --- llvm/trunk/lib/Transforms/InstCombine/InstCombineVectorOps.cpp +++ llvm/trunk/lib/Transforms/InstCombine/InstCombineVectorOps.cpp @@ -593,6 +593,58 @@ return true; } +// Turn a chain of inserts that splats a value into a canonical insert + shuffle +// splat. That is: +// insertelt(insertelt(insertelt(insertelt X, %k, 0), %k, 1), %k, 2) ... -> +// shufflevector(insertelt(X, %k, 0), undef, zero) +static Instruction *foldInsSequenceIntoBroadcast(InsertElementInst &InsElt) { + // We are interested in the last insert in a chain. So, if this insert + // has a single user, and that user is an insert, bail. + if (InsElt.hasOneUse() && isa(InsElt.user_back())) + return nullptr; + + VectorType *VT = cast(InsElt.getType()); + int NumElements = VT->getNumElements(); + + // Do not try to do this for a one-element vector, since that's a nop, + // and will cause an inf-loop. + if (NumElements == 1) + return nullptr; + + Value *SplatVal = InsElt.getOperand(1); + InsertElementInst *CurrIE = &InsElt; + SmallVector ElementPresent(NumElements, false); + + // Walk the chain backwards, keeping track of which indices we inserted into, + // until we hit something that isn't an insert of the splatted value. + while (CurrIE) { + ConstantInt *Idx = dyn_cast(CurrIE->getOperand(2)); + if (!Idx || CurrIE->getOperand(1) != SplatVal) + return nullptr; + + // Check none of the intermediate steps have any additional uses. + if ((CurrIE != &InsElt) && !CurrIE->hasOneUse()) + return nullptr; + + ElementPresent[Idx->getZExtValue()] = true; + CurrIE = dyn_cast(CurrIE->getOperand(0)); + } + + // Make sure we've seen an insert into every element. + if (llvm::any_of(ElementPresent, [](bool Present) { return !Present; })) + return nullptr; + + // All right, create the insert + shuffle. + Instruction *InsertFirst = InsertElementInst::Create( + UndefValue::get(VT), SplatVal, + ConstantInt::get(Type::getInt32Ty(InsElt.getContext()), 0), "", &InsElt); + + Constant *ZeroMask = ConstantAggregateZero::get( + VectorType::get(Type::getInt32Ty(InsElt.getContext()), NumElements)); + + return new ShuffleVectorInst(InsertFirst, UndefValue::get(VT), ZeroMask); +} + /// insertelt (shufflevector X, CVec, Mask|insertelt X, C1, CIndex1), C, CIndex /// --> shufflevector X, CVec', Mask' static Instruction *foldConstantInsEltIntoShuffle(InsertElementInst &InsElt) { @@ -754,6 +806,11 @@ if (Instruction *Shuf = foldConstantInsEltIntoShuffle(IE)) return Shuf; + // Turn a sequence of inserts that broadcasts a scalar into a single + // insert + shufflevector. + if (Instruction *Broadcast = foldInsSequenceIntoBroadcast(IE)) + return Broadcast; + return nullptr; } Index: llvm/trunk/test/Transforms/BBVectorize/X86/loop1.ll =================================================================== --- llvm/trunk/test/Transforms/BBVectorize/X86/loop1.ll +++ llvm/trunk/test/Transforms/BBVectorize/X86/loop1.ll @@ -38,7 +38,7 @@ ; CHECK-NEXT: insertelement ; CHECK-NEXT: fadd <2 x double> ; CHECK-NEXT: insertelement -; CHECK-NEXT: insertelement +; CHECK-NEXT: shufflevector ; CHECK-NEXT: fadd <2 x double> ; CHECK-NEXT: insertelement ; CHECK-NEXT: fmul <2 x double> Index: llvm/trunk/test/Transforms/BBVectorize/loop1.ll =================================================================== --- llvm/trunk/test/Transforms/BBVectorize/loop1.ll +++ llvm/trunk/test/Transforms/BBVectorize/loop1.ll @@ -46,10 +46,10 @@ ; CHECK: %add4.v.i1.1 = insertelement <2 x double> undef, double %1, i32 0 ; CHECK: %add4.v.i1.2 = insertelement <2 x double> %add4.v.i1.1, double %0, i32 1 ; CHECK: %add4 = fadd <2 x double> %add4.v.i1.2, %add4.v.i1.2 -; CHECK: %add5.v.i1.1 = insertelement <2 x double> undef, double %0, i32 0 -; CHECK: %add5.v.i1.2 = insertelement <2 x double> %add5.v.i1.1, double %0, i32 1 +; CHECK: %2 = insertelement <2 x double> undef, double %0, i32 0 +; CHECK: %add5.v.i1.2 = shufflevector <2 x double> %2, <2 x double> undef, <2 x i32> zeroinitializer ; CHECK: %add5 = fadd <2 x double> %add4, %add5.v.i1.2 -; CHECK: %mul6.v.i0.2 = insertelement <2 x double> %add5.v.i1.1, double %mul8, i32 1 +; CHECK: %mul6.v.i0.2 = insertelement <2 x double> %2, double %mul8, i32 1 ; CHECK: %mul6 = fmul <2 x double> %mul6.v.i0.2, %add5 ; CHECK: %mul6.v.r1 = extractelement <2 x double> %mul6, i32 0 ; CHECK: %mul6.v.r2 = extractelement <2 x double> %mul6, i32 1 Index: llvm/trunk/test/Transforms/BBVectorize/simple-int.ll =================================================================== --- llvm/trunk/test/Transforms/BBVectorize/simple-int.ll +++ llvm/trunk/test/Transforms/BBVectorize/simple-int.ll @@ -177,7 +177,7 @@ ; CHECK: %X1.v.i0.1 = insertelement <2 x double> undef, double %A1, i32 0 ; CHECK: %X1.v.i0.2 = insertelement <2 x double> %X1.v.i0.1, double %A2, i32 1 ; CHECK: %X1 = fsub <2 x double> %X1.v.i0.2, %X1.v.i1.2 -; CHECK: %Y1.v.i1.2 = insertelement <2 x double> %X1.v.i0.1, double %A1, i32 1 +; CHECK: %Y1.v.i1.2 = shufflevector <2 x double> %X1.v.i0.1, <2 x double> undef, <2 x i32> zeroinitializer ; CHECK: %Y1 = call <2 x double> @llvm.copysign.v2f64(<2 x double> %X1, <2 x double> %Y1.v.i1.2) ; CHECK: %Z1 = fadd <2 x double> %Y1, %X1.v.i1.2 ; CHECK: %Z1.v.r1 = extractelement <2 x double> %Z1, i32 0 Index: llvm/trunk/test/Transforms/InstCombine/broadcast.ll =================================================================== --- llvm/trunk/test/Transforms/InstCombine/broadcast.ll +++ llvm/trunk/test/Transforms/InstCombine/broadcast.ll @@ -0,0 +1,109 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py +; RUN: opt -instcombine -S < %s | FileCheck %s + +; CHECK-LABEL: good1 +; CHECK: %[[INS:.*]] = insertelement <4 x float> undef, float %arg, i32 0 +; CHECK-NEXT: %[[BCAST:.*]] = shufflevector <4 x float> %[[INS]], <4 x float> undef, <4 x i32> zeroinitializer +; CHECK-NEXT: ret <4 x float> %[[BCAST]] +define <4 x float> @good1(float %arg) { + %tmp = insertelement <4 x float> undef, float %arg, i32 0 + %tmp4 = insertelement <4 x float> %tmp, float %arg, i32 1 + %tmp5 = insertelement <4 x float> %tmp4, float %arg, i32 2 + %tmp6 = insertelement <4 x float> %tmp5, float %arg, i32 3 + ret <4 x float> %tmp6 +} + +; CHECK-LABEL: good2 +; CHECK: %[[INS:.*]] = insertelement <4 x float> undef, float %arg, i32 0 +; CHECK-NEXT: %[[BCAST:.*]] = shufflevector <4 x float> %[[INS]], <4 x float> undef, <4 x i32> zeroinitializer +; CHECK-NEXT: ret <4 x float> %[[BCAST]] +define <4 x float> @good2(float %arg) { + %tmp = insertelement <4 x float> undef, float %arg, i32 1 + %tmp4 = insertelement <4 x float> %tmp, float %arg, i32 2 + %tmp5 = insertelement <4 x float> %tmp4, float %arg, i32 0 + %tmp6 = insertelement <4 x float> %tmp5, float %arg, i32 3 + ret <4 x float> %tmp6 +} + +; CHECK-LABEL: good3 +; CHECK: %[[INS:.*]] = insertelement <4 x float> undef, float %arg, i32 0 +; CHECK-NEXT: %[[BCAST:.*]] = shufflevector <4 x float> %[[INS]], <4 x float> undef, <4 x i32> zeroinitializer +; CHECK-NEXT: ret <4 x float> %[[BCAST]] +define <4 x float> @good3(float %arg) { + %tmp = insertelement <4 x float> zeroinitializer, float %arg, i32 0 + %tmp4 = insertelement <4 x float> %tmp, float %arg, i32 1 + %tmp5 = insertelement <4 x float> %tmp4, float %arg, i32 2 + %tmp6 = insertelement <4 x float> %tmp5, float %arg, i32 3 + ret <4 x float> %tmp6 +} + +; CHECK-LABEL: good4 +; CHECK: %[[INS:.*]] = insertelement <4 x float> undef, float %arg, i32 0 +; CHECK-NEXT: %[[ADD:.*]] = fadd <4 x float> %[[INS]], %[[INS]] +; CHECK-NEXT: %[[BCAST:.*]] = shufflevector <4 x float> %[[ADD]], <4 x float> undef, <4 x i32> zeroinitializer +; CHECK-NEXT: ret <4 x float> %[[BCAST]] +define <4 x float> @good4(float %arg) { + %tmp = insertelement <4 x float> zeroinitializer, float %arg, i32 0 + %tmp4 = insertelement <4 x float> %tmp, float %arg, i32 1 + %tmp5 = insertelement <4 x float> %tmp4, float %arg, i32 2 + %tmp6 = insertelement <4 x float> %tmp5, float %arg, i32 3 + %tmp7 = fadd <4 x float> %tmp6, %tmp6 + ret <4 x float> %tmp7 +} + +; CHECK-LABEL: bad1 +; CHECK-NOT: shufflevector +define <4 x float> @bad1(float %arg) { + %tmp = insertelement <4 x float> undef, float %arg, i32 1 + %tmp4 = insertelement <4 x float> %tmp, float %arg, i32 1 + %tmp5 = insertelement <4 x float> %tmp4, float %arg, i32 2 + %tmp6 = insertelement <4 x float> %tmp5, float %arg, i32 3 + ret <4 x float> %tmp6 +} + +; CHECK-LABEL: bad2 +; CHECK-NOT: shufflevector +define <4 x float> @bad2(float %arg) { + %tmp = insertelement <4 x float> undef, float %arg, i32 0 + %tmp5 = insertelement <4 x float> %tmp, float %arg, i32 2 + %tmp6 = insertelement <4 x float> %tmp5, float %arg, i32 3 + ret <4 x float> %tmp6 +} + +; CHECK-LABEL: bad3 +; CHECK-NOT: shufflevector +define <4 x float> @bad3(float %arg, float %arg2) { + %tmp = insertelement <4 x float> undef, float %arg, i32 0 + %tmp4 = insertelement <4 x float> %tmp, float %arg2, i32 1 + %tmp5 = insertelement <4 x float> %tmp4, float %arg, i32 2 + %tmp6 = insertelement <4 x float> %tmp5, float %arg, i32 3 + ret <4 x float> %tmp6 +} + +; CHECK-LABEL: bad4 +; CHECK-NOT: shufflevector +define <1 x float> @bad4(float %arg) { + %tmp = insertelement <1 x float> undef, float %arg, i32 0 + ret <1 x float> %tmp +} + +; CHECK-LABEL: bad5 +; CHECK-NOT: shufflevector +define <4 x float> @bad5(float %arg) { + %tmp = insertelement <4 x float> undef, float %arg, i32 0 + %tmp4 = insertelement <4 x float> %tmp, float %arg, i32 1 + %tmp5 = insertelement <4 x float> %tmp4, float %arg, i32 2 + %tmp6 = insertelement <4 x float> %tmp5, float %arg, i32 3 + %tmp7 = fadd <4 x float> %tmp6, %tmp4 + ret <4 x float> %tmp7 +} + +; CHECK-LABEL: bad6 +; CHECK-NOT: shufflevector +define <4 x float> @bad6(float %arg, i32 %k) { + %tmp = insertelement <4 x float> undef, float %arg, i32 0 + %tmp4 = insertelement <4 x float> %tmp, float %arg, i32 1 + %tmp5 = insertelement <4 x float> %tmp4, float %arg, i32 %k + %tmp6 = insertelement <4 x float> %tmp5, float %arg, i32 3 + ret <4 x float> %tmp6 +} Index: llvm/trunk/test/Transforms/SLPVectorizer/X86/operandorder.ll =================================================================== --- llvm/trunk/test/Transforms/SLPVectorizer/X86/operandorder.ll +++ llvm/trunk/test/Transforms/SLPVectorizer/X86/operandorder.ll @@ -26,7 +26,7 @@ ; CHECK-LABEL: shuffle_preserve_broadcast ; CHECK: %[[BCAST:[a-z0-9]+]] = insertelement <2 x double> undef, double %v0_1 -; CHECK: = insertelement <2 x double> %[[BCAST]], double %v0_1 +; CHECK: = shufflevector <2 x double> %[[BCAST]], <2 x double> undef, <2 x i32> zeroinitializer define void @shuffle_preserve_broadcast(double * noalias %from, double * noalias %to, double %v1, double %v2) { @@ -51,7 +51,7 @@ ; CHECK-LABEL: shuffle_preserve_broadcast2 ; CHECK: %[[BCAST:[a-z0-9]+]] = insertelement <2 x double> undef, double %v0_1 -; CHECK: = insertelement <2 x double> %[[BCAST]], double %v0_1 +; CHECK: = shufflevector <2 x double> %[[BCAST]], <2 x double> undef, <2 x i32> zeroinitializer define void @shuffle_preserve_broadcast2(double * noalias %from, double * noalias %to, double %v1, double %v2) { @@ -76,7 +76,7 @@ ; CHECK-LABEL: shuffle_preserve_broadcast3 ; CHECK: %[[BCAST:[a-z0-9]+]] = insertelement <2 x double> undef, double %v0_1 -; CHECK: = insertelement <2 x double> %[[BCAST]], double %v0_1 +; CHECK: = shufflevector <2 x double> %[[BCAST]], <2 x double> undef, <2 x i32> zeroinitializer define void @shuffle_preserve_broadcast3(double * noalias %from, double * noalias %to, double %v1, double %v2) { @@ -102,7 +102,7 @@ ; CHECK-LABEL: shuffle_preserve_broadcast4 ; CHECK: %[[BCAST:[a-z0-9]+]] = insertelement <2 x double> undef, double %v0_1 -; CHECK: = insertelement <2 x double> %[[BCAST]], double %v0_1 +; CHECK: = shufflevector <2 x double> %[[BCAST]], <2 x double> undef, <2 x i32> zeroinitializer define void @shuffle_preserve_broadcast4(double * noalias %from, double * noalias %to, double %v1, double %v2) { @@ -127,7 +127,7 @@ ; CHECK-LABEL: shuffle_preserve_broadcast5 ; CHECK: %[[BCAST:[a-z0-9]+]] = insertelement <2 x double> undef, double %v0_1 -; CHECK: = insertelement <2 x double> %[[BCAST]], double %v0_1 +; CHECK: = shufflevector <2 x double> %[[BCAST]], <2 x double> undef, <2 x i32> zeroinitializer define void @shuffle_preserve_broadcast5(double * noalias %from, double * noalias %to, double %v1, double %v2) { @@ -153,7 +153,7 @@ ; CHECK-LABEL: shuffle_preserve_broadcast6 ; CHECK: %[[BCAST:[a-z0-9]+]] = insertelement <2 x double> undef, double %v0_1 -; CHECK: = insertelement <2 x double> %[[BCAST]], double %v0_1 +; CHECK: = shufflevector <2 x double> %[[BCAST]], <2 x double> undef, <2 x i32> zeroinitializer define void @shuffle_preserve_broadcast6(double * noalias %from, double * noalias %to, double %v1, double %v2) {