Index: llvm/trunk/include/llvm/Transforms/Vectorize/SLPVectorizer.h =================================================================== --- llvm/trunk/include/llvm/Transforms/Vectorize/SLPVectorizer.h +++ llvm/trunk/include/llvm/Transforms/Vectorize/SLPVectorizer.h @@ -95,9 +95,11 @@ bool tryToVectorizePair(Value *A, Value *B, slpvectorizer::BoUpSLP &R); /// \brief Try to vectorize a list of operands. + /// \param UserCost Cost of the user operations of \p VL if they may affect + /// the cost of the vectorization. /// \returns true if a value was vectorized. bool tryToVectorizeList(ArrayRef VL, slpvectorizer::BoUpSLP &R, - bool AllowReorder = false); + int UserCost = 0, bool AllowReorder = false); /// \brief Try to vectorize a chain that may start at the operands of \p I. bool tryToVectorize(Instruction *I, slpvectorizer::BoUpSLP &R); Index: llvm/trunk/lib/Transforms/Vectorize/SLPVectorizer.cpp =================================================================== --- llvm/trunk/lib/Transforms/Vectorize/SLPVectorizer.cpp +++ llvm/trunk/lib/Transforms/Vectorize/SLPVectorizer.cpp @@ -4702,11 +4702,11 @@ if (!A || !B) return false; Value *VL[] = { A, B }; - return tryToVectorizeList(VL, R, true); + return tryToVectorizeList(VL, R, /*UserCost=*/0, true); } bool SLPVectorizerPass::tryToVectorizeList(ArrayRef VL, BoUpSLP &R, - bool AllowReorder) { + int UserCost, bool AllowReorder) { if (VL.size() < 2) return false; @@ -4815,7 +4815,7 @@ continue; R.computeMinimumValueSizes(); - int Cost = R.getTreeCost(); + int Cost = R.getTreeCost() - UserCost; CandidateFound = true; MinCost = std::min(MinCost, Cost); @@ -5748,9 +5748,17 @@ /// /// Returns true if it matches static bool findBuildVector(InsertElementInst *LastInsertElem, - SmallVectorImpl &BuildVectorOpds) { + TargetTransformInfo *TTI, + SmallVectorImpl &BuildVectorOpds, + int &UserCost) { + UserCost = 0; Value *V = nullptr; do { + if (auto *CI = dyn_cast(LastInsertElem->getOperand(2))) { + UserCost += TTI->getVectorInstrCost(Instruction::InsertElement, + LastInsertElem->getType(), + CI->getZExtValue()); + } BuildVectorOpds.push_back(LastInsertElem->getOperand(1)); V = LastInsertElem->getOperand(0); if (isa(V)) @@ -5965,13 +5973,17 @@ bool SLPVectorizerPass::vectorizeInsertElementInst(InsertElementInst *IEI, BasicBlock *BB, BoUpSLP &R) { + int UserCost; SmallVector BuildVectorOpds; - if (!findBuildVector(IEI, BuildVectorOpds)) + if (!findBuildVector(IEI, TTI, BuildVectorOpds, UserCost) || + (llvm::all_of(BuildVectorOpds, + [](Value *V) { return isa(V); }) && + isShuffle(BuildVectorOpds))) return false; // Vectorize starting with the build vector operands ignoring the BuildVector // instructions for the purpose of scheduling and user extraction. - return tryToVectorizeList(BuildVectorOpds, R); + return tryToVectorizeList(BuildVectorOpds, R, UserCost); } bool SLPVectorizerPass::vectorizeCmpInst(CmpInst *CI, BasicBlock *BB, @@ -6049,8 +6061,8 @@ // is done when there are exactly two elements since tryToVectorizeList // asserts that there are only two values when AllowReorder is true. bool AllowReorder = NumElts == 2; - if (NumElts > 1 && - tryToVectorizeList(makeArrayRef(IncIt, NumElts), R, AllowReorder)) { + if (NumElts > 1 && tryToVectorizeList(makeArrayRef(IncIt, NumElts), R, + /*UserCost=*/0, AllowReorder)) { // Success start over because instructions might have been changed. HaveVectorizedPhiNodes = true; Changed = true; Index: llvm/trunk/test/Transforms/SLPVectorizer/AArch64/gather-root.ll =================================================================== --- llvm/trunk/test/Transforms/SLPVectorizer/AArch64/gather-root.ll +++ llvm/trunk/test/Transforms/SLPVectorizer/AArch64/gather-root.ll @@ -31,50 +31,54 @@ ; ; GATHER-LABEL: @PR28330( ; GATHER-NEXT: entry: -; GATHER-NEXT: [[TMP0:%.*]] = load <2 x i8>, <2 x i8>* bitcast (i8* getelementptr inbounds ([80 x i8], [80 x i8]* @a, i64 0, i64 1) to <2 x i8>*), align 1 -; GATHER-NEXT: [[TMP1:%.*]] = icmp eq <2 x i8> [[TMP0]], zeroinitializer -; GATHER-NEXT: [[TMP4:%.*]] = load i8, i8* getelementptr inbounds ([80 x i8], [80 x i8]* @a, i64 0, i64 3), align 1 -; GATHER-NEXT: [[TMP5:%.*]] = icmp eq i8 [[TMP4]], 0 -; GATHER-NEXT: [[TMP6:%.*]] = load i8, i8* getelementptr inbounds ([80 x i8], [80 x i8]* @a, i64 0, i64 4), align 4 -; GATHER-NEXT: [[TMP7:%.*]] = icmp eq i8 [[TMP6]], 0 -; GATHER-NEXT: [[TMP8:%.*]] = load i8, i8* getelementptr inbounds ([80 x i8], [80 x i8]* @a, i64 0, i64 5), align 1 -; GATHER-NEXT: [[TMP9:%.*]] = icmp eq i8 [[TMP8]], 0 -; GATHER-NEXT: [[TMP10:%.*]] = load i8, i8* getelementptr inbounds ([80 x i8], [80 x i8]* @a, i64 0, i64 6), align 2 -; GATHER-NEXT: [[TMP11:%.*]] = icmp eq i8 [[TMP10]], 0 -; GATHER-NEXT: [[TMP12:%.*]] = load i8, i8* getelementptr inbounds ([80 x i8], [80 x i8]* @a, i64 0, i64 7), align 1 -; GATHER-NEXT: [[TMP13:%.*]] = icmp eq i8 [[TMP12]], 0 -; GATHER-NEXT: [[TMP14:%.*]] = load i8, i8* getelementptr inbounds ([80 x i8], [80 x i8]* @a, i64 0, i64 8), align 8 -; GATHER-NEXT: [[TMP15:%.*]] = icmp eq i8 [[TMP14]], 0 +; GATHER-NEXT: [[TMP0:%.*]] = load <8 x i8>, <8 x i8>* bitcast (i8* getelementptr inbounds ([80 x i8], [80 x i8]* @a, i64 0, i64 1) to <8 x i8>*), align 1 +; GATHER-NEXT: [[TMP1:%.*]] = icmp eq <8 x i8> [[TMP0]], zeroinitializer ; GATHER-NEXT: br label [[FOR_BODY:%.*]] ; GATHER: for.body: -; GATHER-NEXT: [[TMP17:%.*]] = phi i32 [ [[BIN_EXTRA:%.*]], [[FOR_BODY]] ], [ 0, [[ENTRY:%.*]] ] -; GATHER-NEXT: [[TMP2:%.*]] = select <2 x i1> [[TMP1]], <2 x i32> , <2 x i32> -; GATHER-NEXT: [[TMP3:%.*]] = extractelement <2 x i32> [[TMP2]], i32 0 -; GATHER-NEXT: [[TMP20:%.*]] = add i32 [[TMP17]], [[TMP3]] -; GATHER-NEXT: [[TMP4:%.*]] = extractelement <2 x i32> [[TMP2]], i32 1 -; GATHER-NEXT: [[TMP22:%.*]] = add i32 [[TMP20]], [[TMP4]] -; GATHER-NEXT: [[TMP23:%.*]] = select i1 [[TMP5]], i32 -720, i32 -80 -; GATHER-NEXT: [[TMP24:%.*]] = add i32 [[TMP22]], [[TMP23]] -; GATHER-NEXT: [[TMP25:%.*]] = select i1 [[TMP7]], i32 -720, i32 -80 -; GATHER-NEXT: [[TMP26:%.*]] = add i32 [[TMP24]], [[TMP25]] -; GATHER-NEXT: [[TMP27:%.*]] = select i1 [[TMP9]], i32 -720, i32 -80 -; GATHER-NEXT: [[TMP28:%.*]] = add i32 [[TMP26]], [[TMP27]] -; GATHER-NEXT: [[TMP29:%.*]] = select i1 [[TMP11]], i32 -720, i32 -80 -; GATHER-NEXT: [[TMP30:%.*]] = add i32 [[TMP28]], [[TMP29]] -; GATHER-NEXT: [[TMP31:%.*]] = select i1 [[TMP13]], i32 -720, i32 -80 -; GATHER-NEXT: [[TMP32:%.*]] = add i32 [[TMP30]], [[TMP31]] -; GATHER-NEXT: [[TMP33:%.*]] = select i1 [[TMP15]], i32 -720, i32 -80 -; GATHER-NEXT: [[TMP5:%.*]] = insertelement <8 x i32> undef, i32 [[TMP3]], i32 0 -; GATHER-NEXT: [[TMP6:%.*]] = insertelement <8 x i32> [[TMP5]], i32 [[TMP4]], i32 1 -; GATHER-NEXT: [[TMP7:%.*]] = insertelement <8 x i32> [[TMP6]], i32 [[TMP23]], i32 2 -; GATHER-NEXT: [[TMP8:%.*]] = insertelement <8 x i32> [[TMP7]], i32 [[TMP25]], i32 3 -; GATHER-NEXT: [[TMP9:%.*]] = insertelement <8 x i32> [[TMP8]], i32 [[TMP27]], i32 4 -; GATHER-NEXT: [[TMP10:%.*]] = insertelement <8 x i32> [[TMP9]], i32 [[TMP29]], i32 5 -; GATHER-NEXT: [[TMP11:%.*]] = insertelement <8 x i32> [[TMP10]], i32 [[TMP31]], i32 6 -; GATHER-NEXT: [[TMP12:%.*]] = insertelement <8 x i32> [[TMP11]], i32 [[TMP33]], i32 7 -; GATHER-NEXT: [[TMP13:%.*]] = call i32 @llvm.experimental.vector.reduce.add.i32.v8i32(<8 x i32> [[TMP12]]) -; GATHER-NEXT: [[BIN_EXTRA]] = add i32 [[TMP13]], [[TMP17]] -; GATHER-NEXT: [[TMP34:%.*]] = add i32 [[TMP32]], [[TMP33]] +; GATHER-NEXT: [[TMPP17:%.*]] = phi i32 [ [[OP_EXTRA:%.*]], [[FOR_BODY]] ], [ 0, [[ENTRY:%.*]] ] +; GATHER-NEXT: [[TMP2:%.*]] = extractelement <8 x i1> [[TMP1]], i32 0 +; GATHER-NEXT: [[TMP3:%.*]] = insertelement <8 x i1> undef, i1 [[TMP2]], i32 0 +; GATHER-NEXT: [[TMP4:%.*]] = extractelement <8 x i1> [[TMP1]], i32 1 +; GATHER-NEXT: [[TMP5:%.*]] = insertelement <8 x i1> [[TMP3]], i1 [[TMP4]], i32 1 +; GATHER-NEXT: [[TMP6:%.*]] = extractelement <8 x i1> [[TMP1]], i32 2 +; GATHER-NEXT: [[TMP7:%.*]] = insertelement <8 x i1> [[TMP5]], i1 [[TMP6]], i32 2 +; GATHER-NEXT: [[TMP8:%.*]] = extractelement <8 x i1> [[TMP1]], i32 3 +; GATHER-NEXT: [[TMP9:%.*]] = insertelement <8 x i1> [[TMP7]], i1 [[TMP8]], i32 3 +; GATHER-NEXT: [[TMP10:%.*]] = extractelement <8 x i1> [[TMP1]], i32 4 +; GATHER-NEXT: [[TMP11:%.*]] = insertelement <8 x i1> [[TMP9]], i1 [[TMP10]], i32 4 +; GATHER-NEXT: [[TMP12:%.*]] = extractelement <8 x i1> [[TMP1]], i32 5 +; GATHER-NEXT: [[TMP13:%.*]] = insertelement <8 x i1> [[TMP11]], i1 [[TMP12]], i32 5 +; GATHER-NEXT: [[TMP14:%.*]] = extractelement <8 x i1> [[TMP1]], i32 6 +; GATHER-NEXT: [[TMP15:%.*]] = insertelement <8 x i1> [[TMP13]], i1 [[TMP14]], i32 6 +; GATHER-NEXT: [[TMP16:%.*]] = extractelement <8 x i1> [[TMP1]], i32 7 +; GATHER-NEXT: [[TMP17:%.*]] = insertelement <8 x i1> [[TMP15]], i1 [[TMP16]], i32 7 +; GATHER-NEXT: [[TMP18:%.*]] = select <8 x i1> [[TMP17]], <8 x i32> , <8 x i32> +; GATHER-NEXT: [[TMP19:%.*]] = extractelement <8 x i32> [[TMP18]], i32 0 +; GATHER-NEXT: [[TMPP20:%.*]] = add i32 [[TMPP17]], [[TMP19]] +; GATHER-NEXT: [[TMP20:%.*]] = extractelement <8 x i32> [[TMP18]], i32 1 +; GATHER-NEXT: [[TMPP22:%.*]] = add i32 [[TMPP20]], [[TMP20]] +; GATHER-NEXT: [[TMP21:%.*]] = extractelement <8 x i32> [[TMP18]], i32 2 +; GATHER-NEXT: [[TMPP24:%.*]] = add i32 [[TMPP22]], [[TMP21]] +; GATHER-NEXT: [[TMP22:%.*]] = extractelement <8 x i32> [[TMP18]], i32 3 +; GATHER-NEXT: [[TMPP26:%.*]] = add i32 [[TMPP24]], [[TMP22]] +; GATHER-NEXT: [[TMP23:%.*]] = extractelement <8 x i32> [[TMP18]], i32 4 +; GATHER-NEXT: [[TMPP28:%.*]] = add i32 [[TMPP26]], [[TMP23]] +; GATHER-NEXT: [[TMP24:%.*]] = extractelement <8 x i32> [[TMP18]], i32 5 +; GATHER-NEXT: [[TMPP30:%.*]] = add i32 [[TMPP28]], [[TMP24]] +; GATHER-NEXT: [[TMP25:%.*]] = extractelement <8 x i32> [[TMP18]], i32 6 +; GATHER-NEXT: [[TMPP32:%.*]] = add i32 [[TMPP30]], [[TMP25]] +; GATHER-NEXT: [[TMP26:%.*]] = insertelement <8 x i32> undef, i32 [[TMP19]], i32 0 +; GATHER-NEXT: [[TMP27:%.*]] = insertelement <8 x i32> [[TMP26]], i32 [[TMP20]], i32 1 +; GATHER-NEXT: [[TMP28:%.*]] = insertelement <8 x i32> [[TMP27]], i32 [[TMP21]], i32 2 +; GATHER-NEXT: [[TMP29:%.*]] = insertelement <8 x i32> [[TMP28]], i32 [[TMP22]], i32 3 +; GATHER-NEXT: [[TMP30:%.*]] = insertelement <8 x i32> [[TMP29]], i32 [[TMP23]], i32 4 +; GATHER-NEXT: [[TMP31:%.*]] = insertelement <8 x i32> [[TMP30]], i32 [[TMP24]], i32 5 +; GATHER-NEXT: [[TMP32:%.*]] = insertelement <8 x i32> [[TMP31]], i32 [[TMP25]], i32 6 +; GATHER-NEXT: [[TMP33:%.*]] = extractelement <8 x i32> [[TMP18]], i32 7 +; GATHER-NEXT: [[TMP34:%.*]] = insertelement <8 x i32> [[TMP32]], i32 [[TMP33]], i32 7 +; GATHER-NEXT: [[TMP35:%.*]] = call i32 @llvm.experimental.vector.reduce.add.i32.v8i32(<8 x i32> [[TMP34]]) +; GATHER-NEXT: [[OP_EXTRA]] = add i32 [[TMP35]], [[TMPP17]] +; GATHER-NEXT: [[TMP34:%.*]] = add i32 [[TMPP32]], [[TMP33]] ; GATHER-NEXT: br label [[FOR_BODY]] ; ; MAX-COST-LABEL: @PR28330( @@ -179,50 +183,54 @@ ; ; GATHER-LABEL: @PR32038( ; GATHER-NEXT: entry: -; GATHER-NEXT: [[TMP0:%.*]] = load <2 x i8>, <2 x i8>* bitcast (i8* getelementptr inbounds ([80 x i8], [80 x i8]* @a, i64 0, i64 1) to <2 x i8>*), align 1 -; GATHER-NEXT: [[TMP1:%.*]] = icmp eq <2 x i8> [[TMP0]], zeroinitializer -; GATHER-NEXT: [[TMP4:%.*]] = load i8, i8* getelementptr inbounds ([80 x i8], [80 x i8]* @a, i64 0, i64 3), align 1 -; GATHER-NEXT: [[TMP5:%.*]] = icmp eq i8 [[TMP4]], 0 -; GATHER-NEXT: [[TMP6:%.*]] = load i8, i8* getelementptr inbounds ([80 x i8], [80 x i8]* @a, i64 0, i64 4), align 4 -; GATHER-NEXT: [[TMP7:%.*]] = icmp eq i8 [[TMP6]], 0 -; GATHER-NEXT: [[TMP8:%.*]] = load i8, i8* getelementptr inbounds ([80 x i8], [80 x i8]* @a, i64 0, i64 5), align 1 -; GATHER-NEXT: [[TMP9:%.*]] = icmp eq i8 [[TMP8]], 0 -; GATHER-NEXT: [[TMP10:%.*]] = load i8, i8* getelementptr inbounds ([80 x i8], [80 x i8]* @a, i64 0, i64 6), align 2 -; GATHER-NEXT: [[TMP11:%.*]] = icmp eq i8 [[TMP10]], 0 -; GATHER-NEXT: [[TMP12:%.*]] = load i8, i8* getelementptr inbounds ([80 x i8], [80 x i8]* @a, i64 0, i64 7), align 1 -; GATHER-NEXT: [[TMP13:%.*]] = icmp eq i8 [[TMP12]], 0 -; GATHER-NEXT: [[TMP14:%.*]] = load i8, i8* getelementptr inbounds ([80 x i8], [80 x i8]* @a, i64 0, i64 8), align 8 -; GATHER-NEXT: [[TMP15:%.*]] = icmp eq i8 [[TMP14]], 0 +; GATHER-NEXT: [[TMP0:%.*]] = load <8 x i8>, <8 x i8>* bitcast (i8* getelementptr inbounds ([80 x i8], [80 x i8]* @a, i64 0, i64 1) to <8 x i8>*), align 1 +; GATHER-NEXT: [[TMP1:%.*]] = icmp eq <8 x i8> [[TMP0]], zeroinitializer ; GATHER-NEXT: br label [[FOR_BODY:%.*]] ; GATHER: for.body: -; GATHER-NEXT: [[TMP17:%.*]] = phi i32 [ [[BIN_EXTRA:%.*]], [[FOR_BODY]] ], [ 0, [[ENTRY:%.*]] ] -; GATHER-NEXT: [[TMP2:%.*]] = select <2 x i1> [[TMP1]], <2 x i32> , <2 x i32> -; GATHER-NEXT: [[TMP3:%.*]] = extractelement <2 x i32> [[TMP2]], i32 0 -; GATHER-NEXT: [[TMP20:%.*]] = add i32 -5, [[TMP3]] -; GATHER-NEXT: [[TMP4:%.*]] = extractelement <2 x i32> [[TMP2]], i32 1 -; GATHER-NEXT: [[TMP22:%.*]] = add i32 [[TMP20]], [[TMP4]] -; GATHER-NEXT: [[TMP23:%.*]] = select i1 [[TMP5]], i32 -720, i32 -80 -; GATHER-NEXT: [[TMP24:%.*]] = add i32 [[TMP22]], [[TMP23]] -; GATHER-NEXT: [[TMP25:%.*]] = select i1 [[TMP7]], i32 -720, i32 -80 -; GATHER-NEXT: [[TMP26:%.*]] = add i32 [[TMP24]], [[TMP25]] -; GATHER-NEXT: [[TMP27:%.*]] = select i1 [[TMP9]], i32 -720, i32 -80 -; GATHER-NEXT: [[TMP28:%.*]] = add i32 [[TMP26]], [[TMP27]] -; GATHER-NEXT: [[TMP29:%.*]] = select i1 [[TMP11]], i32 -720, i32 -80 -; GATHER-NEXT: [[TMP30:%.*]] = add i32 [[TMP28]], [[TMP29]] -; GATHER-NEXT: [[TMP31:%.*]] = select i1 [[TMP13]], i32 -720, i32 -80 -; GATHER-NEXT: [[TMP32:%.*]] = add i32 [[TMP30]], [[TMP31]] -; GATHER-NEXT: [[TMP33:%.*]] = select i1 [[TMP15]], i32 -720, i32 -80 -; GATHER-NEXT: [[TMP5:%.*]] = insertelement <8 x i32> undef, i32 [[TMP3]], i32 0 -; GATHER-NEXT: [[TMP6:%.*]] = insertelement <8 x i32> [[TMP5]], i32 [[TMP4]], i32 1 -; GATHER-NEXT: [[TMP7:%.*]] = insertelement <8 x i32> [[TMP6]], i32 [[TMP23]], i32 2 -; GATHER-NEXT: [[TMP8:%.*]] = insertelement <8 x i32> [[TMP7]], i32 [[TMP25]], i32 3 -; GATHER-NEXT: [[TMP9:%.*]] = insertelement <8 x i32> [[TMP8]], i32 [[TMP27]], i32 4 -; GATHER-NEXT: [[TMP10:%.*]] = insertelement <8 x i32> [[TMP9]], i32 [[TMP29]], i32 5 -; GATHER-NEXT: [[TMP11:%.*]] = insertelement <8 x i32> [[TMP10]], i32 [[TMP31]], i32 6 -; GATHER-NEXT: [[TMP12:%.*]] = insertelement <8 x i32> [[TMP11]], i32 [[TMP33]], i32 7 -; GATHER-NEXT: [[TMP13:%.*]] = call i32 @llvm.experimental.vector.reduce.add.i32.v8i32(<8 x i32> [[TMP12]]) -; GATHER-NEXT: [[BIN_EXTRA]] = add i32 [[TMP13]], -5 -; GATHER-NEXT: [[TMP34:%.*]] = add i32 [[TMP32]], [[TMP33]] +; GATHER-NEXT: [[TMP17:%.*]] = phi i32 [ [[OP_EXTRA:%.*]], [[FOR_BODY]] ], [ 0, [[ENTRY:%.*]] ] +; GATHER-NEXT: [[TMP2:%.*]] = extractelement <8 x i1> [[TMP1]], i32 0 +; GATHER-NEXT: [[TMP3:%.*]] = insertelement <8 x i1> undef, i1 [[TMP2]], i32 0 +; GATHER-NEXT: [[TMP4:%.*]] = extractelement <8 x i1> [[TMP1]], i32 1 +; GATHER-NEXT: [[TMP5:%.*]] = insertelement <8 x i1> [[TMP3]], i1 [[TMP4]], i32 1 +; GATHER-NEXT: [[TMP6:%.*]] = extractelement <8 x i1> [[TMP1]], i32 2 +; GATHER-NEXT: [[TMP7:%.*]] = insertelement <8 x i1> [[TMP5]], i1 [[TMP6]], i32 2 +; GATHER-NEXT: [[TMP8:%.*]] = extractelement <8 x i1> [[TMP1]], i32 3 +; GATHER-NEXT: [[TMP9:%.*]] = insertelement <8 x i1> [[TMP7]], i1 [[TMP8]], i32 3 +; GATHER-NEXT: [[TMP10:%.*]] = extractelement <8 x i1> [[TMP1]], i32 4 +; GATHER-NEXT: [[TMP11:%.*]] = insertelement <8 x i1> [[TMP9]], i1 [[TMP10]], i32 4 +; GATHER-NEXT: [[TMP12:%.*]] = extractelement <8 x i1> [[TMP1]], i32 5 +; GATHER-NEXT: [[TMP13:%.*]] = insertelement <8 x i1> [[TMP11]], i1 [[TMP12]], i32 5 +; GATHER-NEXT: [[TMP14:%.*]] = extractelement <8 x i1> [[TMP1]], i32 6 +; GATHER-NEXT: [[TMP15:%.*]] = insertelement <8 x i1> [[TMP13]], i1 [[TMP14]], i32 6 +; GATHER-NEXT: [[TMP16:%.*]] = extractelement <8 x i1> [[TMP1]], i32 7 +; GATHER-NEXT: [[TMP17:%.*]] = insertelement <8 x i1> [[TMP15]], i1 [[TMP16]], i32 7 +; GATHER-NEXT: [[TMP18:%.*]] = select <8 x i1> [[TMP17]], <8 x i32> , <8 x i32> +; GATHER-NEXT: [[TMP19:%.*]] = extractelement <8 x i32> [[TMP18]], i32 0 +; GATHER-NEXT: [[TMPP20:%.*]] = add i32 -5, [[TMP19]] +; GATHER-NEXT: [[TMP20:%.*]] = extractelement <8 x i32> [[TMP18]], i32 1 +; GATHER-NEXT: [[TMPP22:%.*]] = add i32 [[TMPP20]], [[TMP20]] +; GATHER-NEXT: [[TMP21:%.*]] = extractelement <8 x i32> [[TMP18]], i32 2 +; GATHER-NEXT: [[TMPP24:%.*]] = add i32 [[TMPP22]], [[TMP21]] +; GATHER-NEXT: [[TMP22:%.*]] = extractelement <8 x i32> [[TMP18]], i32 3 +; GATHER-NEXT: [[TMPP26:%.*]] = add i32 [[TMPP24]], [[TMP22]] +; GATHER-NEXT: [[TMP23:%.*]] = extractelement <8 x i32> [[TMP18]], i32 4 +; GATHER-NEXT: [[TMPP28:%.*]] = add i32 [[TMPP26]], [[TMP23]] +; GATHER-NEXT: [[TMP24:%.*]] = extractelement <8 x i32> [[TMP18]], i32 5 +; GATHER-NEXT: [[TMPP30:%.*]] = add i32 [[TMPP28]], [[TMP24]] +; GATHER-NEXT: [[TMP25:%.*]] = extractelement <8 x i32> [[TMP18]], i32 6 +; GATHER-NEXT: [[TMPP32:%.*]] = add i32 [[TMPP30]], [[TMP25]] +; GATHER-NEXT: [[TMP26:%.*]] = insertelement <8 x i32> undef, i32 [[TMP19]], i32 0 +; GATHER-NEXT: [[TMP27:%.*]] = insertelement <8 x i32> [[TMP26]], i32 [[TMP20]], i32 1 +; GATHER-NEXT: [[TMP28:%.*]] = insertelement <8 x i32> [[TMP27]], i32 [[TMP21]], i32 2 +; GATHER-NEXT: [[TMP29:%.*]] = insertelement <8 x i32> [[TMP28]], i32 [[TMP22]], i32 3 +; GATHER-NEXT: [[TMP30:%.*]] = insertelement <8 x i32> [[TMP29]], i32 [[TMP23]], i32 4 +; GATHER-NEXT: [[TMP31:%.*]] = insertelement <8 x i32> [[TMP30]], i32 [[TMP24]], i32 5 +; GATHER-NEXT: [[TMP32:%.*]] = insertelement <8 x i32> [[TMP31]], i32 [[TMP25]], i32 6 +; GATHER-NEXT: [[TMP33:%.*]] = extractelement <8 x i32> [[TMP18]], i32 7 +; GATHER-NEXT: [[TMP34:%.*]] = insertelement <8 x i32> [[TMP32]], i32 [[TMP33]], i32 7 +; GATHER-NEXT: [[TMP35:%.*]] = call i32 @llvm.experimental.vector.reduce.add.i32.v8i32(<8 x i32> [[TMP34]]) +; GATHER-NEXT: [[OP_EXTRA]] = add i32 [[TMP35]], -5 +; GATHER-NEXT: [[TMP34:%.*]] = add i32 [[TMPP32]], [[TMP33]] ; GATHER-NEXT: br label [[FOR_BODY]] ; ; MAX-COST-LABEL: @PR32038( Index: llvm/trunk/test/Transforms/SLPVectorizer/X86/insert-element-build-vector.ll =================================================================== --- llvm/trunk/test/Transforms/SLPVectorizer/X86/insert-element-build-vector.ll +++ llvm/trunk/test/Transforms/SLPVectorizer/X86/insert-element-build-vector.ll @@ -387,14 +387,14 @@ ; to do this backwards this backwards define <4 x i32> @reconstruct(<4 x i32> %c) #0 { ; CHECK-LABEL: @reconstruct( -; CHECK-NEXT: [[TMP1:%.*]] = extractelement <4 x i32> [[C:%.*]], i32 3 -; CHECK-NEXT: [[TMP2:%.*]] = extractelement <4 x i32> [[C]], i32 2 -; CHECK-NEXT: [[TMP3:%.*]] = extractelement <4 x i32> [[C]], i32 1 -; CHECK-NEXT: [[TMP4:%.*]] = extractelement <4 x i32> [[C]], i32 0 -; CHECK-NEXT: [[RA:%.*]] = insertelement <4 x i32> undef, i32 [[TMP4]], i32 0 -; CHECK-NEXT: [[RB:%.*]] = insertelement <4 x i32> [[RA]], i32 [[TMP3]], i32 1 -; CHECK-NEXT: [[RC:%.*]] = insertelement <4 x i32> [[RB]], i32 [[TMP2]], i32 2 -; CHECK-NEXT: [[RD:%.*]] = insertelement <4 x i32> [[RC]], i32 [[TMP1]], i32 3 +; CHECK-NEXT: [[C0:%.*]] = extractelement <4 x i32> [[C:%.*]], i32 0 +; CHECK-NEXT: [[C1:%.*]] = extractelement <4 x i32> [[C]], i32 1 +; CHECK-NEXT: [[C2:%.*]] = extractelement <4 x i32> [[C]], i32 2 +; CHECK-NEXT: [[C3:%.*]] = extractelement <4 x i32> [[C]], i32 3 +; CHECK-NEXT: [[RA:%.*]] = insertelement <4 x i32> undef, i32 [[C0]], i32 0 +; CHECK-NEXT: [[RB:%.*]] = insertelement <4 x i32> [[RA]], i32 [[C1]], i32 1 +; CHECK-NEXT: [[RC:%.*]] = insertelement <4 x i32> [[RB]], i32 [[C2]], i32 2 +; CHECK-NEXT: [[RD:%.*]] = insertelement <4 x i32> [[RC]], i32 [[C3]], i32 3 ; CHECK-NEXT: ret <4 x i32> [[RD]] ; ; ZEROTHRESH-LABEL: @reconstruct( Index: llvm/trunk/test/Transforms/SLPVectorizer/X86/jumbled-load-multiuse.ll =================================================================== --- llvm/trunk/test/Transforms/SLPVectorizer/X86/jumbled-load-multiuse.ll +++ llvm/trunk/test/Transforms/SLPVectorizer/X86/jumbled-load-multiuse.ll @@ -12,19 +12,20 @@ ; CHECK-LABEL: @fn1( ; CHECK-NEXT: entry: ; CHECK-NEXT: [[TMP0:%.*]] = load i32, i32* getelementptr inbounds ([4 x i32], [4 x i32]* @b, i64 0, i32 0), align 4 -; CHECK-NEXT: [[TMP1:%.*]] = load i32, i32* getelementptr inbounds ([4 x i32], [4 x i32]* @b, i64 0, i32 1), align 4 -; CHECK-NEXT: [[TMP2:%.*]] = load i32, i32* getelementptr inbounds ([4 x i32], [4 x i32]* @b, i64 0, i32 2), align 4 -; CHECK-NEXT: [[TMP3:%.*]] = load i32, i32* getelementptr inbounds ([4 x i32], [4 x i32]* @b, i64 0, i32 3), align 4 -; CHECK-NEXT: [[TMP4:%.*]] = insertelement <4 x i32> undef, i32 [[TMP1]], i32 0 -; CHECK-NEXT: [[TMP5:%.*]] = insertelement <4 x i32> [[TMP4]], i32 [[TMP2]], i32 1 -; CHECK-NEXT: [[TMP6:%.*]] = insertelement <4 x i32> [[TMP5]], i32 [[TMP3]], i32 2 -; CHECK-NEXT: [[TMP7:%.*]] = insertelement <4 x i32> [[TMP6]], i32 [[TMP0]], i32 3 -; CHECK-NEXT: [[TMP8:%.*]] = icmp sgt <4 x i32> [[TMP7]], zeroinitializer -; CHECK-NEXT: [[TMP9:%.*]] = insertelement <4 x i32> [[TMP4]], i32 ptrtoint (i32 ()* @fn1 to i32), i32 1 -; CHECK-NEXT: [[TMP10:%.*]] = insertelement <4 x i32> [[TMP9]], i32 ptrtoint (i32 ()* @fn1 to i32), i32 2 -; CHECK-NEXT: [[TMP11:%.*]] = insertelement <4 x i32> [[TMP10]], i32 8, i32 3 -; CHECK-NEXT: [[TMP12:%.*]] = select <4 x i1> [[TMP8]], <4 x i32> [[TMP11]], <4 x i32> -; CHECK-NEXT: store <4 x i32> [[TMP12]], <4 x i32>* bitcast ([4 x i32]* @a to <4 x i32>*), align 4 +; CHECK-NEXT: [[TMP1:%.*]] = load <2 x i32>, <2 x i32>* bitcast (i32* getelementptr inbounds ([4 x i32], [4 x i32]* @b, i64 0, i32 1) to <2 x i32>*), align 4 +; CHECK-NEXT: [[TMP2:%.*]] = load i32, i32* getelementptr inbounds ([4 x i32], [4 x i32]* @b, i64 0, i32 3), align 4 +; CHECK-NEXT: [[TMP3:%.*]] = extractelement <2 x i32> [[TMP1]], i32 0 +; CHECK-NEXT: [[TMP4:%.*]] = insertelement <4 x i32> undef, i32 [[TMP3]], i32 0 +; CHECK-NEXT: [[TMP5:%.*]] = extractelement <2 x i32> [[TMP1]], i32 1 +; CHECK-NEXT: [[TMP6:%.*]] = insertelement <4 x i32> [[TMP4]], i32 [[TMP5]], i32 1 +; CHECK-NEXT: [[TMP7:%.*]] = insertelement <4 x i32> [[TMP6]], i32 [[TMP2]], i32 2 +; CHECK-NEXT: [[TMP8:%.*]] = insertelement <4 x i32> [[TMP7]], i32 [[TMP0]], i32 3 +; CHECK-NEXT: [[TMP9:%.*]] = icmp sgt <4 x i32> [[TMP8]], zeroinitializer +; CHECK-NEXT: [[TMP10:%.*]] = insertelement <4 x i32> [[TMP4]], i32 ptrtoint (i32 ()* @fn1 to i32), i32 1 +; CHECK-NEXT: [[TMP11:%.*]] = insertelement <4 x i32> [[TMP10]], i32 ptrtoint (i32 ()* @fn1 to i32), i32 2 +; CHECK-NEXT: [[TMP12:%.*]] = insertelement <4 x i32> [[TMP11]], i32 8, i32 3 +; CHECK-NEXT: [[TMP13:%.*]] = select <4 x i1> [[TMP9]], <4 x i32> [[TMP12]], <4 x i32> +; CHECK-NEXT: store <4 x i32> [[TMP13]], <4 x i32>* bitcast ([4 x i32]* @a to <4 x i32>*), align 4 ; CHECK-NEXT: ret i32 0 ; entry: Index: llvm/trunk/test/Transforms/SLPVectorizer/X86/sign-extend.ll =================================================================== --- llvm/trunk/test/Transforms/SLPVectorizer/X86/sign-extend.ll +++ llvm/trunk/test/Transforms/SLPVectorizer/X86/sign-extend.ll @@ -34,18 +34,15 @@ define <4 x i16> @truncate_v_v(<4 x i32> %lhs) { ; CHECK-LABEL: @truncate_v_v( ; CHECK-NEXT: entry: -; CHECK-NEXT: [[VECEXT:%.*]] = extractelement <4 x i32> [[LHS:%.*]], i32 0 -; CHECK-NEXT: [[CONV:%.*]] = trunc i32 [[VECEXT]] to i16 -; CHECK-NEXT: [[VECINIT:%.*]] = insertelement <4 x i16> undef, i16 [[CONV]], i32 0 -; CHECK-NEXT: [[VECEXT1:%.*]] = extractelement <4 x i32> [[LHS]], i32 1 -; CHECK-NEXT: [[CONV2:%.*]] = trunc i32 [[VECEXT1]] to i16 -; CHECK-NEXT: [[VECINIT3:%.*]] = insertelement <4 x i16> [[VECINIT]], i16 [[CONV2]], i32 1 -; CHECK-NEXT: [[VECEXT4:%.*]] = extractelement <4 x i32> [[LHS]], i32 2 -; CHECK-NEXT: [[CONV5:%.*]] = trunc i32 [[VECEXT4]] to i16 -; CHECK-NEXT: [[VECINIT6:%.*]] = insertelement <4 x i16> [[VECINIT3]], i16 [[CONV5]], i32 2 -; CHECK-NEXT: [[VECEXT7:%.*]] = extractelement <4 x i32> [[LHS]], i32 3 -; CHECK-NEXT: [[CONV8:%.*]] = trunc i32 [[VECEXT7]] to i16 -; CHECK-NEXT: [[VECINIT9:%.*]] = insertelement <4 x i16> [[VECINIT6]], i16 [[CONV8]], i32 3 +; CHECK-NEXT: [[TMP0:%.*]] = trunc <4 x i32> [[LHS:%.*]] to <4 x i16> +; CHECK-NEXT: [[TMP1:%.*]] = extractelement <4 x i16> [[TMP0]], i32 0 +; CHECK-NEXT: [[VECINIT:%.*]] = insertelement <4 x i16> undef, i16 [[TMP1]], i32 0 +; CHECK-NEXT: [[TMP2:%.*]] = extractelement <4 x i16> [[TMP0]], i32 1 +; CHECK-NEXT: [[VECINIT3:%.*]] = insertelement <4 x i16> [[VECINIT]], i16 [[TMP2]], i32 1 +; CHECK-NEXT: [[TMP3:%.*]] = extractelement <4 x i16> [[TMP0]], i32 2 +; CHECK-NEXT: [[VECINIT6:%.*]] = insertelement <4 x i16> [[VECINIT3]], i16 [[TMP3]], i32 2 +; CHECK-NEXT: [[TMP4:%.*]] = extractelement <4 x i16> [[TMP0]], i32 3 +; CHECK-NEXT: [[VECINIT9:%.*]] = insertelement <4 x i16> [[VECINIT6]], i16 [[TMP4]], i32 3 ; CHECK-NEXT: ret <4 x i16> [[VECINIT9]] ; entry: Index: llvm/trunk/test/Transforms/SLPVectorizer/X86/value-bug.ll =================================================================== --- llvm/trunk/test/Transforms/SLPVectorizer/X86/value-bug.ll +++ llvm/trunk/test/Transforms/SLPVectorizer/X86/value-bug.ll @@ -11,34 +11,32 @@ define void @test() { ; CHECK-LABEL: @test( ; CHECK-NEXT: bb279: +; CHECK-NEXT: [[TMP0:%.*]] = insertelement <2 x float> undef, float undef, i32 0 +; CHECK-NEXT: [[TMP1:%.*]] = insertelement <2 x float> [[TMP0]], float undef, i32 1 ; CHECK-NEXT: br label [[BB283:%.*]] ; CHECK: bb283: -; CHECK-NEXT: [[TMP0:%.*]] = phi <2 x float> [ undef, [[BB279:%.*]] ], [ [[TMP11:%.*]], [[EXIT:%.*]] ] -; CHECK-NEXT: [[TMP1:%.*]] = phi <2 x float> [ undef, [[BB279]] ], [ [[TMP13:%.*]], [[EXIT]] ] +; CHECK-NEXT: [[TMP2:%.*]] = phi <2 x float> [ undef, [[BB279:%.*]] ], [ [[TMP13:%.*]], [[EXIT:%.*]] ] +; CHECK-NEXT: [[TMP3:%.*]] = phi <2 x float> [ undef, [[BB279]] ], [ [[TMP1]], [[EXIT]] ] ; CHECK-NEXT: br label [[BB284:%.*]] ; CHECK: bb284: -; CHECK-NEXT: [[TMP2:%.*]] = fpext <2 x float> [[TMP0]] to <2 x double> -; CHECK-NEXT: [[TMP3:%.*]] = fsub <2 x double> [[TMP2]], undef -; CHECK-NEXT: [[TMP4:%.*]] = fsub <2 x double> [[TMP3]], undef +; CHECK-NEXT: [[TMP4:%.*]] = fpext <2 x float> [[TMP2]] to <2 x double> +; CHECK-NEXT: [[TMP5:%.*]] = fsub <2 x double> [[TMP4]], undef +; CHECK-NEXT: [[TMP6:%.*]] = fsub <2 x double> [[TMP5]], undef ; CHECK-NEXT: br label [[BB21_I:%.*]] ; CHECK: bb21.i: ; CHECK-NEXT: br i1 undef, label [[BB22_I:%.*]], label [[EXIT]] ; CHECK: bb22.i: -; CHECK-NEXT: [[TMP5:%.*]] = fadd <2 x double> undef, [[TMP4]] +; CHECK-NEXT: [[TMP7:%.*]] = fadd <2 x double> undef, [[TMP6]] ; CHECK-NEXT: br label [[BB32_I:%.*]] ; CHECK: bb32.i: -; CHECK-NEXT: [[TMP6:%.*]] = phi <2 x double> [ [[TMP5]], [[BB22_I]] ], [ zeroinitializer, [[BB32_I]] ] +; CHECK-NEXT: [[TMP8:%.*]] = phi <2 x double> [ [[TMP7]], [[BB22_I]] ], [ zeroinitializer, [[BB32_I]] ] ; CHECK-NEXT: br i1 undef, label [[BB32_I]], label [[BB21_I]] ; CHECK: exit: -; CHECK-NEXT: [[TMP7:%.*]] = fpext <2 x float> [[TMP1]] to <2 x double> -; CHECK-NEXT: [[TMP8:%.*]] = fmul <2 x double> , [[TMP7]] -; CHECK-NEXT: [[TMP9:%.*]] = fadd <2 x double> undef, [[TMP8]] -; CHECK-NEXT: [[TMP10:%.*]] = fadd <2 x double> undef, [[TMP9]] -; CHECK-NEXT: [[TMP11]] = fptrunc <2 x double> [[TMP10]] to <2 x float> -; CHECK-NEXT: [[TMP317:%.*]] = fptrunc double undef to float -; CHECK-NEXT: [[TMP319:%.*]] = fptrunc double undef to float -; CHECK-NEXT: [[TMP12:%.*]] = insertelement <2 x float> undef, float [[TMP317]], i32 0 -; CHECK-NEXT: [[TMP13]] = insertelement <2 x float> [[TMP12]], float [[TMP319]], i32 1 +; CHECK-NEXT: [[TMP9:%.*]] = fpext <2 x float> [[TMP3]] to <2 x double> +; CHECK-NEXT: [[TMP10:%.*]] = fmul <2 x double> , [[TMP9]] +; CHECK-NEXT: [[TMP11:%.*]] = fadd <2 x double> undef, [[TMP10]] +; CHECK-NEXT: [[TMP12:%.*]] = fadd <2 x double> undef, [[TMP11]] +; CHECK-NEXT: [[TMP13]] = fptrunc <2 x double> [[TMP12]] to <2 x float> ; CHECK-NEXT: br label [[BB283]] ; bb279: