diff --git a/llvm/include/llvm/Transforms/Vectorize/SLPVectorizer.h b/llvm/include/llvm/Transforms/Vectorize/SLPVectorizer.h --- a/llvm/include/llvm/Transforms/Vectorize/SLPVectorizer.h +++ b/llvm/include/llvm/Transforms/Vectorize/SLPVectorizer.h @@ -128,13 +128,11 @@ bool vectorizeInsertElementInst(InsertElementInst *IEI, BasicBlock *BB, slpvectorizer::BoUpSLP &R); - /// Try to vectorize trees that start at compare instructions. - bool vectorizeCmpInst(CmpInst *CI, BasicBlock *BB, slpvectorizer::BoUpSLP &R); - /// Tries to vectorize constructs started from CmpInst, InsertValueInst or /// InsertElementInst instructions. bool vectorizeSimpleInstructions(SmallVectorImpl &Instructions, - BasicBlock *BB, slpvectorizer::BoUpSLP &R); + BasicBlock *BB, slpvectorizer::BoUpSLP &R, + bool AtTerminator); /// Scan the basic block and look for patterns that are likely to start /// a vectorization chain. diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp --- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp +++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp @@ -7601,22 +7601,11 @@ BuildVectorInsts); } -bool SLPVectorizerPass::vectorizeCmpInst(CmpInst *CI, BasicBlock *BB, - BoUpSLP &R) { - if (tryToVectorizePair(CI->getOperand(0), CI->getOperand(1), R)) - return true; - - bool OpsChanged = false; - for (int Idx = 0; Idx < 2; ++Idx) { - OpsChanged |= - vectorizeRootInstruction(nullptr, CI->getOperand(Idx), BB, R, TTI); - } - return OpsChanged; -} - bool SLPVectorizerPass::vectorizeSimpleInstructions( - SmallVectorImpl &Instructions, BasicBlock *BB, BoUpSLP &R) { + SmallVectorImpl &Instructions, BasicBlock *BB, BoUpSLP &R, + bool AtTerminator) { bool OpsChanged = false; + SmallVector PostponedCmps; for (auto *I : reverse(Instructions)) { if (R.isDeleted(I)) continue; @@ -7624,10 +7613,29 @@ OpsChanged |= vectorizeInsertValueInst(LastInsertValue, BB, R); else if (auto *LastInsertElem = dyn_cast(I)) OpsChanged |= vectorizeInsertElementInst(LastInsertElem, BB, R); - else if (auto *CI = dyn_cast(I)) - OpsChanged |= vectorizeCmpInst(CI, BB, R); + else if (isa(I)) + PostponedCmps.push_back(I); + } + if (AtTerminator) { + // Try to find reductions first. + for (Instruction *I : PostponedCmps) { + if (R.isDeleted(I)) + continue; + for (Value *Op : I->operands()) + OpsChanged |= vectorizeRootInstruction(nullptr, Op, BB, R, TTI); + } + // Try to vectorize operands as vector bundles. + for (Instruction *I : PostponedCmps) { + if (R.isDeleted(I)) + continue; + OpsChanged |= tryToVectorize(I, R); + } + Instructions.clear(); + } else { + // Insert in reverse order since the PostponedCmps vector was filled in + // reverse order. + Instructions.assign(PostponedCmps.rbegin(), PostponedCmps.rend()); } - Instructions.clear(); return OpsChanged; } @@ -7705,7 +7713,8 @@ // We may go through BB multiple times so skip the one we have checked. if (!VisitedInstrs.insert(&*it).second) { if (it->use_empty() && KeyNodes.contains(&*it) && - vectorizeSimpleInstructions(PostProcessInstructions, BB, R)) { + vectorizeSimpleInstructions(PostProcessInstructions, BB, R, + it->isTerminator())) { // We would like to start over since some instructions are deleted // and the iterator may become invalid value. Changed = true; @@ -7764,7 +7773,8 @@ // Start vectorization of post-process list of instructions from the // top-tree instructions to try to vectorize as many instructions as // possible. - OpsChanged |= vectorizeSimpleInstructions(PostProcessInstructions, BB, R); + OpsChanged |= vectorizeSimpleInstructions(PostProcessInstructions, BB, R, + it->isTerminator()); if (OpsChanged) { // We would like to start over since some instructions are deleted // and the iterator may become invalid value. diff --git a/llvm/test/Transforms/SLPVectorizer/X86/horizontal-minmax.ll b/llvm/test/Transforms/SLPVectorizer/X86/horizontal-minmax.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/horizontal-minmax.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/horizontal-minmax.ll @@ -48,65 +48,11 @@ } define i32 @maxi8_store_in(i32) { -; SSE-LABEL: @maxi8_store_in( -; SSE-NEXT: [[TMP2:%.*]] = load <8 x i32>, <8 x i32>* bitcast ([32 x i32]* @arr to <8 x i32>*), align 16 -; SSE-NEXT: store i32 0, i32* @var, align 8 -; SSE-NEXT: [[TMP3:%.*]] = call i32 @llvm.vector.reduce.smax.v8i32(<8 x i32> [[TMP2]]) -; SSE-NEXT: ret i32 [[TMP3]] -; -; AVX-LABEL: @maxi8_store_in( -; AVX-NEXT: [[TMP2:%.*]] = load <4 x i32>, <4 x i32>* bitcast ([32 x i32]* @arr to <4 x i32>*), align 16 -; AVX-NEXT: [[TMP3:%.*]] = call i32 @llvm.vector.reduce.smax.v4i32(<4 x i32> [[TMP2]]) -; AVX-NEXT: [[TMP4:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 4), align 16 -; AVX-NEXT: [[TMP5:%.*]] = icmp sgt i32 [[TMP3]], [[TMP4]] -; AVX-NEXT: [[TMP6:%.*]] = select i1 [[TMP5]], i32 [[TMP3]], i32 [[TMP4]] -; AVX-NEXT: store i32 0, i32* @var, align 8 -; AVX-NEXT: [[TMP7:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 5), align 4 -; AVX-NEXT: [[TMP8:%.*]] = icmp sgt i32 [[TMP6]], [[TMP7]] -; AVX-NEXT: [[TMP9:%.*]] = select i1 [[TMP8]], i32 [[TMP6]], i32 [[TMP7]] -; AVX-NEXT: [[TMP10:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 6), align 8 -; AVX-NEXT: [[TMP11:%.*]] = icmp sgt i32 [[TMP9]], [[TMP10]] -; AVX-NEXT: [[TMP12:%.*]] = select i1 [[TMP11]], i32 [[TMP9]], i32 [[TMP10]] -; AVX-NEXT: [[TMP13:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 7), align 4 -; AVX-NEXT: [[TMP14:%.*]] = icmp sgt i32 [[TMP12]], [[TMP13]] -; AVX-NEXT: [[TMP15:%.*]] = select i1 [[TMP14]], i32 [[TMP12]], i32 [[TMP13]] -; AVX-NEXT: ret i32 [[TMP15]] -; -; AVX2-LABEL: @maxi8_store_in( -; AVX2-NEXT: [[TMP2:%.*]] = load <4 x i32>, <4 x i32>* bitcast ([32 x i32]* @arr to <4 x i32>*), align 16 -; AVX2-NEXT: [[TMP3:%.*]] = call i32 @llvm.vector.reduce.smax.v4i32(<4 x i32> [[TMP2]]) -; AVX2-NEXT: [[TMP4:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 4), align 16 -; AVX2-NEXT: [[TMP5:%.*]] = icmp sgt i32 [[TMP3]], [[TMP4]] -; AVX2-NEXT: [[TMP6:%.*]] = select i1 [[TMP5]], i32 [[TMP3]], i32 [[TMP4]] -; AVX2-NEXT: store i32 0, i32* @var, align 8 -; AVX2-NEXT: [[TMP7:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 5), align 4 -; AVX2-NEXT: [[TMP8:%.*]] = icmp sgt i32 [[TMP6]], [[TMP7]] -; AVX2-NEXT: [[TMP9:%.*]] = select i1 [[TMP8]], i32 [[TMP6]], i32 [[TMP7]] -; AVX2-NEXT: [[TMP10:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 6), align 8 -; AVX2-NEXT: [[TMP11:%.*]] = icmp sgt i32 [[TMP9]], [[TMP10]] -; AVX2-NEXT: [[TMP12:%.*]] = select i1 [[TMP11]], i32 [[TMP9]], i32 [[TMP10]] -; AVX2-NEXT: [[TMP13:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 7), align 4 -; AVX2-NEXT: [[TMP14:%.*]] = icmp sgt i32 [[TMP12]], [[TMP13]] -; AVX2-NEXT: [[TMP15:%.*]] = select i1 [[TMP14]], i32 [[TMP12]], i32 [[TMP13]] -; AVX2-NEXT: ret i32 [[TMP15]] -; -; THRESH-LABEL: @maxi8_store_in( -; THRESH-NEXT: [[TMP2:%.*]] = load <4 x i32>, <4 x i32>* bitcast ([32 x i32]* @arr to <4 x i32>*), align 16 -; THRESH-NEXT: [[TMP3:%.*]] = call i32 @llvm.vector.reduce.smax.v4i32(<4 x i32> [[TMP2]]) -; THRESH-NEXT: [[TMP4:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 4), align 16 -; THRESH-NEXT: [[TMP5:%.*]] = icmp sgt i32 [[TMP3]], [[TMP4]] -; THRESH-NEXT: [[TMP6:%.*]] = select i1 [[TMP5]], i32 [[TMP3]], i32 [[TMP4]] -; THRESH-NEXT: store i32 0, i32* @var, align 8 -; THRESH-NEXT: [[TMP7:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 5), align 4 -; THRESH-NEXT: [[TMP8:%.*]] = icmp sgt i32 [[TMP6]], [[TMP7]] -; THRESH-NEXT: [[TMP9:%.*]] = select i1 [[TMP8]], i32 [[TMP6]], i32 [[TMP7]] -; THRESH-NEXT: [[TMP10:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 6), align 8 -; THRESH-NEXT: [[TMP11:%.*]] = icmp sgt i32 [[TMP9]], [[TMP10]] -; THRESH-NEXT: [[TMP12:%.*]] = select i1 [[TMP11]], i32 [[TMP9]], i32 [[TMP10]] -; THRESH-NEXT: [[TMP13:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 7), align 4 -; THRESH-NEXT: [[TMP14:%.*]] = icmp sgt i32 [[TMP12]], [[TMP13]] -; THRESH-NEXT: [[TMP15:%.*]] = select i1 [[TMP14]], i32 [[TMP12]], i32 [[TMP13]] -; THRESH-NEXT: ret i32 [[TMP15]] +; CHECK-LABEL: @maxi8_store_in( +; CHECK-NEXT: [[TMP2:%.*]] = load <8 x i32>, <8 x i32>* bitcast ([32 x i32]* @arr to <8 x i32>*), align 16 +; CHECK-NEXT: store i32 0, i32* @var, align 8 +; CHECK-NEXT: [[TMP3:%.*]] = call i32 @llvm.vector.reduce.smax.v8i32(<8 x i32> [[TMP2]]) +; CHECK-NEXT: ret i32 [[TMP3]] ; %2 = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 0), align 16 %3 = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 1), align 4 @@ -852,17 +798,17 @@ ; AVX-NEXT: [[TMP5:%.*]] = select i1 [[TMP4]], i32 [[TMP2]], i32 [[TMP3]] ; AVX-NEXT: [[TMP6:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 2) to <4 x i32>*), align 8 ; AVX-NEXT: [[TMP7:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 6), align 8 -; AVX-NEXT: [[TMP8:%.*]] = call i32 @llvm.vector.reduce.smax.v4i32(<4 x i32> [[TMP6]]) -; AVX-NEXT: [[TMP9:%.*]] = icmp sgt i32 [[TMP8]], [[TMP7]] -; AVX-NEXT: [[TMP10:%.*]] = select i1 [[TMP9]], i32 [[TMP8]], i32 [[TMP7]] -; AVX-NEXT: [[OP_EXTRA:%.*]] = icmp sgt i32 [[TMP10]], [[TMP5]] -; AVX-NEXT: [[OP_EXTRA1:%.*]] = select i1 [[OP_EXTRA]], i32 [[TMP10]], i32 [[TMP5]] -; AVX-NEXT: [[TMP11:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 7), align 4 -; AVX-NEXT: [[TMP12:%.*]] = icmp sgt i32 [[OP_EXTRA1]], [[TMP11]] -; AVX-NEXT: [[TMP13:%.*]] = select i1 [[TMP12]], i32 [[OP_EXTRA1]], i32 [[TMP11]] +; AVX-NEXT: [[TMP8:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 7), align 4 +; AVX-NEXT: [[TMP9:%.*]] = call i32 @llvm.vector.reduce.smax.v4i32(<4 x i32> [[TMP6]]) +; AVX-NEXT: [[TMP10:%.*]] = icmp sgt i32 [[TMP9]], [[TMP7]] +; AVX-NEXT: [[TMP11:%.*]] = select i1 [[TMP10]], i32 [[TMP9]], i32 [[TMP7]] +; AVX-NEXT: [[TMP12:%.*]] = icmp sgt i32 [[TMP11]], [[TMP8]] +; AVX-NEXT: [[TMP13:%.*]] = select i1 [[TMP12]], i32 [[TMP11]], i32 [[TMP8]] +; AVX-NEXT: [[OP_EXTRA:%.*]] = icmp sgt i32 [[TMP13]], [[TMP5]] +; AVX-NEXT: [[OP_EXTRA1:%.*]] = select i1 [[OP_EXTRA]], i32 [[TMP13]], i32 [[TMP5]] ; AVX-NEXT: [[TMP14:%.*]] = select i1 [[TMP4]], i32 3, i32 4 ; AVX-NEXT: store i32 [[TMP14]], i32* @var, align 8 -; AVX-NEXT: ret i32 [[TMP13]] +; AVX-NEXT: ret i32 [[OP_EXTRA1]] ; ; AVX2-LABEL: @maxi8_mutiple_uses( ; AVX2-NEXT: [[TMP2:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 0), align 16 @@ -871,17 +817,17 @@ ; AVX2-NEXT: [[TMP5:%.*]] = select i1 [[TMP4]], i32 [[TMP2]], i32 [[TMP3]] ; AVX2-NEXT: [[TMP6:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 2) to <4 x i32>*), align 8 ; AVX2-NEXT: [[TMP7:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 6), align 8 -; AVX2-NEXT: [[TMP8:%.*]] = call i32 @llvm.vector.reduce.smax.v4i32(<4 x i32> [[TMP6]]) -; AVX2-NEXT: [[TMP9:%.*]] = icmp sgt i32 [[TMP8]], [[TMP7]] -; AVX2-NEXT: [[TMP10:%.*]] = select i1 [[TMP9]], i32 [[TMP8]], i32 [[TMP7]] -; AVX2-NEXT: [[OP_EXTRA:%.*]] = icmp sgt i32 [[TMP10]], [[TMP5]] -; AVX2-NEXT: [[OP_EXTRA1:%.*]] = select i1 [[OP_EXTRA]], i32 [[TMP10]], i32 [[TMP5]] -; AVX2-NEXT: [[TMP11:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 7), align 4 -; AVX2-NEXT: [[TMP12:%.*]] = icmp sgt i32 [[OP_EXTRA1]], [[TMP11]] -; AVX2-NEXT: [[TMP13:%.*]] = select i1 [[TMP12]], i32 [[OP_EXTRA1]], i32 [[TMP11]] +; AVX2-NEXT: [[TMP8:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 7), align 4 +; AVX2-NEXT: [[TMP9:%.*]] = call i32 @llvm.vector.reduce.smax.v4i32(<4 x i32> [[TMP6]]) +; AVX2-NEXT: [[TMP10:%.*]] = icmp sgt i32 [[TMP9]], [[TMP7]] +; AVX2-NEXT: [[TMP11:%.*]] = select i1 [[TMP10]], i32 [[TMP9]], i32 [[TMP7]] +; AVX2-NEXT: [[TMP12:%.*]] = icmp sgt i32 [[TMP11]], [[TMP8]] +; AVX2-NEXT: [[TMP13:%.*]] = select i1 [[TMP12]], i32 [[TMP11]], i32 [[TMP8]] +; AVX2-NEXT: [[OP_EXTRA:%.*]] = icmp sgt i32 [[TMP13]], [[TMP5]] +; AVX2-NEXT: [[OP_EXTRA1:%.*]] = select i1 [[OP_EXTRA]], i32 [[TMP13]], i32 [[TMP5]] ; AVX2-NEXT: [[TMP14:%.*]] = select i1 [[TMP4]], i32 3, i32 4 ; AVX2-NEXT: store i32 [[TMP14]], i32* @var, align 8 -; AVX2-NEXT: ret i32 [[TMP13]] +; AVX2-NEXT: ret i32 [[OP_EXTRA1]] ; ; THRESH-LABEL: @maxi8_mutiple_uses( ; THRESH-NEXT: [[TMP2:%.*]] = load <2 x i32>, <2 x i32>* bitcast ([32 x i32]* @arr to <2 x i32>*), align 16 @@ -889,24 +835,24 @@ ; THRESH-NEXT: [[TMP4:%.*]] = extractelement <2 x i32> [[TMP2]], i32 1 ; THRESH-NEXT: [[TMP5:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 2) to <4 x i32>*), align 8 ; THRESH-NEXT: [[TMP6:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 6), align 8 -; THRESH-NEXT: [[TMP7:%.*]] = call i32 @llvm.vector.reduce.smax.v4i32(<4 x i32> [[TMP5]]) -; THRESH-NEXT: [[TMP8:%.*]] = insertelement <2 x i32> poison, i32 [[TMP7]], i32 0 -; THRESH-NEXT: [[TMP9:%.*]] = insertelement <2 x i32> [[TMP8]], i32 [[TMP3]], i32 1 -; THRESH-NEXT: [[TMP10:%.*]] = insertelement <2 x i32> poison, i32 [[TMP6]], i32 0 -; THRESH-NEXT: [[TMP11:%.*]] = insertelement <2 x i32> [[TMP10]], i32 [[TMP4]], i32 1 -; THRESH-NEXT: [[TMP12:%.*]] = icmp sgt <2 x i32> [[TMP9]], [[TMP11]] -; THRESH-NEXT: [[TMP13:%.*]] = select <2 x i1> [[TMP12]], <2 x i32> [[TMP9]], <2 x i32> [[TMP11]] -; THRESH-NEXT: [[TMP14:%.*]] = extractelement <2 x i32> [[TMP13]], i32 1 -; THRESH-NEXT: [[TMP15:%.*]] = extractelement <2 x i32> [[TMP13]], i32 0 -; THRESH-NEXT: [[OP_EXTRA:%.*]] = icmp sgt i32 [[TMP15]], [[TMP14]] -; THRESH-NEXT: [[OP_EXTRA1:%.*]] = select i1 [[OP_EXTRA]], i32 [[TMP15]], i32 [[TMP14]] -; THRESH-NEXT: [[TMP16:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 7), align 4 -; THRESH-NEXT: [[TMP17:%.*]] = icmp sgt i32 [[OP_EXTRA1]], [[TMP16]] -; THRESH-NEXT: [[TMP18:%.*]] = select i1 [[TMP17]], i32 [[OP_EXTRA1]], i32 [[TMP16]] -; THRESH-NEXT: [[TMP19:%.*]] = extractelement <2 x i1> [[TMP12]], i32 1 +; THRESH-NEXT: [[TMP7:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 7), align 4 +; THRESH-NEXT: [[TMP8:%.*]] = call i32 @llvm.vector.reduce.smax.v4i32(<4 x i32> [[TMP5]]) +; THRESH-NEXT: [[TMP9:%.*]] = icmp sgt i32 [[TMP8]], [[TMP6]] +; THRESH-NEXT: [[TMP10:%.*]] = select i1 [[TMP9]], i32 [[TMP8]], i32 [[TMP6]] +; THRESH-NEXT: [[TMP11:%.*]] = insertelement <2 x i32> poison, i32 [[TMP10]], i32 0 +; THRESH-NEXT: [[TMP12:%.*]] = insertelement <2 x i32> [[TMP11]], i32 [[TMP3]], i32 1 +; THRESH-NEXT: [[TMP13:%.*]] = insertelement <2 x i32> poison, i32 [[TMP7]], i32 0 +; THRESH-NEXT: [[TMP14:%.*]] = insertelement <2 x i32> [[TMP13]], i32 [[TMP4]], i32 1 +; THRESH-NEXT: [[TMP15:%.*]] = icmp sgt <2 x i32> [[TMP12]], [[TMP14]] +; THRESH-NEXT: [[TMP16:%.*]] = select <2 x i1> [[TMP15]], <2 x i32> [[TMP12]], <2 x i32> [[TMP14]] +; THRESH-NEXT: [[TMP17:%.*]] = extractelement <2 x i32> [[TMP16]], i32 1 +; THRESH-NEXT: [[TMP18:%.*]] = extractelement <2 x i32> [[TMP16]], i32 0 +; THRESH-NEXT: [[OP_EXTRA:%.*]] = icmp sgt i32 [[TMP18]], [[TMP17]] +; THRESH-NEXT: [[OP_EXTRA1:%.*]] = select i1 [[OP_EXTRA]], i32 [[TMP18]], i32 [[TMP17]] +; THRESH-NEXT: [[TMP19:%.*]] = extractelement <2 x i1> [[TMP15]], i32 1 ; THRESH-NEXT: [[TMP20:%.*]] = select i1 [[TMP19]], i32 3, i32 4 ; THRESH-NEXT: store i32 [[TMP20]], i32* @var, align 8 -; THRESH-NEXT: ret i32 [[TMP18]] +; THRESH-NEXT: ret i32 [[OP_EXTRA1]] ; %2 = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 0), align 16 %3 = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 1), align 4