diff --git a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp --- a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp +++ b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp @@ -28,6 +28,7 @@ #include "llvm/InitializePasses.h" #include "llvm/Pass.h" #include "llvm/Support/CommandLine.h" +#include "llvm/Transforms/InstCombine/InstCombineWorklist.h" #include "llvm/Transforms/Utils/Local.h" #include "llvm/Transforms/Vectorize.h" @@ -73,6 +74,7 @@ const DominatorTree &DT; AAResults &AA; AssumptionCache &AC; + InstCombineWorklist Worklist; bool vectorizeLoadInsert(Instruction &I); ExtractElementInst *getShuffleExtract(ExtractElementInst *Ext0, @@ -92,14 +94,29 @@ bool foldExtractedCmps(Instruction &I); bool foldSingleElementStore(Instruction &I); bool scalarizeLoadExtract(Instruction &I); + + void replaceValue(Value &Old, Value &New) { + Old.replaceAllUsesWith(&New); + New.takeName(&Old); + if (auto *NewI = dyn_cast(&New)) { + Worklist.pushUsersToWorkList(*NewI); + Worklist.pushValue(NewI); + } + if (auto *OldI = dyn_cast(&Old)) { + Worklist.pushValue(OldI); + } + } + + void eraseInstruction(Instruction &I) { + for (Value *Op : I.operands()) + if (auto *OpI = dyn_cast(Op)) + Worklist.add(OpI); + Worklist.remove(&I); + I.eraseFromParent(); + } }; } // namespace -static void replaceValue(Value &Old, Value &New) { - Old.replaceAllUsesWith(&New); - New.takeName(&Old); -} - bool VectorCombine::vectorizeLoadInsert(Instruction &I) { // Match insert into fixed vector of scalar value. // TODO: Handle non-zero insert index. @@ -501,6 +518,8 @@ else foldExtExtBinop(Ext0, Ext1, I); + Worklist.push(Ext0); + Worklist.push(Ext1); return true; } @@ -927,8 +946,7 @@ DL); NSI->setAlignment(ScalarOpAlignment); replaceValue(I, *NSI); - // Need erasing the store manually. - I.eraseFromParent(); + eraseInstruction(I); return true; } @@ -938,11 +956,10 @@ /// Try to scalarize vector loads feeding extractelement instructions. bool VectorCombine::scalarizeLoadExtract(Instruction &I) { Value *Ptr; - Value *Idx; - if (!match(&I, m_ExtractElt(m_Load(m_Value(Ptr)), m_Value(Idx)))) + if (!match(&I, m_Load(m_Value(Ptr)))) return false; - auto *LI = cast(I.getOperand(0)); + auto *LI = cast(&I); const DataLayout &DL = I.getModule()->getDataLayout(); if (LI->isVolatile() || !DL.typeSizeEqualsStoreSize(LI->getType())) return false; @@ -1038,29 +1055,45 @@ return false; bool MadeChange = false; + auto SimplifyInst = [this](Instruction &I) { + Builder.SetInsertPoint(&I); + return vectorizeLoadInsert(I) || foldExtractExtract(I) || + foldBitcastShuf(I) || scalarizeBinopOrCmp(I) || + foldExtractedCmps(I) || scalarizeLoadExtract(I) || + foldSingleElementStore(I); + }; for (BasicBlock &BB : F) { // Ignore unreachable basic blocks. if (!DT.isReachableFromEntry(&BB)) continue; - // Use early increment range so that we can erase instructions in loop. - for (Instruction &I : make_early_inc_range(BB)) { - if (isa(I)) + for (Instruction &I : make_early_inc_range(BB.instructionsWithoutDebug())) + MadeChange |= SimplifyInst(I); + } + + while (!Worklist.isEmpty()) { + // Walk deferred instructions in reverse order, and push them to the + // worklist, which means they'll end up popped from the worklist in-order. + while (Instruction *I = Worklist.popDeferred()) { + // Check to see if we can DCE the instruction. We do this already here to + // reduce the number of uses and thus allow other folds to trigger. + if (isInstructionTriviallyDead(I)) { + eraseInstruction(*I); continue; - Builder.SetInsertPoint(&I); - MadeChange |= vectorizeLoadInsert(I); - MadeChange |= foldExtractExtract(I); - MadeChange |= foldBitcastShuf(I); - MadeChange |= scalarizeBinopOrCmp(I); - MadeChange |= foldExtractedCmps(I); - MadeChange |= scalarizeLoadExtract(I); - MadeChange |= foldSingleElementStore(I); + } + Worklist.push(I); } - } - // We're done with transforms, so remove dead instructions. - if (MadeChange) - for (BasicBlock &BB : F) - SimplifyInstructionsInBlock(&BB); + Instruction *I = Worklist.removeOne(); + if (!I) + continue; + + if (isInstructionTriviallyDead(I)) { + eraseInstruction(*I); + continue; + } + + MadeChange |= SimplifyInst(*I); + } return MadeChange; } diff --git a/llvm/test/Transforms/PhaseOrdering/AArch64/matrix-extract-insert.ll b/llvm/test/Transforms/PhaseOrdering/AArch64/matrix-extract-insert.ll --- a/llvm/test/Transforms/PhaseOrdering/AArch64/matrix-extract-insert.ll +++ b/llvm/test/Transforms/PhaseOrdering/AArch64/matrix-extract-insert.ll @@ -20,13 +20,14 @@ ; CHECK-NEXT: [[TMP6:%.*]] = icmp ult i64 [[TMP5]], 225 ; CHECK-NEXT: tail call void @llvm.assume(i1 [[TMP6]]) ; CHECK-NEXT: [[TMP7:%.*]] = bitcast [225 x double]* [[B:%.*]] to <225 x double>* -; CHECK-NEXT: [[TMP8:%.*]] = load <225 x double>, <225 x double>* [[TMP7]], align 8 -; CHECK-NEXT: [[MATRIXEXT4:%.*]] = extractelement <225 x double> [[TMP8]], i64 [[TMP5]] +; CHECK-NEXT: [[TMP8:%.*]] = getelementptr inbounds <225 x double>, <225 x double>* [[TMP7]], i64 0, i64 [[TMP5]] +; CHECK-NEXT: [[MATRIXEXT4:%.*]] = load double, double* [[TMP8]], align 8 ; CHECK-NEXT: [[MUL:%.*]] = fmul double [[MATRIXEXT]], [[MATRIXEXT4]] -; CHECK-NEXT: [[MATRIXEXT7:%.*]] = extractelement <225 x double> [[TMP8]], i64 [[TMP1]] -; CHECK-NEXT: [[SUB:%.*]] = fsub double [[MATRIXEXT7]], [[MUL]] ; CHECK-NEXT: [[TMP9:%.*]] = getelementptr inbounds <225 x double>, <225 x double>* [[TMP7]], i64 0, i64 [[TMP1]] -; CHECK-NEXT: store double [[SUB]], double* [[TMP9]], align 8 +; CHECK-NEXT: [[MATRIXEXT7:%.*]] = load double, double* [[TMP9]], align 8 +; CHECK-NEXT: [[SUB:%.*]] = fsub double [[MATRIXEXT7]], [[MUL]] +; CHECK-NEXT: [[TMP10:%.*]] = getelementptr inbounds <225 x double>, <225 x double>* [[TMP7]], i64 0, i64 [[TMP1]] +; CHECK-NEXT: store double [[SUB]], double* [[TMP10]], align 8 ; CHECK-NEXT: ret void ; entry: diff --git a/llvm/test/Transforms/VectorCombine/AArch64/load-extract-insert-store-scalarization.ll b/llvm/test/Transforms/VectorCombine/AArch64/load-extract-insert-store-scalarization.ll --- a/llvm/test/Transforms/VectorCombine/AArch64/load-extract-insert-store-scalarization.ll +++ b/llvm/test/Transforms/VectorCombine/AArch64/load-extract-insert-store-scalarization.ll @@ -6,18 +6,19 @@ define void @load_extract_insert_store_const_idx(<225 x double>* %A) { ; CHECK-LABEL: @load_extract_insert_store_const_idx( ; CHECK-NEXT: entry: -; CHECK-NEXT: [[LV:%.*]] = load <225 x double>, <225 x double>* [[A:%.*]], align 8 -; CHECK-NEXT: [[EXT_0:%.*]] = extractelement <225 x double> [[LV]], i64 0 +; CHECK-NEXT: [[TMP0:%.*]] = getelementptr inbounds <225 x double>, <225 x double>* [[A:%.*]], i32 0, i64 1 +; CHECK-NEXT: [[EXT_0:%.*]] = load double, double* [[TMP0]], align 8 ; CHECK-NEXT: [[MUL:%.*]] = fmul double 2.000000e+01, [[EXT_0]] -; CHECK-NEXT: [[EXT_1:%.*]] = extractelement <225 x double> [[LV]], i64 1 +; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds <225 x double>, <225 x double>* [[A]], i32 0, i64 1 +; CHECK-NEXT: [[EXT_1:%.*]] = load double, double* [[TMP1]], align 8 ; CHECK-NEXT: [[SUB:%.*]] = fsub double [[EXT_1]], [[MUL]] -; CHECK-NEXT: [[TMP0:%.*]] = getelementptr inbounds <225 x double>, <225 x double>* [[A]], i64 0, i64 1 -; CHECK-NEXT: store double [[SUB]], double* [[TMP0]], align 8 +; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds <225 x double>, <225 x double>* [[A]], i64 0, i64 1 +; CHECK-NEXT: store double [[SUB]], double* [[TMP2]], align 8 ; CHECK-NEXT: ret void ; entry: %lv = load <225 x double>, <225 x double>* %A, align 8 - %ext.0 = extractelement <225 x double> %lv, i64 0 + %ext.0 = extractelement <225 x double> %lv, i64 1 %mul = fmul double 20.0, %ext.0 %ext.1 = extractelement <225 x double> %lv, i64 1 %sub = fsub double %ext.1, %mul @@ -33,13 +34,14 @@ ; CHECK-NEXT: call void @llvm.assume(i1 [[CMP_1]]) ; CHECK-NEXT: [[CMP_2:%.*]] = icmp ult i64 [[IDX_2:%.*]], 225 ; CHECK-NEXT: call void @llvm.assume(i1 [[CMP_2]]) -; CHECK-NEXT: [[LV:%.*]] = load <225 x double>, <225 x double>* [[A:%.*]], align 8 -; CHECK-NEXT: [[EXT_0:%.*]] = extractelement <225 x double> [[LV]], i64 [[IDX_1]] +; CHECK-NEXT: [[TMP0:%.*]] = getelementptr inbounds <225 x double>, <225 x double>* [[A:%.*]], i32 0, i64 [[IDX_1]] +; CHECK-NEXT: [[EXT_0:%.*]] = load double, double* [[TMP0]], align 8 ; CHECK-NEXT: [[MUL:%.*]] = fmul double 2.000000e+01, [[EXT_0]] -; CHECK-NEXT: [[EXT_1:%.*]] = extractelement <225 x double> [[LV]], i64 [[IDX_2]] +; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds <225 x double>, <225 x double>* [[A]], i32 0, i64 [[IDX_2]] +; CHECK-NEXT: [[EXT_1:%.*]] = load double, double* [[TMP1]], align 8 ; CHECK-NEXT: [[SUB:%.*]] = fsub double [[EXT_1]], [[MUL]] -; CHECK-NEXT: [[TMP0:%.*]] = getelementptr inbounds <225 x double>, <225 x double>* [[A]], i64 0, i64 [[IDX_1]] -; CHECK-NEXT: store double [[SUB]], double* [[TMP0]], align 8 +; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds <225 x double>, <225 x double>* [[A]], i64 0, i64 [[IDX_1]] +; CHECK-NEXT: store double [[SUB]], double* [[TMP2]], align 8 ; CHECK-NEXT: ret void ; entry: diff --git a/llvm/test/Transforms/VectorCombine/X86/extract-binop-inseltpoison.ll b/llvm/test/Transforms/VectorCombine/X86/extract-binop-inseltpoison.ll --- a/llvm/test/Transforms/VectorCombine/X86/extract-binop-inseltpoison.ll +++ b/llvm/test/Transforms/VectorCombine/X86/extract-binop-inseltpoison.ll @@ -453,7 +453,9 @@ define <4 x float> @PR34724(<4 x float> %a, <4 x float> %b) { ; CHECK-LABEL: @PR34724( -; CHECK-NEXT: [[SHIFT:%.*]] = shufflevector <4 x float> [[A:%.*]], <4 x float> poison, <4 x i32> +; CHECK-NEXT: [[A0:%.*]] = extractelement <4 x float> [[A:%.*]], i32 0 +; CHECK-NEXT: [[A1:%.*]] = extractelement <4 x float> [[A]], i32 1 +; CHECK-NEXT: [[SHIFT:%.*]] = shufflevector <4 x float> [[A]], <4 x float> poison, <4 x i32> ; CHECK-NEXT: [[TMP1:%.*]] = fadd <4 x float> [[A]], [[SHIFT]] ; CHECK-NEXT: [[A23:%.*]] = extractelement <4 x float> [[TMP1]], i32 2 ; CHECK-NEXT: [[SHIFT1:%.*]] = shufflevector <4 x float> [[B:%.*]], <4 x float> poison, <4 x i32> diff --git a/llvm/test/Transforms/VectorCombine/X86/extract-binop.ll b/llvm/test/Transforms/VectorCombine/X86/extract-binop.ll --- a/llvm/test/Transforms/VectorCombine/X86/extract-binop.ll +++ b/llvm/test/Transforms/VectorCombine/X86/extract-binop.ll @@ -453,7 +453,9 @@ define <4 x float> @PR34724(<4 x float> %a, <4 x float> %b) { ; CHECK-LABEL: @PR34724( -; CHECK-NEXT: [[SHIFT:%.*]] = shufflevector <4 x float> [[A:%.*]], <4 x float> poison, <4 x i32> +; CHECK-NEXT: [[A0:%.*]] = extractelement <4 x float> [[A:%.*]], i32 0 +; CHECK-NEXT: [[A1:%.*]] = extractelement <4 x float> [[A]], i32 1 +; CHECK-NEXT: [[SHIFT:%.*]] = shufflevector <4 x float> [[A]], <4 x float> poison, <4 x i32> ; CHECK-NEXT: [[TMP1:%.*]] = fadd <4 x float> [[A]], [[SHIFT]] ; CHECK-NEXT: [[A23:%.*]] = extractelement <4 x float> [[TMP1]], i32 2 ; CHECK-NEXT: [[SHIFT1:%.*]] = shufflevector <4 x float> [[B:%.*]], <4 x float> poison, <4 x i32> diff --git a/llvm/test/Transforms/VectorCombine/load-insert-store.ll b/llvm/test/Transforms/VectorCombine/load-insert-store.ll --- a/llvm/test/Transforms/VectorCombine/load-insert-store.ll +++ b/llvm/test/Transforms/VectorCombine/load-insert-store.ll @@ -465,7 +465,9 @@ ; CHECK-LABEL: @insert_store_ptr_strip( ; CHECK-NEXT: entry: ; CHECK-NEXT: [[ADDR0:%.*]] = bitcast <16 x i8>* [[Q:%.*]] to <2 x i64>* -; CHECK-NEXT: [[TMP0:%.*]] = getelementptr inbounds <16 x i8>, <16 x i8>* [[Q]], i32 0, i32 3 +; CHECK-NEXT: [[ADDR1:%.*]] = getelementptr <2 x i64>, <2 x i64>* [[ADDR0]], i64 0 +; CHECK-NEXT: [[ADDR2:%.*]] = bitcast <2 x i64>* [[ADDR1]] to <16 x i8>* +; CHECK-NEXT: [[TMP0:%.*]] = getelementptr inbounds <16 x i8>, <16 x i8>* [[ADDR2]], i32 0, i32 3 ; CHECK-NEXT: store i8 [[S:%.*]], i8* [[TMP0]], align 1 ; CHECK-NEXT: ret void ;