diff --git a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp --- a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp +++ b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp @@ -31,10 +31,12 @@ #include "llvm/Transforms/Utils/Local.h" #include "llvm/Transforms/Vectorize.h" +#define DEBUG_TYPE "vector-combine" +#include "llvm/Transforms/Utils/InstructionWorklist.h" + using namespace llvm; using namespace llvm::PatternMatch; -#define DEBUG_TYPE "vector-combine" STATISTIC(NumVecLoad, "Number of vector loads formed"); STATISTIC(NumVecCmp, "Number of vector compares formed"); STATISTIC(NumVecBO, "Number of vector binops formed"); @@ -73,6 +75,7 @@ const DominatorTree &DT; AAResults &AA; AssumptionCache &AC; + InstructionWorklist Worklist; bool vectorizeLoadInsert(Instruction &I); ExtractElementInst *getShuffleExtract(ExtractElementInst *Ext0, @@ -92,14 +95,26 @@ bool foldExtractedCmps(Instruction &I); bool foldSingleElementStore(Instruction &I); bool scalarizeLoadExtract(Instruction &I); + + void replaceValue(Value &Old, Value &New) { + Old.replaceAllUsesWith(&New); + New.takeName(&Old); + if (auto *NewI = dyn_cast(&New)) { + Worklist.pushUsersToWorkList(*NewI); + Worklist.pushValue(NewI); + } + Worklist.pushValue(&Old); + } + + void eraseInstruction(Instruction &I) { + for (Value *Op : I.operands()) + Worklist.pushValue(Op); + Worklist.remove(&I); + I.eraseFromParent(); + } }; } // namespace -static void replaceValue(Value &Old, Value &New) { - Old.replaceAllUsesWith(&New); - New.takeName(&Old); -} - bool VectorCombine::vectorizeLoadInsert(Instruction &I) { // Match insert into fixed vector of scalar value. // TODO: Handle non-zero insert index. @@ -501,6 +516,8 @@ else foldExtExtBinop(Ext0, Ext1, I); + Worklist.push(Ext0); + Worklist.push(Ext1); return true; } @@ -928,8 +945,7 @@ DL); NSI->setAlignment(ScalarOpAlignment); replaceValue(I, *NSI); - // Need erasing the store manually. - I.eraseFromParent(); + eraseInstruction(I); return true; } @@ -939,11 +955,10 @@ /// Try to scalarize vector loads feeding extractelement instructions. bool VectorCombine::scalarizeLoadExtract(Instruction &I) { Value *Ptr; - Value *Idx; - if (!match(&I, m_ExtractElt(m_Load(m_Value(Ptr)), m_Value(Idx)))) + if (!match(&I, m_Load(m_Value(Ptr)))) return false; - auto *LI = cast(I.getOperand(0)); + auto *LI = cast(&I); const DataLayout &DL = I.getModule()->getDataLayout(); if (LI->isVolatile() || !DL.typeSizeEqualsStoreSize(LI->getType())) return false; @@ -1039,6 +1054,16 @@ return false; bool MadeChange = false; + auto FoldInst = [this, &MadeChange](Instruction &I) { + Builder.SetInsertPoint(&I); + MadeChange |= vectorizeLoadInsert(I); + MadeChange |= foldExtractExtract(I); + MadeChange |= foldBitcastShuf(I); + MadeChange |= scalarizeBinopOrCmp(I); + MadeChange |= foldExtractedCmps(I); + MadeChange |= scalarizeLoadExtract(I); + MadeChange |= foldSingleElementStore(I); + }; for (BasicBlock &BB : F) { // Ignore unreachable basic blocks. if (!DT.isReachableFromEntry(&BB)) @@ -1047,21 +1072,22 @@ for (Instruction &I : make_early_inc_range(BB)) { if (isa(I)) continue; - Builder.SetInsertPoint(&I); - MadeChange |= vectorizeLoadInsert(I); - MadeChange |= foldExtractExtract(I); - MadeChange |= foldBitcastShuf(I); - MadeChange |= scalarizeBinopOrCmp(I); - MadeChange |= foldExtractedCmps(I); - MadeChange |= scalarizeLoadExtract(I); - MadeChange |= foldSingleElementStore(I); + FoldInst(I); } } - // We're done with transforms, so remove dead instructions. - if (MadeChange) - for (BasicBlock &BB : F) - SimplifyInstructionsInBlock(&BB); + while (!Worklist.isEmpty()) { + Instruction *I = Worklist.removeOne(); + if (!I) + continue; + + if (isInstructionTriviallyDead(I)) { + eraseInstruction(*I); + continue; + } + + FoldInst(*I); + } return MadeChange; } diff --git a/llvm/test/Transforms/PhaseOrdering/AArch64/matrix-extract-insert.ll b/llvm/test/Transforms/PhaseOrdering/AArch64/matrix-extract-insert.ll --- a/llvm/test/Transforms/PhaseOrdering/AArch64/matrix-extract-insert.ll +++ b/llvm/test/Transforms/PhaseOrdering/AArch64/matrix-extract-insert.ll @@ -20,13 +20,14 @@ ; CHECK-NEXT: [[TMP6:%.*]] = icmp ult i64 [[TMP5]], 225 ; CHECK-NEXT: tail call void @llvm.assume(i1 [[TMP6]]) ; CHECK-NEXT: [[TMP7:%.*]] = bitcast [225 x double]* [[B:%.*]] to <225 x double>* -; CHECK-NEXT: [[TMP8:%.*]] = load <225 x double>, <225 x double>* [[TMP7]], align 8 -; CHECK-NEXT: [[MATRIXEXT4:%.*]] = extractelement <225 x double> [[TMP8]], i64 [[TMP5]] +; CHECK-NEXT: [[TMP8:%.*]] = getelementptr inbounds <225 x double>, <225 x double>* [[TMP7]], i64 0, i64 [[TMP5]] +; CHECK-NEXT: [[MATRIXEXT4:%.*]] = load double, double* [[TMP8]], align 8 ; CHECK-NEXT: [[MUL:%.*]] = fmul double [[MATRIXEXT]], [[MATRIXEXT4]] -; CHECK-NEXT: [[MATRIXEXT7:%.*]] = extractelement <225 x double> [[TMP8]], i64 [[TMP1]] -; CHECK-NEXT: [[SUB:%.*]] = fsub double [[MATRIXEXT7]], [[MUL]] ; CHECK-NEXT: [[TMP9:%.*]] = getelementptr inbounds <225 x double>, <225 x double>* [[TMP7]], i64 0, i64 [[TMP1]] -; CHECK-NEXT: store double [[SUB]], double* [[TMP9]], align 8 +; CHECK-NEXT: [[MATRIXEXT7:%.*]] = load double, double* [[TMP9]], align 8 +; CHECK-NEXT: [[SUB:%.*]] = fsub double [[MATRIXEXT7]], [[MUL]] +; CHECK-NEXT: [[TMP10:%.*]] = getelementptr inbounds <225 x double>, <225 x double>* [[TMP7]], i64 0, i64 [[TMP1]] +; CHECK-NEXT: store double [[SUB]], double* [[TMP10]], align 8 ; CHECK-NEXT: ret void ; entry: diff --git a/llvm/test/Transforms/VectorCombine/AArch64/load-extract-insert-store-scalarization.ll b/llvm/test/Transforms/VectorCombine/AArch64/load-extract-insert-store-scalarization.ll --- a/llvm/test/Transforms/VectorCombine/AArch64/load-extract-insert-store-scalarization.ll +++ b/llvm/test/Transforms/VectorCombine/AArch64/load-extract-insert-store-scalarization.ll @@ -6,13 +6,14 @@ define void @load_extract_insert_store_const_idx(<225 x double>* %A) { ; CHECK-LABEL: @load_extract_insert_store_const_idx( ; CHECK-NEXT: entry: -; CHECK-NEXT: [[LV:%.*]] = load <225 x double>, <225 x double>* [[A:%.*]], align 8 -; CHECK-NEXT: [[EXT_0:%.*]] = extractelement <225 x double> [[LV]], i64 0 +; CHECK-NEXT: [[TMP0:%.*]] = getelementptr inbounds <225 x double>, <225 x double>* [[A:%.*]], i32 0, i64 0 +; CHECK-NEXT: [[EXT_0:%.*]] = load double, double* [[TMP0]], align 8 ; CHECK-NEXT: [[MUL:%.*]] = fmul double 2.000000e+01, [[EXT_0]] -; CHECK-NEXT: [[EXT_1:%.*]] = extractelement <225 x double> [[LV]], i64 1 +; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds <225 x double>, <225 x double>* [[A]], i32 0, i64 1 +; CHECK-NEXT: [[EXT_1:%.*]] = load double, double* [[TMP1]], align 8 ; CHECK-NEXT: [[SUB:%.*]] = fsub double [[EXT_1]], [[MUL]] -; CHECK-NEXT: [[TMP0:%.*]] = getelementptr inbounds <225 x double>, <225 x double>* [[A]], i64 0, i64 1 -; CHECK-NEXT: store double [[SUB]], double* [[TMP0]], align 8 +; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds <225 x double>, <225 x double>* [[A]], i64 0, i64 1 +; CHECK-NEXT: store double [[SUB]], double* [[TMP2]], align 8 ; CHECK-NEXT: ret void ; entry: @@ -33,13 +34,14 @@ ; CHECK-NEXT: call void @llvm.assume(i1 [[CMP_1]]) ; CHECK-NEXT: [[CMP_2:%.*]] = icmp ult i64 [[IDX_2:%.*]], 225 ; CHECK-NEXT: call void @llvm.assume(i1 [[CMP_2]]) -; CHECK-NEXT: [[LV:%.*]] = load <225 x double>, <225 x double>* [[A:%.*]], align 8 -; CHECK-NEXT: [[EXT_0:%.*]] = extractelement <225 x double> [[LV]], i64 [[IDX_1]] +; CHECK-NEXT: [[TMP0:%.*]] = getelementptr inbounds <225 x double>, <225 x double>* [[A:%.*]], i32 0, i64 [[IDX_1]] +; CHECK-NEXT: [[EXT_0:%.*]] = load double, double* [[TMP0]], align 8 ; CHECK-NEXT: [[MUL:%.*]] = fmul double 2.000000e+01, [[EXT_0]] -; CHECK-NEXT: [[EXT_1:%.*]] = extractelement <225 x double> [[LV]], i64 [[IDX_2]] +; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds <225 x double>, <225 x double>* [[A]], i32 0, i64 [[IDX_2]] +; CHECK-NEXT: [[EXT_1:%.*]] = load double, double* [[TMP1]], align 8 ; CHECK-NEXT: [[SUB:%.*]] = fsub double [[EXT_1]], [[MUL]] -; CHECK-NEXT: [[TMP0:%.*]] = getelementptr inbounds <225 x double>, <225 x double>* [[A]], i64 0, i64 [[IDX_1]] -; CHECK-NEXT: store double [[SUB]], double* [[TMP0]], align 8 +; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds <225 x double>, <225 x double>* [[A]], i64 0, i64 [[IDX_1]] +; CHECK-NEXT: store double [[SUB]], double* [[TMP2]], align 8 ; CHECK-NEXT: ret void ; entry: @@ -69,13 +71,14 @@ ; CHECK-NEXT: call void @llvm.assume(i1 [[CMP_2]]) ; CHECK-NEXT: br i1 [[C_1:%.*]], label [[LOOP:%.*]], label [[EXIT:%.*]] ; CHECK: loop: -; CHECK-NEXT: [[LV:%.*]] = load <225 x double>, <225 x double>* [[A:%.*]], align 8 -; CHECK-NEXT: [[EXT_0:%.*]] = extractelement <225 x double> [[LV]], i64 [[IDX_1]] +; CHECK-NEXT: [[TMP0:%.*]] = getelementptr inbounds <225 x double>, <225 x double>* [[A:%.*]], i32 0, i64 [[IDX_1]] +; CHECK-NEXT: [[EXT_0:%.*]] = load double, double* [[TMP0]], align 8 ; CHECK-NEXT: [[MUL:%.*]] = fmul double 2.000000e+01, [[EXT_0]] -; CHECK-NEXT: [[EXT_1:%.*]] = extractelement <225 x double> [[LV]], i64 [[IDX_2]] +; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds <225 x double>, <225 x double>* [[A]], i32 0, i64 [[IDX_2]] +; CHECK-NEXT: [[EXT_1:%.*]] = load double, double* [[TMP1]], align 8 ; CHECK-NEXT: [[SUB:%.*]] = fsub double [[EXT_1]], [[MUL]] -; CHECK-NEXT: [[TMP0:%.*]] = getelementptr inbounds <225 x double>, <225 x double>* [[A]], i64 0, i64 [[IDX_1]] -; CHECK-NEXT: store double [[SUB]], double* [[TMP0]], align 8 +; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds <225 x double>, <225 x double>* [[A]], i64 0, i64 [[IDX_1]] +; CHECK-NEXT: store double [[SUB]], double* [[TMP2]], align 8 ; CHECK-NEXT: [[C_2:%.*]] = call i1 @cond() ; CHECK-NEXT: br i1 [[C_2]], label [[LOOP]], label [[EXIT]] ; CHECK: exit: diff --git a/llvm/test/Transforms/VectorCombine/X86/extract-binop-inseltpoison.ll b/llvm/test/Transforms/VectorCombine/X86/extract-binop-inseltpoison.ll --- a/llvm/test/Transforms/VectorCombine/X86/extract-binop-inseltpoison.ll +++ b/llvm/test/Transforms/VectorCombine/X86/extract-binop-inseltpoison.ll @@ -453,7 +453,9 @@ define <4 x float> @PR34724(<4 x float> %a, <4 x float> %b) { ; CHECK-LABEL: @PR34724( -; CHECK-NEXT: [[SHIFT:%.*]] = shufflevector <4 x float> [[A:%.*]], <4 x float> poison, <4 x i32> +; CHECK-NEXT: [[A0:%.*]] = extractelement <4 x float> [[A:%.*]], i32 0 +; CHECK-NEXT: [[A1:%.*]] = extractelement <4 x float> [[A]], i32 1 +; CHECK-NEXT: [[SHIFT:%.*]] = shufflevector <4 x float> [[A]], <4 x float> poison, <4 x i32> ; CHECK-NEXT: [[TMP1:%.*]] = fadd <4 x float> [[A]], [[SHIFT]] ; CHECK-NEXT: [[A23:%.*]] = extractelement <4 x float> [[TMP1]], i32 2 ; CHECK-NEXT: [[SHIFT1:%.*]] = shufflevector <4 x float> [[B:%.*]], <4 x float> poison, <4 x i32> diff --git a/llvm/test/Transforms/VectorCombine/X86/extract-binop.ll b/llvm/test/Transforms/VectorCombine/X86/extract-binop.ll --- a/llvm/test/Transforms/VectorCombine/X86/extract-binop.ll +++ b/llvm/test/Transforms/VectorCombine/X86/extract-binop.ll @@ -453,7 +453,9 @@ define <4 x float> @PR34724(<4 x float> %a, <4 x float> %b) { ; CHECK-LABEL: @PR34724( -; CHECK-NEXT: [[SHIFT:%.*]] = shufflevector <4 x float> [[A:%.*]], <4 x float> poison, <4 x i32> +; CHECK-NEXT: [[A0:%.*]] = extractelement <4 x float> [[A:%.*]], i32 0 +; CHECK-NEXT: [[A1:%.*]] = extractelement <4 x float> [[A]], i32 1 +; CHECK-NEXT: [[SHIFT:%.*]] = shufflevector <4 x float> [[A]], <4 x float> poison, <4 x i32> ; CHECK-NEXT: [[TMP1:%.*]] = fadd <4 x float> [[A]], [[SHIFT]] ; CHECK-NEXT: [[A23:%.*]] = extractelement <4 x float> [[TMP1]], i32 2 ; CHECK-NEXT: [[SHIFT1:%.*]] = shufflevector <4 x float> [[B:%.*]], <4 x float> poison, <4 x i32> diff --git a/llvm/test/Transforms/VectorCombine/load-insert-store.ll b/llvm/test/Transforms/VectorCombine/load-insert-store.ll --- a/llvm/test/Transforms/VectorCombine/load-insert-store.ll +++ b/llvm/test/Transforms/VectorCombine/load-insert-store.ll @@ -465,7 +465,9 @@ ; CHECK-LABEL: @insert_store_ptr_strip( ; CHECK-NEXT: entry: ; CHECK-NEXT: [[ADDR0:%.*]] = bitcast <16 x i8>* [[Q:%.*]] to <2 x i64>* -; CHECK-NEXT: [[TMP0:%.*]] = getelementptr inbounds <16 x i8>, <16 x i8>* [[Q]], i32 0, i32 3 +; CHECK-NEXT: [[ADDR1:%.*]] = getelementptr <2 x i64>, <2 x i64>* [[ADDR0]], i64 0 +; CHECK-NEXT: [[ADDR2:%.*]] = bitcast <2 x i64>* [[ADDR1]] to <16 x i8>* +; CHECK-NEXT: [[TMP0:%.*]] = getelementptr inbounds <16 x i8>, <16 x i8>* [[ADDR2]], i32 0, i32 3 ; CHECK-NEXT: store i8 [[S:%.*]], i8* [[TMP0]], align 1 ; CHECK-NEXT: ret void ;