diff --git a/llvm/lib/Transforms/Scalar/NewGVN.cpp b/llvm/lib/Transforms/Scalar/NewGVN.cpp --- a/llvm/lib/Transforms/Scalar/NewGVN.cpp +++ b/llvm/lib/Transforms/Scalar/NewGVN.cpp @@ -656,6 +656,15 @@ // Deletion info. SmallPtrSet InstructionsToErase; + // Keeps the memory instructions that should be optimized with load coercion. + // The first value is the load instruction that should be optimized and the + // second value is the memory access of the memory instruction that the load + // depends on. + std::map LoadCoercion; + + // Keeps the load instructions that have been optimized with load coercion. + SmallPtrSet OptimizedLoads; + public: NewGVN(Function &F, DominatorTree *DT, AssumptionCache *AC, TargetLibraryInfo *TLI, AliasAnalysis *AA, MemorySSA *MSSA, @@ -833,6 +842,7 @@ SmallVectorImpl &) const; bool eliminateInstructions(Function &); + bool implementLoadCoercion(); void replaceInstruction(Instruction *, Value *); void markInstructionForDeletion(Instruction *); void deleteInstructionsInBlock(BasicBlock *); @@ -1456,6 +1466,14 @@ << " to constant " << *C << "\n"); return createConstantExpression( getConstantStoreValueForLoad(C, Offset, LoadType, DL)); + } else { + // Add the load instructions that can be optimized with load coercion in + // LoadCoercion map. + const_cast(this)->LoadCoercion[LI] = DefiningAccess; + // Load coercion occurs before the elimination phase. The loads that can + // be optimized with load coercion are not added in any congruence + // class. Thus, we do not create any load expression for them. + return nullptr; } } } else if (auto *DepLI = dyn_cast(DepInst)) { @@ -2977,6 +2995,8 @@ MemoryToUsers.clear(); RevisitOnReachabilityChange.clear(); IntrinsicInstPred.clear(); + LoadCoercion.clear(); + OptimizedLoads.clear(); } // Assign local DFS number mapping to instructions, and leave space for Value @@ -3481,6 +3501,17 @@ verifyIterationSettled(F); verifyStoreExpressions(); + // During load coercion, we replace the candidate load instructions with a new + // sequence of instructions. Next, we run value numbering which adds the new + // instructions in the right congruence classes. In this way, any redundant + // instructions will be optimized away in the elimiantion phase. Hence, it + // makes sense to do the load coercion before the elimination phase. + if (implementLoadCoercion()) + // The newly generated instructions need to get a DFS number for the + // elimination phase. + // TODO: Update DFS numbers faster. + ICount = updateDFSNumbers(ICount); + Changed |= eliminateInstructions(F); // Delete all instructions marked for deletion. @@ -3816,6 +3847,100 @@ return nullptr; } +// Iterate over the load instructions of LoadCoercion map and it replaces them +// with the right sequence of instructions. Next, it checks if any other load in +// LoadCoercion map can be optimized with the newly generated instructions. +// Finally, it runs value numbering for the new instructions. In this way, any +// redundant instructions will be eliminated in the elimination phase. +bool NewGVN::implementLoadCoercion() { + bool AnythingReplaced = false; + for (const auto &P : LoadCoercion) { + LoadInst *LoadToOptimize = cast(P.first); + // The LoadCoercion map might have multiple load instructions that can be + // optimized with the same sequence of instructions. In this case, we + // process only the first load and we optimize the other loads with the + // sequence of instructions that we emitted for the first load. Here, we + // skip these loads in order not to process them again. + if (OptimizedLoads.count(LoadToOptimize)) + continue; + + Type *LoadToOptimizeTy = LoadToOptimize->getType(); + Value *NewValue = nullptr; + MemoryAccess *MA = P.second; + Instruction *DependingInsn = cast(MA)->getMemoryInst(); + BasicBlock *DependingInsnBB = cast(DependingInsn)->getParent(); + Instruction *InsnBeforeLoadToOptimize = LoadToOptimize->getPrevNode(); + Instruction *InsnBeforeTerminatorInDependingInsnBB = + DependingInsnBB->getTerminator()->getPrevNode(); + bool DefUSeAreInDifferentBB = + DependingInsnBB != LoadToOptimize->getParent(); + // If the load and the depending instruction are in different basic blocks, + // then the new sequence of instructions is emitted at the end of the basic + // block of the depending instruction. In this way, the newly generated + // instructions can be used by other loads that are in basic blocks that are + // dominated by the basic block of the depending instruction. If the load + // and the depending instruction are in the same basic block, then we emit + // them just before the load. + Instruction *InsertPtr = DefUSeAreInDifferentBB + ? DependingInsnBB->getTerminator() + : LoadToOptimize; + if (StoreInst *Store = dyn_cast(DependingInsn)) { + int Offset = analyzeLoadFromClobberingStore( + LoadToOptimizeTy, LoadToOptimize->getPointerOperand(), Store, DL); + // Emits the sequence of the instructions that replace the load. + NewValue = getStoreValueForLoad(Store->getValueOperand(), Offset, + LoadToOptimizeTy, InsertPtr, DL); + } + OptimizedLoads.insert(LoadToOptimize); + InstructionsToErase.insert(LoadToOptimize); + LoadToOptimize->replaceAllUsesWith(NewValue); + LLVM_DEBUG(dbgs() << "Load coersion: The load " << *LoadToOptimize + << " was eliminated and its uses were replaced by " + << *NewValue << "\n"); + + // Iterate the LoadCoercion map and check if there is any other load + // instruction that can be replaced with the same sequence of instructions. + for (const auto &P : LoadCoercion) { + LoadInst *LI = cast(P.first); + + if (LI == LoadToOptimize) + continue; + + // Check if the two load instructions have the same type and if the memory + // instructions that they depend on have the same memory access. + if (LoadToOptimize->getType() == LI->getType() && MA == P.second) { + OptimizedLoads.insert(LI); + InstructionsToErase.insert(LI); + LI->replaceAllUsesWith(NewValue); + LLVM_DEBUG(dbgs() << "Load coersion: The load " << *LI + << " was eliminated and its uses were replaced by " + << *NewValue << "\n"); + } + } + + // Collect the newly generated instructions and run value numbering for + // them. In this way, the new instructions will be inserted in the + // corresponding congruence classes. In case any of these instructions are + // redundant, they will be optimized away in the elimiantion phase. + Instruction *FirstNewlyGeneratedInsn = + DefUSeAreInDifferentBB + ? InsnBeforeTerminatorInDependingInsnBB->getNextNode() + : InsnBeforeLoadToOptimize->getNextNode(); + Instruction *LastNewlyGeneratedInsn = DefUSeAreInDifferentBB + ? DependingInsnBB->getTerminator() + : LoadToOptimize; + for (Instruction *CurInsn = FirstNewlyGeneratedInsn; + CurInsn != LastNewlyGeneratedInsn; CurInsn = CurInsn->getNextNode()) { + TOPClass->insert(CurInsn); + ValueToClass[CurInsn] = TOPClass; + valueNumberInstruction(CurInsn); + updateProcessedCount(CurInsn); + } + AnythingReplaced = true; + } + return AnythingReplaced; +} + bool NewGVN::eliminateInstructions(Function &F) { // This is a non-standard eliminator. The normal way to eliminate is // to walk the dominator tree in order, keeping track of available diff --git a/llvm/test/Transforms/NewGVN/load_coercion_between_store_and_load.ll b/llvm/test/Transforms/NewGVN/load_coercion_between_store_and_load.ll --- a/llvm/test/Transforms/NewGVN/load_coercion_between_store_and_load.ll +++ b/llvm/test/Transforms/NewGVN/load_coercion_between_store_and_load.ll @@ -12,8 +12,8 @@ ; NEWGVN-LABEL: @test1( ; NEWGVN-NEXT: store i32 [[V1:%.*]], i32* [[P:%.*]], align 4 ; NEWGVN-NEXT: [[P1:%.*]] = bitcast i32* [[P]] to float* -; NEWGVN-NEXT: [[V2:%.*]] = load float, float* [[P1]], align 4 -; NEWGVN-NEXT: ret float [[V2]] +; NEWGVN-NEXT: [[TMP1:%.*]] = bitcast i32 [[V1]] to float +; NEWGVN-NEXT: ret float [[TMP1]] ; store i32 %V1, i32* %P %P1 = bitcast i32* %P to float* @@ -33,8 +33,10 @@ ; NEWGVN-LABEL: @test2( ; NEWGVN-NEXT: store i64* [[V1:%.*]], i64** [[P1:%.*]], align 8 ; NEWGVN-NEXT: [[P2:%.*]] = bitcast i64** [[P1]] to float* -; NEWGVN-NEXT: [[V2:%.*]] = load float, float* [[P2]], align 4 -; NEWGVN-NEXT: ret float [[V2]] +; NEWGVN-NEXT: [[TMP1:%.*]] = ptrtoint i64* [[V1]] to i64 +; NEWGVN-NEXT: [[TMP2:%.*]] = trunc i64 [[TMP1]] to i32 +; NEWGVN-NEXT: [[TMP3:%.*]] = bitcast i32 [[TMP2]] to float +; NEWGVN-NEXT: ret float [[TMP3]] ; store i64* %V1, i64** %P1 %P2 = bitcast i64** %P1 to float* @@ -52,8 +54,8 @@ ; NEWGVN-LABEL: @test3( ; NEWGVN-NEXT: store i32 [[V1:%.*]], i32* [[P1:%.*]], align 4 ; NEWGVN-NEXT: [[P2:%.*]] = bitcast i32* [[P1]] to i8* -; NEWGVN-NEXT: [[V2:%.*]] = load i8, i8* [[P2]], align 1 -; NEWGVN-NEXT: ret i8 [[V2]] +; NEWGVN-NEXT: [[TMP1:%.*]] = trunc i32 [[V1]] to i8 +; NEWGVN-NEXT: ret i8 [[TMP1]] ; store i32 %V1, i32* %P1 %P2 = bitcast i32* %P1 to i8* @@ -72,8 +74,9 @@ ; NEWGVN-LABEL: @test4( ; NEWGVN-NEXT: store i64 [[V1:%.*]], i64* [[P1:%.*]], align 4 ; NEWGVN-NEXT: [[P2:%.*]] = bitcast i64* [[P1]] to float* -; NEWGVN-NEXT: [[V2:%.*]] = load float, float* [[P2]], align 4 -; NEWGVN-NEXT: ret float [[V2]] +; NEWGVN-NEXT: [[TMP1:%.*]] = trunc i64 [[V1]] to i32 +; NEWGVN-NEXT: [[TMP2:%.*]] = bitcast i32 [[TMP1]] to float +; NEWGVN-NEXT: ret float [[TMP2]] ; store i64 %V1, i64* %P1 %P2 = bitcast i64* %P1 to float* @@ -115,8 +118,8 @@ ; NEWGVN-LABEL: @test6( ; NEWGVN-NEXT: store i64 [[V1:%.*]], i64* [[P1:%.*]], align 4 ; NEWGVN-NEXT: [[P2:%.*]] = bitcast i64* [[P1]] to i8** -; NEWGVN-NEXT: [[V2:%.*]] = load i8*, i8** [[P2]], align 8 -; NEWGVN-NEXT: ret i8* [[V2]] +; NEWGVN-NEXT: [[TMP1:%.*]] = inttoptr i64 [[V1]] to i8* +; NEWGVN-NEXT: ret i8* [[TMP1]] ; store i64 %V1, i64* %P1 %P2 = bitcast i64* %P1 to i8** @@ -135,8 +138,9 @@ ; NEWGVN-LABEL: @test7( ; NEWGVN-NEXT: store double [[V1:%.*]], double* [[P1:%.*]], align 8 ; NEWGVN-NEXT: [[P2:%.*]] = bitcast double* [[P1]] to i32* -; NEWGVN-NEXT: [[V2:%.*]] = load i32, i32* [[P2]], align 4 -; NEWGVN-NEXT: ret i32 [[V2]] +; NEWGVN-NEXT: [[TMP1:%.*]] = bitcast double [[V1]] to i64 +; NEWGVN-NEXT: [[TMP2:%.*]] = trunc i64 [[TMP1]] to i32 +; NEWGVN-NEXT: ret i32 [[TMP2]] ; store double %V1, double* %P1 %P2 = bitcast double* %P1 to i32* @@ -157,8 +161,9 @@ ; NEWGVN-NEXT: store i32 [[V1:%.*]], i32* [[P1:%.*]], align 4 ; NEWGVN-NEXT: [[P2:%.*]] = bitcast i32* [[P1]] to i8* ; NEWGVN-NEXT: [[P3:%.*]] = getelementptr i8, i8* [[P2]], i32 2 -; NEWGVN-NEXT: [[V2:%.*]] = load i8, i8* [[P3]], align 1 -; NEWGVN-NEXT: ret i8 [[V2]] +; NEWGVN-NEXT: [[TMP1:%.*]] = lshr i32 [[V1]], 16 +; NEWGVN-NEXT: [[TMP2:%.*]] = trunc i32 [[TMP1]] to i8 +; NEWGVN-NEXT: ret i8 [[TMP2]] ; store i32 %V1, i32* %P1 %P2 = bitcast i32* %P1 to i8* @@ -180,14 +185,12 @@ ; ; NEWGVN-LABEL: @test9( ; NEWGVN-NEXT: store i64 [[V:%.*]], i64* [[P1:%.*]], align 4 -; NEWGVN-NEXT: [[P3:%.*]] = bitcast i64* [[P1]] to double* +; NEWGVN-NEXT: [[TMP1:%.*]] = bitcast i64 [[V]] to double ; NEWGVN-NEXT: br i1 [[COND:%.*]], label [[T:%.*]], label [[F:%.*]] ; NEWGVN: T: -; NEWGVN-NEXT: [[B:%.*]] = load double, double* [[P3]], align 8 -; NEWGVN-NEXT: ret double [[B]] +; NEWGVN-NEXT: ret double [[TMP1]] ; NEWGVN: F: -; NEWGVN-NEXT: [[C:%.*]] = load double, double* [[P3]], align 8 -; NEWGVN-NEXT: ret double [[C]] +; NEWGVN-NEXT: ret double [[TMP1]] ; %A = load i64 , i64* %P1 store i64 %V, i64* %P1 @@ -217,11 +220,11 @@ ; NEWGVN-LABEL: @test10( ; NEWGVN-NEXT: store i32 [[V0:%.*]], i32* [[P:%.*]], align 4 ; NEWGVN-NEXT: [[P1:%.*]] = bitcast i32* [[P]] to float* -; NEWGVN-NEXT: [[V1:%.*]] = load float, float* [[P1]], align 4 +; NEWGVN-NEXT: [[TMP1:%.*]] = bitcast i32 [[V0]] to float ; NEWGVN-NEXT: [[P2:%.*]] = bitcast i32* [[P]] to i8* -; NEWGVN-NEXT: [[V2:%.*]] = load i8, i8* [[P2]], align 1 -; NEWGVN-NEXT: [[I1:%.*]] = insertvalue <{ i8, float }> undef, i8 [[V2]], 0 -; NEWGVN-NEXT: [[I2:%.*]] = insertvalue <{ i8, float }> [[I1]], float [[V1]], 1 +; NEWGVN-NEXT: [[TMP2:%.*]] = trunc i32 [[V0]] to i8 +; NEWGVN-NEXT: [[I1:%.*]] = insertvalue <{ i8, float }> undef, i8 [[TMP2]], 0 +; NEWGVN-NEXT: [[I2:%.*]] = insertvalue <{ i8, float }> [[I1]], float [[TMP1]], 1 ; NEWGVN-NEXT: ret <{ i8, float }> [[I2]] ; store i32 %V0, i32* %P @@ -251,16 +254,16 @@ ; ; NEWGVN-LABEL: @test11( ; NEWGVN-NEXT: store i32 [[V0:%.*]], i32* [[P:%.*]], align 4 +; NEWGVN-NEXT: [[TMP1:%.*]] = bitcast i32 [[V0]] to float +; NEWGVN-NEXT: [[TMP2:%.*]] = trunc i32 [[V0]] to i8 ; NEWGVN-NEXT: br i1 [[COND:%.*]], label [[T:%.*]], label [[F:%.*]] ; NEWGVN: T: ; NEWGVN-NEXT: [[P1:%.*]] = bitcast i32* [[P]] to float* -; NEWGVN-NEXT: [[V1:%.*]] = load float, float* [[P1]], align 4 -; NEWGVN-NEXT: [[I1:%.*]] = insertvalue <{ i8, float }> undef, float [[V1]], 1 +; NEWGVN-NEXT: [[I1:%.*]] = insertvalue <{ i8, float }> undef, float [[TMP1]], 1 ; NEWGVN-NEXT: ret <{ i8, float }> [[I1]] ; NEWGVN: F: ; NEWGVN-NEXT: [[P2:%.*]] = bitcast i32* [[P]] to i8* -; NEWGVN-NEXT: [[V2:%.*]] = load i8, i8* [[P2]], align 1 -; NEWGVN-NEXT: [[I2:%.*]] = insertvalue <{ i8, float }> undef, i8 [[V2]], 0 +; NEWGVN-NEXT: [[I2:%.*]] = insertvalue <{ i8, float }> undef, i8 [[TMP2]], 0 ; NEWGVN-NEXT: ret <{ i8, float }> [[I2]] ; store i32 %V0, i32* %P @@ -295,16 +298,13 @@ ; ; NEWGVN-LABEL: @test12( ; NEWGVN-NEXT: store i32 [[V0:%.*]], i32* [[P:%.*]], align 4 +; NEWGVN-NEXT: [[TMP1:%.*]] = bitcast i32 [[V0]] to float ; NEWGVN-NEXT: br i1 [[COND:%.*]], label [[T:%.*]], label [[F:%.*]] ; NEWGVN: T: -; NEWGVN-NEXT: [[P1:%.*]] = bitcast i32* [[P]] to float* -; NEWGVN-NEXT: [[V1:%.*]] = load float, float* [[P1]], align 4 -; NEWGVN-NEXT: [[I1:%.*]] = insertvalue <{ float, float }> undef, float [[V1]], 1 +; NEWGVN-NEXT: [[I1:%.*]] = insertvalue <{ float, float }> undef, float [[TMP1]], 1 ; NEWGVN-NEXT: ret <{ float, float }> [[I1]] ; NEWGVN: F: -; NEWGVN-NEXT: [[P2:%.*]] = bitcast i32* [[P]] to float* -; NEWGVN-NEXT: [[V2:%.*]] = load float, float* [[P2]], align 4 -; NEWGVN-NEXT: [[I2:%.*]] = insertvalue <{ float, float }> undef, float [[V2]], 0 +; NEWGVN-NEXT: [[I2:%.*]] = insertvalue <{ float, float }> undef, float [[TMP1]], 0 ; NEWGVN-NEXT: ret <{ float, float }> [[I2]] ; store i32 %V0, i32* %P @@ -335,10 +335,9 @@ ; NEWGVN-LABEL: @test13( ; NEWGVN-NEXT: store i32 [[V1:%.*]], i32* [[P1:%.*]], align 4 ; NEWGVN-NEXT: [[P2:%.*]] = bitcast i32* [[P1]] to i8* -; NEWGVN-NEXT: [[V2:%.*]] = load i8, i8* [[P2]], align 1 +; NEWGVN-NEXT: [[TMP1:%.*]] = trunc i32 [[V1]] to i8 ; NEWGVN-NEXT: [[P3:%.*]] = bitcast i32* [[P1]] to i64* -; NEWGVN-NEXT: [[V4:%.*]] = trunc i32 [[V1]] to i8 -; NEWGVN-NEXT: [[V5:%.*]] = add i8 [[V2]], [[V4]] +; NEWGVN-NEXT: [[V5:%.*]] = add i8 [[TMP1]], [[TMP1]] ; NEWGVN-NEXT: ret i8 [[V5]] ; store i32 %V1, i32* %P1 @@ -361,9 +360,8 @@ ; ; NEWGVN-LABEL: @test14( ; NEWGVN-NEXT: store i32 [[V1:%.*]], i32* [[P1:%.*]], align 4 -; NEWGVN-NEXT: [[P2:%.*]] = bitcast i32* [[P1]] to i8* -; NEWGVN-NEXT: [[V2:%.*]] = load i8, i8* [[P2]], align 1 -; NEWGVN-NEXT: [[V5:%.*]] = add i8 [[V2]], [[V2]] +; NEWGVN-NEXT: [[TMP1:%.*]] = trunc i32 [[V1]] to i8 +; NEWGVN-NEXT: [[V5:%.*]] = add i8 [[TMP1]], [[TMP1]] ; NEWGVN-NEXT: ret i8 [[V5]] ; store i32 %V1, i32* %P1 diff --git a/llvm/test/Transforms/NewGVN/pr14166-xfail.ll b/llvm/test/Transforms/NewGVN/pr14166-xfail.ll --- a/llvm/test/Transforms/NewGVN/pr14166-xfail.ll +++ b/llvm/test/Transforms/NewGVN/pr14166-xfail.ll @@ -1,4 +1,3 @@ -; XFAIL: * ; RUN: opt -disable-basic-aa -passes=newgvn -S < %s | FileCheck %s ; NewGVN fails this due to missing load coercion target datalayout = "e-p:32:32:32"