Index: include/llvm/IR/Intrinsics.td =================================================================== --- include/llvm/IR/Intrinsics.td +++ include/llvm/IR/Intrinsics.td @@ -955,6 +955,11 @@ def int_ssa_copy : Intrinsic<[llvm_any_ty], [LLVMMatchType<0>], [IntrNoMem, Returned<0>]>; + +def int_phantom_mem : Intrinsic<[], + [llvm_anyptr_ty, llvm_anyptr_ty, llvm_i64_ty], + [IntrArgMemOnly]>; + //===----------------------------------------------------------------------===// // Target-specific intrinsics //===----------------------------------------------------------------------===// Index: lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp =================================================================== --- lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp +++ lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp @@ -5986,6 +5986,8 @@ visitVectorReduce(I, Intrinsic); return nullptr; } + case Intrinsic::phantom_mem: + return nullptr; } } Index: lib/Transforms/InstCombine/InstCombineInternal.h =================================================================== --- lib/Transforms/InstCombine/InstCombineInternal.h +++ lib/Transforms/InstCombine/InstCombineInternal.h @@ -196,6 +196,8 @@ } } +typedef DenseMap> PtrInfoTy; + /// \brief The core instruction combiner logic. /// /// This class provides both the logic to recursively visit instructions and @@ -230,6 +232,7 @@ // Optional analyses. When non-null, these can both be used to do better // combining and will be updated to reflect any changes. LoopInfo *LI; + PtrInfoTy &Ptrs; bool MadeIRChange; @@ -238,10 +241,10 @@ bool MinimizeSize, bool ExpensiveCombines, AliasAnalysis *AA, AssumptionCache &AC, TargetLibraryInfo &TLI, DominatorTree &DT, OptimizationRemarkEmitter &ORE, const DataLayout &DL, - LoopInfo *LI) + LoopInfo *LI, PtrInfoTy &Ptrs) : Worklist(Worklist), Builder(Builder), MinimizeSize(MinimizeSize), ExpensiveCombines(ExpensiveCombines), AA(AA), AC(AC), TLI(TLI), DT(DT), - DL(DL), SQ(DL, &TLI, &DT, &AC), ORE(ORE), LI(LI), MadeIRChange(false) {} + DL(DL), SQ(DL, &TLI, &DT, &AC), ORE(ORE), LI(LI), Ptrs(Ptrs), MadeIRChange(false) {} /// \brief Run the combiner over the entire worklist until it is empty. /// Index: lib/Transforms/InstCombine/InstructionCombining.cpp =================================================================== --- lib/Transforms/InstCombine/InstructionCombining.cpp +++ lib/Transforms/InstCombine/InstructionCombining.cpp @@ -1466,6 +1466,27 @@ Instruction *InstCombiner::visitGetElementPtrInst(GetElementPtrInst &GEP) { SmallVector Ops(GEP.op_begin(), GEP.op_end()); + // Here we have to avoid any GEPs with vector types or with + // number of operands that is not equal to two or GEPs + // in a loop, except it's origin is in the same basic block. + if (!dyn_cast(Ops[0]->getType()) && GEP.getNumOperands() == 2 && + LI != nullptr) { + bool ValidGEP = (LI->getLoopDepth(GEP.getParent()) == 0); + Value *Ptr = GEP.getOperand(0); + if (!ValidGEP) + if (Instruction *PtrInst = dyn_cast(Ptr)) + ValidGEP = PtrInst->getParent() == GEP.getParent(); + if (ValidGEP) + if (ConstantInt *CST = dyn_cast(GEP.getOperand(1))) { + uint64_t NewOffset = CST->getZExtValue(); + if (Ptrs.find(Ptr) != Ptrs.end()) { + if (NewOffset > Ptrs[Ptr].first && !Ptrs[Ptr].second) + Ptrs[Ptr].first = NewOffset; + } else + Ptrs[Ptr] = {NewOffset, false}; + } + } + if (Value *V = SimplifyGEPInst(GEP.getSourceElementType(), Ops, SQ.getWithInstruction(&GEP))) return replaceInstUsesWith(GEP, V); @@ -2896,6 +2917,47 @@ return true; } +static void recordDeadLoad(LoadInst *Load, InstCombiner::BuilderTy &Builder, + PtrInfoTy &Ptrs) { + BasicBlock *BB = Load->getParent(); + for (BasicBlock::iterator Iter = BB->begin(), E = BB->end(); Iter != E;) { + long long MaxOffset = 0; + Value *Ptr = nullptr; + if (LoadInst *LInstr = dyn_cast(Iter++)) { + if (Ptrs.find(LInstr->getOperand(0)) != Ptrs.end()) { + Ptr = LInstr->getOperand(0); + MaxOffset = Ptrs[Ptr].first; + } else if (GetElementPtrInst *GEP = + dyn_cast(LInstr->getOperand(0))) + if (Ptrs.find(GEP->getOperand(0)) != Ptrs.end()) { + Ptr = GEP->getOperand(0); + MaxOffset = Ptrs[Ptr].first; + } + if (Ptr && MaxOffset > 0 && !Ptrs[Ptr].second) { + Instruction *PtrInst = dyn_cast(Ptr); + // if PtrInst is not an Instruction then we consider + // it as a parameter to the function. + if (PtrInst == nullptr) { + BasicBlock *BBEntry = &BB->getParent()->getEntryBlock(); + assert(BBEntry && "Entry block not found!"); + BasicBlock::iterator ItNew; + ItNew = BBEntry->begin(); + Builder.SetInsertPoint(&*ItNew); + } else + Builder.SetInsertPoint(BB, ++PtrInst->getIterator()); + Value *Ops[] = {Ptr, Constant::getNullValue(Ptr->getType()), + Builder.getInt64(MaxOffset)}; + Module *M = BB->getParent()->getParent(); + Value *TheFn = Intrinsic::getDeclaration( + M, Intrinsic::phantom_mem, {Ptr->getType(), Ptr->getType()}); + const Twine &Name = ""; + Builder.CreateCall(TheFn, Ops, Name); + Ptrs[Ptr].second = true; + } + } + } +} + bool InstCombiner::run() { while (!Worklist.isEmpty()) { Instruction *I = Worklist.RemoveOne(); @@ -2904,6 +2966,8 @@ // Check to see if we can DCE the instruction. if (isInstructionTriviallyDead(I, &TLI)) { DEBUG(dbgs() << "IC: DCE: " << *I << '\n'); + if (LoadInst *Load = dyn_cast(I)) + recordDeadLoad(Load, Builder, Ptrs); eraseInstFromFunction(*I); ++NumDeadInst; MadeIRChange = true; @@ -3214,6 +3278,8 @@ AC.registerAssumption(cast(I)); })); + PtrInfoTy Ptrs; + // Lower dbg.declare intrinsics otherwise their value may be clobbered // by instcombiner. bool MadeIRChange = false; @@ -3230,7 +3296,7 @@ MadeIRChange |= prepareICWorklistFromFunction(F, DL, &TLI, Worklist); InstCombiner IC(Worklist, Builder, F.optForMinSize(), ExpensiveCombines, AA, - AC, TLI, DT, ORE, DL, LI); + AC, TLI, DT, ORE, DL, LI, Ptrs); IC.MaxArraySizeForCombine = MaxArraySize; if (!IC.run()) Index: test/Transforms/InstCombine/phantommem.ll =================================================================== --- /dev/null +++ test/Transforms/InstCombine/phantommem.ll @@ -0,0 +1,32 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py +; RUN: opt < %s -instcombine -mtriple=x86_64-unknown-linux-gnu -mcpu=bdver2 -S | FileCheck %s + +define <4 x double> @load_four_scalars_but_use_two(double* %ptr) { +; CHECK-LABEL: @load_four_scalars_but_use_two( +; CHECK-NEXT: call void @llvm.phantom.mem.p0f64.p0f64(double* [[PTR:%.*]], double* null, i64 3) +; CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds double, double* [[PTR]], i64 2 +; CHECK-NEXT: [[LD0:%.*]] = load double, double* [[PTR]], align 8 +; CHECK-NEXT: [[LD2:%.*]] = load double, double* [[ARRAYIDX2]], align 8 +; CHECK-NEXT: [[INS0:%.*]] = insertelement <4 x double> undef, double [[LD0]], i32 0 +; CHECK-NEXT: [[INS2:%.*]] = insertelement <4 x double> [[INS0]], double [[LD2]], i32 2 +; CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <4 x double> [[INS2]], <4 x double> undef, <4 x i32> +; CHECK-NEXT: ret <4 x double> [[SHUFFLE]] +; + %arrayidx0 = getelementptr inbounds double, double* %ptr, i64 0 + %arrayidx1 = getelementptr inbounds double, double* %ptr, i64 1 + %arrayidx2 = getelementptr inbounds double, double* %ptr, i64 2 + %arrayidx3 = getelementptr inbounds double, double* %ptr, i64 3 + + %ld0 = load double, double* %arrayidx0 + %ld1 = load double, double* %arrayidx1 + %ld2 = load double, double* %arrayidx2 + %ld3 = load double, double* %arrayidx3 + + %ins0 = insertelement <4 x double> undef, double %ld0, i32 0 + %ins1 = insertelement <4 x double> %ins0, double %ld1, i32 1 + %ins2 = insertelement <4 x double> %ins1, double %ld2, i32 2 + %ins3 = insertelement <4 x double> %ins2, double %ld3, i32 3 + + %shuffle = shufflevector <4 x double> %ins3, <4 x double> undef, <4 x i32> + ret <4 x double> %shuffle +}