Index: include/llvm/Transforms/Vectorize/SLPVectorizer.h =================================================================== --- include/llvm/Transforms/Vectorize/SLPVectorizer.h +++ include/llvm/Transforms/Vectorize/SLPVectorizer.h @@ -94,6 +94,10 @@ /// collected in GEPs. bool vectorizeGEPIndices(BasicBlock *BB, slpvectorizer::BoUpSLP &R); + /// \brief Restore inserts, loads out of dereferenceable_or_null attribute. + InsertElementInst * restoreInserts(InsertElementInst *FirstInsertElem, + LoadInst *LInstr, const MDNode *MD); + /// Try to find horizontal reduction or otherwise vectorize a chain of binary /// operators. bool vectorizeRootInstruction(PHINode *P, Value *V, BasicBlock *BB, Index: lib/IR/Verifier.cpp =================================================================== --- lib/IR/Verifier.cpp +++ lib/IR/Verifier.cpp @@ -3714,8 +3714,6 @@ } void Verifier::visitDereferenceableMetadata(Instruction& I, MDNode* MD) { - Assert(I.getType()->isPointerTy(), "dereferenceable, dereferenceable_or_null " - "apply only to pointer types", &I); Assert(isa(I), "dereferenceable, dereferenceable_or_null apply only to load" " instructions, use attributes for calls or invokes", &I); Index: lib/Transforms/InstCombine/InstructionCombining.cpp =================================================================== --- lib/Transforms/InstCombine/InstructionCombining.cpp +++ lib/Transforms/InstCombine/InstructionCombining.cpp @@ -57,6 +57,7 @@ #include "llvm/IR/IntrinsicInst.h" #include "llvm/IR/PatternMatch.h" #include "llvm/IR/ValueHandle.h" +#include "llvm/IR/MDBuilder.h" #include "llvm/Support/CommandLine.h" #include "llvm/Support/Debug.h" #include "llvm/Support/KnownBits.h" @@ -87,6 +88,10 @@ MaxArraySize("instcombine-maxarray-size", cl::init(1024), cl::desc("Maximum array size considered when doing a combine")); +DenseMap Bases; + +DenseMap> GEPs; + Value *InstCombiner::EmitGEPOffset(User *GEP) { return llvm::EmitGEPOffset(Builder, DL, GEP); } @@ -1415,6 +1420,15 @@ Instruction *InstCombiner::visitGetElementPtrInst(GetElementPtrInst &GEP) { SmallVector Ops(GEP.op_begin(), GEP.op_end()); + if (!dyn_cast(Ops[0]->getType()) && GEP.getNumOperands() == 2) + if (Constant *CST = dyn_cast(GEP.getOperand(1))) { + Value *PtrOp = GEP.getOperand(0); + Value *GEPResult = llvm::cast(&GEP); + if (CST > Bases[PtrOp]) + Bases[PtrOp]=CST; + GEPs[GEPResult] = std::make_pair(PtrOp, CST); + } + if (Value *V = SimplifyGEPInst(GEP.getSourceElementType(), Ops, SQ.getWithInstruction(&GEP))) return replaceInstUsesWith(GEP, V); @@ -2839,6 +2853,42 @@ // Check to see if we can DCE the instruction. if (isInstructionTriviallyDead(I, &TLI)) { DEBUG(dbgs() << "IC: DCE: " << *I << '\n'); + // If we have a load here then we need to + // update other loads with dereferenceable_or_null + // for the same base pointer. + if (LoadInst *Load = dyn_cast(I)) { + BasicBlock *BB = Load->getParent(); + for (BasicBlock::iterator Iter = BB->begin(), + E = BB->end(); Iter != E; ) { + long long MaxOffset=0, CurPos=0; + if (LoadInst *LInstr = dyn_cast(Iter++)) { + if (LInstr->getMetadata( + LLVMContext::MD_dereferenceable_or_null) != nullptr) + continue; + if (Bases[LInstr->getOperand(0)]) + MaxOffset = (dyn_cast + (Bases[LInstr->getOperand(0)]))->getZExtValue(); + else + if (GEPs.count(LInstr->getOperand(0)) > 0) { + MaxOffset = (dyn_cast + (Bases[GEPs[LInstr->getOperand(0)].first]))->getZExtValue(); + CurPos = (dyn_cast( + GEPs[LInstr->getOperand(0)].second))->getZExtValue(); + } + if ((MaxOffset-CurPos) != 0) + { + Constant *C = ConstantInt::get(LInstr->getContext(), + APInt(64, (MaxOffset - CurPos) + *DL.getTypeStoreSize(LInstr->getType()))); + SmallVector Vals; + MDBuilder MDB(LInstr->getContext()); + Vals.push_back(MDB.createConstant(C)); + LInstr->setMetadata(LLVMContext::MD_dereferenceable_or_null, + MDNode::get(LInstr->getContext(), Vals)); + } + } + } + } eraseInstFromFunction(*I); ++NumDeadInst; MadeIRChange = true; @@ -3134,6 +3184,8 @@ LoopInfo *LI = nullptr) { auto &DL = F.getParent()->getDataLayout(); ExpensiveCombines |= EnableExpensiveCombines; + Bases.clear(); + GEPs.clear(); /// Builder - This is an IRBuilder that automatically inserts new /// instructions into the worklist when they are created. Index: lib/Transforms/Vectorize/SLPVectorizer.cpp =================================================================== --- lib/Transforms/Vectorize/SLPVectorizer.cpp +++ lib/Transforms/Vectorize/SLPVectorizer.cpp @@ -4836,6 +4836,138 @@ return Res; } +InsertElementInst* SLPVectorizerPass::restoreInserts(InsertElementInst* + FirstInsertElem, LoadInst *LInstr, + const MDNode *MD) +{ + Value *Ptr; + Value *Use = nullptr; + SmallVector Offsets; + SmallDenseMap Inserts; + BasicBlock *BB = FirstInsertElem->getParent(); + int64_t FirstAt = 0; + int64_t TypeSize = DL->getTypeStoreSize(LInstr->getType()); + VectorType *VecType = FirstInsertElem->getType(); + + ConstantInt *C = cast(FirstInsertElem->getOperand(2)); + if (C->getZExtValue() == 0) { + // We found the first InsertElem in + // this chain. + Inserts[0] = FirstInsertElem; + Offsets.push_back(0); + } + if (GetElementPtrInst *GEP = + dyn_cast(LInstr->getOperand(0))) + { + Ptr = GEP->getOperand(0); + // Looks like the FirstInsertElem was not first in the chain, + // saving its offset in FirstAt. + if (ConstantInt *C = dyn_cast(GEP->getOperand(1))) + FirstAt = C->getZExtValue() * DL->getTypeStoreSize(LInstr->getType()); + else + // Complex GEP for vector insert. + return nullptr; + } else + Ptr = LInstr->getOperand(0); + ConstantAsMetadata *ValMD = + dyn_cast(MD->getOperand(0)); + int64_t MaxOffset = + cast(ValMD->getValue())->getZExtValue() + + FirstAt; + IRBuilder Builder(LInstr->getParent(), + ++BasicBlock::iterator(LInstr)); + DEBUG(dbgs() << "Found maxmimum offset for " << *Ptr << " " << + MaxOffset << "\n"); + for (BasicBlock::iterator it = BB->begin(), e = BB->end(); it != e; it++) + if (GetElementPtrInst *GEP = dyn_cast(it)) + { + // Examining the chain of GEP, Load, Insert. + if (Ptr == GEP->getOperand(0)) { + if (ConstantInt *C = dyn_cast(GEP->getOperand(1))) + { + int64_t Off = C->getZExtValue()*TypeSize; + Offsets.push_back(Off); + if (GEP->use_empty()) + return nullptr; + if (!GEP->hasOneUse()) + return nullptr; + LoadInst *Load = dyn_cast(GEP->user_back()); + if (!Load) + return nullptr; + if (Load->use_empty()) + return nullptr; + if (!Load->hasOneUse()) + return nullptr; + InsertElementInst *Insert = + dyn_cast(Load->user_back()); + if (!Insert) + return nullptr; + ConstantInt *C1 = cast(Insert->getOperand(2)); + int64_t Off1 = C1->getZExtValue()*TypeSize; + // Offset from GEP must be equal to offset + // from Load otherwise ignoring. + if (Off != Off1) + return nullptr; + Inserts[Off] = Insert; + if (!Insert->use_empty()) + Use = Insert->user_back(); + } else // Complex GEP for vector insert. + return nullptr; + } + } + + for (int i=0; i(Inserts[i]); + assert(Ins != nullptr && "Expect insert element instruction here"); + // We don't need to update InsertElement for + // offset quals to 0, it should be Undef. + if (i != 0 && (Ins->getOperand(0) != Inserts[i-TypeSize])) + Ins->setOperand(0, Inserts[i-TypeSize]); + Found=true; + } + if (!Found) { + // Building GEP, Load, Insert for + // this Offset. + Builder.SetInsertPoint(LInstr); + Builder.SetCurrentDebugLocation(LInstr->getDebugLoc()); + Value *NewGEP = Builder.CreateGEP(LInstr->getType(), Ptr, + Builder.getInt64(i/TypeSize)); + Value *NewLoad = Builder.CreateLoad(NewGEP); + if (i==0) { + // For Offset 0 it should be Undef. + Inserts[0] = Builder.CreateInsertElement( + UndefValue::get(VecType), NewLoad, Builder.getInt64(0)); + FirstInsertElem = dyn_cast(Inserts[0]); + } else { + // InsertElement must be inserted just after + // previous InsertElement. + InsertElementInst *PrevIns = dyn_cast( + Inserts[i-TypeSize]); + Instruction *InsAt = PrevIns->getNextNode(); + IRBuilder Builder1(InsAt->getParent(), + ++BasicBlock::iterator(InsAt)); + Builder1.SetInsertPoint(InsAt); + Builder1.SetCurrentDebugLocation(InsAt->getDebugLoc()); + Inserts[i] = Builder1.CreateInsertElement( + PrevIns, NewLoad, Builder1.getInt64(i/TypeSize)); + } + } + } + if (ShuffleVectorInst* Shuffle = dyn_cast(Use)) + { + // Update ShuffleVector with restored insert element. + Shuffle->setOperand(0, Inserts[MaxOffset]); + return FirstInsertElem; + } + return nullptr; +} + bool SLPVectorizerPass::vectorizeRootInstruction(PHINode *P, Value *V, BasicBlock *BB, BoUpSLP &R, TargetTransformInfo *TTI) { @@ -4995,6 +5127,16 @@ if (!findBuildVector(FirstInsertElem, BuildVector, BuildVectorOpds)) continue; + if (LoadInst *LInstr = dyn_cast( + FirstInsertElem->getOperand(1))) + if (const MDNode *MD = LInstr->getMetadata( + LLVMContext::MD_dereferenceable_or_null)) + { + if (InsertElementInst *Insert = + restoreInserts(FirstInsertElem, LInstr, MD)) + findBuildVector(Insert, BuildVector, BuildVectorOpds); + } + // Vectorize starting with the build vector operands ignoring the // BuildVector instructions for the purpose of scheduling and user // extraction. Index: test/Transforms/SLPVectorizer/X86/21780.ll =================================================================== --- /dev/null +++ test/Transforms/SLPVectorizer/X86/21780.ll @@ -0,0 +1,97 @@ +; RUN: opt < %s -instcombine -slp-vectorizer -mtriple=x86_64-unknown-linux-gnu -mcpu=bdver2 -S | FileCheck %s --check-prefix=SLP +; RUN: opt < %s -instcombine -mtriple=x86_64-unknown-linux-gnu -mcpu=bdver2 -S | FileCheck %s --check-prefix=ICOMB + +; SLP-LABEL: load_four_scalars_but_use_two +; SLP: %arrayidx2 = getelementptr inbounds double, double* %ptr, i64 2 +; SLP-NEXT: %1 = getelementptr double, double* %ptr, i64 1 +; SLP-NEXT: %2 = getelementptr double, double* %ptr, i64 3 +; SLP-NEXT: %3 = bitcast double* %ptr to <4 x double>* +; SLP-NEXT: %4 = load <4 x double>, <4 x double>* %3, align 8 +; SLP-NEXT: %5 = extractelement <4 x double> %4, i32 0 +; SLP-NEXT: %ins0 = insertelement <4 x double> undef, double %5, i32 0 +; SLP-NEXT: %6 = extractelement <4 x double> %4, i32 1 +; SLP-NEXT: %7 = insertelement <4 x double> %ins0, double %6, i64 1 +; SLP-NEXT: %8 = extractelement <4 x double> %4, i32 2 +; SLP-NEXT: %ins2 = insertelement <4 x double> %7, double %8, i32 2 +; SLP-NEXT: %9 = extractelement <4 x double> %4, i32 3 +; SLP-NEXT: %10 = insertelement <4 x double> %ins2, double %9, i64 3 +; SLP-NEXT: %shuffle = shufflevector <4 x double> %10, <4 x double> undef, <4 x i32> +; SLP-NEXT: ret <4 x double> %shuffle + +; ICOMB-LABEL: load_four_scalars_but_use_two +; ICOMB: %arrayidx2 = getelementptr inbounds double, double* %ptr, i64 2 +; ICOMB-NEXT: %ld0 = load double, double* %ptr, align 8, !dereferenceable_or_null !0 +; ICOMB-NEXT: %ld2 = load double, double* %arrayidx2, align 8, !dereferenceable_or_null !1 +; ICOMB-NEXT: %ins0 = insertelement <4 x double> undef, double %ld0, i32 0 +; ICOMB-NEXT: %ins2 = insertelement <4 x double> %ins0, double %ld2, i32 2 +; ICOMB-NEXT: %shuffle = shufflevector <4 x double> %ins2, <4 x double> undef, <4 x i32> +; ICOMB-NEXT: ret <4 x double> %shuffle + +define <4 x double> @load_four_scalars_but_use_two(double* %ptr) { + %arrayidx0 = getelementptr inbounds double, double* %ptr, i64 0 + %arrayidx1 = getelementptr inbounds double, double* %ptr, i64 1 + %arrayidx2 = getelementptr inbounds double, double* %ptr, i64 2 + %arrayidx3 = getelementptr inbounds double, double* %ptr, i64 3 + + %ld0 = load double, double* %arrayidx0 + %ld1 = load double, double* %arrayidx1 + %ld2 = load double, double* %arrayidx2 + %ld3 = load double, double* %arrayidx3 + + %ins0 = insertelement <4 x double> undef, double %ld0, i32 0 + %ins1 = insertelement <4 x double> %ins0, double %ld1, i32 1 + %ins2 = insertelement <4 x double> %ins1, double %ld2, i32 2 + %ins3 = insertelement <4 x double> %ins2, double %ld3, i32 3 + + %shuffle = shufflevector <4 x double> %ins3, <4 x double> undef, <4 x i32> + ret <4 x double> %shuffle +} + +; SLP-LABEL: load_four_scalars_but_use_two1 +; SLP: %arrayidx1 = getelementptr inbounds double, double* %ptr, i64 1 +; SLP-NEXT: %arrayidx2 = getelementptr inbounds double, double* %ptr, i64 2 +; SLP-NEXT: %1 = getelementptr double, double* %ptr, i64 0 +; SLP-NEXT: %2 = getelementptr double, double* %ptr, i64 3 +; SLP-NEXT: %3 = bitcast double* %1 to <4 x double>* +; SLP-NEXT: %4 = load <4 x double>, <4 x double>* %3, align 8 +; SLP-NEXT: %5 = extractelement <4 x double> %4, i32 0 +; SLP-NEXT: %6 = insertelement <4 x double> undef, double %5, i64 0 +; SLP-NEXT: %7 = extractelement <4 x double> %4, i32 1 +; SLP-NEXT: %ins1 = insertelement <4 x double> %6, double %7, i32 1 +; SLP-NEXT: %8 = extractelement <4 x double> %4, i32 2 +; SLP-NEXT: %ins2 = insertelement <4 x double> %ins1, double %8, i32 2 +; SLP-NEXT: %9 = extractelement <4 x double> %4, i32 3 +; SLP-NEXT: %10 = insertelement <4 x double> %ins2, double %9, i64 3 +; SLP-NEXT: %shuffle = shufflevector <4 x double> %10, <4 x double> undef, <4 x i32> +; SLP-NEXT: ret <4 x double> %shuffle + +; ICOMB-LABEL: load_four_scalars_but_use_two1 +; ICOMB: %arrayidx1 = getelementptr inbounds double, double* %ptr, i64 1 +; ICOMB-NEXT: %arrayidx2 = getelementptr inbounds double, double* %ptr, i64 2 +; ICOMB-NEXT: %ld1 = load double, double* %arrayidx1, align 8, !dereferenceable_or_null !2 +; ICOMB-NEXT: %ld2 = load double, double* %arrayidx2, align 8, !dereferenceable_or_null !1 +; ICOMB-NEXT: %ins1 = insertelement <4 x double> undef, double %ld1, i32 1 +; ICOMB-NEXT: %ins2 = insertelement <4 x double> %ins1, double %ld2, i32 2 +; ICOMB-NEXT: %shuffle = shufflevector <4 x double> %ins2, <4 x double> undef, <4 x i32> +; ICOMB-NEXT: ret <4 x double> %shuffle + +define <4 x double> @load_four_scalars_but_use_two1(double* %ptr) { + %arrayidx0 = getelementptr inbounds double, double* %ptr, i64 0 + %arrayidx1 = getelementptr inbounds double, double* %ptr, i64 1 + %arrayidx2 = getelementptr inbounds double, double* %ptr, i64 2 + %arrayidx3 = getelementptr inbounds double, double* %ptr, i64 3 + + %ld0 = load double, double* %arrayidx0 + %ld1 = load double, double* %arrayidx1 + %ld2 = load double, double* %arrayidx2 + %ld3 = load double, double* %arrayidx3 + + %ins0 = insertelement <4 x double> undef, double %ld0, i32 0 + %ins1 = insertelement <4 x double> %ins0, double %ld1, i32 1 + %ins2 = insertelement <4 x double> %ins1, double %ld2, i32 2 + %ins3 = insertelement <4 x double> %ins2, double %ld3, i32 3 + + %shuffle = shufflevector <4 x double> %ins3, <4 x double> undef, <4 x i32> + ret <4 x double> %shuffle +} + Index: test/Verifier/dereferenceable-md.ll =================================================================== --- test/Verifier/dereferenceable-md.ll +++ test/Verifier/dereferenceable-md.ll @@ -18,22 +18,6 @@ ; CHECK: dereferenceable, dereferenceable_or_null apply only to load instructions, use attributes for calls or invokes ; CHECK-NEXT: call i8* @foo() -define i8 @f3(i8* %x) { -entry: - %y = load i8, i8* %x, !dereferenceable !{i64 2} - ret i8 %y -} -; CHECK: dereferenceable, dereferenceable_or_null apply only to pointer types -; CHECK-NEXT: load i8, i8* %x - -define i8 @f4(i8* %x) { -entry: - %y = load i8, i8* %x, !dereferenceable_or_null !{i64 2} - ret i8 %y -} -; CHECK: dereferenceable, dereferenceable_or_null apply only to pointer types -; CHECK-NEXT: load i8, i8* %x - define i8* @f5(i8** %x) { entry: %y = load i8*, i8** %x, !dereferenceable !{} @@ -83,4 +67,4 @@ ret i8* %y } ; CHECK: dereferenceable, dereferenceable_or_null metadata value must be an i64! -; CHECK-NEXT: load i8*, i8** %x \ No newline at end of file +; CHECK-NEXT: load i8*, i8** %x