Index: include/llvm/IR/Intrinsics.td =================================================================== --- include/llvm/IR/Intrinsics.td +++ include/llvm/IR/Intrinsics.td @@ -955,6 +955,11 @@ def int_ssa_copy : Intrinsic<[llvm_any_ty], [LLVMMatchType<0>], [IntrNoMem, Returned<0>]>; + +def int_phantom_mem : Intrinsic<[], + [llvm_anyptr_ty, llvm_anyptr_ty, llvm_i64_ty], + [IntrArgMemOnly]>; + //===----------------------------------------------------------------------===// // Target-specific intrinsics //===----------------------------------------------------------------------===// Index: include/llvm/Transforms/Vectorize/SLPVectorizer.h =================================================================== --- include/llvm/Transforms/Vectorize/SLPVectorizer.h +++ include/llvm/Transforms/Vectorize/SLPVectorizer.h @@ -112,6 +112,10 @@ /// collected in GEPs. bool vectorizeGEPIndices(BasicBlock *BB, slpvectorizer::BoUpSLP &R); + /// \brief Restore inserts, loads out of phantom_mem intrinsic. + InsertElementInst * restoreInserts(InsertElementInst *FirstInsertElem, + LoadInst *LInstr, Value* Ptr); + /// Try to find horizontal reduction or otherwise vectorize a chain of binary /// operators. bool vectorizeRootInstruction(PHINode *P, Value *V, BasicBlock *BB, @@ -148,6 +152,8 @@ /// The getelementptr instructions in a basic block organized by base pointer. WeakTrackingVHListMap GEPs; + + DenseMap PhantomMem; }; } // end namespace llvm Index: lib/Transforms/Vectorize/SLPVectorizer.cpp =================================================================== --- lib/Transforms/Vectorize/SLPVectorizer.cpp +++ lib/Transforms/Vectorize/SLPVectorizer.cpp @@ -4198,6 +4198,21 @@ // A general note: the vectorizer must use BoUpSLP::eraseInstruction() to // delete instructions. + for (auto &BB : F) + for (BasicBlock::iterator BBI = BB.begin(), E = BB.end(); BBI != E;) { + Instruction *I = &*BBI++; + + CallInst *CI = dyn_cast(I); + if (!CI) + continue; + if (getIntrinsicForCallSite(CI, TLI) == Intrinsic::phantom_mem) { + ConstantInt *MaxOffset = cast(CI->getOperand(2)); + PhantomMem[CI->getOperand(0)] = MaxOffset->getZExtValue(); + CI->removeFromParent(); + CI->dropAllReferences(); + } + } + // Scan the blocks in the function in post order. for (auto BB : post_order(&F.getEntryBlock())) { collectSeedInstructions(BB); @@ -5327,6 +5342,130 @@ return tryToVectorizeList(BuildVectorOpds, R, BuildVector, false); } +InsertElementInst * +SLPVectorizerPass::restoreInserts(InsertElementInst *FirstInsertElem, + LoadInst *LInstr, Value *Ptr) { + ShuffleVectorInst *Use = nullptr; + DenseMap Loads; + DenseMap Inserts; + DenseMap GEPs; + BasicBlock *BB = FirstInsertElem->getParent(); + VectorType *VecType = FirstInsertElem->getType(); + uint64_t MaxOffset = PhantomMem[Ptr]; + + ConstantInt *C = dyn_cast(FirstInsertElem->getOperand(2)); + + if (!C || C->getZExtValue() > MaxOffset) + return nullptr; + + // We found the first InsertElem in + // this chain. + uint64_t Offset = C->getZExtValue(); + Inserts[Offset] = FirstInsertElem; + Loads[Offset] = LInstr; + GetElementPtrInst *GEP = dyn_cast(LInstr->getOperand(0)); + if (GEP) + GEPs[Offset] = GEP; + + for (BasicBlock::iterator it = BB->begin(), e = BB->end(); it != e; it++) { + if (InsertElementInst *Insert = dyn_cast(it)) { + ConstantInt *C = dyn_cast(Insert->getOperand(2)); + LoadInst *Load = dyn_cast(Insert->getOperand(1)); + if (!Load || Insert == FirstInsertElem) + continue; + GetElementPtrInst *GEP = dyn_cast(Load->getOperand(0)); + Value *Base = Load->getOperand(0); + if (Base == Ptr) { + if (!C) + return nullptr; + uint64_t Offset = C->getZExtValue(); + if (Offset > MaxOffset || !Load->hasOneUse()) + return nullptr; + Inserts[Offset] = Insert; + Loads[Offset] = Load; + // GEP instruction might be missing for some offsets. + if (GEP) + GEPs[Offset] = GEP; + continue; + } + } + if (GetElementPtrInst *GEP = dyn_cast(it)) + // Examining the chain of GEP, Load, Insert. + if (Ptr == GEP->getOperand(0)) { + ConstantInt *C = dyn_cast(GEP->getOperand(1)); + if (!C || GEP->use_empty() || !GEP->hasOneUse()) + return nullptr; + uint64_t Off = C->getZExtValue(); + LoadInst *Load = dyn_cast(GEP->user_back()); + if (!Load || Load->use_empty() || !Load->hasOneUse()) + return nullptr; + InsertElementInst *Insert = + dyn_cast(Load->user_back()); + if (!Insert) + return nullptr; + ConstantInt *C1 = cast(Insert->getOperand(2)); + uint64_t Off_Ins = C1->getZExtValue(); + // Offset from GEP must be equal to offset from + // Insert otherwise we have to ignore. + if (Off != Off_Ins) + return nullptr; + if (!Insert->use_empty()) + if (ShuffleVectorInst *Shuffle = + dyn_cast(Insert->user_back())) + Use = Shuffle; + Inserts[Off] = Insert; + Loads[Off] = Load; + GEPs[Off] = GEP; + } + } + if (!Use || Use->getParent() != BB) + return nullptr; + + IRBuilder<> Builder(LInstr->getParent(), ++BasicBlock::iterator(LInstr)); + if (!Inserts[0]) { + Builder.SetInsertPoint(LInstr->getPrevNode()); + LoadInst *NewLoad = Builder.CreateLoad(Ptr); + NewLoad->setAlignment(LInstr->getAlignment()); + Loads[0] = NewLoad; + Value *NewInsert = Builder.CreateInsertElement( + UndefValue::get(VecType), NewLoad, Builder.getInt32(0)); + Inserts[0] = cast(NewInsert); + } + for (unsigned i = 1, e = MaxOffset + 1; i < e; i++) { + // Building GEP, Load, Insert for + // this Offset. + GetElementPtrInst *PrevGEP = GEPs[i - 1]; + if (!Inserts[i]) { + assert(Loads[i - 1] != nullptr && + "Couldn't find previous load in the chain."); + if (PrevGEP == nullptr) + Builder.SetInsertPoint(BB, ++Loads[i - 1]->getIterator()); + else + Builder.SetInsertPoint(BB, ++PrevGEP->getIterator()); + Value *NewGEP = + Builder.CreateGEP(LInstr->getType(), Ptr, Builder.getInt64(i)); + GEPs[i] = cast(NewGEP); + Builder.SetInsertPoint(BB, ++Loads[i - 1]->getIterator()); + Builder.SetCurrentDebugLocation(LInstr->getDebugLoc()); + LoadInst *NewLoad = Builder.CreateLoad(NewGEP); + NewLoad->setAlignment(LInstr->getAlignment()); + Loads[i] = NewLoad; + // InsertElement must be inserted just after + // previous InsertElement. + InsertElementInst *PrevIns = Inserts[i - 1]; + Builder.SetInsertPoint(BB, ++PrevIns->getIterator()); + Value *NewInsert = + Builder.CreateInsertElement(PrevIns, NewLoad, Builder.getInt32(i)); + Inserts[i] = cast(NewInsert); + } + Inserts[i]->setOperand(0, Inserts[i - 1]); + } + + // Update ShuffleVector with restored insert element. + Use->setOperand(0, Inserts[MaxOffset]); + return cast(Inserts[MaxOffset]); +} + bool SLPVectorizerPass::vectorizeInsertElementInst(InsertElementInst *IEI, BasicBlock *BB, BoUpSLP &R) { SmallVector BuildVector; @@ -5334,6 +5473,25 @@ if (!findBuildVector(IEI, BuildVector, BuildVectorOpds)) return false; + if (LoadInst *LInstr = dyn_cast(IEI->getOperand(1))) { + Value *Ptr = nullptr; + if (PhantomMem.find(LInstr->getOperand(0)) != PhantomMem.end()) + Ptr = LInstr->getOperand(0); + else if (GetElementPtrInst *GEP = + dyn_cast(LInstr->getOperand(0))) + if (PhantomMem.find(GEP->getOperand(0)) != PhantomMem.end()) + Ptr = GEP->getOperand(0); + + if (!Ptr) + return false; + if (InsertElementInst *Insert = restoreInserts(IEI, LInstr, Ptr)) { + BuildVector.clear(); + BuildVectorOpds.clear(); + if (!findBuildVector(Insert, BuildVector, BuildVectorOpds)) + return false; + } + } + // Vectorize starting with the build vector operands ignoring the BuildVector // instructions for the purpose of scheduling and user extraction. return tryToVectorizeList(BuildVectorOpds, R, BuildVector); Index: test/Transforms/SLPVectorizer/X86/pr21780.ll =================================================================== --- /dev/null +++ test/Transforms/SLPVectorizer/X86/pr21780.ll @@ -0,0 +1,62 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py +; RUN: opt < %s -slp-vectorizer -mtriple=x86_64-unknown-linux-gnu -mcpu=bdver2 -S | FileCheck %s + +define <4 x double> @load_four_scalars_but_use_two(double* %ptr) #0 { +; CHECK-LABEL: @load_four_scalars_but_use_two( +; CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds double, double* [[PTR:%.*]], i64 2 +; CHECK-NEXT: [[TMP1:%.*]] = getelementptr double, double* [[PTR]], i64 3 +; CHECK-NEXT: [[TMP2:%.*]] = getelementptr double, double* [[PTR]], i64 1 +; CHECK-NEXT: [[TMP3:%.*]] = bitcast double* [[PTR]] to <4 x double>* +; CHECK-NEXT: [[TMP4:%.*]] = load <4 x double>, <4 x double>* [[TMP3]], align 8 +; CHECK-NEXT: [[TMP5:%.*]] = extractelement <4 x double> [[TMP4]], i32 0 +; CHECK-NEXT: [[INS0:%.*]] = insertelement <4 x double> undef, double [[TMP5]], i32 0 +; CHECK-NEXT: [[TMP6:%.*]] = extractelement <4 x double> [[TMP4]], i32 1 +; CHECK-NEXT: [[TMP7:%.*]] = insertelement <4 x double> [[INS0]], double [[TMP6]], i32 1 +; CHECK-NEXT: [[TMP8:%.*]] = extractelement <4 x double> [[TMP4]], i32 2 +; CHECK-NEXT: [[INS2:%.*]] = insertelement <4 x double> [[TMP7]], double [[TMP8]], i32 2 +; CHECK-NEXT: [[TMP9:%.*]] = extractelement <4 x double> [[TMP4]], i32 3 +; CHECK-NEXT: [[TMP10:%.*]] = insertelement <4 x double> [[INS2]], double [[TMP9]], i32 3 +; CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <4 x double> [[TMP10]], <4 x double> undef, <4 x i32> +; CHECK-NEXT: ret <4 x double> [[SHUFFLE]] +; + call void @llvm.phantom.mem.p0f64.p0f64(double* %ptr, double* null, i64 3) + %arrayidx2 = getelementptr inbounds double, double* %ptr, i64 2 + %ld0 = load double, double* %ptr, align 8 + %ld2 = load double, double* %arrayidx2, align 8 + %ins0 = insertelement <4 x double> undef, double %ld0, i32 0 + %ins2 = insertelement <4 x double> %ins0, double %ld2, i32 2 + %shuffle = shufflevector <4 x double> %ins2, <4 x double> undef, <4 x i32> + ret <4 x double> %shuffle +} + +define <4 x double> @load_four_scalars_but_use_two1(double* %ptr) #0 { +; CHECK-LABEL: @load_four_scalars_but_use_two1( +; CHECK-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds double, double* [[PTR:%.*]], i64 1 +; CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds double, double* [[PTR]], i64 2 +; CHECK-NEXT: [[TMP1:%.*]] = getelementptr double, double* [[PTR]], i64 3 +; CHECK-NEXT: [[TMP2:%.*]] = bitcast double* [[PTR]] to <4 x double>* +; CHECK-NEXT: [[TMP3:%.*]] = load <4 x double>, <4 x double>* [[TMP2]], align 8 +; CHECK-NEXT: [[TMP4:%.*]] = extractelement <4 x double> [[TMP3]], i32 0 +; CHECK-NEXT: [[TMP5:%.*]] = insertelement <4 x double> undef, double [[TMP4]], i32 0 +; CHECK-NEXT: [[TMP6:%.*]] = extractelement <4 x double> [[TMP3]], i32 1 +; CHECK-NEXT: [[INS1:%.*]] = insertelement <4 x double> [[TMP5]], double [[TMP6]], i32 1 +; CHECK-NEXT: [[TMP7:%.*]] = extractelement <4 x double> [[TMP3]], i32 2 +; CHECK-NEXT: [[INS2:%.*]] = insertelement <4 x double> [[INS1]], double [[TMP7]], i32 2 +; CHECK-NEXT: [[TMP8:%.*]] = extractelement <4 x double> [[TMP3]], i32 3 +; CHECK-NEXT: [[TMP9:%.*]] = insertelement <4 x double> [[INS2]], double [[TMP8]], i32 3 +; CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <4 x double> [[TMP9]], <4 x double> undef, <4 x i32> +; CHECK-NEXT: ret <4 x double> [[SHUFFLE]] +; + call void @llvm.phantom.mem.p0f64.p0f64(double* %ptr, double* null, i64 3) + %arrayidx1 = getelementptr inbounds double, double* %ptr, i64 1 + %arrayidx2 = getelementptr inbounds double, double* %ptr, i64 2 + %ld1 = load double, double* %arrayidx1, align 8 + %ld2 = load double, double* %arrayidx2, align 8 + %ins1 = insertelement <4 x double> undef, double %ld1, i32 1 + %ins2 = insertelement <4 x double> %ins1, double %ld2, i32 2 + %shuffle = shufflevector <4 x double> %ins2, <4 x double> undef, <4 x i32> + ret <4 x double> %shuffle +} + +; Function Attrs: argmemonly nounwind +declare void @llvm.phantom.mem.p0f64.p0f64(double*, double*, i64) #1