diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h b/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h --- a/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h +++ b/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h @@ -216,6 +216,9 @@ Value *getOrCreateVectorValues(Value *V, unsigned Part) override; Value *getOrCreateScalarValue(Value *V, const VPIteration &Instance) override; + + virtual void setVectorValue(Value *Key, unsigned Part, + Value *Vector) override; }; /// A builder used to construct the current plan. diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp --- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -406,9 +406,6 @@ /// Return the pre-header block of the new loop. BasicBlock *createVectorizedLoopSkeleton(); - /// Widen a single instruction within the innermost loop. - void widenInstruction(Instruction &I); - /// Fix the vectorized code, taking care of header phi's, live-outs, and more. void fixVectorizedLoop(); @@ -4225,213 +4222,6 @@ return !CInt || CInt->isZero(); } -void InnerLoopVectorizer::widenInstruction(Instruction &I) { - switch (I.getOpcode()) { - case Instruction::Br: - case Instruction::PHI: - case Instruction::GetElementPtr: - llvm_unreachable("This instruction is handled by a different recipe."); - case Instruction::UDiv: - case Instruction::SDiv: - case Instruction::SRem: - case Instruction::URem: - case Instruction::Add: - case Instruction::FAdd: - case Instruction::Sub: - case Instruction::FSub: - case Instruction::FNeg: - case Instruction::Mul: - case Instruction::FMul: - case Instruction::FDiv: - case Instruction::FRem: - case Instruction::Shl: - case Instruction::LShr: - case Instruction::AShr: - case Instruction::And: - case Instruction::Or: - case Instruction::Xor: { - // Just widen unops and binops. - setDebugLocFromInst(Builder, &I); - - for (unsigned Part = 0; Part < UF; ++Part) { - SmallVector Ops; - for (Value *Op : I.operands()) - Ops.push_back(getOrCreateVectorValue(Op, Part)); - - Value *V = Builder.CreateNAryOp(I.getOpcode(), Ops); - - if (auto *VecOp = dyn_cast(V)) - VecOp->copyIRFlags(&I); - - // Use this vector value for all users of the original instruction. - VectorLoopValueMap.setVectorValue(&I, Part, V); - addMetadata(V, &I); - } - - break; - } - case Instruction::Select: { - // Widen selects. - // If the selector is loop invariant we can create a select - // instruction with a scalar condition. Otherwise, use vector-select. - auto *SE = PSE.getSE(); - bool InvariantCond = - SE->isLoopInvariant(PSE.getSCEV(I.getOperand(0)), OrigLoop); - setDebugLocFromInst(Builder, &I); - - // The condition can be loop invariant but still defined inside the - // loop. This means that we can't just use the original 'cond' value. - // We have to take the 'vectorized' value and pick the first lane. - // Instcombine will make this a no-op. - - auto *ScalarCond = getOrCreateScalarValue(I.getOperand(0), {0, 0}); - - for (unsigned Part = 0; Part < UF; ++Part) { - Value *Cond = getOrCreateVectorValue(I.getOperand(0), Part); - Value *Op0 = getOrCreateVectorValue(I.getOperand(1), Part); - Value *Op1 = getOrCreateVectorValue(I.getOperand(2), Part); - Value *Sel = - Builder.CreateSelect(InvariantCond ? ScalarCond : Cond, Op0, Op1); - VectorLoopValueMap.setVectorValue(&I, Part, Sel); - addMetadata(Sel, &I); - } - - break; - } - - case Instruction::ICmp: - case Instruction::FCmp: { - // Widen compares. Generate vector compares. - bool FCmp = (I.getOpcode() == Instruction::FCmp); - auto *Cmp = cast(&I); - setDebugLocFromInst(Builder, Cmp); - for (unsigned Part = 0; Part < UF; ++Part) { - Value *A = getOrCreateVectorValue(Cmp->getOperand(0), Part); - Value *B = getOrCreateVectorValue(Cmp->getOperand(1), Part); - Value *C = nullptr; - if (FCmp) { - // Propagate fast math flags. - IRBuilder<>::FastMathFlagGuard FMFG(Builder); - Builder.setFastMathFlags(Cmp->getFastMathFlags()); - C = Builder.CreateFCmp(Cmp->getPredicate(), A, B); - } else { - C = Builder.CreateICmp(Cmp->getPredicate(), A, B); - } - VectorLoopValueMap.setVectorValue(&I, Part, C); - addMetadata(C, &I); - } - - break; - } - - case Instruction::ZExt: - case Instruction::SExt: - case Instruction::FPToUI: - case Instruction::FPToSI: - case Instruction::FPExt: - case Instruction::PtrToInt: - case Instruction::IntToPtr: - case Instruction::SIToFP: - case Instruction::UIToFP: - case Instruction::Trunc: - case Instruction::FPTrunc: - case Instruction::BitCast: { - auto *CI = cast(&I); - setDebugLocFromInst(Builder, CI); - - /// Vectorize casts. - Type *DestTy = - (VF == 1) ? CI->getType() : VectorType::get(CI->getType(), VF); - - for (unsigned Part = 0; Part < UF; ++Part) { - Value *A = getOrCreateVectorValue(CI->getOperand(0), Part); - Value *Cast = Builder.CreateCast(CI->getOpcode(), A, DestTy); - VectorLoopValueMap.setVectorValue(&I, Part, Cast); - addMetadata(Cast, &I); - } - break; - } - - case Instruction::Call: { - // Ignore dbg intrinsics. - if (isa(I)) - break; - setDebugLocFromInst(Builder, &I); - - Module *M = I.getParent()->getParent()->getParent(); - auto *CI = cast(&I); - - SmallVector Tys; - for (Value *ArgOperand : CI->arg_operands()) - Tys.push_back(ToVectorTy(ArgOperand->getType(), VF)); - - Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI); - - // The flag shows whether we use Intrinsic or a usual Call for vectorized - // version of the instruction. - // Is it beneficial to perform intrinsic call compared to lib call? - bool NeedToScalarize = false; - unsigned CallCost = Cost->getVectorCallCost(CI, VF, NeedToScalarize); - bool UseVectorIntrinsic = - ID && Cost->getVectorIntrinsicCost(CI, VF) <= CallCost; - assert((UseVectorIntrinsic || !NeedToScalarize) && - "Instruction should be scalarized elsewhere."); - - for (unsigned Part = 0; Part < UF; ++Part) { - SmallVector Args; - for (unsigned i = 0, ie = CI->getNumArgOperands(); i != ie; ++i) { - Value *Arg = CI->getArgOperand(i); - // Some intrinsics have a scalar argument - don't replace it with a - // vector. - if (!UseVectorIntrinsic || !hasVectorInstrinsicScalarOpd(ID, i)) - Arg = getOrCreateVectorValue(CI->getArgOperand(i), Part); - Args.push_back(Arg); - } - - Function *VectorF; - if (UseVectorIntrinsic) { - // Use vector version of the intrinsic. - Type *TysForDecl[] = {CI->getType()}; - if (VF > 1) - TysForDecl[0] = VectorType::get(CI->getType()->getScalarType(), VF); - VectorF = Intrinsic::getDeclaration(M, ID, TysForDecl); - } else { - // Use vector version of the function call. - const VFShape Shape = - VFShape::get(*CI, {VF, false} /*EC*/, false /*HasGlobalPred*/); -#ifndef NDEBUG - const SmallVector Infos = VFDatabase::getMappings(*CI); - assert(std::find_if(Infos.begin(), Infos.end(), - [&Shape](const VFInfo &Info) { - return Info.Shape == Shape; - }) != Infos.end() && - "Vector function shape is missing from the database."); -#endif - VectorF = VFDatabase(*CI).getVectorizedFunction(Shape); - } - assert(VectorF && "Can't create vector function."); - - SmallVector OpBundles; - CI->getOperandBundlesAsDefs(OpBundles); - CallInst *V = Builder.CreateCall(VectorF, Args, OpBundles); - - if (isa(V)) - V->copyFastMathFlags(CI); - - VectorLoopValueMap.setVectorValue(&I, Part, V); - addMetadata(V, &I); - } - - break; - } - - default: - // This instruction is not vectorized by simple widening. - LLVM_DEBUG(dbgs() << "LV: Found an unhandled instruction: " << I); - llvm_unreachable("Unhandled instruction!"); - } // end of switch. -} - void LoopVectorizationCostModel::collectLoopScalars(unsigned VF) { // We should not collect Scalars more than once per VF. Right now, this // function is called from collectUniformsAndScalars(), which already does @@ -6982,7 +6772,12 @@ return false; // Success: widen this instruction. - VPWidenRecipe *WidenRecipe = new VPWidenRecipe(I); + VPlan &Plan = *VPBB->getPlan(); + // Create VPValue operands. + auto VPValues = map_range( + I->operands(), [&Plan](Value *Op) { return Plan.getOrAddVPValue(Op); }); + SmallVector Values(VPValues.begin(), VPValues.end()); + VPWidenRecipe *WidenRecipe = new VPWidenRecipe(I, Values); setRecipe(I, WidenRecipe); VPBB->appendRecipe(WidenRecipe); return true; @@ -7365,8 +7160,17 @@ ILV->addMetadata(To, From); } -void VPTransformState::setDebugLocFromInst(IRBuilder<> &B, const Value *Ptr) { - ILV->setDebugLocFromInst(B, Ptr); +void VPTransformState::setDebugLocFromInst(const Value *Ptr) { + ILV->setDebugLocFromInst(Builder, Ptr); +} + +unsigned VPTransformState::getVectorCallCost(CallInst *CI, unsigned VF, + bool &NeedToScalarize) { + return ILV->Cost->getVectorCallCost(CI, VF, NeedToScalarize); +} + +unsigned VPTransformState::getVectorIntrinsicCost(CallInst *CI, unsigned VF) { + return ILV->Cost->getVectorIntrinsicCost(CI, VF); } void VPInterleaveRecipe::print(raw_ostream &O, const Twine &Indent, @@ -7389,7 +7193,222 @@ } void VPWidenRecipe::execute(VPTransformState &State) { - State.ILV->widenInstruction(*Ingredient); + Instruction &I = *Ingredient; + auto GetVectorOps = [&State](ArrayRef Ops, unsigned Part) { + SmallVector VecOps; + for (VPValue *Op : Ops) + VecOps.push_back(State.get(Op, Part)); + return VecOps; + }; + switch (I.getOpcode()) { + case Instruction::Br: + case Instruction::PHI: + case Instruction::GetElementPtr: + llvm_unreachable("This instruction is handled by a different recipe."); + case Instruction::UDiv: + case Instruction::SDiv: + case Instruction::SRem: + case Instruction::URem: + case Instruction::Add: + case Instruction::FAdd: + case Instruction::Sub: + case Instruction::FSub: + case Instruction::FNeg: + case Instruction::Mul: + case Instruction::FMul: + case Instruction::FDiv: + case Instruction::FRem: + case Instruction::Shl: + case Instruction::LShr: + case Instruction::AShr: + case Instruction::And: + case Instruction::Or: + case Instruction::Xor: { + // Just widen unops and binops. + State.setDebugLocFromInst(&I); + + for (unsigned Part = 0; Part < State.UF; ++Part) { + Value *V = State.Builder.CreateNAryOp( + I.getOpcode(), GetVectorOps(User.getOperandsRef(), Part)); + + if (auto *VecOp = dyn_cast(V)) + VecOp->copyIRFlags(&I); + + // Use this vector value for all users of the original instruction. + State.Callback.setVectorValue(&I, Part, V); + State.addMetadata(V, &I); + } + + break; + } + case Instruction::Select: { + // Widen selects. + // If the selector is loop invariant we can create a select + // instruction with a scalar condition. Otherwise, use vector-select. + auto *SE = State.PSE.getSE(); + bool InvariantCond = SE->isLoopInvariant(State.PSE.getSCEV(I.getOperand(0)), + State.OriginalLoop); + State.setDebugLocFromInst(&I); + + // The condition can be loop invariant but still defined inside the + // loop. This means that we can't just use the original 'cond' value. + // We have to take the 'vectorized' value and pick the first lane. + // Instcombine will make this a no-op. + + auto *ScalarCond = + State.Callback.getOrCreateScalarValue(I.getOperand(0), {0, 0}); + + for (unsigned Part = 0; Part < State.UF; ++Part) { + auto VectorOps = GetVectorOps(User.getOperandsRef(), Part); + Value *Cond = VectorOps[0]; + Value *Op0 = VectorOps[1]; + Value *Op1 = VectorOps[2]; + Value *Sel = State.Builder.CreateSelect(InvariantCond ? ScalarCond : Cond, + Op0, Op1); + State.Callback.setVectorValue(&I, Part, Sel); + State.addMetadata(Sel, &I); + } + + break; + } + + case Instruction::ICmp: + case Instruction::FCmp: { + // Widen compares. Generate vector compares. + bool FCmp = (I.getOpcode() == Instruction::FCmp); + auto *Cmp = cast(&I); + State.setDebugLocFromInst(Cmp); + for (unsigned Part = 0; Part < State.UF; ++Part) { + auto VectorOps = GetVectorOps(User.getOperandsRef(), Part); + Value *A = VectorOps[0]; + Value *B = VectorOps[1]; + Value *C = nullptr; + if (FCmp) { + // Propagate fast math flags. + IRBuilder<>::FastMathFlagGuard FMFG(State.Builder); + State.Builder.setFastMathFlags(Cmp->getFastMathFlags()); + C = State.Builder.CreateFCmp(Cmp->getPredicate(), A, B); + } else { + C = State.Builder.CreateICmp(Cmp->getPredicate(), A, B); + } + State.Callback.setVectorValue(&I, Part, C); + State.addMetadata(C, &I); + } + + break; + } + + case Instruction::ZExt: + case Instruction::SExt: + case Instruction::FPToUI: + case Instruction::FPToSI: + case Instruction::FPExt: + case Instruction::PtrToInt: + case Instruction::IntToPtr: + case Instruction::SIToFP: + case Instruction::UIToFP: + case Instruction::Trunc: + case Instruction::FPTrunc: + case Instruction::BitCast: { + auto *CI = cast(&I); + State.setDebugLocFromInst(CI); + + /// Vectorize casts. + Type *DestTy = (State.VF == 1) ? CI->getType() + : VectorType::get(CI->getType(), State.VF); + + for (unsigned Part = 0; Part < State.UF; ++Part) { + auto VectorOps = GetVectorOps(User.getOperandsRef(), Part); + Value *A = VectorOps[0]; + Value *Cast = State.Builder.CreateCast(CI->getOpcode(), A, DestTy); + State.Callback.setVectorValue(&I, Part, Cast); + State.addMetadata(Cast, &I); + } + break; + } + + case Instruction::Call: { + // Ignore dbg intrinsics. + if (isa(I)) + break; + State.setDebugLocFromInst(&I); + + Module *M = I.getParent()->getParent()->getParent(); + auto *CI = cast(&I); + + SmallVector Tys; + for (Value *ArgOperand : CI->arg_operands()) + Tys.push_back(ToVectorTy(ArgOperand->getType(), State.VF)); + + Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, State.TLI); + + // The flag shows whether we use Intrinsic or a usual Call for vectorized + // version of the instruction. + // Is it beneficial to perform intrinsic call compared to lib call? + bool NeedToScalarize = false; + unsigned CallCost = State.getVectorCallCost(CI, State.VF, NeedToScalarize); + bool UseVectorIntrinsic = + ID && State.getVectorIntrinsicCost(CI, State.VF) <= CallCost; + assert((UseVectorIntrinsic || !NeedToScalarize) && + "Instruction should be scalarized elsewhere."); + + for (unsigned Part = 0; Part < State.UF; ++Part) { + SmallVector Args; + // Need to use CI->getNumArgOperands, to skip the called function and + // operand bundles. + for (unsigned i = 0, ie = CI->getNumArgOperands(); i != ie; ++i) { + VPValue *Arg = User.getOperand(i); + // Some intrinsics have a scalar argument - don't replace it with a + // vector. + Value *ArgVal = Arg->getUnderlyingValue(); + if (!UseVectorIntrinsic || !hasVectorInstrinsicScalarOpd(ID, i)) + ArgVal = State.get(Arg, Part); + Args.push_back(ArgVal); + } + + Function *VectorF; + if (UseVectorIntrinsic) { + // Use vector version of the intrinsic. + Type *TysForDecl[] = {CI->getType()}; + if (State.VF > 1) + TysForDecl[0] = + VectorType::get(CI->getType()->getScalarType(), State.VF); + VectorF = Intrinsic::getDeclaration(M, ID, TysForDecl); + } else { + // Use vector version of the function call. + const VFShape Shape = VFShape::get(*CI, /*EC=*/{State.VF, false}, + /*HasGlobalPred=*/false); +#ifndef NDEBUG + const SmallVector Infos = VFDatabase::getMappings(*CI); + assert(std::find_if(Infos.begin(), Infos.end(), + [&Shape](const VFInfo &Info) { + return Info.Shape == Shape; + }) != Infos.end() && + "Vector function shape is missing from the database."); +#endif + VectorF = VFDatabase(*CI).getVectorizedFunction(Shape); + } + assert(VectorF && "Can't create vector function."); + + SmallVector OpBundles; + CI->getOperandBundlesAsDefs(OpBundles); + CallInst *V = State.Builder.CreateCall(VectorF, Args, OpBundles); + + if (isa(V)) + V->copyFastMathFlags(CI); + + State.Callback.setVectorValue(&I, Part, V); + State.addMetadata(V, &I); + } + + break; + } + + default: + // This instruction is not vectorized by simple widening. + LLVM_DEBUG(dbgs() << "LV: Found an unhandled instruction: " << I); + llvm_unreachable("Unhandled instruction!"); + } // } void VPWidenGEPRecipe::execute(VPTransformState &State) { diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h --- a/llvm/lib/Transforms/Vectorize/VPlan.h +++ b/llvm/lib/Transforms/Vectorize/VPlan.h @@ -230,6 +230,7 @@ virtual Value *getOrCreateVectorValues(Value *V, unsigned Part) = 0; virtual Value *getOrCreateScalarValue(Value *V, const VPIteration &Instance) = 0; + virtual void setVectorValue(Value *Key, unsigned Part, Value *Vector) = 0; }; /// VPTransformState holds information passed down when "executing" a VPlan, @@ -293,7 +294,7 @@ void addMetadata(ArrayRef To, Instruction *From); - void setDebugLocFromInst(IRBuilder<> &B, const Value *Ptr); + void setDebugLocFromInst(const Value *Ptr); unsigned getVectorCallCost(CallInst *CI, unsigned VF, bool &NeedToScalarize); @@ -784,8 +785,11 @@ /// Hold the ingredients by pointing to their original BasicBlock location. Instruction *Ingredient; + VPUser User; + public: - VPWidenRecipe(Instruction *I) : VPRecipeBase(VPWidenSC), Ingredient(I) {} + VPWidenRecipe(Instruction *I, ArrayRef Operands) + : VPRecipeBase(VPWidenSC), Ingredient(I), User(Operands) {} ~VPWidenRecipe() override = default; diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp --- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp @@ -71,8 +71,14 @@ NewRecipe = new VPWidenPHIRecipe(Phi); } else if (GetElementPtrInst *GEP = dyn_cast(Inst)) { NewRecipe = new VPWidenGEPRecipe(GEP, OrigLoop); - } else - NewRecipe = new VPWidenRecipe(Inst); + } else { + // Create VPValue operands. + auto VPValues = map_range(I->operands(), [&Plan](Value *Op) { + return Plan.getOrAddVPValue(Op); + }); + SmallVector Values(VPValues.begin(), VPValues.end()); + NewRecipe = new VPWidenRecipe(Inst, Values); + } NewRecipe->insertBefore(Ingredient); Ingredient->eraseFromParent(); diff --git a/llvm/lib/Transforms/Vectorize/VPlanValue.h b/llvm/lib/Transforms/Vectorize/VPlanValue.h --- a/llvm/lib/Transforms/Vectorize/VPlanValue.h +++ b/llvm/lib/Transforms/Vectorize/VPlanValue.h @@ -43,6 +43,7 @@ friend class VPBasicBlock; friend class VPInterleavedAccessInfo; friend class VPSlotTracker; + friend class VPWidenRecipe; private: const unsigned char SubclassID; ///< Subclass identifier (for isa/dyn_cast). @@ -182,6 +183,8 @@ const_operand_range operands() const { return const_operand_range(op_begin(), op_end()); } + + ArrayRef getOperandsRef() { return {Operands}; } }; class VPlan; class VPBasicBlock;