Index: llvm/trunk/include/llvm/Transforms/Utils/LoopUtils.h =================================================================== --- llvm/trunk/include/llvm/Transforms/Utils/LoopUtils.h +++ llvm/trunk/include/llvm/Transforms/Utils/LoopUtils.h @@ -263,13 +263,15 @@ enum InductionKind { IK_NoInduction, ///< Not an induction variable. IK_IntInduction, ///< Integer induction variable. Step = C. - IK_PtrInduction ///< Pointer induction var. Step = C / sizeof(elem). + IK_PtrInduction, ///< Pointer induction var. Step = C / sizeof(elem). + IK_FpInduction ///< Floating point induction variable. }; public: /// Default constructor - creates an invalid induction. InductionDescriptor() - : StartValue(nullptr), IK(IK_NoInduction), Step(nullptr) {} + : StartValue(nullptr), IK(IK_NoInduction), Step(nullptr), + InductionBinOp(nullptr) {} /// Get the consecutive direction. Returns: /// 0 - unknown or non-consecutive. @@ -291,26 +293,58 @@ const SCEV *getStep() const { return Step; } ConstantInt *getConstIntStepValue() const; - /// Returns true if \p Phi is an induction. If \p Phi is an induction, - /// the induction descriptor \p D will contain the data describing this - /// induction. If by some other means the caller has a better SCEV + /// Returns true if \p Phi is an induction in the loop \p L. If \p Phi is an + /// induction, the induction descriptor \p D will contain the data describing + /// this induction. If by some other means the caller has a better SCEV /// expression for \p Phi than the one returned by the ScalarEvolution /// analysis, it can be passed through \p Expr. - static bool isInductionPHI(PHINode *Phi, ScalarEvolution *SE, + static bool isInductionPHI(PHINode *Phi, const Loop* L, ScalarEvolution *SE, InductionDescriptor &D, const SCEV *Expr = nullptr); - /// Returns true if \p Phi is an induction, in the context associated with - /// the run-time predicate of PSE. If \p Assume is true, this can add further - /// SCEV predicates to \p PSE in order to prove that \p Phi is an induction. + /// Returns true if \p Phi is a floating point induction in the loop \p L. + /// If \p Phi is an induction, the induction descriptor \p D will contain + /// the data describing this induction. + static bool isFPInductionPHI(PHINode *Phi, const Loop* L, + ScalarEvolution *SE, InductionDescriptor &D); + + /// Returns true if \p Phi is a loop \p L induction, in the context associated + /// with the run-time predicate of PSE. If \p Assume is true, this can add + /// further SCEV predicates to \p PSE in order to prove that \p Phi is an + /// induction. /// If \p Phi is an induction, \p D will contain the data describing this /// induction. - static bool isInductionPHI(PHINode *Phi, PredicatedScalarEvolution &PSE, + static bool isInductionPHI(PHINode *Phi, const Loop* L, + PredicatedScalarEvolution &PSE, InductionDescriptor &D, bool Assume = false); + /// Returns true if the induction type is FP and the binary operator does + /// not have the "fast-math" property. Such operation requires a relaxed FP + /// mode. + bool hasUnsafeAlgebra() { + return InductionBinOp && + !cast(InductionBinOp)->hasUnsafeAlgebra(); + } + + /// Returns induction operator that does not have "fast-math" property + /// and requires FP unsafe mode. + Instruction *getUnsafeAlgebraInst() { + if (!InductionBinOp || + cast(InductionBinOp)->hasUnsafeAlgebra()) + return nullptr; + return InductionBinOp; + } + + /// Returns binary opcode of the induction operator. + Instruction::BinaryOps getInductionOpcode() const { + return InductionBinOp ? InductionBinOp->getOpcode() : + Instruction::BinaryOpsEnd; + } + private: /// Private constructor - used by \c isInductionPHI. - InductionDescriptor(Value *Start, InductionKind K, const SCEV *Step); + InductionDescriptor(Value *Start, InductionKind K, const SCEV *Step, + BinaryOperator *InductionBinOp = nullptr); /// Start value. TrackingVH StartValue; @@ -318,6 +352,8 @@ InductionKind IK; /// Step value. const SCEV *Step; + // Instruction that advances induction variable. + BinaryOperator *InductionBinOp; }; BasicBlock *InsertPreheaderForLoop(Loop *L, DominatorTree *DT, LoopInfo *LI, Index: llvm/trunk/lib/Transforms/Scalar/LoopInterchange.cpp =================================================================== --- llvm/trunk/lib/Transforms/Scalar/LoopInterchange.cpp +++ llvm/trunk/lib/Transforms/Scalar/LoopInterchange.cpp @@ -703,7 +703,7 @@ RecurrenceDescriptor RD; InductionDescriptor ID; PHINode *PHI = cast(I); - if (InductionDescriptor::isInductionPHI(PHI, SE, ID)) + if (InductionDescriptor::isInductionPHI(PHI, L, SE, ID)) Inductions.push_back(PHI); else if (RecurrenceDescriptor::isReductionPHI(PHI, L, RD)) Reductions.push_back(PHI); Index: llvm/trunk/lib/Transforms/Utils/LoopUtils.cpp =================================================================== --- llvm/trunk/lib/Transforms/Utils/LoopUtils.cpp +++ llvm/trunk/lib/Transforms/Utils/LoopUtils.cpp @@ -654,8 +654,8 @@ } InductionDescriptor::InductionDescriptor(Value *Start, InductionKind K, - const SCEV *Step) - : StartValue(Start), IK(K), Step(Step) { + const SCEV *Step, BinaryOperator *BOp) + : StartValue(Start), IK(K), Step(Step), InductionBinOp(BOp) { assert(IK != IK_NoInduction && "Not an induction"); // Start value type should match the induction kind and the value @@ -672,7 +672,15 @@ assert((IK != IK_PtrInduction || getConstIntStepValue()) && "Step value should be constant for pointer induction"); - assert(Step->getType()->isIntegerTy() && "StepValue is not an integer"); + assert((IK == IK_FpInduction || Step->getType()->isIntegerTy()) && + "StepValue is not an integer"); + + assert((IK != IK_FpInduction || Step->getType()->isFloatingPointTy()) && + "StepValue is not FP for FpInduction"); + assert((IK != IK_FpInduction || (InductionBinOp && + (InductionBinOp->getOpcode() == Instruction::FAdd || + InductionBinOp->getOpcode() == Instruction::FSub))) && + "Binary opcode should be specified for FP induction"); } int InductionDescriptor::getConsecutiveDirection() const { @@ -693,6 +701,8 @@ const DataLayout& DL) const { SCEVExpander Exp(*SE, DL, "induction"); + assert(Index->getType() == Step->getType() && + "Index type does not match StepValue type"); switch (IK) { case IK_IntInduction: { assert(Index->getType() == StartValue->getType() && @@ -717,29 +727,113 @@ return Exp.expandCodeFor(S, StartValue->getType(), &*B.GetInsertPoint()); } case IK_PtrInduction: { - assert(Index->getType() == Step->getType() && - "Index type does not match StepValue type"); assert(isa(Step) && "Expected constant step for pointer induction"); const SCEV *S = SE->getMulExpr(SE->getSCEV(Index), Step); Index = Exp.expandCodeFor(S, Index->getType(), &*B.GetInsertPoint()); return B.CreateGEP(nullptr, StartValue, Index); } + case IK_FpInduction: { + assert(Step->getType()->isFloatingPointTy() && "Expected FP Step value"); + assert(InductionBinOp && + (InductionBinOp->getOpcode() == Instruction::FAdd || + InductionBinOp->getOpcode() == Instruction::FSub) && + "Original bin op should be defined for FP induction"); + + Value *StepValue = cast(Step)->getValue(); + + // Floating point operations had to be 'fast' to enable the induction. + FastMathFlags Flags; + Flags.setUnsafeAlgebra(); + + Value *MulExp = B.CreateFMul(StepValue, Index); + if (isa(MulExp)) + // We have to check, the MulExp may be a constant. + cast(MulExp)->setFastMathFlags(Flags); + + Value *BOp = B.CreateBinOp(InductionBinOp->getOpcode() , StartValue, + MulExp, "induction"); + if (isa(BOp)) + cast(BOp)->setFastMathFlags(Flags); + + return BOp; + } case IK_NoInduction: return nullptr; } llvm_unreachable("invalid enum"); } -bool InductionDescriptor::isInductionPHI(PHINode *Phi, +bool InductionDescriptor::isFPInductionPHI(PHINode *Phi, const Loop *TheLoop, + ScalarEvolution *SE, + InductionDescriptor &D) { + + // Here we only handle FP induction variables. + assert(Phi->getType()->isFloatingPointTy() && "Unexpected Phi type"); + + if (TheLoop->getHeader() != Phi->getParent()) + return false; + + // The loop may have multiple entrances or multiple exits; we can analyze + // this phi if it has a unique entry value and a unique backedge value. + if (Phi->getNumIncomingValues() != 2) + return false; + Value *BEValue = nullptr, *StartValue = nullptr; + if (TheLoop->contains(Phi->getIncomingBlock(0))) { + BEValue = Phi->getIncomingValue(0); + StartValue = Phi->getIncomingValue(1); + } else { + assert(TheLoop->contains(Phi->getIncomingBlock(1)) && + "Unexpected Phi node in the loop"); + BEValue = Phi->getIncomingValue(1); + StartValue = Phi->getIncomingValue(0); + } + + BinaryOperator *BOp = dyn_cast(BEValue); + if (!BOp) + return false; + + Value *Addend = nullptr; + if (BOp->getOpcode() == Instruction::FAdd) { + if (BOp->getOperand(0) == Phi) + Addend = BOp->getOperand(1); + else if (BOp->getOperand(1) == Phi) + Addend = BOp->getOperand(0); + } else if (BOp->getOpcode() == Instruction::FSub) + if (BOp->getOperand(0) == Phi) + Addend = BOp->getOperand(1); + + if (!Addend) + return false; + + // The addend should be loop invariant + if (auto *I = dyn_cast(Addend)) + if (TheLoop->contains(I)) + return false; + + // FP Step has unknown SCEV + const SCEV *Step = SE->getUnknown(Addend); + D = InductionDescriptor(StartValue, IK_FpInduction, Step, BOp); + return true; +} + +bool InductionDescriptor::isInductionPHI(PHINode *Phi, const Loop *TheLoop, PredicatedScalarEvolution &PSE, InductionDescriptor &D, bool Assume) { Type *PhiTy = Phi->getType(); - // We only handle integer and pointer inductions variables. - if (!PhiTy->isIntegerTy() && !PhiTy->isPointerTy()) + + // Handle integer and pointer inductions variables. + // Now we handle also FP induction but not trying to make a + // recurrent expression from the PHI node in-place. + + if (!PhiTy->isIntegerTy() && !PhiTy->isPointerTy() && + !PhiTy->isFloatTy() && !PhiTy->isDoubleTy() && !PhiTy->isHalfTy()) return false; + if (PhiTy->isFloatingPointTy()) + return isFPInductionPHI(Phi, TheLoop, PSE.getSE(), D); + const SCEV *PhiScev = PSE.getSCEV(Phi); const auto *AR = dyn_cast(PhiScev); @@ -752,10 +846,10 @@ return false; } - return isInductionPHI(Phi, PSE.getSE(), D, AR); + return isInductionPHI(Phi, TheLoop, PSE.getSE(), D, AR); } -bool InductionDescriptor::isInductionPHI(PHINode *Phi, +bool InductionDescriptor::isInductionPHI(PHINode *Phi, const Loop *TheLoop, ScalarEvolution *SE, InductionDescriptor &D, const SCEV *Expr) { @@ -773,7 +867,7 @@ return false; } - assert(AR->getLoop()->getHeader() == Phi->getParent() && + assert(TheLoop->getHeader() == Phi->getParent() && "PHI is an AddRec for a different loop?!"); Value *StartValue = Phi->getIncomingValueForBlock(AR->getLoop()->getLoopPreheader()); @@ -781,7 +875,7 @@ // Calculate the pointer stride and check if it is consecutive. // The stride may be a constant or a loop invariant integer value. const SCEVConstant *ConstStep = dyn_cast(Step); - if (!ConstStep && !SE->isLoopInvariant(Step, AR->getLoop())) + if (!ConstStep && !SE->isLoopInvariant(Step, TheLoop)) return false; if (PhiTy->isIntegerTy()) { Index: llvm/trunk/lib/Transforms/Vectorize/LoopVectorize.cpp =================================================================== --- llvm/trunk/lib/Transforms/Vectorize/LoopVectorize.cpp +++ llvm/trunk/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -402,7 +402,10 @@ /// This function adds (StartIdx, StartIdx + Step, StartIdx + 2*Step, ...) /// to each vector element of Val. The sequence starts at StartIndex. - virtual Value *getStepVector(Value *Val, int StartIdx, Value *Step); + /// \p Opcode is relevant for FP induction variable. + virtual Value *getStepVector(Value *Val, int StartIdx, Value *Step, + Instruction::BinaryOps Opcode = + Instruction::BinaryOpsEnd); /// Compute scalar induction steps. \p ScalarIV is the scalar induction /// variable on which to base the steps, \p Step is the size of the step, and @@ -625,7 +628,9 @@ bool IfPredicateStore = false) override; void vectorizeMemoryInstruction(Instruction *Instr) override; Value *getBroadcastInstrs(Value *V) override; - Value *getStepVector(Value *Val, int StartIdx, Value *Step) override; + Value *getStepVector(Value *Val, int StartIdx, Value *Step, + Instruction::BinaryOps Opcode = + Instruction::BinaryOpsEnd) override; Value *reverseVector(Value *Vec) override; }; @@ -2000,32 +2005,60 @@ } } -Value *InnerLoopVectorizer::getStepVector(Value *Val, int StartIdx, - Value *Step) { +Value *InnerLoopVectorizer::getStepVector(Value *Val, int StartIdx, Value *Step, + Instruction::BinaryOps BinOp) { + // Create and check the types. assert(Val->getType()->isVectorTy() && "Must be a vector"); - assert(Val->getType()->getScalarType()->isIntegerTy() && - "Elem must be an integer"); - assert(Step->getType() == Val->getType()->getScalarType() && - "Step has wrong type"); - // Create the types. - Type *ITy = Val->getType()->getScalarType(); - VectorType *Ty = cast(Val->getType()); - int VLen = Ty->getNumElements(); + int VLen = Val->getType()->getVectorNumElements(); + + Type *STy = Val->getType()->getScalarType(); + assert((STy->isIntegerTy() || STy->isFloatingPointTy()) && + "Induction Step must be an integer or FP"); + assert(Step->getType() == STy && "Step has wrong type"); + SmallVector Indices; + if (STy->isIntegerTy()) { + // Create a vector of consecutive numbers from zero to VF. + for (int i = 0; i < VLen; ++i) + Indices.push_back(ConstantInt::get(STy, StartIdx + i)); + + // Add the consecutive indices to the vector value. + Constant *Cv = ConstantVector::get(Indices); + assert(Cv->getType() == Val->getType() && "Invalid consecutive vec"); + Step = Builder.CreateVectorSplat(VLen, Step); + assert(Step->getType() == Val->getType() && "Invalid step vec"); + // FIXME: The newly created binary instructions should contain nsw/nuw flags, + // which can be found from the original scalar operations. + Step = Builder.CreateMul(Cv, Step); + return Builder.CreateAdd(Val, Step, "induction"); + } + + // Floating point induction. + assert((BinOp == Instruction::FAdd || BinOp == Instruction::FSub) && + "Binary Opcode should be specified for FP induction"); // Create a vector of consecutive numbers from zero to VF. for (int i = 0; i < VLen; ++i) - Indices.push_back(ConstantInt::get(ITy, StartIdx + i)); + Indices.push_back(ConstantFP::get(STy, (double)(StartIdx + i))); // Add the consecutive indices to the vector value. Constant *Cv = ConstantVector::get(Indices); - assert(Cv->getType() == Val->getType() && "Invalid consecutive vec"); + Step = Builder.CreateVectorSplat(VLen, Step); - assert(Step->getType() == Val->getType() && "Invalid step vec"); - // FIXME: The newly created binary instructions should contain nsw/nuw flags, - // which can be found from the original scalar operations. - Step = Builder.CreateMul(Cv, Step); - return Builder.CreateAdd(Val, Step, "induction"); + + // Floating point operations had to be 'fast' to enable the induction. + FastMathFlags Flags; + Flags.setUnsafeAlgebra(); + + Value *MulOp = Builder.CreateFMul(Cv, Step); + if (isa(MulOp)) + // Have to check, MulOp may be a constant + cast(MulOp)->setFastMathFlags(Flags); + + Value *BOp = Builder.CreateBinOp(BinOp, Val, MulOp, "induction"); + if (isa(BOp)) + cast(BOp)->setFastMathFlags(Flags); + return BOp; } void InnerLoopVectorizer::buildScalarSteps(Value *ScalarIV, Value *Step, @@ -3099,8 +3132,10 @@ EndValue = CountRoundDown; } else { IRBuilder<> B(LoopBypassBlocks.back()->getTerminator()); - Value *CRD = B.CreateSExtOrTrunc(CountRoundDown, - II.getStep()->getType(), "cast.crd"); + Type *StepType = II.getStep()->getType(); + Instruction::CastOps CastOp = + CastInst::getCastOpcode(CountRoundDown, true, StepType, true); + Value *CRD = B.CreateCast(CastOp, CountRoundDown, StepType, "cast.crd"); const DataLayout &DL = OrigLoop->getHeader()->getModule()->getDataLayout(); EndValue = II.transform(B, CRD, PSE.getSE(), DL); EndValue->setName("ind.end"); @@ -4047,7 +4082,7 @@ llvm_unreachable("Unknown induction"); case InductionDescriptor::IK_IntInduction: return widenIntInduction(P, Entry); - case InductionDescriptor::IK_PtrInduction: + case InductionDescriptor::IK_PtrInduction: { // Handle the pointer induction variable case. assert(P->getType()->isPointerTy() && "Unexpected type."); // This is the normalized GEP that starts counting at zero. @@ -4080,6 +4115,29 @@ } return; } + case InductionDescriptor::IK_FpInduction: { + assert(P->getType() == II.getStartValue()->getType() && + "Types must match"); + // Handle other induction variables that are now based on the + // canonical one. + assert(P != OldInduction && "Primary induction can be integer only"); + + Value *V = Builder.CreateCast(Instruction::SIToFP, Induction, P->getType()); + V = II.transform(Builder, V, PSE.getSE(), DL); + V->setName("fp.offset.idx"); + + // Now we have scalar op: %fp.offset.idx = StartVal +/- Induction*StepVal + + Value *Broadcasted = getBroadcastInstrs(V); + // After broadcasting the induction variable we need to make the vector + // consecutive by adding StepVal*0, StepVal*1, StepVal*2, etc. + Value *StepVal = cast(II.getStep())->getValue(); + for (unsigned part = 0; part < UF; ++part) + Entry[part] = getStepVector(Broadcasted, VF * part, StepVal, + II.getInductionOpcode()); + return; + } + } } void InnerLoopVectorizer::vectorizeBlockInLoop(BasicBlock *BB, PhiVector *PV) { @@ -4565,10 +4623,12 @@ const DataLayout &DL = Phi->getModule()->getDataLayout(); // Get the widest type. - if (!WidestIndTy) - WidestIndTy = convertPointerToIntegerType(DL, PhiTy); - else - WidestIndTy = getWiderType(DL, PhiTy, WidestIndTy); + if (!PhiTy->isFloatingPointTy()) { + if (!WidestIndTy) + WidestIndTy = convertPointerToIntegerType(DL, PhiTy); + else + WidestIndTy = getWiderType(DL, PhiTy, WidestIndTy); + } // Int inductions are special because we only allow one IV. if (ID.getKind() == InductionDescriptor::IK_IntInduction && @@ -4649,8 +4709,10 @@ } InductionDescriptor ID; - if (InductionDescriptor::isInductionPHI(Phi, PSE, ID)) { + if (InductionDescriptor::isInductionPHI(Phi, TheLoop, PSE, ID)) { addInductionPhi(Phi, ID, AllowedExit); + if (ID.hasUnsafeAlgebra() && !HasFunNoNaNAttr) + Requirements->addUnsafeAlgebraInst(ID.getUnsafeAlgebraInst()); continue; } @@ -4661,7 +4723,7 @@ // As a last resort, coerce the PHI to a AddRec expression // and re-try classifying it a an induction PHI. - if (InductionDescriptor::isInductionPHI(Phi, PSE, ID, true)) { + if (InductionDescriptor::isInductionPHI(Phi, TheLoop, PSE, ID, true)) { addInductionPhi(Phi, ID, AllowedExit); continue; } @@ -6348,11 +6410,20 @@ Value *InnerLoopUnroller::getBroadcastInstrs(Value *V) { return V; } -Value *InnerLoopUnroller::getStepVector(Value *Val, int StartIdx, Value *Step) { +Value *InnerLoopUnroller::getStepVector(Value *Val, int StartIdx, Value *Step, + Instruction::BinaryOps BinOp) { // When unrolling and the VF is 1, we only need to add a simple scalar. - Type *ITy = Val->getType(); - assert(!ITy->isVectorTy() && "Val must be a scalar"); - Constant *C = ConstantInt::get(ITy, StartIdx); + Type *Ty = Val->getType(); + assert(!Ty->isVectorTy() && "Val must be a scalar"); + + if (Ty->isFloatingPointTy()) { + Constant *C = ConstantFP::get(Ty, (double)StartIdx); + + // Floating point operations had to be 'fast' to enable the unrolling. + Value *MulOp = addFastMathFlag(Builder.CreateFMul(C, Step)); + return addFastMathFlag(Builder.CreateBinOp(BinOp, Val, MulOp)); + } + Constant *C = ConstantInt::get(Ty, StartIdx); return Builder.CreateAdd(Val, Builder.CreateMul(C, Step), "induction"); } Index: llvm/trunk/test/Transforms/LoopVectorize/X86/float-induction-x86.ll =================================================================== --- llvm/trunk/test/Transforms/LoopVectorize/X86/float-induction-x86.ll +++ llvm/trunk/test/Transforms/LoopVectorize/X86/float-induction-x86.ll @@ -0,0 +1,86 @@ +; RUN: opt < %s -O3 -mcpu=core-avx2 -mtriple=x86_64-unknown-linux-gnu -S | FileCheck --check-prefix AUTO_VEC %s + +; This test checks auto-vectorization with FP induction variable. +; The FP operation is not "fast" and requires "fast-math" function attribute. + +;void fp_iv_loop1(float * __restrict__ A, int N) { +; float x = 1.0; +; for (int i=0; i < N; ++i) { +; A[i] = x; +; x += 0.5; +; } +;} + + +; AUTO_VEC-LABEL: @fp_iv_loop1( +; AUTO_VEC: vector.body +; AUTO_VEC: store <8 x float> + +define void @fp_iv_loop1(float* noalias nocapture %A, i32 %N) #0 { +entry: + %cmp4 = icmp sgt i32 %N, 0 + br i1 %cmp4, label %for.body.preheader, label %for.end + +for.body.preheader: ; preds = %entry + br label %for.body + +for.body: ; preds = %for.body.preheader, %for.body + %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %for.body.preheader ] + %x.06 = phi float [ %conv1, %for.body ], [ 1.000000e+00, %for.body.preheader ] + %arrayidx = getelementptr inbounds float, float* %A, i64 %indvars.iv + store float %x.06, float* %arrayidx, align 4 + %conv1 = fadd float %x.06, 5.000000e-01 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %lftr.wideiv = trunc i64 %indvars.iv.next to i32 + %exitcond = icmp eq i32 %lftr.wideiv, %N + br i1 %exitcond, label %for.end.loopexit, label %for.body + +for.end.loopexit: ; preds = %for.body + br label %for.end + +for.end: ; preds = %for.end.loopexit, %entry + ret void +} + +; The same as the previous, FP operation is not fast, different function attribute +; Vectorization should be rejected. +;void fp_iv_loop2(float * __restrict__ A, int N) { +; float x = 1.0; +; for (int i=0; i < N; ++i) { +; A[i] = x; +; x += 0.5; +; } +;} + +; AUTO_VEC-LABEL: @fp_iv_loop2( +; AUTO_VEC-NOT: vector.body +; AUTO_VEC-NOT: store <{{.*}} x float> + +define void @fp_iv_loop2(float* noalias nocapture %A, i32 %N) #1 { +entry: + %cmp4 = icmp sgt i32 %N, 0 + br i1 %cmp4, label %for.body.preheader, label %for.end + +for.body.preheader: ; preds = %entry + br label %for.body + +for.body: ; preds = %for.body.preheader, %for.body + %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %for.body.preheader ] + %x.06 = phi float [ %conv1, %for.body ], [ 1.000000e+00, %for.body.preheader ] + %arrayidx = getelementptr inbounds float, float* %A, i64 %indvars.iv + store float %x.06, float* %arrayidx, align 4 + %conv1 = fadd float %x.06, 5.000000e-01 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %lftr.wideiv = trunc i64 %indvars.iv.next to i32 + %exitcond = icmp eq i32 %lftr.wideiv, %N + br i1 %exitcond, label %for.end.loopexit, label %for.body + +for.end.loopexit: ; preds = %for.body + br label %for.end + +for.end: ; preds = %for.end.loopexit, %entry + ret void +} + +attributes #0 = { "no-nans-fp-math"="true" } +attributes #1 = { "no-nans-fp-math"="false" } Index: llvm/trunk/test/Transforms/LoopVectorize/float-induction.ll =================================================================== --- llvm/trunk/test/Transforms/LoopVectorize/float-induction.ll +++ llvm/trunk/test/Transforms/LoopVectorize/float-induction.ll @@ -0,0 +1,218 @@ +; RUN: opt < %s -loop-vectorize -force-vector-interleave=1 -force-vector-width=4 -dce -instcombine -S | FileCheck --check-prefix VEC4_INTERL1 %s +; RUN: opt < %s -loop-vectorize -force-vector-interleave=2 -force-vector-width=4 -dce -instcombine -S | FileCheck --check-prefix VEC4_INTERL2 %s +; RUN: opt < %s -loop-vectorize -force-vector-interleave=2 -force-vector-width=1 -dce -instcombine -S | FileCheck --check-prefix VEC1_INTERL2 %s + +; VEC4_INTERL1-LABEL: @fp_iv_loop1( +; VEC4_INTERL1: %[[FP_INC:.*]] = load float, float* @fp_inc +; VEC4_INTERL1: vector.body: +; VEC4_INTERL1: %[[FP_INDEX:.*]] = sitofp i64 {{.*}} to float +; VEC4_INTERL1: %[[VEC_INCR:.*]] = fmul fast float {{.*}}, %[[FP_INDEX]] +; VEC4_INTERL1: %[[FP_OFFSET_IDX:.*]] = fsub fast float %init, %[[VEC_INCR]] +; VEC4_INTERL1: %[[BRCT_INSERT:.*]] = insertelement <4 x float> undef, float %[[FP_OFFSET_IDX]], i32 0 +; VEC4_INTERL1-NEXT: %[[BRCT_SPLAT:.*]] = shufflevector <4 x float> %[[BRCT_INSERT]], <4 x float> undef, <4 x i32> zeroinitializer +; VEC4_INTERL1: %[[BRCT_INSERT:.*]] = insertelement {{.*}} %[[FP_INC]] +; VEC4_INTERL1-NEXT: %[[FP_INC_BCST:.*]] = shufflevector <4 x float> %[[BRCT_INSERT]], {{.*}} zeroinitializer +; VEC4_INTERL1: %[[VSTEP:.*]] = fmul fast <4 x float> %[[FP_INC_BCST]], +; VEC4_INTERL1-NEXT: %[[VEC_INDUCTION:.*]] = fsub fast <4 x float> %[[BRCT_SPLAT]], %[[VSTEP]] +; VEC4_INTERL1: store <4 x float> %[[VEC_INDUCTION]] + +; VEC4_INTERL2-LABEL: @fp_iv_loop1( +; VEC4_INTERL2: %[[FP_INC:.*]] = load float, float* @fp_inc +; VEC4_INTERL2: vector.body: +; VEC4_INTERL2: %[[INDEX:.*]] = sitofp i64 {{.*}} to float +; VEC4_INTERL2: %[[VEC_INCR:.*]] = fmul fast float %{{.*}}, %[[INDEX]] +; VEC4_INTERL2: fsub fast float %init, %[[VEC_INCR]] +; VEC4_INTERL2: %[[VSTEP1:.*]] = fmul fast <4 x float> %{{.*}}, +; VEC4_INTERL2-NEXT: %[[VEC_INDUCTION1:.*]] = fsub fast <4 x float> {{.*}}, %[[VSTEP1]] +; VEC4_INTERL2: %[[VSTEP2:.*]] = fmul fast <4 x float> %{{.*}}, +; VEC4_INTERL2-NEXT: %[[VEC_INDUCTION2:.*]] = fsub fast <4 x float> {{.*}}, %[[VSTEP2]] +; VEC4_INTERL2: store <4 x float> %[[VEC_INDUCTION1]] +; VEC4_INTERL2: store <4 x float> %[[VEC_INDUCTION2]] + +; VEC1_INTERL2-LABEL: @fp_iv_loop1( +; VEC1_INTERL2: %[[FP_INC:.*]] = load float, float* @fp_inc +; VEC1_INTERL2: vector.body: +; VEC1_INTERL2: %[[INDEX:.*]] = sitofp i64 {{.*}} to float +; VEC1_INTERL2: %[[STEP:.*]] = fmul fast float %{{.*}}, %[[INDEX]] +; VEC1_INTERL2: %[[FP_OFFSET_IDX:.*]] = fsub fast float %init, %[[STEP]] +; VEC1_INTERL2: %[[SCALAR_INDUCTION2:.*]] = fsub fast float %[[FP_OFFSET_IDX]], %[[FP_INC]] +; VEC1_INTERL2: store float %[[FP_OFFSET_IDX]] +; VEC1_INTERL2: store float %[[SCALAR_INDUCTION2]] + +@fp_inc = common global float 0.000000e+00, align 4 + +;void fp_iv_loop1(float init, float * __restrict__ A, int N) { +; float x = init; +; for (int i=0; i < N; ++i) { +; A[i] = x; +; x -= fp_inc; +; } +;} + +define void @fp_iv_loop1(float %init, float* noalias nocapture %A, i32 %N) #1 { +entry: + %cmp4 = icmp sgt i32 %N, 0 + br i1 %cmp4, label %for.body.lr.ph, label %for.end + +for.body.lr.ph: ; preds = %entry + %fpinc = load float, float* @fp_inc, align 4 + br label %for.body + +for.body: ; preds = %for.body, %for.body.lr.ph + %indvars.iv = phi i64 [ 0, %for.body.lr.ph ], [ %indvars.iv.next, %for.body ] + %x.05 = phi float [ %init, %for.body.lr.ph ], [ %add, %for.body ] + %arrayidx = getelementptr inbounds float, float* %A, i64 %indvars.iv + store float %x.05, float* %arrayidx, align 4 + %add = fsub fast float %x.05, %fpinc + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %lftr.wideiv = trunc i64 %indvars.iv.next to i32 + %exitcond = icmp eq i32 %lftr.wideiv, %N + br i1 %exitcond, label %for.end.loopexit, label %for.body + +for.end.loopexit: ; preds = %for.body + br label %for.end + +for.end: ; preds = %for.end.loopexit, %entry + ret void +} + +;void fp_iv_loop2(float init, float * __restrict__ A, int N) { +; float x = init; +; for (int i=0; i < N; ++i) { +; A[i] = x; +; x += 0.5; +; } +;} + +; VEC4_INTERL1-LABEL: @fp_iv_loop2( +; VEC4_INTERL1: vector.body +; VEC4_INTERL1: %[[index:.*]] = phi i64 [ 0, %vector.ph ] +; VEC4_INTERL1: sitofp i64 %[[index]] to float +; VEC4_INTERL1: %[[VAR1:.*]] = fmul fast float {{.*}}, 5.000000e-01 +; VEC4_INTERL1: %[[VAR2:.*]] = fadd fast float %[[VAR1]] +; VEC4_INTERL1: insertelement <4 x float> undef, float %[[VAR2]], i32 0 +; VEC4_INTERL1: shufflevector <4 x float> {{.*}}, <4 x float> undef, <4 x i32> zeroinitializer +; VEC4_INTERL1: fadd fast <4 x float> {{.*}}, +; VEC4_INTERL1: store <4 x float> + +define void @fp_iv_loop2(float %init, float* noalias nocapture %A, i32 %N) #0 { +entry: + %cmp4 = icmp sgt i32 %N, 0 + br i1 %cmp4, label %for.body.preheader, label %for.end + +for.body.preheader: ; preds = %entry + br label %for.body + +for.body: ; preds = %for.body.preheader, %for.body + %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %for.body.preheader ] + %x.06 = phi float [ %conv1, %for.body ], [ %init, %for.body.preheader ] + %arrayidx = getelementptr inbounds float, float* %A, i64 %indvars.iv + store float %x.06, float* %arrayidx, align 4 + %conv1 = fadd fast float %x.06, 5.000000e-01 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %lftr.wideiv = trunc i64 %indvars.iv.next to i32 + %exitcond = icmp eq i32 %lftr.wideiv, %N + br i1 %exitcond, label %for.end.loopexit, label %for.body + +for.end.loopexit: ; preds = %for.body + br label %for.end + +for.end: ; preds = %for.end.loopexit, %entry + ret void +} + +;void fp_iv_loop3(float init, float * __restrict__ A, float * __restrict__ B, float * __restrict__ C, int N) { +; int i = 0; +; float x = init; +; float y = 0.1; +; for (; i < N; ++i) { +; A[i] = x; +; x += fp_inc; +; y -= 0.5; +; B[i] = x + y; +; C[i] = y; +; } +;} +; VEC4_INTERL1-LABEL: @fp_iv_loop3( +; VEC4_INTERL1: vector.body +; VEC4_INTERL1: %[[index:.*]] = phi i64 [ 0, %vector.ph ] +; VEC4_INTERL1: sitofp i64 %[[index]] to float +; VEC4_INTERL1: %[[VAR1:.*]] = fmul fast float {{.*}}, -5.000000e-01 +; VEC4_INTERL1: fadd fast float %[[VAR1]] +; VEC4_INTERL1: fadd fast <4 x float> {{.*}}, +; VEC4_INTERL1: store <4 x float> + +define void @fp_iv_loop3(float %init, float* noalias nocapture %A, float* noalias nocapture %B, float* noalias nocapture %C, i32 %N) #1 { +entry: + %cmp9 = icmp sgt i32 %N, 0 + br i1 %cmp9, label %for.body.lr.ph, label %for.end + +for.body.lr.ph: ; preds = %entry + %0 = load float, float* @fp_inc, align 4 + br label %for.body + +for.body: ; preds = %for.body, %for.body.lr.ph + %indvars.iv = phi i64 [ 0, %for.body.lr.ph ], [ %indvars.iv.next, %for.body ] + %y.012 = phi float [ 0x3FB99999A0000000, %for.body.lr.ph ], [ %conv1, %for.body ] + %x.011 = phi float [ %init, %for.body.lr.ph ], [ %add, %for.body ] + %arrayidx = getelementptr inbounds float, float* %A, i64 %indvars.iv + store float %x.011, float* %arrayidx, align 4 + %add = fadd fast float %x.011, %0 + %conv1 = fadd fast float %y.012, -5.000000e-01 + %add2 = fadd fast float %conv1, %add + %arrayidx4 = getelementptr inbounds float, float* %B, i64 %indvars.iv + store float %add2, float* %arrayidx4, align 4 + %arrayidx6 = getelementptr inbounds float, float* %C, i64 %indvars.iv + store float %conv1, float* %arrayidx6, align 4 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %lftr.wideiv = trunc i64 %indvars.iv.next to i32 + %exitcond = icmp eq i32 %lftr.wideiv, %N + br i1 %exitcond, label %for.end.loopexit, label %for.body + +for.end.loopexit: + br label %for.end + +for.end: + ret void +} + +; Start and step values are constants. There is no 'fmul' operation in this case +;void fp_iv_loop4(float * __restrict__ A, int N) { +; float x = 1.0; +; for (int i=0; i < N; ++i) { +; A[i] = x; +; x += 0.5; +; } +;} + +; VEC4_INTERL1-LABEL: @fp_iv_loop4( +; VEC4_INTERL1: vector.body +; VEC4_INTERL1-NOT: fmul fast <4 x float> +; VEC4_INTERL1: %[[induction:.*]] = fadd fast <4 x float> %{{.*}}, +; VEC4_INTERL1: store <4 x float> %[[induction]] + +define void @fp_iv_loop4(float* noalias nocapture %A, i32 %N) { +entry: + %cmp4 = icmp sgt i32 %N, 0 + br i1 %cmp4, label %for.body.preheader, label %for.end + +for.body.preheader: ; preds = %entry + br label %for.body + +for.body: ; preds = %for.body.preheader, %for.body + %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %for.body.preheader ] + %x.06 = phi float [ %conv1, %for.body ], [ 1.000000e+00, %for.body.preheader ] + %arrayidx = getelementptr inbounds float, float* %A, i64 %indvars.iv + store float %x.06, float* %arrayidx, align 4 + %conv1 = fadd fast float %x.06, 5.000000e-01 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %lftr.wideiv = trunc i64 %indvars.iv.next to i32 + %exitcond = icmp eq i32 %lftr.wideiv, %N + br i1 %exitcond, label %for.end.loopexit, label %for.body + +for.end.loopexit: ; preds = %for.body + br label %for.end + +for.end: ; preds = %for.end.loopexit, %entry + ret void +}