Index: include/llvm/Analysis/ScalarEvolution.h =================================================================== --- include/llvm/Analysis/ScalarEvolution.h +++ include/llvm/Analysis/ScalarEvolution.h @@ -1623,6 +1623,11 @@ return F.getParent()->getDataLayout(); } + /// Return Loop pointer for basic block + const Loop* getLoopFor(const BasicBlock *BB) const { + return LI.getLoopFor(BB); + } + const SCEVPredicate *getEqualPredicate(const SCEVUnknown *LHS, const SCEVConstant *RHS); Index: include/llvm/Transforms/Utils/LoopUtils.h =================================================================== --- include/llvm/Transforms/Utils/LoopUtils.h +++ include/llvm/Transforms/Utils/LoopUtils.h @@ -263,7 +263,8 @@ enum InductionKind { IK_NoInduction, ///< Not an induction variable. IK_IntInduction, ///< Integer induction variable. Step = C. - IK_PtrInduction ///< Pointer induction var. Step = C / sizeof(elem). + IK_PtrInduction, ///< Pointer induction var. Step = C / sizeof(elem). + IK_FpInduction ///< Floating point induction variable. }; public: @@ -300,6 +301,12 @@ InductionDescriptor &D, const SCEV *Expr = nullptr); + /// Returns true if \p Phi is a floating point induction. + /// If \p Phi is an induction, the induction descriptor \p D will contain + /// the data describing this induction. + static bool isFpInductionPHI(PHINode *Phi, ScalarEvolution *SE, + InductionDescriptor &D); + /// Returns true if \p Phi is an induction, in the context associated with /// the run-time predicate of PSE. If \p Assume is true, this can add further /// SCEV predicates to \p PSE in order to prove that \p Phi is an induction. Index: lib/Analysis/ScalarEvolutionExpander.cpp =================================================================== --- lib/Analysis/ScalarEvolutionExpander.cpp +++ lib/Analysis/ScalarEvolutionExpander.cpp @@ -1607,7 +1607,7 @@ Value *SCEVExpander::expandCodeFor(const SCEV *SH, Type *Ty) { // Expand the code for this SCEV. Value *V = expand(SH); - if (Ty) { + if (Ty && SE.isSCEVable(Ty)) { assert(SE.getTypeSizeInBits(Ty) == SE.getTypeSizeInBits(SH->getType()) && "non-trivial casts should be done with the SCEVs directly!"); V = InsertNoopCastOfTo(V, Ty); Index: lib/Transforms/Utils/LoopUtils.cpp =================================================================== --- lib/Transforms/Utils/LoopUtils.cpp +++ lib/Transforms/Utils/LoopUtils.cpp @@ -672,7 +672,11 @@ assert((IK != IK_PtrInduction || getConstIntStepValue()) && "Step value should be constant for pointer induction"); - assert(Step->getType()->isIntegerTy() && "StepValue is not an integer"); + assert(((IK != IK_PtrInduction && IK != IK_IntInduction) || + Step->getType()->isIntegerTy()) && "StepValue is not an integer"); + + assert((IK != IK_FpInduction || Step->getType()->isFloatingPointTy()) && + "StepValue is not FP for FpInduction"); } int InductionDescriptor::getConsecutiveDirection() const { @@ -725,21 +729,107 @@ Index = Exp.expandCodeFor(S, Index->getType(), &*B.GetInsertPoint()); return B.CreateGEP(nullptr, StartValue, Index); } + case IK_FpInduction: { + assert(Index->getType() == Step->getType() && + "Index type does not match StepValue type"); + assert(Step->getType()->isFloatingPointTy() && "Expected FP Step value"); + + Value *StepValue = cast(Step)->getValue(); + if (BinaryOperator::isFNeg(StepValue)) { + Value *MulExp = + B.CreateFMul(cast(StepValue)->getOperand(1), Index); + return B.CreateFSub(StartValue, MulExp); + } + Value *MulExp = B.CreateFMul(StepValue, Index); + return B.CreateFAdd(StartValue, MulExp); + } case IK_NoInduction: return nullptr; } llvm_unreachable("invalid enum"); } +bool InductionDescriptor::isFpInductionPHI(PHINode *Phi, ScalarEvolution *SE, + InductionDescriptor &D) { + + // Here we only handle FP induction variables. + assert(Phi->getType()->isFloatingPointTy() && "Unexpected Phi type"); + + const Loop *L = SE->getLoopFor(Phi->getParent()); + if (!L || L->getHeader() != Phi->getParent()) + return nullptr; + + // The loop may have multiple entrances or multiple exits; we can analyze + // this phi if it has a unique entry value and a unique backedge value. + if (Phi->getNumIncomingValues() != 2) + return false; + Value *BEValueV = nullptr, *StartValueV = nullptr; + for (BasicBlock *Bb : Phi->blocks()) { + Value *V = Phi->getIncomingValueForBlock(Bb); + if (L->contains(Bb)) { + if (!BEValueV) + BEValueV = V; + else if (BEValueV != V) + return false; + } else if (!StartValueV) + StartValueV = V; + else if (StartValueV != V) + return false; + } + + if (auto *BOp = dyn_cast(BEValueV)) { + if (BOp->getOpcode() != Instruction::FAdd && + BOp->getOpcode() != Instruction::FSub) + return false; + bool PhiUse = false; + Value *Addend = nullptr; + for (Value *Op : BOp->operands()) { + if (Op == Phi) + PhiUse = true; + else + Addend = Op; + } + if (!PhiUse) + return false; + + // The addend should be loop invariant + if (auto *I = dyn_cast(Addend)) { + if (L->contains(I)) + return false; + if (BOp->getOpcode() == Instruction::FSub) { + Addend = BinaryOperator::CreateFNeg(I); + cast(Addend)->insertAfter(I); + } + } else if (auto *C = dyn_cast(Addend)) { + if (BOp->getOpcode() == Instruction::FSub) + Addend = ConstantExpr::getFNeg(C); + } else + return false; + + const SCEV *Step = SE->getUnknown(Addend); + D = InductionDescriptor(StartValueV, IK_FpInduction, Step); + return true; + } + return false; +} + bool InductionDescriptor::isInductionPHI(PHINode *Phi, PredicatedScalarEvolution &PSE, InductionDescriptor &D, bool Assume) { Type *PhiTy = Phi->getType(); - // We only handle integer and pointer inductions variables. - if (!PhiTy->isIntegerTy() && !PhiTy->isPointerTy()) + + // Handle integer and pointer inductions variables. + // Now we handle also FP induction but not trying to make a + // recurrent expression from the PHI node in-place. + + if (!PhiTy->isIntegerTy() && !PhiTy->isPointerTy() && + !PhiTy->isFloatTy() && !PhiTy->isDoubleTy() && !PhiTy->isHalfTy()) return false; + if (PhiTy->isFloatingPointTy()) + return isFpInductionPHI(Phi, PSE.getSE(), D); + const SCEV *PhiScev = PSE.getSCEV(Phi); const auto *AR = dyn_cast(PhiScev); Index: lib/Transforms/Vectorize/LoopVectorize.cpp =================================================================== --- lib/Transforms/Vectorize/LoopVectorize.cpp +++ lib/Transforms/Vectorize/LoopVectorize.cpp @@ -2145,29 +2145,49 @@ Value *InnerLoopVectorizer::getStepVector(Value *Val, int StartIdx, Value *Step) { assert(Val->getType()->isVectorTy() && "Must be a vector"); - assert(Val->getType()->getScalarType()->isIntegerTy() && - "Elem must be an integer"); + assert((Val->getType()->getScalarType()->isIntegerTy() || + Val->getType()->getScalarType()->isFloatingPointTy()) && + "Induction Step must be an integer or FP"); assert(Step->getType() == Val->getType()->getScalarType() && "Step has wrong type"); // Create the types. Type *ITy = Val->getType()->getScalarType(); - VectorType *Ty = cast(Val->getType()); - int VLen = Ty->getNumElements(); + int VLen = Val->getType()->getVectorNumElements(); SmallVector Indices; + if (Step->getType()->isIntegerTy()) { + // Create a vector of consecutive numbers from zero to VF. + for (int i = 0; i < VLen; ++i) + Indices.push_back(ConstantInt::get(ITy, StartIdx + i)); + + Constant *Cv = ConstantVector::get(Indices); + Step = Builder.CreateVectorSplat(VLen, Step); + + // FIXME: The newly created binary instructions should contain nsw/nuw flags, + // which can be found from the original scalar operations. + Step = Builder.CreateMul(Cv, Step); + return Builder.CreateAdd(Val, Step, "induction"); + } + + // Create a vector of consecutive numbers from zero to VF. for (int i = 0; i < VLen; ++i) - Indices.push_back(ConstantInt::get(ITy, StartIdx + i)); + Indices.push_back(ConstantFP::get(ITy, (double)(StartIdx + i))); + + bool FNegativeStep = false; + if (BinaryOperator::isFNeg(Step)) { + Step = cast(Step)->getOperand(1); + FNegativeStep = true; + } // Add the consecutive indices to the vector value. Constant *Cv = ConstantVector::get(Indices); - assert(Cv->getType() == Val->getType() && "Invalid consecutive vec"); Step = Builder.CreateVectorSplat(VLen, Step); - assert(Step->getType() == Val->getType() && "Invalid step vec"); - // FIXME: The newly created binary instructions should contain nsw/nuw flags, - // which can be found from the original scalar operations. - Step = Builder.CreateMul(Cv, Step); - return Builder.CreateAdd(Val, Step, "induction"); + + Step = Builder.CreateFMul(Cv, Step); + if (FNegativeStep) + return Builder.CreateFSub(Val, Step, "induction"); + return Builder.CreateFAdd(Val, Step, "induction"); } int LoopVectorizationLegality::isConsecutivePtr(Value *Ptr) { @@ -3208,8 +3228,13 @@ EndValue = CountRoundDown; } else { IRBuilder<> B(LoopBypassBlocks.back()->getTerminator()); - Value *CRD = B.CreateSExtOrTrunc(CountRoundDown, - II.getStep()->getType(), "cast.crd"); + Value *CRD; + if (II.getStep()->getType()->isIntegerTy()) + CRD = B.CreateSExtOrTrunc(CountRoundDown, II.getStep()->getType(), + "cast.crd"); + else + CRD = B.CreateCast(Instruction::SIToFP, CountRoundDown, + II.getStep()->getType(), "cast.crd"); const DataLayout &DL = OrigLoop->getHeader()->getModule()->getDataLayout(); EndValue = II.transform(B, CRD, PSE.getSE(), DL); EndValue->setName("ind.end"); @@ -4121,6 +4146,24 @@ } return; } + case InductionDescriptor::IK_FpInduction: { + assert(P->getType() == II.getStartValue()->getType() && + "Types must match"); + // Handle other induction variables that are now based on the + // canonical one. + assert(P != OldInduction && "Main induction can be integer only"); + + Value *V = Builder.CreateCast(Instruction::SIToFP, Induction, P->getType()); + V = II.transform(Builder, V, PSE.getSE(), DL); + V->setName("fp.offset.idx"); + + Value *Broadcasted = getBroadcastInstrs(V); + // After broadcasting the induction variable we need to make the vector + // consecutive by adding 0, 1, 2, etc. + for (unsigned part = 0; part < UF; ++part) + Entry[part] = getStepVector(Broadcasted, VF * part, II.getStep()); + return; + } case InductionDescriptor::IK_PtrInduction: // Handle the pointer induction variable case. assert(P->getType()->isPointerTy() && "Unexpected type."); @@ -4662,10 +4705,12 @@ const DataLayout &DL = Phi->getModule()->getDataLayout(); // Get the widest type. - if (!WidestIndTy) - WidestIndTy = convertPointerToIntegerType(DL, PhiTy); - else - WidestIndTy = getWiderType(DL, PhiTy, WidestIndTy); + if (!PhiTy->isFloatingPointTy()) { + if (!WidestIndTy) + WidestIndTy = convertPointerToIntegerType(DL, PhiTy); + else + WidestIndTy = getWiderType(DL, PhiTy, WidestIndTy); + } // Int inductions are special because we only allow one IV. if (ID.getKind() == InductionDescriptor::IK_IntInduction && @@ -6410,6 +6455,15 @@ // When unrolling and the VF is 1, we only need to add a simple scalar. Type *ITy = Val->getType(); assert(!ITy->isVectorTy() && "Val must be a scalar"); + + if (ITy->isFloatingPointTy()) { + Constant *C = ConstantFP::get(ITy, (double)StartIdx); + if (BinaryOperator::isFNeg(Step)) { + Step = dyn_cast(Step)->getOperand(1); + return Builder.CreateFSub(Val, Builder.CreateFMul(C, Step), "induction"); + } + return Builder.CreateFAdd(Val, Builder.CreateFMul(C, Step), "induction"); + } Constant *C = ConstantInt::get(ITy, StartIdx); return Builder.CreateAdd(Val, Builder.CreateMul(C, Step), "induction"); } Index: test/Transforms/LoopVectorize/float-induction.ll =================================================================== --- test/Transforms/LoopVectorize/float-induction.ll +++ test/Transforms/LoopVectorize/float-induction.ll @@ -0,0 +1,150 @@ +; RUN: opt < %s -loop-vectorize -force-vector-interleave=1 -force-vector-width=4 -dce -instcombine -S | FileCheck %s + + +; CHECK-LABEL: @fp_iv_loop1( +; CHECK: %[[FP_INC:.*]] = load float, float* @fp_inc, align 4 +; CHECK: vector.body: +; CHECK: %[[VAR1:.*]] = insertelement <4 x float> undef, float %[[FP_INC]], i32 0 +; CHECK: %[[VAR2:.*]] = shufflevector <4 x float> %[[VAR1]], <4 x float> undef, <4 x i32> zeroinitializer +; CHECK: %[[VAR3:.*]] = fmul <4 x float> %[[VAR2]], +; CHECK: %[[VAR4:.*]] = fsub <4 x float>{{.*}}, %[[VAR3]] +; CHECK: store <4 x float> %[[VAR4]] + +@fp_inc = common global float 0.000000e+00, align 4 + +;void fp_iv_loop1(float init, float * __restrict__ A, int N) { +; float x = init; +; for (int i=0; i < N; ++i) { +; A[i] = x; +; x -= fp_inc; +; } +;} + +define void @fp_iv_loop1(float %init, float* noalias nocapture %A, i32 %N) #0 { +entry: + %cmp4 = icmp sgt i32 %N, 0 + br i1 %cmp4, label %for.body.lr.ph, label %for.end + +for.body.lr.ph: ; preds = %entry + %0 = load float, float* @fp_inc, align 4 + br label %for.body + +for.body: ; preds = %for.body, %for.body.lr.ph + %indvars.iv = phi i64 [ 0, %for.body.lr.ph ], [ %indvars.iv.next, %for.body ] + %x.05 = phi float [ %init, %for.body.lr.ph ], [ %add, %for.body ] + %arrayidx = getelementptr inbounds float, float* %A, i64 %indvars.iv + store float %x.05, float* %arrayidx, align 4 + %add = fsub float %x.05, %0 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %lftr.wideiv = trunc i64 %indvars.iv.next to i32 + %exitcond = icmp eq i32 %lftr.wideiv, %N + br i1 %exitcond, label %for.end.loopexit, label %for.body + +for.end.loopexit: ; preds = %for.body + br label %for.end + +for.end: ; preds = %for.end.loopexit, %entry + ret void +} + +;void fp_iv_loop2(float init, float * __restrict__ A, int N) { +; float x = init; +; for (int i=0; i < N; ++i) { +; A[i] = x; +; x += 0.5; +; } +;} + +; CHECK-LABEL: @fp_iv_loop2( +; CHECK: vector.body +; CHECK: %[[index:.*]] = phi i64 [ 0, %vector.ph ] +; CHECK: sitofp i64 %[[index]] to float +; CHECK: %[[VAR1:.*]] = fmul float {{.*}}, 5.000000e-01 +; CHECK: %[[VAR2:.*]] = fadd float %[[VAR1]] +; CHECK: insertelement <4 x float> undef, float %[[VAR2]], i32 0 +; CHECK: shufflevector <4 x float> {{.*}}, <4 x float> undef, <4 x i32> zeroinitializer +; CHECK: fadd <4 x float> {{.*}}, +; CHECK: store <4 x float> + +define void @fp_iv_loop2(float %init, float* noalias nocapture %A, i32 %N) #0 { +entry: + %cmp4 = icmp sgt i32 %N, 0 + br i1 %cmp4, label %for.body.preheader, label %for.end + +for.body.preheader: ; preds = %entry + br label %for.body + +for.body: ; preds = %for.body.preheader, %for.body + %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %for.body.preheader ] + %x.06 = phi float [ %conv1, %for.body ], [ %init, %for.body.preheader ] + %arrayidx = getelementptr inbounds float, float* %A, i64 %indvars.iv + store float %x.06, float* %arrayidx, align 4 + %conv1 = fadd float %x.06, 5.000000e-01 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %lftr.wideiv = trunc i64 %indvars.iv.next to i32 + %exitcond = icmp eq i32 %lftr.wideiv, %N + br i1 %exitcond, label %for.end.loopexit, label %for.body + +for.end.loopexit: ; preds = %for.body + br label %for.end + +for.end: ; preds = %for.end.loopexit, %entry + ret void +} + +;void fp_iv_loop3(float init, float * __restrict__ A, float * __restrict__ B, float * __restrict__ C, int N) { +; int i = 0; +; float x = init; +; float y = 0.1; +; for (; i < N; ++i) { +; A[i] = x; +; x += fp_inc; +; y -= 0.5; +; B[i] = x + y; +; C[i] = y; +; } +;} +; CHECK-LABEL: @fp_iv_loop3( +; CHECK: vector.body +; CHECK: %[[index:.*]] = phi i64 [ 0, %vector.ph ] +; CHECK: sitofp i64 %[[index]] to float +; CHECK: %[[VAR1:.*]] = fmul float {{.*}}, -5.000000e-01 +; CHECK: %[[VAR2:.*]] = fadd float %[[VAR1]] +; CHECK: insertelement <4 x float> undef, float %[[VAR2]], i32 0 +; CHECK: shufflevector <4 x float> {{.*}}, <4 x float> undef, <4 x i32> zeroinitializer +; CHECK: fadd <4 x float> {{.*}}, +; CHECK: store <4 x float> + +define void @fp_iv_loop3(float %init, float* noalias nocapture %A, float* noalias nocapture %B, float* noalias nocapture %C, i32 %N) #0 { +entry: + %cmp9 = icmp sgt i32 %N, 0 + br i1 %cmp9, label %for.body.lr.ph, label %for.end + +for.body.lr.ph: ; preds = %entry + %0 = load float, float* @fp_inc, align 4 + br label %for.body + +for.body: ; preds = %for.body, %for.body.lr.ph + %indvars.iv = phi i64 [ 0, %for.body.lr.ph ], [ %indvars.iv.next, %for.body ] + %y.012 = phi float [ 0x3FB99999A0000000, %for.body.lr.ph ], [ %conv1, %for.body ] + %x.011 = phi float [ %init, %for.body.lr.ph ], [ %add, %for.body ] + %arrayidx = getelementptr inbounds float, float* %A, i64 %indvars.iv + store float %x.011, float* %arrayidx, align 4 + %add = fadd float %x.011, %0 + %conv1 = fadd float %y.012, -5.000000e-01 + %add2 = fadd float %conv1, %add + %arrayidx4 = getelementptr inbounds float, float* %B, i64 %indvars.iv + store float %add2, float* %arrayidx4, align 4 + %arrayidx6 = getelementptr inbounds float, float* %C, i64 %indvars.iv + store float %conv1, float* %arrayidx6, align 4 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %lftr.wideiv = trunc i64 %indvars.iv.next to i32 + %exitcond = icmp eq i32 %lftr.wideiv, %N + br i1 %exitcond, label %for.end.loopexit, label %for.body + +for.end.loopexit: + br label %for.end + +for.end: + ret void +}