Index: lib/Transforms/Vectorize/LoopVectorize.cpp =================================================================== --- lib/Transforms/Vectorize/LoopVectorize.cpp +++ lib/Transforms/Vectorize/LoopVectorize.cpp @@ -384,6 +384,22 @@ /// we always assume predicated blocks have a 50% chance of executing. static unsigned getReciprocalPredBlockProb() { return 2; } +/// A helper function that adds a 'fast' flag to floating point operations. +static Value *addFastMathFlag(Value *V) { + if (isa(V)) { + FastMathFlags Flags; + Flags.setUnsafeAlgebra(); + cast(V)->setFastMathFlags(Flags); + } + return V; +} + +/// A helper function that returns an integer or floating-point constant with +/// value C. +static Constant *getIntOrFpConstant(Type *Ty, unsigned C) { + return Ty->isIntegerTy() ? ConstantInt::get(Ty, C) : ConstantFP::get(Ty, C); +} + /// InnerLoopVectorizer vectorizes loops which contain only one basic /// block to a specified vectorization factor (VF). /// This class performs the widening of scalars into vectors, or multiple @@ -544,12 +560,13 @@ /// node, and \p Step is the loop-invariant step. If \p EntryVal is a /// truncate instruction, instead of widening the original IV, we widen a /// version of the IV truncated to \p EntryVal's type. - void createVectorIntInductionPHI(const InductionDescriptor &II, Value *Step, - Instruction *EntryVal); + void createVectorIntOrFpInductionPHI(const InductionDescriptor &II, + Value *Step, Instruction *EntryVal); - /// Widen an integer induction variable \p IV. If \p Trunc is provided, the - /// induction variable will first be truncated to the corresponding type. - void widenIntInduction(PHINode *IV, TruncInst *Trunc = nullptr); + /// Widen an integer or floating-point induction variable \p IV. If \p Trunc + /// is provided, the integer induction variable will first be truncated to + /// the corresponding type. + void widenIntOrFpInduction(PHINode *IV, TruncInst *Trunc = nullptr); /// Returns true if an instruction \p I should be scalarized instead of /// vectorized for the chosen vectorization factor. @@ -2355,30 +2372,46 @@ return Shuf; } -void InnerLoopVectorizer::createVectorIntInductionPHI( +void InnerLoopVectorizer::createVectorIntOrFpInductionPHI( const InductionDescriptor &II, Value *Step, Instruction *EntryVal) { Value *Start = II.getStartValue(); - assert(Step->getType()->isIntegerTy() && - "Cannot widen an IV having a step with a non-integer type"); // Construct the initial value of the vector IV in the vector loop preheader auto CurrIP = Builder.saveIP(); Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator()); if (isa(EntryVal)) { + assert(Start->getType()->isIntegerTy() && + "Truncation requires an integer type"); auto *TruncType = cast(EntryVal->getType()); Step = Builder.CreateTrunc(Step, TruncType); Start = Builder.CreateCast(Instruction::Trunc, Start, TruncType); } Value *SplatStart = Builder.CreateVectorSplat(VF, Start); - Value *SteppedStart = getStepVector(SplatStart, 0, Step); + Value *SteppedStart = + getStepVector(SplatStart, 0, Step, II.getInductionOpcode()); + + // We create vector phi nodes for both integer and floating-point induction + // variables. Here, we determine the kind of arithmetic we will perform. + Instruction::BinaryOps AddOp; + Instruction::BinaryOps MulOp; + if (Step->getType()->isIntegerTy()) { + AddOp = Instruction::Add; + MulOp = Instruction::Mul; + } else { + AddOp = Instruction::FAdd; + MulOp = Instruction::FMul; + } + + // Multiply the vectorization factor by the step using integer or + // floating-point arithmetic as appropriate. + Value *ConstVF = getIntOrFpConstant(Step->getType(), VF); + Value *Mul = addFastMathFlag(Builder.CreateBinOp(MulOp, Step, ConstVF)); // Create a vector splat to use in the induction update. // // FIXME: If the step is non-constant, we create the vector splat with // IRBuilder. IRBuilder can constant-fold the multiply, but it doesn't // handle a constant vector splat. - auto *ConstVF = ConstantInt::getSigned(Step->getType(), VF); - auto *Mul = Builder.CreateMul(Step, ConstVF); Value *SplatVF = isa(Mul) ? ConstantVector::getSplat(VF, cast(Mul)) : Builder.CreateVectorSplat(VF, Mul); @@ -2392,8 +2425,8 @@ VectorParts Entry(UF); for (unsigned Part = 0; Part < UF; ++Part) { Entry[Part] = LastInduction; - LastInduction = cast( - Builder.CreateAdd(LastInduction, SplatVF, "step.add")); + LastInduction = cast(addFastMathFlag( + Builder.CreateBinOp(AddOp, LastInduction, SplatVF, "step.add"))); } VectorLoopValueMap.initVector(EntryVal, Entry); if (isa(EntryVal)) @@ -2426,7 +2459,11 @@ return any_of(IV->users(), isScalarInst); } -void InnerLoopVectorizer::widenIntInduction(PHINode *IV, TruncInst *Trunc) { +void InnerLoopVectorizer::widenIntOrFpInduction(PHINode *IV, TruncInst *Trunc) { + + assert(!IV->getType()->isIntegerTy() + ? IV != OldInduction + : true && "Primary induction variable must have an integer type"); auto II = Legal->getInductionVars()->find(IV); assert(II != Legal->getInductionVars()->end() && "IV is not an induction"); @@ -2455,15 +2492,20 @@ assert(PSE.getSE()->isLoopInvariant(ID.getStep(), OrigLoop) && "Induction step should be loop invariant"); auto &DL = OrigLoop->getHeader()->getModule()->getDataLayout(); - SCEVExpander Exp(*PSE.getSE(), DL, "induction"); - Value *Step = Exp.expandCodeFor(ID.getStep(), ID.getStep()->getType(), - LoopVectorPreHeader->getTerminator()); + Value *Step = nullptr; + if (PSE.getSE()->isSCEVable(IV->getType())) { + SCEVExpander Exp(*PSE.getSE(), DL, "induction"); + Step = Exp.expandCodeFor(ID.getStep(), ID.getStep()->getType(), + LoopVectorPreHeader->getTerminator()); + } else { + Step = cast(ID.getStep())->getValue(); + } // Try to create a new independent vector induction variable. If we can't // create the phi node, we will splat the scalar induction variable in each // loop iteration. if (VF > 1 && !shouldScalarizeInstruction(EntryVal)) { - createVectorIntInductionPHI(ID, Step, EntryVal); + createVectorIntOrFpInductionPHI(ID, Step, EntryVal); VectorizedIV = true; } @@ -2482,7 +2524,10 @@ } else { ScalarIV = Induction; if (IV != OldInduction) { - ScalarIV = Builder.CreateSExtOrTrunc(ScalarIV, IV->getType()); + ScalarIV = IV->getType()->isIntegerTy() + ? Builder.CreateSExtOrTrunc(ScalarIV, IV->getType()) + : Builder.CreateCast(Instruction::SIToFP, Induction, + IV->getType()); ScalarIV = ID.transform(Builder, ScalarIV, PSE.getSE(), DL); ScalarIV->setName("offset.idx"); } @@ -2495,7 +2540,8 @@ Value *Broadcasted = getBroadcastInstrs(ScalarIV); VectorParts Entry(UF); for (unsigned Part = 0; Part < UF; ++Part) - Entry[Part] = getStepVector(Broadcasted, VF * Part, Step); + Entry[Part] = + getStepVector(Broadcasted, VF * Part, Step, ID.getInductionOpcode()); VectorLoopValueMap.initVector(EntryVal, Entry); if (Trunc) addMetadata(Entry, Trunc); @@ -2575,8 +2621,20 @@ // Get the value type and ensure it and the step have the same integer type. Type *ScalarIVTy = ScalarIV->getType()->getScalarType(); - assert(ScalarIVTy->isIntegerTy() && ScalarIVTy == Step->getType() && - "Val and Step should have the same integer type"); + assert(ScalarIVTy == Step->getType() && + "Val and Step should have the same type"); + + // We build scalar steps for both integer and floating-point induction + // variables. Here, we determine the kind of arithmetic we will perform. + Instruction::BinaryOps AddOp; + Instruction::BinaryOps MulOp; + if (ScalarIVTy->isIntegerTy()) { + AddOp = Instruction::Add; + MulOp = Instruction::Mul; + } else { + AddOp = Instruction::FAdd; + MulOp = Instruction::FMul; + } // Determine the number of scalars we need to generate for each unroll // iteration. If EntryVal is uniform, we only need to generate the first @@ -2589,9 +2647,9 @@ for (unsigned Part = 0; Part < UF; ++Part) { Entry[Part].resize(VF); for (unsigned Lane = 0; Lane < Lanes; ++Lane) { - auto *StartIdx = ConstantInt::get(ScalarIVTy, VF * Part + Lane); - auto *Mul = Builder.CreateMul(StartIdx, Step); - auto *Add = Builder.CreateAdd(ScalarIV, Mul); + auto *StartIdx = getIntOrFpConstant(ScalarIVTy, VF * Part + Lane); + auto *Mul = addFastMathFlag(Builder.CreateBinOp(MulOp, StartIdx, Step)); + auto *Add = addFastMathFlag(Builder.CreateBinOp(AddOp, ScalarIV, Mul)); Entry[Part][Lane] = Add; } } @@ -3674,16 +3732,6 @@ } } -/// \brief Adds a 'fast' flag to floating point operations. -static Value *addFastMathFlag(Value *V) { - if (isa(V)) { - FastMathFlags Flags; - Flags.setUnsafeAlgebra(); - cast(V)->setFastMathFlags(Flags); - } - return V; -} - /// \brief Estimate the overhead of scalarizing an instruction. This is a /// convenience wrapper for the type-based getScalarizationOverhead API. static unsigned getScalarizationOverhead(Instruction *I, unsigned VF, @@ -4684,7 +4732,8 @@ case InductionDescriptor::IK_NoInduction: llvm_unreachable("Unknown induction"); case InductionDescriptor::IK_IntInduction: - return widenIntInduction(P); + case InductionDescriptor::IK_FpInduction: + return widenIntOrFpInduction(P); case InductionDescriptor::IK_PtrInduction: { // Handle the pointer induction variable case. assert(P->getType()->isPointerTy() && "Unexpected type."); @@ -4711,30 +4760,6 @@ VectorLoopValueMap.initScalar(P, Entry); return; } - case InductionDescriptor::IK_FpInduction: { - assert(P->getType() == II.getStartValue()->getType() && - "Types must match"); - // Handle other induction variables that are now based on the - // canonical one. - assert(P != OldInduction && "Primary induction can be integer only"); - - Value *V = Builder.CreateCast(Instruction::SIToFP, Induction, P->getType()); - V = II.transform(Builder, V, PSE.getSE(), DL); - V->setName("fp.offset.idx"); - - // Now we have scalar op: %fp.offset.idx = StartVal +/- Induction*StepVal - - Value *Broadcasted = getBroadcastInstrs(V); - // After broadcasting the induction variable we need to make the vector - // consecutive by adding StepVal*0, StepVal*1, StepVal*2, etc. - Value *StepVal = cast(II.getStep())->getValue(); - VectorParts Entry(UF); - for (unsigned part = 0; part < UF; ++part) - Entry[part] = getStepVector(Broadcasted, VF * part, StepVal, - II.getInductionOpcode()); - VectorLoopValueMap.initVector(P, Entry); - return; - } } } @@ -4909,8 +4934,8 @@ // because (a) FP conversions lose precision, (b) sext/zext may wrap, and // (c) other casts depend on pointer size. if (Cost->isOptimizableIVTruncate(CI, VF)) { - widenIntInduction(cast(CI->getOperand(0)), - cast(CI)); + widenIntOrFpInduction(cast(CI->getOperand(0)), + cast(CI)); break; } Index: test/Transforms/LoopVectorize/float-induction.ll =================================================================== --- test/Transforms/LoopVectorize/float-induction.ll +++ test/Transforms/LoopVectorize/float-induction.ll @@ -1,33 +1,55 @@ ; RUN: opt < %s -loop-vectorize -force-vector-interleave=1 -force-vector-width=4 -dce -instcombine -S | FileCheck --check-prefix VEC4_INTERL1 %s ; RUN: opt < %s -loop-vectorize -force-vector-interleave=2 -force-vector-width=4 -dce -instcombine -S | FileCheck --check-prefix VEC4_INTERL2 %s ; RUN: opt < %s -loop-vectorize -force-vector-interleave=2 -force-vector-width=1 -dce -instcombine -S | FileCheck --check-prefix VEC1_INTERL2 %s +; RUN: opt < %s -loop-vectorize -force-vector-interleave=1 -force-vector-width=2 -dce -simplifycfg -instcombine -S | FileCheck --check-prefix=VEC2_INTERL1_PRED_STORE %s ; VEC4_INTERL1-LABEL: @fp_iv_loop1( -; VEC4_INTERL1: %[[FP_INC:.*]] = load float, float* @fp_inc -; VEC4_INTERL1: vector.body: -; VEC4_INTERL1: %[[FP_INDEX:.*]] = sitofp i64 {{.*}} to float -; VEC4_INTERL1: %[[VEC_INCR:.*]] = fmul fast float {{.*}}, %[[FP_INDEX]] -; VEC4_INTERL1: %[[FP_OFFSET_IDX:.*]] = fsub fast float %init, %[[VEC_INCR]] -; VEC4_INTERL1: %[[BRCT_INSERT:.*]] = insertelement <4 x float> undef, float %[[FP_OFFSET_IDX]], i32 0 -; VEC4_INTERL1-NEXT: %[[BRCT_SPLAT:.*]] = shufflevector <4 x float> %[[BRCT_INSERT]], <4 x float> undef, <4 x i32> zeroinitializer -; VEC4_INTERL1: %[[BRCT_INSERT:.*]] = insertelement {{.*}} %[[FP_INC]] -; VEC4_INTERL1-NEXT: %[[FP_INC_BCST:.*]] = shufflevector <4 x float> %[[BRCT_INSERT]], {{.*}} zeroinitializer -; VEC4_INTERL1: %[[VSTEP:.*]] = fmul fast <4 x float> %[[FP_INC_BCST]], -; VEC4_INTERL1-NEXT: %[[VEC_INDUCTION:.*]] = fsub fast <4 x float> %[[BRCT_SPLAT]], %[[VSTEP]] -; VEC4_INTERL1: store <4 x float> %[[VEC_INDUCTION]] +; VEC4_INTERL1: vector.ph: +; VEC4_INTERL1-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <4 x float> undef, float %init, i32 0 +; VEC4_INTERL1-NEXT: [[DOTSPLAT:%.*]] = shufflevector <4 x float> [[DOTSPLATINSERT]], <4 x float> undef, <4 x i32> zeroinitializer +; VEC4_INTERL1-NEXT: [[DOTSPLATINSERT2:%.*]] = insertelement <4 x float> undef, float %fpinc, i32 0 +; VEC4_INTERL1-NEXT: [[DOTSPLAT3:%.*]] = shufflevector <4 x float> [[DOTSPLATINSERT2]], <4 x float> undef, <4 x i32> zeroinitializer +; VEC4_INTERL1-NEXT: [[TMP5:%.*]] = fmul fast <4 x float> [[DOTSPLAT3]], +; VEC4_INTERL1-NEXT: [[INDUCTION4:%.*]] = fsub fast <4 x float> [[DOTSPLAT]], [[TMP5]] +; VEC4_INTERL1-NEXT: [[TMP6:%.*]] = fmul fast float %fpinc, 4.000000e+00 +; VEC4_INTERL1-NEXT: [[DOTSPLATINSERT5:%.*]] = insertelement <4 x float> undef, float [[TMP6]], i32 0 +; VEC4_INTERL1-NEXT: [[DOTSPLAT6:%.*]] = shufflevector <4 x float> [[DOTSPLATINSERT5]], <4 x float> undef, <4 x i32> zeroinitializer +; VEC4_INTERL1-NEXT: br label %vector.body +; VEC4_INTERL1: vector.body: +; VEC4_INTERL1-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %vector.ph ], [ [[INDEX_NEXT:%.*]], %vector.body ] +; VEC4_INTERL1-NEXT: [[VEC_IND:%.*]] = phi <4 x float> [ [[INDUCTION4]], %vector.ph ], [ [[VEC_IND_NEXT:%.*]], %vector.body ] +; VEC4_INTERL1-NEXT: [[TMP7:%.*]] = getelementptr inbounds float, float* %A, i64 [[INDEX]] +; VEC4_INTERL1-NEXT: [[TMP8:%.*]] = bitcast float* [[TMP7]] to <4 x float>* +; VEC4_INTERL1-NEXT: store <4 x float> [[VEC_IND]], <4 x float>* [[TMP8]], align 4 +; VEC4_INTERL1-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 4 +; VEC4_INTERL1-NEXT: [[VEC_IND_NEXT]] = fadd fast <4 x float> [[VEC_IND]], [[DOTSPLAT6]] +; VEC4_INTERL1: br i1 {{.*}}, label %middle.block, label %vector.body ; VEC4_INTERL2-LABEL: @fp_iv_loop1( -; VEC4_INTERL2: %[[FP_INC:.*]] = load float, float* @fp_inc -; VEC4_INTERL2: vector.body: -; VEC4_INTERL2: %[[INDEX:.*]] = sitofp i64 {{.*}} to float -; VEC4_INTERL2: %[[VEC_INCR:.*]] = fmul fast float %{{.*}}, %[[INDEX]] -; VEC4_INTERL2: fsub fast float %init, %[[VEC_INCR]] -; VEC4_INTERL2: %[[VSTEP1:.*]] = fmul fast <4 x float> %{{.*}}, -; VEC4_INTERL2-NEXT: %[[VEC_INDUCTION1:.*]] = fsub fast <4 x float> {{.*}}, %[[VSTEP1]] -; VEC4_INTERL2: %[[VSTEP2:.*]] = fmul fast <4 x float> %{{.*}}, -; VEC4_INTERL2-NEXT: %[[VEC_INDUCTION2:.*]] = fsub fast <4 x float> {{.*}}, %[[VSTEP2]] -; VEC4_INTERL2: store <4 x float> %[[VEC_INDUCTION1]] -; VEC4_INTERL2: store <4 x float> %[[VEC_INDUCTION2]] +; VEC4_INTERL2: vector.ph: +; VEC4_INTERL2-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <4 x float> undef, float %init, i32 0 +; VEC4_INTERL2-NEXT: [[DOTSPLAT:%.*]] = shufflevector <4 x float> [[DOTSPLATINSERT]], <4 x float> undef, <4 x i32> zeroinitializer +; VEC4_INTERL2-NEXT: [[DOTSPLATINSERT3:%.*]] = insertelement <4 x float> undef, float %fpinc, i32 0 +; VEC4_INTERL2-NEXT: [[DOTSPLAT4:%.*]] = shufflevector <4 x float> [[DOTSPLATINSERT3]], <4 x float> undef, <4 x i32> zeroinitializer +; VEC4_INTERL2-NEXT: [[TMP5:%.*]] = fmul fast <4 x float> [[DOTSPLAT4]], +; VEC4_INTERL2-NEXT: [[INDUCTION5:%.*]] = fsub fast <4 x float> [[DOTSPLAT]], [[TMP5]] +; VEC4_INTERL2-NEXT: [[TMP6:%.*]] = fmul fast float %fpinc, 4.000000e+00 +; VEC4_INTERL2-NEXT: [[DOTSPLATINSERT6:%.*]] = insertelement <4 x float> undef, float [[TMP6]], i32 0 +; VEC4_INTERL2-NEXT: [[DOTSPLAT7:%.*]] = shufflevector <4 x float> [[DOTSPLATINSERT6]], <4 x float> undef, <4 x i32> zeroinitializer +; VEC4_INTERL2-NEXT: br label %vector.body +; VEC4_INTERL2: vector.body: +; VEC4_INTERL2-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %vector.ph ], [ [[INDEX_NEXT:%.*]], %vector.body ] +; VEC4_INTERL2-NEXT: [[VEC_IND:%.*]] = phi <4 x float> [ [[INDUCTION5]], %vector.ph ], [ [[VEC_IND_NEXT:%.*]], %vector.body ] +; VEC4_INTERL2-NEXT: [[STEP_ADD:%.*]] = fadd fast <4 x float> [[VEC_IND]], [[DOTSPLAT7]] +; VEC4_INTERL2-NEXT: [[TMP7:%.*]] = getelementptr inbounds float, float* %A, i64 [[INDEX]] +; VEC4_INTERL2-NEXT: [[TMP8:%.*]] = bitcast float* [[TMP7]] to <4 x float>* +; VEC4_INTERL2-NEXT: store <4 x float> [[VEC_IND]], <4 x float>* [[TMP8]], align 4 +; VEC4_INTERL2-NEXT: [[TMP9:%.*]] = getelementptr float, float* [[TMP7]], i64 4 +; VEC4_INTERL2-NEXT: [[TMP10:%.*]] = bitcast float* [[TMP9]] to <4 x float>* +; VEC4_INTERL2-NEXT: store <4 x float> [[STEP_ADD]], <4 x float>* [[TMP10]], align 4 +; VEC4_INTERL2-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 8 +; VEC4_INTERL2-NEXT: [[VEC_IND_NEXT]] = fadd fast <4 x float> [[STEP_ADD]], [[DOTSPLAT7]] +; VEC4_INTERL2: br i1 {{.*}}, label %middle.block, label %vector.body ; VEC1_INTERL2-LABEL: @fp_iv_loop1( ; VEC1_INTERL2: %[[FP_INC:.*]] = load float, float* @fp_inc @@ -85,15 +107,20 @@ ;} ; VEC4_INTERL1-LABEL: @fp_iv_loop2( -; VEC4_INTERL1: vector.body -; VEC4_INTERL1: %[[index:.*]] = phi i64 [ 0, %vector.ph ] -; VEC4_INTERL1: sitofp i64 %[[index]] to float -; VEC4_INTERL1: %[[VAR1:.*]] = fmul fast float {{.*}}, 5.000000e-01 -; VEC4_INTERL1: %[[VAR2:.*]] = fadd fast float %[[VAR1]] -; VEC4_INTERL1: insertelement <4 x float> undef, float %[[VAR2]], i32 0 -; VEC4_INTERL1: shufflevector <4 x float> {{.*}}, <4 x float> undef, <4 x i32> zeroinitializer -; VEC4_INTERL1: fadd fast <4 x float> {{.*}}, -; VEC4_INTERL1: store <4 x float> +; VEC4_INTERL1: vector.ph: +; VEC4_INTERL1-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <4 x float> undef, float %init, i32 0 +; VEC4_INTERL1-NEXT: [[DOTSPLAT:%.*]] = shufflevector <4 x float> [[DOTSPLATINSERT]], <4 x float> undef, <4 x i32> zeroinitializer +; VEC4_INTERL1-NEXT: [[INDUCTION2:%.*]] = fadd fast <4 x float> [[DOTSPLAT]], +; VEC4_INTERL1-NEXT: br label %vector.body +; VEC4_INTERL1: vector.body: +; VEC4_INTERL1-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %vector.ph ], [ [[INDEX_NEXT:%.*]], %vector.body ] +; VEC4_INTERL1-NEXT: [[VEC_IND:%.*]] = phi <4 x float> [ [[INDUCTION2]], %vector.ph ], [ [[VEC_IND_NEXT:%.*]], %vector.body ] +; VEC4_INTERL1-NEXT: [[TMP5:%.*]] = getelementptr inbounds float, float* %A, i64 [[INDEX]] +; VEC4_INTERL1-NEXT: [[TMP6:%.*]] = bitcast float* [[TMP5]] to <4 x float>* +; VEC4_INTERL1-NEXT: store <4 x float> [[VEC_IND]], <4 x float>* [[TMP6]], align 4 +; VEC4_INTERL1-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 4 +; VEC4_INTERL1-NEXT: [[VEC_IND_NEXT]] = fadd fast <4 x float> [[VEC_IND]], +; VEC4_INTERL1: br i1 {{.*}}, label %middle.block, label %vector.body define void @fp_iv_loop2(float %init, float* noalias nocapture %A, i32 %N) #0 { entry: @@ -133,14 +160,43 @@ ; C[i] = y; ; } ;} + ; VEC4_INTERL1-LABEL: @fp_iv_loop3( -; VEC4_INTERL1: vector.body -; VEC4_INTERL1: %[[index:.*]] = phi i64 [ 0, %vector.ph ] -; VEC4_INTERL1: sitofp i64 %[[index]] to float -; VEC4_INTERL1: %[[VAR1:.*]] = fmul fast float {{.*}}, -5.000000e-01 -; VEC4_INTERL1: fadd fast float %[[VAR1]] -; VEC4_INTERL1: fadd fast <4 x float> {{.*}}, -; VEC4_INTERL1: store <4 x float> +; VEC4_INTERL1: for.body.lr.ph: +; VEC4_INTERL1: [[TMP0:%.*]] = load float, float* @fp_inc, align 4 +; VEC4_INTERL1: vector.ph: +; VEC4_INTERL1-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <4 x float> undef, float %init, i32 0 +; VEC4_INTERL1-NEXT: [[DOTSPLAT:%.*]] = shufflevector <4 x float> [[DOTSPLATINSERT]], <4 x float> undef, <4 x i32> zeroinitializer +; VEC4_INTERL1-NEXT: [[DOTSPLATINSERT5:%.*]] = insertelement <4 x float> undef, float [[TMP0]], i32 0 +; VEC4_INTERL1-NEXT: [[DOTSPLAT6:%.*]] = shufflevector <4 x float> [[DOTSPLATINSERT5]], <4 x float> undef, <4 x i32> zeroinitializer +; VEC4_INTERL1-NEXT: [[TMP7:%.*]] = fmul fast <4 x float> [[DOTSPLAT6]], +; VEC4_INTERL1-NEXT: [[INDUCTION7:%.*]] = fadd fast <4 x float> [[DOTSPLAT]], [[TMP7]] +; VEC4_INTERL1-NEXT: [[TMP8:%.*]] = fmul fast float [[TMP0]], 4.000000e+00 +; VEC4_INTERL1-NEXT: [[DOTSPLATINSERT8:%.*]] = insertelement <4 x float> undef, float [[TMP8]], i32 0 +; VEC4_INTERL1-NEXT: [[DOTSPLAT9:%.*]] = shufflevector <4 x float> [[DOTSPLATINSERT8]], <4 x float> undef, <4 x i32> zeroinitializer +; VEC4_INTERL1-NEXT: [[BROADCAST_SPLATINSERT12:%.*]] = insertelement <4 x float> undef, float [[TMP0]], i32 0 +; VEC4_INTERL1-NEXT: [[BROADCAST_SPLAT13:%.*]] = shufflevector <4 x float> [[BROADCAST_SPLATINSERT12]], <4 x float> undef, <4 x i32> zeroinitializer +; VEC4_INTERL1-NEXT: br label [[VECTOR_BODY:%.*]] +; VEC4_INTERL1: vector.body: +; VEC4_INTERL1-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %vector.ph ], [ [[INDEX_NEXT:%.*]], %vector.body ] +; VEC4_INTERL1-NEXT: [[VEC_IND:%.*]] = phi <4 x float> [ , %vector.ph ], [ [[VEC_IND_NEXT:%.*]], %vector.body ] +; VEC4_INTERL1-NEXT: [[VEC_IND10:%.*]] = phi <4 x float> [ [[INDUCTION7]], %vector.ph ], [ [[VEC_IND_NEXT11:%.*]], %vector.body ] +; VEC4_INTERL1-NEXT: [[TMP9:%.*]] = getelementptr inbounds float, float* %A, i64 [[INDEX]] +; VEC4_INTERL1-NEXT: [[TMP10:%.*]] = bitcast float* [[TMP9]] to <4 x float>* +; VEC4_INTERL1-NEXT: store <4 x float> [[VEC_IND10]], <4 x float>* [[TMP10]], align 4 +; VEC4_INTERL1-NEXT: [[TMP11:%.*]] = fadd fast <4 x float> [[VEC_IND10]], [[BROADCAST_SPLAT13]] +; VEC4_INTERL1-NEXT: [[TMP12:%.*]] = fadd fast <4 x float> [[VEC_IND]], +; VEC4_INTERL1-NEXT: [[TMP13:%.*]] = fadd fast <4 x float> [[TMP12]], [[TMP11]] +; VEC4_INTERL1-NEXT: [[TMP14:%.*]] = getelementptr inbounds float, float* %B, i64 [[INDEX]] +; VEC4_INTERL1-NEXT: [[TMP15:%.*]] = bitcast float* [[TMP14]] to <4 x float>* +; VEC4_INTERL1-NEXT: store <4 x float> [[TMP13]], <4 x float>* [[TMP15]], align 4 +; VEC4_INTERL1-NEXT: [[TMP16:%.*]] = getelementptr inbounds float, float* %C, i64 [[INDEX]] +; VEC4_INTERL1-NEXT: [[TMP17:%.*]] = bitcast float* [[TMP16]] to <4 x float>* +; VEC4_INTERL1-NEXT: store <4 x float> [[TMP12]], <4 x float>* [[TMP17]], align 4 +; VEC4_INTERL1-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 4 +; VEC4_INTERL1-NEXT: [[VEC_IND_NEXT]] = fadd fast <4 x float> [[VEC_IND]], +; VEC4_INTERL1-NEXT: [[VEC_IND_NEXT11]] = fadd fast <4 x float> [[VEC_IND10]], [[DOTSPLAT9]] +; VEC4_INTERL1: br i1 {{.*}}, label %middle.block, label %vector.body define void @fp_iv_loop3(float %init, float* noalias nocapture %A, float* noalias nocapture %B, float* noalias nocapture %C, i32 %N) #1 { entry: @@ -186,10 +242,17 @@ ;} ; VEC4_INTERL1-LABEL: @fp_iv_loop4( -; VEC4_INTERL1: vector.body -; VEC4_INTERL1-NOT: fmul fast <4 x float> -; VEC4_INTERL1: %[[induction:.*]] = fadd fast <4 x float> %{{.*}}, -; VEC4_INTERL1: store <4 x float> %[[induction]] +; VEC4_INTERL1: vector.ph: +; VEC4_INTERL1-NEXT: br label %vector.body +; VEC4_INTERL1: vector.body: +; VEC4_INTERL1-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %vector.ph ], [ [[INDEX_NEXT:%.*]], %vector.body ] +; VEC4_INTERL1-NEXT: [[VEC_IND:%.*]] = phi <4 x float> [ , %vector.ph ], [ [[VEC_IND_NEXT:%.*]], %vector.body ] +; VEC4_INTERL1-NEXT: [[TMP5:%.*]] = getelementptr inbounds float, float* %A, i64 [[INDEX]] +; VEC4_INTERL1-NEXT: [[TMP6:%.*]] = bitcast float* [[TMP5]] to <4 x float>* +; VEC4_INTERL1-NEXT: store <4 x float> [[VEC_IND]], <4 x float>* [[TMP6]], align 4 +; VEC4_INTERL1-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 4 +; VEC4_INTERL1-NEXT: [[VEC_IND_NEXT]] = fadd fast <4 x float> [[VEC_IND]], +; VEC4_INTERL1: br i1 {{.*}}, label %middle.block, label %vector.body define void @fp_iv_loop4(float* noalias nocapture %A, i32 %N) { entry: @@ -216,3 +279,56 @@ for.end: ; preds = %for.end.loopexit, %entry ret void } + +; VEC2_INTERL1_PRED_STORE-LABEL: @non_primary_iv_float_scalar( +; VEC2_INTERL1_PRED_STORE: vector.body: +; VEC2_INTERL1_PRED_STORE-NEXT: [[INDEX:%.*]] = phi i64 [ [[INDEX_NEXT:%.*]], %[[PRED_STORE_CONTINUE4:.*]] ], [ 0, %min.iters.checked ] +; VEC2_INTERL1_PRED_STORE-NEXT: [[TMP1:%.*]] = sitofp i64 [[INDEX]] to float +; VEC2_INTERL1_PRED_STORE-NEXT: [[TMP2:%.*]] = getelementptr inbounds float, float* %A, i64 [[INDEX]] +; VEC2_INTERL1_PRED_STORE-NEXT: [[TMP3:%.*]] = bitcast float* [[TMP2]] to <2 x float>* +; VEC2_INTERL1_PRED_STORE-NEXT: [[WIDE_LOAD:%.*]] = load <2 x float>, <2 x float>* [[TMP3]], align 4 +; VEC2_INTERL1_PRED_STORE-NEXT: [[TMP4:%.*]] = fcmp fast oeq <2 x float> [[WIDE_LOAD]], zeroinitializer +; VEC2_INTERL1_PRED_STORE-NEXT: [[TMP5:%.*]] = extractelement <2 x i1> [[TMP4]], i32 0 +; VEC2_INTERL1_PRED_STORE-NEXT: br i1 [[TMP5]], label %[[PRED_STORE_IF:.*]], label %[[PRED_STORE_CONTINUE:.*]] +; VEC2_INTERL1_PRED_STORE: [[PRED_STORE_IF]]: +; VEC2_INTERL1_PRED_STORE-NEXT: [[TMP6:%.*]] = getelementptr inbounds float, float* %A, i64 [[INDEX]] +; VEC2_INTERL1_PRED_STORE-NEXT: store float [[TMP1]], float* [[TMP6]], align 4 +; VEC2_INTERL1_PRED_STORE-NEXT: br label %[[PRED_STORE_CONTINUE]] +; VEC2_INTERL1_PRED_STORE: [[PRED_STORE_CONTINUE]]: +; VEC2_INTERL1_PRED_STORE-NEXT: [[TMP7:%.*]] = extractelement <2 x i1> [[TMP4]], i32 1 +; VEC2_INTERL1_PRED_STORE-NEXT: br i1 [[TMP7]], label %[[PRED_STORE_IF3:.*]], label %[[PRED_STORE_CONTINUE4]] +; VEC2_INTERL1_PRED_STORE: [[PRED_STORE_IF3]]: +; VEC2_INTERL1_PRED_STORE-NEXT: [[TMP8:%.*]] = fadd fast float [[TMP1]], 1.000000e+00 +; VEC2_INTERL1_PRED_STORE-NEXT: [[TMP9:%.*]] = or i64 [[INDEX]], 1 +; VEC2_INTERL1_PRED_STORE-NEXT: [[TMP10:%.*]] = getelementptr inbounds float, float* %A, i64 [[TMP9]] +; VEC2_INTERL1_PRED_STORE-NEXT: store float [[TMP8]], float* [[TMP10]], align 4 +; VEC2_INTERL1_PRED_STORE-NEXT: br label %[[PRED_STORE_CONTINUE4]] +; VEC2_INTERL1_PRED_STORE: [[PRED_STORE_CONTINUE4]]: +; VEC2_INTERL1_PRED_STORE-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 2 +; VEC2_INTERL1_PRED_STORE: br i1 {{.*}}, label %middle.block, label %vector.body + +define void @non_primary_iv_float_scalar(float* %A, i64 %N) { +entry: + br label %for.body + +for.body: + %i = phi i64 [ %i.next, %for.inc ], [ 0, %entry ] + %j = phi float [ %j.next, %for.inc ], [ 0.0, %entry ] + %tmp0 = getelementptr inbounds float, float* %A, i64 %i + %tmp1 = load float, float* %tmp0, align 4 + %tmp2 = fcmp fast oeq float %tmp1, 0.0 + br i1 %tmp2, label %if.pred, label %for.inc + +if.pred: + store float %j, float* %tmp0, align 4 + br label %for.inc + +for.inc: + %i.next = add nuw nsw i64 %i, 1 + %j.next = fadd fast float %j, 1.0 + %cond = icmp slt i64 %i.next, %N + br i1 %cond, label %for.body, label %for.end + +for.end: + ret void +}