Index: llvm/trunk/lib/Transforms/Vectorize/LoopVectorize.cpp =================================================================== --- llvm/trunk/lib/Transforms/Vectorize/LoopVectorize.cpp +++ llvm/trunk/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -369,6 +369,23 @@ /// we always assume predicated blocks have a 50% chance of executing. static unsigned getReciprocalPredBlockProb() { return 2; } +/// A helper function that adds a 'fast' flag to floating-point operations. +static Value *addFastMathFlag(Value *V) { + if (isa(V)) { + FastMathFlags Flags; + Flags.setUnsafeAlgebra(); + cast(V)->setFastMathFlags(Flags); + } + return V; +} + +/// A helper function that returns an integer or floating-point constant with +/// value C. +static Constant *getSignedIntOrFpConstant(Type *Ty, int64_t C) { + return Ty->isIntegerTy() ? ConstantInt::getSigned(Ty, C) + : ConstantFP::get(Ty, C); +} + /// InnerLoopVectorizer vectorizes loops which contain only one basic /// block to a specified vectorization factor (VF). /// This class performs the widening of scalars into vectors, or multiple @@ -522,19 +539,21 @@ /// \p EntryVal is the value from the original loop that maps to the steps. /// Note that \p EntryVal doesn't have to be an induction variable (e.g., it /// can be a truncate instruction). - void buildScalarSteps(Value *ScalarIV, Value *Step, Value *EntryVal); + void buildScalarSteps(Value *ScalarIV, Value *Step, Value *EntryVal, + const InductionDescriptor &ID); /// Create a vector induction phi node based on an existing scalar one. \p /// EntryVal is the value from the original loop that maps to the vector phi /// node, and \p Step is the loop-invariant step. If \p EntryVal is a /// truncate instruction, instead of widening the original IV, we widen a /// version of the IV truncated to \p EntryVal's type. - void createVectorIntInductionPHI(const InductionDescriptor &II, Value *Step, - Instruction *EntryVal); + void createVectorIntOrFpInductionPHI(const InductionDescriptor &II, + Value *Step, Instruction *EntryVal); - /// Widen an integer induction variable \p IV. If \p Trunc is provided, the - /// induction variable will first be truncated to the corresponding type. - void widenIntInduction(PHINode *IV, TruncInst *Trunc = nullptr); + /// Widen an integer or floating-point induction variable \p IV. If \p Trunc + /// is provided, the integer induction variable will first be truncated to + /// the corresponding type. + void widenIntOrFpInduction(PHINode *IV, TruncInst *Trunc = nullptr); /// Returns true if an instruction \p I should be scalarized instead of /// vectorized for the chosen vectorization factor. @@ -2324,30 +2343,46 @@ return Shuf; } -void InnerLoopVectorizer::createVectorIntInductionPHI( +void InnerLoopVectorizer::createVectorIntOrFpInductionPHI( const InductionDescriptor &II, Value *Step, Instruction *EntryVal) { Value *Start = II.getStartValue(); - assert(Step->getType()->isIntegerTy() && - "Cannot widen an IV having a step with a non-integer type"); // Construct the initial value of the vector IV in the vector loop preheader auto CurrIP = Builder.saveIP(); Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator()); if (isa(EntryVal)) { + assert(Start->getType()->isIntegerTy() && + "Truncation requires an integer type"); auto *TruncType = cast(EntryVal->getType()); Step = Builder.CreateTrunc(Step, TruncType); Start = Builder.CreateCast(Instruction::Trunc, Start, TruncType); } Value *SplatStart = Builder.CreateVectorSplat(VF, Start); - Value *SteppedStart = getStepVector(SplatStart, 0, Step); + Value *SteppedStart = + getStepVector(SplatStart, 0, Step, II.getInductionOpcode()); + + // We create vector phi nodes for both integer and floating-point induction + // variables. Here, we determine the kind of arithmetic we will perform. + Instruction::BinaryOps AddOp; + Instruction::BinaryOps MulOp; + if (Step->getType()->isIntegerTy()) { + AddOp = Instruction::Add; + MulOp = Instruction::Mul; + } else { + AddOp = II.getInductionOpcode(); + MulOp = Instruction::FMul; + } + + // Multiply the vectorization factor by the step using integer or + // floating-point arithmetic as appropriate. + Value *ConstVF = getSignedIntOrFpConstant(Step->getType(), VF); + Value *Mul = addFastMathFlag(Builder.CreateBinOp(MulOp, Step, ConstVF)); // Create a vector splat to use in the induction update. // // FIXME: If the step is non-constant, we create the vector splat with // IRBuilder. IRBuilder can constant-fold the multiply, but it doesn't // handle a constant vector splat. - auto *ConstVF = ConstantInt::getSigned(Step->getType(), VF); - auto *Mul = Builder.CreateMul(Step, ConstVF); Value *SplatVF = isa(Mul) ? ConstantVector::getSplat(VF, cast(Mul)) : Builder.CreateVectorSplat(VF, Mul); @@ -2361,8 +2396,8 @@ VectorParts Entry(UF); for (unsigned Part = 0; Part < UF; ++Part) { Entry[Part] = LastInduction; - LastInduction = cast( - Builder.CreateAdd(LastInduction, SplatVF, "step.add")); + LastInduction = cast(addFastMathFlag( + Builder.CreateBinOp(AddOp, LastInduction, SplatVF, "step.add"))); } VectorLoopValueMap.initVector(EntryVal, Entry); if (isa(EntryVal)) @@ -2395,7 +2430,10 @@ return any_of(IV->users(), isScalarInst); } -void InnerLoopVectorizer::widenIntInduction(PHINode *IV, TruncInst *Trunc) { +void InnerLoopVectorizer::widenIntOrFpInduction(PHINode *IV, TruncInst *Trunc) { + + assert((IV->getType()->isIntegerTy() || IV != OldInduction) && + "Primary induction variable must have an integer type"); auto II = Legal->getInductionVars()->find(IV); assert(II != Legal->getInductionVars()->end() && "IV is not an induction"); @@ -2424,15 +2462,20 @@ assert(PSE.getSE()->isLoopInvariant(ID.getStep(), OrigLoop) && "Induction step should be loop invariant"); auto &DL = OrigLoop->getHeader()->getModule()->getDataLayout(); - SCEVExpander Exp(*PSE.getSE(), DL, "induction"); - Value *Step = Exp.expandCodeFor(ID.getStep(), ID.getStep()->getType(), - LoopVectorPreHeader->getTerminator()); + Value *Step = nullptr; + if (PSE.getSE()->isSCEVable(IV->getType())) { + SCEVExpander Exp(*PSE.getSE(), DL, "induction"); + Step = Exp.expandCodeFor(ID.getStep(), ID.getStep()->getType(), + LoopVectorPreHeader->getTerminator()); + } else { + Step = cast(ID.getStep())->getValue(); + } // Try to create a new independent vector induction variable. If we can't // create the phi node, we will splat the scalar induction variable in each // loop iteration. if (VF > 1 && !shouldScalarizeInstruction(EntryVal)) { - createVectorIntInductionPHI(ID, Step, EntryVal); + createVectorIntOrFpInductionPHI(ID, Step, EntryVal); VectorizedIV = true; } @@ -2451,7 +2494,10 @@ } else { ScalarIV = Induction; if (IV != OldInduction) { - ScalarIV = Builder.CreateSExtOrTrunc(ScalarIV, IV->getType()); + ScalarIV = IV->getType()->isIntegerTy() + ? Builder.CreateSExtOrTrunc(ScalarIV, IV->getType()) + : Builder.CreateCast(Instruction::SIToFP, Induction, + IV->getType()); ScalarIV = ID.transform(Builder, ScalarIV, PSE.getSE(), DL); ScalarIV->setName("offset.idx"); } @@ -2464,7 +2510,8 @@ Value *Broadcasted = getBroadcastInstrs(ScalarIV); VectorParts Entry(UF); for (unsigned Part = 0; Part < UF; ++Part) - Entry[Part] = getStepVector(Broadcasted, VF * Part, Step); + Entry[Part] = + getStepVector(Broadcasted, VF * Part, Step, ID.getInductionOpcode()); VectorLoopValueMap.initVector(EntryVal, Entry); if (Trunc) addMetadata(Entry, Trunc); @@ -2477,7 +2524,7 @@ // in the loop in the common case prior to InstCombine. We will be trading // one vector extract for each scalar step. if (NeedsScalarIV) - buildScalarSteps(ScalarIV, Step, EntryVal); + buildScalarSteps(ScalarIV, Step, EntryVal, ID); } Value *InnerLoopVectorizer::getStepVector(Value *Val, int StartIdx, Value *Step, @@ -2537,15 +2584,28 @@ } void InnerLoopVectorizer::buildScalarSteps(Value *ScalarIV, Value *Step, - Value *EntryVal) { + Value *EntryVal, + const InductionDescriptor &ID) { // We shouldn't have to build scalar steps if we aren't vectorizing. assert(VF > 1 && "VF should be greater than one"); // Get the value type and ensure it and the step have the same integer type. Type *ScalarIVTy = ScalarIV->getType()->getScalarType(); - assert(ScalarIVTy->isIntegerTy() && ScalarIVTy == Step->getType() && - "Val and Step should have the same integer type"); + assert(ScalarIVTy == Step->getType() && + "Val and Step should have the same type"); + + // We build scalar steps for both integer and floating-point induction + // variables. Here, we determine the kind of arithmetic we will perform. + Instruction::BinaryOps AddOp; + Instruction::BinaryOps MulOp; + if (ScalarIVTy->isIntegerTy()) { + AddOp = Instruction::Add; + MulOp = Instruction::Mul; + } else { + AddOp = ID.getInductionOpcode(); + MulOp = Instruction::FMul; + } // Determine the number of scalars we need to generate for each unroll // iteration. If EntryVal is uniform, we only need to generate the first @@ -2558,9 +2618,9 @@ for (unsigned Part = 0; Part < UF; ++Part) { Entry[Part].resize(VF); for (unsigned Lane = 0; Lane < Lanes; ++Lane) { - auto *StartIdx = ConstantInt::get(ScalarIVTy, VF * Part + Lane); - auto *Mul = Builder.CreateMul(StartIdx, Step); - auto *Add = Builder.CreateAdd(ScalarIV, Mul); + auto *StartIdx = getSignedIntOrFpConstant(ScalarIVTy, VF * Part + Lane); + auto *Mul = addFastMathFlag(Builder.CreateBinOp(MulOp, StartIdx, Step)); + auto *Add = addFastMathFlag(Builder.CreateBinOp(AddOp, ScalarIV, Mul)); Entry[Part][Lane] = Add; } } @@ -3643,16 +3703,6 @@ } } -/// \brief Adds a 'fast' flag to floating point operations. -static Value *addFastMathFlag(Value *V) { - if (isa(V)) { - FastMathFlags Flags; - Flags.setUnsafeAlgebra(); - cast(V)->setFastMathFlags(Flags); - } - return V; -} - /// \brief Estimate the overhead of scalarizing an instruction. This is a /// convenience wrapper for the type-based getScalarizationOverhead API. static unsigned getScalarizationOverhead(Instruction *I, unsigned VF, @@ -4653,7 +4703,8 @@ case InductionDescriptor::IK_NoInduction: llvm_unreachable("Unknown induction"); case InductionDescriptor::IK_IntInduction: - return widenIntInduction(P); + case InductionDescriptor::IK_FpInduction: + return widenIntOrFpInduction(P); case InductionDescriptor::IK_PtrInduction: { // Handle the pointer induction variable case. assert(P->getType()->isPointerTy() && "Unexpected type."); @@ -4680,30 +4731,6 @@ VectorLoopValueMap.initScalar(P, Entry); return; } - case InductionDescriptor::IK_FpInduction: { - assert(P->getType() == II.getStartValue()->getType() && - "Types must match"); - // Handle other induction variables that are now based on the - // canonical one. - assert(P != OldInduction && "Primary induction can be integer only"); - - Value *V = Builder.CreateCast(Instruction::SIToFP, Induction, P->getType()); - V = II.transform(Builder, V, PSE.getSE(), DL); - V->setName("fp.offset.idx"); - - // Now we have scalar op: %fp.offset.idx = StartVal +/- Induction*StepVal - - Value *Broadcasted = getBroadcastInstrs(V); - // After broadcasting the induction variable we need to make the vector - // consecutive by adding StepVal*0, StepVal*1, StepVal*2, etc. - Value *StepVal = cast(II.getStep())->getValue(); - VectorParts Entry(UF); - for (unsigned part = 0; part < UF; ++part) - Entry[part] = getStepVector(Broadcasted, VF * part, StepVal, - II.getInductionOpcode()); - VectorLoopValueMap.initVector(P, Entry); - return; - } } } @@ -4878,8 +4905,8 @@ // because (a) FP conversions lose precision, (b) sext/zext may wrap, and // (c) other casts depend on pointer size. if (Cost->isOptimizableIVTruncate(CI, VF)) { - widenIntInduction(cast(CI->getOperand(0)), - cast(CI)); + widenIntOrFpInduction(cast(CI->getOperand(0)), + cast(CI)); break; } Index: llvm/trunk/test/Transforms/LoopVectorize/float-induction.ll =================================================================== --- llvm/trunk/test/Transforms/LoopVectorize/float-induction.ll +++ llvm/trunk/test/Transforms/LoopVectorize/float-induction.ll @@ -15,47 +15,50 @@ ; VEC4_INTERL1-LABEL: @fp_iv_loop1( ; VEC4_INTERL1: vector.ph: +; VEC4_INTERL1-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <4 x float> undef, float %init, i32 0 +; VEC4_INTERL1-NEXT: [[DOTSPLAT:%.*]] = shufflevector <4 x float> [[DOTSPLATINSERT]], <4 x float> undef, <4 x i32> zeroinitializer +; VEC4_INTERL1-NEXT: [[DOTSPLATINSERT2:%.*]] = insertelement <4 x float> undef, float %fpinc, i32 0 +; VEC4_INTERL1-NEXT: [[DOTSPLAT3:%.*]] = shufflevector <4 x float> [[DOTSPLATINSERT2]], <4 x float> undef, <4 x i32> zeroinitializer +; VEC4_INTERL1-NEXT: [[TMP5:%.*]] = fmul fast <4 x float> [[DOTSPLAT3]], +; VEC4_INTERL1-NEXT: [[INDUCTION4:%.*]] = fsub fast <4 x float> [[DOTSPLAT]], [[TMP5]] +; VEC4_INTERL1-NEXT: [[TMP6:%.*]] = fmul fast float %fpinc, 4.000000e+00 +; VEC4_INTERL1-NEXT: [[DOTSPLATINSERT5:%.*]] = insertelement <4 x float> undef, float [[TMP6]], i32 0 +; VEC4_INTERL1-NEXT: [[DOTSPLAT6:%.*]] = shufflevector <4 x float> [[DOTSPLATINSERT5]], <4 x float> undef, <4 x i32> zeroinitializer ; VEC4_INTERL1-NEXT: br label %vector.body ; VEC4_INTERL1: vector.body: ; VEC4_INTERL1-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %vector.ph ], [ [[INDEX_NEXT:%.*]], %vector.body ] -; VEC4_INTERL1-NEXT: [[TMP5:%.*]] = sitofp i64 [[INDEX]] to float -; VEC4_INTERL1-NEXT: [[TMP6:%.*]] = fmul fast float %fpinc, [[TMP5]] -; VEC4_INTERL1-NEXT: [[FP_OFFSET_IDX:%.*]] = fsub fast float %init, [[TMP6]] -; VEC4_INTERL1-NEXT: [[BROADCAST_SPLATINSERT3:%.*]] = insertelement <4 x float> undef, float [[FP_OFFSET_IDX]], i32 0 -; VEC4_INTERL1-NEXT: [[BROADCAST_SPLAT4:%.*]] = shufflevector <4 x float> [[BROADCAST_SPLATINSERT3]], <4 x float> undef, <4 x i32> zeroinitializer -; VEC4_INTERL1-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <4 x float> undef, float %fpinc, i32 0 -; VEC4_INTERL1-NEXT: [[DOTSPLAT:%.*]] = shufflevector <4 x float> [[DOTSPLATINSERT]], <4 x float> undef, <4 x i32> zeroinitializer -; VEC4_INTERL1-NEXT: [[TMP7:%.*]] = fmul fast <4 x float> [[DOTSPLAT]], -; VEC4_INTERL1-NEXT: [[INDUCTION5:%.*]] = fsub fast <4 x float> [[BROADCAST_SPLAT4]], [[TMP7]] +; VEC4_INTERL1-NEXT: [[VEC_IND:%.*]] = phi <4 x float> [ [[INDUCTION4]], %vector.ph ], [ [[VEC_IND_NEXT:%.*]], %vector.body ] ; VEC4_INTERL1-NEXT: [[TMP8:%.*]] = getelementptr inbounds float, float* %A, i64 [[INDEX]] ; VEC4_INTERL1-NEXT: [[TMP9:%.*]] = bitcast float* [[TMP8]] to <4 x float>* -; VEC4_INTERL1-NEXT: store <4 x float> [[INDUCTION5]], <4 x float>* [[TMP9]], align 4 +; VEC4_INTERL1-NEXT: store <4 x float> [[VEC_IND]], <4 x float>* [[TMP9]], align 4 ; VEC4_INTERL1-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 4 +; VEC4_INTERL1-NEXT: [[VEC_IND_NEXT]] = fsub fast <4 x float> [[VEC_IND]], [[DOTSPLAT6]] ; VEC4_INTERL1: br i1 {{.*}}, label %middle.block, label %vector.body ; VEC4_INTERL2-LABEL: @fp_iv_loop1( ; VEC4_INTERL2: vector.ph: +; VEC4_INTERL2-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <4 x float> undef, float %init, i32 0 +; VEC4_INTERL2-NEXT: [[DOTSPLAT:%.*]] = shufflevector <4 x float> [[DOTSPLATINSERT]], <4 x float> undef, <4 x i32> zeroinitializer +; VEC4_INTERL2-NEXT: [[DOTSPLATINSERT3:%.*]] = insertelement <4 x float> undef, float %fpinc, i32 0 +; VEC4_INTERL2-NEXT: [[DOTSPLAT4:%.*]] = shufflevector <4 x float> [[DOTSPLATINSERT3]], <4 x float> undef, <4 x i32> zeroinitializer +; VEC4_INTERL2-NEXT: [[TMP5:%.*]] = fmul fast <4 x float> [[DOTSPLAT4]], +; VEC4_INTERL2-NEXT: [[INDUCTION5:%.*]] = fsub fast <4 x float> [[DOTSPLAT]], [[TMP5]] +; VEC4_INTERL2-NEXT: [[TMP6:%.*]] = fmul fast float %fpinc, 4.000000e+00 +; VEC4_INTERL2-NEXT: [[DOTSPLATINSERT6:%.*]] = insertelement <4 x float> undef, float [[TMP6]], i32 0 +; VEC4_INTERL2-NEXT: [[DOTSPLAT7:%.*]] = shufflevector <4 x float> [[DOTSPLATINSERT6]], <4 x float> undef, <4 x i32> zeroinitializer ; VEC4_INTERL2-NEXT: br label %vector.body ; VEC4_INTERL2: vector.body: ; VEC4_INTERL2-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %vector.ph ], [ [[INDEX_NEXT:%.*]], %vector.body ] -; VEC4_INTERL2-NEXT: [[TMP5:%.*]] = sitofp i64 [[INDEX]] to float -; VEC4_INTERL2-NEXT: [[TMP6:%.*]] = fmul fast float %fpinc, [[TMP5]] -; VEC4_INTERL2-NEXT: [[FP_OFFSET_IDX:%.*]] = fsub fast float %init, [[TMP6]] -; VEC4_INTERL2-NEXT: [[BROADCAST_SPLATINSERT4:%.*]] = insertelement <4 x float> undef, float [[FP_OFFSET_IDX]], i32 0 -; VEC4_INTERL2-NEXT: [[BROADCAST_SPLAT5:%.*]] = shufflevector <4 x float> [[BROADCAST_SPLATINSERT4]], <4 x float> undef, <4 x i32> zeroinitializer -; VEC4_INTERL2-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <4 x float> undef, float %fpinc, i32 0 -; VEC4_INTERL2-NEXT: [[DOTSPLAT:%.*]] = shufflevector <4 x float> [[DOTSPLATINSERT]], <4 x float> undef, <4 x i32> zeroinitializer -; VEC4_INTERL2-NEXT: [[TMP7:%.*]] = fmul fast <4 x float> [[DOTSPLAT]], -; VEC4_INTERL2-NEXT: [[INDUCTION6:%.*]] = fsub fast <4 x float> [[BROADCAST_SPLAT5]], [[TMP7]] -; VEC4_INTERL2-NEXT: [[TMP8:%.*]] = fmul fast <4 x float> [[DOTSPLAT]], -; VEC4_INTERL2-NEXT: [[INDUCTION9:%.*]] = fsub fast <4 x float> [[BROADCAST_SPLAT5]], [[TMP8]] +; VEC4_INTERL2-NEXT: [[VEC_IND:%.*]] = phi <4 x float> [ [[INDUCTION5]], %vector.ph ], [ [[VEC_IND_NEXT:%.*]], %vector.body ] +; VEC4_INTERL2-NEXT: [[STEP_ADD:%.*]] = fsub fast <4 x float> [[VEC_IND]], [[DOTSPLAT7]] ; VEC4_INTERL2-NEXT: [[TMP9:%.*]] = getelementptr inbounds float, float* %A, i64 [[INDEX]] ; VEC4_INTERL2-NEXT: [[TMP10:%.*]] = bitcast float* [[TMP9]] to <4 x float>* -; VEC4_INTERL2-NEXT: store <4 x float> [[INDUCTION6]], <4 x float>* [[TMP10]], align 4 +; VEC4_INTERL2-NEXT: store <4 x float> [[VEC_IND]], <4 x float>* [[TMP10]], align 4 ; VEC4_INTERL2-NEXT: [[TMP11:%.*]] = getelementptr float, float* [[TMP9]], i64 4 ; VEC4_INTERL2-NEXT: [[TMP12:%.*]] = bitcast float* [[TMP11]] to <4 x float>* -; VEC4_INTERL2-NEXT: store <4 x float> [[INDUCTION9]], <4 x float>* [[TMP12]], align 4 +; VEC4_INTERL2-NEXT: store <4 x float> [[STEP_ADD]], <4 x float>* [[TMP12]], align 4 ; VEC4_INTERL2-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 8 +; VEC4_INTERL2-NEXT: [[VEC_IND_NEXT]] = fsub fast <4 x float> [[STEP_ADD]], [[DOTSPLAT7]] ; VEC4_INTERL2: br i1 {{.*}}, label %middle.block, label %vector.body ; VEC1_INTERL2-LABEL: @fp_iv_loop1( @@ -112,19 +115,18 @@ ; VEC4_INTERL1-LABEL: @fp_iv_loop2( ; VEC4_INTERL1: vector.ph: +; VEC4_INTERL1-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <4 x float> undef, float %init, i32 0 +; VEC4_INTERL1-NEXT: [[DOTSPLAT:%.*]] = shufflevector <4 x float> [[DOTSPLATINSERT]], <4 x float> undef, <4 x i32> zeroinitializer +; VEC4_INTERL1-NEXT: [[INDUCTION2:%.*]] = fadd fast <4 x float> [[DOTSPLAT]], ; VEC4_INTERL1-NEXT: br label %vector.body ; VEC4_INTERL1: vector.body: ; VEC4_INTERL1-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %vector.ph ], [ [[INDEX_NEXT:%.*]], %vector.body ] -; VEC4_INTERL1-NEXT: [[TMP5:%.*]] = sitofp i64 [[INDEX]] to float -; VEC4_INTERL1-NEXT: [[TMP6:%.*]] = fmul fast float [[TMP5]], 5.000000e-01 -; VEC4_INTERL1-NEXT: [[FP_OFFSET_IDX:%.*]] = fadd fast float [[TMP6]], %init -; VEC4_INTERL1-NEXT: [[BROADCAST_SPLATINSERT3:%.*]] = insertelement <4 x float> undef, float [[FP_OFFSET_IDX]], i32 0 -; VEC4_INTERL1-NEXT: [[BROADCAST_SPLAT4:%.*]] = shufflevector <4 x float> [[BROADCAST_SPLATINSERT3]], <4 x float> undef, <4 x i32> zeroinitializer -; VEC4_INTERL1-NEXT: [[INDUCTION5:%.*]] = fadd fast <4 x float> [[BROADCAST_SPLAT4]], +; VEC4_INTERL1-NEXT: [[VEC_IND:%.*]] = phi <4 x float> [ [[INDUCTION2]], %vector.ph ], [ [[VEC_IND_NEXT:%.*]], %vector.body ] ; VEC4_INTERL1-NEXT: [[TMP7:%.*]] = getelementptr inbounds float, float* %A, i64 [[INDEX]] ; VEC4_INTERL1-NEXT: [[TMP8:%.*]] = bitcast float* [[TMP7]] to <4 x float>* -; VEC4_INTERL1-NEXT: store <4 x float> [[INDUCTION5]], <4 x float>* [[TMP8]], align 4 +; VEC4_INTERL1-NEXT: store <4 x float> [[VEC_IND]], <4 x float>* [[TMP8]], align 4 ; VEC4_INTERL1-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 4 +; VEC4_INTERL1-NEXT: [[VEC_IND_NEXT]] = fadd fast <4 x float> [[VEC_IND]], ; VEC4_INTERL1: br i1 {{.*}}, label %middle.block, label %vector.body define void @fp_iv_loop2(float %init, float* noalias nocapture %A, i32 %N) #0 { @@ -170,30 +172,27 @@ ; VEC4_INTERL1: for.body.lr.ph: ; VEC4_INTERL1: [[TMP0:%.*]] = load float, float* @fp_inc, align 4 ; VEC4_INTERL1: vector.ph: -; VEC4_INTERL1-NEXT: [[BROADCAST_SPLATINSERT14:%.*]] = insertelement <4 x float> undef, float [[TMP0]], i32 0 -; VEC4_INTERL1-NEXT: [[BROADCAST_SPLAT15:%.*]] = shufflevector <4 x float> [[BROADCAST_SPLATINSERT14]], <4 x float> undef, <4 x i32> zeroinitializer -; VEC4_INTERL1-NEXT: br label %vector.body +; VEC4_INTERL1-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <4 x float> undef, float %init, i32 0 +; VEC4_INTERL1-NEXT: [[DOTSPLAT:%.*]] = shufflevector <4 x float> [[DOTSPLATINSERT]], <4 x float> undef, <4 x i32> zeroinitializer +; VEC4_INTERL1-NEXT: [[DOTSPLATINSERT5:%.*]] = insertelement <4 x float> undef, float [[TMP0]], i32 0 +; VEC4_INTERL1-NEXT: [[DOTSPLAT6:%.*]] = shufflevector <4 x float> [[DOTSPLATINSERT5]], <4 x float> undef, <4 x i32> zeroinitializer +; VEC4_INTERL1-NEXT: [[TMP7:%.*]] = fmul fast <4 x float> [[DOTSPLAT6]], +; VEC4_INTERL1-NEXT: [[INDUCTION7:%.*]] = fadd fast <4 x float> [[DOTSPLAT]], [[TMP7]] +; VEC4_INTERL1-NEXT: [[TMP8:%.*]] = fmul fast float [[TMP0]], 4.000000e+00 +; VEC4_INTERL1-NEXT: [[DOTSPLATINSERT8:%.*]] = insertelement <4 x float> undef, float [[TMP8]], i32 0 +; VEC4_INTERL1-NEXT: [[DOTSPLAT9:%.*]] = shufflevector <4 x float> [[DOTSPLATINSERT8]], <4 x float> undef, <4 x i32> zeroinitializer +; VEC4_INTERL1-NEXT: [[BROADCAST_SPLATINSERT12:%.*]] = insertelement <4 x float> undef, float [[TMP0]], i32 0 +; VEC4_INTERL1-NEXT: [[BROADCAST_SPLAT13:%.*]] = shufflevector <4 x float> [[BROADCAST_SPLATINSERT12]], <4 x float> undef, <4 x i32> zeroinitializer +; VEC4_INTERL1-NEXT: br label [[VECTOR_BODY:%.*]] ; VEC4_INTERL1: vector.body: ; VEC4_INTERL1-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %vector.ph ], [ [[INDEX_NEXT:%.*]], %vector.body ] -; VEC4_INTERL1-NEXT: [[TMP7:%.*]] = sitofp i64 [[INDEX]] to float -; VEC4_INTERL1-NEXT: [[TMP8:%.*]] = fmul fast float [[TMP7]], -5.000000e-01 -; VEC4_INTERL1-NEXT: [[FP_OFFSET_IDX:%.*]] = fadd fast float [[TMP8]], 0x3FB99999A0000000 -; VEC4_INTERL1-NEXT: [[BROADCAST_SPLATINSERT6:%.*]] = insertelement <4 x float> undef, float [[FP_OFFSET_IDX]], i32 0 -; VEC4_INTERL1-NEXT: [[BROADCAST_SPLAT7:%.*]] = shufflevector <4 x float> [[BROADCAST_SPLATINSERT6]], <4 x float> undef, <4 x i32> zeroinitializer -; VEC4_INTERL1-NEXT: [[TMP9:%.*]] = sitofp i64 [[INDEX]] to float -; VEC4_INTERL1-NEXT: [[TMP10:%.*]] = fmul fast float [[TMP0]], [[TMP9]] -; VEC4_INTERL1-NEXT: [[FP_OFFSET_IDX10:%.*]] = fadd fast float [[TMP10]], %init -; VEC4_INTERL1-NEXT: [[BROADCAST_SPLATINSERT11:%.*]] = insertelement <4 x float> undef, float [[FP_OFFSET_IDX10]], i32 0 -; VEC4_INTERL1-NEXT: [[BROADCAST_SPLAT12:%.*]] = shufflevector <4 x float> [[BROADCAST_SPLATINSERT11]], <4 x float> undef, <4 x i32> zeroinitializer -; VEC4_INTERL1-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <4 x float> undef, float [[TMP0]], i32 0 -; VEC4_INTERL1-NEXT: [[DOTSPLAT:%.*]] = shufflevector <4 x float> [[DOTSPLATINSERT]], <4 x float> undef, <4 x i32> zeroinitializer -; VEC4_INTERL1-NEXT: [[TMP11:%.*]] = fmul fast <4 x float> [[DOTSPLAT]], -; VEC4_INTERL1-NEXT: [[INDUCTION13:%.*]] = fadd fast <4 x float> [[BROADCAST_SPLAT12]], [[TMP11]] +; VEC4_INTERL1-NEXT: [[VEC_IND:%.*]] = phi <4 x float> [ , %vector.ph ], [ [[VEC_IND_NEXT:%.*]], %vector.body ] +; VEC4_INTERL1-NEXT: [[VEC_IND10:%.*]] = phi <4 x float> [ [[INDUCTION7]], %vector.ph ], [ [[VEC_IND_NEXT11:%.*]], %vector.body ] ; VEC4_INTERL1-NEXT: [[TMP12:%.*]] = getelementptr inbounds float, float* %A, i64 [[INDEX]] ; VEC4_INTERL1-NEXT: [[TMP13:%.*]] = bitcast float* [[TMP12]] to <4 x float>* -; VEC4_INTERL1-NEXT: store <4 x float> [[INDUCTION13]], <4 x float>* [[TMP13]], align 4 -; VEC4_INTERL1-NEXT: [[TMP14:%.*]] = fadd fast <4 x float> [[INDUCTION13]], [[BROADCAST_SPLAT15]] -; VEC4_INTERL1-NEXT: [[TMP15:%.*]] = fadd fast <4 x float> [[BROADCAST_SPLAT7]], +; VEC4_INTERL1-NEXT: store <4 x float> [[VEC_IND10]], <4 x float>* [[TMP13]], align 4 +; VEC4_INTERL1-NEXT: [[TMP14:%.*]] = fadd fast <4 x float> [[VEC_IND10]], [[BROADCAST_SPLAT13]] +; VEC4_INTERL1-NEXT: [[TMP15:%.*]] = fadd fast <4 x float> [[VEC_IND]], ; VEC4_INTERL1-NEXT: [[TMP16:%.*]] = fadd fast <4 x float> [[TMP15]], [[TMP14]] ; VEC4_INTERL1-NEXT: [[TMP17:%.*]] = getelementptr inbounds float, float* %B, i64 [[INDEX]] ; VEC4_INTERL1-NEXT: [[TMP18:%.*]] = bitcast float* [[TMP17]] to <4 x float>* @@ -202,6 +201,8 @@ ; VEC4_INTERL1-NEXT: [[TMP20:%.*]] = bitcast float* [[TMP19]] to <4 x float>* ; VEC4_INTERL1-NEXT: store <4 x float> [[TMP15]], <4 x float>* [[TMP20]], align 4 ; VEC4_INTERL1-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 4 +; VEC4_INTERL1-NEXT: [[VEC_IND_NEXT]] = fadd fast <4 x float> [[VEC_IND]], +; VEC4_INTERL1-NEXT: [[VEC_IND_NEXT11]] = fadd fast <4 x float> [[VEC_IND10]], [[DOTSPLAT9]] ; VEC4_INTERL1: br i1 {{.*}}, label %middle.block, label %vector.body define void @fp_iv_loop3(float %init, float* noalias nocapture %A, float* noalias nocapture %B, float* noalias nocapture %C, i32 %N) #1 { @@ -252,16 +253,12 @@ ; VEC4_INTERL1-NEXT: br label %vector.body ; VEC4_INTERL1: vector.body: ; VEC4_INTERL1-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %vector.ph ], [ [[INDEX_NEXT:%.*]], %vector.body ] -; VEC4_INTERL1-NEXT: [[TMP5:%.*]] = sitofp i64 [[INDEX]] to float -; VEC4_INTERL1-NEXT: [[TMP6:%.*]] = fmul fast float [[TMP5]], 5.000000e-01 -; VEC4_INTERL1-NEXT: [[FP_OFFSET_IDX:%.*]] = fadd fast float [[TMP6]], 1.000000e+00 -; VEC4_INTERL1-NEXT: [[BROADCAST_SPLATINSERT3:%.*]] = insertelement <4 x float> undef, float [[FP_OFFSET_IDX]], i32 0 -; VEC4_INTERL1-NEXT: [[BROADCAST_SPLAT4:%.*]] = shufflevector <4 x float> [[BROADCAST_SPLATINSERT3]], <4 x float> undef, <4 x i32> zeroinitializer -; VEC4_INTERL1-NEXT: [[INDUCTION5:%.*]] = fadd fast <4 x float> [[BROADCAST_SPLAT4]], +; VEC4_INTERL1-NEXT: [[VEC_IND:%.*]] = phi <4 x float> [ , %vector.ph ], [ [[VEC_IND_NEXT:%.*]], %vector.body ] ; VEC4_INTERL1-NEXT: [[TMP7:%.*]] = getelementptr inbounds float, float* %A, i64 [[INDEX]] ; VEC4_INTERL1-NEXT: [[TMP8:%.*]] = bitcast float* [[TMP7]] to <4 x float>* -; VEC4_INTERL1-NEXT: store <4 x float> [[INDUCTION5]], <4 x float>* [[TMP8]], align 4 +; VEC4_INTERL1-NEXT: store <4 x float> [[VEC_IND]], <4 x float>* [[TMP8]], align 4 ; VEC4_INTERL1-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 4 +; VEC4_INTERL1-NEXT: [[VEC_IND_NEXT]] = fadd fast <4 x float> [[VEC_IND]], ; VEC4_INTERL1: br i1 {{.*}}, label %middle.block, label %vector.body define void @fp_iv_loop4(float* noalias nocapture %A, i32 %N) { @@ -294,9 +291,6 @@ ; VEC2_INTERL1_PRED_STORE: vector.body: ; VEC2_INTERL1_PRED_STORE-NEXT: [[INDEX:%.*]] = phi i64 [ [[INDEX_NEXT:%.*]], %[[PRED_STORE_CONTINUE7:.*]] ], [ 0, %min.iters.checked ] ; VEC2_INTERL1_PRED_STORE-NEXT: [[TMP1:%.*]] = sitofp i64 [[INDEX]] to float -; VEC2_INTERL1_PRED_STORE-NEXT: [[BROADCAST_SPLATINSERT3:%.*]] = insertelement <2 x float> undef, float [[TMP1]], i32 0 -; VEC2_INTERL1_PRED_STORE-NEXT: [[BROADCAST_SPLAT4:%.*]] = shufflevector <2 x float> [[BROADCAST_SPLATINSERT3]], <2 x float> undef, <2 x i32> zeroinitializer -; VEC2_INTERL1_PRED_STORE-NEXT: [[INDUCTION5:%.*]] = fadd fast <2 x float> [[BROADCAST_SPLAT4]], ; VEC2_INTERL1_PRED_STORE-NEXT: [[TMP2:%.*]] = getelementptr inbounds float, float* %A, i64 [[INDEX]] ; VEC2_INTERL1_PRED_STORE-NEXT: [[TMP3:%.*]] = bitcast float* [[TMP2]] to <2 x float>* ; VEC2_INTERL1_PRED_STORE-NEXT: [[WIDE_LOAD:%.*]] = load <2 x float>, <2 x float>* [[TMP3]], align 4 @@ -304,15 +298,14 @@ ; VEC2_INTERL1_PRED_STORE-NEXT: [[TMP5:%.*]] = extractelement <2 x i1> [[TMP4]], i32 0 ; VEC2_INTERL1_PRED_STORE-NEXT: br i1 [[TMP5]], label %[[PRED_STORE_IF:.*]], label %[[PRED_STORE_CONTINUE:.*]] ; VEC2_INTERL1_PRED_STORE: [[PRED_STORE_IF]]: -; VEC2_INTERL1_PRED_STORE-NEXT: [[TMP6:%.*]] = extractelement <2 x float> [[INDUCTION5]], i32 0 ; VEC2_INTERL1_PRED_STORE-NEXT: [[TMP7:%.*]] = getelementptr inbounds float, float* %A, i64 [[INDEX]] -; VEC2_INTERL1_PRED_STORE-NEXT: store float [[TMP6]], float* [[TMP7]], align 4 +; VEC2_INTERL1_PRED_STORE-NEXT: store float [[TMP1]], float* [[TMP7]], align 4 ; VEC2_INTERL1_PRED_STORE-NEXT: br label %[[PRED_STORE_CONTINUE]] ; VEC2_INTERL1_PRED_STORE: [[PRED_STORE_CONTINUE]]: ; VEC2_INTERL1_PRED_STORE-NEXT: [[TMP8:%.*]] = extractelement <2 x i1> [[TMP4]], i32 1 ; VEC2_INTERL1_PRED_STORE-NEXT: br i1 [[TMP8]], label %[[PRED_STORE_IF6:.*]], label %[[PRED_STORE_CONTINUE7]] ; VEC2_INTERL1_PRED_STORE: [[PRED_STORE_IF6]]: -; VEC2_INTERL1_PRED_STORE-NEXT: [[TMP9:%.*]] = extractelement <2 x float> [[INDUCTION5]], i32 1 +; VEC2_INTERL1_PRED_STORE-NEXT: [[TMP9:%.*]] = fadd fast float [[TMP1]], 1.000000e+00 ; VEC2_INTERL1_PRED_STORE-NEXT: [[TMP10:%.*]] = or i64 [[INDEX]], 1 ; VEC2_INTERL1_PRED_STORE-NEXT: [[TMP11:%.*]] = getelementptr inbounds float, float* %A, i64 [[TMP10]] ; VEC2_INTERL1_PRED_STORE-NEXT: store float [[TMP9]], float* [[TMP11]], align 4