Index: llvm/trunk/include/llvm/Transforms/Utils/LoopUtils.h =================================================================== --- llvm/trunk/include/llvm/Transforms/Utils/LoopUtils.h +++ llvm/trunk/include/llvm/Transforms/Utils/LoopUtils.h @@ -175,6 +175,13 @@ static bool isReductionPHI(PHINode *Phi, Loop *TheLoop, RecurrenceDescriptor &RedDes); + /// Returns true if Phi is a first-order recurrence. A first-order recurrence + /// is a non-reduction recurrence relation in which the value of the + /// recurrence in the current loop iteration equals a value defined in the + /// previous iteration. + static bool isFirstOrderRecurrence(PHINode *Phi, Loop *TheLoop, + DominatorTree *DT); + RecurrenceKind getRecurrenceKind() { return Kind; } MinMaxRecurrenceKind getMinMaxRecurrenceKind() { return MinMaxKind; } Index: llvm/trunk/lib/Transforms/Utils/LoopUtils.cpp =================================================================== --- llvm/trunk/lib/Transforms/Utils/LoopUtils.cpp +++ llvm/trunk/lib/Transforms/Utils/LoopUtils.cpp @@ -520,6 +520,43 @@ return false; } +bool RecurrenceDescriptor::isFirstOrderRecurrence(PHINode *Phi, Loop *TheLoop, + DominatorTree *DT) { + + // Ensure the phi node is in the loop header and has two incoming values. + if (Phi->getParent() != TheLoop->getHeader() || + Phi->getNumIncomingValues() != 2) + return false; + + // Ensure the loop has a preheader and a single latch block. The loop + // vectorizer will need the latch to set up the next iteration of the loop. + auto *Preheader = TheLoop->getLoopPreheader(); + auto *Latch = TheLoop->getLoopLatch(); + if (!Preheader || !Latch) + return false; + + // Ensure the phi node's incoming blocks are the loop preheader and latch. + if (Phi->getBasicBlockIndex(Preheader) < 0 || + Phi->getBasicBlockIndex(Latch) < 0) + return false; + + // Get the previous value. The previous value comes from the latch edge while + // the initial value comes form the preheader edge. + auto *Previous = dyn_cast(Phi->getIncomingValueForBlock(Latch)); + if (!Previous) + return false; + + // Ensure every user of the phi node is dominated by the previous value. The + // dominance requirement ensures the loop vectorizer will not need to + // vectorize the initial value prior to the first iteration of the loop. + for (User *U : Phi->users()) + if (auto *I = dyn_cast(U)) + if (!DT->dominates(Previous, I)) + return false; + + return true; +} + /// This function returns the identity element (or neutral element) for /// the operation K. Constant *RecurrenceDescriptor::getRecurrenceIdentity(RecurrenceKind K, Index: llvm/trunk/lib/Transforms/Vectorize/LoopVectorize.cpp =================================================================== --- llvm/trunk/lib/Transforms/Vectorize/LoopVectorize.cpp +++ llvm/trunk/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -364,6 +364,10 @@ /// Copy and widen the instructions from the old loop. virtual void vectorizeLoop(); + /// Fix a first-order recurrence. This is the second phase of vectorizing + /// this phi node. + void fixFirstOrderRecurrence(PHINode *Phi); + /// \brief The Loop exit block may have single value PHI nodes where the /// incoming value is 'Undef'. While vectorizing we only handled real values /// that were defined inside the loop. Here we fix the 'undef case'. @@ -1201,6 +1205,10 @@ /// induction descriptor. typedef MapVector InductionList; + /// RecurrenceSet contains the phi nodes that are recurrences other than + /// inductions and reductions. + typedef SmallPtrSet RecurrenceSet; + /// Returns true if it is legal to vectorize this loop. /// This does not mean that it is profitable to vectorize this /// loop, only that it is legal to do so. @@ -1215,6 +1223,9 @@ /// Returns the induction variables found in the loop. InductionList *getInductionVars() { return &Inductions; } + /// Return the first-order recurrences found in the loop. + RecurrenceSet *getFirstOrderRecurrences() { return &FirstOrderRecurrences; } + /// Returns the widest induction type. Type *getWidestInductionType() { return WidestIndTy; } @@ -1224,6 +1235,9 @@ /// Returns True if PN is a reduction variable in this loop. bool isReductionVariable(PHINode *PN) { return Reductions.count(PN); } + /// Returns True if Phi is a first-order recurrence in this loop. + bool isFirstOrderRecurrence(const PHINode *Phi); + /// Return true if the block BB needs to be predicated in order for the loop /// to be vectorized. bool blockNeedsPredication(BasicBlock *BB); @@ -1384,6 +1398,8 @@ /// Notice that inductions don't need to start at zero and that induction /// variables can be pointers. InductionList Inductions; + /// Holds the phi nodes that are first-order recurrences. + RecurrenceSet FirstOrderRecurrences; /// Holds the widest induction type encountered. Type *WidestIndTy; @@ -3386,8 +3402,14 @@ for (PHINode *Phi : PHIsToFix) { assert(Phi && "Unable to recover vectorized PHI"); - // We currently only handle reductions. Ensure the PHI node to be fixed is - // a reduction, and get its reduction variable descriptor. + // Handle first-order recurrences that need to be fixed. + if (Legal->isFirstOrderRecurrence(Phi)) { + fixFirstOrderRecurrence(Phi); + continue; + } + + // If the phi node is not a first-order recurrence, it must be a reduction. + // Get it's reduction variable descriptor. assert(Legal->isReductionVariable(Phi) && "Unable to find the reduction variable"); RecurrenceDescriptor RdxDesc = (*Legal->getReductionVars())[Phi]; @@ -3613,6 +3635,158 @@ cse(LoopVectorBody); } +void InnerLoopVectorizer::fixFirstOrderRecurrence(PHINode *Phi) { + + // This is the second phase of vectorizing first-order rececurrences. An + // overview of the transformation is described below. Suppose we have the + // following loop. + // + // for (int i = 0; i < n; ++i) + // b[i] = a[i] - a[i - 1]; + // + // There is a first-order recurrence on "a". For this loop, the shorthand + // scalar IR looks like: + // + // scalar.ph: + // s_init = a[-1] + // br scalar.body + // + // scalar.body: + // i = phi [0, scalar.ph], [i+1, scalar.body] + // s1 = phi [s_init, scalar.ph], [s2, scalar.body] + // s2 = a[i] + // b[i] = s2 - s1 + // br cond, scalar.body, ... + // + // In this example, s1 is a recurrence because it's value depends on the + // previous iteration. In the first phase of vectorization, we created a + // temporary value for s1. We now complete the vectorization and produce the + // shorthand vector IR shown below (for VF = 4, UF = 1). + // + // vector.ph: + // v_init = vector(..., ..., ..., a[-1]) + // br vector.body + // + // vector.body + // i = phi [0, vector.ph], [i+4, vector.body] + // v1 = phi [v_init, vector.ph], [v2, vector.body] + // v2 = a[i, i+1, i+2, i+3]; + // v3 = vector(v1(3), v2(0, 1, 2)) + // b[i, i+1, i+2, i+3] = v2 - v3 + // br cond, vector.body, middle.block + // + // middle.block: + // x = v2(3) + // br scalar.ph + // + // scalar.ph: + // s_init = phi [x, middle.block], [a[-1], otherwise] + // br scalar.body + // + // After execution completes the vector loop, we extract the next value of + // the recurrence (x) to use as the initial value in the scalar loop. + + // Get the original loop preheader and single loop latch. + auto *Preheader = OrigLoop->getLoopPreheader(); + auto *Latch = OrigLoop->getLoopLatch(); + + // Get the initial and previous values of the scalar recurrence. + auto *ScalarInit = Phi->getIncomingValueForBlock(Preheader); + auto *Previous = Phi->getIncomingValueForBlock(Latch); + + // Create a vector from the initial value. + auto *VectorInit = ScalarInit; + if (VF > 1) { + Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator()); + VectorInit = Builder.CreateInsertElement( + UndefValue::get(VectorType::get(VectorInit->getType(), VF)), VectorInit, + Builder.getInt32(VF - 1), "vector.recur.init"); + } + + // We constructed a temporary phi node in the first phase of vectorization. + // This phi node will eventually be deleted. + auto &PhiParts = getVectorValue(Phi); + Builder.SetInsertPoint(cast(PhiParts[0])); + + // Create a phi node for the new recurrence. The current value will either be + // the initial value inserted into a vector or loop-varying vector value. + auto *VecPhi = Builder.CreatePHI(VectorInit->getType(), 2, "vector.recur"); + VecPhi->addIncoming(VectorInit, LoopVectorPreHeader); + + // Get the vectorized previous value. We ensured the previous values was an + // instruction when detecting the recurrence. + auto &PreviousParts = getVectorValue(Previous); + + // Set the insertion point to be after this instruction. We ensured the + // previous value dominated all uses of the phi when detecting the + // recurrence. + Builder.SetInsertPoint( + &*++BasicBlock::iterator(cast(PreviousParts[UF - 1]))); + + // We will construct a vector for the recurrence by combining the values for + // the current and previous iterations. This is the required shuffle mask. + SmallVector ShuffleMask(VF); + ShuffleMask[0] = Builder.getInt32(VF - 1); + for (unsigned I = 1; I < VF; ++I) + ShuffleMask[I] = Builder.getInt32(I + VF - 1); + + // The vector from which to take the initial value for the current iteration + // (actual or unrolled). Initially, this is the vector phi node. + Value *Incoming = VecPhi; + + // Shuffle the current and previous vector and update the vector parts. + for (unsigned Part = 0; Part < UF; ++Part) { + auto *Shuffle = + VF > 1 + ? Builder.CreateShuffleVector(Incoming, PreviousParts[Part], + ConstantVector::get(ShuffleMask)) + : Incoming; + PhiParts[Part]->replaceAllUsesWith(Shuffle); + cast(PhiParts[Part])->eraseFromParent(); + PhiParts[Part] = Shuffle; + Incoming = PreviousParts[Part]; + } + + // Fix the latch value of the new recurrence in the vector loop. + VecPhi->addIncoming(Incoming, + LI->getLoopFor(LoopVectorBody[0])->getLoopLatch()); + + // Extract the last vector element in the middle block. This will be the + // initial value for the recurrence when jumping to the scalar loop. + auto *Extract = Incoming; + if (VF > 1) { + Builder.SetInsertPoint(LoopMiddleBlock->getTerminator()); + Extract = Builder.CreateExtractElement(Extract, Builder.getInt32(VF - 1), + "vector.recur.extract"); + } + + // Fix the initial value of the original recurrence in the scalar loop. + Builder.SetInsertPoint(&*LoopScalarPreHeader->begin()); + auto *Start = Builder.CreatePHI(Phi->getType(), 2, "scalar.recur.init"); + for (auto *BB : predecessors(LoopScalarPreHeader)) { + auto *Incoming = BB == LoopMiddleBlock ? Extract : ScalarInit; + Start->addIncoming(Incoming, BB); + } + + Phi->setIncomingValue(Phi->getBasicBlockIndex(LoopScalarPreHeader), Start); + Phi->setName("scalar.recur"); + + // Finally, fix users of the recurrence outside the loop. The users will need + // either the last value of the scalar recurrence or the last value of the + // vector recurrence we extracted in the middle block. Since the loop is in + // LCSSA form, we just need to find the phi node for the original scalar + // recurrence in the exit block, and then add an edge for the middle block. + for (auto &I : *LoopExitBlock) { + auto *LCSSAPhi = dyn_cast(&I); + if (!LCSSAPhi) + break; + if (LCSSAPhi->getIncomingValue(0) == Phi) { + LCSSAPhi->addIncoming(Extract, LoopMiddleBlock); + break; + } + } +} + void InnerLoopVectorizer::fixLCSSAPHIs() { for (BasicBlock::iterator LEI = LoopExitBlock->begin(), LEE = LoopExitBlock->end(); LEI != LEE; ++LEI) { @@ -3687,8 +3861,8 @@ Instruction *PN, InnerLoopVectorizer::VectorParts &Entry, unsigned UF, unsigned VF, PhiVector *PV) { PHINode* P = cast(PN); - // Handle reduction variables: - if (Legal->isReductionVariable(P)) { + // Handle recurrences. + if (Legal->isReductionVariable(P) || Legal->isFirstOrderRecurrence(P)) { for (unsigned part = 0; part < UF; ++part) { // This is phase one of vectorizing PHIs. Type *VecTy = (VF == 1) ? PN->getType() : @@ -4384,6 +4558,11 @@ continue; } + if (RecurrenceDescriptor::isFirstOrderRecurrence(Phi, TheLoop, DT)) { + FirstOrderRecurrences.insert(Phi); + continue; + } + emitAnalysis(VectorizationReport(&*it) << "value that could not be identified as " "reduction is used outside the loop"); @@ -4561,6 +4740,10 @@ return Inductions.count(PN); } +bool LoopVectorizationLegality::isFirstOrderRecurrence(const PHINode *Phi) { + return FirstOrderRecurrences.count(Phi); +} + bool LoopVectorizationLegality::blockNeedsPredication(BasicBlock *BB) { return LoopAccessInfo::blockNeedsPredication(BB, TheLoop, DT); } @@ -5450,9 +5633,17 @@ case Instruction::Br: { return TTI.getCFInstrCost(I->getOpcode()); } - case Instruction::PHI: - //TODO: IF-converted IFs become selects. + case Instruction::PHI: { + auto *Phi = cast(I); + + // First-order recurrences are replaced by vector shuffles inside the loop. + if (VF > 1 && Legal->isFirstOrderRecurrence(Phi)) + return TTI.getShuffleCost(TargetTransformInfo::SK_ExtractSubvector, + VectorTy, VF - 1, VectorTy); + + // TODO: IF-converted IFs become selects. return 0; + } case Instruction::Add: case Instruction::FAdd: case Instruction::Sub: Index: llvm/trunk/test/Transforms/LoopVectorize/AArch64/first-order-recurrence.ll =================================================================== --- llvm/trunk/test/Transforms/LoopVectorize/AArch64/first-order-recurrence.ll +++ llvm/trunk/test/Transforms/LoopVectorize/AArch64/first-order-recurrence.ll @@ -0,0 +1,209 @@ +; RUN: opt < %s -loop-vectorize -force-vector-width=4 -force-vector-interleave=1 -dce -instcombine -S | FileCheck %s +; RUN: opt < %s -loop-vectorize -force-vector-width=4 -force-vector-interleave=2 -dce -instcombine -S | FileCheck %s --check-prefix=UNROLL + +target datalayout = "e-m:e-i64:64-i128:128-n32:64-S128" + +; CHECK-LABEL: @recurrence_1 +; +; void recurrence_1(int *a, int *b, int n) { +; for(int i = 0; i < n; i++) +; b[i] = a[i] + a[i - 1] +; } +; +; CHECK: vector.ph: +; CHECK: %vector.recur.init = insertelement <4 x i32> undef, i32 %pre_load, i32 3 +; +; CHECK: vector.body: +; CHECK: %vector.recur = phi <4 x i32> [ %vector.recur.init, %vector.ph ], [ [[L1:%[a-zA-Z0-9.]+]], %vector.body ] +; CHECK: [[L1]] = load <4 x i32> +; CHECK: {{.*}} = shufflevector <4 x i32> %vector.recur, <4 x i32> [[L1]], <4 x i32> +; +; CHECK: middle.block: +; CHECK: %vector.recur.extract = extractelement <4 x i32> [[L1]], i32 3 +; +; CHECK: scalar.ph: +; CHECK: %scalar.recur.init = phi i32 [ %vector.recur.extract, %middle.block ], [ %pre_load, %vector.memcheck ], [ %pre_load, %min.iters.checked ], [ %pre_load, %for.preheader ] +; +; CHECK: scalar.body: +; CHECK: %scalar.recur = phi i32 [ %scalar.recur.init, %scalar.ph ], [ {{.*}}, %scalar.body ] +; +; UNROLL: vector.body: +; UNROLL: %vector.recur = phi <4 x i32> [ %vector.recur.init, %vector.ph ], [ [[L2:%[a-zA-Z0-9.]+]], %vector.body ] +; UNROLL: [[L1:%[a-zA-Z0-9.]+]] = load <4 x i32> +; UNROLL: [[L2]] = load <4 x i32> +; UNROLL: {{.*}} = shufflevector <4 x i32> %vector.recur, <4 x i32> [[L1]], <4 x i32> +; UNROLL: {{.*}} = shufflevector <4 x i32> [[L1]], <4 x i32> [[L2]], <4 x i32> +; +; UNROLL: middle.block: +; UNROLL: %vector.recur.extract = extractelement <4 x i32> [[L2]], i32 3 +; +define void @recurrence_1(i32* nocapture readonly %a, i32* nocapture %b, i32 %n) { +entry: + br label %for.preheader + +for.preheader: + %arrayidx.phi.trans.insert = getelementptr inbounds i32, i32* %a, i64 0 + %pre_load = load i32, i32* %arrayidx.phi.trans.insert + br label %scalar.body + +scalar.body: + %0 = phi i32 [ %pre_load, %for.preheader ], [ %1, %scalar.body ] + %indvars.iv = phi i64 [ 0, %for.preheader ], [ %indvars.iv.next, %scalar.body ] + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %arrayidx32 = getelementptr inbounds i32, i32* %a, i64 %indvars.iv.next + %1 = load i32, i32* %arrayidx32 + %arrayidx34 = getelementptr inbounds i32, i32* %b, i64 %indvars.iv + %add35 = add i32 %1, %0 + store i32 %add35, i32* %arrayidx34 + %lftr.wideiv = trunc i64 %indvars.iv.next to i32 + %exitcond = icmp eq i32 %lftr.wideiv, %n + br i1 %exitcond, label %for.exit, label %scalar.body + +for.exit: + ret void +} + +; CHECK-LABEL: @recurrence_2 +; +; int recurrence_2(int *a, int n) { +; int minmax; +; for (int i = 0; i < n; ++i) +; minmax = min(minmax, max(a[i] - a[i-1], 0)); +; return minmax; +; } +; +; CHECK: vector.ph: +; CHECK: %vector.recur.init = insertelement <4 x i32> undef, i32 %.pre, i32 3 +; +; CHECK: vector.body: +; CHECK: %vector.recur = phi <4 x i32> [ %vector.recur.init, %vector.ph ], [ [[L1:%[a-zA-Z0-9.]+]], %vector.body ] +; CHECK: [[L1]] = load <4 x i32> +; CHECK: {{.*}} = shufflevector <4 x i32> %vector.recur, <4 x i32> [[L1]], <4 x i32> +; +; CHECK: middle.block: +; CHECK: %vector.recur.extract = extractelement <4 x i32> [[L1]], i32 3 +; +; CHECK: scalar.ph: +; CHECK: %scalar.recur.init = phi i32 [ %vector.recur.extract, %middle.block ], [ %.pre, %min.iters.checked ], [ %.pre, %for.preheader ] +; +; CHECK: scalar.body: +; CHECK: %scalar.recur = phi i32 [ %scalar.recur.init, %scalar.ph ], [ {{.*}}, %scalar.body ] +; +; UNROLL: vector.body: +; UNROLL: %vector.recur = phi <4 x i32> [ %vector.recur.init, %vector.ph ], [ [[L2:%[a-zA-Z0-9.]+]], %vector.body ] +; UNROLL: [[L1:%[a-zA-Z0-9.]+]] = load <4 x i32> +; UNROLL: [[L2]] = load <4 x i32> +; UNROLL: {{.*}} = shufflevector <4 x i32> %vector.recur, <4 x i32> [[L1]], <4 x i32> +; UNROLL: {{.*}} = shufflevector <4 x i32> [[L1]], <4 x i32> [[L2]], <4 x i32> +; +; UNROLL: middle.block: +; UNROLL: %vector.recur.extract = extractelement <4 x i32> [[L2]], i32 3 +; +define i32 @recurrence_2(i32* nocapture readonly %a, i32 %n) { +entry: + %cmp27 = icmp sgt i32 %n, 0 + br i1 %cmp27, label %for.preheader, label %for.cond.cleanup + +for.preheader: + %arrayidx2.phi.trans.insert = getelementptr inbounds i32, i32* %a, i64 -1 + %.pre = load i32, i32* %arrayidx2.phi.trans.insert, align 4 + br label %scalar.body + +for.cond.cleanup.loopexit: + %minmax.0.cond.lcssa = phi i32 [ %minmax.0.cond, %scalar.body ] + br label %for.cond.cleanup + +for.cond.cleanup: + %minmax.0.lcssa = phi i32 [ undef, %entry ], [ %minmax.0.cond.lcssa, %for.cond.cleanup.loopexit ] + ret i32 %minmax.0.lcssa + +scalar.body: + %0 = phi i32 [ %.pre, %for.preheader ], [ %1, %scalar.body ] + %indvars.iv = phi i64 [ 0, %for.preheader ], [ %indvars.iv.next, %scalar.body ] + %minmax.028 = phi i32 [ undef, %for.preheader ], [ %minmax.0.cond, %scalar.body ] + %arrayidx = getelementptr inbounds i32, i32* %a, i64 %indvars.iv + %1 = load i32, i32* %arrayidx, align 4 + %sub3 = sub nsw i32 %1, %0 + %cmp4 = icmp sgt i32 %sub3, 0 + %cond = select i1 %cmp4, i32 %sub3, i32 0 + %cmp5 = icmp slt i32 %minmax.028, %cond + %minmax.0.cond = select i1 %cmp5, i32 %minmax.028, i32 %cond + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %lftr.wideiv = trunc i64 %indvars.iv.next to i32 + %exitcond = icmp eq i32 %lftr.wideiv, %n + br i1 %exitcond, label %for.cond.cleanup.loopexit, label %scalar.body +} + +; CHECK-LABEL: @recurrence_3 +; +; void recurrence_3(short *a, double *b, int n, float f, short p) { +; b[0] = (double)a[0] - f * (double)p; +; for (int i = 1; i < n; i++) +; b[i] = (double)a[i] - f * (double)a[i - 1]; +; } +; +; +; CHECK: vector.ph: +; CHECK: %vector.recur.init = insertelement <4 x i16> undef, i16 %0, i32 3 +; +; CHECK: vector.body: +; CHECK: %vector.recur = phi <4 x i16> [ %vector.recur.init, %vector.ph ], [ [[L1:%[a-zA-Z0-9.]+]], %vector.body ] +; CHECK: [[L1]] = load <4 x i16> +; CHECK: {{.*}} = shufflevector <4 x i16> %vector.recur, <4 x i16> [[L1]], <4 x i32> +; +; CHECK: middle.block: +; CHECK: %vector.recur.extract = extractelement <4 x i16> [[L1]], i32 3 +; +; CHECK: scalar.ph: +; CHECK: %scalar.recur.init = phi i16 [ %vector.recur.extract, %middle.block ], [ %0, %vector.memcheck ], [ %0, %min.iters.checked ], [ %0, %for.preheader ] +; +; CHECK: scalar.body: +; CHECK: %scalar.recur = phi i16 [ %scalar.recur.init, %scalar.ph ], [ {{.*}}, %scalar.body ] +; +; UNROLL: vector.body: +; UNROLL: %vector.recur = phi <4 x i16> [ %vector.recur.init, %vector.ph ], [ [[L2:%[a-zA-Z0-9.]+]], %vector.body ] +; UNROLL: [[L1:%[a-zA-Z0-9.]+]] = load <4 x i16> +; UNROLL: [[L2]] = load <4 x i16> +; UNROLL: {{.*}} = shufflevector <4 x i16> %vector.recur, <4 x i16> [[L1]], <4 x i32> +; UNROLL: {{.*}} = shufflevector <4 x i16> [[L1]], <4 x i16> [[L2]], <4 x i32> +; +; UNROLL: middle.block: +; UNROLL: %vector.recur.extract = extractelement <4 x i16> [[L2]], i32 3 +; +define void @recurrence_3(i16* nocapture readonly %a, double* nocapture %b, i32 %n, float %f, i16 %p) { +entry: + %0 = load i16, i16* %a, align 2 + %conv = sitofp i16 %0 to double + %conv1 = fpext float %f to double + %conv2 = sitofp i16 %p to double + %mul = fmul fast double %conv2, %conv1 + %sub = fsub fast double %conv, %mul + store double %sub, double* %b, align 8 + %cmp25 = icmp sgt i32 %n, 1 + br i1 %cmp25, label %for.preheader, label %for.end + +for.preheader: + br label %scalar.body + +scalar.body: + %1 = phi i16 [ %0, %for.preheader ], [ %2, %scalar.body ] + %advars.iv = phi i64 [ %advars.iv.next, %scalar.body ], [ 1, %for.preheader ] + %arrayidx5 = getelementptr inbounds i16, i16* %a, i64 %advars.iv + %2 = load i16, i16* %arrayidx5, align 2 + %conv6 = sitofp i16 %2 to double + %conv11 = sitofp i16 %1 to double + %mul12 = fmul fast double %conv11, %conv1 + %sub13 = fsub fast double %conv6, %mul12 + %arrayidx15 = getelementptr inbounds double, double* %b, i64 %advars.iv + store double %sub13, double* %arrayidx15, align 8 + %advars.iv.next = add nuw nsw i64 %advars.iv, 1 + %lftr.wideiv = trunc i64 %advars.iv.next to i32 + %exitcond = icmp eq i32 %lftr.wideiv, %n + br i1 %exitcond, label %for.end.loopexit, label %scalar.body + +for.end.loopexit: + br label %for.end + +for.end: + ret void +}