Index: llvm/include/llvm/Transforms/Vectorize/LoopVectorizationLegality.h =================================================================== --- llvm/include/llvm/Transforms/Vectorize/LoopVectorizationLegality.h +++ llvm/include/llvm/Transforms/Vectorize/LoopVectorizationLegality.h @@ -196,10 +196,6 @@ Instruction *getExactFPInst() { return ExactFPMathInst; } - bool canVectorizeFPMath(const LoopVectorizeHints &Hints, - bool HintsAllowReordering) const { - return !ExactFPMathInst || Hints.allowReordering(HintsAllowReordering); - } unsigned getNumRuntimePointerChecks() const { return NumRuntimePointerChecks; @@ -257,6 +253,12 @@ /// If false, good old LV code. bool canVectorize(bool UseVPlanNativePath); + /// Returns true if it is legal to vectorize the FP math operations in this + /// loop. Vectorizing is legal if we allow reordering of FP operations, or if + /// we can use in-order reductions. + bool canVectorizeFPMath(bool EnableStrictReductions, + bool HintsAllowReordering); + /// Return true if we can vectorize this loop while folding its tail by /// masking, and mark all respective loads/stores for masking. /// This object's state is only modified iff this function returns true. Index: llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp =================================================================== --- llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp +++ llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp @@ -857,6 +857,35 @@ return true; } +bool LoopVectorizationLegality::canVectorizeFPMath( + bool EnableStrictReductions, bool HintsAllowReordering) { + + // First check if there is any ExactFP math or if we allow reassociations + if (!Requirements->getExactFPInst() || + Hints->allowReordering(HintsAllowReordering)) + return true; + + if (!EnableStrictReductions) + return false; + + // If the above is false, we have ExactFPMath & do not allow reordering. + // First check if we have any Exact FP induction vars, which we cannot + // vectorize. + if (any_of(getInductionVars(), [&](auto &Induction) -> bool { + InductionDescriptor IndDesc = Induction.second; + return IndDesc.getExactFPMathInst(); + })) + return false; + + // We can now only vectorize if all reductions with Exact FP math also + // have the isOrdered flag set, which indicates that we can move the + // reduction operations in-loop. + return (all_of(getReductionVars(), [&](auto &Reduction) -> bool { + RecurrenceDescriptor RdxDesc = Reduction.second; + return !RdxDesc.hasExactFPMath() || RdxDesc.isOrdered(); + })); +} + bool LoopVectorizationLegality::isInductionPhi(const Value *V) { Value *In0 = const_cast(V); PHINode *PN = dyn_cast_or_null(In0); Index: llvm/lib/Transforms/Vectorize/LoopVectorize.cpp =================================================================== --- llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -9938,7 +9938,7 @@ return false; } - if (!Requirements.canVectorizeFPMath(Hints, HintsAllowReordering)) { + if (!LVL.canVectorizeFPMath(EnableStrictReductions, HintsAllowReordering)) { ORE->emit([&]() { auto *ExactFPMathInst = Requirements.getExactFPInst(); return OptimizationRemarkAnalysisFPCommute(DEBUG_TYPE, "CantReorderFPOps", Index: llvm/test/Transforms/LoopVectorize/AArch64/scalable-strict-fadd.ll =================================================================== --- llvm/test/Transforms/LoopVectorize/AArch64/scalable-strict-fadd.ll +++ llvm/test/Transforms/LoopVectorize/AArch64/scalable-strict-fadd.ll @@ -1,6 +1,6 @@ -; RUN: opt < %s -loop-vectorize -mtriple aarch64-unknown-linux-gnu -mattr=+sve -enable-strict-reductions -S | FileCheck %s -check-prefix=CHECK -; RUN: opt < %s -loop-vectorize -mtriple aarch64-unknown-linux-gnu -mattr=+sve -enable-strict-reductions -hints-allow-reordering=false -S | FileCheck %s -check-prefix=CHECK-NO-REORDER +; RUN: opt < %s -loop-vectorize -mtriple aarch64-unknown-linux-gnu -mattr=+sve -enable-strict-reductions -pass-remarks=loop-vectorize -pass-remarks-analysis=loop-vectorize -hints-allow-reordering=false -S | FileCheck %s +; CHECK-REMARKS: vectorized loop (vectorization width: 8, interleaved count: 1) define float @fadd_strict(float* noalias nocapture readonly %a, i64 %n) { ; CHECK-LABEL: @fadd_strict ; CHECK: vector.body: @@ -10,9 +10,6 @@ ; CHECK: for.end ; CHECK: %[[PHI:.*]] = phi float [ %[[SCALAR:.*]], %for.body ], [ %[[RDX]], %middle.block ] ; CHECK: ret float %[[PHI]] - -; CHECK-NO-REORDER-LABEL: @fadd_strict -; CHECK-NO-REORDER-NOT: vector.body entry: br label %for.body @@ -30,6 +27,7 @@ ret float %add } +; CHECK-REMARKS: vectorized loop (vectorization width: 8, interleaved count: 4) define float @fadd_strict_unroll(float* noalias nocapture readonly %a, i64 %n) { ; CHECK-LABEL: @fadd_strict_unroll ; CHECK: vector.body: @@ -46,9 +44,6 @@ ; CHECK: for.end ; CHECK: %[[PHI:.*]] = phi float [ %[[SCALAR:.*]], %for.body ], [ %[[RDX4]], %middle.block ] ; CHECK: ret float %[[PHI]] - -; CHECK-NO-REORDER-LABEL: @fadd_strict_unroll -; CHECK-NO-REORDER-NOT: vector.body entry: br label %for.body @@ -66,6 +61,7 @@ ret float %add } +; CHECK-REMARKS: vectorized loop (vectorization width: 4, interleaved count: 2) define void @fadd_strict_interleave(float* noalias nocapture readonly %a, float* noalias nocapture readonly %b, i64 %n) { ; CHECK-LABEL: @fadd_strict_interleave ; CHECK: entry @@ -90,9 +86,6 @@ ; CHECK: %[[RDX2]] = call float @llvm.vector.reduce.fadd.nxv4f32(float %[[VEC_PHI2]], %[[MGATHER2]]) ; CHECK: for.end ; CHECK ret void - -; CHECK-NO-REORDER-LABEL: @fadd_strict_interleave -; CHECK-NO-REORDER-NOT: vector.body entry: %arrayidxa = getelementptr inbounds float, float* %a, i64 1 %a1 = load float, float* %a, align 4 @@ -120,6 +113,7 @@ ret void } +; CHECK-REMARKS: vectorized loop (vectorization width: 4, interleaved count: 1) define float @fadd_invariant(float* noalias nocapture readonly %a, float* noalias nocapture readonly %b, i64 %n) { ; CHECK-LABEL: @fadd_invariant ; CHECK: vector.body @@ -133,9 +127,6 @@ ; CHECK: for.end ; CHECK: %[[PHI:.*]] = phi float [ 0.000000e+00, %entry ], [ %[[EXIT_PHI]], %for.end.loopexit ] ; CHECK: ret float %[[PHI]] - -; CHECK-NO-REORDER-LABEL: @fadd_invariant -; CHECK-NO-REORDER-NOT: vector.body entry: %arrayidx = getelementptr inbounds float, float* %a, i64 1 %0 = load float, float* %arrayidx, align 4 @@ -160,6 +151,7 @@ ret float %res } +; CHECK-REMARKS: vectorized loop (vectorization width: 4, interleaved count: 1) define float @fadd_conditional(float* noalias nocapture readonly %a, float* noalias nocapture readonly %b, i64 %n) { ; CHECK-LABEL: @fadd_conditional ; CHECK: vector.body @@ -182,9 +174,6 @@ ; CHECK: for.end ; CHECK: %[[RDX_PHI:.*]] = phi float [ %[[FADD]], %for.inc ], [ %[[RDX]], %middle.block ] ; CHECK: ret float %[[RDX_PHI]] - -; CHECK-NO-REORDER-LABEL: @fadd_conditional -; CHECK-NO-REORDER-NOT: vector.body entry: br label %for.body @@ -214,30 +203,10 @@ } ; Negative test - loop contains multiple fadds which we cannot safely reorder -; Note: This test vectorizes the loop with a non-strict implementation, which reorders the FAdd operations. -; This is happening because we are using hints, where allowReordering returns true. +; CHECK-REMARKS: loop not vectorized: cannot prove it is safe to reorder floating-point operations define float @fadd_multiple(float* noalias nocapture %a, float* noalias nocapture %b, i64 %n) { ; CHECK-LABEL: @fadd_multiple -; CHECK: vector.body -; CHECK: %[[PHI:.*]] = phi [ insertelement ( shufflevector ( insertelement ( undef, float -0.000000e+00, i32 0), undef, zeroinitializer), float -0.000000e+00, i32 0), %vector.ph ], [ %[[VEC_FADD2:.*]], %vector.body ] -; CHECK: %[[VEC_LOAD1:.*]] = load , -; CHECK: %[[VEC_FADD1:.*]] = fadd %[[PHI]], %[[VEC_LOAD1]] -; CHECK: %[[VEC_LOAD2:.*]] = load , -; CHECK: %[[VEC_FADD2]] = fadd %[[VEC_FADD1]], %[[VEC_LOAD2]] -; CHECK: middle.block -; CHECK: %[[RDX:.*]] = call float @llvm.vector.reduce.fadd.nxv8f32(float -0.000000e+00, %[[VEC_FADD2]]) -; CHECK: for.body -; CHECK: %[[SUM:.*]] = phi float [ %bc.merge.rdx, %scalar.ph ], [ %[[FADD2:.*]], %for.body ] -; CHECK: %[[LOAD1:.*]] = load float, float* -; CHECK: %[[FADD1:.*]] = fadd float %[[SUM]], %[[LOAD1]] -; CHECK: %[[LOAD2:.*]] = load float, float* -; CHECK: %[[FADD2]] = fadd float %[[FADD1]], %[[LOAD2]] -; CHECK: for.end -; CHECK: %[[RET:.*]] = phi float [ %[[FADD2]], %for.body ], [ %[[RDX]], %middle.block ] -; CHECK: ret float %[[RET]] - -; CHECK-NO-REORDER-LABEL: @fadd_multiple -; CHECK-NO-REORDER-NOT: vector.body +; CHECK-NOT: vector.body entry: br label %for.body Index: llvm/test/Transforms/LoopVectorize/AArch64/strict-fadd.ll =================================================================== --- llvm/test/Transforms/LoopVectorize/AArch64/strict-fadd.ll +++ llvm/test/Transforms/LoopVectorize/AArch64/strict-fadd.ll @@ -1,6 +1,7 @@ -; RUN: opt < %s -loop-vectorize -mtriple aarch64-unknown-linux-gnu -enable-strict-reductions -S | FileCheck %s -check-prefix=CHECK -; RUN: opt < %s -loop-vectorize -mtriple aarch64-unknown-linux-gnu -enable-strict-reductions -hints-allow-reordering=false -S | FileCheck %s -check-prefix=CHECK-NO-REORDER +; RUN: opt < %s -loop-vectorize -mtriple aarch64-unknown-linux-gnu -enable-strict-reductions -hints-allow-reordering=false -pass-remarks=loop-vectorize -pass-remarks-analysis=loop-vectorize -S 2>%t | FileCheck %s +; RUN: cat %t | FileCheck %s -check-prefix=CHECK-REMARKS +; CHECK-REMARKS: vectorized loop (vectorization width: 8, interleaved count: 1) define float @fadd_strict(float* noalias nocapture readonly %a, i64 %n) { ; CHECK-LABEL: @fadd_strict ; CHECK: vector.body: @@ -10,9 +11,6 @@ ; CHECK: for.end ; CHECK: %[[PHI:.*]] = phi float [ %[[SCALAR:.*]], %for.body ], [ %[[RDX]], %middle.block ] ; CHECK: ret float %[[PHI]] - -; CHECK-NO-REORDER-LABEL: @fadd_strict -; CHECK-NO-REORDER-NOT: vector.body entry: br label %for.body @@ -30,6 +28,7 @@ ret float %add } +; CHECK-REMARKS: vectorized loop (vectorization width: 8, interleaved count: 4) define float @fadd_strict_unroll(float* noalias nocapture readonly %a, i64 %n) { ; CHECK-LABEL: @fadd_strict_unroll ; CHECK: vector.body: @@ -46,9 +45,6 @@ ; CHECK: for.end ; CHECK: %[[PHI:.*]] = phi float [ %[[SCALAR:.*]], %for.body ], [ %[[RDX4]], %middle.block ] ; CHECK: ret float %[[PHI]] - -; CHECK-NO-REORDER-LABEL: @fadd_strict_unroll -; CHECK-NO-REORDER-NOT: vector.body entry: br label %for.body @@ -74,6 +70,7 @@ ; } ; return sum; +; CHECK-REMARKS: vectorized loop (vectorization width: 8, interleaved count: 4) define float @fadd_strict_unroll_last_val(float* noalias nocapture readonly %a, float* noalias nocapture readonly %b, i64 %n) { ; CHECK-LABEL: @fadd_strict_unroll_last_val ; CHECK: vector.body @@ -98,9 +95,6 @@ ; CHECK: for.end ; CHECK: %[[SUM_LCSSA:.*]] = phi float [ %[[FADD_LCSSA]], %for.cond.cleanup ], [ 0.000000e+00, %entry ] ; CHECK: ret float %[[SUM_LCSSA]] - -; CHECK-NO-REORDER-LABEL: @fadd_strict_unroll_last_val -; CHECK-NO-REORDER-NOT: vector.body entry: %cmp = icmp sgt i64 %n, 0 br i1 %cmp, label %for.body, label %for.end @@ -126,6 +120,7 @@ ret float %sum.lcssa } +; CHECK-REMARKS: vectorized loop (vectorization width: 4, interleaved count: 1) define void @fadd_strict_interleave(float* noalias nocapture readonly %a, float* noalias nocapture readonly %b, i64 %n) { ; CHECK-LABEL: @fadd_strict_interleave ; CHECK: entry @@ -142,9 +137,6 @@ ; CHECK: %[[RDX2]] = call float @llvm.vector.reduce.fadd.v4f32(float %[[VEC_PHI1]], <4 x float> %[[STRIDED2]]) ; CHECK: for.end ; CHECK ret void - -; CHECK-NO-REORDER-LABEL: @fadd_strict_interleave -; CHECK-NO-REORDER-NOT: vector.body entry: %arrayidxa = getelementptr inbounds float, float* %a, i64 1 %a1 = load float, float* %a, align 4 @@ -172,6 +164,7 @@ ret void } +; CHECK-REMARKS: vectorized loop (vectorization width: 4, interleaved count: 1) define float @fadd_invariant(float* noalias nocapture readonly %a, float* noalias nocapture readonly %b, i64 %n) { ; CHECK-LABEL: @fadd_invariant ; CHECK: vector.body @@ -185,9 +178,6 @@ ; CHECK: for.end ; CHECK: %[[PHI:.*]] = phi float [ 0.000000e+00, %entry ], [ %[[EXIT_PHI]], %for.end.loopexit ] ; CHECK: ret float %[[PHI]] - -; CHECK-NO-REORDER-LABEL: @fadd_invariant -; CHECK-NO-REORDER-NOT: vector.body entry: %arrayidx = getelementptr inbounds float, float* %a, i64 1 %0 = load float, float* %arrayidx, align 4 @@ -212,6 +202,7 @@ ret float %res } +; CHECK-REMARKS: vectorized loop (vectorization width: 4, interleaved count: 1) define float @fadd_conditional(float* noalias nocapture readonly %a, float* noalias nocapture readonly %b, i64 %n) { ; CHECK-LABEL: @fadd_conditional ; CHECK: vector.body: @@ -239,9 +230,6 @@ ; CHECK: for.end ; CHECK: %[[RDX_PHI:.*]] = phi float [ %[[FADD]], %for.inc ], [ %[[RDX]], %middle.block ] ; CHECK: ret float %[[RDX_PHI]] - -; CHECK-NO-REORDER-LABEL: @fadd_conditional -; CHECK-NO-REORDER-NOT: vector.body entry: br label %for.body @@ -271,6 +259,7 @@ } ; Test to check masking correct, using the "llvm.loop.vectorize.predicate.enable" attribute +; CHECK-REMARKS: vectorized loop (vectorization width: 2, interleaved count: 1) define float @fadd_predicated(float* noalias nocapture %a, i64 %n) { ; CHECK-LABEL: @fadd_predicated ; CHECK: vector.ph @@ -286,9 +275,6 @@ ; CHECK: for.end: ; CHECK: %[[RES_PHI:.*]] = phi float [ %[[FADD:.*]], %for.body ], [ %[[RDX]], %middle.block ] ; CHECK: ret float %[[RES_PHI]] - -; CHECK-NO-REORDER-LABEL: @fadd_predicated -; CHECK-NO-REORDER-NOT: vector.body entry: br label %for.body @@ -308,28 +294,10 @@ } ; Negative test - loop contains multiple fadds which we cannot safely reorder +; CHECK-REMARKS: loop not vectorized: cannot prove it is safe to reorder floating-point operations define float @fadd_multiple(float* noalias nocapture %a, float* noalias nocapture %b, i64 %n) { ; CHECK-LABEL: @fadd_multiple -; CHECK: vector.body -; CHECK: %[[PHI:.*]] = phi <8 x float> [ , %vector.ph ], [ %[[VEC_FADD2:.*]], %vector.body ] -; CHECK: %[[VEC_LOAD1:.*]] = load <8 x float>, <8 x float> -; CHECK: %[[VEC_FADD1:.*]] = fadd <8 x float> %[[PHI]], %[[VEC_LOAD1]] -; CHECK: %[[VEC_LOAD2:.*]] = load <8 x float>, <8 x float> -; CHECK: %[[VEC_FADD2]] = fadd <8 x float> %[[VEC_FADD1]], %[[VEC_LOAD2]] -; CHECK: middle.block -; CHECK: %[[RDX:.*]] = call float @llvm.vector.reduce.fadd.v8f32(float -0.000000e+00, <8 x float> %[[VEC_FADD2]]) -; CHECK: for.body -; CHECK: %[[SUM:.*]] = phi float [ %bc.merge.rdx, %scalar.ph ], [ %[[FADD2:.*]], %for.body ] -; CHECK: %[[LOAD1:.*]] = load float, float* -; CHECK: %[[FADD1:.*]] = fadd float %sum, %[[LOAD1]] -; CHECK: %[[LOAD2:.*]] = load float, float* -; CHECK: %[[FADD2]] = fadd float %[[FADD1]], %[[LOAD2]] -; CHECK: for.end -; CHECK: %[[RET:.*]] = phi float [ %[[FADD2]], %for.body ], [ %[[RDX]], %middle.block ] -; CHECK: ret float %[[RET]] - -; CHECK-NO-REORDER-LABEL: @fadd_multiple -; CHECK-NO-REORDER-NOT: vector.body +; CHECK-NOT: vector.body entry: br label %for.body @@ -351,6 +319,121 @@ ret float %rdx } +; Tests with both a floating point reduction & induction, e.g. +; +;float fp_iv_rdx_loop(float *values, float init, float * __restrict__ A, int N) { +; float fp_inc = 2.0; +; float x = init; +; float sum = 0.0; +; for (int i=0; i < N; ++i) { +; A[i] = x; +; x += fp_inc; +; sum += values[i]; +; } +; return sum; +;} + +; Strict reduction could be performed in-loop, but ordered FP induction variables are not supported +; CHECK-REMARKS: loop not vectorized: cannot prove it is safe to reorder floating-point operations +define float @induction_and_reduction(float* nocapture readonly %values, float %init, float* noalias nocapture %A, i64 %N) { +; CHECK-LABEL: @induction_and_reduction +; CHECK-NOT: vector.body +entry: + br label %for.body + +for.body: + %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ] + %sum.015 = phi float [ 0.000000e+00, %entry ], [ %add3, %for.body ] + %x.014 = phi float [ %init, %entry ], [ %add, %for.body ] + %arrayidx = getelementptr inbounds float, float* %A, i64 %iv + store float %x.014, float* %arrayidx, align 4 + %add = fadd float %x.014, 2.000000e+00 + %arrayidx2 = getelementptr inbounds float, float* %values, i64 %iv + %0 = load float, float* %arrayidx2, align 4 + %add3 = fadd float %sum.015, %0 + %iv.next = add nuw nsw i64 %iv, 1 + %exitcond.not = icmp eq i64 %iv.next, %N + br i1 %exitcond.not, label %for.end, label %for.body + +for.end: + ret float %add3 +} + +; As above, but with the FP induction being unordered (fast) the loop can be vectorized +; CHECK-REMARKS: vectorized loop (vectorization width: 4, interleaved count: 2) +define float @fast_induction_and_reduction(float* nocapture readonly %values, float %init, float* noalias nocapture %A, i64 %N) { +; CHECK-LABEL: @fast_induction_and_reduction +; CHECK: vector.ph +; CHECK: %[[INDUCTION:.*]] = fadd fast <4 x float> {{.*}}, +; CHECK: vector.body +; CHECK: %[[RDX_PHI:.*]] = phi float [ 0.000000e+00, %vector.ph ], [ %[[FADD2:.*]], %vector.body ] +; CHECK: %[[IND_PHI:.*]] = phi <4 x float> [ %[[INDUCTION]], %vector.ph ], [ %[[VEC_IND_NEXT:.*]], %vector.body ] +; CHECK: %[[STEP_ADD:.*]] = fadd fast <4 x float> %[[IND_PHI]], +; CHECK: %[[LOAD1:.*]] = load <4 x float>, <4 x float>* +; CHECK: %[[LOAD2:.*]] = load <4 x float>, <4 x float>* +; CHECK: %[[FADD1:.*]] = call float @llvm.vector.reduce.fadd.v4f32(float %[[RDX_PHI]], <4 x float> %[[LOAD1]]) +; CHECK: %[[FADD2]] = call float @llvm.vector.reduce.fadd.v4f32(float %[[FADD1]], <4 x float> %[[LOAD2]]) +; CHECK: %[[VEC_IND_NEXT]] = fadd fast <4 x float> %[[STEP_ADD]], +; CHECK: for.body +; CHECK: %[[RDX_SUM_PHI:.*]] = phi float [ {{.*}}, %scalar.ph ], [ %[[FADD3:.*]], %for.body ] +; CHECK: %[[IND_SUM_PHI:.*]] = phi fast float [ {{.*}}, %scalar.ph ], [ %[[ADD_IND:.*]], %for.body ] +; CHECK: store float %[[IND_SUM_PHI]], float* +; CHECK: %[[ADD_IND]] = fadd fast float %[[IND_SUM_PHI]], 2.000000e+00 +; CHECK: %[[LOAD3:.*]] = load float, float* +; CHECK: %[[FADD3]] = fadd float %[[RDX_SUM_PHI]], %[[LOAD3]] +; CHECK: for.end +; CHECK: %[[RES_PHI:.*]] = phi float [ %[[FADD3]], %for.body ], [ %[[FADD2]], %middle.block ] +; CHECK: ret float %[[RES_PHI]] +entry: + br label %for.body + +for.body: + %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ] + %sum.015 = phi float [ 0.000000e+00, %entry ], [ %add3, %for.body ] + %x.014 = phi fast float [ %init, %entry ], [ %add, %for.body ] + %arrayidx = getelementptr inbounds float, float* %A, i64 %iv + store float %x.014, float* %arrayidx, align 4 + %add = fadd fast float %x.014, 2.000000e+00 + %arrayidx2 = getelementptr inbounds float, float* %values, i64 %iv + %0 = load float, float* %arrayidx2, align 4 + %add3 = fadd float %sum.015, %0 + %iv.next = add nuw nsw i64 %iv, 1 + %exitcond.not = icmp eq i64 %iv.next, %N + br i1 %exitcond.not, label %for.end, label %for.body + +for.end: + ret float %add3 +} + +; The FP induction is fast, but here we can't vectorize as only one of the reductions is an FAdd that can be performed in-loop +; CHECK-REMARKS: loop not vectorized: cannot prove it is safe to reorder floating-point operations +define float @fast_induction_unordered_reduction(float* nocapture readonly %values, float %init, float* noalias nocapture %A, float* noalias nocapture %B, i64 %N) { +; CHECK-LABEL: @fast_induction_unordered_reduction +; CHECK-NOT: vector.body +entry: + br label %for.body + +for.body: + %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ] + %sum2.023 = phi float [ 3.000000e+00, %entry ], [ %mul, %for.body ] + %sum.022 = phi float [ 0.000000e+00, %entry ], [ %add3, %for.body ] + %x.021 = phi float [ %init, %entry ], [ %add, %for.body ] + %arrayidx = getelementptr inbounds float, float* %A, i64 %iv + store float %x.021, float* %arrayidx, align 4 + %add = fadd fast float %x.021, 2.000000e+00 + %arrayidx2 = getelementptr inbounds float, float* %values, i64 %iv + %0 = load float, float* %arrayidx2, align 4 + %add3 = fadd float %sum.022, %0 + %mul = fmul float %sum2.023, %0 + %iv.next = add nuw nsw i64 %iv, 1 + %exitcond.not = icmp eq i64 %iv.next, %N + br i1 %exitcond.not, label %for.end, label %for.body + +for.end: + %add6 = fadd float %add3, %mul + ret float %add6 +} + !0 = distinct !{!0, !4, !7, !9} !1 = distinct !{!1, !4, !8, !9} !2 = distinct !{!2, !5, !7, !9}