Index: llvm/lib/Transforms/Vectorize/LoopVectorize.cpp =================================================================== --- llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -247,6 +247,12 @@ cl::desc( "Enable runtime interleaving until load/store ports are saturated")); +/// Interleave small loops with scalar reductions. +static cl::opt InterleaveSmallLoopScalarReduction( + "interleave-small-loop-scalar-reduction", cl::init(false), cl::Hidden, + cl::desc("Enable interleaving for small loops with scalar reductions " + "to expose ILP.")); + /// The number of stores in a loop that are allowed to need predication. static cl::opt NumberOfStoresToPredicate( "vectorize-num-stores-pred", cl::init(1), cl::Hidden, @@ -5250,10 +5256,18 @@ if (Legal->getMaxSafeDepDistBytes() != -1U) return 1; - // Do not interleave loops with a relatively small known or estimated trip - // count. auto BestKnownTC = getSmallBestKnownTC(*PSE.getSE(), TheLoop); - if (BestKnownTC && *BestKnownTC < TinyTripCountInterleaveThreshold) + const bool HasReductions = !Legal->getReductionVars().empty(); + const bool ScalarReductionCond = + InterleaveSmallLoopScalarReduction && HasReductions && VF == 1; + // Do not interleave loops with a relatively small known or estimated trip + // count. But we will interleave when ScalarReductionCond is satisfied: + // i.e. InterleaveSmallLoopScalarReduction is enabled, and the code has + // scalar reductions(HasReductions && VF = 1), because with the above + // conditions interleaving can expose ILP, break cross iteration dependences + // for reductions, and will benefit SLP vectorizer in a later pass. + if (BestKnownTC && (*BestKnownTC < TinyTripCountInterleaveThreshold) && + !ScalarReductionCond) return 1; RegisterUsage R = calculateRegisterUsage({VF})[0]; @@ -5338,7 +5352,7 @@ // Interleave if we vectorized this loop and there is a reduction that could // benefit from interleaving. - if (VF > 1 && !Legal->getReductionVars().empty()) { + if (VF > 1 && HasReductions) { LLVM_DEBUG(dbgs() << "LV: Interleaving because of reductions.\n"); return IC; } @@ -5350,7 +5364,11 @@ // We want to interleave small loops in order to reduce the loop overhead and // potentially expose ILP opportunities. - LLVM_DEBUG(dbgs() << "LV: Loop cost is " << LoopCost << '\n'); + LLVM_DEBUG(dbgs() << "LV: Loop cost is " << LoopCost << '\n' + << "LV: IC is " << IC << '\n' + << "LV: VF is " << VF << '\n'); + const bool AggressivelyInterleaveReductions = + TTI.enableAggressiveInterleaving(HasReductions); if (!InterleavingRequiresRuntimePointerCheck && LoopCost < SmallLoopCost) { // We assume that the cost overhead is 1 and we use the cost model // to estimate the cost of the loop and interleave until the cost of the @@ -5369,7 +5387,7 @@ // by this point), we can increase the critical path length if the loop // we're interleaving is inside another loop. Limit, by default to 2, so the // critical path only gets increased by one reduction operation. - if (!Legal->getReductionVars().empty() && TheLoop->getLoopDepth() > 1) { + if (HasReductions && TheLoop->getLoopDepth() > 1) { unsigned F = static_cast(MaxNestedScalarReductionIC); SmallIC = std::min(SmallIC, F); StoresIC = std::min(StoresIC, F); @@ -5383,14 +5401,23 @@ return std::max(StoresIC, LoadsIC); } - LLVM_DEBUG(dbgs() << "LV: Interleaving to reduce branch cost.\n"); - return SmallIC; + // If there are scalar reductions and TTI has enabled aggressive + // interleaving for reductions, we will interleave to expose ILP. + if (InterleaveSmallLoopScalarReduction && VF == 1 && + AggressivelyInterleaveReductions) { + LLVM_DEBUG(dbgs() << "LV: Interleaving to expose ILP.\n"); + // Interleave no less than SmallIC but not as aggressive as the normal IC + // to satisfy the rare situation when resources are too limited. + return std::max(IC / 2, SmallIC); + } else { + LLVM_DEBUG(dbgs() << "LV: Interleaving to reduce branch cost.\n"); + return SmallIC; + } } // Interleave if this is a large loop (small loops are already dealt with by // this point) that could benefit from interleaving. - bool HasReductions = !Legal->getReductionVars().empty(); - if (TTI.enableAggressiveInterleaving(HasReductions)) { + if (AggressivelyInterleaveReductions) { LLVM_DEBUG(dbgs() << "LV: Interleaving to expose ILP.\n"); return IC; } Index: llvm/test/Transforms/LoopVectorize/PowerPC/interleave_IC.ll =================================================================== --- /dev/null +++ llvm/test/Transforms/LoopVectorize/PowerPC/interleave_IC.ll @@ -0,0 +1,93 @@ +; RUN: opt < %s -loop-vectorize -mtriple=powerpc64le-unknown-linux -S -mcpu=pwr9 -interleave-small-loop-scalar-reduction=true 2>&1 | FileCheck %s +; RUN: opt < %s -passes='loop-vectorize' -mtriple=powerpc64le-unknown-linux -S -mcpu=pwr9 -interleave-small-loop-scalar-reduction=true 2>&1 | FileCheck %s + +;void fun(Vector &MatrixB, +; const Vector &MatrixA, +; const unsigned int * const start, +; const unsigned int * const end, +; const double * val) const +;{ +; const unsigned int N=MatrixB.size(); +; MatrixB = MatrixA; +; for (unsigned int row=0; row +%3 = type { %7*, i8* } +%4 = type { %5 } +%5 = type { %6 } +%6 = type { i32**, i32**, i32** } +%7 = type <{ %8, i32, i32, i32, [4 x i8], i64, i32, [4 x i8], i64*, i32*, i8, i8, [6 x i8] }> +%8 = type { i32 (...)**, i32, %9, %16* } +%9 = type { %10 } +%10 = type { %11 } +%11 = type { %12, %14 } +%12 = type { %13 } +%13 = type { i8 } +%14 = type { %15, i64 } +%15 = type { i32, %15*, %15*, %15* } +%16 = type { i32 (...)**, i8* } +%17 = type { %8, i32, i32, double* } + +$test = comdat any +define dso_local void @test(%0* %arg, %17* dereferenceable(88) %arg1) comdat align 2 { + %tmp14 = getelementptr %0, %0* %arg, i64 0, i32 0, i32 3, i32 0, i32 0, i32 0 + %tmp15 = load i32**, i32*** %tmp14, align 8 + %tmp18 = getelementptr inbounds %17, %17* %arg1, i64 0, i32 3 + %tmp19 = load double*, double** %tmp18, align 8 + br label %bb22 +bb22: ; preds = %bb33, %bb + %tmp26 = add i64 0, 1 + %tmp27 = getelementptr inbounds i32, i32* null, i64 %tmp26 + %tmp28 = getelementptr inbounds i32*, i32** %tmp15, i64 undef + %tmp29 = load i32*, i32** %tmp28, align 8 + %tmp32 = getelementptr inbounds double, double* null, i64 %tmp26 + br label %bb40 +bb33: ; preds = %bb40 + %tmp35 = getelementptr inbounds double, double* %tmp19, i64 undef + %tmp37 = fsub fast double 0.000000e+00, %tmp50 + store double %tmp37, double* %tmp35, align 8 + br label %bb22 +bb40: ; preds = %bb40, %bb22 + %tmp41 = phi i32* [ %tmp51, %bb40 ], [ %tmp27, %bb22 ] + %tmp42 = phi double* [ %tmp52, %bb40 ], [ %tmp32, %bb22 ] + %tmp43 = phi double [ %tmp50, %bb40 ], [ 0.000000e+00, %bb22 ] + %tmp44 = load double, double* %tmp42, align 8 + %tmp45 = load i32, i32* %tmp41, align 4 + %tmp46 = zext i32 %tmp45 to i64 + %tmp47 = getelementptr inbounds double, double* %tmp19, i64 %tmp46 + %tmp48 = load double, double* %tmp47, align 8 + %tmp49 = fmul fast double %tmp48, %tmp44 + %tmp50 = fadd fast double %tmp49, %tmp43 + %tmp51 = getelementptr inbounds i32, i32* %tmp41, i64 1 + %tmp52 = getelementptr inbounds double, double* %tmp42, i64 1 + %tmp53 = icmp eq i32* %tmp51, %tmp29 + br i1 %tmp53, label %bb33, label %bb40 +} Index: llvm/test/Transforms/PhaseOrdering/interleave_LV_SLP.ll =================================================================== --- /dev/null +++ llvm/test/Transforms/PhaseOrdering/interleave_LV_SLP.ll @@ -0,0 +1,174 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py +; REQUIRES: powerpc-registered-target +; RUN: opt < %s -O2 -mtriple=powerpc64le-unknown-linux -S -mcpu=pwr9 -interleave-small-loop-scalar-reduction=true 2>&1 | FileCheck %s +; RUN: opt < %s -passes='default' -mtriple=powerpc64le-unknown-linux -S -mcpu=pwr9 -interleave-small-loop-scalar-reduction=true 2>&1 | FileCheck %s + +;void fun(Vector &MatrixB, +; const Vector &MatrixA, +; const unsigned int * const start, +; const unsigned int * const end, +; const double * val) const +;{ +; const unsigned int N=MatrixB.size(); +; MatrixB = MatrixA; +; for (unsigned int row=0; row +%3 = type { %7*, i8* } +%4 = type { %5 } +%5 = type { %6 } +%6 = type { i32**, i32**, i32** } +%7 = type <{ %8, i32, i32, i32, [4 x i8], i64, i32, [4 x i8], i64*, i32*, i8, i8, [6 x i8] }> +%8 = type { i32 (...)**, i32, %9, %16* } +%9 = type { %10 } +%10 = type { %11 } +%11 = type { %12, %14 } +%12 = type { %13 } +%13 = type { i8 } +%14 = type { %15, i64 } +%15 = type { i32, %15*, %15*, %15* } +%16 = type { i32 (...)**, i8* } +%17 = type { %8, i32, i32, double* } + +$test = comdat any +define dso_local void @test(%0* %arg, %17* dereferenceable(88) %arg1) comdat align 2 { +; CHECK-LABEL: @test( +; CHECK-NEXT: [[TMP14:%.*]] = getelementptr [[TMP0:%.*]], %0* [[ARG:%.*]], i64 0, i32 0, i32 3, i32 0, i32 0, i32 0 +; CHECK-NEXT: [[TMP15:%.*]] = load i32**, i32*** [[TMP14]], align 8 +; CHECK-NEXT: [[TMP18:%.*]] = getelementptr inbounds [[TMP17:%.*]], %17* [[ARG1:%.*]], i64 0, i32 3 +; CHECK-NEXT: [[TMP19:%.*]] = load double*, double** [[TMP18]], align 8 +; CHECK-NEXT: [[TMP28:%.*]] = getelementptr inbounds i32*, i32** [[TMP15]], i64 undef +; CHECK-NEXT: [[TMP35:%.*]] = getelementptr inbounds double, double* [[TMP19]], i64 undef +; CHECK-NEXT: br label [[BB22:%.*]] +; CHECK: bb22: +; CHECK-NEXT: [[TMP29:%.*]] = load i32*, i32** [[TMP28]], align 8 +; CHECK-NEXT: [[UGLYGEP17:%.*]] = getelementptr i32, i32* [[TMP29]], i64 -2 +; CHECK-NEXT: [[UGLYGEP2:%.*]] = ptrtoint i32* [[UGLYGEP17]] to i64 +; CHECK-NEXT: [[TMP1:%.*]] = lshr i64 [[UGLYGEP2]], 2 +; CHECK-NEXT: [[TMP2:%.*]] = add nuw nsw i64 [[TMP1]], 1 +; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i32* [[UGLYGEP17]], inttoptr (i64 12 to i32*) +; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[BB40_PREHEADER:%.*]], label [[VECTOR_PH:%.*]] +; CHECK: bb40.preheader: +; CHECK-NEXT: [[TMP41_PH:%.*]] = phi i32* [ inttoptr (i64 4 to i32*), [[BB22]] ], [ [[IND_END:%.*]], [[MIDDLE_BLOCK:%.*]] ] +; CHECK-NEXT: [[TMP42_PH:%.*]] = phi double* [ inttoptr (i64 8 to double*), [[BB22]] ], [ [[IND_END4:%.*]], [[MIDDLE_BLOCK]] ] +; CHECK-NEXT: [[TMP43_PH:%.*]] = phi double [ 0.000000e+00, [[BB22]] ], [ [[BIN_RDX16:%.*]], [[MIDDLE_BLOCK]] ] +; CHECK-NEXT: br label [[BB40:%.*]] +; CHECK: vector.ph: +; CHECK-NEXT: [[N_VEC:%.*]] = and i64 [[TMP2]], 9223372036854775804 +; CHECK-NEXT: [[IND_END]] = getelementptr i32, i32* inttoptr (i64 4 to i32*), i64 [[N_VEC]] +; CHECK-NEXT: [[IND_END4]] = getelementptr double, double* inttoptr (i64 8 to double*), i64 [[N_VEC]] +; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK: vector.body: +; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP3:%.*]] = phi <4 x double> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP30:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[NEXT_GEP:%.*]] = getelementptr i32, i32* inttoptr (i64 4 to i32*), i64 [[INDEX]] +; CHECK-NEXT: [[TMP4:%.*]] = or i64 [[INDEX]], 1 +; CHECK-NEXT: [[NEXT_GEP5:%.*]] = getelementptr i32, i32* inttoptr (i64 4 to i32*), i64 [[TMP4]] +; CHECK-NEXT: [[TMP5:%.*]] = or i64 [[INDEX]], 2 +; CHECK-NEXT: [[NEXT_GEP6:%.*]] = getelementptr i32, i32* inttoptr (i64 4 to i32*), i64 [[TMP5]] +; CHECK-NEXT: [[TMP6:%.*]] = or i64 [[INDEX]], 3 +; CHECK-NEXT: [[NEXT_GEP7:%.*]] = getelementptr i32, i32* inttoptr (i64 4 to i32*), i64 [[TMP6]] +; CHECK-NEXT: [[NEXT_GEP8:%.*]] = getelementptr double, double* inttoptr (i64 8 to double*), i64 [[INDEX]] +; CHECK-NEXT: [[TMP7:%.*]] = bitcast double* [[NEXT_GEP8]] to <4 x double>* +; CHECK-NEXT: [[TMP8:%.*]] = load <4 x double>, <4 x double>* [[TMP7]], align 8 +; CHECK-NEXT: [[TMP9:%.*]] = load i32, i32* [[NEXT_GEP]], align 4 +; CHECK-NEXT: [[TMP10:%.*]] = load i32, i32* [[NEXT_GEP5]], align 4 +; CHECK-NEXT: [[TMP11:%.*]] = load i32, i32* [[NEXT_GEP6]], align 4 +; CHECK-NEXT: [[TMP12:%.*]] = load i32, i32* [[NEXT_GEP7]], align 4 +; CHECK-NEXT: [[TMP13:%.*]] = zext i32 [[TMP9]] to i64 +; CHECK-NEXT: [[TMP14:%.*]] = zext i32 [[TMP10]] to i64 +; CHECK-NEXT: [[TMP15:%.*]] = zext i32 [[TMP11]] to i64 +; CHECK-NEXT: [[TMP16:%.*]] = zext i32 [[TMP12]] to i64 +; CHECK-NEXT: [[TMP17]] = getelementptr inbounds double, double* [[TMP19]], i64 [[TMP13]] +; CHECK-NEXT: [[TMP18:%.*]] = getelementptr inbounds double, double* [[TMP19]], i64 [[TMP14]] +; CHECK-NEXT: [[TMP19:%.*]] = getelementptr inbounds double, double* [[TMP19]], i64 [[TMP15]] +; CHECK-NEXT: [[TMP20:%.*]] = getelementptr inbounds double, double* [[TMP19]], i64 [[TMP16]] +; CHECK-NEXT: [[TMP21:%.*]] = load double, double* [[TMP17]], align 8 +; CHECK-NEXT: [[TMP22:%.*]] = load double, double* [[TMP18]], align 8 +; CHECK-NEXT: [[TMP23:%.*]] = load double, double* [[TMP19]], align 8 +; CHECK-NEXT: [[TMP24:%.*]] = load double, double* [[TMP20]], align 8 +; CHECK-NEXT: [[TMP25:%.*]] = insertelement <4 x double> undef, double [[TMP21]], i32 0 +; CHECK-NEXT: [[TMP26:%.*]] = insertelement <4 x double> [[TMP25]], double [[TMP22]], i32 1 +; CHECK-NEXT: [[TMP27:%.*]] = insertelement <4 x double> [[TMP26]], double [[TMP23]], i32 2 +; CHECK-NEXT: [[TMP28:%.*]] = insertelement <4 x double> [[TMP27]], double [[TMP24]], i32 3 +; CHECK-NEXT: [[TMP29:%.*]] = fmul fast <4 x double> [[TMP28]], [[TMP8]] +; CHECK-NEXT: [[TMP30]] = fadd fast <4 x double> [[TMP29]], [[TMP3]] +; CHECK-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 4 +; CHECK-NEXT: [[TMP31:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-NEXT: br i1 [[TMP31]], label [[MIDDLE_BLOCK]], label [[VECTOR_BODY]], !llvm.loop !0 +; CHECK: middle.block: +; CHECK-NEXT: [[TMP32:%.*]] = extractelement <4 x double> [[TMP30]], i32 0 +; CHECK-NEXT: [[TMP33:%.*]] = extractelement <4 x double> [[TMP30]], i32 1 +; CHECK-NEXT: [[BIN_RDX:%.*]] = fadd fast double [[TMP33]], [[TMP32]] +; CHECK-NEXT: [[TMP34:%.*]] = extractelement <4 x double> [[TMP30]], i32 2 +; CHECK-NEXT: [[BIN_RDX15:%.*]] = fadd fast double [[TMP34]], [[BIN_RDX]] +; CHECK-NEXT: [[TMP35:%.*]] = extractelement <4 x double> [[TMP30]], i32 3 +; CHECK-NEXT: [[BIN_RDX16]] = fadd fast double [[TMP35]], [[BIN_RDX15]] +; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[TMP2]], [[N_VEC]] +; CHECK-NEXT: br i1 [[CMP_N]], label [[BB33:%.*]], label [[BB40_PREHEADER]] +; CHECK: bb33: +; CHECK-NEXT: [[TMP50_LCSSA:%.*]] = phi double [ [[BIN_RDX16]], [[MIDDLE_BLOCK]] ], [ [[TMP50:%.*]], [[BB40]] ] +; CHECK-NEXT: [[TMP37:%.*]] = fneg fast double [[TMP50_LCSSA]] +; CHECK-NEXT: store double [[TMP37]], double* [[TMP35]], align 8 +; CHECK-NEXT: br label [[BB22]] +; CHECK: bb40: +; CHECK-NEXT: [[TMP41:%.*]] = phi i32* [ [[TMP51:%.*]], [[BB40]] ], [ [[TMP41_PH]], [[BB40_PREHEADER]] ] +; CHECK-NEXT: [[TMP42:%.*]] = phi double* [ [[TMP52:%.*]], [[BB40]] ], [ [[TMP42_PH]], [[BB40_PREHEADER]] ] +; CHECK-NEXT: [[TMP43:%.*]] = phi double [ [[TMP50]], [[BB40]] ], [ [[TMP43_PH]], [[BB40_PREHEADER]] ] +; CHECK-NEXT: [[TMP44:%.*]] = load double, double* [[TMP42]], align 8 +; CHECK-NEXT: [[TMP45:%.*]] = load i32, i32* [[TMP41]], align 4 +; CHECK-NEXT: [[TMP46:%.*]] = zext i32 [[TMP45]] to i64 +; CHECK-NEXT: [[TMP47:%.*]] = getelementptr inbounds double, double* [[TMP19]], i64 [[TMP46]] +; CHECK-NEXT: [[TMP48:%.*]] = load double, double* [[TMP47]], align 8 +; CHECK-NEXT: [[TMP49:%.*]] = fmul fast double [[TMP48]], [[TMP44]] +; CHECK-NEXT: [[TMP50]] = fadd fast double [[TMP49]], [[TMP43]] +; CHECK-NEXT: [[TMP51]] = getelementptr inbounds i32, i32* [[TMP41]], i64 1 +; CHECK-NEXT: [[TMP52]] = getelementptr inbounds double, double* [[TMP42]], i64 1 +; CHECK-NEXT: [[TMP53:%.*]] = icmp eq i32* [[TMP51]], [[TMP29]] +; CHECK-NEXT: br i1 [[TMP53]], label [[BB33]], label [[BB40]], !llvm.loop !2 +; + %tmp14 = getelementptr %0, %0* %arg, i64 0, i32 0, i32 3, i32 0, i32 0, i32 0 + %tmp15 = load i32**, i32*** %tmp14, align 8 + %tmp18 = getelementptr inbounds %17, %17* %arg1, i64 0, i32 3 + %tmp19 = load double*, double** %tmp18, align 8 + br label %bb22 +bb22: ; preds = %bb33, %bb + %tmp26 = add i64 0, 1 + %tmp27 = getelementptr inbounds i32, i32* null, i64 %tmp26 + %tmp28 = getelementptr inbounds i32*, i32** %tmp15, i64 undef + %tmp29 = load i32*, i32** %tmp28, align 8 + %tmp32 = getelementptr inbounds double, double* null, i64 %tmp26 + br label %bb40 +bb33: ; preds = %bb40 + %tmp35 = getelementptr inbounds double, double* %tmp19, i64 undef + %tmp37 = fsub fast double 0.000000e+00, %tmp50 + store double %tmp37, double* %tmp35, align 8 + br label %bb22 +bb40: ; preds = %bb40, %bb22 + %tmp41 = phi i32* [ %tmp51, %bb40 ], [ %tmp27, %bb22 ] + %tmp42 = phi double* [ %tmp52, %bb40 ], [ %tmp32, %bb22 ] + %tmp43 = phi double [ %tmp50, %bb40 ], [ 0.000000e+00, %bb22 ] + %tmp44 = load double, double* %tmp42, align 8 + %tmp45 = load i32, i32* %tmp41, align 4 + %tmp46 = zext i32 %tmp45 to i64 + %tmp47 = getelementptr inbounds double, double* %tmp19, i64 %tmp46 + %tmp48 = load double, double* %tmp47, align 8 + %tmp49 = fmul fast double %tmp48, %tmp44 + %tmp50 = fadd fast double %tmp49, %tmp43 + %tmp51 = getelementptr inbounds i32, i32* %tmp41, i64 1 + %tmp52 = getelementptr inbounds double, double* %tmp42, i64 1 + %tmp53 = icmp eq i32* %tmp51, %tmp29 + br i1 %tmp53, label %bb33, label %bb40 +} Index: llvm/test/Transforms/SLPVectorizer/PowerPC/interleave_SLP.ll =================================================================== --- /dev/null +++ llvm/test/Transforms/SLPVectorizer/PowerPC/interleave_SLP.ll @@ -0,0 +1,213 @@ +; RUN: opt -S -mcpu=pwr9 -slp-vectorizer -interleave-small-loop-scalar-reduction=true < %s | FileCheck %s +; RUN: opt -S -mcpu=pwr9 -passes='slp-vectorizer' -interleave-small-loop-scalar-reduction=true < %s | FileCheck %s + +; CHECK-LABEL: vector.body + +; CHECK: load <4 x double>, <4 x double>* + +; CHECK: fmul fast <4 x double> + +; CHECK: fadd fast <4 x double> + +target datalayout = "e-m:e-i64:64-n32:64" +target triple = "powerpc64le-unknown-linux" + +%0 = type { i8 } +%1 = type { %2, %9 } +%2 = type { %3, i8, double, %5, %8* } +%3 = type <{ i32 (...)**, %4, double*, i32 }> +%4 = type { %8*, i8* } +%5 = type { %6 } +%6 = type { %7 } +%7 = type { i32**, i32**, i32** } +%8 = type <{ %9, i32, i32, i32, [4 x i8], i64, i32, [4 x i8], i64*, i32*, i8, i8, [6 x i8] }> +%9 = type { i32 (...)**, i32, %10, %17* } +%10 = type { %11 } +%11 = type { %12 } +%12 = type { %13, %15 } +%13 = type { %14 } +%14 = type { i8 } +%15 = type { %16, i64 } +%16 = type { i32, %16*, %16*, %16* } +%17 = type { i32 (...)**, i8* } +%18 = type { %9, i32, i32, double* } +%19 = type <{ i32 (...)**, %4, double*, i32, [4 x i8], %9 }> + +$test0 = comdat any + +@0 = internal global %0 zeroinitializer, align 1 +@__dso_handle = external hidden global i8 +@llvm.global_ctors = appending global [1 x { i32, void ()*, i8* }] [{ i32, void ()*, i8* } { i32 65535, void ()* @1, i8* null }] +declare void @test3(%0*) +declare void @test4(%0*) +; Function Attrs: nofree nounwind +declare i32 @__cxa_atexit(void (i8*)*, i8*, i8*) +define weak_odr dso_local void @test0(%1* %arg, %18* dereferenceable(88) %arg1, %18* dereferenceable(88) %arg2) local_unnamed_addr comdat align 2 { +bb: + %tmp = getelementptr inbounds %18, %18* %arg1, i64 0, i32 1 + %tmp3 = load i32, i32* %tmp, align 8 + %tmp4 = bitcast %1* %arg to %19* + %tmp5 = tail call dereferenceable(128) %8* @test1(%19* %tmp4) + %tmp6 = getelementptr inbounds %8, %8* %tmp5, i64 0, i32 8 + %tmp7 = load i64*, i64** %tmp6, align 8 + %tmp8 = tail call dereferenceable(128) %8* @test1(%19* %tmp4) + %tmp9 = getelementptr inbounds %8, %8* %tmp8, i64 0, i32 9 + %tmp10 = load i32*, i32** %tmp9, align 8 + %tmp102 = ptrtoint i32* %tmp10 to i64 + %tmp11 = tail call dereferenceable(88) %18* @test2(%18* nonnull %arg1, %18* nonnull dereferenceable(88) %arg2) + %tmp12 = icmp eq i32 %tmp3, 0 + br i1 %tmp12, label %bb21, label %bb13 + +bb13: ; preds = %bb + %tmp14 = getelementptr %1, %1* %arg, i64 0, i32 0, i32 3, i32 0, i32 0, i32 0 + %tmp15 = load i32**, i32*** %tmp14, align 8 + %tmp16 = getelementptr inbounds %1, %1* %arg, i64 0, i32 0, i32 0, i32 2 + %tmp17 = load double*, double** %tmp16, align 8 + %tmp18 = getelementptr inbounds %18, %18* %arg1, i64 0, i32 3 + %tmp19 = load double*, double** %tmp18, align 8 + %tmp20 = zext i32 %tmp3 to i64 + %0 = sub i64 0, %tmp102 + br label %bb22 + +bb21.loopexit: ; preds = %bb33 + br label %bb21 + +bb21: ; preds = %bb21.loopexit, %bb + ret void + +bb22: ; preds = %bb33, %bb13 + %tmp23 = phi i64 [ 0, %bb13 ], [ %tmp38, %bb33 ] + %tmp24 = getelementptr inbounds i64, i64* %tmp7, i64 %tmp23 + %tmp25 = load i64, i64* %tmp24, align 8 + %tmp26 = add i64 %tmp25, 1 + %tmp27 = getelementptr inbounds i32, i32* %tmp10, i64 %tmp26 + %tmp28 = getelementptr inbounds i32*, i32** %tmp15, i64 %tmp23 + %tmp29 = load i32*, i32** %tmp28, align 8 + %tmp30 = icmp eq i32* %tmp27, %tmp29 + br i1 %tmp30, label %bb33, label %bb31 + +bb31: ; preds = %bb22 + %tmp32 = getelementptr inbounds double, double* %tmp17, i64 %tmp26 + %scevgep = getelementptr i32, i32* %tmp29, i64 -2 + %scevgep1 = bitcast i32* %scevgep to i8* + %uglygep = getelementptr i8, i8* %scevgep1, i64 %0 + %1 = mul i64 %tmp25, -4 + %scevgep3 = getelementptr i8, i8* %uglygep, i64 %1 + %scevgep34 = ptrtoint i8* %scevgep3 to i64 + %2 = lshr i64 %scevgep34, 2 + %3 = add nuw nsw i64 %2, 1 + %min.iters.check = icmp ult i64 %3, 4 + br i1 %min.iters.check, label %scalar.ph, label %vector.ph + +vector.ph: ; preds = %bb31 + %n.mod.vf = urem i64 %3, 4 + %n.vec = sub i64 %3, %n.mod.vf + %ind.end = getelementptr i32, i32* %tmp27, i64 %n.vec + %ind.end6 = getelementptr double, double* %tmp32, i64 %n.vec + br label %vector.body + +vector.body: ; preds = %vector.body, %vector.ph + %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ] + %vec.phi = phi double [ 0.000000e+00, %vector.ph ], [ %36, %vector.body ] + %vec.phi14 = phi double [ 0.000000e+00, %vector.ph ], [ %37, %vector.body ] + %vec.phi15 = phi double [ 0.000000e+00, %vector.ph ], [ %38, %vector.body ] + %vec.phi16 = phi double [ 0.000000e+00, %vector.ph ], [ %39, %vector.body ] + %4 = add i64 %index, 0 + %next.gep = getelementptr i32, i32* %tmp27, i64 %4 + %5 = add i64 %index, 1 + %next.gep7 = getelementptr i32, i32* %tmp27, i64 %5 + %6 = add i64 %index, 2 + %next.gep8 = getelementptr i32, i32* %tmp27, i64 %6 + %7 = add i64 %index, 3 + %next.gep9 = getelementptr i32, i32* %tmp27, i64 %7 + %8 = add i64 %index, 0 + %next.gep10 = getelementptr double, double* %tmp32, i64 %8 + %9 = add i64 %index, 1 + %next.gep11 = getelementptr double, double* %tmp32, i64 %9 + %10 = add i64 %index, 2 + %next.gep12 = getelementptr double, double* %tmp32, i64 %10 + %11 = add i64 %index, 3 + %next.gep13 = getelementptr double, double* %tmp32, i64 %11 + %12 = load double, double* %next.gep10, align 8 + %13 = load double, double* %next.gep11, align 8 + %14 = load double, double* %next.gep12, align 8 + %15 = load double, double* %next.gep13, align 8 + %16 = load i32, i32* %next.gep, align 4 + %17 = load i32, i32* %next.gep7, align 4 + %18 = load i32, i32* %next.gep8, align 4 + %19 = load i32, i32* %next.gep9, align 4 + %20 = zext i32 %16 to i64 + %21 = zext i32 %17 to i64 + %22 = zext i32 %18 to i64 + %23 = zext i32 %19 to i64 + %24 = getelementptr inbounds double, double* %tmp19, i64 %20 + %25 = getelementptr inbounds double, double* %tmp19, i64 %21 + %26 = getelementptr inbounds double, double* %tmp19, i64 %22 + %27 = getelementptr inbounds double, double* %tmp19, i64 %23 + %28 = load double, double* %24, align 8 + %29 = load double, double* %25, align 8 + %30 = load double, double* %26, align 8 + %31 = load double, double* %27, align 8 + %32 = fmul fast double %28, %12 + %33 = fmul fast double %29, %13 + %34 = fmul fast double %30, %14 + %35 = fmul fast double %31, %15 + %36 = fadd fast double %32, %vec.phi + %37 = fadd fast double %33, %vec.phi14 + %38 = fadd fast double %34, %vec.phi15 + %39 = fadd fast double %35, %vec.phi16 + %index.next = add i64 %index, 4 + %40 = icmp eq i64 %index.next, %n.vec + br i1 %40, label %middle.block, label %vector.body + +middle.block: ; preds = %vector.body + %bin.rdx = fadd fast double %37, %36 + %bin.rdx17 = fadd fast double %38, %bin.rdx + %bin.rdx18 = fadd fast double %39, %bin.rdx17 + %cmp.n = icmp eq i64 %3, %n.vec + br i1 %cmp.n, label %bb33.loopexit, label %scalar.ph + +scalar.ph: ; preds = %middle.block, %bb31 + %bc.resume.val = phi i32* [ %ind.end, %middle.block ], [ %tmp27, %bb31 ] + %bc.resume.val5 = phi double* [ %ind.end6, %middle.block ], [ %tmp32, %bb31 ] + %bc.merge.rdx = phi double [ 0.000000e+00, %bb31 ], [ %bin.rdx18, %middle.block ] + br label %bb40 + +bb33.loopexit: ; preds = %middle.block, %bb40 + %tmp50.lcssa = phi double [ %tmp50, %bb40 ], [ %bin.rdx18, %middle.block ] + br label %bb33 + +bb33: ; preds = %bb33.loopexit, %bb22 + %tmp34 = phi double [ 0.000000e+00, %bb22 ], [ %tmp50.lcssa, %bb33.loopexit ] + %tmp35 = getelementptr inbounds double, double* %tmp19, i64 %tmp23 + %tmp36 = load double, double* %tmp35, align 8 + %tmp37 = fsub fast double %tmp36, %tmp34 + store double %tmp37, double* %tmp35, align 8 + %tmp38 = add nuw nsw i64 %tmp23, 1 + %tmp39 = icmp eq i64 %tmp38, %tmp20 + br i1 %tmp39, label %bb21.loopexit, label %bb22 + +bb40: ; preds = %bb40, %scalar.ph + %tmp41 = phi i32* [ %tmp51, %bb40 ], [ %bc.resume.val, %scalar.ph ] + %tmp42 = phi double* [ %tmp52, %bb40 ], [ %bc.resume.val5, %scalar.ph ] + %tmp43 = phi double [ %tmp50, %bb40 ], [ %bc.merge.rdx, %scalar.ph ] + %tmp44 = load double, double* %tmp42, align 8 + %tmp45 = load i32, i32* %tmp41, align 4 + %tmp46 = zext i32 %tmp45 to i64 + %tmp47 = getelementptr inbounds double, double* %tmp19, i64 %tmp46 + %tmp48 = load double, double* %tmp47, align 8 + %tmp49 = fmul fast double %tmp48, %tmp44 + %tmp50 = fadd fast double %tmp49, %tmp43 + %tmp51 = getelementptr inbounds i32, i32* %tmp41, i64 1 + %tmp52 = getelementptr inbounds double, double* %tmp42, i64 1 + %tmp53 = icmp eq i32* %tmp51, %tmp29 + br i1 %tmp53, label %bb33.loopexit, label %bb40 +} +declare dereferenceable(128) %8* @test1(%19*) +declare dereferenceable(88) %18* @test2(%18*, %18* dereferenceable(88)) +define internal void @1() section ".text.startup" { +bb: + tail call void @test3(%0* nonnull @0) + %tmp = tail call i32 @__cxa_atexit(void (i8*)* bitcast (void (%0*)* @test4 to void (i8*)*), i8* getelementptr inbounds (%0, %0* @0, i64 0, i32 0), i8* nonnull @__dso_handle) + ret void +}