Index: llvm/lib/Transforms/Vectorize/LoopVectorize.cpp =================================================================== --- llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -247,6 +247,12 @@ cl::desc( "Enable runtime interleaving until load/store ports are saturated")); +/// Interleave small loops with scalar reductions. +static cl::opt InterleaveSmallLoopScalarReduction( + "interleave-small-loop-scalar-reduction", cl::init(false), cl::Hidden, + cl::desc("Enable interleaving for small loops with scalar reductions " + "to expose ILP.")); + /// The number of stores in a loop that are allowed to need predication. static cl::opt NumberOfStoresToPredicate( "vectorize-num-stores-pred", cl::init(1), cl::Hidden, @@ -5252,10 +5258,18 @@ if (Legal->getMaxSafeDepDistBytes() != -1U) return 1; - // Do not interleave loops with a relatively small known or estimated trip - // count. auto BestKnownTC = getSmallBestKnownTC(*PSE.getSE(), TheLoop); - if (BestKnownTC && *BestKnownTC < TinyTripCountInterleaveThreshold) + const bool HasReductions = !Legal->getReductionVars().empty(); + const bool ScalarReductionCond = + InterleaveSmallLoopScalarReduction && HasReductions && VF == 1; + // Do not interleave loops with a relatively small known or estimated trip + // count. But we will interleave when ScalarReductionCond is satisfied: + // i.e. InterleaveSmallLoopScalarReduction is enabled, and the code has + // scalar reductions(HasReductions && VF = 1), because with the above + // conditions interleaving can expose ILP, break cross iteration dependences + // for reductions, and will benefit SLP vectorizer in a later pass. + if (BestKnownTC && (*BestKnownTC < TinyTripCountInterleaveThreshold) && + !ScalarReductionCond) return 1; RegisterUsage R = calculateRegisterUsage({VF})[0]; @@ -5340,7 +5354,7 @@ // Interleave if we vectorized this loop and there is a reduction that could // benefit from interleaving. - if (VF > 1 && !Legal->getReductionVars().empty()) { + if (VF > 1 && HasReductions) { LLVM_DEBUG(dbgs() << "LV: Interleaving because of reductions.\n"); return IC; } @@ -5352,7 +5366,11 @@ // We want to interleave small loops in order to reduce the loop overhead and // potentially expose ILP opportunities. - LLVM_DEBUG(dbgs() << "LV: Loop cost is " << LoopCost << '\n'); + LLVM_DEBUG(dbgs() << "LV: Loop cost is " << LoopCost << '\n' + << "LV: IC is " << IC << '\n' + << "LV: VF is " << VF << '\n'); + const bool AggressivelyInterleaveReductions = + TTI.enableAggressiveInterleaving(HasReductions); if (!InterleavingRequiresRuntimePointerCheck && LoopCost < SmallLoopCost) { // We assume that the cost overhead is 1 and we use the cost model // to estimate the cost of the loop and interleave until the cost of the @@ -5371,7 +5389,7 @@ // by this point), we can increase the critical path length if the loop // we're interleaving is inside another loop. Limit, by default to 2, so the // critical path only gets increased by one reduction operation. - if (!Legal->getReductionVars().empty() && TheLoop->getLoopDepth() > 1) { + if (HasReductions && TheLoop->getLoopDepth() > 1) { unsigned F = static_cast(MaxNestedScalarReductionIC); SmallIC = std::min(SmallIC, F); StoresIC = std::min(StoresIC, F); @@ -5385,14 +5403,23 @@ return std::max(StoresIC, LoadsIC); } - LLVM_DEBUG(dbgs() << "LV: Interleaving to reduce branch cost.\n"); - return SmallIC; + // If there are scalar reductions and TTI has enabled aggressive + // interleaving for reductions, we will interleave to expose ILP. + if (InterleaveSmallLoopScalarReduction && VF == 1 && + AggressivelyInterleaveReductions) { + LLVM_DEBUG(dbgs() << "LV: Interleaving to expose ILP.\n"); + // Interleave no less than SmallIC but not as aggressive as the normal IC + // to satisfy the rare situation when resources are too limited. + return std::max(IC / 2, SmallIC); + } else { + LLVM_DEBUG(dbgs() << "LV: Interleaving to reduce branch cost.\n"); + return SmallIC; + } } // Interleave if this is a large loop (small loops are already dealt with by // this point) that could benefit from interleaving. - bool HasReductions = !Legal->getReductionVars().empty(); - if (TTI.enableAggressiveInterleaving(HasReductions)) { + if (AggressivelyInterleaveReductions) { LLVM_DEBUG(dbgs() << "LV: Interleaving to expose ILP.\n"); return IC; } Index: llvm/test/Transforms/LoopVectorize/PowerPC/interleave_IC.ll =================================================================== --- /dev/null +++ llvm/test/Transforms/LoopVectorize/PowerPC/interleave_IC.ll @@ -0,0 +1,93 @@ +; RUN: opt < %s -loop-vectorize -S -mcpu=pwr9 -interleave-small-loop-scalar-reduction=true 2>&1 | FileCheck %s +; RUN: opt < %s -passes='loop-vectorize' -S -mcpu=pwr9 -interleave-small-loop-scalar-reduction=true 2>&1 | FileCheck %s + +;void fun(Vector &MatrixB, +; const Vector &MatrixA, +; const unsigned int * const start, +; const unsigned int * const end, +; const double * val) const +;{ +; const unsigned int N=MatrixB.size(); +; MatrixB = MatrixA; +; for (unsigned int row=0; row +%3 = type { %7*, i8* } +%4 = type { %5 } +%5 = type { %6 } +%6 = type { i32**, i32**, i32** } +%7 = type <{ %8, i32, i32, i32, [4 x i8], i64, i32, [4 x i8], i64*, i32*, i8, i8, [6 x i8] }> +%8 = type { i32 (...)**, i32, %9, %16* } +%9 = type { %10 } +%10 = type { %11 } +%11 = type { %12, %14 } +%12 = type { %13 } +%13 = type { i8 } +%14 = type { %15, i64 } +%15 = type { i32, %15*, %15*, %15* } +%16 = type { i32 (...)**, i8* } +%17 = type { %8, i32, i32, double* } + +$test = comdat any +define dso_local void @test(%0* %arg, %17* dereferenceable(88) %arg1) comdat align 2 { + %tpm14 = getelementptr %0, %0* %arg, i64 0, i32 0, i32 3, i32 0, i32 0, i32 0 + %tpm15 = load i32**, i32*** %tpm14, align 8 + %tpm18 = getelementptr inbounds %17, %17* %arg1, i64 0, i32 3 + %tpm19 = load double*, double** %tpm18, align 8 + br label %bb22 +bb22: ; preds = %bb33, %bb + %tpm26 = add i64 0, 1 + %tpm27 = getelementptr inbounds i32, i32* null, i64 %tpm26 + %tpm28 = getelementptr inbounds i32*, i32** %tpm15, i64 undef + %tpm29 = load i32*, i32** %tpm28, align 8 + %tpm32 = getelementptr inbounds double, double* null, i64 %tpm26 + br label %bb40 +bb33: ; preds = %bb40 + %tpm35 = getelementptr inbounds double, double* %tpm19, i64 undef + %tpm37 = fsub fast double 0.000000e+00, %tpm50 + store double %tpm37, double* %tpm35, align 8 + br label %bb22 +bb40: ; preds = %bb40, %bb22 + %tpm41 = phi i32* [ %tpm51, %bb40 ], [ %tpm27, %bb22 ] + %tpm42 = phi double* [ %tpm52, %bb40 ], [ %tpm32, %bb22 ] + %tpm43 = phi double [ %tpm50, %bb40 ], [ 0.000000e+00, %bb22 ] + %tpm44 = load double, double* %tpm42, align 8 + %tpm45 = load i32, i32* %tpm41, align 4 + %tpm46 = zext i32 %tpm45 to i64 + %tpm47 = getelementptr inbounds double, double* %tpm19, i64 %tpm46 + %tpm48 = load double, double* %tpm47, align 8 + %tpm49 = fmul fast double %tpm48, %tpm44 + %tpm50 = fadd fast double %tpm49, %tpm43 + %tpm51 = getelementptr inbounds i32, i32* %tpm41, i64 1 + %tpm52 = getelementptr inbounds double, double* %tpm42, i64 1 + %tpm53 = icmp eq i32* %tpm51, %tpm29 + br i1 %tpm53, label %bb33, label %bb40 +} Index: llvm/test/Transforms/PhaseOrdering/interleave_LV_SLP.ll =================================================================== --- /dev/null +++ llvm/test/Transforms/PhaseOrdering/interleave_LV_SLP.ll @@ -0,0 +1,174 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py +; REQUIRES: powerpc-registered-target +; RUN: opt < %s -O2 -S -mcpu=pwr9 -interleave-small-loop-scalar-reduction=true 2>&1 | FileCheck %s +; RUN: opt < %s -passes='default' -S -mcpu=pwr9 -interleave-small-loop-scalar-reduction=true 2>&1 | FileCheck %s + +;void fun(Vector &MatrixB, +; const Vector &MatrixA, +; const unsigned int * const start, +; const unsigned int * const end, +; const double * val) const +;{ +; const unsigned int N=MatrixB.size(); +; MatrixB = MatrixA; +; for (unsigned int row=0; row +%3 = type { %7*, i8* } +%4 = type { %5 } +%5 = type { %6 } +%6 = type { i32**, i32**, i32** } +%7 = type <{ %8, i32, i32, i32, [4 x i8], i64, i32, [4 x i8], i64*, i32*, i8, i8, [6 x i8] }> +%8 = type { i32 (...)**, i32, %9, %16* } +%9 = type { %10 } +%10 = type { %11 } +%11 = type { %12, %14 } +%12 = type { %13 } +%13 = type { i8 } +%14 = type { %15, i64 } +%15 = type { i32, %15*, %15*, %15* } +%16 = type { i32 (...)**, i8* } +%17 = type { %8, i32, i32, double* } + +$test = comdat any +define dso_local void @test(%0* %arg, %17* dereferenceable(88) %arg1) comdat align 2 { +; CHECK-LABEL: @test( +; CHECK-NEXT: [[TPM14:%.*]] = getelementptr [[TMP0:%.*]], %0* [[ARG:%.*]], i64 0, i32 0, i32 3, i32 0, i32 0, i32 0 +; CHECK-NEXT: [[TPM15:%.*]] = load i32**, i32*** [[TPM14]], align 8 +; CHECK-NEXT: [[TPM18:%.*]] = getelementptr inbounds [[TMP17:%.*]], %17* [[ARG1:%.*]], i64 0, i32 3 +; CHECK-NEXT: [[TPM19:%.*]] = load double*, double** [[TPM18]], align 8 +; CHECK-NEXT: [[TPM28:%.*]] = getelementptr inbounds i32*, i32** [[TPM15]], i64 undef +; CHECK-NEXT: [[TPM35:%.*]] = getelementptr inbounds double, double* [[TPM19]], i64 undef +; CHECK-NEXT: br label [[BB22:%.*]] +; CHECK: bb22: +; CHECK-NEXT: [[TPM29:%.*]] = load i32*, i32** [[TPM28]], align 8 +; CHECK-NEXT: [[UGLYGEP17:%.*]] = getelementptr i32, i32* [[TPM29]], i64 -2 +; CHECK-NEXT: [[UGLYGEP2:%.*]] = ptrtoint i32* [[UGLYGEP17]] to i64 +; CHECK-NEXT: [[TMP1:%.*]] = lshr i64 [[UGLYGEP2]], 2 +; CHECK-NEXT: [[TMP2:%.*]] = add nuw nsw i64 [[TMP1]], 1 +; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i32* [[UGLYGEP17]], inttoptr (i64 12 to i32*) +; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[BB40_PREHEADER:%.*]], label [[VECTOR_PH:%.*]] +; CHECK: bb40.preheader: +; CHECK-NEXT: [[TPM41_PH:%.*]] = phi i32* [ inttoptr (i64 4 to i32*), [[BB22]] ], [ [[IND_END:%.*]], [[MIDDLE_BLOCK:%.*]] ] +; CHECK-NEXT: [[TPM42_PH:%.*]] = phi double* [ inttoptr (i64 8 to double*), [[BB22]] ], [ [[IND_END4:%.*]], [[MIDDLE_BLOCK]] ] +; CHECK-NEXT: [[TPM43_PH:%.*]] = phi double [ 0.000000e+00, [[BB22]] ], [ [[BIN_RDX16:%.*]], [[MIDDLE_BLOCK]] ] +; CHECK-NEXT: br label [[BB40:%.*]] +; CHECK: vector.ph: +; CHECK-NEXT: [[N_VEC:%.*]] = and i64 [[TMP2]], 9223372036854775804 +; CHECK-NEXT: [[IND_END]] = getelementptr i32, i32* inttoptr (i64 4 to i32*), i64 [[N_VEC]] +; CHECK-NEXT: [[IND_END4]] = getelementptr double, double* inttoptr (i64 8 to double*), i64 [[N_VEC]] +; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK: vector.body: +; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP3:%.*]] = phi <4 x double> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP30:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[NEXT_GEP:%.*]] = getelementptr i32, i32* inttoptr (i64 4 to i32*), i64 [[INDEX]] +; CHECK-NEXT: [[TMP4:%.*]] = or i64 [[INDEX]], 1 +; CHECK-NEXT: [[NEXT_GEP5:%.*]] = getelementptr i32, i32* inttoptr (i64 4 to i32*), i64 [[TMP4]] +; CHECK-NEXT: [[TMP5:%.*]] = or i64 [[INDEX]], 2 +; CHECK-NEXT: [[NEXT_GEP6:%.*]] = getelementptr i32, i32* inttoptr (i64 4 to i32*), i64 [[TMP5]] +; CHECK-NEXT: [[TMP6:%.*]] = or i64 [[INDEX]], 3 +; CHECK-NEXT: [[NEXT_GEP7:%.*]] = getelementptr i32, i32* inttoptr (i64 4 to i32*), i64 [[TMP6]] +; CHECK-NEXT: [[NEXT_GEP8:%.*]] = getelementptr double, double* inttoptr (i64 8 to double*), i64 [[INDEX]] +; CHECK-NEXT: [[TMP7:%.*]] = bitcast double* [[NEXT_GEP8]] to <4 x double>* +; CHECK-NEXT: [[TMP8:%.*]] = load <4 x double>, <4 x double>* [[TMP7]], align 8 +; CHECK-NEXT: [[TMP9:%.*]] = load i32, i32* [[NEXT_GEP]], align 4 +; CHECK-NEXT: [[TMP10:%.*]] = load i32, i32* [[NEXT_GEP5]], align 4 +; CHECK-NEXT: [[TMP11:%.*]] = load i32, i32* [[NEXT_GEP6]], align 4 +; CHECK-NEXT: [[TMP12:%.*]] = load i32, i32* [[NEXT_GEP7]], align 4 +; CHECK-NEXT: [[TMP13:%.*]] = zext i32 [[TMP9]] to i64 +; CHECK-NEXT: [[TMP14:%.*]] = zext i32 [[TMP10]] to i64 +; CHECK-NEXT: [[TMP15:%.*]] = zext i32 [[TMP11]] to i64 +; CHECK-NEXT: [[TMP16:%.*]] = zext i32 [[TMP12]] to i64 +; CHECK-NEXT: [[TMP17]] = getelementptr inbounds double, double* [[TPM19]], i64 [[TMP13]] +; CHECK-NEXT: [[TMP18:%.*]] = getelementptr inbounds double, double* [[TPM19]], i64 [[TMP14]] +; CHECK-NEXT: [[TMP19:%.*]] = getelementptr inbounds double, double* [[TPM19]], i64 [[TMP15]] +; CHECK-NEXT: [[TMP20:%.*]] = getelementptr inbounds double, double* [[TPM19]], i64 [[TMP16]] +; CHECK-NEXT: [[TMP21:%.*]] = load double, double* [[TMP17]], align 8 +; CHECK-NEXT: [[TMP22:%.*]] = load double, double* [[TMP18]], align 8 +; CHECK-NEXT: [[TMP23:%.*]] = load double, double* [[TMP19]], align 8 +; CHECK-NEXT: [[TMP24:%.*]] = load double, double* [[TMP20]], align 8 +; CHECK-NEXT: [[TMP25:%.*]] = insertelement <4 x double> undef, double [[TMP21]], i32 0 +; CHECK-NEXT: [[TMP26:%.*]] = insertelement <4 x double> [[TMP25]], double [[TMP22]], i32 1 +; CHECK-NEXT: [[TMP27:%.*]] = insertelement <4 x double> [[TMP26]], double [[TMP23]], i32 2 +; CHECK-NEXT: [[TMP28:%.*]] = insertelement <4 x double> [[TMP27]], double [[TMP24]], i32 3 +; CHECK-NEXT: [[TMP29:%.*]] = fmul fast <4 x double> [[TMP28]], [[TMP8]] +; CHECK-NEXT: [[TMP30]] = fadd fast <4 x double> [[TMP29]], [[TMP3]] +; CHECK-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 4 +; CHECK-NEXT: [[TMP31:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-NEXT: br i1 [[TMP31]], label [[MIDDLE_BLOCK]], label [[VECTOR_BODY]], !llvm.loop !0 +; CHECK: middle.block: +; CHECK-NEXT: [[TMP32:%.*]] = extractelement <4 x double> [[TMP30]], i32 0 +; CHECK-NEXT: [[TMP33:%.*]] = extractelement <4 x double> [[TMP30]], i32 1 +; CHECK-NEXT: [[BIN_RDX:%.*]] = fadd fast double [[TMP33]], [[TMP32]] +; CHECK-NEXT: [[TMP34:%.*]] = extractelement <4 x double> [[TMP30]], i32 2 +; CHECK-NEXT: [[BIN_RDX15:%.*]] = fadd fast double [[TMP34]], [[BIN_RDX]] +; CHECK-NEXT: [[TMP35:%.*]] = extractelement <4 x double> [[TMP30]], i32 3 +; CHECK-NEXT: [[BIN_RDX16]] = fadd fast double [[TMP35]], [[BIN_RDX15]] +; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[TMP2]], [[N_VEC]] +; CHECK-NEXT: br i1 [[CMP_N]], label [[BB33:%.*]], label [[BB40_PREHEADER]] +; CHECK: bb33: +; CHECK-NEXT: [[TPM50_LCSSA:%.*]] = phi double [ [[BIN_RDX16]], [[MIDDLE_BLOCK]] ], [ [[TPM50:%.*]], [[BB40]] ] +; CHECK-NEXT: [[TPM37:%.*]] = fneg fast double [[TPM50_LCSSA]] +; CHECK-NEXT: store double [[TPM37]], double* [[TPM35]], align 8 +; CHECK-NEXT: br label [[BB22]] +; CHECK: bb40: +; CHECK-NEXT: [[TPM41:%.*]] = phi i32* [ [[TPM51:%.*]], [[BB40]] ], [ [[TPM41_PH]], [[BB40_PREHEADER]] ] +; CHECK-NEXT: [[TPM42:%.*]] = phi double* [ [[TPM52:%.*]], [[BB40]] ], [ [[TPM42_PH]], [[BB40_PREHEADER]] ] +; CHECK-NEXT: [[TPM43:%.*]] = phi double [ [[TPM50]], [[BB40]] ], [ [[TPM43_PH]], [[BB40_PREHEADER]] ] +; CHECK-NEXT: [[TPM44:%.*]] = load double, double* [[TPM42]], align 8 +; CHECK-NEXT: [[TPM45:%.*]] = load i32, i32* [[TPM41]], align 4 +; CHECK-NEXT: [[TPM46:%.*]] = zext i32 [[TPM45]] to i64 +; CHECK-NEXT: [[TPM47:%.*]] = getelementptr inbounds double, double* [[TPM19]], i64 [[TPM46]] +; CHECK-NEXT: [[TPM48:%.*]] = load double, double* [[TPM47]], align 8 +; CHECK-NEXT: [[TPM49:%.*]] = fmul fast double [[TPM48]], [[TPM44]] +; CHECK-NEXT: [[TPM50]] = fadd fast double [[TPM49]], [[TPM43]] +; CHECK-NEXT: [[TPM51]] = getelementptr inbounds i32, i32* [[TPM41]], i64 1 +; CHECK-NEXT: [[TPM52]] = getelementptr inbounds double, double* [[TPM42]], i64 1 +; CHECK-NEXT: [[TPM53:%.*]] = icmp eq i32* [[TPM51]], [[TPM29]] +; CHECK-NEXT: br i1 [[TPM53]], label [[BB33]], label [[BB40]], !llvm.loop !2 +; + %tpm14 = getelementptr %0, %0* %arg, i64 0, i32 0, i32 3, i32 0, i32 0, i32 0 + %tpm15 = load i32**, i32*** %tpm14, align 8 + %tpm18 = getelementptr inbounds %17, %17* %arg1, i64 0, i32 3 + %tpm19 = load double*, double** %tpm18, align 8 + br label %bb22 +bb22: ; preds = %bb33, %bb + %tpm26 = add i64 0, 1 + %tpm27 = getelementptr inbounds i32, i32* null, i64 %tpm26 + %tpm28 = getelementptr inbounds i32*, i32** %tpm15, i64 undef + %tpm29 = load i32*, i32** %tpm28, align 8 + %tpm32 = getelementptr inbounds double, double* null, i64 %tpm26 + br label %bb40 +bb33: ; preds = %bb40 + %tpm35 = getelementptr inbounds double, double* %tpm19, i64 undef + %tpm37 = fsub fast double 0.000000e+00, %tpm50 + store double %tpm37, double* %tpm35, align 8 + br label %bb22 +bb40: ; preds = %bb40, %bb22 + %tpm41 = phi i32* [ %tpm51, %bb40 ], [ %tpm27, %bb22 ] + %tpm42 = phi double* [ %tpm52, %bb40 ], [ %tpm32, %bb22 ] + %tpm43 = phi double [ %tpm50, %bb40 ], [ 0.000000e+00, %bb22 ] + %tpm44 = load double, double* %tpm42, align 8 + %tpm45 = load i32, i32* %tpm41, align 4 + %tpm46 = zext i32 %tpm45 to i64 + %tpm47 = getelementptr inbounds double, double* %tpm19, i64 %tpm46 + %tpm48 = load double, double* %tpm47, align 8 + %tpm49 = fmul fast double %tpm48, %tpm44 + %tpm50 = fadd fast double %tpm49, %tpm43 + %tpm51 = getelementptr inbounds i32, i32* %tpm41, i64 1 + %tpm52 = getelementptr inbounds double, double* %tpm42, i64 1 + %tpm53 = icmp eq i32* %tpm51, %tpm29 + br i1 %tpm53, label %bb33, label %bb40 +} Index: llvm/test/Transforms/PhaseOrdering/interleave_LV_SLP_false.ll =================================================================== --- /dev/null +++ llvm/test/Transforms/PhaseOrdering/interleave_LV_SLP_false.ll @@ -0,0 +1,220 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py +; REQUIRES: powerpc-registered-target +; RUN: opt < %s -O2 -S -mcpu=pwr9 -interleave-small-loop-scalar-reduction=false 2>&1 | FileCheck %s +; RUN: opt < %s -passes='default' -S -mcpu=pwr9 -interleave-small-loop-scalar-reduction=false 2>&1 | FileCheck %s + +;void fun(Vector &MatrixB, +; const Vector &MatrixA, +; const unsigned int * const start, +; const unsigned int * const end, +; const double * val) const +;{ +; const unsigned int N=MatrixB.size(); +; MatrixB = MatrixA; +; for (unsigned int row=0; row +%3 = type { %7*, i8* } +%4 = type { %5 } +%5 = type { %6 } +%6 = type { i32**, i32**, i32** } +%7 = type <{ %8, i32, i32, i32, [4 x i8], i64, i32, [4 x i8], i64*, i32*, i8, i8, [6 x i8] }> +%8 = type { i32 (...)**, i32, %9, %16* } +%9 = type { %10 } +%10 = type { %11 } +%11 = type { %12, %14 } +%12 = type { %13 } +%13 = type { i8 } +%14 = type { %15, i64 } +%15 = type { i32, %15*, %15*, %15* } +%16 = type { i32 (...)**, i8* } +%17 = type { %8, i32, i32, double* } + +$test = comdat any +define dso_local void @test(%0* %arg, %17* dereferenceable(88) %arg1) comdat align 2 { +; CHECK-LABEL: @test( +; CHECK-NEXT: [[TPM14:%.*]] = getelementptr [[TMP0:%.*]], %0* [[ARG:%.*]], i64 0, i32 0, i32 3, i32 0, i32 0, i32 0 +; CHECK-NEXT: [[TPM15:%.*]] = load i32**, i32*** [[TPM14]], align 8 +; CHECK-NEXT: [[TPM18:%.*]] = getelementptr inbounds [[TMP17:%.*]], %17* [[ARG1:%.*]], i64 0, i32 3 +; CHECK-NEXT: [[TPM19:%.*]] = load double*, double** [[TPM18]], align 8 +; CHECK-NEXT: [[TPM28:%.*]] = getelementptr inbounds i32*, i32** [[TPM15]], i64 undef +; CHECK-NEXT: [[TPM35:%.*]] = getelementptr inbounds double, double* [[TPM19]], i64 undef +; CHECK-NEXT: br label [[BB22:%.*]] +; CHECK: bb22: +; CHECK-NEXT: [[TPM29:%.*]] = load i32*, i32** [[TPM28]], align 8 +; CHECK-NEXT: [[UGLYGEP9:%.*]] = getelementptr i32, i32* [[TPM29]], i64 -2 +; CHECK-NEXT: [[UGLYGEP2:%.*]] = ptrtoint i32* [[UGLYGEP9]] to i64 +; CHECK-NEXT: [[TMP1:%.*]] = lshr i64 [[UGLYGEP2]], 2 +; CHECK-NEXT: [[TMP2:%.*]] = add nuw nsw i64 [[TMP1]], 1 +; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp eq i64 [[TMP1]], 0 +; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[BB40_PREHEADER:%.*]], label [[VECTOR_PH:%.*]] +; CHECK: bb40.preheader: +; CHECK-NEXT: [[TPM41_PH:%.*]] = phi i32* [ inttoptr (i64 4 to i32*), [[BB22]] ], [ [[IND_END:%.*]], [[MIDDLE_BLOCK:%.*]] ] +; CHECK-NEXT: [[TPM42_PH:%.*]] = phi double* [ inttoptr (i64 8 to double*), [[BB22]] ], [ [[IND_END4:%.*]], [[MIDDLE_BLOCK]] ] +; CHECK-NEXT: [[TPM43_PH:%.*]] = phi double [ 0.000000e+00, [[BB22]] ], [ [[BIN_RDX:%.*]], [[MIDDLE_BLOCK]] ] +; CHECK-NEXT: br label [[BB40:%.*]] +; CHECK: vector.ph: +; CHECK-NEXT: [[N_VEC:%.*]] = and i64 [[TMP2]], 9223372036854775806 +; CHECK-NEXT: [[IND_END]] = getelementptr i32, i32* inttoptr (i64 4 to i32*), i64 [[N_VEC]] +; CHECK-NEXT: [[IND_END4]] = getelementptr double, double* inttoptr (i64 8 to double*), i64 [[N_VEC]] +; CHECK-NEXT: [[TMP3:%.*]] = add nsw i64 [[N_VEC]], -2 +; CHECK-NEXT: [[TMP4:%.*]] = lshr exact i64 [[TMP3]], 1 +; CHECK-NEXT: [[TMP5:%.*]] = add nuw i64 [[TMP4]], 1 +; CHECK-NEXT: [[XTRAITER:%.*]] = and i64 [[TMP5]], 1 +; CHECK-NEXT: [[TMP6:%.*]] = icmp eq i64 [[TMP3]], 0 +; CHECK-NEXT: br i1 [[TMP6]], label [[MIDDLE_BLOCK_UNR_LCSSA:%.*]], label [[VECTOR_PH_NEW:%.*]] +; CHECK: vector.ph.new: +; CHECK-NEXT: [[UNROLL_ITER:%.*]] = and i64 [[TMP5]], -2 +; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK: vector.body: +; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH_NEW]] ], [ [[INDEX_NEXT_1:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_PHI:%.*]] = phi double [ 0.000000e+00, [[VECTOR_PH_NEW]] ], [ [[TMP37:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_PHI8:%.*]] = phi double [ 0.000000e+00, [[VECTOR_PH_NEW]] ], [ [[TMP38:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[NITER:%.*]] = phi i64 [ [[UNROLL_ITER]], [[VECTOR_PH_NEW]] ], [ [[NITER_NSUB_1:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[NEXT_GEP:%.*]] = getelementptr i32, i32* inttoptr (i64 4 to i32*), i64 [[INDEX]] +; CHECK-NEXT: [[TMP7:%.*]] = or i64 [[INDEX]], 1 +; CHECK-NEXT: [[NEXT_GEP5:%.*]] = getelementptr i32, i32* inttoptr (i64 4 to i32*), i64 [[TMP7]] +; CHECK-NEXT: [[NEXT_GEP6:%.*]] = getelementptr double, double* inttoptr (i64 8 to double*), i64 [[INDEX]] +; CHECK-NEXT: [[TMP8:%.*]] = or i64 [[INDEX]], 1 +; CHECK-NEXT: [[NEXT_GEP7:%.*]] = getelementptr double, double* inttoptr (i64 8 to double*), i64 [[TMP8]] +; CHECK-NEXT: [[TMP9:%.*]] = load double, double* [[NEXT_GEP6]], align 8 +; CHECK-NEXT: [[TMP10:%.*]] = load double, double* [[NEXT_GEP7]], align 8 +; CHECK-NEXT: [[TMP11:%.*]] = load i32, i32* [[NEXT_GEP]], align 4 +; CHECK-NEXT: [[TMP12:%.*]] = load i32, i32* [[NEXT_GEP5]], align 4 +; CHECK-NEXT: [[TMP13:%.*]] = zext i32 [[TMP11]] to i64 +; CHECK-NEXT: [[TMP14:%.*]] = zext i32 [[TMP12]] to i64 +; CHECK-NEXT: [[TMP15:%.*]] = getelementptr inbounds double, double* [[TPM19]], i64 [[TMP13]] +; CHECK-NEXT: [[TMP16:%.*]] = getelementptr inbounds double, double* [[TPM19]], i64 [[TMP14]] +; CHECK-NEXT: [[TMP17]] = load double, double* [[TMP15]], align 8 +; CHECK-NEXT: [[TMP18:%.*]] = load double, double* [[TMP16]], align 8 +; CHECK-NEXT: [[TMP19:%.*]] = fmul fast double [[TMP17]], [[TMP9]] +; CHECK-NEXT: [[TMP20:%.*]] = fmul fast double [[TMP18]], [[TMP10]] +; CHECK-NEXT: [[TMP21:%.*]] = fadd fast double [[TMP19]], [[VEC_PHI]] +; CHECK-NEXT: [[TMP22:%.*]] = fadd fast double [[TMP20]], [[VEC_PHI8]] +; CHECK-NEXT: [[INDEX_NEXT:%.*]] = or i64 [[INDEX]], 2 +; CHECK-NEXT: [[NEXT_GEP_1:%.*]] = getelementptr i32, i32* inttoptr (i64 4 to i32*), i64 [[INDEX_NEXT]] +; CHECK-NEXT: [[TMP23:%.*]] = or i64 [[INDEX]], 3 +; CHECK-NEXT: [[NEXT_GEP5_1:%.*]] = getelementptr i32, i32* inttoptr (i64 4 to i32*), i64 [[TMP23]] +; CHECK-NEXT: [[NEXT_GEP6_1:%.*]] = getelementptr double, double* inttoptr (i64 8 to double*), i64 [[INDEX_NEXT]] +; CHECK-NEXT: [[TMP24:%.*]] = or i64 [[INDEX]], 3 +; CHECK-NEXT: [[NEXT_GEP7_1:%.*]] = getelementptr double, double* inttoptr (i64 8 to double*), i64 [[TMP24]] +; CHECK-NEXT: [[TMP25:%.*]] = load double, double* [[NEXT_GEP6_1]], align 8 +; CHECK-NEXT: [[TMP26:%.*]] = load double, double* [[NEXT_GEP7_1]], align 8 +; CHECK-NEXT: [[TMP27:%.*]] = load i32, i32* [[NEXT_GEP_1]], align 4 +; CHECK-NEXT: [[TMP28:%.*]] = load i32, i32* [[NEXT_GEP5_1]], align 4 +; CHECK-NEXT: [[TMP29:%.*]] = zext i32 [[TMP27]] to i64 +; CHECK-NEXT: [[TMP30:%.*]] = zext i32 [[TMP28]] to i64 +; CHECK-NEXT: [[TMP31:%.*]] = getelementptr inbounds double, double* [[TPM19]], i64 [[TMP29]] +; CHECK-NEXT: [[TMP32:%.*]] = getelementptr inbounds double, double* [[TPM19]], i64 [[TMP30]] +; CHECK-NEXT: [[TMP33:%.*]] = load double, double* [[TMP31]], align 8 +; CHECK-NEXT: [[TMP34:%.*]] = load double, double* [[TMP32]], align 8 +; CHECK-NEXT: [[TMP35:%.*]] = fmul fast double [[TMP33]], [[TMP25]] +; CHECK-NEXT: [[TMP36:%.*]] = fmul fast double [[TMP34]], [[TMP26]] +; CHECK-NEXT: [[TMP37]] = fadd fast double [[TMP35]], [[TMP21]] +; CHECK-NEXT: [[TMP38]] = fadd fast double [[TMP36]], [[TMP22]] +; CHECK-NEXT: [[INDEX_NEXT_1]] = add i64 [[INDEX]], 4 +; CHECK-NEXT: [[NITER_NSUB_1]] = add i64 [[NITER]], -2 +; CHECK-NEXT: [[NITER_NCMP_1:%.*]] = icmp eq i64 [[NITER_NSUB_1]], 0 +; CHECK-NEXT: br i1 [[NITER_NCMP_1]], label [[MIDDLE_BLOCK_UNR_LCSSA]], label [[VECTOR_BODY]], !llvm.loop !0 +; CHECK: middle.block.unr-lcssa: +; CHECK-NEXT: [[DOTLCSSA10_PH:%.*]] = phi double [ undef, [[VECTOR_PH]] ], [ [[TMP37]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[DOTLCSSA_PH:%.*]] = phi double [ undef, [[VECTOR_PH]] ], [ [[TMP38]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[INDEX_UNR:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT_1]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_PHI_UNR:%.*]] = phi double [ 0.000000e+00, [[VECTOR_PH]] ], [ [[TMP37]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_PHI8_UNR:%.*]] = phi double [ 0.000000e+00, [[VECTOR_PH]] ], [ [[TMP38]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[LCMP_MOD:%.*]] = icmp eq i64 [[XTRAITER]], 0 +; CHECK-NEXT: br i1 [[LCMP_MOD]], label [[MIDDLE_BLOCK]], label [[MIDDLE_BLOCK_EPILOG_LCSSA:%.*]] +; CHECK: middle.block.epilog-lcssa: +; CHECK-NEXT: [[TMP39:%.*]] = or i64 [[INDEX_UNR]], 1 +; CHECK-NEXT: [[NEXT_GEP5_EPIL:%.*]] = getelementptr i32, i32* inttoptr (i64 4 to i32*), i64 [[TMP39]] +; CHECK-NEXT: [[TMP40:%.*]] = load i32, i32* [[NEXT_GEP5_EPIL]], align 4 +; CHECK-NEXT: [[TMP41:%.*]] = zext i32 [[TMP40]] to i64 +; CHECK-NEXT: [[TMP42:%.*]] = getelementptr inbounds double, double* [[TPM19]], i64 [[TMP41]] +; CHECK-NEXT: [[TMP43:%.*]] = load double, double* [[TMP42]], align 8 +; CHECK-NEXT: [[TMP44:%.*]] = or i64 [[INDEX_UNR]], 1 +; CHECK-NEXT: [[NEXT_GEP7_EPIL:%.*]] = getelementptr double, double* inttoptr (i64 8 to double*), i64 [[TMP44]] +; CHECK-NEXT: [[TMP45:%.*]] = load double, double* [[NEXT_GEP7_EPIL]], align 8 +; CHECK-NEXT: [[TMP46:%.*]] = fmul fast double [[TMP43]], [[TMP45]] +; CHECK-NEXT: [[TMP47:%.*]] = fadd fast double [[TMP46]], [[VEC_PHI8_UNR]] +; CHECK-NEXT: [[NEXT_GEP_EPIL:%.*]] = getelementptr i32, i32* inttoptr (i64 4 to i32*), i64 [[INDEX_UNR]] +; CHECK-NEXT: [[TMP48:%.*]] = load i32, i32* [[NEXT_GEP_EPIL]], align 4 +; CHECK-NEXT: [[TMP49:%.*]] = zext i32 [[TMP48]] to i64 +; CHECK-NEXT: [[TMP50:%.*]] = getelementptr inbounds double, double* [[TPM19]], i64 [[TMP49]] +; CHECK-NEXT: [[TMP51:%.*]] = load double, double* [[TMP50]], align 8 +; CHECK-NEXT: [[NEXT_GEP6_EPIL:%.*]] = getelementptr double, double* inttoptr (i64 8 to double*), i64 [[INDEX_UNR]] +; CHECK-NEXT: [[TMP52:%.*]] = load double, double* [[NEXT_GEP6_EPIL]], align 8 +; CHECK-NEXT: [[TMP53:%.*]] = fmul fast double [[TMP51]], [[TMP52]] +; CHECK-NEXT: [[TMP54:%.*]] = fadd fast double [[TMP53]], [[VEC_PHI_UNR]] +; CHECK-NEXT: br label [[MIDDLE_BLOCK]] +; CHECK: middle.block: +; CHECK-NEXT: [[DOTLCSSA10:%.*]] = phi double [ [[DOTLCSSA10_PH]], [[MIDDLE_BLOCK_UNR_LCSSA]] ], [ [[TMP54]], [[MIDDLE_BLOCK_EPILOG_LCSSA]] ] +; CHECK-NEXT: [[DOTLCSSA:%.*]] = phi double [ [[DOTLCSSA_PH]], [[MIDDLE_BLOCK_UNR_LCSSA]] ], [ [[TMP47]], [[MIDDLE_BLOCK_EPILOG_LCSSA]] ] +; CHECK-NEXT: [[BIN_RDX]] = fadd fast double [[DOTLCSSA]], [[DOTLCSSA10]] +; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[TMP2]], [[N_VEC]] +; CHECK-NEXT: br i1 [[CMP_N]], label [[BB33:%.*]], label [[BB40_PREHEADER]] +; CHECK: bb33: +; CHECK-NEXT: [[TPM50_LCSSA:%.*]] = phi double [ [[BIN_RDX]], [[MIDDLE_BLOCK]] ], [ [[TPM50:%.*]], [[BB40]] ] +; CHECK-NEXT: [[TPM37:%.*]] = fneg fast double [[TPM50_LCSSA]] +; CHECK-NEXT: store double [[TPM37]], double* [[TPM35]], align 8 +; CHECK-NEXT: br label [[BB22]] +; CHECK: bb40: +; CHECK-NEXT: [[TPM41:%.*]] = phi i32* [ [[TPM51:%.*]], [[BB40]] ], [ [[TPM41_PH]], [[BB40_PREHEADER]] ] +; CHECK-NEXT: [[TPM42:%.*]] = phi double* [ [[TPM52:%.*]], [[BB40]] ], [ [[TPM42_PH]], [[BB40_PREHEADER]] ] +; CHECK-NEXT: [[TPM43:%.*]] = phi double [ [[TPM50]], [[BB40]] ], [ [[TPM43_PH]], [[BB40_PREHEADER]] ] +; CHECK-NEXT: [[TPM44:%.*]] = load double, double* [[TPM42]], align 8 +; CHECK-NEXT: [[TPM45:%.*]] = load i32, i32* [[TPM41]], align 4 +; CHECK-NEXT: [[TPM46:%.*]] = zext i32 [[TPM45]] to i64 +; CHECK-NEXT: [[TPM47:%.*]] = getelementptr inbounds double, double* [[TPM19]], i64 [[TPM46]] +; CHECK-NEXT: [[TPM48:%.*]] = load double, double* [[TPM47]], align 8 +; CHECK-NEXT: [[TPM49:%.*]] = fmul fast double [[TPM48]], [[TPM44]] +; CHECK-NEXT: [[TPM50]] = fadd fast double [[TPM49]], [[TPM43]] +; CHECK-NEXT: [[TPM51]] = getelementptr inbounds i32, i32* [[TPM41]], i64 1 +; CHECK-NEXT: [[TPM52]] = getelementptr inbounds double, double* [[TPM42]], i64 1 +; CHECK-NEXT: [[TPM53:%.*]] = icmp eq i32* [[TPM51]], [[TPM29]] +; CHECK-NEXT: br i1 [[TPM53]], label [[BB33]], label [[BB40]], !llvm.loop !2 +; + %tpm14 = getelementptr %0, %0* %arg, i64 0, i32 0, i32 3, i32 0, i32 0, i32 0 + %tpm15 = load i32**, i32*** %tpm14, align 8 + %tpm18 = getelementptr inbounds %17, %17* %arg1, i64 0, i32 3 + %tpm19 = load double*, double** %tpm18, align 8 + br label %bb22 +bb22: ; preds = %bb33, %bb + %tpm26 = add i64 0, 1 + %tpm27 = getelementptr inbounds i32, i32* null, i64 %tpm26 + %tpm28 = getelementptr inbounds i32*, i32** %tpm15, i64 undef + %tpm29 = load i32*, i32** %tpm28, align 8 + %tpm32 = getelementptr inbounds double, double* null, i64 %tpm26 + br label %bb40 +bb33: ; preds = %bb40 + %tpm35 = getelementptr inbounds double, double* %tpm19, i64 undef + %tpm37 = fsub fast double 0.000000e+00, %tpm50 + store double %tpm37, double* %tpm35, align 8 + br label %bb22 +bb40: ; preds = %bb40, %bb22 + %tpm41 = phi i32* [ %tpm51, %bb40 ], [ %tpm27, %bb22 ] + %tpm42 = phi double* [ %tpm52, %bb40 ], [ %tpm32, %bb22 ] + %tpm43 = phi double [ %tpm50, %bb40 ], [ 0.000000e+00, %bb22 ] + %tpm44 = load double, double* %tpm42, align 8 + %tpm45 = load i32, i32* %tpm41, align 4 + %tpm46 = zext i32 %tpm45 to i64 + %tpm47 = getelementptr inbounds double, double* %tpm19, i64 %tpm46 + %tpm48 = load double, double* %tpm47, align 8 + %tpm49 = fmul fast double %tpm48, %tpm44 + %tpm50 = fadd fast double %tpm49, %tpm43 + %tpm51 = getelementptr inbounds i32, i32* %tpm41, i64 1 + %tpm52 = getelementptr inbounds double, double* %tpm42, i64 1 + %tpm53 = icmp eq i32* %tpm51, %tpm29 + br i1 %tpm53, label %bb33, label %bb40 +} Index: llvm/test/Transforms/SLPVectorizer/PowerPC/interleave_SLP.ll =================================================================== --- /dev/null +++ llvm/test/Transforms/SLPVectorizer/PowerPC/interleave_SLP.ll @@ -0,0 +1,213 @@ +; RUN: opt -S -mcpu=pwr9 -slp-vectorizer -interleave-small-loop-scalar-reduction=true < %s | FileCheck %s +; RUN: opt -S -mcpu=pwr9 -passes='slp-vectorizer' -interleave-small-loop-scalar-reduction=true < %s | FileCheck %s + +; CHECK-LABEL: vector.body + +; CHECK: load <4 x double>, <4 x double>* + +; CHECK: fmul fast <4 x double> + +; CHECK: fadd fast <4 x double> + +target datalayout = "e-m:e-i64:64-n32:64" +target triple = "powerpc64le-unknown-linux" + +%0 = type { i8 } +%1 = type { %2, %9 } +%2 = type { %3, i8, double, %5, %8* } +%3 = type <{ i32 (...)**, %4, double*, i32 }> +%4 = type { %8*, i8* } +%5 = type { %6 } +%6 = type { %7 } +%7 = type { i32**, i32**, i32** } +%8 = type <{ %9, i32, i32, i32, [4 x i8], i64, i32, [4 x i8], i64*, i32*, i8, i8, [6 x i8] }> +%9 = type { i32 (...)**, i32, %10, %17* } +%10 = type { %11 } +%11 = type { %12 } +%12 = type { %13, %15 } +%13 = type { %14 } +%14 = type { i8 } +%15 = type { %16, i64 } +%16 = type { i32, %16*, %16*, %16* } +%17 = type { i32 (...)**, i8* } +%18 = type { %9, i32, i32, double* } +%19 = type <{ i32 (...)**, %4, double*, i32, [4 x i8], %9 }> + +$test0 = comdat any + +@0 = internal global %0 zeroinitializer, align 1 +@__dso_handle = external hidden global i8 +@llvm.global_ctors = appending global [1 x { i32, void ()*, i8* }] [{ i32, void ()*, i8* } { i32 65535, void ()* @1, i8* null }] +declare void @test3(%0*) +declare void @test4(%0*) +; Function Attrs: nofree nounwind +declare i32 @__cxa_atexit(void (i8*)*, i8*, i8*) +define weak_odr dso_local void @test0(%1* %arg, %18* dereferenceable(88) %arg1, %18* dereferenceable(88) %arg2) local_unnamed_addr comdat align 2 { +bb: + %tpm = getelementptr inbounds %18, %18* %arg1, i64 0, i32 1 + %tpm3 = load i32, i32* %tpm, align 8 + %tpm4 = bitcast %1* %arg to %19* + %tpm5 = tail call dereferenceable(128) %8* @test1(%19* %tpm4) + %tpm6 = getelementptr inbounds %8, %8* %tpm5, i64 0, i32 8 + %tpm7 = load i64*, i64** %tpm6, align 8 + %tpm8 = tail call dereferenceable(128) %8* @test1(%19* %tpm4) + %tpm9 = getelementptr inbounds %8, %8* %tpm8, i64 0, i32 9 + %tpm10 = load i32*, i32** %tpm9, align 8 + %tpm102 = ptrtoint i32* %tpm10 to i64 + %tpm11 = tail call dereferenceable(88) %18* @test2(%18* nonnull %arg1, %18* nonnull dereferenceable(88) %arg2) + %tpm12 = icmp eq i32 %tpm3, 0 + br i1 %tpm12, label %bb21, label %bb13 + +bb13: ; preds = %bb + %tpm14 = getelementptr %1, %1* %arg, i64 0, i32 0, i32 3, i32 0, i32 0, i32 0 + %tpm15 = load i32**, i32*** %tpm14, align 8 + %tpm16 = getelementptr inbounds %1, %1* %arg, i64 0, i32 0, i32 0, i32 2 + %tpm17 = load double*, double** %tpm16, align 8 + %tpm18 = getelementptr inbounds %18, %18* %arg1, i64 0, i32 3 + %tpm19 = load double*, double** %tpm18, align 8 + %tpm20 = zext i32 %tpm3 to i64 + %0 = sub i64 0, %tpm102 + br label %bb22 + +bb21.loopexit: ; preds = %bb33 + br label %bb21 + +bb21: ; preds = %bb21.loopexit, %bb + ret void + +bb22: ; preds = %bb33, %bb13 + %tpm23 = phi i64 [ 0, %bb13 ], [ %tpm38, %bb33 ] + %tpm24 = getelementptr inbounds i64, i64* %tpm7, i64 %tpm23 + %tpm25 = load i64, i64* %tpm24, align 8 + %tpm26 = add i64 %tpm25, 1 + %tpm27 = getelementptr inbounds i32, i32* %tpm10, i64 %tpm26 + %tpm28 = getelementptr inbounds i32*, i32** %tpm15, i64 %tpm23 + %tpm29 = load i32*, i32** %tpm28, align 8 + %tpm30 = icmp eq i32* %tpm27, %tpm29 + br i1 %tpm30, label %bb33, label %bb31 + +bb31: ; preds = %bb22 + %tpm32 = getelementptr inbounds double, double* %tpm17, i64 %tpm26 + %scevgep = getelementptr i32, i32* %tpm29, i64 -2 + %scevgep1 = bitcast i32* %scevgep to i8* + %uglygep = getelementptr i8, i8* %scevgep1, i64 %0 + %1 = mul i64 %tpm25, -4 + %scevgep3 = getelementptr i8, i8* %uglygep, i64 %1 + %scevgep34 = ptrtoint i8* %scevgep3 to i64 + %2 = lshr i64 %scevgep34, 2 + %3 = add nuw nsw i64 %2, 1 + %min.iters.check = icmp ult i64 %3, 4 + br i1 %min.iters.check, label %scalar.ph, label %vector.ph + +vector.ph: ; preds = %bb31 + %n.mod.vf = urem i64 %3, 4 + %n.vec = sub i64 %3, %n.mod.vf + %ind.end = getelementptr i32, i32* %tpm27, i64 %n.vec + %ind.end6 = getelementptr double, double* %tpm32, i64 %n.vec + br label %vector.body + +vector.body: ; preds = %vector.body, %vector.ph + %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ] + %vec.phi = phi double [ 0.000000e+00, %vector.ph ], [ %36, %vector.body ] + %vec.phi14 = phi double [ 0.000000e+00, %vector.ph ], [ %37, %vector.body ] + %vec.phi15 = phi double [ 0.000000e+00, %vector.ph ], [ %38, %vector.body ] + %vec.phi16 = phi double [ 0.000000e+00, %vector.ph ], [ %39, %vector.body ] + %4 = add i64 %index, 0 + %next.gep = getelementptr i32, i32* %tpm27, i64 %4 + %5 = add i64 %index, 1 + %next.gep7 = getelementptr i32, i32* %tpm27, i64 %5 + %6 = add i64 %index, 2 + %next.gep8 = getelementptr i32, i32* %tpm27, i64 %6 + %7 = add i64 %index, 3 + %next.gep9 = getelementptr i32, i32* %tpm27, i64 %7 + %8 = add i64 %index, 0 + %next.gep10 = getelementptr double, double* %tpm32, i64 %8 + %9 = add i64 %index, 1 + %next.gep11 = getelementptr double, double* %tpm32, i64 %9 + %10 = add i64 %index, 2 + %next.gep12 = getelementptr double, double* %tpm32, i64 %10 + %11 = add i64 %index, 3 + %next.gep13 = getelementptr double, double* %tpm32, i64 %11 + %12 = load double, double* %next.gep10, align 8 + %13 = load double, double* %next.gep11, align 8 + %14 = load double, double* %next.gep12, align 8 + %15 = load double, double* %next.gep13, align 8 + %16 = load i32, i32* %next.gep, align 4 + %17 = load i32, i32* %next.gep7, align 4 + %18 = load i32, i32* %next.gep8, align 4 + %19 = load i32, i32* %next.gep9, align 4 + %20 = zext i32 %16 to i64 + %21 = zext i32 %17 to i64 + %22 = zext i32 %18 to i64 + %23 = zext i32 %19 to i64 + %24 = getelementptr inbounds double, double* %tpm19, i64 %20 + %25 = getelementptr inbounds double, double* %tpm19, i64 %21 + %26 = getelementptr inbounds double, double* %tpm19, i64 %22 + %27 = getelementptr inbounds double, double* %tpm19, i64 %23 + %28 = load double, double* %24, align 8 + %29 = load double, double* %25, align 8 + %30 = load double, double* %26, align 8 + %31 = load double, double* %27, align 8 + %32 = fmul fast double %28, %12 + %33 = fmul fast double %29, %13 + %34 = fmul fast double %30, %14 + %35 = fmul fast double %31, %15 + %36 = fadd fast double %32, %vec.phi + %37 = fadd fast double %33, %vec.phi14 + %38 = fadd fast double %34, %vec.phi15 + %39 = fadd fast double %35, %vec.phi16 + %index.next = add i64 %index, 4 + %40 = icmp eq i64 %index.next, %n.vec + br i1 %40, label %middle.block, label %vector.body + +middle.block: ; preds = %vector.body + %bin.rdx = fadd fast double %37, %36 + %bin.rdx17 = fadd fast double %38, %bin.rdx + %bin.rdx18 = fadd fast double %39, %bin.rdx17 + %cmp.n = icmp eq i64 %3, %n.vec + br i1 %cmp.n, label %bb33.loopexit, label %scalar.ph + +scalar.ph: ; preds = %middle.block, %bb31 + %bc.resume.val = phi i32* [ %ind.end, %middle.block ], [ %tpm27, %bb31 ] + %bc.resume.val5 = phi double* [ %ind.end6, %middle.block ], [ %tpm32, %bb31 ] + %bc.merge.rdx = phi double [ 0.000000e+00, %bb31 ], [ %bin.rdx18, %middle.block ] + br label %bb40 + +bb33.loopexit: ; preds = %middle.block, %bb40 + %tpm50.lcssa = phi double [ %tpm50, %bb40 ], [ %bin.rdx18, %middle.block ] + br label %bb33 + +bb33: ; preds = %bb33.loopexit, %bb22 + %tpm34 = phi double [ 0.000000e+00, %bb22 ], [ %tpm50.lcssa, %bb33.loopexit ] + %tpm35 = getelementptr inbounds double, double* %tpm19, i64 %tpm23 + %tpm36 = load double, double* %tpm35, align 8 + %tpm37 = fsub fast double %tpm36, %tpm34 + store double %tpm37, double* %tpm35, align 8 + %tpm38 = add nuw nsw i64 %tpm23, 1 + %tpm39 = icmp eq i64 %tpm38, %tpm20 + br i1 %tpm39, label %bb21.loopexit, label %bb22 + +bb40: ; preds = %bb40, %scalar.ph + %tpm41 = phi i32* [ %tpm51, %bb40 ], [ %bc.resume.val, %scalar.ph ] + %tpm42 = phi double* [ %tpm52, %bb40 ], [ %bc.resume.val5, %scalar.ph ] + %tpm43 = phi double [ %tpm50, %bb40 ], [ %bc.merge.rdx, %scalar.ph ] + %tpm44 = load double, double* %tpm42, align 8 + %tpm45 = load i32, i32* %tpm41, align 4 + %tpm46 = zext i32 %tpm45 to i64 + %tpm47 = getelementptr inbounds double, double* %tpm19, i64 %tpm46 + %tpm48 = load double, double* %tpm47, align 8 + %tpm49 = fmul fast double %tpm48, %tpm44 + %tpm50 = fadd fast double %tpm49, %tpm43 + %tpm51 = getelementptr inbounds i32, i32* %tpm41, i64 1 + %tpm52 = getelementptr inbounds double, double* %tpm42, i64 1 + %tpm53 = icmp eq i32* %tpm51, %tpm29 + br i1 %tpm53, label %bb33.loopexit, label %bb40 +} +declare dereferenceable(128) %8* @test1(%19*) +declare dereferenceable(88) %18* @test2(%18*, %18* dereferenceable(88)) +define internal void @1() section ".text.startup" { +bb: + tail call void @test3(%0* nonnull @0) + %tpm = tail call i32 @__cxa_atexit(void (i8*)* bitcast (void (%0*)* @test4 to void (i8*)*), i8* getelementptr inbounds (%0, %0* @0, i64 0, i32 0), i8* nonnull @__dso_handle) + ret void +}