diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp --- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp +++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp @@ -436,26 +436,6 @@ /// i32 6> /// %2 = mul <4 x i8> %1, %1 /// ret <4 x i8> %2 -/// We convert this initially to something like: -/// %x0 = extractelement <4 x i8> %x, i32 0 -/// %x3 = extractelement <4 x i8> %x, i32 3 -/// %y1 = extractelement <4 x i8> %y, i32 1 -/// %y2 = extractelement <4 x i8> %y, i32 2 -/// %1 = insertelement <4 x i8> poison, i8 %x0, i32 0 -/// %2 = insertelement <4 x i8> %1, i8 %x3, i32 1 -/// %3 = insertelement <4 x i8> %2, i8 %y1, i32 2 -/// %4 = insertelement <4 x i8> %3, i8 %y2, i32 3 -/// %5 = mul <4 x i8> %4, %4 -/// %6 = extractelement <4 x i8> %5, i32 0 -/// %ins1 = insertelement <4 x i8> poison, i8 %6, i32 0 -/// %7 = extractelement <4 x i8> %5, i32 1 -/// %ins2 = insertelement <4 x i8> %ins1, i8 %7, i32 1 -/// %8 = extractelement <4 x i8> %5, i32 2 -/// %ins3 = insertelement <4 x i8> %ins2, i8 %8, i32 2 -/// %9 = extractelement <4 x i8> %5, i32 3 -/// %ins4 = insertelement <4 x i8> %ins3, i8 %9, i32 3 -/// ret <4 x i8> %ins4 -/// InstCombiner transforms this into a shuffle and vector mul /// Mask will return the Shuffle Mask equivalent to the extracted elements. /// TODO: Can we split off and reuse the shuffle mask detection from /// ShuffleVectorInst/getShuffleCost? @@ -7505,6 +7485,14 @@ } return VecBase; } + /// Checks if the specified entry \p E needs to be delayed because of its + /// dependency nodes. + std::optional + needToDelay(const TreeEntry *, + ArrayRef>) const { + // No need to delay the cost estimation during analysis. + return std::nullopt; + } void add(const TreeEntry &E1, const TreeEntry &E2, ArrayRef Mask) { if (&E1 == &E2) { assert(all_of(Mask, @@ -7619,13 +7607,16 @@ if (Mask[Idx] != PoisonMaskElem && CommonMask[Idx] == PoisonMaskElem) CommonMask[Idx] = Mask[Idx] + VF; } - Value *gather(ArrayRef VL, Value *Root = nullptr) { + Value *gather(ArrayRef VL, unsigned MaskVF = 0, + Value *Root = nullptr) { Cost += getBuildVectorCost(VL, Root); if (!Root) { - assert(InVectors.empty() && "Unexpected input vectors for buildvector."); // FIXME: Need to find a way to avoid use of getNullValue here. SmallVector Vals; - for (Value *V : VL) { + unsigned VF = VL.size(); + if (MaskVF != 0) + VF = std::min(VF, MaskVF); + for (Value *V : VL.take_front(VF)) { if (isa(V)) { Vals.push_back(cast(V)); continue; @@ -7635,9 +7626,11 @@ return ConstantVector::get(Vals); } return ConstantVector::getSplat( - ElementCount::getFixed(VL.size()), + ElementCount::getFixed( + cast(Root->getType())->getNumElements()), getAllOnesValue(*R.DL, VL.front()->getType())); } + InstructionCost createFreeze(InstructionCost Cost) { return Cost; } /// Finalize emission of the shuffles. InstructionCost finalize(ArrayRef ExtMask, unsigned VF = 0, @@ -7659,8 +7652,10 @@ InVectors.front() = V; } ::addMask(CommonMask, ExtMask, /*ExtendingManyInputs=*/true); - if (CommonMask.empty()) + if (CommonMask.empty()) { + assert(InVectors.size() == 1 && "Expected only one vector with no mask"); return Cost; + } return Cost + createShuffle(InVectors.front(), InVectors.size() == 2 ? InVectors.back() : nullptr, @@ -7737,189 +7732,8 @@ return 0; if (isa(VL[0])) return InstructionCost::getInvalid(); - ShuffleCostEstimator Estimator(*TTI, VectorizedVals, *this, - CheckedExtracts); - unsigned VF = E->getVectorFactor(); - SmallVector ReuseShuffleIndicies(E->ReuseShuffleIndices.begin(), - E->ReuseShuffleIndices.end()); - SmallVector GatheredScalars(E->Scalars.begin(), E->Scalars.end()); - // Build a mask out of the reorder indices and reorder scalars per this - // mask. - SmallVector ReorderMask; - inversePermutation(E->ReorderIndices, ReorderMask); - if (!ReorderMask.empty()) - reorderScalars(GatheredScalars, ReorderMask); - SmallVector Mask; - SmallVector ExtractMask; - Value *ExtractVecBase = nullptr; - bool UseVecBaseAsInput = false; - SmallVector> GatherShuffles; - SmallVector> Entries; - SmallVector> ExtractShuffles; - // Check for gathered extracts. - bool Resized = false; - unsigned NumParts = TTI->getNumberOfParts(VecTy); - if (NumParts == 0 || NumParts >= GatheredScalars.size()) - NumParts = 1; - if (!all_of(GatheredScalars, UndefValue::classof)) { - ExtractShuffles = - tryToGatherExtractElements(GatheredScalars, ExtractMask, NumParts); - if (!ExtractShuffles.empty()) { - if (Value *VecBase = Estimator.adjustExtracts( - E, ExtractMask, ExtractShuffles, NumParts, UseVecBaseAsInput)) { - if (auto *VecBaseTy = dyn_cast(VecBase->getType())) - if (VF == VecBaseTy->getNumElements() && - GatheredScalars.size() != VF) { - Resized = true; - GatheredScalars.append(VF - GatheredScalars.size(), - PoisonValue::get(ScalarTy)); - } - } - } - - // Do not try to look for reshuffled loads for gathered loads (they will - // be handled later), for vectorized scalars, and cases, which are - // definitely not profitable (splats and small gather nodes.) - if (!ExtractShuffles.empty() || E->getOpcode() != Instruction::Load || - E->isAltShuffle() || - all_of(E->Scalars, [this](Value *V) { return getTreeEntry(V); }) || - isSplat(E->Scalars) || - (E->Scalars != GatheredScalars && GatheredScalars.size() <= 2)) - GatherShuffles = - isGatherShuffledEntry(E, GatheredScalars, Mask, Entries, NumParts); - } - if (!GatherShuffles.empty()) { - if (GatherShuffles.size() == 1 && - *GatherShuffles.front() == TTI::SK_PermuteSingleSrc && - Entries.front().front()->isSame(E->Scalars)) { - // Perfect match in the graph, will reuse the previously vectorized - // node. Cost is 0. - LLVM_DEBUG( - dbgs() - << "SLP: perfect diamond match for gather bundle " - << shortBundleName(VL) << ".\n"); - // Restore the mask for previous partially matched values. - Mask.resize(E->Scalars.size()); - const TreeEntry *FrontTE = Entries.front().front(); - if (FrontTE->ReorderIndices.empty() && - ((FrontTE->ReuseShuffleIndices.empty() && - E->Scalars.size() == FrontTE->Scalars.size()) || - (E->Scalars.size() == FrontTE->ReuseShuffleIndices.size()))) { - std::iota(Mask.begin(), Mask.end(), 0); - } else { - for (auto [I, V] : enumerate(E->Scalars)) { - if (isa(V)) { - Mask[I] = PoisonMaskElem; - continue; - } - Mask[I] = FrontTE->findLaneForValue(V); - } - } - Estimator.add(*FrontTE, Mask); - return Estimator.finalize(E->getCommonMask()); - } - if (!Resized) { - if (GatheredScalars.size() != VF && - any_of(Entries, [&](ArrayRef TEs) { - return any_of(TEs, [&](const TreeEntry *TE) { - return TE->getVectorFactor() == VF; - }); - })) - GatheredScalars.append(VF - GatheredScalars.size(), - PoisonValue::get(ScalarTy)); - } - // Remove shuffled elements from list of gathers. - for (int I = 0, Sz = Mask.size(); I < Sz; ++I) { - if (Mask[I] != PoisonMaskElem) - GatheredScalars[I] = PoisonValue::get(ScalarTy); - } - LLVM_DEBUG(dbgs() << "SLP: shuffled " << Entries.size() - << " entries for bundle " - << shortBundleName(VL) << ".\n"); - unsigned SliceSize = E->Scalars.size() / NumParts; - SmallVector VecMask(Mask.size(), PoisonMaskElem); - for (const auto [I, TEs] : enumerate(Entries)) { - if (TEs.empty()) { - assert(!GatherShuffles[I] && - "No shuffles with empty entries list expected."); - continue; - } - assert((TEs.size() == 1 || TEs.size() == 2) && - "Expected shuffle of 1 or 2 entries."); - auto SubMask = ArrayRef(Mask).slice(I * SliceSize, SliceSize); - VecMask.assign(VecMask.size(), PoisonMaskElem); - copy(SubMask, std::next(VecMask.begin(), I * SliceSize)); - Estimator.add(*TEs.front(), *TEs.back(), VecMask); - } - if (all_of(GatheredScalars, PoisonValue ::classof)) - return Estimator.finalize(E->ReuseShuffleIndices); - return Estimator.finalize( - E->ReuseShuffleIndices, E->Scalars.size(), - [&](Value *&Vec, SmallVectorImpl &Mask) { - Vec = Estimator.gather(GatheredScalars, - Constant::getNullValue(FixedVectorType::get( - ScalarTy, GatheredScalars.size()))); - }); - } - if (!ExtractShuffles.empty()) { - Value *Vec1 = nullptr; - // Gather of extractelements can be represented as just a shuffle of - // a single/two vectors the scalars are extracted from. - // Find input vectors. - Value *Vec2 = nullptr; - for (unsigned I = 0, Sz = ExtractMask.size(); I < Sz; ++I) { - if (!Mask.empty() && Mask[I] != PoisonMaskElem) - ExtractMask[I] = PoisonMaskElem; - } - if (UseVecBaseAsInput) { - Vec1 = ExtractVecBase; - } else { - for (unsigned I = 0, Sz = ExtractMask.size(); I < Sz; ++I) { - if (ExtractMask[I] == PoisonMaskElem) - continue; - if (isa(E->Scalars[I])) - continue; - auto *EI = cast(E->Scalars[I]); - Value *VecOp = EI->getVectorOperand(); - if (const auto *TE = getTreeEntry(VecOp)) - if (TE->VectorizedValue) - VecOp = TE->VectorizedValue; - if (!Vec1) { - Vec1 = VecOp; - } else if (Vec1 != EI->getVectorOperand()) { - assert((!Vec2 || Vec2 == EI->getVectorOperand()) && - "Expected only 1 or 2 vectors shuffle."); - Vec2 = VecOp; - } - } - } - if (Vec2) { - Estimator.add(Vec1, Vec2, ExtractMask); - } else if (Vec1) { - Estimator.add(Vec1, ExtractMask, /*ForExtracts=*/true); - } else { - Estimator.add(PoisonValue::get(FixedVectorType::get( - ScalarTy, GatheredScalars.size())), - ExtractMask, /*ForExtracts=*/true); - } - } - if (!all_of(GatheredScalars, PoisonValue::classof)) { - auto Gathers = ArrayRef(GatheredScalars).take_front(VL.size()); - bool SameGathers = VL.equals(Gathers); - if (!SameGathers) - return Estimator.finalize( - E->ReuseShuffleIndices, E->Scalars.size(), - [&](Value *&Vec, SmallVectorImpl &Mask) { - Vec = Estimator.gather( - GatheredScalars, Constant::getNullValue(FixedVectorType::get( - ScalarTy, GatheredScalars.size()))); - }); - Value *BV = Estimator.gather(Gathers); - SmallVector ReuseMask(Gathers.size(), PoisonMaskElem); - std::iota(ReuseMask.begin(), ReuseMask.end(), 0); - Estimator.add(BV, ReuseMask); - } - return Estimator.finalize(E->ReuseShuffleIndices); + return processBuildVector( + E, *TTI, VectorizedVals, *this, CheckedExtracts); } InstructionCost CommonCost = 0; SmallVector Mask; @@ -10337,6 +10151,7 @@ /// Adjusts extractelements after reusing them. Value *adjustExtracts(const TreeEntry *E, MutableArrayRef Mask, + ArrayRef> ShuffleKinds, unsigned NumParts, bool &UseVecBaseAsInput) { UseVecBaseAsInput = false; SmallPtrSet UniqueBases; @@ -10441,14 +10256,15 @@ } /// Checks if the specified entry \p E needs to be delayed because of its /// dependency nodes. - Value *needToDelay(const TreeEntry *E, - ArrayRef> Deps) const { + std::optional + needToDelay(const TreeEntry *E, + ArrayRef> Deps) const { // No need to delay emission if all deps are ready. if (all_of(Deps, [](ArrayRef TEs) { return all_of( TEs, [](const TreeEntry *TE) { return TE->VectorizedValue; }); })) - return nullptr; + return std::nullopt; // Postpone gather emission, will be emitted after the end of the // process to keep correct order. auto *VecTy = FixedVectorType::get(E->Scalars.front()->getType(), @@ -10558,7 +10374,8 @@ inversePermutation(Order, NewMask); add(V1, NewMask); } - Value *gather(ArrayRef VL, Value *Root = nullptr) { + Value *gather(ArrayRef VL, unsigned MaskVF = 0, + Value *Root = nullptr) { return R.gather(VL, Root); } Value *createFreeze(Value *V) { return Builder.CreateFreeze(V); } @@ -10819,15 +10636,16 @@ cast(E->Scalars[Idx])->getVectorOperand())) ExtractEntries.push_back(TE); } - if (Value *Delayed = ShuffleBuilder.needToDelay(E, ExtractEntries)) { + if (std::optional Delayed = + ShuffleBuilder.needToDelay(E, ExtractEntries)) { // Delay emission of gathers which are not ready yet. PostponedGathers.insert(E); // Postpone gather emission, will be emitted after the end of the // process to keep correct order. - return Delayed; + return *Delayed; } if (Value *VecBase = ShuffleBuilder.adjustExtracts( - E, ExtractMask, NumParts, UseVecBaseAsInput)) { + E, ExtractMask, ExtractShuffles, NumParts, UseVecBaseAsInput)) { ExtractVecBase = VecBase; if (auto *VecBaseTy = dyn_cast(VecBase->getType())) if (VF == VecBaseTy->getNumElements() && @@ -10848,12 +10666,13 @@ isGatherShuffledEntry(E, GatheredScalars, Mask, Entries, NumParts); } if (!GatherShuffles.empty()) { - if (Value *Delayed = ShuffleBuilder.needToDelay(E, Entries)) { + if (std::optional Delayed = + ShuffleBuilder.needToDelay(E, Entries)) { // Delay emission of gathers which are not ready yet. PostponedGathers.insert(E); // Postpone gather emission, will be emitted after the end of the // process to keep correct order. - return Delayed; + return *Delayed; } if (GatherShuffles.size() == 1 && *GatherShuffles.front() == TTI::SK_PermuteSingleSrc && @@ -11062,14 +10881,16 @@ IsUsedInExpr &= FindReusedSplat(VecMask, TEs.front()->getVectorFactor()); ShuffleBuilder.add(*TEs.front(), VecMask); - IsNonPoisoned &= - isGuaranteedNotToBePoison(TEs.front()->VectorizedValue); + if (TEs.front()->VectorizedValue) + IsNonPoisoned &= + isGuaranteedNotToBePoison(TEs.front()->VectorizedValue); } else { IsUsedInExpr = false; ShuffleBuilder.add(*TEs.front(), *TEs.back(), VecMask); - IsNonPoisoned &= - isGuaranteedNotToBePoison(TEs.front()->VectorizedValue) && - isGuaranteedNotToBePoison(TEs.back()->VectorizedValue); + if (TEs.front()->VectorizedValue && TEs.back()->VectorizedValue) + IsNonPoisoned &= + isGuaranteedNotToBePoison(TEs.front()->VectorizedValue) && + isGuaranteedNotToBePoison(TEs.back()->VectorizedValue); } } } @@ -11128,7 +10949,7 @@ if (!all_of(GatheredScalars, PoisonValue::classof)) { SmallVector BVMask(GatheredScalars.size(), PoisonMaskElem); TryPackScalars(GatheredScalars, BVMask, /*IsRootPoison=*/true); - Value *BV = ShuffleBuilder.gather(GatheredScalars); + Value *BV = ShuffleBuilder.gather(GatheredScalars, BVMask.size()); ShuffleBuilder.add(BV, BVMask); } if (all_of(NonConstants, [=](Value *V) { @@ -11142,13 +10963,13 @@ E->ReuseShuffleIndices, E->Scalars.size(), [&](Value *&Vec, SmallVectorImpl &Mask) { TryPackScalars(NonConstants, Mask, /*IsRootPoison=*/false); - Vec = ShuffleBuilder.gather(NonConstants, Vec); + Vec = ShuffleBuilder.gather(NonConstants, Mask.size(), Vec); }); } else if (!allConstant(GatheredScalars)) { // Gather unique scalars and all constants. SmallVector ReuseMask(GatheredScalars.size(), PoisonMaskElem); TryPackScalars(GatheredScalars, ReuseMask, /*IsRootPoison=*/true); - Value *BV = ShuffleBuilder.gather(GatheredScalars); + Value *BV = ShuffleBuilder.gather(GatheredScalars, ReuseMask.size()); ShuffleBuilder.add(BV, ReuseMask); Res = ShuffleBuilder.finalize(E->ReuseShuffleIndices); } else { diff --git a/llvm/test/Transforms/SLPVectorizer/AArch64/scalarization-overhead.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/scalarization-overhead.ll --- a/llvm/test/Transforms/SLPVectorizer/AArch64/scalarization-overhead.ll +++ b/llvm/test/Transforms/SLPVectorizer/AArch64/scalarization-overhead.ll @@ -6,58 +6,36 @@ define fastcc i64 @zot(float %arg, float %arg1, float %arg2, float %arg3, float %arg4, ptr %arg5, i1 %arg6, i1 %arg7, i1 %arg8) { ; CHECK-LABEL: @zot( ; CHECK-NEXT: bb: -; CHECK-NEXT: [[VAL:%.*]] = fmul fast float 0.000000e+00, 0.000000e+00 -; CHECK-NEXT: [[VAL9:%.*]] = fmul fast float 0.000000e+00, [[ARG:%.*]] -; CHECK-NEXT: [[VAL10:%.*]] = fmul fast float [[ARG3:%.*]], 1.000000e+00 -; CHECK-NEXT: [[VAL11:%.*]] = fmul fast float [[ARG3]], 1.000000e+00 +; CHECK-NEXT: [[TMP0:%.*]] = insertelement <4 x float> , float [[ARG:%.*]], i32 1 +; CHECK-NEXT: [[TMP1:%.*]] = insertelement <4 x float> [[TMP0]], float [[ARG3:%.*]], i32 2 +; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <4 x float> [[TMP1]], <4 x float> poison, <4 x i32> +; CHECK-NEXT: [[TMP3:%.*]] = fmul fast <4 x float> , [[TMP2]] ; CHECK-NEXT: [[VAL12:%.*]] = fadd fast float [[ARG3]], 1.000000e+00 -; CHECK-NEXT: [[VAL13:%.*]] = fadd fast float [[VAL12]], 2.000000e+00 -; CHECK-NEXT: [[VAL14:%.*]] = fadd fast float 0.000000e+00, 0.000000e+00 -; CHECK-NEXT: [[VAL15:%.*]] = fadd fast float [[VAL14]], 1.000000e+00 -; CHECK-NEXT: [[VAL16:%.*]] = fadd fast float [[ARG3]], 1.000000e+00 -; CHECK-NEXT: [[VAL17:%.*]] = fadd fast float [[ARG3]], 1.000000e+00 +; CHECK-NEXT: [[TMP4:%.*]] = insertelement <4 x float> [[TMP2]], float [[VAL12]], i32 0 +; CHECK-NEXT: [[TMP5:%.*]] = insertelement <4 x float> [[TMP4]], float 0.000000e+00, i32 1 +; CHECK-NEXT: [[TMP6:%.*]] = fadd fast <4 x float> [[TMP5]], ; CHECK-NEXT: br i1 [[ARG6:%.*]], label [[BB18:%.*]], label [[BB57:%.*]] ; CHECK: bb18: -; CHECK-NEXT: [[VAL19:%.*]] = phi float [ [[VAL13]], [[BB:%.*]] ] -; CHECK-NEXT: [[VAL20:%.*]] = phi float [ [[VAL15]], [[BB]] ] -; CHECK-NEXT: [[VAL21:%.*]] = phi float [ [[VAL16]], [[BB]] ] -; CHECK-NEXT: [[VAL22:%.*]] = phi float [ [[VAL17]], [[BB]] ] -; CHECK-NEXT: [[VAL23:%.*]] = fmul fast float [[VAL16]], 2.000000e+00 -; CHECK-NEXT: [[VAL24:%.*]] = fmul fast float [[VAL17]], 3.000000e+00 +; CHECK-NEXT: [[TMP7:%.*]] = phi <4 x float> [ [[TMP6]], [[BB:%.*]] ] +; CHECK-NEXT: [[TMP8:%.*]] = extractelement <4 x float> [[TMP6]], i32 2 +; CHECK-NEXT: [[VAL23:%.*]] = fmul fast float [[TMP8]], 2.000000e+00 +; CHECK-NEXT: [[TMP9:%.*]] = extractelement <4 x float> [[TMP6]], i32 3 +; CHECK-NEXT: [[VAL24:%.*]] = fmul fast float [[TMP9]], 3.000000e+00 ; CHECK-NEXT: br i1 [[ARG7:%.*]], label [[BB25:%.*]], label [[BB57]] ; CHECK: bb25: -; CHECK-NEXT: [[VAL26:%.*]] = phi float [ [[VAL19]], [[BB18]] ] -; CHECK-NEXT: [[VAL27:%.*]] = phi float [ [[VAL20]], [[BB18]] ] -; CHECK-NEXT: [[VAL28:%.*]] = phi float [ [[VAL21]], [[BB18]] ] -; CHECK-NEXT: [[VAL29:%.*]] = phi float [ [[VAL22]], [[BB18]] ] +; CHECK-NEXT: [[TMP10:%.*]] = phi <4 x float> [ [[TMP7]], [[BB18]] ] +; CHECK-NEXT: [[TMP11:%.*]] = extractelement <4 x float> [[TMP3]], i32 1 ; CHECK-NEXT: br label [[BB30:%.*]] ; CHECK: bb30: ; CHECK-NEXT: [[VAL31:%.*]] = phi float [ [[VAL55:%.*]], [[BB30]] ], [ 0.000000e+00, [[BB25]] ] -; CHECK-NEXT: [[VAL32:%.*]] = phi float [ [[VAL9]], [[BB30]] ], [ 0.000000e+00, [[BB25]] ] -; CHECK-NEXT: [[VAL33:%.*]] = load i8, ptr [[ARG5:%.*]], align 1 -; CHECK-NEXT: [[VAL34:%.*]] = uitofp i8 [[VAL33]] to float -; CHECK-NEXT: [[VAL35:%.*]] = getelementptr inbounds i8, ptr [[ARG5]], i64 1 -; CHECK-NEXT: [[VAL36:%.*]] = load i8, ptr [[VAL35]], align 1 -; CHECK-NEXT: [[VAL37:%.*]] = uitofp i8 [[VAL36]] to float -; CHECK-NEXT: [[VAL38:%.*]] = getelementptr inbounds i8, ptr [[ARG5]], i64 2 -; CHECK-NEXT: [[VAL39:%.*]] = load i8, ptr [[VAL38]], align 1 -; CHECK-NEXT: [[VAL40:%.*]] = uitofp i8 [[VAL39]] to float -; CHECK-NEXT: [[VAL41:%.*]] = getelementptr inbounds i8, ptr [[ARG5]], i64 3 -; CHECK-NEXT: [[VAL42:%.*]] = load i8, ptr [[VAL41]], align 1 -; CHECK-NEXT: [[VAL43:%.*]] = uitofp i8 [[VAL42]] to float -; CHECK-NEXT: [[VAL44:%.*]] = fsub fast float [[VAL34]], [[VAL]] -; CHECK-NEXT: [[VAL45:%.*]] = fsub fast float [[VAL37]], [[VAL9]] -; CHECK-NEXT: [[VAL46:%.*]] = fsub fast float [[VAL40]], [[VAL10]] -; CHECK-NEXT: [[VAL47:%.*]] = fsub fast float [[VAL43]], [[VAL11]] -; CHECK-NEXT: [[VAL48:%.*]] = fmul fast float [[VAL44]], [[VAL26]] -; CHECK-NEXT: [[VAL49:%.*]] = fmul fast float [[VAL45]], [[VAL27]] -; CHECK-NEXT: [[VAL50:%.*]] = fadd fast float [[VAL49]], [[VAL48]] -; CHECK-NEXT: [[VAL51:%.*]] = fmul fast float [[VAL46]], [[VAL28]] -; CHECK-NEXT: [[VAL52:%.*]] = fadd fast float [[VAL50]], [[VAL51]] -; CHECK-NEXT: [[VAL53:%.*]] = fmul fast float [[VAL47]], [[VAL29]] -; CHECK-NEXT: [[VAL54:%.*]] = fadd fast float [[VAL52]], [[VAL53]] +; CHECK-NEXT: [[VAL32:%.*]] = phi float [ [[TMP11]], [[BB30]] ], [ 0.000000e+00, [[BB25]] ] +; CHECK-NEXT: [[TMP12:%.*]] = load <4 x i8>, ptr [[ARG5:%.*]], align 1 +; CHECK-NEXT: [[TMP13:%.*]] = uitofp <4 x i8> [[TMP12]] to <4 x float> +; CHECK-NEXT: [[TMP14:%.*]] = fsub fast <4 x float> [[TMP13]], [[TMP3]] +; CHECK-NEXT: [[TMP15:%.*]] = fmul fast <4 x float> [[TMP14]], [[TMP10]] +; CHECK-NEXT: [[TMP16:%.*]] = call fast float @llvm.vector.reduce.fadd.v4f32(float -0.000000e+00, <4 x float> [[TMP15]]) ; CHECK-NEXT: [[VAL55]] = tail call fast float @llvm.minnum.f32(float [[VAL31]], float [[ARG1:%.*]]) -; CHECK-NEXT: [[VAL56:%.*]] = tail call fast float @llvm.maxnum.f32(float [[ARG2:%.*]], float [[VAL54]]) +; CHECK-NEXT: [[VAL56:%.*]] = tail call fast float @llvm.maxnum.f32(float [[ARG2:%.*]], float [[TMP16]]) ; CHECK-NEXT: call void @ham(float [[VAL55]], float [[VAL56]]) ; CHECK-NEXT: br i1 [[ARG8:%.*]], label [[BB30]], label [[BB57]] ; CHECK: bb57: