diff --git a/llvm/lib/Target/AMDGPU/AMDGPU.h b/llvm/lib/Target/AMDGPU/AMDGPU.h --- a/llvm/lib/Target/AMDGPU/AMDGPU.h +++ b/llvm/lib/Target/AMDGPU/AMDGPU.h @@ -56,6 +56,7 @@ FunctionPass *createSIWholeQuadModePass(); FunctionPass *createSIFixControlFlowLiveIntervalsPass(); FunctionPass *createSIOptimizeExecMaskingPreRAPass(); +FunctionPass *createSIOptimizeVGPRLiveRangePass(); FunctionPass *createSIFixSGPRCopiesPass(); FunctionPass *createSIMemoryLegalizerPass(); FunctionPass *createSIInsertWaitcntsPass(); @@ -297,6 +298,9 @@ void initializeSIOptimizeExecMaskingPreRAPass(PassRegistry&); extern char &SIOptimizeExecMaskingPreRAID; +void initializeSIOptimizeVGPRLiveRangePass(PassRegistry &); +extern char &SIOptimizeVGPRLiveRangeID; + void initializeAMDGPUAnnotateUniformValuesPass(PassRegistry&); extern char &AMDGPUAnnotateUniformValuesPassID; diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp --- a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp @@ -162,6 +162,11 @@ cl::init(true), cl::Hidden); +static cl::opt OptVGPRLiveRange( + "amdgpu-opt-vgpr-liverange", + cl::desc("Enable VGPR liverange optimizations for if-else structure"), + cl::init(true), cl::Hidden); + // Enable atomic optimization static cl::opt EnableAtomicOptimizations( "amdgpu-atomic-optimizations", @@ -225,6 +230,7 @@ initializeSIPeepholeSDWAPass(*PR); initializeSIShrinkInstructionsPass(*PR); initializeSIOptimizeExecMaskingPreRAPass(*PR); + initializeSIOptimizeVGPRLiveRangePass(*PR); initializeSILoadStoreOptimizerPass(*PR); initializeAMDGPUFixFunctionBitcastsPass(*PR); initializeAMDGPUAlwaysInlinePass(*PR); @@ -1190,6 +1196,12 @@ if (TM->getOptLevel() > CodeGenOpt::Less) insertPass(&MachineSchedulerID, &SIFormMemoryClausesID); + // FIXME: when an instruction has a Killed operand, and the instruction is + // inside a bundle, seems only the BUNDLE instruction appears as the Kills of + // the register in LiveVariables, this would trigger a failure in verifier, + // we should fix it and enable the verifier. + if (OptVGPRLiveRange) + insertPass(&LiveVariablesID, &SIOptimizeVGPRLiveRangeID, false); // This must be run immediately after phi elimination and before // TwoAddressInstructions, otherwise the processing of the tied operand of // SI_ELSE will introduce a copy of the tied operand source after the else. diff --git a/llvm/lib/Target/AMDGPU/CMakeLists.txt b/llvm/lib/Target/AMDGPU/CMakeLists.txt --- a/llvm/lib/Target/AMDGPU/CMakeLists.txt +++ b/llvm/lib/Target/AMDGPU/CMakeLists.txt @@ -132,6 +132,7 @@ SIMemoryLegalizer.cpp SIOptimizeExecMasking.cpp SIOptimizeExecMaskingPreRA.cpp + SIOptimizeVGPRLiveRange.cpp SIPeepholeSDWA.cpp SIPostRABundler.cpp SIPreEmitPeephole.cpp diff --git a/llvm/lib/Target/AMDGPU/SIOptimizeVGPRLiveRange.cpp b/llvm/lib/Target/AMDGPU/SIOptimizeVGPRLiveRange.cpp new file mode 100644 --- /dev/null +++ b/llvm/lib/Target/AMDGPU/SIOptimizeVGPRLiveRange.cpp @@ -0,0 +1,497 @@ +//===--------------------- SIOptimizeVGPRLiveRange.cpp -------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +/// \file +/// This pass tries to remove unnecessary VGPR live range in divergent if-else +/// structure. +/// +/// When we do structurization, we usually transform a if-else into two +/// sucessive if-then (with a flow block to do predicate inversion). Consider a +/// simple case after structurization: A divergent value %a was defined before +/// if-else and used in both THEN (use in THEN is optional) and ELSE part: +/// bb.if: +/// %a = ... +/// ... +/// bb.then: +/// ... = op %a +/// ... // %a can be dead here +/// bb.flow: +/// ... +/// bb.else: +/// ... = %a +/// ... +/// bb.endif +/// +/// As register allocator has no idea of the thread-control-flow, it will just +/// assume %a would be alive in the whole range of bb.then because of a later +/// use in bb.else. On AMDGPU architecture, the VGPR was accessed with respect +/// to exec mask. For this if-else case, the lanes active in bb.then will be +/// inactive in bb.else, and vice-verse. So we are safe to say that %a was dead +/// after the last use in bb.then untill the end of the block. The reason is +/// the instructions in bb.then will only overwrite lanes that will never be +/// accessed in bb.else. +/// +/// This pass aims to to tell register allocator that %a is in-fact dead, +/// through inserting a phi-node in bb.flow saying that %a is undef when coming +/// from bb.then, and then replace the uses in the bb.else with the result of +/// newly inserted phi. +/// +/// Two key conditions must be met to ensure correctness: +/// 1.) The def-point should be in the same loop-level as if-else-endif to make +/// sure the second loop iteration still get correct data. +/// 2.) There should be no further uses after the IF-ELSE region. +/// +// +//===----------------------------------------------------------------------===// + +#include "AMDGPU.h" +#include "GCNSubtarget.h" +#include "MCTargetDesc/AMDGPUMCTargetDesc.h" +#include "SIMachineFunctionInfo.h" +#include "llvm/CodeGen/LiveVariables.h" +#include "llvm/CodeGen/MachineDominators.h" +#include "llvm/CodeGen/MachineLoopInfo.h" +#include "llvm/CodeGen/TargetRegisterInfo.h" +#include "llvm/InitializePasses.h" + +using namespace llvm; + +#define DEBUG_TYPE "si-opt-vgpr-liverange" + +namespace { + +class SIOptimizeVGPRLiveRange : public MachineFunctionPass { +private: + const SIRegisterInfo *TRI = nullptr; + const SIInstrInfo *TII = nullptr; + LiveVariables *LV = nullptr; + MachineDominatorTree *MDT = nullptr; + const MachineLoopInfo *Loops = nullptr; + MachineRegisterInfo *MRI = nullptr; + +public: + static char ID; + + MachineBasicBlock *getElseTarget(MachineBasicBlock *MBB) const; + + void collectElseRegionBlocks(MachineBasicBlock *Flow, + MachineBasicBlock *Endif, + SmallSetVector &) const; + + void + collectCandidateRegisters(MachineBasicBlock *If, MachineBasicBlock *Flow, + MachineBasicBlock *Endif, + SmallSetVector &ElseBlocks, + SmallVectorImpl &CandidateRegs) const; + + void findNonPHIUsesInBlock(Register Reg, MachineBasicBlock *MBB, + SmallVectorImpl &Uses) const; + + void updateLiveRangeInThenRegion(Register Reg, MachineBasicBlock *If, + MachineBasicBlock *Flow) const; + + void updateLiveRangeInElseRegion( + Register Reg, Register NewReg, MachineBasicBlock *Flow, + MachineBasicBlock *Endif, + SmallSetVector &ElseBlocks) const; + + void + optimizeLiveRange(Register Reg, MachineBasicBlock *If, + MachineBasicBlock *Flow, MachineBasicBlock *Endif, + SmallSetVector &ElseBlocks) const; + + SIOptimizeVGPRLiveRange() : MachineFunctionPass(ID) {} + + bool runOnMachineFunction(MachineFunction &MF) override; + + StringRef getPassName() const override { + return "SI Optimize VGPR LiveRange"; + } + + void getAnalysisUsage(AnalysisUsage &AU) const override { + AU.addRequired(); + AU.addRequired(); + AU.addRequired(); + AU.addPreserved(); + AU.addPreserved(); + AU.addPreserved(); + MachineFunctionPass::getAnalysisUsage(AU); + } + + MachineFunctionProperties getRequiredProperties() const override { + return MachineFunctionProperties().set( + MachineFunctionProperties::Property::IsSSA); + } +}; + +} // end anonymous namespace + +// Check whether the MBB is a else flow block and get the branching target which +// is the Endif block +MachineBasicBlock * +SIOptimizeVGPRLiveRange::getElseTarget(MachineBasicBlock *MBB) const { + for (auto &BR : MBB->terminators()) { + if (BR.getOpcode() == AMDGPU::SI_ELSE) + return BR.getOperand(2).getMBB(); + } + return nullptr; +} + +void SIOptimizeVGPRLiveRange::collectElseRegionBlocks( + MachineBasicBlock *Flow, MachineBasicBlock *Endif, + SmallSetVector &Blocks) const { + assert(Flow != Endif); + + MachineBasicBlock *MBB = Endif; + unsigned Cur = 0; + while (MBB) { + for (auto *Pred : MBB->predecessors()) { + if (Pred != Flow && !Blocks.contains(Pred)) + Blocks.insert(Pred); + } + + if (Cur < Blocks.size()) + MBB = Blocks[Cur++]; + else + MBB = nullptr; + } + + LLVM_DEBUG(dbgs() << "Found Else blocks: "); + for (auto *MBB : Blocks) + LLVM_DEBUG(dbgs() << printMBBReference(*MBB) << ' '); + LLVM_DEBUG(dbgs() << '\n'); +} + +/// Find the instructions(excluding phi) in \p MBB that uses the \p Reg. +void SIOptimizeVGPRLiveRange::findNonPHIUsesInBlock( + Register Reg, MachineBasicBlock *MBB, + SmallVectorImpl &Uses) const { + for (auto &UseMI : MRI->use_nodbg_instructions(Reg)) { + if (UseMI.getParent() == MBB && !UseMI.isPHI()) + Uses.push_back(&UseMI); + } +} + +/// Collect the killed registers in the ELSE region which are not alive through +/// the whole THEN region. +void SIOptimizeVGPRLiveRange::collectCandidateRegisters( + MachineBasicBlock *If, MachineBasicBlock *Flow, MachineBasicBlock *Endif, + SmallSetVector &ElseBlocks, + SmallVectorImpl &CandidateRegs) const { + + SmallSet KillsInElse; + + for (auto *Else : ElseBlocks) { + for (auto &MI : Else->instrs()) { + if (MI.isDebugInstr()) + continue; + + for (auto &MO : MI.operands()) { + if (!MO.isReg() || !MO.getReg() || MO.isDef()) + continue; + + Register MOReg = MO.getReg(); + // We can only optimize AGPR/VGPR virtual register + if (MOReg.isPhysical() || !TRI->isVectorRegister(*MRI, MOReg)) + continue; + + if (MO.isKill() && MO.readsReg()) { + LiveVariables::VarInfo &VI = LV->getVarInfo(MOReg); + const MachineBasicBlock *DefMBB = MRI->getVRegDef(MOReg)->getParent(); + // Make sure two conditions are met: + // a.) the value is defined before/in the IF block + // b.) should be defined in the same loop-level. + if ((VI.AliveBlocks.test(If->getNumber()) || DefMBB == If) && + Loops->getLoopFor(DefMBB) == Loops->getLoopFor(If)) + KillsInElse.insert(MOReg); + } + } + } + } + + // Check the phis in the Endif, looking for value coming from the ELSE + // region. Make sure the phi-use is the last use. + for (auto &MI : Endif->phis()) { + for (unsigned Idx = 1; Idx < MI.getNumOperands(); Idx += 2) { + auto &MO = MI.getOperand(Idx); + auto *Pred = MI.getOperand(Idx + 1).getMBB(); + if (Pred == Flow) + continue; + assert(ElseBlocks.contains(Pred) && "Should be from Else region\n"); + + if (!MO.isReg() || !MO.getReg() || MO.isUndef()) + continue; + + Register Reg = MO.getReg(); + if (Reg.isPhysical() || !TRI->isVectorRegister(*MRI, Reg)) + continue; + + LiveVariables::VarInfo &VI = LV->getVarInfo(Reg); + + if (VI.isLiveIn(*Endif, Reg, *MRI)) { + LLVM_DEBUG(dbgs() << "Excluding " << printReg(Reg, TRI) + << " as Live in Endif\n"); + continue; + } + // Make sure two conditions are met: + // a.) the value is defined before/in the IF block + // b.) should be defined in the same loop-level. + const MachineBasicBlock *DefMBB = MRI->getVRegDef(Reg)->getParent(); + if ((VI.AliveBlocks.test(If->getNumber()) || DefMBB == If) && + Loops->getLoopFor(DefMBB) == Loops->getLoopFor(If)) + KillsInElse.insert(Reg); + } + } + + auto IsLiveThroughThen = [&](Register Reg) { + for (auto I = MRI->use_nodbg_begin(Reg), E = MRI->use_nodbg_end(); I != E; + ++I) { + if (!I->readsReg()) + continue; + auto *UseMI = I->getParent(); + auto *UseMBB = UseMI->getParent(); + if (UseMBB == Flow || UseMBB == Endif) { + if (!UseMI->isPHI()) + return true; + + auto *IncomingMBB = UseMI->getOperand(I.getOperandNo() + 1).getMBB(); + // The register is live through the path If->Flow or Flow->Endif. + // we should not optimize for such cases. + if ((UseMBB == Flow && IncomingMBB != If) || + (UseMBB == Endif && IncomingMBB == Flow)) + return true; + } + } + return false; + }; + + for (auto Reg : KillsInElse) { + if (!IsLiveThroughThen(Reg)) + CandidateRegs.push_back(Reg); + } +} + +// Re-calculate the liveness of \p Reg in the THEN-region +void SIOptimizeVGPRLiveRange::updateLiveRangeInThenRegion( + Register Reg, MachineBasicBlock *If, MachineBasicBlock *Flow) const { + + SmallPtrSet PHIIncoming; + + MachineBasicBlock *ThenEntry = nullptr; + for (auto *Succ : If->successors()) { + if (Succ != Flow) { + ThenEntry = Succ; + break; + } + } + assert(ThenEntry && "No successor in Then region?"); + + LiveVariables::VarInfo &OldVarInfo = LV->getVarInfo(Reg); + df_iterator_default_set Visited; + + for (MachineBasicBlock *MBB : depth_first_ext(ThenEntry, Visited)) { + if (MBB == Flow) + break; + + // Clear Live bit, as we will recalculate afterwards + LLVM_DEBUG(dbgs() << "Clear AliveBlock " << printMBBReference(*MBB) + << '\n'); + OldVarInfo.AliveBlocks.reset(MBB->getNumber()); + } + + // Get the blocks the Reg should be alive through + for (auto I = MRI->use_nodbg_begin(Reg), E = MRI->use_nodbg_end(); I != E; + ++I) { + auto *UseMI = I->getParent(); + if (UseMI->isPHI() && I->readsReg()) { + if (Visited.contains(UseMI->getParent())) + PHIIncoming.insert(UseMI->getOperand(I.getOperandNo() + 1).getMBB()); + } + } + + Visited.clear(); + + for (MachineBasicBlock *MBB : depth_first_ext(ThenEntry, Visited)) { + if (MBB == Flow) + break; + + SmallVector Uses; + // PHI instructions has been processed before. + findNonPHIUsesInBlock(Reg, MBB, Uses); + + if (Uses.size() == 1) { + LLVM_DEBUG(dbgs() << "Found one Non-PHI use in " + << printMBBReference(*MBB) << '\n'); + LV->HandleVirtRegUse(Reg, MBB, *(*Uses.begin())); + } else if (Uses.size() > 1) { + // Process the instructions in-order + LLVM_DEBUG(dbgs() << "Found " << Uses.size() << " Non-PHI uses in " + << printMBBReference(*MBB) << '\n'); + for (MachineInstr &MI : *MBB) { + if (llvm::is_contained(Uses, &MI)) + LV->HandleVirtRegUse(Reg, MBB, MI); + } + } + + // Mark Reg alive through the block if this is a PHI incoming block + if (PHIIncoming.contains(MBB)) + LV->MarkVirtRegAliveInBlock(OldVarInfo, MRI->getVRegDef(Reg)->getParent(), + MBB); + } + + // Set the isKilled flag if we get new Kills in the THEN region. + for (auto *MI : OldVarInfo.Kills) { + if (Visited.contains(MI->getParent())) + MI->addRegisterKilled(Reg, TRI); + } +} + +void SIOptimizeVGPRLiveRange::updateLiveRangeInElseRegion( + Register Reg, Register NewReg, MachineBasicBlock *Flow, + MachineBasicBlock *Endif, + SmallSetVector &ElseBlocks) const { + LiveVariables::VarInfo &NewVarInfo = LV->getVarInfo(NewReg); + LiveVariables::VarInfo &OldVarInfo = LV->getVarInfo(Reg); + + // Transfer aliveBlocks from Reg to NewReg + for (auto *MBB : ElseBlocks) { + unsigned BBNum = MBB->getNumber(); + if (OldVarInfo.AliveBlocks.test(BBNum)) { + NewVarInfo.AliveBlocks.set(BBNum); + LLVM_DEBUG(dbgs() << "Removing ALiveBlock " << printMBBReference(*MBB) + << '\n'); + OldVarInfo.AliveBlocks.reset(BBNum); + } + } + + // Transfer the possible Kills in ElseBlocks from Reg to NewReg + auto I = OldVarInfo.Kills.begin(); + while (I != OldVarInfo.Kills.end()) { + if (ElseBlocks.contains((*I)->getParent())) { + NewVarInfo.Kills.push_back(*I); + I = OldVarInfo.Kills.erase(I); + } else { + ++I; + } + } +} + +void SIOptimizeVGPRLiveRange::optimizeLiveRange( + Register Reg, MachineBasicBlock *If, MachineBasicBlock *Flow, + MachineBasicBlock *Endif, + SmallSetVector &ElseBlocks) const { + // Insert a new PHI, marking the value from the THEN region being + // undef. + LLVM_DEBUG(dbgs() << "Optimizing " << printReg(Reg, TRI) << '\n'); + const auto *RC = MRI->getRegClass(Reg); + Register NewReg = MRI->createVirtualRegister(RC); + Register UndefReg = MRI->createVirtualRegister(RC); + MachineInstrBuilder PHI = BuildMI(*Flow, Flow->getFirstNonPHI(), DebugLoc(), + TII->get(TargetOpcode::PHI), NewReg); + for (auto *Pred : Flow->predecessors()) { + if (Pred == If) + PHI.addReg(Reg).addMBB(Pred); + else + PHI.addReg(UndefReg, RegState::Undef).addMBB(Pred); + } + + // Replace all uses in the ELSE region or the PHIs in ENDIF block + for (auto I = MRI->use_begin(Reg), E = MRI->use_end(); I != E;) { + MachineOperand &O = *I; + // This is a little bit tricky, the setReg() will update the linked list, + // so we have to increment the iterator before setReg() to avoid skipping + // some uses. + ++I; + auto *UseMI = O.getParent(); + auto *UseBlock = UseMI->getParent(); + // Replace uses in Endif block + if (UseBlock == Endif) { + assert(UseMI->isPHI() && "Uses should be PHI in Endif block"); + O.setReg(NewReg); + continue; + } + + // Replace uses in Else region + if (ElseBlocks.contains(UseBlock)) + O.setReg(NewReg); + } + + // The optimized Reg is not alive through Flow blocks anymore. + LiveVariables::VarInfo &OldVarInfo = LV->getVarInfo(Reg); + OldVarInfo.AliveBlocks.reset(Flow->getNumber()); + + updateLiveRangeInElseRegion(Reg, NewReg, Flow, Endif, ElseBlocks); + updateLiveRangeInThenRegion(Reg, If, Flow); +} + +char SIOptimizeVGPRLiveRange::ID = 0; + +INITIALIZE_PASS_BEGIN(SIOptimizeVGPRLiveRange, DEBUG_TYPE, + "SI Optimize VGPR LiveRange", false, false) +INITIALIZE_PASS_DEPENDENCY(MachineDominatorTree) +INITIALIZE_PASS_DEPENDENCY(MachineLoopInfo) +INITIALIZE_PASS_DEPENDENCY(LiveVariables) +INITIALIZE_PASS_END(SIOptimizeVGPRLiveRange, DEBUG_TYPE, + "SI Optimize VGPR LiveRange", false, false) + +char &llvm::SIOptimizeVGPRLiveRangeID = SIOptimizeVGPRLiveRange::ID; + +FunctionPass *llvm::createSIOptimizeVGPRLiveRangePass() { + return new SIOptimizeVGPRLiveRange(); +} + +bool SIOptimizeVGPRLiveRange::runOnMachineFunction(MachineFunction &MF) { + + const GCNSubtarget &ST = MF.getSubtarget(); + TII = ST.getInstrInfo(); + TRI = &TII->getRegisterInfo(); + MDT = &getAnalysis(); + Loops = &getAnalysis(); + LV = &getAnalysis(); + MRI = &MF.getRegInfo(); + + if (skipFunction(MF.getFunction())) + return false; + + bool MadeChange = false; + + // TODO: we need to think about the order of visiting the blocks to get + // optimal result for nesting if-else cases. + for (MachineBasicBlock &MBB : MF) { + for (auto &MI : MBB.terminators()) { + // Detect the if-else blocks + if (MI.getOpcode() == AMDGPU::SI_IF) { + MachineBasicBlock *IfTarget = MI.getOperand(2).getMBB(); + auto *Endif = getElseTarget(IfTarget); + if (!Endif) + continue; + + SmallSetVector ElseBlocks; + SmallVector CandidateRegs; + + LLVM_DEBUG(dbgs() << "Checking IF-ELSE-ENDIF: " + << printMBBReference(MBB) << ' ' + << printMBBReference(*IfTarget) << ' ' + << printMBBReference(*Endif) << '\n'); + + // Collect all the blocks in the ELSE region + collectElseRegionBlocks(IfTarget, Endif, ElseBlocks); + + // Collect the registers can be optimized + collectCandidateRegisters(&MBB, IfTarget, Endif, ElseBlocks, + CandidateRegs); + MadeChange |= !CandidateRegs.empty(); + // Now we are safe to optimize. + for (auto Reg : CandidateRegs) + optimizeLiveRange(Reg, &MBB, IfTarget, Endif, ElseBlocks); + } + } + } + + return MadeChange; +} diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/sdiv.i64.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/sdiv.i64.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/sdiv.i64.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/sdiv.i64.ll @@ -17,147 +17,149 @@ ; CHECK-NEXT: s_cbranch_execz BB0_2 ; CHECK-NEXT: ; %bb.1: ; CHECK-NEXT: v_ashrrev_i32_e32 v4, 31, v3 -; CHECK-NEXT: v_add_i32_e32 v5, vcc, v2, v4 +; CHECK-NEXT: v_add_i32_e32 v2, vcc, v2, v4 ; CHECK-NEXT: v_addc_u32_e32 v3, vcc, v3, v4, vcc -; CHECK-NEXT: v_xor_b32_e32 v5, v5, v4 +; CHECK-NEXT: v_xor_b32_e32 v2, v2, v4 ; CHECK-NEXT: v_xor_b32_e32 v3, v3, v4 -; CHECK-NEXT: v_cvt_f32_u32_e32 v6, v5 -; CHECK-NEXT: v_cvt_f32_u32_e32 v7, v3 -; CHECK-NEXT: v_ashrrev_i32_e32 v8, 31, v1 -; CHECK-NEXT: v_mac_f32_e32 v6, 0x4f800000, v7 -; CHECK-NEXT: v_rcp_iflag_f32_e32 v6, v6 -; CHECK-NEXT: v_add_i32_e32 v7, vcc, v0, v8 -; CHECK-NEXT: v_addc_u32_e32 v1, vcc, v1, v8, vcc -; CHECK-NEXT: v_sub_i32_e32 v10, vcc, 0, v5 -; CHECK-NEXT: v_mul_f32_e32 v6, 0x5f7ffffc, v6 -; CHECK-NEXT: v_mul_f32_e32 v9, 0x2f800000, v6 -; CHECK-NEXT: v_trunc_f32_e32 v9, v9 -; CHECK-NEXT: v_mac_f32_e32 v6, 0xcf800000, v9 +; CHECK-NEXT: v_cvt_f32_u32_e32 v5, v2 +; CHECK-NEXT: v_cvt_f32_u32_e32 v6, v3 +; CHECK-NEXT: v_ashrrev_i32_e32 v7, 31, v1 +; CHECK-NEXT: v_add_i32_e32 v0, vcc, v0, v7 +; CHECK-NEXT: v_addc_u32_e32 v1, vcc, v1, v7, vcc +; CHECK-NEXT: v_mac_f32_e32 v5, 0x4f800000, v6 +; CHECK-NEXT: v_rcp_iflag_f32_e32 v5, v5 +; CHECK-NEXT: v_sub_i32_e32 v8, vcc, 0, v2 +; CHECK-NEXT: v_subb_u32_e32 v9, vcc, 0, v3, vcc +; CHECK-NEXT: v_xor_b32_e32 v0, v0, v7 +; CHECK-NEXT: v_mul_f32_e32 v5, 0x5f7ffffc, v5 +; CHECK-NEXT: v_mul_f32_e32 v6, 0x2f800000, v5 +; CHECK-NEXT: v_trunc_f32_e32 v6, v6 +; CHECK-NEXT: v_mac_f32_e32 v5, 0xcf800000, v6 +; CHECK-NEXT: v_cvt_u32_f32_e32 v5, v5 ; CHECK-NEXT: v_cvt_u32_f32_e32 v6, v6 -; CHECK-NEXT: v_cvt_u32_f32_e32 v9, v9 -; CHECK-NEXT: v_subb_u32_e32 v11, vcc, 0, v3, vcc -; CHECK-NEXT: v_xor_b32_e32 v7, v7, v8 -; CHECK-NEXT: v_mul_lo_u32 v12, v11, v6 -; CHECK-NEXT: v_mul_lo_u32 v13, v10, v9 -; CHECK-NEXT: v_mul_hi_u32 v15, v10, v6 -; CHECK-NEXT: v_mul_lo_u32 v14, v10, v6 -; CHECK-NEXT: v_xor_b32_e32 v1, v1, v8 +; CHECK-NEXT: v_xor_b32_e32 v1, v1, v7 +; CHECK-NEXT: v_mul_lo_u32 v10, v9, v5 +; CHECK-NEXT: v_mul_lo_u32 v11, v8, v6 +; CHECK-NEXT: v_mul_hi_u32 v13, v8, v5 +; CHECK-NEXT: v_mul_lo_u32 v12, v8, v5 +; CHECK-NEXT: v_add_i32_e32 v10, vcc, v10, v11 +; CHECK-NEXT: v_add_i32_e32 v10, vcc, v10, v13 +; CHECK-NEXT: v_mul_lo_u32 v11, v6, v12 +; CHECK-NEXT: v_mul_lo_u32 v13, v5, v10 +; CHECK-NEXT: v_mul_hi_u32 v14, v5, v12 +; CHECK-NEXT: v_mul_hi_u32 v12, v6, v12 +; CHECK-NEXT: v_add_i32_e32 v11, vcc, v11, v13 +; CHECK-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc +; CHECK-NEXT: v_add_i32_e32 v11, vcc, v11, v14 +; CHECK-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc +; CHECK-NEXT: v_mul_lo_u32 v14, v6, v10 +; CHECK-NEXT: v_add_i32_e32 v11, vcc, v13, v11 +; CHECK-NEXT: v_mul_hi_u32 v13, v5, v10 +; CHECK-NEXT: v_mul_hi_u32 v10, v6, v10 +; CHECK-NEXT: v_add_i32_e32 v12, vcc, v14, v12 +; CHECK-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc ; CHECK-NEXT: v_add_i32_e32 v12, vcc, v12, v13 -; CHECK-NEXT: v_add_i32_e32 v12, vcc, v12, v15 -; CHECK-NEXT: v_mul_lo_u32 v13, v9, v14 -; CHECK-NEXT: v_mul_lo_u32 v15, v6, v12 -; CHECK-NEXT: v_mul_hi_u32 v16, v6, v14 -; CHECK-NEXT: v_mul_hi_u32 v14, v9, v14 -; CHECK-NEXT: v_add_i32_e32 v13, vcc, v13, v15 -; CHECK-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc -; CHECK-NEXT: v_add_i32_e32 v13, vcc, v13, v16 ; CHECK-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc -; CHECK-NEXT: v_mul_lo_u32 v16, v9, v12 -; CHECK-NEXT: v_add_i32_e32 v13, vcc, v15, v13 -; CHECK-NEXT: v_mul_hi_u32 v15, v6, v12 -; CHECK-NEXT: v_mul_hi_u32 v12, v9, v12 -; CHECK-NEXT: v_add_i32_e32 v14, vcc, v16, v14 -; CHECK-NEXT: v_cndmask_b32_e64 v16, 0, 1, vcc -; CHECK-NEXT: v_add_i32_e32 v14, vcc, v14, v15 -; CHECK-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc -; CHECK-NEXT: v_add_i32_e32 v15, vcc, v16, v15 ; CHECK-NEXT: v_add_i32_e32 v13, vcc, v14, v13 -; CHECK-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc -; CHECK-NEXT: v_add_i32_e32 v14, vcc, v15, v14 -; CHECK-NEXT: v_add_i32_e32 v12, vcc, v12, v14 -; CHECK-NEXT: v_add_i32_e32 v6, vcc, v6, v13 -; CHECK-NEXT: v_addc_u32_e64 v13, s[4:5], v9, v12, vcc -; CHECK-NEXT: v_mul_lo_u32 v11, v11, v6 -; CHECK-NEXT: v_mul_lo_u32 v14, v10, v13 -; CHECK-NEXT: v_mul_lo_u32 v15, v10, v6 -; CHECK-NEXT: v_mul_hi_u32 v10, v10, v6 +; CHECK-NEXT: v_add_i32_e32 v11, vcc, v12, v11 +; CHECK-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc +; CHECK-NEXT: v_add_i32_e32 v12, vcc, v13, v12 +; CHECK-NEXT: v_add_i32_e32 v10, vcc, v10, v12 +; CHECK-NEXT: v_add_i32_e32 v5, vcc, v5, v11 +; CHECK-NEXT: v_addc_u32_e64 v11, s[4:5], v6, v10, vcc +; CHECK-NEXT: v_mul_lo_u32 v9, v9, v5 +; CHECK-NEXT: v_mul_lo_u32 v12, v8, v11 +; CHECK-NEXT: v_mul_lo_u32 v13, v8, v5 +; CHECK-NEXT: v_mul_hi_u32 v8, v8, v5 +; CHECK-NEXT: v_add_i32_e64 v6, s[4:5], v6, v10 +; CHECK-NEXT: v_add_i32_e64 v9, s[4:5], v9, v12 +; CHECK-NEXT: v_mul_hi_u32 v10, v5, v13 +; CHECK-NEXT: v_add_i32_e64 v8, s[4:5], v9, v8 +; CHECK-NEXT: v_mul_lo_u32 v9, v11, v13 +; CHECK-NEXT: v_mul_lo_u32 v12, v5, v8 +; CHECK-NEXT: v_mul_hi_u32 v13, v11, v13 ; CHECK-NEXT: v_add_i32_e64 v9, s[4:5], v9, v12 -; CHECK-NEXT: v_add_i32_e64 v11, s[4:5], v11, v14 -; CHECK-NEXT: v_mul_hi_u32 v12, v6, v15 -; CHECK-NEXT: v_add_i32_e64 v10, s[4:5], v11, v10 -; CHECK-NEXT: v_mul_lo_u32 v11, v13, v15 -; CHECK-NEXT: v_mul_lo_u32 v14, v6, v10 -; CHECK-NEXT: v_mul_hi_u32 v15, v13, v15 -; CHECK-NEXT: v_add_i32_e64 v11, s[4:5], v11, v14 -; CHECK-NEXT: v_cndmask_b32_e64 v14, 0, 1, s[4:5] -; CHECK-NEXT: v_add_i32_e64 v11, s[4:5], v11, v12 -; CHECK-NEXT: v_cndmask_b32_e64 v11, 0, 1, s[4:5] -; CHECK-NEXT: v_mul_lo_u32 v12, v13, v10 -; CHECK-NEXT: v_add_i32_e64 v11, s[4:5], v14, v11 -; CHECK-NEXT: v_mul_hi_u32 v14, v6, v10 -; CHECK-NEXT: v_mul_hi_u32 v10, v13, v10 -; CHECK-NEXT: v_add_i32_e64 v12, s[4:5], v12, v15 -; CHECK-NEXT: v_cndmask_b32_e64 v15, 0, 1, s[4:5] -; CHECK-NEXT: v_add_i32_e64 v12, s[4:5], v12, v14 -; CHECK-NEXT: v_cndmask_b32_e64 v14, 0, 1, s[4:5] -; CHECK-NEXT: v_add_i32_e64 v14, s[4:5], v15, v14 -; CHECK-NEXT: v_add_i32_e64 v11, s[4:5], v12, v11 ; CHECK-NEXT: v_cndmask_b32_e64 v12, 0, 1, s[4:5] -; CHECK-NEXT: v_add_i32_e64 v12, s[4:5], v14, v12 +; CHECK-NEXT: v_add_i32_e64 v9, s[4:5], v9, v10 +; CHECK-NEXT: v_cndmask_b32_e64 v9, 0, 1, s[4:5] +; CHECK-NEXT: v_mul_lo_u32 v10, v11, v8 +; CHECK-NEXT: v_add_i32_e64 v9, s[4:5], v12, v9 +; CHECK-NEXT: v_mul_hi_u32 v12, v5, v8 +; CHECK-NEXT: v_mul_hi_u32 v8, v11, v8 +; CHECK-NEXT: v_add_i32_e64 v10, s[4:5], v10, v13 +; CHECK-NEXT: v_cndmask_b32_e64 v13, 0, 1, s[4:5] ; CHECK-NEXT: v_add_i32_e64 v10, s[4:5], v10, v12 -; CHECK-NEXT: v_addc_u32_e32 v9, vcc, v9, v10, vcc -; CHECK-NEXT: v_add_i32_e32 v6, vcc, v6, v11 -; CHECK-NEXT: v_addc_u32_e32 v9, vcc, 0, v9, vcc +; CHECK-NEXT: v_cndmask_b32_e64 v12, 0, 1, s[4:5] +; CHECK-NEXT: v_add_i32_e64 v12, s[4:5], v13, v12 +; CHECK-NEXT: v_add_i32_e64 v9, s[4:5], v10, v9 +; CHECK-NEXT: v_cndmask_b32_e64 v10, 0, 1, s[4:5] +; CHECK-NEXT: v_add_i32_e64 v10, s[4:5], v12, v10 +; CHECK-NEXT: v_add_i32_e64 v8, s[4:5], v8, v10 +; CHECK-NEXT: v_addc_u32_e32 v6, vcc, v6, v8, vcc +; CHECK-NEXT: v_add_i32_e32 v5, vcc, v5, v9 +; CHECK-NEXT: v_addc_u32_e32 v6, vcc, 0, v6, vcc +; CHECK-NEXT: v_mul_lo_u32 v8, v1, v5 +; CHECK-NEXT: v_mul_lo_u32 v9, v0, v6 +; CHECK-NEXT: v_mul_hi_u32 v10, v0, v5 +; CHECK-NEXT: v_mul_hi_u32 v5, v1, v5 +; CHECK-NEXT: v_add_i32_e32 v8, vcc, v8, v9 +; CHECK-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc +; CHECK-NEXT: v_add_i32_e32 v8, vcc, v8, v10 +; CHECK-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc ; CHECK-NEXT: v_mul_lo_u32 v10, v1, v6 -; CHECK-NEXT: v_mul_lo_u32 v11, v7, v9 -; CHECK-NEXT: v_mul_hi_u32 v12, v7, v6 +; CHECK-NEXT: v_add_i32_e32 v8, vcc, v9, v8 +; CHECK-NEXT: v_mul_hi_u32 v9, v0, v6 ; CHECK-NEXT: v_mul_hi_u32 v6, v1, v6 -; CHECK-NEXT: v_add_i32_e32 v10, vcc, v10, v11 -; CHECK-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc -; CHECK-NEXT: v_add_i32_e32 v10, vcc, v10, v12 +; CHECK-NEXT: v_add_i32_e32 v5, vcc, v10, v5 ; CHECK-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc -; CHECK-NEXT: v_mul_lo_u32 v12, v1, v9 -; CHECK-NEXT: v_add_i32_e32 v10, vcc, v11, v10 -; CHECK-NEXT: v_mul_hi_u32 v11, v7, v9 -; CHECK-NEXT: v_mul_hi_u32 v9, v1, v9 -; CHECK-NEXT: v_add_i32_e32 v6, vcc, v12, v6 -; CHECK-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc -; CHECK-NEXT: v_add_i32_e32 v6, vcc, v6, v11 -; CHECK-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc -; CHECK-NEXT: v_add_i32_e32 v11, vcc, v12, v11 -; CHECK-NEXT: v_add_i32_e32 v6, vcc, v6, v10 -; CHECK-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc -; CHECK-NEXT: v_add_i32_e32 v10, vcc, v11, v10 -; CHECK-NEXT: v_add_i32_e32 v9, vcc, v9, v10 -; CHECK-NEXT: v_mul_lo_u32 v10, v3, v6 -; CHECK-NEXT: v_mul_lo_u32 v11, v5, v9 -; CHECK-NEXT: v_mul_hi_u32 v13, v5, v6 -; CHECK-NEXT: v_mul_lo_u32 v12, v5, v6 -; CHECK-NEXT: v_add_i32_e32 v10, vcc, v10, v11 -; CHECK-NEXT: v_add_i32_e32 v10, vcc, v10, v13 -; CHECK-NEXT: v_sub_i32_e32 v7, vcc, v7, v12 -; CHECK-NEXT: v_subb_u32_e64 v11, s[4:5], v1, v10, vcc -; CHECK-NEXT: v_sub_i32_e64 v1, s[4:5], v1, v10 -; CHECK-NEXT: v_cmp_ge_u32_e64 s[4:5], v11, v3 +; CHECK-NEXT: v_add_i32_e32 v5, vcc, v5, v9 +; CHECK-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc +; CHECK-NEXT: v_add_i32_e32 v9, vcc, v10, v9 +; CHECK-NEXT: v_add_i32_e32 v5, vcc, v5, v8 +; CHECK-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc +; CHECK-NEXT: v_add_i32_e32 v8, vcc, v9, v8 +; CHECK-NEXT: v_add_i32_e32 v6, vcc, v6, v8 +; CHECK-NEXT: v_mul_lo_u32 v8, v3, v5 +; CHECK-NEXT: v_mul_lo_u32 v9, v2, v6 +; CHECK-NEXT: v_mul_hi_u32 v11, v2, v5 +; CHECK-NEXT: v_mul_lo_u32 v10, v2, v5 +; CHECK-NEXT: v_add_i32_e32 v8, vcc, v8, v9 +; CHECK-NEXT: v_add_i32_e32 v8, vcc, v8, v11 +; CHECK-NEXT: v_sub_i32_e32 v0, vcc, v0, v10 +; CHECK-NEXT: v_subb_u32_e64 v9, s[4:5], v1, v8, vcc +; CHECK-NEXT: v_sub_i32_e64 v1, s[4:5], v1, v8 +; CHECK-NEXT: v_cmp_ge_u32_e64 s[4:5], v9, v3 ; CHECK-NEXT: v_subb_u32_e32 v1, vcc, v1, v3, vcc -; CHECK-NEXT: v_cndmask_b32_e64 v10, 0, -1, s[4:5] -; CHECK-NEXT: v_cmp_ge_u32_e64 s[4:5], v7, v5 -; CHECK-NEXT: v_sub_i32_e32 v7, vcc, v7, v5 +; CHECK-NEXT: v_cndmask_b32_e64 v8, 0, -1, s[4:5] +; CHECK-NEXT: v_cmp_ge_u32_e64 s[4:5], v0, v2 +; CHECK-NEXT: v_sub_i32_e32 v0, vcc, v0, v2 ; CHECK-NEXT: v_subbrev_u32_e32 v1, vcc, 0, v1, vcc -; CHECK-NEXT: v_cndmask_b32_e64 v12, 0, -1, s[4:5] -; CHECK-NEXT: v_cmp_eq_u32_e64 s[4:5], v11, v3 -; CHECK-NEXT: v_add_i32_e32 v11, vcc, 1, v6 -; CHECK-NEXT: v_cndmask_b32_e64 v10, v10, v12, s[4:5] -; CHECK-NEXT: v_addc_u32_e32 v12, vcc, 0, v9, vcc +; CHECK-NEXT: v_cndmask_b32_e64 v10, 0, -1, s[4:5] +; CHECK-NEXT: v_cmp_eq_u32_e64 s[4:5], v9, v3 +; CHECK-NEXT: v_add_i32_e32 v9, vcc, 1, v5 +; CHECK-NEXT: v_cndmask_b32_e64 v8, v8, v10, s[4:5] +; CHECK-NEXT: v_addc_u32_e32 v10, vcc, 0, v6, vcc ; CHECK-NEXT: v_cmp_ge_u32_e32 vcc, v1, v3 -; CHECK-NEXT: v_cndmask_b32_e64 v13, 0, -1, vcc -; CHECK-NEXT: v_cmp_ge_u32_e32 vcc, v7, v5 -; CHECK-NEXT: v_cndmask_b32_e64 v5, 0, -1, vcc +; CHECK-NEXT: v_cndmask_b32_e64 v11, 0, -1, vcc +; CHECK-NEXT: v_cmp_ge_u32_e32 vcc, v0, v2 +; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc ; CHECK-NEXT: v_cmp_eq_u32_e32 vcc, v1, v3 -; CHECK-NEXT: v_cndmask_b32_e32 v1, v13, v5, vcc -; CHECK-NEXT: v_add_i32_e32 v3, vcc, 1, v11 -; CHECK-NEXT: v_addc_u32_e32 v5, vcc, 0, v12, vcc -; CHECK-NEXT: v_cmp_ne_u32_e32 vcc, 0, v1 -; CHECK-NEXT: v_cndmask_b32_e32 v1, v11, v3, vcc -; CHECK-NEXT: v_cndmask_b32_e32 v3, v12, v5, vcc -; CHECK-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10 +; CHECK-NEXT: v_cndmask_b32_e32 v0, v11, v0, vcc +; CHECK-NEXT: v_add_i32_e32 v1, vcc, 1, v9 +; CHECK-NEXT: v_addc_u32_e32 v2, vcc, 0, v10, vcc +; CHECK-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 +; CHECK-NEXT: v_cndmask_b32_e32 v0, v9, v1, vcc +; CHECK-NEXT: v_cndmask_b32_e32 v1, v10, v2, vcc +; CHECK-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 +; CHECK-NEXT: v_cndmask_b32_e32 v0, v5, v0, vcc +; CHECK-NEXT: v_xor_b32_e32 v2, v7, v4 ; CHECK-NEXT: v_cndmask_b32_e32 v1, v6, v1, vcc -; CHECK-NEXT: v_xor_b32_e32 v5, v8, v4 -; CHECK-NEXT: v_cndmask_b32_e32 v3, v9, v3, vcc -; CHECK-NEXT: v_xor_b32_e32 v1, v1, v5 -; CHECK-NEXT: v_xor_b32_e32 v3, v3, v5 -; CHECK-NEXT: v_sub_i32_e32 v4, vcc, v1, v5 -; CHECK-NEXT: v_subb_u32_e32 v5, vcc, v3, v5, vcc +; CHECK-NEXT: v_xor_b32_e32 v0, v0, v2 +; CHECK-NEXT: v_sub_i32_e32 v4, vcc, v0, v2 +; CHECK-NEXT: v_xor_b32_e32 v1, v1, v2 +; CHECK-NEXT: v_subb_u32_e32 v5, vcc, v1, v2, vcc +; CHECK-NEXT: ; implicit-def: $vgpr0 +; CHECK-NEXT: ; implicit-def: $vgpr2 ; CHECK-NEXT: BB0_2: ; %Flow ; CHECK-NEXT: s_or_saveexec_b64 s[6:7], s[6:7] ; CHECK-NEXT: s_xor_b64 exec, exec, s[6:7] @@ -702,146 +704,148 @@ ; CGP-NEXT: ; %bb.1: ; CGP-NEXT: v_ashrrev_i32_e32 v0, 31, v5 ; CGP-NEXT: v_add_i32_e32 v1, vcc, v4, v0 -; CGP-NEXT: v_addc_u32_e32 v5, vcc, v5, v0, vcc +; CGP-NEXT: v_addc_u32_e32 v4, vcc, v5, v0, vcc ; CGP-NEXT: v_xor_b32_e32 v1, v1, v0 -; CGP-NEXT: v_xor_b32_e32 v5, v5, v0 -; CGP-NEXT: v_cvt_f32_u32_e32 v10, v1 -; CGP-NEXT: v_cvt_f32_u32_e32 v11, v5 -; CGP-NEXT: v_ashrrev_i32_e32 v12, 31, v9 -; CGP-NEXT: v_mac_f32_e32 v10, 0x4f800000, v11 -; CGP-NEXT: v_rcp_iflag_f32_e32 v10, v10 -; CGP-NEXT: v_add_i32_e32 v11, vcc, v8, v12 -; CGP-NEXT: v_addc_u32_e32 v9, vcc, v9, v12, vcc -; CGP-NEXT: v_sub_i32_e32 v14, vcc, 0, v1 -; CGP-NEXT: v_mul_f32_e32 v10, 0x5f7ffffc, v10 -; CGP-NEXT: v_mul_f32_e32 v13, 0x2f800000, v10 -; CGP-NEXT: v_trunc_f32_e32 v13, v13 -; CGP-NEXT: v_mac_f32_e32 v10, 0xcf800000, v13 +; CGP-NEXT: v_xor_b32_e32 v4, v4, v0 +; CGP-NEXT: v_cvt_f32_u32_e32 v5, v1 +; CGP-NEXT: v_cvt_f32_u32_e32 v10, v4 +; CGP-NEXT: v_ashrrev_i32_e32 v11, 31, v9 +; CGP-NEXT: v_add_i32_e32 v8, vcc, v8, v11 +; CGP-NEXT: v_addc_u32_e32 v9, vcc, v9, v11, vcc +; CGP-NEXT: v_mac_f32_e32 v5, 0x4f800000, v10 +; CGP-NEXT: v_rcp_iflag_f32_e32 v5, v5 +; CGP-NEXT: v_sub_i32_e32 v12, vcc, 0, v1 +; CGP-NEXT: v_subb_u32_e32 v13, vcc, 0, v4, vcc +; CGP-NEXT: v_xor_b32_e32 v8, v8, v11 +; CGP-NEXT: v_mul_f32_e32 v5, 0x5f7ffffc, v5 +; CGP-NEXT: v_mul_f32_e32 v10, 0x2f800000, v5 +; CGP-NEXT: v_trunc_f32_e32 v10, v10 +; CGP-NEXT: v_mac_f32_e32 v5, 0xcf800000, v10 +; CGP-NEXT: v_cvt_u32_f32_e32 v5, v5 ; CGP-NEXT: v_cvt_u32_f32_e32 v10, v10 -; CGP-NEXT: v_cvt_u32_f32_e32 v13, v13 -; CGP-NEXT: v_subb_u32_e32 v15, vcc, 0, v5, vcc -; CGP-NEXT: v_xor_b32_e32 v11, v11, v12 -; CGP-NEXT: v_mul_lo_u32 v16, v15, v10 -; CGP-NEXT: v_mul_lo_u32 v17, v14, v13 -; CGP-NEXT: v_mul_hi_u32 v19, v14, v10 -; CGP-NEXT: v_mul_lo_u32 v18, v14, v10 -; CGP-NEXT: v_xor_b32_e32 v9, v9, v12 +; CGP-NEXT: v_xor_b32_e32 v9, v9, v11 +; CGP-NEXT: v_mul_lo_u32 v14, v13, v5 +; CGP-NEXT: v_mul_lo_u32 v15, v12, v10 +; CGP-NEXT: v_mul_hi_u32 v17, v12, v5 +; CGP-NEXT: v_mul_lo_u32 v16, v12, v5 +; CGP-NEXT: v_add_i32_e32 v14, vcc, v14, v15 +; CGP-NEXT: v_add_i32_e32 v14, vcc, v14, v17 +; CGP-NEXT: v_mul_lo_u32 v15, v10, v16 +; CGP-NEXT: v_mul_lo_u32 v17, v5, v14 +; CGP-NEXT: v_mul_hi_u32 v18, v5, v16 +; CGP-NEXT: v_mul_hi_u32 v16, v10, v16 +; CGP-NEXT: v_add_i32_e32 v15, vcc, v15, v17 +; CGP-NEXT: v_cndmask_b32_e64 v17, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v15, vcc, v15, v18 +; CGP-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc +; CGP-NEXT: v_mul_lo_u32 v18, v10, v14 +; CGP-NEXT: v_add_i32_e32 v15, vcc, v17, v15 +; CGP-NEXT: v_mul_hi_u32 v17, v5, v14 +; CGP-NEXT: v_mul_hi_u32 v14, v10, v14 +; CGP-NEXT: v_add_i32_e32 v16, vcc, v18, v16 +; CGP-NEXT: v_cndmask_b32_e64 v18, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v16, vcc, v16, v17 -; CGP-NEXT: v_add_i32_e32 v16, vcc, v16, v19 -; CGP-NEXT: v_mul_lo_u32 v17, v13, v18 -; CGP-NEXT: v_mul_lo_u32 v19, v10, v16 -; CGP-NEXT: v_mul_hi_u32 v20, v10, v18 -; CGP-NEXT: v_mul_hi_u32 v18, v13, v18 -; CGP-NEXT: v_add_i32_e32 v17, vcc, v17, v19 -; CGP-NEXT: v_cndmask_b32_e64 v19, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v17, vcc, v17, v20 ; CGP-NEXT: v_cndmask_b32_e64 v17, 0, 1, vcc -; CGP-NEXT: v_mul_lo_u32 v20, v13, v16 -; CGP-NEXT: v_add_i32_e32 v17, vcc, v19, v17 -; CGP-NEXT: v_mul_hi_u32 v19, v10, v16 -; CGP-NEXT: v_mul_hi_u32 v16, v13, v16 -; CGP-NEXT: v_add_i32_e32 v18, vcc, v20, v18 -; CGP-NEXT: v_cndmask_b32_e64 v20, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v18, vcc, v18, v19 -; CGP-NEXT: v_cndmask_b32_e64 v19, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v19, vcc, v20, v19 ; CGP-NEXT: v_add_i32_e32 v17, vcc, v18, v17 -; CGP-NEXT: v_cndmask_b32_e64 v18, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v18, vcc, v19, v18 -; CGP-NEXT: v_add_i32_e32 v16, vcc, v16, v18 -; CGP-NEXT: v_add_i32_e32 v10, vcc, v10, v17 -; CGP-NEXT: v_addc_u32_e64 v17, s[4:5], v13, v16, vcc -; CGP-NEXT: v_mul_lo_u32 v15, v15, v10 -; CGP-NEXT: v_mul_lo_u32 v18, v14, v17 -; CGP-NEXT: v_mul_lo_u32 v19, v14, v10 -; CGP-NEXT: v_mul_hi_u32 v14, v14, v10 +; CGP-NEXT: v_add_i32_e32 v15, vcc, v16, v15 +; CGP-NEXT: v_cndmask_b32_e64 v16, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v16, vcc, v17, v16 +; CGP-NEXT: v_add_i32_e32 v14, vcc, v14, v16 +; CGP-NEXT: v_add_i32_e32 v5, vcc, v5, v15 +; CGP-NEXT: v_addc_u32_e64 v15, s[4:5], v10, v14, vcc +; CGP-NEXT: v_mul_lo_u32 v13, v13, v5 +; CGP-NEXT: v_mul_lo_u32 v16, v12, v15 +; CGP-NEXT: v_mul_lo_u32 v17, v12, v5 +; CGP-NEXT: v_mul_hi_u32 v12, v12, v5 +; CGP-NEXT: v_add_i32_e64 v10, s[4:5], v10, v14 +; CGP-NEXT: v_add_i32_e64 v13, s[4:5], v13, v16 +; CGP-NEXT: v_mul_hi_u32 v14, v5, v17 +; CGP-NEXT: v_add_i32_e64 v12, s[4:5], v13, v12 +; CGP-NEXT: v_mul_lo_u32 v13, v15, v17 +; CGP-NEXT: v_mul_lo_u32 v16, v5, v12 +; CGP-NEXT: v_mul_hi_u32 v17, v15, v17 ; CGP-NEXT: v_add_i32_e64 v13, s[4:5], v13, v16 -; CGP-NEXT: v_add_i32_e64 v15, s[4:5], v15, v18 -; CGP-NEXT: v_mul_hi_u32 v16, v10, v19 -; CGP-NEXT: v_add_i32_e64 v14, s[4:5], v15, v14 -; CGP-NEXT: v_mul_lo_u32 v15, v17, v19 -; CGP-NEXT: v_mul_lo_u32 v18, v10, v14 -; CGP-NEXT: v_mul_hi_u32 v19, v17, v19 -; CGP-NEXT: v_add_i32_e64 v15, s[4:5], v15, v18 -; CGP-NEXT: v_cndmask_b32_e64 v18, 0, 1, s[4:5] -; CGP-NEXT: v_add_i32_e64 v15, s[4:5], v15, v16 -; CGP-NEXT: v_cndmask_b32_e64 v15, 0, 1, s[4:5] -; CGP-NEXT: v_mul_lo_u32 v16, v17, v14 -; CGP-NEXT: v_add_i32_e64 v15, s[4:5], v18, v15 -; CGP-NEXT: v_mul_hi_u32 v18, v10, v14 -; CGP-NEXT: v_mul_hi_u32 v14, v17, v14 -; CGP-NEXT: v_add_i32_e64 v16, s[4:5], v16, v19 -; CGP-NEXT: v_cndmask_b32_e64 v19, 0, 1, s[4:5] -; CGP-NEXT: v_add_i32_e64 v16, s[4:5], v16, v18 -; CGP-NEXT: v_cndmask_b32_e64 v18, 0, 1, s[4:5] -; CGP-NEXT: v_add_i32_e64 v18, s[4:5], v19, v18 -; CGP-NEXT: v_add_i32_e64 v15, s[4:5], v16, v15 ; CGP-NEXT: v_cndmask_b32_e64 v16, 0, 1, s[4:5] -; CGP-NEXT: v_add_i32_e64 v16, s[4:5], v18, v16 +; CGP-NEXT: v_add_i32_e64 v13, s[4:5], v13, v14 +; CGP-NEXT: v_cndmask_b32_e64 v13, 0, 1, s[4:5] +; CGP-NEXT: v_mul_lo_u32 v14, v15, v12 +; CGP-NEXT: v_add_i32_e64 v13, s[4:5], v16, v13 +; CGP-NEXT: v_mul_hi_u32 v16, v5, v12 +; CGP-NEXT: v_mul_hi_u32 v12, v15, v12 +; CGP-NEXT: v_add_i32_e64 v14, s[4:5], v14, v17 +; CGP-NEXT: v_cndmask_b32_e64 v17, 0, 1, s[4:5] ; CGP-NEXT: v_add_i32_e64 v14, s[4:5], v14, v16 -; CGP-NEXT: v_addc_u32_e32 v13, vcc, v13, v14, vcc -; CGP-NEXT: v_add_i32_e32 v10, vcc, v10, v15 -; CGP-NEXT: v_addc_u32_e32 v13, vcc, 0, v13, vcc +; CGP-NEXT: v_cndmask_b32_e64 v16, 0, 1, s[4:5] +; CGP-NEXT: v_add_i32_e64 v16, s[4:5], v17, v16 +; CGP-NEXT: v_add_i32_e64 v13, s[4:5], v14, v13 +; CGP-NEXT: v_cndmask_b32_e64 v14, 0, 1, s[4:5] +; CGP-NEXT: v_add_i32_e64 v14, s[4:5], v16, v14 +; CGP-NEXT: v_add_i32_e64 v12, s[4:5], v12, v14 +; CGP-NEXT: v_addc_u32_e32 v10, vcc, v10, v12, vcc +; CGP-NEXT: v_add_i32_e32 v5, vcc, v5, v13 +; CGP-NEXT: v_addc_u32_e32 v10, vcc, 0, v10, vcc +; CGP-NEXT: v_mul_lo_u32 v12, v9, v5 +; CGP-NEXT: v_mul_lo_u32 v13, v8, v10 +; CGP-NEXT: v_mul_hi_u32 v14, v8, v5 +; CGP-NEXT: v_mul_hi_u32 v5, v9, v5 +; CGP-NEXT: v_add_i32_e32 v12, vcc, v12, v13 +; CGP-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v12, vcc, v12, v14 +; CGP-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc ; CGP-NEXT: v_mul_lo_u32 v14, v9, v10 -; CGP-NEXT: v_mul_lo_u32 v15, v11, v13 -; CGP-NEXT: v_mul_hi_u32 v16, v11, v10 +; CGP-NEXT: v_add_i32_e32 v12, vcc, v13, v12 +; CGP-NEXT: v_mul_hi_u32 v13, v8, v10 ; CGP-NEXT: v_mul_hi_u32 v10, v9, v10 -; CGP-NEXT: v_add_i32_e32 v14, vcc, v14, v15 -; CGP-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v14, vcc, v14, v16 -; CGP-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc -; CGP-NEXT: v_mul_lo_u32 v16, v9, v13 -; CGP-NEXT: v_add_i32_e32 v14, vcc, v15, v14 -; CGP-NEXT: v_mul_hi_u32 v15, v11, v13 -; CGP-NEXT: v_mul_hi_u32 v13, v9, v13 -; CGP-NEXT: v_add_i32_e32 v10, vcc, v16, v10 -; CGP-NEXT: v_cndmask_b32_e64 v16, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v10, vcc, v10, v15 -; CGP-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v15, vcc, v16, v15 -; CGP-NEXT: v_add_i32_e32 v10, vcc, v10, v14 +; CGP-NEXT: v_add_i32_e32 v5, vcc, v14, v5 ; CGP-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v14, vcc, v15, v14 -; CGP-NEXT: v_add_i32_e32 v13, vcc, v13, v14 -; CGP-NEXT: v_mul_lo_u32 v14, v5, v10 -; CGP-NEXT: v_mul_lo_u32 v15, v1, v13 -; CGP-NEXT: v_mul_hi_u32 v17, v1, v10 -; CGP-NEXT: v_mul_lo_u32 v16, v1, v10 -; CGP-NEXT: v_add_i32_e32 v14, vcc, v14, v15 -; CGP-NEXT: v_add_i32_e32 v14, vcc, v14, v17 -; CGP-NEXT: v_sub_i32_e32 v11, vcc, v11, v16 -; CGP-NEXT: v_subb_u32_e64 v15, s[4:5], v9, v14, vcc -; CGP-NEXT: v_sub_i32_e64 v9, s[4:5], v9, v14 -; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v15, v5 -; CGP-NEXT: v_subb_u32_e32 v9, vcc, v9, v5, vcc -; CGP-NEXT: v_cndmask_b32_e64 v14, 0, -1, s[4:5] -; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v11, v1 -; CGP-NEXT: v_sub_i32_e32 v11, vcc, v11, v1 +; CGP-NEXT: v_add_i32_e32 v5, vcc, v5, v13 +; CGP-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v13, vcc, v14, v13 +; CGP-NEXT: v_add_i32_e32 v5, vcc, v5, v12 +; CGP-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v12, vcc, v13, v12 +; CGP-NEXT: v_add_i32_e32 v10, vcc, v10, v12 +; CGP-NEXT: v_mul_lo_u32 v12, v4, v5 +; CGP-NEXT: v_mul_lo_u32 v13, v1, v10 +; CGP-NEXT: v_mul_hi_u32 v15, v1, v5 +; CGP-NEXT: v_mul_lo_u32 v14, v1, v5 +; CGP-NEXT: v_add_i32_e32 v12, vcc, v12, v13 +; CGP-NEXT: v_add_i32_e32 v12, vcc, v12, v15 +; CGP-NEXT: v_sub_i32_e32 v8, vcc, v8, v14 +; CGP-NEXT: v_subb_u32_e64 v13, s[4:5], v9, v12, vcc +; CGP-NEXT: v_sub_i32_e64 v9, s[4:5], v9, v12 +; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v13, v4 +; CGP-NEXT: v_subb_u32_e32 v9, vcc, v9, v4, vcc +; CGP-NEXT: v_cndmask_b32_e64 v12, 0, -1, s[4:5] +; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v8, v1 +; CGP-NEXT: v_sub_i32_e32 v8, vcc, v8, v1 ; CGP-NEXT: v_subbrev_u32_e32 v9, vcc, 0, v9, vcc -; CGP-NEXT: v_cndmask_b32_e64 v16, 0, -1, s[4:5] -; CGP-NEXT: v_cmp_eq_u32_e64 s[4:5], v15, v5 -; CGP-NEXT: v_add_i32_e32 v15, vcc, 1, v10 -; CGP-NEXT: v_cndmask_b32_e64 v14, v14, v16, s[4:5] -; CGP-NEXT: v_addc_u32_e32 v16, vcc, 0, v13, vcc -; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v9, v5 -; CGP-NEXT: v_cndmask_b32_e64 v17, 0, -1, vcc -; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v11, v1 +; CGP-NEXT: v_cndmask_b32_e64 v14, 0, -1, s[4:5] +; CGP-NEXT: v_cmp_eq_u32_e64 s[4:5], v13, v4 +; CGP-NEXT: v_add_i32_e32 v13, vcc, 1, v5 +; CGP-NEXT: v_cndmask_b32_e64 v12, v12, v14, s[4:5] +; CGP-NEXT: v_addc_u32_e32 v14, vcc, 0, v10, vcc +; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v9, v4 +; CGP-NEXT: v_cndmask_b32_e64 v15, 0, -1, vcc +; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v8, v1 ; CGP-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc -; CGP-NEXT: v_cmp_eq_u32_e32 vcc, v9, v5 -; CGP-NEXT: v_cndmask_b32_e32 v1, v17, v1, vcc -; CGP-NEXT: v_add_i32_e32 v5, vcc, 1, v15 -; CGP-NEXT: v_addc_u32_e32 v9, vcc, 0, v16, vcc +; CGP-NEXT: v_cmp_eq_u32_e32 vcc, v9, v4 +; CGP-NEXT: v_cndmask_b32_e32 v1, v15, v1, vcc +; CGP-NEXT: v_add_i32_e32 v4, vcc, 1, v13 +; CGP-NEXT: v_addc_u32_e32 v8, vcc, 0, v14, vcc ; CGP-NEXT: v_cmp_ne_u32_e32 vcc, 0, v1 -; CGP-NEXT: v_cndmask_b32_e32 v1, v15, v5, vcc -; CGP-NEXT: v_cndmask_b32_e32 v5, v16, v9, vcc -; CGP-NEXT: v_cmp_ne_u32_e32 vcc, 0, v14 -; CGP-NEXT: v_cndmask_b32_e32 v1, v10, v1, vcc -; CGP-NEXT: v_xor_b32_e32 v9, v12, v0 -; CGP-NEXT: v_cndmask_b32_e32 v5, v13, v5, vcc -; CGP-NEXT: v_xor_b32_e32 v0, v1, v9 -; CGP-NEXT: v_xor_b32_e32 v1, v5, v9 -; CGP-NEXT: v_sub_i32_e32 v0, vcc, v0, v9 -; CGP-NEXT: v_subb_u32_e32 v1, vcc, v1, v9, vcc +; CGP-NEXT: v_cndmask_b32_e32 v1, v13, v4, vcc +; CGP-NEXT: v_cndmask_b32_e32 v4, v14, v8, vcc +; CGP-NEXT: v_cmp_ne_u32_e32 vcc, 0, v12 +; CGP-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc +; CGP-NEXT: v_xor_b32_e32 v5, v11, v0 +; CGP-NEXT: v_cndmask_b32_e32 v4, v10, v4, vcc +; CGP-NEXT: v_xor_b32_e32 v0, v1, v5 +; CGP-NEXT: v_xor_b32_e32 v1, v4, v5 +; CGP-NEXT: v_sub_i32_e32 v0, vcc, v0, v5 +; CGP-NEXT: v_subb_u32_e32 v1, vcc, v1, v5, vcc +; CGP-NEXT: ; implicit-def: $vgpr8 +; CGP-NEXT: ; implicit-def: $vgpr4 ; CGP-NEXT: BB2_2: ; %Flow2 ; CGP-NEXT: s_or_saveexec_b64 s[6:7], s[6:7] ; CGP-NEXT: s_xor_b64 exec, exec, s[6:7] @@ -879,146 +883,148 @@ ; CGP-NEXT: ; %bb.5: ; CGP-NEXT: v_ashrrev_i32_e32 v4, 31, v7 ; CGP-NEXT: v_add_i32_e32 v5, vcc, v6, v4 -; CGP-NEXT: v_addc_u32_e32 v7, vcc, v7, v4, vcc +; CGP-NEXT: v_addc_u32_e32 v6, vcc, v7, v4, vcc ; CGP-NEXT: v_xor_b32_e32 v5, v5, v4 -; CGP-NEXT: v_xor_b32_e32 v7, v7, v4 -; CGP-NEXT: v_cvt_f32_u32_e32 v8, v5 -; CGP-NEXT: v_cvt_f32_u32_e32 v9, v7 -; CGP-NEXT: v_ashrrev_i32_e32 v10, 31, v3 -; CGP-NEXT: v_mac_f32_e32 v8, 0x4f800000, v9 -; CGP-NEXT: v_rcp_iflag_f32_e32 v8, v8 -; CGP-NEXT: v_add_i32_e32 v9, vcc, v2, v10 -; CGP-NEXT: v_addc_u32_e32 v3, vcc, v3, v10, vcc -; CGP-NEXT: v_sub_i32_e32 v12, vcc, 0, v5 -; CGP-NEXT: v_mul_f32_e32 v8, 0x5f7ffffc, v8 -; CGP-NEXT: v_mul_f32_e32 v11, 0x2f800000, v8 -; CGP-NEXT: v_trunc_f32_e32 v11, v11 -; CGP-NEXT: v_mac_f32_e32 v8, 0xcf800000, v11 +; CGP-NEXT: v_xor_b32_e32 v6, v6, v4 +; CGP-NEXT: v_cvt_f32_u32_e32 v7, v5 +; CGP-NEXT: v_cvt_f32_u32_e32 v8, v6 +; CGP-NEXT: v_ashrrev_i32_e32 v9, 31, v3 +; CGP-NEXT: v_add_i32_e32 v2, vcc, v2, v9 +; CGP-NEXT: v_addc_u32_e32 v3, vcc, v3, v9, vcc +; CGP-NEXT: v_mac_f32_e32 v7, 0x4f800000, v8 +; CGP-NEXT: v_rcp_iflag_f32_e32 v7, v7 +; CGP-NEXT: v_sub_i32_e32 v10, vcc, 0, v5 +; CGP-NEXT: v_subb_u32_e32 v11, vcc, 0, v6, vcc +; CGP-NEXT: v_xor_b32_e32 v2, v2, v9 +; CGP-NEXT: v_mul_f32_e32 v7, 0x5f7ffffc, v7 +; CGP-NEXT: v_mul_f32_e32 v8, 0x2f800000, v7 +; CGP-NEXT: v_trunc_f32_e32 v8, v8 +; CGP-NEXT: v_mac_f32_e32 v7, 0xcf800000, v8 +; CGP-NEXT: v_cvt_u32_f32_e32 v7, v7 ; CGP-NEXT: v_cvt_u32_f32_e32 v8, v8 -; CGP-NEXT: v_cvt_u32_f32_e32 v11, v11 -; CGP-NEXT: v_subb_u32_e32 v13, vcc, 0, v7, vcc -; CGP-NEXT: v_xor_b32_e32 v9, v9, v10 -; CGP-NEXT: v_mul_lo_u32 v14, v13, v8 -; CGP-NEXT: v_mul_lo_u32 v15, v12, v11 -; CGP-NEXT: v_mul_hi_u32 v17, v12, v8 -; CGP-NEXT: v_mul_lo_u32 v16, v12, v8 -; CGP-NEXT: v_xor_b32_e32 v3, v3, v10 +; CGP-NEXT: v_xor_b32_e32 v3, v3, v9 +; CGP-NEXT: v_mul_lo_u32 v12, v11, v7 +; CGP-NEXT: v_mul_lo_u32 v13, v10, v8 +; CGP-NEXT: v_mul_hi_u32 v15, v10, v7 +; CGP-NEXT: v_mul_lo_u32 v14, v10, v7 +; CGP-NEXT: v_add_i32_e32 v12, vcc, v12, v13 +; CGP-NEXT: v_add_i32_e32 v12, vcc, v12, v15 +; CGP-NEXT: v_mul_lo_u32 v13, v8, v14 +; CGP-NEXT: v_mul_lo_u32 v15, v7, v12 +; CGP-NEXT: v_mul_hi_u32 v16, v7, v14 +; CGP-NEXT: v_mul_hi_u32 v14, v8, v14 +; CGP-NEXT: v_add_i32_e32 v13, vcc, v13, v15 +; CGP-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v13, vcc, v13, v16 +; CGP-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc +; CGP-NEXT: v_mul_lo_u32 v16, v8, v12 +; CGP-NEXT: v_add_i32_e32 v13, vcc, v15, v13 +; CGP-NEXT: v_mul_hi_u32 v15, v7, v12 +; CGP-NEXT: v_mul_hi_u32 v12, v8, v12 +; CGP-NEXT: v_add_i32_e32 v14, vcc, v16, v14 +; CGP-NEXT: v_cndmask_b32_e64 v16, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v14, vcc, v14, v15 -; CGP-NEXT: v_add_i32_e32 v14, vcc, v14, v17 -; CGP-NEXT: v_mul_lo_u32 v15, v11, v16 -; CGP-NEXT: v_mul_lo_u32 v17, v8, v14 -; CGP-NEXT: v_mul_hi_u32 v18, v8, v16 -; CGP-NEXT: v_mul_hi_u32 v16, v11, v16 -; CGP-NEXT: v_add_i32_e32 v15, vcc, v15, v17 -; CGP-NEXT: v_cndmask_b32_e64 v17, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v15, vcc, v15, v18 ; CGP-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc -; CGP-NEXT: v_mul_lo_u32 v18, v11, v14 -; CGP-NEXT: v_add_i32_e32 v15, vcc, v17, v15 -; CGP-NEXT: v_mul_hi_u32 v17, v8, v14 -; CGP-NEXT: v_mul_hi_u32 v14, v11, v14 -; CGP-NEXT: v_add_i32_e32 v16, vcc, v18, v16 -; CGP-NEXT: v_cndmask_b32_e64 v18, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v16, vcc, v16, v17 -; CGP-NEXT: v_cndmask_b32_e64 v17, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v17, vcc, v18, v17 ; CGP-NEXT: v_add_i32_e32 v15, vcc, v16, v15 -; CGP-NEXT: v_cndmask_b32_e64 v16, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v16, vcc, v17, v16 -; CGP-NEXT: v_add_i32_e32 v14, vcc, v14, v16 -; CGP-NEXT: v_add_i32_e32 v8, vcc, v8, v15 -; CGP-NEXT: v_addc_u32_e64 v15, s[4:5], v11, v14, vcc -; CGP-NEXT: v_mul_lo_u32 v13, v13, v8 -; CGP-NEXT: v_mul_lo_u32 v16, v12, v15 -; CGP-NEXT: v_mul_lo_u32 v17, v12, v8 -; CGP-NEXT: v_mul_hi_u32 v12, v12, v8 +; CGP-NEXT: v_add_i32_e32 v13, vcc, v14, v13 +; CGP-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v14, vcc, v15, v14 +; CGP-NEXT: v_add_i32_e32 v12, vcc, v12, v14 +; CGP-NEXT: v_add_i32_e32 v7, vcc, v7, v13 +; CGP-NEXT: v_addc_u32_e64 v13, s[4:5], v8, v12, vcc +; CGP-NEXT: v_mul_lo_u32 v11, v11, v7 +; CGP-NEXT: v_mul_lo_u32 v14, v10, v13 +; CGP-NEXT: v_mul_lo_u32 v15, v10, v7 +; CGP-NEXT: v_mul_hi_u32 v10, v10, v7 +; CGP-NEXT: v_add_i32_e64 v8, s[4:5], v8, v12 +; CGP-NEXT: v_add_i32_e64 v11, s[4:5], v11, v14 +; CGP-NEXT: v_mul_hi_u32 v12, v7, v15 +; CGP-NEXT: v_add_i32_e64 v10, s[4:5], v11, v10 +; CGP-NEXT: v_mul_lo_u32 v11, v13, v15 +; CGP-NEXT: v_mul_lo_u32 v14, v7, v10 +; CGP-NEXT: v_mul_hi_u32 v15, v13, v15 ; CGP-NEXT: v_add_i32_e64 v11, s[4:5], v11, v14 -; CGP-NEXT: v_add_i32_e64 v13, s[4:5], v13, v16 -; CGP-NEXT: v_mul_hi_u32 v14, v8, v17 -; CGP-NEXT: v_add_i32_e64 v12, s[4:5], v13, v12 -; CGP-NEXT: v_mul_lo_u32 v13, v15, v17 -; CGP-NEXT: v_mul_lo_u32 v16, v8, v12 -; CGP-NEXT: v_mul_hi_u32 v17, v15, v17 -; CGP-NEXT: v_add_i32_e64 v13, s[4:5], v13, v16 -; CGP-NEXT: v_cndmask_b32_e64 v16, 0, 1, s[4:5] -; CGP-NEXT: v_add_i32_e64 v13, s[4:5], v13, v14 -; CGP-NEXT: v_cndmask_b32_e64 v13, 0, 1, s[4:5] -; CGP-NEXT: v_mul_lo_u32 v14, v15, v12 -; CGP-NEXT: v_add_i32_e64 v13, s[4:5], v16, v13 -; CGP-NEXT: v_mul_hi_u32 v16, v8, v12 -; CGP-NEXT: v_mul_hi_u32 v12, v15, v12 -; CGP-NEXT: v_add_i32_e64 v14, s[4:5], v14, v17 -; CGP-NEXT: v_cndmask_b32_e64 v17, 0, 1, s[4:5] -; CGP-NEXT: v_add_i32_e64 v14, s[4:5], v14, v16 -; CGP-NEXT: v_cndmask_b32_e64 v16, 0, 1, s[4:5] -; CGP-NEXT: v_add_i32_e64 v16, s[4:5], v17, v16 -; CGP-NEXT: v_add_i32_e64 v13, s[4:5], v14, v13 ; CGP-NEXT: v_cndmask_b32_e64 v14, 0, 1, s[4:5] -; CGP-NEXT: v_add_i32_e64 v14, s[4:5], v16, v14 +; CGP-NEXT: v_add_i32_e64 v11, s[4:5], v11, v12 +; CGP-NEXT: v_cndmask_b32_e64 v11, 0, 1, s[4:5] +; CGP-NEXT: v_mul_lo_u32 v12, v13, v10 +; CGP-NEXT: v_add_i32_e64 v11, s[4:5], v14, v11 +; CGP-NEXT: v_mul_hi_u32 v14, v7, v10 +; CGP-NEXT: v_mul_hi_u32 v10, v13, v10 +; CGP-NEXT: v_add_i32_e64 v12, s[4:5], v12, v15 +; CGP-NEXT: v_cndmask_b32_e64 v15, 0, 1, s[4:5] ; CGP-NEXT: v_add_i32_e64 v12, s[4:5], v12, v14 -; CGP-NEXT: v_addc_u32_e32 v11, vcc, v11, v12, vcc -; CGP-NEXT: v_add_i32_e32 v8, vcc, v8, v13 -; CGP-NEXT: v_addc_u32_e32 v11, vcc, 0, v11, vcc +; CGP-NEXT: v_cndmask_b32_e64 v14, 0, 1, s[4:5] +; CGP-NEXT: v_add_i32_e64 v14, s[4:5], v15, v14 +; CGP-NEXT: v_add_i32_e64 v11, s[4:5], v12, v11 +; CGP-NEXT: v_cndmask_b32_e64 v12, 0, 1, s[4:5] +; CGP-NEXT: v_add_i32_e64 v12, s[4:5], v14, v12 +; CGP-NEXT: v_add_i32_e64 v10, s[4:5], v10, v12 +; CGP-NEXT: v_addc_u32_e32 v8, vcc, v8, v10, vcc +; CGP-NEXT: v_add_i32_e32 v7, vcc, v7, v11 +; CGP-NEXT: v_addc_u32_e32 v8, vcc, 0, v8, vcc +; CGP-NEXT: v_mul_lo_u32 v10, v3, v7 +; CGP-NEXT: v_mul_lo_u32 v11, v2, v8 +; CGP-NEXT: v_mul_hi_u32 v12, v2, v7 +; CGP-NEXT: v_mul_hi_u32 v7, v3, v7 +; CGP-NEXT: v_add_i32_e32 v10, vcc, v10, v11 +; CGP-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v10, vcc, v10, v12 +; CGP-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc ; CGP-NEXT: v_mul_lo_u32 v12, v3, v8 -; CGP-NEXT: v_mul_lo_u32 v13, v9, v11 -; CGP-NEXT: v_mul_hi_u32 v14, v9, v8 +; CGP-NEXT: v_add_i32_e32 v10, vcc, v11, v10 +; CGP-NEXT: v_mul_hi_u32 v11, v2, v8 ; CGP-NEXT: v_mul_hi_u32 v8, v3, v8 -; CGP-NEXT: v_add_i32_e32 v12, vcc, v12, v13 -; CGP-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v12, vcc, v12, v14 -; CGP-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc -; CGP-NEXT: v_mul_lo_u32 v14, v3, v11 -; CGP-NEXT: v_add_i32_e32 v12, vcc, v13, v12 -; CGP-NEXT: v_mul_hi_u32 v13, v9, v11 -; CGP-NEXT: v_mul_hi_u32 v11, v3, v11 -; CGP-NEXT: v_add_i32_e32 v8, vcc, v14, v8 -; CGP-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v8, vcc, v8, v13 -; CGP-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v13, vcc, v14, v13 -; CGP-NEXT: v_add_i32_e32 v8, vcc, v8, v12 +; CGP-NEXT: v_add_i32_e32 v7, vcc, v12, v7 ; CGP-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v12, vcc, v13, v12 -; CGP-NEXT: v_add_i32_e32 v11, vcc, v11, v12 -; CGP-NEXT: v_mul_lo_u32 v12, v7, v8 -; CGP-NEXT: v_mul_lo_u32 v13, v5, v11 -; CGP-NEXT: v_mul_hi_u32 v15, v5, v8 -; CGP-NEXT: v_mul_lo_u32 v14, v5, v8 -; CGP-NEXT: v_add_i32_e32 v12, vcc, v12, v13 -; CGP-NEXT: v_add_i32_e32 v12, vcc, v12, v15 -; CGP-NEXT: v_sub_i32_e32 v9, vcc, v9, v14 -; CGP-NEXT: v_subb_u32_e64 v13, s[4:5], v3, v12, vcc -; CGP-NEXT: v_sub_i32_e64 v3, s[4:5], v3, v12 -; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v13, v7 -; CGP-NEXT: v_subb_u32_e32 v3, vcc, v3, v7, vcc -; CGP-NEXT: v_cndmask_b32_e64 v12, 0, -1, s[4:5] -; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v9, v5 -; CGP-NEXT: v_sub_i32_e32 v9, vcc, v9, v5 +; CGP-NEXT: v_add_i32_e32 v7, vcc, v7, v11 +; CGP-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v11, vcc, v12, v11 +; CGP-NEXT: v_add_i32_e32 v7, vcc, v7, v10 +; CGP-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v10, vcc, v11, v10 +; CGP-NEXT: v_add_i32_e32 v8, vcc, v8, v10 +; CGP-NEXT: v_mul_lo_u32 v10, v6, v7 +; CGP-NEXT: v_mul_lo_u32 v11, v5, v8 +; CGP-NEXT: v_mul_hi_u32 v13, v5, v7 +; CGP-NEXT: v_mul_lo_u32 v12, v5, v7 +; CGP-NEXT: v_add_i32_e32 v10, vcc, v10, v11 +; CGP-NEXT: v_add_i32_e32 v10, vcc, v10, v13 +; CGP-NEXT: v_sub_i32_e32 v2, vcc, v2, v12 +; CGP-NEXT: v_subb_u32_e64 v11, s[4:5], v3, v10, vcc +; CGP-NEXT: v_sub_i32_e64 v3, s[4:5], v3, v10 +; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v11, v6 +; CGP-NEXT: v_subb_u32_e32 v3, vcc, v3, v6, vcc +; CGP-NEXT: v_cndmask_b32_e64 v10, 0, -1, s[4:5] +; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v2, v5 +; CGP-NEXT: v_sub_i32_e32 v2, vcc, v2, v5 ; CGP-NEXT: v_subbrev_u32_e32 v3, vcc, 0, v3, vcc -; CGP-NEXT: v_cndmask_b32_e64 v14, 0, -1, s[4:5] -; CGP-NEXT: v_cmp_eq_u32_e64 s[4:5], v13, v7 -; CGP-NEXT: v_add_i32_e32 v13, vcc, 1, v8 -; CGP-NEXT: v_cndmask_b32_e64 v12, v12, v14, s[4:5] -; CGP-NEXT: v_addc_u32_e32 v14, vcc, 0, v11, vcc -; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v3, v7 -; CGP-NEXT: v_cndmask_b32_e64 v15, 0, -1, vcc -; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v9, v5 -; CGP-NEXT: v_cndmask_b32_e64 v5, 0, -1, vcc -; CGP-NEXT: v_cmp_eq_u32_e32 vcc, v3, v7 -; CGP-NEXT: v_cndmask_b32_e32 v3, v15, v5, vcc -; CGP-NEXT: v_add_i32_e32 v5, vcc, 1, v13 -; CGP-NEXT: v_addc_u32_e32 v7, vcc, 0, v14, vcc -; CGP-NEXT: v_cmp_ne_u32_e32 vcc, 0, v3 -; CGP-NEXT: v_cndmask_b32_e32 v3, v13, v5, vcc -; CGP-NEXT: v_cndmask_b32_e32 v5, v14, v7, vcc -; CGP-NEXT: v_cmp_ne_u32_e32 vcc, 0, v12 +; CGP-NEXT: v_cndmask_b32_e64 v12, 0, -1, s[4:5] +; CGP-NEXT: v_cmp_eq_u32_e64 s[4:5], v11, v6 +; CGP-NEXT: v_add_i32_e32 v11, vcc, 1, v7 +; CGP-NEXT: v_cndmask_b32_e64 v10, v10, v12, s[4:5] +; CGP-NEXT: v_addc_u32_e32 v12, vcc, 0, v8, vcc +; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v3, v6 +; CGP-NEXT: v_cndmask_b32_e64 v13, 0, -1, vcc +; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v2, v5 +; CGP-NEXT: v_cndmask_b32_e64 v2, 0, -1, vcc +; CGP-NEXT: v_cmp_eq_u32_e32 vcc, v3, v6 +; CGP-NEXT: v_cndmask_b32_e32 v2, v13, v2, vcc +; CGP-NEXT: v_add_i32_e32 v3, vcc, 1, v11 +; CGP-NEXT: v_addc_u32_e32 v5, vcc, 0, v12, vcc +; CGP-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 +; CGP-NEXT: v_cndmask_b32_e32 v2, v11, v3, vcc +; CGP-NEXT: v_cndmask_b32_e32 v3, v12, v5, vcc +; CGP-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10 +; CGP-NEXT: v_cndmask_b32_e32 v2, v7, v2, vcc +; CGP-NEXT: v_xor_b32_e32 v5, v9, v4 ; CGP-NEXT: v_cndmask_b32_e32 v3, v8, v3, vcc -; CGP-NEXT: v_xor_b32_e32 v7, v10, v4 -; CGP-NEXT: v_cndmask_b32_e32 v5, v11, v5, vcc -; CGP-NEXT: v_xor_b32_e32 v3, v3, v7 -; CGP-NEXT: v_xor_b32_e32 v5, v5, v7 -; CGP-NEXT: v_sub_i32_e32 v4, vcc, v3, v7 -; CGP-NEXT: v_subb_u32_e32 v5, vcc, v5, v7, vcc +; CGP-NEXT: v_xor_b32_e32 v2, v2, v5 +; CGP-NEXT: v_sub_i32_e32 v4, vcc, v2, v5 +; CGP-NEXT: v_xor_b32_e32 v3, v3, v5 +; CGP-NEXT: v_subb_u32_e32 v5, vcc, v3, v5, vcc +; CGP-NEXT: ; implicit-def: $vgpr2 +; CGP-NEXT: ; implicit-def: $vgpr6 ; CGP-NEXT: BB2_6: ; %Flow ; CGP-NEXT: s_or_saveexec_b64 s[6:7], s[6:7] ; CGP-NEXT: s_xor_b64 exec, exec, s[6:7] @@ -2516,146 +2522,148 @@ ; CHECK-NEXT: ; %bb.1: ; CHECK-NEXT: v_ashrrev_i32_e32 v2, 31, v5 ; CHECK-NEXT: v_add_i32_e32 v3, vcc, v4, v2 -; CHECK-NEXT: v_addc_u32_e32 v5, vcc, v5, v2, vcc +; CHECK-NEXT: v_addc_u32_e32 v4, vcc, v5, v2, vcc ; CHECK-NEXT: v_xor_b32_e32 v3, v3, v2 -; CHECK-NEXT: v_xor_b32_e32 v5, v5, v2 -; CHECK-NEXT: v_cvt_f32_u32_e32 v6, v3 -; CHECK-NEXT: v_cvt_f32_u32_e32 v7, v5 -; CHECK-NEXT: v_ashrrev_i32_e32 v8, 31, v1 -; CHECK-NEXT: v_mac_f32_e32 v6, 0x4f800000, v7 -; CHECK-NEXT: v_rcp_iflag_f32_e32 v6, v6 -; CHECK-NEXT: v_add_i32_e32 v7, vcc, v0, v8 -; CHECK-NEXT: v_addc_u32_e32 v1, vcc, v1, v8, vcc -; CHECK-NEXT: v_sub_i32_e32 v10, vcc, 0, v3 -; CHECK-NEXT: v_mul_f32_e32 v6, 0x5f7ffffc, v6 -; CHECK-NEXT: v_mul_f32_e32 v9, 0x2f800000, v6 -; CHECK-NEXT: v_trunc_f32_e32 v9, v9 -; CHECK-NEXT: v_mac_f32_e32 v6, 0xcf800000, v9 +; CHECK-NEXT: v_xor_b32_e32 v4, v4, v2 +; CHECK-NEXT: v_cvt_f32_u32_e32 v5, v3 +; CHECK-NEXT: v_cvt_f32_u32_e32 v6, v4 +; CHECK-NEXT: v_ashrrev_i32_e32 v7, 31, v1 +; CHECK-NEXT: v_add_i32_e32 v0, vcc, v0, v7 +; CHECK-NEXT: v_addc_u32_e32 v1, vcc, v1, v7, vcc +; CHECK-NEXT: v_mac_f32_e32 v5, 0x4f800000, v6 +; CHECK-NEXT: v_rcp_iflag_f32_e32 v5, v5 +; CHECK-NEXT: v_sub_i32_e32 v8, vcc, 0, v3 +; CHECK-NEXT: v_subb_u32_e32 v9, vcc, 0, v4, vcc +; CHECK-NEXT: v_xor_b32_e32 v0, v0, v7 +; CHECK-NEXT: v_mul_f32_e32 v5, 0x5f7ffffc, v5 +; CHECK-NEXT: v_mul_f32_e32 v6, 0x2f800000, v5 +; CHECK-NEXT: v_trunc_f32_e32 v6, v6 +; CHECK-NEXT: v_mac_f32_e32 v5, 0xcf800000, v6 +; CHECK-NEXT: v_cvt_u32_f32_e32 v5, v5 ; CHECK-NEXT: v_cvt_u32_f32_e32 v6, v6 -; CHECK-NEXT: v_cvt_u32_f32_e32 v9, v9 -; CHECK-NEXT: v_subb_u32_e32 v11, vcc, 0, v5, vcc -; CHECK-NEXT: v_xor_b32_e32 v7, v7, v8 -; CHECK-NEXT: v_mul_lo_u32 v12, v11, v6 -; CHECK-NEXT: v_mul_lo_u32 v13, v10, v9 -; CHECK-NEXT: v_mul_hi_u32 v15, v10, v6 -; CHECK-NEXT: v_mul_lo_u32 v14, v10, v6 -; CHECK-NEXT: v_xor_b32_e32 v1, v1, v8 +; CHECK-NEXT: v_xor_b32_e32 v1, v1, v7 +; CHECK-NEXT: v_mul_lo_u32 v10, v9, v5 +; CHECK-NEXT: v_mul_lo_u32 v11, v8, v6 +; CHECK-NEXT: v_mul_hi_u32 v13, v8, v5 +; CHECK-NEXT: v_mul_lo_u32 v12, v8, v5 +; CHECK-NEXT: v_add_i32_e32 v10, vcc, v10, v11 +; CHECK-NEXT: v_add_i32_e32 v10, vcc, v10, v13 +; CHECK-NEXT: v_mul_lo_u32 v11, v6, v12 +; CHECK-NEXT: v_mul_lo_u32 v13, v5, v10 +; CHECK-NEXT: v_mul_hi_u32 v14, v5, v12 +; CHECK-NEXT: v_mul_hi_u32 v12, v6, v12 +; CHECK-NEXT: v_add_i32_e32 v11, vcc, v11, v13 +; CHECK-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc +; CHECK-NEXT: v_add_i32_e32 v11, vcc, v11, v14 +; CHECK-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc +; CHECK-NEXT: v_mul_lo_u32 v14, v6, v10 +; CHECK-NEXT: v_add_i32_e32 v11, vcc, v13, v11 +; CHECK-NEXT: v_mul_hi_u32 v13, v5, v10 +; CHECK-NEXT: v_mul_hi_u32 v10, v6, v10 +; CHECK-NEXT: v_add_i32_e32 v12, vcc, v14, v12 +; CHECK-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc ; CHECK-NEXT: v_add_i32_e32 v12, vcc, v12, v13 -; CHECK-NEXT: v_add_i32_e32 v12, vcc, v12, v15 -; CHECK-NEXT: v_mul_lo_u32 v13, v9, v14 -; CHECK-NEXT: v_mul_lo_u32 v15, v6, v12 -; CHECK-NEXT: v_mul_hi_u32 v16, v6, v14 -; CHECK-NEXT: v_mul_hi_u32 v14, v9, v14 -; CHECK-NEXT: v_add_i32_e32 v13, vcc, v13, v15 -; CHECK-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc -; CHECK-NEXT: v_add_i32_e32 v13, vcc, v13, v16 ; CHECK-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc -; CHECK-NEXT: v_mul_lo_u32 v16, v9, v12 -; CHECK-NEXT: v_add_i32_e32 v13, vcc, v15, v13 -; CHECK-NEXT: v_mul_hi_u32 v15, v6, v12 -; CHECK-NEXT: v_mul_hi_u32 v12, v9, v12 -; CHECK-NEXT: v_add_i32_e32 v14, vcc, v16, v14 -; CHECK-NEXT: v_cndmask_b32_e64 v16, 0, 1, vcc -; CHECK-NEXT: v_add_i32_e32 v14, vcc, v14, v15 -; CHECK-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc -; CHECK-NEXT: v_add_i32_e32 v15, vcc, v16, v15 ; CHECK-NEXT: v_add_i32_e32 v13, vcc, v14, v13 -; CHECK-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc -; CHECK-NEXT: v_add_i32_e32 v14, vcc, v15, v14 -; CHECK-NEXT: v_add_i32_e32 v12, vcc, v12, v14 -; CHECK-NEXT: v_add_i32_e32 v6, vcc, v6, v13 -; CHECK-NEXT: v_addc_u32_e64 v13, s[4:5], v9, v12, vcc -; CHECK-NEXT: v_mul_lo_u32 v11, v11, v6 -; CHECK-NEXT: v_mul_lo_u32 v14, v10, v13 -; CHECK-NEXT: v_mul_lo_u32 v15, v10, v6 -; CHECK-NEXT: v_mul_hi_u32 v10, v10, v6 +; CHECK-NEXT: v_add_i32_e32 v11, vcc, v12, v11 +; CHECK-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc +; CHECK-NEXT: v_add_i32_e32 v12, vcc, v13, v12 +; CHECK-NEXT: v_add_i32_e32 v10, vcc, v10, v12 +; CHECK-NEXT: v_add_i32_e32 v5, vcc, v5, v11 +; CHECK-NEXT: v_addc_u32_e64 v11, s[4:5], v6, v10, vcc +; CHECK-NEXT: v_mul_lo_u32 v9, v9, v5 +; CHECK-NEXT: v_mul_lo_u32 v12, v8, v11 +; CHECK-NEXT: v_mul_lo_u32 v13, v8, v5 +; CHECK-NEXT: v_mul_hi_u32 v8, v8, v5 +; CHECK-NEXT: v_add_i32_e64 v6, s[4:5], v6, v10 +; CHECK-NEXT: v_add_i32_e64 v9, s[4:5], v9, v12 +; CHECK-NEXT: v_mul_hi_u32 v10, v5, v13 +; CHECK-NEXT: v_add_i32_e64 v8, s[4:5], v9, v8 +; CHECK-NEXT: v_mul_lo_u32 v9, v11, v13 +; CHECK-NEXT: v_mul_lo_u32 v12, v5, v8 +; CHECK-NEXT: v_mul_hi_u32 v13, v11, v13 ; CHECK-NEXT: v_add_i32_e64 v9, s[4:5], v9, v12 -; CHECK-NEXT: v_add_i32_e64 v11, s[4:5], v11, v14 -; CHECK-NEXT: v_mul_hi_u32 v12, v6, v15 -; CHECK-NEXT: v_add_i32_e64 v10, s[4:5], v11, v10 -; CHECK-NEXT: v_mul_lo_u32 v11, v13, v15 -; CHECK-NEXT: v_mul_lo_u32 v14, v6, v10 -; CHECK-NEXT: v_mul_hi_u32 v15, v13, v15 -; CHECK-NEXT: v_add_i32_e64 v11, s[4:5], v11, v14 -; CHECK-NEXT: v_cndmask_b32_e64 v14, 0, 1, s[4:5] -; CHECK-NEXT: v_add_i32_e64 v11, s[4:5], v11, v12 -; CHECK-NEXT: v_cndmask_b32_e64 v11, 0, 1, s[4:5] -; CHECK-NEXT: v_mul_lo_u32 v12, v13, v10 -; CHECK-NEXT: v_add_i32_e64 v11, s[4:5], v14, v11 -; CHECK-NEXT: v_mul_hi_u32 v14, v6, v10 -; CHECK-NEXT: v_mul_hi_u32 v10, v13, v10 -; CHECK-NEXT: v_add_i32_e64 v12, s[4:5], v12, v15 -; CHECK-NEXT: v_cndmask_b32_e64 v15, 0, 1, s[4:5] -; CHECK-NEXT: v_add_i32_e64 v12, s[4:5], v12, v14 -; CHECK-NEXT: v_cndmask_b32_e64 v14, 0, 1, s[4:5] -; CHECK-NEXT: v_add_i32_e64 v14, s[4:5], v15, v14 -; CHECK-NEXT: v_add_i32_e64 v11, s[4:5], v12, v11 ; CHECK-NEXT: v_cndmask_b32_e64 v12, 0, 1, s[4:5] -; CHECK-NEXT: v_add_i32_e64 v12, s[4:5], v14, v12 +; CHECK-NEXT: v_add_i32_e64 v9, s[4:5], v9, v10 +; CHECK-NEXT: v_cndmask_b32_e64 v9, 0, 1, s[4:5] +; CHECK-NEXT: v_mul_lo_u32 v10, v11, v8 +; CHECK-NEXT: v_add_i32_e64 v9, s[4:5], v12, v9 +; CHECK-NEXT: v_mul_hi_u32 v12, v5, v8 +; CHECK-NEXT: v_mul_hi_u32 v8, v11, v8 +; CHECK-NEXT: v_add_i32_e64 v10, s[4:5], v10, v13 +; CHECK-NEXT: v_cndmask_b32_e64 v13, 0, 1, s[4:5] ; CHECK-NEXT: v_add_i32_e64 v10, s[4:5], v10, v12 -; CHECK-NEXT: v_addc_u32_e32 v9, vcc, v9, v10, vcc -; CHECK-NEXT: v_add_i32_e32 v6, vcc, v6, v11 -; CHECK-NEXT: v_addc_u32_e32 v9, vcc, 0, v9, vcc +; CHECK-NEXT: v_cndmask_b32_e64 v12, 0, 1, s[4:5] +; CHECK-NEXT: v_add_i32_e64 v12, s[4:5], v13, v12 +; CHECK-NEXT: v_add_i32_e64 v9, s[4:5], v10, v9 +; CHECK-NEXT: v_cndmask_b32_e64 v10, 0, 1, s[4:5] +; CHECK-NEXT: v_add_i32_e64 v10, s[4:5], v12, v10 +; CHECK-NEXT: v_add_i32_e64 v8, s[4:5], v8, v10 +; CHECK-NEXT: v_addc_u32_e32 v6, vcc, v6, v8, vcc +; CHECK-NEXT: v_add_i32_e32 v5, vcc, v5, v9 +; CHECK-NEXT: v_addc_u32_e32 v6, vcc, 0, v6, vcc +; CHECK-NEXT: v_mul_lo_u32 v8, v1, v5 +; CHECK-NEXT: v_mul_lo_u32 v9, v0, v6 +; CHECK-NEXT: v_mul_hi_u32 v10, v0, v5 +; CHECK-NEXT: v_mul_hi_u32 v5, v1, v5 +; CHECK-NEXT: v_add_i32_e32 v8, vcc, v8, v9 +; CHECK-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc +; CHECK-NEXT: v_add_i32_e32 v8, vcc, v8, v10 +; CHECK-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc ; CHECK-NEXT: v_mul_lo_u32 v10, v1, v6 -; CHECK-NEXT: v_mul_lo_u32 v11, v7, v9 -; CHECK-NEXT: v_mul_hi_u32 v12, v7, v6 +; CHECK-NEXT: v_add_i32_e32 v8, vcc, v9, v8 +; CHECK-NEXT: v_mul_hi_u32 v9, v0, v6 ; CHECK-NEXT: v_mul_hi_u32 v6, v1, v6 -; CHECK-NEXT: v_add_i32_e32 v10, vcc, v10, v11 -; CHECK-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc -; CHECK-NEXT: v_add_i32_e32 v10, vcc, v10, v12 +; CHECK-NEXT: v_add_i32_e32 v5, vcc, v10, v5 ; CHECK-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc -; CHECK-NEXT: v_mul_lo_u32 v12, v1, v9 -; CHECK-NEXT: v_add_i32_e32 v10, vcc, v11, v10 -; CHECK-NEXT: v_mul_hi_u32 v11, v7, v9 -; CHECK-NEXT: v_mul_hi_u32 v9, v1, v9 -; CHECK-NEXT: v_add_i32_e32 v6, vcc, v12, v6 -; CHECK-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc -; CHECK-NEXT: v_add_i32_e32 v6, vcc, v6, v11 -; CHECK-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc -; CHECK-NEXT: v_add_i32_e32 v11, vcc, v12, v11 -; CHECK-NEXT: v_add_i32_e32 v6, vcc, v6, v10 -; CHECK-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc -; CHECK-NEXT: v_add_i32_e32 v10, vcc, v11, v10 -; CHECK-NEXT: v_add_i32_e32 v9, vcc, v9, v10 -; CHECK-NEXT: v_mul_lo_u32 v10, v5, v6 -; CHECK-NEXT: v_mul_lo_u32 v11, v3, v9 -; CHECK-NEXT: v_mul_hi_u32 v13, v3, v6 -; CHECK-NEXT: v_mul_lo_u32 v12, v3, v6 -; CHECK-NEXT: v_add_i32_e32 v10, vcc, v10, v11 -; CHECK-NEXT: v_add_i32_e32 v10, vcc, v10, v13 -; CHECK-NEXT: v_sub_i32_e32 v7, vcc, v7, v12 -; CHECK-NEXT: v_subb_u32_e64 v11, s[4:5], v1, v10, vcc -; CHECK-NEXT: v_sub_i32_e64 v1, s[4:5], v1, v10 -; CHECK-NEXT: v_cmp_ge_u32_e64 s[4:5], v11, v5 -; CHECK-NEXT: v_subb_u32_e32 v1, vcc, v1, v5, vcc -; CHECK-NEXT: v_cndmask_b32_e64 v10, 0, -1, s[4:5] -; CHECK-NEXT: v_cmp_ge_u32_e64 s[4:5], v7, v3 -; CHECK-NEXT: v_sub_i32_e32 v7, vcc, v7, v3 +; CHECK-NEXT: v_add_i32_e32 v5, vcc, v5, v9 +; CHECK-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc +; CHECK-NEXT: v_add_i32_e32 v9, vcc, v10, v9 +; CHECK-NEXT: v_add_i32_e32 v5, vcc, v5, v8 +; CHECK-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc +; CHECK-NEXT: v_add_i32_e32 v8, vcc, v9, v8 +; CHECK-NEXT: v_add_i32_e32 v6, vcc, v6, v8 +; CHECK-NEXT: v_mul_lo_u32 v8, v4, v5 +; CHECK-NEXT: v_mul_lo_u32 v9, v3, v6 +; CHECK-NEXT: v_mul_hi_u32 v11, v3, v5 +; CHECK-NEXT: v_mul_lo_u32 v10, v3, v5 +; CHECK-NEXT: v_add_i32_e32 v8, vcc, v8, v9 +; CHECK-NEXT: v_add_i32_e32 v8, vcc, v8, v11 +; CHECK-NEXT: v_sub_i32_e32 v0, vcc, v0, v10 +; CHECK-NEXT: v_subb_u32_e64 v9, s[4:5], v1, v8, vcc +; CHECK-NEXT: v_sub_i32_e64 v1, s[4:5], v1, v8 +; CHECK-NEXT: v_cmp_ge_u32_e64 s[4:5], v9, v4 +; CHECK-NEXT: v_subb_u32_e32 v1, vcc, v1, v4, vcc +; CHECK-NEXT: v_cndmask_b32_e64 v8, 0, -1, s[4:5] +; CHECK-NEXT: v_cmp_ge_u32_e64 s[4:5], v0, v3 +; CHECK-NEXT: v_sub_i32_e32 v0, vcc, v0, v3 ; CHECK-NEXT: v_subbrev_u32_e32 v1, vcc, 0, v1, vcc -; CHECK-NEXT: v_cndmask_b32_e64 v12, 0, -1, s[4:5] -; CHECK-NEXT: v_cmp_eq_u32_e64 s[4:5], v11, v5 -; CHECK-NEXT: v_add_i32_e32 v11, vcc, 1, v6 -; CHECK-NEXT: v_cndmask_b32_e64 v10, v10, v12, s[4:5] -; CHECK-NEXT: v_addc_u32_e32 v12, vcc, 0, v9, vcc -; CHECK-NEXT: v_cmp_ge_u32_e32 vcc, v1, v5 -; CHECK-NEXT: v_cndmask_b32_e64 v13, 0, -1, vcc -; CHECK-NEXT: v_cmp_ge_u32_e32 vcc, v7, v3 -; CHECK-NEXT: v_cndmask_b32_e64 v3, 0, -1, vcc -; CHECK-NEXT: v_cmp_eq_u32_e32 vcc, v1, v5 -; CHECK-NEXT: v_cndmask_b32_e32 v1, v13, v3, vcc -; CHECK-NEXT: v_add_i32_e32 v3, vcc, 1, v11 -; CHECK-NEXT: v_addc_u32_e32 v5, vcc, 0, v12, vcc -; CHECK-NEXT: v_cmp_ne_u32_e32 vcc, 0, v1 -; CHECK-NEXT: v_cndmask_b32_e32 v1, v11, v3, vcc -; CHECK-NEXT: v_cndmask_b32_e32 v3, v12, v5, vcc -; CHECK-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10 +; CHECK-NEXT: v_cndmask_b32_e64 v10, 0, -1, s[4:5] +; CHECK-NEXT: v_cmp_eq_u32_e64 s[4:5], v9, v4 +; CHECK-NEXT: v_add_i32_e32 v9, vcc, 1, v5 +; CHECK-NEXT: v_cndmask_b32_e64 v8, v8, v10, s[4:5] +; CHECK-NEXT: v_addc_u32_e32 v10, vcc, 0, v6, vcc +; CHECK-NEXT: v_cmp_ge_u32_e32 vcc, v1, v4 +; CHECK-NEXT: v_cndmask_b32_e64 v11, 0, -1, vcc +; CHECK-NEXT: v_cmp_ge_u32_e32 vcc, v0, v3 +; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc +; CHECK-NEXT: v_cmp_eq_u32_e32 vcc, v1, v4 +; CHECK-NEXT: v_cndmask_b32_e32 v0, v11, v0, vcc +; CHECK-NEXT: v_add_i32_e32 v1, vcc, 1, v9 +; CHECK-NEXT: v_addc_u32_e32 v3, vcc, 0, v10, vcc +; CHECK-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 +; CHECK-NEXT: v_cndmask_b32_e32 v0, v9, v1, vcc +; CHECK-NEXT: v_cndmask_b32_e32 v1, v10, v3, vcc +; CHECK-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 +; CHECK-NEXT: v_cndmask_b32_e32 v0, v5, v0, vcc +; CHECK-NEXT: v_xor_b32_e32 v3, v7, v2 ; CHECK-NEXT: v_cndmask_b32_e32 v1, v6, v1, vcc -; CHECK-NEXT: v_xor_b32_e32 v5, v8, v2 -; CHECK-NEXT: v_cndmask_b32_e32 v3, v9, v3, vcc -; CHECK-NEXT: v_xor_b32_e32 v1, v1, v5 -; CHECK-NEXT: v_xor_b32_e32 v3, v3, v5 -; CHECK-NEXT: v_sub_i32_e32 v2, vcc, v1, v5 -; CHECK-NEXT: v_subb_u32_e32 v3, vcc, v3, v5, vcc +; CHECK-NEXT: v_xor_b32_e32 v0, v0, v3 +; CHECK-NEXT: v_sub_i32_e32 v2, vcc, v0, v3 +; CHECK-NEXT: v_xor_b32_e32 v1, v1, v3 +; CHECK-NEXT: v_subb_u32_e32 v3, vcc, v1, v3, vcc +; CHECK-NEXT: ; implicit-def: $vgpr0 +; CHECK-NEXT: ; implicit-def: $vgpr4_vgpr5 ; CHECK-NEXT: BB7_2: ; %Flow ; CHECK-NEXT: s_or_saveexec_b64 s[6:7], s[6:7] ; CHECK-NEXT: s_xor_b64 exec, exec, s[6:7] @@ -3008,142 +3016,144 @@ ; CGP-NEXT: v_xor_b32_e32 v1, v1, v0 ; CGP-NEXT: v_xor_b32_e32 v4, v4, v0 ; CGP-NEXT: v_cvt_f32_u32_e32 v6, v1 -; CGP-NEXT: v_cvt_f32_u32_e32 v11, v4 -; CGP-NEXT: v_ashrrev_i32_e32 v12, 31, v7 -; CGP-NEXT: v_mac_f32_e32 v6, 0x4f800000, v11 +; CGP-NEXT: v_cvt_f32_u32_e32 v10, v4 +; CGP-NEXT: v_ashrrev_i32_e32 v11, 31, v7 +; CGP-NEXT: v_add_i32_e32 v5, vcc, v5, v11 +; CGP-NEXT: v_addc_u32_e32 v7, vcc, v7, v11, vcc +; CGP-NEXT: v_mac_f32_e32 v6, 0x4f800000, v10 ; CGP-NEXT: v_rcp_iflag_f32_e32 v6, v6 -; CGP-NEXT: v_add_i32_e32 v11, vcc, v5, v12 -; CGP-NEXT: v_addc_u32_e32 v7, vcc, v7, v12, vcc -; CGP-NEXT: v_sub_i32_e32 v14, vcc, 0, v1 +; CGP-NEXT: v_sub_i32_e32 v12, vcc, 0, v1 +; CGP-NEXT: v_subb_u32_e32 v13, vcc, 0, v4, vcc +; CGP-NEXT: v_xor_b32_e32 v5, v5, v11 ; CGP-NEXT: v_mul_f32_e32 v6, 0x5f7ffffc, v6 -; CGP-NEXT: v_mul_f32_e32 v13, 0x2f800000, v6 -; CGP-NEXT: v_trunc_f32_e32 v13, v13 -; CGP-NEXT: v_mac_f32_e32 v6, 0xcf800000, v13 +; CGP-NEXT: v_mul_f32_e32 v10, 0x2f800000, v6 +; CGP-NEXT: v_trunc_f32_e32 v10, v10 +; CGP-NEXT: v_mac_f32_e32 v6, 0xcf800000, v10 ; CGP-NEXT: v_cvt_u32_f32_e32 v6, v6 -; CGP-NEXT: v_cvt_u32_f32_e32 v13, v13 -; CGP-NEXT: v_subb_u32_e32 v15, vcc, 0, v4, vcc -; CGP-NEXT: v_xor_b32_e32 v11, v11, v12 -; CGP-NEXT: v_mul_lo_u32 v16, v15, v6 -; CGP-NEXT: v_mul_lo_u32 v17, v14, v13 -; CGP-NEXT: v_mul_hi_u32 v19, v14, v6 -; CGP-NEXT: v_mul_lo_u32 v18, v14, v6 -; CGP-NEXT: v_xor_b32_e32 v7, v7, v12 +; CGP-NEXT: v_cvt_u32_f32_e32 v10, v10 +; CGP-NEXT: v_xor_b32_e32 v7, v7, v11 +; CGP-NEXT: v_mul_lo_u32 v14, v13, v6 +; CGP-NEXT: v_mul_lo_u32 v15, v12, v10 +; CGP-NEXT: v_mul_hi_u32 v17, v12, v6 +; CGP-NEXT: v_mul_lo_u32 v16, v12, v6 +; CGP-NEXT: v_add_i32_e32 v14, vcc, v14, v15 +; CGP-NEXT: v_add_i32_e32 v14, vcc, v14, v17 +; CGP-NEXT: v_mul_lo_u32 v15, v10, v16 +; CGP-NEXT: v_mul_lo_u32 v17, v6, v14 +; CGP-NEXT: v_mul_hi_u32 v18, v6, v16 +; CGP-NEXT: v_mul_hi_u32 v16, v10, v16 +; CGP-NEXT: v_add_i32_e32 v15, vcc, v15, v17 +; CGP-NEXT: v_cndmask_b32_e64 v17, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v15, vcc, v15, v18 +; CGP-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc +; CGP-NEXT: v_mul_lo_u32 v18, v10, v14 +; CGP-NEXT: v_add_i32_e32 v15, vcc, v17, v15 +; CGP-NEXT: v_mul_hi_u32 v17, v6, v14 +; CGP-NEXT: v_mul_hi_u32 v14, v10, v14 +; CGP-NEXT: v_add_i32_e32 v16, vcc, v18, v16 +; CGP-NEXT: v_cndmask_b32_e64 v18, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v16, vcc, v16, v17 -; CGP-NEXT: v_add_i32_e32 v16, vcc, v16, v19 -; CGP-NEXT: v_mul_lo_u32 v17, v13, v18 -; CGP-NEXT: v_mul_lo_u32 v19, v6, v16 -; CGP-NEXT: v_mul_hi_u32 v20, v6, v18 -; CGP-NEXT: v_mul_hi_u32 v18, v13, v18 -; CGP-NEXT: v_add_i32_e32 v17, vcc, v17, v19 -; CGP-NEXT: v_cndmask_b32_e64 v19, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v17, vcc, v17, v20 ; CGP-NEXT: v_cndmask_b32_e64 v17, 0, 1, vcc -; CGP-NEXT: v_mul_lo_u32 v20, v13, v16 -; CGP-NEXT: v_add_i32_e32 v17, vcc, v19, v17 -; CGP-NEXT: v_mul_hi_u32 v19, v6, v16 -; CGP-NEXT: v_mul_hi_u32 v16, v13, v16 -; CGP-NEXT: v_add_i32_e32 v18, vcc, v20, v18 -; CGP-NEXT: v_cndmask_b32_e64 v20, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v18, vcc, v18, v19 -; CGP-NEXT: v_cndmask_b32_e64 v19, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v19, vcc, v20, v19 ; CGP-NEXT: v_add_i32_e32 v17, vcc, v18, v17 -; CGP-NEXT: v_cndmask_b32_e64 v18, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v18, vcc, v19, v18 -; CGP-NEXT: v_add_i32_e32 v16, vcc, v16, v18 -; CGP-NEXT: v_add_i32_e32 v6, vcc, v6, v17 -; CGP-NEXT: v_addc_u32_e64 v17, s[4:5], v13, v16, vcc -; CGP-NEXT: v_mul_lo_u32 v15, v15, v6 -; CGP-NEXT: v_mul_lo_u32 v18, v14, v17 -; CGP-NEXT: v_mul_lo_u32 v19, v14, v6 -; CGP-NEXT: v_mul_hi_u32 v14, v14, v6 +; CGP-NEXT: v_add_i32_e32 v15, vcc, v16, v15 +; CGP-NEXT: v_cndmask_b32_e64 v16, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v16, vcc, v17, v16 +; CGP-NEXT: v_add_i32_e32 v14, vcc, v14, v16 +; CGP-NEXT: v_add_i32_e32 v6, vcc, v6, v15 +; CGP-NEXT: v_addc_u32_e64 v15, s[4:5], v10, v14, vcc +; CGP-NEXT: v_mul_lo_u32 v13, v13, v6 +; CGP-NEXT: v_mul_lo_u32 v16, v12, v15 +; CGP-NEXT: v_mul_lo_u32 v17, v12, v6 +; CGP-NEXT: v_mul_hi_u32 v12, v12, v6 +; CGP-NEXT: v_add_i32_e64 v10, s[4:5], v10, v14 +; CGP-NEXT: v_add_i32_e64 v13, s[4:5], v13, v16 +; CGP-NEXT: v_mul_hi_u32 v14, v6, v17 +; CGP-NEXT: v_add_i32_e64 v12, s[4:5], v13, v12 +; CGP-NEXT: v_mul_lo_u32 v13, v15, v17 +; CGP-NEXT: v_mul_lo_u32 v16, v6, v12 +; CGP-NEXT: v_mul_hi_u32 v17, v15, v17 ; CGP-NEXT: v_add_i32_e64 v13, s[4:5], v13, v16 -; CGP-NEXT: v_add_i32_e64 v15, s[4:5], v15, v18 -; CGP-NEXT: v_mul_hi_u32 v16, v6, v19 -; CGP-NEXT: v_add_i32_e64 v14, s[4:5], v15, v14 -; CGP-NEXT: v_mul_lo_u32 v15, v17, v19 -; CGP-NEXT: v_mul_lo_u32 v18, v6, v14 -; CGP-NEXT: v_mul_hi_u32 v19, v17, v19 -; CGP-NEXT: v_add_i32_e64 v15, s[4:5], v15, v18 -; CGP-NEXT: v_cndmask_b32_e64 v18, 0, 1, s[4:5] -; CGP-NEXT: v_add_i32_e64 v15, s[4:5], v15, v16 -; CGP-NEXT: v_cndmask_b32_e64 v15, 0, 1, s[4:5] -; CGP-NEXT: v_mul_lo_u32 v16, v17, v14 -; CGP-NEXT: v_add_i32_e64 v15, s[4:5], v18, v15 -; CGP-NEXT: v_mul_hi_u32 v18, v6, v14 -; CGP-NEXT: v_mul_hi_u32 v14, v17, v14 -; CGP-NEXT: v_add_i32_e64 v16, s[4:5], v16, v19 -; CGP-NEXT: v_cndmask_b32_e64 v19, 0, 1, s[4:5] -; CGP-NEXT: v_add_i32_e64 v16, s[4:5], v16, v18 -; CGP-NEXT: v_cndmask_b32_e64 v18, 0, 1, s[4:5] -; CGP-NEXT: v_add_i32_e64 v18, s[4:5], v19, v18 -; CGP-NEXT: v_add_i32_e64 v15, s[4:5], v16, v15 ; CGP-NEXT: v_cndmask_b32_e64 v16, 0, 1, s[4:5] -; CGP-NEXT: v_add_i32_e64 v16, s[4:5], v18, v16 +; CGP-NEXT: v_add_i32_e64 v13, s[4:5], v13, v14 +; CGP-NEXT: v_cndmask_b32_e64 v13, 0, 1, s[4:5] +; CGP-NEXT: v_mul_lo_u32 v14, v15, v12 +; CGP-NEXT: v_add_i32_e64 v13, s[4:5], v16, v13 +; CGP-NEXT: v_mul_hi_u32 v16, v6, v12 +; CGP-NEXT: v_mul_hi_u32 v12, v15, v12 +; CGP-NEXT: v_add_i32_e64 v14, s[4:5], v14, v17 +; CGP-NEXT: v_cndmask_b32_e64 v17, 0, 1, s[4:5] ; CGP-NEXT: v_add_i32_e64 v14, s[4:5], v14, v16 -; CGP-NEXT: v_addc_u32_e32 v13, vcc, v13, v14, vcc -; CGP-NEXT: v_add_i32_e32 v6, vcc, v6, v15 -; CGP-NEXT: v_addc_u32_e32 v13, vcc, 0, v13, vcc -; CGP-NEXT: v_mul_lo_u32 v14, v7, v6 -; CGP-NEXT: v_mul_lo_u32 v15, v11, v13 -; CGP-NEXT: v_mul_hi_u32 v16, v11, v6 +; CGP-NEXT: v_cndmask_b32_e64 v16, 0, 1, s[4:5] +; CGP-NEXT: v_add_i32_e64 v16, s[4:5], v17, v16 +; CGP-NEXT: v_add_i32_e64 v13, s[4:5], v14, v13 +; CGP-NEXT: v_cndmask_b32_e64 v14, 0, 1, s[4:5] +; CGP-NEXT: v_add_i32_e64 v14, s[4:5], v16, v14 +; CGP-NEXT: v_add_i32_e64 v12, s[4:5], v12, v14 +; CGP-NEXT: v_addc_u32_e32 v10, vcc, v10, v12, vcc +; CGP-NEXT: v_add_i32_e32 v6, vcc, v6, v13 +; CGP-NEXT: v_addc_u32_e32 v10, vcc, 0, v10, vcc +; CGP-NEXT: v_mul_lo_u32 v12, v7, v6 +; CGP-NEXT: v_mul_lo_u32 v13, v5, v10 +; CGP-NEXT: v_mul_hi_u32 v14, v5, v6 ; CGP-NEXT: v_mul_hi_u32 v6, v7, v6 -; CGP-NEXT: v_add_i32_e32 v14, vcc, v14, v15 -; CGP-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v14, vcc, v14, v16 -; CGP-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc -; CGP-NEXT: v_mul_lo_u32 v16, v7, v13 -; CGP-NEXT: v_add_i32_e32 v14, vcc, v15, v14 -; CGP-NEXT: v_mul_hi_u32 v15, v11, v13 -; CGP-NEXT: v_mul_hi_u32 v13, v7, v13 -; CGP-NEXT: v_add_i32_e32 v6, vcc, v16, v6 -; CGP-NEXT: v_cndmask_b32_e64 v16, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v6, vcc, v6, v15 -; CGP-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v15, vcc, v16, v15 -; CGP-NEXT: v_add_i32_e32 v6, vcc, v6, v14 +; CGP-NEXT: v_add_i32_e32 v12, vcc, v12, v13 +; CGP-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v12, vcc, v12, v14 +; CGP-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc +; CGP-NEXT: v_mul_lo_u32 v14, v7, v10 +; CGP-NEXT: v_add_i32_e32 v12, vcc, v13, v12 +; CGP-NEXT: v_mul_hi_u32 v13, v5, v10 +; CGP-NEXT: v_mul_hi_u32 v10, v7, v10 +; CGP-NEXT: v_add_i32_e32 v6, vcc, v14, v6 ; CGP-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v14, vcc, v15, v14 -; CGP-NEXT: v_add_i32_e32 v13, vcc, v13, v14 -; CGP-NEXT: v_mul_lo_u32 v14, v4, v6 -; CGP-NEXT: v_mul_lo_u32 v15, v1, v13 -; CGP-NEXT: v_mul_hi_u32 v17, v1, v6 -; CGP-NEXT: v_mul_lo_u32 v16, v1, v6 -; CGP-NEXT: v_add_i32_e32 v14, vcc, v14, v15 -; CGP-NEXT: v_add_i32_e32 v14, vcc, v14, v17 -; CGP-NEXT: v_sub_i32_e32 v11, vcc, v11, v16 -; CGP-NEXT: v_subb_u32_e64 v15, s[4:5], v7, v14, vcc -; CGP-NEXT: v_sub_i32_e64 v7, s[4:5], v7, v14 -; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v15, v4 +; CGP-NEXT: v_add_i32_e32 v6, vcc, v6, v13 +; CGP-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v13, vcc, v14, v13 +; CGP-NEXT: v_add_i32_e32 v6, vcc, v6, v12 +; CGP-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v12, vcc, v13, v12 +; CGP-NEXT: v_add_i32_e32 v10, vcc, v10, v12 +; CGP-NEXT: v_mul_lo_u32 v12, v4, v6 +; CGP-NEXT: v_mul_lo_u32 v13, v1, v10 +; CGP-NEXT: v_mul_hi_u32 v15, v1, v6 +; CGP-NEXT: v_mul_lo_u32 v14, v1, v6 +; CGP-NEXT: v_add_i32_e32 v12, vcc, v12, v13 +; CGP-NEXT: v_add_i32_e32 v12, vcc, v12, v15 +; CGP-NEXT: v_sub_i32_e32 v5, vcc, v5, v14 +; CGP-NEXT: v_subb_u32_e64 v13, s[4:5], v7, v12, vcc +; CGP-NEXT: v_sub_i32_e64 v7, s[4:5], v7, v12 +; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v13, v4 ; CGP-NEXT: v_subb_u32_e32 v7, vcc, v7, v4, vcc -; CGP-NEXT: v_cndmask_b32_e64 v14, 0, -1, s[4:5] -; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v11, v1 -; CGP-NEXT: v_sub_i32_e32 v11, vcc, v11, v1 +; CGP-NEXT: v_cndmask_b32_e64 v12, 0, -1, s[4:5] +; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v5, v1 +; CGP-NEXT: v_sub_i32_e32 v5, vcc, v5, v1 ; CGP-NEXT: v_subbrev_u32_e32 v7, vcc, 0, v7, vcc -; CGP-NEXT: v_cndmask_b32_e64 v16, 0, -1, s[4:5] -; CGP-NEXT: v_cmp_eq_u32_e64 s[4:5], v15, v4 -; CGP-NEXT: v_add_i32_e32 v15, vcc, 1, v6 -; CGP-NEXT: v_cndmask_b32_e64 v14, v14, v16, s[4:5] -; CGP-NEXT: v_addc_u32_e32 v16, vcc, 0, v13, vcc +; CGP-NEXT: v_cndmask_b32_e64 v14, 0, -1, s[4:5] +; CGP-NEXT: v_cmp_eq_u32_e64 s[4:5], v13, v4 +; CGP-NEXT: v_add_i32_e32 v13, vcc, 1, v6 +; CGP-NEXT: v_cndmask_b32_e64 v12, v12, v14, s[4:5] +; CGP-NEXT: v_addc_u32_e32 v14, vcc, 0, v10, vcc ; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v7, v4 -; CGP-NEXT: v_cndmask_b32_e64 v17, 0, -1, vcc -; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v11, v1 +; CGP-NEXT: v_cndmask_b32_e64 v15, 0, -1, vcc +; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v5, v1 ; CGP-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc ; CGP-NEXT: v_cmp_eq_u32_e32 vcc, v7, v4 -; CGP-NEXT: v_cndmask_b32_e32 v1, v17, v1, vcc -; CGP-NEXT: v_add_i32_e32 v4, vcc, 1, v15 -; CGP-NEXT: v_addc_u32_e32 v7, vcc, 0, v16, vcc +; CGP-NEXT: v_cndmask_b32_e32 v1, v15, v1, vcc +; CGP-NEXT: v_add_i32_e32 v4, vcc, 1, v13 +; CGP-NEXT: v_addc_u32_e32 v5, vcc, 0, v14, vcc ; CGP-NEXT: v_cmp_ne_u32_e32 vcc, 0, v1 -; CGP-NEXT: v_cndmask_b32_e32 v1, v15, v4, vcc -; CGP-NEXT: v_cndmask_b32_e32 v4, v16, v7, vcc -; CGP-NEXT: v_cmp_ne_u32_e32 vcc, 0, v14 +; CGP-NEXT: v_cndmask_b32_e32 v1, v13, v4, vcc +; CGP-NEXT: v_cndmask_b32_e32 v4, v14, v5, vcc +; CGP-NEXT: v_cmp_ne_u32_e32 vcc, 0, v12 ; CGP-NEXT: v_cndmask_b32_e32 v1, v6, v1, vcc -; CGP-NEXT: v_xor_b32_e32 v6, v12, v0 -; CGP-NEXT: v_cndmask_b32_e32 v4, v13, v4, vcc -; CGP-NEXT: v_xor_b32_e32 v0, v1, v6 -; CGP-NEXT: v_xor_b32_e32 v1, v4, v6 -; CGP-NEXT: v_sub_i32_e32 v0, vcc, v0, v6 -; CGP-NEXT: v_subb_u32_e32 v1, vcc, v1, v6, vcc +; CGP-NEXT: v_xor_b32_e32 v5, v11, v0 +; CGP-NEXT: v_cndmask_b32_e32 v4, v10, v4, vcc +; CGP-NEXT: v_xor_b32_e32 v0, v1, v5 +; CGP-NEXT: v_xor_b32_e32 v1, v4, v5 +; CGP-NEXT: v_sub_i32_e32 v0, vcc, v0, v5 +; CGP-NEXT: v_subb_u32_e32 v1, vcc, v1, v5, vcc +; CGP-NEXT: ; implicit-def: $vgpr5 +; CGP-NEXT: ; implicit-def: $vgpr10_vgpr11 ; CGP-NEXT: BB8_2: ; %Flow2 ; CGP-NEXT: s_or_saveexec_b64 s[6:7], s[6:7] ; CGP-NEXT: s_xor_b64 exec, exec, s[6:7] @@ -3185,142 +3195,144 @@ ; CGP-NEXT: v_xor_b32_e32 v5, v5, v4 ; CGP-NEXT: v_xor_b32_e32 v6, v6, v4 ; CGP-NEXT: v_cvt_f32_u32_e32 v7, v5 -; CGP-NEXT: v_cvt_f32_u32_e32 v9, v6 -; CGP-NEXT: v_ashrrev_i32_e32 v10, 31, v3 -; CGP-NEXT: v_mac_f32_e32 v7, 0x4f800000, v9 +; CGP-NEXT: v_cvt_f32_u32_e32 v8, v6 +; CGP-NEXT: v_ashrrev_i32_e32 v9, 31, v3 +; CGP-NEXT: v_add_i32_e32 v2, vcc, v2, v9 +; CGP-NEXT: v_addc_u32_e32 v3, vcc, v3, v9, vcc +; CGP-NEXT: v_mac_f32_e32 v7, 0x4f800000, v8 ; CGP-NEXT: v_rcp_iflag_f32_e32 v7, v7 -; CGP-NEXT: v_add_i32_e32 v9, vcc, v2, v10 -; CGP-NEXT: v_addc_u32_e32 v3, vcc, v3, v10, vcc -; CGP-NEXT: v_sub_i32_e32 v12, vcc, 0, v5 +; CGP-NEXT: v_sub_i32_e32 v10, vcc, 0, v5 +; CGP-NEXT: v_subb_u32_e32 v11, vcc, 0, v6, vcc +; CGP-NEXT: v_xor_b32_e32 v2, v2, v9 ; CGP-NEXT: v_mul_f32_e32 v7, 0x5f7ffffc, v7 -; CGP-NEXT: v_mul_f32_e32 v11, 0x2f800000, v7 -; CGP-NEXT: v_trunc_f32_e32 v11, v11 -; CGP-NEXT: v_mac_f32_e32 v7, 0xcf800000, v11 +; CGP-NEXT: v_mul_f32_e32 v8, 0x2f800000, v7 +; CGP-NEXT: v_trunc_f32_e32 v8, v8 +; CGP-NEXT: v_mac_f32_e32 v7, 0xcf800000, v8 ; CGP-NEXT: v_cvt_u32_f32_e32 v7, v7 -; CGP-NEXT: v_cvt_u32_f32_e32 v11, v11 -; CGP-NEXT: v_subb_u32_e32 v13, vcc, 0, v6, vcc -; CGP-NEXT: v_xor_b32_e32 v9, v9, v10 -; CGP-NEXT: v_mul_lo_u32 v14, v13, v7 -; CGP-NEXT: v_mul_lo_u32 v15, v12, v11 -; CGP-NEXT: v_mul_hi_u32 v17, v12, v7 -; CGP-NEXT: v_mul_lo_u32 v16, v12, v7 -; CGP-NEXT: v_xor_b32_e32 v3, v3, v10 +; CGP-NEXT: v_cvt_u32_f32_e32 v8, v8 +; CGP-NEXT: v_xor_b32_e32 v3, v3, v9 +; CGP-NEXT: v_mul_lo_u32 v12, v11, v7 +; CGP-NEXT: v_mul_lo_u32 v13, v10, v8 +; CGP-NEXT: v_mul_hi_u32 v15, v10, v7 +; CGP-NEXT: v_mul_lo_u32 v14, v10, v7 +; CGP-NEXT: v_add_i32_e32 v12, vcc, v12, v13 +; CGP-NEXT: v_add_i32_e32 v12, vcc, v12, v15 +; CGP-NEXT: v_mul_lo_u32 v13, v8, v14 +; CGP-NEXT: v_mul_lo_u32 v15, v7, v12 +; CGP-NEXT: v_mul_hi_u32 v16, v7, v14 +; CGP-NEXT: v_mul_hi_u32 v14, v8, v14 +; CGP-NEXT: v_add_i32_e32 v13, vcc, v13, v15 +; CGP-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v13, vcc, v13, v16 +; CGP-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc +; CGP-NEXT: v_mul_lo_u32 v16, v8, v12 +; CGP-NEXT: v_add_i32_e32 v13, vcc, v15, v13 +; CGP-NEXT: v_mul_hi_u32 v15, v7, v12 +; CGP-NEXT: v_mul_hi_u32 v12, v8, v12 +; CGP-NEXT: v_add_i32_e32 v14, vcc, v16, v14 +; CGP-NEXT: v_cndmask_b32_e64 v16, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v14, vcc, v14, v15 -; CGP-NEXT: v_add_i32_e32 v14, vcc, v14, v17 -; CGP-NEXT: v_mul_lo_u32 v15, v11, v16 -; CGP-NEXT: v_mul_lo_u32 v17, v7, v14 -; CGP-NEXT: v_mul_hi_u32 v18, v7, v16 -; CGP-NEXT: v_mul_hi_u32 v16, v11, v16 -; CGP-NEXT: v_add_i32_e32 v15, vcc, v15, v17 -; CGP-NEXT: v_cndmask_b32_e64 v17, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v15, vcc, v15, v18 ; CGP-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc -; CGP-NEXT: v_mul_lo_u32 v18, v11, v14 -; CGP-NEXT: v_add_i32_e32 v15, vcc, v17, v15 -; CGP-NEXT: v_mul_hi_u32 v17, v7, v14 -; CGP-NEXT: v_mul_hi_u32 v14, v11, v14 -; CGP-NEXT: v_add_i32_e32 v16, vcc, v18, v16 -; CGP-NEXT: v_cndmask_b32_e64 v18, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v16, vcc, v16, v17 -; CGP-NEXT: v_cndmask_b32_e64 v17, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v17, vcc, v18, v17 ; CGP-NEXT: v_add_i32_e32 v15, vcc, v16, v15 -; CGP-NEXT: v_cndmask_b32_e64 v16, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v16, vcc, v17, v16 -; CGP-NEXT: v_add_i32_e32 v14, vcc, v14, v16 -; CGP-NEXT: v_add_i32_e32 v7, vcc, v7, v15 -; CGP-NEXT: v_addc_u32_e64 v15, s[4:5], v11, v14, vcc -; CGP-NEXT: v_mul_lo_u32 v13, v13, v7 -; CGP-NEXT: v_mul_lo_u32 v16, v12, v15 -; CGP-NEXT: v_mul_lo_u32 v17, v12, v7 -; CGP-NEXT: v_mul_hi_u32 v12, v12, v7 +; CGP-NEXT: v_add_i32_e32 v13, vcc, v14, v13 +; CGP-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v14, vcc, v15, v14 +; CGP-NEXT: v_add_i32_e32 v12, vcc, v12, v14 +; CGP-NEXT: v_add_i32_e32 v7, vcc, v7, v13 +; CGP-NEXT: v_addc_u32_e64 v13, s[4:5], v8, v12, vcc +; CGP-NEXT: v_mul_lo_u32 v11, v11, v7 +; CGP-NEXT: v_mul_lo_u32 v14, v10, v13 +; CGP-NEXT: v_mul_lo_u32 v15, v10, v7 +; CGP-NEXT: v_mul_hi_u32 v10, v10, v7 +; CGP-NEXT: v_add_i32_e64 v8, s[4:5], v8, v12 +; CGP-NEXT: v_add_i32_e64 v11, s[4:5], v11, v14 +; CGP-NEXT: v_mul_hi_u32 v12, v7, v15 +; CGP-NEXT: v_add_i32_e64 v10, s[4:5], v11, v10 +; CGP-NEXT: v_mul_lo_u32 v11, v13, v15 +; CGP-NEXT: v_mul_lo_u32 v14, v7, v10 +; CGP-NEXT: v_mul_hi_u32 v15, v13, v15 ; CGP-NEXT: v_add_i32_e64 v11, s[4:5], v11, v14 -; CGP-NEXT: v_add_i32_e64 v13, s[4:5], v13, v16 -; CGP-NEXT: v_mul_hi_u32 v14, v7, v17 -; CGP-NEXT: v_add_i32_e64 v12, s[4:5], v13, v12 -; CGP-NEXT: v_mul_lo_u32 v13, v15, v17 -; CGP-NEXT: v_mul_lo_u32 v16, v7, v12 -; CGP-NEXT: v_mul_hi_u32 v17, v15, v17 -; CGP-NEXT: v_add_i32_e64 v13, s[4:5], v13, v16 -; CGP-NEXT: v_cndmask_b32_e64 v16, 0, 1, s[4:5] -; CGP-NEXT: v_add_i32_e64 v13, s[4:5], v13, v14 -; CGP-NEXT: v_cndmask_b32_e64 v13, 0, 1, s[4:5] -; CGP-NEXT: v_mul_lo_u32 v14, v15, v12 -; CGP-NEXT: v_add_i32_e64 v13, s[4:5], v16, v13 -; CGP-NEXT: v_mul_hi_u32 v16, v7, v12 -; CGP-NEXT: v_mul_hi_u32 v12, v15, v12 -; CGP-NEXT: v_add_i32_e64 v14, s[4:5], v14, v17 -; CGP-NEXT: v_cndmask_b32_e64 v17, 0, 1, s[4:5] -; CGP-NEXT: v_add_i32_e64 v14, s[4:5], v14, v16 -; CGP-NEXT: v_cndmask_b32_e64 v16, 0, 1, s[4:5] -; CGP-NEXT: v_add_i32_e64 v16, s[4:5], v17, v16 -; CGP-NEXT: v_add_i32_e64 v13, s[4:5], v14, v13 ; CGP-NEXT: v_cndmask_b32_e64 v14, 0, 1, s[4:5] -; CGP-NEXT: v_add_i32_e64 v14, s[4:5], v16, v14 +; CGP-NEXT: v_add_i32_e64 v11, s[4:5], v11, v12 +; CGP-NEXT: v_cndmask_b32_e64 v11, 0, 1, s[4:5] +; CGP-NEXT: v_mul_lo_u32 v12, v13, v10 +; CGP-NEXT: v_add_i32_e64 v11, s[4:5], v14, v11 +; CGP-NEXT: v_mul_hi_u32 v14, v7, v10 +; CGP-NEXT: v_mul_hi_u32 v10, v13, v10 +; CGP-NEXT: v_add_i32_e64 v12, s[4:5], v12, v15 +; CGP-NEXT: v_cndmask_b32_e64 v15, 0, 1, s[4:5] ; CGP-NEXT: v_add_i32_e64 v12, s[4:5], v12, v14 -; CGP-NEXT: v_addc_u32_e32 v11, vcc, v11, v12, vcc -; CGP-NEXT: v_add_i32_e32 v7, vcc, v7, v13 -; CGP-NEXT: v_addc_u32_e32 v11, vcc, 0, v11, vcc -; CGP-NEXT: v_mul_lo_u32 v12, v3, v7 -; CGP-NEXT: v_mul_lo_u32 v13, v9, v11 -; CGP-NEXT: v_mul_hi_u32 v14, v9, v7 +; CGP-NEXT: v_cndmask_b32_e64 v14, 0, 1, s[4:5] +; CGP-NEXT: v_add_i32_e64 v14, s[4:5], v15, v14 +; CGP-NEXT: v_add_i32_e64 v11, s[4:5], v12, v11 +; CGP-NEXT: v_cndmask_b32_e64 v12, 0, 1, s[4:5] +; CGP-NEXT: v_add_i32_e64 v12, s[4:5], v14, v12 +; CGP-NEXT: v_add_i32_e64 v10, s[4:5], v10, v12 +; CGP-NEXT: v_addc_u32_e32 v8, vcc, v8, v10, vcc +; CGP-NEXT: v_add_i32_e32 v7, vcc, v7, v11 +; CGP-NEXT: v_addc_u32_e32 v8, vcc, 0, v8, vcc +; CGP-NEXT: v_mul_lo_u32 v10, v3, v7 +; CGP-NEXT: v_mul_lo_u32 v11, v2, v8 +; CGP-NEXT: v_mul_hi_u32 v12, v2, v7 ; CGP-NEXT: v_mul_hi_u32 v7, v3, v7 -; CGP-NEXT: v_add_i32_e32 v12, vcc, v12, v13 -; CGP-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v12, vcc, v12, v14 -; CGP-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc -; CGP-NEXT: v_mul_lo_u32 v14, v3, v11 -; CGP-NEXT: v_add_i32_e32 v12, vcc, v13, v12 -; CGP-NEXT: v_mul_hi_u32 v13, v9, v11 -; CGP-NEXT: v_mul_hi_u32 v11, v3, v11 -; CGP-NEXT: v_add_i32_e32 v7, vcc, v14, v7 -; CGP-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v7, vcc, v7, v13 -; CGP-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v13, vcc, v14, v13 -; CGP-NEXT: v_add_i32_e32 v7, vcc, v7, v12 +; CGP-NEXT: v_add_i32_e32 v10, vcc, v10, v11 +; CGP-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v10, vcc, v10, v12 +; CGP-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc +; CGP-NEXT: v_mul_lo_u32 v12, v3, v8 +; CGP-NEXT: v_add_i32_e32 v10, vcc, v11, v10 +; CGP-NEXT: v_mul_hi_u32 v11, v2, v8 +; CGP-NEXT: v_mul_hi_u32 v8, v3, v8 +; CGP-NEXT: v_add_i32_e32 v7, vcc, v12, v7 ; CGP-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v12, vcc, v13, v12 -; CGP-NEXT: v_add_i32_e32 v11, vcc, v11, v12 -; CGP-NEXT: v_mul_lo_u32 v12, v6, v7 -; CGP-NEXT: v_mul_lo_u32 v13, v5, v11 -; CGP-NEXT: v_mul_hi_u32 v15, v5, v7 -; CGP-NEXT: v_mul_lo_u32 v14, v5, v7 -; CGP-NEXT: v_add_i32_e32 v12, vcc, v12, v13 -; CGP-NEXT: v_add_i32_e32 v12, vcc, v12, v15 -; CGP-NEXT: v_sub_i32_e32 v9, vcc, v9, v14 -; CGP-NEXT: v_subb_u32_e64 v13, s[4:5], v3, v12, vcc -; CGP-NEXT: v_sub_i32_e64 v3, s[4:5], v3, v12 -; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v13, v6 +; CGP-NEXT: v_add_i32_e32 v7, vcc, v7, v11 +; CGP-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v11, vcc, v12, v11 +; CGP-NEXT: v_add_i32_e32 v7, vcc, v7, v10 +; CGP-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v10, vcc, v11, v10 +; CGP-NEXT: v_add_i32_e32 v8, vcc, v8, v10 +; CGP-NEXT: v_mul_lo_u32 v10, v6, v7 +; CGP-NEXT: v_mul_lo_u32 v11, v5, v8 +; CGP-NEXT: v_mul_hi_u32 v13, v5, v7 +; CGP-NEXT: v_mul_lo_u32 v12, v5, v7 +; CGP-NEXT: v_add_i32_e32 v10, vcc, v10, v11 +; CGP-NEXT: v_add_i32_e32 v10, vcc, v10, v13 +; CGP-NEXT: v_sub_i32_e32 v2, vcc, v2, v12 +; CGP-NEXT: v_subb_u32_e64 v11, s[4:5], v3, v10, vcc +; CGP-NEXT: v_sub_i32_e64 v3, s[4:5], v3, v10 +; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v11, v6 ; CGP-NEXT: v_subb_u32_e32 v3, vcc, v3, v6, vcc -; CGP-NEXT: v_cndmask_b32_e64 v12, 0, -1, s[4:5] -; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v9, v5 -; CGP-NEXT: v_sub_i32_e32 v9, vcc, v9, v5 +; CGP-NEXT: v_cndmask_b32_e64 v10, 0, -1, s[4:5] +; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v2, v5 +; CGP-NEXT: v_sub_i32_e32 v2, vcc, v2, v5 ; CGP-NEXT: v_subbrev_u32_e32 v3, vcc, 0, v3, vcc -; CGP-NEXT: v_cndmask_b32_e64 v14, 0, -1, s[4:5] -; CGP-NEXT: v_cmp_eq_u32_e64 s[4:5], v13, v6 -; CGP-NEXT: v_add_i32_e32 v13, vcc, 1, v7 -; CGP-NEXT: v_cndmask_b32_e64 v12, v12, v14, s[4:5] -; CGP-NEXT: v_addc_u32_e32 v14, vcc, 0, v11, vcc +; CGP-NEXT: v_cndmask_b32_e64 v12, 0, -1, s[4:5] +; CGP-NEXT: v_cmp_eq_u32_e64 s[4:5], v11, v6 +; CGP-NEXT: v_add_i32_e32 v11, vcc, 1, v7 +; CGP-NEXT: v_cndmask_b32_e64 v10, v10, v12, s[4:5] +; CGP-NEXT: v_addc_u32_e32 v12, vcc, 0, v8, vcc ; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v3, v6 -; CGP-NEXT: v_cndmask_b32_e64 v15, 0, -1, vcc -; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v9, v5 -; CGP-NEXT: v_cndmask_b32_e64 v5, 0, -1, vcc +; CGP-NEXT: v_cndmask_b32_e64 v13, 0, -1, vcc +; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v2, v5 +; CGP-NEXT: v_cndmask_b32_e64 v2, 0, -1, vcc ; CGP-NEXT: v_cmp_eq_u32_e32 vcc, v3, v6 -; CGP-NEXT: v_cndmask_b32_e32 v3, v15, v5, vcc -; CGP-NEXT: v_add_i32_e32 v5, vcc, 1, v13 -; CGP-NEXT: v_addc_u32_e32 v6, vcc, 0, v14, vcc -; CGP-NEXT: v_cmp_ne_u32_e32 vcc, 0, v3 -; CGP-NEXT: v_cndmask_b32_e32 v3, v13, v5, vcc -; CGP-NEXT: v_cndmask_b32_e32 v5, v14, v6, vcc -; CGP-NEXT: v_cmp_ne_u32_e32 vcc, 0, v12 -; CGP-NEXT: v_cndmask_b32_e32 v3, v7, v3, vcc -; CGP-NEXT: v_xor_b32_e32 v6, v10, v4 -; CGP-NEXT: v_cndmask_b32_e32 v5, v11, v5, vcc -; CGP-NEXT: v_xor_b32_e32 v3, v3, v6 -; CGP-NEXT: v_xor_b32_e32 v5, v5, v6 -; CGP-NEXT: v_sub_i32_e32 v4, vcc, v3, v6 -; CGP-NEXT: v_subb_u32_e32 v5, vcc, v5, v6, vcc +; CGP-NEXT: v_cndmask_b32_e32 v2, v13, v2, vcc +; CGP-NEXT: v_add_i32_e32 v3, vcc, 1, v11 +; CGP-NEXT: v_addc_u32_e32 v5, vcc, 0, v12, vcc +; CGP-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 +; CGP-NEXT: v_cndmask_b32_e32 v2, v11, v3, vcc +; CGP-NEXT: v_cndmask_b32_e32 v3, v12, v5, vcc +; CGP-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10 +; CGP-NEXT: v_cndmask_b32_e32 v2, v7, v2, vcc +; CGP-NEXT: v_xor_b32_e32 v5, v9, v4 +; CGP-NEXT: v_cndmask_b32_e32 v3, v8, v3, vcc +; CGP-NEXT: v_xor_b32_e32 v2, v2, v5 +; CGP-NEXT: v_sub_i32_e32 v4, vcc, v2, v5 +; CGP-NEXT: v_xor_b32_e32 v3, v3, v5 +; CGP-NEXT: v_subb_u32_e32 v5, vcc, v3, v5, vcc +; CGP-NEXT: ; implicit-def: $vgpr2 +; CGP-NEXT: ; implicit-def: $vgpr8_vgpr9 ; CGP-NEXT: BB8_6: ; %Flow ; CGP-NEXT: s_or_saveexec_b64 s[6:7], s[6:7] ; CGP-NEXT: s_xor_b64 exec, exec, s[6:7] diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/srem.i64.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/srem.i64.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/srem.i64.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/srem.i64.ll @@ -17,145 +17,147 @@ ; CHECK-NEXT: s_cbranch_execz BB0_2 ; CHECK-NEXT: ; %bb.1: ; CHECK-NEXT: v_ashrrev_i32_e32 v4, 31, v3 -; CHECK-NEXT: v_add_i32_e32 v5, vcc, v2, v4 +; CHECK-NEXT: v_add_i32_e32 v2, vcc, v2, v4 ; CHECK-NEXT: v_addc_u32_e32 v3, vcc, v3, v4, vcc ; CHECK-NEXT: v_xor_b32_e32 v3, v3, v4 -; CHECK-NEXT: v_xor_b32_e32 v5, v5, v4 -; CHECK-NEXT: v_cvt_f32_u32_e32 v4, v5 -; CHECK-NEXT: v_cvt_f32_u32_e32 v6, v3 -; CHECK-NEXT: v_ashrrev_i32_e32 v7, 31, v1 -; CHECK-NEXT: v_mac_f32_e32 v4, 0x4f800000, v6 +; CHECK-NEXT: v_xor_b32_e32 v2, v2, v4 +; CHECK-NEXT: v_cvt_f32_u32_e32 v4, v2 +; CHECK-NEXT: v_cvt_f32_u32_e32 v5, v3 +; CHECK-NEXT: v_ashrrev_i32_e32 v6, 31, v1 +; CHECK-NEXT: v_add_i32_e32 v0, vcc, v0, v6 +; CHECK-NEXT: v_addc_u32_e32 v1, vcc, v1, v6, vcc +; CHECK-NEXT: v_mac_f32_e32 v4, 0x4f800000, v5 ; CHECK-NEXT: v_rcp_iflag_f32_e32 v4, v4 -; CHECK-NEXT: v_add_i32_e32 v6, vcc, v0, v7 -; CHECK-NEXT: v_addc_u32_e32 v1, vcc, v1, v7, vcc -; CHECK-NEXT: v_sub_i32_e32 v9, vcc, 0, v5 +; CHECK-NEXT: v_sub_i32_e32 v7, vcc, 0, v2 +; CHECK-NEXT: v_subb_u32_e32 v8, vcc, 0, v3, vcc +; CHECK-NEXT: v_xor_b32_e32 v0, v0, v6 ; CHECK-NEXT: v_mul_f32_e32 v4, 0x5f7ffffc, v4 -; CHECK-NEXT: v_mul_f32_e32 v8, 0x2f800000, v4 -; CHECK-NEXT: v_trunc_f32_e32 v8, v8 -; CHECK-NEXT: v_mac_f32_e32 v4, 0xcf800000, v8 +; CHECK-NEXT: v_mul_f32_e32 v5, 0x2f800000, v4 +; CHECK-NEXT: v_trunc_f32_e32 v5, v5 +; CHECK-NEXT: v_mac_f32_e32 v4, 0xcf800000, v5 ; CHECK-NEXT: v_cvt_u32_f32_e32 v4, v4 -; CHECK-NEXT: v_cvt_u32_f32_e32 v8, v8 -; CHECK-NEXT: v_subb_u32_e32 v10, vcc, 0, v3, vcc -; CHECK-NEXT: v_xor_b32_e32 v6, v6, v7 -; CHECK-NEXT: v_mul_lo_u32 v11, v10, v4 -; CHECK-NEXT: v_mul_lo_u32 v12, v9, v8 -; CHECK-NEXT: v_mul_hi_u32 v14, v9, v4 -; CHECK-NEXT: v_mul_lo_u32 v13, v9, v4 -; CHECK-NEXT: v_xor_b32_e32 v1, v1, v7 +; CHECK-NEXT: v_cvt_u32_f32_e32 v5, v5 +; CHECK-NEXT: v_xor_b32_e32 v1, v1, v6 +; CHECK-NEXT: v_mul_lo_u32 v9, v8, v4 +; CHECK-NEXT: v_mul_lo_u32 v10, v7, v5 +; CHECK-NEXT: v_mul_hi_u32 v12, v7, v4 +; CHECK-NEXT: v_mul_lo_u32 v11, v7, v4 +; CHECK-NEXT: v_add_i32_e32 v9, vcc, v9, v10 +; CHECK-NEXT: v_add_i32_e32 v9, vcc, v9, v12 +; CHECK-NEXT: v_mul_lo_u32 v10, v5, v11 +; CHECK-NEXT: v_mul_lo_u32 v12, v4, v9 +; CHECK-NEXT: v_mul_hi_u32 v13, v4, v11 +; CHECK-NEXT: v_mul_hi_u32 v11, v5, v11 +; CHECK-NEXT: v_add_i32_e32 v10, vcc, v10, v12 +; CHECK-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc +; CHECK-NEXT: v_add_i32_e32 v10, vcc, v10, v13 +; CHECK-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc +; CHECK-NEXT: v_mul_lo_u32 v13, v5, v9 +; CHECK-NEXT: v_add_i32_e32 v10, vcc, v12, v10 +; CHECK-NEXT: v_mul_hi_u32 v12, v4, v9 +; CHECK-NEXT: v_mul_hi_u32 v9, v5, v9 +; CHECK-NEXT: v_add_i32_e32 v11, vcc, v13, v11 +; CHECK-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc ; CHECK-NEXT: v_add_i32_e32 v11, vcc, v11, v12 -; CHECK-NEXT: v_add_i32_e32 v11, vcc, v11, v14 -; CHECK-NEXT: v_mul_lo_u32 v12, v8, v13 -; CHECK-NEXT: v_mul_lo_u32 v14, v4, v11 -; CHECK-NEXT: v_mul_hi_u32 v15, v4, v13 -; CHECK-NEXT: v_mul_hi_u32 v13, v8, v13 -; CHECK-NEXT: v_add_i32_e32 v12, vcc, v12, v14 -; CHECK-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc -; CHECK-NEXT: v_add_i32_e32 v12, vcc, v12, v15 ; CHECK-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc -; CHECK-NEXT: v_mul_lo_u32 v15, v8, v11 -; CHECK-NEXT: v_add_i32_e32 v12, vcc, v14, v12 -; CHECK-NEXT: v_mul_hi_u32 v14, v4, v11 -; CHECK-NEXT: v_mul_hi_u32 v11, v8, v11 -; CHECK-NEXT: v_add_i32_e32 v13, vcc, v15, v13 -; CHECK-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc -; CHECK-NEXT: v_add_i32_e32 v13, vcc, v13, v14 -; CHECK-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc -; CHECK-NEXT: v_add_i32_e32 v14, vcc, v15, v14 ; CHECK-NEXT: v_add_i32_e32 v12, vcc, v13, v12 -; CHECK-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc -; CHECK-NEXT: v_add_i32_e32 v13, vcc, v14, v13 -; CHECK-NEXT: v_add_i32_e32 v11, vcc, v11, v13 -; CHECK-NEXT: v_add_i32_e32 v4, vcc, v4, v12 -; CHECK-NEXT: v_addc_u32_e64 v12, s[4:5], v8, v11, vcc -; CHECK-NEXT: v_mul_lo_u32 v10, v10, v4 -; CHECK-NEXT: v_mul_lo_u32 v13, v9, v12 -; CHECK-NEXT: v_mul_lo_u32 v14, v9, v4 -; CHECK-NEXT: v_mul_hi_u32 v9, v9, v4 +; CHECK-NEXT: v_add_i32_e32 v10, vcc, v11, v10 +; CHECK-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc +; CHECK-NEXT: v_add_i32_e32 v11, vcc, v12, v11 +; CHECK-NEXT: v_add_i32_e32 v9, vcc, v9, v11 +; CHECK-NEXT: v_add_i32_e32 v4, vcc, v4, v10 +; CHECK-NEXT: v_addc_u32_e64 v10, s[4:5], v5, v9, vcc +; CHECK-NEXT: v_mul_lo_u32 v8, v8, v4 +; CHECK-NEXT: v_mul_lo_u32 v11, v7, v10 +; CHECK-NEXT: v_mul_lo_u32 v12, v7, v4 +; CHECK-NEXT: v_mul_hi_u32 v7, v7, v4 +; CHECK-NEXT: v_add_i32_e64 v5, s[4:5], v5, v9 +; CHECK-NEXT: v_add_i32_e64 v8, s[4:5], v8, v11 +; CHECK-NEXT: v_mul_hi_u32 v9, v4, v12 +; CHECK-NEXT: v_add_i32_e64 v7, s[4:5], v8, v7 +; CHECK-NEXT: v_mul_lo_u32 v8, v10, v12 +; CHECK-NEXT: v_mul_lo_u32 v11, v4, v7 +; CHECK-NEXT: v_mul_hi_u32 v12, v10, v12 ; CHECK-NEXT: v_add_i32_e64 v8, s[4:5], v8, v11 -; CHECK-NEXT: v_add_i32_e64 v10, s[4:5], v10, v13 -; CHECK-NEXT: v_mul_hi_u32 v11, v4, v14 -; CHECK-NEXT: v_add_i32_e64 v9, s[4:5], v10, v9 -; CHECK-NEXT: v_mul_lo_u32 v10, v12, v14 -; CHECK-NEXT: v_mul_lo_u32 v13, v4, v9 -; CHECK-NEXT: v_mul_hi_u32 v14, v12, v14 -; CHECK-NEXT: v_add_i32_e64 v10, s[4:5], v10, v13 -; CHECK-NEXT: v_cndmask_b32_e64 v13, 0, 1, s[4:5] -; CHECK-NEXT: v_add_i32_e64 v10, s[4:5], v10, v11 -; CHECK-NEXT: v_cndmask_b32_e64 v10, 0, 1, s[4:5] -; CHECK-NEXT: v_mul_lo_u32 v11, v12, v9 -; CHECK-NEXT: v_add_i32_e64 v10, s[4:5], v13, v10 -; CHECK-NEXT: v_mul_hi_u32 v13, v4, v9 -; CHECK-NEXT: v_mul_hi_u32 v9, v12, v9 -; CHECK-NEXT: v_add_i32_e64 v11, s[4:5], v11, v14 -; CHECK-NEXT: v_cndmask_b32_e64 v14, 0, 1, s[4:5] -; CHECK-NEXT: v_add_i32_e64 v11, s[4:5], v11, v13 -; CHECK-NEXT: v_cndmask_b32_e64 v13, 0, 1, s[4:5] -; CHECK-NEXT: v_add_i32_e64 v13, s[4:5], v14, v13 -; CHECK-NEXT: v_add_i32_e64 v10, s[4:5], v11, v10 ; CHECK-NEXT: v_cndmask_b32_e64 v11, 0, 1, s[4:5] -; CHECK-NEXT: v_add_i32_e64 v11, s[4:5], v13, v11 +; CHECK-NEXT: v_add_i32_e64 v8, s[4:5], v8, v9 +; CHECK-NEXT: v_cndmask_b32_e64 v8, 0, 1, s[4:5] +; CHECK-NEXT: v_mul_lo_u32 v9, v10, v7 +; CHECK-NEXT: v_add_i32_e64 v8, s[4:5], v11, v8 +; CHECK-NEXT: v_mul_hi_u32 v11, v4, v7 +; CHECK-NEXT: v_mul_hi_u32 v7, v10, v7 +; CHECK-NEXT: v_add_i32_e64 v9, s[4:5], v9, v12 +; CHECK-NEXT: v_cndmask_b32_e64 v12, 0, 1, s[4:5] ; CHECK-NEXT: v_add_i32_e64 v9, s[4:5], v9, v11 -; CHECK-NEXT: v_addc_u32_e32 v8, vcc, v8, v9, vcc -; CHECK-NEXT: v_add_i32_e32 v4, vcc, v4, v10 -; CHECK-NEXT: v_addc_u32_e32 v8, vcc, 0, v8, vcc -; CHECK-NEXT: v_mul_lo_u32 v9, v1, v4 -; CHECK-NEXT: v_mul_lo_u32 v10, v6, v8 -; CHECK-NEXT: v_mul_hi_u32 v11, v6, v4 +; CHECK-NEXT: v_cndmask_b32_e64 v11, 0, 1, s[4:5] +; CHECK-NEXT: v_add_i32_e64 v11, s[4:5], v12, v11 +; CHECK-NEXT: v_add_i32_e64 v8, s[4:5], v9, v8 +; CHECK-NEXT: v_cndmask_b32_e64 v9, 0, 1, s[4:5] +; CHECK-NEXT: v_add_i32_e64 v9, s[4:5], v11, v9 +; CHECK-NEXT: v_add_i32_e64 v7, s[4:5], v7, v9 +; CHECK-NEXT: v_addc_u32_e32 v5, vcc, v5, v7, vcc +; CHECK-NEXT: v_add_i32_e32 v4, vcc, v4, v8 +; CHECK-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc +; CHECK-NEXT: v_mul_lo_u32 v7, v1, v4 +; CHECK-NEXT: v_mul_lo_u32 v8, v0, v5 +; CHECK-NEXT: v_mul_hi_u32 v9, v0, v4 ; CHECK-NEXT: v_mul_hi_u32 v4, v1, v4 -; CHECK-NEXT: v_add_i32_e32 v9, vcc, v9, v10 -; CHECK-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc -; CHECK-NEXT: v_add_i32_e32 v9, vcc, v9, v11 -; CHECK-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc -; CHECK-NEXT: v_mul_lo_u32 v11, v1, v8 -; CHECK-NEXT: v_add_i32_e32 v9, vcc, v10, v9 -; CHECK-NEXT: v_mul_hi_u32 v10, v6, v8 -; CHECK-NEXT: v_mul_hi_u32 v8, v1, v8 -; CHECK-NEXT: v_add_i32_e32 v4, vcc, v11, v4 -; CHECK-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc -; CHECK-NEXT: v_add_i32_e32 v4, vcc, v4, v10 -; CHECK-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc -; CHECK-NEXT: v_add_i32_e32 v10, vcc, v11, v10 -; CHECK-NEXT: v_add_i32_e32 v4, vcc, v4, v9 +; CHECK-NEXT: v_add_i32_e32 v7, vcc, v7, v8 +; CHECK-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc +; CHECK-NEXT: v_add_i32_e32 v7, vcc, v7, v9 +; CHECK-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc +; CHECK-NEXT: v_mul_lo_u32 v9, v1, v5 +; CHECK-NEXT: v_add_i32_e32 v7, vcc, v8, v7 +; CHECK-NEXT: v_mul_hi_u32 v8, v0, v5 +; CHECK-NEXT: v_mul_hi_u32 v5, v1, v5 +; CHECK-NEXT: v_add_i32_e32 v4, vcc, v9, v4 ; CHECK-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc -; CHECK-NEXT: v_add_i32_e32 v9, vcc, v10, v9 -; CHECK-NEXT: v_add_i32_e32 v8, vcc, v8, v9 -; CHECK-NEXT: v_mul_lo_u32 v9, v3, v4 -; CHECK-NEXT: v_mul_lo_u32 v8, v5, v8 -; CHECK-NEXT: v_mul_lo_u32 v10, v5, v4 -; CHECK-NEXT: v_mul_hi_u32 v4, v5, v4 +; CHECK-NEXT: v_add_i32_e32 v4, vcc, v4, v8 +; CHECK-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc ; CHECK-NEXT: v_add_i32_e32 v8, vcc, v9, v8 -; CHECK-NEXT: v_add_i32_e32 v4, vcc, v8, v4 -; CHECK-NEXT: v_sub_i32_e32 v6, vcc, v6, v10 -; CHECK-NEXT: v_subb_u32_e64 v8, s[4:5], v1, v4, vcc +; CHECK-NEXT: v_add_i32_e32 v4, vcc, v4, v7 +; CHECK-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc +; CHECK-NEXT: v_add_i32_e32 v7, vcc, v8, v7 +; CHECK-NEXT: v_add_i32_e32 v5, vcc, v5, v7 +; CHECK-NEXT: v_mul_lo_u32 v7, v3, v4 +; CHECK-NEXT: v_mul_lo_u32 v5, v2, v5 +; CHECK-NEXT: v_mul_lo_u32 v8, v2, v4 +; CHECK-NEXT: v_mul_hi_u32 v4, v2, v4 +; CHECK-NEXT: v_add_i32_e32 v5, vcc, v7, v5 +; CHECK-NEXT: v_add_i32_e32 v4, vcc, v5, v4 +; CHECK-NEXT: v_sub_i32_e32 v0, vcc, v0, v8 +; CHECK-NEXT: v_subb_u32_e64 v5, s[4:5], v1, v4, vcc ; CHECK-NEXT: v_sub_i32_e64 v1, s[4:5], v1, v4 -; CHECK-NEXT: v_cmp_ge_u32_e64 s[4:5], v8, v3 +; CHECK-NEXT: v_cmp_ge_u32_e64 s[4:5], v5, v3 ; CHECK-NEXT: v_cndmask_b32_e64 v4, 0, -1, s[4:5] -; CHECK-NEXT: v_cmp_ge_u32_e64 s[4:5], v6, v5 -; CHECK-NEXT: v_cndmask_b32_e64 v9, 0, -1, s[4:5] -; CHECK-NEXT: v_cmp_eq_u32_e64 s[4:5], v8, v3 +; CHECK-NEXT: v_cmp_ge_u32_e64 s[4:5], v0, v2 +; CHECK-NEXT: v_cndmask_b32_e64 v7, 0, -1, s[4:5] +; CHECK-NEXT: v_cmp_eq_u32_e64 s[4:5], v5, v3 ; CHECK-NEXT: v_subb_u32_e32 v1, vcc, v1, v3, vcc -; CHECK-NEXT: v_cndmask_b32_e64 v4, v4, v9, s[4:5] -; CHECK-NEXT: v_sub_i32_e32 v9, vcc, v6, v5 -; CHECK-NEXT: v_subbrev_u32_e64 v10, s[4:5], 0, v1, vcc -; CHECK-NEXT: v_cmp_ge_u32_e64 s[4:5], v10, v3 -; CHECK-NEXT: v_cndmask_b32_e64 v11, 0, -1, s[4:5] -; CHECK-NEXT: v_cmp_ge_u32_e64 s[4:5], v9, v5 +; CHECK-NEXT: v_cndmask_b32_e64 v4, v4, v7, s[4:5] +; CHECK-NEXT: v_sub_i32_e32 v7, vcc, v0, v2 +; CHECK-NEXT: v_subbrev_u32_e64 v8, s[4:5], 0, v1, vcc +; CHECK-NEXT: v_cmp_ge_u32_e64 s[4:5], v8, v3 +; CHECK-NEXT: v_cndmask_b32_e64 v9, 0, -1, s[4:5] +; CHECK-NEXT: v_cmp_ge_u32_e64 s[4:5], v7, v2 ; CHECK-NEXT: v_subb_u32_e32 v1, vcc, v1, v3, vcc -; CHECK-NEXT: v_cndmask_b32_e64 v12, 0, -1, s[4:5] -; CHECK-NEXT: v_cmp_eq_u32_e64 s[4:5], v10, v3 -; CHECK-NEXT: v_sub_i32_e32 v3, vcc, v9, v5 -; CHECK-NEXT: v_cndmask_b32_e64 v11, v11, v12, s[4:5] +; CHECK-NEXT: v_sub_i32_e32 v2, vcc, v7, v2 +; CHECK-NEXT: v_cndmask_b32_e64 v10, 0, -1, s[4:5] +; CHECK-NEXT: v_cmp_eq_u32_e64 s[4:5], v8, v3 +; CHECK-NEXT: v_cndmask_b32_e64 v9, v9, v10, s[4:5] ; CHECK-NEXT: v_subbrev_u32_e32 v1, vcc, 0, v1, vcc -; CHECK-NEXT: v_cmp_ne_u32_e32 vcc, 0, v11 -; CHECK-NEXT: v_cndmask_b32_e32 v3, v9, v3, vcc -; CHECK-NEXT: v_cndmask_b32_e32 v1, v10, v1, vcc -; CHECK-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 -; CHECK-NEXT: v_cndmask_b32_e32 v3, v6, v3, vcc +; CHECK-NEXT: v_cmp_ne_u32_e32 vcc, 0, v9 +; CHECK-NEXT: v_cndmask_b32_e32 v2, v7, v2, vcc ; CHECK-NEXT: v_cndmask_b32_e32 v1, v8, v1, vcc -; CHECK-NEXT: v_xor_b32_e32 v3, v3, v7 -; CHECK-NEXT: v_xor_b32_e32 v1, v1, v7 -; CHECK-NEXT: v_sub_i32_e32 v4, vcc, v3, v7 -; CHECK-NEXT: v_subb_u32_e32 v5, vcc, v1, v7, vcc +; CHECK-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 +; CHECK-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc +; CHECK-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc +; CHECK-NEXT: v_xor_b32_e32 v0, v0, v6 +; CHECK-NEXT: v_sub_i32_e32 v4, vcc, v0, v6 +; CHECK-NEXT: v_xor_b32_e32 v1, v1, v6 +; CHECK-NEXT: v_subb_u32_e32 v5, vcc, v1, v6, vcc +; CHECK-NEXT: ; implicit-def: $vgpr0 +; CHECK-NEXT: ; implicit-def: $vgpr2 ; CHECK-NEXT: BB0_2: ; %Flow ; CHECK-NEXT: s_or_saveexec_b64 s[4:5], s[6:7] ; CHECK-NEXT: s_xor_b64 exec, exec, s[4:5] @@ -690,144 +692,146 @@ ; CGP-NEXT: ; %bb.1: ; CGP-NEXT: v_ashrrev_i32_e32 v0, 31, v5 ; CGP-NEXT: v_add_i32_e32 v1, vcc, v4, v0 -; CGP-NEXT: v_addc_u32_e32 v5, vcc, v5, v0, vcc +; CGP-NEXT: v_addc_u32_e32 v4, vcc, v5, v0, vcc ; CGP-NEXT: v_xor_b32_e32 v1, v1, v0 -; CGP-NEXT: v_xor_b32_e32 v0, v5, v0 -; CGP-NEXT: v_cvt_f32_u32_e32 v5, v1 -; CGP-NEXT: v_cvt_f32_u32_e32 v10, v0 -; CGP-NEXT: v_ashrrev_i32_e32 v11, 31, v9 -; CGP-NEXT: v_mac_f32_e32 v5, 0x4f800000, v10 -; CGP-NEXT: v_rcp_iflag_f32_e32 v5, v5 -; CGP-NEXT: v_add_i32_e32 v10, vcc, v8, v11 -; CGP-NEXT: v_addc_u32_e32 v9, vcc, v9, v11, vcc -; CGP-NEXT: v_sub_i32_e32 v13, vcc, 0, v1 -; CGP-NEXT: v_mul_f32_e32 v5, 0x5f7ffffc, v5 -; CGP-NEXT: v_mul_f32_e32 v12, 0x2f800000, v5 -; CGP-NEXT: v_trunc_f32_e32 v12, v12 -; CGP-NEXT: v_mac_f32_e32 v5, 0xcf800000, v12 -; CGP-NEXT: v_cvt_u32_f32_e32 v5, v5 -; CGP-NEXT: v_cvt_u32_f32_e32 v12, v12 -; CGP-NEXT: v_subb_u32_e32 v14, vcc, 0, v0, vcc -; CGP-NEXT: v_xor_b32_e32 v10, v10, v11 -; CGP-NEXT: v_mul_lo_u32 v15, v14, v5 -; CGP-NEXT: v_mul_lo_u32 v16, v13, v12 -; CGP-NEXT: v_mul_hi_u32 v18, v13, v5 -; CGP-NEXT: v_mul_lo_u32 v17, v13, v5 -; CGP-NEXT: v_xor_b32_e32 v9, v9, v11 +; CGP-NEXT: v_xor_b32_e32 v0, v4, v0 +; CGP-NEXT: v_cvt_f32_u32_e32 v4, v1 +; CGP-NEXT: v_cvt_f32_u32_e32 v5, v0 +; CGP-NEXT: v_ashrrev_i32_e32 v10, 31, v9 +; CGP-NEXT: v_mac_f32_e32 v4, 0x4f800000, v5 +; CGP-NEXT: v_rcp_iflag_f32_e32 v4, v4 +; CGP-NEXT: v_add_i32_e32 v5, vcc, v8, v10 +; CGP-NEXT: v_addc_u32_e32 v8, vcc, v9, v10, vcc +; CGP-NEXT: v_sub_i32_e32 v11, vcc, 0, v1 +; CGP-NEXT: v_mul_f32_e32 v4, 0x5f7ffffc, v4 +; CGP-NEXT: v_mul_f32_e32 v9, 0x2f800000, v4 +; CGP-NEXT: v_trunc_f32_e32 v9, v9 +; CGP-NEXT: v_mac_f32_e32 v4, 0xcf800000, v9 +; CGP-NEXT: v_cvt_u32_f32_e32 v4, v4 +; CGP-NEXT: v_cvt_u32_f32_e32 v9, v9 +; CGP-NEXT: v_subb_u32_e32 v12, vcc, 0, v0, vcc +; CGP-NEXT: v_xor_b32_e32 v5, v5, v10 +; CGP-NEXT: v_mul_lo_u32 v13, v12, v4 +; CGP-NEXT: v_mul_lo_u32 v14, v11, v9 +; CGP-NEXT: v_mul_hi_u32 v16, v11, v4 +; CGP-NEXT: v_mul_lo_u32 v15, v11, v4 +; CGP-NEXT: v_xor_b32_e32 v8, v8, v10 +; CGP-NEXT: v_add_i32_e32 v13, vcc, v13, v14 +; CGP-NEXT: v_add_i32_e32 v13, vcc, v13, v16 +; CGP-NEXT: v_mul_lo_u32 v14, v9, v15 +; CGP-NEXT: v_mul_lo_u32 v16, v4, v13 +; CGP-NEXT: v_mul_hi_u32 v17, v4, v15 +; CGP-NEXT: v_mul_hi_u32 v15, v9, v15 +; CGP-NEXT: v_add_i32_e32 v14, vcc, v14, v16 +; CGP-NEXT: v_cndmask_b32_e64 v16, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v14, vcc, v14, v17 +; CGP-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc +; CGP-NEXT: v_mul_lo_u32 v17, v9, v13 +; CGP-NEXT: v_add_i32_e32 v14, vcc, v16, v14 +; CGP-NEXT: v_mul_hi_u32 v16, v4, v13 +; CGP-NEXT: v_mul_hi_u32 v13, v9, v13 +; CGP-NEXT: v_add_i32_e32 v15, vcc, v17, v15 +; CGP-NEXT: v_cndmask_b32_e64 v17, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v15, vcc, v15, v16 -; CGP-NEXT: v_add_i32_e32 v15, vcc, v15, v18 -; CGP-NEXT: v_mul_lo_u32 v16, v12, v17 -; CGP-NEXT: v_mul_lo_u32 v18, v5, v15 -; CGP-NEXT: v_mul_hi_u32 v19, v5, v17 -; CGP-NEXT: v_mul_hi_u32 v17, v12, v17 -; CGP-NEXT: v_add_i32_e32 v16, vcc, v16, v18 -; CGP-NEXT: v_cndmask_b32_e64 v18, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v16, vcc, v16, v19 ; CGP-NEXT: v_cndmask_b32_e64 v16, 0, 1, vcc -; CGP-NEXT: v_mul_lo_u32 v19, v12, v15 -; CGP-NEXT: v_add_i32_e32 v16, vcc, v18, v16 -; CGP-NEXT: v_mul_hi_u32 v18, v5, v15 -; CGP-NEXT: v_mul_hi_u32 v15, v12, v15 -; CGP-NEXT: v_add_i32_e32 v17, vcc, v19, v17 -; CGP-NEXT: v_cndmask_b32_e64 v19, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v17, vcc, v17, v18 -; CGP-NEXT: v_cndmask_b32_e64 v18, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v18, vcc, v19, v18 ; CGP-NEXT: v_add_i32_e32 v16, vcc, v17, v16 -; CGP-NEXT: v_cndmask_b32_e64 v17, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v17, vcc, v18, v17 -; CGP-NEXT: v_add_i32_e32 v15, vcc, v15, v17 -; CGP-NEXT: v_add_i32_e32 v5, vcc, v5, v16 -; CGP-NEXT: v_addc_u32_e64 v16, s[4:5], v12, v15, vcc -; CGP-NEXT: v_mul_lo_u32 v14, v14, v5 -; CGP-NEXT: v_mul_lo_u32 v17, v13, v16 -; CGP-NEXT: v_mul_lo_u32 v18, v13, v5 -; CGP-NEXT: v_mul_hi_u32 v13, v13, v5 +; CGP-NEXT: v_add_i32_e32 v14, vcc, v15, v14 +; CGP-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v15, vcc, v16, v15 +; CGP-NEXT: v_add_i32_e32 v13, vcc, v13, v15 +; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v14 +; CGP-NEXT: v_addc_u32_e64 v14, s[4:5], v9, v13, vcc +; CGP-NEXT: v_mul_lo_u32 v12, v12, v4 +; CGP-NEXT: v_mul_lo_u32 v15, v11, v14 +; CGP-NEXT: v_mul_lo_u32 v16, v11, v4 +; CGP-NEXT: v_mul_hi_u32 v11, v11, v4 +; CGP-NEXT: v_add_i32_e64 v9, s[4:5], v9, v13 +; CGP-NEXT: v_add_i32_e64 v12, s[4:5], v12, v15 +; CGP-NEXT: v_mul_hi_u32 v13, v4, v16 +; CGP-NEXT: v_add_i32_e64 v11, s[4:5], v12, v11 +; CGP-NEXT: v_mul_lo_u32 v12, v14, v16 +; CGP-NEXT: v_mul_lo_u32 v15, v4, v11 +; CGP-NEXT: v_mul_hi_u32 v16, v14, v16 ; CGP-NEXT: v_add_i32_e64 v12, s[4:5], v12, v15 -; CGP-NEXT: v_add_i32_e64 v14, s[4:5], v14, v17 -; CGP-NEXT: v_mul_hi_u32 v15, v5, v18 -; CGP-NEXT: v_add_i32_e64 v13, s[4:5], v14, v13 -; CGP-NEXT: v_mul_lo_u32 v14, v16, v18 -; CGP-NEXT: v_mul_lo_u32 v17, v5, v13 -; CGP-NEXT: v_mul_hi_u32 v18, v16, v18 -; CGP-NEXT: v_add_i32_e64 v14, s[4:5], v14, v17 -; CGP-NEXT: v_cndmask_b32_e64 v17, 0, 1, s[4:5] -; CGP-NEXT: v_add_i32_e64 v14, s[4:5], v14, v15 -; CGP-NEXT: v_cndmask_b32_e64 v14, 0, 1, s[4:5] -; CGP-NEXT: v_mul_lo_u32 v15, v16, v13 -; CGP-NEXT: v_add_i32_e64 v14, s[4:5], v17, v14 -; CGP-NEXT: v_mul_hi_u32 v17, v5, v13 -; CGP-NEXT: v_mul_hi_u32 v13, v16, v13 -; CGP-NEXT: v_add_i32_e64 v15, s[4:5], v15, v18 -; CGP-NEXT: v_cndmask_b32_e64 v18, 0, 1, s[4:5] -; CGP-NEXT: v_add_i32_e64 v15, s[4:5], v15, v17 -; CGP-NEXT: v_cndmask_b32_e64 v17, 0, 1, s[4:5] -; CGP-NEXT: v_add_i32_e64 v17, s[4:5], v18, v17 -; CGP-NEXT: v_add_i32_e64 v14, s[4:5], v15, v14 ; CGP-NEXT: v_cndmask_b32_e64 v15, 0, 1, s[4:5] -; CGP-NEXT: v_add_i32_e64 v15, s[4:5], v17, v15 +; CGP-NEXT: v_add_i32_e64 v12, s[4:5], v12, v13 +; CGP-NEXT: v_cndmask_b32_e64 v12, 0, 1, s[4:5] +; CGP-NEXT: v_mul_lo_u32 v13, v14, v11 +; CGP-NEXT: v_add_i32_e64 v12, s[4:5], v15, v12 +; CGP-NEXT: v_mul_hi_u32 v15, v4, v11 +; CGP-NEXT: v_mul_hi_u32 v11, v14, v11 +; CGP-NEXT: v_add_i32_e64 v13, s[4:5], v13, v16 +; CGP-NEXT: v_cndmask_b32_e64 v16, 0, 1, s[4:5] ; CGP-NEXT: v_add_i32_e64 v13, s[4:5], v13, v15 -; CGP-NEXT: v_addc_u32_e32 v12, vcc, v12, v13, vcc -; CGP-NEXT: v_add_i32_e32 v5, vcc, v5, v14 -; CGP-NEXT: v_addc_u32_e32 v12, vcc, 0, v12, vcc -; CGP-NEXT: v_mul_lo_u32 v13, v9, v5 -; CGP-NEXT: v_mul_lo_u32 v14, v10, v12 -; CGP-NEXT: v_mul_hi_u32 v15, v10, v5 -; CGP-NEXT: v_mul_hi_u32 v5, v9, v5 -; CGP-NEXT: v_add_i32_e32 v13, vcc, v13, v14 -; CGP-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v13, vcc, v13, v15 -; CGP-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc -; CGP-NEXT: v_mul_lo_u32 v15, v9, v12 -; CGP-NEXT: v_add_i32_e32 v13, vcc, v14, v13 -; CGP-NEXT: v_mul_hi_u32 v14, v10, v12 -; CGP-NEXT: v_mul_hi_u32 v12, v9, v12 -; CGP-NEXT: v_add_i32_e32 v5, vcc, v15, v5 -; CGP-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v5, vcc, v5, v14 -; CGP-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v14, vcc, v15, v14 -; CGP-NEXT: v_add_i32_e32 v5, vcc, v5, v13 +; CGP-NEXT: v_cndmask_b32_e64 v15, 0, 1, s[4:5] +; CGP-NEXT: v_add_i32_e64 v15, s[4:5], v16, v15 +; CGP-NEXT: v_add_i32_e64 v12, s[4:5], v13, v12 +; CGP-NEXT: v_cndmask_b32_e64 v13, 0, 1, s[4:5] +; CGP-NEXT: v_add_i32_e64 v13, s[4:5], v15, v13 +; CGP-NEXT: v_add_i32_e64 v11, s[4:5], v11, v13 +; CGP-NEXT: v_addc_u32_e32 v9, vcc, v9, v11, vcc +; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v12 +; CGP-NEXT: v_addc_u32_e32 v9, vcc, 0, v9, vcc +; CGP-NEXT: v_mul_lo_u32 v11, v8, v4 +; CGP-NEXT: v_mul_lo_u32 v12, v5, v9 +; CGP-NEXT: v_mul_hi_u32 v13, v5, v4 +; CGP-NEXT: v_mul_hi_u32 v4, v8, v4 +; CGP-NEXT: v_add_i32_e32 v11, vcc, v11, v12 +; CGP-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v11, vcc, v11, v13 +; CGP-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc +; CGP-NEXT: v_mul_lo_u32 v13, v8, v9 +; CGP-NEXT: v_add_i32_e32 v11, vcc, v12, v11 +; CGP-NEXT: v_mul_hi_u32 v12, v5, v9 +; CGP-NEXT: v_mul_hi_u32 v9, v8, v9 +; CGP-NEXT: v_add_i32_e32 v4, vcc, v13, v4 ; CGP-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v13, vcc, v14, v13 -; CGP-NEXT: v_add_i32_e32 v12, vcc, v12, v13 -; CGP-NEXT: v_mul_lo_u32 v13, v0, v5 -; CGP-NEXT: v_mul_lo_u32 v12, v1, v12 -; CGP-NEXT: v_mul_lo_u32 v14, v1, v5 -; CGP-NEXT: v_mul_hi_u32 v5, v1, v5 +; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v12 +; CGP-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v12, vcc, v13, v12 -; CGP-NEXT: v_add_i32_e32 v5, vcc, v12, v5 -; CGP-NEXT: v_sub_i32_e32 v10, vcc, v10, v14 -; CGP-NEXT: v_subb_u32_e64 v12, s[4:5], v9, v5, vcc -; CGP-NEXT: v_sub_i32_e64 v5, s[4:5], v9, v5 +; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v11 +; CGP-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v11, vcc, v12, v11 +; CGP-NEXT: v_add_i32_e32 v9, vcc, v9, v11 +; CGP-NEXT: v_mul_lo_u32 v11, v0, v4 +; CGP-NEXT: v_mul_lo_u32 v9, v1, v9 +; CGP-NEXT: v_mul_lo_u32 v12, v1, v4 +; CGP-NEXT: v_mul_hi_u32 v4, v1, v4 +; CGP-NEXT: v_add_i32_e32 v9, vcc, v11, v9 +; CGP-NEXT: v_add_i32_e32 v4, vcc, v9, v4 +; CGP-NEXT: v_sub_i32_e32 v5, vcc, v5, v12 +; CGP-NEXT: v_subb_u32_e64 v9, s[4:5], v8, v4, vcc +; CGP-NEXT: v_sub_i32_e64 v4, s[4:5], v8, v4 +; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v9, v0 +; CGP-NEXT: v_cndmask_b32_e64 v8, 0, -1, s[4:5] +; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v5, v1 +; CGP-NEXT: v_cndmask_b32_e64 v11, 0, -1, s[4:5] +; CGP-NEXT: v_cmp_eq_u32_e64 s[4:5], v9, v0 +; CGP-NEXT: v_subb_u32_e32 v4, vcc, v4, v0, vcc +; CGP-NEXT: v_cndmask_b32_e64 v8, v8, v11, s[4:5] +; CGP-NEXT: v_sub_i32_e32 v11, vcc, v5, v1 +; CGP-NEXT: v_subbrev_u32_e64 v12, s[4:5], 0, v4, vcc ; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v12, v0 -; CGP-NEXT: v_cndmask_b32_e64 v9, 0, -1, s[4:5] -; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v10, v1 ; CGP-NEXT: v_cndmask_b32_e64 v13, 0, -1, s[4:5] +; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v11, v1 +; CGP-NEXT: v_cndmask_b32_e64 v14, 0, -1, s[4:5] ; CGP-NEXT: v_cmp_eq_u32_e64 s[4:5], v12, v0 -; CGP-NEXT: v_subb_u32_e32 v5, vcc, v5, v0, vcc -; CGP-NEXT: v_cndmask_b32_e64 v9, v9, v13, s[4:5] -; CGP-NEXT: v_sub_i32_e32 v13, vcc, v10, v1 -; CGP-NEXT: v_subbrev_u32_e64 v14, s[4:5], 0, v5, vcc -; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v14, v0 -; CGP-NEXT: v_cndmask_b32_e64 v15, 0, -1, s[4:5] -; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v13, v1 -; CGP-NEXT: v_cndmask_b32_e64 v16, 0, -1, s[4:5] -; CGP-NEXT: v_cmp_eq_u32_e64 s[4:5], v14, v0 -; CGP-NEXT: v_subb_u32_e32 v0, vcc, v5, v0, vcc -; CGP-NEXT: v_sub_i32_e32 v1, vcc, v13, v1 -; CGP-NEXT: v_cndmask_b32_e64 v15, v15, v16, s[4:5] +; CGP-NEXT: v_subb_u32_e32 v0, vcc, v4, v0, vcc +; CGP-NEXT: v_sub_i32_e32 v1, vcc, v11, v1 +; CGP-NEXT: v_cndmask_b32_e64 v13, v13, v14, s[4:5] ; CGP-NEXT: v_subbrev_u32_e32 v0, vcc, 0, v0, vcc -; CGP-NEXT: v_cmp_ne_u32_e32 vcc, 0, v15 -; CGP-NEXT: v_cndmask_b32_e32 v1, v13, v1, vcc -; CGP-NEXT: v_cndmask_b32_e32 v0, v14, v0, vcc -; CGP-NEXT: v_cmp_ne_u32_e32 vcc, 0, v9 -; CGP-NEXT: v_cndmask_b32_e32 v1, v10, v1, vcc +; CGP-NEXT: v_cmp_ne_u32_e32 vcc, 0, v13 +; CGP-NEXT: v_cndmask_b32_e32 v1, v11, v1, vcc ; CGP-NEXT: v_cndmask_b32_e32 v0, v12, v0, vcc -; CGP-NEXT: v_xor_b32_e32 v1, v1, v11 -; CGP-NEXT: v_xor_b32_e32 v5, v0, v11 -; CGP-NEXT: v_sub_i32_e32 v0, vcc, v1, v11 -; CGP-NEXT: v_subb_u32_e32 v1, vcc, v5, v11, vcc +; CGP-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 +; CGP-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc +; CGP-NEXT: v_cndmask_b32_e32 v0, v9, v0, vcc +; CGP-NEXT: v_xor_b32_e32 v1, v1, v10 +; CGP-NEXT: v_xor_b32_e32 v4, v0, v10 +; CGP-NEXT: v_sub_i32_e32 v0, vcc, v1, v10 +; CGP-NEXT: v_subb_u32_e32 v1, vcc, v4, v10, vcc +; CGP-NEXT: ; implicit-def: $vgpr8 +; CGP-NEXT: ; implicit-def: $vgpr4 ; CGP-NEXT: BB2_2: ; %Flow2 ; CGP-NEXT: s_or_saveexec_b64 s[4:5], s[6:7] ; CGP-NEXT: s_xor_b64 exec, exec, s[4:5] @@ -863,144 +867,146 @@ ; CGP-NEXT: ; %bb.5: ; CGP-NEXT: v_ashrrev_i32_e32 v4, 31, v7 ; CGP-NEXT: v_add_i32_e32 v5, vcc, v6, v4 -; CGP-NEXT: v_addc_u32_e32 v7, vcc, v7, v4, vcc +; CGP-NEXT: v_addc_u32_e32 v6, vcc, v7, v4, vcc ; CGP-NEXT: v_xor_b32_e32 v5, v5, v4 -; CGP-NEXT: v_xor_b32_e32 v4, v7, v4 -; CGP-NEXT: v_cvt_f32_u32_e32 v7, v5 -; CGP-NEXT: v_cvt_f32_u32_e32 v8, v4 -; CGP-NEXT: v_ashrrev_i32_e32 v9, 31, v3 -; CGP-NEXT: v_mac_f32_e32 v7, 0x4f800000, v8 -; CGP-NEXT: v_rcp_iflag_f32_e32 v7, v7 -; CGP-NEXT: v_add_i32_e32 v8, vcc, v2, v9 -; CGP-NEXT: v_addc_u32_e32 v3, vcc, v3, v9, vcc -; CGP-NEXT: v_sub_i32_e32 v11, vcc, 0, v5 -; CGP-NEXT: v_mul_f32_e32 v7, 0x5f7ffffc, v7 -; CGP-NEXT: v_mul_f32_e32 v10, 0x2f800000, v7 -; CGP-NEXT: v_trunc_f32_e32 v10, v10 -; CGP-NEXT: v_mac_f32_e32 v7, 0xcf800000, v10 +; CGP-NEXT: v_xor_b32_e32 v4, v6, v4 +; CGP-NEXT: v_cvt_f32_u32_e32 v6, v5 +; CGP-NEXT: v_cvt_f32_u32_e32 v7, v4 +; CGP-NEXT: v_ashrrev_i32_e32 v8, 31, v3 +; CGP-NEXT: v_add_i32_e32 v2, vcc, v2, v8 +; CGP-NEXT: v_addc_u32_e32 v3, vcc, v3, v8, vcc +; CGP-NEXT: v_mac_f32_e32 v6, 0x4f800000, v7 +; CGP-NEXT: v_rcp_iflag_f32_e32 v6, v6 +; CGP-NEXT: v_sub_i32_e32 v9, vcc, 0, v5 +; CGP-NEXT: v_subb_u32_e32 v10, vcc, 0, v4, vcc +; CGP-NEXT: v_xor_b32_e32 v2, v2, v8 +; CGP-NEXT: v_mul_f32_e32 v6, 0x5f7ffffc, v6 +; CGP-NEXT: v_mul_f32_e32 v7, 0x2f800000, v6 +; CGP-NEXT: v_trunc_f32_e32 v7, v7 +; CGP-NEXT: v_mac_f32_e32 v6, 0xcf800000, v7 +; CGP-NEXT: v_cvt_u32_f32_e32 v6, v6 ; CGP-NEXT: v_cvt_u32_f32_e32 v7, v7 -; CGP-NEXT: v_cvt_u32_f32_e32 v10, v10 -; CGP-NEXT: v_subb_u32_e32 v12, vcc, 0, v4, vcc -; CGP-NEXT: v_xor_b32_e32 v8, v8, v9 -; CGP-NEXT: v_mul_lo_u32 v13, v12, v7 -; CGP-NEXT: v_mul_lo_u32 v14, v11, v10 -; CGP-NEXT: v_mul_hi_u32 v16, v11, v7 -; CGP-NEXT: v_mul_lo_u32 v15, v11, v7 -; CGP-NEXT: v_xor_b32_e32 v3, v3, v9 +; CGP-NEXT: v_xor_b32_e32 v3, v3, v8 +; CGP-NEXT: v_mul_lo_u32 v11, v10, v6 +; CGP-NEXT: v_mul_lo_u32 v12, v9, v7 +; CGP-NEXT: v_mul_hi_u32 v14, v9, v6 +; CGP-NEXT: v_mul_lo_u32 v13, v9, v6 +; CGP-NEXT: v_add_i32_e32 v11, vcc, v11, v12 +; CGP-NEXT: v_add_i32_e32 v11, vcc, v11, v14 +; CGP-NEXT: v_mul_lo_u32 v12, v7, v13 +; CGP-NEXT: v_mul_lo_u32 v14, v6, v11 +; CGP-NEXT: v_mul_hi_u32 v15, v6, v13 +; CGP-NEXT: v_mul_hi_u32 v13, v7, v13 +; CGP-NEXT: v_add_i32_e32 v12, vcc, v12, v14 +; CGP-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v12, vcc, v12, v15 +; CGP-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc +; CGP-NEXT: v_mul_lo_u32 v15, v7, v11 +; CGP-NEXT: v_add_i32_e32 v12, vcc, v14, v12 +; CGP-NEXT: v_mul_hi_u32 v14, v6, v11 +; CGP-NEXT: v_mul_hi_u32 v11, v7, v11 +; CGP-NEXT: v_add_i32_e32 v13, vcc, v15, v13 +; CGP-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v13, vcc, v13, v14 -; CGP-NEXT: v_add_i32_e32 v13, vcc, v13, v16 -; CGP-NEXT: v_mul_lo_u32 v14, v10, v15 -; CGP-NEXT: v_mul_lo_u32 v16, v7, v13 -; CGP-NEXT: v_mul_hi_u32 v17, v7, v15 -; CGP-NEXT: v_mul_hi_u32 v15, v10, v15 -; CGP-NEXT: v_add_i32_e32 v14, vcc, v14, v16 -; CGP-NEXT: v_cndmask_b32_e64 v16, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v14, vcc, v14, v17 ; CGP-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc -; CGP-NEXT: v_mul_lo_u32 v17, v10, v13 -; CGP-NEXT: v_add_i32_e32 v14, vcc, v16, v14 -; CGP-NEXT: v_mul_hi_u32 v16, v7, v13 -; CGP-NEXT: v_mul_hi_u32 v13, v10, v13 -; CGP-NEXT: v_add_i32_e32 v15, vcc, v17, v15 -; CGP-NEXT: v_cndmask_b32_e64 v17, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v15, vcc, v15, v16 -; CGP-NEXT: v_cndmask_b32_e64 v16, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v16, vcc, v17, v16 ; CGP-NEXT: v_add_i32_e32 v14, vcc, v15, v14 -; CGP-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v15, vcc, v16, v15 -; CGP-NEXT: v_add_i32_e32 v13, vcc, v13, v15 -; CGP-NEXT: v_add_i32_e32 v7, vcc, v7, v14 -; CGP-NEXT: v_addc_u32_e64 v14, s[4:5], v10, v13, vcc -; CGP-NEXT: v_mul_lo_u32 v12, v12, v7 -; CGP-NEXT: v_mul_lo_u32 v15, v11, v14 -; CGP-NEXT: v_mul_lo_u32 v16, v11, v7 -; CGP-NEXT: v_mul_hi_u32 v11, v11, v7 +; CGP-NEXT: v_add_i32_e32 v12, vcc, v13, v12 +; CGP-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v13, vcc, v14, v13 +; CGP-NEXT: v_add_i32_e32 v11, vcc, v11, v13 +; CGP-NEXT: v_add_i32_e32 v6, vcc, v6, v12 +; CGP-NEXT: v_addc_u32_e64 v12, s[4:5], v7, v11, vcc +; CGP-NEXT: v_mul_lo_u32 v10, v10, v6 +; CGP-NEXT: v_mul_lo_u32 v13, v9, v12 +; CGP-NEXT: v_mul_lo_u32 v14, v9, v6 +; CGP-NEXT: v_mul_hi_u32 v9, v9, v6 +; CGP-NEXT: v_add_i32_e64 v7, s[4:5], v7, v11 +; CGP-NEXT: v_add_i32_e64 v10, s[4:5], v10, v13 +; CGP-NEXT: v_mul_hi_u32 v11, v6, v14 +; CGP-NEXT: v_add_i32_e64 v9, s[4:5], v10, v9 +; CGP-NEXT: v_mul_lo_u32 v10, v12, v14 +; CGP-NEXT: v_mul_lo_u32 v13, v6, v9 +; CGP-NEXT: v_mul_hi_u32 v14, v12, v14 ; CGP-NEXT: v_add_i32_e64 v10, s[4:5], v10, v13 -; CGP-NEXT: v_add_i32_e64 v12, s[4:5], v12, v15 -; CGP-NEXT: v_mul_hi_u32 v13, v7, v16 -; CGP-NEXT: v_add_i32_e64 v11, s[4:5], v12, v11 -; CGP-NEXT: v_mul_lo_u32 v12, v14, v16 -; CGP-NEXT: v_mul_lo_u32 v15, v7, v11 -; CGP-NEXT: v_mul_hi_u32 v16, v14, v16 -; CGP-NEXT: v_add_i32_e64 v12, s[4:5], v12, v15 -; CGP-NEXT: v_cndmask_b32_e64 v15, 0, 1, s[4:5] -; CGP-NEXT: v_add_i32_e64 v12, s[4:5], v12, v13 -; CGP-NEXT: v_cndmask_b32_e64 v12, 0, 1, s[4:5] -; CGP-NEXT: v_mul_lo_u32 v13, v14, v11 -; CGP-NEXT: v_add_i32_e64 v12, s[4:5], v15, v12 -; CGP-NEXT: v_mul_hi_u32 v15, v7, v11 -; CGP-NEXT: v_mul_hi_u32 v11, v14, v11 -; CGP-NEXT: v_add_i32_e64 v13, s[4:5], v13, v16 -; CGP-NEXT: v_cndmask_b32_e64 v16, 0, 1, s[4:5] -; CGP-NEXT: v_add_i32_e64 v13, s[4:5], v13, v15 -; CGP-NEXT: v_cndmask_b32_e64 v15, 0, 1, s[4:5] -; CGP-NEXT: v_add_i32_e64 v15, s[4:5], v16, v15 -; CGP-NEXT: v_add_i32_e64 v12, s[4:5], v13, v12 ; CGP-NEXT: v_cndmask_b32_e64 v13, 0, 1, s[4:5] -; CGP-NEXT: v_add_i32_e64 v13, s[4:5], v15, v13 +; CGP-NEXT: v_add_i32_e64 v10, s[4:5], v10, v11 +; CGP-NEXT: v_cndmask_b32_e64 v10, 0, 1, s[4:5] +; CGP-NEXT: v_mul_lo_u32 v11, v12, v9 +; CGP-NEXT: v_add_i32_e64 v10, s[4:5], v13, v10 +; CGP-NEXT: v_mul_hi_u32 v13, v6, v9 +; CGP-NEXT: v_mul_hi_u32 v9, v12, v9 +; CGP-NEXT: v_add_i32_e64 v11, s[4:5], v11, v14 +; CGP-NEXT: v_cndmask_b32_e64 v14, 0, 1, s[4:5] ; CGP-NEXT: v_add_i32_e64 v11, s[4:5], v11, v13 -; CGP-NEXT: v_addc_u32_e32 v10, vcc, v10, v11, vcc -; CGP-NEXT: v_add_i32_e32 v7, vcc, v7, v12 -; CGP-NEXT: v_addc_u32_e32 v10, vcc, 0, v10, vcc +; CGP-NEXT: v_cndmask_b32_e64 v13, 0, 1, s[4:5] +; CGP-NEXT: v_add_i32_e64 v13, s[4:5], v14, v13 +; CGP-NEXT: v_add_i32_e64 v10, s[4:5], v11, v10 +; CGP-NEXT: v_cndmask_b32_e64 v11, 0, 1, s[4:5] +; CGP-NEXT: v_add_i32_e64 v11, s[4:5], v13, v11 +; CGP-NEXT: v_add_i32_e64 v9, s[4:5], v9, v11 +; CGP-NEXT: v_addc_u32_e32 v7, vcc, v7, v9, vcc +; CGP-NEXT: v_add_i32_e32 v6, vcc, v6, v10 +; CGP-NEXT: v_addc_u32_e32 v7, vcc, 0, v7, vcc +; CGP-NEXT: v_mul_lo_u32 v9, v3, v6 +; CGP-NEXT: v_mul_lo_u32 v10, v2, v7 +; CGP-NEXT: v_mul_hi_u32 v11, v2, v6 +; CGP-NEXT: v_mul_hi_u32 v6, v3, v6 +; CGP-NEXT: v_add_i32_e32 v9, vcc, v9, v10 +; CGP-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v9, vcc, v9, v11 +; CGP-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc ; CGP-NEXT: v_mul_lo_u32 v11, v3, v7 -; CGP-NEXT: v_mul_lo_u32 v12, v8, v10 -; CGP-NEXT: v_mul_hi_u32 v13, v8, v7 +; CGP-NEXT: v_add_i32_e32 v9, vcc, v10, v9 +; CGP-NEXT: v_mul_hi_u32 v10, v2, v7 ; CGP-NEXT: v_mul_hi_u32 v7, v3, v7 -; CGP-NEXT: v_add_i32_e32 v11, vcc, v11, v12 -; CGP-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v11, vcc, v11, v13 -; CGP-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc -; CGP-NEXT: v_mul_lo_u32 v13, v3, v10 -; CGP-NEXT: v_add_i32_e32 v11, vcc, v12, v11 -; CGP-NEXT: v_mul_hi_u32 v12, v8, v10 -; CGP-NEXT: v_mul_hi_u32 v10, v3, v10 -; CGP-NEXT: v_add_i32_e32 v7, vcc, v13, v7 -; CGP-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v7, vcc, v7, v12 -; CGP-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v12, vcc, v13, v12 -; CGP-NEXT: v_add_i32_e32 v7, vcc, v7, v11 +; CGP-NEXT: v_add_i32_e32 v6, vcc, v11, v6 ; CGP-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v11, vcc, v12, v11 -; CGP-NEXT: v_add_i32_e32 v10, vcc, v10, v11 -; CGP-NEXT: v_mul_lo_u32 v11, v4, v7 -; CGP-NEXT: v_mul_lo_u32 v10, v5, v10 -; CGP-NEXT: v_mul_lo_u32 v12, v5, v7 -; CGP-NEXT: v_mul_hi_u32 v7, v5, v7 +; CGP-NEXT: v_add_i32_e32 v6, vcc, v6, v10 +; CGP-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v10, vcc, v11, v10 -; CGP-NEXT: v_add_i32_e32 v7, vcc, v10, v7 -; CGP-NEXT: v_sub_i32_e32 v8, vcc, v8, v12 -; CGP-NEXT: v_subb_u32_e64 v10, s[4:5], v3, v7, vcc -; CGP-NEXT: v_sub_i32_e64 v3, s[4:5], v3, v7 +; CGP-NEXT: v_add_i32_e32 v6, vcc, v6, v9 +; CGP-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v9, vcc, v10, v9 +; CGP-NEXT: v_add_i32_e32 v7, vcc, v7, v9 +; CGP-NEXT: v_mul_lo_u32 v9, v4, v6 +; CGP-NEXT: v_mul_lo_u32 v7, v5, v7 +; CGP-NEXT: v_mul_lo_u32 v10, v5, v6 +; CGP-NEXT: v_mul_hi_u32 v6, v5, v6 +; CGP-NEXT: v_add_i32_e32 v7, vcc, v9, v7 +; CGP-NEXT: v_add_i32_e32 v6, vcc, v7, v6 +; CGP-NEXT: v_sub_i32_e32 v2, vcc, v2, v10 +; CGP-NEXT: v_subb_u32_e64 v7, s[4:5], v3, v6, vcc +; CGP-NEXT: v_sub_i32_e64 v3, s[4:5], v3, v6 +; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v7, v4 +; CGP-NEXT: v_cndmask_b32_e64 v6, 0, -1, s[4:5] +; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v2, v5 +; CGP-NEXT: v_cndmask_b32_e64 v9, 0, -1, s[4:5] +; CGP-NEXT: v_cmp_eq_u32_e64 s[4:5], v7, v4 +; CGP-NEXT: v_subb_u32_e32 v3, vcc, v3, v4, vcc +; CGP-NEXT: v_cndmask_b32_e64 v6, v6, v9, s[4:5] +; CGP-NEXT: v_sub_i32_e32 v9, vcc, v2, v5 +; CGP-NEXT: v_subbrev_u32_e64 v10, s[4:5], 0, v3, vcc ; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v10, v4 -; CGP-NEXT: v_cndmask_b32_e64 v7, 0, -1, s[4:5] -; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v8, v5 ; CGP-NEXT: v_cndmask_b32_e64 v11, 0, -1, s[4:5] -; CGP-NEXT: v_cmp_eq_u32_e64 s[4:5], v10, v4 +; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v9, v5 ; CGP-NEXT: v_subb_u32_e32 v3, vcc, v3, v4, vcc -; CGP-NEXT: v_cndmask_b32_e64 v7, v7, v11, s[4:5] -; CGP-NEXT: v_sub_i32_e32 v11, vcc, v8, v5 -; CGP-NEXT: v_subbrev_u32_e64 v12, s[4:5], 0, v3, vcc -; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v12, v4 -; CGP-NEXT: v_cndmask_b32_e64 v13, 0, -1, s[4:5] -; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v11, v5 -; CGP-NEXT: v_subb_u32_e32 v3, vcc, v3, v4, vcc -; CGP-NEXT: v_cndmask_b32_e64 v14, 0, -1, s[4:5] -; CGP-NEXT: v_cmp_eq_u32_e64 s[4:5], v12, v4 -; CGP-NEXT: v_sub_i32_e32 v4, vcc, v11, v5 -; CGP-NEXT: v_cndmask_b32_e64 v13, v13, v14, s[4:5] +; CGP-NEXT: v_cndmask_b32_e64 v12, 0, -1, s[4:5] +; CGP-NEXT: v_cmp_eq_u32_e64 s[4:5], v10, v4 +; CGP-NEXT: v_sub_i32_e32 v4, vcc, v9, v5 +; CGP-NEXT: v_cndmask_b32_e64 v11, v11, v12, s[4:5] ; CGP-NEXT: v_subbrev_u32_e32 v3, vcc, 0, v3, vcc -; CGP-NEXT: v_cmp_ne_u32_e32 vcc, 0, v13 -; CGP-NEXT: v_cndmask_b32_e32 v4, v11, v4, vcc -; CGP-NEXT: v_cndmask_b32_e32 v3, v12, v3, vcc -; CGP-NEXT: v_cmp_ne_u32_e32 vcc, 0, v7 -; CGP-NEXT: v_cndmask_b32_e32 v4, v8, v4, vcc +; CGP-NEXT: v_cmp_ne_u32_e32 vcc, 0, v11 +; CGP-NEXT: v_cndmask_b32_e32 v4, v9, v4, vcc ; CGP-NEXT: v_cndmask_b32_e32 v3, v10, v3, vcc -; CGP-NEXT: v_xor_b32_e32 v4, v4, v9 -; CGP-NEXT: v_xor_b32_e32 v3, v3, v9 -; CGP-NEXT: v_sub_i32_e32 v4, vcc, v4, v9 -; CGP-NEXT: v_subb_u32_e32 v5, vcc, v3, v9, vcc +; CGP-NEXT: v_cmp_ne_u32_e32 vcc, 0, v6 +; CGP-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc +; CGP-NEXT: v_cndmask_b32_e32 v3, v7, v3, vcc +; CGP-NEXT: v_xor_b32_e32 v2, v2, v8 +; CGP-NEXT: v_sub_i32_e32 v4, vcc, v2, v8 +; CGP-NEXT: v_xor_b32_e32 v3, v3, v8 +; CGP-NEXT: v_subb_u32_e32 v5, vcc, v3, v8, vcc +; CGP-NEXT: ; implicit-def: $vgpr2 +; CGP-NEXT: ; implicit-def: $vgpr6 ; CGP-NEXT: BB2_6: ; %Flow ; CGP-NEXT: s_or_saveexec_b64 s[4:5], s[6:7] ; CGP-NEXT: s_xor_b64 exec, exec, s[4:5] @@ -2480,144 +2486,146 @@ ; CHECK-NEXT: ; %bb.1: ; CHECK-NEXT: v_ashrrev_i32_e32 v2, 31, v5 ; CHECK-NEXT: v_add_i32_e32 v3, vcc, v4, v2 -; CHECK-NEXT: v_addc_u32_e32 v5, vcc, v5, v2, vcc +; CHECK-NEXT: v_addc_u32_e32 v4, vcc, v5, v2, vcc ; CHECK-NEXT: v_xor_b32_e32 v3, v3, v2 -; CHECK-NEXT: v_xor_b32_e32 v2, v5, v2 -; CHECK-NEXT: v_cvt_f32_u32_e32 v5, v3 -; CHECK-NEXT: v_cvt_f32_u32_e32 v6, v2 -; CHECK-NEXT: v_ashrrev_i32_e32 v7, 31, v1 -; CHECK-NEXT: v_mac_f32_e32 v5, 0x4f800000, v6 -; CHECK-NEXT: v_rcp_iflag_f32_e32 v5, v5 -; CHECK-NEXT: v_add_i32_e32 v6, vcc, v0, v7 -; CHECK-NEXT: v_addc_u32_e32 v1, vcc, v1, v7, vcc -; CHECK-NEXT: v_sub_i32_e32 v9, vcc, 0, v3 -; CHECK-NEXT: v_mul_f32_e32 v5, 0x5f7ffffc, v5 -; CHECK-NEXT: v_mul_f32_e32 v8, 0x2f800000, v5 -; CHECK-NEXT: v_trunc_f32_e32 v8, v8 -; CHECK-NEXT: v_mac_f32_e32 v5, 0xcf800000, v8 +; CHECK-NEXT: v_xor_b32_e32 v2, v4, v2 +; CHECK-NEXT: v_cvt_f32_u32_e32 v4, v3 +; CHECK-NEXT: v_cvt_f32_u32_e32 v5, v2 +; CHECK-NEXT: v_ashrrev_i32_e32 v6, 31, v1 +; CHECK-NEXT: v_add_i32_e32 v0, vcc, v0, v6 +; CHECK-NEXT: v_addc_u32_e32 v1, vcc, v1, v6, vcc +; CHECK-NEXT: v_mac_f32_e32 v4, 0x4f800000, v5 +; CHECK-NEXT: v_rcp_iflag_f32_e32 v4, v4 +; CHECK-NEXT: v_sub_i32_e32 v7, vcc, 0, v3 +; CHECK-NEXT: v_subb_u32_e32 v8, vcc, 0, v2, vcc +; CHECK-NEXT: v_xor_b32_e32 v0, v0, v6 +; CHECK-NEXT: v_mul_f32_e32 v4, 0x5f7ffffc, v4 +; CHECK-NEXT: v_mul_f32_e32 v5, 0x2f800000, v4 +; CHECK-NEXT: v_trunc_f32_e32 v5, v5 +; CHECK-NEXT: v_mac_f32_e32 v4, 0xcf800000, v5 +; CHECK-NEXT: v_cvt_u32_f32_e32 v4, v4 ; CHECK-NEXT: v_cvt_u32_f32_e32 v5, v5 -; CHECK-NEXT: v_cvt_u32_f32_e32 v8, v8 -; CHECK-NEXT: v_subb_u32_e32 v10, vcc, 0, v2, vcc -; CHECK-NEXT: v_xor_b32_e32 v6, v6, v7 -; CHECK-NEXT: v_mul_lo_u32 v11, v10, v5 -; CHECK-NEXT: v_mul_lo_u32 v12, v9, v8 -; CHECK-NEXT: v_mul_hi_u32 v14, v9, v5 -; CHECK-NEXT: v_mul_lo_u32 v13, v9, v5 -; CHECK-NEXT: v_xor_b32_e32 v1, v1, v7 +; CHECK-NEXT: v_xor_b32_e32 v1, v1, v6 +; CHECK-NEXT: v_mul_lo_u32 v9, v8, v4 +; CHECK-NEXT: v_mul_lo_u32 v10, v7, v5 +; CHECK-NEXT: v_mul_hi_u32 v12, v7, v4 +; CHECK-NEXT: v_mul_lo_u32 v11, v7, v4 +; CHECK-NEXT: v_add_i32_e32 v9, vcc, v9, v10 +; CHECK-NEXT: v_add_i32_e32 v9, vcc, v9, v12 +; CHECK-NEXT: v_mul_lo_u32 v10, v5, v11 +; CHECK-NEXT: v_mul_lo_u32 v12, v4, v9 +; CHECK-NEXT: v_mul_hi_u32 v13, v4, v11 +; CHECK-NEXT: v_mul_hi_u32 v11, v5, v11 +; CHECK-NEXT: v_add_i32_e32 v10, vcc, v10, v12 +; CHECK-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc +; CHECK-NEXT: v_add_i32_e32 v10, vcc, v10, v13 +; CHECK-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc +; CHECK-NEXT: v_mul_lo_u32 v13, v5, v9 +; CHECK-NEXT: v_add_i32_e32 v10, vcc, v12, v10 +; CHECK-NEXT: v_mul_hi_u32 v12, v4, v9 +; CHECK-NEXT: v_mul_hi_u32 v9, v5, v9 +; CHECK-NEXT: v_add_i32_e32 v11, vcc, v13, v11 +; CHECK-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc ; CHECK-NEXT: v_add_i32_e32 v11, vcc, v11, v12 -; CHECK-NEXT: v_add_i32_e32 v11, vcc, v11, v14 -; CHECK-NEXT: v_mul_lo_u32 v12, v8, v13 -; CHECK-NEXT: v_mul_lo_u32 v14, v5, v11 -; CHECK-NEXT: v_mul_hi_u32 v15, v5, v13 -; CHECK-NEXT: v_mul_hi_u32 v13, v8, v13 -; CHECK-NEXT: v_add_i32_e32 v12, vcc, v12, v14 -; CHECK-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc -; CHECK-NEXT: v_add_i32_e32 v12, vcc, v12, v15 ; CHECK-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc -; CHECK-NEXT: v_mul_lo_u32 v15, v8, v11 -; CHECK-NEXT: v_add_i32_e32 v12, vcc, v14, v12 -; CHECK-NEXT: v_mul_hi_u32 v14, v5, v11 -; CHECK-NEXT: v_mul_hi_u32 v11, v8, v11 -; CHECK-NEXT: v_add_i32_e32 v13, vcc, v15, v13 -; CHECK-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc -; CHECK-NEXT: v_add_i32_e32 v13, vcc, v13, v14 -; CHECK-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc -; CHECK-NEXT: v_add_i32_e32 v14, vcc, v15, v14 ; CHECK-NEXT: v_add_i32_e32 v12, vcc, v13, v12 -; CHECK-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc -; CHECK-NEXT: v_add_i32_e32 v13, vcc, v14, v13 -; CHECK-NEXT: v_add_i32_e32 v11, vcc, v11, v13 -; CHECK-NEXT: v_add_i32_e32 v5, vcc, v5, v12 -; CHECK-NEXT: v_addc_u32_e64 v12, s[4:5], v8, v11, vcc -; CHECK-NEXT: v_mul_lo_u32 v10, v10, v5 -; CHECK-NEXT: v_mul_lo_u32 v13, v9, v12 -; CHECK-NEXT: v_mul_lo_u32 v14, v9, v5 -; CHECK-NEXT: v_mul_hi_u32 v9, v9, v5 +; CHECK-NEXT: v_add_i32_e32 v10, vcc, v11, v10 +; CHECK-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc +; CHECK-NEXT: v_add_i32_e32 v11, vcc, v12, v11 +; CHECK-NEXT: v_add_i32_e32 v9, vcc, v9, v11 +; CHECK-NEXT: v_add_i32_e32 v4, vcc, v4, v10 +; CHECK-NEXT: v_addc_u32_e64 v10, s[4:5], v5, v9, vcc +; CHECK-NEXT: v_mul_lo_u32 v8, v8, v4 +; CHECK-NEXT: v_mul_lo_u32 v11, v7, v10 +; CHECK-NEXT: v_mul_lo_u32 v12, v7, v4 +; CHECK-NEXT: v_mul_hi_u32 v7, v7, v4 +; CHECK-NEXT: v_add_i32_e64 v5, s[4:5], v5, v9 +; CHECK-NEXT: v_add_i32_e64 v8, s[4:5], v8, v11 +; CHECK-NEXT: v_mul_hi_u32 v9, v4, v12 +; CHECK-NEXT: v_add_i32_e64 v7, s[4:5], v8, v7 +; CHECK-NEXT: v_mul_lo_u32 v8, v10, v12 +; CHECK-NEXT: v_mul_lo_u32 v11, v4, v7 +; CHECK-NEXT: v_mul_hi_u32 v12, v10, v12 ; CHECK-NEXT: v_add_i32_e64 v8, s[4:5], v8, v11 -; CHECK-NEXT: v_add_i32_e64 v10, s[4:5], v10, v13 -; CHECK-NEXT: v_mul_hi_u32 v11, v5, v14 -; CHECK-NEXT: v_add_i32_e64 v9, s[4:5], v10, v9 -; CHECK-NEXT: v_mul_lo_u32 v10, v12, v14 -; CHECK-NEXT: v_mul_lo_u32 v13, v5, v9 -; CHECK-NEXT: v_mul_hi_u32 v14, v12, v14 -; CHECK-NEXT: v_add_i32_e64 v10, s[4:5], v10, v13 -; CHECK-NEXT: v_cndmask_b32_e64 v13, 0, 1, s[4:5] -; CHECK-NEXT: v_add_i32_e64 v10, s[4:5], v10, v11 -; CHECK-NEXT: v_cndmask_b32_e64 v10, 0, 1, s[4:5] -; CHECK-NEXT: v_mul_lo_u32 v11, v12, v9 -; CHECK-NEXT: v_add_i32_e64 v10, s[4:5], v13, v10 -; CHECK-NEXT: v_mul_hi_u32 v13, v5, v9 -; CHECK-NEXT: v_mul_hi_u32 v9, v12, v9 -; CHECK-NEXT: v_add_i32_e64 v11, s[4:5], v11, v14 -; CHECK-NEXT: v_cndmask_b32_e64 v14, 0, 1, s[4:5] -; CHECK-NEXT: v_add_i32_e64 v11, s[4:5], v11, v13 -; CHECK-NEXT: v_cndmask_b32_e64 v13, 0, 1, s[4:5] -; CHECK-NEXT: v_add_i32_e64 v13, s[4:5], v14, v13 -; CHECK-NEXT: v_add_i32_e64 v10, s[4:5], v11, v10 ; CHECK-NEXT: v_cndmask_b32_e64 v11, 0, 1, s[4:5] -; CHECK-NEXT: v_add_i32_e64 v11, s[4:5], v13, v11 +; CHECK-NEXT: v_add_i32_e64 v8, s[4:5], v8, v9 +; CHECK-NEXT: v_cndmask_b32_e64 v8, 0, 1, s[4:5] +; CHECK-NEXT: v_mul_lo_u32 v9, v10, v7 +; CHECK-NEXT: v_add_i32_e64 v8, s[4:5], v11, v8 +; CHECK-NEXT: v_mul_hi_u32 v11, v4, v7 +; CHECK-NEXT: v_mul_hi_u32 v7, v10, v7 +; CHECK-NEXT: v_add_i32_e64 v9, s[4:5], v9, v12 +; CHECK-NEXT: v_cndmask_b32_e64 v12, 0, 1, s[4:5] ; CHECK-NEXT: v_add_i32_e64 v9, s[4:5], v9, v11 -; CHECK-NEXT: v_addc_u32_e32 v8, vcc, v8, v9, vcc -; CHECK-NEXT: v_add_i32_e32 v5, vcc, v5, v10 -; CHECK-NEXT: v_addc_u32_e32 v8, vcc, 0, v8, vcc +; CHECK-NEXT: v_cndmask_b32_e64 v11, 0, 1, s[4:5] +; CHECK-NEXT: v_add_i32_e64 v11, s[4:5], v12, v11 +; CHECK-NEXT: v_add_i32_e64 v8, s[4:5], v9, v8 +; CHECK-NEXT: v_cndmask_b32_e64 v9, 0, 1, s[4:5] +; CHECK-NEXT: v_add_i32_e64 v9, s[4:5], v11, v9 +; CHECK-NEXT: v_add_i32_e64 v7, s[4:5], v7, v9 +; CHECK-NEXT: v_addc_u32_e32 v5, vcc, v5, v7, vcc +; CHECK-NEXT: v_add_i32_e32 v4, vcc, v4, v8 +; CHECK-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc +; CHECK-NEXT: v_mul_lo_u32 v7, v1, v4 +; CHECK-NEXT: v_mul_lo_u32 v8, v0, v5 +; CHECK-NEXT: v_mul_hi_u32 v9, v0, v4 +; CHECK-NEXT: v_mul_hi_u32 v4, v1, v4 +; CHECK-NEXT: v_add_i32_e32 v7, vcc, v7, v8 +; CHECK-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc +; CHECK-NEXT: v_add_i32_e32 v7, vcc, v7, v9 +; CHECK-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc ; CHECK-NEXT: v_mul_lo_u32 v9, v1, v5 -; CHECK-NEXT: v_mul_lo_u32 v10, v6, v8 -; CHECK-NEXT: v_mul_hi_u32 v11, v6, v5 +; CHECK-NEXT: v_add_i32_e32 v7, vcc, v8, v7 +; CHECK-NEXT: v_mul_hi_u32 v8, v0, v5 ; CHECK-NEXT: v_mul_hi_u32 v5, v1, v5 -; CHECK-NEXT: v_add_i32_e32 v9, vcc, v9, v10 -; CHECK-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc -; CHECK-NEXT: v_add_i32_e32 v9, vcc, v9, v11 -; CHECK-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc -; CHECK-NEXT: v_mul_lo_u32 v11, v1, v8 -; CHECK-NEXT: v_add_i32_e32 v9, vcc, v10, v9 -; CHECK-NEXT: v_mul_hi_u32 v10, v6, v8 -; CHECK-NEXT: v_mul_hi_u32 v8, v1, v8 -; CHECK-NEXT: v_add_i32_e32 v5, vcc, v11, v5 -; CHECK-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc -; CHECK-NEXT: v_add_i32_e32 v5, vcc, v5, v10 -; CHECK-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc -; CHECK-NEXT: v_add_i32_e32 v10, vcc, v11, v10 -; CHECK-NEXT: v_add_i32_e32 v5, vcc, v5, v9 +; CHECK-NEXT: v_add_i32_e32 v4, vcc, v9, v4 ; CHECK-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc -; CHECK-NEXT: v_add_i32_e32 v9, vcc, v10, v9 -; CHECK-NEXT: v_add_i32_e32 v8, vcc, v8, v9 -; CHECK-NEXT: v_mul_lo_u32 v9, v2, v5 -; CHECK-NEXT: v_mul_lo_u32 v8, v3, v8 -; CHECK-NEXT: v_mul_lo_u32 v10, v3, v5 -; CHECK-NEXT: v_mul_hi_u32 v5, v3, v5 +; CHECK-NEXT: v_add_i32_e32 v4, vcc, v4, v8 +; CHECK-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc ; CHECK-NEXT: v_add_i32_e32 v8, vcc, v9, v8 -; CHECK-NEXT: v_add_i32_e32 v5, vcc, v8, v5 -; CHECK-NEXT: v_sub_i32_e32 v6, vcc, v6, v10 -; CHECK-NEXT: v_subb_u32_e64 v8, s[4:5], v1, v5, vcc -; CHECK-NEXT: v_sub_i32_e64 v1, s[4:5], v1, v5 +; CHECK-NEXT: v_add_i32_e32 v4, vcc, v4, v7 +; CHECK-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc +; CHECK-NEXT: v_add_i32_e32 v7, vcc, v8, v7 +; CHECK-NEXT: v_add_i32_e32 v5, vcc, v5, v7 +; CHECK-NEXT: v_mul_lo_u32 v7, v2, v4 +; CHECK-NEXT: v_mul_lo_u32 v5, v3, v5 +; CHECK-NEXT: v_mul_lo_u32 v8, v3, v4 +; CHECK-NEXT: v_mul_hi_u32 v4, v3, v4 +; CHECK-NEXT: v_add_i32_e32 v5, vcc, v7, v5 +; CHECK-NEXT: v_add_i32_e32 v4, vcc, v5, v4 +; CHECK-NEXT: v_sub_i32_e32 v0, vcc, v0, v8 +; CHECK-NEXT: v_subb_u32_e64 v5, s[4:5], v1, v4, vcc +; CHECK-NEXT: v_sub_i32_e64 v1, s[4:5], v1, v4 +; CHECK-NEXT: v_cmp_ge_u32_e64 s[4:5], v5, v2 +; CHECK-NEXT: v_cndmask_b32_e64 v4, 0, -1, s[4:5] +; CHECK-NEXT: v_cmp_ge_u32_e64 s[4:5], v0, v3 +; CHECK-NEXT: v_cndmask_b32_e64 v7, 0, -1, s[4:5] +; CHECK-NEXT: v_cmp_eq_u32_e64 s[4:5], v5, v2 +; CHECK-NEXT: v_subb_u32_e32 v1, vcc, v1, v2, vcc +; CHECK-NEXT: v_cndmask_b32_e64 v4, v4, v7, s[4:5] +; CHECK-NEXT: v_sub_i32_e32 v7, vcc, v0, v3 +; CHECK-NEXT: v_subbrev_u32_e64 v8, s[4:5], 0, v1, vcc ; CHECK-NEXT: v_cmp_ge_u32_e64 s[4:5], v8, v2 -; CHECK-NEXT: v_cndmask_b32_e64 v5, 0, -1, s[4:5] -; CHECK-NEXT: v_cmp_ge_u32_e64 s[4:5], v6, v3 ; CHECK-NEXT: v_cndmask_b32_e64 v9, 0, -1, s[4:5] -; CHECK-NEXT: v_cmp_eq_u32_e64 s[4:5], v8, v2 +; CHECK-NEXT: v_cmp_ge_u32_e64 s[4:5], v7, v3 ; CHECK-NEXT: v_subb_u32_e32 v1, vcc, v1, v2, vcc -; CHECK-NEXT: v_cndmask_b32_e64 v5, v5, v9, s[4:5] -; CHECK-NEXT: v_sub_i32_e32 v9, vcc, v6, v3 -; CHECK-NEXT: v_subbrev_u32_e64 v10, s[4:5], 0, v1, vcc -; CHECK-NEXT: v_cmp_ge_u32_e64 s[4:5], v10, v2 -; CHECK-NEXT: v_cndmask_b32_e64 v11, 0, -1, s[4:5] -; CHECK-NEXT: v_cmp_ge_u32_e64 s[4:5], v9, v3 -; CHECK-NEXT: v_subb_u32_e32 v1, vcc, v1, v2, vcc -; CHECK-NEXT: v_cndmask_b32_e64 v12, 0, -1, s[4:5] -; CHECK-NEXT: v_cmp_eq_u32_e64 s[4:5], v10, v2 -; CHECK-NEXT: v_sub_i32_e32 v2, vcc, v9, v3 -; CHECK-NEXT: v_cndmask_b32_e64 v11, v11, v12, s[4:5] +; CHECK-NEXT: v_cndmask_b32_e64 v10, 0, -1, s[4:5] +; CHECK-NEXT: v_cmp_eq_u32_e64 s[4:5], v8, v2 +; CHECK-NEXT: v_sub_i32_e32 v2, vcc, v7, v3 +; CHECK-NEXT: v_cndmask_b32_e64 v9, v9, v10, s[4:5] ; CHECK-NEXT: v_subbrev_u32_e32 v1, vcc, 0, v1, vcc -; CHECK-NEXT: v_cmp_ne_u32_e32 vcc, 0, v11 -; CHECK-NEXT: v_cndmask_b32_e32 v2, v9, v2, vcc -; CHECK-NEXT: v_cndmask_b32_e32 v1, v10, v1, vcc -; CHECK-NEXT: v_cmp_ne_u32_e32 vcc, 0, v5 -; CHECK-NEXT: v_cndmask_b32_e32 v2, v6, v2, vcc +; CHECK-NEXT: v_cmp_ne_u32_e32 vcc, 0, v9 +; CHECK-NEXT: v_cndmask_b32_e32 v2, v7, v2, vcc ; CHECK-NEXT: v_cndmask_b32_e32 v1, v8, v1, vcc -; CHECK-NEXT: v_xor_b32_e32 v2, v2, v7 -; CHECK-NEXT: v_xor_b32_e32 v1, v1, v7 -; CHECK-NEXT: v_sub_i32_e32 v2, vcc, v2, v7 -; CHECK-NEXT: v_subb_u32_e32 v3, vcc, v1, v7, vcc +; CHECK-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 +; CHECK-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc +; CHECK-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc +; CHECK-NEXT: v_xor_b32_e32 v0, v0, v6 +; CHECK-NEXT: v_sub_i32_e32 v2, vcc, v0, v6 +; CHECK-NEXT: v_xor_b32_e32 v1, v1, v6 +; CHECK-NEXT: v_subb_u32_e32 v3, vcc, v1, v6, vcc +; CHECK-NEXT: ; implicit-def: $vgpr0 +; CHECK-NEXT: ; implicit-def: $vgpr4_vgpr5 ; CHECK-NEXT: BB7_2: ; %Flow ; CHECK-NEXT: s_or_saveexec_b64 s[4:5], s[6:7] ; CHECK-NEXT: s_xor_b64 exec, exec, s[4:5] @@ -2965,139 +2973,141 @@ ; CGP-NEXT: v_xor_b32_e32 v0, v4, v0 ; CGP-NEXT: v_cvt_f32_u32_e32 v4, v1 ; CGP-NEXT: v_cvt_f32_u32_e32 v6, v0 -; CGP-NEXT: v_ashrrev_i32_e32 v11, 31, v7 +; CGP-NEXT: v_ashrrev_i32_e32 v10, 31, v7 +; CGP-NEXT: v_add_i32_e32 v5, vcc, v5, v10 +; CGP-NEXT: v_xor_b32_e32 v5, v5, v10 ; CGP-NEXT: v_mac_f32_e32 v4, 0x4f800000, v6 ; CGP-NEXT: v_rcp_iflag_f32_e32 v4, v4 -; CGP-NEXT: v_add_i32_e32 v6, vcc, v5, v11 -; CGP-NEXT: v_addc_u32_e32 v7, vcc, v7, v11, vcc -; CGP-NEXT: v_sub_i32_e32 v13, vcc, 0, v1 +; CGP-NEXT: v_addc_u32_e32 v6, vcc, v7, v10, vcc +; CGP-NEXT: v_sub_i32_e32 v11, vcc, 0, v1 +; CGP-NEXT: v_subb_u32_e32 v12, vcc, 0, v0, vcc ; CGP-NEXT: v_mul_f32_e32 v4, 0x5f7ffffc, v4 -; CGP-NEXT: v_mul_f32_e32 v12, 0x2f800000, v4 -; CGP-NEXT: v_trunc_f32_e32 v12, v12 -; CGP-NEXT: v_mac_f32_e32 v4, 0xcf800000, v12 +; CGP-NEXT: v_mul_f32_e32 v7, 0x2f800000, v4 +; CGP-NEXT: v_trunc_f32_e32 v7, v7 +; CGP-NEXT: v_mac_f32_e32 v4, 0xcf800000, v7 ; CGP-NEXT: v_cvt_u32_f32_e32 v4, v4 -; CGP-NEXT: v_cvt_u32_f32_e32 v12, v12 -; CGP-NEXT: v_subb_u32_e32 v14, vcc, 0, v0, vcc -; CGP-NEXT: v_xor_b32_e32 v6, v6, v11 -; CGP-NEXT: v_mul_lo_u32 v15, v14, v4 -; CGP-NEXT: v_mul_lo_u32 v16, v13, v12 -; CGP-NEXT: v_mul_hi_u32 v18, v13, v4 -; CGP-NEXT: v_mul_lo_u32 v17, v13, v4 -; CGP-NEXT: v_xor_b32_e32 v7, v7, v11 +; CGP-NEXT: v_cvt_u32_f32_e32 v7, v7 +; CGP-NEXT: v_xor_b32_e32 v6, v6, v10 +; CGP-NEXT: v_mul_lo_u32 v13, v12, v4 +; CGP-NEXT: v_mul_lo_u32 v14, v11, v7 +; CGP-NEXT: v_mul_hi_u32 v16, v11, v4 +; CGP-NEXT: v_mul_lo_u32 v15, v11, v4 +; CGP-NEXT: v_add_i32_e32 v13, vcc, v13, v14 +; CGP-NEXT: v_add_i32_e32 v13, vcc, v13, v16 +; CGP-NEXT: v_mul_lo_u32 v14, v7, v15 +; CGP-NEXT: v_mul_lo_u32 v16, v4, v13 +; CGP-NEXT: v_mul_hi_u32 v17, v4, v15 +; CGP-NEXT: v_mul_hi_u32 v15, v7, v15 +; CGP-NEXT: v_add_i32_e32 v14, vcc, v14, v16 +; CGP-NEXT: v_cndmask_b32_e64 v16, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v14, vcc, v14, v17 +; CGP-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc +; CGP-NEXT: v_mul_lo_u32 v17, v7, v13 +; CGP-NEXT: v_add_i32_e32 v14, vcc, v16, v14 +; CGP-NEXT: v_mul_hi_u32 v16, v4, v13 +; CGP-NEXT: v_mul_hi_u32 v13, v7, v13 +; CGP-NEXT: v_add_i32_e32 v15, vcc, v17, v15 +; CGP-NEXT: v_cndmask_b32_e64 v17, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v15, vcc, v15, v16 -; CGP-NEXT: v_add_i32_e32 v15, vcc, v15, v18 -; CGP-NEXT: v_mul_lo_u32 v16, v12, v17 -; CGP-NEXT: v_mul_lo_u32 v18, v4, v15 -; CGP-NEXT: v_mul_hi_u32 v19, v4, v17 -; CGP-NEXT: v_mul_hi_u32 v17, v12, v17 -; CGP-NEXT: v_add_i32_e32 v16, vcc, v16, v18 -; CGP-NEXT: v_cndmask_b32_e64 v18, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v16, vcc, v16, v19 ; CGP-NEXT: v_cndmask_b32_e64 v16, 0, 1, vcc -; CGP-NEXT: v_mul_lo_u32 v19, v12, v15 -; CGP-NEXT: v_add_i32_e32 v16, vcc, v18, v16 -; CGP-NEXT: v_mul_hi_u32 v18, v4, v15 -; CGP-NEXT: v_mul_hi_u32 v15, v12, v15 -; CGP-NEXT: v_add_i32_e32 v17, vcc, v19, v17 -; CGP-NEXT: v_cndmask_b32_e64 v19, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v17, vcc, v17, v18 -; CGP-NEXT: v_cndmask_b32_e64 v18, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v18, vcc, v19, v18 ; CGP-NEXT: v_add_i32_e32 v16, vcc, v17, v16 -; CGP-NEXT: v_cndmask_b32_e64 v17, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v17, vcc, v18, v17 -; CGP-NEXT: v_add_i32_e32 v15, vcc, v15, v17 -; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v16 -; CGP-NEXT: v_addc_u32_e64 v16, s[4:5], v12, v15, vcc -; CGP-NEXT: v_mul_lo_u32 v14, v14, v4 -; CGP-NEXT: v_mul_lo_u32 v17, v13, v16 -; CGP-NEXT: v_mul_lo_u32 v18, v13, v4 -; CGP-NEXT: v_mul_hi_u32 v13, v13, v4 +; CGP-NEXT: v_add_i32_e32 v14, vcc, v15, v14 +; CGP-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v15, vcc, v16, v15 +; CGP-NEXT: v_add_i32_e32 v13, vcc, v13, v15 +; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v14 +; CGP-NEXT: v_addc_u32_e64 v14, s[4:5], v7, v13, vcc +; CGP-NEXT: v_mul_lo_u32 v12, v12, v4 +; CGP-NEXT: v_mul_lo_u32 v15, v11, v14 +; CGP-NEXT: v_mul_lo_u32 v16, v11, v4 +; CGP-NEXT: v_mul_hi_u32 v11, v11, v4 +; CGP-NEXT: v_add_i32_e64 v7, s[4:5], v7, v13 +; CGP-NEXT: v_add_i32_e64 v12, s[4:5], v12, v15 +; CGP-NEXT: v_mul_hi_u32 v13, v4, v16 +; CGP-NEXT: v_add_i32_e64 v11, s[4:5], v12, v11 +; CGP-NEXT: v_mul_lo_u32 v12, v14, v16 +; CGP-NEXT: v_mul_lo_u32 v15, v4, v11 +; CGP-NEXT: v_mul_hi_u32 v16, v14, v16 ; CGP-NEXT: v_add_i32_e64 v12, s[4:5], v12, v15 -; CGP-NEXT: v_add_i32_e64 v14, s[4:5], v14, v17 -; CGP-NEXT: v_mul_hi_u32 v15, v4, v18 -; CGP-NEXT: v_add_i32_e64 v13, s[4:5], v14, v13 -; CGP-NEXT: v_mul_lo_u32 v14, v16, v18 -; CGP-NEXT: v_mul_lo_u32 v17, v4, v13 -; CGP-NEXT: v_mul_hi_u32 v18, v16, v18 -; CGP-NEXT: v_add_i32_e64 v14, s[4:5], v14, v17 -; CGP-NEXT: v_cndmask_b32_e64 v17, 0, 1, s[4:5] -; CGP-NEXT: v_add_i32_e64 v14, s[4:5], v14, v15 -; CGP-NEXT: v_cndmask_b32_e64 v14, 0, 1, s[4:5] -; CGP-NEXT: v_mul_lo_u32 v15, v16, v13 -; CGP-NEXT: v_add_i32_e64 v14, s[4:5], v17, v14 -; CGP-NEXT: v_mul_hi_u32 v17, v4, v13 -; CGP-NEXT: v_mul_hi_u32 v13, v16, v13 -; CGP-NEXT: v_add_i32_e64 v15, s[4:5], v15, v18 -; CGP-NEXT: v_cndmask_b32_e64 v18, 0, 1, s[4:5] -; CGP-NEXT: v_add_i32_e64 v15, s[4:5], v15, v17 -; CGP-NEXT: v_cndmask_b32_e64 v17, 0, 1, s[4:5] -; CGP-NEXT: v_add_i32_e64 v17, s[4:5], v18, v17 -; CGP-NEXT: v_add_i32_e64 v14, s[4:5], v15, v14 ; CGP-NEXT: v_cndmask_b32_e64 v15, 0, 1, s[4:5] -; CGP-NEXT: v_add_i32_e64 v15, s[4:5], v17, v15 +; CGP-NEXT: v_add_i32_e64 v12, s[4:5], v12, v13 +; CGP-NEXT: v_cndmask_b32_e64 v12, 0, 1, s[4:5] +; CGP-NEXT: v_mul_lo_u32 v13, v14, v11 +; CGP-NEXT: v_add_i32_e64 v12, s[4:5], v15, v12 +; CGP-NEXT: v_mul_hi_u32 v15, v4, v11 +; CGP-NEXT: v_mul_hi_u32 v11, v14, v11 +; CGP-NEXT: v_add_i32_e64 v13, s[4:5], v13, v16 +; CGP-NEXT: v_cndmask_b32_e64 v16, 0, 1, s[4:5] ; CGP-NEXT: v_add_i32_e64 v13, s[4:5], v13, v15 -; CGP-NEXT: v_addc_u32_e32 v12, vcc, v12, v13, vcc -; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v14 -; CGP-NEXT: v_addc_u32_e32 v12, vcc, 0, v12, vcc -; CGP-NEXT: v_mul_lo_u32 v13, v7, v4 -; CGP-NEXT: v_mul_lo_u32 v14, v6, v12 -; CGP-NEXT: v_mul_hi_u32 v15, v6, v4 -; CGP-NEXT: v_mul_hi_u32 v4, v7, v4 -; CGP-NEXT: v_add_i32_e32 v13, vcc, v13, v14 -; CGP-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v13, vcc, v13, v15 -; CGP-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc -; CGP-NEXT: v_mul_lo_u32 v15, v7, v12 -; CGP-NEXT: v_add_i32_e32 v13, vcc, v14, v13 -; CGP-NEXT: v_mul_hi_u32 v14, v6, v12 -; CGP-NEXT: v_mul_hi_u32 v12, v7, v12 -; CGP-NEXT: v_add_i32_e32 v4, vcc, v15, v4 -; CGP-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v14 -; CGP-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v14, vcc, v15, v14 -; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v13 +; CGP-NEXT: v_cndmask_b32_e64 v15, 0, 1, s[4:5] +; CGP-NEXT: v_add_i32_e64 v15, s[4:5], v16, v15 +; CGP-NEXT: v_add_i32_e64 v12, s[4:5], v13, v12 +; CGP-NEXT: v_cndmask_b32_e64 v13, 0, 1, s[4:5] +; CGP-NEXT: v_add_i32_e64 v13, s[4:5], v15, v13 +; CGP-NEXT: v_add_i32_e64 v11, s[4:5], v11, v13 +; CGP-NEXT: v_addc_u32_e32 v7, vcc, v7, v11, vcc +; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v12 +; CGP-NEXT: v_addc_u32_e32 v7, vcc, 0, v7, vcc +; CGP-NEXT: v_mul_lo_u32 v11, v6, v4 +; CGP-NEXT: v_mul_lo_u32 v12, v5, v7 +; CGP-NEXT: v_mul_hi_u32 v13, v5, v4 +; CGP-NEXT: v_mul_hi_u32 v4, v6, v4 +; CGP-NEXT: v_add_i32_e32 v11, vcc, v11, v12 +; CGP-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v11, vcc, v11, v13 +; CGP-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc +; CGP-NEXT: v_mul_lo_u32 v13, v6, v7 +; CGP-NEXT: v_add_i32_e32 v11, vcc, v12, v11 +; CGP-NEXT: v_mul_hi_u32 v12, v5, v7 +; CGP-NEXT: v_mul_hi_u32 v7, v6, v7 +; CGP-NEXT: v_add_i32_e32 v4, vcc, v13, v4 ; CGP-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v13, vcc, v14, v13 -; CGP-NEXT: v_add_i32_e32 v12, vcc, v12, v13 -; CGP-NEXT: v_mul_lo_u32 v13, v0, v4 -; CGP-NEXT: v_mul_lo_u32 v12, v1, v12 -; CGP-NEXT: v_mul_lo_u32 v14, v1, v4 -; CGP-NEXT: v_mul_hi_u32 v4, v1, v4 +; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v12 +; CGP-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v12, vcc, v13, v12 -; CGP-NEXT: v_add_i32_e32 v4, vcc, v12, v4 -; CGP-NEXT: v_sub_i32_e32 v6, vcc, v6, v14 -; CGP-NEXT: v_subb_u32_e64 v12, s[4:5], v7, v4, vcc -; CGP-NEXT: v_sub_i32_e64 v4, s[4:5], v7, v4 +; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v11 +; CGP-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v11, vcc, v12, v11 +; CGP-NEXT: v_add_i32_e32 v7, vcc, v7, v11 +; CGP-NEXT: v_mul_lo_u32 v11, v0, v4 +; CGP-NEXT: v_mul_lo_u32 v7, v1, v7 +; CGP-NEXT: v_mul_lo_u32 v12, v1, v4 +; CGP-NEXT: v_mul_hi_u32 v4, v1, v4 +; CGP-NEXT: v_add_i32_e32 v7, vcc, v11, v7 +; CGP-NEXT: v_add_i32_e32 v4, vcc, v7, v4 +; CGP-NEXT: v_sub_i32_e32 v5, vcc, v5, v12 +; CGP-NEXT: v_subb_u32_e64 v7, s[4:5], v6, v4, vcc +; CGP-NEXT: v_sub_i32_e64 v4, s[4:5], v6, v4 +; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v7, v0 +; CGP-NEXT: v_cndmask_b32_e64 v6, 0, -1, s[4:5] +; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v5, v1 +; CGP-NEXT: v_cndmask_b32_e64 v11, 0, -1, s[4:5] +; CGP-NEXT: v_cmp_eq_u32_e64 s[4:5], v7, v0 +; CGP-NEXT: v_subb_u32_e32 v4, vcc, v4, v0, vcc +; CGP-NEXT: v_cndmask_b32_e64 v6, v6, v11, s[4:5] +; CGP-NEXT: v_sub_i32_e32 v11, vcc, v5, v1 +; CGP-NEXT: v_subbrev_u32_e64 v12, s[4:5], 0, v4, vcc ; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v12, v0 -; CGP-NEXT: v_cndmask_b32_e64 v7, 0, -1, s[4:5] -; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v6, v1 ; CGP-NEXT: v_cndmask_b32_e64 v13, 0, -1, s[4:5] +; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v11, v1 +; CGP-NEXT: v_cndmask_b32_e64 v14, 0, -1, s[4:5] ; CGP-NEXT: v_cmp_eq_u32_e64 s[4:5], v12, v0 -; CGP-NEXT: v_subb_u32_e32 v4, vcc, v4, v0, vcc -; CGP-NEXT: v_cndmask_b32_e64 v7, v7, v13, s[4:5] -; CGP-NEXT: v_sub_i32_e32 v13, vcc, v6, v1 -; CGP-NEXT: v_subbrev_u32_e64 v14, s[4:5], 0, v4, vcc -; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v14, v0 -; CGP-NEXT: v_cndmask_b32_e64 v15, 0, -1, s[4:5] -; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v13, v1 -; CGP-NEXT: v_cndmask_b32_e64 v16, 0, -1, s[4:5] -; CGP-NEXT: v_cmp_eq_u32_e64 s[4:5], v14, v0 ; CGP-NEXT: v_subb_u32_e32 v0, vcc, v4, v0, vcc -; CGP-NEXT: v_sub_i32_e32 v1, vcc, v13, v1 -; CGP-NEXT: v_cndmask_b32_e64 v15, v15, v16, s[4:5] +; CGP-NEXT: v_sub_i32_e32 v1, vcc, v11, v1 +; CGP-NEXT: v_cndmask_b32_e64 v13, v13, v14, s[4:5] ; CGP-NEXT: v_subbrev_u32_e32 v0, vcc, 0, v0, vcc -; CGP-NEXT: v_cmp_ne_u32_e32 vcc, 0, v15 -; CGP-NEXT: v_cndmask_b32_e32 v1, v13, v1, vcc -; CGP-NEXT: v_cndmask_b32_e32 v0, v14, v0, vcc -; CGP-NEXT: v_cmp_ne_u32_e32 vcc, 0, v7 -; CGP-NEXT: v_cndmask_b32_e32 v1, v6, v1, vcc +; CGP-NEXT: v_cmp_ne_u32_e32 vcc, 0, v13 +; CGP-NEXT: v_cndmask_b32_e32 v1, v11, v1, vcc ; CGP-NEXT: v_cndmask_b32_e32 v0, v12, v0, vcc -; CGP-NEXT: v_xor_b32_e32 v1, v1, v11 -; CGP-NEXT: v_xor_b32_e32 v4, v0, v11 -; CGP-NEXT: v_sub_i32_e32 v0, vcc, v1, v11 -; CGP-NEXT: v_subb_u32_e32 v1, vcc, v4, v11, vcc +; CGP-NEXT: v_cmp_ne_u32_e32 vcc, 0, v6 +; CGP-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc +; CGP-NEXT: v_cndmask_b32_e32 v0, v7, v0, vcc +; CGP-NEXT: v_xor_b32_e32 v1, v1, v10 +; CGP-NEXT: v_xor_b32_e32 v4, v0, v10 +; CGP-NEXT: v_sub_i32_e32 v0, vcc, v1, v10 +; CGP-NEXT: v_subb_u32_e32 v1, vcc, v4, v10, vcc +; CGP-NEXT: ; implicit-def: $vgpr5 +; CGP-NEXT: ; implicit-def: $vgpr10_vgpr11 ; CGP-NEXT: BB8_2: ; %Flow2 ; CGP-NEXT: s_or_saveexec_b64 s[4:5], s[6:7] ; CGP-NEXT: s_xor_b64 exec, exec, s[4:5] @@ -3138,139 +3148,141 @@ ; CGP-NEXT: v_xor_b32_e32 v4, v6, v4 ; CGP-NEXT: v_cvt_f32_u32_e32 v6, v5 ; CGP-NEXT: v_cvt_f32_u32_e32 v7, v4 -; CGP-NEXT: v_ashrrev_i32_e32 v9, 31, v3 +; CGP-NEXT: v_ashrrev_i32_e32 v8, 31, v3 +; CGP-NEXT: v_add_i32_e32 v2, vcc, v2, v8 +; CGP-NEXT: v_addc_u32_e32 v3, vcc, v3, v8, vcc ; CGP-NEXT: v_mac_f32_e32 v6, 0x4f800000, v7 ; CGP-NEXT: v_rcp_iflag_f32_e32 v6, v6 -; CGP-NEXT: v_add_i32_e32 v7, vcc, v2, v9 -; CGP-NEXT: v_addc_u32_e32 v3, vcc, v3, v9, vcc -; CGP-NEXT: v_sub_i32_e32 v11, vcc, 0, v5 +; CGP-NEXT: v_sub_i32_e32 v9, vcc, 0, v5 +; CGP-NEXT: v_subb_u32_e32 v10, vcc, 0, v4, vcc +; CGP-NEXT: v_xor_b32_e32 v2, v2, v8 ; CGP-NEXT: v_mul_f32_e32 v6, 0x5f7ffffc, v6 -; CGP-NEXT: v_mul_f32_e32 v10, 0x2f800000, v6 -; CGP-NEXT: v_trunc_f32_e32 v10, v10 -; CGP-NEXT: v_mac_f32_e32 v6, 0xcf800000, v10 +; CGP-NEXT: v_mul_f32_e32 v7, 0x2f800000, v6 +; CGP-NEXT: v_trunc_f32_e32 v7, v7 +; CGP-NEXT: v_mac_f32_e32 v6, 0xcf800000, v7 ; CGP-NEXT: v_cvt_u32_f32_e32 v6, v6 -; CGP-NEXT: v_cvt_u32_f32_e32 v10, v10 -; CGP-NEXT: v_subb_u32_e32 v12, vcc, 0, v4, vcc -; CGP-NEXT: v_xor_b32_e32 v7, v7, v9 -; CGP-NEXT: v_mul_lo_u32 v13, v12, v6 -; CGP-NEXT: v_mul_lo_u32 v14, v11, v10 -; CGP-NEXT: v_mul_hi_u32 v16, v11, v6 -; CGP-NEXT: v_mul_lo_u32 v15, v11, v6 -; CGP-NEXT: v_xor_b32_e32 v3, v3, v9 +; CGP-NEXT: v_cvt_u32_f32_e32 v7, v7 +; CGP-NEXT: v_xor_b32_e32 v3, v3, v8 +; CGP-NEXT: v_mul_lo_u32 v11, v10, v6 +; CGP-NEXT: v_mul_lo_u32 v12, v9, v7 +; CGP-NEXT: v_mul_hi_u32 v14, v9, v6 +; CGP-NEXT: v_mul_lo_u32 v13, v9, v6 +; CGP-NEXT: v_add_i32_e32 v11, vcc, v11, v12 +; CGP-NEXT: v_add_i32_e32 v11, vcc, v11, v14 +; CGP-NEXT: v_mul_lo_u32 v12, v7, v13 +; CGP-NEXT: v_mul_lo_u32 v14, v6, v11 +; CGP-NEXT: v_mul_hi_u32 v15, v6, v13 +; CGP-NEXT: v_mul_hi_u32 v13, v7, v13 +; CGP-NEXT: v_add_i32_e32 v12, vcc, v12, v14 +; CGP-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v12, vcc, v12, v15 +; CGP-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc +; CGP-NEXT: v_mul_lo_u32 v15, v7, v11 +; CGP-NEXT: v_add_i32_e32 v12, vcc, v14, v12 +; CGP-NEXT: v_mul_hi_u32 v14, v6, v11 +; CGP-NEXT: v_mul_hi_u32 v11, v7, v11 +; CGP-NEXT: v_add_i32_e32 v13, vcc, v15, v13 +; CGP-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v13, vcc, v13, v14 -; CGP-NEXT: v_add_i32_e32 v13, vcc, v13, v16 -; CGP-NEXT: v_mul_lo_u32 v14, v10, v15 -; CGP-NEXT: v_mul_lo_u32 v16, v6, v13 -; CGP-NEXT: v_mul_hi_u32 v17, v6, v15 -; CGP-NEXT: v_mul_hi_u32 v15, v10, v15 -; CGP-NEXT: v_add_i32_e32 v14, vcc, v14, v16 -; CGP-NEXT: v_cndmask_b32_e64 v16, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v14, vcc, v14, v17 ; CGP-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc -; CGP-NEXT: v_mul_lo_u32 v17, v10, v13 -; CGP-NEXT: v_add_i32_e32 v14, vcc, v16, v14 -; CGP-NEXT: v_mul_hi_u32 v16, v6, v13 -; CGP-NEXT: v_mul_hi_u32 v13, v10, v13 -; CGP-NEXT: v_add_i32_e32 v15, vcc, v17, v15 -; CGP-NEXT: v_cndmask_b32_e64 v17, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v15, vcc, v15, v16 -; CGP-NEXT: v_cndmask_b32_e64 v16, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v16, vcc, v17, v16 ; CGP-NEXT: v_add_i32_e32 v14, vcc, v15, v14 -; CGP-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v15, vcc, v16, v15 -; CGP-NEXT: v_add_i32_e32 v13, vcc, v13, v15 -; CGP-NEXT: v_add_i32_e32 v6, vcc, v6, v14 -; CGP-NEXT: v_addc_u32_e64 v14, s[4:5], v10, v13, vcc -; CGP-NEXT: v_mul_lo_u32 v12, v12, v6 -; CGP-NEXT: v_mul_lo_u32 v15, v11, v14 -; CGP-NEXT: v_mul_lo_u32 v16, v11, v6 -; CGP-NEXT: v_mul_hi_u32 v11, v11, v6 +; CGP-NEXT: v_add_i32_e32 v12, vcc, v13, v12 +; CGP-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v13, vcc, v14, v13 +; CGP-NEXT: v_add_i32_e32 v11, vcc, v11, v13 +; CGP-NEXT: v_add_i32_e32 v6, vcc, v6, v12 +; CGP-NEXT: v_addc_u32_e64 v12, s[4:5], v7, v11, vcc +; CGP-NEXT: v_mul_lo_u32 v10, v10, v6 +; CGP-NEXT: v_mul_lo_u32 v13, v9, v12 +; CGP-NEXT: v_mul_lo_u32 v14, v9, v6 +; CGP-NEXT: v_mul_hi_u32 v9, v9, v6 +; CGP-NEXT: v_add_i32_e64 v7, s[4:5], v7, v11 +; CGP-NEXT: v_add_i32_e64 v10, s[4:5], v10, v13 +; CGP-NEXT: v_mul_hi_u32 v11, v6, v14 +; CGP-NEXT: v_add_i32_e64 v9, s[4:5], v10, v9 +; CGP-NEXT: v_mul_lo_u32 v10, v12, v14 +; CGP-NEXT: v_mul_lo_u32 v13, v6, v9 +; CGP-NEXT: v_mul_hi_u32 v14, v12, v14 ; CGP-NEXT: v_add_i32_e64 v10, s[4:5], v10, v13 -; CGP-NEXT: v_add_i32_e64 v12, s[4:5], v12, v15 -; CGP-NEXT: v_mul_hi_u32 v13, v6, v16 -; CGP-NEXT: v_add_i32_e64 v11, s[4:5], v12, v11 -; CGP-NEXT: v_mul_lo_u32 v12, v14, v16 -; CGP-NEXT: v_mul_lo_u32 v15, v6, v11 -; CGP-NEXT: v_mul_hi_u32 v16, v14, v16 -; CGP-NEXT: v_add_i32_e64 v12, s[4:5], v12, v15 -; CGP-NEXT: v_cndmask_b32_e64 v15, 0, 1, s[4:5] -; CGP-NEXT: v_add_i32_e64 v12, s[4:5], v12, v13 -; CGP-NEXT: v_cndmask_b32_e64 v12, 0, 1, s[4:5] -; CGP-NEXT: v_mul_lo_u32 v13, v14, v11 -; CGP-NEXT: v_add_i32_e64 v12, s[4:5], v15, v12 -; CGP-NEXT: v_mul_hi_u32 v15, v6, v11 -; CGP-NEXT: v_mul_hi_u32 v11, v14, v11 -; CGP-NEXT: v_add_i32_e64 v13, s[4:5], v13, v16 -; CGP-NEXT: v_cndmask_b32_e64 v16, 0, 1, s[4:5] -; CGP-NEXT: v_add_i32_e64 v13, s[4:5], v13, v15 -; CGP-NEXT: v_cndmask_b32_e64 v15, 0, 1, s[4:5] -; CGP-NEXT: v_add_i32_e64 v15, s[4:5], v16, v15 -; CGP-NEXT: v_add_i32_e64 v12, s[4:5], v13, v12 ; CGP-NEXT: v_cndmask_b32_e64 v13, 0, 1, s[4:5] -; CGP-NEXT: v_add_i32_e64 v13, s[4:5], v15, v13 +; CGP-NEXT: v_add_i32_e64 v10, s[4:5], v10, v11 +; CGP-NEXT: v_cndmask_b32_e64 v10, 0, 1, s[4:5] +; CGP-NEXT: v_mul_lo_u32 v11, v12, v9 +; CGP-NEXT: v_add_i32_e64 v10, s[4:5], v13, v10 +; CGP-NEXT: v_mul_hi_u32 v13, v6, v9 +; CGP-NEXT: v_mul_hi_u32 v9, v12, v9 +; CGP-NEXT: v_add_i32_e64 v11, s[4:5], v11, v14 +; CGP-NEXT: v_cndmask_b32_e64 v14, 0, 1, s[4:5] ; CGP-NEXT: v_add_i32_e64 v11, s[4:5], v11, v13 -; CGP-NEXT: v_addc_u32_e32 v10, vcc, v10, v11, vcc -; CGP-NEXT: v_add_i32_e32 v6, vcc, v6, v12 -; CGP-NEXT: v_addc_u32_e32 v10, vcc, 0, v10, vcc -; CGP-NEXT: v_mul_lo_u32 v11, v3, v6 -; CGP-NEXT: v_mul_lo_u32 v12, v7, v10 -; CGP-NEXT: v_mul_hi_u32 v13, v7, v6 +; CGP-NEXT: v_cndmask_b32_e64 v13, 0, 1, s[4:5] +; CGP-NEXT: v_add_i32_e64 v13, s[4:5], v14, v13 +; CGP-NEXT: v_add_i32_e64 v10, s[4:5], v11, v10 +; CGP-NEXT: v_cndmask_b32_e64 v11, 0, 1, s[4:5] +; CGP-NEXT: v_add_i32_e64 v11, s[4:5], v13, v11 +; CGP-NEXT: v_add_i32_e64 v9, s[4:5], v9, v11 +; CGP-NEXT: v_addc_u32_e32 v7, vcc, v7, v9, vcc +; CGP-NEXT: v_add_i32_e32 v6, vcc, v6, v10 +; CGP-NEXT: v_addc_u32_e32 v7, vcc, 0, v7, vcc +; CGP-NEXT: v_mul_lo_u32 v9, v3, v6 +; CGP-NEXT: v_mul_lo_u32 v10, v2, v7 +; CGP-NEXT: v_mul_hi_u32 v11, v2, v6 ; CGP-NEXT: v_mul_hi_u32 v6, v3, v6 -; CGP-NEXT: v_add_i32_e32 v11, vcc, v11, v12 -; CGP-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v11, vcc, v11, v13 -; CGP-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc -; CGP-NEXT: v_mul_lo_u32 v13, v3, v10 -; CGP-NEXT: v_add_i32_e32 v11, vcc, v12, v11 -; CGP-NEXT: v_mul_hi_u32 v12, v7, v10 -; CGP-NEXT: v_mul_hi_u32 v10, v3, v10 -; CGP-NEXT: v_add_i32_e32 v6, vcc, v13, v6 -; CGP-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v6, vcc, v6, v12 -; CGP-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v12, vcc, v13, v12 -; CGP-NEXT: v_add_i32_e32 v6, vcc, v6, v11 +; CGP-NEXT: v_add_i32_e32 v9, vcc, v9, v10 +; CGP-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v9, vcc, v9, v11 +; CGP-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc +; CGP-NEXT: v_mul_lo_u32 v11, v3, v7 +; CGP-NEXT: v_add_i32_e32 v9, vcc, v10, v9 +; CGP-NEXT: v_mul_hi_u32 v10, v2, v7 +; CGP-NEXT: v_mul_hi_u32 v7, v3, v7 +; CGP-NEXT: v_add_i32_e32 v6, vcc, v11, v6 ; CGP-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v11, vcc, v12, v11 -; CGP-NEXT: v_add_i32_e32 v10, vcc, v10, v11 -; CGP-NEXT: v_mul_lo_u32 v11, v4, v6 -; CGP-NEXT: v_mul_lo_u32 v10, v5, v10 -; CGP-NEXT: v_mul_lo_u32 v12, v5, v6 -; CGP-NEXT: v_mul_hi_u32 v6, v5, v6 +; CGP-NEXT: v_add_i32_e32 v6, vcc, v6, v10 +; CGP-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v10, vcc, v11, v10 -; CGP-NEXT: v_add_i32_e32 v6, vcc, v10, v6 -; CGP-NEXT: v_sub_i32_e32 v7, vcc, v7, v12 -; CGP-NEXT: v_subb_u32_e64 v10, s[4:5], v3, v6, vcc +; CGP-NEXT: v_add_i32_e32 v6, vcc, v6, v9 +; CGP-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v9, vcc, v10, v9 +; CGP-NEXT: v_add_i32_e32 v7, vcc, v7, v9 +; CGP-NEXT: v_mul_lo_u32 v9, v4, v6 +; CGP-NEXT: v_mul_lo_u32 v7, v5, v7 +; CGP-NEXT: v_mul_lo_u32 v10, v5, v6 +; CGP-NEXT: v_mul_hi_u32 v6, v5, v6 +; CGP-NEXT: v_add_i32_e32 v7, vcc, v9, v7 +; CGP-NEXT: v_add_i32_e32 v6, vcc, v7, v6 +; CGP-NEXT: v_sub_i32_e32 v2, vcc, v2, v10 +; CGP-NEXT: v_subb_u32_e64 v7, s[4:5], v3, v6, vcc ; CGP-NEXT: v_sub_i32_e64 v3, s[4:5], v3, v6 -; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v10, v4 +; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v7, v4 ; CGP-NEXT: v_cndmask_b32_e64 v6, 0, -1, s[4:5] -; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v7, v5 -; CGP-NEXT: v_cndmask_b32_e64 v11, 0, -1, s[4:5] -; CGP-NEXT: v_cmp_eq_u32_e64 s[4:5], v10, v4 +; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v2, v5 +; CGP-NEXT: v_cndmask_b32_e64 v9, 0, -1, s[4:5] +; CGP-NEXT: v_cmp_eq_u32_e64 s[4:5], v7, v4 ; CGP-NEXT: v_subb_u32_e32 v3, vcc, v3, v4, vcc -; CGP-NEXT: v_cndmask_b32_e64 v6, v6, v11, s[4:5] -; CGP-NEXT: v_sub_i32_e32 v11, vcc, v7, v5 -; CGP-NEXT: v_subbrev_u32_e64 v12, s[4:5], 0, v3, vcc -; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v12, v4 -; CGP-NEXT: v_cndmask_b32_e64 v13, 0, -1, s[4:5] -; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v11, v5 +; CGP-NEXT: v_cndmask_b32_e64 v6, v6, v9, s[4:5] +; CGP-NEXT: v_sub_i32_e32 v9, vcc, v2, v5 +; CGP-NEXT: v_subbrev_u32_e64 v10, s[4:5], 0, v3, vcc +; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v10, v4 +; CGP-NEXT: v_cndmask_b32_e64 v11, 0, -1, s[4:5] +; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v9, v5 ; CGP-NEXT: v_subb_u32_e32 v3, vcc, v3, v4, vcc -; CGP-NEXT: v_cndmask_b32_e64 v14, 0, -1, s[4:5] -; CGP-NEXT: v_cmp_eq_u32_e64 s[4:5], v12, v4 -; CGP-NEXT: v_sub_i32_e32 v4, vcc, v11, v5 -; CGP-NEXT: v_cndmask_b32_e64 v13, v13, v14, s[4:5] +; CGP-NEXT: v_cndmask_b32_e64 v12, 0, -1, s[4:5] +; CGP-NEXT: v_cmp_eq_u32_e64 s[4:5], v10, v4 +; CGP-NEXT: v_sub_i32_e32 v4, vcc, v9, v5 +; CGP-NEXT: v_cndmask_b32_e64 v11, v11, v12, s[4:5] ; CGP-NEXT: v_subbrev_u32_e32 v3, vcc, 0, v3, vcc -; CGP-NEXT: v_cmp_ne_u32_e32 vcc, 0, v13 -; CGP-NEXT: v_cndmask_b32_e32 v4, v11, v4, vcc -; CGP-NEXT: v_cndmask_b32_e32 v3, v12, v3, vcc -; CGP-NEXT: v_cmp_ne_u32_e32 vcc, 0, v6 -; CGP-NEXT: v_cndmask_b32_e32 v4, v7, v4, vcc +; CGP-NEXT: v_cmp_ne_u32_e32 vcc, 0, v11 +; CGP-NEXT: v_cndmask_b32_e32 v4, v9, v4, vcc ; CGP-NEXT: v_cndmask_b32_e32 v3, v10, v3, vcc -; CGP-NEXT: v_xor_b32_e32 v4, v4, v9 -; CGP-NEXT: v_xor_b32_e32 v3, v3, v9 -; CGP-NEXT: v_sub_i32_e32 v4, vcc, v4, v9 -; CGP-NEXT: v_subb_u32_e32 v5, vcc, v3, v9, vcc +; CGP-NEXT: v_cmp_ne_u32_e32 vcc, 0, v6 +; CGP-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc +; CGP-NEXT: v_cndmask_b32_e32 v3, v7, v3, vcc +; CGP-NEXT: v_xor_b32_e32 v2, v2, v8 +; CGP-NEXT: v_sub_i32_e32 v4, vcc, v2, v8 +; CGP-NEXT: v_xor_b32_e32 v3, v3, v8 +; CGP-NEXT: v_subb_u32_e32 v5, vcc, v3, v8, vcc +; CGP-NEXT: ; implicit-def: $vgpr2 +; CGP-NEXT: ; implicit-def: $vgpr8_vgpr9 ; CGP-NEXT: BB8_6: ; %Flow ; CGP-NEXT: s_or_saveexec_b64 s[4:5], s[6:7] ; CGP-NEXT: s_xor_b64 exec, exec, s[4:5] diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/udiv.i64.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/udiv.i64.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/udiv.i64.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/udiv.i64.ll @@ -119,30 +119,32 @@ ; CHECK-NEXT: v_add_i32_e32 v8, vcc, 1, v10 ; CHECK-NEXT: v_addc_u32_e32 v12, vcc, 0, v11, vcc ; CHECK-NEXT: v_add_i32_e32 v6, vcc, v6, v9 -; CHECK-NEXT: v_sub_i32_e32 v7, vcc, v0, v7 -; CHECK-NEXT: v_subb_u32_e64 v9, s[4:5], v1, v6, vcc +; CHECK-NEXT: v_sub_i32_e32 v0, vcc, v0, v7 +; CHECK-NEXT: v_subb_u32_e64 v7, s[4:5], v1, v6, vcc ; CHECK-NEXT: v_sub_i32_e64 v1, s[4:5], v1, v6 -; CHECK-NEXT: v_cmp_ge_u32_e64 s[4:5], v7, v2 +; CHECK-NEXT: v_cmp_ge_u32_e64 s[4:5], v0, v2 ; CHECK-NEXT: v_cndmask_b32_e64 v6, 0, -1, s[4:5] -; CHECK-NEXT: v_cmp_ge_u32_e64 s[4:5], v9, v3 -; CHECK-NEXT: v_cndmask_b32_e64 v13, 0, -1, s[4:5] +; CHECK-NEXT: v_cmp_ge_u32_e64 s[4:5], v7, v3 +; CHECK-NEXT: v_cndmask_b32_e64 v9, 0, -1, s[4:5] ; CHECK-NEXT: v_subb_u32_e32 v1, vcc, v1, v3, vcc -; CHECK-NEXT: v_cmp_eq_u32_e32 vcc, v9, v3 -; CHECK-NEXT: v_cndmask_b32_e32 v6, v13, v6, vcc -; CHECK-NEXT: v_sub_i32_e32 v7, vcc, v7, v2 +; CHECK-NEXT: v_cmp_eq_u32_e32 vcc, v7, v3 +; CHECK-NEXT: v_cndmask_b32_e32 v6, v9, v6, vcc +; CHECK-NEXT: v_sub_i32_e32 v0, vcc, v0, v2 ; CHECK-NEXT: v_subbrev_u32_e32 v1, vcc, 0, v1, vcc -; CHECK-NEXT: v_cmp_ge_u32_e32 vcc, v7, v2 -; CHECK-NEXT: v_cndmask_b32_e64 v7, 0, -1, vcc +; CHECK-NEXT: v_cmp_ge_u32_e32 vcc, v0, v2 +; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc ; CHECK-NEXT: v_cmp_ge_u32_e32 vcc, v1, v3 -; CHECK-NEXT: v_cndmask_b32_e64 v9, 0, -1, vcc +; CHECK-NEXT: v_cndmask_b32_e64 v2, 0, -1, vcc ; CHECK-NEXT: v_cmp_eq_u32_e32 vcc, v1, v3 -; CHECK-NEXT: v_cndmask_b32_e32 v1, v9, v7, vcc -; CHECK-NEXT: v_cmp_ne_u32_e32 vcc, 0, v1 -; CHECK-NEXT: v_cndmask_b32_e32 v1, v10, v8, vcc -; CHECK-NEXT: v_cndmask_b32_e32 v3, v11, v12, vcc +; CHECK-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc +; CHECK-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 +; CHECK-NEXT: v_cndmask_b32_e32 v0, v10, v8, vcc +; CHECK-NEXT: v_cndmask_b32_e32 v1, v11, v12, vcc ; CHECK-NEXT: v_cmp_ne_u32_e32 vcc, 0, v6 -; CHECK-NEXT: v_cndmask_b32_e32 v4, v4, v1, vcc -; CHECK-NEXT: v_cndmask_b32_e32 v5, v5, v3, vcc +; CHECK-NEXT: v_cndmask_b32_e32 v4, v4, v0, vcc +; CHECK-NEXT: v_cndmask_b32_e32 v5, v5, v1, vcc +; CHECK-NEXT: ; implicit-def: $vgpr0 +; CHECK-NEXT: ; implicit-def: $vgpr2 ; CHECK-NEXT: BB0_2: ; %Flow ; CHECK-NEXT: s_or_saveexec_b64 s[6:7], s[6:7] ; CHECK-NEXT: s_xor_b64 exec, exec, s[6:7] @@ -739,30 +741,32 @@ ; CGP-NEXT: v_add_i32_e32 v12, vcc, 1, v14 ; CGP-NEXT: v_addc_u32_e32 v16, vcc, 0, v15, vcc ; CGP-NEXT: v_add_i32_e32 v10, vcc, v10, v13 -; CGP-NEXT: v_sub_i32_e32 v11, vcc, v8, v11 -; CGP-NEXT: v_subb_u32_e64 v13, s[4:5], v9, v10, vcc +; CGP-NEXT: v_sub_i32_e32 v8, vcc, v8, v11 +; CGP-NEXT: v_subb_u32_e64 v11, s[4:5], v9, v10, vcc ; CGP-NEXT: v_sub_i32_e64 v9, s[4:5], v9, v10 -; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v11, v4 +; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v8, v4 ; CGP-NEXT: v_cndmask_b32_e64 v10, 0, -1, s[4:5] -; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v13, v5 -; CGP-NEXT: v_cndmask_b32_e64 v17, 0, -1, s[4:5] +; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v11, v5 +; CGP-NEXT: v_cndmask_b32_e64 v13, 0, -1, s[4:5] ; CGP-NEXT: v_subb_u32_e32 v9, vcc, v9, v5, vcc -; CGP-NEXT: v_cmp_eq_u32_e32 vcc, v13, v5 -; CGP-NEXT: v_cndmask_b32_e32 v10, v17, v10, vcc -; CGP-NEXT: v_sub_i32_e32 v11, vcc, v11, v4 +; CGP-NEXT: v_cmp_eq_u32_e32 vcc, v11, v5 +; CGP-NEXT: v_cndmask_b32_e32 v10, v13, v10, vcc +; CGP-NEXT: v_sub_i32_e32 v8, vcc, v8, v4 ; CGP-NEXT: v_subbrev_u32_e32 v9, vcc, 0, v9, vcc -; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v11, v4 -; CGP-NEXT: v_cndmask_b32_e64 v11, 0, -1, vcc +; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v8, v4 +; CGP-NEXT: v_cndmask_b32_e64 v4, 0, -1, vcc ; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v9, v5 -; CGP-NEXT: v_cndmask_b32_e64 v13, 0, -1, vcc +; CGP-NEXT: v_cndmask_b32_e64 v8, 0, -1, vcc ; CGP-NEXT: v_cmp_eq_u32_e32 vcc, v9, v5 -; CGP-NEXT: v_cndmask_b32_e32 v5, v13, v11, vcc -; CGP-NEXT: v_cmp_ne_u32_e32 vcc, 0, v5 -; CGP-NEXT: v_cndmask_b32_e32 v5, v14, v12, vcc -; CGP-NEXT: v_cndmask_b32_e32 v9, v15, v16, vcc +; CGP-NEXT: v_cndmask_b32_e32 v4, v8, v4, vcc +; CGP-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 +; CGP-NEXT: v_cndmask_b32_e32 v4, v14, v12, vcc +; CGP-NEXT: v_cndmask_b32_e32 v5, v15, v16, vcc ; CGP-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10 -; CGP-NEXT: v_cndmask_b32_e32 v0, v0, v5, vcc -; CGP-NEXT: v_cndmask_b32_e32 v1, v1, v9, vcc +; CGP-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc +; CGP-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc +; CGP-NEXT: ; implicit-def: $vgpr8 +; CGP-NEXT: ; implicit-def: $vgpr4 ; CGP-NEXT: BB2_2: ; %Flow2 ; CGP-NEXT: s_or_saveexec_b64 s[6:7], s[6:7] ; CGP-NEXT: s_xor_b64 exec, exec, s[6:7] @@ -901,30 +905,32 @@ ; CGP-NEXT: v_add_i32_e32 v10, vcc, 1, v12 ; CGP-NEXT: v_addc_u32_e32 v14, vcc, 0, v13, vcc ; CGP-NEXT: v_add_i32_e32 v8, vcc, v8, v11 -; CGP-NEXT: v_sub_i32_e32 v9, vcc, v2, v9 -; CGP-NEXT: v_subb_u32_e64 v11, s[4:5], v3, v8, vcc +; CGP-NEXT: v_sub_i32_e32 v2, vcc, v2, v9 +; CGP-NEXT: v_subb_u32_e64 v9, s[4:5], v3, v8, vcc ; CGP-NEXT: v_sub_i32_e64 v3, s[4:5], v3, v8 -; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v9, v6 +; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v2, v6 ; CGP-NEXT: v_cndmask_b32_e64 v8, 0, -1, s[4:5] -; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v11, v7 -; CGP-NEXT: v_cndmask_b32_e64 v15, 0, -1, s[4:5] +; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v9, v7 +; CGP-NEXT: v_cndmask_b32_e64 v11, 0, -1, s[4:5] ; CGP-NEXT: v_subb_u32_e32 v3, vcc, v3, v7, vcc -; CGP-NEXT: v_cmp_eq_u32_e32 vcc, v11, v7 -; CGP-NEXT: v_cndmask_b32_e32 v8, v15, v8, vcc -; CGP-NEXT: v_sub_i32_e32 v9, vcc, v9, v6 +; CGP-NEXT: v_cmp_eq_u32_e32 vcc, v9, v7 +; CGP-NEXT: v_cndmask_b32_e32 v8, v11, v8, vcc +; CGP-NEXT: v_sub_i32_e32 v2, vcc, v2, v6 ; CGP-NEXT: v_subbrev_u32_e32 v3, vcc, 0, v3, vcc -; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v9, v6 -; CGP-NEXT: v_cndmask_b32_e64 v9, 0, -1, vcc +; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v2, v6 +; CGP-NEXT: v_cndmask_b32_e64 v2, 0, -1, vcc ; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v3, v7 -; CGP-NEXT: v_cndmask_b32_e64 v11, 0, -1, vcc +; CGP-NEXT: v_cndmask_b32_e64 v6, 0, -1, vcc ; CGP-NEXT: v_cmp_eq_u32_e32 vcc, v3, v7 -; CGP-NEXT: v_cndmask_b32_e32 v3, v11, v9, vcc -; CGP-NEXT: v_cmp_ne_u32_e32 vcc, 0, v3 -; CGP-NEXT: v_cndmask_b32_e32 v3, v12, v10, vcc -; CGP-NEXT: v_cndmask_b32_e32 v7, v13, v14, vcc +; CGP-NEXT: v_cndmask_b32_e32 v2, v6, v2, vcc +; CGP-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 +; CGP-NEXT: v_cndmask_b32_e32 v2, v12, v10, vcc +; CGP-NEXT: v_cndmask_b32_e32 v3, v13, v14, vcc ; CGP-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 -; CGP-NEXT: v_cndmask_b32_e32 v4, v4, v3, vcc -; CGP-NEXT: v_cndmask_b32_e32 v5, v5, v7, vcc +; CGP-NEXT: v_cndmask_b32_e32 v4, v4, v2, vcc +; CGP-NEXT: v_cndmask_b32_e32 v5, v5, v3, vcc +; CGP-NEXT: ; implicit-def: $vgpr2 +; CGP-NEXT: ; implicit-def: $vgpr6 ; CGP-NEXT: BB2_6: ; %Flow ; CGP-NEXT: s_or_saveexec_b64 s[6:7], s[6:7] ; CGP-NEXT: s_xor_b64 exec, exec, s[6:7] @@ -2399,30 +2405,32 @@ ; CHECK-NEXT: v_add_i32_e32 v8, vcc, 1, v10 ; CHECK-NEXT: v_addc_u32_e32 v12, vcc, 0, v11, vcc ; CHECK-NEXT: v_add_i32_e32 v6, vcc, v6, v9 -; CHECK-NEXT: v_sub_i32_e32 v7, vcc, v0, v7 -; CHECK-NEXT: v_subb_u32_e64 v9, s[4:5], v1, v6, vcc +; CHECK-NEXT: v_sub_i32_e32 v0, vcc, v0, v7 +; CHECK-NEXT: v_subb_u32_e64 v7, s[4:5], v1, v6, vcc ; CHECK-NEXT: v_sub_i32_e64 v1, s[4:5], v1, v6 -; CHECK-NEXT: v_cmp_ge_u32_e64 s[4:5], v7, v4 +; CHECK-NEXT: v_cmp_ge_u32_e64 s[4:5], v0, v4 ; CHECK-NEXT: v_cndmask_b32_e64 v6, 0, -1, s[4:5] -; CHECK-NEXT: v_cmp_ge_u32_e64 s[4:5], v9, v5 -; CHECK-NEXT: v_cndmask_b32_e64 v13, 0, -1, s[4:5] +; CHECK-NEXT: v_cmp_ge_u32_e64 s[4:5], v7, v5 +; CHECK-NEXT: v_cndmask_b32_e64 v9, 0, -1, s[4:5] ; CHECK-NEXT: v_subb_u32_e32 v1, vcc, v1, v5, vcc -; CHECK-NEXT: v_cmp_eq_u32_e32 vcc, v9, v5 -; CHECK-NEXT: v_cndmask_b32_e32 v6, v13, v6, vcc -; CHECK-NEXT: v_sub_i32_e32 v7, vcc, v7, v4 +; CHECK-NEXT: v_cmp_eq_u32_e32 vcc, v7, v5 +; CHECK-NEXT: v_cndmask_b32_e32 v6, v9, v6, vcc +; CHECK-NEXT: v_sub_i32_e32 v0, vcc, v0, v4 ; CHECK-NEXT: v_subbrev_u32_e32 v1, vcc, 0, v1, vcc -; CHECK-NEXT: v_cmp_ge_u32_e32 vcc, v7, v4 -; CHECK-NEXT: v_cndmask_b32_e64 v7, 0, -1, vcc +; CHECK-NEXT: v_cmp_ge_u32_e32 vcc, v0, v4 +; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc ; CHECK-NEXT: v_cmp_ge_u32_e32 vcc, v1, v5 -; CHECK-NEXT: v_cndmask_b32_e64 v9, 0, -1, vcc +; CHECK-NEXT: v_cndmask_b32_e64 v4, 0, -1, vcc ; CHECK-NEXT: v_cmp_eq_u32_e32 vcc, v1, v5 -; CHECK-NEXT: v_cndmask_b32_e32 v1, v9, v7, vcc -; CHECK-NEXT: v_cmp_ne_u32_e32 vcc, 0, v1 -; CHECK-NEXT: v_cndmask_b32_e32 v1, v10, v8, vcc -; CHECK-NEXT: v_cndmask_b32_e32 v5, v11, v12, vcc +; CHECK-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc +; CHECK-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 +; CHECK-NEXT: v_cndmask_b32_e32 v0, v10, v8, vcc +; CHECK-NEXT: v_cndmask_b32_e32 v1, v11, v12, vcc ; CHECK-NEXT: v_cmp_ne_u32_e32 vcc, 0, v6 -; CHECK-NEXT: v_cndmask_b32_e32 v2, v2, v1, vcc -; CHECK-NEXT: v_cndmask_b32_e32 v3, v3, v5, vcc +; CHECK-NEXT: v_cndmask_b32_e32 v2, v2, v0, vcc +; CHECK-NEXT: v_cndmask_b32_e32 v3, v3, v1, vcc +; CHECK-NEXT: ; implicit-def: $vgpr0 +; CHECK-NEXT: ; implicit-def: $vgpr4_vgpr5 ; CHECK-NEXT: BB7_2: ; %Flow ; CHECK-NEXT: s_or_saveexec_b64 s[6:7], s[6:7] ; CHECK-NEXT: s_xor_b64 exec, exec, s[6:7] @@ -2842,30 +2850,32 @@ ; CGP-NEXT: v_add_i32_e32 v12, vcc, 1, v14 ; CGP-NEXT: v_addc_u32_e32 v16, vcc, 0, v15, vcc ; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v13 -; CGP-NEXT: v_sub_i32_e32 v6, vcc, v5, v6 -; CGP-NEXT: v_subb_u32_e64 v13, s[4:5], v7, v4, vcc +; CGP-NEXT: v_sub_i32_e32 v5, vcc, v5, v6 +; CGP-NEXT: v_subb_u32_e64 v6, s[4:5], v7, v4, vcc ; CGP-NEXT: v_sub_i32_e64 v4, s[4:5], v7, v4 -; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v6, v10 +; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v5, v10 ; CGP-NEXT: v_cndmask_b32_e64 v7, 0, -1, s[4:5] -; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v13, v11 -; CGP-NEXT: v_cndmask_b32_e64 v17, 0, -1, s[4:5] +; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v6, v11 +; CGP-NEXT: v_cndmask_b32_e64 v13, 0, -1, s[4:5] ; CGP-NEXT: v_subb_u32_e32 v4, vcc, v4, v11, vcc -; CGP-NEXT: v_cmp_eq_u32_e32 vcc, v13, v11 -; CGP-NEXT: v_cndmask_b32_e32 v7, v17, v7, vcc -; CGP-NEXT: v_sub_i32_e32 v6, vcc, v6, v10 +; CGP-NEXT: v_cmp_eq_u32_e32 vcc, v6, v11 +; CGP-NEXT: v_cndmask_b32_e32 v6, v13, v7, vcc +; CGP-NEXT: v_sub_i32_e32 v5, vcc, v5, v10 ; CGP-NEXT: v_subbrev_u32_e32 v4, vcc, 0, v4, vcc -; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v6, v10 -; CGP-NEXT: v_cndmask_b32_e64 v6, 0, -1, vcc +; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v5, v10 +; CGP-NEXT: v_cndmask_b32_e64 v5, 0, -1, vcc ; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v4, v11 -; CGP-NEXT: v_cndmask_b32_e64 v13, 0, -1, vcc +; CGP-NEXT: v_cndmask_b32_e64 v7, 0, -1, vcc ; CGP-NEXT: v_cmp_eq_u32_e32 vcc, v4, v11 -; CGP-NEXT: v_cndmask_b32_e32 v4, v13, v6, vcc +; CGP-NEXT: v_cndmask_b32_e32 v4, v7, v5, vcc ; CGP-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 ; CGP-NEXT: v_cndmask_b32_e32 v4, v14, v12, vcc -; CGP-NEXT: v_cndmask_b32_e32 v6, v15, v16, vcc -; CGP-NEXT: v_cmp_ne_u32_e32 vcc, 0, v7 +; CGP-NEXT: v_cndmask_b32_e32 v5, v15, v16, vcc +; CGP-NEXT: v_cmp_ne_u32_e32 vcc, 0, v6 ; CGP-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc -; CGP-NEXT: v_cndmask_b32_e32 v1, v1, v6, vcc +; CGP-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc +; CGP-NEXT: ; implicit-def: $vgpr5 +; CGP-NEXT: ; implicit-def: $vgpr10_vgpr11 ; CGP-NEXT: BB8_2: ; %Flow2 ; CGP-NEXT: s_or_saveexec_b64 s[6:7], s[6:7] ; CGP-NEXT: s_xor_b64 exec, exec, s[6:7] @@ -3004,30 +3014,32 @@ ; CGP-NEXT: v_add_i32_e32 v10, vcc, 1, v12 ; CGP-NEXT: v_addc_u32_e32 v14, vcc, 0, v13, vcc ; CGP-NEXT: v_add_i32_e32 v6, vcc, v6, v11 -; CGP-NEXT: v_sub_i32_e32 v7, vcc, v2, v7 -; CGP-NEXT: v_subb_u32_e64 v11, s[4:5], v3, v6, vcc +; CGP-NEXT: v_sub_i32_e32 v2, vcc, v2, v7 +; CGP-NEXT: v_subb_u32_e64 v7, s[4:5], v3, v6, vcc ; CGP-NEXT: v_sub_i32_e64 v3, s[4:5], v3, v6 -; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v7, v8 +; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v2, v8 ; CGP-NEXT: v_cndmask_b32_e64 v6, 0, -1, s[4:5] -; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v11, v9 -; CGP-NEXT: v_cndmask_b32_e64 v15, 0, -1, s[4:5] +; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v7, v9 +; CGP-NEXT: v_cndmask_b32_e64 v11, 0, -1, s[4:5] ; CGP-NEXT: v_subb_u32_e32 v3, vcc, v3, v9, vcc -; CGP-NEXT: v_cmp_eq_u32_e32 vcc, v11, v9 -; CGP-NEXT: v_cndmask_b32_e32 v6, v15, v6, vcc -; CGP-NEXT: v_sub_i32_e32 v7, vcc, v7, v8 +; CGP-NEXT: v_cmp_eq_u32_e32 vcc, v7, v9 +; CGP-NEXT: v_cndmask_b32_e32 v6, v11, v6, vcc +; CGP-NEXT: v_sub_i32_e32 v2, vcc, v2, v8 ; CGP-NEXT: v_subbrev_u32_e32 v3, vcc, 0, v3, vcc -; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v7, v8 -; CGP-NEXT: v_cndmask_b32_e64 v7, 0, -1, vcc +; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v2, v8 +; CGP-NEXT: v_cndmask_b32_e64 v2, 0, -1, vcc ; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v3, v9 -; CGP-NEXT: v_cndmask_b32_e64 v11, 0, -1, vcc +; CGP-NEXT: v_cndmask_b32_e64 v7, 0, -1, vcc ; CGP-NEXT: v_cmp_eq_u32_e32 vcc, v3, v9 -; CGP-NEXT: v_cndmask_b32_e32 v3, v11, v7, vcc -; CGP-NEXT: v_cmp_ne_u32_e32 vcc, 0, v3 -; CGP-NEXT: v_cndmask_b32_e32 v3, v12, v10, vcc -; CGP-NEXT: v_cndmask_b32_e32 v7, v13, v14, vcc +; CGP-NEXT: v_cndmask_b32_e32 v2, v7, v2, vcc +; CGP-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 +; CGP-NEXT: v_cndmask_b32_e32 v2, v12, v10, vcc +; CGP-NEXT: v_cndmask_b32_e32 v3, v13, v14, vcc ; CGP-NEXT: v_cmp_ne_u32_e32 vcc, 0, v6 -; CGP-NEXT: v_cndmask_b32_e32 v4, v4, v3, vcc -; CGP-NEXT: v_cndmask_b32_e32 v5, v5, v7, vcc +; CGP-NEXT: v_cndmask_b32_e32 v4, v4, v2, vcc +; CGP-NEXT: v_cndmask_b32_e32 v5, v5, v3, vcc +; CGP-NEXT: ; implicit-def: $vgpr2 +; CGP-NEXT: ; implicit-def: $vgpr8_vgpr9 ; CGP-NEXT: BB8_6: ; %Flow ; CGP-NEXT: s_or_saveexec_b64 s[6:7], s[6:7] ; CGP-NEXT: s_xor_b64 exec, exec, s[6:7] diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/urem.i64.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/urem.i64.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/urem.i64.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/urem.i64.ll @@ -115,33 +115,35 @@ ; CHECK-NEXT: v_mul_lo_u32 v5, v2, v5 ; CHECK-NEXT: v_add_i32_e32 v5, vcc, v8, v5 ; CHECK-NEXT: v_add_i32_e32 v4, vcc, v5, v4 -; CHECK-NEXT: v_sub_i32_e32 v5, vcc, v0, v7 -; CHECK-NEXT: v_subb_u32_e64 v6, s[4:5], v1, v4, vcc +; CHECK-NEXT: v_sub_i32_e32 v0, vcc, v0, v7 +; CHECK-NEXT: v_subb_u32_e64 v5, s[4:5], v1, v4, vcc ; CHECK-NEXT: v_sub_i32_e64 v1, s[4:5], v1, v4 -; CHECK-NEXT: v_cmp_ge_u32_e64 s[4:5], v5, v2 +; CHECK-NEXT: v_cmp_ge_u32_e64 s[4:5], v0, v2 ; CHECK-NEXT: v_cndmask_b32_e64 v4, 0, -1, s[4:5] -; CHECK-NEXT: v_cmp_ge_u32_e64 s[4:5], v6, v3 -; CHECK-NEXT: v_cndmask_b32_e64 v7, 0, -1, s[4:5] +; CHECK-NEXT: v_cmp_ge_u32_e64 s[4:5], v5, v3 +; CHECK-NEXT: v_cndmask_b32_e64 v6, 0, -1, s[4:5] ; CHECK-NEXT: v_subb_u32_e32 v1, vcc, v1, v3, vcc -; CHECK-NEXT: v_cmp_eq_u32_e32 vcc, v6, v3 -; CHECK-NEXT: v_cndmask_b32_e32 v4, v7, v4, vcc -; CHECK-NEXT: v_sub_i32_e32 v7, vcc, v5, v2 -; CHECK-NEXT: v_subbrev_u32_e64 v8, s[4:5], 0, v1, vcc -; CHECK-NEXT: v_cmp_ge_u32_e64 s[4:5], v7, v2 -; CHECK-NEXT: v_cndmask_b32_e64 v9, 0, -1, s[4:5] +; CHECK-NEXT: v_cmp_eq_u32_e32 vcc, v5, v3 +; CHECK-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc +; CHECK-NEXT: v_sub_i32_e32 v6, vcc, v0, v2 +; CHECK-NEXT: v_subbrev_u32_e64 v7, s[4:5], 0, v1, vcc +; CHECK-NEXT: v_cmp_ge_u32_e64 s[4:5], v6, v2 +; CHECK-NEXT: v_cndmask_b32_e64 v8, 0, -1, s[4:5] ; CHECK-NEXT: v_subb_u32_e32 v1, vcc, v1, v3, vcc -; CHECK-NEXT: v_cmp_ge_u32_e32 vcc, v8, v3 -; CHECK-NEXT: v_cndmask_b32_e64 v10, 0, -1, vcc -; CHECK-NEXT: v_sub_i32_e32 v11, vcc, v7, v2 +; CHECK-NEXT: v_cmp_ge_u32_e32 vcc, v7, v3 +; CHECK-NEXT: v_cndmask_b32_e64 v9, 0, -1, vcc +; CHECK-NEXT: v_sub_i32_e32 v2, vcc, v6, v2 ; CHECK-NEXT: v_subbrev_u32_e32 v1, vcc, 0, v1, vcc -; CHECK-NEXT: v_cmp_eq_u32_e32 vcc, v8, v3 -; CHECK-NEXT: v_cndmask_b32_e32 v3, v10, v9, vcc +; CHECK-NEXT: v_cmp_eq_u32_e32 vcc, v7, v3 +; CHECK-NEXT: v_cndmask_b32_e32 v3, v9, v8, vcc ; CHECK-NEXT: v_cmp_ne_u32_e32 vcc, 0, v3 -; CHECK-NEXT: v_cndmask_b32_e32 v3, v7, v11, vcc -; CHECK-NEXT: v_cndmask_b32_e32 v1, v8, v1, vcc +; CHECK-NEXT: v_cndmask_b32_e32 v2, v6, v2, vcc +; CHECK-NEXT: v_cndmask_b32_e32 v1, v7, v1, vcc ; CHECK-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 -; CHECK-NEXT: v_cndmask_b32_e32 v4, v5, v3, vcc -; CHECK-NEXT: v_cndmask_b32_e32 v5, v6, v1, vcc +; CHECK-NEXT: v_cndmask_b32_e32 v4, v0, v2, vcc +; CHECK-NEXT: v_cndmask_b32_e32 v5, v5, v1, vcc +; CHECK-NEXT: ; implicit-def: $vgpr0 +; CHECK-NEXT: ; implicit-def: $vgpr2 ; CHECK-NEXT: BB0_2: ; %Flow ; CHECK-NEXT: s_or_saveexec_b64 s[4:5], s[6:7] ; CHECK-NEXT: s_xor_b64 exec, exec, s[4:5] @@ -728,32 +730,34 @@ ; CGP-NEXT: v_add_i32_e32 v1, vcc, v12, v1 ; CGP-NEXT: v_add_i32_e32 v0, vcc, v1, v0 ; CGP-NEXT: v_sub_i32_e32 v1, vcc, v8, v11 -; CGP-NEXT: v_subb_u32_e64 v10, s[4:5], v9, v0, vcc +; CGP-NEXT: v_subb_u32_e64 v8, s[4:5], v9, v0, vcc ; CGP-NEXT: v_sub_i32_e64 v0, s[4:5], v9, v0 ; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v1, v4 ; CGP-NEXT: v_cndmask_b32_e64 v9, 0, -1, s[4:5] -; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v10, v5 -; CGP-NEXT: v_cndmask_b32_e64 v11, 0, -1, s[4:5] +; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v8, v5 +; CGP-NEXT: v_cndmask_b32_e64 v10, 0, -1, s[4:5] ; CGP-NEXT: v_subb_u32_e32 v0, vcc, v0, v5, vcc -; CGP-NEXT: v_cmp_eq_u32_e32 vcc, v10, v5 -; CGP-NEXT: v_cndmask_b32_e32 v9, v11, v9, vcc -; CGP-NEXT: v_sub_i32_e32 v11, vcc, v1, v4 -; CGP-NEXT: v_subbrev_u32_e64 v12, s[4:5], 0, v0, vcc -; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v11, v4 -; CGP-NEXT: v_cndmask_b32_e64 v13, 0, -1, s[4:5] +; CGP-NEXT: v_cmp_eq_u32_e32 vcc, v8, v5 +; CGP-NEXT: v_cndmask_b32_e32 v9, v10, v9, vcc +; CGP-NEXT: v_sub_i32_e32 v10, vcc, v1, v4 +; CGP-NEXT: v_subbrev_u32_e64 v11, s[4:5], 0, v0, vcc +; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v10, v4 +; CGP-NEXT: v_cndmask_b32_e64 v12, 0, -1, s[4:5] ; CGP-NEXT: v_subb_u32_e32 v0, vcc, v0, v5, vcc -; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v12, v5 -; CGP-NEXT: v_cndmask_b32_e64 v14, 0, -1, vcc -; CGP-NEXT: v_sub_i32_e32 v15, vcc, v11, v4 +; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v11, v5 +; CGP-NEXT: v_cndmask_b32_e64 v13, 0, -1, vcc +; CGP-NEXT: v_sub_i32_e32 v4, vcc, v10, v4 ; CGP-NEXT: v_subbrev_u32_e32 v0, vcc, 0, v0, vcc -; CGP-NEXT: v_cmp_eq_u32_e32 vcc, v12, v5 -; CGP-NEXT: v_cndmask_b32_e32 v5, v14, v13, vcc +; CGP-NEXT: v_cmp_eq_u32_e32 vcc, v11, v5 +; CGP-NEXT: v_cndmask_b32_e32 v5, v13, v12, vcc ; CGP-NEXT: v_cmp_ne_u32_e32 vcc, 0, v5 -; CGP-NEXT: v_cndmask_b32_e32 v5, v11, v15, vcc -; CGP-NEXT: v_cndmask_b32_e32 v11, v12, v0, vcc +; CGP-NEXT: v_cndmask_b32_e32 v4, v10, v4, vcc +; CGP-NEXT: v_cndmask_b32_e32 v5, v11, v0, vcc ; CGP-NEXT: v_cmp_ne_u32_e32 vcc, 0, v9 -; CGP-NEXT: v_cndmask_b32_e32 v0, v1, v5, vcc -; CGP-NEXT: v_cndmask_b32_e32 v1, v10, v11, vcc +; CGP-NEXT: v_cndmask_b32_e32 v0, v1, v4, vcc +; CGP-NEXT: v_cndmask_b32_e32 v1, v8, v5, vcc +; CGP-NEXT: ; implicit-def: $vgpr8 +; CGP-NEXT: ; implicit-def: $vgpr4 ; CGP-NEXT: BB2_2: ; %Flow2 ; CGP-NEXT: s_or_saveexec_b64 s[4:5], s[6:7] ; CGP-NEXT: s_xor_b64 exec, exec, s[4:5] @@ -886,33 +890,35 @@ ; CGP-NEXT: v_mul_lo_u32 v5, v6, v5 ; CGP-NEXT: v_add_i32_e32 v5, vcc, v10, v5 ; CGP-NEXT: v_add_i32_e32 v4, vcc, v5, v4 -; CGP-NEXT: v_sub_i32_e32 v5, vcc, v2, v9 -; CGP-NEXT: v_subb_u32_e64 v8, s[4:5], v3, v4, vcc +; CGP-NEXT: v_sub_i32_e32 v2, vcc, v2, v9 +; CGP-NEXT: v_subb_u32_e64 v5, s[4:5], v3, v4, vcc ; CGP-NEXT: v_sub_i32_e64 v3, s[4:5], v3, v4 -; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v5, v6 +; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v2, v6 ; CGP-NEXT: v_cndmask_b32_e64 v4, 0, -1, s[4:5] -; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v8, v7 -; CGP-NEXT: v_cndmask_b32_e64 v9, 0, -1, s[4:5] +; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v5, v7 +; CGP-NEXT: v_cndmask_b32_e64 v8, 0, -1, s[4:5] ; CGP-NEXT: v_subb_u32_e32 v3, vcc, v3, v7, vcc -; CGP-NEXT: v_cmp_eq_u32_e32 vcc, v8, v7 -; CGP-NEXT: v_cndmask_b32_e32 v4, v9, v4, vcc -; CGP-NEXT: v_sub_i32_e32 v9, vcc, v5, v6 -; CGP-NEXT: v_subbrev_u32_e64 v10, s[4:5], 0, v3, vcc -; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v9, v6 -; CGP-NEXT: v_cndmask_b32_e64 v11, 0, -1, s[4:5] +; CGP-NEXT: v_cmp_eq_u32_e32 vcc, v5, v7 +; CGP-NEXT: v_cndmask_b32_e32 v4, v8, v4, vcc +; CGP-NEXT: v_sub_i32_e32 v8, vcc, v2, v6 +; CGP-NEXT: v_subbrev_u32_e64 v9, s[4:5], 0, v3, vcc +; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v8, v6 +; CGP-NEXT: v_cndmask_b32_e64 v10, 0, -1, s[4:5] ; CGP-NEXT: v_subb_u32_e32 v3, vcc, v3, v7, vcc -; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v10, v7 -; CGP-NEXT: v_cndmask_b32_e64 v12, 0, -1, vcc -; CGP-NEXT: v_sub_i32_e32 v13, vcc, v9, v6 +; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v9, v7 +; CGP-NEXT: v_cndmask_b32_e64 v11, 0, -1, vcc +; CGP-NEXT: v_sub_i32_e32 v6, vcc, v8, v6 ; CGP-NEXT: v_subbrev_u32_e32 v3, vcc, 0, v3, vcc -; CGP-NEXT: v_cmp_eq_u32_e32 vcc, v10, v7 -; CGP-NEXT: v_cndmask_b32_e32 v7, v12, v11, vcc +; CGP-NEXT: v_cmp_eq_u32_e32 vcc, v9, v7 +; CGP-NEXT: v_cndmask_b32_e32 v7, v11, v10, vcc ; CGP-NEXT: v_cmp_ne_u32_e32 vcc, 0, v7 -; CGP-NEXT: v_cndmask_b32_e32 v7, v9, v13, vcc -; CGP-NEXT: v_cndmask_b32_e32 v3, v10, v3, vcc +; CGP-NEXT: v_cndmask_b32_e32 v6, v8, v6, vcc +; CGP-NEXT: v_cndmask_b32_e32 v3, v9, v3, vcc ; CGP-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 -; CGP-NEXT: v_cndmask_b32_e32 v4, v5, v7, vcc -; CGP-NEXT: v_cndmask_b32_e32 v5, v8, v3, vcc +; CGP-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc +; CGP-NEXT: v_cndmask_b32_e32 v5, v5, v3, vcc +; CGP-NEXT: ; implicit-def: $vgpr2 +; CGP-NEXT: ; implicit-def: $vgpr6 ; CGP-NEXT: BB2_6: ; %Flow ; CGP-NEXT: s_or_saveexec_b64 s[4:5], s[6:7] ; CGP-NEXT: s_xor_b64 exec, exec, s[4:5] @@ -1755,33 +1761,35 @@ ; CHECK-NEXT: v_mul_lo_u32 v3, v4, v3 ; CHECK-NEXT: v_add_i32_e32 v3, vcc, v8, v3 ; CHECK-NEXT: v_add_i32_e32 v2, vcc, v3, v2 -; CHECK-NEXT: v_sub_i32_e32 v3, vcc, v0, v7 -; CHECK-NEXT: v_subb_u32_e64 v6, s[4:5], v1, v2, vcc +; CHECK-NEXT: v_sub_i32_e32 v0, vcc, v0, v7 +; CHECK-NEXT: v_subb_u32_e64 v3, s[4:5], v1, v2, vcc ; CHECK-NEXT: v_sub_i32_e64 v1, s[4:5], v1, v2 -; CHECK-NEXT: v_cmp_ge_u32_e64 s[4:5], v3, v4 +; CHECK-NEXT: v_cmp_ge_u32_e64 s[4:5], v0, v4 ; CHECK-NEXT: v_cndmask_b32_e64 v2, 0, -1, s[4:5] -; CHECK-NEXT: v_cmp_ge_u32_e64 s[4:5], v6, v5 -; CHECK-NEXT: v_cndmask_b32_e64 v7, 0, -1, s[4:5] +; CHECK-NEXT: v_cmp_ge_u32_e64 s[4:5], v3, v5 +; CHECK-NEXT: v_cndmask_b32_e64 v6, 0, -1, s[4:5] ; CHECK-NEXT: v_subb_u32_e32 v1, vcc, v1, v5, vcc -; CHECK-NEXT: v_cmp_eq_u32_e32 vcc, v6, v5 -; CHECK-NEXT: v_cndmask_b32_e32 v2, v7, v2, vcc -; CHECK-NEXT: v_sub_i32_e32 v7, vcc, v3, v4 -; CHECK-NEXT: v_subbrev_u32_e64 v8, s[4:5], 0, v1, vcc -; CHECK-NEXT: v_cmp_ge_u32_e64 s[4:5], v7, v4 -; CHECK-NEXT: v_cndmask_b32_e64 v9, 0, -1, s[4:5] +; CHECK-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 +; CHECK-NEXT: v_cndmask_b32_e32 v2, v6, v2, vcc +; CHECK-NEXT: v_sub_i32_e32 v6, vcc, v0, v4 +; CHECK-NEXT: v_subbrev_u32_e64 v7, s[4:5], 0, v1, vcc +; CHECK-NEXT: v_cmp_ge_u32_e64 s[4:5], v6, v4 +; CHECK-NEXT: v_cndmask_b32_e64 v8, 0, -1, s[4:5] ; CHECK-NEXT: v_subb_u32_e32 v1, vcc, v1, v5, vcc -; CHECK-NEXT: v_cmp_ge_u32_e32 vcc, v8, v5 -; CHECK-NEXT: v_cndmask_b32_e64 v10, 0, -1, vcc -; CHECK-NEXT: v_sub_i32_e32 v11, vcc, v7, v4 +; CHECK-NEXT: v_cmp_ge_u32_e32 vcc, v7, v5 +; CHECK-NEXT: v_cndmask_b32_e64 v9, 0, -1, vcc +; CHECK-NEXT: v_sub_i32_e32 v4, vcc, v6, v4 ; CHECK-NEXT: v_subbrev_u32_e32 v1, vcc, 0, v1, vcc -; CHECK-NEXT: v_cmp_eq_u32_e32 vcc, v8, v5 -; CHECK-NEXT: v_cndmask_b32_e32 v5, v10, v9, vcc +; CHECK-NEXT: v_cmp_eq_u32_e32 vcc, v7, v5 +; CHECK-NEXT: v_cndmask_b32_e32 v5, v9, v8, vcc ; CHECK-NEXT: v_cmp_ne_u32_e32 vcc, 0, v5 -; CHECK-NEXT: v_cndmask_b32_e32 v5, v7, v11, vcc -; CHECK-NEXT: v_cndmask_b32_e32 v1, v8, v1, vcc +; CHECK-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc +; CHECK-NEXT: v_cndmask_b32_e32 v1, v7, v1, vcc ; CHECK-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 -; CHECK-NEXT: v_cndmask_b32_e32 v2, v3, v5, vcc -; CHECK-NEXT: v_cndmask_b32_e32 v3, v6, v1, vcc +; CHECK-NEXT: v_cndmask_b32_e32 v2, v0, v4, vcc +; CHECK-NEXT: v_cndmask_b32_e32 v3, v3, v1, vcc +; CHECK-NEXT: ; implicit-def: $vgpr0 +; CHECK-NEXT: ; implicit-def: $vgpr4_vgpr5 ; CHECK-NEXT: BB7_2: ; %Flow ; CHECK-NEXT: s_or_saveexec_b64 s[4:5], s[6:7] ; CHECK-NEXT: s_xor_b64 exec, exec, s[4:5] @@ -2197,29 +2205,31 @@ ; CGP-NEXT: v_subb_u32_e64 v4, s[4:5], v7, v0, vcc ; CGP-NEXT: v_sub_i32_e64 v0, s[4:5], v7, v0 ; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v1, v10 -; CGP-NEXT: v_cndmask_b32_e64 v6, 0, -1, s[4:5] +; CGP-NEXT: v_cndmask_b32_e64 v5, 0, -1, s[4:5] ; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v4, v11 -; CGP-NEXT: v_cndmask_b32_e64 v7, 0, -1, s[4:5] +; CGP-NEXT: v_cndmask_b32_e64 v6, 0, -1, s[4:5] ; CGP-NEXT: v_subb_u32_e32 v0, vcc, v0, v11, vcc ; CGP-NEXT: v_cmp_eq_u32_e32 vcc, v4, v11 -; CGP-NEXT: v_cndmask_b32_e32 v6, v7, v6, vcc -; CGP-NEXT: v_sub_i32_e32 v7, vcc, v1, v10 -; CGP-NEXT: v_subbrev_u32_e64 v12, s[4:5], 0, v0, vcc -; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v7, v10 -; CGP-NEXT: v_cndmask_b32_e64 v13, 0, -1, s[4:5] +; CGP-NEXT: v_cndmask_b32_e32 v5, v6, v5, vcc +; CGP-NEXT: v_sub_i32_e32 v6, vcc, v1, v10 +; CGP-NEXT: v_subbrev_u32_e64 v7, s[4:5], 0, v0, vcc +; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v6, v10 +; CGP-NEXT: v_cndmask_b32_e64 v12, 0, -1, s[4:5] ; CGP-NEXT: v_subb_u32_e32 v0, vcc, v0, v11, vcc -; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v12, v11 -; CGP-NEXT: v_cndmask_b32_e64 v14, 0, -1, vcc -; CGP-NEXT: v_sub_i32_e32 v15, vcc, v7, v10 +; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v7, v11 +; CGP-NEXT: v_cndmask_b32_e64 v13, 0, -1, vcc +; CGP-NEXT: v_sub_i32_e32 v10, vcc, v6, v10 ; CGP-NEXT: v_subbrev_u32_e32 v0, vcc, 0, v0, vcc -; CGP-NEXT: v_cmp_eq_u32_e32 vcc, v12, v11 -; CGP-NEXT: v_cndmask_b32_e32 v11, v14, v13, vcc +; CGP-NEXT: v_cmp_eq_u32_e32 vcc, v7, v11 +; CGP-NEXT: v_cndmask_b32_e32 v11, v13, v12, vcc ; CGP-NEXT: v_cmp_ne_u32_e32 vcc, 0, v11 -; CGP-NEXT: v_cndmask_b32_e32 v7, v7, v15, vcc -; CGP-NEXT: v_cndmask_b32_e32 v11, v12, v0, vcc -; CGP-NEXT: v_cmp_ne_u32_e32 vcc, 0, v6 -; CGP-NEXT: v_cndmask_b32_e32 v0, v1, v7, vcc -; CGP-NEXT: v_cndmask_b32_e32 v1, v4, v11, vcc +; CGP-NEXT: v_cndmask_b32_e32 v6, v6, v10, vcc +; CGP-NEXT: v_cndmask_b32_e32 v7, v7, v0, vcc +; CGP-NEXT: v_cmp_ne_u32_e32 vcc, 0, v5 +; CGP-NEXT: v_cndmask_b32_e32 v0, v1, v6, vcc +; CGP-NEXT: v_cndmask_b32_e32 v1, v4, v7, vcc +; CGP-NEXT: ; implicit-def: $vgpr5 +; CGP-NEXT: ; implicit-def: $vgpr10_vgpr11 ; CGP-NEXT: BB8_2: ; %Flow2 ; CGP-NEXT: s_or_saveexec_b64 s[4:5], s[6:7] ; CGP-NEXT: s_xor_b64 exec, exec, s[4:5] @@ -2352,33 +2362,35 @@ ; CGP-NEXT: v_mul_lo_u32 v5, v8, v5 ; CGP-NEXT: v_add_i32_e32 v5, vcc, v10, v5 ; CGP-NEXT: v_add_i32_e32 v4, vcc, v5, v4 -; CGP-NEXT: v_sub_i32_e32 v5, vcc, v2, v7 -; CGP-NEXT: v_subb_u32_e64 v6, s[4:5], v3, v4, vcc +; CGP-NEXT: v_sub_i32_e32 v2, vcc, v2, v7 +; CGP-NEXT: v_subb_u32_e64 v5, s[4:5], v3, v4, vcc ; CGP-NEXT: v_sub_i32_e64 v3, s[4:5], v3, v4 -; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v5, v8 +; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v2, v8 ; CGP-NEXT: v_cndmask_b32_e64 v4, 0, -1, s[4:5] -; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v6, v9 -; CGP-NEXT: v_cndmask_b32_e64 v7, 0, -1, s[4:5] +; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v5, v9 +; CGP-NEXT: v_cndmask_b32_e64 v6, 0, -1, s[4:5] ; CGP-NEXT: v_subb_u32_e32 v3, vcc, v3, v9, vcc -; CGP-NEXT: v_cmp_eq_u32_e32 vcc, v6, v9 -; CGP-NEXT: v_cndmask_b32_e32 v4, v7, v4, vcc -; CGP-NEXT: v_sub_i32_e32 v7, vcc, v5, v8 -; CGP-NEXT: v_subbrev_u32_e64 v10, s[4:5], 0, v3, vcc -; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v7, v8 -; CGP-NEXT: v_cndmask_b32_e64 v11, 0, -1, s[4:5] +; CGP-NEXT: v_cmp_eq_u32_e32 vcc, v5, v9 +; CGP-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc +; CGP-NEXT: v_sub_i32_e32 v6, vcc, v2, v8 +; CGP-NEXT: v_subbrev_u32_e64 v7, s[4:5], 0, v3, vcc +; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v6, v8 +; CGP-NEXT: v_cndmask_b32_e64 v10, 0, -1, s[4:5] ; CGP-NEXT: v_subb_u32_e32 v3, vcc, v3, v9, vcc -; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v10, v9 -; CGP-NEXT: v_cndmask_b32_e64 v12, 0, -1, vcc -; CGP-NEXT: v_sub_i32_e32 v13, vcc, v7, v8 +; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v7, v9 +; CGP-NEXT: v_cndmask_b32_e64 v11, 0, -1, vcc +; CGP-NEXT: v_sub_i32_e32 v8, vcc, v6, v8 ; CGP-NEXT: v_subbrev_u32_e32 v3, vcc, 0, v3, vcc -; CGP-NEXT: v_cmp_eq_u32_e32 vcc, v10, v9 -; CGP-NEXT: v_cndmask_b32_e32 v9, v12, v11, vcc +; CGP-NEXT: v_cmp_eq_u32_e32 vcc, v7, v9 +; CGP-NEXT: v_cndmask_b32_e32 v9, v11, v10, vcc ; CGP-NEXT: v_cmp_ne_u32_e32 vcc, 0, v9 -; CGP-NEXT: v_cndmask_b32_e32 v7, v7, v13, vcc -; CGP-NEXT: v_cndmask_b32_e32 v3, v10, v3, vcc +; CGP-NEXT: v_cndmask_b32_e32 v6, v6, v8, vcc +; CGP-NEXT: v_cndmask_b32_e32 v3, v7, v3, vcc ; CGP-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 -; CGP-NEXT: v_cndmask_b32_e32 v4, v5, v7, vcc -; CGP-NEXT: v_cndmask_b32_e32 v5, v6, v3, vcc +; CGP-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc +; CGP-NEXT: v_cndmask_b32_e32 v5, v5, v3, vcc +; CGP-NEXT: ; implicit-def: $vgpr2 +; CGP-NEXT: ; implicit-def: $vgpr8_vgpr9 ; CGP-NEXT: BB8_6: ; %Flow ; CGP-NEXT: s_or_saveexec_b64 s[4:5], s[6:7] ; CGP-NEXT: s_xor_b64 exec, exec, s[4:5] diff --git a/llvm/test/CodeGen/AMDGPU/bypass-div.ll b/llvm/test/CodeGen/AMDGPU/bypass-div.ll --- a/llvm/test/CodeGen/AMDGPU/bypass-div.ll +++ b/llvm/test/CodeGen/AMDGPU/bypass-div.ll @@ -8,135 +8,136 @@ ; GFX9-LABEL: sdiv64: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v5, v3 -; GFX9-NEXT: v_or_b32_e32 v4, v1, v5 -; GFX9-NEXT: v_mov_b32_e32 v3, 0 -; GFX9-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[3:4] -; GFX9-NEXT: ; implicit-def: $vgpr3_vgpr4 +; GFX9-NEXT: v_or_b32_e32 v5, v1, v3 +; GFX9-NEXT: v_mov_b32_e32 v4, 0 +; GFX9-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] +; GFX9-NEXT: ; implicit-def: $vgpr4_vgpr5 ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[6:7], exec, s[4:5] ; GFX9-NEXT: s_cbranch_execz BB0_2 ; GFX9-NEXT: ; %bb.1: -; GFX9-NEXT: v_ashrrev_i32_e32 v3, 31, v5 -; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, v2, v3 -; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, v5, v3, vcc -; GFX9-NEXT: v_xor_b32_e32 v4, v4, v3 -; GFX9-NEXT: v_xor_b32_e32 v5, v5, v3 -; GFX9-NEXT: v_cvt_f32_u32_e32 v6, v4 -; GFX9-NEXT: v_cvt_f32_u32_e32 v7, v5 -; GFX9-NEXT: v_sub_co_u32_e32 v8, vcc, 0, v4 -; GFX9-NEXT: v_subb_co_u32_e32 v9, vcc, 0, v5, vcc -; GFX9-NEXT: v_mov_b32_e32 v16, 0 -; GFX9-NEXT: v_mac_f32_e32 v6, 0x4f800000, v7 -; GFX9-NEXT: v_rcp_f32_e32 v6, v6 +; GFX9-NEXT: v_ashrrev_i32_e32 v4, 31, v3 +; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v2, v4 +; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, v3, v4, vcc +; GFX9-NEXT: v_xor_b32_e32 v2, v2, v4 +; GFX9-NEXT: v_xor_b32_e32 v3, v3, v4 +; GFX9-NEXT: v_cvt_f32_u32_e32 v5, v2 +; GFX9-NEXT: v_cvt_f32_u32_e32 v6, v3 +; GFX9-NEXT: v_sub_co_u32_e32 v7, vcc, 0, v2 +; GFX9-NEXT: v_subb_co_u32_e32 v8, vcc, 0, v3, vcc ; GFX9-NEXT: v_mov_b32_e32 v15, 0 -; GFX9-NEXT: v_mul_f32_e32 v6, 0x5f7ffffc, v6 -; GFX9-NEXT: v_mul_f32_e32 v7, 0x2f800000, v6 -; GFX9-NEXT: v_trunc_f32_e32 v7, v7 -; GFX9-NEXT: v_mac_f32_e32 v6, 0xcf800000, v7 +; GFX9-NEXT: v_mac_f32_e32 v5, 0x4f800000, v6 +; GFX9-NEXT: v_rcp_f32_e32 v5, v5 +; GFX9-NEXT: v_mov_b32_e32 v14, 0 +; GFX9-NEXT: v_mul_f32_e32 v5, 0x5f7ffffc, v5 +; GFX9-NEXT: v_mul_f32_e32 v6, 0x2f800000, v5 +; GFX9-NEXT: v_trunc_f32_e32 v6, v6 +; GFX9-NEXT: v_mac_f32_e32 v5, 0xcf800000, v6 +; GFX9-NEXT: v_cvt_u32_f32_e32 v5, v5 ; GFX9-NEXT: v_cvt_u32_f32_e32 v6, v6 -; GFX9-NEXT: v_cvt_u32_f32_e32 v7, v7 -; GFX9-NEXT: v_mul_lo_u32 v10, v9, v6 -; GFX9-NEXT: v_mul_hi_u32 v11, v8, v6 -; GFX9-NEXT: v_mul_lo_u32 v12, v8, v7 -; GFX9-NEXT: v_mul_lo_u32 v13, v8, v6 -; GFX9-NEXT: v_add3_u32 v10, v11, v12, v10 -; GFX9-NEXT: v_mul_lo_u32 v12, v6, v10 -; GFX9-NEXT: v_mul_hi_u32 v14, v6, v13 -; GFX9-NEXT: v_mul_hi_u32 v11, v6, v10 -; GFX9-NEXT: v_add_co_u32_e32 v12, vcc, v14, v12 -; GFX9-NEXT: v_mul_lo_u32 v14, v7, v13 -; GFX9-NEXT: v_addc_co_u32_e32 v11, vcc, v16, v11, vcc -; GFX9-NEXT: v_mul_hi_u32 v13, v7, v13 -; GFX9-NEXT: v_add_co_u32_e32 v12, vcc, v12, v14 -; GFX9-NEXT: v_mul_hi_u32 v12, v7, v10 -; GFX9-NEXT: v_mul_lo_u32 v10, v7, v10 -; GFX9-NEXT: v_addc_co_u32_e32 v11, vcc, v11, v13, vcc -; GFX9-NEXT: v_addc_co_u32_e32 v12, vcc, v12, v15, vcc -; GFX9-NEXT: v_add_co_u32_e32 v10, vcc, v11, v10 -; GFX9-NEXT: v_add_co_u32_e64 v6, s[4:5], v6, v10 -; GFX9-NEXT: v_addc_co_u32_e32 v11, vcc, v16, v12, vcc -; GFX9-NEXT: v_addc_co_u32_e64 v10, vcc, v7, v11, s[4:5] -; GFX9-NEXT: v_mul_lo_u32 v12, v8, v10 -; GFX9-NEXT: v_mul_hi_u32 v13, v8, v6 -; GFX9-NEXT: v_mul_lo_u32 v9, v9, v6 -; GFX9-NEXT: v_mul_lo_u32 v8, v8, v6 -; GFX9-NEXT: v_add_u32_e32 v7, v7, v11 -; GFX9-NEXT: v_add3_u32 v9, v13, v12, v9 -; GFX9-NEXT: v_mul_lo_u32 v12, v6, v9 -; GFX9-NEXT: v_mul_hi_u32 v13, v6, v8 -; GFX9-NEXT: v_mul_hi_u32 v14, v6, v9 -; GFX9-NEXT: v_add_co_u32_e32 v12, vcc, v13, v12 -; GFX9-NEXT: v_mul_hi_u32 v13, v10, v8 -; GFX9-NEXT: v_mul_lo_u32 v8, v10, v8 -; GFX9-NEXT: v_addc_co_u32_e32 v14, vcc, v16, v14, vcc -; GFX9-NEXT: v_add_co_u32_e32 v8, vcc, v12, v8 -; GFX9-NEXT: v_mul_hi_u32 v8, v10, v9 -; GFX9-NEXT: v_mul_lo_u32 v9, v10, v9 -; GFX9-NEXT: v_addc_co_u32_e32 v12, vcc, v14, v13, vcc -; GFX9-NEXT: v_addc_co_u32_e32 v8, vcc, v8, v15, vcc -; GFX9-NEXT: v_add_co_u32_e32 v9, vcc, v12, v9 -; GFX9-NEXT: v_addc_co_u32_e32 v8, vcc, v16, v8, vcc -; GFX9-NEXT: v_addc_co_u32_e64 v7, vcc, v7, v8, s[4:5] -; GFX9-NEXT: v_add_co_u32_e32 v6, vcc, v6, v9 -; GFX9-NEXT: v_addc_co_u32_e32 v7, vcc, 0, v7, vcc -; GFX9-NEXT: v_ashrrev_i32_e32 v8, 31, v1 -; GFX9-NEXT: v_add_co_u32_e32 v9, vcc, v0, v8 -; GFX9-NEXT: v_xor_b32_e32 v9, v9, v8 -; GFX9-NEXT: v_mul_lo_u32 v10, v9, v7 -; GFX9-NEXT: v_mul_hi_u32 v11, v9, v6 +; GFX9-NEXT: v_mul_lo_u32 v9, v8, v5 +; GFX9-NEXT: v_mul_hi_u32 v10, v7, v5 +; GFX9-NEXT: v_mul_lo_u32 v11, v7, v6 +; GFX9-NEXT: v_mul_lo_u32 v12, v7, v5 +; GFX9-NEXT: v_add3_u32 v9, v10, v11, v9 +; GFX9-NEXT: v_mul_lo_u32 v11, v5, v9 +; GFX9-NEXT: v_mul_hi_u32 v13, v5, v12 +; GFX9-NEXT: v_mul_hi_u32 v10, v5, v9 +; GFX9-NEXT: v_mul_hi_u32 v16, v6, v9 +; GFX9-NEXT: v_mul_lo_u32 v9, v6, v9 +; GFX9-NEXT: v_add_co_u32_e32 v11, vcc, v13, v11 +; GFX9-NEXT: v_mul_lo_u32 v13, v6, v12 +; GFX9-NEXT: v_mul_hi_u32 v12, v6, v12 +; GFX9-NEXT: v_addc_co_u32_e32 v10, vcc, v15, v10, vcc +; GFX9-NEXT: v_add_co_u32_e32 v11, vcc, v11, v13 +; GFX9-NEXT: v_addc_co_u32_e32 v10, vcc, v10, v12, vcc +; GFX9-NEXT: v_addc_co_u32_e32 v11, vcc, v16, v14, vcc +; GFX9-NEXT: v_add_co_u32_e32 v9, vcc, v10, v9 +; GFX9-NEXT: v_add_co_u32_e64 v5, s[4:5], v5, v9 +; GFX9-NEXT: v_addc_co_u32_e32 v10, vcc, v15, v11, vcc +; GFX9-NEXT: v_addc_co_u32_e64 v9, vcc, v6, v10, s[4:5] +; GFX9-NEXT: v_mul_lo_u32 v11, v7, v9 +; GFX9-NEXT: v_mul_hi_u32 v12, v7, v5 +; GFX9-NEXT: v_mul_lo_u32 v8, v8, v5 +; GFX9-NEXT: v_mul_lo_u32 v7, v7, v5 +; GFX9-NEXT: v_add_u32_e32 v6, v6, v10 +; GFX9-NEXT: v_add3_u32 v8, v12, v11, v8 +; GFX9-NEXT: v_mul_lo_u32 v13, v5, v8 +; GFX9-NEXT: v_mul_hi_u32 v16, v5, v7 +; GFX9-NEXT: v_mul_hi_u32 v17, v5, v8 ; GFX9-NEXT: v_mul_hi_u32 v12, v9, v7 -; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v8, vcc -; GFX9-NEXT: v_xor_b32_e32 v1, v1, v8 -; GFX9-NEXT: v_add_co_u32_e32 v10, vcc, v11, v10 -; GFX9-NEXT: v_addc_co_u32_e32 v11, vcc, v16, v12, vcc -; GFX9-NEXT: v_mul_lo_u32 v12, v1, v6 -; GFX9-NEXT: v_mul_hi_u32 v6, v1, v6 -; GFX9-NEXT: v_mul_hi_u32 v13, v1, v7 -; GFX9-NEXT: v_mul_lo_u32 v7, v1, v7 -; GFX9-NEXT: v_add_co_u32_e32 v10, vcc, v10, v12 -; GFX9-NEXT: v_addc_co_u32_e32 v6, vcc, v11, v6, vcc -; GFX9-NEXT: v_addc_co_u32_e32 v10, vcc, v13, v15, vcc -; GFX9-NEXT: v_add_co_u32_e32 v6, vcc, v6, v7 -; GFX9-NEXT: v_addc_co_u32_e32 v7, vcc, v16, v10, vcc -; GFX9-NEXT: v_mul_lo_u32 v10, v5, v6 -; GFX9-NEXT: v_mul_lo_u32 v11, v4, v7 -; GFX9-NEXT: v_mul_hi_u32 v12, v4, v6 -; GFX9-NEXT: v_mul_lo_u32 v13, v4, v6 -; GFX9-NEXT: v_add3_u32 v10, v12, v11, v10 -; GFX9-NEXT: v_sub_u32_e32 v11, v1, v10 -; GFX9-NEXT: v_sub_co_u32_e32 v9, vcc, v9, v13 -; GFX9-NEXT: v_subb_co_u32_e64 v11, s[4:5], v11, v5, vcc -; GFX9-NEXT: v_sub_co_u32_e64 v12, s[4:5], v9, v4 -; GFX9-NEXT: v_subbrev_co_u32_e64 v11, s[4:5], 0, v11, s[4:5] -; GFX9-NEXT: v_cmp_ge_u32_e64 s[4:5], v11, v5 -; GFX9-NEXT: v_cndmask_b32_e64 v13, 0, -1, s[4:5] -; GFX9-NEXT: v_cmp_ge_u32_e64 s[4:5], v12, v4 -; GFX9-NEXT: v_cndmask_b32_e64 v12, 0, -1, s[4:5] -; GFX9-NEXT: v_cmp_eq_u32_e64 s[4:5], v11, v5 -; GFX9-NEXT: v_subb_co_u32_e32 v1, vcc, v1, v10, vcc -; GFX9-NEXT: v_cndmask_b32_e64 v11, v13, v12, s[4:5] -; GFX9-NEXT: v_add_co_u32_e64 v12, s[4:5], 2, v6 -; GFX9-NEXT: v_addc_co_u32_e64 v13, s[4:5], 0, v7, s[4:5] -; GFX9-NEXT: v_cmp_ge_u32_e32 vcc, v1, v5 -; GFX9-NEXT: v_add_co_u32_e64 v14, s[4:5], 1, v6 -; GFX9-NEXT: v_cndmask_b32_e64 v10, 0, -1, vcc -; GFX9-NEXT: v_cmp_ge_u32_e32 vcc, v9, v4 -; GFX9-NEXT: v_addc_co_u32_e64 v15, s[4:5], 0, v7, s[4:5] -; GFX9-NEXT: v_cndmask_b32_e64 v4, 0, -1, vcc -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v1, v5 -; GFX9-NEXT: v_cmp_ne_u32_e64 s[4:5], 0, v11 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v10, v4, vcc -; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v1 -; GFX9-NEXT: v_cndmask_b32_e64 v4, v14, v12, s[4:5] -; GFX9-NEXT: v_cndmask_b32_e64 v11, v15, v13, s[4:5] -; GFX9-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc -; GFX9-NEXT: v_xor_b32_e32 v5, v8, v3 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v7, v11, vcc -; GFX9-NEXT: v_xor_b32_e32 v3, v4, v5 -; GFX9-NEXT: v_xor_b32_e32 v1, v1, v5 -; GFX9-NEXT: v_sub_co_u32_e32 v3, vcc, v3, v5 -; GFX9-NEXT: v_subb_co_u32_e32 v4, vcc, v1, v5, vcc +; GFX9-NEXT: v_mul_lo_u32 v7, v9, v7 +; GFX9-NEXT: v_add_co_u32_e32 v13, vcc, v16, v13 +; GFX9-NEXT: v_mul_hi_u32 v11, v9, v8 +; GFX9-NEXT: v_addc_co_u32_e32 v16, vcc, v15, v17, vcc +; GFX9-NEXT: v_mul_lo_u32 v8, v9, v8 +; GFX9-NEXT: v_add_co_u32_e32 v7, vcc, v13, v7 +; GFX9-NEXT: v_addc_co_u32_e32 v7, vcc, v16, v12, vcc +; GFX9-NEXT: v_addc_co_u32_e32 v9, vcc, v11, v14, vcc +; GFX9-NEXT: v_add_co_u32_e32 v7, vcc, v7, v8 +; GFX9-NEXT: v_addc_co_u32_e32 v8, vcc, v15, v9, vcc +; GFX9-NEXT: v_addc_co_u32_e64 v6, vcc, v6, v8, s[4:5] +; GFX9-NEXT: v_add_co_u32_e32 v5, vcc, v5, v7 +; GFX9-NEXT: v_addc_co_u32_e32 v6, vcc, 0, v6, vcc +; GFX9-NEXT: v_ashrrev_i32_e32 v7, 31, v1 +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v7 +; GFX9-NEXT: v_xor_b32_e32 v0, v0, v7 +; GFX9-NEXT: v_mul_lo_u32 v8, v0, v6 +; GFX9-NEXT: v_mul_hi_u32 v9, v0, v5 +; GFX9-NEXT: v_mul_hi_u32 v10, v0, v6 +; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v7, vcc +; GFX9-NEXT: v_xor_b32_e32 v1, v1, v7 +; GFX9-NEXT: v_add_co_u32_e32 v8, vcc, v9, v8 +; GFX9-NEXT: v_addc_co_u32_e32 v9, vcc, v15, v10, vcc +; GFX9-NEXT: v_mul_lo_u32 v10, v1, v5 +; GFX9-NEXT: v_mul_hi_u32 v5, v1, v5 +; GFX9-NEXT: v_mul_hi_u32 v11, v1, v6 +; GFX9-NEXT: v_mul_lo_u32 v6, v1, v6 +; GFX9-NEXT: v_add_co_u32_e32 v8, vcc, v8, v10 +; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, v9, v5, vcc +; GFX9-NEXT: v_addc_co_u32_e32 v8, vcc, v11, v14, vcc +; GFX9-NEXT: v_add_co_u32_e32 v5, vcc, v5, v6 +; GFX9-NEXT: v_addc_co_u32_e32 v6, vcc, v15, v8, vcc +; GFX9-NEXT: v_mul_lo_u32 v8, v3, v5 +; GFX9-NEXT: v_mul_lo_u32 v9, v2, v6 +; GFX9-NEXT: v_mul_hi_u32 v10, v2, v5 +; GFX9-NEXT: v_mul_lo_u32 v11, v2, v5 +; GFX9-NEXT: v_add3_u32 v8, v10, v9, v8 +; GFX9-NEXT: v_sub_u32_e32 v9, v1, v8 +; GFX9-NEXT: v_sub_co_u32_e32 v0, vcc, v0, v11 +; GFX9-NEXT: v_subb_co_u32_e64 v9, s[4:5], v9, v3, vcc +; GFX9-NEXT: v_sub_co_u32_e64 v10, s[4:5], v0, v2 +; GFX9-NEXT: v_subbrev_co_u32_e64 v9, s[4:5], 0, v9, s[4:5] +; GFX9-NEXT: v_cmp_ge_u32_e64 s[4:5], v9, v3 +; GFX9-NEXT: v_cndmask_b32_e64 v11, 0, -1, s[4:5] +; GFX9-NEXT: v_cmp_ge_u32_e64 s[4:5], v10, v2 +; GFX9-NEXT: v_cndmask_b32_e64 v10, 0, -1, s[4:5] +; GFX9-NEXT: v_cmp_eq_u32_e64 s[4:5], v9, v3 +; GFX9-NEXT: v_subb_co_u32_e32 v1, vcc, v1, v8, vcc +; GFX9-NEXT: v_cndmask_b32_e64 v9, v11, v10, s[4:5] +; GFX9-NEXT: v_add_co_u32_e64 v10, s[4:5], 2, v5 +; GFX9-NEXT: v_addc_co_u32_e64 v11, s[4:5], 0, v6, s[4:5] +; GFX9-NEXT: v_cmp_ge_u32_e32 vcc, v1, v3 +; GFX9-NEXT: v_add_co_u32_e64 v12, s[4:5], 1, v5 +; GFX9-NEXT: v_cndmask_b32_e64 v8, 0, -1, vcc +; GFX9-NEXT: v_cmp_ge_u32_e32 vcc, v0, v2 +; GFX9-NEXT: v_addc_co_u32_e64 v13, s[4:5], 0, v6, s[4:5] +; GFX9-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v1, v3 +; GFX9-NEXT: v_cmp_ne_u32_e64 s[4:5], 0, v9 +; GFX9-NEXT: v_cndmask_b32_e32 v0, v8, v0, vcc +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 +; GFX9-NEXT: v_cndmask_b32_e64 v1, v12, v10, s[4:5] +; GFX9-NEXT: v_cndmask_b32_e64 v9, v13, v11, s[4:5] +; GFX9-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc +; GFX9-NEXT: v_xor_b32_e32 v2, v7, v4 +; GFX9-NEXT: v_cndmask_b32_e32 v0, v6, v9, vcc +; GFX9-NEXT: v_xor_b32_e32 v1, v1, v2 +; GFX9-NEXT: v_xor_b32_e32 v0, v0, v2 +; GFX9-NEXT: v_sub_co_u32_e32 v4, vcc, v1, v2 +; GFX9-NEXT: v_subb_co_u32_e32 v5, vcc, v0, v2, vcc +; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX9-NEXT: ; implicit-def: $vgpr2_vgpr3 ; GFX9-NEXT: BB0_2: ; %Flow ; GFX9-NEXT: s_or_saveexec_b64 s[4:5], s[6:7] ; GFX9-NEXT: s_xor_b64 exec, exec, s[4:5] @@ -144,6 +145,7 @@ ; GFX9-NEXT: ; %bb.3: ; GFX9-NEXT: v_cvt_f32_u32_e32 v1, v2 ; GFX9-NEXT: v_sub_u32_e32 v3, 0, v2 +; GFX9-NEXT: v_mov_b32_e32 v5, 0 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v1, v1 ; GFX9-NEXT: v_mul_f32_e32 v1, 0x4f7ffffe, v1 ; GFX9-NEXT: v_cvt_u32_f32_e32 v1, v1 @@ -156,16 +158,15 @@ ; GFX9-NEXT: v_sub_u32_e32 v0, v0, v3 ; GFX9-NEXT: v_cmp_ge_u32_e32 vcc, v0, v2 ; GFX9-NEXT: v_sub_u32_e32 v3, v0, v2 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc ; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc ; GFX9-NEXT: v_add_u32_e32 v3, 1, v1 ; GFX9-NEXT: v_cmp_ge_u32_e32 vcc, v0, v2 -; GFX9-NEXT: v_cndmask_b32_e32 v3, v1, v3, vcc -; GFX9-NEXT: v_mov_b32_e32 v4, 0 +; GFX9-NEXT: v_cndmask_b32_e32 v4, v1, v3, vcc ; GFX9-NEXT: BB0_4: ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX9-NEXT: v_mov_b32_e32 v0, v3 -; GFX9-NEXT: v_mov_b32_e32 v1, v4 +; GFX9-NEXT: v_mov_b32_e32 v0, v4 +; GFX9-NEXT: v_mov_b32_e32 v1, v5 ; GFX9-NEXT: s_setpc_b64 s[30:31] %d = sdiv i64 %a, %b ret i64 %d @@ -261,33 +262,35 @@ ; GFX9-NEXT: v_mul_lo_u32 v9, v2, v4 ; GFX9-NEXT: v_add3_u32 v6, v8, v7, v6 ; GFX9-NEXT: v_sub_u32_e32 v7, v1, v6 -; GFX9-NEXT: v_sub_co_u32_e32 v8, vcc, v0, v9 +; GFX9-NEXT: v_sub_co_u32_e32 v0, vcc, v0, v9 ; GFX9-NEXT: v_subb_co_u32_e64 v7, s[4:5], v7, v3, vcc -; GFX9-NEXT: v_sub_co_u32_e64 v9, s[4:5], v8, v2 +; GFX9-NEXT: v_sub_co_u32_e64 v8, s[4:5], v0, v2 ; GFX9-NEXT: v_subbrev_co_u32_e64 v7, s[4:5], 0, v7, s[4:5] ; GFX9-NEXT: v_cmp_ge_u32_e64 s[4:5], v7, v3 -; GFX9-NEXT: v_cndmask_b32_e64 v10, 0, -1, s[4:5] -; GFX9-NEXT: v_cmp_ge_u32_e64 s[4:5], v9, v2 ; GFX9-NEXT: v_cndmask_b32_e64 v9, 0, -1, s[4:5] +; GFX9-NEXT: v_cmp_ge_u32_e64 s[4:5], v8, v2 +; GFX9-NEXT: v_cndmask_b32_e64 v8, 0, -1, s[4:5] ; GFX9-NEXT: v_cmp_eq_u32_e64 s[4:5], v7, v3 ; GFX9-NEXT: v_subb_co_u32_e32 v1, vcc, v1, v6, vcc -; GFX9-NEXT: v_cndmask_b32_e64 v7, v10, v9, s[4:5] -; GFX9-NEXT: v_add_co_u32_e64 v9, s[4:5], 2, v4 -; GFX9-NEXT: v_addc_co_u32_e64 v10, s[4:5], 0, v5, s[4:5] +; GFX9-NEXT: v_cndmask_b32_e64 v7, v9, v8, s[4:5] +; GFX9-NEXT: v_add_co_u32_e64 v8, s[4:5], 2, v4 +; GFX9-NEXT: v_addc_co_u32_e64 v9, s[4:5], 0, v5, s[4:5] ; GFX9-NEXT: v_cmp_ge_u32_e32 vcc, v1, v3 -; GFX9-NEXT: v_add_co_u32_e64 v11, s[4:5], 1, v4 +; GFX9-NEXT: v_add_co_u32_e64 v10, s[4:5], 1, v4 ; GFX9-NEXT: v_cndmask_b32_e64 v6, 0, -1, vcc -; GFX9-NEXT: v_cmp_ge_u32_e32 vcc, v8, v2 -; GFX9-NEXT: v_addc_co_u32_e64 v12, s[4:5], 0, v5, s[4:5] -; GFX9-NEXT: v_cndmask_b32_e64 v8, 0, -1, vcc +; GFX9-NEXT: v_cmp_ge_u32_e32 vcc, v0, v2 +; GFX9-NEXT: v_addc_co_u32_e64 v11, s[4:5], 0, v5, s[4:5] +; GFX9-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v1, v3 ; GFX9-NEXT: v_cmp_ne_u32_e64 s[4:5], 0, v7 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v6, v8, vcc -; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v1 -; GFX9-NEXT: v_cndmask_b32_e64 v7, v12, v10, s[4:5] -; GFX9-NEXT: v_cndmask_b32_e64 v1, v11, v9, s[4:5] +; GFX9-NEXT: v_cndmask_b32_e32 v0, v6, v0, vcc +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 +; GFX9-NEXT: v_cndmask_b32_e64 v7, v11, v9, s[4:5] +; GFX9-NEXT: v_cndmask_b32_e64 v0, v10, v8, s[4:5] +; GFX9-NEXT: v_cndmask_b32_e32 v4, v4, v0, vcc ; GFX9-NEXT: v_cndmask_b32_e32 v5, v5, v7, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v4, v4, v1, vcc +; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX9-NEXT: ; implicit-def: $vgpr2_vgpr3 ; GFX9-NEXT: BB1_2: ; %Flow ; GFX9-NEXT: s_or_saveexec_b64 s[4:5], s[6:7] ; GFX9-NEXT: s_xor_b64 exec, exec, s[4:5] @@ -326,133 +329,134 @@ ; GFX9-LABEL: srem64: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v5, v3 -; GFX9-NEXT: v_or_b32_e32 v4, v1, v5 -; GFX9-NEXT: v_mov_b32_e32 v3, 0 -; GFX9-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[3:4] -; GFX9-NEXT: ; implicit-def: $vgpr3_vgpr4 +; GFX9-NEXT: v_or_b32_e32 v5, v1, v3 +; GFX9-NEXT: v_mov_b32_e32 v4, 0 +; GFX9-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] +; GFX9-NEXT: ; implicit-def: $vgpr4_vgpr5 ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[8:9], exec, s[4:5] ; GFX9-NEXT: s_cbranch_execz BB2_2 ; GFX9-NEXT: ; %bb.1: -; GFX9-NEXT: v_ashrrev_i32_e32 v3, 31, v5 -; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, v2, v3 -; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, v5, v3, vcc -; GFX9-NEXT: v_xor_b32_e32 v5, v5, v3 -; GFX9-NEXT: v_xor_b32_e32 v3, v4, v3 -; GFX9-NEXT: v_cvt_f32_u32_e32 v4, v3 -; GFX9-NEXT: v_cvt_f32_u32_e32 v6, v5 -; GFX9-NEXT: v_sub_co_u32_e32 v7, vcc, 0, v3 -; GFX9-NEXT: v_subb_co_u32_e32 v8, vcc, 0, v5, vcc -; GFX9-NEXT: v_mov_b32_e32 v15, 0 -; GFX9-NEXT: v_mac_f32_e32 v4, 0x4f800000, v6 -; GFX9-NEXT: v_rcp_f32_e32 v4, v4 +; GFX9-NEXT: v_ashrrev_i32_e32 v4, 31, v3 +; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v2, v4 +; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, v3, v4, vcc +; GFX9-NEXT: v_xor_b32_e32 v3, v3, v4 +; GFX9-NEXT: v_xor_b32_e32 v2, v2, v4 +; GFX9-NEXT: v_cvt_f32_u32_e32 v4, v2 +; GFX9-NEXT: v_cvt_f32_u32_e32 v5, v3 +; GFX9-NEXT: v_sub_co_u32_e32 v6, vcc, 0, v2 +; GFX9-NEXT: v_subb_co_u32_e32 v7, vcc, 0, v3, vcc ; GFX9-NEXT: v_mov_b32_e32 v14, 0 +; GFX9-NEXT: v_mac_f32_e32 v4, 0x4f800000, v5 +; GFX9-NEXT: v_rcp_f32_e32 v4, v4 +; GFX9-NEXT: v_mov_b32_e32 v13, 0 ; GFX9-NEXT: v_mul_f32_e32 v4, 0x5f7ffffc, v4 -; GFX9-NEXT: v_mul_f32_e32 v6, 0x2f800000, v4 -; GFX9-NEXT: v_trunc_f32_e32 v6, v6 -; GFX9-NEXT: v_mac_f32_e32 v4, 0xcf800000, v6 +; GFX9-NEXT: v_mul_f32_e32 v5, 0x2f800000, v4 +; GFX9-NEXT: v_trunc_f32_e32 v5, v5 +; GFX9-NEXT: v_mac_f32_e32 v4, 0xcf800000, v5 ; GFX9-NEXT: v_cvt_u32_f32_e32 v4, v4 -; GFX9-NEXT: v_cvt_u32_f32_e32 v6, v6 -; GFX9-NEXT: v_mul_lo_u32 v9, v8, v4 -; GFX9-NEXT: v_mul_hi_u32 v10, v7, v4 -; GFX9-NEXT: v_mul_lo_u32 v11, v7, v6 -; GFX9-NEXT: v_mul_lo_u32 v12, v7, v4 -; GFX9-NEXT: v_add3_u32 v9, v10, v11, v9 -; GFX9-NEXT: v_mul_lo_u32 v11, v4, v9 -; GFX9-NEXT: v_mul_hi_u32 v13, v4, v12 -; GFX9-NEXT: v_mul_hi_u32 v10, v4, v9 -; GFX9-NEXT: v_mul_hi_u32 v16, v6, v9 -; GFX9-NEXT: v_mul_lo_u32 v9, v6, v9 -; GFX9-NEXT: v_add_co_u32_e32 v11, vcc, v13, v11 -; GFX9-NEXT: v_mul_lo_u32 v13, v6, v12 -; GFX9-NEXT: v_mul_hi_u32 v12, v6, v12 -; GFX9-NEXT: v_addc_co_u32_e32 v10, vcc, v15, v10, vcc -; GFX9-NEXT: v_add_co_u32_e32 v11, vcc, v11, v13 -; GFX9-NEXT: v_addc_co_u32_e32 v10, vcc, v10, v12, vcc -; GFX9-NEXT: v_addc_co_u32_e32 v11, vcc, v16, v14, vcc -; GFX9-NEXT: v_add_co_u32_e32 v9, vcc, v10, v9 -; GFX9-NEXT: v_add_co_u32_e64 v4, s[4:5], v4, v9 -; GFX9-NEXT: v_addc_co_u32_e32 v10, vcc, v15, v11, vcc -; GFX9-NEXT: v_addc_co_u32_e64 v9, vcc, v6, v10, s[4:5] -; GFX9-NEXT: v_mul_lo_u32 v11, v7, v9 -; GFX9-NEXT: v_mul_hi_u32 v12, v7, v4 -; GFX9-NEXT: v_mul_lo_u32 v8, v8, v4 +; GFX9-NEXT: v_cvt_u32_f32_e32 v5, v5 +; GFX9-NEXT: v_mul_lo_u32 v8, v7, v4 +; GFX9-NEXT: v_mul_hi_u32 v9, v6, v4 +; GFX9-NEXT: v_mul_lo_u32 v10, v6, v5 +; GFX9-NEXT: v_mul_lo_u32 v11, v6, v4 +; GFX9-NEXT: v_add3_u32 v8, v9, v10, v8 +; GFX9-NEXT: v_mul_lo_u32 v10, v4, v8 +; GFX9-NEXT: v_mul_hi_u32 v12, v4, v11 +; GFX9-NEXT: v_mul_hi_u32 v9, v4, v8 +; GFX9-NEXT: v_mul_hi_u32 v15, v5, v8 +; GFX9-NEXT: v_mul_lo_u32 v8, v5, v8 +; GFX9-NEXT: v_add_co_u32_e32 v10, vcc, v12, v10 +; GFX9-NEXT: v_mul_lo_u32 v12, v5, v11 +; GFX9-NEXT: v_mul_hi_u32 v11, v5, v11 +; GFX9-NEXT: v_addc_co_u32_e32 v9, vcc, v14, v9, vcc +; GFX9-NEXT: v_add_co_u32_e32 v10, vcc, v10, v12 +; GFX9-NEXT: v_addc_co_u32_e32 v9, vcc, v9, v11, vcc +; GFX9-NEXT: v_addc_co_u32_e32 v10, vcc, v15, v13, vcc +; GFX9-NEXT: v_add_co_u32_e32 v8, vcc, v9, v8 +; GFX9-NEXT: v_add_co_u32_e64 v4, s[4:5], v4, v8 +; GFX9-NEXT: v_addc_co_u32_e32 v9, vcc, v14, v10, vcc +; GFX9-NEXT: v_addc_co_u32_e64 v8, vcc, v5, v9, s[4:5] +; GFX9-NEXT: v_mul_lo_u32 v10, v6, v8 +; GFX9-NEXT: v_mul_hi_u32 v11, v6, v4 ; GFX9-NEXT: v_mul_lo_u32 v7, v7, v4 -; GFX9-NEXT: v_add_u32_e32 v6, v6, v10 -; GFX9-NEXT: v_add3_u32 v8, v12, v11, v8 -; GFX9-NEXT: v_mul_lo_u32 v11, v4, v8 -; GFX9-NEXT: v_mul_hi_u32 v12, v4, v7 -; GFX9-NEXT: v_mul_hi_u32 v16, v4, v8 -; GFX9-NEXT: v_mul_hi_u32 v13, v9, v8 -; GFX9-NEXT: v_mul_lo_u32 v8, v9, v8 -; GFX9-NEXT: v_add_co_u32_e32 v11, vcc, v12, v11 -; GFX9-NEXT: v_mul_hi_u32 v12, v9, v7 -; GFX9-NEXT: v_mul_lo_u32 v7, v9, v7 -; GFX9-NEXT: v_addc_co_u32_e32 v16, vcc, v15, v16, vcc -; GFX9-NEXT: v_add_co_u32_e32 v7, vcc, v11, v7 -; GFX9-NEXT: v_addc_co_u32_e32 v7, vcc, v16, v12, vcc -; GFX9-NEXT: v_addc_co_u32_e32 v9, vcc, v13, v14, vcc -; GFX9-NEXT: v_add_co_u32_e32 v7, vcc, v7, v8 -; GFX9-NEXT: v_addc_co_u32_e32 v8, vcc, v15, v9, vcc -; GFX9-NEXT: v_addc_co_u32_e64 v6, vcc, v6, v8, s[4:5] -; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, v4, v7 -; GFX9-NEXT: v_addc_co_u32_e32 v6, vcc, 0, v6, vcc -; GFX9-NEXT: v_ashrrev_i32_e32 v7, 31, v1 -; GFX9-NEXT: v_add_co_u32_e32 v8, vcc, v0, v7 -; GFX9-NEXT: v_xor_b32_e32 v8, v8, v7 -; GFX9-NEXT: v_mul_lo_u32 v9, v8, v6 -; GFX9-NEXT: v_mul_hi_u32 v10, v8, v4 +; GFX9-NEXT: v_mul_lo_u32 v6, v6, v4 +; GFX9-NEXT: v_add_u32_e32 v5, v5, v9 +; GFX9-NEXT: v_add3_u32 v7, v11, v10, v7 +; GFX9-NEXT: v_mul_lo_u32 v12, v4, v7 +; GFX9-NEXT: v_mul_hi_u32 v15, v4, v6 +; GFX9-NEXT: v_mul_hi_u32 v16, v4, v7 ; GFX9-NEXT: v_mul_hi_u32 v11, v8, v6 -; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v7, vcc -; GFX9-NEXT: v_xor_b32_e32 v1, v1, v7 -; GFX9-NEXT: v_add_co_u32_e32 v9, vcc, v10, v9 -; GFX9-NEXT: v_addc_co_u32_e32 v10, vcc, v15, v11, vcc -; GFX9-NEXT: v_mul_lo_u32 v11, v1, v4 -; GFX9-NEXT: v_mul_hi_u32 v4, v1, v4 -; GFX9-NEXT: v_mul_hi_u32 v12, v1, v6 -; GFX9-NEXT: v_mul_lo_u32 v6, v1, v6 -; GFX9-NEXT: v_add_co_u32_e32 v9, vcc, v9, v11 -; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, v10, v4, vcc -; GFX9-NEXT: v_addc_co_u32_e32 v9, vcc, v12, v14, vcc +; GFX9-NEXT: v_mul_lo_u32 v6, v8, v6 +; GFX9-NEXT: v_add_co_u32_e32 v12, vcc, v15, v12 +; GFX9-NEXT: v_mul_hi_u32 v10, v8, v7 +; GFX9-NEXT: v_addc_co_u32_e32 v15, vcc, v14, v16, vcc +; GFX9-NEXT: v_mul_lo_u32 v7, v8, v7 +; GFX9-NEXT: v_add_co_u32_e32 v6, vcc, v12, v6 +; GFX9-NEXT: v_addc_co_u32_e32 v6, vcc, v15, v11, vcc +; GFX9-NEXT: v_addc_co_u32_e32 v8, vcc, v10, v13, vcc +; GFX9-NEXT: v_add_co_u32_e32 v6, vcc, v6, v7 +; GFX9-NEXT: v_addc_co_u32_e32 v7, vcc, v14, v8, vcc +; GFX9-NEXT: v_addc_co_u32_e64 v5, vcc, v5, v7, s[4:5] ; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, v4, v6 -; GFX9-NEXT: v_addc_co_u32_e32 v6, vcc, v15, v9, vcc -; GFX9-NEXT: v_mul_lo_u32 v9, v5, v4 -; GFX9-NEXT: v_mul_hi_u32 v10, v3, v4 -; GFX9-NEXT: v_mul_lo_u32 v6, v3, v6 -; GFX9-NEXT: v_mul_lo_u32 v4, v3, v4 -; GFX9-NEXT: v_add3_u32 v6, v10, v6, v9 -; GFX9-NEXT: v_sub_u32_e32 v9, v1, v6 -; GFX9-NEXT: v_sub_co_u32_e32 v4, vcc, v8, v4 -; GFX9-NEXT: v_subb_co_u32_e64 v8, s[4:5], v9, v5, vcc -; GFX9-NEXT: v_sub_co_u32_e64 v9, s[4:5], v4, v3 -; GFX9-NEXT: v_subbrev_co_u32_e64 v10, s[6:7], 0, v8, s[4:5] -; GFX9-NEXT: v_cmp_ge_u32_e64 s[6:7], v10, v5 -; GFX9-NEXT: v_subb_co_u32_e32 v1, vcc, v1, v6, vcc -; GFX9-NEXT: v_cndmask_b32_e64 v11, 0, -1, s[6:7] -; GFX9-NEXT: v_cmp_ge_u32_e64 s[6:7], v9, v3 -; GFX9-NEXT: v_cndmask_b32_e64 v12, 0, -1, s[6:7] -; GFX9-NEXT: v_cmp_eq_u32_e64 s[6:7], v10, v5 -; GFX9-NEXT: v_cmp_ge_u32_e32 vcc, v1, v5 -; GFX9-NEXT: v_subb_co_u32_e64 v8, s[4:5], v8, v5, s[4:5] -; GFX9-NEXT: v_cndmask_b32_e64 v11, v11, v12, s[6:7] -; GFX9-NEXT: v_sub_co_u32_e64 v12, s[4:5], v9, v3 -; GFX9-NEXT: v_cndmask_b32_e64 v6, 0, -1, vcc -; GFX9-NEXT: v_cmp_ge_u32_e32 vcc, v4, v3 -; GFX9-NEXT: v_subbrev_co_u32_e64 v8, s[4:5], 0, v8, s[4:5] -; GFX9-NEXT: v_cndmask_b32_e64 v3, 0, -1, vcc -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v1, v5 -; GFX9-NEXT: v_cndmask_b32_e32 v3, v6, v3, vcc -; GFX9-NEXT: v_cmp_ne_u32_e64 s[4:5], 0, v11 -; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v3 -; GFX9-NEXT: v_cndmask_b32_e64 v3, v9, v12, s[4:5] -; GFX9-NEXT: v_cndmask_b32_e32 v3, v4, v3, vcc -; GFX9-NEXT: v_cndmask_b32_e64 v8, v10, v8, s[4:5] -; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v8, vcc -; GFX9-NEXT: v_xor_b32_e32 v3, v3, v7 -; GFX9-NEXT: v_xor_b32_e32 v1, v1, v7 -; GFX9-NEXT: v_sub_co_u32_e32 v3, vcc, v3, v7 -; GFX9-NEXT: v_subb_co_u32_e32 v4, vcc, v1, v7, vcc +; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v5, vcc +; GFX9-NEXT: v_ashrrev_i32_e32 v6, 31, v1 +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v6 +; GFX9-NEXT: v_xor_b32_e32 v0, v0, v6 +; GFX9-NEXT: v_mul_lo_u32 v7, v0, v5 +; GFX9-NEXT: v_mul_hi_u32 v8, v0, v4 +; GFX9-NEXT: v_mul_hi_u32 v9, v0, v5 +; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v6, vcc +; GFX9-NEXT: v_xor_b32_e32 v1, v1, v6 +; GFX9-NEXT: v_add_co_u32_e32 v7, vcc, v8, v7 +; GFX9-NEXT: v_addc_co_u32_e32 v8, vcc, v14, v9, vcc +; GFX9-NEXT: v_mul_lo_u32 v9, v1, v4 +; GFX9-NEXT: v_mul_hi_u32 v4, v1, v4 +; GFX9-NEXT: v_mul_hi_u32 v10, v1, v5 +; GFX9-NEXT: v_mul_lo_u32 v5, v1, v5 +; GFX9-NEXT: v_add_co_u32_e32 v7, vcc, v7, v9 +; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, v8, v4, vcc +; GFX9-NEXT: v_addc_co_u32_e32 v7, vcc, v10, v13, vcc +; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, v4, v5 +; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, v14, v7, vcc +; GFX9-NEXT: v_mul_lo_u32 v7, v3, v4 +; GFX9-NEXT: v_mul_hi_u32 v8, v2, v4 +; GFX9-NEXT: v_mul_lo_u32 v5, v2, v5 +; GFX9-NEXT: v_mul_lo_u32 v4, v2, v4 +; GFX9-NEXT: v_add3_u32 v5, v8, v5, v7 +; GFX9-NEXT: v_sub_u32_e32 v7, v1, v5 +; GFX9-NEXT: v_sub_co_u32_e32 v0, vcc, v0, v4 +; GFX9-NEXT: v_subb_co_u32_e64 v4, s[4:5], v7, v3, vcc +; GFX9-NEXT: v_sub_co_u32_e64 v7, s[4:5], v0, v2 +; GFX9-NEXT: v_subbrev_co_u32_e64 v8, s[6:7], 0, v4, s[4:5] +; GFX9-NEXT: v_cmp_ge_u32_e64 s[6:7], v8, v3 +; GFX9-NEXT: v_subb_co_u32_e32 v1, vcc, v1, v5, vcc +; GFX9-NEXT: v_cndmask_b32_e64 v9, 0, -1, s[6:7] +; GFX9-NEXT: v_cmp_ge_u32_e64 s[6:7], v7, v2 +; GFX9-NEXT: v_cndmask_b32_e64 v10, 0, -1, s[6:7] +; GFX9-NEXT: v_cmp_eq_u32_e64 s[6:7], v8, v3 +; GFX9-NEXT: v_cmp_ge_u32_e32 vcc, v1, v3 +; GFX9-NEXT: v_subb_co_u32_e64 v4, s[4:5], v4, v3, s[4:5] +; GFX9-NEXT: v_cndmask_b32_e64 v9, v9, v10, s[6:7] +; GFX9-NEXT: v_sub_co_u32_e64 v10, s[4:5], v7, v2 +; GFX9-NEXT: v_cndmask_b32_e64 v5, 0, -1, vcc +; GFX9-NEXT: v_cmp_ge_u32_e32 vcc, v0, v2 +; GFX9-NEXT: v_subbrev_co_u32_e64 v4, s[4:5], 0, v4, s[4:5] +; GFX9-NEXT: v_cndmask_b32_e64 v2, 0, -1, vcc +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v1, v3 +; GFX9-NEXT: v_cndmask_b32_e32 v2, v5, v2, vcc +; GFX9-NEXT: v_cmp_ne_u32_e64 s[4:5], 0, v9 +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 +; GFX9-NEXT: v_cndmask_b32_e64 v2, v7, v10, s[4:5] +; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc +; GFX9-NEXT: v_cndmask_b32_e64 v4, v8, v4, s[4:5] +; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc +; GFX9-NEXT: v_xor_b32_e32 v0, v0, v6 +; GFX9-NEXT: v_xor_b32_e32 v1, v1, v6 +; GFX9-NEXT: v_sub_co_u32_e32 v4, vcc, v0, v6 +; GFX9-NEXT: v_subb_co_u32_e32 v5, vcc, v1, v6, vcc +; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX9-NEXT: ; implicit-def: $vgpr2_vgpr3 ; GFX9-NEXT: BB2_2: ; %Flow ; GFX9-NEXT: s_or_saveexec_b64 s[4:5], s[8:9] ; GFX9-NEXT: s_xor_b64 exec, exec, s[4:5] @@ -460,7 +464,7 @@ ; GFX9-NEXT: ; %bb.3: ; GFX9-NEXT: v_cvt_f32_u32_e32 v1, v2 ; GFX9-NEXT: v_sub_u32_e32 v3, 0, v2 -; GFX9-NEXT: v_mov_b32_e32 v4, 0 +; GFX9-NEXT: v_mov_b32_e32 v5, 0 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v1, v1 ; GFX9-NEXT: v_mul_f32_e32 v1, 0x4f7ffffe, v1 ; GFX9-NEXT: v_cvt_u32_f32_e32 v1, v1 @@ -475,11 +479,11 @@ ; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc ; GFX9-NEXT: v_sub_u32_e32 v1, v0, v2 ; GFX9-NEXT: v_cmp_ge_u32_e32 vcc, v0, v2 -; GFX9-NEXT: v_cndmask_b32_e32 v3, v0, v1, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v4, v0, v1, vcc ; GFX9-NEXT: BB2_4: ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX9-NEXT: v_mov_b32_e32 v0, v3 -; GFX9-NEXT: v_mov_b32_e32 v1, v4 +; GFX9-NEXT: v_mov_b32_e32 v0, v4 +; GFX9-NEXT: v_mov_b32_e32 v1, v5 ; GFX9-NEXT: s_setpc_b64 s[30:31] %d = srem i64 %a, %b ret i64 %d @@ -575,32 +579,34 @@ ; GFX9-NEXT: v_mul_lo_u32 v4, v2, v4 ; GFX9-NEXT: v_add3_u32 v5, v7, v5, v6 ; GFX9-NEXT: v_sub_u32_e32 v6, v1, v5 -; GFX9-NEXT: v_sub_co_u32_e32 v4, vcc, v0, v4 -; GFX9-NEXT: v_subb_co_u32_e64 v6, s[4:5], v6, v3, vcc -; GFX9-NEXT: v_sub_co_u32_e64 v7, s[4:5], v4, v2 -; GFX9-NEXT: v_subbrev_co_u32_e64 v8, s[6:7], 0, v6, s[4:5] -; GFX9-NEXT: v_cmp_ge_u32_e64 s[6:7], v8, v3 -; GFX9-NEXT: v_cndmask_b32_e64 v9, 0, -1, s[6:7] -; GFX9-NEXT: v_cmp_ge_u32_e64 s[6:7], v7, v2 -; GFX9-NEXT: v_cndmask_b32_e64 v10, 0, -1, s[6:7] -; GFX9-NEXT: v_cmp_eq_u32_e64 s[6:7], v8, v3 -; GFX9-NEXT: v_subb_co_u32_e64 v6, s[4:5], v6, v3, s[4:5] +; GFX9-NEXT: v_sub_co_u32_e32 v0, vcc, v0, v4 +; GFX9-NEXT: v_subb_co_u32_e64 v4, s[4:5], v6, v3, vcc +; GFX9-NEXT: v_sub_co_u32_e64 v6, s[4:5], v0, v2 +; GFX9-NEXT: v_subbrev_co_u32_e64 v7, s[6:7], 0, v4, s[4:5] +; GFX9-NEXT: v_cmp_ge_u32_e64 s[6:7], v7, v3 ; GFX9-NEXT: v_subb_co_u32_e32 v1, vcc, v1, v5, vcc -; GFX9-NEXT: v_cndmask_b32_e64 v9, v9, v10, s[6:7] -; GFX9-NEXT: v_sub_co_u32_e64 v10, s[4:5], v7, v2 -; GFX9-NEXT: v_subbrev_co_u32_e64 v6, s[4:5], 0, v6, s[4:5] +; GFX9-NEXT: v_cndmask_b32_e64 v8, 0, -1, s[6:7] +; GFX9-NEXT: v_cmp_ge_u32_e64 s[6:7], v6, v2 +; GFX9-NEXT: v_cndmask_b32_e64 v9, 0, -1, s[6:7] +; GFX9-NEXT: v_cmp_eq_u32_e64 s[6:7], v7, v3 ; GFX9-NEXT: v_cmp_ge_u32_e32 vcc, v1, v3 -; GFX9-NEXT: v_cmp_ne_u32_e64 s[4:5], 0, v9 +; GFX9-NEXT: v_subb_co_u32_e64 v4, s[4:5], v4, v3, s[4:5] +; GFX9-NEXT: v_cndmask_b32_e64 v8, v8, v9, s[6:7] +; GFX9-NEXT: v_sub_co_u32_e64 v9, s[4:5], v6, v2 ; GFX9-NEXT: v_cndmask_b32_e64 v5, 0, -1, vcc -; GFX9-NEXT: v_cmp_ge_u32_e32 vcc, v4, v2 -; GFX9-NEXT: v_cndmask_b32_e64 v6, v8, v6, s[4:5] -; GFX9-NEXT: v_cndmask_b32_e64 v8, 0, -1, vcc +; GFX9-NEXT: v_cmp_ge_u32_e32 vcc, v0, v2 +; GFX9-NEXT: v_subbrev_co_u32_e64 v4, s[4:5], 0, v4, s[4:5] +; GFX9-NEXT: v_cndmask_b32_e64 v2, 0, -1, vcc ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v1, v3 -; GFX9-NEXT: v_cndmask_b32_e32 v3, v5, v8, vcc -; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v3 -; GFX9-NEXT: v_cndmask_b32_e32 v5, v1, v6, vcc -; GFX9-NEXT: v_cndmask_b32_e64 v1, v7, v10, s[4:5] -; GFX9-NEXT: v_cndmask_b32_e32 v4, v4, v1, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v2, v5, v2, vcc +; GFX9-NEXT: v_cmp_ne_u32_e64 s[4:5], 0, v8 +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 +; GFX9-NEXT: v_cndmask_b32_e64 v4, v7, v4, s[4:5] +; GFX9-NEXT: v_cndmask_b32_e32 v5, v1, v4, vcc +; GFX9-NEXT: v_cndmask_b32_e64 v1, v6, v9, s[4:5] +; GFX9-NEXT: v_cndmask_b32_e32 v4, v0, v1, vcc +; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX9-NEXT: ; implicit-def: $vgpr2_vgpr3 ; GFX9-NEXT: BB3_2: ; %Flow ; GFX9-NEXT: s_or_saveexec_b64 s[4:5], s[8:9] ; GFX9-NEXT: s_xor_b64 exec, exec, s[4:5] @@ -762,147 +768,148 @@ ; GFX9-LABEL: sdivrem64: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v7, v3 -; GFX9-NEXT: v_or_b32_e32 v4, v1, v7 -; GFX9-NEXT: v_mov_b32_e32 v3, 0 -; GFX9-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[3:4] -; GFX9-NEXT: ; implicit-def: $vgpr5_vgpr6 -; GFX9-NEXT: ; implicit-def: $vgpr3_vgpr4 +; GFX9-NEXT: v_or_b32_e32 v5, v1, v3 +; GFX9-NEXT: v_mov_b32_e32 v4, 0 +; GFX9-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] +; GFX9-NEXT: ; implicit-def: $vgpr6_vgpr7 +; GFX9-NEXT: ; implicit-def: $vgpr4_vgpr5 ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[10:11], exec, s[4:5] ; GFX9-NEXT: s_cbranch_execz BB8_2 ; GFX9-NEXT: ; %bb.1: -; GFX9-NEXT: v_ashrrev_i32_e32 v3, 31, v7 -; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, v2, v3 -; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, v7, v3, vcc -; GFX9-NEXT: v_xor_b32_e32 v6, v4, v3 -; GFX9-NEXT: v_xor_b32_e32 v5, v5, v3 -; GFX9-NEXT: v_cvt_f32_u32_e32 v4, v6 -; GFX9-NEXT: v_cvt_f32_u32_e32 v7, v5 -; GFX9-NEXT: v_sub_co_u32_e32 v8, vcc, 0, v6 -; GFX9-NEXT: v_subb_co_u32_e32 v9, vcc, 0, v5, vcc -; GFX9-NEXT: v_mov_b32_e32 v16, 0 -; GFX9-NEXT: v_mac_f32_e32 v4, 0x4f800000, v7 -; GFX9-NEXT: v_rcp_f32_e32 v4, v4 +; GFX9-NEXT: v_ashrrev_i32_e32 v4, 31, v3 +; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v2, v4 +; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, v3, v4, vcc +; GFX9-NEXT: v_xor_b32_e32 v2, v2, v4 +; GFX9-NEXT: v_xor_b32_e32 v3, v3, v4 +; GFX9-NEXT: v_cvt_f32_u32_e32 v5, v2 +; GFX9-NEXT: v_cvt_f32_u32_e32 v6, v3 +; GFX9-NEXT: v_sub_co_u32_e32 v7, vcc, 0, v2 +; GFX9-NEXT: v_subb_co_u32_e32 v8, vcc, 0, v3, vcc ; GFX9-NEXT: v_mov_b32_e32 v15, 0 -; GFX9-NEXT: v_mul_f32_e32 v4, 0x5f7ffffc, v4 -; GFX9-NEXT: v_mul_f32_e32 v7, 0x2f800000, v4 -; GFX9-NEXT: v_trunc_f32_e32 v7, v7 -; GFX9-NEXT: v_mac_f32_e32 v4, 0xcf800000, v7 -; GFX9-NEXT: v_cvt_u32_f32_e32 v4, v4 -; GFX9-NEXT: v_cvt_u32_f32_e32 v7, v7 -; GFX9-NEXT: v_mul_lo_u32 v10, v9, v4 -; GFX9-NEXT: v_mul_hi_u32 v11, v8, v4 -; GFX9-NEXT: v_mul_lo_u32 v12, v8, v7 -; GFX9-NEXT: v_mul_lo_u32 v13, v8, v4 -; GFX9-NEXT: v_add3_u32 v10, v11, v12, v10 -; GFX9-NEXT: v_mul_lo_u32 v12, v4, v10 -; GFX9-NEXT: v_mul_hi_u32 v14, v4, v13 -; GFX9-NEXT: v_mul_hi_u32 v11, v4, v10 -; GFX9-NEXT: v_add_co_u32_e32 v12, vcc, v14, v12 -; GFX9-NEXT: v_mul_lo_u32 v14, v7, v13 -; GFX9-NEXT: v_addc_co_u32_e32 v11, vcc, v16, v11, vcc -; GFX9-NEXT: v_mul_hi_u32 v13, v7, v13 -; GFX9-NEXT: v_add_co_u32_e32 v12, vcc, v12, v14 -; GFX9-NEXT: v_mul_hi_u32 v12, v7, v10 -; GFX9-NEXT: v_mul_lo_u32 v10, v7, v10 -; GFX9-NEXT: v_addc_co_u32_e32 v11, vcc, v11, v13, vcc -; GFX9-NEXT: v_addc_co_u32_e32 v12, vcc, v12, v15, vcc -; GFX9-NEXT: v_add_co_u32_e32 v10, vcc, v11, v10 -; GFX9-NEXT: v_add_co_u32_e64 v4, s[4:5], v4, v10 -; GFX9-NEXT: v_addc_co_u32_e32 v11, vcc, v16, v12, vcc -; GFX9-NEXT: v_addc_co_u32_e64 v10, vcc, v7, v11, s[4:5] -; GFX9-NEXT: v_mul_lo_u32 v12, v8, v10 -; GFX9-NEXT: v_mul_hi_u32 v13, v8, v4 -; GFX9-NEXT: v_mul_lo_u32 v9, v9, v4 -; GFX9-NEXT: v_mul_lo_u32 v8, v8, v4 -; GFX9-NEXT: v_add_u32_e32 v7, v7, v11 -; GFX9-NEXT: v_add3_u32 v9, v13, v12, v9 -; GFX9-NEXT: v_mul_lo_u32 v12, v4, v9 -; GFX9-NEXT: v_mul_hi_u32 v13, v4, v8 -; GFX9-NEXT: v_mul_hi_u32 v14, v4, v9 -; GFX9-NEXT: v_add_co_u32_e32 v12, vcc, v13, v12 -; GFX9-NEXT: v_mul_hi_u32 v13, v10, v8 -; GFX9-NEXT: v_mul_lo_u32 v8, v10, v8 -; GFX9-NEXT: v_addc_co_u32_e32 v14, vcc, v16, v14, vcc -; GFX9-NEXT: v_add_co_u32_e32 v8, vcc, v12, v8 -; GFX9-NEXT: v_mul_hi_u32 v8, v10, v9 -; GFX9-NEXT: v_mul_lo_u32 v9, v10, v9 -; GFX9-NEXT: v_addc_co_u32_e32 v12, vcc, v14, v13, vcc -; GFX9-NEXT: v_addc_co_u32_e32 v8, vcc, v8, v15, vcc -; GFX9-NEXT: v_add_co_u32_e32 v9, vcc, v12, v9 -; GFX9-NEXT: v_addc_co_u32_e32 v8, vcc, v16, v8, vcc -; GFX9-NEXT: v_addc_co_u32_e64 v7, vcc, v7, v8, s[4:5] -; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, v4, v9 -; GFX9-NEXT: v_addc_co_u32_e32 v7, vcc, 0, v7, vcc -; GFX9-NEXT: v_ashrrev_i32_e32 v8, 31, v1 -; GFX9-NEXT: v_add_co_u32_e32 v9, vcc, v0, v8 -; GFX9-NEXT: v_xor_b32_e32 v9, v9, v8 -; GFX9-NEXT: v_mul_lo_u32 v10, v9, v7 -; GFX9-NEXT: v_mul_hi_u32 v11, v9, v4 +; GFX9-NEXT: v_mac_f32_e32 v5, 0x4f800000, v6 +; GFX9-NEXT: v_rcp_f32_e32 v5, v5 +; GFX9-NEXT: v_mov_b32_e32 v14, 0 +; GFX9-NEXT: v_mul_f32_e32 v5, 0x5f7ffffc, v5 +; GFX9-NEXT: v_mul_f32_e32 v6, 0x2f800000, v5 +; GFX9-NEXT: v_trunc_f32_e32 v6, v6 +; GFX9-NEXT: v_mac_f32_e32 v5, 0xcf800000, v6 +; GFX9-NEXT: v_cvt_u32_f32_e32 v5, v5 +; GFX9-NEXT: v_cvt_u32_f32_e32 v6, v6 +; GFX9-NEXT: v_mul_lo_u32 v9, v8, v5 +; GFX9-NEXT: v_mul_hi_u32 v10, v7, v5 +; GFX9-NEXT: v_mul_lo_u32 v11, v7, v6 +; GFX9-NEXT: v_mul_lo_u32 v12, v7, v5 +; GFX9-NEXT: v_add3_u32 v9, v10, v11, v9 +; GFX9-NEXT: v_mul_lo_u32 v11, v5, v9 +; GFX9-NEXT: v_mul_hi_u32 v13, v5, v12 +; GFX9-NEXT: v_mul_hi_u32 v10, v5, v9 +; GFX9-NEXT: v_mul_hi_u32 v16, v6, v9 +; GFX9-NEXT: v_mul_lo_u32 v9, v6, v9 +; GFX9-NEXT: v_add_co_u32_e32 v11, vcc, v13, v11 +; GFX9-NEXT: v_mul_lo_u32 v13, v6, v12 +; GFX9-NEXT: v_mul_hi_u32 v12, v6, v12 +; GFX9-NEXT: v_addc_co_u32_e32 v10, vcc, v15, v10, vcc +; GFX9-NEXT: v_add_co_u32_e32 v11, vcc, v11, v13 +; GFX9-NEXT: v_addc_co_u32_e32 v10, vcc, v10, v12, vcc +; GFX9-NEXT: v_addc_co_u32_e32 v11, vcc, v16, v14, vcc +; GFX9-NEXT: v_add_co_u32_e32 v9, vcc, v10, v9 +; GFX9-NEXT: v_add_co_u32_e64 v5, s[4:5], v5, v9 +; GFX9-NEXT: v_addc_co_u32_e32 v10, vcc, v15, v11, vcc +; GFX9-NEXT: v_addc_co_u32_e64 v9, vcc, v6, v10, s[4:5] +; GFX9-NEXT: v_mul_lo_u32 v11, v7, v9 +; GFX9-NEXT: v_mul_hi_u32 v12, v7, v5 +; GFX9-NEXT: v_mul_lo_u32 v8, v8, v5 +; GFX9-NEXT: v_mul_lo_u32 v7, v7, v5 +; GFX9-NEXT: v_add_u32_e32 v6, v6, v10 +; GFX9-NEXT: v_add3_u32 v8, v12, v11, v8 +; GFX9-NEXT: v_mul_lo_u32 v13, v5, v8 +; GFX9-NEXT: v_mul_hi_u32 v16, v5, v7 +; GFX9-NEXT: v_mul_hi_u32 v17, v5, v8 ; GFX9-NEXT: v_mul_hi_u32 v12, v9, v7 -; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v8, vcc -; GFX9-NEXT: v_xor_b32_e32 v1, v1, v8 -; GFX9-NEXT: v_add_co_u32_e32 v10, vcc, v11, v10 -; GFX9-NEXT: v_addc_co_u32_e32 v11, vcc, v16, v12, vcc -; GFX9-NEXT: v_mul_lo_u32 v12, v1, v4 -; GFX9-NEXT: v_mul_hi_u32 v4, v1, v4 -; GFX9-NEXT: v_mul_hi_u32 v13, v1, v7 -; GFX9-NEXT: v_mul_lo_u32 v7, v1, v7 -; GFX9-NEXT: v_add_co_u32_e32 v10, vcc, v10, v12 -; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, v11, v4, vcc -; GFX9-NEXT: v_addc_co_u32_e32 v10, vcc, v13, v15, vcc -; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, v4, v7 -; GFX9-NEXT: v_addc_co_u32_e32 v7, vcc, v16, v10, vcc -; GFX9-NEXT: v_mul_lo_u32 v10, v5, v4 -; GFX9-NEXT: v_mul_lo_u32 v11, v6, v7 -; GFX9-NEXT: v_mul_hi_u32 v12, v6, v4 -; GFX9-NEXT: v_mul_lo_u32 v13, v6, v4 -; GFX9-NEXT: v_add3_u32 v10, v12, v11, v10 -; GFX9-NEXT: v_sub_u32_e32 v11, v1, v10 -; GFX9-NEXT: v_sub_co_u32_e32 v9, vcc, v9, v13 -; GFX9-NEXT: v_subb_co_u32_e64 v11, s[4:5], v11, v5, vcc -; GFX9-NEXT: v_sub_co_u32_e64 v12, s[4:5], v9, v6 -; GFX9-NEXT: v_subbrev_co_u32_e64 v13, s[6:7], 0, v11, s[4:5] -; GFX9-NEXT: v_cmp_ge_u32_e64 s[6:7], v13, v5 -; GFX9-NEXT: v_cndmask_b32_e64 v14, 0, -1, s[6:7] -; GFX9-NEXT: v_cmp_ge_u32_e64 s[6:7], v12, v6 -; GFX9-NEXT: v_cndmask_b32_e64 v15, 0, -1, s[6:7] -; GFX9-NEXT: v_cmp_eq_u32_e64 s[6:7], v13, v5 -; GFX9-NEXT: v_cndmask_b32_e64 v14, v14, v15, s[6:7] -; GFX9-NEXT: v_add_co_u32_e64 v15, s[6:7], 2, v4 -; GFX9-NEXT: v_addc_co_u32_e64 v16, s[6:7], 0, v7, s[6:7] -; GFX9-NEXT: v_subb_co_u32_e32 v1, vcc, v1, v10, vcc -; GFX9-NEXT: v_add_co_u32_e64 v17, s[6:7], 1, v4 -; GFX9-NEXT: v_addc_co_u32_e64 v18, s[6:7], 0, v7, s[6:7] -; GFX9-NEXT: v_cmp_ge_u32_e32 vcc, v1, v5 -; GFX9-NEXT: v_cmp_ne_u32_e64 s[6:7], 0, v14 -; GFX9-NEXT: v_cndmask_b32_e64 v10, 0, -1, vcc -; GFX9-NEXT: v_cmp_ge_u32_e32 vcc, v9, v6 -; GFX9-NEXT: v_cndmask_b32_e64 v14, v18, v16, s[6:7] -; GFX9-NEXT: v_cndmask_b32_e64 v16, 0, -1, vcc -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v1, v5 -; GFX9-NEXT: v_subb_co_u32_e64 v5, s[4:5], v11, v5, s[4:5] -; GFX9-NEXT: v_sub_co_u32_e64 v6, s[4:5], v12, v6 -; GFX9-NEXT: v_cndmask_b32_e32 v10, v10, v16, vcc -; GFX9-NEXT: v_subbrev_co_u32_e64 v5, s[4:5], 0, v5, s[4:5] -; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10 -; GFX9-NEXT: v_cndmask_b32_e64 v5, v13, v5, s[6:7] -; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc -; GFX9-NEXT: v_cndmask_b32_e64 v5, v12, v6, s[6:7] -; GFX9-NEXT: v_cndmask_b32_e64 v10, v17, v15, s[6:7] -; GFX9-NEXT: v_cndmask_b32_e32 v5, v9, v5, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v4, v4, v10, vcc -; GFX9-NEXT: v_xor_b32_e32 v10, v8, v3 -; GFX9-NEXT: v_cndmask_b32_e32 v7, v7, v14, vcc -; GFX9-NEXT: v_xor_b32_e32 v3, v4, v10 -; GFX9-NEXT: v_xor_b32_e32 v5, v5, v8 -; GFX9-NEXT: v_xor_b32_e32 v7, v7, v10 -; GFX9-NEXT: v_sub_co_u32_e64 v3, s[8:9], v3, v10 -; GFX9-NEXT: v_xor_b32_e32 v1, v1, v8 -; GFX9-NEXT: v_sub_co_u32_e32 v5, vcc, v5, v8 -; GFX9-NEXT: v_subb_co_u32_e64 v4, s[8:9], v7, v10, s[8:9] -; GFX9-NEXT: v_subb_co_u32_e32 v6, vcc, v1, v8, vcc +; GFX9-NEXT: v_mul_lo_u32 v7, v9, v7 +; GFX9-NEXT: v_add_co_u32_e32 v13, vcc, v16, v13 +; GFX9-NEXT: v_mul_hi_u32 v11, v9, v8 +; GFX9-NEXT: v_addc_co_u32_e32 v16, vcc, v15, v17, vcc +; GFX9-NEXT: v_mul_lo_u32 v8, v9, v8 +; GFX9-NEXT: v_add_co_u32_e32 v7, vcc, v13, v7 +; GFX9-NEXT: v_addc_co_u32_e32 v7, vcc, v16, v12, vcc +; GFX9-NEXT: v_addc_co_u32_e32 v9, vcc, v11, v14, vcc +; GFX9-NEXT: v_add_co_u32_e32 v7, vcc, v7, v8 +; GFX9-NEXT: v_addc_co_u32_e32 v8, vcc, v15, v9, vcc +; GFX9-NEXT: v_addc_co_u32_e64 v6, vcc, v6, v8, s[4:5] +; GFX9-NEXT: v_add_co_u32_e32 v5, vcc, v5, v7 +; GFX9-NEXT: v_addc_co_u32_e32 v6, vcc, 0, v6, vcc +; GFX9-NEXT: v_ashrrev_i32_e32 v7, 31, v1 +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v7 +; GFX9-NEXT: v_xor_b32_e32 v0, v0, v7 +; GFX9-NEXT: v_mul_lo_u32 v8, v0, v6 +; GFX9-NEXT: v_mul_hi_u32 v9, v0, v5 +; GFX9-NEXT: v_mul_hi_u32 v10, v0, v6 +; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v7, vcc +; GFX9-NEXT: v_xor_b32_e32 v1, v1, v7 +; GFX9-NEXT: v_add_co_u32_e32 v8, vcc, v9, v8 +; GFX9-NEXT: v_addc_co_u32_e32 v9, vcc, v15, v10, vcc +; GFX9-NEXT: v_mul_lo_u32 v10, v1, v5 +; GFX9-NEXT: v_mul_hi_u32 v5, v1, v5 +; GFX9-NEXT: v_mul_hi_u32 v11, v1, v6 +; GFX9-NEXT: v_mul_lo_u32 v6, v1, v6 +; GFX9-NEXT: v_add_co_u32_e32 v8, vcc, v8, v10 +; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, v9, v5, vcc +; GFX9-NEXT: v_addc_co_u32_e32 v8, vcc, v11, v14, vcc +; GFX9-NEXT: v_add_co_u32_e32 v5, vcc, v5, v6 +; GFX9-NEXT: v_addc_co_u32_e32 v6, vcc, v15, v8, vcc +; GFX9-NEXT: v_mul_lo_u32 v8, v3, v5 +; GFX9-NEXT: v_mul_lo_u32 v9, v2, v6 +; GFX9-NEXT: v_mul_hi_u32 v10, v2, v5 +; GFX9-NEXT: v_mul_lo_u32 v11, v2, v5 +; GFX9-NEXT: v_add3_u32 v8, v10, v9, v8 +; GFX9-NEXT: v_sub_u32_e32 v9, v1, v8 +; GFX9-NEXT: v_sub_co_u32_e32 v0, vcc, v0, v11 +; GFX9-NEXT: v_subb_co_u32_e64 v9, s[4:5], v9, v3, vcc +; GFX9-NEXT: v_sub_co_u32_e64 v10, s[4:5], v0, v2 +; GFX9-NEXT: v_subbrev_co_u32_e64 v11, s[6:7], 0, v9, s[4:5] +; GFX9-NEXT: v_cmp_ge_u32_e64 s[6:7], v11, v3 +; GFX9-NEXT: v_cndmask_b32_e64 v12, 0, -1, s[6:7] +; GFX9-NEXT: v_cmp_ge_u32_e64 s[6:7], v10, v2 +; GFX9-NEXT: v_cndmask_b32_e64 v13, 0, -1, s[6:7] +; GFX9-NEXT: v_cmp_eq_u32_e64 s[6:7], v11, v3 +; GFX9-NEXT: v_cndmask_b32_e64 v12, v12, v13, s[6:7] +; GFX9-NEXT: v_add_co_u32_e64 v13, s[6:7], 2, v5 +; GFX9-NEXT: v_addc_co_u32_e64 v14, s[6:7], 0, v6, s[6:7] +; GFX9-NEXT: v_subb_co_u32_e32 v1, vcc, v1, v8, vcc +; GFX9-NEXT: v_add_co_u32_e64 v15, s[6:7], 1, v5 +; GFX9-NEXT: v_addc_co_u32_e64 v16, s[6:7], 0, v6, s[6:7] +; GFX9-NEXT: v_cmp_ge_u32_e32 vcc, v1, v3 +; GFX9-NEXT: v_cmp_ne_u32_e64 s[6:7], 0, v12 +; GFX9-NEXT: v_cndmask_b32_e64 v8, 0, -1, vcc +; GFX9-NEXT: v_cmp_ge_u32_e32 vcc, v0, v2 +; GFX9-NEXT: v_cndmask_b32_e64 v12, v16, v14, s[6:7] +; GFX9-NEXT: v_cndmask_b32_e64 v14, 0, -1, vcc +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v1, v3 +; GFX9-NEXT: v_cndmask_b32_e32 v8, v8, v14, vcc +; GFX9-NEXT: v_subb_co_u32_e64 v3, s[4:5], v9, v3, s[4:5] +; GFX9-NEXT: v_sub_co_u32_e64 v2, s[4:5], v10, v2 +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 +; GFX9-NEXT: v_cndmask_b32_e64 v8, v15, v13, s[6:7] +; GFX9-NEXT: v_subbrev_co_u32_e64 v3, s[4:5], 0, v3, s[4:5] +; GFX9-NEXT: v_cndmask_b32_e32 v5, v5, v8, vcc +; GFX9-NEXT: v_xor_b32_e32 v8, v7, v4 +; GFX9-NEXT: v_cndmask_b32_e64 v2, v10, v2, s[6:7] +; GFX9-NEXT: v_cndmask_b32_e32 v6, v6, v12, vcc +; GFX9-NEXT: v_xor_b32_e32 v4, v5, v8 +; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc +; GFX9-NEXT: v_cndmask_b32_e64 v3, v11, v3, s[6:7] +; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc +; GFX9-NEXT: v_xor_b32_e32 v6, v6, v8 +; GFX9-NEXT: v_sub_co_u32_e64 v4, s[8:9], v4, v8 +; GFX9-NEXT: v_xor_b32_e32 v0, v0, v7 +; GFX9-NEXT: v_subb_co_u32_e64 v5, s[8:9], v6, v8, s[8:9] +; GFX9-NEXT: v_xor_b32_e32 v1, v1, v7 +; GFX9-NEXT: v_sub_co_u32_e32 v6, vcc, v0, v7 +; GFX9-NEXT: v_subb_co_u32_e32 v7, vcc, v1, v7, vcc +; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX9-NEXT: ; implicit-def: $vgpr2_vgpr3 ; GFX9-NEXT: BB8_2: ; %Flow ; GFX9-NEXT: s_or_saveexec_b64 s[4:5], s[10:11] ; GFX9-NEXT: s_xor_b64 exec, exec, s[4:5] @@ -910,7 +917,8 @@ ; GFX9-NEXT: ; %bb.3: ; GFX9-NEXT: v_cvt_f32_u32_e32 v1, v2 ; GFX9-NEXT: v_sub_u32_e32 v3, 0, v2 -; GFX9-NEXT: v_mov_b32_e32 v4, 0 +; GFX9-NEXT: v_mov_b32_e32 v5, 0 +; GFX9-NEXT: v_mov_b32_e32 v7, v5 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v1, v1 ; GFX9-NEXT: v_mul_f32_e32 v1, 0x4f7ffffe, v1 ; GFX9-NEXT: v_cvt_u32_f32_e32 v1, v1 @@ -919,24 +927,23 @@ ; GFX9-NEXT: v_add_u32_e32 v1, v1, v3 ; GFX9-NEXT: v_mul_hi_u32 v1, v0, v1 ; GFX9-NEXT: v_mul_lo_u32 v3, v1, v2 -; GFX9-NEXT: v_add_u32_e32 v5, 1, v1 +; GFX9-NEXT: v_add_u32_e32 v4, 1, v1 ; GFX9-NEXT: v_sub_u32_e32 v0, v0, v3 ; GFX9-NEXT: v_cmp_ge_u32_e32 vcc, v0, v2 ; GFX9-NEXT: v_sub_u32_e32 v3, v0, v2 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc ; GFX9-NEXT: v_sub_u32_e32 v3, v0, v2 ; GFX9-NEXT: v_cmp_ge_u32_e32 vcc, v0, v2 -; GFX9-NEXT: v_add_u32_e32 v6, 1, v1 -; GFX9-NEXT: v_cndmask_b32_e32 v5, v0, v3, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v3, v1, v6, vcc -; GFX9-NEXT: v_mov_b32_e32 v6, v4 +; GFX9-NEXT: v_add_u32_e32 v4, 1, v1 +; GFX9-NEXT: v_cndmask_b32_e32 v6, v0, v3, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v4, v1, v4, vcc ; GFX9-NEXT: BB8_4: ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX9-NEXT: v_mov_b32_e32 v0, v3 -; GFX9-NEXT: v_mov_b32_e32 v1, v4 -; GFX9-NEXT: v_mov_b32_e32 v2, v5 -; GFX9-NEXT: v_mov_b32_e32 v3, v6 +; GFX9-NEXT: v_mov_b32_e32 v0, v4 +; GFX9-NEXT: v_mov_b32_e32 v1, v5 +; GFX9-NEXT: v_mov_b32_e32 v2, v6 +; GFX9-NEXT: v_mov_b32_e32 v3, v7 ; GFX9-NEXT: s_setpc_b64 s[30:31] %d = sdiv i64 %a, %b %r = srem i64 %a, %b @@ -1036,40 +1043,42 @@ ; GFX9-NEXT: v_mul_lo_u32 v9, v2, v4 ; GFX9-NEXT: v_add3_u32 v6, v8, v7, v6 ; GFX9-NEXT: v_sub_u32_e32 v7, v1, v6 -; GFX9-NEXT: v_sub_co_u32_e32 v8, vcc, v0, v9 +; GFX9-NEXT: v_sub_co_u32_e32 v0, vcc, v0, v9 ; GFX9-NEXT: v_subb_co_u32_e64 v7, s[4:5], v7, v3, vcc -; GFX9-NEXT: v_sub_co_u32_e64 v9, s[4:5], v8, v2 -; GFX9-NEXT: v_subbrev_co_u32_e64 v10, s[6:7], 0, v7, s[4:5] -; GFX9-NEXT: v_cmp_ge_u32_e64 s[6:7], v10, v3 +; GFX9-NEXT: v_sub_co_u32_e64 v8, s[4:5], v0, v2 +; GFX9-NEXT: v_subbrev_co_u32_e64 v9, s[6:7], 0, v7, s[4:5] +; GFX9-NEXT: v_cmp_ge_u32_e64 s[6:7], v9, v3 +; GFX9-NEXT: v_cndmask_b32_e64 v10, 0, -1, s[6:7] +; GFX9-NEXT: v_cmp_ge_u32_e64 s[6:7], v8, v2 ; GFX9-NEXT: v_cndmask_b32_e64 v11, 0, -1, s[6:7] -; GFX9-NEXT: v_cmp_ge_u32_e64 s[6:7], v9, v2 -; GFX9-NEXT: v_cndmask_b32_e64 v12, 0, -1, s[6:7] -; GFX9-NEXT: v_cmp_eq_u32_e64 s[6:7], v10, v3 -; GFX9-NEXT: v_cndmask_b32_e64 v11, v11, v12, s[6:7] -; GFX9-NEXT: v_add_co_u32_e64 v12, s[6:7], 2, v4 -; GFX9-NEXT: v_addc_co_u32_e64 v13, s[6:7], 0, v5, s[6:7] +; GFX9-NEXT: v_cmp_eq_u32_e64 s[6:7], v9, v3 +; GFX9-NEXT: v_cndmask_b32_e64 v10, v10, v11, s[6:7] +; GFX9-NEXT: v_add_co_u32_e64 v11, s[6:7], 2, v4 +; GFX9-NEXT: v_addc_co_u32_e64 v12, s[6:7], 0, v5, s[6:7] ; GFX9-NEXT: v_subb_co_u32_e32 v1, vcc, v1, v6, vcc -; GFX9-NEXT: v_add_co_u32_e64 v14, s[6:7], 1, v4 -; GFX9-NEXT: v_addc_co_u32_e64 v15, s[6:7], 0, v5, s[6:7] +; GFX9-NEXT: v_add_co_u32_e64 v13, s[6:7], 1, v4 +; GFX9-NEXT: v_addc_co_u32_e64 v14, s[6:7], 0, v5, s[6:7] ; GFX9-NEXT: v_cmp_ge_u32_e32 vcc, v1, v3 -; GFX9-NEXT: v_cmp_ne_u32_e64 s[6:7], 0, v11 +; GFX9-NEXT: v_cmp_ne_u32_e64 s[6:7], 0, v10 ; GFX9-NEXT: v_cndmask_b32_e64 v6, 0, -1, vcc -; GFX9-NEXT: v_cmp_ge_u32_e32 vcc, v8, v2 -; GFX9-NEXT: v_cndmask_b32_e64 v11, v15, v13, s[6:7] -; GFX9-NEXT: v_cndmask_b32_e64 v13, 0, -1, vcc +; GFX9-NEXT: v_cmp_ge_u32_e32 vcc, v0, v2 +; GFX9-NEXT: v_cndmask_b32_e64 v10, v14, v12, s[6:7] +; GFX9-NEXT: v_cndmask_b32_e64 v12, 0, -1, vcc ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v1, v3 -; GFX9-NEXT: v_cndmask_b32_e32 v6, v6, v13, vcc -; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v6 -; GFX9-NEXT: v_cndmask_b32_e64 v6, v14, v12, s[6:7] ; GFX9-NEXT: v_subb_co_u32_e64 v3, s[4:5], v7, v3, s[4:5] -; GFX9-NEXT: v_cndmask_b32_e32 v4, v4, v6, vcc -; GFX9-NEXT: v_sub_co_u32_e64 v6, s[4:5], v9, v2 +; GFX9-NEXT: v_sub_co_u32_e64 v2, s[4:5], v8, v2 +; GFX9-NEXT: v_cndmask_b32_e32 v6, v6, v12, vcc ; GFX9-NEXT: v_subbrev_co_u32_e64 v3, s[4:5], 0, v3, s[4:5] -; GFX9-NEXT: v_cndmask_b32_e64 v3, v10, v3, s[6:7] +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v6 +; GFX9-NEXT: v_cndmask_b32_e64 v3, v9, v3, s[6:7] +; GFX9-NEXT: v_cndmask_b32_e64 v6, v13, v11, s[6:7] ; GFX9-NEXT: v_cndmask_b32_e32 v7, v1, v3, vcc -; GFX9-NEXT: v_cndmask_b32_e64 v1, v9, v6, s[6:7] -; GFX9-NEXT: v_cndmask_b32_e32 v5, v5, v11, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v6, v8, v1, vcc +; GFX9-NEXT: v_cndmask_b32_e64 v1, v8, v2, s[6:7] +; GFX9-NEXT: v_cndmask_b32_e32 v4, v4, v6, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v6, v0, v1, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v5, v5, v10, vcc +; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX9-NEXT: ; implicit-def: $vgpr2_vgpr3 ; GFX9-NEXT: BB9_2: ; %Flow ; GFX9-NEXT: s_or_saveexec_b64 s[4:5], s[8:9] ; GFX9-NEXT: s_xor_b64 exec, exec, s[4:5] diff --git a/llvm/test/CodeGen/AMDGPU/collapse-endcf.ll b/llvm/test/CodeGen/AMDGPU/collapse-endcf.ll --- a/llvm/test/CodeGen/AMDGPU/collapse-endcf.ll +++ b/llvm/test/CodeGen/AMDGPU/collapse-endcf.ll @@ -86,7 +86,7 @@ ; GCN-NEXT: s_cbranch_execz [[THEN_INNER:BB[0-9_]+]] ; GCN-NEXT: ; %bb.{{[0-9]+}}: ; GCN: store_dword -; GCN-NEXT: {{^}}[[THEN_INNER]]: +; GCN: {{^}}[[THEN_INNER]]: ; GCN-NEXT: s_or_saveexec_b64 [[SAVEEXEC_INNER3:s\[[0-9:]+\]]], [[SAVEEXEC_INNER2]] ; GCN-NEXT: s_xor_b64 exec, exec, [[SAVEEXEC_INNER3]] ; GCN-NEXT: s_cbranch_execz [[ENDIF_OUTER]] @@ -136,7 +136,7 @@ ; GCN: store_dword ; GCN-NEXT: {{^}}[[THEN_OUTER_FLOW]]: ; GCN-NEXT: s_or_b64 exec, exec, [[SAVEEXEC_INNER_IF_OUTER_ELSE]] -; GCN-NEXT: {{^}}[[THEN_OUTER]]: +; GCN: {{^}}[[THEN_OUTER]]: ; GCN-NEXT: s_or_saveexec_b64 [[SAVEEXEC_OUTER3:s\[[0-9:]+\]]], [[SAVEEXEC_OUTER2]] ; GCN-NEXT: s_xor_b64 exec, exec, [[SAVEEXEC_OUTER3]] ; GCN-NEXT: s_cbranch_execz [[ENDIF_OUTER:BB[0-9_]+]] diff --git a/llvm/test/CodeGen/AMDGPU/llc-pipeline.ll b/llvm/test/CodeGen/AMDGPU/llc-pipeline.ll --- a/llvm/test/CodeGen/AMDGPU/llc-pipeline.ll +++ b/llvm/test/CodeGen/AMDGPU/llc-pipeline.ll @@ -330,6 +330,8 @@ ; GCN-O1-NEXT: Process Implicit Definitions ; GCN-O1-NEXT: Remove unreachable machine basic blocks ; GCN-O1-NEXT: Live Variable Analysis +; GCN-O1-NEXT: MachineDominator Tree Construction +; GCN-O1-NEXT: SI Optimize VGPR LiveRange ; GCN-O1-NEXT: Eliminate PHI nodes for register allocation ; GCN-O1-NEXT: SI Lower control flow pseudo instructions ; GCN-O1-NEXT: Two-Address instruction pass @@ -610,6 +612,7 @@ ; GCN-O1-OPTS-NEXT: Process Implicit Definitions ; GCN-O1-OPTS-NEXT: Remove unreachable machine basic blocks ; GCN-O1-OPTS-NEXT: Live Variable Analysis +; GCN-O1-OPTS-NEXT: SI Optimize VGPR LiveRange ; GCN-O1-OPTS-NEXT: Eliminate PHI nodes for register allocation ; GCN-O1-OPTS-NEXT: SI Lower control flow pseudo instructions ; GCN-O1-OPTS-NEXT: Two-Address instruction pass @@ -890,6 +893,7 @@ ; GCN-O2-NEXT: Process Implicit Definitions ; GCN-O2-NEXT: Remove unreachable machine basic blocks ; GCN-O2-NEXT: Live Variable Analysis +; GCN-O2-NEXT: SI Optimize VGPR LiveRange ; GCN-O2-NEXT: Eliminate PHI nodes for register allocation ; GCN-O2-NEXT: SI Lower control flow pseudo instructions ; GCN-O2-NEXT: Two-Address instruction pass @@ -1184,6 +1188,7 @@ ; GCN-O3-NEXT: Process Implicit Definitions ; GCN-O3-NEXT: Remove unreachable machine basic blocks ; GCN-O3-NEXT: Live Variable Analysis +; GCN-O3-NEXT: SI Optimize VGPR LiveRange ; GCN-O3-NEXT: Eliminate PHI nodes for register allocation ; GCN-O3-NEXT: SI Lower control flow pseudo instructions ; GCN-O3-NEXT: Two-Address instruction pass diff --git a/llvm/test/CodeGen/AMDGPU/sgpr-control-flow.ll b/llvm/test/CodeGen/AMDGPU/sgpr-control-flow.ll --- a/llvm/test/CodeGen/AMDGPU/sgpr-control-flow.ll +++ b/llvm/test/CodeGen/AMDGPU/sgpr-control-flow.ll @@ -164,15 +164,16 @@ ; SI-NEXT: s_cbranch_execz BB3_2 ; SI-NEXT: ; %bb.1: ; %else ; SI-NEXT: s_mov_b32 s11, 0xf000 -; SI-NEXT: v_lshlrev_b32_e32 v1, 2, v0 -; SI-NEXT: v_mov_b32_e32 v2, 0 +; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; SI-NEXT: v_mov_b32_e32 v1, 0 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: buffer_load_dword v1, v[1:2], s[8:11], 0 addr64 +; SI-NEXT: buffer_load_dword v0, v[0:1], s[8:11], 0 addr64 ; SI-NEXT: s_andn2_b64 s[0:1], s[0:1], exec ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cmp_gt_i32_e32 vcc, 0, v1 +; SI-NEXT: v_cmp_gt_i32_e32 vcc, 0, v0 ; SI-NEXT: s_and_b64 s[8:9], vcc, exec ; SI-NEXT: s_or_b64 s[0:1], s[0:1], s[8:9] +; SI-NEXT: ; implicit-def: $vgpr0 ; SI-NEXT: BB3_2: ; %Flow ; SI-NEXT: s_or_saveexec_b64 s[2:3], s[2:3] ; SI-NEXT: s_xor_b64 exec, exec, s[2:3] diff --git a/llvm/test/CodeGen/AMDGPU/skip-if-dead.ll b/llvm/test/CodeGen/AMDGPU/skip-if-dead.ll --- a/llvm/test/CodeGen/AMDGPU/skip-if-dead.ll +++ b/llvm/test/CodeGen/AMDGPU/skip-if-dead.ll @@ -1160,6 +1160,8 @@ ; SI-NEXT: s_cbranch_execz BB14_3 ; SI-NEXT: ; %bb.1: ; %kill ; SI-NEXT: s_andn2_b64 s[0:1], s[0:1], exec +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr1 ; SI-NEXT: s_cbranch_scc0 BB14_6 ; SI-NEXT: ; %bb.2: ; %kill ; SI-NEXT: s_mov_b64 exec, 0 @@ -1197,6 +1199,8 @@ ; GFX10-WAVE64-NEXT: s_cbranch_execz BB14_3 ; GFX10-WAVE64-NEXT: ; %bb.1: ; %kill ; GFX10-WAVE64-NEXT: s_andn2_b64 s[0:1], s[0:1], exec +; GFX10-WAVE64-NEXT: ; implicit-def: $vgpr0 +; GFX10-WAVE64-NEXT: ; implicit-def: $vgpr1 ; GFX10-WAVE64-NEXT: s_cbranch_scc0 BB14_6 ; GFX10-WAVE64-NEXT: ; %bb.2: ; %kill ; GFX10-WAVE64-NEXT: s_mov_b64 exec, 0 @@ -1234,6 +1238,8 @@ ; GFX10-WAVE32-NEXT: s_cbranch_execz BB14_3 ; GFX10-WAVE32-NEXT: ; %bb.1: ; %kill ; GFX10-WAVE32-NEXT: s_andn2_b32 s0, s0, exec_lo +; GFX10-WAVE32-NEXT: ; implicit-def: $vgpr0 +; GFX10-WAVE32-NEXT: ; implicit-def: $vgpr1 ; GFX10-WAVE32-NEXT: s_cbranch_scc0 BB14_6 ; GFX10-WAVE32-NEXT: ; %bb.2: ; %kill ; GFX10-WAVE32-NEXT: s_mov_b32 exec_lo, 0 diff --git a/llvm/test/CodeGen/AMDGPU/vgpr-liverange-ir.ll b/llvm/test/CodeGen/AMDGPU/vgpr-liverange-ir.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/vgpr-liverange-ir.ll @@ -0,0 +1,190 @@ +; NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py +; RUN: llc -march=amdgcn -mcpu=tonga -amdgpu-opt-vgpr-liverange=true -stop-after=si-opt-vgpr-liverange -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s + +; a normal if-else +define amdgpu_ps float @else1(i32 %z, float %v) #0 { + ; SI-LABEL: name: else1 + ; SI: bb.0.main_body: + ; SI: successors: %bb.3(0x40000000), %bb.1(0x40000000) + ; SI: liveins: $vgpr0, $vgpr1 + ; SI: [[COPY:%[0-9]+]]:vgpr_32 = COPY killed $vgpr1 + ; SI: [[COPY1:%[0-9]+]]:vgpr_32 = COPY killed $vgpr0 + ; SI: [[V_CMP_GT_I32_e64_:%[0-9]+]]:sreg_64 = V_CMP_GT_I32_e64 6, killed [[COPY1]], implicit $exec + ; SI: [[SI_IF:%[0-9]+]]:sreg_64 = SI_IF killed [[V_CMP_GT_I32_e64_]], %bb.1, implicit-def dead $exec, implicit-def dead $scc, implicit $exec + ; SI: S_BRANCH %bb.3 + ; SI: bb.1.Flow: + ; SI: successors: %bb.2(0x40000000), %bb.4(0x40000000) + ; SI: [[PHI:%[0-9]+]]:vgpr_32 = PHI undef %13:vgpr_32, %bb.0, %4, %bb.3 + ; SI: [[PHI1:%[0-9]+]]:vgpr_32 = PHI [[COPY]], %bb.0, undef %15:vgpr_32, %bb.3 + ; SI: [[SI_ELSE:%[0-9]+]]:sreg_64 = SI_ELSE killed [[SI_IF]], %bb.4, implicit-def dead $exec, implicit-def dead $scc, implicit $exec + ; SI: S_BRANCH %bb.2 + ; SI: bb.2.if: + ; SI: successors: %bb.4(0x80000000) + ; SI: %3:vgpr_32 = nofpexcept V_ADD_F32_e32 killed [[PHI1]], [[PHI1]], implicit $mode, implicit $exec + ; SI: S_BRANCH %bb.4 + ; SI: bb.3.else: + ; SI: successors: %bb.1(0x80000000) + ; SI: %4:vgpr_32 = nofpexcept V_MUL_F32_e32 1077936128, killed [[COPY]], implicit $mode, implicit $exec + ; SI: S_BRANCH %bb.1 + ; SI: bb.4.end: + ; SI: [[PHI2:%[0-9]+]]:vgpr_32 = PHI [[PHI]], %bb.1, %3, %bb.2 + ; SI: SI_END_CF killed [[SI_ELSE]], implicit-def dead $exec, implicit-def dead $scc, implicit $exec + ; SI: $vgpr0 = COPY killed [[PHI2]] + ; SI: SI_RETURN_TO_EPILOG killed $vgpr0 +main_body: + %cc = icmp sgt i32 %z, 5 + br i1 %cc, label %if, label %else + +if: + %v.if = fmul float %v, 2.0 + br label %end + +else: + %v.else = fmul float %v, 3.0 + br label %end + +end: + %r = phi float [ %v.if, %if ], [ %v.else, %else ] + ret float %r +} + + +; %v was used after if-else +define amdgpu_ps float @else2(i32 %z, float %v) #0 { + ; SI-LABEL: name: else2 + ; SI: bb.0.main_body: + ; SI: successors: %bb.3(0x40000000), %bb.1(0x40000000) + ; SI: liveins: $vgpr0, $vgpr1 + ; SI: [[COPY:%[0-9]+]]:vgpr_32 = COPY killed $vgpr1 + ; SI: [[COPY1:%[0-9]+]]:vgpr_32 = COPY killed $vgpr0 + ; SI: [[V_CMP_GT_I32_e64_:%[0-9]+]]:sreg_64 = V_CMP_GT_I32_e64 6, killed [[COPY1]], implicit $exec + ; SI: [[SI_IF:%[0-9]+]]:sreg_64 = SI_IF killed [[V_CMP_GT_I32_e64_]], %bb.1, implicit-def dead $exec, implicit-def dead $scc, implicit $exec + ; SI: S_BRANCH %bb.3 + ; SI: bb.1.Flow: + ; SI: successors: %bb.2(0x40000000), %bb.4(0x40000000) + ; SI: [[PHI:%[0-9]+]]:vgpr_32 = PHI undef %15:vgpr_32, %bb.0, %4, %bb.3 + ; SI: [[SI_ELSE:%[0-9]+]]:sreg_64 = SI_ELSE killed [[SI_IF]], %bb.4, implicit-def dead $exec, implicit-def dead $scc, implicit $exec + ; SI: S_BRANCH %bb.2 + ; SI: bb.2.if: + ; SI: successors: %bb.4(0x80000000) + ; SI: %3:vgpr_32 = nofpexcept V_ADD_F32_e32 killed [[COPY]], [[COPY]], implicit $mode, implicit $exec + ; SI: S_BRANCH %bb.4 + ; SI: bb.3.else: + ; SI: successors: %bb.1(0x80000000) + ; SI: %4:vgpr_32 = nofpexcept V_MUL_F32_e32 1077936128, [[COPY]], implicit $mode, implicit $exec + ; SI: S_BRANCH %bb.1 + ; SI: bb.4.end: + ; SI: [[PHI1:%[0-9]+]]:vgpr_32 = PHI [[COPY]], %bb.1, %3, %bb.2 + ; SI: [[PHI2:%[0-9]+]]:vgpr_32 = PHI [[PHI]], %bb.1, %3, %bb.2 + ; SI: SI_END_CF killed [[SI_ELSE]], implicit-def dead $exec, implicit-def dead $scc, implicit $exec + ; SI: %14:vgpr_32 = nofpexcept V_ADD_F32_e32 killed [[PHI1]], killed [[PHI2]], implicit $mode, implicit $exec + ; SI: $vgpr0 = COPY killed %14 + ; SI: SI_RETURN_TO_EPILOG killed $vgpr0 +main_body: + %cc = icmp sgt i32 %z, 5 + br i1 %cc, label %if, label %else + +if: + %v.if = fmul float %v, 2.0 + br label %end + +else: + %v.else = fmul float %v, 3.0 + br label %end + +end: + %r0 = phi float [ %v.if, %if ], [ %v, %else ] + %r1 = phi float [ %v.if, %if ], [ %v.else, %else ] + %r2 = fadd float %r0, %r1 + ret float %r2 +} + +; if-else inside loop, %x can be optimized, but %v cannot be. +define amdgpu_ps float @else3(i32 %z, float %v, i32 inreg %bound, i32 %x0) #0 { + ; SI-LABEL: name: else3 + ; SI: bb.0.entry: + ; SI: successors: %bb.1(0x80000000) + ; SI: liveins: $vgpr0, $vgpr1, $sgpr0, $vgpr2 + ; SI: [[COPY:%[0-9]+]]:vgpr_32 = COPY killed $vgpr2 + ; SI: [[COPY1:%[0-9]+]]:sgpr_32 = COPY killed $sgpr0 + ; SI: [[COPY2:%[0-9]+]]:vgpr_32 = COPY killed $vgpr1 + ; SI: [[COPY3:%[0-9]+]]:vgpr_32 = COPY killed $vgpr0 + ; SI: [[V_CMP_GT_I32_e64_:%[0-9]+]]:sreg_64 = V_CMP_GT_I32_e64 6, killed [[COPY3]], implicit $exec + ; SI: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0 + ; SI: bb.1.for.body: + ; SI: successors: %bb.4(0x40000000), %bb.2(0x40000000) + ; SI: [[PHI:%[0-9]+]]:sreg_32 = PHI [[S_MOV_B32_]], %bb.0, %14, %bb.5 + ; SI: [[PHI1:%[0-9]+]]:vgpr_32 = PHI [[COPY]], %bb.0, %13, %bb.5 + ; SI: [[SI_IF:%[0-9]+]]:sreg_64 = SI_IF [[V_CMP_GT_I32_e64_]], %bb.2, implicit-def dead $exec, implicit-def dead $scc, implicit $exec + ; SI: S_BRANCH %bb.4 + ; SI: bb.2.Flow: + ; SI: successors: %bb.3(0x40000000), %bb.5(0x40000000) + ; SI: [[PHI2:%[0-9]+]]:vgpr_32 = PHI undef %36:vgpr_32, %bb.1, %10, %bb.4 + ; SI: [[PHI3:%[0-9]+]]:vgpr_32 = PHI undef %37:vgpr_32, %bb.1, %9, %bb.4 + ; SI: [[PHI4:%[0-9]+]]:vgpr_32 = PHI [[PHI1]], %bb.1, undef %40:vgpr_32, %bb.4 + ; SI: [[SI_ELSE:%[0-9]+]]:sreg_64 = SI_ELSE killed [[SI_IF]], %bb.5, implicit-def dead $exec, implicit-def dead $scc, implicit $exec + ; SI: S_BRANCH %bb.3 + ; SI: bb.3.if: + ; SI: successors: %bb.5(0x80000000) + ; SI: %7:vgpr_32 = nofpexcept V_MUL_F32_e32 [[PHI]], [[COPY2]], implicit $mode, implicit $exec + ; SI: %8:vgpr_32, dead %32:sreg_64 = V_ADD_CO_U32_e64 1, killed [[PHI4]], 0, implicit $exec + ; SI: S_BRANCH %bb.5 + ; SI: bb.4.else: + ; SI: successors: %bb.2(0x80000000) + ; SI: %9:vgpr_32 = nofpexcept V_MUL_F32_e32 [[COPY2]], [[PHI1]], implicit $mode, implicit $exec + ; SI: [[V_MUL_LO_U32_e64_:%[0-9]+]]:vgpr_32 = V_MUL_LO_U32_e64 killed [[PHI1]], 3, implicit $exec + ; SI: [[COPY4:%[0-9]+]]:vgpr_32 = COPY killed [[V_MUL_LO_U32_e64_]] + ; SI: S_BRANCH %bb.2 + ; SI: bb.5.if.end: + ; SI: successors: %bb.6(0x04000000), %bb.1(0x7c000000) + ; SI: [[PHI5:%[0-9]+]]:vgpr_32 = PHI [[PHI3]], %bb.2, %7, %bb.3 + ; SI: [[PHI6:%[0-9]+]]:vgpr_32 = PHI [[PHI2]], %bb.2, %8, %bb.3 + ; SI: SI_END_CF killed [[SI_ELSE]], implicit-def dead $exec, implicit-def dead $scc, implicit $exec + ; SI: %13:vgpr_32, dead %34:sreg_64 = V_ADD_CO_U32_e64 1, [[PHI6]], 0, implicit $exec + ; SI: [[S_ADD_I32_:%[0-9]+]]:sreg_32 = S_ADD_I32 killed [[PHI]], 1, implicit-def dead $scc + ; SI: S_CMP_LT_I32 [[S_ADD_I32_]], [[COPY1]], implicit-def $scc + ; SI: S_CBRANCH_SCC1 %bb.1, implicit killed $scc + ; SI: S_BRANCH %bb.6 + ; SI: bb.6.for.end: + ; SI: %35:vgpr_32 = nofpexcept V_ADD_F32_e32 killed [[PHI6]], killed [[PHI5]], implicit $mode, implicit $exec + ; SI: $vgpr0 = COPY killed %35 + ; SI: SI_RETURN_TO_EPILOG killed $vgpr0 +entry: +; %break = icmp sgt i32 %bound, 0 +; br i1 %break, label %for.body, label %for.end + br label %for.body + +for.body: + %i = phi i32 [ 0, %entry ], [ %inc, %if.end ] + %x = phi i32 [ %x0, %entry ], [ %xinc, %if.end ] + %cc = icmp sgt i32 %z, 5 + br i1 %cc, label %if, label %else + +if: + %i.tmp = bitcast i32 %i to float + %v.if = fmul float %v, %i.tmp + %x.if = add i32 %x, 1 + br label %if.end + +else: + %x.tmp = bitcast i32 %x to float + %v.else = fmul float %v, %x.tmp + %x.else = mul i32 %x, 3 + br label %if.end + +if.end: + %v.endif = phi float [ %v.if, %if ], [ %v.else, %else ] + %x.endif = phi i32 [ %x.if, %if ], [ %x.else, %else ] + + %xinc = add i32 %x.endif, 1 + %inc = add i32 %i, 1 + %cond = icmp slt i32 %inc, %bound + br i1 %cond, label %for.body, label %for.end + +for.end: + %x_float = bitcast i32 %x.endif to float + %r = fadd float %x_float, %v.endif + ret float %r +} + +attributes #0 = { nounwind } diff --git a/llvm/test/CodeGen/AMDGPU/vgpr-liverange.ll b/llvm/test/CodeGen/AMDGPU/vgpr-liverange.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/vgpr-liverange.ll @@ -0,0 +1,156 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -march=amdgcn -mcpu=tonga -amdgpu-opt-vgpr-liverange=true -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s + +; a normal if-else +define amdgpu_ps float @else1(i32 %z, float %v) #0 { +; SI-LABEL: else1: +; SI: ; %bb.0: ; %main_body +; SI-NEXT: v_cmp_gt_i32_e32 vcc, 6, v0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: s_and_saveexec_b64 s[0:1], vcc +; SI-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; SI-NEXT: ; %bb.1: ; %else +; SI-NEXT: v_mul_f32_e32 v0, 0x40400000, v1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; %bb.2: ; %Flow +; SI-NEXT: s_or_saveexec_b64 s[0:1], s[0:1] +; SI-NEXT: s_xor_b64 exec, exec, s[0:1] +; SI-NEXT: ; %bb.3: ; %if +; SI-NEXT: v_add_f32_e32 v0, v1, v1 +; SI-NEXT: ; %bb.4: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[0:1] +; SI-NEXT: ; return to shader part epilog +main_body: + %cc = icmp sgt i32 %z, 5 + br i1 %cc, label %if, label %else + +if: + %v.if = fmul float %v, 2.0 + br label %end + +else: + %v.else = fmul float %v, 3.0 + br label %end + +end: + %r = phi float [ %v.if, %if ], [ %v.else, %else ] + ret float %r +} + + +; %v was used after if-else +define amdgpu_ps float @else2(i32 %z, float %v) #0 { +; SI-LABEL: else2: +; SI: ; %bb.0: ; %main_body +; SI-NEXT: v_cmp_gt_i32_e32 vcc, 6, v0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: s_and_saveexec_b64 s[0:1], vcc +; SI-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; SI-NEXT: ; %bb.1: ; %else +; SI-NEXT: v_mul_f32_e32 v0, 0x40400000, v1 +; SI-NEXT: ; %bb.2: ; %Flow +; SI-NEXT: s_or_saveexec_b64 s[0:1], s[0:1] +; SI-NEXT: s_xor_b64 exec, exec, s[0:1] +; SI-NEXT: ; %bb.3: ; %if +; SI-NEXT: v_add_f32_e32 v1, v1, v1 +; SI-NEXT: v_mov_b32_e32 v0, v1 +; SI-NEXT: ; %bb.4: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[0:1] +; SI-NEXT: v_add_f32_e32 v0, v1, v0 +; SI-NEXT: ; return to shader part epilog +main_body: + %cc = icmp sgt i32 %z, 5 + br i1 %cc, label %if, label %else + +if: + %v.if = fmul float %v, 2.0 + br label %end + +else: + %v.else = fmul float %v, 3.0 + br label %end + +end: + %r0 = phi float [ %v.if, %if ], [ %v, %else ] + %r1 = phi float [ %v.if, %if ], [ %v.else, %else ] + %r2 = fadd float %r0, %r1 + ret float %r2 +} + +; if-else inside loop, %x can be optimized, but %v cannot be. +define amdgpu_ps float @else3(i32 %z, float %v, i32 inreg %bound, i32 %x0) #0 { +; SI-LABEL: else3: +; SI: ; %bb.0: ; %entry +; SI-NEXT: v_cmp_gt_i32_e32 vcc, 6, v0 +; SI-NEXT: s_mov_b32 s1, 0 +; SI-NEXT: s_branch BB2_2 +; SI-NEXT: BB2_1: ; %if.end +; SI-NEXT: ; in Loop: Header=BB2_2 Depth=1 +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_add_i32 s1, s1, 1 +; SI-NEXT: s_cmp_lt_i32 s1, s0 +; SI-NEXT: v_add_u32_e64 v2, s[2:3], 1, v0 +; SI-NEXT: s_cbranch_scc0 BB2_6 +; SI-NEXT: BB2_2: ; %for.body +; SI-NEXT: ; =>This Inner Loop Header: Depth=1 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: s_and_saveexec_b64 s[2:3], vcc +; SI-NEXT: s_xor_b64 s[2:3], exec, s[2:3] +; SI-NEXT: ; %bb.3: ; %else +; SI-NEXT: ; in Loop: Header=BB2_2 Depth=1 +; SI-NEXT: v_mul_lo_u32 v0, v2, 3 +; SI-NEXT: v_mul_f32_e32 v3, v1, v2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; %bb.4: ; %Flow +; SI-NEXT: ; in Loop: Header=BB2_2 Depth=1 +; SI-NEXT: s_or_saveexec_b64 s[4:5], s[2:3] +; SI-NEXT: s_xor_b64 exec, exec, s[4:5] +; SI-NEXT: s_cbranch_execz BB2_1 +; SI-NEXT: ; %bb.5: ; %if +; SI-NEXT: ; in Loop: Header=BB2_2 Depth=1 +; SI-NEXT: v_mul_f32_e32 v3, s1, v1 +; SI-NEXT: v_add_u32_e64 v0, s[2:3], 1, v2 +; SI-NEXT: s_branch BB2_1 +; SI-NEXT: BB2_6: ; %for.end +; SI-NEXT: v_add_f32_e32 v0, v0, v3 +; SI-NEXT: ; return to shader part epilog +entry: +; %break = icmp sgt i32 %bound, 0 +; br i1 %break, label %for.body, label %for.end + br label %for.body + +for.body: + %i = phi i32 [ 0, %entry ], [ %inc, %if.end ] + %x = phi i32 [ %x0, %entry ], [ %xinc, %if.end ] + %cc = icmp sgt i32 %z, 5 + br i1 %cc, label %if, label %else + +if: + %i.tmp = bitcast i32 %i to float + %v.if = fmul float %v, %i.tmp + %x.if = add i32 %x, 1 + br label %if.end + +else: + %x.tmp = bitcast i32 %x to float + %v.else = fmul float %v, %x.tmp + %x.else = mul i32 %x, 3 + br label %if.end + +if.end: + %v.endif = phi float [ %v.if, %if ], [ %v.else, %else ] + %x.endif = phi i32 [ %x.if, %if ], [ %x.else, %else ] + + %xinc = add i32 %x.endif, 1 + %inc = add i32 %i, 1 + %cond = icmp slt i32 %inc, %bound + br i1 %cond, label %for.body, label %for.end + +for.end: + %x_float = bitcast i32 %x.endif to float + %r = fadd float %x_float, %v.endif + ret float %r +} + +attributes #0 = { nounwind }