diff --git a/llvm/lib/Target/AMDGPU/SIFixSGPRCopies.cpp b/llvm/lib/Target/AMDGPU/SIFixSGPRCopies.cpp --- a/llvm/lib/Target/AMDGPU/SIFixSGPRCopies.cpp +++ b/llvm/lib/Target/AMDGPU/SIFixSGPRCopies.cpp @@ -67,6 +67,7 @@ #include "AMDGPU.h" #include "GCNSubtarget.h" #include "MCTargetDesc/AMDGPUMCTargetDesc.h" +#include "llvm/ADT/SetOperations.h" #include "llvm/CodeGen/MachineDominators.h" #include "llvm/InitializePasses.h" #include "llvm/Target/TargetMachine.h" @@ -81,9 +82,9 @@ cl::init(true)); namespace { - class SIFixSGPRCopies : public MachineFunctionPass { MachineDominatorTree *MDT; + unsigned NextVGPRToSGPRCopyID; public: static char ID; @@ -92,9 +93,16 @@ const SIRegisterInfo *TRI; const SIInstrInfo *TII; - SIFixSGPRCopies() : MachineFunctionPass(ID) {} + SIFixSGPRCopies() : MachineFunctionPass(ID), NextVGPRToSGPRCopyID(0) {} bool runOnMachineFunction(MachineFunction &MF) override; + unsigned getNextVGPRToSGPRCopyId() { return ++NextVGPRToSGPRCopyID; } + void lowerVGPR2SGPRCopies(MachineFunction &MF); + // Handles copies which source register is: + // 1. Physical register + // 2. AGPR + // 3. Defined by the instruction the merely moves the immediate + bool lowerSpecialCase(MachineInstr &MI); MachineBasicBlock *processPHINode(MachineInstr &MI); @@ -569,6 +577,14 @@ TII = ST.getInstrInfo(); MDT = &getAnalysis(); + // We have to lower VGPR to SGPR copies before the main loop + // because the REG_SEQUENCE and PHI lowering in main loop + // convert the def-use chains to VALU and close the opportunities + // for keeping them scalar. + // TODO: REG_SEQENCE and PHIs are semantically copies. The next patch + // addresses their lowering and unify the processing in one main loop. + lowerVGPR2SGPRCopies(MF); + for (MachineFunction::iterator BI = MF.begin(), BE = MF.end(); BI != BE; ++BI) { MachineBasicBlock *MBB = &*BI; @@ -640,42 +656,7 @@ continue; } - if (isVGPRToSGPRCopy(SrcRC, DstRC, *TRI)) { - Register SrcReg = MI.getOperand(1).getReg(); - if (!SrcReg.isVirtual()) { - MachineBasicBlock *NewBB = TII->moveToVALU(MI, MDT); - if (NewBB && NewBB != MBB) { - MBB = NewBB; - E = MBB->end(); - BI = MachineFunction::iterator(MBB); - BE = MF.end(); - } - assert((!NewBB || NewBB == I->getParent()) && - "moveToVALU did not return the right basic block"); - break; - } - - MachineInstr *DefMI = MRI->getVRegDef(SrcReg); - unsigned SMovOp; - int64_t Imm; - // If we are just copying an immediate, we can replace the copy with - // s_mov_b32. - if (isSafeToFoldImmIntoCopy(&MI, DefMI, TII, SMovOp, Imm)) { - MI.getOperand(1).ChangeToImmediate(Imm); - MI.addImplicitDefUseOperands(MF); - MI.setDesc(TII->get(SMovOp)); - break; - } - MachineBasicBlock *NewBB = TII->moveToVALU(MI, MDT); - if (NewBB && NewBB != MBB) { - MBB = NewBB; - E = MBB->end(); - BI = MachineFunction::iterator(MBB); - BE = MF.end(); - } - assert((!NewBB || NewBB == I->getParent()) && - "moveToVALU did not return the right basic block"); - } else if (isSGPRToVGPRCopy(SrcRC, DstRC, *TRI)) { + if (isSGPRToVGPRCopy(SrcRC, DstRC, *TRI)) { tryChangeVGPRtoSGPRinCopy(MI, TRI, TII); } @@ -916,3 +897,269 @@ } return CreatedBB; } + +bool SIFixSGPRCopies::lowerSpecialCase(MachineInstr &MI) { + MachineBasicBlock *MBB = MI.getParent(); + const TargetRegisterClass *SrcRC, *DstRC; + std::tie(SrcRC, DstRC) = getCopyRegClasses(MI, *TRI, *MRI); + + // We return true to indicate that no further processing needed + if (!isVGPRToSGPRCopy(SrcRC, DstRC, *TRI)) + return true; + + Register SrcReg = MI.getOperand(1).getReg(); + if (!SrcReg.isVirtual() || TRI->isAGPR(*MRI, SrcReg)) { + TII->moveToVALU(MI, MDT); + return true; + } + + unsigned SMovOp; + int64_t Imm; + // If we are just copying an immediate, we can replace the copy with + // s_mov_b32. + if (isSafeToFoldImmIntoCopy(&MI, MRI->getVRegDef(SrcReg), TII, SMovOp, Imm)) { + MI.getOperand(1).ChangeToImmediate(Imm); + MI.addImplicitDefUseOperands(*MBB->getParent()); + MI.setDesc(TII->get(SMovOp)); + return true; + } + return false; +} + +class V2SCopyInfo { +public: + // VGPR to SGPR copy being processed + MachineInstr *Copy; + // All SALU instructions reachable from this copy in SSA graph + DenseSet SChain; + // Number of SGPR to VGPR copies that are used to put the SALU computation + // results back to VALU. + unsigned NumSVCopies; + + unsigned Score; + // Actual count of v_readfirstlane_b32 + // which need to be inserted to keep SChain SALU + unsigned NumReadfirstlanes; + // Current score state. To speedup selection V2SCopyInfos for processing + bool NeedToBeConvertedToVALU = false; + // Unique ID. Used as a key for mapping to keep permanent order. + unsigned ID; + + // Count of another VGPR to SGPR copies that contribute to the + // current copy SChain + unsigned SiblingPenalty = 0; + SetVector Siblings; + V2SCopyInfo() : Copy(nullptr), ID(0){}; + V2SCopyInfo(unsigned Id, MachineInstr *C, unsigned Width) + : Copy(C), NumSVCopies(0), NumReadfirstlanes(Width / 32), ID(Id){}; +#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) + void dump() { + dbgs() << ID << " : " << *Copy << "\n\tS:" << SChain.size() + << "\n\tSV:" << NumSVCopies << "\n\tSP: " << SiblingPenalty + << "\nScore: " << Score << "\n"; + } +#endif +}; + +void SIFixSGPRCopies::lowerVGPR2SGPRCopies(MachineFunction &MF) { + + DenseMap Copies; + DenseMap> SiblingPenalty; + + // The main function that computes the VGPR to SGPR copy score + // and determines copy further lowering way: v_readfirstlane_b32 or moveToVALU + auto needToBeConvertedToVALU = [&](V2SCopyInfo *I) -> bool { + if (I->SChain.empty()) + return true; + I->Siblings = SiblingPenalty[*std::max_element( + I->SChain.begin(), I->SChain.end(), + [&](MachineInstr *A, MachineInstr *B) -> bool { + return SiblingPenalty[A].size() < SiblingPenalty[B].size(); + })]; + I->Siblings.remove_if([&](unsigned ID) { return ID == I->ID; }); + // The loop below computes the number of another VGPR to SGPR copies + // which contribute to the current copy SALU chain. We assume that all the + // copies with the same source virtual register will be squashed to one by + // regalloc. Also we take careof the copies of the differnt subregs of the + // same register. + SmallSet, 4> SrcRegs; + for (auto J : I->Siblings) { + auto InfoIt = Copies.find(J); + if (InfoIt != Copies.end()) { + MachineInstr *SiblingCopy = InfoIt->getSecond().Copy; + if (SiblingCopy->isImplicitDef()) + // the COPY has already been MoveToVALUed + continue; + + SrcRegs.insert(std::make_pair(SiblingCopy->getOperand(1).getReg(), + SiblingCopy->getOperand(1).getSubReg())); + } + } + I->SiblingPenalty = SrcRegs.size(); + + unsigned Penalty = + I->NumSVCopies + I->SiblingPenalty + I->NumReadfirstlanes; + unsigned Profit = I->SChain.size(); + I->Score = Penalty > Profit ? 0 : Profit - Penalty; + I->NeedToBeConvertedToVALU = I->Score < 3; + return I->NeedToBeConvertedToVALU; + }; + + auto needProcessing = [](MachineInstr &MI) -> bool { + switch (MI.getOpcode()) { + case AMDGPU::COPY: + case AMDGPU::WQM: + case AMDGPU::STRICT_WQM: + case AMDGPU::SOFT_WQM: + case AMDGPU::STRICT_WWM: + return true; + default: + return false; + } + }; + + for (MachineFunction::iterator BI = MF.begin(), BE = MF.end(); BI != BE; + ++BI) { + MachineBasicBlock *MBB = &*BI; + for (MachineBasicBlock::iterator I = MBB->begin(), E = MBB->end(); I != E; + ++I) { + MachineInstr &MI = *I; + if (!needProcessing(MI)) + continue; + if (lowerSpecialCase(MI)) + continue; + + // Compute the COPY width to pass it to V2SCopyInfo Ctor + Register DstReg = MI.getOperand(0).getReg(); + + const TargetRegisterClass *DstRC = TRI->getRegClassForReg(*MRI, DstReg); + + V2SCopyInfo In(getNextVGPRToSGPRCopyId(), &MI, + TRI->getRegSizeInBits(*DstRC)); + + SmallVector AnalysisWorklist; + // Needed because the SSA is not a tree but a graph and may have + // forks and joins. We should not then go same way twice. + DenseSet Visited; + AnalysisWorklist.push_back(&MI); + while (!AnalysisWorklist.empty()) { + + MachineInstr *Inst = AnalysisWorklist.pop_back_val(); + + if (!Visited.insert(Inst).second) + continue; + + // Copies and REG_SEQUENCE do not contribute to the final assembly + // So, skip them but take care of the SGPR to VGPR copies bookkeeping. + if (Inst->isCopy() || Inst->isRegSequence()) { + if (TRI->isVGPR(*MRI, Inst->getOperand(0).getReg())) { + if (!Inst->isCopy() || + !tryChangeVGPRtoSGPRinCopy(*Inst, TRI, TII)) { + In.NumSVCopies++; + continue; + } + } + } + + SiblingPenalty[Inst].insert(In.ID); + + SmallVector Users; + if ((TII->isSALU(*Inst) && Inst->isCompare()) || + (Inst->isCopy() && Inst->getOperand(0).getReg() == AMDGPU::SCC)) { + auto I = Inst->getIterator(); + auto E = Inst->getParent()->end(); + while (++I != E && !I->findRegisterDefOperand(AMDGPU::SCC)) { + if (I->readsRegister(AMDGPU::SCC)) + Users.push_back(&*I); + } + } else if (Inst->getNumExplicitDefs() != 0) { + Register Reg = Inst->getOperand(0).getReg(); + if (TRI->isSGPRReg(*MRI, Reg)) + for (auto &U : MRI->use_instructions(Reg)) + Users.push_back(&U); + } + for (auto U : Users) { + if (TII->isSALU(*U)) + In.SChain.insert(U); + AnalysisWorklist.push_back(U); + } + } + Copies[In.ID] = In; + } + } + + SmallVector LoweringWorklist; + for (auto &C : Copies) { + if (needToBeConvertedToVALU(&C.second)) + LoweringWorklist.push_back(C.second.ID); + } + + while (!LoweringWorklist.empty()) { + unsigned CurID = LoweringWorklist.pop_back_val(); + auto CurInfoIt = Copies.find(CurID); + if (CurInfoIt != Copies.end()) { + V2SCopyInfo C = CurInfoIt->getSecond(); + LLVM_DEBUG(dbgs() << "Processing ...\n"; C.dump()); + for (auto S : C.Siblings) { + auto SibInfoIt = Copies.find(S); + if (SibInfoIt != Copies.end()) { + V2SCopyInfo &SI = SibInfoIt->getSecond(); + LLVM_DEBUG(dbgs() << "Sibling:\n"; SI.dump()); + if (!SI.NeedToBeConvertedToVALU) { + set_subtract(SI.SChain, C.SChain); + if (needToBeConvertedToVALU(&SI)) + LoweringWorklist.push_back(SI.ID); + } + SI.Siblings.remove_if([&](unsigned ID) { return ID == C.ID; }); + } + } + LLVM_DEBUG(dbgs() << "V2S copy " << *C.Copy + << " is being turned to VALU\n"); + Copies.erase(C.ID); + TII->moveToVALU(*C.Copy, MDT); + } + } + + // Now do actual lowering + for (auto C : Copies) { + MachineInstr *MI = C.second.Copy; + MachineBasicBlock *MBB = MI->getParent(); + // We decide to turn V2S copy to v_readfirstlane_b32 + // remove it from the V2SCopies and remove it from all its siblings + LLVM_DEBUG(dbgs() << "V2S copy " << *MI + << " is being turned to v_readfirstlane_b32" + << " Score: " << C.second.Score << "\n"); + Register DstReg = MI->getOperand(0).getReg(); + Register SrcReg = MI->getOperand(1).getReg(); + unsigned SubReg = MI->getOperand(1).getSubReg(); + const TargetRegisterClass *SrcRC = TRI->getRegClassForReg(*MRI, SrcReg); + SrcRC = TRI->getSubRegClass(SrcRC, SubReg); + size_t SrcSize = TRI->getRegSizeInBits(*SrcRC); + if (SrcSize == 16) { + // HACK to handle possible 16bit VGPR source + auto MIB = BuildMI(*MBB, MI, MI->getDebugLoc(), + TII->get(AMDGPU::V_READFIRSTLANE_B32), DstReg); + MIB.addReg(SrcReg, 0, AMDGPU::NoSubRegister); + } else if (SrcSize == 32) { + auto MIB = BuildMI(*MBB, MI, MI->getDebugLoc(), + TII->get(AMDGPU::V_READFIRSTLANE_B32), DstReg); + MIB.addReg(SrcReg, 0, SubReg); + } else { + auto Result = BuildMI(*MBB, MI, MI->getDebugLoc(), + TII->get(AMDGPU::REG_SEQUENCE), DstReg); + int N = TRI->getRegSizeInBits(*SrcRC) / 32; + for (int i = 0; i < N; i++) { + Register PartialSrc = TII->buildExtractSubReg( + Result, *MRI, MI->getOperand(1), SrcRC, + TRI->getSubRegFromChannel(i), &AMDGPU::VGPR_32RegClass); + Register PartialDst = + MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass); + BuildMI(*MBB, *Result, Result->getDebugLoc(), + TII->get(AMDGPU::V_READFIRSTLANE_B32), PartialDst) + .addReg(PartialSrc); + Result.addReg(PartialDst).addImm(TRI->getSubRegFromChannel(i)); + } + } + MI->eraseFromParent(); + } +} diff --git a/llvm/test/CodeGen/AMDGPU/add3.ll b/llvm/test/CodeGen/AMDGPU/add3.ll --- a/llvm/test/CodeGen/AMDGPU/add3.ll +++ b/llvm/test/CodeGen/AMDGPU/add3.ll @@ -223,7 +223,7 @@ ; VI-NEXT: v_mov_b32_e32 v2, 0x40400000 ; VI-NEXT: v_add_f32_e32 v2, s4, v2 ; VI-NEXT: v_add_u32_e32 v0, vcc, v1, v0 -; VI-NEXT: v_add_u32_e32 v0, vcc, v2, v0 +; VI-NEXT: v_add_u32_e32 v0, vcc, v0, v2 ; VI-NEXT: ; return to shader part epilog ; ; GFX9-LABEL: add3_uniform_vgpr: diff --git a/llvm/test/CodeGen/AMDGPU/agpr-copy-no-free-registers.ll b/llvm/test/CodeGen/AMDGPU/agpr-copy-no-free-registers.ll --- a/llvm/test/CodeGen/AMDGPU/agpr-copy-no-free-registers.ll +++ b/llvm/test/CodeGen/AMDGPU/agpr-copy-no-free-registers.ll @@ -521,175 +521,168 @@ define amdgpu_kernel void @introduced_copy_to_sgpr(i64 %arg, i32 %arg1, i32 %arg2, i64 %arg3, <2 x half> %arg4, <2 x half> %arg5) #3 { ; GFX908-LABEL: introduced_copy_to_sgpr: ; GFX908: ; %bb.0: ; %bb -; GFX908-NEXT: global_load_ushort v0, v[0:1], off glc +; GFX908-NEXT: global_load_ushort v24, v[0:1], off glc ; GFX908-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX908-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x10 -; GFX908-NEXT: s_load_dword s7, s[4:5], 0x18 -; GFX908-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 -; GFX908-NEXT: s_mov_b32 s6, 0 +; GFX908-NEXT: s_load_dword s8, s[4:5], 0x18 +; GFX908-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 +; GFX908-NEXT: v_mov_b32_e32 v1, 0 ; GFX908-NEXT: s_waitcnt lgkmcnt(0) -; GFX908-NEXT: v_cvt_f32_u32_e32 v1, s1 +; GFX908-NEXT: v_cvt_f32_u32_e32 v0, s1 ; GFX908-NEXT: s_sub_i32 s4, 0, s1 -; GFX908-NEXT: s_lshl_b64 s[10:11], s[2:3], 5 -; GFX908-NEXT: s_or_b32 s10, s10, 28 -; GFX908-NEXT: v_rcp_iflag_f32_e32 v1, v1 -; GFX908-NEXT: v_mov_b32_e32 v35, s10 -; GFX908-NEXT: s_lshr_b32 s12, s7, 16 -; GFX908-NEXT: v_mov_b32_e32 v10, s11 -; GFX908-NEXT: v_mul_f32_e32 v1, 0x4f7ffffe, v1 -; GFX908-NEXT: v_cvt_u32_f32_e32 v2, v1 -; GFX908-NEXT: v_cvt_f32_f16_e32 v26, s7 -; GFX908-NEXT: v_cvt_f32_f16_e32 v27, s12 -; GFX908-NEXT: v_accvgpr_write_b32 a0, v35 -; GFX908-NEXT: v_mul_lo_u32 v1, s4, v2 -; GFX908-NEXT: v_accvgpr_write_b32 a1, v10 -; GFX908-NEXT: s_lshl_b64 s[4:5], s[8:9], 5 -; GFX908-NEXT: v_mul_hi_u32 v3, v2, v1 -; GFX908-NEXT: v_mov_b32_e32 v1, 0 -; GFX908-NEXT: v_add_u32_e32 v2, v2, v3 -; GFX908-NEXT: v_mul_hi_u32 v4, s0, v2 -; GFX908-NEXT: v_mul_lo_u32 v5, v4, s1 -; GFX908-NEXT: v_add_u32_e32 v6, 1, v4 -; GFX908-NEXT: v_sub_u32_e32 v5, s0, v5 -; GFX908-NEXT: v_cmp_le_u32_e32 vcc, s1, v5 -; GFX908-NEXT: v_cndmask_b32_e32 v4, v4, v6, vcc -; GFX908-NEXT: v_subrev_u32_e32 v6, s1, v5 -; GFX908-NEXT: v_cndmask_b32_e32 v5, v5, v6, vcc -; GFX908-NEXT: v_add_u32_e32 v7, 1, v4 -; GFX908-NEXT: v_cmp_le_u32_e32 vcc, s1, v5 +; GFX908-NEXT: s_lshr_b32 s11, s8, 16 +; GFX908-NEXT: v_cvt_f32_f16_e32 v25, s8 +; GFX908-NEXT: v_rcp_iflag_f32_e32 v0, v0 +; GFX908-NEXT: s_lshl_b64 s[8:9], s[2:3], 5 +; GFX908-NEXT: v_cvt_f32_f16_e32 v26, s11 +; GFX908-NEXT: s_or_b32 s8, s8, 28 +; GFX908-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 +; GFX908-NEXT: v_cvt_u32_f32_e32 v0, v0 +; GFX908-NEXT: v_mov_b32_e32 v7, s3 +; GFX908-NEXT: s_mov_b32 s10, 0 +; GFX908-NEXT: v_mov_b32_e32 v6, s2 +; GFX908-NEXT: v_mul_lo_u32 v2, s4, v0 +; GFX908-NEXT: s_lshl_b64 s[4:5], s[6:7], 5 +; GFX908-NEXT: v_mul_hi_u32 v2, v0, v2 +; GFX908-NEXT: v_add_u32_e32 v0, v0, v2 +; GFX908-NEXT: v_mul_hi_u32 v0, s0, v0 +; GFX908-NEXT: v_mov_b32_e32 v2, s8 +; GFX908-NEXT: v_mov_b32_e32 v3, s9 +; GFX908-NEXT: v_mul_lo_u32 v4, v0, s1 +; GFX908-NEXT: v_add_u32_e32 v5, 1, v0 +; GFX908-NEXT: v_sub_u32_e32 v4, s0, v4 +; GFX908-NEXT: v_cmp_le_u32_e32 vcc, s1, v4 +; GFX908-NEXT: v_cndmask_b32_e32 v0, v0, v5, vcc +; GFX908-NEXT: v_subrev_u32_e32 v5, s1, v4 +; GFX908-NEXT: v_cndmask_b32_e32 v4, v4, v5, vcc +; GFX908-NEXT: v_add_u32_e32 v5, 1, v0 +; GFX908-NEXT: v_cmp_le_u32_e32 vcc, s1, v4 +; GFX908-NEXT: v_cndmask_b32_e32 v0, v0, v5, vcc +; GFX908-NEXT: v_lshlrev_b64 v[4:5], 5, v[0:1] ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_and_b32_e32 v28, 0xffff, v0 -; GFX908-NEXT: v_cndmask_b32_e32 v0, v4, v7, vcc -; GFX908-NEXT: v_mul_lo_u32 v8, s9, v28 -; GFX908-NEXT: v_mul_hi_u32 v9, s8, v28 -; GFX908-NEXT: v_lshlrev_b64 v[2:3], 5, v[0:1] -; GFX908-NEXT: v_mul_lo_u32 v6, s8, v28 -; GFX908-NEXT: v_add_u32_e32 v7, v9, v8 -; GFX908-NEXT: v_accvgpr_write_b32 a2, v2 -; GFX908-NEXT: v_accvgpr_write_b32 a3, v3 -; GFX908-NEXT: v_lshlrev_b64 v[6:7], 5, v[6:7] -; GFX908-NEXT: v_mov_b32_e32 v9, s3 -; GFX908-NEXT: v_mov_b32_e32 v8, s2 +; GFX908-NEXT: v_readfirstlane_b32 s0, v24 +; GFX908-NEXT: s_and_b32 s0, 0xffff, s0 +; GFX908-NEXT: s_mul_i32 s1, s7, s0 +; GFX908-NEXT: s_mul_hi_u32 s7, s6, s0 +; GFX908-NEXT: s_mul_i32 s0, s6, s0 +; GFX908-NEXT: s_add_i32 s1, s7, s1 +; GFX908-NEXT: s_lshl_b64 s[6:7], s[0:1], 5 ; GFX908-NEXT: s_branch .LBB3_2 ; GFX908-NEXT: .LBB3_1: ; %bb12 ; GFX908-NEXT: ; in Loop: Header=BB3_2 Depth=1 -; GFX908-NEXT: v_add_co_u32_e32 v8, vcc, v8, v0 -; GFX908-NEXT: s_nop 0 -; GFX908-NEXT: v_accvgpr_read_b32 v3, a1 -; GFX908-NEXT: v_accvgpr_read_b32 v5, a3 -; GFX908-NEXT: v_addc_co_u32_e32 v9, vcc, 0, v9, vcc -; GFX908-NEXT: v_accvgpr_read_b32 v2, a0 -; GFX908-NEXT: v_accvgpr_read_b32 v4, a2 +; GFX908-NEXT: v_add_co_u32_e32 v6, vcc, v6, v0 +; GFX908-NEXT: v_addc_co_u32_e32 v7, vcc, 0, v7, vcc ; GFX908-NEXT: v_add_co_u32_e32 v2, vcc, v2, v4 ; GFX908-NEXT: v_addc_co_u32_e32 v3, vcc, v3, v5, vcc -; GFX908-NEXT: s_nop 0 -; GFX908-NEXT: v_accvgpr_write_b32 a0, v2 -; GFX908-NEXT: v_accvgpr_write_b32 a1, v3 ; GFX908-NEXT: .LBB3_2: ; %bb9 ; GFX908-NEXT: ; =>This Loop Header: Depth=1 ; GFX908-NEXT: ; Child Loop BB3_5 Depth 2 ; GFX908-NEXT: s_cbranch_scc0 .LBB3_1 ; GFX908-NEXT: ; %bb.3: ; %bb14 ; GFX908-NEXT: ; in Loop: Header=BB3_2 Depth=1 -; GFX908-NEXT: v_mov_b32_e32 v10, 0 -; GFX908-NEXT: v_mov_b32_e32 v11, 0 -; GFX908-NEXT: global_load_dwordx2 v[10:11], v[10:11], off -; GFX908-NEXT: s_mov_b32 s7, s6 -; GFX908-NEXT: v_cmp_gt_i64_e64 s[0:1], 0, v[8:9] -; GFX908-NEXT: v_accvgpr_read_b32 v13, a1 -; GFX908-NEXT: v_mov_b32_e32 v15, s7 -; GFX908-NEXT: v_mov_b32_e32 v17, s7 -; GFX908-NEXT: v_accvgpr_read_b32 v12, a0 -; GFX908-NEXT: v_mov_b32_e32 v14, s6 -; GFX908-NEXT: v_mov_b32_e32 v16, s6 +; GFX908-NEXT: v_mov_b32_e32 v8, 0 +; GFX908-NEXT: v_mov_b32_e32 v9, 0 +; GFX908-NEXT: global_load_dwordx2 v[8:9], v[8:9], off +; GFX908-NEXT: s_mov_b32 s11, s10 +; GFX908-NEXT: v_mov_b32_e32 v13, s11 +; GFX908-NEXT: v_mov_b32_e32 v15, s11 +; GFX908-NEXT: v_mov_b32_e32 v17, s11 +; GFX908-NEXT: v_mov_b32_e32 v12, s10 +; GFX908-NEXT: v_mov_b32_e32 v14, s10 +; GFX908-NEXT: v_mov_b32_e32 v16, s10 +; GFX908-NEXT: v_cmp_gt_i64_e64 s[0:1], 0, v[6:7] +; GFX908-NEXT: v_mov_b32_e32 v11, v3 +; GFX908-NEXT: v_mov_b32_e32 v19, v13 +; GFX908-NEXT: v_mov_b32_e32 v10, v2 +; GFX908-NEXT: v_mov_b32_e32 v18, v12 ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_add_co_u32_e32 v20, vcc, 1, v10 -; GFX908-NEXT: v_addc_co_u32_e32 v18, vcc, 0, v11, vcc -; GFX908-NEXT: v_mul_lo_u32 v21, s4, v18 -; GFX908-NEXT: v_mul_hi_u32 v22, s4, v20 -; GFX908-NEXT: v_mul_lo_u32 v23, s5, v20 -; GFX908-NEXT: v_mul_lo_u32 v29, s4, v20 -; GFX908-NEXT: v_mov_b32_e32 v19, s7 -; GFX908-NEXT: v_add_u32_e32 v20, v22, v21 -; GFX908-NEXT: v_add_u32_e32 v30, v20, v23 -; GFX908-NEXT: v_mov_b32_e32 v21, s7 -; GFX908-NEXT: v_mov_b32_e32 v18, s6 -; GFX908-NEXT: v_mov_b32_e32 v20, s6 +; GFX908-NEXT: v_readfirstlane_b32 s2, v8 +; GFX908-NEXT: v_readfirstlane_b32 s3, v9 +; GFX908-NEXT: s_add_u32 s2, s2, 1 +; GFX908-NEXT: s_addc_u32 s3, s3, 0 +; GFX908-NEXT: s_mul_hi_u32 s9, s4, s2 +; GFX908-NEXT: s_mul_i32 s11, s5, s2 +; GFX908-NEXT: s_mul_i32 s8, s4, s2 +; GFX908-NEXT: s_mul_i32 s2, s4, s3 +; GFX908-NEXT: s_add_i32 s2, s9, s2 +; GFX908-NEXT: s_add_i32 s9, s2, s11 ; GFX908-NEXT: s_branch .LBB3_5 ; GFX908-NEXT: .LBB3_4: ; %bb58 ; GFX908-NEXT: ; in Loop: Header=BB3_5 Depth=2 -; GFX908-NEXT: v_add_co_u32_e32 v10, vcc, v10, v28 -; GFX908-NEXT: v_addc_co_u32_e32 v11, vcc, 0, v11, vcc -; GFX908-NEXT: v_cmp_gt_i64_e32 vcc, 0, v[10:11] -; GFX908-NEXT: v_add_co_u32_e64 v12, s[2:3], v12, v6 -; GFX908-NEXT: v_addc_co_u32_e64 v13, s[2:3], v13, v7, s[2:3] +; GFX908-NEXT: v_add_co_u32_sdwa v8, vcc, v8, v24 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; GFX908-NEXT: v_addc_co_u32_e32 v9, vcc, 0, v9, vcc +; GFX908-NEXT: v_cmp_gt_i64_e32 vcc, 0, v[8:9] +; GFX908-NEXT: v_mov_b32_e32 v20, s7 +; GFX908-NEXT: v_add_co_u32_e64 v10, s[2:3], s6, v10 +; GFX908-NEXT: v_addc_co_u32_e64 v11, s[2:3], v11, v20, s[2:3] ; GFX908-NEXT: s_cbranch_vccz .LBB3_1 ; GFX908-NEXT: .LBB3_5: ; %bb16 ; GFX908-NEXT: ; Parent Loop BB3_2 Depth=1 ; GFX908-NEXT: ; => This Inner Loop Header: Depth=2 -; GFX908-NEXT: v_add_co_u32_e32 v22, vcc, v12, v29 -; GFX908-NEXT: v_addc_co_u32_e32 v23, vcc, v13, v30, vcc -; GFX908-NEXT: global_load_dword v32, v[22:23], off offset:-12 glc +; GFX908-NEXT: v_mov_b32_e32 v21, s9 +; GFX908-NEXT: v_add_co_u32_e32 v20, vcc, s8, v10 +; GFX908-NEXT: v_addc_co_u32_e32 v21, vcc, v11, v21, vcc +; GFX908-NEXT: global_load_dword v28, v[20:21], off offset:-12 glc ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: global_load_dword v31, v[22:23], off offset:-8 glc +; GFX908-NEXT: global_load_dword v27, v[20:21], off offset:-8 glc ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: global_load_dword v24, v[22:23], off offset:-4 glc +; GFX908-NEXT: global_load_dword v22, v[20:21], off offset:-4 glc ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: global_load_dword v22, v[22:23], off glc +; GFX908-NEXT: global_load_dword v20, v[20:21], off glc ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: ds_read_b64 v[22:23], v1 -; GFX908-NEXT: ds_read_b64 v[24:25], v0 +; GFX908-NEXT: ds_read_b64 v[20:21], v1 +; GFX908-NEXT: ds_read_b64 v[22:23], v0 ; GFX908-NEXT: s_and_b64 vcc, exec, s[0:1] ; GFX908-NEXT: s_waitcnt lgkmcnt(0) ; GFX908-NEXT: s_cbranch_vccnz .LBB3_4 ; GFX908-NEXT: ; %bb.6: ; %bb51 ; GFX908-NEXT: ; in Loop: Header=BB3_5 Depth=2 -; GFX908-NEXT: v_cvt_f32_f16_sdwa v33, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 -; GFX908-NEXT: v_cvt_f32_f16_e32 v32, v32 -; GFX908-NEXT: v_cvt_f32_f16_sdwa v34, v31 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 -; GFX908-NEXT: v_cvt_f32_f16_e32 v31, v31 -; GFX908-NEXT: v_add_f32_e32 v4, v26, v22 -; GFX908-NEXT: v_add_f32_e32 v5, v27, v23 -; GFX908-NEXT: v_add_f32_e32 v2, 0, v22 -; GFX908-NEXT: v_add_f32_e32 v3, 0, v23 -; GFX908-NEXT: v_add_f32_e32 v25, v33, v25 -; GFX908-NEXT: v_add_f32_e32 v24, v32, v24 -; GFX908-NEXT: v_add_f32_e32 v23, v34, v23 -; GFX908-NEXT: v_add_f32_e32 v22, v31, v22 -; GFX908-NEXT: v_add_f32_e32 v15, v15, v5 -; GFX908-NEXT: v_add_f32_e32 v14, v14, v4 -; GFX908-NEXT: v_add_f32_e32 v17, v17, v3 -; GFX908-NEXT: v_add_f32_e32 v16, v16, v2 -; GFX908-NEXT: v_add_f32_e32 v18, v18, v24 -; GFX908-NEXT: v_add_f32_e32 v19, v19, v25 -; GFX908-NEXT: v_add_f32_e32 v20, v20, v22 -; GFX908-NEXT: v_add_f32_e32 v21, v21, v23 +; GFX908-NEXT: v_cvt_f32_f16_sdwa v29, v28 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 +; GFX908-NEXT: v_cvt_f32_f16_e32 v28, v28 +; GFX908-NEXT: v_cvt_f32_f16_sdwa v30, v27 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 +; GFX908-NEXT: v_cvt_f32_f16_e32 v27, v27 +; GFX908-NEXT: v_add_f32_e32 v31, v25, v20 +; GFX908-NEXT: v_add_f32_e32 v32, v26, v21 +; GFX908-NEXT: v_add_f32_e32 v33, 0, v20 +; GFX908-NEXT: v_add_f32_e32 v34, 0, v21 +; GFX908-NEXT: v_add_f32_e32 v23, v29, v23 +; GFX908-NEXT: v_add_f32_e32 v22, v28, v22 +; GFX908-NEXT: v_add_f32_e32 v21, v30, v21 +; GFX908-NEXT: v_add_f32_e32 v20, v27, v20 +; GFX908-NEXT: v_add_f32_e32 v13, v13, v32 +; GFX908-NEXT: v_add_f32_e32 v12, v12, v31 +; GFX908-NEXT: v_add_f32_e32 v15, v15, v34 +; GFX908-NEXT: v_add_f32_e32 v14, v14, v33 +; GFX908-NEXT: v_add_f32_e32 v16, v16, v22 +; GFX908-NEXT: v_add_f32_e32 v17, v17, v23 +; GFX908-NEXT: v_add_f32_e32 v18, v18, v20 +; GFX908-NEXT: v_add_f32_e32 v19, v19, v21 ; GFX908-NEXT: s_branch .LBB3_4 ; ; GFX90A-LABEL: introduced_copy_to_sgpr: ; GFX90A: ; %bb.0: ; %bb -; GFX90A-NEXT: global_load_ushort v10, v[0:1], off glc +; GFX90A-NEXT: global_load_ushort v28, v[0:1], off glc ; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX90A-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x8 -; GFX90A-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x10 +; GFX90A-NEXT: s_load_dwordx2 s[10:11], s[4:5], 0x10 ; GFX90A-NEXT: s_load_dword s2, s[4:5], 0x18 ; GFX90A-NEXT: v_mov_b32_e32 v1, 0 -; GFX90A-NEXT: s_mov_b32 s4, 0 +; GFX90A-NEXT: s_mov_b32 s8, 0 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: v_cvt_f32_u32_e32 v0, s7 -; GFX90A-NEXT: s_sub_i32 s5, 0, s7 +; GFX90A-NEXT: s_sub_i32 s9, 0, s7 +; GFX90A-NEXT: s_lshl_b64 s[4:5], s[10:11], 5 +; GFX90A-NEXT: s_or_b32 s4, s4, 28 +; GFX90A-NEXT: v_rcp_iflag_f32_e32 v0, v0 ; GFX90A-NEXT: s_lshr_b32 s12, s2, 16 +; GFX90A-NEXT: v_pk_mov_b32 v[6:7], s[4:5], s[4:5] op_sel:[0,1] ; GFX90A-NEXT: v_cvt_f32_f16_e32 v2, s2 -; GFX90A-NEXT: v_rcp_iflag_f32_e32 v0, v0 -; GFX90A-NEXT: v_cvt_f32_f16_e32 v3, s12 -; GFX90A-NEXT: s_lshl_b64 s[10:11], s[8:9], 5 -; GFX90A-NEXT: s_or_b32 s10, s10, 28 ; GFX90A-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX90A-NEXT: v_cvt_u32_f32_e32 v0, v0 +; GFX90A-NEXT: v_cvt_f32_f16_e32 v3, s12 ; GFX90A-NEXT: s_lshl_b64 s[2:3], s[0:1], 5 -; GFX90A-NEXT: v_pk_mov_b32 v[4:5], s[8:9], s[8:9] op_sel:[0,1] -; GFX90A-NEXT: v_pk_mov_b32 v[6:7], s[10:11], s[10:11] op_sel:[0,1] -; GFX90A-NEXT: v_mul_lo_u32 v8, s5, v0 +; GFX90A-NEXT: v_pk_mov_b32 v[4:5], s[10:11], s[10:11] op_sel:[0,1] +; GFX90A-NEXT: v_mul_lo_u32 v8, s9, v0 ; GFX90A-NEXT: v_mul_hi_u32 v8, v0, v8 ; GFX90A-NEXT: v_add_u32_e32 v0, v0, v8 ; GFX90A-NEXT: v_mul_hi_u32 v0, s6, v0 @@ -704,14 +697,15 @@ ; GFX90A-NEXT: v_cmp_le_u32_e32 vcc, s7, v8 ; GFX90A-NEXT: v_cndmask_b32_e32 v0, v0, v9, vcc ; GFX90A-NEXT: v_lshlrev_b64 v[8:9], 5, v[0:1] +; GFX90A-NEXT: v_pk_mov_b32 v[10:11], 0, 0 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_and_b32_e32 v30, 0xffff, v10 -; GFX90A-NEXT: v_mul_lo_u32 v11, s1, v30 -; GFX90A-NEXT: v_mul_hi_u32 v12, s0, v30 -; GFX90A-NEXT: v_mul_lo_u32 v10, s0, v30 -; GFX90A-NEXT: v_add_u32_e32 v11, v12, v11 -; GFX90A-NEXT: v_lshlrev_b64 v[10:11], 5, v[10:11] -; GFX90A-NEXT: v_pk_mov_b32 v[12:13], 0, 0 +; GFX90A-NEXT: v_readfirstlane_b32 s4, v28 +; GFX90A-NEXT: s_and_b32 s4, 0xffff, s4 +; GFX90A-NEXT: s_mul_i32 s1, s1, s4 +; GFX90A-NEXT: s_mul_hi_u32 s5, s0, s4 +; GFX90A-NEXT: s_mul_i32 s0, s0, s4 +; GFX90A-NEXT: s_add_i32 s1, s5, s1 +; GFX90A-NEXT: s_lshl_b64 s[4:5], s[0:1], 5 ; GFX90A-NEXT: s_branch .LBB3_2 ; GFX90A-NEXT: .LBB3_1: ; %bb12 ; GFX90A-NEXT: ; in Loop: Header=BB3_2 Depth=1 @@ -725,66 +719,70 @@ ; GFX90A-NEXT: s_cbranch_scc0 .LBB3_1 ; GFX90A-NEXT: ; %bb.3: ; %bb14 ; GFX90A-NEXT: ; in Loop: Header=BB3_2 Depth=1 -; GFX90A-NEXT: global_load_dwordx2 v[14:15], v[12:13], off -; GFX90A-NEXT: s_mov_b32 s5, s4 +; GFX90A-NEXT: global_load_dwordx2 v[12:13], v[10:11], off +; GFX90A-NEXT: s_mov_b32 s9, s8 +; GFX90A-NEXT: v_pk_mov_b32 v[16:17], s[8:9], s[8:9] op_sel:[0,1] +; GFX90A-NEXT: v_pk_mov_b32 v[18:19], s[8:9], s[8:9] op_sel:[0,1] +; GFX90A-NEXT: v_pk_mov_b32 v[20:21], s[8:9], s[8:9] op_sel:[0,1] ; GFX90A-NEXT: v_cmp_gt_i64_e64 s[0:1], 0, v[4:5] -; GFX90A-NEXT: v_pk_mov_b32 v[16:17], v[6:7], v[6:7] op_sel:[0,1] -; GFX90A-NEXT: v_pk_mov_b32 v[18:19], s[4:5], s[4:5] op_sel:[0,1] -; GFX90A-NEXT: v_pk_mov_b32 v[20:21], s[4:5], s[4:5] op_sel:[0,1] -; GFX90A-NEXT: v_pk_mov_b32 v[22:23], s[4:5], s[4:5] op_sel:[0,1] +; GFX90A-NEXT: v_pk_mov_b32 v[14:15], v[6:7], v[6:7] op_sel:[0,1] +; GFX90A-NEXT: v_pk_mov_b32 v[22:23], v[16:17], v[16:17] op_sel:[0,1] ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_add_co_u32_e32 v24, vcc, 1, v14 -; GFX90A-NEXT: v_addc_co_u32_e32 v25, vcc, 0, v15, vcc -; GFX90A-NEXT: v_mul_lo_u32 v25, s2, v25 -; GFX90A-NEXT: v_mul_hi_u32 v26, s2, v24 -; GFX90A-NEXT: v_mul_lo_u32 v27, s3, v24 -; GFX90A-NEXT: v_mul_lo_u32 v31, s2, v24 -; GFX90A-NEXT: v_add_u32_e32 v24, v26, v25 -; GFX90A-NEXT: v_add_u32_e32 v32, v24, v27 -; GFX90A-NEXT: v_pk_mov_b32 v[24:25], s[4:5], s[4:5] op_sel:[0,1] +; GFX90A-NEXT: v_readfirstlane_b32 s6, v12 +; GFX90A-NEXT: v_readfirstlane_b32 s7, v13 +; GFX90A-NEXT: s_add_u32 s6, s6, 1 +; GFX90A-NEXT: s_addc_u32 s7, s7, 0 +; GFX90A-NEXT: s_mul_hi_u32 s9, s2, s6 +; GFX90A-NEXT: s_mul_i32 s7, s2, s7 +; GFX90A-NEXT: s_mul_i32 s10, s3, s6 +; GFX90A-NEXT: s_add_i32 s7, s9, s7 +; GFX90A-NEXT: s_mul_i32 s6, s2, s6 +; GFX90A-NEXT: s_add_i32 s7, s7, s10 ; GFX90A-NEXT: s_branch .LBB3_5 ; GFX90A-NEXT: .LBB3_4: ; %bb58 ; GFX90A-NEXT: ; in Loop: Header=BB3_5 Depth=2 -; GFX90A-NEXT: v_add_co_u32_e32 v14, vcc, v14, v30 -; GFX90A-NEXT: v_addc_co_u32_e32 v15, vcc, 0, v15, vcc -; GFX90A-NEXT: v_add_co_u32_e32 v16, vcc, v16, v10 -; GFX90A-NEXT: v_addc_co_u32_e32 v17, vcc, v17, v11, vcc -; GFX90A-NEXT: v_cmp_gt_i64_e32 vcc, 0, v[14:15] +; GFX90A-NEXT: v_add_co_u32_sdwa v12, vcc, v12, v28 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; GFX90A-NEXT: v_addc_co_u32_e32 v13, vcc, 0, v13, vcc +; GFX90A-NEXT: v_mov_b32_e32 v24, s5 +; GFX90A-NEXT: v_add_co_u32_e32 v14, vcc, s4, v14 +; GFX90A-NEXT: v_addc_co_u32_e32 v15, vcc, v15, v24, vcc +; GFX90A-NEXT: v_cmp_gt_i64_e32 vcc, 0, v[12:13] ; GFX90A-NEXT: s_cbranch_vccz .LBB3_1 ; GFX90A-NEXT: .LBB3_5: ; %bb16 ; GFX90A-NEXT: ; Parent Loop BB3_2 Depth=1 ; GFX90A-NEXT: ; => This Inner Loop Header: Depth=2 -; GFX90A-NEXT: v_add_co_u32_e32 v26, vcc, v16, v31 -; GFX90A-NEXT: v_addc_co_u32_e32 v27, vcc, v17, v32, vcc -; GFX90A-NEXT: global_load_dword v34, v[26:27], off offset:-12 glc +; GFX90A-NEXT: v_mov_b32_e32 v25, s7 +; GFX90A-NEXT: v_add_co_u32_e32 v24, vcc, s6, v14 +; GFX90A-NEXT: v_addc_co_u32_e32 v25, vcc, v15, v25, vcc +; GFX90A-NEXT: global_load_dword v30, v[24:25], off offset:-12 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: global_load_dword v33, v[26:27], off offset:-8 glc +; GFX90A-NEXT: global_load_dword v29, v[24:25], off offset:-8 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: global_load_dword v28, v[26:27], off offset:-4 glc +; GFX90A-NEXT: global_load_dword v26, v[24:25], off offset:-4 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: global_load_dword v28, v[26:27], off glc +; GFX90A-NEXT: global_load_dword v26, v[24:25], off glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: ; kill: killed $vgpr26 killed $vgpr27 -; GFX90A-NEXT: ds_read_b64 v[26:27], v1 +; GFX90A-NEXT: ; kill: killed $vgpr24 killed $vgpr25 +; GFX90A-NEXT: ds_read_b64 v[24:25], v1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: ds_read_b64 v[28:29], v0 +; GFX90A-NEXT: ds_read_b64 v[26:27], v0 ; GFX90A-NEXT: s_and_b64 vcc, exec, s[0:1] ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: s_cbranch_vccnz .LBB3_4 ; GFX90A-NEXT: ; %bb.6: ; %bb51 ; GFX90A-NEXT: ; in Loop: Header=BB3_5 Depth=2 -; GFX90A-NEXT: v_cvt_f32_f16_sdwa v35, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 -; GFX90A-NEXT: v_cvt_f32_f16_e32 v34, v34 -; GFX90A-NEXT: v_cvt_f32_f16_sdwa v37, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 -; GFX90A-NEXT: v_cvt_f32_f16_e32 v36, v33 -; GFX90A-NEXT: v_pk_add_f32 v[38:39], v[2:3], v[26:27] -; GFX90A-NEXT: v_pk_add_f32 v[40:41], v[26:27], 0 op_sel_hi:[1,0] -; GFX90A-NEXT: v_pk_add_f32 v[28:29], v[34:35], v[28:29] -; GFX90A-NEXT: v_pk_add_f32 v[26:27], v[36:37], v[26:27] -; GFX90A-NEXT: v_pk_add_f32 v[18:19], v[18:19], v[38:39] -; GFX90A-NEXT: v_pk_add_f32 v[20:21], v[20:21], v[40:41] -; GFX90A-NEXT: v_pk_add_f32 v[22:23], v[22:23], v[28:29] -; GFX90A-NEXT: v_pk_add_f32 v[24:25], v[24:25], v[26:27] +; GFX90A-NEXT: v_cvt_f32_f16_sdwa v31, v30 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 +; GFX90A-NEXT: v_cvt_f32_f16_e32 v30, v30 +; GFX90A-NEXT: v_cvt_f32_f16_sdwa v33, v29 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 +; GFX90A-NEXT: v_cvt_f32_f16_e32 v32, v29 +; GFX90A-NEXT: v_pk_add_f32 v[34:35], v[2:3], v[24:25] +; GFX90A-NEXT: v_pk_add_f32 v[36:37], v[24:25], 0 op_sel_hi:[1,0] +; GFX90A-NEXT: v_pk_add_f32 v[26:27], v[30:31], v[26:27] +; GFX90A-NEXT: v_pk_add_f32 v[24:25], v[32:33], v[24:25] +; GFX90A-NEXT: v_pk_add_f32 v[16:17], v[16:17], v[34:35] +; GFX90A-NEXT: v_pk_add_f32 v[18:19], v[18:19], v[36:37] +; GFX90A-NEXT: v_pk_add_f32 v[20:21], v[20:21], v[26:27] +; GFX90A-NEXT: v_pk_add_f32 v[22:23], v[22:23], v[24:25] ; GFX90A-NEXT: s_branch .LBB3_4 bb: %i = load volatile i16, i16 addrspace(4)* undef, align 2 diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-idiv.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-idiv.ll --- a/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-idiv.ll +++ b/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-idiv.ll @@ -51,7 +51,7 @@ ; GFX6-NEXT: v_mul_lo_u32 v1, s4, v0 ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 ; GFX6-NEXT: v_mul_hi_u32 v1, v0, v1 -; GFX6-NEXT: v_add_i32_e32 v0, vcc, v1, v0 +; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v1 ; GFX6-NEXT: v_mul_hi_u32 v0, s2, v0 ; GFX6-NEXT: v_mul_lo_u32 v1, v0, s3 ; GFX6-NEXT: v_add_i32_e32 v2, vcc, 1, v0 @@ -145,7 +145,7 @@ ; GFX6-NEXT: v_mul_lo_u32 v1, s2, v0 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: v_mul_hi_u32 v1, v0, v1 -; GFX6-NEXT: v_add_i32_e32 v0, vcc, v1, v0 +; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v1 ; GFX6-NEXT: v_mul_hi_u32 v0, s4, v0 ; GFX6-NEXT: v_mul_lo_u32 v0, v0, s5 ; GFX6-NEXT: v_sub_i32_e32 v0, vcc, s4, v0 @@ -161,27 +161,28 @@ ; GFX9-LABEL: urem_i32: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c +; GFX9-NEXT: v_mov_b32_e32 v1, 0 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s3 ; GFX9-NEXT: s_sub_i32 s4, 0, s3 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v0, v0 ; GFX9-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0 -; GFX9-NEXT: v_mul_lo_u32 v1, s4, v0 -; GFX9-NEXT: v_mul_hi_u32 v1, v0, v1 -; GFX9-NEXT: v_add_u32_e32 v0, v0, v1 -; GFX9-NEXT: v_mul_hi_u32 v0, s2, v0 -; GFX9-NEXT: v_mov_b32_e32 v1, 0 -; GFX9-NEXT: v_mul_lo_u32 v0, v0, s3 -; GFX9-NEXT: v_sub_u32_e32 v0, s2, v0 -; GFX9-NEXT: v_subrev_u32_e32 v2, s3, v0 -; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s3, v0 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc -; GFX9-NEXT: v_subrev_u32_e32 v2, s3, v0 -; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s3, v0 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc -; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: v_readfirstlane_b32 s5, v0 +; GFX9-NEXT: s_mul_i32 s4, s4, s5 +; GFX9-NEXT: s_mul_hi_u32 s4, s5, s4 +; GFX9-NEXT: s_add_i32 s5, s5, s4 +; GFX9-NEXT: s_mul_hi_u32 s4, s2, s5 +; GFX9-NEXT: s_mul_i32 s4, s4, s3 +; GFX9-NEXT: s_sub_i32 s2, s2, s4 +; GFX9-NEXT: s_sub_i32 s4, s2, s3 +; GFX9-NEXT: s_cmp_ge_u32 s2, s3 +; GFX9-NEXT: s_cselect_b32 s2, s4, s2 +; GFX9-NEXT: s_sub_i32 s4, s2, s3 +; GFX9-NEXT: s_cmp_ge_u32 s2, s3 +; GFX9-NEXT: s_cselect_b32 s2, s4, s2 +; GFX9-NEXT: v_mov_b32_e32 v0, s2 ; GFX9-NEXT: global_store_dword v1, v0, s[0:1] ; GFX9-NEXT: s_endpgm %r = urem i32 %x, %y @@ -372,7 +373,7 @@ ; GFX6-NEXT: v_mul_lo_u32 v1, s3, v0 ; GFX6-NEXT: s_mov_b32 s3, 0xf000 ; GFX6-NEXT: v_mul_hi_u32 v1, v0, v1 -; GFX6-NEXT: v_add_i32_e32 v0, vcc, v1, v0 +; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v1 ; GFX6-NEXT: v_mul_hi_u32 v0, s6, v0 ; GFX6-NEXT: v_mul_lo_u32 v0, v0, s4 ; GFX6-NEXT: v_sub_i32_e32 v0, vcc, s6, v0 @@ -390,35 +391,36 @@ ; GFX9-LABEL: srem_i32: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c +; GFX9-NEXT: v_mov_b32_e32 v1, 0 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_ashr_i32 s4, s3, 31 ; GFX9-NEXT: s_add_i32 s3, s3, s4 ; GFX9-NEXT: s_xor_b32 s3, s3, s4 ; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s3 -; GFX9-NEXT: s_sub_i32 s4, 0, s3 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX9-NEXT: s_sub_i32 s5, 0, s3 +; GFX9-NEXT: s_ashr_i32 s4, s2, 31 +; GFX9-NEXT: s_add_i32 s2, s2, s4 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v0, v0 +; GFX9-NEXT: s_xor_b32 s2, s2, s4 ; GFX9-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0 -; GFX9-NEXT: v_mul_lo_u32 v1, s4, v0 -; GFX9-NEXT: s_ashr_i32 s4, s2, 31 -; GFX9-NEXT: s_add_i32 s2, s2, s4 +; GFX9-NEXT: v_readfirstlane_b32 s6, v0 +; GFX9-NEXT: s_mul_i32 s5, s5, s6 +; GFX9-NEXT: s_mul_hi_u32 s5, s6, s5 +; GFX9-NEXT: s_add_i32 s6, s6, s5 +; GFX9-NEXT: s_mul_hi_u32 s5, s2, s6 +; GFX9-NEXT: s_mul_i32 s5, s5, s3 +; GFX9-NEXT: s_sub_i32 s2, s2, s5 +; GFX9-NEXT: s_sub_i32 s5, s2, s3 +; GFX9-NEXT: s_cmp_ge_u32 s2, s3 +; GFX9-NEXT: s_cselect_b32 s2, s5, s2 +; GFX9-NEXT: s_sub_i32 s5, s2, s3 +; GFX9-NEXT: s_cmp_ge_u32 s2, s3 +; GFX9-NEXT: s_cselect_b32 s2, s5, s2 ; GFX9-NEXT: s_xor_b32 s2, s2, s4 -; GFX9-NEXT: v_mul_hi_u32 v1, v0, v1 -; GFX9-NEXT: v_add_u32_e32 v0, v0, v1 -; GFX9-NEXT: v_mul_hi_u32 v0, s2, v0 -; GFX9-NEXT: v_mov_b32_e32 v1, 0 -; GFX9-NEXT: v_mul_lo_u32 v0, v0, s3 -; GFX9-NEXT: v_sub_u32_e32 v0, s2, v0 -; GFX9-NEXT: v_subrev_u32_e32 v2, s3, v0 -; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s3, v0 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc -; GFX9-NEXT: v_subrev_u32_e32 v2, s3, v0 -; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s3, v0 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc -; GFX9-NEXT: v_xor_b32_e32 v0, s4, v0 -; GFX9-NEXT: v_subrev_u32_e32 v0, s4, v0 -; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_sub_i32 s2, s2, s4 +; GFX9-NEXT: v_mov_b32_e32 v0, s2 ; GFX9-NEXT: global_store_dword v1, v0, s[0:1] ; GFX9-NEXT: s_endpgm %r = srem i32 %x, %y @@ -697,7 +699,7 @@ ; GFX6-NEXT: v_cmp_ge_f32_e64 vcc, |v1|, |v0| ; GFX6-NEXT: v_cndmask_b32_e32 v0, 0, v3, vcc ; GFX6-NEXT: s_mov_b32 s3, 0xf000 -; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v2 +; GFX6-NEXT: v_add_i32_e32 v0, vcc, v2, v0 ; GFX6-NEXT: v_mul_lo_u32 v0, v0, s2 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: v_sub_i32_e32 v0, vcc, s4, v0 @@ -919,7 +921,7 @@ ; GFX6-NEXT: v_cvt_i32_f32_e32 v2, v2 ; GFX6-NEXT: v_cmp_ge_f32_e64 vcc, |v1|, |v0| ; GFX6-NEXT: v_cndmask_b32_e32 v0, 0, v3, vcc -; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v2 +; GFX6-NEXT: v_add_i32_e32 v0, vcc, v2, v0 ; GFX6-NEXT: buffer_store_byte v0, off, s[0:3], 0 ; GFX6-NEXT: s_endpgm ; @@ -1198,9 +1200,9 @@ ; GFX6-NEXT: s_sub_i32 s2, 0, s10 ; GFX6-NEXT: v_mul_hi_u32 v2, v0, v2 ; GFX6-NEXT: v_mul_hi_u32 v3, v1, v3 -; GFX6-NEXT: v_add_i32_e32 v0, vcc, v2, v0 +; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v2 ; GFX6-NEXT: v_mul_hi_u32 v0, s4, v0 -; GFX6-NEXT: v_add_i32_e32 v1, vcc, v3, v1 +; GFX6-NEXT: v_add_i32_e32 v1, vcc, v1, v3 ; GFX6-NEXT: v_mul_hi_u32 v1, s5, v1 ; GFX6-NEXT: v_mul_lo_u32 v2, v0, s8 ; GFX6-NEXT: v_add_i32_e32 v3, vcc, 1, v0 @@ -1226,7 +1228,7 @@ ; GFX6-NEXT: v_mul_hi_u32 v4, v2, v4 ; GFX6-NEXT: v_add_i32_e32 v5, vcc, 1, v1 ; GFX6-NEXT: s_sub_i32 s0, 0, s11 -; GFX6-NEXT: v_add_i32_e32 v2, vcc, v4, v2 +; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v4 ; GFX6-NEXT: v_rcp_iflag_f32_e32 v4, v6 ; GFX6-NEXT: v_cmp_le_u32_e32 vcc, s9, v3 ; GFX6-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc @@ -1522,7 +1524,7 @@ ; GFX6-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc ; GFX6-NEXT: v_rcp_iflag_f32_e32 v4, v5 ; GFX6-NEXT: s_sub_i32 s4, 0, s11 -; GFX6-NEXT: v_add_i32_e32 v2, vcc, v3, v2 +; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v3 ; GFX6-NEXT: v_mul_f32_e32 v3, 0x4f7ffffe, v4 ; GFX6-NEXT: v_cvt_u32_f32_e32 v3, v3 ; GFX6-NEXT: v_subrev_i32_e32 v4, vcc, s9, v1 @@ -1564,68 +1566,76 @@ ; GFX9-NEXT: v_cvt_f32_u32_e32 v2, s10 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v0, v0 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v1, v1 -; GFX9-NEXT: s_sub_i32 s3, 0, s9 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v2, v2 ; GFX9-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0 ; GFX9-NEXT: v_mul_f32_e32 v1, 0x4f7ffffe, v1 ; GFX9-NEXT: v_cvt_u32_f32_e32 v1, v1 ; GFX9-NEXT: v_mul_f32_e32 v2, 0x4f7ffffe, v2 -; GFX9-NEXT: v_mul_lo_u32 v3, s2, v0 +; GFX9-NEXT: v_readfirstlane_b32 s3, v0 +; GFX9-NEXT: s_mul_i32 s2, s2, s3 +; GFX9-NEXT: s_mul_hi_u32 s2, s3, s2 +; GFX9-NEXT: s_add_i32 s3, s3, s2 +; GFX9-NEXT: s_mul_hi_u32 s2, s4, s3 +; GFX9-NEXT: s_mul_i32 s2, s2, s8 +; GFX9-NEXT: s_sub_i32 s2, s4, s2 +; GFX9-NEXT: s_sub_i32 s3, s2, s8 +; GFX9-NEXT: s_cmp_ge_u32 s2, s8 +; GFX9-NEXT: s_cselect_b32 s2, s3, s2 +; GFX9-NEXT: s_sub_i32 s3, s2, s8 +; GFX9-NEXT: s_cmp_ge_u32 s2, s8 +; GFX9-NEXT: v_readfirstlane_b32 s12, v1 +; GFX9-NEXT: s_cselect_b32 s2, s3, s2 +; GFX9-NEXT: s_sub_i32 s3, 0, s9 +; GFX9-NEXT: s_mul_i32 s3, s3, s12 +; GFX9-NEXT: s_mul_hi_u32 s3, s12, s3 +; GFX9-NEXT: s_add_i32 s12, s12, s3 +; GFX9-NEXT: s_mul_hi_u32 s3, s5, s12 +; GFX9-NEXT: s_mul_i32 s3, s3, s9 +; GFX9-NEXT: s_sub_i32 s3, s5, s3 +; GFX9-NEXT: s_sub_i32 s4, s3, s9 ; GFX9-NEXT: v_cvt_u32_f32_e32 v2, v2 -; GFX9-NEXT: v_mul_lo_u32 v5, s3, v1 -; GFX9-NEXT: s_sub_i32 s2, 0, s10 -; GFX9-NEXT: v_mul_hi_u32 v3, v0, v3 -; GFX9-NEXT: v_mul_hi_u32 v5, v1, v5 -; GFX9-NEXT: v_add_u32_e32 v0, v0, v3 -; GFX9-NEXT: v_cvt_f32_u32_e32 v3, s11 -; GFX9-NEXT: v_add_u32_e32 v1, v1, v5 -; GFX9-NEXT: v_mul_lo_u32 v5, s2, v2 -; GFX9-NEXT: s_sub_i32 s2, 0, s11 -; GFX9-NEXT: v_rcp_iflag_f32_e32 v3, v3 -; GFX9-NEXT: v_mul_hi_u32 v0, s4, v0 -; GFX9-NEXT: v_mul_hi_u32 v5, v2, v5 -; GFX9-NEXT: v_mul_hi_u32 v1, s5, v1 -; GFX9-NEXT: v_mul_f32_e32 v3, 0x4f7ffffe, v3 -; GFX9-NEXT: v_cvt_u32_f32_e32 v3, v3 -; GFX9-NEXT: v_add_u32_e32 v2, v2, v5 -; GFX9-NEXT: v_mul_lo_u32 v0, v0, s8 -; GFX9-NEXT: v_mul_hi_u32 v2, s6, v2 -; GFX9-NEXT: v_mul_lo_u32 v5, s2, v3 -; GFX9-NEXT: v_mul_lo_u32 v1, v1, s9 -; GFX9-NEXT: v_sub_u32_e32 v0, s4, v0 -; GFX9-NEXT: v_subrev_u32_e32 v6, s8, v0 -; GFX9-NEXT: v_mul_hi_u32 v5, v3, v5 -; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s8, v0 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v6, vcc -; GFX9-NEXT: v_mul_lo_u32 v2, v2, s10 -; GFX9-NEXT: v_add_u32_e32 v3, v3, v5 -; GFX9-NEXT: v_mul_hi_u32 v3, s7, v3 -; GFX9-NEXT: v_sub_u32_e32 v1, s5, v1 -; GFX9-NEXT: v_subrev_u32_e32 v6, s8, v0 -; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s8, v0 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v6, vcc -; GFX9-NEXT: v_subrev_u32_e32 v6, s9, v1 -; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s9, v1 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v6, vcc -; GFX9-NEXT: v_mul_lo_u32 v3, v3, s11 -; GFX9-NEXT: v_subrev_u32_e32 v6, s9, v1 -; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s9, v1 -; GFX9-NEXT: v_sub_u32_e32 v2, s6, v2 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v6, vcc -; GFX9-NEXT: v_subrev_u32_e32 v5, s10, v2 -; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s10, v2 -; GFX9-NEXT: v_cndmask_b32_e32 v2, v2, v5, vcc -; GFX9-NEXT: v_subrev_u32_e32 v5, s10, v2 -; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s10, v2 -; GFX9-NEXT: v_sub_u32_e32 v3, s7, v3 -; GFX9-NEXT: v_cndmask_b32_e32 v2, v2, v5, vcc -; GFX9-NEXT: v_subrev_u32_e32 v5, s11, v3 -; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s11, v3 -; GFX9-NEXT: v_cndmask_b32_e32 v3, v3, v5, vcc -; GFX9-NEXT: v_subrev_u32_e32 v5, s11, v3 -; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s11, v3 -; GFX9-NEXT: v_cndmask_b32_e32 v3, v3, v5, vcc +; GFX9-NEXT: s_cmp_ge_u32 s3, s9 +; GFX9-NEXT: s_cselect_b32 s3, s4, s3 +; GFX9-NEXT: s_sub_i32 s4, s3, s9 +; GFX9-NEXT: s_cmp_ge_u32 s3, s9 +; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s11 +; GFX9-NEXT: s_cselect_b32 s3, s4, s3 +; GFX9-NEXT: s_sub_i32 s4, 0, s10 +; GFX9-NEXT: v_readfirstlane_b32 s5, v2 +; GFX9-NEXT: s_mul_i32 s4, s4, s5 +; GFX9-NEXT: s_mul_hi_u32 s4, s5, s4 +; GFX9-NEXT: s_add_i32 s5, s5, s4 +; GFX9-NEXT: v_rcp_iflag_f32_e32 v0, v0 +; GFX9-NEXT: s_mul_hi_u32 s4, s6, s5 +; GFX9-NEXT: s_mul_i32 s4, s4, s10 +; GFX9-NEXT: s_sub_i32 s4, s6, s4 +; GFX9-NEXT: s_sub_i32 s5, s4, s10 +; GFX9-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 +; GFX9-NEXT: s_cmp_ge_u32 s4, s10 +; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0 +; GFX9-NEXT: s_cselect_b32 s4, s5, s4 +; GFX9-NEXT: s_sub_i32 s5, s4, s10 +; GFX9-NEXT: s_cmp_ge_u32 s4, s10 +; GFX9-NEXT: s_cselect_b32 s4, s5, s4 +; GFX9-NEXT: s_sub_i32 s5, 0, s11 +; GFX9-NEXT: v_readfirstlane_b32 s6, v0 +; GFX9-NEXT: s_mul_i32 s5, s5, s6 +; GFX9-NEXT: s_mul_hi_u32 s5, s6, s5 +; GFX9-NEXT: s_add_i32 s6, s6, s5 +; GFX9-NEXT: s_mul_hi_u32 s5, s7, s6 +; GFX9-NEXT: s_mul_i32 s5, s5, s11 +; GFX9-NEXT: s_sub_i32 s5, s7, s5 +; GFX9-NEXT: s_sub_i32 s6, s5, s11 +; GFX9-NEXT: s_cmp_ge_u32 s5, s11 +; GFX9-NEXT: s_cselect_b32 s5, s6, s5 +; GFX9-NEXT: s_sub_i32 s6, s5, s11 +; GFX9-NEXT: s_cmp_ge_u32 s5, s11 +; GFX9-NEXT: s_cselect_b32 s5, s6, s5 +; GFX9-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-NEXT: v_mov_b32_e32 v1, s3 +; GFX9-NEXT: v_mov_b32_e32 v2, s4 +; GFX9-NEXT: v_mov_b32_e32 v3, s5 ; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] ; GFX9-NEXT: s_endpgm %r = urem <4 x i32> %x, %y @@ -1831,7 +1841,7 @@ ; GFX6-NEXT: v_mul_hi_u32 v2, v0, v2 ; GFX6-NEXT: v_cvt_u32_f32_e32 v1, v1 ; GFX6-NEXT: s_sub_i32 s0, 0, s9 -; GFX6-NEXT: v_add_i32_e32 v0, vcc, v2, v0 +; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v2 ; GFX6-NEXT: v_mul_hi_u32 v0, s1, v0 ; GFX6-NEXT: v_mul_lo_u32 v2, s0, v1 ; GFX6-NEXT: v_mul_lo_u32 v3, v0, s3 @@ -1903,7 +1913,7 @@ ; GFX6-NEXT: v_mul_hi_u32 v5, v4, v5 ; GFX6-NEXT: v_add_i32_e32 v6, vcc, 1, v2 ; GFX6-NEXT: s_xor_b32 s2, s0, s2 -; GFX6-NEXT: v_add_i32_e32 v4, vcc, v5, v4 +; GFX6-NEXT: v_add_i32_e32 v4, vcc, v4, v5 ; GFX6-NEXT: v_mul_hi_u32 v4, s1, v4 ; GFX6-NEXT: v_cmp_le_u32_e32 vcc, s4, v3 ; GFX6-NEXT: v_cndmask_b32_e32 v2, v2, v6, vcc @@ -2323,105 +2333,113 @@ ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_ashr_i32 s2, s8, 31 -; GFX9-NEXT: s_add_i32 s8, s8, s2 -; GFX9-NEXT: s_xor_b32 s2, s8, s2 +; GFX9-NEXT: s_add_i32 s3, s8, s2 +; GFX9-NEXT: s_xor_b32 s2, s3, s2 ; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s2 -; GFX9-NEXT: s_ashr_i32 s3, s9, 31 -; GFX9-NEXT: s_add_i32 s8, s9, s3 -; GFX9-NEXT: s_sub_i32 s12, 0, s2 +; GFX9-NEXT: s_sub_i32 s8, 0, s2 +; GFX9-NEXT: s_ashr_i32 s3, s4, 31 +; GFX9-NEXT: s_add_i32 s4, s4, s3 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v0, v0 -; GFX9-NEXT: s_xor_b32 s3, s8, s3 -; GFX9-NEXT: v_cvt_f32_u32_e32 v1, s3 -; GFX9-NEXT: s_ashr_i32 s8, s4, 31 +; GFX9-NEXT: s_xor_b32 s4, s4, s3 ; GFX9-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0 -; GFX9-NEXT: v_rcp_iflag_f32_e32 v1, v1 -; GFX9-NEXT: s_add_i32 s4, s4, s8 -; GFX9-NEXT: s_xor_b32 s4, s4, s8 -; GFX9-NEXT: v_mul_lo_u32 v2, s12, v0 -; GFX9-NEXT: v_mul_f32_e32 v1, 0x4f7ffffe, v1 -; GFX9-NEXT: v_cvt_u32_f32_e32 v1, v1 -; GFX9-NEXT: s_sub_i32 s12, 0, s3 -; GFX9-NEXT: v_mul_hi_u32 v2, v0, v2 -; GFX9-NEXT: s_ashr_i32 s9, s5, 31 -; GFX9-NEXT: v_mul_lo_u32 v3, s12, v1 -; GFX9-NEXT: s_ashr_i32 s12, s10, 31 -; GFX9-NEXT: v_add_u32_e32 v0, v0, v2 -; GFX9-NEXT: v_mul_hi_u32 v0, s4, v0 -; GFX9-NEXT: v_mul_hi_u32 v3, v1, v3 -; GFX9-NEXT: s_add_i32 s5, s5, s9 -; GFX9-NEXT: s_xor_b32 s5, s5, s9 -; GFX9-NEXT: v_mul_lo_u32 v0, v0, s2 -; GFX9-NEXT: v_add_u32_e32 v1, v1, v3 -; GFX9-NEXT: v_mul_hi_u32 v1, s5, v1 -; GFX9-NEXT: v_sub_u32_e32 v0, s4, v0 -; GFX9-NEXT: v_subrev_u32_e32 v2, s2, v0 -; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s2, v0 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc -; GFX9-NEXT: v_subrev_u32_e32 v2, s2, v0 -; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s2, v0 -; GFX9-NEXT: s_add_i32 s2, s10, s12 -; GFX9-NEXT: s_xor_b32 s2, s2, s12 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc -; GFX9-NEXT: v_cvt_f32_u32_e32 v2, s2 -; GFX9-NEXT: v_mul_lo_u32 v1, v1, s3 -; GFX9-NEXT: v_xor_b32_e32 v0, s8, v0 -; GFX9-NEXT: v_subrev_u32_e32 v0, s8, v0 +; GFX9-NEXT: v_readfirstlane_b32 s12, v0 +; GFX9-NEXT: s_mul_i32 s8, s8, s12 +; GFX9-NEXT: s_mul_hi_u32 s8, s12, s8 +; GFX9-NEXT: s_add_i32 s12, s12, s8 +; GFX9-NEXT: s_mul_hi_u32 s8, s4, s12 +; GFX9-NEXT: s_mul_i32 s8, s8, s2 +; GFX9-NEXT: s_sub_i32 s4, s4, s8 +; GFX9-NEXT: s_sub_i32 s8, s4, s2 +; GFX9-NEXT: s_cmp_ge_u32 s4, s2 +; GFX9-NEXT: s_cselect_b32 s4, s8, s4 +; GFX9-NEXT: s_sub_i32 s8, s4, s2 +; GFX9-NEXT: s_cmp_ge_u32 s4, s2 +; GFX9-NEXT: s_cselect_b32 s2, s8, s4 +; GFX9-NEXT: s_ashr_i32 s4, s9, 31 +; GFX9-NEXT: s_add_i32 s8, s9, s4 +; GFX9-NEXT: s_xor_b32 s4, s8, s4 +; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s4 +; GFX9-NEXT: s_ashr_i32 s8, s5, 31 +; GFX9-NEXT: s_xor_b32 s2, s2, s3 +; GFX9-NEXT: s_add_i32 s5, s5, s8 +; GFX9-NEXT: v_rcp_iflag_f32_e32 v0, v0 +; GFX9-NEXT: s_sub_i32 s2, s2, s3 +; GFX9-NEXT: s_xor_b32 s3, s5, s8 +; GFX9-NEXT: s_sub_i32 s5, 0, s4 +; GFX9-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 +; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0 +; GFX9-NEXT: v_readfirstlane_b32 s9, v0 +; GFX9-NEXT: s_mul_i32 s5, s5, s9 +; GFX9-NEXT: s_mul_hi_u32 s5, s9, s5 +; GFX9-NEXT: s_add_i32 s9, s9, s5 +; GFX9-NEXT: s_mul_hi_u32 s5, s3, s9 +; GFX9-NEXT: s_mul_i32 s5, s5, s4 +; GFX9-NEXT: s_sub_i32 s3, s3, s5 +; GFX9-NEXT: s_sub_i32 s5, s3, s4 +; GFX9-NEXT: s_cmp_ge_u32 s3, s4 +; GFX9-NEXT: s_cselect_b32 s3, s5, s3 +; GFX9-NEXT: s_sub_i32 s5, s3, s4 +; GFX9-NEXT: s_cmp_ge_u32 s3, s4 +; GFX9-NEXT: s_cselect_b32 s3, s5, s3 +; GFX9-NEXT: s_ashr_i32 s4, s10, 31 +; GFX9-NEXT: s_add_i32 s5, s10, s4 +; GFX9-NEXT: s_xor_b32 s4, s5, s4 +; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s4 +; GFX9-NEXT: s_xor_b32 s3, s3, s8 +; GFX9-NEXT: s_sub_i32 s3, s3, s8 +; GFX9-NEXT: s_sub_i32 s8, 0, s4 +; GFX9-NEXT: v_rcp_iflag_f32_e32 v0, v0 +; GFX9-NEXT: s_ashr_i32 s5, s6, 31 +; GFX9-NEXT: s_add_i32 s6, s6, s5 +; GFX9-NEXT: s_xor_b32 s6, s6, s5 +; GFX9-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 +; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0 +; GFX9-NEXT: v_mov_b32_e32 v1, s3 +; GFX9-NEXT: v_readfirstlane_b32 s9, v0 +; GFX9-NEXT: s_mul_i32 s8, s8, s9 +; GFX9-NEXT: s_mul_hi_u32 s8, s9, s8 +; GFX9-NEXT: s_add_i32 s9, s9, s8 +; GFX9-NEXT: s_mul_hi_u32 s8, s6, s9 +; GFX9-NEXT: s_mul_i32 s8, s8, s4 +; GFX9-NEXT: s_sub_i32 s6, s6, s8 +; GFX9-NEXT: s_sub_i32 s8, s6, s4 +; GFX9-NEXT: s_cmp_ge_u32 s6, s4 +; GFX9-NEXT: s_cselect_b32 s6, s8, s6 +; GFX9-NEXT: s_sub_i32 s8, s6, s4 +; GFX9-NEXT: s_cmp_ge_u32 s6, s4 +; GFX9-NEXT: s_cselect_b32 s4, s8, s6 +; GFX9-NEXT: s_ashr_i32 s6, s11, 31 +; GFX9-NEXT: s_add_i32 s8, s11, s6 +; GFX9-NEXT: s_xor_b32 s6, s8, s6 +; GFX9-NEXT: v_cvt_f32_u32_e32 v2, s6 +; GFX9-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-NEXT: s_ashr_i32 s2, s7, 31 +; GFX9-NEXT: s_xor_b32 s3, s4, s5 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v2, v2 -; GFX9-NEXT: v_sub_u32_e32 v1, s5, v1 -; GFX9-NEXT: v_subrev_u32_e32 v3, s3, v1 -; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s3, v1 +; GFX9-NEXT: s_add_i32 s4, s7, s2 +; GFX9-NEXT: s_sub_i32 s3, s3, s5 +; GFX9-NEXT: s_sub_i32 s5, 0, s6 ; GFX9-NEXT: v_mul_f32_e32 v2, 0x4f7ffffe, v2 ; GFX9-NEXT: v_cvt_u32_f32_e32 v2, v2 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc -; GFX9-NEXT: v_subrev_u32_e32 v3, s3, v1 -; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s3, v1 -; GFX9-NEXT: s_sub_i32 s3, 0, s2 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc -; GFX9-NEXT: v_mul_lo_u32 v3, s3, v2 -; GFX9-NEXT: s_ashr_i32 s3, s11, 31 -; GFX9-NEXT: s_add_i32 s4, s11, s3 -; GFX9-NEXT: s_xor_b32 s3, s4, s3 -; GFX9-NEXT: v_cvt_f32_u32_e32 v5, s3 -; GFX9-NEXT: v_mul_hi_u32 v3, v2, v3 -; GFX9-NEXT: s_ashr_i32 s4, s6, 31 -; GFX9-NEXT: s_add_i32 s5, s6, s4 -; GFX9-NEXT: v_rcp_iflag_f32_e32 v5, v5 -; GFX9-NEXT: v_add_u32_e32 v2, v2, v3 -; GFX9-NEXT: s_xor_b32 s5, s5, s4 -; GFX9-NEXT: v_mul_hi_u32 v2, s5, v2 -; GFX9-NEXT: v_mul_f32_e32 v3, 0x4f7ffffe, v5 -; GFX9-NEXT: v_cvt_u32_f32_e32 v3, v3 -; GFX9-NEXT: s_sub_i32 s6, 0, s3 -; GFX9-NEXT: v_mul_lo_u32 v2, v2, s2 -; GFX9-NEXT: v_xor_b32_e32 v1, s9, v1 -; GFX9-NEXT: v_mul_lo_u32 v5, s6, v3 -; GFX9-NEXT: v_subrev_u32_e32 v1, s9, v1 -; GFX9-NEXT: v_sub_u32_e32 v2, s5, v2 -; GFX9-NEXT: s_ashr_i32 s5, s7, 31 -; GFX9-NEXT: v_mul_hi_u32 v5, v3, v5 -; GFX9-NEXT: s_add_i32 s6, s7, s5 -; GFX9-NEXT: s_xor_b32 s6, s6, s5 -; GFX9-NEXT: v_subrev_u32_e32 v6, s2, v2 -; GFX9-NEXT: v_add_u32_e32 v3, v3, v5 -; GFX9-NEXT: v_mul_hi_u32 v3, s6, v3 -; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s2, v2 -; GFX9-NEXT: v_cndmask_b32_e32 v2, v2, v6, vcc -; GFX9-NEXT: v_subrev_u32_e32 v5, s2, v2 -; GFX9-NEXT: v_mul_lo_u32 v3, v3, s3 -; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s2, v2 -; GFX9-NEXT: v_cndmask_b32_e32 v2, v2, v5, vcc -; GFX9-NEXT: v_xor_b32_e32 v2, s4, v2 -; GFX9-NEXT: v_sub_u32_e32 v3, s6, v3 -; GFX9-NEXT: v_subrev_u32_e32 v5, s3, v3 -; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s3, v3 -; GFX9-NEXT: v_cndmask_b32_e32 v3, v3, v5, vcc -; GFX9-NEXT: v_subrev_u32_e32 v5, s3, v3 -; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s3, v3 -; GFX9-NEXT: v_cndmask_b32_e32 v3, v3, v5, vcc -; GFX9-NEXT: v_xor_b32_e32 v3, s5, v3 -; GFX9-NEXT: v_subrev_u32_e32 v2, s4, v2 -; GFX9-NEXT: v_subrev_u32_e32 v3, s5, v3 +; GFX9-NEXT: s_xor_b32 s4, s4, s2 +; GFX9-NEXT: v_readfirstlane_b32 s7, v2 +; GFX9-NEXT: s_mul_i32 s5, s5, s7 +; GFX9-NEXT: s_mul_hi_u32 s5, s7, s5 +; GFX9-NEXT: s_add_i32 s7, s7, s5 +; GFX9-NEXT: s_mul_hi_u32 s5, s4, s7 +; GFX9-NEXT: s_mul_i32 s5, s5, s6 +; GFX9-NEXT: s_sub_i32 s4, s4, s5 +; GFX9-NEXT: s_sub_i32 s5, s4, s6 +; GFX9-NEXT: s_cmp_ge_u32 s4, s6 +; GFX9-NEXT: s_cselect_b32 s4, s5, s4 +; GFX9-NEXT: s_sub_i32 s5, s4, s6 +; GFX9-NEXT: s_cmp_ge_u32 s4, s6 +; GFX9-NEXT: s_cselect_b32 s4, s5, s4 +; GFX9-NEXT: s_xor_b32 s4, s4, s2 +; GFX9-NEXT: s_sub_i32 s2, s4, s2 +; GFX9-NEXT: v_mov_b32_e32 v2, s3 +; GFX9-NEXT: v_mov_b32_e32 v3, s2 ; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] ; GFX9-NEXT: s_endpgm %r = srem <4 x i32> %x, %y @@ -2770,7 +2788,7 @@ ; GFX6-NEXT: v_rcp_iflag_f32_e32 v5, v3 ; GFX6-NEXT: v_sub_i32_e32 v0, vcc, s4, v0 ; GFX6-NEXT: s_lshr_b32 s4, s7, 16 -; GFX6-NEXT: v_sub_i32_e32 v2, vcc, v1, v2 +; GFX6-NEXT: v_subrev_i32_e32 v2, vcc, v2, v1 ; GFX6-NEXT: v_mul_f32_e32 v1, v4, v5 ; GFX6-NEXT: v_cvt_f32_u32_e32 v5, s4 ; GFX6-NEXT: s_lshr_b32 s6, s5, 16 @@ -2995,7 +3013,7 @@ ; GFX6-NEXT: v_mov_b32_e32 v3, s8 ; GFX6-NEXT: v_cndmask_b32_e32 v0, 0, v3, vcc ; GFX6-NEXT: s_ashr_i32 s4, s4, 16 -; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v2 +; GFX6-NEXT: v_add_i32_e32 v0, vcc, v2, v0 ; GFX6-NEXT: v_cvt_f32_i32_e32 v2, s4 ; GFX6-NEXT: v_rcp_iflag_f32_e32 v3, v1 ; GFX6-NEXT: s_xor_b32 s4, s4, s6 @@ -3263,7 +3281,7 @@ ; GFX6-NEXT: v_cmp_ge_f32_e64 vcc, |v1|, |v0| ; GFX6-NEXT: v_cndmask_b32_e32 v0, 0, v3, vcc ; GFX6-NEXT: v_mov_b32_e32 v1, s4 -; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v2 +; GFX6-NEXT: v_add_i32_e32 v0, vcc, v2, v0 ; GFX6-NEXT: v_mov_b32_e32 v2, s6 ; GFX6-NEXT: v_alignbit_b32 v2, s7, v2, 16 ; GFX6-NEXT: v_bfe_i32 v3, v2, 0, 16 @@ -3283,7 +3301,7 @@ ; GFX6-NEXT: v_or_b32_e32 v3, 1, v3 ; GFX6-NEXT: v_cmp_ge_f32_e64 vcc, |v6|, |v4| ; GFX6-NEXT: v_cndmask_b32_e32 v3, 0, v3, vcc -; GFX6-NEXT: v_add_i32_e32 v3, vcc, v3, v5 +; GFX6-NEXT: v_add_i32_e32 v3, vcc, v5, v3 ; GFX6-NEXT: s_sext_i32_i16 s4, s7 ; GFX6-NEXT: v_mul_lo_u32 v2, v3, v2 ; GFX6-NEXT: v_cvt_f32_i32_e32 v3, s4 @@ -3617,7 +3635,7 @@ ; GFX6-NEXT: v_cvt_i32_f32_e32 v2, v2 ; GFX6-NEXT: v_cmp_ge_f32_e64 vcc, |v1|, |v0| ; GFX6-NEXT: v_cndmask_b32_e32 v0, 0, v3, vcc -; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v2 +; GFX6-NEXT: v_add_i32_e32 v0, vcc, v2, v0 ; GFX6-NEXT: v_and_b32_e32 v0, 7, v0 ; GFX6-NEXT: buffer_store_byte v0, off, s[0:3], 0 ; GFX6-NEXT: s_endpgm @@ -3701,7 +3719,7 @@ ; GFX6-NEXT: v_cmp_ge_f32_e64 vcc, |v1|, |v0| ; GFX6-NEXT: v_cndmask_b32_e32 v0, 0, v3, vcc ; GFX6-NEXT: s_lshr_b32 s3, s4, 8 -; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v2 +; GFX6-NEXT: v_add_i32_e32 v0, vcc, v2, v0 ; GFX6-NEXT: v_mul_lo_u32 v0, v0, s3 ; GFX6-NEXT: s_mov_b32 s3, 0xf000 ; GFX6-NEXT: s_mov_b32 s2, -1 @@ -4191,7 +4209,7 @@ ; GFX6-NEXT: v_mov_b32_e32 v3, s8 ; GFX6-NEXT: v_cndmask_b32_e32 v0, 0, v3, vcc ; GFX6-NEXT: s_ashr_i32 s4, s4, 16 -; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v2 +; GFX6-NEXT: v_add_i32_e32 v0, vcc, v2, v0 ; GFX6-NEXT: v_cvt_f32_i32_e32 v2, s4 ; GFX6-NEXT: v_rcp_iflag_f32_e32 v3, v1 ; GFX6-NEXT: s_xor_b32 s4, s4, s6 @@ -4207,7 +4225,7 @@ ; GFX6-NEXT: v_cvt_f32_i32_e32 v2, s4 ; GFX6-NEXT: v_cndmask_b32_e32 v1, 0, v4, vcc ; GFX6-NEXT: s_sext_i32_i16 s5, s5 -; GFX6-NEXT: v_add_i32_e32 v1, vcc, v1, v3 +; GFX6-NEXT: v_add_i32_e32 v1, vcc, v3, v1 ; GFX6-NEXT: v_cvt_f32_i32_e32 v3, s5 ; GFX6-NEXT: v_rcp_iflag_f32_e32 v4, v2 ; GFX6-NEXT: s_xor_b32 s4, s5, s4 @@ -4435,7 +4453,7 @@ ; GFX6-NEXT: v_mov_b32_e32 v6, s4 ; GFX6-NEXT: v_cmp_ge_f32_e64 vcc, |v3|, |v4| ; GFX6-NEXT: v_cndmask_b32_e32 v3, 0, v6, vcc -; GFX6-NEXT: v_add_i32_e32 v3, vcc, v3, v5 +; GFX6-NEXT: v_add_i32_e32 v3, vcc, v5, v3 ; GFX6-NEXT: v_mul_lo_u32 v3, v3, s7 ; GFX6-NEXT: v_sub_i32_e32 v1, vcc, v1, v2 ; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1 @@ -4821,7 +4839,7 @@ ; GFX6-NEXT: v_mul_lo_u32 v2, v3, v2 ; GFX6-NEXT: s_lshr_b32 s3, s2, 15 ; GFX6-NEXT: v_sub_i32_e32 v3, vcc, s3, v1 -; GFX6-NEXT: v_subrev_i32_e32 v0, vcc, v2, v0 +; GFX6-NEXT: v_sub_i32_e32 v0, vcc, v0, v2 ; GFX6-NEXT: v_and_b32_e32 v3, 0x7fff, v3 ; GFX6-NEXT: v_lshl_b64 v[0:1], v[0:1], 30 ; GFX6-NEXT: v_and_b32_e32 v2, 0x7fff, v6 @@ -5008,7 +5026,7 @@ ; GFX6-NEXT: v_mov_b32_e32 v5, s1 ; GFX6-NEXT: v_cndmask_b32_e32 v2, 0, v5, vcc ; GFX6-NEXT: s_bfe_i32 s1, s2, 0xf000f -; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v4 +; GFX6-NEXT: v_add_i32_e32 v2, vcc, v4, v2 ; GFX6-NEXT: v_cvt_f32_i32_e32 v4, s1 ; GFX6-NEXT: v_rcp_iflag_f32_e32 v5, v3 ; GFX6-NEXT: s_xor_b32 s0, s1, s0 @@ -5024,7 +5042,7 @@ ; GFX6-NEXT: v_mov_b32_e32 v6, s0 ; GFX6-NEXT: v_cndmask_b32_e32 v3, 0, v6, vcc ; GFX6-NEXT: v_bfe_i32 v0, v0, 0, 15 -; GFX6-NEXT: v_add_i32_e32 v3, vcc, v3, v5 +; GFX6-NEXT: v_add_i32_e32 v3, vcc, v5, v3 ; GFX6-NEXT: v_cvt_f32_i32_e32 v5, v0 ; GFX6-NEXT: v_rcp_iflag_f32_e32 v6, v4 ; GFX6-NEXT: v_xor_b32_e32 v0, v0, v1 @@ -5233,7 +5251,7 @@ ; GFX6-NEXT: v_cmp_ge_f32_e64 vcc, |v5|, |v4| ; GFX6-NEXT: v_cndmask_b32_e32 v4, 0, v7, vcc ; GFX6-NEXT: v_and_b32_e32 v3, 0x7fff, v2 -; GFX6-NEXT: v_add_i32_e32 v4, vcc, v4, v6 +; GFX6-NEXT: v_add_i32_e32 v4, vcc, v6, v4 ; GFX6-NEXT: v_mul_lo_u32 v4, v4, s0 ; GFX6-NEXT: s_bfe_i32 s0, s0, 0xf000f ; GFX6-NEXT: v_cvt_f32_i32_e32 v5, s0 @@ -5256,7 +5274,7 @@ ; GFX6-NEXT: v_and_b32_e32 v1, 0x7fff, v0 ; GFX6-NEXT: v_cndmask_b32_e32 v5, 0, v8, vcc ; GFX6-NEXT: v_bfe_i32 v0, v0, 0, 15 -; GFX6-NEXT: v_add_i32_e32 v5, vcc, v5, v7 +; GFX6-NEXT: v_add_i32_e32 v5, vcc, v7, v5 ; GFX6-NEXT: v_cvt_f32_i32_e32 v7, v0 ; GFX6-NEXT: v_rcp_iflag_f32_e32 v8, v6 ; GFX6-NEXT: v_xor_b32_e32 v0, v0, v2 @@ -5764,7 +5782,7 @@ ; GFX6-NEXT: v_mul_hi_u32 v0, s4, v0 ; GFX6-NEXT: v_sub_i32_e32 v1, vcc, s4, v0 ; GFX6-NEXT: v_lshrrev_b32_e32 v1, 1, v1 -; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v1 +; GFX6-NEXT: v_add_i32_e32 v0, vcc, v1, v0 ; GFX6-NEXT: v_lshrrev_b32_e32 v0, 20, v0 ; GFX6-NEXT: v_mul_lo_u32 v0, v0, s2 ; GFX6-NEXT: s_mov_b32 s2, -1 @@ -5998,7 +6016,7 @@ ; GFX6-NEXT: v_mul_hi_u32 v3, v1, v3 ; GFX6-NEXT: v_add_i32_e32 v0, vcc, v2, v0 ; GFX6-NEXT: v_mul_hi_u32 v0, s4, v0 -; GFX6-NEXT: v_add_i32_e32 v1, vcc, v3, v1 +; GFX6-NEXT: v_add_i32_e32 v1, vcc, v1, v3 ; GFX6-NEXT: v_mul_hi_u32 v1, s5, v1 ; GFX6-NEXT: v_mul_lo_u32 v0, v0, s6 ; GFX6-NEXT: v_mul_lo_u32 v1, v1, s7 @@ -6022,46 +6040,49 @@ ; GFX9-LABEL: urem_v2i32_pow2_shl_denom: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x2c +; GFX9-NEXT: v_mov_b32_e32 v2, 0 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_lshl_b32 s3, 0x1000, s6 -; GFX9-NEXT: s_lshl_b32 s2, 0x1000, s7 ; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s3 +; GFX9-NEXT: s_lshl_b32 s2, 0x1000, s7 ; GFX9-NEXT: v_cvt_f32_u32_e32 v1, s2 ; GFX9-NEXT: s_sub_i32 s6, 0, s3 -; GFX9-NEXT: s_sub_i32 s7, 0, s2 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v0, v0 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v1, v1 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 -; GFX9-NEXT: v_mul_f32_e32 v1, 0x4f7ffffe, v1 ; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0 +; GFX9-NEXT: v_mul_f32_e32 v1, 0x4f7ffffe, v1 ; GFX9-NEXT: v_cvt_u32_f32_e32 v1, v1 -; GFX9-NEXT: v_mul_lo_u32 v2, s6, v0 -; GFX9-NEXT: v_mul_lo_u32 v3, s7, v1 -; GFX9-NEXT: v_mul_hi_u32 v2, v0, v2 -; GFX9-NEXT: v_mul_hi_u32 v3, v1, v3 -; GFX9-NEXT: v_add_u32_e32 v0, v0, v2 -; GFX9-NEXT: v_add_u32_e32 v1, v1, v3 -; GFX9-NEXT: v_mul_hi_u32 v0, s4, v0 -; GFX9-NEXT: v_mul_hi_u32 v1, s5, v1 -; GFX9-NEXT: v_mov_b32_e32 v2, 0 -; GFX9-NEXT: v_mul_lo_u32 v0, v0, s3 -; GFX9-NEXT: v_mul_lo_u32 v1, v1, s2 -; GFX9-NEXT: v_sub_u32_e32 v0, s4, v0 -; GFX9-NEXT: v_sub_u32_e32 v1, s5, v1 -; GFX9-NEXT: v_subrev_u32_e32 v3, s3, v0 -; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s3, v0 -; GFX9-NEXT: v_subrev_u32_e32 v4, s2, v1 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc -; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s2, v1 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc -; GFX9-NEXT: v_subrev_u32_e32 v3, s3, v0 -; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s3, v0 -; GFX9-NEXT: v_subrev_u32_e32 v4, s2, v1 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc -; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s2, v1 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc -; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: v_readfirstlane_b32 s7, v0 +; GFX9-NEXT: s_mul_i32 s6, s6, s7 +; GFX9-NEXT: s_mul_hi_u32 s6, s7, s6 +; GFX9-NEXT: s_add_i32 s7, s7, s6 +; GFX9-NEXT: s_mul_hi_u32 s6, s4, s7 +; GFX9-NEXT: s_mul_i32 s6, s6, s3 +; GFX9-NEXT: s_sub_i32 s4, s4, s6 +; GFX9-NEXT: s_sub_i32 s6, s4, s3 +; GFX9-NEXT: s_cmp_ge_u32 s4, s3 +; GFX9-NEXT: s_cselect_b32 s4, s6, s4 +; GFX9-NEXT: s_sub_i32 s6, s4, s3 +; GFX9-NEXT: s_cmp_ge_u32 s4, s3 +; GFX9-NEXT: v_readfirstlane_b32 s8, v1 +; GFX9-NEXT: s_cselect_b32 s3, s6, s4 +; GFX9-NEXT: s_sub_i32 s4, 0, s2 +; GFX9-NEXT: s_mul_i32 s4, s4, s8 +; GFX9-NEXT: s_mul_hi_u32 s4, s8, s4 +; GFX9-NEXT: s_add_i32 s8, s8, s4 +; GFX9-NEXT: s_mul_hi_u32 s4, s5, s8 +; GFX9-NEXT: s_mul_i32 s4, s4, s2 +; GFX9-NEXT: s_sub_i32 s4, s5, s4 +; GFX9-NEXT: s_sub_i32 s5, s4, s2 +; GFX9-NEXT: s_cmp_ge_u32 s4, s2 +; GFX9-NEXT: s_cselect_b32 s4, s5, s4 +; GFX9-NEXT: s_sub_i32 s5, s4, s2 +; GFX9-NEXT: s_cmp_ge_u32 s4, s2 +; GFX9-NEXT: s_cselect_b32 s2, s5, s4 +; GFX9-NEXT: v_mov_b32_e32 v0, s3 +; GFX9-NEXT: v_mov_b32_e32 v1, s2 ; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] ; GFX9-NEXT: s_endpgm %shl.y = shl <2 x i32> , %y @@ -6179,7 +6200,7 @@ ; GFX6-NEXT: v_mul_hi_u32 v1, v0, v1 ; GFX6-NEXT: s_xor_b32 s1, s1, s0 ; GFX6-NEXT: s_xor_b32 s2, s0, s8 -; GFX6-NEXT: v_add_i32_e32 v0, vcc, v1, v0 +; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v1 ; GFX6-NEXT: v_mul_hi_u32 v0, s1, v0 ; GFX6-NEXT: v_mul_lo_u32 v1, v0, s3 ; GFX6-NEXT: v_add_i32_e32 v2, vcc, 1, v0 @@ -6483,7 +6504,7 @@ ; GFX6-NEXT: s_ashr_i32 s0, s9, 31 ; GFX6-NEXT: s_add_i32 s1, s9, s0 ; GFX6-NEXT: s_xor_b32 s1, s1, s0 -; GFX6-NEXT: v_add_i32_e32 v1, vcc, v2, v1 +; GFX6-NEXT: v_add_i32_e32 v1, vcc, v1, v2 ; GFX6-NEXT: v_mul_hi_u32 v1, s1, v1 ; GFX6-NEXT: v_add_i32_e32 v4, vcc, 1, v0 ; GFX6-NEXT: v_cmp_le_u32_e32 vcc, s2, v3 @@ -6588,18 +6609,19 @@ ; GFX6: ; %bb.0: ; GFX6-NEXT: s_load_dword s4, s[0:1], 0xb ; GFX6-NEXT: v_mov_b32_e32 v0, 0xd9528441 -; GFX6-NEXT: s_mov_b32 s2, 0x12d8fb ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; GFX6-NEXT: s_mov_b32 s3, 0xf000 +; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mul_hi_i32 v0, s4, v0 -; GFX6-NEXT: v_add_i32_e32 v0, vcc, s4, v0 -; GFX6-NEXT: v_lshrrev_b32_e32 v1, 31, v0 -; GFX6-NEXT: v_ashrrev_i32_e32 v0, 20, v0 -; GFX6-NEXT: v_add_i32_e32 v0, vcc, v1, v0 -; GFX6-NEXT: v_mul_lo_u32 v0, v0, s2 -; GFX6-NEXT: s_mov_b32 s2, -1 -; GFX6-NEXT: v_sub_i32_e32 v0, vcc, s4, v0 +; GFX6-NEXT: v_readfirstlane_b32 s5, v0 +; GFX6-NEXT: s_add_i32 s5, s5, s4 +; GFX6-NEXT: s_lshr_b32 s6, s5, 31 +; GFX6-NEXT: s_ashr_i32 s5, s5, 20 +; GFX6-NEXT: s_add_i32 s5, s5, s6 +; GFX6-NEXT: s_mul_i32 s5, s5, 0x12d8fb +; GFX6-NEXT: s_sub_i32 s4, s4, s5 +; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX6-NEXT: s_endpgm ; @@ -6693,7 +6715,7 @@ ; GFX6-NEXT: v_mul_lo_u32 v1, s3, v0 ; GFX6-NEXT: s_mov_b32 s3, 0xf000 ; GFX6-NEXT: v_mul_hi_u32 v1, v0, v1 -; GFX6-NEXT: v_add_i32_e32 v0, vcc, v1, v0 +; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v1 ; GFX6-NEXT: v_mul_hi_u32 v0, s6, v0 ; GFX6-NEXT: v_mul_lo_u32 v0, v0, s4 ; GFX6-NEXT: v_sub_i32_e32 v0, vcc, s6, v0 @@ -6711,36 +6733,37 @@ ; GFX9-LABEL: srem_i32_pow2_shl_denom: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c +; GFX9-NEXT: v_mov_b32_e32 v1, 0 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_lshl_b32 s3, 0x1000, s3 ; GFX9-NEXT: s_ashr_i32 s4, s3, 31 ; GFX9-NEXT: s_add_i32 s3, s3, s4 ; GFX9-NEXT: s_xor_b32 s3, s3, s4 ; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s3 -; GFX9-NEXT: s_sub_i32 s4, 0, s3 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX9-NEXT: s_sub_i32 s5, 0, s3 +; GFX9-NEXT: s_ashr_i32 s4, s2, 31 +; GFX9-NEXT: s_add_i32 s2, s2, s4 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v0, v0 +; GFX9-NEXT: s_xor_b32 s2, s2, s4 ; GFX9-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0 -; GFX9-NEXT: v_mul_lo_u32 v1, s4, v0 -; GFX9-NEXT: s_ashr_i32 s4, s2, 31 -; GFX9-NEXT: s_add_i32 s2, s2, s4 +; GFX9-NEXT: v_readfirstlane_b32 s6, v0 +; GFX9-NEXT: s_mul_i32 s5, s5, s6 +; GFX9-NEXT: s_mul_hi_u32 s5, s6, s5 +; GFX9-NEXT: s_add_i32 s6, s6, s5 +; GFX9-NEXT: s_mul_hi_u32 s5, s2, s6 +; GFX9-NEXT: s_mul_i32 s5, s5, s3 +; GFX9-NEXT: s_sub_i32 s2, s2, s5 +; GFX9-NEXT: s_sub_i32 s5, s2, s3 +; GFX9-NEXT: s_cmp_ge_u32 s2, s3 +; GFX9-NEXT: s_cselect_b32 s2, s5, s2 +; GFX9-NEXT: s_sub_i32 s5, s2, s3 +; GFX9-NEXT: s_cmp_ge_u32 s2, s3 +; GFX9-NEXT: s_cselect_b32 s2, s5, s2 ; GFX9-NEXT: s_xor_b32 s2, s2, s4 -; GFX9-NEXT: v_mul_hi_u32 v1, v0, v1 -; GFX9-NEXT: v_add_u32_e32 v0, v0, v1 -; GFX9-NEXT: v_mul_hi_u32 v0, s2, v0 -; GFX9-NEXT: v_mov_b32_e32 v1, 0 -; GFX9-NEXT: v_mul_lo_u32 v0, v0, s3 -; GFX9-NEXT: v_sub_u32_e32 v0, s2, v0 -; GFX9-NEXT: v_subrev_u32_e32 v2, s3, v0 -; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s3, v0 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc -; GFX9-NEXT: v_subrev_u32_e32 v2, s3, v0 -; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s3, v0 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc -; GFX9-NEXT: v_xor_b32_e32 v0, s4, v0 -; GFX9-NEXT: v_subrev_u32_e32 v0, s4, v0 -; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_sub_i32 s2, s2, s4 +; GFX9-NEXT: v_mov_b32_e32 v0, s2 ; GFX9-NEXT: global_store_dword v1, v0, s[0:1] ; GFX9-NEXT: s_endpgm %shl.y = shl i32 4096, %y @@ -6919,7 +6942,7 @@ ; GFX6-NEXT: s_sub_i32 s9, 0, s7 ; GFX6-NEXT: s_mov_b32 s3, 0xf000 ; GFX6-NEXT: s_mov_b32 s2, -1 -; GFX6-NEXT: v_add_i32_e32 v0, vcc, v2, v0 +; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v2 ; GFX6-NEXT: v_mul_hi_u32 v0, s4, v0 ; GFX6-NEXT: v_mul_lo_u32 v2, s9, v1 ; GFX6-NEXT: s_ashr_i32 s9, s5, 31 @@ -6954,62 +6977,65 @@ ; GFX9-LABEL: srem_v2i32_pow2_shl_denom: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x2c +; GFX9-NEXT: v_mov_b32_e32 v2, 0 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_lshl_b32 s3, 0x1000, s6 -; GFX9-NEXT: s_ashr_i32 s6, s3, 31 -; GFX9-NEXT: s_add_i32 s3, s3, s6 -; GFX9-NEXT: s_lshl_b32 s2, 0x1000, s7 -; GFX9-NEXT: s_xor_b32 s3, s3, s6 -; GFX9-NEXT: s_ashr_i32 s7, s2, 31 -; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s3 -; GFX9-NEXT: s_add_i32 s2, s2, s7 -; GFX9-NEXT: s_xor_b32 s2, s2, s7 -; GFX9-NEXT: v_cvt_f32_u32_e32 v1, s2 -; GFX9-NEXT: v_rcp_iflag_f32_e32 v0, v0 -; GFX9-NEXT: s_sub_i32 s8, 0, s3 +; GFX9-NEXT: s_lshl_b32 s2, 0x1000, s6 +; GFX9-NEXT: s_ashr_i32 s3, s2, 31 +; GFX9-NEXT: s_add_i32 s2, s2, s3 +; GFX9-NEXT: s_xor_b32 s2, s2, s3 +; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s2 +; GFX9-NEXT: s_lshl_b32 s3, 0x1000, s7 +; GFX9-NEXT: s_sub_i32 s7, 0, s2 ; GFX9-NEXT: s_ashr_i32 s6, s4, 31 -; GFX9-NEXT: v_rcp_iflag_f32_e32 v1, v1 -; GFX9-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 -; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0 +; GFX9-NEXT: v_rcp_iflag_f32_e32 v0, v0 ; GFX9-NEXT: s_add_i32 s4, s4, s6 -; GFX9-NEXT: v_mul_f32_e32 v1, 0x4f7ffffe, v1 -; GFX9-NEXT: v_cvt_u32_f32_e32 v1, v1 -; GFX9-NEXT: v_mul_lo_u32 v2, s8, v0 -; GFX9-NEXT: s_sub_i32 s8, 0, s2 ; GFX9-NEXT: s_xor_b32 s4, s4, s6 -; GFX9-NEXT: v_mul_lo_u32 v3, s8, v1 -; GFX9-NEXT: v_mul_hi_u32 v2, v0, v2 -; GFX9-NEXT: s_ashr_i32 s7, s5, 31 -; GFX9-NEXT: s_add_i32 s5, s5, s7 -; GFX9-NEXT: v_mul_hi_u32 v3, v1, v3 -; GFX9-NEXT: v_add_u32_e32 v0, v0, v2 -; GFX9-NEXT: v_mul_hi_u32 v0, s4, v0 -; GFX9-NEXT: s_xor_b32 s5, s5, s7 -; GFX9-NEXT: v_add_u32_e32 v1, v1, v3 -; GFX9-NEXT: v_mul_hi_u32 v1, s5, v1 -; GFX9-NEXT: v_mul_lo_u32 v0, v0, s3 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; GFX9-NEXT: v_mov_b32_e32 v2, 0 -; GFX9-NEXT: v_mul_lo_u32 v1, v1, s2 -; GFX9-NEXT: v_sub_u32_e32 v0, s4, v0 -; GFX9-NEXT: v_subrev_u32_e32 v3, s3, v0 -; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s3, v0 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc -; GFX9-NEXT: v_subrev_u32_e32 v3, s3, v0 -; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s3, v0 -; GFX9-NEXT: v_sub_u32_e32 v1, s5, v1 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc -; GFX9-NEXT: v_subrev_u32_e32 v3, s2, v1 -; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s2, v1 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc -; GFX9-NEXT: v_subrev_u32_e32 v3, s2, v1 -; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s2, v1 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc -; GFX9-NEXT: v_xor_b32_e32 v0, s6, v0 -; GFX9-NEXT: v_xor_b32_e32 v1, s7, v1 -; GFX9-NEXT: v_subrev_u32_e32 v0, s6, v0 -; GFX9-NEXT: v_subrev_u32_e32 v1, s7, v1 -; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 +; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0 +; GFX9-NEXT: v_readfirstlane_b32 s8, v0 +; GFX9-NEXT: s_mul_i32 s7, s7, s8 +; GFX9-NEXT: s_mul_hi_u32 s7, s8, s7 +; GFX9-NEXT: s_add_i32 s8, s8, s7 +; GFX9-NEXT: s_mul_hi_u32 s7, s4, s8 +; GFX9-NEXT: s_mul_i32 s7, s7, s2 +; GFX9-NEXT: s_sub_i32 s4, s4, s7 +; GFX9-NEXT: s_sub_i32 s7, s4, s2 +; GFX9-NEXT: s_cmp_ge_u32 s4, s2 +; GFX9-NEXT: s_cselect_b32 s4, s7, s4 +; GFX9-NEXT: s_sub_i32 s7, s4, s2 +; GFX9-NEXT: s_cmp_ge_u32 s4, s2 +; GFX9-NEXT: s_cselect_b32 s2, s7, s4 +; GFX9-NEXT: s_ashr_i32 s4, s3, 31 +; GFX9-NEXT: s_add_i32 s3, s3, s4 +; GFX9-NEXT: s_xor_b32 s3, s3, s4 +; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s3 +; GFX9-NEXT: s_xor_b32 s2, s2, s6 +; GFX9-NEXT: s_sub_i32 s2, s2, s6 +; GFX9-NEXT: s_sub_i32 s6, 0, s3 +; GFX9-NEXT: v_rcp_iflag_f32_e32 v0, v0 +; GFX9-NEXT: s_ashr_i32 s4, s5, 31 +; GFX9-NEXT: s_add_i32 s5, s5, s4 +; GFX9-NEXT: s_xor_b32 s5, s5, s4 +; GFX9-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 +; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0 +; GFX9-NEXT: v_readfirstlane_b32 s7, v0 +; GFX9-NEXT: s_mul_i32 s6, s6, s7 +; GFX9-NEXT: s_mul_hi_u32 s6, s7, s6 +; GFX9-NEXT: s_add_i32 s7, s7, s6 +; GFX9-NEXT: s_mul_hi_u32 s6, s5, s7 +; GFX9-NEXT: s_mul_i32 s6, s6, s3 +; GFX9-NEXT: s_sub_i32 s5, s5, s6 +; GFX9-NEXT: s_sub_i32 s6, s5, s3 +; GFX9-NEXT: s_cmp_ge_u32 s5, s3 +; GFX9-NEXT: s_cselect_b32 s5, s6, s5 +; GFX9-NEXT: s_sub_i32 s6, s5, s3 +; GFX9-NEXT: s_cmp_ge_u32 s5, s3 +; GFX9-NEXT: s_cselect_b32 s3, s6, s5 +; GFX9-NEXT: s_xor_b32 s3, s3, s4 +; GFX9-NEXT: s_sub_i32 s3, s3, s4 +; GFX9-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-NEXT: v_mov_b32_e32 v1, s3 ; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] ; GFX9-NEXT: s_endpgm %shl.y = shl <2 x i32> , %y @@ -7046,8 +7072,8 @@ ; GFX6-NEXT: v_mul_lo_u32 v4, v1, s5 ; GFX6-NEXT: v_mul_lo_u32 v5, v0, s5 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 -; GFX6-NEXT: v_add_i32_e32 v2, vcc, v3, v2 -; GFX6-NEXT: v_add_i32_e32 v2, vcc, v4, v2 +; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v3 +; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v4 ; GFX6-NEXT: v_mul_lo_u32 v3, v0, v2 ; GFX6-NEXT: v_mul_hi_u32 v4, v0, v5 ; GFX6-NEXT: v_mul_hi_u32 v6, v0, v2 @@ -7070,7 +7096,7 @@ ; GFX6-NEXT: v_mul_lo_u32 v4, v1, s5 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_mov_b32 s4, s0 -; GFX6-NEXT: v_add_i32_e32 v2, vcc, v3, v2 +; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v3 ; GFX6-NEXT: v_mul_lo_u32 v3, v0, s5 ; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v4 ; GFX6-NEXT: v_mul_lo_u32 v4, v0, v2 @@ -7151,116 +7177,130 @@ ; GFX9-NEXT: v_mov_b32_e32 v1, 0x4f800000 ; GFX9-NEXT: v_madmk_f32 v0, v1, 0x438f8000, v0 ; GFX9-NEXT: v_rcp_f32_e32 v0, v0 -; GFX9-NEXT: s_movk_i32 s2, 0xfee0 -; GFX9-NEXT: s_mov_b32 s3, 0x68958c89 ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0 ; GFX9-NEXT: v_mul_f32_e32 v1, 0x2f800000, v0 ; GFX9-NEXT: v_trunc_f32_e32 v1, v1 ; GFX9-NEXT: v_mac_f32_e32 v0, 0xcf800000, v1 ; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0 ; GFX9-NEXT: v_cvt_u32_f32_e32 v1, v1 -; GFX9-NEXT: v_mul_lo_u32 v2, v0, s2 -; GFX9-NEXT: v_mul_hi_u32 v3, v0, s3 -; GFX9-NEXT: v_mul_lo_u32 v5, v1, s3 -; GFX9-NEXT: v_mul_lo_u32 v4, v0, s3 -; GFX9-NEXT: v_add_u32_e32 v2, v3, v2 -; GFX9-NEXT: v_add_u32_e32 v2, v2, v5 -; GFX9-NEXT: v_mul_hi_u32 v3, v0, v4 -; GFX9-NEXT: v_mul_lo_u32 v5, v0, v2 -; GFX9-NEXT: v_mul_hi_u32 v6, v0, v2 -; GFX9-NEXT: v_mul_hi_u32 v7, v1, v2 -; GFX9-NEXT: v_mul_lo_u32 v2, v1, v2 -; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, v3, v5 -; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v6, vcc -; GFX9-NEXT: v_mul_lo_u32 v6, v1, v4 -; GFX9-NEXT: v_mul_hi_u32 v4, v1, v4 -; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, v3, v6 -; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, v5, v4, vcc -; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, 0, v7, vcc -; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v3, v2 -; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v4, vcc -; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v2 -; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v3, vcc -; GFX9-NEXT: v_mul_lo_u32 v2, v0, s2 -; GFX9-NEXT: v_mul_hi_u32 v3, v0, s3 -; GFX9-NEXT: v_mul_lo_u32 v4, v1, s3 -; GFX9-NEXT: v_mul_lo_u32 v5, v0, s3 -; GFX9-NEXT: s_movk_i32 s2, 0x11f -; GFX9-NEXT: v_add_u32_e32 v2, v3, v2 -; GFX9-NEXT: v_add_u32_e32 v2, v2, v4 -; GFX9-NEXT: v_mul_lo_u32 v3, v0, v2 -; GFX9-NEXT: v_mul_hi_u32 v4, v0, v5 -; GFX9-NEXT: v_mul_hi_u32 v6, v0, v2 -; GFX9-NEXT: v_mul_hi_u32 v7, v1, v2 -; GFX9-NEXT: v_mul_lo_u32 v2, v1, v2 -; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, v4, v3 -; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, 0, v6, vcc -; GFX9-NEXT: v_mul_lo_u32 v6, v1, v5 -; GFX9-NEXT: v_mul_hi_u32 v5, v1, v5 -; GFX9-NEXT: s_mov_b32 s3, 0x976a7377 -; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, v3, v6 -; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, v4, v5, vcc -; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, 0, v7, vcc -; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v3, v2 -; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v4, vcc -; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v2 -; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v3, vcc +; GFX9-NEXT: v_readfirstlane_b32 s0, v0 +; GFX9-NEXT: s_mul_i32 s1, s0, 0xfffffee0 +; GFX9-NEXT: s_mul_hi_u32 s2, s0, 0x68958c89 +; GFX9-NEXT: s_add_i32 s1, s2, s1 +; GFX9-NEXT: v_readfirstlane_b32 s2, v1 +; GFX9-NEXT: s_mul_i32 s3, s2, 0x68958c89 +; GFX9-NEXT: s_add_i32 s1, s1, s3 +; GFX9-NEXT: s_mul_i32 s9, s0, 0x68958c89 +; GFX9-NEXT: s_mul_hi_u32 s3, s0, s1 +; GFX9-NEXT: s_mul_i32 s8, s0, s1 +; GFX9-NEXT: s_mul_hi_u32 s0, s0, s9 +; GFX9-NEXT: s_add_u32 s0, s0, s8 +; GFX9-NEXT: s_addc_u32 s3, 0, s3 +; GFX9-NEXT: s_mul_hi_u32 s10, s2, s9 +; GFX9-NEXT: s_mul_i32 s9, s2, s9 +; GFX9-NEXT: s_add_u32 s0, s0, s9 +; GFX9-NEXT: s_mul_hi_u32 s8, s2, s1 +; GFX9-NEXT: s_addc_u32 s0, s3, s10 +; GFX9-NEXT: s_addc_u32 s3, s8, 0 +; GFX9-NEXT: s_mul_i32 s1, s2, s1 +; GFX9-NEXT: s_add_u32 s0, s0, s1 +; GFX9-NEXT: s_addc_u32 s1, 0, s3 +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s0, v0 +; GFX9-NEXT: s_cmp_lg_u64 vcc, 0 +; GFX9-NEXT: s_addc_u32 s0, s2, s1 +; GFX9-NEXT: v_readfirstlane_b32 s2, v0 +; GFX9-NEXT: s_mul_i32 s3, s2, 0xfffffee0 +; GFX9-NEXT: s_mul_hi_u32 s8, s2, 0x68958c89 +; GFX9-NEXT: s_mul_i32 s1, s0, 0x68958c89 +; GFX9-NEXT: s_add_i32 s3, s8, s3 +; GFX9-NEXT: s_add_i32 s3, s3, s1 +; GFX9-NEXT: s_mul_i32 s9, s2, 0x68958c89 +; GFX9-NEXT: s_mul_hi_u32 s1, s2, s3 +; GFX9-NEXT: s_mul_i32 s8, s2, s3 +; GFX9-NEXT: s_mul_hi_u32 s2, s2, s9 +; GFX9-NEXT: s_add_u32 s2, s2, s8 +; GFX9-NEXT: s_addc_u32 s1, 0, s1 +; GFX9-NEXT: s_mul_hi_u32 s10, s0, s9 +; GFX9-NEXT: s_mul_i32 s9, s0, s9 +; GFX9-NEXT: s_add_u32 s2, s2, s9 +; GFX9-NEXT: s_mul_hi_u32 s8, s0, s3 +; GFX9-NEXT: s_addc_u32 s1, s1, s10 +; GFX9-NEXT: s_addc_u32 s2, s8, 0 +; GFX9-NEXT: s_mul_i32 s3, s0, s3 +; GFX9-NEXT: s_add_u32 s1, s1, s3 +; GFX9-NEXT: s_addc_u32 s2, 0, s2 +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s1, v0 +; GFX9-NEXT: s_cmp_lg_u64 vcc, 0 +; GFX9-NEXT: s_addc_u32 s0, s0, s2 +; GFX9-NEXT: v_readfirstlane_b32 s3, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mul_lo_u32 v2, s6, v1 -; GFX9-NEXT: v_mul_hi_u32 v3, s6, v0 -; GFX9-NEXT: v_mul_hi_u32 v4, s6, v1 -; GFX9-NEXT: v_mul_hi_u32 v5, s7, v1 -; GFX9-NEXT: v_mul_lo_u32 v1, s7, v1 -; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v3, v2 -; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v4, vcc -; GFX9-NEXT: v_mul_lo_u32 v4, s7, v0 -; GFX9-NEXT: v_mul_hi_u32 v0, s7, v0 -; GFX9-NEXT: v_mov_b32_e32 v6, 0x11f -; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v2, v4 -; GFX9-NEXT: v_addc_co_u32_e32 v0, vcc, v3, v0, vcc -; GFX9-NEXT: v_addc_co_u32_e32 v2, vcc, 0, v5, vcc -; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v1 -; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v2, vcc -; GFX9-NEXT: v_mul_lo_u32 v2, v0, s2 -; GFX9-NEXT: v_mul_hi_u32 v3, v0, s3 -; GFX9-NEXT: v_mul_lo_u32 v4, v1, s3 -; GFX9-NEXT: v_mov_b32_e32 v5, 0 -; GFX9-NEXT: v_add_u32_e32 v2, v3, v2 -; GFX9-NEXT: v_mul_lo_u32 v3, v0, s3 -; GFX9-NEXT: v_add_u32_e32 v2, v2, v4 -; GFX9-NEXT: v_sub_u32_e32 v4, s7, v2 -; GFX9-NEXT: v_sub_co_u32_e32 v3, vcc, s6, v3 -; GFX9-NEXT: v_subb_co_u32_e64 v4, s[0:1], v4, v6, vcc -; GFX9-NEXT: v_subrev_co_u32_e64 v6, s[0:1], s3, v3 -; GFX9-NEXT: v_subbrev_co_u32_e64 v4, s[0:1], 0, v4, s[0:1] -; GFX9-NEXT: s_movk_i32 s3, 0x11e -; GFX9-NEXT: v_cmp_lt_u32_e64 s[0:1], s3, v4 -; GFX9-NEXT: s_mov_b32 s6, 0x976a7376 -; GFX9-NEXT: v_cndmask_b32_e64 v7, 0, -1, s[0:1] -; GFX9-NEXT: v_cmp_lt_u32_e64 s[0:1], s6, v6 -; GFX9-NEXT: v_cndmask_b32_e64 v6, 0, -1, s[0:1] -; GFX9-NEXT: v_cmp_eq_u32_e64 s[0:1], s2, v4 -; GFX9-NEXT: v_cndmask_b32_e64 v4, v7, v6, s[0:1] -; GFX9-NEXT: v_add_co_u32_e64 v6, s[0:1], 2, v0 -; GFX9-NEXT: v_addc_co_u32_e64 v7, s[0:1], 0, v1, s[0:1] -; GFX9-NEXT: v_add_co_u32_e64 v8, s[0:1], 1, v0 -; GFX9-NEXT: v_addc_co_u32_e64 v9, s[0:1], 0, v1, s[0:1] -; GFX9-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v4 -; GFX9-NEXT: v_cndmask_b32_e64 v4, v9, v7, s[0:1] -; GFX9-NEXT: v_mov_b32_e32 v7, s7 -; GFX9-NEXT: v_subb_co_u32_e32 v2, vcc, v7, v2, vcc -; GFX9-NEXT: v_cmp_lt_u32_e32 vcc, s3, v2 -; GFX9-NEXT: v_cndmask_b32_e64 v7, 0, -1, vcc -; GFX9-NEXT: v_cmp_lt_u32_e32 vcc, s6, v3 -; GFX9-NEXT: v_cndmask_b32_e64 v3, 0, -1, vcc -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, s2, v2 -; GFX9-NEXT: v_cndmask_b32_e32 v2, v7, v3, vcc -; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 -; GFX9-NEXT: v_cndmask_b32_e64 v2, v8, v6, s[0:1] -; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc -; GFX9-NEXT: global_store_dwordx2 v5, v[0:1], s[4:5] +; GFX9-NEXT: s_mul_i32 s2, s6, s0 +; GFX9-NEXT: s_mul_hi_u32 s8, s6, s3 +; GFX9-NEXT: s_mul_hi_u32 s1, s6, s0 +; GFX9-NEXT: s_add_u32 s2, s8, s2 +; GFX9-NEXT: s_addc_u32 s1, 0, s1 +; GFX9-NEXT: s_mul_hi_u32 s9, s7, s3 +; GFX9-NEXT: s_mul_i32 s3, s7, s3 +; GFX9-NEXT: s_add_u32 s2, s2, s3 +; GFX9-NEXT: s_mul_hi_u32 s8, s7, s0 +; GFX9-NEXT: s_addc_u32 s1, s1, s9 +; GFX9-NEXT: s_addc_u32 s2, s8, 0 +; GFX9-NEXT: s_mul_i32 s0, s7, s0 +; GFX9-NEXT: s_add_u32 s3, s1, s0 +; GFX9-NEXT: s_addc_u32 s2, 0, s2 +; GFX9-NEXT: s_mul_i32 s0, s3, 0x11f +; GFX9-NEXT: s_mul_hi_u32 s8, s3, 0x976a7377 +; GFX9-NEXT: s_add_i32 s0, s8, s0 +; GFX9-NEXT: s_mul_i32 s8, s2, 0x976a7377 +; GFX9-NEXT: s_mul_i32 s9, s3, 0x976a7377 +; GFX9-NEXT: s_add_i32 s8, s0, s8 +; GFX9-NEXT: v_mov_b32_e32 v0, s9 +; GFX9-NEXT: s_sub_i32 s0, s7, s8 +; GFX9-NEXT: v_sub_co_u32_e32 v0, vcc, s6, v0 +; GFX9-NEXT: s_mov_b32 s1, 0x976a7377 +; GFX9-NEXT: s_cmp_lg_u64 vcc, 0 +; GFX9-NEXT: s_subb_u32 s6, s0, 0x11f +; GFX9-NEXT: v_subrev_co_u32_e64 v1, s[0:1], s1, v0 +; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX9-NEXT: s_subb_u32 s6, s6, 0 +; GFX9-NEXT: s_cmpk_gt_u32 s6, 0x11e +; GFX9-NEXT: s_mov_b32 s10, 0x976a7376 +; GFX9-NEXT: s_cselect_b32 s9, -1, 0 +; GFX9-NEXT: v_cmp_lt_u32_e64 s[0:1], s10, v1 +; GFX9-NEXT: s_cmpk_eq_i32 s6, 0x11f +; GFX9-NEXT: v_cndmask_b32_e64 v1, 0, -1, s[0:1] +; GFX9-NEXT: v_mov_b32_e32 v3, s9 +; GFX9-NEXT: s_cselect_b64 s[0:1], -1, 0 +; GFX9-NEXT: s_add_u32 s6, s3, 2 +; GFX9-NEXT: v_cndmask_b32_e64 v1, v3, v1, s[0:1] +; GFX9-NEXT: s_addc_u32 s0, s2, 0 +; GFX9-NEXT: s_add_u32 s9, s3, 1 +; GFX9-NEXT: s_addc_u32 s1, s2, 0 +; GFX9-NEXT: s_cmp_lg_u64 vcc, 0 +; GFX9-NEXT: s_subb_u32 s7, s7, s8 +; GFX9-NEXT: s_cmpk_gt_u32 s7, 0x11e +; GFX9-NEXT: v_mov_b32_e32 v3, s1 +; GFX9-NEXT: v_mov_b32_e32 v4, s0 +; GFX9-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v1 +; GFX9-NEXT: s_cselect_b32 s8, -1, 0 +; GFX9-NEXT: v_cmp_lt_u32_e32 vcc, s10, v0 +; GFX9-NEXT: s_cmpk_eq_i32 s7, 0x11f +; GFX9-NEXT: v_cndmask_b32_e64 v1, v3, v4, s[0:1] +; GFX9-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc +; GFX9-NEXT: v_mov_b32_e32 v3, s8 +; GFX9-NEXT: s_cselect_b64 vcc, -1, 0 +; GFX9-NEXT: v_cndmask_b32_e32 v0, v3, v0, vcc +; GFX9-NEXT: v_mov_b32_e32 v3, s2 +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc +; GFX9-NEXT: v_mov_b32_e32 v0, s9 +; GFX9-NEXT: v_mov_b32_e32 v3, s6 +; GFX9-NEXT: v_cndmask_b32_e64 v0, v0, v3, s[0:1] +; GFX9-NEXT: v_mov_b32_e32 v3, s3 +; GFX9-NEXT: v_cndmask_b32_e32 v0, v3, v0, vcc +; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] ; GFX9-NEXT: s_endpgm %r = udiv i64 %x, 1235195949943 store i64 %r, i64 addrspace(1)* %out @@ -7421,8 +7461,8 @@ ; GFX6-NEXT: v_mul_hi_u32 v2, v0, s6 ; GFX6-NEXT: v_mul_lo_u32 v4, v1, s6 ; GFX6-NEXT: v_mul_lo_u32 v3, v0, s6 -; GFX6-NEXT: v_subrev_i32_e32 v2, vcc, v0, v2 -; GFX6-NEXT: v_add_i32_e32 v2, vcc, v4, v2 +; GFX6-NEXT: v_sub_i32_e32 v2, vcc, v2, v0 +; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v4 ; GFX6-NEXT: v_mul_hi_u32 v5, v0, v3 ; GFX6-NEXT: v_mul_lo_u32 v4, v0, v2 ; GFX6-NEXT: v_mul_hi_u32 v6, v0, v2 @@ -7443,8 +7483,8 @@ ; GFX6-NEXT: v_mul_lo_u32 v3, v1, s6 ; GFX6-NEXT: v_mul_lo_u32 v4, v0, s6 ; GFX6-NEXT: s_mov_b32 s6, -1 -; GFX6-NEXT: v_subrev_i32_e32 v2, vcc, v0, v2 -; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v3 +; GFX6-NEXT: v_sub_i32_e32 v2, vcc, v2, v0 +; GFX6-NEXT: v_add_i32_e32 v2, vcc, v3, v2 ; GFX6-NEXT: v_mul_lo_u32 v3, v0, v2 ; GFX6-NEXT: v_mul_hi_u32 v5, v0, v4 ; GFX6-NEXT: v_mul_hi_u32 v6, v0, v2 @@ -7702,8 +7742,8 @@ ; GFX6-NEXT: v_mul_lo_u32 v4, v1, s3 ; GFX6-NEXT: v_mul_lo_u32 v5, v0, s3 ; GFX6-NEXT: s_mov_b32 s12, 0x9761f7c9 -; GFX6-NEXT: v_add_i32_e32 v2, vcc, v3, v2 -; GFX6-NEXT: v_add_i32_e32 v2, vcc, v4, v2 +; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v3 +; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v4 ; GFX6-NEXT: v_mul_lo_u32 v3, v0, v2 ; GFX6-NEXT: v_mul_hi_u32 v4, v0, v5 ; GFX6-NEXT: v_mul_hi_u32 v6, v0, v2 @@ -7727,7 +7767,7 @@ ; GFX6-NEXT: v_mul_lo_u32 v4, v1, s3 ; GFX6-NEXT: s_mov_b32 s11, 0xf000 ; GFX6-NEXT: s_mov_b32 s10, -1 -; GFX6-NEXT: v_add_i32_e32 v2, vcc, v3, v2 +; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v3 ; GFX6-NEXT: v_mul_lo_u32 v3, v0, s3 ; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v4 ; GFX6-NEXT: v_mul_lo_u32 v4, v0, v2 @@ -7764,8 +7804,8 @@ ; GFX6-NEXT: v_mul_hi_u32 v3, v0, s12 ; GFX6-NEXT: v_mul_lo_u32 v1, v1, s12 ; GFX6-NEXT: v_mul_lo_u32 v0, v0, s12 -; GFX6-NEXT: v_add_i32_e32 v2, vcc, v3, v2 -; GFX6-NEXT: v_add_i32_e32 v1, vcc, v2, v1 +; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v3 +; GFX6-NEXT: v_add_i32_e32 v1, vcc, v1, v2 ; GFX6-NEXT: v_sub_i32_e32 v2, vcc, s7, v1 ; GFX6-NEXT: v_mov_b32_e32 v3, 0x11f ; GFX6-NEXT: v_sub_i32_e32 v0, vcc, s6, v0 @@ -7805,115 +7845,128 @@ ; GFX9-NEXT: v_mov_b32_e32 v1, 0x4f800000 ; GFX9-NEXT: v_madmk_f32 v0, v1, 0x438f8000, v0 ; GFX9-NEXT: v_rcp_f32_e32 v0, v0 -; GFX9-NEXT: s_movk_i32 s2, 0xfee0 -; GFX9-NEXT: s_mov_b32 s3, 0x689e0837 ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NEXT: s_mov_b32 s12, 0x9761f7c8 +; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0 ; GFX9-NEXT: v_mul_f32_e32 v1, 0x2f800000, v0 ; GFX9-NEXT: v_trunc_f32_e32 v1, v1 ; GFX9-NEXT: v_mac_f32_e32 v0, 0xcf800000, v1 ; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0 ; GFX9-NEXT: v_cvt_u32_f32_e32 v1, v1 -; GFX9-NEXT: s_movk_i32 s8, 0x11f -; GFX9-NEXT: s_mov_b32 s9, 0x9761f7c9 -; GFX9-NEXT: v_mul_lo_u32 v2, v0, s2 -; GFX9-NEXT: v_mul_hi_u32 v3, v0, s3 -; GFX9-NEXT: v_mul_lo_u32 v5, v1, s3 -; GFX9-NEXT: v_mul_lo_u32 v4, v0, s3 -; GFX9-NEXT: s_mov_b32 s10, 0x9761f7c8 -; GFX9-NEXT: v_add_u32_e32 v2, v3, v2 -; GFX9-NEXT: v_add_u32_e32 v2, v2, v5 -; GFX9-NEXT: v_mul_hi_u32 v3, v0, v4 -; GFX9-NEXT: v_mul_lo_u32 v5, v0, v2 -; GFX9-NEXT: v_mul_hi_u32 v6, v0, v2 -; GFX9-NEXT: v_mul_hi_u32 v7, v1, v2 -; GFX9-NEXT: v_mul_lo_u32 v2, v1, v2 -; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, v3, v5 -; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v6, vcc -; GFX9-NEXT: v_mul_lo_u32 v6, v1, v4 -; GFX9-NEXT: v_mul_hi_u32 v4, v1, v4 -; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, v3, v6 -; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, v5, v4, vcc -; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, 0, v7, vcc -; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v3, v2 -; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v4, vcc -; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v2 -; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v3, vcc -; GFX9-NEXT: v_mul_lo_u32 v2, v0, s2 -; GFX9-NEXT: v_mul_hi_u32 v3, v0, s3 -; GFX9-NEXT: v_mul_lo_u32 v4, v1, s3 -; GFX9-NEXT: v_mul_lo_u32 v5, v0, s3 -; GFX9-NEXT: v_add_u32_e32 v2, v3, v2 -; GFX9-NEXT: v_add_u32_e32 v2, v2, v4 -; GFX9-NEXT: v_mul_lo_u32 v3, v0, v2 -; GFX9-NEXT: v_mul_hi_u32 v4, v0, v5 -; GFX9-NEXT: v_mul_hi_u32 v6, v0, v2 -; GFX9-NEXT: v_mul_hi_u32 v7, v1, v2 -; GFX9-NEXT: v_mul_lo_u32 v2, v1, v2 -; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, v4, v3 -; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, 0, v6, vcc -; GFX9-NEXT: v_mul_lo_u32 v6, v1, v5 -; GFX9-NEXT: v_mul_hi_u32 v5, v1, v5 -; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, v3, v6 -; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, v4, v5, vcc -; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, 0, v7, vcc -; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v3, v2 -; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v4, vcc -; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v2 -; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v3, vcc +; GFX9-NEXT: v_readfirstlane_b32 s0, v0 +; GFX9-NEXT: s_mul_i32 s1, s0, 0xfffffee0 +; GFX9-NEXT: s_mul_hi_u32 s2, s0, 0x689e0837 +; GFX9-NEXT: s_add_i32 s1, s2, s1 +; GFX9-NEXT: v_readfirstlane_b32 s2, v1 +; GFX9-NEXT: s_mul_i32 s3, s2, 0x689e0837 +; GFX9-NEXT: s_add_i32 s1, s1, s3 +; GFX9-NEXT: s_mul_i32 s9, s0, 0x689e0837 +; GFX9-NEXT: s_mul_hi_u32 s3, s0, s1 +; GFX9-NEXT: s_mul_i32 s8, s0, s1 +; GFX9-NEXT: s_mul_hi_u32 s0, s0, s9 +; GFX9-NEXT: s_add_u32 s0, s0, s8 +; GFX9-NEXT: s_addc_u32 s3, 0, s3 +; GFX9-NEXT: s_mul_hi_u32 s10, s2, s9 +; GFX9-NEXT: s_mul_i32 s9, s2, s9 +; GFX9-NEXT: s_add_u32 s0, s0, s9 +; GFX9-NEXT: s_mul_hi_u32 s8, s2, s1 +; GFX9-NEXT: s_addc_u32 s0, s3, s10 +; GFX9-NEXT: s_addc_u32 s3, s8, 0 +; GFX9-NEXT: s_mul_i32 s1, s2, s1 +; GFX9-NEXT: s_add_u32 s0, s0, s1 +; GFX9-NEXT: s_addc_u32 s1, 0, s3 +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s0, v0 +; GFX9-NEXT: s_cmp_lg_u64 vcc, 0 +; GFX9-NEXT: s_addc_u32 s0, s2, s1 +; GFX9-NEXT: v_readfirstlane_b32 s2, v0 +; GFX9-NEXT: s_mul_i32 s3, s2, 0xfffffee0 +; GFX9-NEXT: s_mul_hi_u32 s8, s2, 0x689e0837 +; GFX9-NEXT: s_mul_i32 s1, s0, 0x689e0837 +; GFX9-NEXT: s_add_i32 s3, s8, s3 +; GFX9-NEXT: s_add_i32 s3, s3, s1 +; GFX9-NEXT: s_mul_i32 s9, s2, 0x689e0837 +; GFX9-NEXT: s_mul_hi_u32 s1, s2, s3 +; GFX9-NEXT: s_mul_i32 s8, s2, s3 +; GFX9-NEXT: s_mul_hi_u32 s2, s2, s9 +; GFX9-NEXT: s_add_u32 s2, s2, s8 +; GFX9-NEXT: s_addc_u32 s1, 0, s1 +; GFX9-NEXT: s_mul_hi_u32 s10, s0, s9 +; GFX9-NEXT: s_mul_i32 s9, s0, s9 +; GFX9-NEXT: s_add_u32 s2, s2, s9 +; GFX9-NEXT: s_mul_hi_u32 s8, s0, s3 +; GFX9-NEXT: s_addc_u32 s1, s1, s10 +; GFX9-NEXT: s_addc_u32 s2, s8, 0 +; GFX9-NEXT: s_mul_i32 s3, s0, s3 +; GFX9-NEXT: s_add_u32 s1, s1, s3 +; GFX9-NEXT: s_addc_u32 s2, 0, s2 +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s1, v0 +; GFX9-NEXT: s_cmp_lg_u64 vcc, 0 +; GFX9-NEXT: s_addc_u32 s0, s0, s2 +; GFX9-NEXT: v_readfirstlane_b32 s3, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mul_lo_u32 v2, s6, v1 -; GFX9-NEXT: v_mul_hi_u32 v3, s6, v0 -; GFX9-NEXT: v_mul_hi_u32 v4, s6, v1 -; GFX9-NEXT: v_mul_hi_u32 v5, s7, v1 -; GFX9-NEXT: v_mul_lo_u32 v1, s7, v1 -; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v3, v2 -; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v4, vcc -; GFX9-NEXT: v_mul_lo_u32 v4, s7, v0 -; GFX9-NEXT: v_mul_hi_u32 v0, s7, v0 -; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v2, v4 -; GFX9-NEXT: v_addc_co_u32_e32 v0, vcc, v3, v0, vcc -; GFX9-NEXT: v_addc_co_u32_e32 v2, vcc, 0, v5, vcc -; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v1 -; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v2, vcc -; GFX9-NEXT: v_mul_lo_u32 v2, v0, s8 -; GFX9-NEXT: v_mul_hi_u32 v3, v0, s9 -; GFX9-NEXT: v_mul_lo_u32 v1, v1, s9 -; GFX9-NEXT: v_mul_lo_u32 v0, v0, s9 -; GFX9-NEXT: v_mov_b32_e32 v4, 0 -; GFX9-NEXT: v_add_u32_e32 v2, v3, v2 -; GFX9-NEXT: v_add_u32_e32 v1, v2, v1 -; GFX9-NEXT: v_sub_u32_e32 v2, s7, v1 -; GFX9-NEXT: v_mov_b32_e32 v3, 0x11f +; GFX9-NEXT: s_mul_i32 s2, s6, s0 +; GFX9-NEXT: s_mul_hi_u32 s8, s6, s3 +; GFX9-NEXT: s_mul_hi_u32 s1, s6, s0 +; GFX9-NEXT: s_add_u32 s2, s8, s2 +; GFX9-NEXT: s_addc_u32 s1, 0, s1 +; GFX9-NEXT: s_mul_hi_u32 s9, s7, s3 +; GFX9-NEXT: s_mul_i32 s3, s7, s3 +; GFX9-NEXT: s_add_u32 s2, s2, s3 +; GFX9-NEXT: s_mul_hi_u32 s8, s7, s0 +; GFX9-NEXT: s_addc_u32 s1, s1, s9 +; GFX9-NEXT: s_addc_u32 s2, s8, 0 +; GFX9-NEXT: s_mul_i32 s0, s7, s0 +; GFX9-NEXT: s_add_u32 s0, s1, s0 +; GFX9-NEXT: s_addc_u32 s1, 0, s2 +; GFX9-NEXT: s_mul_i32 s2, s0, 0x11f +; GFX9-NEXT: s_mul_hi_u32 s3, s0, 0x9761f7c9 +; GFX9-NEXT: s_add_i32 s2, s3, s2 +; GFX9-NEXT: s_mul_i32 s1, s1, 0x9761f7c9 +; GFX9-NEXT: s_mul_i32 s0, s0, 0x9761f7c9 +; GFX9-NEXT: s_add_i32 s9, s2, s1 +; GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NEXT: s_sub_i32 s1, s7, s9 ; GFX9-NEXT: v_sub_co_u32_e32 v0, vcc, s6, v0 -; GFX9-NEXT: v_subb_co_u32_e64 v2, s[0:1], v2, v3, vcc -; GFX9-NEXT: v_subrev_co_u32_e64 v5, s[0:1], s9, v0 -; GFX9-NEXT: v_subbrev_co_u32_e64 v6, s[2:3], 0, v2, s[0:1] -; GFX9-NEXT: s_movk_i32 s6, 0x11e -; GFX9-NEXT: v_cmp_lt_u32_e64 s[2:3], s6, v6 -; GFX9-NEXT: v_cndmask_b32_e64 v7, 0, -1, s[2:3] -; GFX9-NEXT: v_cmp_lt_u32_e64 s[2:3], s10, v5 -; GFX9-NEXT: v_subb_co_u32_e64 v2, s[0:1], v2, v3, s[0:1] -; GFX9-NEXT: v_cndmask_b32_e64 v8, 0, -1, s[2:3] -; GFX9-NEXT: v_cmp_eq_u32_e64 s[2:3], s8, v6 -; GFX9-NEXT: v_subrev_co_u32_e64 v3, s[0:1], s9, v5 -; GFX9-NEXT: v_cndmask_b32_e64 v7, v7, v8, s[2:3] -; GFX9-NEXT: v_subbrev_co_u32_e64 v2, s[0:1], 0, v2, s[0:1] -; GFX9-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v7 -; GFX9-NEXT: v_cndmask_b32_e64 v2, v6, v2, s[0:1] -; GFX9-NEXT: v_mov_b32_e32 v6, s7 -; GFX9-NEXT: v_subb_co_u32_e32 v1, vcc, v6, v1, vcc -; GFX9-NEXT: v_cmp_lt_u32_e32 vcc, s6, v1 -; GFX9-NEXT: v_cndmask_b32_e64 v6, 0, -1, vcc -; GFX9-NEXT: v_cmp_lt_u32_e32 vcc, s10, v0 -; GFX9-NEXT: v_cndmask_b32_e64 v7, 0, -1, vcc -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, s8, v1 -; GFX9-NEXT: v_cndmask_b32_e32 v6, v6, v7, vcc -; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v6 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc -; GFX9-NEXT: v_cndmask_b32_e64 v2, v5, v3, s[0:1] -; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc -; GFX9-NEXT: global_store_dwordx2 v4, v[0:1], s[4:5] +; GFX9-NEXT: s_mov_b32 s8, 0x9761f7c9 +; GFX9-NEXT: s_cmp_lg_u64 vcc, 0 +; GFX9-NEXT: s_subb_u32 s6, s1, 0x11f +; GFX9-NEXT: v_subrev_co_u32_e64 v3, s[0:1], s8, v0 +; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX9-NEXT: s_subb_u32 s10, s6, 0 +; GFX9-NEXT: s_cmpk_gt_u32 s10, 0x11e +; GFX9-NEXT: s_cselect_b32 s11, -1, 0 +; GFX9-NEXT: v_cmp_lt_u32_e64 s[2:3], s12, v3 +; GFX9-NEXT: s_cmpk_eq_i32 s10, 0x11f +; GFX9-NEXT: v_cndmask_b32_e64 v1, 0, -1, s[2:3] +; GFX9-NEXT: v_mov_b32_e32 v4, s11 +; GFX9-NEXT: s_cselect_b64 s[2:3], -1, 0 +; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX9-NEXT: v_cndmask_b32_e64 v1, v4, v1, s[2:3] +; GFX9-NEXT: s_subb_u32 s2, s6, 0x11f +; GFX9-NEXT: v_subrev_co_u32_e64 v4, s[0:1], s8, v3 +; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX9-NEXT: s_subb_u32 s0, s2, 0 +; GFX9-NEXT: s_cmp_lg_u64 vcc, 0 +; GFX9-NEXT: s_subb_u32 s2, s7, s9 +; GFX9-NEXT: s_cmpk_gt_u32 s2, 0x11e +; GFX9-NEXT: v_mov_b32_e32 v5, s10 +; GFX9-NEXT: v_mov_b32_e32 v6, s0 +; GFX9-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v1 +; GFX9-NEXT: s_cselect_b32 s3, -1, 0 +; GFX9-NEXT: v_cmp_lt_u32_e32 vcc, s12, v0 +; GFX9-NEXT: s_cmpk_eq_i32 s2, 0x11f +; GFX9-NEXT: v_cndmask_b32_e64 v1, v5, v6, s[0:1] +; GFX9-NEXT: v_cndmask_b32_e64 v5, 0, -1, vcc +; GFX9-NEXT: v_mov_b32_e32 v6, s3 +; GFX9-NEXT: s_cselect_b64 vcc, -1, 0 +; GFX9-NEXT: v_cndmask_b32_e32 v5, v6, v5, vcc +; GFX9-NEXT: v_mov_b32_e32 v6, s2 +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v5 +; GFX9-NEXT: v_cndmask_b32_e64 v3, v3, v4, s[0:1] +; GFX9-NEXT: v_cndmask_b32_e32 v1, v6, v1, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc +; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] ; GFX9-NEXT: s_endpgm %r = urem i64 %x, 1235195393993 store i64 %r, i64 addrspace(1)* %out @@ -8138,8 +8191,8 @@ ; GFX6-NEXT: v_mul_lo_u32 v4, v0, s5 ; GFX6-NEXT: s_mov_b32 s9, s8 ; GFX6-NEXT: s_addc_u32 s3, s3, s8 -; GFX6-NEXT: v_add_i32_e32 v2, vcc, v3, v2 -; GFX6-NEXT: v_subrev_i32_e32 v2, vcc, v0, v2 +; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v3 +; GFX6-NEXT: v_sub_i32_e32 v2, vcc, v2, v0 ; GFX6-NEXT: v_mul_hi_u32 v3, v0, v4 ; GFX6-NEXT: v_mul_lo_u32 v5, v0, v2 ; GFX6-NEXT: v_mul_hi_u32 v6, v0, v2 @@ -8162,7 +8215,7 @@ ; GFX6-NEXT: v_mul_hi_u32 v3, v0, s5 ; GFX6-NEXT: s_mov_b32 s0, 0x12d8fb ; GFX6-NEXT: s_mov_b32 s6, -1 -; GFX6-NEXT: v_add_i32_e32 v2, vcc, v3, v2 +; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v3 ; GFX6-NEXT: v_mul_lo_u32 v3, v0, s5 ; GFX6-NEXT: v_subrev_i32_e32 v2, vcc, v0, v2 ; GFX6-NEXT: v_mul_lo_u32 v6, v0, v2 @@ -8434,8 +8487,8 @@ ; GFX6-NEXT: v_mul_lo_u32 v5, s5, v0 ; GFX6-NEXT: v_mul_lo_u32 v4, s4, v0 ; GFX6-NEXT: s_addc_u32 s3, s3, s12 -; GFX6-NEXT: v_add_i32_e32 v2, vcc, v3, v2 -; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v5 +; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v3 +; GFX6-NEXT: v_add_i32_e32 v2, vcc, v5, v2 ; GFX6-NEXT: v_mul_hi_u32 v3, v0, v4 ; GFX6-NEXT: v_mul_lo_u32 v5, v0, v2 ; GFX6-NEXT: v_mul_hi_u32 v6, v0, v2 @@ -8495,9 +8548,9 @@ ; GFX6-NEXT: v_mul_hi_u32 v3, s10, v0 ; GFX6-NEXT: v_mul_lo_u32 v4, s11, v0 ; GFX6-NEXT: v_mov_b32_e32 v5, s11 -; GFX6-NEXT: v_add_i32_e32 v2, vcc, v3, v2 +; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v3 ; GFX6-NEXT: v_mul_lo_u32 v3, s10, v0 -; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v4 +; GFX6-NEXT: v_add_i32_e32 v2, vcc, v4, v2 ; GFX6-NEXT: v_sub_i32_e32 v4, vcc, s3, v2 ; GFX6-NEXT: v_sub_i32_e32 v3, vcc, s2, v3 ; GFX6-NEXT: v_subb_u32_e64 v4, s[0:1], v4, v5, vcc @@ -8549,125 +8602,143 @@ ; GFX9-NEXT: s_xor_b64 s[8:9], s[4:5], s[2:3] ; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s8 ; GFX9-NEXT: v_cvt_f32_u32_e32 v1, s9 -; GFX9-NEXT: s_sub_u32 s10, 0, s8 -; GFX9-NEXT: s_subb_u32 s4, 0, s9 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NEXT: s_sub_u32 s0, 0, s8 +; GFX9-NEXT: s_subb_u32 s1, 0, s9 ; GFX9-NEXT: v_mac_f32_e32 v0, 0x4f800000, v1 -; GFX9-NEXT: v_rcp_f32_e32 v0, v0 -; GFX9-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0 -; GFX9-NEXT: v_mul_f32_e32 v1, 0x2f800000, v0 -; GFX9-NEXT: v_trunc_f32_e32 v1, v1 -; GFX9-NEXT: v_mac_f32_e32 v0, 0xcf800000, v1 +; GFX9-NEXT: v_rcp_f32_e32 v1, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, 0 +; GFX9-NEXT: v_mul_f32_e32 v1, 0x5f7ffffc, v1 +; GFX9-NEXT: v_mul_f32_e32 v2, 0x2f800000, v1 +; GFX9-NEXT: v_trunc_f32_e32 v2, v2 +; GFX9-NEXT: v_mac_f32_e32 v1, 0xcf800000, v2 +; GFX9-NEXT: v_cvt_u32_f32_e32 v2, v2 ; GFX9-NEXT: v_cvt_u32_f32_e32 v1, v1 -; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0 -; GFX9-NEXT: v_mul_lo_u32 v2, s10, v1 -; GFX9-NEXT: v_mul_hi_u32 v3, s10, v0 -; GFX9-NEXT: v_mul_lo_u32 v5, s4, v0 -; GFX9-NEXT: v_mul_lo_u32 v4, s10, v0 -; GFX9-NEXT: v_add_u32_e32 v2, v3, v2 -; GFX9-NEXT: v_add_u32_e32 v2, v2, v5 -; GFX9-NEXT: v_mul_hi_u32 v3, v0, v4 -; GFX9-NEXT: v_mul_lo_u32 v5, v0, v2 -; GFX9-NEXT: v_mul_hi_u32 v7, v0, v2 -; GFX9-NEXT: v_mul_hi_u32 v6, v1, v4 -; GFX9-NEXT: v_mul_lo_u32 v4, v1, v4 -; GFX9-NEXT: v_mul_hi_u32 v8, v1, v2 -; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, v3, v5 -; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v7, vcc -; GFX9-NEXT: v_mul_lo_u32 v2, v1, v2 -; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, v3, v4 -; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, v5, v6, vcc -; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, 0, v8, vcc -; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v3, v2 -; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v4, vcc -; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v2 -; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v3, vcc -; GFX9-NEXT: v_mul_lo_u32 v2, s10, v1 -; GFX9-NEXT: v_mul_hi_u32 v3, s10, v0 -; GFX9-NEXT: v_mul_lo_u32 v4, s4, v0 -; GFX9-NEXT: v_mul_lo_u32 v5, s10, v0 -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-NEXT: v_add_u32_e32 v2, v3, v2 -; GFX9-NEXT: v_add_u32_e32 v2, v2, v4 -; GFX9-NEXT: v_mul_lo_u32 v6, v0, v2 -; GFX9-NEXT: v_mul_hi_u32 v7, v0, v5 -; GFX9-NEXT: v_mul_hi_u32 v8, v0, v2 -; GFX9-NEXT: v_mul_hi_u32 v4, v1, v5 -; GFX9-NEXT: v_mul_lo_u32 v5, v1, v5 -; GFX9-NEXT: v_mul_hi_u32 v3, v1, v2 -; GFX9-NEXT: v_add_co_u32_e32 v6, vcc, v7, v6 -; GFX9-NEXT: v_addc_co_u32_e32 v7, vcc, 0, v8, vcc -; GFX9-NEXT: v_mul_lo_u32 v2, v1, v2 -; GFX9-NEXT: v_add_co_u32_e32 v5, vcc, v6, v5 -; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, v7, v4, vcc -; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc -; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v4, v2 +; GFX9-NEXT: v_readfirstlane_b32 s10, v2 +; GFX9-NEXT: v_readfirstlane_b32 s11, v1 +; GFX9-NEXT: s_mul_i32 s12, s0, s10 +; GFX9-NEXT: s_mul_hi_u32 s14, s0, s11 +; GFX9-NEXT: s_mul_i32 s13, s1, s11 +; GFX9-NEXT: s_add_i32 s12, s14, s12 +; GFX9-NEXT: s_add_i32 s12, s12, s13 +; GFX9-NEXT: s_mul_i32 s15, s0, s11 +; GFX9-NEXT: s_mul_hi_u32 s13, s11, s12 +; GFX9-NEXT: s_mul_i32 s14, s11, s12 +; GFX9-NEXT: s_mul_hi_u32 s11, s11, s15 +; GFX9-NEXT: s_add_u32 s11, s11, s14 +; GFX9-NEXT: s_addc_u32 s13, 0, s13 +; GFX9-NEXT: s_mul_hi_u32 s16, s10, s15 +; GFX9-NEXT: s_mul_i32 s15, s10, s15 +; GFX9-NEXT: s_add_u32 s11, s11, s15 +; GFX9-NEXT: s_mul_hi_u32 s14, s10, s12 +; GFX9-NEXT: s_addc_u32 s11, s13, s16 +; GFX9-NEXT: s_addc_u32 s13, s14, 0 +; GFX9-NEXT: s_mul_i32 s12, s10, s12 +; GFX9-NEXT: s_add_u32 s11, s11, s12 +; GFX9-NEXT: s_addc_u32 s12, 0, s13 +; GFX9-NEXT: v_add_co_u32_e32 v1, vcc, s11, v1 +; GFX9-NEXT: s_cmp_lg_u64 vcc, 0 +; GFX9-NEXT: s_addc_u32 s10, s10, s12 +; GFX9-NEXT: v_readfirstlane_b32 s12, v1 +; GFX9-NEXT: s_mul_i32 s11, s0, s10 +; GFX9-NEXT: s_mul_hi_u32 s13, s0, s12 +; GFX9-NEXT: s_add_i32 s11, s13, s11 +; GFX9-NEXT: s_mul_i32 s1, s1, s12 +; GFX9-NEXT: s_add_i32 s11, s11, s1 +; GFX9-NEXT: s_mul_i32 s0, s0, s12 +; GFX9-NEXT: s_mul_hi_u32 s13, s10, s0 +; GFX9-NEXT: s_mul_i32 s14, s10, s0 +; GFX9-NEXT: s_mul_i32 s16, s12, s11 +; GFX9-NEXT: s_mul_hi_u32 s0, s12, s0 +; GFX9-NEXT: s_mul_hi_u32 s15, s12, s11 +; GFX9-NEXT: s_add_u32 s0, s0, s16 +; GFX9-NEXT: s_addc_u32 s12, 0, s15 +; GFX9-NEXT: s_add_u32 s0, s0, s14 +; GFX9-NEXT: s_mul_hi_u32 s1, s10, s11 +; GFX9-NEXT: s_addc_u32 s0, s12, s13 +; GFX9-NEXT: s_addc_u32 s1, s1, 0 +; GFX9-NEXT: s_mul_i32 s11, s10, s11 +; GFX9-NEXT: s_add_u32 s0, s0, s11 +; GFX9-NEXT: s_addc_u32 s1, 0, s1 +; GFX9-NEXT: v_add_co_u32_e32 v1, vcc, s0, v1 +; GFX9-NEXT: s_cmp_lg_u64 vcc, 0 +; GFX9-NEXT: s_addc_u32 s12, s10, s1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_ashr_i32 s10, s7, 31 -; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc ; GFX9-NEXT: s_add_u32 s0, s6, s10 -; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v2 ; GFX9-NEXT: s_mov_b32 s11, s10 ; GFX9-NEXT: s_addc_u32 s1, s7, s10 -; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v3, vcc ; GFX9-NEXT: s_xor_b64 s[6:7], s[0:1], s[10:11] -; GFX9-NEXT: v_mul_lo_u32 v2, s6, v1 -; GFX9-NEXT: v_mul_hi_u32 v3, s6, v0 -; GFX9-NEXT: v_mul_hi_u32 v4, s6, v1 -; GFX9-NEXT: v_mul_hi_u32 v5, s7, v1 -; GFX9-NEXT: v_mul_lo_u32 v1, s7, v1 -; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v3, v2 -; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v4, vcc -; GFX9-NEXT: v_mul_lo_u32 v4, s7, v0 -; GFX9-NEXT: v_mul_hi_u32 v0, s7, v0 -; GFX9-NEXT: v_mov_b32_e32 v6, s9 -; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v2, v4 -; GFX9-NEXT: v_addc_co_u32_e32 v0, vcc, v3, v0, vcc -; GFX9-NEXT: v_addc_co_u32_e32 v2, vcc, 0, v5, vcc -; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v1 -; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v2, vcc -; GFX9-NEXT: v_mul_lo_u32 v2, s8, v1 -; GFX9-NEXT: v_mul_hi_u32 v3, s8, v0 -; GFX9-NEXT: v_mul_lo_u32 v4, s9, v0 -; GFX9-NEXT: v_mov_b32_e32 v5, 0 -; GFX9-NEXT: v_add_u32_e32 v2, v3, v2 -; GFX9-NEXT: v_mul_lo_u32 v3, s8, v0 -; GFX9-NEXT: v_add_u32_e32 v2, v2, v4 -; GFX9-NEXT: v_sub_u32_e32 v4, s7, v2 -; GFX9-NEXT: v_sub_co_u32_e32 v3, vcc, s6, v3 -; GFX9-NEXT: v_subb_co_u32_e64 v4, s[0:1], v4, v6, vcc -; GFX9-NEXT: v_subrev_co_u32_e64 v6, s[0:1], s8, v3 -; GFX9-NEXT: v_subbrev_co_u32_e64 v4, s[0:1], 0, v4, s[0:1] -; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s9, v4 -; GFX9-NEXT: v_cndmask_b32_e64 v7, 0, -1, s[0:1] -; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s8, v6 -; GFX9-NEXT: v_cndmask_b32_e64 v6, 0, -1, s[0:1] -; GFX9-NEXT: v_cmp_eq_u32_e64 s[0:1], s9, v4 -; GFX9-NEXT: v_cndmask_b32_e64 v4, v7, v6, s[0:1] -; GFX9-NEXT: v_add_co_u32_e64 v6, s[0:1], 2, v0 -; GFX9-NEXT: v_addc_co_u32_e64 v7, s[0:1], 0, v1, s[0:1] -; GFX9-NEXT: v_add_co_u32_e64 v8, s[0:1], 1, v0 -; GFX9-NEXT: v_addc_co_u32_e64 v9, s[0:1], 0, v1, s[0:1] -; GFX9-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v4 -; GFX9-NEXT: v_cndmask_b32_e64 v4, v9, v7, s[0:1] -; GFX9-NEXT: v_mov_b32_e32 v7, s7 -; GFX9-NEXT: v_subb_co_u32_e32 v2, vcc, v7, v2, vcc -; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s9, v2 -; GFX9-NEXT: v_cndmask_b32_e64 v7, 0, -1, vcc -; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s8, v3 -; GFX9-NEXT: v_cndmask_b32_e64 v3, 0, -1, vcc -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, s9, v2 -; GFX9-NEXT: v_cndmask_b32_e32 v2, v7, v3, vcc -; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 -; GFX9-NEXT: v_cndmask_b32_e64 v2, v8, v6, s[0:1] -; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc +; GFX9-NEXT: v_readfirstlane_b32 s13, v1 +; GFX9-NEXT: s_mul_i32 s1, s6, s12 +; GFX9-NEXT: s_mul_hi_u32 s14, s6, s13 +; GFX9-NEXT: s_mul_hi_u32 s0, s6, s12 +; GFX9-NEXT: s_add_u32 s1, s14, s1 +; GFX9-NEXT: s_addc_u32 s0, 0, s0 +; GFX9-NEXT: s_mul_hi_u32 s15, s7, s13 +; GFX9-NEXT: s_mul_i32 s13, s7, s13 +; GFX9-NEXT: s_add_u32 s1, s1, s13 +; GFX9-NEXT: s_mul_hi_u32 s14, s7, s12 +; GFX9-NEXT: s_addc_u32 s0, s0, s15 +; GFX9-NEXT: s_addc_u32 s1, s14, 0 +; GFX9-NEXT: s_mul_i32 s12, s7, s12 +; GFX9-NEXT: s_add_u32 s12, s0, s12 +; GFX9-NEXT: s_addc_u32 s13, 0, s1 +; GFX9-NEXT: s_mul_i32 s0, s8, s13 +; GFX9-NEXT: s_mul_hi_u32 s1, s8, s12 +; GFX9-NEXT: s_add_i32 s0, s1, s0 +; GFX9-NEXT: s_mul_i32 s1, s9, s12 +; GFX9-NEXT: s_add_i32 s14, s0, s1 +; GFX9-NEXT: s_mul_i32 s1, s8, s12 +; GFX9-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NEXT: s_sub_i32 s0, s7, s14 +; GFX9-NEXT: v_sub_co_u32_e32 v1, vcc, s6, v1 +; GFX9-NEXT: s_cmp_lg_u64 vcc, 0 +; GFX9-NEXT: s_subb_u32 s6, s0, s9 +; GFX9-NEXT: v_subrev_co_u32_e64 v2, s[0:1], s8, v1 +; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX9-NEXT: s_subb_u32 s6, s6, 0 +; GFX9-NEXT: s_cmp_ge_u32 s6, s9 +; GFX9-NEXT: s_cselect_b32 s15, -1, 0 +; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s8, v2 +; GFX9-NEXT: s_cmp_eq_u32 s6, s9 +; GFX9-NEXT: v_cndmask_b32_e64 v2, 0, -1, s[0:1] +; GFX9-NEXT: v_mov_b32_e32 v3, s15 +; GFX9-NEXT: s_cselect_b64 s[0:1], -1, 0 +; GFX9-NEXT: s_add_u32 s6, s12, 2 +; GFX9-NEXT: v_cndmask_b32_e64 v2, v3, v2, s[0:1] +; GFX9-NEXT: s_addc_u32 s0, s13, 0 +; GFX9-NEXT: s_add_u32 s15, s12, 1 +; GFX9-NEXT: s_addc_u32 s1, s13, 0 +; GFX9-NEXT: s_cmp_lg_u64 vcc, 0 +; GFX9-NEXT: s_subb_u32 s7, s7, s14 +; GFX9-NEXT: s_cmp_ge_u32 s7, s9 +; GFX9-NEXT: v_mov_b32_e32 v3, s1 +; GFX9-NEXT: v_mov_b32_e32 v4, s0 +; GFX9-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v2 +; GFX9-NEXT: s_cselect_b32 s14, -1, 0 +; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s8, v1 +; GFX9-NEXT: s_cmp_eq_u32 s7, s9 +; GFX9-NEXT: v_cndmask_b32_e64 v2, v3, v4, s[0:1] +; GFX9-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc +; GFX9-NEXT: v_mov_b32_e32 v3, s14 +; GFX9-NEXT: s_cselect_b64 vcc, -1, 0 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc +; GFX9-NEXT: v_mov_b32_e32 v3, s13 +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v1 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v3, v2, vcc +; GFX9-NEXT: v_mov_b32_e32 v2, s15 +; GFX9-NEXT: v_mov_b32_e32 v3, s6 +; GFX9-NEXT: v_cndmask_b32_e64 v2, v2, v3, s[0:1] +; GFX9-NEXT: v_mov_b32_e32 v3, s12 +; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v2, vcc ; GFX9-NEXT: s_xor_b64 s[0:1], s[10:11], s[2:3] -; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc -; GFX9-NEXT: v_xor_b32_e32 v0, s0, v0 -; GFX9-NEXT: v_xor_b32_e32 v1, s1, v1 -; GFX9-NEXT: v_mov_b32_e32 v2, s1 -; GFX9-NEXT: v_subrev_co_u32_e32 v0, vcc, s0, v0 -; GFX9-NEXT: v_subb_co_u32_e32 v1, vcc, v1, v2, vcc -; GFX9-NEXT: global_store_dwordx2 v5, v[0:1], s[4:5] +; GFX9-NEXT: v_xor_b32_e32 v2, s0, v2 +; GFX9-NEXT: v_xor_b32_e32 v3, s1, v1 +; GFX9-NEXT: v_mov_b32_e32 v4, s1 +; GFX9-NEXT: v_subrev_co_u32_e32 v1, vcc, s0, v2 +; GFX9-NEXT: v_subb_co_u32_e32 v2, vcc, v3, v4, vcc +; GFX9-NEXT: global_store_dwordx2 v0, v[1:2], s[4:5] ; GFX9-NEXT: s_endpgm %shl.y = shl i64 4096, %y %r = sdiv i64 %x, %shl.y @@ -8772,9 +8843,9 @@ ; GFX6-NEXT: s_add_u32 s0, s0, s8 ; GFX6-NEXT: s_addc_u32 s1, s1, 0 ; GFX6-NEXT: s_ashr_i64 s[8:9], s[0:1], 12 -; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v3 +; GFX6-NEXT: v_add_i32_e32 v2, vcc, v3, v2 ; GFX6-NEXT: v_mul_lo_u32 v3, v0, s6 -; GFX6-NEXT: v_subrev_i32_e32 v2, vcc, v0, v2 +; GFX6-NEXT: v_sub_i32_e32 v2, vcc, v2, v0 ; GFX6-NEXT: v_mul_lo_u32 v4, v0, v2 ; GFX6-NEXT: v_mul_hi_u32 v5, v0, v3 ; GFX6-NEXT: v_mul_hi_u32 v6, v0, v2 @@ -8798,7 +8869,7 @@ ; GFX6-NEXT: s_mov_b32 s11, s10 ; GFX6-NEXT: s_addc_u32 s1, s3, s10 ; GFX6-NEXT: s_xor_b64 s[0:1], s[0:1], s[10:11] -; GFX6-NEXT: v_add_i32_e32 v2, vcc, v3, v2 +; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v3 ; GFX6-NEXT: v_mul_lo_u32 v3, v0, s6 ; GFX6-NEXT: v_subrev_i32_e32 v2, vcc, v0, v2 ; GFX6-NEXT: v_mul_lo_u32 v6, v0, v2 @@ -9046,8 +9117,8 @@ ; GFX6-NEXT: v_mul_lo_u32 v5, s11, v0 ; GFX6-NEXT: v_mul_lo_u32 v4, s10, v0 ; GFX6-NEXT: s_xor_b64 s[14:15], s[16:17], s[14:15] -; GFX6-NEXT: v_add_i32_e32 v2, vcc, v3, v2 -; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v5 +; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v3 +; GFX6-NEXT: v_add_i32_e32 v2, vcc, v5, v2 ; GFX6-NEXT: v_mul_hi_u32 v3, v0, v4 ; GFX6-NEXT: v_mul_lo_u32 v5, v0, v2 ; GFX6-NEXT: v_mul_hi_u32 v7, v0, v2 @@ -9106,9 +9177,9 @@ ; GFX6-NEXT: v_mul_hi_u32 v3, s12, v0 ; GFX6-NEXT: v_mul_lo_u32 v4, s13, v0 ; GFX6-NEXT: v_mov_b32_e32 v5, s13 -; GFX6-NEXT: v_add_i32_e32 v2, vcc, v3, v2 +; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v3 ; GFX6-NEXT: v_mul_lo_u32 v3, s12, v0 -; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v4 +; GFX6-NEXT: v_add_i32_e32 v2, vcc, v4, v2 ; GFX6-NEXT: v_sub_i32_e32 v4, vcc, s5, v2 ; GFX6-NEXT: v_sub_i32_e32 v3, vcc, s4, v3 ; GFX6-NEXT: v_subb_u32_e64 v4, s[0:1], v4, v5, vcc @@ -9159,9 +9230,9 @@ ; GFX6-NEXT: s_subb_u32 s1, 0, s3 ; GFX6-NEXT: v_mul_lo_u32 v6, s1, v3 ; GFX6-NEXT: s_ashr_i32 s12, s7, 31 -; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v5 +; GFX6-NEXT: v_add_i32_e32 v2, vcc, v5, v2 ; GFX6-NEXT: v_mul_lo_u32 v5, s0, v3 -; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v6 +; GFX6-NEXT: v_add_i32_e32 v2, vcc, v6, v2 ; GFX6-NEXT: v_mul_lo_u32 v6, v3, v2 ; GFX6-NEXT: v_mul_hi_u32 v7, v3, v5 ; GFX6-NEXT: v_mul_hi_u32 v8, v3, v2 @@ -9273,257 +9344,293 @@ ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x34 ; GFX9-NEXT: s_mov_b64 s[2:3], 0x1000 +; GFX9-NEXT: v_mov_b32_e32 v4, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_lshl_b64 s[10:11], s[2:3], s10 ; GFX9-NEXT: s_lshl_b64 s[2:3], s[2:3], s8 -; GFX9-NEXT: s_ashr_i32 s12, s3, 31 -; GFX9-NEXT: s_add_u32 s2, s2, s12 -; GFX9-NEXT: s_mov_b32 s13, s12 -; GFX9-NEXT: s_addc_u32 s3, s3, s12 -; GFX9-NEXT: s_xor_b64 s[8:9], s[2:3], s[12:13] -; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s8 -; GFX9-NEXT: v_cvt_f32_u32_e32 v1, s9 -; GFX9-NEXT: s_sub_u32 s2, 0, s8 -; GFX9-NEXT: s_subb_u32 s3, 0, s9 -; GFX9-NEXT: s_ashr_i32 s14, s5, 31 -; GFX9-NEXT: v_mac_f32_e32 v0, 0x4f800000, v1 -; GFX9-NEXT: v_rcp_f32_e32 v0, v0 -; GFX9-NEXT: s_mov_b32 s15, s14 +; GFX9-NEXT: s_ashr_i32 s8, s3, 31 +; GFX9-NEXT: s_add_u32 s2, s2, s8 +; GFX9-NEXT: s_mov_b32 s9, s8 +; GFX9-NEXT: s_addc_u32 s3, s3, s8 +; GFX9-NEXT: s_xor_b64 s[12:13], s[2:3], s[8:9] +; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s12 +; GFX9-NEXT: v_cvt_f32_u32_e32 v1, s13 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GFX9-NEXT: s_sub_u32 s0, 0, s12 +; GFX9-NEXT: s_subb_u32 s1, 0, s13 +; GFX9-NEXT: v_mac_f32_e32 v0, 0x4f800000, v1 +; GFX9-NEXT: v_rcp_f32_e32 v0, v0 ; GFX9-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0 ; GFX9-NEXT: v_mul_f32_e32 v1, 0x2f800000, v0 ; GFX9-NEXT: v_trunc_f32_e32 v1, v1 ; GFX9-NEXT: v_mac_f32_e32 v0, 0xcf800000, v1 ; GFX9-NEXT: v_cvt_u32_f32_e32 v1, v1 ; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0 -; GFX9-NEXT: v_mul_lo_u32 v2, s2, v1 -; GFX9-NEXT: v_mul_hi_u32 v3, s2, v0 -; GFX9-NEXT: v_mul_lo_u32 v5, s3, v0 -; GFX9-NEXT: v_mul_lo_u32 v4, s2, v0 -; GFX9-NEXT: v_add_u32_e32 v2, v3, v2 -; GFX9-NEXT: v_add_u32_e32 v2, v2, v5 -; GFX9-NEXT: v_mul_hi_u32 v3, v0, v4 -; GFX9-NEXT: v_mul_lo_u32 v5, v0, v2 -; GFX9-NEXT: v_mul_hi_u32 v7, v0, v2 -; GFX9-NEXT: v_mul_hi_u32 v6, v1, v4 -; GFX9-NEXT: v_mul_lo_u32 v4, v1, v4 -; GFX9-NEXT: v_mul_hi_u32 v8, v1, v2 -; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, v3, v5 -; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v7, vcc -; GFX9-NEXT: v_mul_lo_u32 v2, v1, v2 -; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, v3, v4 -; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, v5, v6, vcc -; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, 0, v8, vcc -; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v3, v2 -; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v4, vcc -; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v2 -; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v3, vcc -; GFX9-NEXT: v_mul_lo_u32 v2, s2, v1 -; GFX9-NEXT: v_mul_hi_u32 v3, s2, v0 -; GFX9-NEXT: v_mul_lo_u32 v4, s3, v0 -; GFX9-NEXT: v_mul_lo_u32 v5, s2, v0 -; GFX9-NEXT: s_add_u32 s2, s4, s14 -; GFX9-NEXT: v_add_u32_e32 v2, v3, v2 -; GFX9-NEXT: v_add_u32_e32 v2, v2, v4 -; GFX9-NEXT: v_mul_lo_u32 v6, v0, v2 -; GFX9-NEXT: v_mul_hi_u32 v7, v0, v5 -; GFX9-NEXT: v_mul_hi_u32 v8, v0, v2 -; GFX9-NEXT: v_mul_hi_u32 v4, v1, v5 -; GFX9-NEXT: v_mul_lo_u32 v5, v1, v5 -; GFX9-NEXT: v_mul_hi_u32 v3, v1, v2 -; GFX9-NEXT: v_add_co_u32_e32 v6, vcc, v7, v6 -; GFX9-NEXT: v_addc_co_u32_e32 v7, vcc, 0, v8, vcc -; GFX9-NEXT: v_mul_lo_u32 v2, v1, v2 -; GFX9-NEXT: v_add_co_u32_e32 v5, vcc, v6, v5 -; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, v7, v4, vcc -; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc -; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v4, v2 -; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc -; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v2 -; GFX9-NEXT: s_addc_u32 s3, s5, s14 -; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v3, vcc -; GFX9-NEXT: s_xor_b64 s[4:5], s[2:3], s[14:15] -; GFX9-NEXT: v_mul_lo_u32 v2, s4, v1 -; GFX9-NEXT: v_mul_hi_u32 v3, s4, v0 -; GFX9-NEXT: v_mul_hi_u32 v4, s4, v1 -; GFX9-NEXT: v_mul_hi_u32 v5, s5, v1 -; GFX9-NEXT: v_mul_lo_u32 v1, s5, v1 -; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v3, v2 -; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v4, vcc -; GFX9-NEXT: v_mul_lo_u32 v4, s5, v0 -; GFX9-NEXT: v_mul_hi_u32 v0, s5, v0 -; GFX9-NEXT: v_mov_b32_e32 v6, s9 -; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 -; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v2, v4 -; GFX9-NEXT: v_addc_co_u32_e32 v0, vcc, v3, v0, vcc -; GFX9-NEXT: v_addc_co_u32_e32 v2, vcc, 0, v5, vcc -; GFX9-NEXT: v_add_co_u32_e32 v1, vcc, v0, v1 -; GFX9-NEXT: v_addc_co_u32_e32 v2, vcc, 0, v2, vcc -; GFX9-NEXT: v_mul_lo_u32 v3, s8, v2 -; GFX9-NEXT: v_mul_hi_u32 v4, s8, v1 -; GFX9-NEXT: v_mul_lo_u32 v5, s9, v1 -; GFX9-NEXT: s_xor_b64 s[12:13], s[14:15], s[12:13] -; GFX9-NEXT: v_mov_b32_e32 v0, 0 -; GFX9-NEXT: v_add_u32_e32 v3, v4, v3 -; GFX9-NEXT: v_mul_lo_u32 v4, s8, v1 -; GFX9-NEXT: v_add_u32_e32 v3, v3, v5 -; GFX9-NEXT: v_sub_u32_e32 v5, s5, v3 -; GFX9-NEXT: v_sub_co_u32_e32 v4, vcc, s4, v4 -; GFX9-NEXT: v_subb_co_u32_e64 v5, s[0:1], v5, v6, vcc -; GFX9-NEXT: v_subrev_co_u32_e64 v6, s[0:1], s8, v4 -; GFX9-NEXT: v_subbrev_co_u32_e64 v5, s[0:1], 0, v5, s[0:1] -; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s9, v5 -; GFX9-NEXT: v_cndmask_b32_e64 v7, 0, -1, s[0:1] -; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s8, v6 -; GFX9-NEXT: v_cndmask_b32_e64 v6, 0, -1, s[0:1] -; GFX9-NEXT: v_cmp_eq_u32_e64 s[0:1], s9, v5 -; GFX9-NEXT: v_cndmask_b32_e64 v5, v7, v6, s[0:1] -; GFX9-NEXT: v_add_co_u32_e64 v6, s[0:1], 2, v1 -; GFX9-NEXT: v_addc_co_u32_e64 v7, s[0:1], 0, v2, s[0:1] -; GFX9-NEXT: v_add_co_u32_e64 v8, s[0:1], 1, v1 -; GFX9-NEXT: v_addc_co_u32_e64 v9, s[0:1], 0, v2, s[0:1] +; GFX9-NEXT: v_readfirstlane_b32 s14, v1 +; GFX9-NEXT: v_readfirstlane_b32 s15, v0 +; GFX9-NEXT: s_mul_i32 s16, s0, s14 +; GFX9-NEXT: s_mul_hi_u32 s18, s0, s15 +; GFX9-NEXT: s_mul_i32 s17, s1, s15 +; GFX9-NEXT: s_add_i32 s16, s18, s16 +; GFX9-NEXT: s_add_i32 s16, s16, s17 +; GFX9-NEXT: s_mul_i32 s19, s0, s15 +; GFX9-NEXT: s_mul_hi_u32 s17, s15, s16 +; GFX9-NEXT: s_mul_i32 s18, s15, s16 +; GFX9-NEXT: s_mul_hi_u32 s15, s15, s19 +; GFX9-NEXT: s_add_u32 s15, s15, s18 +; GFX9-NEXT: s_addc_u32 s17, 0, s17 +; GFX9-NEXT: s_mul_hi_u32 s20, s14, s19 +; GFX9-NEXT: s_mul_i32 s19, s14, s19 +; GFX9-NEXT: s_add_u32 s15, s15, s19 +; GFX9-NEXT: s_mul_hi_u32 s18, s14, s16 +; GFX9-NEXT: s_addc_u32 s15, s17, s20 +; GFX9-NEXT: s_addc_u32 s17, s18, 0 +; GFX9-NEXT: s_mul_i32 s16, s14, s16 +; GFX9-NEXT: s_add_u32 s15, s15, s16 +; GFX9-NEXT: s_addc_u32 s16, 0, s17 +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s15, v0 +; GFX9-NEXT: s_cmp_lg_u64 vcc, 0 +; GFX9-NEXT: s_addc_u32 s14, s14, s16 +; GFX9-NEXT: v_readfirstlane_b32 s16, v0 +; GFX9-NEXT: s_mul_i32 s15, s0, s14 +; GFX9-NEXT: s_mul_hi_u32 s17, s0, s16 +; GFX9-NEXT: s_add_i32 s15, s17, s15 +; GFX9-NEXT: s_mul_i32 s1, s1, s16 +; GFX9-NEXT: s_add_i32 s15, s15, s1 +; GFX9-NEXT: s_mul_i32 s0, s0, s16 +; GFX9-NEXT: s_mul_hi_u32 s17, s14, s0 +; GFX9-NEXT: s_mul_i32 s18, s14, s0 +; GFX9-NEXT: s_mul_i32 s20, s16, s15 +; GFX9-NEXT: s_mul_hi_u32 s0, s16, s0 +; GFX9-NEXT: s_mul_hi_u32 s19, s16, s15 +; GFX9-NEXT: s_add_u32 s0, s0, s20 +; GFX9-NEXT: s_addc_u32 s16, 0, s19 +; GFX9-NEXT: s_add_u32 s0, s0, s18 +; GFX9-NEXT: s_mul_hi_u32 s1, s14, s15 +; GFX9-NEXT: s_addc_u32 s0, s16, s17 +; GFX9-NEXT: s_addc_u32 s1, s1, 0 +; GFX9-NEXT: s_mul_i32 s15, s14, s15 +; GFX9-NEXT: s_add_u32 s0, s0, s15 +; GFX9-NEXT: s_addc_u32 s1, 0, s1 +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s0, v0 +; GFX9-NEXT: s_cmp_lg_u64 vcc, 0 +; GFX9-NEXT: s_addc_u32 s16, s14, s1 +; GFX9-NEXT: s_ashr_i32 s14, s5, 31 +; GFX9-NEXT: s_add_u32 s0, s4, s14 +; GFX9-NEXT: s_mov_b32 s15, s14 +; GFX9-NEXT: s_addc_u32 s1, s5, s14 +; GFX9-NEXT: s_xor_b64 s[4:5], s[0:1], s[14:15] +; GFX9-NEXT: v_readfirstlane_b32 s17, v0 +; GFX9-NEXT: s_mul_i32 s1, s4, s16 +; GFX9-NEXT: s_mul_hi_u32 s18, s4, s17 +; GFX9-NEXT: s_mul_hi_u32 s0, s4, s16 +; GFX9-NEXT: s_add_u32 s1, s18, s1 +; GFX9-NEXT: s_addc_u32 s0, 0, s0 +; GFX9-NEXT: s_mul_hi_u32 s19, s5, s17 +; GFX9-NEXT: s_mul_i32 s17, s5, s17 +; GFX9-NEXT: s_add_u32 s1, s1, s17 +; GFX9-NEXT: s_mul_hi_u32 s18, s5, s16 +; GFX9-NEXT: s_addc_u32 s0, s0, s19 +; GFX9-NEXT: s_addc_u32 s1, s18, 0 +; GFX9-NEXT: s_mul_i32 s16, s5, s16 +; GFX9-NEXT: s_add_u32 s16, s0, s16 +; GFX9-NEXT: s_addc_u32 s17, 0, s1 +; GFX9-NEXT: s_mul_i32 s0, s12, s17 +; GFX9-NEXT: s_mul_hi_u32 s1, s12, s16 +; GFX9-NEXT: s_add_i32 s0, s1, s0 +; GFX9-NEXT: s_mul_i32 s1, s13, s16 +; GFX9-NEXT: s_add_i32 s18, s0, s1 +; GFX9-NEXT: s_mul_i32 s1, s12, s16 +; GFX9-NEXT: v_mov_b32_e32 v0, s1 +; GFX9-NEXT: s_sub_i32 s0, s5, s18 +; GFX9-NEXT: v_sub_co_u32_e32 v0, vcc, s4, v0 +; GFX9-NEXT: s_cmp_lg_u64 vcc, 0 +; GFX9-NEXT: s_subb_u32 s4, s0, s13 +; GFX9-NEXT: v_subrev_co_u32_e64 v1, s[0:1], s12, v0 +; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX9-NEXT: s_subb_u32 s4, s4, 0 +; GFX9-NEXT: s_cmp_ge_u32 s4, s13 +; GFX9-NEXT: s_cselect_b32 s19, -1, 0 +; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s12, v1 +; GFX9-NEXT: s_cmp_eq_u32 s4, s13 +; GFX9-NEXT: v_cndmask_b32_e64 v1, 0, -1, s[0:1] +; GFX9-NEXT: v_mov_b32_e32 v2, s19 +; GFX9-NEXT: s_cselect_b64 s[0:1], -1, 0 +; GFX9-NEXT: s_add_u32 s4, s16, 2 +; GFX9-NEXT: v_cndmask_b32_e64 v1, v2, v1, s[0:1] +; GFX9-NEXT: s_addc_u32 s0, s17, 0 +; GFX9-NEXT: s_add_u32 s19, s16, 1 +; GFX9-NEXT: s_addc_u32 s1, s17, 0 +; GFX9-NEXT: s_cmp_lg_u64 vcc, 0 +; GFX9-NEXT: s_subb_u32 s5, s5, s18 +; GFX9-NEXT: s_cmp_ge_u32 s5, s13 +; GFX9-NEXT: v_mov_b32_e32 v2, s1 +; GFX9-NEXT: v_mov_b32_e32 v3, s0 +; GFX9-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v1 +; GFX9-NEXT: s_cselect_b32 s18, -1, 0 +; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s12, v0 +; GFX9-NEXT: s_cmp_eq_u32 s5, s13 +; GFX9-NEXT: v_cndmask_b32_e64 v1, v2, v3, s[0:1] +; GFX9-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: s_cselect_b64 vcc, -1, 0 +; GFX9-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc +; GFX9-NEXT: v_mov_b32_e32 v2, s17 +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 +; GFX9-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc +; GFX9-NEXT: v_mov_b32_e32 v1, s19 +; GFX9-NEXT: v_mov_b32_e32 v2, s4 +; GFX9-NEXT: v_cndmask_b32_e64 v1, v1, v2, s[0:1] +; GFX9-NEXT: s_xor_b64 s[0:1], s[14:15], s[8:9] ; GFX9-NEXT: s_ashr_i32 s4, s11, 31 -; GFX9-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v5 -; GFX9-NEXT: s_add_u32 s10, s10, s4 -; GFX9-NEXT: v_cndmask_b32_e64 v5, v9, v7, s[0:1] -; GFX9-NEXT: v_mov_b32_e32 v7, s5 +; GFX9-NEXT: s_add_u32 s8, s10, s4 ; GFX9-NEXT: s_mov_b32 s5, s4 -; GFX9-NEXT: s_addc_u32 s11, s11, s4 -; GFX9-NEXT: s_xor_b64 s[10:11], s[10:11], s[4:5] -; GFX9-NEXT: v_cvt_f32_u32_e32 v9, s10 -; GFX9-NEXT: v_cvt_f32_u32_e32 v10, s11 -; GFX9-NEXT: v_subb_co_u32_e32 v3, vcc, v7, v3, vcc -; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s9, v3 -; GFX9-NEXT: v_cndmask_b32_e64 v7, 0, -1, vcc -; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s8, v4 -; GFX9-NEXT: v_cndmask_b32_e64 v4, 0, -1, vcc -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, s9, v3 -; GFX9-NEXT: v_mac_f32_e32 v9, 0x4f800000, v10 -; GFX9-NEXT: v_cndmask_b32_e32 v3, v7, v4, vcc -; GFX9-NEXT: v_rcp_f32_e32 v4, v9 -; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v3 -; GFX9-NEXT: v_cndmask_b32_e32 v2, v2, v5, vcc -; GFX9-NEXT: v_cndmask_b32_e64 v3, v8, v6, s[0:1] -; GFX9-NEXT: v_mul_f32_e32 v4, 0x5f7ffffc, v4 -; GFX9-NEXT: v_mul_f32_e32 v5, 0x2f800000, v4 -; GFX9-NEXT: v_trunc_f32_e32 v5, v5 -; GFX9-NEXT: v_mac_f32_e32 v4, 0xcf800000, v5 -; GFX9-NEXT: v_cvt_u32_f32_e32 v4, v4 -; GFX9-NEXT: v_cvt_u32_f32_e32 v5, v5 -; GFX9-NEXT: s_sub_u32 s0, 0, s10 -; GFX9-NEXT: s_subb_u32 s1, 0, s11 -; GFX9-NEXT: v_mul_hi_u32 v6, s0, v4 -; GFX9-NEXT: v_mul_lo_u32 v7, s0, v5 -; GFX9-NEXT: v_mul_lo_u32 v8, s1, v4 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc -; GFX9-NEXT: v_mul_lo_u32 v3, s0, v4 -; GFX9-NEXT: v_add_u32_e32 v6, v6, v7 -; GFX9-NEXT: v_add_u32_e32 v6, v6, v8 -; GFX9-NEXT: v_mul_lo_u32 v7, v4, v6 -; GFX9-NEXT: v_mul_hi_u32 v8, v4, v3 -; GFX9-NEXT: v_mul_hi_u32 v9, v4, v6 -; GFX9-NEXT: v_mul_hi_u32 v10, v5, v6 -; GFX9-NEXT: v_mul_lo_u32 v6, v5, v6 -; GFX9-NEXT: v_add_co_u32_e32 v7, vcc, v8, v7 -; GFX9-NEXT: v_addc_co_u32_e32 v8, vcc, 0, v9, vcc -; GFX9-NEXT: v_mul_lo_u32 v9, v5, v3 -; GFX9-NEXT: v_mul_hi_u32 v3, v5, v3 -; GFX9-NEXT: s_ashr_i32 s8, s7, 31 -; GFX9-NEXT: s_mov_b32 s9, s8 -; GFX9-NEXT: v_add_co_u32_e32 v7, vcc, v7, v9 -; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, v8, v3, vcc -; GFX9-NEXT: v_addc_co_u32_e32 v7, vcc, 0, v10, vcc -; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, v3, v6 -; GFX9-NEXT: v_addc_co_u32_e32 v6, vcc, 0, v7, vcc -; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, v4, v3 -; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, v5, v6, vcc -; GFX9-NEXT: v_mul_lo_u32 v5, s0, v4 -; GFX9-NEXT: v_mul_hi_u32 v6, s0, v3 -; GFX9-NEXT: v_mul_lo_u32 v7, s1, v3 -; GFX9-NEXT: v_mul_lo_u32 v8, s0, v3 -; GFX9-NEXT: s_add_u32 s0, s6, s8 -; GFX9-NEXT: v_add_u32_e32 v5, v6, v5 -; GFX9-NEXT: v_add_u32_e32 v5, v5, v7 -; GFX9-NEXT: v_mul_lo_u32 v9, v3, v5 -; GFX9-NEXT: v_mul_hi_u32 v10, v3, v8 -; GFX9-NEXT: v_mul_hi_u32 v11, v3, v5 -; GFX9-NEXT: v_mul_hi_u32 v7, v4, v8 -; GFX9-NEXT: v_mul_lo_u32 v8, v4, v8 -; GFX9-NEXT: v_mul_hi_u32 v6, v4, v5 -; GFX9-NEXT: v_add_co_u32_e32 v9, vcc, v10, v9 -; GFX9-NEXT: v_addc_co_u32_e32 v10, vcc, 0, v11, vcc -; GFX9-NEXT: v_mul_lo_u32 v5, v4, v5 -; GFX9-NEXT: v_add_co_u32_e32 v8, vcc, v9, v8 -; GFX9-NEXT: v_addc_co_u32_e32 v7, vcc, v10, v7, vcc -; GFX9-NEXT: v_addc_co_u32_e32 v6, vcc, 0, v6, vcc -; GFX9-NEXT: v_add_co_u32_e32 v5, vcc, v7, v5 -; GFX9-NEXT: v_addc_co_u32_e32 v6, vcc, 0, v6, vcc -; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, v3, v5 -; GFX9-NEXT: s_addc_u32 s1, s7, s8 -; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, v4, v6, vcc -; GFX9-NEXT: s_xor_b64 s[6:7], s[0:1], s[8:9] -; GFX9-NEXT: v_mul_lo_u32 v5, s6, v4 -; GFX9-NEXT: v_mul_hi_u32 v6, s6, v3 -; GFX9-NEXT: v_mul_hi_u32 v8, s6, v4 -; GFX9-NEXT: v_mul_hi_u32 v9, s7, v4 -; GFX9-NEXT: v_mul_lo_u32 v4, s7, v4 -; GFX9-NEXT: v_add_co_u32_e32 v5, vcc, v6, v5 -; GFX9-NEXT: v_addc_co_u32_e32 v6, vcc, 0, v8, vcc -; GFX9-NEXT: v_mul_lo_u32 v8, s7, v3 -; GFX9-NEXT: v_mul_hi_u32 v3, s7, v3 -; GFX9-NEXT: v_xor_b32_e32 v1, s12, v1 -; GFX9-NEXT: v_xor_b32_e32 v2, s13, v2 -; GFX9-NEXT: v_add_co_u32_e32 v5, vcc, v5, v8 -; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, v6, v3, vcc -; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v9, vcc -; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, v3, v4 -; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, 0, v5, vcc -; GFX9-NEXT: v_mul_lo_u32 v5, s10, v4 -; GFX9-NEXT: v_mul_hi_u32 v6, s10, v3 -; GFX9-NEXT: v_mul_lo_u32 v8, s11, v3 -; GFX9-NEXT: v_mov_b32_e32 v7, s13 -; GFX9-NEXT: v_subrev_co_u32_e32 v1, vcc, s12, v1 -; GFX9-NEXT: v_add_u32_e32 v5, v6, v5 -; GFX9-NEXT: v_mul_lo_u32 v6, s10, v3 -; GFX9-NEXT: v_subb_co_u32_e32 v2, vcc, v2, v7, vcc -; GFX9-NEXT: v_add_u32_e32 v5, v5, v8 -; GFX9-NEXT: v_sub_u32_e32 v7, s7, v5 -; GFX9-NEXT: v_mov_b32_e32 v8, s11 -; GFX9-NEXT: v_sub_co_u32_e32 v6, vcc, s6, v6 -; GFX9-NEXT: v_subb_co_u32_e64 v7, s[0:1], v7, v8, vcc -; GFX9-NEXT: v_subrev_co_u32_e64 v8, s[0:1], s10, v6 -; GFX9-NEXT: v_subbrev_co_u32_e64 v7, s[0:1], 0, v7, s[0:1] -; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s11, v7 -; GFX9-NEXT: v_cndmask_b32_e64 v9, 0, -1, s[0:1] -; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s10, v8 -; GFX9-NEXT: v_cndmask_b32_e64 v8, 0, -1, s[0:1] -; GFX9-NEXT: v_cmp_eq_u32_e64 s[0:1], s11, v7 -; GFX9-NEXT: v_cndmask_b32_e64 v7, v9, v8, s[0:1] -; GFX9-NEXT: v_add_co_u32_e64 v8, s[0:1], 2, v3 -; GFX9-NEXT: v_addc_co_u32_e64 v9, s[0:1], 0, v4, s[0:1] -; GFX9-NEXT: v_add_co_u32_e64 v10, s[0:1], 1, v3 -; GFX9-NEXT: v_addc_co_u32_e64 v11, s[0:1], 0, v4, s[0:1] -; GFX9-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v7 -; GFX9-NEXT: v_cndmask_b32_e64 v7, v11, v9, s[0:1] -; GFX9-NEXT: v_mov_b32_e32 v9, s7 -; GFX9-NEXT: v_subb_co_u32_e32 v5, vcc, v9, v5, vcc -; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s11, v5 -; GFX9-NEXT: v_cndmask_b32_e64 v9, 0, -1, vcc -; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s10, v6 -; GFX9-NEXT: v_cndmask_b32_e64 v6, 0, -1, vcc -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, s11, v5 -; GFX9-NEXT: v_cndmask_b32_e32 v5, v9, v6, vcc -; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v5 -; GFX9-NEXT: v_cndmask_b32_e64 v5, v10, v8, s[0:1] -; GFX9-NEXT: v_cndmask_b32_e32 v3, v3, v5, vcc -; GFX9-NEXT: s_xor_b64 s[0:1], s[8:9], s[4:5] -; GFX9-NEXT: v_cndmask_b32_e32 v4, v4, v7, vcc -; GFX9-NEXT: v_xor_b32_e32 v3, s0, v3 -; GFX9-NEXT: v_xor_b32_e32 v4, s1, v4 +; GFX9-NEXT: s_addc_u32 s9, s11, s4 +; GFX9-NEXT: v_mov_b32_e32 v2, s16 +; GFX9-NEXT: s_xor_b64 s[8:9], s[8:9], s[4:5] +; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc +; GFX9-NEXT: v_cvt_f32_u32_e32 v2, s8 +; GFX9-NEXT: v_cvt_f32_u32_e32 v3, s9 +; GFX9-NEXT: v_xor_b32_e32 v1, s0, v1 +; GFX9-NEXT: v_xor_b32_e32 v5, s1, v0 +; GFX9-NEXT: v_subrev_co_u32_e32 v0, vcc, s0, v1 +; GFX9-NEXT: v_mac_f32_e32 v2, 0x4f800000, v3 +; GFX9-NEXT: v_rcp_f32_e32 v2, v2 +; GFX9-NEXT: s_sub_u32 s0, 0, s8 +; GFX9-NEXT: v_mov_b32_e32 v6, s1 +; GFX9-NEXT: s_subb_u32 s1, 0, s9 +; GFX9-NEXT: v_mul_f32_e32 v2, 0x5f7ffffc, v2 +; GFX9-NEXT: v_mul_f32_e32 v3, 0x2f800000, v2 +; GFX9-NEXT: v_trunc_f32_e32 v3, v3 +; GFX9-NEXT: v_mac_f32_e32 v2, 0xcf800000, v3 +; GFX9-NEXT: v_cvt_u32_f32_e32 v2, v2 +; GFX9-NEXT: v_cvt_u32_f32_e32 v3, v3 +; GFX9-NEXT: v_subb_co_u32_e32 v1, vcc, v5, v6, vcc +; GFX9-NEXT: v_readfirstlane_b32 s10, v2 +; GFX9-NEXT: v_readfirstlane_b32 s13, v3 +; GFX9-NEXT: s_mul_hi_u32 s12, s0, s10 +; GFX9-NEXT: s_mul_i32 s14, s0, s13 +; GFX9-NEXT: s_mul_i32 s11, s1, s10 +; GFX9-NEXT: s_add_i32 s12, s12, s14 +; GFX9-NEXT: s_add_i32 s12, s12, s11 +; GFX9-NEXT: s_mul_i32 s15, s0, s10 +; GFX9-NEXT: s_mul_hi_u32 s11, s10, s12 +; GFX9-NEXT: s_mul_i32 s14, s10, s12 +; GFX9-NEXT: s_mul_hi_u32 s10, s10, s15 +; GFX9-NEXT: s_add_u32 s10, s10, s14 +; GFX9-NEXT: s_addc_u32 s11, 0, s11 +; GFX9-NEXT: s_mul_hi_u32 s16, s13, s15 +; GFX9-NEXT: s_mul_i32 s15, s13, s15 +; GFX9-NEXT: s_add_u32 s10, s10, s15 +; GFX9-NEXT: s_mul_hi_u32 s14, s13, s12 +; GFX9-NEXT: s_addc_u32 s10, s11, s16 +; GFX9-NEXT: s_addc_u32 s11, s14, 0 +; GFX9-NEXT: s_mul_i32 s12, s13, s12 +; GFX9-NEXT: s_add_u32 s10, s10, s12 +; GFX9-NEXT: s_addc_u32 s11, 0, s11 +; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s10, v2 +; GFX9-NEXT: s_cmp_lg_u64 vcc, 0 +; GFX9-NEXT: s_addc_u32 s10, s13, s11 +; GFX9-NEXT: v_readfirstlane_b32 s12, v2 +; GFX9-NEXT: s_mul_i32 s11, s0, s10 +; GFX9-NEXT: s_mul_hi_u32 s13, s0, s12 +; GFX9-NEXT: s_add_i32 s11, s13, s11 +; GFX9-NEXT: s_mul_i32 s1, s1, s12 +; GFX9-NEXT: s_add_i32 s11, s11, s1 +; GFX9-NEXT: s_mul_i32 s0, s0, s12 +; GFX9-NEXT: s_mul_hi_u32 s13, s10, s0 +; GFX9-NEXT: s_mul_i32 s14, s10, s0 +; GFX9-NEXT: s_mul_i32 s16, s12, s11 +; GFX9-NEXT: s_mul_hi_u32 s0, s12, s0 +; GFX9-NEXT: s_mul_hi_u32 s15, s12, s11 +; GFX9-NEXT: s_add_u32 s0, s0, s16 +; GFX9-NEXT: s_addc_u32 s12, 0, s15 +; GFX9-NEXT: s_add_u32 s0, s0, s14 +; GFX9-NEXT: s_mul_hi_u32 s1, s10, s11 +; GFX9-NEXT: s_addc_u32 s0, s12, s13 +; GFX9-NEXT: s_addc_u32 s1, s1, 0 +; GFX9-NEXT: s_mul_i32 s11, s10, s11 +; GFX9-NEXT: s_add_u32 s0, s0, s11 +; GFX9-NEXT: s_addc_u32 s1, 0, s1 +; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s0, v2 +; GFX9-NEXT: s_cmp_lg_u64 vcc, 0 +; GFX9-NEXT: s_addc_u32 s12, s10, s1 +; GFX9-NEXT: s_ashr_i32 s10, s7, 31 +; GFX9-NEXT: s_add_u32 s0, s6, s10 +; GFX9-NEXT: s_mov_b32 s11, s10 +; GFX9-NEXT: s_addc_u32 s1, s7, s10 +; GFX9-NEXT: s_xor_b64 s[6:7], s[0:1], s[10:11] +; GFX9-NEXT: v_readfirstlane_b32 s13, v2 +; GFX9-NEXT: s_mul_i32 s1, s6, s12 +; GFX9-NEXT: s_mul_hi_u32 s14, s6, s13 +; GFX9-NEXT: s_mul_hi_u32 s0, s6, s12 +; GFX9-NEXT: s_add_u32 s1, s14, s1 +; GFX9-NEXT: s_addc_u32 s0, 0, s0 +; GFX9-NEXT: s_mul_hi_u32 s15, s7, s13 +; GFX9-NEXT: s_mul_i32 s13, s7, s13 +; GFX9-NEXT: s_add_u32 s1, s1, s13 +; GFX9-NEXT: s_mul_hi_u32 s14, s7, s12 +; GFX9-NEXT: s_addc_u32 s0, s0, s15 +; GFX9-NEXT: s_addc_u32 s1, s14, 0 +; GFX9-NEXT: s_mul_i32 s12, s7, s12 +; GFX9-NEXT: s_add_u32 s12, s0, s12 +; GFX9-NEXT: s_addc_u32 s13, 0, s1 +; GFX9-NEXT: s_mul_i32 s0, s8, s13 +; GFX9-NEXT: s_mul_hi_u32 s1, s8, s12 +; GFX9-NEXT: s_add_i32 s0, s1, s0 +; GFX9-NEXT: s_mul_i32 s1, s9, s12 +; GFX9-NEXT: s_add_i32 s14, s0, s1 +; GFX9-NEXT: s_mul_i32 s1, s8, s12 +; GFX9-NEXT: v_mov_b32_e32 v2, s1 +; GFX9-NEXT: s_sub_i32 s0, s7, s14 +; GFX9-NEXT: v_sub_co_u32_e32 v2, vcc, s6, v2 +; GFX9-NEXT: s_cmp_lg_u64 vcc, 0 +; GFX9-NEXT: s_subb_u32 s6, s0, s9 +; GFX9-NEXT: v_subrev_co_u32_e64 v3, s[0:1], s8, v2 +; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX9-NEXT: s_subb_u32 s6, s6, 0 +; GFX9-NEXT: s_cmp_ge_u32 s6, s9 +; GFX9-NEXT: s_cselect_b32 s15, -1, 0 +; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s8, v3 +; GFX9-NEXT: s_cmp_eq_u32 s6, s9 +; GFX9-NEXT: v_cndmask_b32_e64 v3, 0, -1, s[0:1] +; GFX9-NEXT: v_mov_b32_e32 v5, s15 +; GFX9-NEXT: s_cselect_b64 s[0:1], -1, 0 +; GFX9-NEXT: s_add_u32 s6, s12, 2 +; GFX9-NEXT: v_cndmask_b32_e64 v3, v5, v3, s[0:1] +; GFX9-NEXT: s_addc_u32 s0, s13, 0 +; GFX9-NEXT: s_add_u32 s15, s12, 1 +; GFX9-NEXT: s_addc_u32 s1, s13, 0 +; GFX9-NEXT: s_cmp_lg_u64 vcc, 0 +; GFX9-NEXT: s_subb_u32 s7, s7, s14 +; GFX9-NEXT: s_cmp_ge_u32 s7, s9 ; GFX9-NEXT: v_mov_b32_e32 v5, s1 -; GFX9-NEXT: v_subrev_co_u32_e32 v3, vcc, s0, v3 -; GFX9-NEXT: v_subb_co_u32_e32 v4, vcc, v4, v5, vcc +; GFX9-NEXT: v_mov_b32_e32 v6, s0 +; GFX9-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v3 +; GFX9-NEXT: s_cselect_b32 s14, -1, 0 +; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s8, v2 +; GFX9-NEXT: s_cmp_eq_u32 s7, s9 +; GFX9-NEXT: v_cndmask_b32_e64 v3, v5, v6, s[0:1] +; GFX9-NEXT: v_cndmask_b32_e64 v2, 0, -1, vcc +; GFX9-NEXT: v_mov_b32_e32 v5, s14 +; GFX9-NEXT: s_cselect_b64 vcc, -1, 0 +; GFX9-NEXT: v_cndmask_b32_e32 v2, v5, v2, vcc +; GFX9-NEXT: v_mov_b32_e32 v5, s13 +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 +; GFX9-NEXT: v_cndmask_b32_e32 v2, v5, v3, vcc +; GFX9-NEXT: v_mov_b32_e32 v3, s15 +; GFX9-NEXT: v_mov_b32_e32 v5, s6 +; GFX9-NEXT: v_cndmask_b32_e64 v3, v3, v5, s[0:1] +; GFX9-NEXT: v_mov_b32_e32 v5, s12 +; GFX9-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc +; GFX9-NEXT: s_xor_b64 s[0:1], s[10:11], s[4:5] +; GFX9-NEXT: v_xor_b32_e32 v3, s0, v3 +; GFX9-NEXT: v_xor_b32_e32 v5, s1, v2 +; GFX9-NEXT: v_mov_b32_e32 v6, s1 +; GFX9-NEXT: v_subrev_co_u32_e32 v2, vcc, s0, v3 +; GFX9-NEXT: v_subb_co_u32_e32 v3, vcc, v5, v6, vcc ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_store_dwordx4 v0, v[1:4], s[2:3] +; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[2:3] ; GFX9-NEXT: s_endpgm %shl.y = shl <2 x i64> , %y %r = sdiv <2 x i64> %x, %shl.y @@ -9559,8 +9666,8 @@ ; GFX6-NEXT: v_mul_lo_u32 v4, v0, s4 ; GFX6-NEXT: s_mov_b32 s9, s8 ; GFX6-NEXT: s_addc_u32 s3, s3, s8 -; GFX6-NEXT: v_add_i32_e32 v2, vcc, v3, v2 -; GFX6-NEXT: v_subrev_i32_e32 v2, vcc, v0, v2 +; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v3 +; GFX6-NEXT: v_sub_i32_e32 v2, vcc, v2, v0 ; GFX6-NEXT: v_mul_hi_u32 v3, v0, v4 ; GFX6-NEXT: v_mul_lo_u32 v5, v0, v2 ; GFX6-NEXT: v_mul_hi_u32 v6, v0, v2 @@ -9653,113 +9760,127 @@ ; ; GFX9-LABEL: srem_i64_oddk_denom: ; GFX9: ; %bb.0: -; GFX9-NEXT: v_mov_b32_e32 v0, 0x4f800000 -; GFX9-NEXT: v_madak_f32 v0, 0, v0, 0x4996c7d8 +; GFX9-NEXT: v_mov_b32_e32 v0, 0x4996c7d8 +; GFX9-NEXT: v_mov_b32_e32 v1, 0x4f800000 +; GFX9-NEXT: v_mac_f32_e32 v0, 0, v1 ; GFX9-NEXT: v_rcp_f32_e32 v0, v0 -; GFX9-NEXT: s_mov_b32 s2, 0xffed2705 ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0 ; GFX9-NEXT: v_mul_f32_e32 v1, 0x2f800000, v0 ; GFX9-NEXT: v_trunc_f32_e32 v1, v1 ; GFX9-NEXT: v_mac_f32_e32 v0, 0xcf800000, v1 ; GFX9-NEXT: v_cvt_u32_f32_e32 v1, v1 ; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0 -; GFX9-NEXT: v_mul_lo_u32 v2, v1, s2 -; GFX9-NEXT: v_mul_hi_u32 v3, v0, s2 -; GFX9-NEXT: v_mul_lo_u32 v4, v0, s2 -; GFX9-NEXT: v_add_u32_e32 v2, v3, v2 -; GFX9-NEXT: v_sub_u32_e32 v2, v2, v0 -; GFX9-NEXT: v_mul_hi_u32 v3, v0, v4 -; GFX9-NEXT: v_mul_lo_u32 v6, v0, v2 -; GFX9-NEXT: v_mul_hi_u32 v7, v0, v2 -; GFX9-NEXT: v_mul_lo_u32 v5, v1, v4 -; GFX9-NEXT: v_mul_hi_u32 v4, v1, v4 -; GFX9-NEXT: v_mul_hi_u32 v8, v1, v2 -; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, v3, v6 -; GFX9-NEXT: v_addc_co_u32_e32 v6, vcc, 0, v7, vcc -; GFX9-NEXT: v_mul_lo_u32 v2, v1, v2 -; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, v3, v5 -; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, v6, v4, vcc -; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, 0, v8, vcc -; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v3, v2 -; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v4, vcc -; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v2 -; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v3, vcc -; GFX9-NEXT: v_mul_lo_u32 v2, v1, s2 -; GFX9-NEXT: v_mul_hi_u32 v3, v0, s2 -; GFX9-NEXT: v_mul_lo_u32 v4, v0, s2 +; GFX9-NEXT: v_readfirstlane_b32 s0, v1 +; GFX9-NEXT: v_readfirstlane_b32 s1, v0 +; GFX9-NEXT: s_mul_hi_u32 s2, s1, 0xffed2705 +; GFX9-NEXT: s_mul_i32 s3, s0, 0xffed2705 +; GFX9-NEXT: s_add_i32 s2, s2, s3 +; GFX9-NEXT: s_sub_i32 s2, s2, s1 +; GFX9-NEXT: s_mul_i32 s9, s1, 0xffed2705 +; GFX9-NEXT: s_mul_hi_u32 s3, s1, s2 +; GFX9-NEXT: s_mul_i32 s8, s1, s2 +; GFX9-NEXT: s_mul_hi_u32 s1, s1, s9 +; GFX9-NEXT: s_add_u32 s1, s1, s8 +; GFX9-NEXT: s_addc_u32 s3, 0, s3 +; GFX9-NEXT: s_mul_hi_u32 s10, s0, s9 +; GFX9-NEXT: s_mul_i32 s9, s0, s9 +; GFX9-NEXT: s_add_u32 s1, s1, s9 +; GFX9-NEXT: s_mul_hi_u32 s8, s0, s2 +; GFX9-NEXT: s_addc_u32 s1, s3, s10 +; GFX9-NEXT: s_addc_u32 s3, s8, 0 +; GFX9-NEXT: s_mul_i32 s2, s0, s2 +; GFX9-NEXT: s_add_u32 s1, s1, s2 +; GFX9-NEXT: s_addc_u32 s2, 0, s3 +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s1, v0 +; GFX9-NEXT: s_cmp_lg_u64 vcc, 0 +; GFX9-NEXT: s_addc_u32 s0, s0, s2 +; GFX9-NEXT: v_readfirstlane_b32 s2, v0 +; GFX9-NEXT: s_mul_i32 s1, s0, 0xffed2705 +; GFX9-NEXT: s_mul_hi_u32 s3, s2, 0xffed2705 +; GFX9-NEXT: s_add_i32 s3, s3, s1 +; GFX9-NEXT: s_sub_i32 s1, s3, s2 +; GFX9-NEXT: s_mul_i32 s8, s2, 0xffed2705 +; GFX9-NEXT: s_mul_hi_u32 s11, s2, s1 +; GFX9-NEXT: s_mul_i32 s12, s2, s1 +; GFX9-NEXT: s_mul_hi_u32 s2, s2, s8 +; GFX9-NEXT: s_add_u32 s2, s2, s12 +; GFX9-NEXT: s_mul_hi_u32 s9, s0, s8 +; GFX9-NEXT: s_mul_i32 s10, s0, s8 +; GFX9-NEXT: s_addc_u32 s8, 0, s11 +; GFX9-NEXT: s_add_u32 s2, s2, s10 +; GFX9-NEXT: s_mul_hi_u32 s3, s0, s1 +; GFX9-NEXT: s_addc_u32 s2, s8, s9 +; GFX9-NEXT: s_addc_u32 s3, s3, 0 +; GFX9-NEXT: s_mul_i32 s1, s0, s1 +; GFX9-NEXT: s_add_u32 s1, s2, s1 +; GFX9-NEXT: s_addc_u32 s2, 0, s3 +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s1, v0 +; GFX9-NEXT: s_cmp_lg_u64 vcc, 0 +; GFX9-NEXT: s_addc_u32 s8, s0, s2 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_ashr_i32 s2, s7, 31 ; GFX9-NEXT: s_add_u32 s0, s6, s2 -; GFX9-NEXT: v_add_u32_e32 v2, v3, v2 -; GFX9-NEXT: v_sub_u32_e32 v2, v2, v0 -; GFX9-NEXT: v_mul_lo_u32 v6, v0, v2 -; GFX9-NEXT: v_mul_hi_u32 v7, v0, v4 -; GFX9-NEXT: v_mul_hi_u32 v8, v0, v2 -; GFX9-NEXT: v_mul_hi_u32 v5, v1, v4 -; GFX9-NEXT: v_mul_lo_u32 v4, v1, v4 -; GFX9-NEXT: v_mul_hi_u32 v3, v1, v2 -; GFX9-NEXT: v_add_co_u32_e32 v6, vcc, v7, v6 -; GFX9-NEXT: v_addc_co_u32_e32 v7, vcc, 0, v8, vcc -; GFX9-NEXT: v_mul_lo_u32 v2, v1, v2 -; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, v6, v4 -; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, v7, v5, vcc -; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc -; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v4, v2 -; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc -; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v2 ; GFX9-NEXT: s_mov_b32 s3, s2 ; GFX9-NEXT: s_addc_u32 s1, s7, s2 -; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v3, vcc ; GFX9-NEXT: s_xor_b64 s[0:1], s[0:1], s[2:3] -; GFX9-NEXT: v_mul_lo_u32 v2, s0, v1 -; GFX9-NEXT: v_mul_hi_u32 v3, s0, v0 -; GFX9-NEXT: v_mul_hi_u32 v4, s0, v1 -; GFX9-NEXT: v_mul_hi_u32 v5, s1, v1 -; GFX9-NEXT: v_mul_lo_u32 v1, s1, v1 -; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v3, v2 -; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v4, vcc -; GFX9-NEXT: v_mul_lo_u32 v4, s1, v0 -; GFX9-NEXT: v_mul_hi_u32 v0, s1, v0 -; GFX9-NEXT: s_mov_b32 s3, 0x12d8fb -; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v2, v4 -; GFX9-NEXT: v_addc_co_u32_e32 v0, vcc, v3, v0, vcc -; GFX9-NEXT: v_addc_co_u32_e32 v2, vcc, 0, v5, vcc -; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v1 -; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v2, vcc -; GFX9-NEXT: v_mul_lo_u32 v1, v1, s3 -; GFX9-NEXT: v_mul_hi_u32 v2, v0, s3 -; GFX9-NEXT: v_mul_lo_u32 v0, v0, s3 -; GFX9-NEXT: v_mov_b32_e32 v3, 0 -; GFX9-NEXT: v_add_u32_e32 v1, v2, v1 -; GFX9-NEXT: v_mov_b32_e32 v2, s1 +; GFX9-NEXT: v_readfirstlane_b32 s7, v0 +; GFX9-NEXT: s_mul_i32 s6, s0, s8 +; GFX9-NEXT: s_mul_hi_u32 s9, s0, s7 +; GFX9-NEXT: s_mul_hi_u32 s3, s0, s8 +; GFX9-NEXT: s_add_u32 s6, s9, s6 +; GFX9-NEXT: s_addc_u32 s3, 0, s3 +; GFX9-NEXT: s_mul_hi_u32 s10, s1, s7 +; GFX9-NEXT: s_mul_i32 s7, s1, s7 +; GFX9-NEXT: s_add_u32 s6, s6, s7 +; GFX9-NEXT: s_mul_hi_u32 s9, s1, s8 +; GFX9-NEXT: s_addc_u32 s3, s3, s10 +; GFX9-NEXT: s_addc_u32 s6, s9, 0 +; GFX9-NEXT: s_mul_i32 s7, s1, s8 +; GFX9-NEXT: s_add_u32 s3, s3, s7 +; GFX9-NEXT: s_addc_u32 s6, 0, s6 +; GFX9-NEXT: s_mul_hi_u32 s8, s3, 0x12d8fb +; GFX9-NEXT: s_mul_i32 s3, s3, 0x12d8fb +; GFX9-NEXT: s_mul_i32 s6, s6, 0x12d8fb +; GFX9-NEXT: v_mov_b32_e32 v0, s3 +; GFX9-NEXT: s_add_i32 s8, s8, s6 ; GFX9-NEXT: v_sub_co_u32_e32 v0, vcc, s0, v0 -; GFX9-NEXT: v_subb_co_u32_e32 v1, vcc, v2, v1, vcc -; GFX9-NEXT: v_subrev_co_u32_e32 v2, vcc, s3, v0 -; GFX9-NEXT: v_subbrev_co_u32_e32 v4, vcc, 0, v1, vcc -; GFX9-NEXT: v_subrev_co_u32_e32 v5, vcc, s3, v2 -; GFX9-NEXT: v_subbrev_co_u32_e32 v6, vcc, 0, v4, vcc -; GFX9-NEXT: s_mov_b32 s0, 0x12d8fa -; GFX9-NEXT: v_cmp_lt_u32_e32 vcc, s0, v2 -; GFX9-NEXT: v_cndmask_b32_e64 v7, 0, -1, vcc -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v4 -; GFX9-NEXT: v_cndmask_b32_e32 v7, -1, v7, vcc -; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v7 -; GFX9-NEXT: v_cmp_lt_u32_e64 s[0:1], s0, v0 -; GFX9-NEXT: v_cndmask_b32_e32 v4, v4, v6, vcc -; GFX9-NEXT: v_cndmask_b32_e64 v6, 0, -1, s[0:1] -; GFX9-NEXT: v_cmp_eq_u32_e64 s[0:1], 0, v1 -; GFX9-NEXT: v_cndmask_b32_e64 v6, -1, v6, s[0:1] -; GFX9-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v6 -; GFX9-NEXT: v_cndmask_b32_e32 v2, v2, v5, vcc -; GFX9-NEXT: v_cndmask_b32_e64 v0, v0, v2, s[0:1] -; GFX9-NEXT: v_cndmask_b32_e64 v1, v1, v4, s[0:1] +; GFX9-NEXT: s_mov_b32 s7, 0x12d8fb +; GFX9-NEXT: s_cmp_lg_u64 vcc, 0 +; GFX9-NEXT: s_subb_u32 s3, s1, s8 +; GFX9-NEXT: v_subrev_co_u32_e32 v1, vcc, s7, v0 +; GFX9-NEXT: s_cmp_lg_u64 vcc, 0 +; GFX9-NEXT: s_subb_u32 s0, s3, 0 +; GFX9-NEXT: v_subrev_co_u32_e32 v3, vcc, s7, v1 +; GFX9-NEXT: s_cmp_lg_u64 vcc, 0 +; GFX9-NEXT: s_subb_u32 s1, s0, 0 +; GFX9-NEXT: s_mov_b32 s6, 0x12d8fa +; GFX9-NEXT: v_cmp_lt_u32_e32 vcc, s6, v1 +; GFX9-NEXT: s_cmp_eq_u32 s0, 0 +; GFX9-NEXT: v_cndmask_b32_e64 v4, 0, -1, vcc +; GFX9-NEXT: s_cselect_b64 vcc, -1, 0 +; GFX9-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc +; GFX9-NEXT: v_mov_b32_e32 v5, s0 +; GFX9-NEXT: v_mov_b32_e32 v6, s1 +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 +; GFX9-NEXT: v_cmp_lt_u32_e64 s[0:1], s6, v0 +; GFX9-NEXT: s_cmp_eq_u32 s3, 0 +; GFX9-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc +; GFX9-NEXT: v_cndmask_b32_e64 v5, 0, -1, s[0:1] +; GFX9-NEXT: s_cselect_b64 s[0:1], -1, 0 +; GFX9-NEXT: v_cndmask_b32_e64 v5, -1, v5, s[0:1] +; GFX9-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v5 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc +; GFX9-NEXT: v_mov_b32_e32 v6, s3 +; GFX9-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[0:1] +; GFX9-NEXT: v_cndmask_b32_e64 v4, v6, v4, s[0:1] ; GFX9-NEXT: v_xor_b32_e32 v0, s2, v0 -; GFX9-NEXT: v_xor_b32_e32 v1, s2, v1 -; GFX9-NEXT: v_mov_b32_e32 v2, s2 +; GFX9-NEXT: v_xor_b32_e32 v1, s2, v4 +; GFX9-NEXT: v_mov_b32_e32 v3, s2 ; GFX9-NEXT: v_subrev_co_u32_e32 v0, vcc, s2, v0 -; GFX9-NEXT: v_subb_co_u32_e32 v1, vcc, v1, v2, vcc -; GFX9-NEXT: global_store_dwordx2 v3, v[0:1], s[4:5] +; GFX9-NEXT: v_subb_co_u32_e32 v1, vcc, v1, v3, vcc +; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] ; GFX9-NEXT: s_endpgm %r = srem i64 %x, 1235195 store i64 %r, i64 addrspace(1)* %out @@ -9855,8 +9976,8 @@ ; GFX6-NEXT: v_mul_lo_u32 v5, s5, v0 ; GFX6-NEXT: v_mul_lo_u32 v4, s4, v0 ; GFX6-NEXT: s_addc_u32 s3, s3, s10 -; GFX6-NEXT: v_add_i32_e32 v2, vcc, v3, v2 -; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v5 +; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v3 +; GFX6-NEXT: v_add_i32_e32 v2, vcc, v5, v2 ; GFX6-NEXT: v_mul_hi_u32 v3, v0, v4 ; GFX6-NEXT: v_mul_lo_u32 v5, v0, v2 ; GFX6-NEXT: v_mul_hi_u32 v6, v0, v2 @@ -9968,123 +10089,140 @@ ; GFX9-NEXT: s_xor_b64 s[8:9], s[2:3], s[4:5] ; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s8 ; GFX9-NEXT: v_cvt_f32_u32_e32 v1, s9 -; GFX9-NEXT: s_sub_u32 s2, 0, s8 -; GFX9-NEXT: s_subb_u32 s3, 0, s9 ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NEXT: s_sub_u32 s0, 0, s8 +; GFX9-NEXT: s_subb_u32 s1, 0, s9 ; GFX9-NEXT: v_mac_f32_e32 v0, 0x4f800000, v1 -; GFX9-NEXT: v_rcp_f32_e32 v0, v0 +; GFX9-NEXT: v_rcp_f32_e32 v1, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, 0 +; GFX9-NEXT: v_mul_f32_e32 v1, 0x5f7ffffc, v1 +; GFX9-NEXT: v_mul_f32_e32 v2, 0x2f800000, v1 +; GFX9-NEXT: v_trunc_f32_e32 v2, v2 +; GFX9-NEXT: v_mac_f32_e32 v1, 0xcf800000, v2 +; GFX9-NEXT: v_cvt_u32_f32_e32 v2, v2 +; GFX9-NEXT: v_cvt_u32_f32_e32 v1, v1 +; GFX9-NEXT: v_readfirstlane_b32 s2, v2 +; GFX9-NEXT: v_readfirstlane_b32 s3, v1 +; GFX9-NEXT: s_mul_i32 s10, s0, s2 +; GFX9-NEXT: s_mul_hi_u32 s12, s0, s3 +; GFX9-NEXT: s_mul_i32 s11, s1, s3 +; GFX9-NEXT: s_add_i32 s10, s12, s10 +; GFX9-NEXT: s_add_i32 s10, s10, s11 +; GFX9-NEXT: s_mul_i32 s13, s0, s3 +; GFX9-NEXT: s_mul_hi_u32 s11, s3, s10 +; GFX9-NEXT: s_mul_i32 s12, s3, s10 +; GFX9-NEXT: s_mul_hi_u32 s3, s3, s13 +; GFX9-NEXT: s_add_u32 s3, s3, s12 +; GFX9-NEXT: s_addc_u32 s11, 0, s11 +; GFX9-NEXT: s_mul_hi_u32 s14, s2, s13 +; GFX9-NEXT: s_mul_i32 s13, s2, s13 +; GFX9-NEXT: s_add_u32 s3, s3, s13 +; GFX9-NEXT: s_mul_hi_u32 s12, s2, s10 +; GFX9-NEXT: s_addc_u32 s3, s11, s14 +; GFX9-NEXT: s_addc_u32 s11, s12, 0 +; GFX9-NEXT: s_mul_i32 s10, s2, s10 +; GFX9-NEXT: s_add_u32 s3, s3, s10 +; GFX9-NEXT: s_addc_u32 s10, 0, s11 +; GFX9-NEXT: v_add_co_u32_e32 v1, vcc, s3, v1 +; GFX9-NEXT: s_cmp_lg_u64 vcc, 0 +; GFX9-NEXT: s_addc_u32 s2, s2, s10 +; GFX9-NEXT: v_readfirstlane_b32 s10, v1 +; GFX9-NEXT: s_mul_i32 s3, s0, s2 +; GFX9-NEXT: s_mul_hi_u32 s11, s0, s10 +; GFX9-NEXT: s_add_i32 s3, s11, s3 +; GFX9-NEXT: s_mul_i32 s1, s1, s10 +; GFX9-NEXT: s_add_i32 s3, s3, s1 +; GFX9-NEXT: s_mul_i32 s0, s0, s10 +; GFX9-NEXT: s_mul_hi_u32 s11, s2, s0 +; GFX9-NEXT: s_mul_i32 s12, s2, s0 +; GFX9-NEXT: s_mul_i32 s14, s10, s3 +; GFX9-NEXT: s_mul_hi_u32 s0, s10, s0 +; GFX9-NEXT: s_mul_hi_u32 s13, s10, s3 +; GFX9-NEXT: s_add_u32 s0, s0, s14 +; GFX9-NEXT: s_addc_u32 s10, 0, s13 +; GFX9-NEXT: s_add_u32 s0, s0, s12 +; GFX9-NEXT: s_mul_hi_u32 s1, s2, s3 +; GFX9-NEXT: s_addc_u32 s0, s10, s11 +; GFX9-NEXT: s_addc_u32 s1, s1, 0 +; GFX9-NEXT: s_mul_i32 s3, s2, s3 +; GFX9-NEXT: s_add_u32 s0, s0, s3 +; GFX9-NEXT: s_addc_u32 s1, 0, s1 +; GFX9-NEXT: v_add_co_u32_e32 v1, vcc, s0, v1 +; GFX9-NEXT: s_cmp_lg_u64 vcc, 0 +; GFX9-NEXT: s_addc_u32 s2, s2, s1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_ashr_i32 s10, s7, 31 -; GFX9-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0 -; GFX9-NEXT: v_mul_f32_e32 v1, 0x2f800000, v0 -; GFX9-NEXT: v_trunc_f32_e32 v1, v1 -; GFX9-NEXT: v_mac_f32_e32 v0, 0xcf800000, v1 -; GFX9-NEXT: v_cvt_u32_f32_e32 v1, v1 -; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0 ; GFX9-NEXT: s_add_u32 s0, s6, s10 ; GFX9-NEXT: s_mov_b32 s11, s10 -; GFX9-NEXT: v_mul_lo_u32 v2, s2, v1 -; GFX9-NEXT: v_mul_hi_u32 v3, s2, v0 -; GFX9-NEXT: v_mul_lo_u32 v5, s3, v0 -; GFX9-NEXT: v_mul_lo_u32 v4, s2, v0 ; GFX9-NEXT: s_addc_u32 s1, s7, s10 -; GFX9-NEXT: v_add_u32_e32 v2, v3, v2 -; GFX9-NEXT: v_add_u32_e32 v2, v2, v5 -; GFX9-NEXT: v_mul_hi_u32 v3, v0, v4 -; GFX9-NEXT: v_mul_lo_u32 v5, v0, v2 -; GFX9-NEXT: v_mul_hi_u32 v7, v0, v2 -; GFX9-NEXT: v_mul_hi_u32 v6, v1, v4 -; GFX9-NEXT: v_mul_lo_u32 v4, v1, v4 -; GFX9-NEXT: v_mul_hi_u32 v8, v1, v2 -; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, v3, v5 -; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v7, vcc -; GFX9-NEXT: v_mul_lo_u32 v2, v1, v2 -; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, v3, v4 -; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, v5, v6, vcc -; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, 0, v8, vcc -; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v3, v2 -; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v4, vcc -; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v2 -; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v3, vcc -; GFX9-NEXT: v_mul_lo_u32 v2, s2, v1 -; GFX9-NEXT: v_mul_hi_u32 v3, s2, v0 -; GFX9-NEXT: v_mul_lo_u32 v4, s3, v0 -; GFX9-NEXT: v_mul_lo_u32 v5, s2, v0 ; GFX9-NEXT: s_xor_b64 s[6:7], s[0:1], s[10:11] -; GFX9-NEXT: v_add_u32_e32 v2, v3, v2 -; GFX9-NEXT: v_add_u32_e32 v2, v2, v4 -; GFX9-NEXT: v_mul_lo_u32 v6, v0, v2 -; GFX9-NEXT: v_mul_hi_u32 v7, v0, v5 -; GFX9-NEXT: v_mul_hi_u32 v8, v0, v2 -; GFX9-NEXT: v_mul_hi_u32 v4, v1, v5 -; GFX9-NEXT: v_mul_lo_u32 v5, v1, v5 -; GFX9-NEXT: v_mul_hi_u32 v3, v1, v2 -; GFX9-NEXT: v_add_co_u32_e32 v6, vcc, v7, v6 -; GFX9-NEXT: v_addc_co_u32_e32 v7, vcc, 0, v8, vcc -; GFX9-NEXT: v_mul_lo_u32 v2, v1, v2 -; GFX9-NEXT: v_add_co_u32_e32 v5, vcc, v6, v5 -; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, v7, v4, vcc -; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc -; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v4, v2 -; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc -; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v2 -; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v3, vcc -; GFX9-NEXT: v_mul_lo_u32 v2, s6, v1 -; GFX9-NEXT: v_mul_hi_u32 v3, s6, v0 -; GFX9-NEXT: v_mul_hi_u32 v4, s6, v1 -; GFX9-NEXT: v_mul_hi_u32 v5, s7, v1 -; GFX9-NEXT: v_mul_lo_u32 v1, s7, v1 -; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v3, v2 -; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v4, vcc -; GFX9-NEXT: v_mul_lo_u32 v4, s7, v0 -; GFX9-NEXT: v_mul_hi_u32 v0, s7, v0 -; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v2, v4 -; GFX9-NEXT: v_addc_co_u32_e32 v0, vcc, v3, v0, vcc -; GFX9-NEXT: v_addc_co_u32_e32 v2, vcc, 0, v5, vcc -; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v1 -; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v2, vcc -; GFX9-NEXT: v_mul_lo_u32 v1, s8, v1 -; GFX9-NEXT: v_mul_hi_u32 v2, s8, v0 -; GFX9-NEXT: v_mul_lo_u32 v3, s9, v0 -; GFX9-NEXT: v_mul_lo_u32 v0, s8, v0 -; GFX9-NEXT: v_mov_b32_e32 v4, 0 -; GFX9-NEXT: v_add_u32_e32 v1, v2, v1 -; GFX9-NEXT: v_add_u32_e32 v1, v1, v3 -; GFX9-NEXT: v_sub_u32_e32 v2, s7, v1 -; GFX9-NEXT: v_mov_b32_e32 v3, s9 -; GFX9-NEXT: v_sub_co_u32_e32 v0, vcc, s6, v0 -; GFX9-NEXT: v_subb_co_u32_e64 v2, s[0:1], v2, v3, vcc -; GFX9-NEXT: v_subrev_co_u32_e64 v5, s[0:1], s8, v0 -; GFX9-NEXT: v_subbrev_co_u32_e64 v6, s[2:3], 0, v2, s[0:1] -; GFX9-NEXT: v_cmp_le_u32_e64 s[2:3], s9, v6 -; GFX9-NEXT: v_cndmask_b32_e64 v7, 0, -1, s[2:3] -; GFX9-NEXT: v_cmp_le_u32_e64 s[2:3], s8, v5 -; GFX9-NEXT: v_subb_co_u32_e64 v2, s[0:1], v2, v3, s[0:1] -; GFX9-NEXT: v_cndmask_b32_e64 v8, 0, -1, s[2:3] -; GFX9-NEXT: v_cmp_eq_u32_e64 s[2:3], s9, v6 -; GFX9-NEXT: v_subrev_co_u32_e64 v3, s[0:1], s8, v5 -; GFX9-NEXT: v_cndmask_b32_e64 v7, v7, v8, s[2:3] -; GFX9-NEXT: v_subbrev_co_u32_e64 v2, s[0:1], 0, v2, s[0:1] -; GFX9-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v7 -; GFX9-NEXT: v_cndmask_b32_e64 v2, v6, v2, s[0:1] -; GFX9-NEXT: v_mov_b32_e32 v6, s7 -; GFX9-NEXT: v_subb_co_u32_e32 v1, vcc, v6, v1, vcc -; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s9, v1 -; GFX9-NEXT: v_cndmask_b32_e64 v6, 0, -1, vcc -; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s8, v0 -; GFX9-NEXT: v_cndmask_b32_e64 v7, 0, -1, vcc -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, s9, v1 -; GFX9-NEXT: v_cndmask_b32_e32 v6, v6, v7, vcc -; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v6 +; GFX9-NEXT: v_readfirstlane_b32 s3, v1 +; GFX9-NEXT: s_mul_i32 s1, s6, s2 +; GFX9-NEXT: s_mul_hi_u32 s11, s6, s3 +; GFX9-NEXT: s_mul_hi_u32 s0, s6, s2 +; GFX9-NEXT: s_add_u32 s1, s11, s1 +; GFX9-NEXT: s_addc_u32 s0, 0, s0 +; GFX9-NEXT: s_mul_hi_u32 s12, s7, s3 +; GFX9-NEXT: s_mul_i32 s3, s7, s3 +; GFX9-NEXT: s_add_u32 s1, s1, s3 +; GFX9-NEXT: s_mul_hi_u32 s11, s7, s2 +; GFX9-NEXT: s_addc_u32 s0, s0, s12 +; GFX9-NEXT: s_addc_u32 s1, s11, 0 +; GFX9-NEXT: s_mul_i32 s2, s7, s2 +; GFX9-NEXT: s_add_u32 s0, s0, s2 +; GFX9-NEXT: s_addc_u32 s1, 0, s1 +; GFX9-NEXT: s_mul_i32 s1, s8, s1 +; GFX9-NEXT: s_mul_hi_u32 s2, s8, s0 +; GFX9-NEXT: s_add_i32 s1, s2, s1 +; GFX9-NEXT: s_mul_i32 s2, s9, s0 +; GFX9-NEXT: s_mul_i32 s0, s8, s0 +; GFX9-NEXT: s_add_i32 s11, s1, s2 +; GFX9-NEXT: v_mov_b32_e32 v1, s0 +; GFX9-NEXT: s_sub_i32 s1, s7, s11 +; GFX9-NEXT: v_sub_co_u32_e32 v1, vcc, s6, v1 +; GFX9-NEXT: s_cmp_lg_u64 vcc, 0 +; GFX9-NEXT: s_subb_u32 s6, s1, s9 +; GFX9-NEXT: v_subrev_co_u32_e64 v2, s[0:1], s8, v1 +; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX9-NEXT: s_subb_u32 s12, s6, 0 +; GFX9-NEXT: s_cmp_ge_u32 s12, s9 +; GFX9-NEXT: s_cselect_b32 s13, -1, 0 +; GFX9-NEXT: v_cmp_le_u32_e64 s[2:3], s8, v2 +; GFX9-NEXT: s_cmp_eq_u32 s12, s9 +; GFX9-NEXT: v_cndmask_b32_e64 v3, 0, -1, s[2:3] +; GFX9-NEXT: v_mov_b32_e32 v4, s13 +; GFX9-NEXT: s_cselect_b64 s[2:3], -1, 0 +; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX9-NEXT: v_cndmask_b32_e64 v3, v4, v3, s[2:3] +; GFX9-NEXT: s_subb_u32 s2, s6, s9 +; GFX9-NEXT: v_subrev_co_u32_e64 v4, s[0:1], s8, v2 +; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX9-NEXT: s_subb_u32 s0, s2, 0 +; GFX9-NEXT: s_cmp_lg_u64 vcc, 0 +; GFX9-NEXT: s_subb_u32 s2, s7, s11 +; GFX9-NEXT: s_cmp_ge_u32 s2, s9 +; GFX9-NEXT: v_mov_b32_e32 v5, s12 +; GFX9-NEXT: v_mov_b32_e32 v6, s0 +; GFX9-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v3 +; GFX9-NEXT: s_cselect_b32 s3, -1, 0 +; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s8, v1 +; GFX9-NEXT: s_cmp_eq_u32 s2, s9 +; GFX9-NEXT: v_cndmask_b32_e64 v3, v5, v6, s[0:1] +; GFX9-NEXT: v_cndmask_b32_e64 v5, 0, -1, vcc +; GFX9-NEXT: v_mov_b32_e32 v6, s3 +; GFX9-NEXT: s_cselect_b64 vcc, -1, 0 +; GFX9-NEXT: v_cndmask_b32_e32 v5, v6, v5, vcc +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v5 +; GFX9-NEXT: v_cndmask_b32_e64 v2, v2, v4, s[0:1] +; GFX9-NEXT: v_mov_b32_e32 v6, s2 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc -; GFX9-NEXT: v_cndmask_b32_e64 v2, v5, v3, s[0:1] -; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc -; GFX9-NEXT: v_xor_b32_e32 v0, s10, v0 +; GFX9-NEXT: v_cndmask_b32_e32 v3, v6, v3, vcc ; GFX9-NEXT: v_xor_b32_e32 v1, s10, v1 -; GFX9-NEXT: v_mov_b32_e32 v2, s10 -; GFX9-NEXT: v_subrev_co_u32_e32 v0, vcc, s10, v0 -; GFX9-NEXT: v_subb_co_u32_e32 v1, vcc, v1, v2, vcc -; GFX9-NEXT: global_store_dwordx2 v4, v[0:1], s[4:5] +; GFX9-NEXT: v_xor_b32_e32 v2, s10, v3 +; GFX9-NEXT: v_mov_b32_e32 v3, s10 +; GFX9-NEXT: v_subrev_co_u32_e32 v1, vcc, s10, v1 +; GFX9-NEXT: v_subb_co_u32_e32 v2, vcc, v2, v3, vcc +; GFX9-NEXT: global_store_dwordx2 v0, v[1:2], s[4:5] ; GFX9-NEXT: s_endpgm %shl.y = shl i64 4096, %y %r = srem i64 %x, %shl.y @@ -10212,8 +10350,8 @@ ; GFX6-NEXT: v_mul_lo_u32 v5, s3, v0 ; GFX6-NEXT: v_mul_lo_u32 v4, s2, v0 ; GFX6-NEXT: s_mov_b32 s10, -1 -; GFX6-NEXT: v_add_i32_e32 v2, vcc, v3, v2 -; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v5 +; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v3 +; GFX6-NEXT: v_add_i32_e32 v2, vcc, v5, v2 ; GFX6-NEXT: v_mul_hi_u32 v3, v0, v4 ; GFX6-NEXT: v_mul_lo_u32 v5, v0, v2 ; GFX6-NEXT: v_mul_hi_u32 v7, v0, v2 @@ -10322,9 +10460,9 @@ ; GFX6-NEXT: s_subb_u32 s1, 0, s5 ; GFX6-NEXT: v_mul_lo_u32 v6, s1, v3 ; GFX6-NEXT: s_ashr_i32 s14, s7, 31 -; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v5 +; GFX6-NEXT: v_add_i32_e32 v2, vcc, v5, v2 ; GFX6-NEXT: v_mul_lo_u32 v5, s0, v3 -; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v6 +; GFX6-NEXT: v_add_i32_e32 v2, vcc, v6, v2 ; GFX6-NEXT: v_mul_lo_u32 v6, v3, v2 ; GFX6-NEXT: v_mul_hi_u32 v7, v3, v5 ; GFX6-NEXT: v_mul_hi_u32 v8, v3, v2 @@ -10390,8 +10528,8 @@ ; GFX6-NEXT: v_subrev_i32_e32 v0, vcc, s12, v0 ; GFX6-NEXT: v_mul_lo_u32 v2, s4, v2 ; GFX6-NEXT: v_subb_u32_e32 v1, vcc, v1, v6, vcc -; GFX6-NEXT: v_add_i32_e32 v3, vcc, v4, v3 -; GFX6-NEXT: v_add_i32_e32 v3, vcc, v3, v5 +; GFX6-NEXT: v_add_i32_e32 v3, vcc, v3, v4 +; GFX6-NEXT: v_add_i32_e32 v3, vcc, v5, v3 ; GFX6-NEXT: v_sub_i32_e32 v4, vcc, s7, v3 ; GFX6-NEXT: v_mov_b32_e32 v5, s5 ; GFX6-NEXT: v_sub_i32_e32 v2, vcc, s6, v2 @@ -10434,6 +10572,7 @@ ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x34 ; GFX9-NEXT: s_mov_b64 s[2:3], 0x1000 +; GFX9-NEXT: v_mov_b32_e32 v4, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_lshl_b64 s[10:11], s[2:3], s10 ; GFX9-NEXT: s_lshl_b64 s[2:3], s[2:3], s8 @@ -10444,243 +10583,276 @@ ; GFX9-NEXT: s_xor_b64 s[12:13], s[2:3], s[8:9] ; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s12 ; GFX9-NEXT: v_cvt_f32_u32_e32 v1, s13 -; GFX9-NEXT: s_sub_u32 s2, 0, s12 -; GFX9-NEXT: s_subb_u32 s3, 0, s13 -; GFX9-NEXT: s_ashr_i32 s8, s5, 31 +; GFX9-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x24 +; GFX9-NEXT: s_sub_u32 s0, 0, s12 +; GFX9-NEXT: s_subb_u32 s1, 0, s13 ; GFX9-NEXT: v_mac_f32_e32 v0, 0x4f800000, v1 ; GFX9-NEXT: v_rcp_f32_e32 v0, v0 -; GFX9-NEXT: s_mov_b32 s9, s8 ; GFX9-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0 ; GFX9-NEXT: v_mul_f32_e32 v1, 0x2f800000, v0 ; GFX9-NEXT: v_trunc_f32_e32 v1, v1 ; GFX9-NEXT: v_mac_f32_e32 v0, 0xcf800000, v1 ; GFX9-NEXT: v_cvt_u32_f32_e32 v1, v1 ; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0 -; GFX9-NEXT: v_mul_lo_u32 v2, s2, v1 -; GFX9-NEXT: v_mul_hi_u32 v3, s2, v0 -; GFX9-NEXT: v_mul_lo_u32 v5, s3, v0 -; GFX9-NEXT: v_mul_lo_u32 v4, s2, v0 -; GFX9-NEXT: v_add_u32_e32 v2, v3, v2 -; GFX9-NEXT: v_add_u32_e32 v2, v2, v5 -; GFX9-NEXT: v_mul_hi_u32 v3, v0, v4 -; GFX9-NEXT: v_mul_lo_u32 v5, v0, v2 -; GFX9-NEXT: v_mul_hi_u32 v7, v0, v2 -; GFX9-NEXT: v_mul_hi_u32 v6, v1, v4 -; GFX9-NEXT: v_mul_lo_u32 v4, v1, v4 -; GFX9-NEXT: v_mul_hi_u32 v8, v1, v2 -; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, v3, v5 -; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v7, vcc -; GFX9-NEXT: v_mul_lo_u32 v2, v1, v2 -; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, v3, v4 -; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, v5, v6, vcc -; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, 0, v8, vcc -; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v3, v2 -; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v4, vcc -; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v2 -; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v3, vcc -; GFX9-NEXT: v_mul_lo_u32 v2, s2, v1 -; GFX9-NEXT: v_mul_hi_u32 v3, s2, v0 -; GFX9-NEXT: v_mul_lo_u32 v4, s3, v0 -; GFX9-NEXT: v_mul_lo_u32 v5, s2, v0 -; GFX9-NEXT: s_add_u32 s2, s4, s8 -; GFX9-NEXT: v_add_u32_e32 v2, v3, v2 -; GFX9-NEXT: v_add_u32_e32 v2, v2, v4 -; GFX9-NEXT: v_mul_lo_u32 v6, v0, v2 -; GFX9-NEXT: v_mul_hi_u32 v7, v0, v5 -; GFX9-NEXT: v_mul_hi_u32 v8, v0, v2 -; GFX9-NEXT: v_mul_hi_u32 v4, v1, v5 -; GFX9-NEXT: v_mul_lo_u32 v5, v1, v5 -; GFX9-NEXT: v_mul_hi_u32 v3, v1, v2 -; GFX9-NEXT: v_add_co_u32_e32 v6, vcc, v7, v6 -; GFX9-NEXT: v_addc_co_u32_e32 v7, vcc, 0, v8, vcc -; GFX9-NEXT: v_mul_lo_u32 v2, v1, v2 -; GFX9-NEXT: v_add_co_u32_e32 v5, vcc, v6, v5 -; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, v7, v4, vcc -; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc -; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v4, v2 -; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc -; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v2 -; GFX9-NEXT: s_addc_u32 s3, s5, s8 -; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v3, vcc -; GFX9-NEXT: s_xor_b64 s[14:15], s[2:3], s[8:9] -; GFX9-NEXT: v_mul_lo_u32 v2, s14, v1 -; GFX9-NEXT: v_mul_hi_u32 v3, s14, v0 -; GFX9-NEXT: v_mul_hi_u32 v4, s14, v1 -; GFX9-NEXT: v_mul_hi_u32 v5, s15, v1 -; GFX9-NEXT: v_mul_lo_u32 v1, s15, v1 -; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v3, v2 -; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v4, vcc -; GFX9-NEXT: v_mul_lo_u32 v4, s15, v0 -; GFX9-NEXT: v_mul_hi_u32 v0, s15, v0 -; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 -; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v2, v4 -; GFX9-NEXT: v_addc_co_u32_e32 v0, vcc, v3, v0, vcc -; GFX9-NEXT: v_addc_co_u32_e32 v2, vcc, 0, v5, vcc -; GFX9-NEXT: v_add_co_u32_e32 v1, vcc, v0, v1 -; GFX9-NEXT: v_addc_co_u32_e32 v0, vcc, 0, v2, vcc -; GFX9-NEXT: v_mul_lo_u32 v2, s12, v0 -; GFX9-NEXT: v_mul_hi_u32 v3, s12, v1 -; GFX9-NEXT: v_mul_lo_u32 v4, s13, v1 -; GFX9-NEXT: v_mul_lo_u32 v1, s12, v1 -; GFX9-NEXT: v_mov_b32_e32 v0, 0 -; GFX9-NEXT: v_add_u32_e32 v2, v3, v2 -; GFX9-NEXT: v_add_u32_e32 v2, v2, v4 -; GFX9-NEXT: v_sub_u32_e32 v3, s15, v2 -; GFX9-NEXT: v_mov_b32_e32 v4, s13 -; GFX9-NEXT: v_sub_co_u32_e32 v1, vcc, s14, v1 -; GFX9-NEXT: v_subb_co_u32_e64 v3, s[0:1], v3, v4, vcc -; GFX9-NEXT: v_subrev_co_u32_e64 v5, s[0:1], s12, v1 -; GFX9-NEXT: v_subbrev_co_u32_e64 v6, s[2:3], 0, v3, s[0:1] -; GFX9-NEXT: v_cmp_le_u32_e64 s[2:3], s13, v6 -; GFX9-NEXT: v_cndmask_b32_e64 v7, 0, -1, s[2:3] -; GFX9-NEXT: v_cmp_le_u32_e64 s[2:3], s12, v5 -; GFX9-NEXT: v_cndmask_b32_e64 v8, 0, -1, s[2:3] -; GFX9-NEXT: v_cmp_eq_u32_e64 s[2:3], s13, v6 -; GFX9-NEXT: v_cndmask_b32_e64 v7, v7, v8, s[2:3] -; GFX9-NEXT: s_ashr_i32 s2, s11, 31 -; GFX9-NEXT: v_subb_co_u32_e64 v3, s[0:1], v3, v4, s[0:1] -; GFX9-NEXT: s_add_u32 s10, s10, s2 -; GFX9-NEXT: v_subrev_co_u32_e64 v4, s[0:1], s12, v5 -; GFX9-NEXT: s_mov_b32 s3, s2 -; GFX9-NEXT: s_addc_u32 s11, s11, s2 -; GFX9-NEXT: v_subbrev_co_u32_e64 v3, s[0:1], 0, v3, s[0:1] -; GFX9-NEXT: s_xor_b64 s[10:11], s[10:11], s[2:3] -; GFX9-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v7 -; GFX9-NEXT: v_cvt_f32_u32_e32 v7, s10 -; GFX9-NEXT: v_cvt_f32_u32_e32 v8, s11 -; GFX9-NEXT: v_cndmask_b32_e64 v3, v6, v3, s[0:1] -; GFX9-NEXT: v_mov_b32_e32 v6, s15 -; GFX9-NEXT: v_subb_co_u32_e32 v2, vcc, v6, v2, vcc -; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s13, v2 -; GFX9-NEXT: v_mac_f32_e32 v7, 0x4f800000, v8 -; GFX9-NEXT: v_cndmask_b32_e64 v6, 0, -1, vcc -; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s12, v1 -; GFX9-NEXT: v_rcp_f32_e32 v7, v7 -; GFX9-NEXT: v_cndmask_b32_e64 v9, 0, -1, vcc -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, s13, v2 -; GFX9-NEXT: v_cndmask_b32_e32 v6, v6, v9, vcc -; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v6 +; GFX9-NEXT: v_readfirstlane_b32 s2, v1 +; GFX9-NEXT: v_readfirstlane_b32 s3, v0 +; GFX9-NEXT: s_mul_i32 s14, s0, s2 +; GFX9-NEXT: s_mul_hi_u32 s16, s0, s3 +; GFX9-NEXT: s_mul_i32 s15, s1, s3 +; GFX9-NEXT: s_add_i32 s14, s16, s14 +; GFX9-NEXT: s_add_i32 s14, s14, s15 +; GFX9-NEXT: s_mul_i32 s17, s0, s3 +; GFX9-NEXT: s_mul_hi_u32 s15, s3, s14 +; GFX9-NEXT: s_mul_i32 s16, s3, s14 +; GFX9-NEXT: s_mul_hi_u32 s3, s3, s17 +; GFX9-NEXT: s_add_u32 s3, s3, s16 +; GFX9-NEXT: s_addc_u32 s15, 0, s15 +; GFX9-NEXT: s_mul_hi_u32 s18, s2, s17 +; GFX9-NEXT: s_mul_i32 s17, s2, s17 +; GFX9-NEXT: s_add_u32 s3, s3, s17 +; GFX9-NEXT: s_mul_hi_u32 s16, s2, s14 +; GFX9-NEXT: s_addc_u32 s3, s15, s18 +; GFX9-NEXT: s_addc_u32 s15, s16, 0 +; GFX9-NEXT: s_mul_i32 s14, s2, s14 +; GFX9-NEXT: s_add_u32 s3, s3, s14 +; GFX9-NEXT: s_addc_u32 s14, 0, s15 +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s3, v0 +; GFX9-NEXT: s_cmp_lg_u64 vcc, 0 +; GFX9-NEXT: s_addc_u32 s2, s2, s14 +; GFX9-NEXT: v_readfirstlane_b32 s14, v0 +; GFX9-NEXT: s_mul_i32 s3, s0, s2 +; GFX9-NEXT: s_mul_hi_u32 s15, s0, s14 +; GFX9-NEXT: s_add_i32 s3, s15, s3 +; GFX9-NEXT: s_mul_i32 s1, s1, s14 +; GFX9-NEXT: s_add_i32 s3, s3, s1 +; GFX9-NEXT: s_mul_i32 s0, s0, s14 +; GFX9-NEXT: s_mul_hi_u32 s15, s2, s0 +; GFX9-NEXT: s_mul_i32 s16, s2, s0 +; GFX9-NEXT: s_mul_i32 s18, s14, s3 +; GFX9-NEXT: s_mul_hi_u32 s0, s14, s0 +; GFX9-NEXT: s_mul_hi_u32 s17, s14, s3 +; GFX9-NEXT: s_add_u32 s0, s0, s18 +; GFX9-NEXT: s_addc_u32 s14, 0, s17 +; GFX9-NEXT: s_add_u32 s0, s0, s16 +; GFX9-NEXT: s_mul_hi_u32 s1, s2, s3 +; GFX9-NEXT: s_addc_u32 s0, s14, s15 +; GFX9-NEXT: s_addc_u32 s1, s1, 0 +; GFX9-NEXT: s_mul_i32 s3, s2, s3 +; GFX9-NEXT: s_add_u32 s0, s0, s3 +; GFX9-NEXT: s_addc_u32 s1, 0, s1 +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s0, v0 +; GFX9-NEXT: s_cmp_lg_u64 vcc, 0 +; GFX9-NEXT: s_addc_u32 s2, s2, s1 +; GFX9-NEXT: s_ashr_i32 s14, s5, 31 +; GFX9-NEXT: s_add_u32 s0, s4, s14 +; GFX9-NEXT: s_mov_b32 s15, s14 +; GFX9-NEXT: s_addc_u32 s1, s5, s14 +; GFX9-NEXT: s_xor_b64 s[4:5], s[0:1], s[14:15] +; GFX9-NEXT: v_readfirstlane_b32 s3, v0 +; GFX9-NEXT: s_mul_i32 s1, s4, s2 +; GFX9-NEXT: s_mul_hi_u32 s15, s4, s3 +; GFX9-NEXT: s_mul_hi_u32 s0, s4, s2 +; GFX9-NEXT: s_add_u32 s1, s15, s1 +; GFX9-NEXT: s_addc_u32 s0, 0, s0 +; GFX9-NEXT: s_mul_hi_u32 s16, s5, s3 +; GFX9-NEXT: s_mul_i32 s3, s5, s3 +; GFX9-NEXT: s_add_u32 s1, s1, s3 +; GFX9-NEXT: s_mul_hi_u32 s15, s5, s2 +; GFX9-NEXT: s_addc_u32 s0, s0, s16 +; GFX9-NEXT: s_addc_u32 s1, s15, 0 +; GFX9-NEXT: s_mul_i32 s2, s5, s2 +; GFX9-NEXT: s_add_u32 s0, s0, s2 +; GFX9-NEXT: s_addc_u32 s1, 0, s1 +; GFX9-NEXT: s_mul_i32 s1, s12, s1 +; GFX9-NEXT: s_mul_hi_u32 s2, s12, s0 +; GFX9-NEXT: s_add_i32 s1, s2, s1 +; GFX9-NEXT: s_mul_i32 s2, s13, s0 +; GFX9-NEXT: s_mul_i32 s0, s12, s0 +; GFX9-NEXT: s_add_i32 s15, s1, s2 +; GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NEXT: s_sub_i32 s1, s5, s15 +; GFX9-NEXT: v_sub_co_u32_e32 v0, vcc, s4, v0 +; GFX9-NEXT: s_cmp_lg_u64 vcc, 0 +; GFX9-NEXT: s_subb_u32 s4, s1, s13 +; GFX9-NEXT: v_subrev_co_u32_e64 v1, s[0:1], s12, v0 +; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX9-NEXT: s_subb_u32 s16, s4, 0 +; GFX9-NEXT: s_cmp_ge_u32 s16, s13 +; GFX9-NEXT: s_cselect_b32 s17, -1, 0 +; GFX9-NEXT: v_cmp_le_u32_e64 s[2:3], s12, v1 +; GFX9-NEXT: s_cmp_eq_u32 s16, s13 +; GFX9-NEXT: v_cndmask_b32_e64 v2, 0, -1, s[2:3] +; GFX9-NEXT: v_mov_b32_e32 v3, s17 +; GFX9-NEXT: s_cselect_b64 s[2:3], -1, 0 +; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX9-NEXT: v_cndmask_b32_e64 v2, v3, v2, s[2:3] +; GFX9-NEXT: s_subb_u32 s2, s4, s13 +; GFX9-NEXT: v_subrev_co_u32_e64 v3, s[0:1], s12, v1 +; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX9-NEXT: s_subb_u32 s0, s2, 0 +; GFX9-NEXT: s_cmp_lg_u64 vcc, 0 +; GFX9-NEXT: s_subb_u32 s2, s5, s15 +; GFX9-NEXT: s_cmp_ge_u32 s2, s13 +; GFX9-NEXT: v_mov_b32_e32 v5, s16 +; GFX9-NEXT: v_mov_b32_e32 v6, s0 +; GFX9-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v2 +; GFX9-NEXT: s_cselect_b32 s3, -1, 0 +; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s12, v0 +; GFX9-NEXT: s_cmp_eq_u32 s2, s13 +; GFX9-NEXT: v_cndmask_b32_e64 v2, v5, v6, s[0:1] +; GFX9-NEXT: v_cndmask_b32_e64 v5, 0, -1, vcc +; GFX9-NEXT: v_mov_b32_e32 v6, s3 +; GFX9-NEXT: s_cselect_b64 vcc, -1, 0 +; GFX9-NEXT: v_cndmask_b32_e64 v1, v1, v3, s[0:1] +; GFX9-NEXT: s_ashr_i32 s0, s11, 31 +; GFX9-NEXT: v_cndmask_b32_e32 v5, v6, v5, vcc +; GFX9-NEXT: v_mov_b32_e32 v6, s2 +; GFX9-NEXT: s_add_u32 s2, s10, s0 +; GFX9-NEXT: s_mov_b32 s1, s0 +; GFX9-NEXT: s_addc_u32 s3, s11, s0 +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v5 +; GFX9-NEXT: s_xor_b64 s[4:5], s[2:3], s[0:1] +; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc +; GFX9-NEXT: v_cvt_f32_u32_e32 v1, s4 +; GFX9-NEXT: v_cvt_f32_u32_e32 v3, s5 +; GFX9-NEXT: v_cndmask_b32_e32 v2, v6, v2, vcc +; GFX9-NEXT: v_xor_b32_e32 v0, s14, v0 +; GFX9-NEXT: v_xor_b32_e32 v2, s14, v2 +; GFX9-NEXT: v_mac_f32_e32 v1, 0x4f800000, v3 +; GFX9-NEXT: v_rcp_f32_e32 v3, v1 +; GFX9-NEXT: v_mov_b32_e32 v5, s14 +; GFX9-NEXT: v_subrev_co_u32_e32 v0, vcc, s14, v0 +; GFX9-NEXT: v_subb_co_u32_e32 v1, vcc, v2, v5, vcc +; GFX9-NEXT: v_mul_f32_e32 v2, 0x5f7ffffc, v3 +; GFX9-NEXT: v_mul_f32_e32 v3, 0x2f800000, v2 +; GFX9-NEXT: v_trunc_f32_e32 v3, v3 +; GFX9-NEXT: v_mac_f32_e32 v2, 0xcf800000, v3 +; GFX9-NEXT: v_cvt_u32_f32_e32 v2, v2 +; GFX9-NEXT: v_cvt_u32_f32_e32 v3, v3 +; GFX9-NEXT: s_sub_u32 s0, 0, s4 +; GFX9-NEXT: s_subb_u32 s1, 0, s5 +; GFX9-NEXT: v_readfirstlane_b32 s2, v2 +; GFX9-NEXT: v_readfirstlane_b32 s11, v3 +; GFX9-NEXT: s_mul_hi_u32 s10, s0, s2 +; GFX9-NEXT: s_mul_i32 s12, s0, s11 +; GFX9-NEXT: s_mul_i32 s3, s1, s2 +; GFX9-NEXT: s_add_i32 s10, s10, s12 +; GFX9-NEXT: s_add_i32 s10, s10, s3 +; GFX9-NEXT: s_mul_i32 s13, s0, s2 +; GFX9-NEXT: s_mul_hi_u32 s3, s2, s10 +; GFX9-NEXT: s_mul_i32 s12, s2, s10 +; GFX9-NEXT: s_mul_hi_u32 s2, s2, s13 +; GFX9-NEXT: s_add_u32 s2, s2, s12 +; GFX9-NEXT: s_addc_u32 s3, 0, s3 +; GFX9-NEXT: s_mul_hi_u32 s14, s11, s13 +; GFX9-NEXT: s_mul_i32 s13, s11, s13 +; GFX9-NEXT: s_add_u32 s2, s2, s13 +; GFX9-NEXT: s_mul_hi_u32 s12, s11, s10 +; GFX9-NEXT: s_addc_u32 s2, s3, s14 +; GFX9-NEXT: s_addc_u32 s3, s12, 0 +; GFX9-NEXT: s_mul_i32 s10, s11, s10 +; GFX9-NEXT: s_add_u32 s2, s2, s10 +; GFX9-NEXT: s_addc_u32 s3, 0, s3 +; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s2, v2 +; GFX9-NEXT: s_cmp_lg_u64 vcc, 0 +; GFX9-NEXT: s_addc_u32 s2, s11, s3 +; GFX9-NEXT: v_readfirstlane_b32 s10, v2 +; GFX9-NEXT: s_mul_i32 s3, s0, s2 +; GFX9-NEXT: s_mul_hi_u32 s11, s0, s10 +; GFX9-NEXT: s_add_i32 s3, s11, s3 +; GFX9-NEXT: s_mul_i32 s1, s1, s10 +; GFX9-NEXT: s_add_i32 s3, s3, s1 +; GFX9-NEXT: s_mul_i32 s0, s0, s10 +; GFX9-NEXT: s_mul_hi_u32 s11, s2, s0 +; GFX9-NEXT: s_mul_i32 s12, s2, s0 +; GFX9-NEXT: s_mul_i32 s14, s10, s3 +; GFX9-NEXT: s_mul_hi_u32 s0, s10, s0 +; GFX9-NEXT: s_mul_hi_u32 s13, s10, s3 +; GFX9-NEXT: s_add_u32 s0, s0, s14 +; GFX9-NEXT: s_addc_u32 s10, 0, s13 +; GFX9-NEXT: s_add_u32 s0, s0, s12 +; GFX9-NEXT: s_mul_hi_u32 s1, s2, s3 +; GFX9-NEXT: s_addc_u32 s0, s10, s11 +; GFX9-NEXT: s_addc_u32 s1, s1, 0 +; GFX9-NEXT: s_mul_i32 s3, s2, s3 +; GFX9-NEXT: s_add_u32 s0, s0, s3 +; GFX9-NEXT: s_addc_u32 s1, 0, s1 +; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s0, v2 +; GFX9-NEXT: s_cmp_lg_u64 vcc, 0 +; GFX9-NEXT: s_addc_u32 s2, s2, s1 +; GFX9-NEXT: s_ashr_i32 s10, s7, 31 +; GFX9-NEXT: s_add_u32 s0, s6, s10 +; GFX9-NEXT: s_mov_b32 s11, s10 +; GFX9-NEXT: s_addc_u32 s1, s7, s10 +; GFX9-NEXT: s_xor_b64 s[6:7], s[0:1], s[10:11] +; GFX9-NEXT: v_readfirstlane_b32 s3, v2 +; GFX9-NEXT: s_mul_i32 s1, s6, s2 +; GFX9-NEXT: s_mul_hi_u32 s11, s6, s3 +; GFX9-NEXT: s_mul_hi_u32 s0, s6, s2 +; GFX9-NEXT: s_add_u32 s1, s11, s1 +; GFX9-NEXT: s_addc_u32 s0, 0, s0 +; GFX9-NEXT: s_mul_hi_u32 s12, s7, s3 +; GFX9-NEXT: s_mul_i32 s3, s7, s3 +; GFX9-NEXT: s_add_u32 s1, s1, s3 +; GFX9-NEXT: s_mul_hi_u32 s11, s7, s2 +; GFX9-NEXT: s_addc_u32 s0, s0, s12 +; GFX9-NEXT: s_addc_u32 s1, s11, 0 +; GFX9-NEXT: s_mul_i32 s2, s7, s2 +; GFX9-NEXT: s_add_u32 s0, s0, s2 +; GFX9-NEXT: s_addc_u32 s1, 0, s1 +; GFX9-NEXT: s_mul_i32 s1, s4, s1 +; GFX9-NEXT: s_mul_hi_u32 s2, s4, s0 +; GFX9-NEXT: s_add_i32 s1, s2, s1 +; GFX9-NEXT: s_mul_i32 s2, s5, s0 +; GFX9-NEXT: s_mul_i32 s0, s4, s0 +; GFX9-NEXT: s_add_i32 s11, s1, s2 +; GFX9-NEXT: v_mov_b32_e32 v2, s0 +; GFX9-NEXT: s_sub_i32 s1, s7, s11 +; GFX9-NEXT: v_sub_co_u32_e32 v2, vcc, s6, v2 +; GFX9-NEXT: s_cmp_lg_u64 vcc, 0 +; GFX9-NEXT: s_subb_u32 s6, s1, s5 +; GFX9-NEXT: v_subrev_co_u32_e64 v3, s[0:1], s4, v2 +; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX9-NEXT: s_subb_u32 s12, s6, 0 +; GFX9-NEXT: s_cmp_ge_u32 s12, s5 +; GFX9-NEXT: s_cselect_b32 s13, -1, 0 +; GFX9-NEXT: v_cmp_le_u32_e64 s[2:3], s4, v3 +; GFX9-NEXT: s_cmp_eq_u32 s12, s5 +; GFX9-NEXT: v_cndmask_b32_e64 v5, 0, -1, s[2:3] +; GFX9-NEXT: v_mov_b32_e32 v6, s13 +; GFX9-NEXT: s_cselect_b64 s[2:3], -1, 0 +; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX9-NEXT: v_cndmask_b32_e64 v5, v6, v5, s[2:3] +; GFX9-NEXT: s_subb_u32 s2, s6, s5 +; GFX9-NEXT: v_subrev_co_u32_e64 v6, s[0:1], s4, v3 +; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX9-NEXT: s_subb_u32 s0, s2, 0 +; GFX9-NEXT: s_cmp_lg_u64 vcc, 0 +; GFX9-NEXT: s_subb_u32 s2, s7, s11 +; GFX9-NEXT: s_cmp_ge_u32 s2, s5 +; GFX9-NEXT: v_mov_b32_e32 v7, s12 +; GFX9-NEXT: v_mov_b32_e32 v8, s0 +; GFX9-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v5 +; GFX9-NEXT: s_cselect_b32 s3, -1, 0 +; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s4, v2 +; GFX9-NEXT: s_cmp_eq_u32 s2, s5 +; GFX9-NEXT: v_cndmask_b32_e64 v5, v7, v8, s[0:1] +; GFX9-NEXT: v_cndmask_b32_e64 v7, 0, -1, vcc +; GFX9-NEXT: v_mov_b32_e32 v8, s3 +; GFX9-NEXT: s_cselect_b64 vcc, -1, 0 +; GFX9-NEXT: v_cndmask_b32_e32 v7, v8, v7, vcc +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v7 +; GFX9-NEXT: v_cndmask_b32_e64 v3, v3, v6, s[0:1] +; GFX9-NEXT: v_mov_b32_e32 v8, s2 ; GFX9-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc -; GFX9-NEXT: v_cndmask_b32_e64 v3, v5, v4, s[0:1] -; GFX9-NEXT: v_mul_f32_e32 v4, 0x5f7ffffc, v7 -; GFX9-NEXT: v_mul_f32_e32 v5, 0x2f800000, v4 -; GFX9-NEXT: v_trunc_f32_e32 v5, v5 -; GFX9-NEXT: v_mac_f32_e32 v4, 0xcf800000, v5 -; GFX9-NEXT: v_cvt_u32_f32_e32 v4, v4 -; GFX9-NEXT: v_cvt_u32_f32_e32 v5, v5 -; GFX9-NEXT: s_sub_u32 s0, 0, s10 -; GFX9-NEXT: s_subb_u32 s1, 0, s11 -; GFX9-NEXT: v_mul_hi_u32 v6, s0, v4 -; GFX9-NEXT: v_mul_lo_u32 v7, s0, v5 -; GFX9-NEXT: v_mul_lo_u32 v8, s1, v4 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc -; GFX9-NEXT: v_mul_lo_u32 v3, s0, v4 -; GFX9-NEXT: v_add_u32_e32 v6, v6, v7 -; GFX9-NEXT: v_add_u32_e32 v6, v6, v8 -; GFX9-NEXT: v_mul_lo_u32 v7, v4, v6 -; GFX9-NEXT: v_mul_hi_u32 v8, v4, v3 -; GFX9-NEXT: v_mul_hi_u32 v9, v4, v6 -; GFX9-NEXT: v_mul_hi_u32 v10, v5, v6 -; GFX9-NEXT: v_mul_lo_u32 v6, v5, v6 -; GFX9-NEXT: v_add_co_u32_e32 v7, vcc, v8, v7 -; GFX9-NEXT: v_addc_co_u32_e32 v8, vcc, 0, v9, vcc -; GFX9-NEXT: v_mul_lo_u32 v9, v5, v3 -; GFX9-NEXT: v_mul_hi_u32 v3, v5, v3 -; GFX9-NEXT: s_ashr_i32 s12, s7, 31 -; GFX9-NEXT: s_mov_b32 s13, s12 -; GFX9-NEXT: v_add_co_u32_e32 v7, vcc, v7, v9 -; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, v8, v3, vcc -; GFX9-NEXT: v_addc_co_u32_e32 v7, vcc, 0, v10, vcc -; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, v3, v6 -; GFX9-NEXT: v_addc_co_u32_e32 v6, vcc, 0, v7, vcc -; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, v4, v3 -; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, v5, v6, vcc -; GFX9-NEXT: v_mul_lo_u32 v5, s0, v4 -; GFX9-NEXT: v_mul_hi_u32 v6, s0, v3 -; GFX9-NEXT: v_mul_lo_u32 v7, s1, v3 -; GFX9-NEXT: v_mul_lo_u32 v8, s0, v3 -; GFX9-NEXT: s_add_u32 s0, s6, s12 -; GFX9-NEXT: v_add_u32_e32 v5, v6, v5 -; GFX9-NEXT: v_add_u32_e32 v5, v5, v7 -; GFX9-NEXT: v_mul_lo_u32 v9, v3, v5 -; GFX9-NEXT: v_mul_hi_u32 v10, v3, v8 -; GFX9-NEXT: v_mul_hi_u32 v11, v3, v5 -; GFX9-NEXT: v_mul_hi_u32 v7, v4, v8 -; GFX9-NEXT: v_mul_lo_u32 v8, v4, v8 -; GFX9-NEXT: v_mul_hi_u32 v6, v4, v5 -; GFX9-NEXT: v_add_co_u32_e32 v9, vcc, v10, v9 -; GFX9-NEXT: v_addc_co_u32_e32 v10, vcc, 0, v11, vcc -; GFX9-NEXT: v_mul_lo_u32 v5, v4, v5 -; GFX9-NEXT: v_add_co_u32_e32 v8, vcc, v9, v8 -; GFX9-NEXT: v_addc_co_u32_e32 v7, vcc, v10, v7, vcc -; GFX9-NEXT: v_addc_co_u32_e32 v6, vcc, 0, v6, vcc -; GFX9-NEXT: v_add_co_u32_e32 v5, vcc, v7, v5 -; GFX9-NEXT: v_addc_co_u32_e32 v6, vcc, 0, v6, vcc -; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, v3, v5 -; GFX9-NEXT: s_addc_u32 s1, s7, s12 -; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, v4, v6, vcc -; GFX9-NEXT: s_xor_b64 s[6:7], s[0:1], s[12:13] -; GFX9-NEXT: v_mul_lo_u32 v5, s6, v4 -; GFX9-NEXT: v_mul_hi_u32 v6, s6, v3 -; GFX9-NEXT: v_mul_hi_u32 v8, s6, v4 -; GFX9-NEXT: v_mul_hi_u32 v9, s7, v4 -; GFX9-NEXT: v_mul_lo_u32 v4, s7, v4 -; GFX9-NEXT: v_add_co_u32_e32 v5, vcc, v6, v5 -; GFX9-NEXT: v_addc_co_u32_e32 v6, vcc, 0, v8, vcc -; GFX9-NEXT: v_mul_lo_u32 v8, s7, v3 -; GFX9-NEXT: v_mul_hi_u32 v3, s7, v3 -; GFX9-NEXT: v_xor_b32_e32 v1, s8, v1 -; GFX9-NEXT: v_xor_b32_e32 v2, s8, v2 -; GFX9-NEXT: v_add_co_u32_e32 v5, vcc, v5, v8 -; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, v6, v3, vcc -; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v9, vcc -; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, v3, v4 -; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, 0, v5, vcc -; GFX9-NEXT: v_mul_lo_u32 v4, s10, v4 -; GFX9-NEXT: v_mul_hi_u32 v5, s10, v3 -; GFX9-NEXT: v_mul_lo_u32 v6, s11, v3 -; GFX9-NEXT: v_mul_lo_u32 v3, s10, v3 -; GFX9-NEXT: v_mov_b32_e32 v7, s8 -; GFX9-NEXT: v_subrev_co_u32_e32 v1, vcc, s8, v1 -; GFX9-NEXT: v_add_u32_e32 v4, v5, v4 -; GFX9-NEXT: v_subb_co_u32_e32 v2, vcc, v2, v7, vcc -; GFX9-NEXT: v_add_u32_e32 v4, v4, v6 -; GFX9-NEXT: v_sub_u32_e32 v5, s7, v4 -; GFX9-NEXT: v_mov_b32_e32 v6, s11 -; GFX9-NEXT: v_sub_co_u32_e32 v3, vcc, s6, v3 -; GFX9-NEXT: v_subb_co_u32_e64 v5, s[0:1], v5, v6, vcc -; GFX9-NEXT: v_subrev_co_u32_e64 v7, s[0:1], s10, v3 -; GFX9-NEXT: v_subbrev_co_u32_e64 v8, s[2:3], 0, v5, s[0:1] -; GFX9-NEXT: v_cmp_le_u32_e64 s[2:3], s11, v8 -; GFX9-NEXT: v_cndmask_b32_e64 v9, 0, -1, s[2:3] -; GFX9-NEXT: v_cmp_le_u32_e64 s[2:3], s10, v7 -; GFX9-NEXT: v_subb_co_u32_e64 v5, s[0:1], v5, v6, s[0:1] -; GFX9-NEXT: v_cndmask_b32_e64 v10, 0, -1, s[2:3] -; GFX9-NEXT: v_cmp_eq_u32_e64 s[2:3], s11, v8 -; GFX9-NEXT: v_subrev_co_u32_e64 v6, s[0:1], s10, v7 -; GFX9-NEXT: v_cndmask_b32_e64 v9, v9, v10, s[2:3] -; GFX9-NEXT: v_subbrev_co_u32_e64 v5, s[0:1], 0, v5, s[0:1] -; GFX9-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v9 -; GFX9-NEXT: v_cndmask_b32_e64 v5, v8, v5, s[0:1] -; GFX9-NEXT: v_mov_b32_e32 v8, s7 -; GFX9-NEXT: v_subb_co_u32_e32 v4, vcc, v8, v4, vcc -; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s11, v4 -; GFX9-NEXT: v_cndmask_b32_e64 v8, 0, -1, vcc -; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s10, v3 -; GFX9-NEXT: v_cndmask_b32_e64 v9, 0, -1, vcc -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, s11, v4 -; GFX9-NEXT: v_cndmask_b32_e32 v8, v8, v9, vcc -; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 -; GFX9-NEXT: v_cndmask_b32_e32 v4, v4, v5, vcc -; GFX9-NEXT: v_cndmask_b32_e64 v5, v7, v6, s[0:1] -; GFX9-NEXT: v_cndmask_b32_e32 v3, v3, v5, vcc -; GFX9-NEXT: v_xor_b32_e32 v3, s12, v3 -; GFX9-NEXT: v_xor_b32_e32 v4, s12, v4 -; GFX9-NEXT: v_mov_b32_e32 v5, s12 -; GFX9-NEXT: v_subrev_co_u32_e32 v3, vcc, s12, v3 -; GFX9-NEXT: v_subb_co_u32_e32 v4, vcc, v4, v5, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v5, v8, v5, vcc +; GFX9-NEXT: v_xor_b32_e32 v2, s10, v2 +; GFX9-NEXT: v_xor_b32_e32 v3, s10, v5 +; GFX9-NEXT: v_mov_b32_e32 v5, s10 +; GFX9-NEXT: v_subrev_co_u32_e32 v2, vcc, s10, v2 +; GFX9-NEXT: v_subb_co_u32_e32 v3, vcc, v3, v5, vcc ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_store_dwordx4 v0, v[1:4], s[4:5] +; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[8:9] ; GFX9-NEXT: s_endpgm %shl.y = shl <2 x i64> , %y %r = srem <2 x i64> %x, %shl.y diff --git a/llvm/test/CodeGen/AMDGPU/cf-loop-on-constant.ll b/llvm/test/CodeGen/AMDGPU/cf-loop-on-constant.ll --- a/llvm/test/CodeGen/AMDGPU/cf-loop-on-constant.ll +++ b/llvm/test/CodeGen/AMDGPU/cf-loop-on-constant.ll @@ -1,14 +1,73 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s -; RUN: llc -march=amdgcn -verify-machineinstrs -O0 < %s +; RUN: llc -march=amdgcn -verify-machineinstrs -O0 < %s | FileCheck -check-prefix=GCN_DBG %s -; GCN-LABEL: {{^}}test_loop: -; GCN: s_and_b64 s[0:1], exec, -1 -; GCN: [[LABEL:.LBB[0-9]+_[0-9]+]]: ; %for.body{{$}} -; GCN: ds_read_b32 -; GCN: ds_write_b32 -; GCN: s_cbranch_vccnz [[LABEL]] -; GCN: s_endpgm define amdgpu_kernel void @test_loop(float addrspace(3)* %ptr, i32 %n) nounwind { +; GCN-LABEL: test_loop: +; GCN: ; %bb.0: ; %entry +; GCN-NEXT: s_load_dword s2, s[0:1], 0xa +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: s_cmp_eq_u32 s2, -1 +; GCN-NEXT: s_cbranch_scc1 .LBB0_3 +; GCN-NEXT: ; %bb.1: ; %for.body.preheader +; GCN-NEXT: s_load_dword s0, s[0:1], 0x9 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: s_addk_i32 s0, 0x80 +; GCN-NEXT: v_mov_b32_e32 v0, s0 +; GCN-NEXT: s_and_b64 s[0:1], exec, -1 +; GCN-NEXT: s_mov_b32 m0, -1 +; GCN-NEXT: .LBB0_2: ; %for.body +; GCN-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN-NEXT: ds_read_b32 v1, v0 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: v_add_f32_e32 v1, 1.0, v1 +; GCN-NEXT: ds_write_b32 v0, v1 +; GCN-NEXT: v_add_i32_e32 v0, vcc, 4, v0 +; GCN-NEXT: s_mov_b64 vcc, s[0:1] +; GCN-NEXT: s_cbranch_vccnz .LBB0_2 +; GCN-NEXT: .LBB0_3: ; %for.exit +; GCN-NEXT: s_endpgm +; +; GCN_DBG-LABEL: test_loop: +; GCN_DBG: ; %bb.0: ; %entry +; GCN_DBG-NEXT: s_load_dword s2, s[0:1], 0x9 +; GCN_DBG-NEXT: s_waitcnt lgkmcnt(0) +; GCN_DBG-NEXT: v_writelane_b32 v2, s2, 0 +; GCN_DBG-NEXT: s_load_dword s1, s[0:1], 0xa +; GCN_DBG-NEXT: s_mov_b32 s0, 0 +; GCN_DBG-NEXT: s_mov_b32 s2, -1 +; GCN_DBG-NEXT: s_waitcnt lgkmcnt(0) +; GCN_DBG-NEXT: s_cmp_lg_u32 s1, s2 +; GCN_DBG-NEXT: v_writelane_b32 v2, s0, 1 +; GCN_DBG-NEXT: s_cbranch_scc1 .LBB0_2 +; GCN_DBG-NEXT: ; %bb.1: ; %for.exit +; GCN_DBG-NEXT: s_endpgm +; GCN_DBG-NEXT: .LBB0_2: ; %for.body +; GCN_DBG-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN_DBG-NEXT: v_readlane_b32 s0, v2, 1 +; GCN_DBG-NEXT: v_readlane_b32 s2, v2, 0 +; GCN_DBG-NEXT: s_mov_b32 s1, 2 +; GCN_DBG-NEXT: s_lshl_b32 s1, s0, s1 +; GCN_DBG-NEXT: s_add_i32 s1, s1, s2 +; GCN_DBG-NEXT: s_mov_b32 s2, 0x80 +; GCN_DBG-NEXT: s_add_i32 s1, s1, s2 +; GCN_DBG-NEXT: s_mov_b32 m0, -1 +; GCN_DBG-NEXT: v_mov_b32_e32 v0, s1 +; GCN_DBG-NEXT: ds_read_b32 v0, v0 +; GCN_DBG-NEXT: s_mov_b32 s2, 1.0 +; GCN_DBG-NEXT: s_waitcnt lgkmcnt(0) +; GCN_DBG-NEXT: v_add_f32_e64 v1, v0, s2 +; GCN_DBG-NEXT: s_mov_b32 m0, -1 +; GCN_DBG-NEXT: v_mov_b32_e32 v0, s1 +; GCN_DBG-NEXT: ds_write_b32 v0, v1 +; GCN_DBG-NEXT: s_mov_b32 s1, 1 +; GCN_DBG-NEXT: s_add_i32 s0, s0, s1 +; GCN_DBG-NEXT: s_mov_b64 s[2:3], -1 +; GCN_DBG-NEXT: s_and_b64 vcc, exec, s[2:3] +; GCN_DBG-NEXT: v_writelane_b32 v2, s0, 1 +; GCN_DBG-NEXT: s_cbranch_vccnz .LBB0_2 +; GCN_DBG-NEXT: ; %bb.3: ; %DummyReturnBlock +; GCN_DBG-NEXT: s_endpgm entry: %cmp = icmp eq i32 %n, -1 br i1 %cmp, label %for.exit, label %for.body @@ -27,12 +86,58 @@ br label %for.body } -; GCN-LABEL: @loop_const_true -; GCN: [[LABEL:.LBB[0-9]+_[0-9]+]]: -; GCN: ds_read_b32 -; GCN: ds_write_b32 -; GCN: s_branch [[LABEL]] define amdgpu_kernel void @loop_const_true(float addrspace(3)* %ptr, i32 %n) nounwind { +; GCN-LABEL: loop_const_true: +; GCN: ; %bb.0: ; %entry +; GCN-NEXT: s_load_dword s0, s[0:1], 0x9 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: s_addk_i32 s0, 0x80 +; GCN-NEXT: v_mov_b32_e32 v0, s0 +; GCN-NEXT: s_mov_b32 m0, -1 +; GCN-NEXT: .LBB1_1: ; %for.body +; GCN-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN-NEXT: ds_read_b32 v1, v0 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: v_add_f32_e32 v1, 1.0, v1 +; GCN-NEXT: ds_write_b32 v0, v1 +; GCN-NEXT: v_add_i32_e32 v0, vcc, 4, v0 +; GCN-NEXT: s_branch .LBB1_1 +; +; GCN_DBG-LABEL: loop_const_true: +; GCN_DBG: ; %bb.0: ; %entry +; GCN_DBG-NEXT: s_load_dword s0, s[0:1], 0x9 +; GCN_DBG-NEXT: s_waitcnt lgkmcnt(0) +; GCN_DBG-NEXT: v_writelane_b32 v2, s0, 0 +; GCN_DBG-NEXT: s_mov_b32 s0, 0 +; GCN_DBG-NEXT: v_writelane_b32 v2, s0, 1 +; GCN_DBG-NEXT: s_branch .LBB1_2 +; GCN_DBG-NEXT: .LBB1_1: ; %for.exit +; GCN_DBG-NEXT: s_endpgm +; GCN_DBG-NEXT: .LBB1_2: ; %for.body +; GCN_DBG-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN_DBG-NEXT: v_readlane_b32 s0, v2, 1 +; GCN_DBG-NEXT: v_readlane_b32 s2, v2, 0 +; GCN_DBG-NEXT: s_mov_b32 s1, 2 +; GCN_DBG-NEXT: s_lshl_b32 s1, s0, s1 +; GCN_DBG-NEXT: s_add_i32 s1, s1, s2 +; GCN_DBG-NEXT: s_mov_b32 s2, 0x80 +; GCN_DBG-NEXT: s_add_i32 s1, s1, s2 +; GCN_DBG-NEXT: s_mov_b32 m0, -1 +; GCN_DBG-NEXT: v_mov_b32_e32 v0, s1 +; GCN_DBG-NEXT: ds_read_b32 v0, v0 +; GCN_DBG-NEXT: s_mov_b32 s2, 1.0 +; GCN_DBG-NEXT: s_waitcnt lgkmcnt(0) +; GCN_DBG-NEXT: v_add_f32_e64 v1, v0, s2 +; GCN_DBG-NEXT: s_mov_b32 m0, -1 +; GCN_DBG-NEXT: v_mov_b32_e32 v0, s1 +; GCN_DBG-NEXT: ds_write_b32 v0, v1 +; GCN_DBG-NEXT: s_mov_b32 s1, 1 +; GCN_DBG-NEXT: s_add_i32 s0, s0, s1 +; GCN_DBG-NEXT: s_mov_b64 s[2:3], 0 +; GCN_DBG-NEXT: s_and_b64 vcc, exec, s[2:3] +; GCN_DBG-NEXT: v_writelane_b32 v2, s0, 1 +; GCN_DBG-NEXT: s_cbranch_vccnz .LBB1_1 +; GCN_DBG-NEXT: s_branch .LBB1_2 entry: br label %for.body @@ -50,10 +155,54 @@ br i1 true, label %for.body, label %for.exit } -; GCN-LABEL: {{^}}loop_const_false: -; GCN-NOT: s_branch -; GCN: s_endpgm define amdgpu_kernel void @loop_const_false(float addrspace(3)* %ptr, i32 %n) nounwind { +; GCN-LABEL: loop_const_false: +; GCN: ; %bb.0: ; %entry +; GCN-NEXT: s_load_dword s0, s[0:1], 0x9 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: v_mov_b32_e32 v0, s0 +; GCN-NEXT: s_mov_b32 m0, -1 +; GCN-NEXT: ds_read_b32 v1, v0 offset:128 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: v_add_f32_e32 v1, 1.0, v1 +; GCN-NEXT: ds_write_b32 v0, v1 offset:128 +; GCN-NEXT: s_endpgm +; +; GCN_DBG-LABEL: loop_const_false: +; GCN_DBG: ; %bb.0: ; %entry +; GCN_DBG-NEXT: s_load_dword s0, s[0:1], 0x9 +; GCN_DBG-NEXT: s_waitcnt lgkmcnt(0) +; GCN_DBG-NEXT: v_writelane_b32 v2, s0, 0 +; GCN_DBG-NEXT: s_mov_b32 s0, 0 +; GCN_DBG-NEXT: v_writelane_b32 v2, s0, 1 +; GCN_DBG-NEXT: s_branch .LBB2_2 +; GCN_DBG-NEXT: .LBB2_1: ; %for.exit +; GCN_DBG-NEXT: s_endpgm +; GCN_DBG-NEXT: .LBB2_2: ; %for.body +; GCN_DBG-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN_DBG-NEXT: v_readlane_b32 s0, v2, 1 +; GCN_DBG-NEXT: v_readlane_b32 s2, v2, 0 +; GCN_DBG-NEXT: s_mov_b32 s1, 2 +; GCN_DBG-NEXT: s_lshl_b32 s1, s0, s1 +; GCN_DBG-NEXT: s_add_i32 s1, s1, s2 +; GCN_DBG-NEXT: s_mov_b32 s2, 0x80 +; GCN_DBG-NEXT: s_add_i32 s1, s1, s2 +; GCN_DBG-NEXT: s_mov_b32 m0, -1 +; GCN_DBG-NEXT: v_mov_b32_e32 v0, s1 +; GCN_DBG-NEXT: ds_read_b32 v0, v0 +; GCN_DBG-NEXT: s_mov_b32 s2, 1.0 +; GCN_DBG-NEXT: s_waitcnt lgkmcnt(0) +; GCN_DBG-NEXT: v_add_f32_e64 v1, v0, s2 +; GCN_DBG-NEXT: s_mov_b32 m0, -1 +; GCN_DBG-NEXT: v_mov_b32_e32 v0, s1 +; GCN_DBG-NEXT: ds_write_b32 v0, v1 +; GCN_DBG-NEXT: s_mov_b32 s1, 1 +; GCN_DBG-NEXT: s_add_i32 s0, s0, s1 +; GCN_DBG-NEXT: s_mov_b64 s[2:3], -1 +; GCN_DBG-NEXT: s_and_b64 vcc, exec, s[2:3] +; GCN_DBG-NEXT: v_writelane_b32 v2, s0, 1 +; GCN_DBG-NEXT: s_cbranch_vccnz .LBB2_1 +; GCN_DBG-NEXT: s_branch .LBB2_2 entry: br label %for.body @@ -72,10 +221,52 @@ br i1 false, label %for.body, label %for.exit } -; GCN-LABEL: {{^}}loop_const_undef: -; GCN-NOT: s_branch -; GCN: s_endpgm define amdgpu_kernel void @loop_const_undef(float addrspace(3)* %ptr, i32 %n) nounwind { +; GCN-LABEL: loop_const_undef: +; GCN: ; %bb.0: ; %entry +; GCN-NEXT: s_load_dword s0, s[0:1], 0x9 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: v_mov_b32_e32 v0, s0 +; GCN-NEXT: s_mov_b32 m0, -1 +; GCN-NEXT: ds_read_b32 v1, v0 offset:128 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: v_add_f32_e32 v1, 1.0, v1 +; GCN-NEXT: ds_write_b32 v0, v1 offset:128 +; GCN-NEXT: s_endpgm +; +; GCN_DBG-LABEL: loop_const_undef: +; GCN_DBG: ; %bb.0: ; %entry +; GCN_DBG-NEXT: s_load_dword s0, s[0:1], 0x9 +; GCN_DBG-NEXT: s_waitcnt lgkmcnt(0) +; GCN_DBG-NEXT: v_writelane_b32 v2, s0, 0 +; GCN_DBG-NEXT: s_mov_b32 s0, 0 +; GCN_DBG-NEXT: v_writelane_b32 v2, s0, 1 +; GCN_DBG-NEXT: s_branch .LBB3_2 +; GCN_DBG-NEXT: .LBB3_1: ; %for.exit +; GCN_DBG-NEXT: s_endpgm +; GCN_DBG-NEXT: .LBB3_2: ; %for.body +; GCN_DBG-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN_DBG-NEXT: v_readlane_b32 s0, v2, 1 +; GCN_DBG-NEXT: v_readlane_b32 s2, v2, 0 +; GCN_DBG-NEXT: s_mov_b32 s1, 2 +; GCN_DBG-NEXT: s_lshl_b32 s1, s0, s1 +; GCN_DBG-NEXT: s_add_i32 s1, s1, s2 +; GCN_DBG-NEXT: s_mov_b32 s2, 0x80 +; GCN_DBG-NEXT: s_add_i32 s1, s1, s2 +; GCN_DBG-NEXT: s_mov_b32 m0, -1 +; GCN_DBG-NEXT: v_mov_b32_e32 v0, s1 +; GCN_DBG-NEXT: ds_read_b32 v0, v0 +; GCN_DBG-NEXT: s_mov_b32 s2, 1.0 +; GCN_DBG-NEXT: s_waitcnt lgkmcnt(0) +; GCN_DBG-NEXT: v_add_f32_e64 v1, v0, s2 +; GCN_DBG-NEXT: s_mov_b32 m0, -1 +; GCN_DBG-NEXT: v_mov_b32_e32 v0, s1 +; GCN_DBG-NEXT: ds_write_b32 v0, v1 +; GCN_DBG-NEXT: s_mov_b32 s1, 1 +; GCN_DBG-NEXT: s_add_i32 s0, s0, s1 +; GCN_DBG-NEXT: v_writelane_b32 v2, s0, 1 +; GCN_DBG-NEXT: s_cbranch_scc1 .LBB3_1 +; GCN_DBG-NEXT: s_branch .LBB3_2 entry: br label %for.body @@ -94,18 +285,81 @@ br i1 undef, label %for.body, label %for.exit } -; GCN-LABEL: {{^}}loop_arg_0: -; GCN: v_and_b32_e32 v{{[0-9]+}}, 1, v{{[0-9]+}} -; GCN: v_cmp_eq_u32{{[^,]*}}, 1, -; GCN: s_add_i32 s2, s0, 0x80 - -; GCN: [[LOOPBB:.LBB[0-9]+_[0-9]+]] -; GCN: _add_i32_e32 v0, vcc, 4, v0 - -; GCN: s_cbranch_{{vccz|vccnz}} [[LOOPBB]] -; GCN-NEXT: ; %bb.2 -; GCN-NEXT: s_endpgm define amdgpu_kernel void @loop_arg_0(float addrspace(3)* %ptr, i32 %n) nounwind { +; GCN-LABEL: loop_arg_0: +; GCN: ; %bb.0: ; %entry +; GCN-NEXT: v_mov_b32_e32 v0, 0 +; GCN-NEXT: s_mov_b32 m0, -1 +; GCN-NEXT: ds_read_u8 v0, v0 +; GCN-NEXT: s_load_dword s2, s[0:1], 0x9 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: v_readfirstlane_b32 s0, v0 +; GCN-NEXT: s_bitcmp1_b32 s0, 0 +; GCN-NEXT: s_cselect_b64 s[0:1], -1, 0 +; GCN-NEXT: s_addk_i32 s2, 0x80 +; GCN-NEXT: s_xor_b64 s[0:1], s[0:1], -1 +; GCN-NEXT: v_mov_b32_e32 v0, s2 +; GCN-NEXT: s_and_b64 s[0:1], exec, s[0:1] +; GCN-NEXT: .LBB4_1: ; %for.body +; GCN-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN-NEXT: ds_read_b32 v1, v0 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: v_add_f32_e32 v1, 1.0, v1 +; GCN-NEXT: ds_write_b32 v0, v1 +; GCN-NEXT: v_add_i32_e32 v0, vcc, 4, v0 +; GCN-NEXT: s_mov_b64 vcc, s[0:1] +; GCN-NEXT: s_cbranch_vccz .LBB4_1 +; GCN-NEXT: ; %bb.2: ; %for.exit +; GCN-NEXT: s_endpgm +; +; GCN_DBG-LABEL: loop_arg_0: +; GCN_DBG: ; %bb.0: ; %entry +; GCN_DBG-NEXT: s_load_dword s0, s[0:1], 0x9 +; GCN_DBG-NEXT: s_waitcnt lgkmcnt(0) +; GCN_DBG-NEXT: v_writelane_b32 v2, s0, 0 +; GCN_DBG-NEXT: v_mov_b32_e32 v0, 0 +; GCN_DBG-NEXT: s_mov_b32 m0, -1 +; GCN_DBG-NEXT: ds_read_u8 v0, v0 +; GCN_DBG-NEXT: s_waitcnt lgkmcnt(0) +; GCN_DBG-NEXT: v_readfirstlane_b32 s0, v0 +; GCN_DBG-NEXT: s_and_b32 s0, 1, s0 +; GCN_DBG-NEXT: s_cmp_eq_u32 s0, 1 +; GCN_DBG-NEXT: s_cselect_b64 s[0:1], -1, 0 +; GCN_DBG-NEXT: s_mov_b64 s[2:3], -1 +; GCN_DBG-NEXT: s_xor_b64 s[0:1], s[0:1], s[2:3] +; GCN_DBG-NEXT: v_writelane_b32 v2, s0, 1 +; GCN_DBG-NEXT: v_writelane_b32 v2, s1, 2 +; GCN_DBG-NEXT: s_mov_b32 s0, 0 +; GCN_DBG-NEXT: v_writelane_b32 v2, s0, 3 +; GCN_DBG-NEXT: s_branch .LBB4_2 +; GCN_DBG-NEXT: .LBB4_1: ; %for.exit +; GCN_DBG-NEXT: s_endpgm +; GCN_DBG-NEXT: .LBB4_2: ; %for.body +; GCN_DBG-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN_DBG-NEXT: v_readlane_b32 s0, v2, 3 +; GCN_DBG-NEXT: v_readlane_b32 s2, v2, 1 +; GCN_DBG-NEXT: v_readlane_b32 s3, v2, 2 +; GCN_DBG-NEXT: v_readlane_b32 s4, v2, 0 +; GCN_DBG-NEXT: s_mov_b32 s1, 2 +; GCN_DBG-NEXT: s_lshl_b32 s1, s0, s1 +; GCN_DBG-NEXT: s_add_i32 s1, s1, s4 +; GCN_DBG-NEXT: s_mov_b32 s4, 0x80 +; GCN_DBG-NEXT: s_add_i32 s1, s1, s4 +; GCN_DBG-NEXT: s_mov_b32 m0, -1 +; GCN_DBG-NEXT: v_mov_b32_e32 v0, s1 +; GCN_DBG-NEXT: ds_read_b32 v0, v0 +; GCN_DBG-NEXT: s_mov_b32 s4, 1.0 +; GCN_DBG-NEXT: s_waitcnt lgkmcnt(0) +; GCN_DBG-NEXT: v_add_f32_e64 v1, v0, s4 +; GCN_DBG-NEXT: s_mov_b32 m0, -1 +; GCN_DBG-NEXT: v_mov_b32_e32 v0, s1 +; GCN_DBG-NEXT: ds_write_b32 v0, v1 +; GCN_DBG-NEXT: s_mov_b32 s1, 1 +; GCN_DBG-NEXT: s_add_i32 s0, s0, s1 +; GCN_DBG-NEXT: s_and_b64 vcc, exec, s[2:3] +; GCN_DBG-NEXT: v_writelane_b32 v2, s0, 3 +; GCN_DBG-NEXT: s_cbranch_vccnz .LBB4_1 +; GCN_DBG-NEXT: s_branch .LBB4_2 entry: %cond = load volatile i1, i1 addrspace(3)* null br label %for.body diff --git a/llvm/test/CodeGen/AMDGPU/idiv-licm.ll b/llvm/test/CodeGen/AMDGPU/idiv-licm.ll --- a/llvm/test/CodeGen/AMDGPU/idiv-licm.ll +++ b/llvm/test/CodeGen/AMDGPU/idiv-licm.ll @@ -9,35 +9,41 @@ ; GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c ; GFX9-NEXT: s_mov_b64 s[2:3], 0 ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s4 ; GFX9-NEXT: s_sub_i32 s5, 0, s4 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v0, v0 ; GFX9-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0 -; GFX9-NEXT: v_mul_lo_u32 v1, s5, v0 -; GFX9-NEXT: v_mul_hi_u32 v1, v0, v1 -; GFX9-NEXT: v_add_u32_e32 v0, v0, v1 -; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: .LBB0_1: ; %bb3 ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX9-NEXT: v_mul_lo_u32 v2, s3, v0 -; GFX9-NEXT: v_mul_hi_u32 v3, s2, v0 -; GFX9-NEXT: v_add_u32_e32 v2, v3, v2 -; GFX9-NEXT: v_mul_lo_u32 v3, s5, v2 -; GFX9-NEXT: v_not_b32_e32 v5, v2 -; GFX9-NEXT: v_mul_lo_u32 v5, s4, v5 -; GFX9-NEXT: v_add_u32_e32 v4, 1, v2 -; GFX9-NEXT: v_add_u32_e32 v3, s2, v3 -; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s4, v3 +; GFX9-NEXT: v_readfirstlane_b32 s6, v0 +; GFX9-NEXT: s_mul_i32 s7, s5, s6 +; GFX9-NEXT: s_mul_hi_u32 s7, s6, s7 +; GFX9-NEXT: s_add_i32 s6, s6, s7 +; GFX9-NEXT: s_mul_i32 s7, s3, s6 +; GFX9-NEXT: s_mul_hi_u32 s6, s2, s6 +; GFX9-NEXT: s_add_i32 s6, s6, s7 +; GFX9-NEXT: s_mul_i32 s7, s5, s6 +; GFX9-NEXT: s_add_i32 s7, s2, s7 +; GFX9-NEXT: s_cmp_ge_u32 s7, s4 +; GFX9-NEXT: v_mov_b32_e32 v2, s6 +; GFX9-NEXT: v_mov_b32_e32 v3, s7 +; GFX9-NEXT: s_cselect_b64 vcc, -1, 0 +; GFX9-NEXT: s_add_i32 s7, s6, 1 +; GFX9-NEXT: s_not_b32 s6, s6 +; GFX9-NEXT: s_mul_i32 s6, s4, s6 +; GFX9-NEXT: v_mov_b32_e32 v4, s7 +; GFX9-NEXT: s_add_i32 s6, s2, s6 ; GFX9-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc -; GFX9-NEXT: v_add_u32_e32 v4, s2, v5 +; GFX9-NEXT: v_mov_b32_e32 v4, s6 ; GFX9-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc +; GFX9-NEXT: v_add_u32_e32 v5, 1, v2 ; GFX9-NEXT: s_add_u32 s2, s2, 1 -; GFX9-NEXT: v_add_u32_e32 v4, 1, v2 ; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s4, v3 ; GFX9-NEXT: s_addc_u32 s3, s3, 0 -; GFX9-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v2, v2, v5, vcc ; GFX9-NEXT: global_store_dword v1, v2, s[0:1] ; GFX9-NEXT: s_add_u32 s0, s0, 4 ; GFX9-NEXT: s_addc_u32 s1, s1, 0 @@ -49,34 +55,38 @@ ; GFX10-LABEL: udiv32_invariant_denom: ; GFX10: ; %bb.0: ; %bb ; GFX10-NEXT: s_load_dword s4, s[0:1], 0x2c -; GFX10-NEXT: s_mov_b64 s[2:3], 0 +; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX10-NEXT: s_mov_b64 s[2:3], 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_cvt_f32_u32_e32 v0, s4 ; GFX10-NEXT: s_sub_i32 s5, 0, s4 ; GFX10-NEXT: v_rcp_iflag_f32_e32 v0, v0 ; GFX10-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX10-NEXT: v_cvt_u32_f32_e32 v0, v0 -; GFX10-NEXT: v_mul_lo_u32 v1, s5, v0 -; GFX10-NEXT: v_mul_hi_u32 v1, v0, v1 -; GFX10-NEXT: v_add_nc_u32_e32 v0, v0, v1 -; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: .LBB0_1: ; %bb3 ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX10-NEXT: v_mul_lo_u32 v2, s3, v0 -; GFX10-NEXT: v_mul_hi_u32 v3, s2, v0 -; GFX10-NEXT: v_add_nc_u32_e32 v2, v3, v2 -; GFX10-NEXT: v_not_b32_e32 v3, v2 -; GFX10-NEXT: v_mul_lo_u32 v4, s5, v2 -; GFX10-NEXT: v_add_nc_u32_e32 v5, 1, v2 -; GFX10-NEXT: v_mul_lo_u32 v3, s4, v3 -; GFX10-NEXT: v_add_nc_u32_e32 v4, s2, v4 -; GFX10-NEXT: v_add_nc_u32_e32 v3, s2, v3 -; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, s4, v4 +; GFX10-NEXT: v_readfirstlane_b32 s6, v0 +; GFX10-NEXT: s_mul_i32 s7, s5, s6 +; GFX10-NEXT: s_mul_hi_u32 s7, s6, s7 +; GFX10-NEXT: s_add_i32 s6, s6, s7 +; GFX10-NEXT: s_mul_i32 s7, s3, s6 +; GFX10-NEXT: s_mul_hi_u32 s6, s2, s6 +; GFX10-NEXT: s_add_i32 s6, s6, s7 +; GFX10-NEXT: s_mul_i32 s7, s5, s6 +; GFX10-NEXT: s_add_i32 s7, s2, s7 +; GFX10-NEXT: s_cmp_ge_u32 s7, s4 +; GFX10-NEXT: s_cselect_b32 vcc_lo, -1, 0 +; GFX10-NEXT: s_add_i32 s8, s6, 1 +; GFX10-NEXT: s_not_b32 s9, s6 +; GFX10-NEXT: v_mov_b32_e32 v2, s8 +; GFX10-NEXT: s_mul_i32 s8, s4, s9 +; GFX10-NEXT: s_add_i32 s8, s2, s8 ; GFX10-NEXT: s_add_u32 s2, s2, 1 +; GFX10-NEXT: v_mov_b32_e32 v3, s8 +; GFX10-NEXT: v_cndmask_b32_e32 v2, s6, v2, vcc_lo ; GFX10-NEXT: s_addc_u32 s3, s3, 0 -; GFX10-NEXT: v_cndmask_b32_e32 v2, v2, v5, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e32 v3, v4, v3, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e32 v3, s7, v3, vcc_lo ; GFX10-NEXT: v_add_nc_u32_e32 v4, 1, v2 ; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, s4, v3 ; GFX10-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc_lo @@ -165,33 +175,35 @@ ; GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c ; GFX9-NEXT: s_mov_b64 s[2:3], 0 ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s4 ; GFX9-NEXT: s_sub_i32 s5, 0, s4 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v0, v0 ; GFX9-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0 -; GFX9-NEXT: v_mul_lo_u32 v1, s5, v0 -; GFX9-NEXT: v_mul_hi_u32 v1, v0, v1 -; GFX9-NEXT: v_add_u32_e32 v0, v0, v1 -; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: .LBB1_1: ; %bb3 ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX9-NEXT: v_mul_lo_u32 v2, s3, v0 -; GFX9-NEXT: v_mul_hi_u32 v3, s2, v0 -; GFX9-NEXT: v_add_u32_e32 v2, v3, v2 -; GFX9-NEXT: v_mul_lo_u32 v3, s5, v2 -; GFX9-NEXT: v_not_b32_e32 v2, v2 -; GFX9-NEXT: v_mul_lo_u32 v2, s4, v2 -; GFX9-NEXT: v_add_u32_e32 v3, s2, v3 -; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s4, v3 -; GFX9-NEXT: v_add_u32_e32 v2, s2, v2 -; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v2, vcc +; GFX9-NEXT: v_readfirstlane_b32 s6, v0 +; GFX9-NEXT: s_mul_i32 s7, s5, s6 +; GFX9-NEXT: s_mul_hi_u32 s7, s6, s7 +; GFX9-NEXT: s_add_i32 s6, s6, s7 +; GFX9-NEXT: s_mul_i32 s7, s3, s6 +; GFX9-NEXT: s_mul_hi_u32 s6, s2, s6 +; GFX9-NEXT: s_add_i32 s6, s6, s7 +; GFX9-NEXT: s_mul_i32 s7, s5, s6 +; GFX9-NEXT: s_not_b32 s6, s6 +; GFX9-NEXT: s_mul_i32 s6, s4, s6 +; GFX9-NEXT: s_add_i32 s7, s2, s7 +; GFX9-NEXT: s_add_i32 s6, s2, s6 +; GFX9-NEXT: s_cmp_ge_u32 s7, s4 +; GFX9-NEXT: s_cselect_b32 s6, s6, s7 +; GFX9-NEXT: s_sub_i32 s7, s6, s4 +; GFX9-NEXT: s_cmp_ge_u32 s6, s4 +; GFX9-NEXT: s_cselect_b32 s6, s7, s6 ; GFX9-NEXT: s_add_u32 s2, s2, 1 -; GFX9-NEXT: v_subrev_u32_e32 v3, s4, v2 -; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s4, v2 +; GFX9-NEXT: v_mov_b32_e32 v2, s6 ; GFX9-NEXT: s_addc_u32 s3, s3, 0 -; GFX9-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc ; GFX9-NEXT: global_store_dword v1, v2, s[0:1] ; GFX9-NEXT: s_add_u32 s0, s0, 4 ; GFX9-NEXT: s_addc_u32 s1, s1, 0 @@ -203,35 +215,37 @@ ; GFX10-LABEL: urem32_invariant_denom: ; GFX10: ; %bb.0: ; %bb ; GFX10-NEXT: s_load_dword s4, s[0:1], 0x2c -; GFX10-NEXT: s_mov_b64 s[2:3], 0 +; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX10-NEXT: s_mov_b64 s[2:3], 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_cvt_f32_u32_e32 v0, s4 ; GFX10-NEXT: s_sub_i32 s5, 0, s4 ; GFX10-NEXT: v_rcp_iflag_f32_e32 v0, v0 ; GFX10-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX10-NEXT: v_cvt_u32_f32_e32 v0, v0 -; GFX10-NEXT: v_mul_lo_u32 v1, s5, v0 -; GFX10-NEXT: v_mul_hi_u32 v1, v0, v1 -; GFX10-NEXT: v_add_nc_u32_e32 v0, v0, v1 -; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: .LBB1_1: ; %bb3 ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX10-NEXT: v_mul_lo_u32 v2, s3, v0 -; GFX10-NEXT: v_mul_hi_u32 v3, s2, v0 -; GFX10-NEXT: v_add_nc_u32_e32 v2, v3, v2 -; GFX10-NEXT: v_not_b32_e32 v3, v2 -; GFX10-NEXT: v_mul_lo_u32 v2, s5, v2 -; GFX10-NEXT: v_mul_lo_u32 v3, s4, v3 -; GFX10-NEXT: v_add_nc_u32_e32 v2, s2, v2 -; GFX10-NEXT: v_add_nc_u32_e32 v3, s2, v3 -; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, s4, v2 +; GFX10-NEXT: v_readfirstlane_b32 s6, v0 +; GFX10-NEXT: s_mul_i32 s7, s5, s6 +; GFX10-NEXT: s_mul_hi_u32 s7, s6, s7 +; GFX10-NEXT: s_add_i32 s6, s6, s7 +; GFX10-NEXT: s_mul_i32 s7, s3, s6 +; GFX10-NEXT: s_mul_hi_u32 s6, s2, s6 +; GFX10-NEXT: s_add_i32 s6, s6, s7 +; GFX10-NEXT: s_not_b32 s7, s6 +; GFX10-NEXT: s_mul_i32 s6, s5, s6 +; GFX10-NEXT: s_mul_i32 s7, s4, s7 +; GFX10-NEXT: s_add_i32 s6, s2, s6 +; GFX10-NEXT: s_add_i32 s7, s2, s7 +; GFX10-NEXT: s_cmp_ge_u32 s6, s4 +; GFX10-NEXT: s_cselect_b32 s6, s7, s6 +; GFX10-NEXT: s_sub_i32 s7, s6, s4 +; GFX10-NEXT: s_cmp_ge_u32 s6, s4 +; GFX10-NEXT: s_cselect_b32 s6, s7, s6 ; GFX10-NEXT: s_add_u32 s2, s2, 1 +; GFX10-NEXT: v_mov_b32_e32 v2, s6 ; GFX10-NEXT: s_addc_u32 s3, s3, 0 -; GFX10-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc_lo -; GFX10-NEXT: v_subrev_nc_u32_e32 v3, s4, v2 -; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, s4, v2 -; GFX10-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc_lo ; GFX10-NEXT: global_store_dword v1, v2, s[0:1] ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_add_u32 s0, s0, 4 @@ -355,38 +369,41 @@ ; GFX10-LABEL: sdiv32_invariant_denom: ; GFX10: ; %bb.0: ; %bb ; GFX10-NEXT: s_load_dword s3, s[0:1], 0x2c +; GFX10-NEXT: v_mov_b32_e32 v1, 0 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX10-NEXT: s_mov_b32 s4, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_ashr_i32 s2, s3, 31 -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX10-NEXT: s_add_i32 s3, s3, s2 ; GFX10-NEXT: s_xor_b32 s3, s3, s2 ; GFX10-NEXT: v_cvt_f32_u32_e32 v0, s3 -; GFX10-NEXT: s_sub_i32 s4, 0, s3 +; GFX10-NEXT: s_sub_i32 s5, 0, s3 ; GFX10-NEXT: v_rcp_iflag_f32_e32 v0, v0 ; GFX10-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX10-NEXT: v_cvt_u32_f32_e32 v0, v0 -; GFX10-NEXT: v_mul_lo_u32 v1, s4, v0 -; GFX10-NEXT: s_mov_b32 s4, 0 -; GFX10-NEXT: v_mul_hi_u32 v1, v0, v1 -; GFX10-NEXT: v_add_nc_u32_e32 v0, v0, v1 -; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: .LBB2_1: ; %bb3 ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX10-NEXT: v_mul_hi_u32 v2, s4, v0 -; GFX10-NEXT: v_mul_lo_u32 v3, v2, s3 -; GFX10-NEXT: v_add_nc_u32_e32 v4, 1, v2 -; GFX10-NEXT: v_sub_nc_u32_e32 v3, s4, v3 +; GFX10-NEXT: v_readfirstlane_b32 s6, v0 +; GFX10-NEXT: s_mul_i32 s7, s5, s6 +; GFX10-NEXT: s_mul_hi_u32 s7, s6, s7 +; GFX10-NEXT: s_add_i32 s6, s6, s7 +; GFX10-NEXT: s_mul_hi_u32 s6, s4, s6 +; GFX10-NEXT: s_mul_i32 s7, s6, s3 +; GFX10-NEXT: s_sub_i32 s7, s4, s7 +; GFX10-NEXT: s_cmp_ge_u32 s7, s3 +; GFX10-NEXT: s_cselect_b32 vcc_lo, -1, 0 +; GFX10-NEXT: s_add_i32 s8, s6, 1 ; GFX10-NEXT: s_add_i32 s4, s4, 1 -; GFX10-NEXT: v_subrev_nc_u32_e32 v5, s3, v3 -; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, s3, v3 -; GFX10-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e32 v3, v3, v5, vcc_lo +; GFX10-NEXT: v_mov_b32_e32 v2, s8 +; GFX10-NEXT: s_sub_i32 s8, s7, s3 +; GFX10-NEXT: v_mov_b32_e32 v3, s8 +; GFX10-NEXT: v_cndmask_b32_e32 v2, s6, v2, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e32 v3, s7, v3, vcc_lo ; GFX10-NEXT: v_add_nc_u32_e32 v4, 1, v2 ; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, s3, v3 ; GFX10-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc_lo ; GFX10-NEXT: v_xor_b32_e32 v2, s2, v2 ; GFX10-NEXT: v_subrev_nc_u32_e32 v2, s2, v2 -; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: global_store_dword v1, v2, s[0:1] ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_add_u32 s0, s0, 4 @@ -470,34 +487,35 @@ ; GFX9-LABEL: srem32_invariant_denom: ; GFX9: ; %bb.0: ; %bb ; GFX9-NEXT: s_load_dword s2, s[0:1], 0x2c +; GFX9-NEXT: v_mov_b32_e32 v1, 0 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_ashr_i32 s3, s2, 31 ; GFX9-NEXT: s_add_i32 s2, s2, s3 ; GFX9-NEXT: s_xor_b32 s2, s2, s3 ; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s2 -; GFX9-NEXT: s_sub_i32 s3, 0, s2 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX9-NEXT: s_mov_b32 s3, 0 +; GFX9-NEXT: s_sub_i32 s4, 0, s2 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v0, v0 ; GFX9-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0 -; GFX9-NEXT: v_mul_lo_u32 v1, s3, v0 -; GFX9-NEXT: s_mov_b32 s3, 0 -; GFX9-NEXT: v_mul_hi_u32 v1, v0, v1 -; GFX9-NEXT: v_add_u32_e32 v0, v0, v1 -; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: .LBB3_1: ; %bb3 ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX9-NEXT: v_mul_hi_u32 v2, s3, v0 -; GFX9-NEXT: v_mul_lo_u32 v2, v2, s2 -; GFX9-NEXT: v_sub_u32_e32 v2, s3, v2 -; GFX9-NEXT: v_subrev_u32_e32 v3, s2, v2 -; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s2, v2 -; GFX9-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc -; GFX9-NEXT: v_subrev_u32_e32 v3, s2, v2 -; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s2, v2 +; GFX9-NEXT: v_readfirstlane_b32 s5, v0 +; GFX9-NEXT: s_mul_i32 s6, s4, s5 +; GFX9-NEXT: s_mul_hi_u32 s6, s5, s6 +; GFX9-NEXT: s_add_i32 s5, s5, s6 +; GFX9-NEXT: s_mul_hi_u32 s5, s3, s5 +; GFX9-NEXT: s_mul_i32 s5, s5, s2 +; GFX9-NEXT: s_sub_i32 s5, s3, s5 +; GFX9-NEXT: s_sub_i32 s6, s5, s2 +; GFX9-NEXT: s_cmp_ge_u32 s5, s2 +; GFX9-NEXT: s_cselect_b32 s5, s6, s5 +; GFX9-NEXT: s_sub_i32 s6, s5, s2 +; GFX9-NEXT: s_cmp_ge_u32 s5, s2 +; GFX9-NEXT: s_cselect_b32 s5, s6, s5 ; GFX9-NEXT: s_add_i32 s3, s3, 1 -; GFX9-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc -; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v2, s5 ; GFX9-NEXT: global_store_dword v1, v2, s[0:1] ; GFX9-NEXT: s_add_u32 s0, s0, 4 ; GFX9-NEXT: s_addc_u32 s1, s1, 0 @@ -509,34 +527,35 @@ ; GFX10-LABEL: srem32_invariant_denom: ; GFX10: ; %bb.0: ; %bb ; GFX10-NEXT: s_load_dword s2, s[0:1], 0x2c +; GFX10-NEXT: v_mov_b32_e32 v1, 0 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_ashr_i32 s3, s2, 31 -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX10-NEXT: s_add_i32 s2, s2, s3 ; GFX10-NEXT: s_xor_b32 s2, s2, s3 +; GFX10-NEXT: s_mov_b32 s3, 0 ; GFX10-NEXT: v_cvt_f32_u32_e32 v0, s2 -; GFX10-NEXT: s_sub_i32 s3, 0, s2 +; GFX10-NEXT: s_sub_i32 s4, 0, s2 ; GFX10-NEXT: v_rcp_iflag_f32_e32 v0, v0 ; GFX10-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX10-NEXT: v_cvt_u32_f32_e32 v0, v0 -; GFX10-NEXT: v_mul_lo_u32 v1, s3, v0 -; GFX10-NEXT: s_mov_b32 s3, 0 -; GFX10-NEXT: v_mul_hi_u32 v1, v0, v1 -; GFX10-NEXT: v_add_nc_u32_e32 v0, v0, v1 -; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: .LBB3_1: ; %bb3 ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX10-NEXT: v_mul_hi_u32 v2, s3, v0 -; GFX10-NEXT: v_mul_lo_u32 v2, v2, s2 -; GFX10-NEXT: v_sub_nc_u32_e32 v2, s3, v2 +; GFX10-NEXT: v_readfirstlane_b32 s5, v0 +; GFX10-NEXT: s_mul_i32 s6, s4, s5 +; GFX10-NEXT: s_mul_hi_u32 s6, s5, s6 +; GFX10-NEXT: s_add_i32 s5, s5, s6 +; GFX10-NEXT: s_mul_hi_u32 s5, s3, s5 +; GFX10-NEXT: s_mul_i32 s5, s5, s2 +; GFX10-NEXT: s_sub_i32 s5, s3, s5 +; GFX10-NEXT: s_sub_i32 s6, s5, s2 +; GFX10-NEXT: s_cmp_ge_u32 s5, s2 +; GFX10-NEXT: s_cselect_b32 s5, s6, s5 +; GFX10-NEXT: s_sub_i32 s6, s5, s2 +; GFX10-NEXT: s_cmp_ge_u32 s5, s2 +; GFX10-NEXT: s_cselect_b32 s5, s6, s5 ; GFX10-NEXT: s_add_i32 s3, s3, 1 -; GFX10-NEXT: v_subrev_nc_u32_e32 v3, s2, v2 -; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, s2, v2 -; GFX10-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc_lo -; GFX10-NEXT: v_subrev_nc_u32_e32 v3, s2, v2 -; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, s2, v2 -; GFX10-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc_lo -; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: v_mov_b32_e32 v2, s5 ; GFX10-NEXT: global_store_dword v1, v2, s[0:1] ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_add_u32 s0, s0, 4 @@ -615,32 +634,33 @@ ; GFX9-LABEL: udiv16_invariant_denom: ; GFX9: ; %bb.0: ; %bb ; GFX9-NEXT: s_load_dword s2, s[0:1], 0x2c -; GFX9-NEXT: v_mov_b32_e32 v1, 0 -; GFX9-NEXT: s_movk_i32 s4, 0x400 -; GFX9-NEXT: v_mov_b32_e32 v4, 0 +; GFX9-NEXT: s_mov_b32 s5, 0 +; GFX9-NEXT: v_mov_b32_e32 v2, 0 +; GFX9-NEXT: s_movk_i32 s6, 0x400 +; GFX9-NEXT: s_mov_b32 s7, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_and_b32 s2, 0xffff, s2 -; GFX9-NEXT: v_cvt_f32_u32_e32 v2, s2 +; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s2 ; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 -; GFX9-NEXT: v_rcp_iflag_f32_e32 v3, v2 +; GFX9-NEXT: v_rcp_iflag_f32_e32 v1, v0 ; GFX9-NEXT: .LBB4_1: ; %bb3 ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v4 -; GFX9-NEXT: v_cvt_f32_u32_e32 v8, v0 -; GFX9-NEXT: v_lshlrev_b64 v[5:6], 1, v[0:1] +; GFX9-NEXT: s_and_b32 s4, 0xffff, s7 +; GFX9-NEXT: v_cvt_f32_u32_e32 v4, s4 +; GFX9-NEXT: v_add_u16_e64 v3, s7, 1 +; GFX9-NEXT: v_readfirstlane_b32 s7, v3 +; GFX9-NEXT: v_cmp_eq_u16_e32 vcc, s6, v3 +; GFX9-NEXT: v_mul_f32_e32 v3, v4, v1 +; GFX9-NEXT: v_trunc_f32_e32 v3, v3 +; GFX9-NEXT: v_cvt_u32_f32_e32 v5, v3 +; GFX9-NEXT: s_lshl_b64 s[0:1], s[4:5], 1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v7, s3 -; GFX9-NEXT: v_add_co_u32_e64 v5, s[0:1], s2, v5 -; GFX9-NEXT: v_mul_f32_e32 v0, v8, v3 -; GFX9-NEXT: v_trunc_f32_e32 v0, v0 -; GFX9-NEXT: v_addc_co_u32_e64 v6, s[0:1], v7, v6, s[0:1] -; GFX9-NEXT: v_cvt_u32_f32_e32 v7, v0 -; GFX9-NEXT: v_add_u16_e32 v4, 1, v4 -; GFX9-NEXT: v_mad_f32 v0, -v0, v2, v8 -; GFX9-NEXT: v_cmp_eq_u16_e32 vcc, s4, v4 -; GFX9-NEXT: v_cmp_ge_f32_e64 s[0:1], |v0|, v2 -; GFX9-NEXT: v_addc_co_u32_e64 v0, s[0:1], 0, v7, s[0:1] -; GFX9-NEXT: global_store_short v[5:6], v0, off +; GFX9-NEXT: s_add_u32 s8, s2, s0 +; GFX9-NEXT: v_mad_f32 v3, -v3, v0, v4 +; GFX9-NEXT: s_addc_u32 s9, s3, s1 +; GFX9-NEXT: v_cmp_ge_f32_e64 s[0:1], |v3|, v0 +; GFX9-NEXT: v_addc_co_u32_e64 v3, s[0:1], 0, v5, s[0:1] +; GFX9-NEXT: global_store_short v2, v3, s[8:9] ; GFX9-NEXT: s_cbranch_vccz .LBB4_1 ; GFX9-NEXT: ; %bb.2: ; %bb2 ; GFX9-NEXT: s_endpgm @@ -650,29 +670,31 @@ ; GFX10-NEXT: s_clause 0x1 ; GFX10-NEXT: s_load_dword s4, s[0:1], 0x2c ; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 -; GFX10-NEXT: v_mov_b32_e32 v1, 0 -; GFX10-NEXT: v_mov_b32_e32 v4, 0 +; GFX10-NEXT: v_mov_b32_e32 v2, 0 +; GFX10-NEXT: s_mov_b32 s1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_and_b32 s0, 0xffff, s4 -; GFX10-NEXT: v_cvt_f32_u32_e32 v2, s0 -; GFX10-NEXT: v_rcp_iflag_f32_e32 v3, v2 +; GFX10-NEXT: s_mov_b32 s4, 0 +; GFX10-NEXT: v_cvt_f32_u32_e32 v0, s0 +; GFX10-NEXT: v_rcp_iflag_f32_e32 v1, v0 ; GFX10-NEXT: .LBB4_1: ; %bb3 ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX10-NEXT: v_and_b32_e32 v0, 0xffff, v4 -; GFX10-NEXT: v_add_nc_u16 v4, v4, 1 -; GFX10-NEXT: v_cvt_f32_u32_e32 v7, v0 -; GFX10-NEXT: v_lshlrev_b64 v[5:6], 1, v[0:1] -; GFX10-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x400, v4 -; GFX10-NEXT: v_mul_f32_e32 v0, v7, v3 -; GFX10-NEXT: v_add_co_u32 v5, s0, s2, v5 -; GFX10-NEXT: v_add_co_ci_u32_e64 v6, s0, s3, v6, s0 -; GFX10-NEXT: v_trunc_f32_e32 v0, v0 +; GFX10-NEXT: s_and_b32 s0, 0xffff, s4 +; GFX10-NEXT: v_add_nc_u16 v3, s4, 1 +; GFX10-NEXT: v_cvt_f32_u32_e32 v4, s0 +; GFX10-NEXT: s_lshl_b64 s[4:5], s[0:1], 1 +; GFX10-NEXT: s_add_u32 s6, s2, s4 +; GFX10-NEXT: v_readfirstlane_b32 s4, v3 +; GFX10-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x400, v3 +; GFX10-NEXT: v_mul_f32_e32 v3, v4, v1 +; GFX10-NEXT: s_addc_u32 s7, s3, s5 ; GFX10-NEXT: s_and_b32 vcc_lo, exec_lo, vcc_lo -; GFX10-NEXT: v_mad_f32 v7, -v0, v2, v7 -; GFX10-NEXT: v_cvt_u32_f32_e32 v0, v0 -; GFX10-NEXT: v_cmp_ge_f32_e64 s0, |v7|, v2 -; GFX10-NEXT: v_add_co_ci_u32_e64 v0, s0, 0, v0, s0 -; GFX10-NEXT: global_store_short v[5:6], v0, off +; GFX10-NEXT: v_trunc_f32_e32 v3, v3 +; GFX10-NEXT: v_mad_f32 v4, -v3, v0, v4 +; GFX10-NEXT: v_cvt_u32_f32_e32 v3, v3 +; GFX10-NEXT: v_cmp_ge_f32_e64 s0, |v4|, v0 +; GFX10-NEXT: v_add_co_ci_u32_e64 v3, s0, 0, v3, s0 +; GFX10-NEXT: global_store_short v2, v3, s[6:7] ; GFX10-NEXT: s_cbranch_vccz .LBB4_1 ; GFX10-NEXT: ; %bb.2: ; %bb2 ; GFX10-NEXT: s_endpgm @@ -862,37 +884,38 @@ ; GFX9-LABEL: sdiv16_invariant_denom: ; GFX9: ; %bb.0: ; %bb ; GFX9-NEXT: s_load_dword s2, s[0:1], 0x2c -; GFX9-NEXT: v_mov_b32_e32 v1, 0 +; GFX9-NEXT: s_mov_b32 s3, 0 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_movk_i32 s5, 0x400 -; GFX9-NEXT: v_mov_b32_e32 v4, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_sext_i32_i16 s4, s2 -; GFX9-NEXT: v_cvt_f32_i32_e32 v2, s4 -; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 -; GFX9-NEXT: v_rcp_iflag_f32_e32 v3, v2 +; GFX9-NEXT: v_cvt_f32_i32_e32 v0, s4 +; GFX9-NEXT: s_mov_b32 s6, 0 +; GFX9-NEXT: v_rcp_iflag_f32_e32 v1, v0 ; GFX9-NEXT: .LBB6_1: ; %bb3 ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX9-NEXT: v_bfe_i32 v5, v4, 0, 16 -; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v4 -; GFX9-NEXT: v_cvt_f32_i32_e32 v9, v5 -; GFX9-NEXT: v_xor_b32_e32 v8, s4, v5 -; GFX9-NEXT: v_lshlrev_b64 v[5:6], 1, v[0:1] -; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v7, s3 -; GFX9-NEXT: v_add_co_u32_e64 v5, s[0:1], s2, v5 -; GFX9-NEXT: v_addc_co_u32_e64 v6, s[0:1], v7, v6, s[0:1] -; GFX9-NEXT: v_mul_f32_e32 v7, v9, v3 -; GFX9-NEXT: v_trunc_f32_e32 v7, v7 -; GFX9-NEXT: v_ashrrev_i32_e32 v0, 30, v8 -; GFX9-NEXT: v_cvt_i32_f32_e32 v8, v7 -; GFX9-NEXT: v_mad_f32 v7, -v7, v2, v9 -; GFX9-NEXT: v_add_u16_e32 v4, 1, v4 -; GFX9-NEXT: v_or_b32_e32 v0, 1, v0 -; GFX9-NEXT: v_cmp_ge_f32_e64 s[0:1], |v7|, |v2| -; GFX9-NEXT: v_cmp_eq_u16_e32 vcc, s5, v4 -; GFX9-NEXT: v_cndmask_b32_e64 v0, 0, v0, s[0:1] -; GFX9-NEXT: v_add_u32_e32 v0, v8, v0 -; GFX9-NEXT: global_store_short v[5:6], v0, off +; GFX9-NEXT: s_sext_i32_i16 s2, s6 +; GFX9-NEXT: v_cvt_f32_i32_e32 v4, s2 +; GFX9-NEXT: s_xor_b32 s7, s2, s4 +; GFX9-NEXT: s_ashr_i32 s2, s7, 30 +; GFX9-NEXT: s_or_b32 s2, s2, 1 +; GFX9-NEXT: v_mul_f32_e32 v5, v4, v1 +; GFX9-NEXT: v_trunc_f32_e32 v5, v5 +; GFX9-NEXT: v_mad_f32 v4, -v5, v0, v4 +; GFX9-NEXT: v_cmp_ge_f32_e64 s[8:9], |v4|, |v0| +; GFX9-NEXT: v_cvt_i32_f32_e32 v5, v5 +; GFX9-NEXT: s_and_b64 s[8:9], s[8:9], exec +; GFX9-NEXT: s_cselect_b32 s7, s2, 0 +; GFX9-NEXT: s_and_b32 s2, s6, 0xffff +; GFX9-NEXT: v_add_u16_e64 v3, s6, 1 +; GFX9-NEXT: s_lshl_b64 s[8:9], s[2:3], 1 +; GFX9-NEXT: v_cmp_eq_u16_e32 vcc, s5, v3 +; GFX9-NEXT: s_add_u32 s8, s0, s8 +; GFX9-NEXT: v_readfirstlane_b32 s6, v3 +; GFX9-NEXT: v_add_u32_e32 v3, s7, v5 +; GFX9-NEXT: s_addc_u32 s9, s1, s9 +; GFX9-NEXT: global_store_short v2, v3, s[8:9] ; GFX9-NEXT: s_cbranch_vccz .LBB6_1 ; GFX9-NEXT: ; %bb.2: ; %bb2 ; GFX9-NEXT: s_endpgm @@ -902,34 +925,36 @@ ; GFX10-NEXT: s_clause 0x1 ; GFX10-NEXT: s_load_dword s4, s[0:1], 0x2c ; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 -; GFX10-NEXT: v_mov_b32_e32 v1, 0 -; GFX10-NEXT: v_mov_b32_e32 v4, 0 +; GFX10-NEXT: v_mov_b32_e32 v2, 0 +; GFX10-NEXT: s_mov_b32 s1, 0 +; GFX10-NEXT: s_mov_b32 s5, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_sext_i32_i16 s4, s4 -; GFX10-NEXT: v_cvt_f32_i32_e32 v2, s4 -; GFX10-NEXT: v_rcp_iflag_f32_e32 v3, v2 +; GFX10-NEXT: v_cvt_f32_i32_e32 v0, s4 +; GFX10-NEXT: v_rcp_iflag_f32_e32 v1, v0 ; GFX10-NEXT: .LBB6_1: ; %bb3 ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX10-NEXT: v_bfe_i32 v5, v4, 0, 16 -; GFX10-NEXT: v_and_b32_e32 v0, 0xffff, v4 -; GFX10-NEXT: v_add_nc_u16 v4, v4, 1 -; GFX10-NEXT: v_cvt_f32_i32_e32 v7, v5 -; GFX10-NEXT: v_xor_b32_e32 v8, s4, v5 -; GFX10-NEXT: v_lshlrev_b64 v[5:6], 1, v[0:1] -; GFX10-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x400, v4 -; GFX10-NEXT: v_mul_f32_e32 v0, v7, v3 -; GFX10-NEXT: v_ashrrev_i32_e32 v8, 30, v8 -; GFX10-NEXT: v_add_co_u32 v5, s0, s2, v5 -; GFX10-NEXT: v_trunc_f32_e32 v0, v0 -; GFX10-NEXT: v_or_b32_e32 v8, 1, v8 -; GFX10-NEXT: v_add_co_ci_u32_e64 v6, s0, s3, v6, s0 -; GFX10-NEXT: s_and_b32 vcc_lo, exec_lo, vcc_lo -; GFX10-NEXT: v_mad_f32 v7, -v0, v2, v7 -; GFX10-NEXT: v_cvt_i32_f32_e32 v0, v0 -; GFX10-NEXT: v_cmp_ge_f32_e64 s1, |v7|, |v2| -; GFX10-NEXT: v_cndmask_b32_e64 v7, 0, v8, s1 -; GFX10-NEXT: v_add_nc_u32_e32 v0, v0, v7 -; GFX10-NEXT: global_store_short v[5:6], v0, off +; GFX10-NEXT: s_sext_i32_i16 s0, s5 +; GFX10-NEXT: v_add_nc_u16 v3, s5, 1 +; GFX10-NEXT: v_cvt_f32_i32_e32 v4, s0 +; GFX10-NEXT: s_xor_b32 s0, s0, s4 +; GFX10-NEXT: s_ashr_i32 s0, s0, 30 +; GFX10-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x400, v3 +; GFX10-NEXT: v_mul_f32_e32 v5, v4, v1 +; GFX10-NEXT: s_or_b32 s0, s0, 1 +; GFX10-NEXT: v_trunc_f32_e32 v5, v5 +; GFX10-NEXT: v_mad_f32 v4, -v5, v0, v4 +; GFX10-NEXT: v_cmp_ge_f32_e64 s6, |v4|, |v0| +; GFX10-NEXT: v_cvt_i32_f32_e32 v4, v5 +; GFX10-NEXT: s_and_b32 s6, s6, exec_lo +; GFX10-NEXT: s_cselect_b32 s6, s0, 0 +; GFX10-NEXT: s_and_b32 s0, s5, 0xffff +; GFX10-NEXT: v_readfirstlane_b32 s5, v3 +; GFX10-NEXT: v_add_nc_u32_e32 v3, s6, v4 +; GFX10-NEXT: s_lshl_b64 s[6:7], s[0:1], 1 +; GFX10-NEXT: s_add_u32 s6, s2, s6 +; GFX10-NEXT: s_addc_u32 s7, s3, s7 +; GFX10-NEXT: global_store_short v2, v3, s[6:7] ; GFX10-NEXT: s_cbranch_vccz .LBB6_1 ; GFX10-NEXT: ; %bb.2: ; %bb2 ; GFX10-NEXT: s_endpgm @@ -1002,38 +1027,40 @@ ; GFX9-LABEL: srem16_invariant_denom: ; GFX9: ; %bb.0: ; %bb ; GFX9-NEXT: s_load_dword s2, s[0:1], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 -; GFX9-NEXT: v_mov_b32_e32 v1, 0 -; GFX9-NEXT: s_movk_i32 s7, 0x400 -; GFX9-NEXT: v_mov_b32_e32 v4, 0 +; GFX9-NEXT: s_mov_b32 s3, 0 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX9-NEXT: v_mov_b32_e32 v2, 0 +; GFX9-NEXT: s_movk_i32 s5, 0x400 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_sext_i32_i16 s6, s2 -; GFX9-NEXT: v_cvt_f32_i32_e32 v2, s6 -; GFX9-NEXT: v_rcp_iflag_f32_e32 v3, v2 +; GFX9-NEXT: s_sext_i32_i16 s4, s2 +; GFX9-NEXT: v_cvt_f32_i32_e32 v0, s4 +; GFX9-NEXT: s_mov_b32 s6, 0 +; GFX9-NEXT: v_rcp_iflag_f32_e32 v1, v0 ; GFX9-NEXT: .LBB7_1: ; %bb3 ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX9-NEXT: v_bfe_i32 v7, v4, 0, 16 -; GFX9-NEXT: v_cvt_f32_i32_e32 v10, v7 -; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v4 -; GFX9-NEXT: v_xor_b32_e32 v9, s6, v7 -; GFX9-NEXT: v_lshlrev_b64 v[5:6], 1, v[0:1] -; GFX9-NEXT: v_ashrrev_i32_e32 v0, 30, v9 -; GFX9-NEXT: v_mul_f32_e32 v9, v10, v3 -; GFX9-NEXT: v_trunc_f32_e32 v9, v9 -; GFX9-NEXT: v_cvt_i32_f32_e32 v11, v9 -; GFX9-NEXT: v_mad_f32 v9, -v9, v2, v10 -; GFX9-NEXT: v_or_b32_e32 v0, 1, v0 -; GFX9-NEXT: v_cmp_ge_f32_e64 s[2:3], |v9|, |v2| -; GFX9-NEXT: v_cndmask_b32_e64 v0, 0, v0, s[2:3] -; GFX9-NEXT: v_add_u32_e32 v0, v11, v0 -; GFX9-NEXT: v_mul_lo_u32 v0, v0, s6 -; GFX9-NEXT: v_add_u16_e32 v4, 1, v4 -; GFX9-NEXT: v_mov_b32_e32 v8, s5 -; GFX9-NEXT: v_cmp_eq_u16_e32 vcc, s7, v4 -; GFX9-NEXT: v_add_co_u32_e64 v5, s[0:1], s4, v5 -; GFX9-NEXT: v_addc_co_u32_e64 v6, s[0:1], v8, v6, s[0:1] -; GFX9-NEXT: v_sub_u32_e32 v0, v7, v0 -; GFX9-NEXT: global_store_short v[5:6], v0, off +; GFX9-NEXT: s_sext_i32_i16 s7, s6 +; GFX9-NEXT: v_cvt_f32_i32_e32 v4, s7 +; GFX9-NEXT: s_xor_b32 s2, s7, s4 +; GFX9-NEXT: s_ashr_i32 s2, s2, 30 +; GFX9-NEXT: s_or_b32 s2, s2, 1 +; GFX9-NEXT: v_mul_f32_e32 v5, v4, v1 +; GFX9-NEXT: v_trunc_f32_e32 v5, v5 +; GFX9-NEXT: v_mad_f32 v4, -v5, v0, v4 +; GFX9-NEXT: v_cvt_i32_f32_e32 v5, v5 +; GFX9-NEXT: v_cmp_ge_f32_e64 s[8:9], |v4|, |v0| +; GFX9-NEXT: s_and_b64 s[8:9], s[8:9], exec +; GFX9-NEXT: v_add_u16_e64 v3, s6, 1 +; GFX9-NEXT: s_cselect_b32 s8, s2, 0 +; GFX9-NEXT: v_cmp_eq_u16_e32 vcc, s5, v3 +; GFX9-NEXT: s_and_b32 s2, s6, 0xffff +; GFX9-NEXT: v_readfirstlane_b32 s6, v3 +; GFX9-NEXT: v_add_u32_e32 v3, s8, v5 +; GFX9-NEXT: v_mul_lo_u32 v3, v3, s4 +; GFX9-NEXT: s_lshl_b64 s[8:9], s[2:3], 1 +; GFX9-NEXT: s_add_u32 s8, s0, s8 +; GFX9-NEXT: s_addc_u32 s9, s1, s9 +; GFX9-NEXT: v_sub_u32_e32 v3, s7, v3 +; GFX9-NEXT: global_store_short v2, v3, s[8:9] ; GFX9-NEXT: s_cbranch_vccz .LBB7_1 ; GFX9-NEXT: ; %bb.2: ; %bb2 ; GFX9-NEXT: s_endpgm @@ -1043,35 +1070,38 @@ ; GFX10-NEXT: s_clause 0x1 ; GFX10-NEXT: s_load_dword s4, s[0:1], 0x2c ; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 -; GFX10-NEXT: v_mov_b32_e32 v1, 0 -; GFX10-NEXT: v_mov_b32_e32 v4, 0 +; GFX10-NEXT: v_mov_b32_e32 v2, 0 +; GFX10-NEXT: s_mov_b32 s1, 0 +; GFX10-NEXT: s_mov_b32 s5, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: s_sext_i32_i16 s1, s4 -; GFX10-NEXT: v_cvt_f32_i32_e32 v2, s1 -; GFX10-NEXT: v_rcp_iflag_f32_e32 v3, v2 +; GFX10-NEXT: s_sext_i32_i16 s4, s4 +; GFX10-NEXT: v_cvt_f32_i32_e32 v0, s4 +; GFX10-NEXT: v_rcp_iflag_f32_e32 v1, v0 ; GFX10-NEXT: .LBB7_1: ; %bb3 ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX10-NEXT: v_bfe_i32 v7, v4, 0, 16 -; GFX10-NEXT: v_and_b32_e32 v0, 0xffff, v4 -; GFX10-NEXT: v_add_nc_u16 v4, v4, 1 -; GFX10-NEXT: v_cvt_f32_i32_e32 v5, v7 -; GFX10-NEXT: v_xor_b32_e32 v6, s1, v7 -; GFX10-NEXT: v_mul_f32_e32 v8, v5, v3 -; GFX10-NEXT: v_ashrrev_i32_e32 v6, 30, v6 -; GFX10-NEXT: v_trunc_f32_e32 v8, v8 -; GFX10-NEXT: v_or_b32_e32 v6, 1, v6 -; GFX10-NEXT: v_mad_f32 v5, -v8, v2, v5 -; GFX10-NEXT: v_cvt_i32_f32_e32 v8, v8 -; GFX10-NEXT: v_cmp_ge_f32_e64 vcc_lo, |v5|, |v2| -; GFX10-NEXT: v_cndmask_b32_e32 v9, 0, v6, vcc_lo -; GFX10-NEXT: v_lshlrev_b64 v[5:6], 1, v[0:1] -; GFX10-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x400, v4 -; GFX10-NEXT: v_add_nc_u32_e32 v0, v8, v9 -; GFX10-NEXT: v_add_co_u32 v5, s0, s2, v5 -; GFX10-NEXT: v_add_co_ci_u32_e64 v6, s0, s3, v6, s0 -; GFX10-NEXT: v_mul_lo_u32 v0, v0, s1 -; GFX10-NEXT: v_sub_nc_u32_e32 v0, v7, v0 -; GFX10-NEXT: global_store_short v[5:6], v0, off +; GFX10-NEXT: s_sext_i32_i16 s8, s5 +; GFX10-NEXT: v_add_nc_u16 v3, s5, 1 +; GFX10-NEXT: v_cvt_f32_i32_e32 v4, s8 +; GFX10-NEXT: s_xor_b32 s0, s8, s4 +; GFX10-NEXT: s_ashr_i32 s0, s0, 30 +; GFX10-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x400, v3 +; GFX10-NEXT: v_mul_f32_e32 v5, v4, v1 +; GFX10-NEXT: s_or_b32 s0, s0, 1 +; GFX10-NEXT: v_trunc_f32_e32 v5, v5 +; GFX10-NEXT: v_mad_f32 v4, -v5, v0, v4 +; GFX10-NEXT: v_cmp_ge_f32_e64 s6, |v4|, |v0| +; GFX10-NEXT: v_cvt_i32_f32_e32 v4, v5 +; GFX10-NEXT: s_and_b32 s6, s6, exec_lo +; GFX10-NEXT: s_cselect_b32 s6, s0, 0 +; GFX10-NEXT: s_and_b32 s0, s5, 0xffff +; GFX10-NEXT: v_add_nc_u32_e32 v4, s6, v4 +; GFX10-NEXT: v_readfirstlane_b32 s5, v3 +; GFX10-NEXT: s_lshl_b64 s[6:7], s[0:1], 1 +; GFX10-NEXT: s_add_u32 s6, s2, s6 +; GFX10-NEXT: v_mul_lo_u32 v3, v4, s4 +; GFX10-NEXT: s_addc_u32 s7, s3, s7 +; GFX10-NEXT: v_sub_nc_u32_e32 v3, s8, v3 +; GFX10-NEXT: global_store_short v2, v3, s[6:7] ; GFX10-NEXT: s_cbranch_vccz .LBB7_1 ; GFX10-NEXT: ; %bb.2: ; %bb2 ; GFX10-NEXT: s_endpgm diff --git a/llvm/test/CodeGen/AMDGPU/multilevel-break.ll b/llvm/test/CodeGen/AMDGPU/multilevel-break.ll --- a/llvm/test/CodeGen/AMDGPU/multilevel-break.ll +++ b/llvm/test/CodeGen/AMDGPU/multilevel-break.ll @@ -1,3 +1,4 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py ; RUN: opt -S -mtriple=amdgcn-- -structurizecfg -si-annotate-control-flow < %s | FileCheck -check-prefix=OPT %s ; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s @@ -177,20 +178,21 @@ ; GCN-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN-NEXT: buffer_load_dword v1, off, s[0:3], 0 glc ; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_readfirstlane_b32 s8, v1 ; GCN-NEXT: s_mov_b64 s[4:5], -1 -; GCN-NEXT: v_cmp_gt_i32_e32 vcc, 1, v1 +; GCN-NEXT: s_cmp_lt_i32 s8, 1 ; GCN-NEXT: s_mov_b64 s[6:7], -1 -; GCN-NEXT: s_cbranch_vccnz .LBB1_6 +; GCN-NEXT: s_cbranch_scc1 .LBB1_6 ; GCN-NEXT: ; %bb.3: ; %LeafBlock1 ; GCN-NEXT: ; in Loop: Header=BB1_2 Depth=1 -; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 1, v1 +; GCN-NEXT: s_cmp_eq_u32 s8, 1 ; GCN-NEXT: s_mov_b64 s[4:5], -1 -; GCN-NEXT: s_cbranch_vccz .LBB1_5 +; GCN-NEXT: s_cbranch_scc0 .LBB1_5 ; GCN-NEXT: ; %bb.4: ; %case1 ; GCN-NEXT: ; in Loop: Header=BB1_2 Depth=1 -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], 0 glc +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], 0 glc ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cmp_ge_i32_e32 vcc, v0, v2 +; GCN-NEXT: v_cmp_ge_i32_e32 vcc, v0, v1 ; GCN-NEXT: s_orn2_b64 s[4:5], vcc, exec ; GCN-NEXT: .LBB1_5: ; %Flow3 ; GCN-NEXT: ; in Loop: Header=BB1_2 Depth=1 @@ -201,8 +203,8 @@ ; GCN-NEXT: s_cbranch_vccz .LBB1_1 ; GCN-NEXT: ; %bb.7: ; %LeafBlock ; GCN-NEXT: ; in Loop: Header=BB1_2 Depth=1 -; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 -; GCN-NEXT: s_cbranch_vccz .LBB1_1 +; GCN-NEXT: s_cmp_eq_u32 s8, 0 +; GCN-NEXT: s_cbranch_scc0 .LBB1_1 ; GCN-NEXT: ; %bb.8: ; %case0 ; GCN-NEXT: ; in Loop: Header=BB1_2 Depth=1 ; GCN-NEXT: buffer_load_dword v1, off, s[0:3], 0 glc diff --git a/llvm/test/CodeGen/AMDGPU/s_add_co_pseudo_lowering.mir b/llvm/test/CodeGen/AMDGPU/s_add_co_pseudo_lowering.mir --- a/llvm/test/CodeGen/AMDGPU/s_add_co_pseudo_lowering.mir +++ b/llvm/test/CodeGen/AMDGPU/s_add_co_pseudo_lowering.mir @@ -14,27 +14,27 @@ ; GCN-NEXT: [[DEF:%[0-9]+]]:sreg_32 = IMPLICIT_DEF ; GCN-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 ; GCN-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2 - ; GCN-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr0 - ; GCN-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr1 - ; GCN-NEXT: [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr2 - ; GCN-NEXT: [[COPY6:%[0-9]+]]:sgpr_32 = COPY [[COPY3]] - ; GCN-NEXT: [[V_MUL_LO_U32_e64_:%[0-9]+]]:vgpr_32 = V_MUL_LO_U32_e64 [[COPY]], [[COPY4]], implicit $exec - ; GCN-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 killed [[V_MUL_LO_U32_e64_]], [[COPY6]], 0, implicit $exec - ; GCN-NEXT: [[S_MUL_HI_U32_:%[0-9]+]]:sreg_32 = S_MUL_HI_U32 [[COPY4]], [[COPY5]] + ; GCN-NEXT: [[DEF1:%[0-9]+]]:sreg_32 = IMPLICIT_DEF + ; GCN-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr1 + ; GCN-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr2 + ; GCN-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[COPY2]] + ; GCN-NEXT: [[V_MUL_LO_U32_e64_:%[0-9]+]]:vgpr_32 = V_MUL_LO_U32_e64 [[COPY]], [[COPY3]], implicit $exec + ; GCN-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 killed [[V_MUL_LO_U32_e64_]], [[COPY5]], 0, implicit $exec + ; GCN-NEXT: [[S_MUL_HI_U32_:%[0-9]+]]:sreg_32 = S_MUL_HI_U32 [[COPY3]], [[COPY4]] ; GCN-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 -614296167 - ; GCN-NEXT: [[V_MUL_LO_U32_e64_1:%[0-9]+]]:vgpr_32 = V_MUL_LO_U32_e64 [[COPY]], [[COPY3]], implicit $exec - ; GCN-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY killed [[S_MOV_B32_]] - ; GCN-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADDC_U32_e64 killed [[V_MUL_LO_U32_e64_1]], [[COPY7]], [[V_ADD_CO_U32_e64_1]], 0, implicit $exec - ; GCN-NEXT: [[V_MUL_HI_U32_e64_:%[0-9]+]]:vgpr_32 = V_MUL_HI_U32_e64 [[COPY4]], [[V_ADDC_U32_e64_]], implicit $exec + ; GCN-NEXT: [[V_MUL_LO_U32_e64_1:%[0-9]+]]:vgpr_32 = V_MUL_LO_U32_e64 [[COPY]], [[COPY2]], implicit $exec + ; GCN-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY killed [[S_MOV_B32_]] + ; GCN-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADDC_U32_e64 killed [[V_MUL_LO_U32_e64_1]], [[COPY6]], [[V_ADD_CO_U32_e64_1]], 0, implicit $exec + ; GCN-NEXT: [[V_MUL_HI_U32_e64_:%[0-9]+]]:vgpr_32 = V_MUL_HI_U32_e64 [[COPY3]], [[V_ADDC_U32_e64_]], implicit $exec ; GCN-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 -181084736 ; GCN-NEXT: [[V_MUL_LO_U32_e64_2:%[0-9]+]]:vgpr_32 = V_MUL_LO_U32_e64 [[V_MUL_HI_U32_e64_]], [[S_MOV_B32_1]], implicit $exec - ; GCN-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY killed [[S_MOV_B32_1]] - ; GCN-NEXT: [[V_ADDC_U32_e64_2:%[0-9]+]]:vgpr_32, [[V_ADDC_U32_e64_3:%[0-9]+]]:sreg_64_xexec = V_ADDC_U32_e64 [[COPY8]], killed [[V_MUL_LO_U32_e64_2]], [[V_ADDC_U32_e64_1]], 0, implicit $exec + ; GCN-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY killed [[S_MOV_B32_1]] + ; GCN-NEXT: [[V_ADDC_U32_e64_2:%[0-9]+]]:vgpr_32, [[V_ADDC_U32_e64_3:%[0-9]+]]:sreg_64_xexec = V_ADDC_U32_e64 [[COPY7]], killed [[V_MUL_LO_U32_e64_2]], [[V_ADDC_U32_e64_1]], 0, implicit $exec %0:vgpr_32 = COPY $vgpr0 %6:sreg_32 = COPY %0 %1:vgpr_32 = COPY $vgpr1 %2:vgpr_32 = COPY $vgpr2 - %3:sreg_32 = COPY $sgpr0 + %3:sreg_32 = COPY %2 %4:sreg_32 = COPY $sgpr1 %5:sreg_32 = COPY $sgpr2 %20:vgpr_32 = COPY %3 diff --git a/llvm/test/CodeGen/AMDGPU/saddo.ll b/llvm/test/CodeGen/AMDGPU/saddo.ll --- a/llvm/test/CodeGen/AMDGPU/saddo.ll +++ b/llvm/test/CodeGen/AMDGPU/saddo.ll @@ -251,7 +251,7 @@ ; SI-NEXT: s_mov_b32 s4, s2 ; SI-NEXT: s_mov_b32 s5, s3 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_add_i32_e32 v2, vcc, v1, v0 +; SI-NEXT: v_add_i32_e32 v2, vcc, v0, v1 ; SI-NEXT: v_cmp_gt_i32_e32 vcc, 0, v1 ; SI-NEXT: v_cmp_lt_i32_e64 s[0:1], v2, v0 ; SI-NEXT: s_xor_b64 s[0:1], vcc, s[0:1] @@ -275,7 +275,7 @@ ; VI-NEXT: v_mov_b32_e32 v2, s2 ; VI-NEXT: v_mov_b32_e32 v3, s3 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_add_u32_e32 v6, vcc, v5, v4 +; VI-NEXT: v_add_u32_e32 v6, vcc, v4, v5 ; VI-NEXT: v_cmp_gt_i32_e32 vcc, 0, v5 ; VI-NEXT: v_cmp_lt_i32_e64 s[0:1], v6, v4 ; VI-NEXT: s_xor_b64 s[0:1], vcc, s[0:1] diff --git a/llvm/test/CodeGen/AMDGPU/scalar_to_vector.ll b/llvm/test/CodeGen/AMDGPU/scalar_to_vector.ll --- a/llvm/test/CodeGen/AMDGPU/scalar_to_vector.ll +++ b/llvm/test/CodeGen/AMDGPU/scalar_to_vector.ll @@ -104,14 +104,17 @@ ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: buffer_load_ubyte v0, off, s[0:3], 0 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v1, 8, v0 -; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: v_and_b32_e32 v1, 0xff00, v0 -; SI-NEXT: v_lshrrev_b32_e32 v2, 8, v0 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_or_b32_e32 v0, v0, v2 +; SI-NEXT: v_readfirstlane_b32 s0, v0 +; SI-NEXT: s_lshl_b32 s1, s0, 8 +; SI-NEXT: s_or_b32 s0, s1, s0 +; SI-NEXT: s_and_b32 s1, s0, 0xff00 +; SI-NEXT: s_lshr_b32 s4, s0, 8 +; SI-NEXT: s_or_b32 s1, s4, s1 +; SI-NEXT: s_lshl_b32 s4, s1, 16 +; SI-NEXT: s_or_b32 s1, s1, s4 +; SI-NEXT: s_or_b32 s0, s0, s4 +; SI-NEXT: v_mov_b32_e32 v0, s0 +; SI-NEXT: v_mov_b32_e32 v1, s1 ; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; SI-NEXT: s_endpgm ; @@ -145,14 +148,17 @@ ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: buffer_load_ubyte v0, off, s[0:3], 0 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v1, 8, v0 -; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: v_and_b32_e32 v1, 0xff00, v0 -; SI-NEXT: v_lshrrev_b32_e32 v2, 8, v0 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_or_b32_e32 v0, v0, v2 +; SI-NEXT: v_readfirstlane_b32 s0, v0 +; SI-NEXT: s_lshl_b32 s1, s0, 8 +; SI-NEXT: s_or_b32 s0, s1, s0 +; SI-NEXT: s_and_b32 s1, s0, 0xff00 +; SI-NEXT: s_lshr_b32 s4, s0, 8 +; SI-NEXT: s_or_b32 s1, s4, s1 +; SI-NEXT: s_lshl_b32 s4, s1, 16 +; SI-NEXT: s_or_b32 s1, s1, s4 +; SI-NEXT: s_or_b32 s0, s0, s4 +; SI-NEXT: v_mov_b32_e32 v0, s0 +; SI-NEXT: v_mov_b32_e32 v1, s1 ; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; SI-NEXT: s_endpgm ; diff --git a/llvm/test/CodeGen/AMDGPU/sdiv.ll b/llvm/test/CodeGen/AMDGPU/sdiv.ll --- a/llvm/test/CodeGen/AMDGPU/sdiv.ll +++ b/llvm/test/CodeGen/AMDGPU/sdiv.ll @@ -42,7 +42,7 @@ ; GCN-NEXT: v_xor_b32_e32 v2, v5, v2 ; GCN-NEXT: v_mul_lo_u32 v4, v4, v3 ; GCN-NEXT: v_mul_hi_u32 v4, v3, v4 -; GCN-NEXT: v_add_i32_e32 v3, vcc, v4, v3 +; GCN-NEXT: v_add_i32_e32 v3, vcc, v3, v4 ; GCN-NEXT: v_mul_hi_u32 v3, v0, v3 ; GCN-NEXT: v_mul_lo_u32 v4, v3, v1 ; GCN-NEXT: v_add_i32_e32 v5, vcc, 1, v3 @@ -55,7 +55,7 @@ ; GCN-NEXT: v_cmp_ge_u32_e32 vcc, v0, v1 ; GCN-NEXT: v_cndmask_b32_e32 v0, v3, v4, vcc ; GCN-NEXT: v_xor_b32_e32 v0, v0, v2 -; GCN-NEXT: v_sub_i32_e32 v0, vcc, v0, v2 +; GCN-NEXT: v_subrev_i32_e32 v0, vcc, v2, v0 ; GCN-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GCN-NEXT: s_endpgm ; @@ -87,7 +87,7 @@ ; TONGA-NEXT: v_xor_b32_e32 v2, v5, v2 ; TONGA-NEXT: v_mul_lo_u32 v4, v4, v3 ; TONGA-NEXT: v_mul_hi_u32 v4, v3, v4 -; TONGA-NEXT: v_add_u32_e32 v3, vcc, v4, v3 +; TONGA-NEXT: v_add_u32_e32 v3, vcc, v3, v4 ; TONGA-NEXT: v_mul_hi_u32 v3, v0, v3 ; TONGA-NEXT: v_mul_lo_u32 v4, v3, v1 ; TONGA-NEXT: v_add_u32_e32 v5, vcc, 1, v3 @@ -100,7 +100,7 @@ ; TONGA-NEXT: v_cmp_ge_u32_e32 vcc, v0, v1 ; TONGA-NEXT: v_cndmask_b32_e32 v0, v3, v4, vcc ; TONGA-NEXT: v_xor_b32_e32 v0, v0, v2 -; TONGA-NEXT: v_sub_u32_e32 v0, vcc, v0, v2 +; TONGA-NEXT: v_subrev_u32_e32 v0, vcc, v2, v0 ; TONGA-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; TONGA-NEXT: s_endpgm ; @@ -214,7 +214,7 @@ ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: v_ashrrev_i32_e32 v1, 31, v0 ; GCN-NEXT: v_lshrrev_b32_e32 v1, 30, v1 -; GCN-NEXT: v_add_i32_e32 v0, vcc, v0, v1 +; GCN-NEXT: v_add_i32_e32 v0, vcc, v1, v0 ; GCN-NEXT: v_ashrrev_i32_e32 v0, 2, v0 ; GCN-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GCN-NEXT: s_endpgm @@ -235,7 +235,7 @@ ; TONGA-NEXT: s_waitcnt vmcnt(0) ; TONGA-NEXT: v_ashrrev_i32_e32 v1, 31, v0 ; TONGA-NEXT: v_lshrrev_b32_e32 v1, 30, v1 -; TONGA-NEXT: v_add_u32_e32 v0, vcc, v0, v1 +; TONGA-NEXT: v_add_u32_e32 v0, vcc, v1, v0 ; TONGA-NEXT: v_ashrrev_i32_e32 v0, 2, v0 ; TONGA-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; TONGA-NEXT: s_endpgm @@ -433,21 +433,21 @@ ; GCN-NEXT: v_xor_b32_e32 v1, v1, v6 ; GCN-NEXT: v_mul_hi_u32 v6, v7, v11 ; GCN-NEXT: v_add_i32_e32 v4, vcc, v4, v5 -; GCN-NEXT: v_add_i32_e32 v5, vcc, v6, v7 +; GCN-NEXT: v_add_i32_e32 v5, vcc, v7, v6 ; GCN-NEXT: v_mul_hi_u32 v4, v0, v4 ; GCN-NEXT: v_mul_hi_u32 v5, v1, v5 ; GCN-NEXT: v_mul_lo_u32 v6, v4, v2 ; GCN-NEXT: v_mul_lo_u32 v10, v5, v3 ; GCN-NEXT: v_add_i32_e32 v7, vcc, 1, v4 ; GCN-NEXT: v_sub_i32_e32 v0, vcc, v0, v6 -; GCN-NEXT: v_sub_i32_e32 v1, vcc, v1, v10 +; GCN-NEXT: v_subrev_i32_e32 v1, vcc, v10, v1 ; GCN-NEXT: v_add_i32_e32 v11, vcc, 1, v5 ; GCN-NEXT: v_cmp_ge_u32_e64 s[0:1], v0, v2 ; GCN-NEXT: v_cmp_ge_u32_e64 s[2:3], v1, v3 ; GCN-NEXT: v_cndmask_b32_e64 v4, v4, v7, s[0:1] -; GCN-NEXT: v_sub_i32_e32 v6, vcc, v0, v2 +; GCN-NEXT: v_subrev_i32_e32 v6, vcc, v2, v0 ; GCN-NEXT: v_cndmask_b32_e64 v5, v5, v11, s[2:3] -; GCN-NEXT: v_sub_i32_e32 v7, vcc, v1, v3 +; GCN-NEXT: v_subrev_i32_e32 v7, vcc, v3, v1 ; GCN-NEXT: v_cndmask_b32_e64 v0, v0, v6, s[0:1] ; GCN-NEXT: v_add_i32_e32 v6, vcc, 1, v4 ; GCN-NEXT: v_cndmask_b32_e64 v1, v1, v7, s[2:3] @@ -459,7 +459,7 @@ ; GCN-NEXT: v_xor_b32_e32 v0, v0, v8 ; GCN-NEXT: v_xor_b32_e32 v1, v1, v9 ; GCN-NEXT: v_sub_i32_e32 v0, vcc, v0, v8 -; GCN-NEXT: v_sub_i32_e32 v1, vcc, v1, v9 +; GCN-NEXT: v_subrev_i32_e32 v1, vcc, v9, v1 ; GCN-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; GCN-NEXT: s_endpgm ; @@ -506,7 +506,7 @@ ; TONGA-NEXT: v_xor_b32_e32 v1, v1, v6 ; TONGA-NEXT: v_mul_hi_u32 v6, v7, v11 ; TONGA-NEXT: v_add_u32_e32 v4, vcc, v4, v5 -; TONGA-NEXT: v_add_u32_e32 v5, vcc, v6, v7 +; TONGA-NEXT: v_add_u32_e32 v5, vcc, v7, v6 ; TONGA-NEXT: v_mul_hi_u32 v4, v0, v4 ; TONGA-NEXT: v_mul_hi_u32 v5, v1, v5 ; TONGA-NEXT: v_mul_lo_u32 v6, v4, v2 @@ -518,9 +518,9 @@ ; TONGA-NEXT: v_cmp_ge_u32_e64 s[0:1], v0, v2 ; TONGA-NEXT: v_cmp_ge_u32_e64 s[2:3], v1, v3 ; TONGA-NEXT: v_cndmask_b32_e64 v4, v4, v7, s[0:1] -; TONGA-NEXT: v_sub_u32_e32 v6, vcc, v0, v2 +; TONGA-NEXT: v_subrev_u32_e32 v6, vcc, v2, v0 ; TONGA-NEXT: v_cndmask_b32_e64 v5, v5, v11, s[2:3] -; TONGA-NEXT: v_sub_u32_e32 v7, vcc, v1, v3 +; TONGA-NEXT: v_subrev_u32_e32 v7, vcc, v3, v1 ; TONGA-NEXT: v_cndmask_b32_e64 v0, v0, v6, s[0:1] ; TONGA-NEXT: v_add_u32_e32 v6, vcc, 1, v4 ; TONGA-NEXT: v_cndmask_b32_e64 v1, v1, v7, s[2:3] @@ -853,7 +853,7 @@ ; GCN-NEXT: v_add_i32_e32 v11, vcc, 1, v8 ; GCN-NEXT: v_cmp_ge_u32_e64 s[0:1], v0, v4 ; GCN-NEXT: v_cndmask_b32_e64 v8, v8, v11, s[0:1] -; GCN-NEXT: v_sub_i32_e32 v11, vcc, v0, v4 +; GCN-NEXT: v_subrev_i32_e32 v11, vcc, v4, v0 ; GCN-NEXT: v_ashrrev_i32_e32 v14, 31, v7 ; GCN-NEXT: v_cndmask_b32_e64 v0, v0, v11, s[0:1] ; GCN-NEXT: v_add_i32_e32 v7, vcc, v14, v7 @@ -865,27 +865,27 @@ ; GCN-NEXT: v_cvt_f32_u32_e32 v12, v4 ; GCN-NEXT: v_mul_hi_u32 v0, v10, v0 ; GCN-NEXT: v_add_i32_e32 v11, vcc, 1, v8 -; GCN-NEXT: v_add_i32_e32 v7, vcc, v7, v9 +; GCN-NEXT: v_add_i32_e32 v7, vcc, v9, v7 ; GCN-NEXT: v_mul_hi_u32 v7, v1, v7 ; GCN-NEXT: v_add_i32_e32 v0, vcc, v0, v10 ; GCN-NEXT: v_mul_hi_u32 v0, v2, v0 ; GCN-NEXT: v_mul_lo_u32 v10, v7, v5 ; GCN-NEXT: v_rcp_iflag_f32_e32 v12, v12 ; GCN-NEXT: v_sub_i32_e32 v9, vcc, 0, v4 -; GCN-NEXT: v_sub_i32_e32 v1, vcc, v1, v10 +; GCN-NEXT: v_subrev_i32_e32 v1, vcc, v10, v1 ; GCN-NEXT: v_mul_lo_u32 v10, v0, v6 ; GCN-NEXT: v_cmp_ge_u32_e64 s[2:3], v1, v5 ; GCN-NEXT: v_mul_f32_e32 v12, 0x4f7ffffe, v12 ; GCN-NEXT: v_cvt_u32_f32_e32 v12, v12 -; GCN-NEXT: v_sub_i32_e32 v2, vcc, v2, v10 +; GCN-NEXT: v_subrev_i32_e32 v2, vcc, v10, v2 ; GCN-NEXT: v_add_i32_e32 v10, vcc, 1, v7 ; GCN-NEXT: v_cndmask_b32_e64 v7, v7, v10, s[2:3] ; GCN-NEXT: v_add_i32_e32 v10, vcc, 1, v0 ; GCN-NEXT: v_cmp_ge_u32_e64 s[4:5], v2, v6 ; GCN-NEXT: v_cndmask_b32_e64 v10, v0, v10, s[4:5] -; GCN-NEXT: v_sub_i32_e32 v0, vcc, v1, v5 +; GCN-NEXT: v_subrev_i32_e32 v0, vcc, v5, v1 ; GCN-NEXT: v_cndmask_b32_e64 v0, v1, v0, s[2:3] -; GCN-NEXT: v_sub_i32_e32 v1, vcc, v2, v6 +; GCN-NEXT: v_subrev_i32_e32 v1, vcc, v6, v2 ; GCN-NEXT: v_cndmask_b32_e64 v2, v2, v1, s[4:5] ; GCN-NEXT: v_add_i32_e32 v1, vcc, 1, v7 ; GCN-NEXT: v_cmp_ge_u32_e32 vcc, v0, v5 @@ -894,7 +894,7 @@ ; GCN-NEXT: v_xor_b32_e32 v1, v8, v15 ; GCN-NEXT: v_xor_b32_e32 v5, v0, v16 ; GCN-NEXT: v_sub_i32_e32 v0, vcc, v1, v15 -; GCN-NEXT: v_sub_i32_e32 v1, vcc, v5, v16 +; GCN-NEXT: v_subrev_i32_e32 v1, vcc, v16, v5 ; GCN-NEXT: v_mul_lo_u32 v5, v9, v12 ; GCN-NEXT: v_ashrrev_i32_e32 v8, 31, v3 ; GCN-NEXT: v_add_i32_e32 v3, vcc, v8, v3 @@ -907,19 +907,19 @@ ; GCN-NEXT: v_cndmask_b32_e32 v2, v10, v7, vcc ; GCN-NEXT: v_xor_b32_e32 v2, v2, v17 ; GCN-NEXT: v_mul_lo_u32 v6, v5, v4 -; GCN-NEXT: v_sub_i32_e32 v2, vcc, v2, v17 +; GCN-NEXT: v_subrev_i32_e32 v2, vcc, v17, v2 ; GCN-NEXT: v_xor_b32_e32 v7, v8, v14 -; GCN-NEXT: v_sub_i32_e32 v3, vcc, v3, v6 +; GCN-NEXT: v_subrev_i32_e32 v3, vcc, v6, v3 ; GCN-NEXT: v_add_i32_e32 v6, vcc, 1, v5 ; GCN-NEXT: v_cmp_ge_u32_e64 s[0:1], v3, v4 ; GCN-NEXT: v_cndmask_b32_e64 v5, v5, v6, s[0:1] -; GCN-NEXT: v_sub_i32_e32 v6, vcc, v3, v4 +; GCN-NEXT: v_subrev_i32_e32 v6, vcc, v4, v3 ; GCN-NEXT: v_cndmask_b32_e64 v3, v3, v6, s[0:1] ; GCN-NEXT: v_add_i32_e32 v6, vcc, 1, v5 ; GCN-NEXT: v_cmp_ge_u32_e32 vcc, v3, v4 ; GCN-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc ; GCN-NEXT: v_xor_b32_e32 v3, v3, v7 -; GCN-NEXT: v_sub_i32_e32 v3, vcc, v3, v7 +; GCN-NEXT: v_subrev_i32_e32 v3, vcc, v7, v3 ; GCN-NEXT: buffer_store_dwordx4 v[0:3], off, s[8:11], 0 ; GCN-NEXT: s_endpgm ; @@ -984,7 +984,7 @@ ; TONGA-NEXT: v_add_u32_e32 v11, vcc, 1, v8 ; TONGA-NEXT: v_cmp_ge_u32_e64 s[0:1], v0, v4 ; TONGA-NEXT: v_cndmask_b32_e64 v8, v8, v11, s[0:1] -; TONGA-NEXT: v_sub_u32_e32 v11, vcc, v0, v4 +; TONGA-NEXT: v_subrev_u32_e32 v11, vcc, v4, v0 ; TONGA-NEXT: v_ashrrev_i32_e32 v14, 31, v7 ; TONGA-NEXT: v_cndmask_b32_e64 v0, v0, v11, s[0:1] ; TONGA-NEXT: v_add_u32_e32 v7, vcc, v14, v7 @@ -996,7 +996,7 @@ ; TONGA-NEXT: v_cvt_f32_u32_e32 v12, v4 ; TONGA-NEXT: v_mul_hi_u32 v0, v10, v0 ; TONGA-NEXT: v_add_u32_e32 v11, vcc, 1, v8 -; TONGA-NEXT: v_add_u32_e32 v7, vcc, v7, v9 +; TONGA-NEXT: v_add_u32_e32 v7, vcc, v9, v7 ; TONGA-NEXT: v_mul_hi_u32 v7, v1, v7 ; TONGA-NEXT: v_add_u32_e32 v0, vcc, v0, v10 ; TONGA-NEXT: v_mul_hi_u32 v0, v2, v0 @@ -1008,15 +1008,15 @@ ; TONGA-NEXT: v_cmp_ge_u32_e64 s[2:3], v1, v5 ; TONGA-NEXT: v_mul_f32_e32 v12, 0x4f7ffffe, v12 ; TONGA-NEXT: v_cvt_u32_f32_e32 v12, v12 -; TONGA-NEXT: v_sub_u32_e32 v2, vcc, v2, v10 +; TONGA-NEXT: v_subrev_u32_e32 v2, vcc, v10, v2 ; TONGA-NEXT: v_add_u32_e32 v10, vcc, 1, v7 ; TONGA-NEXT: v_cndmask_b32_e64 v7, v7, v10, s[2:3] ; TONGA-NEXT: v_add_u32_e32 v10, vcc, 1, v0 ; TONGA-NEXT: v_cmp_ge_u32_e64 s[4:5], v2, v6 ; TONGA-NEXT: v_cndmask_b32_e64 v10, v0, v10, s[4:5] -; TONGA-NEXT: v_sub_u32_e32 v0, vcc, v1, v5 +; TONGA-NEXT: v_subrev_u32_e32 v0, vcc, v5, v1 ; TONGA-NEXT: v_cndmask_b32_e64 v0, v1, v0, s[2:3] -; TONGA-NEXT: v_sub_u32_e32 v1, vcc, v2, v6 +; TONGA-NEXT: v_subrev_u32_e32 v1, vcc, v6, v2 ; TONGA-NEXT: v_cndmask_b32_e64 v2, v2, v1, s[4:5] ; TONGA-NEXT: v_add_u32_e32 v1, vcc, 1, v7 ; TONGA-NEXT: v_cmp_ge_u32_e32 vcc, v0, v5 @@ -1040,17 +1040,17 @@ ; TONGA-NEXT: v_mul_lo_u32 v6, v5, v4 ; TONGA-NEXT: v_subrev_u32_e32 v2, vcc, v17, v2 ; TONGA-NEXT: v_xor_b32_e32 v7, v8, v14 -; TONGA-NEXT: v_sub_u32_e32 v3, vcc, v3, v6 +; TONGA-NEXT: v_subrev_u32_e32 v3, vcc, v6, v3 ; TONGA-NEXT: v_add_u32_e32 v6, vcc, 1, v5 ; TONGA-NEXT: v_cmp_ge_u32_e64 s[0:1], v3, v4 ; TONGA-NEXT: v_cndmask_b32_e64 v5, v5, v6, s[0:1] -; TONGA-NEXT: v_sub_u32_e32 v6, vcc, v3, v4 +; TONGA-NEXT: v_subrev_u32_e32 v6, vcc, v4, v3 ; TONGA-NEXT: v_cndmask_b32_e64 v3, v3, v6, s[0:1] ; TONGA-NEXT: v_add_u32_e32 v6, vcc, 1, v5 ; TONGA-NEXT: v_cmp_ge_u32_e32 vcc, v3, v4 ; TONGA-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc ; TONGA-NEXT: v_xor_b32_e32 v3, v3, v7 -; TONGA-NEXT: v_subrev_u32_e32 v3, vcc, v7, v3 +; TONGA-NEXT: v_sub_u32_e32 v3, vcc, v3, v7 ; TONGA-NEXT: buffer_store_dwordx4 v[0:3], off, s[8:11], 0 ; TONGA-NEXT: s_endpgm ; @@ -1515,7 +1515,7 @@ ; TONGA-NEXT: v_cvt_i32_f32_e32 v1, v1 ; TONGA-NEXT: v_cmp_ge_f32_e64 vcc, |v3|, |v2| ; TONGA-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc -; TONGA-NEXT: v_add_u32_e32 v0, vcc, v1, v0 +; TONGA-NEXT: v_add_u32_e32 v0, vcc, v0, v1 ; TONGA-NEXT: v_bfe_i32 v0, v0, 0, 8 ; TONGA-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; TONGA-NEXT: s_endpgm @@ -1680,7 +1680,7 @@ ; TONGA-NEXT: v_cvt_i32_f32_e32 v1, v1 ; TONGA-NEXT: v_cmp_ge_f32_e64 vcc, |v3|, |v2| ; TONGA-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc -; TONGA-NEXT: v_add_u32_e32 v0, vcc, v1, v0 +; TONGA-NEXT: v_add_u32_e32 v0, vcc, v0, v1 ; TONGA-NEXT: v_bfe_i32 v0, v0, 0, 23 ; TONGA-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; TONGA-NEXT: s_endpgm @@ -1824,7 +1824,7 @@ ; GCN-NEXT: v_cvt_i32_f32_e32 v2, v2 ; GCN-NEXT: v_cmp_ge_f32_e64 vcc, |v3|, |v1| ; GCN-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc -; GCN-NEXT: v_add_i32_e32 v0, vcc, v0, v2 +; GCN-NEXT: v_add_i32_e32 v0, vcc, v2, v0 ; GCN-NEXT: v_bfe_i32 v0, v0, 0, 24 ; GCN-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GCN-NEXT: s_endpgm @@ -1865,7 +1865,7 @@ ; TONGA-NEXT: v_cvt_i32_f32_e32 v2, v2 ; TONGA-NEXT: v_cmp_ge_f32_e64 vcc, |v3|, |v1| ; TONGA-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc -; TONGA-NEXT: v_add_u32_e32 v0, vcc, v2, v0 +; TONGA-NEXT: v_add_u32_e32 v0, vcc, v0, v2 ; TONGA-NEXT: v_bfe_i32 v0, v0, 0, 24 ; TONGA-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; TONGA-NEXT: s_endpgm @@ -1993,21 +1993,21 @@ ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: v_bfe_i32 v2, v1, 0, 25 ; GCN-NEXT: v_bfe_i32 v1, v1, 24, 1 -; GCN-NEXT: v_add_i32_e32 v2, vcc, v1, v2 +; GCN-NEXT: v_add_i32_e32 v2, vcc, v2, v1 ; GCN-NEXT: v_xor_b32_e32 v2, v2, v1 ; GCN-NEXT: v_cvt_f32_u32_e32 v3, v2 ; GCN-NEXT: v_sub_i32_e32 v4, vcc, 0, v2 ; GCN-NEXT: v_bfe_i32 v5, v0, 0, 25 ; GCN-NEXT: v_rcp_iflag_f32_e32 v3, v3 ; GCN-NEXT: v_bfe_i32 v0, v0, 24, 1 -; GCN-NEXT: v_add_i32_e32 v5, vcc, v0, v5 +; GCN-NEXT: v_add_i32_e32 v5, vcc, v5, v0 ; GCN-NEXT: v_mul_f32_e32 v3, 0x4f7ffffe, v3 ; GCN-NEXT: v_cvt_u32_f32_e32 v3, v3 ; GCN-NEXT: v_xor_b32_e32 v5, v5, v0 ; GCN-NEXT: v_xor_b32_e32 v0, v0, v1 ; GCN-NEXT: v_mul_lo_u32 v4, v4, v3 ; GCN-NEXT: v_mul_hi_u32 v4, v3, v4 -; GCN-NEXT: v_add_i32_e32 v3, vcc, v4, v3 +; GCN-NEXT: v_add_i32_e32 v3, vcc, v3, v4 ; GCN-NEXT: v_mul_hi_u32 v3, v5, v3 ; GCN-NEXT: v_mul_lo_u32 v1, v3, v2 ; GCN-NEXT: v_add_i32_e32 v4, vcc, 1, v3 @@ -2020,7 +2020,7 @@ ; GCN-NEXT: v_cmp_ge_u32_e32 vcc, v1, v2 ; GCN-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc ; GCN-NEXT: v_xor_b32_e32 v1, v1, v0 -; GCN-NEXT: v_sub_i32_e32 v0, vcc, v1, v0 +; GCN-NEXT: v_subrev_i32_e32 v0, vcc, v0, v1 ; GCN-NEXT: v_bfe_i32 v0, v0, 0, 25 ; GCN-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GCN-NEXT: s_endpgm @@ -2041,21 +2041,21 @@ ; TONGA-NEXT: s_waitcnt vmcnt(0) ; TONGA-NEXT: v_bfe_i32 v2, v1, 0, 25 ; TONGA-NEXT: v_bfe_i32 v1, v1, 24, 1 -; TONGA-NEXT: v_add_u32_e32 v2, vcc, v1, v2 +; TONGA-NEXT: v_add_u32_e32 v2, vcc, v2, v1 ; TONGA-NEXT: v_xor_b32_e32 v2, v2, v1 ; TONGA-NEXT: v_cvt_f32_u32_e32 v3, v2 ; TONGA-NEXT: v_sub_u32_e32 v4, vcc, 0, v2 ; TONGA-NEXT: v_bfe_i32 v5, v0, 0, 25 ; TONGA-NEXT: v_rcp_iflag_f32_e32 v3, v3 ; TONGA-NEXT: v_bfe_i32 v0, v0, 24, 1 -; TONGA-NEXT: v_add_u32_e32 v5, vcc, v0, v5 +; TONGA-NEXT: v_add_u32_e32 v5, vcc, v5, v0 ; TONGA-NEXT: v_mul_f32_e32 v3, 0x4f7ffffe, v3 ; TONGA-NEXT: v_cvt_u32_f32_e32 v3, v3 ; TONGA-NEXT: v_xor_b32_e32 v5, v5, v0 ; TONGA-NEXT: v_xor_b32_e32 v0, v0, v1 ; TONGA-NEXT: v_mul_lo_u32 v4, v4, v3 ; TONGA-NEXT: v_mul_hi_u32 v4, v3, v4 -; TONGA-NEXT: v_add_u32_e32 v3, vcc, v4, v3 +; TONGA-NEXT: v_add_u32_e32 v3, vcc, v3, v4 ; TONGA-NEXT: v_mul_hi_u32 v3, v5, v3 ; TONGA-NEXT: v_mul_lo_u32 v1, v3, v2 ; TONGA-NEXT: v_add_u32_e32 v4, vcc, 1, v3 @@ -2232,7 +2232,7 @@ ; GCN-NEXT: v_ashrrev_i32_e32 v3, 12, v3 ; GCN-NEXT: v_add_i32_e32 v0, vcc, v0, v4 ; GCN-NEXT: v_add_i32_e32 v1, vcc, v1, v5 -; GCN-NEXT: v_add_i32_e32 v2, vcc, v2, v6 +; GCN-NEXT: v_add_i32_e32 v2, vcc, v6, v2 ; GCN-NEXT: v_add_i32_e32 v3, vcc, v3, v7 ; GCN-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 ; GCN-NEXT: s_endpgm @@ -2262,10 +2262,10 @@ ; TONGA-NEXT: v_ashrrev_i32_e32 v2, 12, v2 ; TONGA-NEXT: v_lshrrev_b32_e32 v7, 31, v3 ; TONGA-NEXT: v_ashrrev_i32_e32 v3, 12, v3 -; TONGA-NEXT: v_add_u32_e32 v0, vcc, v0, v4 +; TONGA-NEXT: v_add_u32_e32 v0, vcc, v4, v0 ; TONGA-NEXT: v_add_u32_e32 v1, vcc, v1, v5 ; TONGA-NEXT: v_add_u32_e32 v2, vcc, v2, v6 -; TONGA-NEXT: v_add_u32_e32 v3, vcc, v3, v7 +; TONGA-NEXT: v_add_u32_e32 v3, vcc, v7, v3 ; TONGA-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 ; TONGA-NEXT: s_endpgm ; diff --git a/llvm/test/CodeGen/AMDGPU/sdiv64.ll b/llvm/test/CodeGen/AMDGPU/sdiv64.ll --- a/llvm/test/CodeGen/AMDGPU/sdiv64.ll +++ b/llvm/test/CodeGen/AMDGPU/sdiv64.ll @@ -36,8 +36,8 @@ ; GCN-NEXT: v_mul_lo_u32 v5, s5, v0 ; GCN-NEXT: v_mul_lo_u32 v4, s4, v0 ; GCN-NEXT: s_addc_u32 s3, s3, s12 -; GCN-NEXT: v_add_i32_e32 v2, vcc, v3, v2 -; GCN-NEXT: v_add_i32_e32 v2, vcc, v2, v5 +; GCN-NEXT: v_add_i32_e32 v2, vcc, v2, v3 +; GCN-NEXT: v_add_i32_e32 v2, vcc, v5, v2 ; GCN-NEXT: v_mul_hi_u32 v3, v0, v4 ; GCN-NEXT: v_mul_lo_u32 v5, v0, v2 ; GCN-NEXT: v_mul_hi_u32 v7, v0, v2 @@ -97,9 +97,9 @@ ; GCN-NEXT: v_mul_hi_u32 v3, s10, v0 ; GCN-NEXT: v_mul_lo_u32 v4, s11, v0 ; GCN-NEXT: v_mov_b32_e32 v5, s11 -; GCN-NEXT: v_add_i32_e32 v2, vcc, v3, v2 +; GCN-NEXT: v_add_i32_e32 v2, vcc, v2, v3 ; GCN-NEXT: v_mul_lo_u32 v3, s10, v0 -; GCN-NEXT: v_add_i32_e32 v2, vcc, v2, v4 +; GCN-NEXT: v_add_i32_e32 v2, vcc, v4, v2 ; GCN-NEXT: v_sub_i32_e32 v4, vcc, s3, v2 ; GCN-NEXT: v_sub_i32_e32 v3, vcc, s2, v3 ; GCN-NEXT: v_subb_u32_e64 v4, s[0:1], v4, v5, vcc @@ -524,7 +524,7 @@ ; GCN-IR-NEXT: v_mov_b32_e32 v3, s4 ; GCN-IR-NEXT: v_cmp_ge_f32_e64 vcc, |v1|, |v0| ; GCN-IR-NEXT: v_cndmask_b32_e32 v0, 0, v3, vcc -; GCN-IR-NEXT: v_add_i32_e32 v0, vcc, v0, v2 +; GCN-IR-NEXT: v_add_i32_e32 v0, vcc, v2, v0 ; GCN-IR-NEXT: v_bfe_i32 v0, v0, 0, 24 ; GCN-IR-NEXT: v_ashrrev_i32_e32 v1, 31, v0 ; GCN-IR-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 @@ -628,7 +628,7 @@ ; GCN-IR-NEXT: v_cmp_ge_f32_e64 vcc, |v1|, |v0| ; GCN-IR-NEXT: v_cndmask_b32_e32 v0, 0, v3, vcc ; GCN-IR-NEXT: s_mov_b32 s5, s1 -; GCN-IR-NEXT: v_add_i32_e32 v0, vcc, v0, v2 +; GCN-IR-NEXT: v_add_i32_e32 v0, vcc, v2, v0 ; GCN-IR-NEXT: v_ashrrev_i32_e32 v1, 31, v0 ; GCN-IR-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; GCN-IR-NEXT: s_endpgm @@ -664,7 +664,7 @@ ; GCN-NEXT: v_mov_b32_e32 v3, s4 ; GCN-NEXT: v_cmp_ge_f32_e64 vcc, |v1|, |v0| ; GCN-NEXT: v_cndmask_b32_e32 v0, 0, v3, vcc -; GCN-NEXT: v_add_i32_e32 v0, vcc, v0, v2 +; GCN-NEXT: v_add_i32_e32 v0, vcc, v2, v0 ; GCN-NEXT: v_bfe_i32 v0, v0, 0, 31 ; GCN-NEXT: v_ashrrev_i32_e32 v1, 31, v0 ; GCN-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 @@ -731,7 +731,7 @@ ; GCN-NEXT: v_mov_b32_e32 v3, s4 ; GCN-NEXT: v_cmp_ge_f32_e64 vcc, |v1|, |v0| ; GCN-NEXT: v_cndmask_b32_e32 v0, 0, v3, vcc -; GCN-NEXT: v_add_i32_e32 v0, vcc, v0, v2 +; GCN-NEXT: v_add_i32_e32 v0, vcc, v2, v0 ; GCN-NEXT: v_bfe_i32 v0, v0, 0, 23 ; GCN-NEXT: v_ashrrev_i32_e32 v1, 31, v0 ; GCN-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 @@ -798,7 +798,7 @@ ; GCN-NEXT: v_mov_b32_e32 v3, s4 ; GCN-NEXT: v_cmp_ge_f32_e64 vcc, |v1|, |v0| ; GCN-NEXT: v_cndmask_b32_e32 v0, 0, v3, vcc -; GCN-NEXT: v_add_i32_e32 v0, vcc, v0, v2 +; GCN-NEXT: v_add_i32_e32 v0, vcc, v2, v0 ; GCN-NEXT: v_bfe_i32 v0, v0, 0, 25 ; GCN-NEXT: v_ashrrev_i32_e32 v1, 31, v0 ; GCN-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 @@ -926,7 +926,7 @@ ; GCN-IR-NEXT: v_cmp_ge_f32_e64 vcc, |v3|, |v2| ; GCN-IR-NEXT: v_cndmask_b32_e32 v2, 0, v5, vcc ; GCN-IR-NEXT: v_bfe_i32 v0, v0, 0, 24 -; GCN-IR-NEXT: v_add_i32_e32 v2, vcc, v2, v4 +; GCN-IR-NEXT: v_add_i32_e32 v2, vcc, v4, v2 ; GCN-IR-NEXT: v_bfe_i32 v2, v2, 0, 24 ; GCN-IR-NEXT: v_ashrrev_i32_e32 v1, 31, v0 ; GCN-IR-NEXT: v_ashrrev_i32_e32 v3, 31, v2 @@ -1115,8 +1115,8 @@ ; GCN-NEXT: v_mul_hi_u32 v3, s4, v0 ; GCN-NEXT: v_mul_lo_u32 v5, s5, v0 ; GCN-NEXT: v_mul_lo_u32 v4, s4, v0 -; GCN-NEXT: v_add_i32_e32 v2, vcc, v3, v2 -; GCN-NEXT: v_add_i32_e32 v2, vcc, v2, v5 +; GCN-NEXT: v_add_i32_e32 v2, vcc, v2, v3 +; GCN-NEXT: v_add_i32_e32 v2, vcc, v5, v2 ; GCN-NEXT: v_mul_hi_u32 v3, v0, v4 ; GCN-NEXT: v_mul_lo_u32 v5, v0, v2 ; GCN-NEXT: v_mul_hi_u32 v7, v0, v2 @@ -1165,7 +1165,7 @@ ; GCN-NEXT: v_addc_u32_e32 v0, vcc, 0, v1, vcc ; GCN-NEXT: v_mul_lo_u32 v1, s3, v0 ; GCN-NEXT: v_mul_hi_u32 v2, s2, v0 -; GCN-NEXT: v_add_i32_e32 v1, vcc, v2, v1 +; GCN-NEXT: v_add_i32_e32 v1, vcc, v1, v2 ; GCN-NEXT: v_mul_lo_u32 v2, s2, v0 ; GCN-NEXT: v_sub_i32_e32 v3, vcc, 0, v1 ; GCN-NEXT: v_sub_i32_e32 v2, vcc, 24, v2 @@ -1800,7 +1800,7 @@ ; GCN-NEXT: v_cvt_i32_f32_e32 v1, v1 ; GCN-NEXT: v_cmp_ge_f32_e64 vcc, |v2|, |v0| ; GCN-NEXT: v_cndmask_b32_e32 v0, 0, v3, vcc -; GCN-NEXT: v_add_i32_e32 v0, vcc, v0, v1 +; GCN-NEXT: v_add_i32_e32 v0, vcc, v1, v0 ; GCN-NEXT: v_bfe_i32 v0, v0, 0, 24 ; GCN-NEXT: v_ashrrev_i32_e32 v1, 31, v0 ; GCN-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 @@ -1827,7 +1827,7 @@ ; GCN-IR-NEXT: v_cvt_i32_f32_e32 v1, v1 ; GCN-IR-NEXT: v_cmp_ge_f32_e64 vcc, |v2|, |v0| ; GCN-IR-NEXT: v_cndmask_b32_e32 v0, 0, v3, vcc -; GCN-IR-NEXT: v_add_i32_e32 v0, vcc, v0, v1 +; GCN-IR-NEXT: v_add_i32_e32 v0, vcc, v1, v0 ; GCN-IR-NEXT: v_bfe_i32 v0, v0, 0, 24 ; GCN-IR-NEXT: v_ashrrev_i32_e32 v1, 31, v0 ; GCN-IR-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 @@ -1858,7 +1858,7 @@ ; GCN-NEXT: v_mov_b32_e32 v2, s0 ; GCN-NEXT: v_cmp_ge_f32_e64 vcc, |v0|, s8 ; GCN-NEXT: v_cndmask_b32_e32 v0, 0, v2, vcc -; GCN-NEXT: v_add_i32_e32 v0, vcc, v0, v1 +; GCN-NEXT: v_add_i32_e32 v0, vcc, v1, v0 ; GCN-NEXT: v_bfe_i32 v0, v0, 0, 24 ; GCN-NEXT: s_mov_b32 s5, s1 ; GCN-NEXT: v_ashrrev_i32_e32 v1, 31, v0 diff --git a/llvm/test/CodeGen/AMDGPU/sdwa-peephole.ll b/llvm/test/CodeGen/AMDGPU/sdwa-peephole.ll --- a/llvm/test/CodeGen/AMDGPU/sdwa-peephole.ll +++ b/llvm/test/CodeGen/AMDGPU/sdwa-peephole.ll @@ -487,7 +487,10 @@ ; GCN-LABEL: {{^}}add_bb_v2i16: ; NOSDWA-NOT: v_add_{{(_co)?}}_u32_sdwa -; VI: v_add_u32_sdwa v{{[0-9]+}}, vcc, v{{[0-9]+}}, v{{[0-9]+}} dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; VI: v_readfirstlane_b32 [[LO:s[0-9]+]] +; VI: v_readfirstlane_b32 [[HI:s[0-9]+]] +; VI: s_lshr_b32 [[LOSH:s[0-9]+]], [[LO]], 16 +; VI: s_lshr_b32 [[HISH:s[0-9]+]], [[HI]], 16 ; GFX9_10: v_pk_add_u16 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} diff --git a/llvm/test/CodeGen/AMDGPU/shl.ll b/llvm/test/CodeGen/AMDGPU/shl.ll --- a/llvm/test/CodeGen/AMDGPU/shl.ll +++ b/llvm/test/CodeGen/AMDGPU/shl.ll @@ -83,9 +83,9 @@ ; SI-NEXT: s_mov_b32 s4, s0 ; SI-NEXT: s_mov_b32 s5, s1 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshl_b32_e32 v3, v3, v7 -; SI-NEXT: v_lshl_b32_e32 v2, v2, v6 -; SI-NEXT: v_lshl_b32_e32 v1, v1, v5 +; SI-NEXT: v_lshlrev_b32_e32 v3, v7, v3 +; SI-NEXT: v_lshlrev_b32_e32 v2, v6, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, v5, v1 ; SI-NEXT: v_lshl_b32_e32 v0, v0, v4 ; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 ; SI-NEXT: s_endpgm diff --git a/llvm/test/CodeGen/AMDGPU/sra.ll b/llvm/test/CodeGen/AMDGPU/sra.ll --- a/llvm/test/CodeGen/AMDGPU/sra.ll +++ b/llvm/test/CodeGen/AMDGPU/sra.ll @@ -85,9 +85,9 @@ ; SI-NEXT: s_mov_b32 s4, s0 ; SI-NEXT: s_mov_b32 s5, s1 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_ashr_i32_e32 v3, v3, v7 -; SI-NEXT: v_ashr_i32_e32 v2, v2, v6 -; SI-NEXT: v_ashr_i32_e32 v1, v1, v5 +; SI-NEXT: v_ashrrev_i32_e32 v3, v7, v3 +; SI-NEXT: v_ashrrev_i32_e32 v2, v6, v2 +; SI-NEXT: v_ashrrev_i32_e32 v1, v5, v1 ; SI-NEXT: v_ashr_i32_e32 v0, v0, v4 ; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 ; SI-NEXT: s_endpgm @@ -159,14 +159,17 @@ ; SI-NEXT: s_mov_b32 s4, s0 ; SI-NEXT: s_mov_b32 s5, s1 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_bfe_i32 v2, v0, 0, 16 -; SI-NEXT: v_ashrrev_i32_e32 v0, 16, v0 -; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v1 -; SI-NEXT: v_ashrrev_i32_e32 v0, v3, v0 -; SI-NEXT: v_ashrrev_i32_e32 v1, v1, v2 -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_readfirstlane_b32 s0, v0 +; SI-NEXT: v_readfirstlane_b32 s1, v1 +; SI-NEXT: s_sext_i32_i16 s2, s0 +; SI-NEXT: s_ashr_i32 s0, s0, 16 +; SI-NEXT: s_lshr_b32 s3, s1, 16 +; SI-NEXT: s_ashr_i32 s0, s0, s3 +; SI-NEXT: s_ashr_i32 s1, s2, s1 +; SI-NEXT: s_lshl_b32 s0, s0, 16 +; SI-NEXT: s_and_b32 s1, s1, 0xffff +; SI-NEXT: s_or_b32 s0, s1, s0 +; SI-NEXT: v_mov_b32_e32 v0, s0 ; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; SI-NEXT: s_endpgm ; @@ -184,9 +187,18 @@ ; VI-NEXT: s_mov_b32 s4, s0 ; VI-NEXT: s_mov_b32 s5, s1 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_ashrrev_i32_sdwa v2, sext(v1), sext(v0) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0 -; VI-NEXT: v_ashrrev_i32_sdwa v0, sext(v1), sext(v0) dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; VI-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_readfirstlane_b32 s0, v0 +; VI-NEXT: v_readfirstlane_b32 s1, v1 +; VI-NEXT: s_ashr_i32 s2, s0, 16 +; VI-NEXT: s_sext_i32_i16 s0, s0 +; VI-NEXT: s_ashr_i32 s3, s1, 16 +; VI-NEXT: s_sext_i32_i16 s1, s1 +; VI-NEXT: s_ashr_i32 s0, s0, s1 +; VI-NEXT: s_ashr_i32 s1, s2, s3 +; VI-NEXT: s_lshl_b32 s1, s1, 16 +; VI-NEXT: s_and_b32 s0, s0, 0xffff +; VI-NEXT: s_or_b32 s0, s0, s1 +; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; VI-NEXT: s_endpgm ; @@ -243,22 +255,28 @@ ; SI-NEXT: s_mov_b32 s4, s0 ; SI-NEXT: s_mov_b32 s5, s1 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_bfe_i32 v4, v0, 0, 16 -; SI-NEXT: v_ashrrev_i32_e32 v0, 16, v0 -; SI-NEXT: v_bfe_i32 v5, v1, 0, 16 -; SI-NEXT: v_ashrrev_i32_e32 v1, 16, v1 -; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v2 -; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v3 -; SI-NEXT: v_ashr_i32_e32 v1, v1, v7 -; SI-NEXT: v_ashr_i32_e32 v3, v5, v3 -; SI-NEXT: v_ashr_i32_e32 v0, v0, v6 -; SI-NEXT: v_ashr_i32_e32 v2, v4, v2 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; SI-NEXT: v_or_b32_e32 v1, v3, v1 -; SI-NEXT: v_or_b32_e32 v0, v2, v0 +; SI-NEXT: v_readfirstlane_b32 s0, v3 +; SI-NEXT: v_readfirstlane_b32 s1, v2 +; SI-NEXT: v_readfirstlane_b32 s2, v1 +; SI-NEXT: v_readfirstlane_b32 s3, v0 +; SI-NEXT: s_sext_i32_i16 s8, s3 +; SI-NEXT: s_ashr_i32 s3, s3, 16 +; SI-NEXT: s_sext_i32_i16 s9, s2 +; SI-NEXT: s_ashr_i32 s2, s2, 16 +; SI-NEXT: s_lshr_b32 s10, s1, 16 +; SI-NEXT: s_lshr_b32 s11, s0, 16 +; SI-NEXT: s_ashr_i32 s2, s2, s11 +; SI-NEXT: s_ashr_i32 s0, s9, s0 +; SI-NEXT: s_ashr_i32 s3, s3, s10 +; SI-NEXT: s_ashr_i32 s1, s8, s1 +; SI-NEXT: s_lshl_b32 s2, s2, 16 +; SI-NEXT: s_and_b32 s0, s0, 0xffff +; SI-NEXT: s_lshl_b32 s3, s3, 16 +; SI-NEXT: s_and_b32 s1, s1, 0xffff +; SI-NEXT: s_or_b32 s0, s0, s2 +; SI-NEXT: s_or_b32 s1, s1, s3 +; SI-NEXT: v_mov_b32_e32 v0, s1 +; SI-NEXT: v_mov_b32_e32 v1, s0 ; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; SI-NEXT: s_endpgm ; @@ -276,12 +294,30 @@ ; VI-NEXT: s_mov_b32 s4, s0 ; VI-NEXT: s_mov_b32 s5, s1 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_ashrrev_i32_sdwa v4, sext(v2), sext(v0) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0 -; VI-NEXT: v_ashrrev_i32_sdwa v0, sext(v2), sext(v0) dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; VI-NEXT: v_ashrrev_i32_sdwa v2, sext(v3), sext(v1) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0 -; VI-NEXT: v_ashrrev_i32_sdwa v1, sext(v3), sext(v1) dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v0, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_readfirstlane_b32 s0, v2 +; VI-NEXT: v_readfirstlane_b32 s1, v3 +; VI-NEXT: v_readfirstlane_b32 s2, v0 +; VI-NEXT: v_readfirstlane_b32 s3, v1 +; VI-NEXT: s_ashr_i32 s8, s3, 16 +; VI-NEXT: s_sext_i32_i16 s3, s3 +; VI-NEXT: s_ashr_i32 s9, s2, 16 +; VI-NEXT: s_sext_i32_i16 s2, s2 +; VI-NEXT: s_ashr_i32 s10, s1, 16 +; VI-NEXT: s_sext_i32_i16 s1, s1 +; VI-NEXT: s_ashr_i32 s11, s0, 16 +; VI-NEXT: s_sext_i32_i16 s0, s0 +; VI-NEXT: s_ashr_i32 s0, s2, s0 +; VI-NEXT: s_ashr_i32 s2, s9, s11 +; VI-NEXT: s_ashr_i32 s1, s3, s1 +; VI-NEXT: s_ashr_i32 s3, s8, s10 +; VI-NEXT: s_lshl_b32 s3, s3, 16 +; VI-NEXT: s_and_b32 s1, s1, 0xffff +; VI-NEXT: s_lshl_b32 s2, s2, 16 +; VI-NEXT: s_and_b32 s0, s0, 0xffff +; VI-NEXT: s_or_b32 s1, s1, s3 +; VI-NEXT: s_or_b32 s0, s0, s2 +; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; VI-NEXT: s_endpgm ; diff --git a/llvm/test/CodeGen/AMDGPU/srem64.ll b/llvm/test/CodeGen/AMDGPU/srem64.ll --- a/llvm/test/CodeGen/AMDGPU/srem64.ll +++ b/llvm/test/CodeGen/AMDGPU/srem64.ll @@ -28,8 +28,8 @@ ; GCN-NEXT: v_mul_hi_u32 v3, s0, v0 ; GCN-NEXT: v_mul_lo_u32 v5, s1, v0 ; GCN-NEXT: v_mul_lo_u32 v4, s0, v0 -; GCN-NEXT: v_add_i32_e32 v2, vcc, v3, v2 -; GCN-NEXT: v_add_i32_e32 v2, vcc, v2, v5 +; GCN-NEXT: v_add_i32_e32 v2, vcc, v2, v3 +; GCN-NEXT: v_add_i32_e32 v2, vcc, v5, v2 ; GCN-NEXT: v_mul_hi_u32 v3, v0, v4 ; GCN-NEXT: v_mul_lo_u32 v5, v0, v2 ; GCN-NEXT: v_mul_hi_u32 v7, v0, v2 @@ -86,8 +86,8 @@ ; GCN-NEXT: v_mul_hi_u32 v2, s12, v0 ; GCN-NEXT: v_mul_lo_u32 v3, s13, v0 ; GCN-NEXT: v_mul_lo_u32 v0, s12, v0 -; GCN-NEXT: v_add_i32_e32 v1, vcc, v2, v1 -; GCN-NEXT: v_add_i32_e32 v1, vcc, v1, v3 +; GCN-NEXT: v_add_i32_e32 v1, vcc, v1, v2 +; GCN-NEXT: v_add_i32_e32 v1, vcc, v3, v1 ; GCN-NEXT: v_sub_i32_e32 v2, vcc, s11, v1 ; GCN-NEXT: v_mov_b32_e32 v3, s13 ; GCN-NEXT: v_sub_i32_e32 v0, vcc, s10, v0 @@ -202,8 +202,8 @@ ; GCN-IR-NEXT: v_mul_lo_u32 v3, s5, v0 ; GCN-IR-NEXT: v_mul_lo_u32 v0, s4, v0 ; GCN-IR-NEXT: s_mov_b32 s11, 0xf000 -; GCN-IR-NEXT: v_add_i32_e32 v1, vcc, v2, v1 -; GCN-IR-NEXT: v_add_i32_e32 v1, vcc, v1, v3 +; GCN-IR-NEXT: v_add_i32_e32 v1, vcc, v1, v2 +; GCN-IR-NEXT: v_add_i32_e32 v1, vcc, v3, v1 ; GCN-IR-NEXT: v_mov_b32_e32 v2, s3 ; GCN-IR-NEXT: v_sub_i32_e32 v0, vcc, s2, v0 ; GCN-IR-NEXT: s_mov_b32 s10, -1 @@ -669,7 +669,7 @@ ; GCN-NEXT: v_cmp_ge_f32_e64 vcc, |v1|, |v0| ; GCN-NEXT: v_cndmask_b32_e32 v0, 0, v3, vcc ; GCN-NEXT: s_mov_b32 s5, s1 -; GCN-NEXT: v_add_i32_e32 v0, vcc, v0, v2 +; GCN-NEXT: v_add_i32_e32 v0, vcc, v2, v0 ; GCN-NEXT: v_mul_lo_u32 v0, v0, s4 ; GCN-NEXT: s_mov_b32 s4, s0 ; GCN-NEXT: v_sub_i32_e32 v0, vcc, s2, v0 @@ -701,7 +701,7 @@ ; GCN-IR-NEXT: v_cmp_ge_f32_e64 vcc, |v1|, |v0| ; GCN-IR-NEXT: v_cndmask_b32_e32 v0, 0, v3, vcc ; GCN-IR-NEXT: s_mov_b32 s5, s1 -; GCN-IR-NEXT: v_add_i32_e32 v0, vcc, v0, v2 +; GCN-IR-NEXT: v_add_i32_e32 v0, vcc, v2, v0 ; GCN-IR-NEXT: v_mul_lo_u32 v0, v0, s4 ; GCN-IR-NEXT: s_mov_b32 s4, s0 ; GCN-IR-NEXT: v_sub_i32_e32 v0, vcc, s2, v0 @@ -740,7 +740,7 @@ ; GCN-NEXT: v_cmp_ge_f32_e64 vcc, |v1|, |v0| ; GCN-NEXT: v_cndmask_b32_e32 v0, 0, v3, vcc ; GCN-NEXT: s_mov_b32 s5, s1 -; GCN-NEXT: v_add_i32_e32 v0, vcc, v0, v2 +; GCN-NEXT: v_add_i32_e32 v0, vcc, v2, v0 ; GCN-NEXT: v_mul_lo_u32 v0, v0, s4 ; GCN-NEXT: s_mov_b32 s4, s0 ; GCN-NEXT: v_sub_i32_e32 v0, vcc, s2, v0 @@ -772,7 +772,7 @@ ; GCN-IR-NEXT: v_cmp_ge_f32_e64 vcc, |v1|, |v0| ; GCN-IR-NEXT: v_cndmask_b32_e32 v0, 0, v3, vcc ; GCN-IR-NEXT: s_mov_b32 s5, s1 -; GCN-IR-NEXT: v_add_i32_e32 v0, vcc, v0, v2 +; GCN-IR-NEXT: v_add_i32_e32 v0, vcc, v2, v0 ; GCN-IR-NEXT: v_mul_lo_u32 v0, v0, s4 ; GCN-IR-NEXT: s_mov_b32 s4, s0 ; GCN-IR-NEXT: v_sub_i32_e32 v0, vcc, s2, v0 @@ -810,7 +810,7 @@ ; GCN-NEXT: v_cvt_i32_f32_e32 v2, v2 ; GCN-NEXT: v_cmp_ge_f32_e64 vcc, |v1|, |v0| ; GCN-NEXT: v_cndmask_b32_e32 v0, 0, v3, vcc -; GCN-NEXT: v_add_i32_e32 v0, vcc, v0, v2 +; GCN-NEXT: v_add_i32_e32 v0, vcc, v2, v0 ; GCN-NEXT: v_mul_lo_u32 v0, v0, s4 ; GCN-NEXT: s_mov_b32 s4, s0 ; GCN-NEXT: v_sub_i32_e32 v0, vcc, s3, v0 @@ -839,7 +839,7 @@ ; GCN-IR-NEXT: v_cvt_i32_f32_e32 v2, v2 ; GCN-IR-NEXT: v_cmp_ge_f32_e64 vcc, |v1|, |v0| ; GCN-IR-NEXT: v_cndmask_b32_e32 v0, 0, v3, vcc -; GCN-IR-NEXT: v_add_i32_e32 v0, vcc, v0, v2 +; GCN-IR-NEXT: v_add_i32_e32 v0, vcc, v2, v0 ; GCN-IR-NEXT: v_mul_lo_u32 v0, v0, s4 ; GCN-IR-NEXT: s_mov_b32 s4, s0 ; GCN-IR-NEXT: v_sub_i32_e32 v0, vcc, s3, v0 @@ -889,8 +889,8 @@ ; GCN-NEXT: v_mul_hi_u32 v3, s0, v0 ; GCN-NEXT: v_mul_lo_u32 v5, s1, v0 ; GCN-NEXT: v_mul_lo_u32 v4, s0, v0 -; GCN-NEXT: v_add_i32_e32 v2, vcc, v3, v2 -; GCN-NEXT: v_add_i32_e32 v2, vcc, v2, v5 +; GCN-NEXT: v_add_i32_e32 v2, vcc, v2, v3 +; GCN-NEXT: v_add_i32_e32 v2, vcc, v5, v2 ; GCN-NEXT: v_mul_hi_u32 v3, v0, v4 ; GCN-NEXT: v_mul_lo_u32 v5, v0, v2 ; GCN-NEXT: v_mul_hi_u32 v6, v0, v2 @@ -950,8 +950,8 @@ ; GCN-NEXT: v_mul_hi_u32 v2, s12, v0 ; GCN-NEXT: v_mul_lo_u32 v3, s13, v0 ; GCN-NEXT: v_mul_lo_u32 v0, s12, v0 -; GCN-NEXT: v_add_i32_e32 v1, vcc, v2, v1 -; GCN-NEXT: v_add_i32_e32 v1, vcc, v1, v3 +; GCN-NEXT: v_add_i32_e32 v1, vcc, v1, v2 +; GCN-NEXT: v_add_i32_e32 v1, vcc, v3, v1 ; GCN-NEXT: v_sub_i32_e32 v2, vcc, s15, v1 ; GCN-NEXT: v_mov_b32_e32 v3, s13 ; GCN-NEXT: v_sub_i32_e32 v0, vcc, s14, v0 @@ -1078,13 +1078,15 @@ ; GCN-IR-NEXT: v_mov_b32_e32 v0, s2 ; GCN-IR-NEXT: v_cndmask_b32_e64 v0, v0, 0, s[14:15] ; GCN-IR-NEXT: .LBB8_6: ; %udiv-end -; GCN-IR-NEXT: v_mul_lo_u32 v1, s8, v1 ; GCN-IR-NEXT: v_mul_hi_u32 v2, s8, v0 -; GCN-IR-NEXT: v_mul_lo_u32 v3, s9, v0 -; GCN-IR-NEXT: v_mul_lo_u32 v0, s8, v0 +; GCN-IR-NEXT: v_mul_lo_u32 v1, s8, v1 ; GCN-IR-NEXT: s_mov_b32 s7, 0xf000 -; GCN-IR-NEXT: v_add_i32_e32 v1, vcc, v2, v1 -; GCN-IR-NEXT: v_add_i32_e32 v1, vcc, v1, v3 +; GCN-IR-NEXT: s_mov_b32 s6, -1 +; GCN-IR-NEXT: v_readfirstlane_b32 s10, v2 +; GCN-IR-NEXT: v_mul_lo_u32 v2, s9, v0 +; GCN-IR-NEXT: v_mul_lo_u32 v0, s8, v0 +; GCN-IR-NEXT: v_add_i32_e32 v1, vcc, s10, v1 +; GCN-IR-NEXT: v_add_i32_e32 v1, vcc, v1, v2 ; GCN-IR-NEXT: v_mov_b32_e32 v2, s3 ; GCN-IR-NEXT: v_sub_i32_e32 v0, vcc, s2, v0 ; GCN-IR-NEXT: v_subb_u32_e32 v1, vcc, v2, v1, vcc @@ -1092,7 +1094,6 @@ ; GCN-IR-NEXT: v_xor_b32_e32 v1, s1, v1 ; GCN-IR-NEXT: v_mov_b32_e32 v2, s1 ; GCN-IR-NEXT: v_subrev_i32_e32 v0, vcc, s0, v0 -; GCN-IR-NEXT: s_mov_b32 s6, -1 ; GCN-IR-NEXT: v_subb_u32_e32 v1, vcc, v1, v2, vcc ; GCN-IR-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; GCN-IR-NEXT: s_endpgm @@ -1239,8 +1240,8 @@ ; GCN-IR-NEXT: v_mul_hi_u32 v2, s6, v0 ; GCN-IR-NEXT: v_mul_lo_u32 v3, s7, v0 ; GCN-IR-NEXT: v_mul_lo_u32 v0, s6, v0 -; GCN-IR-NEXT: v_add_i32_e32 v1, vcc, v2, v1 -; GCN-IR-NEXT: v_add_i32_e32 v1, vcc, v1, v3 +; GCN-IR-NEXT: v_add_i32_e32 v1, vcc, v1, v2 +; GCN-IR-NEXT: v_add_i32_e32 v1, vcc, v3, v1 ; GCN-IR-NEXT: v_mov_b32_e32 v2, s5 ; GCN-IR-NEXT: v_sub_i32_e32 v0, vcc, s4, v0 ; GCN-IR-NEXT: v_subb_u32_e32 v1, vcc, v2, v1, vcc @@ -1291,8 +1292,8 @@ ; GCN-NEXT: v_mul_hi_u32 v3, s2, v0 ; GCN-NEXT: v_mul_lo_u32 v5, s3, v0 ; GCN-NEXT: v_mul_lo_u32 v4, s2, v0 -; GCN-NEXT: v_add_i32_e32 v2, vcc, v3, v2 -; GCN-NEXT: v_add_i32_e32 v2, vcc, v2, v5 +; GCN-NEXT: v_add_i32_e32 v2, vcc, v2, v3 +; GCN-NEXT: v_add_i32_e32 v2, vcc, v5, v2 ; GCN-NEXT: v_mul_hi_u32 v3, v0, v4 ; GCN-NEXT: v_mul_lo_u32 v5, v0, v2 ; GCN-NEXT: v_mul_hi_u32 v7, v0, v2 @@ -1340,7 +1341,7 @@ ; GCN-NEXT: v_mul_lo_u32 v1, s9, v0 ; GCN-NEXT: v_mul_hi_u32 v2, s8, v0 ; GCN-NEXT: v_mul_lo_u32 v0, s8, v0 -; GCN-NEXT: v_add_i32_e32 v1, vcc, v2, v1 +; GCN-NEXT: v_add_i32_e32 v1, vcc, v1, v2 ; GCN-NEXT: v_sub_i32_e32 v2, vcc, 0, v1 ; GCN-NEXT: v_sub_i32_e32 v0, vcc, 24, v0 ; GCN-NEXT: v_subb_u32_e64 v2, s[0:1], v2, v3, vcc @@ -1447,8 +1448,8 @@ ; GCN-IR-NEXT: v_mul_lo_u32 v3, s5, v0 ; GCN-IR-NEXT: v_mul_lo_u32 v0, s4, v0 ; GCN-IR-NEXT: s_mov_b32 s3, 0xf000 -; GCN-IR-NEXT: v_add_i32_e32 v1, vcc, v2, v1 -; GCN-IR-NEXT: v_add_i32_e32 v1, vcc, v1, v3 +; GCN-IR-NEXT: v_add_i32_e32 v1, vcc, v1, v2 +; GCN-IR-NEXT: v_add_i32_e32 v1, vcc, v3, v1 ; GCN-IR-NEXT: v_sub_i32_e32 v0, vcc, 24, v0 ; GCN-IR-NEXT: s_mov_b32 s2, -1 ; GCN-IR-NEXT: v_subb_u32_e32 v1, vcc, 0, v1, vcc @@ -1970,7 +1971,7 @@ ; GCN-NEXT: v_cvt_i32_f32_e32 v1, v1 ; GCN-NEXT: v_cmp_ge_f32_e64 vcc, |v2|, |v0| ; GCN-NEXT: v_cndmask_b32_e32 v0, 0, v3, vcc -; GCN-NEXT: v_add_i32_e32 v0, vcc, v0, v1 +; GCN-NEXT: v_add_i32_e32 v0, vcc, v1, v0 ; GCN-NEXT: v_mul_lo_u32 v0, v0, s4 ; GCN-NEXT: v_sub_i32_e32 v0, vcc, 24, v0 ; GCN-NEXT: v_bfe_i32 v0, v0, 0, 24 @@ -2030,7 +2031,7 @@ ; GCN-NEXT: v_cmp_ge_f32_e64 vcc, |v0|, s4 ; GCN-NEXT: v_cndmask_b32_e32 v0, 0, v1, vcc ; GCN-NEXT: s_movk_i32 s3, 0x5b7f -; GCN-NEXT: v_add_i32_e32 v0, vcc, v0, v2 +; GCN-NEXT: v_add_i32_e32 v0, vcc, v2, v0 ; GCN-NEXT: v_mul_lo_u32 v0, v0, s3 ; GCN-NEXT: s_mov_b32 s4, s0 ; GCN-NEXT: s_mov_b32 s5, s1 @@ -2059,7 +2060,7 @@ ; GCN-IR-NEXT: v_cmp_ge_f32_e64 vcc, |v0|, s4 ; GCN-IR-NEXT: v_cndmask_b32_e32 v0, 0, v1, vcc ; GCN-IR-NEXT: s_movk_i32 s3, 0x5b7f -; GCN-IR-NEXT: v_add_i32_e32 v0, vcc, v0, v2 +; GCN-IR-NEXT: v_add_i32_e32 v0, vcc, v2, v0 ; GCN-IR-NEXT: v_mul_lo_u32 v0, v0, s3 ; GCN-IR-NEXT: s_mov_b32 s4, s0 ; GCN-IR-NEXT: s_mov_b32 s5, s1 diff --git a/llvm/test/CodeGen/AMDGPU/srl.ll b/llvm/test/CodeGen/AMDGPU/srl.ll --- a/llvm/test/CodeGen/AMDGPU/srl.ll +++ b/llvm/test/CodeGen/AMDGPU/srl.ll @@ -138,9 +138,9 @@ ; SI-NEXT: s_mov_b32 s5, s1 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_lshr_b32_e32 v3, v3, v7 -; SI-NEXT: v_lshr_b32_e32 v2, v2, v6 -; SI-NEXT: v_lshr_b32_e32 v1, v1, v5 -; SI-NEXT: v_lshr_b32_e32 v0, v0, v4 +; SI-NEXT: v_lshrrev_b32_e32 v2, v6, v2 +; SI-NEXT: v_lshrrev_b32_e32 v1, v5, v1 +; SI-NEXT: v_lshrrev_b32_e32 v0, v4, v0 ; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 ; SI-NEXT: s_endpgm ; diff --git a/llvm/test/CodeGen/AMDGPU/sub.ll b/llvm/test/CodeGen/AMDGPU/sub.ll --- a/llvm/test/CodeGen/AMDGPU/sub.ll +++ b/llvm/test/CodeGen/AMDGPU/sub.ll @@ -47,7 +47,7 @@ ; GCN-LABEL: {{^}}test_sub_v2i32: ; SI: v_sub_i32_e32 v{{[0-9]+, vcc, v[0-9]+, v[0-9]+}} -; SI: v_sub_i32_e32 v{{[0-9]+, vcc, v[0-9]+, v[0-9]+}} +; SI: v_subrev_i32_e32 v{{[0-9]+, vcc, v[0-9]+, v[0-9]+}} ; GFX9: v_sub_u32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}} ; GFX9: v_sub_u32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}} @@ -61,7 +61,7 @@ } ; GCN-LABEL: {{^}}test_sub_v4i32: -; SI: v_sub_i32_e32 v{{[0-9]+, vcc, v[0-9]+, v[0-9]+}} +; SI: v_subrev_i32_e32 v{{[0-9]+, vcc, v[0-9]+, v[0-9]+}} ; SI: v_sub_i32_e32 v{{[0-9]+, vcc, v[0-9]+, v[0-9]+}} ; SI: v_sub_i32_e32 v{{[0-9]+, vcc, v[0-9]+, v[0-9]+}} ; SI: v_sub_i32_e32 v{{[0-9]+, vcc, v[0-9]+, v[0-9]+}} diff --git a/llvm/test/CodeGen/AMDGPU/udiv.ll b/llvm/test/CodeGen/AMDGPU/udiv.ll --- a/llvm/test/CodeGen/AMDGPU/udiv.ll +++ b/llvm/test/CodeGen/AMDGPU/udiv.ll @@ -27,7 +27,7 @@ ; SI-NEXT: v_cvt_u32_f32_e32 v2, v2 ; SI-NEXT: v_mul_lo_u32 v3, v3, v2 ; SI-NEXT: v_mul_hi_u32 v3, v2, v3 -; SI-NEXT: v_add_i32_e32 v2, vcc, v3, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, v2, v3 ; SI-NEXT: v_mul_hi_u32 v2, v0, v2 ; SI-NEXT: v_mul_lo_u32 v3, v2, v1 ; SI-NEXT: v_add_i32_e32 v4, vcc, 1, v2 @@ -63,7 +63,7 @@ ; VI-NEXT: v_cvt_u32_f32_e32 v2, v2 ; VI-NEXT: v_mul_lo_u32 v3, v3, v2 ; VI-NEXT: v_mul_hi_u32 v3, v2, v3 -; VI-NEXT: v_add_u32_e32 v2, vcc, v3, v2 +; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v3 ; VI-NEXT: v_mul_hi_u32 v2, v0, v2 ; VI-NEXT: v_mul_lo_u32 v3, v2, v1 ; VI-NEXT: v_add_u32_e32 v4, vcc, 1, v2 @@ -93,7 +93,7 @@ ; GCN-NEXT: v_cvt_u32_f32_e32 v2, v2 ; GCN-NEXT: v_mul_lo_u32 v3, v3, v2 ; GCN-NEXT: v_mul_hi_u32 v3, v2, v3 -; GCN-NEXT: v_add_u32_e32 v2, vcc, v3, v2 +; GCN-NEXT: v_add_u32_e32 v2, vcc, v2, v3 ; GCN-NEXT: v_mul_hi_u32 v4, v0, v2 ; GCN-NEXT: v_mov_b32_e32 v2, s0 ; GCN-NEXT: v_mov_b32_e32 v3, s1 @@ -193,7 +193,7 @@ ; SI-NEXT: v_mul_lo_u32 v1, s4, v0 ; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 ; SI-NEXT: v_mul_hi_u32 v1, v0, v1 -; SI-NEXT: v_add_i32_e32 v0, vcc, v1, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, v0, v1 ; SI-NEXT: v_mul_hi_u32 v0, s2, v0 ; SI-NEXT: v_mul_lo_u32 v1, v0, s3 ; SI-NEXT: v_add_i32_e32 v2, vcc, 1, v0 @@ -271,25 +271,29 @@ ; GFX1030-LABEL: s_udiv_i32: ; GFX1030: ; %bb.0: ; GFX1030-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX1030-NEXT: v_mov_b32_e32 v3, 0 ; GFX1030-NEXT: s_waitcnt lgkmcnt(0) ; GFX1030-NEXT: v_cvt_f32_u32_e32 v0, s1 -; GFX1030-NEXT: s_sub_i32 s2, 0, s1 +; GFX1030-NEXT: s_sub_i32 s3, 0, s1 ; GFX1030-NEXT: v_rcp_iflag_f32_e32 v0, v0 ; GFX1030-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX1030-NEXT: v_cvt_u32_f32_e32 v0, v0 -; GFX1030-NEXT: v_mul_lo_u32 v1, s2, v0 +; GFX1030-NEXT: v_readfirstlane_b32 s2, v0 +; GFX1030-NEXT: s_mul_i32 s3, s3, s2 +; GFX1030-NEXT: s_mul_hi_u32 s3, s2, s3 +; GFX1030-NEXT: s_add_i32 s2, s2, s3 +; GFX1030-NEXT: s_mul_hi_u32 s6, s0, s2 +; GFX1030-NEXT: s_mul_i32 s2, s6, s1 +; GFX1030-NEXT: s_sub_i32 s0, s0, s2 ; GFX1030-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 -; GFX1030-NEXT: v_mul_hi_u32 v1, v0, v1 -; GFX1030-NEXT: v_add_nc_u32_e32 v0, v0, v1 -; GFX1030-NEXT: v_mul_hi_u32 v0, s0, v0 -; GFX1030-NEXT: v_mul_lo_u32 v1, v0, s1 -; GFX1030-NEXT: v_add_nc_u32_e32 v2, 1, v0 -; GFX1030-NEXT: v_sub_nc_u32_e32 v1, s0, v1 -; GFX1030-NEXT: v_subrev_nc_u32_e32 v3, s1, v1 -; GFX1030-NEXT: v_cmp_le_u32_e32 vcc_lo, s1, v1 -; GFX1030-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc_lo -; GFX1030-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc_lo -; GFX1030-NEXT: v_mov_b32_e32 v3, 0 +; GFX1030-NEXT: s_cmp_ge_u32 s0, s1 +; GFX1030-NEXT: s_cselect_b32 vcc_lo, -1, 0 +; GFX1030-NEXT: s_add_i32 s7, s6, 1 +; GFX1030-NEXT: s_sub_i32 s4, s0, s1 +; GFX1030-NEXT: v_mov_b32_e32 v0, s7 +; GFX1030-NEXT: v_mov_b32_e32 v1, s4 +; GFX1030-NEXT: v_cndmask_b32_e32 v0, s6, v0, vcc_lo +; GFX1030-NEXT: v_cndmask_b32_e32 v1, s0, v1, vcc_lo ; GFX1030-NEXT: v_add_nc_u32_e32 v2, 1, v0 ; GFX1030-NEXT: v_cmp_le_u32_e32 vcc_lo, s1, v1 ; GFX1030-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc_lo @@ -368,7 +372,7 @@ ; SI-NEXT: v_mul_lo_u32 v8, v5, v3 ; SI-NEXT: v_add_i32_e32 v7, vcc, 1, v4 ; SI-NEXT: v_subrev_i32_e32 v0, vcc, v6, v0 -; SI-NEXT: v_subrev_i32_e32 v1, vcc, v8, v1 +; SI-NEXT: v_sub_i32_e32 v1, vcc, v1, v8 ; SI-NEXT: v_add_i32_e32 v9, vcc, 1, v5 ; SI-NEXT: v_cmp_ge_u32_e64 s[0:1], v0, v2 ; SI-NEXT: v_cmp_ge_u32_e64 s[2:3], v1, v3 @@ -422,13 +426,13 @@ ; VI-NEXT: v_mul_lo_u32 v6, v4, v2 ; VI-NEXT: v_mul_lo_u32 v8, v5, v3 ; VI-NEXT: v_add_u32_e32 v7, vcc, 1, v4 -; VI-NEXT: v_subrev_u32_e32 v0, vcc, v6, v0 +; VI-NEXT: v_sub_u32_e32 v0, vcc, v0, v6 ; VI-NEXT: v_subrev_u32_e32 v1, vcc, v8, v1 ; VI-NEXT: v_add_u32_e32 v9, vcc, 1, v5 ; VI-NEXT: v_cmp_ge_u32_e64 s[0:1], v0, v2 ; VI-NEXT: v_cmp_ge_u32_e64 s[2:3], v1, v3 ; VI-NEXT: v_cndmask_b32_e64 v4, v4, v7, s[0:1] -; VI-NEXT: v_subrev_u32_e32 v6, vcc, v2, v0 +; VI-NEXT: v_sub_u32_e32 v6, vcc, v0, v2 ; VI-NEXT: v_cndmask_b32_e64 v5, v5, v9, s[2:3] ; VI-NEXT: v_subrev_u32_e32 v7, vcc, v3, v1 ; VI-NEXT: v_cndmask_b32_e64 v0, v0, v6, s[0:1] @@ -473,13 +477,13 @@ ; GCN-NEXT: v_mul_lo_u32 v8, v6, v2 ; GCN-NEXT: v_add_u32_e32 v9, vcc, 1, v6 ; GCN-NEXT: v_mul_lo_u32 v10, v7, v3 -; GCN-NEXT: v_subrev_u32_e32 v0, vcc, v8, v0 +; GCN-NEXT: v_sub_u32_e32 v0, vcc, v0, v8 ; GCN-NEXT: v_add_u32_e32 v11, vcc, 1, v7 ; GCN-NEXT: v_subrev_u32_e32 v1, vcc, v10, v1 ; GCN-NEXT: v_cmp_ge_u32_e64 s[0:1], v0, v2 ; GCN-NEXT: v_cmp_ge_u32_e64 s[2:3], v1, v3 ; GCN-NEXT: v_cndmask_b32_e64 v6, v6, v9, s[0:1] -; GCN-NEXT: v_subrev_u32_e32 v8, vcc, v2, v0 +; GCN-NEXT: v_sub_u32_e32 v8, vcc, v0, v2 ; GCN-NEXT: v_cndmask_b32_e64 v7, v7, v11, s[2:3] ; GCN-NEXT: v_subrev_u32_e32 v9, vcc, v3, v1 ; GCN-NEXT: v_cndmask_b32_e64 v0, v0, v8, s[0:1] @@ -654,9 +658,9 @@ ; SI-NEXT: v_mul_lo_u32 v16, v10, v2 ; SI-NEXT: v_mul_lo_u32 v18, v11, v3 ; SI-NEXT: v_subrev_i32_e32 v4, vcc, v12, v4 -; SI-NEXT: v_subrev_i32_e32 v5, vcc, v14, v5 -; SI-NEXT: v_subrev_i32_e32 v6, vcc, v16, v6 -; SI-NEXT: v_subrev_i32_e32 v7, vcc, v18, v7 +; SI-NEXT: v_sub_i32_e32 v5, vcc, v5, v14 +; SI-NEXT: v_sub_i32_e32 v6, vcc, v6, v16 +; SI-NEXT: v_sub_i32_e32 v7, vcc, v7, v18 ; SI-NEXT: v_add_i32_e32 v13, vcc, 1, v8 ; SI-NEXT: v_add_i32_e32 v15, vcc, 1, v9 ; SI-NEXT: v_add_i32_e32 v17, vcc, 1, v10 @@ -670,9 +674,9 @@ ; SI-NEXT: v_cndmask_b32_e64 v9, v9, v15, s[2:3] ; SI-NEXT: v_subrev_i32_e32 v13, vcc, v1, v5 ; SI-NEXT: v_cndmask_b32_e64 v10, v10, v17, s[4:5] -; SI-NEXT: v_subrev_i32_e32 v14, vcc, v2, v6 +; SI-NEXT: v_sub_i32_e32 v14, vcc, v6, v2 ; SI-NEXT: v_cndmask_b32_e64 v11, v11, v19, s[6:7] -; SI-NEXT: v_subrev_i32_e32 v15, vcc, v3, v7 +; SI-NEXT: v_sub_i32_e32 v15, vcc, v7, v3 ; SI-NEXT: v_cndmask_b32_e64 v4, v4, v12, s[0:1] ; SI-NEXT: v_add_i32_e32 v12, vcc, 1, v8 ; SI-NEXT: v_cndmask_b32_e64 v5, v5, v13, s[2:3] @@ -736,8 +740,8 @@ ; VI-NEXT: v_mul_hi_u32 v13, v12, v13 ; VI-NEXT: v_mul_hi_u32 v15, v14, v15 ; VI-NEXT: v_add_u32_e32 v8, vcc, v9, v8 -; VI-NEXT: v_add_u32_e32 v9, vcc, v11, v10 -; VI-NEXT: v_add_u32_e32 v10, vcc, v13, v12 +; VI-NEXT: v_add_u32_e32 v9, vcc, v10, v11 +; VI-NEXT: v_add_u32_e32 v10, vcc, v12, v13 ; VI-NEXT: v_add_u32_e32 v11, vcc, v15, v14 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_mul_hi_u32 v8, v4, v8 @@ -748,10 +752,10 @@ ; VI-NEXT: v_mul_lo_u32 v14, v9, v1 ; VI-NEXT: v_mul_lo_u32 v16, v10, v2 ; VI-NEXT: v_mul_lo_u32 v18, v11, v3 -; VI-NEXT: v_subrev_u32_e32 v4, vcc, v12, v4 +; VI-NEXT: v_sub_u32_e32 v4, vcc, v4, v12 ; VI-NEXT: v_subrev_u32_e32 v5, vcc, v14, v5 ; VI-NEXT: v_subrev_u32_e32 v6, vcc, v16, v6 -; VI-NEXT: v_subrev_u32_e32 v7, vcc, v18, v7 +; VI-NEXT: v_sub_u32_e32 v7, vcc, v7, v18 ; VI-NEXT: v_add_u32_e32 v13, vcc, 1, v8 ; VI-NEXT: v_add_u32_e32 v15, vcc, 1, v9 ; VI-NEXT: v_add_u32_e32 v17, vcc, 1, v10 @@ -831,8 +835,8 @@ ; GCN-NEXT: v_mul_hi_u32 v15, v14, v15 ; GCN-NEXT: v_mul_hi_u32 v17, v16, v17 ; GCN-NEXT: v_add_u32_e32 v10, vcc, v11, v10 -; GCN-NEXT: v_add_u32_e32 v11, vcc, v13, v12 -; GCN-NEXT: v_add_u32_e32 v12, vcc, v15, v14 +; GCN-NEXT: v_add_u32_e32 v11, vcc, v12, v13 +; GCN-NEXT: v_add_u32_e32 v12, vcc, v14, v15 ; GCN-NEXT: v_add_u32_e32 v13, vcc, v17, v16 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: v_mul_hi_u32 v10, v4, v10 @@ -843,10 +847,10 @@ ; GCN-NEXT: v_mul_lo_u32 v16, v11, v1 ; GCN-NEXT: v_mul_lo_u32 v18, v12, v2 ; GCN-NEXT: v_mul_lo_u32 v19, v13, v3 -; GCN-NEXT: v_subrev_u32_e32 v4, vcc, v14, v4 +; GCN-NEXT: v_sub_u32_e32 v4, vcc, v4, v14 ; GCN-NEXT: v_subrev_u32_e32 v5, vcc, v16, v5 ; GCN-NEXT: v_subrev_u32_e32 v6, vcc, v18, v6 -; GCN-NEXT: v_subrev_u32_e32 v7, vcc, v19, v7 +; GCN-NEXT: v_sub_u32_e32 v7, vcc, v7, v19 ; GCN-NEXT: v_add_u32_e32 v15, vcc, 1, v10 ; GCN-NEXT: v_add_u32_e32 v17, vcc, 1, v11 ; GCN-NEXT: v_add_u32_e32 v14, vcc, 1, v12 @@ -1854,10 +1858,10 @@ ; SI-NEXT: v_mul_hi_u32 v1, v2, v1 ; SI-NEXT: v_mul_lo_u32 v3, v1, v0 ; SI-NEXT: v_add_i32_e32 v4, vcc, 1, v1 -; SI-NEXT: v_subrev_i32_e32 v2, vcc, v3, v2 +; SI-NEXT: v_sub_i32_e32 v2, vcc, v2, v3 ; SI-NEXT: v_cmp_ge_u32_e64 s[0:1], v2, v0 ; SI-NEXT: v_cndmask_b32_e64 v1, v1, v4, s[0:1] -; SI-NEXT: v_subrev_i32_e32 v3, vcc, v0, v2 +; SI-NEXT: v_sub_i32_e32 v3, vcc, v2, v0 ; SI-NEXT: v_cndmask_b32_e64 v2, v2, v3, s[0:1] ; SI-NEXT: v_add_i32_e32 v3, vcc, 1, v1 ; SI-NEXT: v_cmp_ge_u32_e32 vcc, v2, v0 @@ -1897,14 +1901,14 @@ ; VI-NEXT: v_cvt_u32_f32_e32 v1, v1 ; VI-NEXT: v_mul_lo_u32 v4, v4, v1 ; VI-NEXT: v_mul_hi_u32 v4, v1, v4 -; VI-NEXT: v_add_u32_e32 v1, vcc, v4, v1 +; VI-NEXT: v_add_u32_e32 v1, vcc, v1, v4 ; VI-NEXT: v_mul_hi_u32 v1, v2, v1 ; VI-NEXT: v_mul_lo_u32 v3, v1, v0 ; VI-NEXT: v_add_u32_e32 v4, vcc, 1, v1 -; VI-NEXT: v_subrev_u32_e32 v2, vcc, v3, v2 +; VI-NEXT: v_sub_u32_e32 v2, vcc, v2, v3 ; VI-NEXT: v_cmp_ge_u32_e64 s[0:1], v2, v0 ; VI-NEXT: v_cndmask_b32_e64 v1, v1, v4, s[0:1] -; VI-NEXT: v_subrev_u32_e32 v3, vcc, v0, v2 +; VI-NEXT: v_sub_u32_e32 v3, vcc, v2, v0 ; VI-NEXT: v_cndmask_b32_e64 v2, v2, v3, s[0:1] ; VI-NEXT: v_add_u32_e32 v3, vcc, 1, v1 ; VI-NEXT: v_cmp_ge_u32_e32 vcc, v2, v0 @@ -1950,16 +1954,16 @@ ; GCN-NEXT: v_cvt_u32_f32_e32 v1, v1 ; GCN-NEXT: v_mul_lo_u32 v4, v4, v1 ; GCN-NEXT: v_mul_hi_u32 v4, v1, v4 -; GCN-NEXT: v_add_u32_e32 v0, vcc, v4, v1 +; GCN-NEXT: v_add_u32_e32 v0, vcc, v1, v4 ; GCN-NEXT: v_mul_hi_u32 v4, v2, v0 ; GCN-NEXT: v_mov_b32_e32 v0, s0 ; GCN-NEXT: v_mov_b32_e32 v1, s1 ; GCN-NEXT: v_mul_lo_u32 v5, v4, v3 ; GCN-NEXT: v_add_u32_e32 v6, vcc, 1, v4 -; GCN-NEXT: v_subrev_u32_e32 v2, vcc, v5, v2 +; GCN-NEXT: v_sub_u32_e32 v2, vcc, v2, v5 ; GCN-NEXT: v_cmp_ge_u32_e64 s[0:1], v2, v3 ; GCN-NEXT: v_cndmask_b32_e64 v4, v4, v6, s[0:1] -; GCN-NEXT: v_subrev_u32_e32 v5, vcc, v3, v2 +; GCN-NEXT: v_sub_u32_e32 v5, vcc, v2, v3 ; GCN-NEXT: v_cndmask_b32_e64 v2, v2, v5, s[0:1] ; GCN-NEXT: v_add_u32_e32 v5, vcc, 1, v4 ; GCN-NEXT: v_cmp_ge_u32_e32 vcc, v2, v3 @@ -2351,7 +2355,7 @@ ; SI-NEXT: v_cvt_i32_f32_e32 v1, v1 ; SI-NEXT: v_cmp_ge_f32_e64 vcc, |v3|, |v2| ; SI-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc -; SI-NEXT: v_add_i32_e32 v0, vcc, v0, v1 +; SI-NEXT: v_add_i32_e32 v0, vcc, v1, v0 ; SI-NEXT: buffer_store_byte v0, off, s[0:3], 0 ; SI-NEXT: s_endpgm ; @@ -2377,7 +2381,7 @@ ; VI-NEXT: v_cvt_i32_f32_e32 v1, v1 ; VI-NEXT: v_cmp_ge_f32_e64 vcc, |v3|, |v2| ; VI-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc -; VI-NEXT: v_add_u32_e32 v0, vcc, v1, v0 +; VI-NEXT: v_add_u32_e32 v0, vcc, v0, v1 ; VI-NEXT: buffer_store_byte v0, off, s[0:3], 0 ; VI-NEXT: s_endpgm ; @@ -2401,7 +2405,7 @@ ; GCN-NEXT: v_cvt_i32_f32_e32 v3, v3 ; GCN-NEXT: v_cmp_ge_f32_e64 vcc, |v5|, |v4| ; GCN-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc -; GCN-NEXT: v_add_u32_e32 v2, vcc, v3, v2 +; GCN-NEXT: v_add_u32_e32 v2, vcc, v2, v3 ; GCN-NEXT: flat_store_byte v[0:1], v2 ; GCN-NEXT: s_endpgm ; @@ -2597,7 +2601,7 @@ ; VI-NEXT: v_cvt_u32_f32_e32 v7, v3 ; VI-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v6, s6, 0 ; VI-NEXT: v_mul_lo_u32 v4, v7, s6 -; VI-NEXT: v_subrev_u32_e32 v3, vcc, v6, v3 +; VI-NEXT: v_sub_u32_e32 v3, vcc, v3, v6 ; VI-NEXT: v_add_u32_e32 v8, vcc, v4, v3 ; VI-NEXT: v_mul_hi_u32 v5, v6, v2 ; VI-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v6, v8, 0 @@ -2684,7 +2688,7 @@ ; GCN-NEXT: v_cvt_u32_f32_e32 v7, v3 ; GCN-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v6, s6, 0 ; GCN-NEXT: v_mul_lo_u32 v4, v7, s6 -; GCN-NEXT: v_subrev_u32_e32 v3, vcc, v6, v3 +; GCN-NEXT: v_sub_u32_e32 v3, vcc, v3, v6 ; GCN-NEXT: v_add_u32_e32 v8, vcc, v4, v3 ; GCN-NEXT: v_mul_hi_u32 v5, v6, v2 ; GCN-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v6, v8, 0 @@ -2765,32 +2769,34 @@ ; GFX1030-NEXT: s_addc_u32 s5, 0, 0 ; GFX1030-NEXT: v_add_co_u32 v2, s4, 0xa9000000, s4 ; GFX1030-NEXT: s_cmpk_lg_u32 s4, 0x0 -; GFX1030-NEXT: s_addc_u32 s4, s5, 0xa7c5 -; GFX1030-NEXT: v_mul_hi_u32 v3, 0xfffe7960, v2 -; GFX1030-NEXT: v_mul_lo_u32 v4, 0xfffe7960, v2 -; GFX1030-NEXT: s_mul_i32 s5, s4, 0xfffe7960 -; GFX1030-NEXT: v_sub_nc_u32_e32 v3, v3, v2 -; GFX1030-NEXT: v_mul_hi_u32 v5, v2, v4 -; GFX1030-NEXT: v_mul_hi_u32 v8, s4, v4 -; GFX1030-NEXT: v_mul_lo_u32 v4, s4, v4 -; GFX1030-NEXT: v_add_nc_u32_e32 v3, s5, v3 -; GFX1030-NEXT: v_mul_lo_u32 v6, v2, v3 -; GFX1030-NEXT: v_mul_hi_u32 v7, v2, v3 -; GFX1030-NEXT: v_mul_hi_u32 v9, s4, v3 -; GFX1030-NEXT: v_mul_lo_u32 v3, s4, v3 -; GFX1030-NEXT: v_add_co_u32 v5, vcc_lo, v5, v6 -; GFX1030-NEXT: v_add_co_ci_u32_e32 v6, vcc_lo, 0, v7, vcc_lo -; GFX1030-NEXT: v_add_co_u32 v4, vcc_lo, v5, v4 -; GFX1030-NEXT: v_add_co_ci_u32_e32 v4, vcc_lo, v6, v8, vcc_lo -; GFX1030-NEXT: v_add_co_ci_u32_e32 v5, vcc_lo, 0, v9, vcc_lo -; GFX1030-NEXT: v_add_co_u32 v3, vcc_lo, v4, v3 -; GFX1030-NEXT: v_add_co_ci_u32_e32 v4, vcc_lo, 0, v5, vcc_lo -; GFX1030-NEXT: v_add_co_u32 v5, vcc_lo, v2, v3 -; GFX1030-NEXT: v_add_co_ci_u32_e32 v6, vcc_lo, s4, v4, vcc_lo -; GFX1030-NEXT: v_mul_hi_u32 v8, v0, v5 -; GFX1030-NEXT: v_mad_u64_u32 v[4:5], null, v1, v5, 0 -; GFX1030-NEXT: v_mad_u64_u32 v[2:3], null, v0, v6, 0 -; GFX1030-NEXT: v_mad_u64_u32 v[6:7], null, v1, v6, 0 +; GFX1030-NEXT: s_addc_u32 s5, s5, 0xa7c5 +; GFX1030-NEXT: v_readfirstlane_b32 s4, v2 +; GFX1030-NEXT: s_mul_i32 s6, s5, 0xfffe7960 +; GFX1030-NEXT: s_mul_hi_u32 s7, s4, 0xfffe7960 +; GFX1030-NEXT: s_mul_i32 s8, s4, 0xfffe7960 +; GFX1030-NEXT: s_sub_i32 s7, s7, s4 +; GFX1030-NEXT: s_mul_hi_u32 s9, s4, s8 +; GFX1030-NEXT: s_add_i32 s7, s7, s6 +; GFX1030-NEXT: s_mul_hi_u32 s10, s5, s8 +; GFX1030-NEXT: s_mul_i32 s6, s5, s8 +; GFX1030-NEXT: s_mul_hi_u32 s8, s4, s7 +; GFX1030-NEXT: s_mul_i32 s4, s4, s7 +; GFX1030-NEXT: s_mul_hi_u32 s11, s5, s7 +; GFX1030-NEXT: s_add_u32 s4, s9, s4 +; GFX1030-NEXT: s_addc_u32 s8, 0, s8 +; GFX1030-NEXT: s_add_u32 s4, s4, s6 +; GFX1030-NEXT: s_mul_i32 s7, s5, s7 +; GFX1030-NEXT: s_addc_u32 s4, s8, s10 +; GFX1030-NEXT: s_addc_u32 s6, s11, 0 +; GFX1030-NEXT: s_add_u32 s4, s4, s7 +; GFX1030-NEXT: s_addc_u32 s6, 0, s6 +; GFX1030-NEXT: v_add_co_u32 v4, s4, v2, s4 +; GFX1030-NEXT: s_cmpk_lg_u32 s4, 0x0 +; GFX1030-NEXT: s_addc_u32 s4, s5, s6 +; GFX1030-NEXT: v_mul_hi_u32 v8, v0, v4 +; GFX1030-NEXT: v_mad_u64_u32 v[2:3], null, v0, s4, 0 +; GFX1030-NEXT: v_mad_u64_u32 v[4:5], null, v1, v4, 0 +; GFX1030-NEXT: v_mad_u64_u32 v[6:7], null, v1, s4, 0 ; GFX1030-NEXT: v_add_co_u32 v2, vcc_lo, v8, v2 ; GFX1030-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, 0, v3, vcc_lo ; GFX1030-NEXT: v_add_co_u32 v2, vcc_lo, v2, v4 diff --git a/llvm/test/CodeGen/AMDGPU/udiv64.ll b/llvm/test/CodeGen/AMDGPU/udiv64.ll --- a/llvm/test/CodeGen/AMDGPU/udiv64.ll +++ b/llvm/test/CodeGen/AMDGPU/udiv64.ll @@ -26,8 +26,8 @@ ; GCN-NEXT: v_mul_hi_u32 v3, s4, v0 ; GCN-NEXT: v_mul_lo_u32 v5, s5, v0 ; GCN-NEXT: v_mul_lo_u32 v4, s4, v0 -; GCN-NEXT: v_add_i32_e32 v2, vcc, v3, v2 -; GCN-NEXT: v_add_i32_e32 v2, vcc, v2, v5 +; GCN-NEXT: v_add_i32_e32 v2, vcc, v2, v3 +; GCN-NEXT: v_add_i32_e32 v2, vcc, v5, v2 ; GCN-NEXT: v_mul_hi_u32 v3, v0, v4 ; GCN-NEXT: v_mul_lo_u32 v5, v0, v2 ; GCN-NEXT: v_mul_hi_u32 v7, v0, v2 @@ -86,9 +86,9 @@ ; GCN-NEXT: v_mul_hi_u32 v3, s8, v0 ; GCN-NEXT: v_mul_lo_u32 v4, s9, v0 ; GCN-NEXT: v_mov_b32_e32 v5, s9 -; GCN-NEXT: v_add_i32_e32 v2, vcc, v3, v2 +; GCN-NEXT: v_add_i32_e32 v2, vcc, v2, v3 ; GCN-NEXT: v_mul_lo_u32 v3, s8, v0 -; GCN-NEXT: v_add_i32_e32 v2, vcc, v2, v4 +; GCN-NEXT: v_add_i32_e32 v2, vcc, v4, v2 ; GCN-NEXT: v_sub_i32_e32 v4, vcc, s3, v2 ; GCN-NEXT: v_sub_i32_e32 v3, vcc, s2, v3 ; GCN-NEXT: v_subb_u32_e64 v4, s[0:1], v4, v5, vcc @@ -704,8 +704,8 @@ ; GCN-NEXT: v_mul_hi_u32 v4, s0, v1 ; GCN-NEXT: v_mul_lo_u32 v5, s1, v1 ; GCN-NEXT: v_mul_lo_u32 v6, s0, v1 -; GCN-NEXT: v_add_i32_e32 v3, vcc, v4, v3 -; GCN-NEXT: v_add_i32_e32 v3, vcc, v3, v5 +; GCN-NEXT: v_add_i32_e32 v3, vcc, v3, v4 +; GCN-NEXT: v_add_i32_e32 v3, vcc, v5, v3 ; GCN-NEXT: v_mul_lo_u32 v4, v1, v3 ; GCN-NEXT: v_mul_hi_u32 v5, v1, v6 ; GCN-NEXT: v_mul_hi_u32 v7, v1, v3 @@ -725,9 +725,9 @@ ; GCN-NEXT: v_mul_lo_u32 v3, s0, v2 ; GCN-NEXT: v_mul_hi_u32 v4, s0, v1 ; GCN-NEXT: v_mul_lo_u32 v5, s1, v1 -; GCN-NEXT: v_add_i32_e32 v3, vcc, v4, v3 +; GCN-NEXT: v_add_i32_e32 v3, vcc, v3, v4 ; GCN-NEXT: v_mul_lo_u32 v4, s0, v1 -; GCN-NEXT: v_add_i32_e32 v3, vcc, v5, v3 +; GCN-NEXT: v_add_i32_e32 v3, vcc, v3, v5 ; GCN-NEXT: v_mul_lo_u32 v7, v1, v3 ; GCN-NEXT: v_mul_hi_u32 v8, v1, v4 ; GCN-NEXT: v_mul_hi_u32 v9, v1, v3 @@ -908,8 +908,8 @@ ; GCN-NEXT: v_mul_hi_u32 v3, s4, v0 ; GCN-NEXT: v_mul_lo_u32 v5, s5, v0 ; GCN-NEXT: v_mul_lo_u32 v4, s4, v0 -; GCN-NEXT: v_add_i32_e32 v2, vcc, v3, v2 -; GCN-NEXT: v_add_i32_e32 v2, vcc, v2, v5 +; GCN-NEXT: v_add_i32_e32 v2, vcc, v2, v3 +; GCN-NEXT: v_add_i32_e32 v2, vcc, v5, v2 ; GCN-NEXT: v_mul_hi_u32 v3, v0, v4 ; GCN-NEXT: v_mul_lo_u32 v5, v0, v2 ; GCN-NEXT: v_mul_hi_u32 v7, v0, v2 @@ -958,7 +958,7 @@ ; GCN-NEXT: v_addc_u32_e32 v0, vcc, 0, v1, vcc ; GCN-NEXT: v_mul_lo_u32 v1, s3, v0 ; GCN-NEXT: v_mul_hi_u32 v2, s2, v0 -; GCN-NEXT: v_add_i32_e32 v1, vcc, v2, v1 +; GCN-NEXT: v_add_i32_e32 v1, vcc, v1, v2 ; GCN-NEXT: v_mul_lo_u32 v2, s2, v0 ; GCN-NEXT: v_sub_i32_e32 v3, vcc, 0, v1 ; GCN-NEXT: v_sub_i32_e32 v2, vcc, 24, v2 @@ -1356,8 +1356,8 @@ ; GCN-NEXT: v_mul_hi_u32 v2, v0, s4 ; GCN-NEXT: v_mul_lo_u32 v4, v1, s4 ; GCN-NEXT: v_mul_lo_u32 v3, v0, s4 -; GCN-NEXT: v_subrev_i32_e32 v2, vcc, v0, v2 -; GCN-NEXT: v_add_i32_e32 v2, vcc, v4, v2 +; GCN-NEXT: v_sub_i32_e32 v2, vcc, v2, v0 +; GCN-NEXT: v_add_i32_e32 v2, vcc, v2, v4 ; GCN-NEXT: v_mul_hi_u32 v5, v0, v3 ; GCN-NEXT: v_mul_lo_u32 v4, v0, v2 ; GCN-NEXT: v_mul_hi_u32 v6, v0, v2 @@ -1378,8 +1378,8 @@ ; GCN-NEXT: v_mul_lo_u32 v3, v1, s4 ; GCN-NEXT: v_mul_lo_u32 v4, v0, s4 ; GCN-NEXT: s_mov_b32 s4, s0 -; GCN-NEXT: v_subrev_i32_e32 v2, vcc, v0, v2 -; GCN-NEXT: v_add_i32_e32 v2, vcc, v2, v3 +; GCN-NEXT: v_sub_i32_e32 v2, vcc, v2, v0 +; GCN-NEXT: v_add_i32_e32 v2, vcc, v3, v2 ; GCN-NEXT: v_mul_lo_u32 v3, v0, v2 ; GCN-NEXT: v_mul_hi_u32 v5, v0, v4 ; GCN-NEXT: v_mul_hi_u32 v6, v0, v2 @@ -1531,8 +1531,8 @@ ; GCN-NEXT: v_mul_hi_u32 v4, v2, s4 ; GCN-NEXT: v_mul_lo_u32 v6, v3, s4 ; GCN-NEXT: v_mul_lo_u32 v5, v2, s4 -; GCN-NEXT: v_subrev_i32_e32 v4, vcc, v2, v4 -; GCN-NEXT: v_add_i32_e32 v4, vcc, v6, v4 +; GCN-NEXT: v_sub_i32_e32 v4, vcc, v4, v2 +; GCN-NEXT: v_add_i32_e32 v4, vcc, v4, v6 ; GCN-NEXT: v_mul_hi_u32 v7, v2, v5 ; GCN-NEXT: v_mul_lo_u32 v6, v2, v4 ; GCN-NEXT: v_mul_hi_u32 v8, v2, v4 @@ -1552,7 +1552,7 @@ ; GCN-NEXT: v_mul_hi_u32 v4, v2, s4 ; GCN-NEXT: v_mul_lo_u32 v5, v3, s4 ; GCN-NEXT: v_mul_lo_u32 v6, v2, s4 -; GCN-NEXT: v_subrev_i32_e32 v4, vcc, v2, v4 +; GCN-NEXT: v_sub_i32_e32 v4, vcc, v4, v2 ; GCN-NEXT: v_add_i32_e32 v4, vcc, v4, v5 ; GCN-NEXT: v_mul_lo_u32 v5, v2, v4 ; GCN-NEXT: v_mul_hi_u32 v7, v2, v6 diff --git a/llvm/test/CodeGen/AMDGPU/udivrem.ll b/llvm/test/CodeGen/AMDGPU/udivrem.ll --- a/llvm/test/CodeGen/AMDGPU/udivrem.ll +++ b/llvm/test/CodeGen/AMDGPU/udivrem.ll @@ -52,7 +52,7 @@ ; GFX6-NEXT: v_mul_lo_u32 v1, s3, v0 ; GFX6-NEXT: s_load_dword s3, s[0:1], 0x1d ; GFX6-NEXT: v_mul_hi_u32 v1, v0, v1 -; GFX6-NEXT: v_add_i32_e32 v0, vcc, v1, v0 +; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mul_hi_u32 v0, s3, v0 ; GFX6-NEXT: v_mul_lo_u32 v1, v0, s2 @@ -86,7 +86,7 @@ ; GFX8-NEXT: v_mul_lo_u32 v1, s2, v0 ; GFX8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX8-NEXT: v_mul_hi_u32 v1, v0, v1 -; GFX8-NEXT: v_add_u32_e32 v0, vcc, v1, v0 +; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v1 ; GFX8-NEXT: v_mul_hi_u32 v2, s7, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v0, s2 @@ -176,7 +176,7 @@ ; GFX6-NEXT: v_mul_hi_u32 v2, v0, v2 ; GFX6-NEXT: v_mul_hi_u32 v3, v1, v3 ; GFX6-NEXT: v_add_i32_e32 v0, vcc, v2, v0 -; GFX6-NEXT: v_add_i32_e32 v1, vcc, v3, v1 +; GFX6-NEXT: v_add_i32_e32 v1, vcc, v1, v3 ; GFX6-NEXT: v_mul_hi_u32 v0, s4, v0 ; GFX6-NEXT: v_mul_hi_u32 v1, s5, v1 ; GFX6-NEXT: v_mul_lo_u32 v0, v0, s6 @@ -204,41 +204,45 @@ ; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_cvt_f32_u32_e32 v0, s6 -; GFX8-NEXT: v_cvt_f32_u32_e32 v1, s7 ; GFX8-NEXT: s_sub_i32 s2, 0, s6 +; GFX8-NEXT: v_cvt_f32_u32_e32 v2, s7 ; GFX8-NEXT: v_rcp_iflag_f32_e32 v0, v0 -; GFX8-NEXT: v_rcp_iflag_f32_e32 v1, v1 +; GFX8-NEXT: v_rcp_iflag_f32_e32 v2, v2 ; GFX8-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX8-NEXT: v_cvt_u32_f32_e32 v0, v0 -; GFX8-NEXT: v_mul_f32_e32 v1, 0x4f7ffffe, v1 -; GFX8-NEXT: v_cvt_u32_f32_e32 v1, v1 -; GFX8-NEXT: v_mul_lo_u32 v2, s2, v0 -; GFX8-NEXT: s_sub_i32 s2, 0, s7 -; GFX8-NEXT: v_mul_lo_u32 v3, s2, v1 -; GFX8-NEXT: v_mul_hi_u32 v2, v0, v2 -; GFX8-NEXT: v_mul_hi_u32 v3, v1, v3 -; GFX8-NEXT: v_add_u32_e32 v0, vcc, v2, v0 -; GFX8-NEXT: v_add_u32_e32 v1, vcc, v3, v1 +; GFX8-NEXT: v_mul_lo_u32 v1, s2, v0 +; GFX8-NEXT: v_mul_hi_u32 v1, v0, v1 +; GFX8-NEXT: v_add_u32_e32 v0, vcc, v1, v0 ; GFX8-NEXT: v_mul_hi_u32 v0, s4, v0 -; GFX8-NEXT: v_mul_hi_u32 v1, s5, v1 -; GFX8-NEXT: v_mul_lo_u32 v0, v0, s6 -; GFX8-NEXT: v_mul_lo_u32 v1, v1, s7 -; GFX8-NEXT: v_sub_u32_e32 v0, vcc, s4, v0 -; GFX8-NEXT: v_sub_u32_e32 v1, vcc, s5, v1 -; GFX8-NEXT: v_subrev_u32_e32 v2, vcc, s6, v0 -; GFX8-NEXT: v_subrev_u32_e32 v3, vcc, s7, v1 -; GFX8-NEXT: v_cmp_le_u32_e32 vcc, s6, v0 -; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc -; GFX8-NEXT: v_cmp_le_u32_e32 vcc, s7, v1 -; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc -; GFX8-NEXT: v_subrev_u32_e32 v2, vcc, s6, v0 -; GFX8-NEXT: v_cmp_le_u32_e32 vcc, s6, v0 -; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc -; GFX8-NEXT: v_subrev_u32_e32 v2, vcc, s7, v1 -; GFX8-NEXT: v_cmp_le_u32_e32 vcc, s7, v1 -; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc +; GFX8-NEXT: v_mul_f32_e32 v1, 0x4f7ffffe, v2 +; GFX8-NEXT: v_cvt_u32_f32_e32 v1, v1 ; GFX8-NEXT: v_mov_b32_e32 v3, s1 +; GFX8-NEXT: v_readfirstlane_b32 s2, v0 +; GFX8-NEXT: s_mul_i32 s2, s2, s6 +; GFX8-NEXT: s_sub_i32 s2, s4, s2 +; GFX8-NEXT: s_sub_i32 s3, s2, s6 +; GFX8-NEXT: s_cmp_ge_u32 s2, s6 +; GFX8-NEXT: s_cselect_b32 s2, s3, s2 +; GFX8-NEXT: s_sub_i32 s3, s2, s6 +; GFX8-NEXT: s_cmp_ge_u32 s2, s6 +; GFX8-NEXT: s_cselect_b32 s2, s3, s2 +; GFX8-NEXT: s_sub_i32 s3, 0, s7 +; GFX8-NEXT: v_mul_lo_u32 v0, s3, v1 ; GFX8-NEXT: v_mov_b32_e32 v2, s0 +; GFX8-NEXT: v_mul_hi_u32 v0, v1, v0 +; GFX8-NEXT: v_add_u32_e32 v0, vcc, v1, v0 +; GFX8-NEXT: v_mul_hi_u32 v1, s5, v0 +; GFX8-NEXT: v_mov_b32_e32 v0, s2 +; GFX8-NEXT: v_readfirstlane_b32 s2, v1 +; GFX8-NEXT: s_mul_i32 s2, s2, s7 +; GFX8-NEXT: s_sub_i32 s2, s5, s2 +; GFX8-NEXT: s_sub_i32 s3, s2, s7 +; GFX8-NEXT: s_cmp_ge_u32 s2, s7 +; GFX8-NEXT: s_cselect_b32 s2, s3, s2 +; GFX8-NEXT: s_sub_i32 s3, s2, s7 +; GFX8-NEXT: s_cmp_ge_u32 s2, s7 +; GFX8-NEXT: s_cselect_b32 s2, s3, s2 +; GFX8-NEXT: v_mov_b32_e32 v1, s2 ; GFX8-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GFX8-NEXT: s_endpgm %result0 = udiv <2 x i32> %x, %y @@ -341,7 +345,7 @@ ; GFX6-NEXT: v_mul_hi_u32 v4, v1, v4 ; GFX6-NEXT: v_add_i32_e32 v0, vcc, v2, v0 ; GFX6-NEXT: v_mul_hi_u32 v0, s4, v0 -; GFX6-NEXT: v_add_i32_e32 v1, vcc, v4, v1 +; GFX6-NEXT: v_add_i32_e32 v1, vcc, v1, v4 ; GFX6-NEXT: v_mul_hi_u32 v1, s5, v1 ; GFX6-NEXT: v_mul_lo_u32 v0, v0, s8 ; GFX6-NEXT: v_mul_f32_e32 v2, 0x4f7ffffe, v3 @@ -374,7 +378,7 @@ ; GFX6-NEXT: v_mul_lo_u32 v2, v2, s10 ; GFX6-NEXT: v_mul_hi_u32 v4, v3, v5 ; GFX6-NEXT: v_sub_i32_e32 v2, vcc, s6, v2 -; GFX6-NEXT: v_add_i32_e32 v3, vcc, v4, v3 +; GFX6-NEXT: v_add_i32_e32 v3, vcc, v3, v4 ; GFX6-NEXT: v_mul_hi_u32 v3, s7, v3 ; GFX6-NEXT: v_subrev_i32_e32 v5, vcc, s10, v2 ; GFX6-NEXT: v_cmp_le_u32_e32 vcc, s10, v2 @@ -399,75 +403,83 @@ ; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_cvt_f32_u32_e32 v0, s8 -; GFX8-NEXT: v_cvt_f32_u32_e32 v1, s9 ; GFX8-NEXT: s_sub_i32 s2, 0, s8 -; GFX8-NEXT: s_sub_i32 s3, 0, s9 +; GFX8-NEXT: v_cvt_f32_u32_e32 v2, s9 +; GFX8-NEXT: v_mov_b32_e32 v5, s1 ; GFX8-NEXT: v_rcp_iflag_f32_e32 v0, v0 -; GFX8-NEXT: v_rcp_iflag_f32_e32 v1, v1 -; GFX8-NEXT: v_cvt_f32_u32_e32 v3, s10 -; GFX8-NEXT: v_cvt_f32_u32_e32 v5, s11 +; GFX8-NEXT: v_mov_b32_e32 v4, s0 +; GFX8-NEXT: v_rcp_iflag_f32_e32 v2, v2 ; GFX8-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX8-NEXT: v_cvt_u32_f32_e32 v0, v0 -; GFX8-NEXT: v_mul_f32_e32 v1, 0x4f7ffffe, v1 -; GFX8-NEXT: v_cvt_u32_f32_e32 v1, v1 -; GFX8-NEXT: v_rcp_iflag_f32_e32 v3, v3 -; GFX8-NEXT: v_mul_lo_u32 v2, s2, v0 -; GFX8-NEXT: s_sub_i32 s2, 0, s10 -; GFX8-NEXT: v_mul_lo_u32 v4, s3, v1 -; GFX8-NEXT: v_mul_hi_u32 v2, v0, v2 -; GFX8-NEXT: v_mul_hi_u32 v4, v1, v4 -; GFX8-NEXT: v_add_u32_e32 v0, vcc, v2, v0 +; GFX8-NEXT: v_mul_lo_u32 v1, s2, v0 +; GFX8-NEXT: v_mul_hi_u32 v1, v0, v1 +; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v1 ; GFX8-NEXT: v_mul_hi_u32 v0, s4, v0 -; GFX8-NEXT: v_add_u32_e32 v1, vcc, v4, v1 -; GFX8-NEXT: v_mul_hi_u32 v1, s5, v1 -; GFX8-NEXT: v_mul_lo_u32 v0, v0, s8 -; GFX8-NEXT: v_mul_f32_e32 v2, 0x4f7ffffe, v3 -; GFX8-NEXT: v_cvt_u32_f32_e32 v2, v2 -; GFX8-NEXT: v_mul_lo_u32 v1, v1, s9 -; GFX8-NEXT: v_sub_u32_e32 v0, vcc, s4, v0 -; GFX8-NEXT: v_subrev_u32_e32 v3, vcc, s8, v0 -; GFX8-NEXT: v_sub_u32_e32 v1, vcc, s5, v1 -; GFX8-NEXT: v_cmp_le_u32_e32 vcc, s8, v0 -; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc -; GFX8-NEXT: v_subrev_u32_e32 v3, vcc, s8, v0 -; GFX8-NEXT: v_cmp_le_u32_e32 vcc, s8, v0 -; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc -; GFX8-NEXT: v_mul_lo_u32 v3, s2, v2 -; GFX8-NEXT: v_subrev_u32_e32 v4, vcc, s9, v1 -; GFX8-NEXT: v_cmp_le_u32_e32 vcc, s9, v1 -; GFX8-NEXT: v_mul_hi_u32 v3, v2, v3 -; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc -; GFX8-NEXT: v_rcp_iflag_f32_e32 v4, v5 -; GFX8-NEXT: s_sub_i32 s2, 0, s11 -; GFX8-NEXT: v_add_u32_e32 v2, vcc, v3, v2 -; GFX8-NEXT: v_mul_f32_e32 v3, 0x4f7ffffe, v4 -; GFX8-NEXT: v_cvt_u32_f32_e32 v3, v3 -; GFX8-NEXT: v_subrev_u32_e32 v4, vcc, s9, v1 -; GFX8-NEXT: v_mul_hi_u32 v2, s6, v2 -; GFX8-NEXT: v_mul_lo_u32 v5, s2, v3 -; GFX8-NEXT: v_cmp_le_u32_e32 vcc, s9, v1 -; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc -; GFX8-NEXT: v_mul_lo_u32 v2, v2, s10 -; GFX8-NEXT: v_mul_hi_u32 v4, v3, v5 -; GFX8-NEXT: v_sub_u32_e32 v2, vcc, s6, v2 -; GFX8-NEXT: v_add_u32_e32 v3, vcc, v4, v3 -; GFX8-NEXT: v_mul_hi_u32 v3, s7, v3 -; GFX8-NEXT: v_subrev_u32_e32 v5, vcc, s10, v2 -; GFX8-NEXT: v_cmp_le_u32_e32 vcc, s10, v2 -; GFX8-NEXT: v_mul_lo_u32 v3, v3, s11 -; GFX8-NEXT: v_cndmask_b32_e32 v2, v2, v5, vcc -; GFX8-NEXT: v_subrev_u32_e32 v4, vcc, s10, v2 -; GFX8-NEXT: v_cmp_le_u32_e32 vcc, s10, v2 -; GFX8-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc -; GFX8-NEXT: v_sub_u32_e32 v3, vcc, s7, v3 -; GFX8-NEXT: v_subrev_u32_e32 v4, vcc, s11, v3 -; GFX8-NEXT: v_cmp_le_u32_e32 vcc, s11, v3 -; GFX8-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc -; GFX8-NEXT: v_subrev_u32_e32 v4, vcc, s11, v3 -; GFX8-NEXT: v_cmp_le_u32_e32 vcc, s11, v3 -; GFX8-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc -; GFX8-NEXT: v_mov_b32_e32 v5, s1 -; GFX8-NEXT: v_mov_b32_e32 v4, s0 +; GFX8-NEXT: v_mul_f32_e32 v1, 0x4f7ffffe, v2 +; GFX8-NEXT: v_cvt_u32_f32_e32 v1, v1 +; GFX8-NEXT: v_cvt_f32_u32_e32 v2, s10 +; GFX8-NEXT: v_readfirstlane_b32 s2, v0 +; GFX8-NEXT: s_mul_i32 s2, s2, s8 +; GFX8-NEXT: s_sub_i32 s2, s4, s2 +; GFX8-NEXT: s_sub_i32 s3, s2, s8 +; GFX8-NEXT: s_cmp_ge_u32 s2, s8 +; GFX8-NEXT: s_cselect_b32 s2, s3, s2 +; GFX8-NEXT: s_sub_i32 s3, s2, s8 +; GFX8-NEXT: s_cmp_ge_u32 s2, s8 +; GFX8-NEXT: s_cselect_b32 s2, s3, s2 +; GFX8-NEXT: s_sub_i32 s3, 0, s9 +; GFX8-NEXT: v_mul_lo_u32 v0, s3, v1 +; GFX8-NEXT: v_rcp_iflag_f32_e32 v2, v2 +; GFX8-NEXT: v_mul_hi_u32 v0, v1, v0 +; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v1 +; GFX8-NEXT: v_mul_hi_u32 v0, s5, v0 +; GFX8-NEXT: v_mul_f32_e32 v1, 0x4f7ffffe, v2 +; GFX8-NEXT: v_cvt_u32_f32_e32 v1, v1 +; GFX8-NEXT: v_cvt_f32_u32_e32 v2, s11 +; GFX8-NEXT: v_readfirstlane_b32 s3, v0 +; GFX8-NEXT: s_mul_i32 s3, s3, s9 +; GFX8-NEXT: s_sub_i32 s3, s5, s3 +; GFX8-NEXT: s_sub_i32 s4, s3, s9 +; GFX8-NEXT: s_cmp_ge_u32 s3, s9 +; GFX8-NEXT: s_cselect_b32 s3, s4, s3 +; GFX8-NEXT: s_sub_i32 s4, s3, s9 +; GFX8-NEXT: s_cmp_ge_u32 s3, s9 +; GFX8-NEXT: s_cselect_b32 s3, s4, s3 +; GFX8-NEXT: s_sub_i32 s4, 0, s10 +; GFX8-NEXT: v_mul_lo_u32 v0, s4, v1 +; GFX8-NEXT: v_rcp_iflag_f32_e32 v2, v2 +; GFX8-NEXT: v_mul_hi_u32 v0, v1, v0 +; GFX8-NEXT: v_add_u32_e32 v0, vcc, v1, v0 +; GFX8-NEXT: v_mul_hi_u32 v0, s6, v0 +; GFX8-NEXT: v_mul_f32_e32 v1, 0x4f7ffffe, v2 +; GFX8-NEXT: v_cvt_u32_f32_e32 v1, v1 +; GFX8-NEXT: v_readfirstlane_b32 s4, v0 +; GFX8-NEXT: s_mul_i32 s4, s4, s10 +; GFX8-NEXT: s_sub_i32 s4, s6, s4 +; GFX8-NEXT: s_sub_i32 s5, s4, s10 +; GFX8-NEXT: s_cmp_ge_u32 s4, s10 +; GFX8-NEXT: s_cselect_b32 s4, s5, s4 +; GFX8-NEXT: s_sub_i32 s5, s4, s10 +; GFX8-NEXT: s_cmp_ge_u32 s4, s10 +; GFX8-NEXT: s_cselect_b32 s4, s5, s4 +; GFX8-NEXT: s_sub_i32 s5, 0, s11 +; GFX8-NEXT: v_mul_lo_u32 v0, s5, v1 +; GFX8-NEXT: v_mov_b32_e32 v2, s4 +; GFX8-NEXT: v_mul_hi_u32 v0, v1, v0 +; GFX8-NEXT: v_add_u32_e32 v0, vcc, v1, v0 +; GFX8-NEXT: v_mul_hi_u32 v3, s7, v0 +; GFX8-NEXT: v_mov_b32_e32 v0, s2 +; GFX8-NEXT: v_mov_b32_e32 v1, s3 +; GFX8-NEXT: v_readfirstlane_b32 s2, v3 +; GFX8-NEXT: s_mul_i32 s2, s2, s11 +; GFX8-NEXT: s_sub_i32 s2, s7, s2 +; GFX8-NEXT: s_sub_i32 s3, s2, s11 +; GFX8-NEXT: s_cmp_ge_u32 s2, s11 +; GFX8-NEXT: s_cselect_b32 s2, s3, s2 +; GFX8-NEXT: s_sub_i32 s3, s2, s11 +; GFX8-NEXT: s_cmp_ge_u32 s2, s11 +; GFX8-NEXT: s_cselect_b32 s2, s3, s2 +; GFX8-NEXT: v_mov_b32_e32 v3, s2 ; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX8-NEXT: s_endpgm %result0 = udiv <4 x i32> %x, %y diff --git a/llvm/test/CodeGen/AMDGPU/unstructured-cfg-def-use-issue.ll b/llvm/test/CodeGen/AMDGPU/unstructured-cfg-def-use-issue.ll --- a/llvm/test/CodeGen/AMDGPU/unstructured-cfg-def-use-issue.ll +++ b/llvm/test/CodeGen/AMDGPU/unstructured-cfg-def-use-issue.ll @@ -1,3 +1,4 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py ; RUN: llc -mtriple=amdgcn-amdhsa -verify-machineinstrs -simplifycfg-require-and-preserve-domtree=1 < %s | FileCheck -check-prefix=GCN %s ; RUN: opt -S -si-annotate-control-flow -mtriple=amdgcn-amdhsa -verify-machineinstrs -simplifycfg-require-and-preserve-domtree=1 < %s | FileCheck -check-prefix=SI-OPT %s @@ -19,10 +20,11 @@ ; GCN-NEXT: flat_load_dword v0, v[0:1] ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: v_cmp_gt_i32_e32 vcc, 21, v0 +; GCN-NEXT: v_readfirstlane_b32 s15, v0 ; GCN-NEXT: s_cbranch_vccz .LBB0_3 ; GCN-NEXT: ; %bb.1: ; %bb4 -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 9, v0 -; GCN-NEXT: s_cbranch_vccnz .LBB0_4 +; GCN-NEXT: s_cmp_lg_u32 s15, 9 +; GCN-NEXT: s_cbranch_scc1 .LBB0_4 ; GCN-NEXT: ; %bb.2: ; %bb7 ; GCN-NEXT: s_getpc_b64 s[16:17] ; GCN-NEXT: s_add_u32 s16, s16, wibble@rel32@lo+4 @@ -30,8 +32,8 @@ ; GCN-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GCN-NEXT: s_branch .LBB0_7 ; GCN-NEXT: .LBB0_3: ; %bb2 -; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 21, v0 -; GCN-NEXT: s_cbranch_vccnz .LBB0_6 +; GCN-NEXT: s_cmp_eq_u32 s15, 21 +; GCN-NEXT: s_cbranch_scc1 .LBB0_6 ; GCN-NEXT: .LBB0_4: ; %bb9 ; GCN-NEXT: s_getpc_b64 s[16:17] ; GCN-NEXT: s_add_u32 s16, s16, wibble@rel32@lo+4 diff --git a/llvm/test/CodeGen/AMDGPU/urem.ll b/llvm/test/CodeGen/AMDGPU/urem.ll --- a/llvm/test/CodeGen/AMDGPU/urem.ll +++ b/llvm/test/CodeGen/AMDGPU/urem.ll @@ -21,9 +21,9 @@ ; FUNC-LABEL: {{^}}test_urem_i32_7: ; SI: s_mov_b32 [[MAGIC:s[0-9]+]], 0x24924925 ; SI: v_mul_hi_u32 {{v[0-9]+}}, {{v[0-9]+}}, [[MAGIC]] -; SI: v_subrev_{{[iu]}}32 -; SI: v_mul_lo_u32 ; SI: v_sub_{{[iu]}}32 +; SI: v_mul_lo_u32 +; SI: v_subrev_{{[iu]}}32 ; SI: buffer_store_dword ; SI: s_endpgm define amdgpu_kernel void @test_urem_i32_7(i32 addrspace(1)* %out, i32 addrspace(1)* %in) { diff --git a/llvm/test/CodeGen/AMDGPU/urem64.ll b/llvm/test/CodeGen/AMDGPU/urem64.ll --- a/llvm/test/CodeGen/AMDGPU/urem64.ll +++ b/llvm/test/CodeGen/AMDGPU/urem64.ll @@ -28,8 +28,8 @@ ; GCN-NEXT: v_mul_hi_u32 v3, s0, v0 ; GCN-NEXT: v_mul_lo_u32 v5, s1, v0 ; GCN-NEXT: v_mul_lo_u32 v4, s0, v0 -; GCN-NEXT: v_add_i32_e32 v2, vcc, v3, v2 -; GCN-NEXT: v_add_i32_e32 v2, vcc, v2, v5 +; GCN-NEXT: v_add_i32_e32 v2, vcc, v2, v3 +; GCN-NEXT: v_add_i32_e32 v2, vcc, v5, v2 ; GCN-NEXT: v_mul_hi_u32 v3, v0, v4 ; GCN-NEXT: v_mul_lo_u32 v5, v0, v2 ; GCN-NEXT: v_mul_hi_u32 v7, v0, v2 @@ -86,8 +86,8 @@ ; GCN-NEXT: v_mul_hi_u32 v2, s12, v0 ; GCN-NEXT: v_mul_lo_u32 v3, s13, v0 ; GCN-NEXT: v_mul_lo_u32 v0, s12, v0 -; GCN-NEXT: v_add_i32_e32 v1, vcc, v2, v1 -; GCN-NEXT: v_add_i32_e32 v1, vcc, v1, v3 +; GCN-NEXT: v_add_i32_e32 v1, vcc, v1, v2 +; GCN-NEXT: v_add_i32_e32 v1, vcc, v3, v1 ; GCN-NEXT: v_sub_i32_e32 v2, vcc, s11, v1 ; GCN-NEXT: v_mov_b32_e32 v3, s13 ; GCN-NEXT: v_sub_i32_e32 v0, vcc, s10, v0 @@ -202,8 +202,8 @@ ; GCN-IR-NEXT: v_mul_lo_u32 v3, s5, v0 ; GCN-IR-NEXT: v_mul_lo_u32 v0, s4, v0 ; GCN-IR-NEXT: s_mov_b32 s11, 0xf000 -; GCN-IR-NEXT: v_add_i32_e32 v1, vcc, v2, v1 -; GCN-IR-NEXT: v_add_i32_e32 v1, vcc, v1, v3 +; GCN-IR-NEXT: v_add_i32_e32 v1, vcc, v1, v2 +; GCN-IR-NEXT: v_add_i32_e32 v1, vcc, v3, v1 ; GCN-IR-NEXT: v_mov_b32_e32 v2, s3 ; GCN-IR-NEXT: v_sub_i32_e32 v0, vcc, s2, v0 ; GCN-IR-NEXT: s_mov_b32 s10, -1 @@ -733,8 +733,8 @@ ; GCN-NEXT: v_mul_hi_u32 v3, s0, v0 ; GCN-NEXT: v_mul_lo_u32 v5, s1, v0 ; GCN-NEXT: v_mul_lo_u32 v4, s0, v0 -; GCN-NEXT: v_add_i32_e32 v2, vcc, v3, v2 -; GCN-NEXT: v_add_i32_e32 v2, vcc, v2, v5 +; GCN-NEXT: v_add_i32_e32 v2, vcc, v2, v3 +; GCN-NEXT: v_add_i32_e32 v2, vcc, v5, v2 ; GCN-NEXT: v_mul_hi_u32 v3, v0, v4 ; GCN-NEXT: v_mul_lo_u32 v5, v0, v2 ; GCN-NEXT: v_mul_hi_u32 v7, v0, v2 @@ -782,7 +782,7 @@ ; GCN-NEXT: v_mul_lo_u32 v1, s7, v0 ; GCN-NEXT: v_mul_hi_u32 v2, s6, v0 ; GCN-NEXT: v_mul_lo_u32 v0, s6, v0 -; GCN-NEXT: v_add_i32_e32 v1, vcc, v2, v1 +; GCN-NEXT: v_add_i32_e32 v1, vcc, v1, v2 ; GCN-NEXT: v_sub_i32_e32 v2, vcc, 0, v1 ; GCN-NEXT: v_sub_i32_e32 v0, vcc, 24, v0 ; GCN-NEXT: v_subb_u32_e64 v2, s[0:1], v2, v3, vcc @@ -884,8 +884,8 @@ ; GCN-IR-NEXT: v_mul_lo_u32 v3, s3, v0 ; GCN-IR-NEXT: v_mul_lo_u32 v0, s2, v0 ; GCN-IR-NEXT: s_mov_b32 s7, 0xf000 -; GCN-IR-NEXT: v_add_i32_e32 v1, vcc, v2, v1 -; GCN-IR-NEXT: v_add_i32_e32 v1, vcc, v1, v3 +; GCN-IR-NEXT: v_add_i32_e32 v1, vcc, v1, v2 +; GCN-IR-NEXT: v_add_i32_e32 v1, vcc, v3, v1 ; GCN-IR-NEXT: v_sub_i32_e32 v0, vcc, 24, v0 ; GCN-IR-NEXT: s_mov_b32 s6, -1 ; GCN-IR-NEXT: s_mov_b32 s4, s0 @@ -919,8 +919,8 @@ ; GCN-NEXT: v_mul_hi_u32 v2, v0, s4 ; GCN-NEXT: v_mul_lo_u32 v4, v1, s4 ; GCN-NEXT: v_mul_lo_u32 v3, v0, s4 -; GCN-NEXT: v_subrev_i32_e32 v2, vcc, v0, v2 -; GCN-NEXT: v_add_i32_e32 v2, vcc, v4, v2 +; GCN-NEXT: v_sub_i32_e32 v2, vcc, v2, v0 +; GCN-NEXT: v_add_i32_e32 v2, vcc, v2, v4 ; GCN-NEXT: v_mul_hi_u32 v5, v0, v3 ; GCN-NEXT: v_mul_lo_u32 v4, v0, v2 ; GCN-NEXT: v_mul_hi_u32 v6, v0, v2 @@ -941,7 +941,7 @@ ; GCN-NEXT: v_mul_lo_u32 v3, v1, s4 ; GCN-NEXT: v_mul_lo_u32 v4, v0, s4 ; GCN-NEXT: s_mov_b32 s4, s0 -; GCN-NEXT: v_subrev_i32_e32 v2, vcc, v0, v2 +; GCN-NEXT: v_sub_i32_e32 v2, vcc, v2, v0 ; GCN-NEXT: v_add_i32_e32 v2, vcc, v2, v3 ; GCN-NEXT: v_mul_lo_u32 v3, v0, v2 ; GCN-NEXT: v_mul_hi_u32 v5, v0, v4 @@ -1071,7 +1071,7 @@ ; GCN-IR-NEXT: v_mul_lo_u32 v0, v0, 24 ; GCN-IR-NEXT: s_mov_b32 s7, 0xf000 ; GCN-IR-NEXT: s_mov_b32 s6, -1 -; GCN-IR-NEXT: v_add_i32_e32 v1, vcc, v2, v1 +; GCN-IR-NEXT: v_add_i32_e32 v1, vcc, v1, v2 ; GCN-IR-NEXT: v_mov_b32_e32 v2, s3 ; GCN-IR-NEXT: v_sub_i32_e32 v0, vcc, s2, v0 ; GCN-IR-NEXT: s_mov_b32 s4, s0 diff --git a/llvm/test/CodeGen/AMDGPU/usubo.ll b/llvm/test/CodeGen/AMDGPU/usubo.ll --- a/llvm/test/CodeGen/AMDGPU/usubo.ll +++ b/llvm/test/CodeGen/AMDGPU/usubo.ll @@ -135,7 +135,7 @@ } ; FUNC-LABEL: {{^}}v_usubo_i16: -; SI: v_subrev_i32_e32 +; SI: v_sub_i32_e32 ; SI: v_and_b32 ; SI: v_cmp_ne_u32_e32 diff --git a/llvm/test/CodeGen/AMDGPU/vgpr-liverange-ir.ll b/llvm/test/CodeGen/AMDGPU/vgpr-liverange-ir.ll --- a/llvm/test/CodeGen/AMDGPU/vgpr-liverange-ir.ll +++ b/llvm/test/CodeGen/AMDGPU/vgpr-liverange-ir.ll @@ -234,7 +234,7 @@ ; SI-NEXT: bb.1.Flow: ; SI-NEXT: successors: %bb.2(0x40000000), %bb.10(0x40000000) ; SI-NEXT: {{ $}} - ; SI-NEXT: [[PHI:%[0-9]+]]:vgpr_32 = PHI undef %29:vgpr_32, %bb.0, %4, %bb.9 + ; SI-NEXT: [[PHI:%[0-9]+]]:vgpr_32 = PHI undef %43:vgpr_32, %bb.0, %4, %bb.9 ; SI-NEXT: [[PHI1:%[0-9]+]]:vgpr_32 = PHI [[COPY4]], %bb.0, undef %45:vgpr_32, %bb.9 ; SI-NEXT: [[PHI2:%[0-9]+]]:vgpr_32 = PHI [[COPY3]], %bb.0, undef %47:vgpr_32, %bb.9 ; SI-NEXT: [[PHI3:%[0-9]+]]:vgpr_32 = PHI [[COPY2]], %bb.0, undef %49:vgpr_32, %bb.9 @@ -357,7 +357,7 @@ ; SI-NEXT: bb.1.Flow: ; SI-NEXT: successors: %bb.2(0x40000000), %bb.10(0x40000000) ; SI-NEXT: {{ $}} - ; SI-NEXT: [[PHI:%[0-9]+]]:vgpr_32 = PHI undef %30:vgpr_32, %bb.0, %4, %bb.9 + ; SI-NEXT: [[PHI:%[0-9]+]]:vgpr_32 = PHI undef %44:vgpr_32, %bb.0, %4, %bb.9 ; SI-NEXT: [[PHI1:%[0-9]+]]:vgpr_32 = PHI [[COPY3]], %bb.0, undef %46:vgpr_32, %bb.9 ; SI-NEXT: [[PHI2:%[0-9]+]]:vgpr_32 = PHI [[COPY2]], %bb.0, undef %48:vgpr_32, %bb.9 ; SI-NEXT: [[SI_ELSE:%[0-9]+]]:sreg_32 = SI_ELSE killed [[SI_IF]], %bb.10, implicit-def dead $exec, implicit-def dead $scc, implicit $exec @@ -510,7 +510,7 @@ ; SI-NEXT: bb.6.sw.bb18: ; SI-NEXT: successors: %bb.5(0x80000000) ; SI-NEXT: {{ $}} - ; SI-NEXT: [[PHI1:%[0-9]+]]:vgpr_32 = PHI undef %39:vgpr_32, %bb.3, [[GLOBAL_LOAD_UBYTE1]], %bb.4 + ; SI-NEXT: [[PHI1:%[0-9]+]]:vgpr_32 = PHI undef %37:vgpr_32, %bb.3, [[GLOBAL_LOAD_UBYTE1]], %bb.4 ; SI-NEXT: [[V_MOV_B2:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO 0, implicit $exec ; SI-NEXT: GLOBAL_STORE_BYTE killed [[V_MOV_B2]], killed [[PHI1]], 0, 0, implicit $exec :: (store (s8) into `i8 addrspace(1)* null`, addrspace 1) ; SI-NEXT: S_BRANCH %bb.5 @@ -596,27 +596,17 @@ ; SI-NEXT: bb.2: ; SI-NEXT: successors: %bb.3(0x80000000) ; SI-NEXT: {{ $}} - ; SI-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub0, implicit $exec - ; SI-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub1, implicit $exec + ; SI-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[GLOBAL_LOAD_DWORDX4_2]].sub0, implicit $exec + ; SI-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[GLOBAL_LOAD_DWORDX4_2]].sub1, implicit $exec ; SI-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1 - ; SI-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U64_e64 killed [[REG_SEQUENCE3]], [[REG_SEQUENCE2]].sub0_sub1, implicit $exec - ; SI-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub2, implicit $exec - ; SI-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub3, implicit $exec + ; SI-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U64_e64 killed [[REG_SEQUENCE3]], [[GLOBAL_LOAD_DWORDX4_2]].sub0_sub1, implicit $exec + ; SI-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[GLOBAL_LOAD_DWORDX4_2]].sub2, implicit $exec + ; SI-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[GLOBAL_LOAD_DWORDX4_2]].sub3, implicit $exec ; SI-NEXT: [[REG_SEQUENCE4:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[V_READFIRSTLANE_B32_2]], %subreg.sub0, [[V_READFIRSTLANE_B32_3]], %subreg.sub1 - ; SI-NEXT: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U64_e64 killed [[REG_SEQUENCE4]], [[REG_SEQUENCE2]].sub2_sub3, implicit $exec + ; SI-NEXT: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U64_e64 killed [[REG_SEQUENCE4]], [[GLOBAL_LOAD_DWORDX4_2]].sub2_sub3, implicit $exec ; SI-NEXT: [[S_AND_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_B32 killed [[V_CMP_EQ_U64_e64_]], killed [[V_CMP_EQ_U64_e64_1]], implicit-def dead $scc - ; SI-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub4, implicit $exec - ; SI-NEXT: [[V_READFIRSTLANE_B32_5:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub5, implicit $exec - ; SI-NEXT: [[REG_SEQUENCE5:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[V_READFIRSTLANE_B32_4]], %subreg.sub0, [[V_READFIRSTLANE_B32_5]], %subreg.sub1 - ; SI-NEXT: [[V_CMP_EQ_U64_e64_2:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U64_e64 killed [[REG_SEQUENCE5]], [[REG_SEQUENCE2]].sub4_sub5, implicit $exec - ; SI-NEXT: [[S_AND_B32_1:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_B32 killed [[S_AND_B32_]], killed [[V_CMP_EQ_U64_e64_2]], implicit-def dead $scc - ; SI-NEXT: [[V_READFIRSTLANE_B32_6:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub6, implicit $exec - ; SI-NEXT: [[V_READFIRSTLANE_B32_7:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub7, implicit $exec - ; SI-NEXT: [[REG_SEQUENCE6:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[V_READFIRSTLANE_B32_6]], %subreg.sub0, [[V_READFIRSTLANE_B32_7]], %subreg.sub1 - ; SI-NEXT: [[V_CMP_EQ_U64_e64_3:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U64_e64 killed [[REG_SEQUENCE6]], [[REG_SEQUENCE2]].sub6_sub7, implicit $exec - ; SI-NEXT: [[S_AND_B32_2:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_B32 killed [[S_AND_B32_1]], killed [[V_CMP_EQ_U64_e64_3]], implicit-def dead $scc - ; SI-NEXT: [[REG_SEQUENCE7:%[0-9]+]]:sgpr_256 = REG_SEQUENCE killed [[V_READFIRSTLANE_B32_]], %subreg.sub0, killed [[V_READFIRSTLANE_B32_1]], %subreg.sub1, killed [[V_READFIRSTLANE_B32_2]], %subreg.sub2, killed [[V_READFIRSTLANE_B32_3]], %subreg.sub3, killed [[V_READFIRSTLANE_B32_4]], %subreg.sub4, killed [[V_READFIRSTLANE_B32_5]], %subreg.sub5, killed [[V_READFIRSTLANE_B32_6]], %subreg.sub6, killed [[V_READFIRSTLANE_B32_7]], %subreg.sub7 - ; SI-NEXT: [[S_AND_SAVEEXEC_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_SAVEEXEC_B32 killed [[S_AND_B32_2]], implicit-def $exec, implicit-def dead $scc, implicit $exec + ; SI-NEXT: [[REG_SEQUENCE5:%[0-9]+]]:sgpr_128 = REG_SEQUENCE killed [[V_READFIRSTLANE_B32_]], %subreg.sub0, killed [[V_READFIRSTLANE_B32_1]], %subreg.sub1, killed [[V_READFIRSTLANE_B32_2]], %subreg.sub2, killed [[V_READFIRSTLANE_B32_3]], %subreg.sub3 + ; SI-NEXT: [[S_AND_SAVEEXEC_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_SAVEEXEC_B32 killed [[S_AND_B32_]], implicit-def $exec, implicit-def dead $scc, implicit $exec ; SI-NEXT: {{ $}} ; SI-NEXT: bb.3: ; SI-NEXT: successors: %bb.4(0x80000000) @@ -626,22 +616,32 @@ ; SI-NEXT: bb.4: ; SI-NEXT: successors: %bb.5(0x80000000) ; SI-NEXT: {{ $}} - ; SI-NEXT: [[V_READFIRSTLANE_B32_8:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[GLOBAL_LOAD_DWORDX4_2]].sub0, implicit $exec - ; SI-NEXT: [[V_READFIRSTLANE_B32_9:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[GLOBAL_LOAD_DWORDX4_2]].sub1, implicit $exec + ; SI-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub0, implicit $exec + ; SI-NEXT: [[V_READFIRSTLANE_B32_5:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub1, implicit $exec + ; SI-NEXT: [[REG_SEQUENCE6:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[V_READFIRSTLANE_B32_4]], %subreg.sub0, [[V_READFIRSTLANE_B32_5]], %subreg.sub1 + ; SI-NEXT: [[V_CMP_EQ_U64_e64_2:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U64_e64 killed [[REG_SEQUENCE6]], [[REG_SEQUENCE2]].sub0_sub1, implicit $exec + ; SI-NEXT: [[V_READFIRSTLANE_B32_6:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub2, implicit $exec + ; SI-NEXT: [[V_READFIRSTLANE_B32_7:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub3, implicit $exec + ; SI-NEXT: [[REG_SEQUENCE7:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[V_READFIRSTLANE_B32_6]], %subreg.sub0, [[V_READFIRSTLANE_B32_7]], %subreg.sub1 + ; SI-NEXT: [[V_CMP_EQ_U64_e64_3:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U64_e64 killed [[REG_SEQUENCE7]], [[REG_SEQUENCE2]].sub2_sub3, implicit $exec + ; SI-NEXT: [[S_AND_B32_1:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_B32 killed [[V_CMP_EQ_U64_e64_2]], killed [[V_CMP_EQ_U64_e64_3]], implicit-def dead $scc + ; SI-NEXT: [[V_READFIRSTLANE_B32_8:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub4, implicit $exec + ; SI-NEXT: [[V_READFIRSTLANE_B32_9:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub5, implicit $exec ; SI-NEXT: [[REG_SEQUENCE8:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[V_READFIRSTLANE_B32_8]], %subreg.sub0, [[V_READFIRSTLANE_B32_9]], %subreg.sub1 - ; SI-NEXT: [[V_CMP_EQ_U64_e64_4:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U64_e64 killed [[REG_SEQUENCE8]], [[GLOBAL_LOAD_DWORDX4_2]].sub0_sub1, implicit $exec - ; SI-NEXT: [[V_READFIRSTLANE_B32_10:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[GLOBAL_LOAD_DWORDX4_2]].sub2, implicit $exec - ; SI-NEXT: [[V_READFIRSTLANE_B32_11:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[GLOBAL_LOAD_DWORDX4_2]].sub3, implicit $exec + ; SI-NEXT: [[V_CMP_EQ_U64_e64_4:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U64_e64 killed [[REG_SEQUENCE8]], [[REG_SEQUENCE2]].sub4_sub5, implicit $exec + ; SI-NEXT: [[S_AND_B32_2:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_B32 killed [[S_AND_B32_1]], killed [[V_CMP_EQ_U64_e64_4]], implicit-def dead $scc + ; SI-NEXT: [[V_READFIRSTLANE_B32_10:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub6, implicit $exec + ; SI-NEXT: [[V_READFIRSTLANE_B32_11:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub7, implicit $exec ; SI-NEXT: [[REG_SEQUENCE9:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[V_READFIRSTLANE_B32_10]], %subreg.sub0, [[V_READFIRSTLANE_B32_11]], %subreg.sub1 - ; SI-NEXT: [[V_CMP_EQ_U64_e64_5:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U64_e64 killed [[REG_SEQUENCE9]], [[GLOBAL_LOAD_DWORDX4_2]].sub2_sub3, implicit $exec - ; SI-NEXT: [[S_AND_B32_3:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_B32 killed [[V_CMP_EQ_U64_e64_4]], killed [[V_CMP_EQ_U64_e64_5]], implicit-def dead $scc - ; SI-NEXT: [[REG_SEQUENCE10:%[0-9]+]]:sgpr_128 = REG_SEQUENCE killed [[V_READFIRSTLANE_B32_8]], %subreg.sub0, killed [[V_READFIRSTLANE_B32_9]], %subreg.sub1, killed [[V_READFIRSTLANE_B32_10]], %subreg.sub2, killed [[V_READFIRSTLANE_B32_11]], %subreg.sub3 + ; SI-NEXT: [[V_CMP_EQ_U64_e64_5:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U64_e64 killed [[REG_SEQUENCE9]], [[REG_SEQUENCE2]].sub6_sub7, implicit $exec + ; SI-NEXT: [[S_AND_B32_3:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_B32 killed [[S_AND_B32_2]], killed [[V_CMP_EQ_U64_e64_5]], implicit-def dead $scc + ; SI-NEXT: [[REG_SEQUENCE10:%[0-9]+]]:sgpr_256 = REG_SEQUENCE killed [[V_READFIRSTLANE_B32_4]], %subreg.sub0, killed [[V_READFIRSTLANE_B32_5]], %subreg.sub1, killed [[V_READFIRSTLANE_B32_6]], %subreg.sub2, killed [[V_READFIRSTLANE_B32_7]], %subreg.sub3, killed [[V_READFIRSTLANE_B32_8]], %subreg.sub4, killed [[V_READFIRSTLANE_B32_9]], %subreg.sub5, killed [[V_READFIRSTLANE_B32_10]], %subreg.sub6, killed [[V_READFIRSTLANE_B32_11]], %subreg.sub7 ; SI-NEXT: [[S_AND_SAVEEXEC_B32_1:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_SAVEEXEC_B32 killed [[S_AND_B32_3]], implicit-def $exec, implicit-def dead $scc, implicit $exec ; SI-NEXT: {{ $}} ; SI-NEXT: bb.5: ; SI-NEXT: successors: %bb.4(0x40000000), %bb.6(0x40000000) ; SI-NEXT: {{ $}} - ; SI-NEXT: [[IMAGE_SAMPLE_V1_V2_gfx10_:%[0-9]+]]:vgpr_32 = IMAGE_SAMPLE_V1_V2_gfx10 undef %27:vreg_64, [[REG_SEQUENCE7]], killed [[REG_SEQUENCE10]], 1, 1, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32) from custom "ImageResource") + ; SI-NEXT: [[IMAGE_SAMPLE_V1_V2_gfx10_:%[0-9]+]]:vgpr_32 = IMAGE_SAMPLE_V1_V2_gfx10 undef %27:vreg_64, killed [[REG_SEQUENCE10]], [[REG_SEQUENCE5]], 1, 1, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32) from custom "ImageResource") ; SI-NEXT: $exec_lo = S_XOR_B32_term $exec_lo, killed [[S_AND_SAVEEXEC_B32_1]], implicit-def dead $scc ; SI-NEXT: SI_WATERFALL_LOOP %bb.4, implicit $exec ; SI-NEXT: {{ $}} diff --git a/llvm/test/CodeGen/AMDGPU/wave32.ll b/llvm/test/CodeGen/AMDGPU/wave32.ll --- a/llvm/test/CodeGen/AMDGPU/wave32.ll +++ b/llvm/test/CodeGen/AMDGPU/wave32.ll @@ -330,24 +330,15 @@ } ; GCN-LABEL: {{^}}test_udiv64: -; GFX1032: v_add_co_u32 v{{[0-9]+}}, vcc_lo, v{{[0-9]+}}, v{{[0-9]+}} -; GFX1032: v_add_co_ci_u32_e32 v{{[0-9]+}}, vcc_lo, 0, v{{[0-9]+}}, vcc_lo -; GFX1032: v_add_co_u32 v{{[0-9]+}}, vcc_lo, v{{[0-9]+}}, v{{[0-9]+}} -; GFX1032: v_add_co_ci_u32_e32 v{{[0-9]+}}, vcc_lo, v{{[0-9]+}}, v{{[0-9]+}}, vcc_lo -; GFX1032: v_add_co_ci_u32_e32 v{{[0-9]+}}, vcc_lo, 0, v{{[0-9]+}}, vcc_lo -; GFX1032: v_add_co_u32 v{{[0-9]+}}, vcc_lo, v{{[0-9]+}}, v{{[0-9]+}} -; GFX1032: v_add_co_ci_u32_e32 v{{[0-9]+}}, vcc_lo, 0, v{{[0-9]+}}, vcc_lo -; GFX1032: v_add_co_u32 v{{[0-9]+}}, vcc_lo, v{{[0-9]+}}, v{{[0-9]+}} -; GFX1032: v_add_co_ci_u32_e32 v{{[0-9]+}}, vcc_lo, v{{[0-9]+}}, v{{[0-9]+}}, vcc_lo -; GFX1064: v_add_co_u32 v{{[0-9]+}}, vcc, v{{[0-9]+}}, v{{[0-9]+}} -; GFX1064: v_add_co_ci_u32_e32 v{{[0-9]+}}, vcc, 0, v{{[0-9]+}}, vcc{{$}} -; GFX1064: v_add_co_u32 v{{[0-9]+}}, vcc, v{{[0-9]+}}, v{{[0-9]+}} -; GFX1064: v_add_co_ci_u32_e32 v{{[0-9]+}}, vcc, v{{[0-9]+}}, v{{[0-9]+}}, vcc{{$}} -; GFX1064: v_add_co_ci_u32_e32 v{{[0-9]+}}, vcc, 0, v{{[0-9]+}}, vcc{{$}} -; GFX1064: v_add_co_u32 v{{[0-9]+}}, vcc, v{{[0-9]+}}, v{{[0-9]+}} -; GFX1064: v_add_co_ci_u32_e32 v{{[0-9]+}}, vcc, 0, v{{[0-9]+}}, vcc{{$}} -; GFX1064: v_add_co_u32 v{{[0-9]+}}, vcc, v{{[0-9]+}}, v{{[0-9]+}} -; GFX1064: v_add_co_ci_u32_e32 v{{[0-9]+}}, vcc, v{{[0-9]+}}, v{{[0-9]+}}, vcc{{$}} +; GCN: s_add_u32 s{{[0-9]+}}, s{{[0-9]+}}, s{{[0-9]+}} +; GCN: s_addc_u32 s{{[0-9]+}}, 0, s{{[0-9]+}} +; GCN: s_add_u32 s{{[0-9]+}}, s{{[0-9]+}}, s{{[0-9]+}} +; GCN: s_addc_u32 s{{[0-9]+}}, s{{[0-9]+}}, s{{[0-9]+}} +; GCN: s_addc_u32 s{{[0-9]+}}, s{{[0-9]+}}, 0 +; GCN: s_add_u32 s{{[0-9]+}}, s{{[0-9]+}}, s{{[0-9]+}} +; GCN: s_add_u32 s{{[0-9]+}}, s{{[0-9]+}}, s{{[0-9]+}} +; GCN: s_addc_u32 s{{[0-9]+}}, 0, s{{[0-9]+}} +; GCN: s_addc_u32 s{{[0-9]+}}, s{{[0-9]+}}, s{{[0-9]+}} define amdgpu_kernel void @test_udiv64(i64 addrspace(1)* %arg) #0 { bb: %tmp = getelementptr inbounds i64, i64 addrspace(1)* %arg, i64 1