Index: llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp =================================================================== --- llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp +++ llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp @@ -1061,24 +1061,51 @@ SDValue RHS = N->getOperand(1); SDValue CI = N->getOperand(2); - unsigned Opc = N->getOpcode() == ISD::ADDCARRY ? AMDGPU::V_ADDC_U32_e64 - : AMDGPU::V_SUBB_U32_e64; - CurDAG->SelectNodeTo( - N, Opc, N->getVTList(), - {LHS, RHS, CI, CurDAG->getTargetConstant(0, {}, MVT::i1) /*clamp bit*/}); + if (N->isDivergent()) { + unsigned Opc = N->getOpcode() == ISD::ADDCARRY ? AMDGPU::V_ADDC_U32_e64 + : AMDGPU::V_SUBB_U32_e64; + CurDAG->SelectNodeTo( + N, Opc, N->getVTList(), + {LHS, RHS, CI, + CurDAG->getTargetConstant(0, {}, MVT::i1) /*clamp bit*/}); + } else { + unsigned Opc = N->getOpcode() == ISD::ADDCARRY ? AMDGPU::S_ADD_CO_PSEUDO + : AMDGPU::S_SUB_CO_PSEUDO; + CurDAG->SelectNodeTo(N, Opc, N->getVTList(), {LHS, RHS, CI}); + } } void AMDGPUDAGToDAGISel::SelectUADDO_USUBO(SDNode *N) { // The name of the opcodes are misleading. v_add_i32/v_sub_i32 have unsigned // carry out despite the _i32 name. These were renamed in VI to _U32. // FIXME: We should probably rename the opcodes here. - unsigned Opc = N->getOpcode() == ISD::UADDO ? - AMDGPU::V_ADD_I32_e64 : AMDGPU::V_SUB_I32_e64; + bool IsAdd = N->getOpcode() == ISD::UADDO; + bool IsVALU = N->isDivergent(); + + for (SDNode::use_iterator UI = N->use_begin(), E = N->use_end(); UI != E; + ++UI) + if (UI.getUse().getResNo() == 1) { + if ((IsAdd && (UI->getOpcode() != ISD::ADDCARRY)) || + (!IsAdd && (UI->getOpcode() != ISD::SUBCARRY))) { + IsVALU = true; + break; + } + } - CurDAG->SelectNodeTo( - N, Opc, N->getVTList(), - {N->getOperand(0), N->getOperand(1), - CurDAG->getTargetConstant(0, {}, MVT::i1) /*clamp bit*/}); + if (IsVALU) { + unsigned Opc = IsAdd ? AMDGPU::V_ADD_I32_e64 : AMDGPU::V_SUB_I32_e64; + + CurDAG->SelectNodeTo( + N, Opc, N->getVTList(), + {N->getOperand(0), N->getOperand(1), + CurDAG->getTargetConstant(0, {}, MVT::i1) /*clamp bit*/}); + } else { + unsigned Opc = N->getOpcode() == ISD::UADDO ? AMDGPU::S_UADDO_PSEUDO + : AMDGPU::S_USUBO_PSEUDO; + + CurDAG->SelectNodeTo(N, Opc, N->getVTList(), + {N->getOperand(0), N->getOperand(1)}); + } } void AMDGPUDAGToDAGISel::SelectFMA_W_CHAIN(SDNode *N) { Index: llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp =================================================================== --- llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp +++ llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp @@ -915,6 +915,12 @@ AMDGPUPassConfig::addInstSelector(); addPass(&SIFixSGPRCopiesID); addPass(createSILowerI1CopiesPass()); + // TODO: We have to add FinalizeISel + // to expand V_ADD/SUB_U64_PSEUDO before SIFixupVectorISel + // that expects V_ADD/SUB -> A_ADDC/SUBB pairs expanded. + // Will be removed as soon as SIFixupVectorISel is changed + // to work with V_ADD/SUB_U64_PSEUDO instead. + addPass(&FinalizeISelID); addPass(createSIFixupVectorISelPass()); addPass(createSIAddIMGInitPass()); return false; Index: llvm/lib/Target/AMDGPU/SIISelLowering.cpp =================================================================== --- llvm/lib/Target/AMDGPU/SIISelLowering.cpp +++ llvm/lib/Target/AMDGPU/SIISelLowering.cpp @@ -3601,6 +3601,28 @@ } switch (MI.getOpcode()) { + case AMDGPU::S_UADDO_PSEUDO: + case AMDGPU::S_USUBO_PSEUDO: { + const DebugLoc &DL = MI.getDebugLoc(); + MachineOperand &Dest0 = MI.getOperand(0); + MachineOperand &Dest1 = MI.getOperand(1); + MachineOperand &Src0 = MI.getOperand(2); + MachineOperand &Src1 = MI.getOperand(3); + + unsigned Opc = (MI.getOpcode() == AMDGPU::S_UADDO_PSEUDO) + ? AMDGPU::S_ADD_I32 + : AMDGPU::S_SUB_I32; + BuildMI(*BB, MI, DL, TII->get(Opc), Dest0.getReg()) + .add(Src0) + .add(Src1); + + BuildMI(*BB, MI, DL, TII->get(AMDGPU::S_CSELECT_B64), Dest1.getReg()) + .addImm(1) + .addImm(0); + + MI.eraseFromParent(); + return BB; + } case AMDGPU::S_ADD_U64_PSEUDO: case AMDGPU::S_SUB_U64_PSEUDO: { MachineRegisterInfo &MRI = BB->getParent()->getRegInfo(); @@ -3648,6 +3670,127 @@ MI.eraseFromParent(); return BB; } + case AMDGPU::V_ADD_U64_PSEUDO: + case AMDGPU::V_SUB_U64_PSEUDO: { + MachineRegisterInfo &MRI = BB->getParent()->getRegInfo(); + const GCNSubtarget &ST = MF->getSubtarget(); + const SIRegisterInfo *TRI = ST.getRegisterInfo(); + const DebugLoc &DL = MI.getDebugLoc(); + + bool IsAdd = (MI.getOpcode() == AMDGPU::V_ADD_U64_PSEUDO); + + const auto *CarryRC = TRI->getRegClass(AMDGPU::SReg_1_XEXECRegClassID); + + Register DestSub0 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); + Register DestSub1 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); + + Register CarryReg = MRI.createVirtualRegister(CarryRC); + Register DeadCarryReg = MRI.createVirtualRegister(CarryRC); + + MachineOperand &Dest = MI.getOperand(0); + MachineOperand &Src0 = MI.getOperand(1); + MachineOperand &Src1 = MI.getOperand(2); + + const TargetRegisterClass *Src0RC = Src0.isReg() + ? MRI.getRegClass(Src0.getReg()) + : &AMDGPU::VReg_64RegClass; + const TargetRegisterClass *Src1RC = Src1.isReg() + ? MRI.getRegClass(Src1.getReg()) + : &AMDGPU::VReg_64RegClass; + + const TargetRegisterClass *Src0SubRC = + TRI->getSubRegClass(Src0RC, AMDGPU::sub0); + const TargetRegisterClass *Src1SubRC = + TRI->getSubRegClass(Src1RC, AMDGPU::sub1); + + MachineOperand SrcReg0Sub0 = TII->buildExtractSubRegOrImm( + MI, MRI, Src0, Src0RC, AMDGPU::sub0, Src0SubRC); + MachineOperand SrcReg1Sub0 = TII->buildExtractSubRegOrImm( + MI, MRI, Src1, Src1RC, AMDGPU::sub0, Src1SubRC); + + MachineOperand SrcReg0Sub1 = TII->buildExtractSubRegOrImm( + MI, MRI, Src0, Src0RC, AMDGPU::sub1, Src0SubRC); + MachineOperand SrcReg1Sub1 = TII->buildExtractSubRegOrImm( + MI, MRI, Src1, Src1RC, AMDGPU::sub1, Src1SubRC); + + unsigned LoOpc = IsAdd ? AMDGPU::V_ADD_I32_e64 : AMDGPU::V_SUB_I32_e64; + MachineInstr *LoHalf = BuildMI(*BB, MI, DL, TII->get(LoOpc), DestSub0) + .addReg(CarryReg, RegState::Define) + .add(SrcReg0Sub0) + .add(SrcReg1Sub0) + .addImm(0); // clamp bit + + unsigned HiOpc = IsAdd ? AMDGPU::V_ADDC_U32_e64 : AMDGPU::V_SUBB_U32_e64; + MachineInstr *HiHalf = + BuildMI(*BB, MI, DL, TII->get(HiOpc), DestSub1) + .addReg(DeadCarryReg, RegState::Define | RegState::Dead) + .add(SrcReg0Sub1) + .add(SrcReg1Sub1) + .addReg(CarryReg, RegState::Kill) + .addImm(0); // clamp bit + + BuildMI(*BB, MI, DL, TII->get(TargetOpcode::REG_SEQUENCE), Dest.getReg()) + .addReg(DestSub0) + .addImm(AMDGPU::sub0) + .addReg(DestSub1) + .addImm(AMDGPU::sub1); + TII->legalizeOperands(*LoHalf); + TII->legalizeOperands(*HiHalf); + MI.eraseFromParent(); + return BB; + } + case AMDGPU::S_ADD_CO_PSEUDO: + case AMDGPU::S_SUB_CO_PSEUDO: { + // This pseudo has a chance to be selected + // only from uniform add/subcarry node. All the VGPR operands + // therefore assumed to be splat vectors. + MachineRegisterInfo &MRI = BB->getParent()->getRegInfo(); + const GCNSubtarget &ST = MF->getSubtarget(); + const SIRegisterInfo *TRI = ST.getRegisterInfo(); + MachineBasicBlock::iterator MII = MI; + const DebugLoc &DL = MI.getDebugLoc(); + MachineOperand &Dest = MI.getOperand(0); + MachineOperand &Src0 = MI.getOperand(2); + MachineOperand &Src1 = MI.getOperand(3); + MachineOperand &Src2 = MI.getOperand(4); + unsigned Opc = (MI.getOpcode() == AMDGPU::S_ADD_CO_PSEUDO) + ? AMDGPU::S_ADDC_U32 + : AMDGPU::S_SUBB_U32; + if (Src0.isReg() && TRI->isVectorRegister(MRI, Src0.getReg())) { + Register RegOp0 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass); + BuildMI(*BB, MII, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32), RegOp0) + .addReg(Src0.getReg()); + Src0.setReg(RegOp0); + } + if (Src1.isReg() && TRI->isVectorRegister(MRI, Src1.getReg())) { + Register RegOp1 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass); + BuildMI(*BB, MII, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32), RegOp1) + .addReg(Src1.getReg()); + Src1.setReg(RegOp1); + } + Register RegOp2 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass); + if (TRI->isVectorRegister(MRI, Src2.getReg())) { + BuildMI(*BB, MII, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32), RegOp2) + .addReg(Src2.getReg()); + Src2.setReg(RegOp2); + } + + if (TRI->getRegSizeInBits(*MRI.getRegClass(Src2.getReg())) == 64) { + BuildMI(*BB, MII, DL, TII->get(AMDGPU::S_CMP_LG_U64)) + .addReg(Src2.getReg()) + .addImm(0); + } else { + BuildMI(*BB, MII, DL, TII->get(AMDGPU::S_CMPK_LG_U32)) + .addReg(Src2.getReg()) + .addImm(0); + } + + BuildMI(*BB, MII, DL, TII->get(Opc), Dest.getReg()) + .add(Src0) + .add(Src1); + MI.eraseFromParent(); + return BB; + } case AMDGPU::SI_INIT_M0: { BuildMI(*BB, MI.getIterator(), MI.getDebugLoc(), TII->get(AMDGPU::S_MOV_B32), AMDGPU::M0) Index: llvm/lib/Target/AMDGPU/SIInstrInfo.cpp =================================================================== --- llvm/lib/Target/AMDGPU/SIInstrInfo.cpp +++ llvm/lib/Target/AMDGPU/SIInstrInfo.cpp @@ -5170,6 +5170,64 @@ splitScalarBinOpN2(Worklist, Inst, AMDGPU::S_OR_B32); Inst.eraseFromParent(); continue; + + // TODO: remove as soon as everything is ready + // to replace VGPR to SGPR copy with V_READFIRSTLANEs. + // S_ADD/SUB_CO_PSEUDO as well as S_UADDO/USUBO_PSEUDO + // can only be selected from the uniform SDNode. + case AMDGPU::S_ADD_CO_PSEUDO: + case AMDGPU::S_SUB_CO_PSEUDO: { + unsigned Opc = (Inst.getOpcode() == AMDGPU::S_ADD_CO_PSEUDO) + ? AMDGPU::V_ADDC_U32_e64 + : AMDGPU::V_SUBB_U32_e64; + const auto *CarryRC = RI.getRegClass(AMDGPU::SReg_1_XEXECRegClassID); + Register DummyCReg = MRI.createVirtualRegister(CarryRC); + Register CarryReg = MRI.createVirtualRegister(CarryRC); + Register DestReg = MRI.createVirtualRegister(RI.getEquivalentVGPRClass( + MRI.getRegClass(Inst.getOperand(0).getReg()))); + BuildMI(*MBB, &Inst, Inst.getDebugLoc(), get(AMDGPU::COPY), CarryReg) + .addReg(Inst.getOperand(4).getReg()); + MachineInstr *CarryOp = + BuildMI(*MBB, &Inst, Inst.getDebugLoc(), get(Opc), DestReg) + .addReg(DummyCReg, RegState::Define | RegState::Dead) + .add(Inst.getOperand(2)) + .add(Inst.getOperand(3)) + .addReg(CarryReg, RegState::Kill) + .addImm(0); + legalizeOperands(*CarryOp); + MRI.replaceRegWith(Inst.getOperand(0).getReg(), DestReg); + addUsersToMoveToVALUWorklist(DestReg, MRI, Worklist); + Inst.eraseFromParent(); + } + continue; + case AMDGPU::S_UADDO_PSEUDO: + case AMDGPU::S_USUBO_PSEUDO: { + const DebugLoc &DL = Inst.getDebugLoc(); + MachineOperand &Dest0 = Inst.getOperand(0); + MachineOperand &Dest1 = Inst.getOperand(1); + MachineOperand &Src0 = Inst.getOperand(2); + MachineOperand &Src1 = Inst.getOperand(3); + + unsigned Opc = (Inst.getOpcode() == AMDGPU::S_UADDO_PSEUDO) + ? AMDGPU::V_ADD_I32_e64 + : AMDGPU::V_SUB_I32_e64; + const TargetRegisterClass *NewRC = + RI.getEquivalentVGPRClass(MRI.getRegClass(Dest0.getReg())); + Register DestReg = MRI.createVirtualRegister(NewRC); + MachineInstr *NewInstr = BuildMI(*MBB, &Inst, DL, get(Opc), DestReg) + .addReg(Dest1.getReg(), RegState::Define) + .add(Src0) + .add(Src1) + .addImm(0); // clamp bit + + legalizeOperands(*NewInstr, MDT); + + MRI.replaceRegWith(Dest0.getReg(), DestReg); + addUsersToMoveToVALUWorklist(NewInstr->getOperand(0).getReg(), MRI, + Worklist); + Inst.eraseFromParent(); + } + continue; } if (NewOpcode == AMDGPU::INSTRUCTION_LIST_END) { @@ -5894,18 +5952,37 @@ // Ensure that def inst defines SCC, which is still live. assert(Op.isReg() && Op.getReg() == AMDGPU::SCC && Op.isDef() && !Op.isDead() && Op.getParent() == &SCCDefInst); + SmallVector CopyToDelete; // This assumes that all the users of SCC are in the same block // as the SCC def. for (MachineInstr &MI : // Skip the def inst itself. make_range(std::next(MachineBasicBlock::iterator(SCCDefInst)), SCCDefInst.getParent()->end())) { // Check if SCC is used first. - if (MI.findRegisterUseOperandIdx(AMDGPU::SCC, false, &RI) != -1) - Worklist.insert(&MI); + if (MI.findRegisterUseOperandIdx(AMDGPU::SCC, false, &RI) != -1) { + if (MI.isCopy()) { + MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo(); + unsigned DestReg = MI.getOperand(0).getReg(); + SmallVector Users; + for (auto &User : MRI.use_nodbg_instructions(DestReg)) { + if ((User.getOpcode() == AMDGPU::S_ADD_CO_PSEUDO) || + (User.getOpcode() == AMDGPU::S_SUB_CO_PSEUDO)) { + Users.push_back(&User); + Worklist.insert(&User); + } + } + for (auto &U : Users) + U->getOperand(4).setReg(RI.getVCC()); + CopyToDelete.push_back(&MI); + } else + Worklist.insert(&MI); + } // Exit if we find another SCC def. if (MI.findRegisterDefOperandIdx(AMDGPU::SCC, false, false, &RI) != -1) - return; + break; } + for (auto &Copy : CopyToDelete) + Copy->eraseFromParent(); } const TargetRegisterClass *SIInstrInfo::getDestEquivalentVGPRClass( Index: llvm/lib/Target/AMDGPU/SIInstructions.td =================================================================== --- llvm/lib/Target/AMDGPU/SIInstructions.td +++ llvm/lib/Target/AMDGPU/SIInstructions.td @@ -193,16 +193,27 @@ let Constraints = "$src = $vdst"; } +let usesCustomInserter = 1, Defs = [VCC, EXEC] in { +def V_ADD_U64_PSEUDO : VPseudoInstSI < + (outs VReg_64:$vdst), (ins VSrc_b64:$src0, VSrc_b64:$src1), + [(set VReg_64:$vdst, (getDivergentFrag.ret i64:$src0, i64:$src1))] +>; + +def V_SUB_U64_PSEUDO : VPseudoInstSI < + (outs VReg_64:$vdst), (ins VSrc_b64:$src0, VSrc_b64:$src1), + [(set VReg_64:$vdst, (getDivergentFrag.ret i64:$src0, i64:$src1))] +>; +} // End usesCustomInserter = 1, Defs = [VCC, EXEC] let usesCustomInserter = 1, Defs = [SCC] in { def S_ADD_U64_PSEUDO : SPseudoInstSI < - (outs SReg_64:$vdst), (ins SSrc_b64:$src0, SSrc_b64:$src1), - [(set SReg_64:$vdst, (add i64:$src0, i64:$src1))] + (outs SReg_64:$sdst), (ins SSrc_b64:$src0, SSrc_b64:$src1), + [(set SReg_64:$sdst, (UniformBinFrag i64:$src0, i64:$src1))] >; def S_SUB_U64_PSEUDO : SPseudoInstSI < - (outs SReg_64:$vdst), (ins SSrc_b64:$src0, SSrc_b64:$src1), - [(set SReg_64:$vdst, (sub i64:$src0, i64:$src1))] + (outs SReg_64:$sdst), (ins SSrc_b64:$src0, SSrc_b64:$src1), + [(set SReg_64:$sdst, (UniformBinFrag i64:$src0, i64:$src1))] >; def S_ADD_U64_CO_PSEUDO : SPseudoInstSI < @@ -212,6 +223,23 @@ def S_SUB_U64_CO_PSEUDO : SPseudoInstSI < (outs SReg_64:$vdst, VOPDstS64orS32:$sdst), (ins SSrc_b64:$src0, SSrc_b64:$src1) >; + +def S_ADD_CO_PSEUDO : SPseudoInstSI < + (outs SReg_32:$sdst, SSrc_i1:$scc_out), (ins SSrc_b32:$src0, SSrc_b32:$src1, SSrc_i1:$scc_in) +>; + +def S_SUB_CO_PSEUDO : SPseudoInstSI < + (outs SReg_32:$sdst, SSrc_i1:$scc_out), (ins SSrc_b32:$src0, SSrc_b32:$src1, SSrc_i1:$scc_in) +>; + +def S_UADDO_PSEUDO : SPseudoInstSI < + (outs SReg_32:$sdst, SSrc_i1:$scc_out), (ins SSrc_b32:$src0, SSrc_b32:$src1) +>; + +def S_USUBO_PSEUDO : SPseudoInstSI < + (outs SReg_32:$sdst, SSrc_i1:$scc_out), (ins SSrc_b32:$src0, SSrc_b32:$src1) +>; + } // End usesCustomInserter = 1, Defs = [SCC] let usesCustomInserter = 1 in { Index: llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-idiv.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-idiv.ll +++ llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-idiv.ll @@ -5353,7 +5353,7 @@ ; GCN-NEXT: v_mul_hi_u32 v6, v0, v3 ; GCN-NEXT: v_mul_hi_u32 v9, v1, v2 ; GCN-NEXT: v_mul_lo_u32 v2, v1, v2 -; GCN-NEXT: s_mov_b32 s4, 0x976a7376 +; GCN-NEXT: s_movk_i32 s4, 0x11e ; GCN-NEXT: v_add_i32_e32 v5, vcc, v6, v5 ; GCN-NEXT: v_mul_lo_u32 v6, v1, v3 ; GCN-NEXT: v_mul_hi_u32 v3, v1, v3 @@ -5369,7 +5369,7 @@ ; GCN-NEXT: v_mul_hi_u32 v5, v0, s3 ; GCN-NEXT: v_addc_u32_e64 v2, vcc, v1, v3, s[0:1] ; GCN-NEXT: v_mul_lo_u32 v6, v2, s3 -; GCN-NEXT: s_movk_i32 s2, 0x11f +; GCN-NEXT: s_mov_b32 s2, 0x976a7377 ; GCN-NEXT: v_add_i32_e32 v4, vcc, v5, v4 ; GCN-NEXT: v_mul_lo_u32 v5, v0, s3 ; GCN-NEXT: v_add_i32_e32 v4, vcc, v4, v6 @@ -5377,14 +5377,14 @@ ; GCN-NEXT: v_mul_hi_u32 v10, v0, v4 ; GCN-NEXT: v_mul_hi_u32 v9, v0, v5 ; GCN-NEXT: v_mul_hi_u32 v11, v2, v4 -; GCN-NEXT: s_mov_b32 s3, 0x976a7377 +; GCN-NEXT: s_movk_i32 s3, 0x11f ; GCN-NEXT: s_mov_b32 s9, s5 ; GCN-NEXT: v_add_i32_e32 v6, vcc, v9, v6 ; GCN-NEXT: v_addc_u32_e32 v9, vcc, v8, v10, vcc ; GCN-NEXT: v_mul_lo_u32 v10, v2, v5 ; GCN-NEXT: v_mul_hi_u32 v5, v2, v5 ; GCN-NEXT: v_mul_lo_u32 v2, v2, v4 -; GCN-NEXT: v_add_i32_e32 v6, vcc, v10, v6 +; GCN-NEXT: v_add_i32_e32 v6, vcc, v6, v10 ; GCN-NEXT: v_addc_u32_e32 v5, vcc, v9, v5, vcc ; GCN-NEXT: v_addc_u32_e32 v4, vcc, v11, v7, vcc ; GCN-NEXT: v_add_i32_e32 v2, vcc, v5, v2 @@ -5407,24 +5407,24 @@ ; GCN-NEXT: v_addc_u32_e32 v2, vcc, v5, v7, vcc ; GCN-NEXT: v_add_i32_e32 v0, vcc, v0, v1 ; GCN-NEXT: v_addc_u32_e32 v1, vcc, v8, v2, vcc -; GCN-NEXT: v_mul_lo_u32 v2, v0, s2 -; GCN-NEXT: v_mul_hi_u32 v3, v0, s3 -; GCN-NEXT: v_mul_lo_u32 v4, v1, s3 -; GCN-NEXT: v_mov_b32_e32 v5, s2 +; GCN-NEXT: v_mul_lo_u32 v2, v0, s3 +; GCN-NEXT: v_mul_hi_u32 v3, v0, s2 +; GCN-NEXT: v_mul_lo_u32 v4, v1, s2 +; GCN-NEXT: v_mov_b32_e32 v5, s3 ; GCN-NEXT: v_add_i32_e32 v2, vcc, v3, v2 -; GCN-NEXT: v_mul_lo_u32 v3, v0, s3 +; GCN-NEXT: v_mul_lo_u32 v3, v0, s2 ; GCN-NEXT: v_add_i32_e32 v2, vcc, v2, v4 ; GCN-NEXT: v_sub_i32_e32 v4, vcc, s7, v2 ; GCN-NEXT: v_sub_i32_e32 v3, vcc, s6, v3 ; GCN-NEXT: v_subb_u32_e64 v4, s[0:1], v4, v5, vcc -; GCN-NEXT: v_subrev_i32_e64 v5, s[0:1], s3, v3 +; GCN-NEXT: v_subrev_i32_e64 v5, s[0:1], s2, v3 ; GCN-NEXT: v_subbrev_u32_e64 v4, s[0:1], 0, v4, s[0:1] -; GCN-NEXT: s_movk_i32 s3, 0x11e -; GCN-NEXT: v_cmp_lt_u32_e64 s[0:1], s3, v4 +; GCN-NEXT: v_cmp_lt_u32_e64 s[0:1], s4, v4 +; GCN-NEXT: s_mov_b32 s2, 0x976a7376 ; GCN-NEXT: v_cndmask_b32_e64 v6, 0, -1, s[0:1] -; GCN-NEXT: v_cmp_lt_u32_e64 s[0:1], s4, v5 +; GCN-NEXT: v_cmp_lt_u32_e64 s[0:1], s2, v5 ; GCN-NEXT: v_cndmask_b32_e64 v5, 0, -1, s[0:1] -; GCN-NEXT: v_cmp_eq_u32_e64 s[0:1], s2, v4 +; GCN-NEXT: v_cmp_eq_u32_e64 s[0:1], s3, v4 ; GCN-NEXT: v_cndmask_b32_e64 v4, v6, v5, s[0:1] ; GCN-NEXT: v_add_i32_e64 v5, s[0:1], 2, v0 ; GCN-NEXT: v_addc_u32_e64 v6, s[0:1], 0, v1, s[0:1] @@ -5434,11 +5434,11 @@ ; GCN-NEXT: v_cndmask_b32_e64 v4, v8, v6, s[0:1] ; GCN-NEXT: v_mov_b32_e32 v6, s7 ; GCN-NEXT: v_subb_u32_e32 v2, vcc, v6, v2, vcc -; GCN-NEXT: v_cmp_lt_u32_e32 vcc, s3, v2 +; GCN-NEXT: v_cmp_lt_u32_e32 vcc, s4, v2 ; GCN-NEXT: v_cndmask_b32_e64 v6, 0, -1, vcc -; GCN-NEXT: v_cmp_lt_u32_e32 vcc, s4, v3 +; GCN-NEXT: v_cmp_lt_u32_e32 vcc, s2, v3 ; GCN-NEXT: v_cndmask_b32_e64 v3, 0, -1, vcc -; GCN-NEXT: v_cmp_eq_u32_e32 vcc, s2, v2 +; GCN-NEXT: v_cmp_eq_u32_e32 vcc, s3, v2 ; GCN-NEXT: v_cndmask_b32_e32 v2, v6, v3, vcc ; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 ; GCN-NEXT: v_cndmask_b32_e64 v2, v7, v5, s[0:1] @@ -5599,7 +5599,7 @@ ; GCN-NEXT: v_mul_lo_u32 v10, v3, v8 ; GCN-NEXT: v_mul_hi_u32 v8, v3, v8 ; GCN-NEXT: v_mul_lo_u32 v3, v3, v5 -; GCN-NEXT: v_add_i32_e32 v6, vcc, v10, v6 +; GCN-NEXT: v_add_i32_e32 v6, vcc, v6, v10 ; GCN-NEXT: v_addc_u32_e32 v6, vcc, v9, v8, vcc ; GCN-NEXT: v_addc_u32_e32 v5, vcc, v11, v2, vcc ; GCN-NEXT: v_add_i32_e32 v3, vcc, v6, v3 @@ -5725,9 +5725,8 @@ ; GCN-NEXT: v_mul_lo_u32 v2, v0, s2 ; GCN-NEXT: v_mul_hi_u32 v3, v0, s3 ; GCN-NEXT: v_mul_lo_u32 v4, v1, s3 -; GCN-NEXT: s_mov_b32 s12, 0x9761f7c9 -; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_mov_b32 s8, s4 +; GCN-NEXT: s_movk_i32 s12, 0x11f +; GCN-NEXT: s_mov_b32 s13, 0x9761f7c9 ; GCN-NEXT: v_add_i32_e32 v2, vcc, v3, v2 ; GCN-NEXT: v_mul_lo_u32 v3, v0, s3 ; GCN-NEXT: v_add_i32_e32 v2, vcc, v4, v2 @@ -5736,12 +5735,13 @@ ; GCN-NEXT: v_mul_hi_u32 v6, v0, v3 ; GCN-NEXT: v_mul_hi_u32 v9, v1, v2 ; GCN-NEXT: v_mul_lo_u32 v2, v1, v2 -; GCN-NEXT: s_movk_i32 s4, 0x11f +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: s_mov_b32 s9, s5 ; GCN-NEXT: v_add_i32_e32 v5, vcc, v6, v5 ; GCN-NEXT: v_mul_lo_u32 v6, v1, v3 ; GCN-NEXT: v_mul_hi_u32 v3, v1, v3 ; GCN-NEXT: v_addc_u32_e32 v4, vcc, v8, v4, vcc -; GCN-NEXT: s_mov_b32 s9, s5 +; GCN-NEXT: s_movk_i32 s5, 0x11e ; GCN-NEXT: v_add_i32_e32 v5, vcc, v6, v5 ; GCN-NEXT: v_addc_u32_e32 v3, vcc, v4, v3, vcc ; GCN-NEXT: v_addc_u32_e32 v4, vcc, v9, v7, vcc @@ -5752,7 +5752,7 @@ ; GCN-NEXT: v_mul_hi_u32 v5, v0, s3 ; GCN-NEXT: v_addc_u32_e64 v2, vcc, v1, v3, s[0:1] ; GCN-NEXT: v_mul_lo_u32 v6, v2, s3 -; GCN-NEXT: s_movk_i32 s5, 0x11e +; GCN-NEXT: s_mov_b32 s8, s4 ; GCN-NEXT: v_add_i32_e32 v4, vcc, v5, v4 ; GCN-NEXT: v_mul_lo_u32 v5, v0, s3 ; GCN-NEXT: v_add_i32_e32 v4, vcc, v4, v6 @@ -5760,14 +5760,15 @@ ; GCN-NEXT: v_mul_hi_u32 v10, v0, v4 ; GCN-NEXT: v_mul_hi_u32 v9, v0, v5 ; GCN-NEXT: v_mul_hi_u32 v11, v2, v4 +; GCN-NEXT: s_mov_b32 s4, 0x9761f7c8 ; GCN-NEXT: s_mov_b32 s11, 0xf000 -; GCN-NEXT: s_mov_b32 s10, -1 ; GCN-NEXT: v_add_i32_e32 v6, vcc, v9, v6 ; GCN-NEXT: v_addc_u32_e32 v9, vcc, v8, v10, vcc ; GCN-NEXT: v_mul_lo_u32 v10, v2, v5 ; GCN-NEXT: v_mul_hi_u32 v5, v2, v5 ; GCN-NEXT: v_mul_lo_u32 v2, v2, v4 -; GCN-NEXT: v_add_i32_e32 v6, vcc, v10, v6 +; GCN-NEXT: s_mov_b32 s10, -1 +; GCN-NEXT: v_add_i32_e32 v6, vcc, v6, v10 ; GCN-NEXT: v_addc_u32_e32 v5, vcc, v9, v5, vcc ; GCN-NEXT: v_addc_u32_e32 v4, vcc, v11, v7, vcc ; GCN-NEXT: v_add_i32_e32 v2, vcc, v5, v2 @@ -5790,26 +5791,25 @@ ; GCN-NEXT: v_addc_u32_e32 v2, vcc, v5, v7, vcc ; GCN-NEXT: v_add_i32_e32 v0, vcc, v0, v1 ; GCN-NEXT: v_addc_u32_e32 v1, vcc, v8, v2, vcc -; GCN-NEXT: v_mul_lo_u32 v2, v0, s4 -; GCN-NEXT: v_mul_hi_u32 v3, v0, s12 -; GCN-NEXT: v_mul_lo_u32 v1, v1, s12 -; GCN-NEXT: v_mul_lo_u32 v0, v0, s12 +; GCN-NEXT: v_mul_lo_u32 v2, v0, s12 +; GCN-NEXT: v_mul_hi_u32 v3, v0, s13 +; GCN-NEXT: v_mul_lo_u32 v1, v1, s13 +; GCN-NEXT: v_mul_lo_u32 v0, v0, s13 ; GCN-NEXT: v_add_i32_e32 v2, vcc, v3, v2 ; GCN-NEXT: v_add_i32_e32 v1, vcc, v2, v1 ; GCN-NEXT: v_sub_i32_e32 v2, vcc, s7, v1 +; GCN-NEXT: v_mov_b32_e32 v3, s12 ; GCN-NEXT: v_sub_i32_e32 v0, vcc, s6, v0 -; GCN-NEXT: v_mov_b32_e32 v3, s4 ; GCN-NEXT: v_subb_u32_e64 v2, s[0:1], v2, v3, vcc -; GCN-NEXT: v_subrev_i32_e64 v4, s[0:1], s12, v0 +; GCN-NEXT: v_subrev_i32_e64 v4, s[0:1], s13, v0 ; GCN-NEXT: v_subbrev_u32_e64 v5, s[2:3], 0, v2, s[0:1] ; GCN-NEXT: v_cmp_lt_u32_e64 s[2:3], s5, v5 -; GCN-NEXT: s_mov_b32 s6, 0x9761f7c8 ; GCN-NEXT: v_subb_u32_e64 v2, s[0:1], v2, v3, s[0:1] ; GCN-NEXT: v_cndmask_b32_e64 v6, 0, -1, s[2:3] -; GCN-NEXT: v_cmp_lt_u32_e64 s[2:3], s6, v4 -; GCN-NEXT: v_subrev_i32_e64 v3, s[0:1], s12, v4 +; GCN-NEXT: v_cmp_lt_u32_e64 s[2:3], s4, v4 +; GCN-NEXT: v_subrev_i32_e64 v3, s[0:1], s13, v4 ; GCN-NEXT: v_cndmask_b32_e64 v7, 0, -1, s[2:3] -; GCN-NEXT: v_cmp_eq_u32_e64 s[2:3], s4, v5 +; GCN-NEXT: v_cmp_eq_u32_e64 s[2:3], s12, v5 ; GCN-NEXT: v_cndmask_b32_e64 v6, v6, v7, s[2:3] ; GCN-NEXT: v_subbrev_u32_e64 v2, s[0:1], 0, v2, s[0:1] ; GCN-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v6 @@ -5818,9 +5818,9 @@ ; GCN-NEXT: v_subb_u32_e32 v1, vcc, v5, v1, vcc ; GCN-NEXT: v_cmp_lt_u32_e32 vcc, s5, v1 ; GCN-NEXT: v_cndmask_b32_e64 v5, 0, -1, vcc -; GCN-NEXT: v_cmp_lt_u32_e32 vcc, s6, v0 +; GCN-NEXT: v_cmp_lt_u32_e32 vcc, s4, v0 ; GCN-NEXT: v_cndmask_b32_e64 v6, 0, -1, vcc -; GCN-NEXT: v_cmp_eq_u32_e32 vcc, s4, v1 +; GCN-NEXT: v_cmp_eq_u32_e32 vcc, s12, v1 ; GCN-NEXT: v_cndmask_b32_e32 v5, v5, v6, vcc ; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v5 ; GCN-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc @@ -6026,7 +6026,7 @@ ; GCN-NEXT: v_add_i32_e32 v10, vcc, v11, v10 ; GCN-NEXT: v_addc_u32_e32 v11, vcc, v8, v12, vcc ; GCN-NEXT: v_mul_lo_u32 v2, v2, v4 -; GCN-NEXT: v_add_i32_e32 v5, vcc, v5, v10 +; GCN-NEXT: v_add_i32_e32 v5, vcc, v10, v5 ; GCN-NEXT: v_addc_u32_e32 v5, vcc, v11, v9, vcc ; GCN-NEXT: v_addc_u32_e32 v4, vcc, v6, v7, vcc ; GCN-NEXT: v_add_i32_e32 v2, vcc, v5, v2 @@ -6199,7 +6199,7 @@ ; GCN-NEXT: v_add_i32_e32 v10, vcc, v11, v10 ; GCN-NEXT: v_addc_u32_e32 v11, vcc, 0, v12, vcc ; GCN-NEXT: v_mul_lo_u32 v2, v2, v5 -; GCN-NEXT: v_add_i32_e32 v7, vcc, v7, v10 +; GCN-NEXT: v_add_i32_e32 v7, vcc, v10, v7 ; GCN-NEXT: v_addc_u32_e32 v7, vcc, v11, v9, vcc ; GCN-NEXT: v_addc_u32_e32 v5, vcc, v8, v4, vcc ; GCN-NEXT: v_add_i32_e32 v2, vcc, v7, v2 @@ -6387,7 +6387,7 @@ ; GCN-NEXT: v_add_i32_e32 v10, vcc, v11, v10 ; GCN-NEXT: v_addc_u32_e32 v11, vcc, 0, v12, vcc ; GCN-NEXT: v_mul_lo_u32 v2, v2, v5 -; GCN-NEXT: v_add_i32_e32 v7, vcc, v7, v10 +; GCN-NEXT: v_add_i32_e32 v7, vcc, v10, v7 ; GCN-NEXT: v_addc_u32_e32 v7, vcc, v11, v9, vcc ; GCN-NEXT: v_addc_u32_e32 v5, vcc, v8, v4, vcc ; GCN-NEXT: v_add_i32_e32 v2, vcc, v7, v2 @@ -6542,7 +6542,7 @@ ; GCN-NEXT: v_add_i32_e32 v10, vcc, v11, v10 ; GCN-NEXT: v_addc_u32_e32 v11, vcc, 0, v12, vcc ; GCN-NEXT: v_mul_lo_u32 v2, v2, v5 -; GCN-NEXT: v_add_i32_e32 v7, vcc, v7, v10 +; GCN-NEXT: v_add_i32_e32 v7, vcc, v10, v7 ; GCN-NEXT: v_addc_u32_e32 v7, vcc, v11, v9, vcc ; GCN-NEXT: v_addc_u32_e32 v5, vcc, v8, v4, vcc ; GCN-NEXT: v_add_i32_e32 v2, vcc, v7, v2 @@ -6665,7 +6665,7 @@ ; GCN-NEXT: v_add_i32_e32 v12, vcc, v13, v12 ; GCN-NEXT: v_addc_u32_e32 v13, vcc, 0, v14, vcc ; GCN-NEXT: v_mul_lo_u32 v3, v3, v8 -; GCN-NEXT: v_add_i32_e32 v9, vcc, v9, v12 +; GCN-NEXT: v_add_i32_e32 v9, vcc, v12, v9 ; GCN-NEXT: v_addc_u32_e32 v9, vcc, v13, v11, vcc ; GCN-NEXT: v_addc_u32_e32 v8, vcc, v10, v4, vcc ; GCN-NEXT: v_add_i32_e32 v3, vcc, v9, v3 @@ -6807,7 +6807,7 @@ ; GCN-NEXT: v_add_i32_e32 v10, vcc, v11, v10 ; GCN-NEXT: v_addc_u32_e32 v11, vcc, v8, v12, vcc ; GCN-NEXT: v_mul_lo_u32 v2, v2, v4 -; GCN-NEXT: v_add_i32_e32 v5, vcc, v5, v10 +; GCN-NEXT: v_add_i32_e32 v5, vcc, v10, v5 ; GCN-NEXT: v_addc_u32_e32 v5, vcc, v11, v9, vcc ; GCN-NEXT: v_addc_u32_e32 v4, vcc, v6, v7, vcc ; GCN-NEXT: v_add_i32_e32 v2, vcc, v5, v2 @@ -6981,7 +6981,7 @@ ; GCN-NEXT: v_add_i32_e32 v10, vcc, v11, v10 ; GCN-NEXT: v_addc_u32_e32 v11, vcc, 0, v12, vcc ; GCN-NEXT: v_mul_lo_u32 v2, v2, v5 -; GCN-NEXT: v_add_i32_e32 v7, vcc, v7, v10 +; GCN-NEXT: v_add_i32_e32 v7, vcc, v10, v7 ; GCN-NEXT: v_addc_u32_e32 v7, vcc, v11, v9, vcc ; GCN-NEXT: v_addc_u32_e32 v5, vcc, v8, v4, vcc ; GCN-NEXT: v_add_i32_e32 v2, vcc, v7, v2 @@ -7190,7 +7190,7 @@ ; GCN-NEXT: v_add_i32_e32 v10, vcc, v11, v10 ; GCN-NEXT: v_addc_u32_e32 v11, vcc, 0, v12, vcc ; GCN-NEXT: v_mul_lo_u32 v2, v2, v5 -; GCN-NEXT: v_add_i32_e32 v7, vcc, v7, v10 +; GCN-NEXT: v_add_i32_e32 v7, vcc, v10, v7 ; GCN-NEXT: v_addc_u32_e32 v7, vcc, v11, v9, vcc ; GCN-NEXT: v_addc_u32_e32 v5, vcc, v8, v4, vcc ; GCN-NEXT: v_add_i32_e32 v2, vcc, v7, v2 @@ -7307,7 +7307,7 @@ ; GCN-NEXT: v_add_i32_e32 v12, vcc, v13, v12 ; GCN-NEXT: v_addc_u32_e32 v13, vcc, 0, v14, vcc ; GCN-NEXT: v_mul_lo_u32 v3, v3, v8 -; GCN-NEXT: v_add_i32_e32 v9, vcc, v9, v12 +; GCN-NEXT: v_add_i32_e32 v9, vcc, v12, v9 ; GCN-NEXT: v_addc_u32_e32 v9, vcc, v13, v11, vcc ; GCN-NEXT: v_addc_u32_e32 v8, vcc, v10, v4, vcc ; GCN-NEXT: v_add_i32_e32 v3, vcc, v9, v3 Index: llvm/test/CodeGen/AMDGPU/cvt_f32_ubyte.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/cvt_f32_ubyte.ll +++ llvm/test/CodeGen/AMDGPU/cvt_f32_ubyte.ll @@ -609,16 +609,16 @@ ; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc ; VI-NEXT: v_add_u32_e32 v6, vcc, 3, v0 ; VI-NEXT: v_addc_u32_e32 v7, vcc, 0, v1, vcc -; VI-NEXT: flat_load_ubyte v8, v[2:3] -; VI-NEXT: flat_load_ubyte v2, v[4:5] -; VI-NEXT: flat_load_ubyte v3, v[6:7] +; VI-NEXT: flat_load_ubyte v4, v[4:5] +; VI-NEXT: flat_load_ubyte v5, v[6:7] +; VI-NEXT: flat_load_ubyte v6, v[2:3] ; VI-NEXT: flat_load_ubyte v0, v[0:1] ; VI-NEXT: s_waitcnt vmcnt(3) lgkmcnt(3) -; VI-NEXT: v_cvt_f32_ubyte2_e32 v1, v8 +; VI-NEXT: v_cvt_f32_ubyte0_e32 v2, v4 ; VI-NEXT: s_waitcnt vmcnt(2) lgkmcnt(2) -; VI-NEXT: v_cvt_f32_ubyte0_e32 v2, v2 +; VI-NEXT: v_cvt_f32_ubyte0_e32 v3, v5 ; VI-NEXT: s_waitcnt vmcnt(1) lgkmcnt(1) -; VI-NEXT: v_cvt_f32_ubyte0_e32 v3, v3 +; VI-NEXT: v_cvt_f32_ubyte2_e32 v1, v6 ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; VI-NEXT: v_cvt_f32_ubyte0_e32 v0, v0 ; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 @@ -775,35 +775,35 @@ ; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v1, vcc ; VI-NEXT: v_add_u32_e32 v4, vcc, 2, v0 ; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc -; VI-NEXT: v_add_u32_e32 v6, vcc, 6, v0 +; VI-NEXT: flat_load_ubyte v12, v[4:5] +; VI-NEXT: v_add_u32_e32 v4, vcc, 6, v0 +; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc +; VI-NEXT: v_add_u32_e32 v6, vcc, 4, v0 ; VI-NEXT: v_addc_u32_e32 v7, vcc, 0, v1, vcc -; VI-NEXT: v_add_u32_e32 v8, vcc, 4, v0 +; VI-NEXT: v_add_u32_e32 v8, vcc, 5, v0 ; VI-NEXT: v_addc_u32_e32 v9, vcc, 0, v1, vcc -; VI-NEXT: v_add_u32_e32 v10, vcc, 5, v0 +; VI-NEXT: v_add_u32_e32 v10, vcc, 1, v0 ; VI-NEXT: v_addc_u32_e32 v11, vcc, 0, v1, vcc -; VI-NEXT: v_add_u32_e32 v12, vcc, 1, v0 -; VI-NEXT: v_addc_u32_e32 v13, vcc, 0, v1, vcc +; VI-NEXT: flat_load_ubyte v8, v[8:9] +; VI-NEXT: flat_load_ubyte v9, v[10:11] ; VI-NEXT: flat_load_ubyte v6, v[6:7] -; VI-NEXT: flat_load_ubyte v7, v[8:9] -; VI-NEXT: flat_load_ubyte v8, v[10:11] -; VI-NEXT: flat_load_ubyte v9, v[12:13] +; VI-NEXT: flat_load_ubyte v7, v[4:5] +; VI-NEXT: flat_load_ubyte v2, v[2:3] ; VI-NEXT: flat_load_ubyte v0, v[0:1] -; VI-NEXT: flat_load_ubyte v1, v[2:3] -; VI-NEXT: flat_load_ubyte v2, v[4:5] -; VI-NEXT: s_waitcnt vmcnt(6) lgkmcnt(6) -; VI-NEXT: v_cvt_f32_ubyte0_e32 v6, v6 ; VI-NEXT: s_waitcnt vmcnt(5) lgkmcnt(5) -; VI-NEXT: v_cvt_f32_ubyte0_e32 v4, v7 -; VI-NEXT: s_waitcnt vmcnt(4) lgkmcnt(4) ; VI-NEXT: v_cvt_f32_ubyte2_e32 v5, v8 +; VI-NEXT: s_waitcnt vmcnt(4) lgkmcnt(4) +; VI-NEXT: v_cvt_f32_ubyte2_e32 v1, v9 +; VI-NEXT: s_waitcnt vmcnt(3) lgkmcnt(3) +; VI-NEXT: v_cvt_f32_ubyte0_e32 v4, v6 ; VI-NEXT: s_waitcnt vmcnt(2) lgkmcnt(2) -; VI-NEXT: v_cvt_f32_ubyte0_e32 v0, v0 +; VI-NEXT: v_cvt_f32_ubyte0_e32 v6, v7 ; VI-NEXT: s_waitcnt vmcnt(1) lgkmcnt(1) -; VI-NEXT: v_lshlrev_b32_e32 v3, 8, v1 -; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; VI-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 +; VI-NEXT: v_or_b32_sdwa v2, v2, v12 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; VI-NEXT: v_cvt_f32_ubyte3_e32 v3, v2 -; VI-NEXT: v_cvt_f32_ubyte2_e32 v1, v9 +; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; VI-NEXT: v_cvt_f32_ubyte0_e32 v0, v0 ; VI-NEXT: v_cvt_f32_ubyte2_e32 v2, v2 ; VI-NEXT: buffer_store_dwordx3 v[4:6], off, s[4:7], 0 offset:16 ; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 @@ -1052,19 +1052,18 @@ ; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc ; VI-NEXT: v_add_u32_e32 v6, vcc, 1, v0 ; VI-NEXT: v_addc_u32_e32 v7, vcc, 0, v1, vcc +; VI-NEXT: flat_load_ubyte v4, v[4:5] +; VI-NEXT: flat_load_ubyte v5, v[6:7] ; VI-NEXT: flat_load_ubyte v2, v[2:3] -; VI-NEXT: flat_load_ubyte v3, v[4:5] -; VI-NEXT: flat_load_ubyte v4, v[6:7] ; VI-NEXT: flat_load_ubyte v0, v[0:1] -; VI-NEXT: s_waitcnt vmcnt(3) lgkmcnt(3) +; VI-NEXT: s_waitcnt vmcnt(1) lgkmcnt(1) ; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v2 -; VI-NEXT: s_waitcnt vmcnt(2) lgkmcnt(2) -; VI-NEXT: v_or_b32_sdwa v1, v1, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; VI-NEXT: v_cvt_f32_ubyte3_e32 v3, v1 -; VI-NEXT: v_cvt_f32_ubyte2_e32 v2, v1 +; VI-NEXT: v_or_b32_sdwa v1, v1, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; VI-NEXT: v_cvt_f32_ubyte0_e32 v0, v0 -; VI-NEXT: v_cvt_f32_ubyte2_e32 v1, v4 +; VI-NEXT: v_cvt_f32_ubyte3_e32 v3, v1 +; VI-NEXT: v_cvt_f32_ubyte2_e32 v2, v1 +; VI-NEXT: v_cvt_f32_ubyte2_e32 v1, v5 ; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 ; VI-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() Index: llvm/test/CodeGen/AMDGPU/fast-unaligned-load-store.global.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/fast-unaligned-load-store.global.ll +++ llvm/test/CodeGen/AMDGPU/fast-unaligned-load-store.global.ll @@ -107,20 +107,21 @@ ; GFX7-ALIGNED-NEXT: v_addc_u32_e32 v3, vcc, 0, v1, vcc ; GFX7-ALIGNED-NEXT: v_add_i32_e32 v4, vcc, 1, v0 ; GFX7-ALIGNED-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc -; GFX7-ALIGNED-NEXT: flat_load_ubyte v6, v[0:1] -; GFX7-ALIGNED-NEXT: v_add_i32_e32 v0, vcc, 3, v0 -; GFX7-ALIGNED-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GFX7-ALIGNED-NEXT: flat_load_ubyte v2, v[2:3] -; GFX7-ALIGNED-NEXT: flat_load_ubyte v3, v[4:5] +; GFX7-ALIGNED-NEXT: v_add_i32_e32 v6, vcc, 3, v0 +; GFX7-ALIGNED-NEXT: v_addc_u32_e32 v7, vcc, 0, v1, vcc ; GFX7-ALIGNED-NEXT: flat_load_ubyte v0, v[0:1] +; GFX7-ALIGNED-NEXT: flat_load_ubyte v1, v[6:7] +; GFX7-ALIGNED-NEXT: flat_load_ubyte v4, v[4:5] +; GFX7-ALIGNED-NEXT: flat_load_ubyte v2, v[2:3] +; GFX7-ALIGNED-NEXT: s_waitcnt vmcnt(2) lgkmcnt(2) +; GFX7-ALIGNED-NEXT: v_lshlrev_b32_e32 v1, 8, v1 ; GFX7-ALIGNED-NEXT: s_waitcnt vmcnt(1) lgkmcnt(1) -; GFX7-ALIGNED-NEXT: v_lshlrev_b32_e32 v1, 8, v3 +; GFX7-ALIGNED-NEXT: v_lshlrev_b32_e32 v3, 8, v4 ; GFX7-ALIGNED-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX7-ALIGNED-NEXT: v_lshlrev_b32_e32 v0, 8, v0 -; GFX7-ALIGNED-NEXT: v_or_b32_e32 v0, v0, v2 -; GFX7-ALIGNED-NEXT: v_or_b32_e32 v1, v1, v6 -; GFX7-ALIGNED-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX7-ALIGNED-NEXT: v_or_b32_e32 v0, v1, v0 +; GFX7-ALIGNED-NEXT: v_or_b32_e32 v1, v1, v2 +; GFX7-ALIGNED-NEXT: v_or_b32_e32 v0, v3, v0 +; GFX7-ALIGNED-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX7-ALIGNED-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX7-ALIGNED-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-UNALIGNED-LABEL: global_load_2xi16_align1: Index: llvm/test/CodeGen/AMDGPU/max.i16.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/max.i16.ll +++ llvm/test/CodeGen/AMDGPU/max.i16.ll @@ -137,21 +137,21 @@ ; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc ; VI-NEXT: v_add_u32_e32 v6, vcc, 4, v0 ; VI-NEXT: v_addc_u32_e32 v7, vcc, 0, v1, vcc -; VI-NEXT: flat_load_ushort v8, v[6:7] -; VI-NEXT: flat_load_dword v9, v[0:1] +; VI-NEXT: flat_load_ushort v6, v[6:7] +; VI-NEXT: flat_load_dword v7, v[0:1] ; VI-NEXT: v_add_u32_e32 v0, vcc, 4, v2 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v3, vcc ; VI-NEXT: flat_load_ushort v0, v[0:1] -; VI-NEXT: flat_load_dword v1, v[2:3] -; VI-NEXT: v_add_u32_e32 v6, vcc, 4, v4 -; VI-NEXT: v_addc_u32_e32 v7, vcc, 0, v5, vcc +; VI-NEXT: flat_load_dword v8, v[2:3] +; VI-NEXT: v_add_u32_e32 v2, vcc, 4, v4 +; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v5, vcc ; VI-NEXT: s_waitcnt vmcnt(1) lgkmcnt(1) -; VI-NEXT: v_max_i16_e32 v0, v8, v0 +; VI-NEXT: v_max_i16_e32 v0, v6, v0 ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; VI-NEXT: v_max_i16_e32 v2, v9, v1 -; VI-NEXT: v_max_i16_sdwa v1, v9, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; VI-NEXT: v_or_b32_e32 v1, v2, v1 -; VI-NEXT: flat_store_short v[6:7], v0 +; VI-NEXT: v_max_i16_e32 v1, v7, v8 +; VI-NEXT: v_max_i16_sdwa v7, v7, v8 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; VI-NEXT: v_or_b32_e32 v1, v1, v7 +; VI-NEXT: flat_store_short v[2:3], v0 ; VI-NEXT: flat_store_dword v[4:5], v1 ; VI-NEXT: s_endpgm ; Index: llvm/test/CodeGen/AMDGPU/promote-constOffset-to-imm.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/promote-constOffset-to-imm.ll +++ llvm/test/CodeGen/AMDGPU/promote-constOffset-to-imm.ll @@ -17,11 +17,11 @@ ; ; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off{{$}} ; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:2048 -; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:-4096 -; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off{{$}} ; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:2048 +; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off{{$}} ; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:2048 ; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off{{$}} +; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:-4096 ; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:2048 ; ; GFX10: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off{{$}} @@ -220,16 +220,16 @@ ; GFX8: flat_load_dword {{v[0-9]+}}, v[{{[0-9]+:[0-9]+}}] ; GFX8: flat_load_dword {{v[0-9]+}}, v[{{[0-9]+:[0-9]+}}] ; +; GFX9: global_load_dword {{v[0-9]+}}, v[{{[0-9]+:[0-9]+}}], off{{$}} ; GFX9: global_load_dword {{v[0-9]+}}, v[{{[0-9]+:[0-9]+}}], off offset:1024 ; GFX9: global_load_dword {{v[0-9]+}}, v[{{[0-9]+:[0-9]+}}], off offset:2048 ; GFX9: global_load_dword {{v[0-9]+}}, v[{{[0-9]+:[0-9]+}}], off offset:3072 ; GFX9: global_load_dword {{v[0-9]+}}, v[{{[0-9]+:[0-9]+}}], off{{$}} ; GFX9: global_load_dword {{v[0-9]+}}, v[{{[0-9]+:[0-9]+}}], off offset:1024 -; GFX9: global_load_dword {{v[0-9]+}}, v[{{[0-9]+:[0-9]+}}], off{{$}} -; GFX9: global_load_dword {{v[0-9]+}}, v[{{[0-9]+:[0-9]+}}], off offset:1024 ; GFX9: global_load_dword {{v[0-9]+}}, v[{{[0-9]+:[0-9]+}}], off offset:2048 ; GFX9: global_load_dword {{v[0-9]+}}, v[{{[0-9]+:[0-9]+}}], off offset:3072 ; GFX9: global_load_dword {{v[0-9]+}}, v[{{[0-9]+:[0-9]+}}], off{{$}} +; GFX9: global_load_dword {{v[0-9]+}}, v[{{[0-9]+:[0-9]+}}], off offset:1024 ; ; GFX10: global_load_dword {{v[0-9]+}}, v[{{[0-9]+:[0-9]+}}], off{{$}} ; GFX10: global_load_dword {{v[0-9]+}}, v[{{[0-9]+:[0-9]+}}], off offset:1024 @@ -299,8 +299,8 @@ ; GFX8: flat_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}] ; GFX8: flat_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}] ; -; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:-4096 ; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off{{$}} +; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:-4096 ; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:2048 ; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off{{$}} ; @@ -345,10 +345,10 @@ ; GFX8: flat_load_dword {{v[0-9]+}}, v[{{[0-9]+:[0-9]+}}] ; GFX8: flat_load_dword {{v[0-9]+}}, v[{{[0-9]+:[0-9]+}}] ; +; GFX9: global_load_dword {{v[0-9]+}}, v[{{[0-9]+:[0-9]+}}], off{{$}} ; GFX9: global_load_dword {{v[0-9]+}}, v[{{[0-9]+:[0-9]+}}], off offset:2048 ; GFX9: global_load_dword {{v[0-9]+}}, v[{{[0-9]+:[0-9]+}}], off offset:3072 ; GFX9: global_load_dword {{v[0-9]+}}, v[{{[0-9]+:[0-9]+}}], off{{$}} -; GFX9: global_load_dword {{v[0-9]+}}, v[{{[0-9]+:[0-9]+}}], off{{$}} ; ; GFX10: global_load_dword {{v[0-9]+}}, v[{{[0-9]+:[0-9]+}}], off{{$}} ; GFX10: global_load_dword {{v[0-9]+}}, v[{{[0-9]+:[0-9]+}}], off offset:-2048 @@ -456,8 +456,8 @@ ; ; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off{{$}} ; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:2048 -; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off{{$}} ; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:2048 +; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off{{$}} ; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:2048 ; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off{{$}} ; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off{{$}} Index: llvm/test/CodeGen/AMDGPU/sdiv64.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/sdiv64.ll +++ llvm/test/CodeGen/AMDGPU/sdiv64.ll @@ -69,7 +69,7 @@ ; GCN-NEXT: v_add_i32_e32 v10, vcc, v11, v10 ; GCN-NEXT: v_addc_u32_e32 v11, vcc, v7, v12, vcc ; GCN-NEXT: v_mul_lo_u32 v3, v3, v5 -; GCN-NEXT: v_add_i32_e32 v6, vcc, v6, v10 +; GCN-NEXT: v_add_i32_e32 v6, vcc, v10, v6 ; GCN-NEXT: v_addc_u32_e32 v6, vcc, v11, v9, vcc ; GCN-NEXT: v_addc_u32_e32 v5, vcc, v8, v1, vcc ; GCN-NEXT: v_add_i32_e32 v3, vcc, v6, v3 @@ -422,15 +422,15 @@ ; GCN-IR-NEXT: v_mov_b32_e32 v1, v5 ; GCN-IR-NEXT: v_cndmask_b32_e64 v7, v10, 0, s[6:7] ; GCN-IR-NEXT: s_and_b64 s[4:5], s[8:9], s[4:5] -; GCN-IR-NEXT: v_mov_b32_e32 v17, v18 +; GCN-IR-NEXT: v_mov_b32_e32 v15, v18 ; GCN-IR-NEXT: v_cndmask_b32_e64 v0, v9, 0, s[6:7] ; GCN-IR-NEXT: s_and_saveexec_b64 s[6:7], s[4:5] ; GCN-IR-NEXT: s_cbranch_execz BB1_6 ; GCN-IR-NEXT: ; %bb.1: ; %udiv-bb1 -; GCN-IR-NEXT: v_add_i32_e32 v15, vcc, 1, v11 -; GCN-IR-NEXT: v_addc_u32_e32 v16, vcc, 0, v12, vcc +; GCN-IR-NEXT: v_add_i32_e32 v16, vcc, 1, v11 +; GCN-IR-NEXT: v_addc_u32_e32 v17, vcc, 0, v12, vcc ; GCN-IR-NEXT: v_sub_i32_e64 v0, s[4:5], 63, v11 -; GCN-IR-NEXT: v_cmp_ge_u64_e32 vcc, v[15:16], v[11:12] +; GCN-IR-NEXT: v_cmp_ge_u64_e32 vcc, v[16:17], v[11:12] ; GCN-IR-NEXT: v_mov_b32_e32 v11, 0 ; GCN-IR-NEXT: v_lshl_b64 v[7:8], v[9:10], v0 ; GCN-IR-NEXT: s_mov_b64 s[8:9], 0 @@ -440,39 +440,39 @@ ; GCN-IR-NEXT: s_cbranch_execz BB1_5 ; GCN-IR-NEXT: ; %bb.2: ; %udiv-preheader ; GCN-IR-NEXT: v_add_i32_e32 v0, vcc, -1, v2 -; GCN-IR-NEXT: v_lshr_b64 v[15:16], v[9:10], v15 +; GCN-IR-NEXT: v_lshr_b64 v[16:17], v[9:10], v16 ; GCN-IR-NEXT: v_addc_u32_e32 v9, vcc, -1, v3, vcc ; GCN-IR-NEXT: v_not_b32_e32 v10, v13 ; GCN-IR-NEXT: v_not_b32_e32 v11, v18 ; GCN-IR-NEXT: v_add_i32_e32 v13, vcc, v10, v14 -; GCN-IR-NEXT: v_addc_u32_e32 v14, vcc, v11, v17, vcc -; GCN-IR-NEXT: v_mov_b32_e32 v17, 0 ; GCN-IR-NEXT: v_mov_b32_e32 v18, 0 +; GCN-IR-NEXT: v_addc_u32_e32 v14, vcc, v11, v15, vcc +; GCN-IR-NEXT: v_mov_b32_e32 v19, 0 ; GCN-IR-NEXT: BB1_3: ; %udiv-do-while ; GCN-IR-NEXT: ; =>This Inner Loop Header: Depth=1 -; GCN-IR-NEXT: v_lshl_b64 v[15:16], v[15:16], 1 +; GCN-IR-NEXT: v_lshl_b64 v[16:17], v[16:17], 1 ; GCN-IR-NEXT: v_lshrrev_b32_e32 v10, 31, v8 -; GCN-IR-NEXT: v_or_b32_e32 v10, v15, v10 +; GCN-IR-NEXT: v_or_b32_e32 v10, v16, v10 ; GCN-IR-NEXT: v_lshl_b64 v[7:8], v[7:8], 1 ; GCN-IR-NEXT: v_sub_i32_e32 v11, vcc, v0, v10 -; GCN-IR-NEXT: v_subb_u32_e32 v11, vcc, v9, v16, vcc -; GCN-IR-NEXT: v_or_b32_e32 v7, v17, v7 -; GCN-IR-NEXT: v_add_i32_e32 v17, vcc, 1, v13 +; GCN-IR-NEXT: v_subb_u32_e32 v11, vcc, v9, v17, vcc +; GCN-IR-NEXT: v_or_b32_e32 v7, v18, v7 +; GCN-IR-NEXT: v_add_i32_e32 v18, vcc, 1, v13 ; GCN-IR-NEXT: v_ashrrev_i32_e32 v15, 31, v11 -; GCN-IR-NEXT: v_or_b32_e32 v8, v18, v8 -; GCN-IR-NEXT: v_addc_u32_e32 v18, vcc, 0, v14, vcc -; GCN-IR-NEXT: v_cmp_lt_u64_e32 vcc, v[17:18], v[13:14] -; GCN-IR-NEXT: v_mov_b32_e32 v13, v17 +; GCN-IR-NEXT: v_or_b32_e32 v8, v19, v8 +; GCN-IR-NEXT: v_addc_u32_e32 v19, vcc, 0, v14, vcc +; GCN-IR-NEXT: v_cmp_lt_u64_e32 vcc, v[18:19], v[13:14] +; GCN-IR-NEXT: v_mov_b32_e32 v13, v18 ; GCN-IR-NEXT: v_mov_b32_e32 v12, 0 ; GCN-IR-NEXT: v_and_b32_e32 v11, 1, v15 -; GCN-IR-NEXT: v_and_b32_e32 v19, v15, v3 +; GCN-IR-NEXT: v_and_b32_e32 v20, v15, v3 ; GCN-IR-NEXT: v_and_b32_e32 v15, v15, v2 -; GCN-IR-NEXT: v_sub_i32_e64 v15, s[4:5], v10, v15 -; GCN-IR-NEXT: v_mov_b32_e32 v14, v18 -; GCN-IR-NEXT: v_mov_b32_e32 v18, v12 -; GCN-IR-NEXT: v_subb_u32_e64 v16, s[4:5], v16, v19, s[4:5] +; GCN-IR-NEXT: v_sub_i32_e64 v16, s[4:5], v10, v15 +; GCN-IR-NEXT: v_mov_b32_e32 v14, v19 +; GCN-IR-NEXT: v_mov_b32_e32 v19, v12 +; GCN-IR-NEXT: v_subb_u32_e64 v17, s[4:5], v17, v20, s[4:5] ; GCN-IR-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GCN-IR-NEXT: v_mov_b32_e32 v17, v11 +; GCN-IR-NEXT: v_mov_b32_e32 v18, v11 ; GCN-IR-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GCN-IR-NEXT: s_cbranch_execnz BB1_3 ; GCN-IR-NEXT: ; %bb.4: ; %Flow @@ -1193,7 +1193,7 @@ ; GCN-NEXT: v_add_i32_e32 v10, vcc, v11, v10 ; GCN-NEXT: v_addc_u32_e32 v11, vcc, v2, v12, vcc ; GCN-NEXT: v_mul_lo_u32 v4, v4, v6 -; GCN-NEXT: v_add_i32_e32 v7, vcc, v7, v10 +; GCN-NEXT: v_add_i32_e32 v7, vcc, v10, v7 ; GCN-NEXT: v_addc_u32_e32 v7, vcc, v11, v9, vcc ; GCN-NEXT: v_addc_u32_e32 v6, vcc, v8, v1, vcc ; GCN-NEXT: v_add_i32_e32 v4, vcc, v7, v4 @@ -1499,21 +1499,21 @@ ; GCN-IR-NEXT: s_and_saveexec_b64 s[6:7], s[4:5] ; GCN-IR-NEXT: s_cbranch_execz BB11_6 ; GCN-IR-NEXT: ; %bb.1: ; %udiv-bb1 -; GCN-IR-NEXT: v_add_i32_e32 v6, vcc, 1, v4 -; GCN-IR-NEXT: v_addc_u32_e32 v7, vcc, 0, v5, vcc -; GCN-IR-NEXT: v_cmp_ge_u64_e32 vcc, v[6:7], v[4:5] +; GCN-IR-NEXT: v_add_i32_e32 v8, vcc, 1, v4 +; GCN-IR-NEXT: v_addc_u32_e32 v9, vcc, 0, v5, vcc +; GCN-IR-NEXT: v_cmp_ge_u64_e32 vcc, v[8:9], v[4:5] ; GCN-IR-NEXT: v_sub_i32_e64 v4, s[4:5], 63, v4 -; GCN-IR-NEXT: v_mov_b32_e32 v8, 0 +; GCN-IR-NEXT: v_mov_b32_e32 v6, 0 ; GCN-IR-NEXT: v_lshl_b64 v[4:5], 24, v4 ; GCN-IR-NEXT: s_mov_b64 s[8:9], 0 -; GCN-IR-NEXT: v_mov_b32_e32 v9, 0 +; GCN-IR-NEXT: v_mov_b32_e32 v7, 0 ; GCN-IR-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GCN-IR-NEXT: s_xor_b64 s[10:11], exec, s[4:5] ; GCN-IR-NEXT: s_cbranch_execz BB11_5 ; GCN-IR-NEXT: ; %bb.2: ; %udiv-preheader -; GCN-IR-NEXT: v_lshr_b64 v[12:13], 24, v6 -; GCN-IR-NEXT: v_add_i32_e32 v6, vcc, -1, v0 -; GCN-IR-NEXT: v_addc_u32_e32 v7, vcc, -1, v1, vcc +; GCN-IR-NEXT: v_lshr_b64 v[12:13], 24, v8 +; GCN-IR-NEXT: v_add_i32_e32 v8, vcc, -1, v0 +; GCN-IR-NEXT: v_addc_u32_e32 v9, vcc, -1, v1, vcc ; GCN-IR-NEXT: v_sub_i32_e32 v10, vcc, 58, v10 ; GCN-IR-NEXT: v_mov_b32_e32 v14, 0 ; GCN-IR-NEXT: v_subb_u32_e32 v11, vcc, 0, v11, vcc @@ -1521,28 +1521,28 @@ ; GCN-IR-NEXT: BB11_3: ; %udiv-do-while ; GCN-IR-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN-IR-NEXT: v_lshl_b64 v[12:13], v[12:13], 1 -; GCN-IR-NEXT: v_lshrrev_b32_e32 v8, 31, v5 -; GCN-IR-NEXT: v_or_b32_e32 v12, v12, v8 +; GCN-IR-NEXT: v_lshrrev_b32_e32 v6, 31, v5 +; GCN-IR-NEXT: v_or_b32_e32 v12, v12, v6 ; GCN-IR-NEXT: v_lshl_b64 v[4:5], v[4:5], 1 -; GCN-IR-NEXT: v_sub_i32_e32 v8, vcc, v6, v12 -; GCN-IR-NEXT: v_subb_u32_e32 v8, vcc, v7, v13, vcc +; GCN-IR-NEXT: v_sub_i32_e32 v6, vcc, v8, v12 +; GCN-IR-NEXT: v_subb_u32_e32 v6, vcc, v9, v13, vcc ; GCN-IR-NEXT: v_or_b32_e32 v4, v14, v4 -; GCN-IR-NEXT: v_ashrrev_i32_e32 v14, 31, v8 +; GCN-IR-NEXT: v_ashrrev_i32_e32 v14, 31, v6 ; GCN-IR-NEXT: v_and_b32_e32 v17, v14, v0 -; GCN-IR-NEXT: v_and_b32_e32 v8, 1, v14 +; GCN-IR-NEXT: v_and_b32_e32 v6, 1, v14 ; GCN-IR-NEXT: v_and_b32_e32 v16, v14, v1 ; GCN-IR-NEXT: v_add_i32_e32 v14, vcc, 1, v10 ; GCN-IR-NEXT: v_or_b32_e32 v5, v15, v5 ; GCN-IR-NEXT: v_addc_u32_e32 v15, vcc, 0, v11, vcc ; GCN-IR-NEXT: v_cmp_lt_u64_e32 vcc, v[14:15], v[10:11] ; GCN-IR-NEXT: v_mov_b32_e32 v10, v14 -; GCN-IR-NEXT: v_mov_b32_e32 v9, 0 +; GCN-IR-NEXT: v_mov_b32_e32 v7, 0 ; GCN-IR-NEXT: v_sub_i32_e64 v12, s[4:5], v12, v17 ; GCN-IR-NEXT: v_mov_b32_e32 v11, v15 -; GCN-IR-NEXT: v_mov_b32_e32 v15, v9 +; GCN-IR-NEXT: v_mov_b32_e32 v15, v7 ; GCN-IR-NEXT: v_subb_u32_e64 v13, s[4:5], v13, v16, s[4:5] ; GCN-IR-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GCN-IR-NEXT: v_mov_b32_e32 v14, v8 +; GCN-IR-NEXT: v_mov_b32_e32 v14, v6 ; GCN-IR-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GCN-IR-NEXT: s_cbranch_execnz BB11_3 ; GCN-IR-NEXT: ; %bb.4: ; %Flow @@ -1550,8 +1550,8 @@ ; GCN-IR-NEXT: BB11_5: ; %Flow3 ; GCN-IR-NEXT: s_or_b64 exec, exec, s[10:11] ; GCN-IR-NEXT: v_lshl_b64 v[0:1], v[4:5], 1 -; GCN-IR-NEXT: v_or_b32_e32 v7, v9, v1 -; GCN-IR-NEXT: v_or_b32_e32 v6, v8, v0 +; GCN-IR-NEXT: v_or_b32_e32 v7, v7, v1 +; GCN-IR-NEXT: v_or_b32_e32 v6, v6, v0 ; GCN-IR-NEXT: BB11_6: ; %Flow4 ; GCN-IR-NEXT: s_or_b64 exec, exec, s[6:7] ; GCN-IR-NEXT: v_xor_b32_e32 v0, v6, v2 @@ -1715,23 +1715,23 @@ ; GCN-IR-NEXT: s_and_saveexec_b64 s[6:7], s[4:5] ; GCN-IR-NEXT: s_cbranch_execz BB12_6 ; GCN-IR-NEXT: ; %bb.1: ; %udiv-bb1 -; GCN-IR-NEXT: v_add_i32_e32 v6, vcc, 1, v4 -; GCN-IR-NEXT: v_addc_u32_e32 v7, vcc, 0, v5, vcc -; GCN-IR-NEXT: v_cmp_ge_u64_e32 vcc, v[6:7], v[4:5] +; GCN-IR-NEXT: v_add_i32_e32 v8, vcc, 1, v4 +; GCN-IR-NEXT: v_addc_u32_e32 v9, vcc, 0, v5, vcc +; GCN-IR-NEXT: v_cmp_ge_u64_e32 vcc, v[8:9], v[4:5] ; GCN-IR-NEXT: v_sub_i32_e64 v4, s[4:5], 63, v4 ; GCN-IR-NEXT: v_lshl_b64 v[4:5], s[8:9], v4 -; GCN-IR-NEXT: v_mov_b32_e32 v8, 0 +; GCN-IR-NEXT: v_mov_b32_e32 v6, 0 ; GCN-IR-NEXT: s_mov_b64 s[8:9], 0 -; GCN-IR-NEXT: v_mov_b32_e32 v9, 0 +; GCN-IR-NEXT: v_mov_b32_e32 v7, 0 ; GCN-IR-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GCN-IR-NEXT: s_xor_b64 s[10:11], exec, s[4:5] ; GCN-IR-NEXT: s_cbranch_execz BB12_5 ; GCN-IR-NEXT: ; %bb.2: ; %udiv-preheader ; GCN-IR-NEXT: s_mov_b32 s5, 0 ; GCN-IR-NEXT: s_mov_b32 s4, 0x8000 -; GCN-IR-NEXT: v_lshr_b64 v[12:13], s[4:5], v6 -; GCN-IR-NEXT: v_add_i32_e32 v6, vcc, -1, v0 -; GCN-IR-NEXT: v_addc_u32_e32 v7, vcc, -1, v1, vcc +; GCN-IR-NEXT: v_lshr_b64 v[12:13], s[4:5], v8 +; GCN-IR-NEXT: v_add_i32_e32 v8, vcc, -1, v0 +; GCN-IR-NEXT: v_addc_u32_e32 v9, vcc, -1, v1, vcc ; GCN-IR-NEXT: v_sub_i32_e32 v10, vcc, 47, v10 ; GCN-IR-NEXT: v_mov_b32_e32 v14, 0 ; GCN-IR-NEXT: v_subb_u32_e32 v11, vcc, 0, v11, vcc @@ -1739,28 +1739,28 @@ ; GCN-IR-NEXT: BB12_3: ; %udiv-do-while ; GCN-IR-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN-IR-NEXT: v_lshl_b64 v[12:13], v[12:13], 1 -; GCN-IR-NEXT: v_lshrrev_b32_e32 v8, 31, v5 -; GCN-IR-NEXT: v_or_b32_e32 v12, v12, v8 +; GCN-IR-NEXT: v_lshrrev_b32_e32 v6, 31, v5 +; GCN-IR-NEXT: v_or_b32_e32 v12, v12, v6 ; GCN-IR-NEXT: v_lshl_b64 v[4:5], v[4:5], 1 -; GCN-IR-NEXT: v_sub_i32_e32 v8, vcc, v6, v12 -; GCN-IR-NEXT: v_subb_u32_e32 v8, vcc, v7, v13, vcc +; GCN-IR-NEXT: v_sub_i32_e32 v6, vcc, v8, v12 +; GCN-IR-NEXT: v_subb_u32_e32 v6, vcc, v9, v13, vcc ; GCN-IR-NEXT: v_or_b32_e32 v4, v14, v4 -; GCN-IR-NEXT: v_ashrrev_i32_e32 v14, 31, v8 +; GCN-IR-NEXT: v_ashrrev_i32_e32 v14, 31, v6 ; GCN-IR-NEXT: v_and_b32_e32 v17, v14, v0 -; GCN-IR-NEXT: v_and_b32_e32 v8, 1, v14 +; GCN-IR-NEXT: v_and_b32_e32 v6, 1, v14 ; GCN-IR-NEXT: v_and_b32_e32 v16, v14, v1 ; GCN-IR-NEXT: v_add_i32_e32 v14, vcc, 1, v10 ; GCN-IR-NEXT: v_or_b32_e32 v5, v15, v5 ; GCN-IR-NEXT: v_addc_u32_e32 v15, vcc, 0, v11, vcc ; GCN-IR-NEXT: v_cmp_lt_u64_e32 vcc, v[14:15], v[10:11] ; GCN-IR-NEXT: v_mov_b32_e32 v10, v14 -; GCN-IR-NEXT: v_mov_b32_e32 v9, 0 +; GCN-IR-NEXT: v_mov_b32_e32 v7, 0 ; GCN-IR-NEXT: v_sub_i32_e64 v12, s[4:5], v12, v17 ; GCN-IR-NEXT: v_mov_b32_e32 v11, v15 -; GCN-IR-NEXT: v_mov_b32_e32 v15, v9 +; GCN-IR-NEXT: v_mov_b32_e32 v15, v7 ; GCN-IR-NEXT: v_subb_u32_e64 v13, s[4:5], v13, v16, s[4:5] ; GCN-IR-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GCN-IR-NEXT: v_mov_b32_e32 v14, v8 +; GCN-IR-NEXT: v_mov_b32_e32 v14, v6 ; GCN-IR-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GCN-IR-NEXT: s_cbranch_execnz BB12_3 ; GCN-IR-NEXT: ; %bb.4: ; %Flow @@ -1768,8 +1768,8 @@ ; GCN-IR-NEXT: BB12_5: ; %Flow3 ; GCN-IR-NEXT: s_or_b64 exec, exec, s[10:11] ; GCN-IR-NEXT: v_lshl_b64 v[0:1], v[4:5], 1 -; GCN-IR-NEXT: v_or_b32_e32 v7, v9, v1 -; GCN-IR-NEXT: v_or_b32_e32 v6, v8, v0 +; GCN-IR-NEXT: v_or_b32_e32 v7, v7, v1 +; GCN-IR-NEXT: v_or_b32_e32 v6, v6, v0 ; GCN-IR-NEXT: BB12_6: ; %Flow4 ; GCN-IR-NEXT: s_or_b64 exec, exec, s[6:7] ; GCN-IR-NEXT: v_xor_b32_e32 v0, v6, v2 Index: llvm/test/CodeGen/AMDGPU/srem64.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/srem64.ll +++ llvm/test/CodeGen/AMDGPU/srem64.ll @@ -63,7 +63,7 @@ ; GCN-NEXT: v_add_i32_e32 v10, vcc, v11, v10 ; GCN-NEXT: v_addc_u32_e32 v11, vcc, v2, v12, vcc ; GCN-NEXT: v_mul_lo_u32 v4, v4, v6 -; GCN-NEXT: v_add_i32_e32 v7, vcc, v7, v10 +; GCN-NEXT: v_add_i32_e32 v7, vcc, v10, v7 ; GCN-NEXT: v_addc_u32_e32 v7, vcc, v11, v9, vcc ; GCN-NEXT: v_addc_u32_e32 v6, vcc, v8, v1, vcc ; GCN-NEXT: v_add_i32_e32 v4, vcc, v7, v4 @@ -948,7 +948,7 @@ ; GCN-NEXT: v_add_i32_e32 v10, vcc, v11, v10 ; GCN-NEXT: v_addc_u32_e32 v11, vcc, v7, v12, vcc ; GCN-NEXT: v_mul_lo_u32 v3, v3, v5 -; GCN-NEXT: v_add_i32_e32 v6, vcc, v6, v10 +; GCN-NEXT: v_add_i32_e32 v6, vcc, v10, v6 ; GCN-NEXT: v_addc_u32_e32 v6, vcc, v11, v9, vcc ; GCN-NEXT: v_addc_u32_e32 v5, vcc, v8, v1, vcc ; GCN-NEXT: v_add_i32_e32 v3, vcc, v6, v3 @@ -1379,7 +1379,7 @@ ; GCN-NEXT: v_add_i32_e32 v10, vcc, v11, v10 ; GCN-NEXT: v_addc_u32_e32 v11, vcc, v2, v12, vcc ; GCN-NEXT: v_mul_lo_u32 v4, v4, v6 -; GCN-NEXT: v_add_i32_e32 v7, vcc, v7, v10 +; GCN-NEXT: v_add_i32_e32 v7, vcc, v10, v7 ; GCN-NEXT: v_addc_u32_e32 v7, vcc, v11, v9, vcc ; GCN-NEXT: v_addc_u32_e32 v6, vcc, v8, v1, vcc ; GCN-NEXT: v_add_i32_e32 v4, vcc, v7, v4 @@ -1676,21 +1676,21 @@ ; GCN-IR-NEXT: s_and_saveexec_b64 s[6:7], s[4:5] ; GCN-IR-NEXT: s_cbranch_execz BB11_6 ; GCN-IR-NEXT: ; %bb.1: ; %udiv-bb1 -; GCN-IR-NEXT: v_add_i32_e32 v5, vcc, 1, v3 -; GCN-IR-NEXT: v_addc_u32_e32 v6, vcc, 0, v4, vcc +; GCN-IR-NEXT: v_add_i32_e32 v6, vcc, 1, v3 +; GCN-IR-NEXT: v_addc_u32_e32 v7, vcc, 0, v4, vcc ; GCN-IR-NEXT: v_sub_i32_e64 v2, s[4:5], 63, v3 -; GCN-IR-NEXT: v_cmp_ge_u64_e32 vcc, v[5:6], v[3:4] -; GCN-IR-NEXT: v_mov_b32_e32 v6, 0 +; GCN-IR-NEXT: v_cmp_ge_u64_e32 vcc, v[6:7], v[3:4] +; GCN-IR-NEXT: v_mov_b32_e32 v4, 0 ; GCN-IR-NEXT: v_lshl_b64 v[2:3], 24, v2 ; GCN-IR-NEXT: s_mov_b64 s[8:9], 0 -; GCN-IR-NEXT: v_mov_b32_e32 v7, 0 +; GCN-IR-NEXT: v_mov_b32_e32 v5, 0 ; GCN-IR-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GCN-IR-NEXT: s_xor_b64 s[10:11], exec, s[4:5] ; GCN-IR-NEXT: s_cbranch_execz BB11_5 ; GCN-IR-NEXT: ; %bb.2: ; %udiv-preheader -; GCN-IR-NEXT: v_add_i32_e32 v4, vcc, -1, v0 -; GCN-IR-NEXT: v_lshr_b64 v[10:11], 24, v5 -; GCN-IR-NEXT: v_addc_u32_e32 v5, vcc, -1, v1, vcc +; GCN-IR-NEXT: v_lshr_b64 v[10:11], 24, v6 +; GCN-IR-NEXT: v_add_i32_e32 v6, vcc, -1, v0 +; GCN-IR-NEXT: v_addc_u32_e32 v7, vcc, -1, v1, vcc ; GCN-IR-NEXT: v_sub_i32_e32 v8, vcc, 58, v8 ; GCN-IR-NEXT: v_mov_b32_e32 v12, 0 ; GCN-IR-NEXT: v_subb_u32_e32 v9, vcc, 0, v9, vcc @@ -1698,28 +1698,28 @@ ; GCN-IR-NEXT: BB11_3: ; %udiv-do-while ; GCN-IR-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN-IR-NEXT: v_lshl_b64 v[10:11], v[10:11], 1 -; GCN-IR-NEXT: v_lshrrev_b32_e32 v6, 31, v3 -; GCN-IR-NEXT: v_or_b32_e32 v10, v10, v6 +; GCN-IR-NEXT: v_lshrrev_b32_e32 v4, 31, v3 +; GCN-IR-NEXT: v_or_b32_e32 v10, v10, v4 ; GCN-IR-NEXT: v_lshl_b64 v[2:3], v[2:3], 1 -; GCN-IR-NEXT: v_sub_i32_e32 v6, vcc, v4, v10 -; GCN-IR-NEXT: v_subb_u32_e32 v6, vcc, v5, v11, vcc +; GCN-IR-NEXT: v_sub_i32_e32 v4, vcc, v6, v10 +; GCN-IR-NEXT: v_subb_u32_e32 v4, vcc, v7, v11, vcc ; GCN-IR-NEXT: v_or_b32_e32 v2, v12, v2 -; GCN-IR-NEXT: v_ashrrev_i32_e32 v12, 31, v6 +; GCN-IR-NEXT: v_ashrrev_i32_e32 v12, 31, v4 ; GCN-IR-NEXT: v_and_b32_e32 v15, v12, v0 -; GCN-IR-NEXT: v_and_b32_e32 v6, 1, v12 +; GCN-IR-NEXT: v_and_b32_e32 v4, 1, v12 ; GCN-IR-NEXT: v_and_b32_e32 v14, v12, v1 ; GCN-IR-NEXT: v_add_i32_e32 v12, vcc, 1, v8 ; GCN-IR-NEXT: v_or_b32_e32 v3, v13, v3 ; GCN-IR-NEXT: v_addc_u32_e32 v13, vcc, 0, v9, vcc ; GCN-IR-NEXT: v_cmp_lt_u64_e32 vcc, v[12:13], v[8:9] ; GCN-IR-NEXT: v_mov_b32_e32 v8, v12 -; GCN-IR-NEXT: v_mov_b32_e32 v7, 0 +; GCN-IR-NEXT: v_mov_b32_e32 v5, 0 ; GCN-IR-NEXT: v_sub_i32_e64 v10, s[4:5], v10, v15 ; GCN-IR-NEXT: v_mov_b32_e32 v9, v13 -; GCN-IR-NEXT: v_mov_b32_e32 v13, v7 +; GCN-IR-NEXT: v_mov_b32_e32 v13, v5 ; GCN-IR-NEXT: v_subb_u32_e64 v11, s[4:5], v11, v14, s[4:5] ; GCN-IR-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GCN-IR-NEXT: v_mov_b32_e32 v12, v6 +; GCN-IR-NEXT: v_mov_b32_e32 v12, v4 ; GCN-IR-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GCN-IR-NEXT: s_cbranch_execnz BB11_3 ; GCN-IR-NEXT: ; %bb.4: ; %Flow @@ -1727,8 +1727,8 @@ ; GCN-IR-NEXT: BB11_5: ; %Flow3 ; GCN-IR-NEXT: s_or_b64 exec, exec, s[10:11] ; GCN-IR-NEXT: v_lshl_b64 v[2:3], v[2:3], 1 -; GCN-IR-NEXT: v_or_b32_e32 v5, v7, v3 -; GCN-IR-NEXT: v_or_b32_e32 v2, v6, v2 +; GCN-IR-NEXT: v_or_b32_e32 v5, v5, v3 +; GCN-IR-NEXT: v_or_b32_e32 v2, v4, v2 ; GCN-IR-NEXT: BB11_6: ; %Flow4 ; GCN-IR-NEXT: s_or_b64 exec, exec, s[6:7] ; GCN-IR-NEXT: v_mul_lo_u32 v3, v0, v5 @@ -1890,23 +1890,23 @@ ; GCN-IR-NEXT: s_and_saveexec_b64 s[6:7], s[4:5] ; GCN-IR-NEXT: s_cbranch_execz BB12_6 ; GCN-IR-NEXT: ; %bb.1: ; %udiv-bb1 -; GCN-IR-NEXT: v_add_i32_e32 v4, vcc, 1, v2 -; GCN-IR-NEXT: v_addc_u32_e32 v5, vcc, 0, v3, vcc -; GCN-IR-NEXT: v_cmp_ge_u64_e32 vcc, v[4:5], v[2:3] +; GCN-IR-NEXT: v_add_i32_e32 v6, vcc, 1, v2 +; GCN-IR-NEXT: v_addc_u32_e32 v7, vcc, 0, v3, vcc +; GCN-IR-NEXT: v_cmp_ge_u64_e32 vcc, v[6:7], v[2:3] ; GCN-IR-NEXT: v_sub_i32_e64 v2, s[4:5], 63, v2 ; GCN-IR-NEXT: v_lshl_b64 v[2:3], s[8:9], v2 -; GCN-IR-NEXT: v_mov_b32_e32 v6, 0 +; GCN-IR-NEXT: v_mov_b32_e32 v4, 0 ; GCN-IR-NEXT: s_mov_b64 s[8:9], 0 -; GCN-IR-NEXT: v_mov_b32_e32 v7, 0 +; GCN-IR-NEXT: v_mov_b32_e32 v5, 0 ; GCN-IR-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GCN-IR-NEXT: s_xor_b64 s[10:11], exec, s[4:5] ; GCN-IR-NEXT: s_cbranch_execz BB12_5 ; GCN-IR-NEXT: ; %bb.2: ; %udiv-preheader ; GCN-IR-NEXT: s_mov_b32 s5, 0 ; GCN-IR-NEXT: s_mov_b32 s4, 0x8000 -; GCN-IR-NEXT: v_lshr_b64 v[10:11], s[4:5], v4 -; GCN-IR-NEXT: v_add_i32_e32 v4, vcc, -1, v0 -; GCN-IR-NEXT: v_addc_u32_e32 v5, vcc, -1, v1, vcc +; GCN-IR-NEXT: v_lshr_b64 v[10:11], s[4:5], v6 +; GCN-IR-NEXT: v_add_i32_e32 v6, vcc, -1, v0 +; GCN-IR-NEXT: v_addc_u32_e32 v7, vcc, -1, v1, vcc ; GCN-IR-NEXT: v_sub_i32_e32 v8, vcc, 47, v8 ; GCN-IR-NEXT: v_mov_b32_e32 v12, 0 ; GCN-IR-NEXT: v_subb_u32_e32 v9, vcc, 0, v9, vcc @@ -1914,28 +1914,28 @@ ; GCN-IR-NEXT: BB12_3: ; %udiv-do-while ; GCN-IR-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN-IR-NEXT: v_lshl_b64 v[10:11], v[10:11], 1 -; GCN-IR-NEXT: v_lshrrev_b32_e32 v6, 31, v3 -; GCN-IR-NEXT: v_or_b32_e32 v10, v10, v6 +; GCN-IR-NEXT: v_lshrrev_b32_e32 v4, 31, v3 +; GCN-IR-NEXT: v_or_b32_e32 v10, v10, v4 ; GCN-IR-NEXT: v_lshl_b64 v[2:3], v[2:3], 1 -; GCN-IR-NEXT: v_sub_i32_e32 v6, vcc, v4, v10 -; GCN-IR-NEXT: v_subb_u32_e32 v6, vcc, v5, v11, vcc +; GCN-IR-NEXT: v_sub_i32_e32 v4, vcc, v6, v10 +; GCN-IR-NEXT: v_subb_u32_e32 v4, vcc, v7, v11, vcc ; GCN-IR-NEXT: v_or_b32_e32 v2, v12, v2 -; GCN-IR-NEXT: v_ashrrev_i32_e32 v12, 31, v6 +; GCN-IR-NEXT: v_ashrrev_i32_e32 v12, 31, v4 ; GCN-IR-NEXT: v_and_b32_e32 v15, v12, v0 -; GCN-IR-NEXT: v_and_b32_e32 v6, 1, v12 +; GCN-IR-NEXT: v_and_b32_e32 v4, 1, v12 ; GCN-IR-NEXT: v_and_b32_e32 v14, v12, v1 ; GCN-IR-NEXT: v_add_i32_e32 v12, vcc, 1, v8 ; GCN-IR-NEXT: v_or_b32_e32 v3, v13, v3 ; GCN-IR-NEXT: v_addc_u32_e32 v13, vcc, 0, v9, vcc ; GCN-IR-NEXT: v_cmp_lt_u64_e32 vcc, v[12:13], v[8:9] ; GCN-IR-NEXT: v_mov_b32_e32 v8, v12 -; GCN-IR-NEXT: v_mov_b32_e32 v7, 0 +; GCN-IR-NEXT: v_mov_b32_e32 v5, 0 ; GCN-IR-NEXT: v_sub_i32_e64 v10, s[4:5], v10, v15 ; GCN-IR-NEXT: v_mov_b32_e32 v9, v13 -; GCN-IR-NEXT: v_mov_b32_e32 v13, v7 +; GCN-IR-NEXT: v_mov_b32_e32 v13, v5 ; GCN-IR-NEXT: v_subb_u32_e64 v11, s[4:5], v11, v14, s[4:5] ; GCN-IR-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GCN-IR-NEXT: v_mov_b32_e32 v12, v6 +; GCN-IR-NEXT: v_mov_b32_e32 v12, v4 ; GCN-IR-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GCN-IR-NEXT: s_cbranch_execnz BB12_3 ; GCN-IR-NEXT: ; %bb.4: ; %Flow @@ -1943,8 +1943,8 @@ ; GCN-IR-NEXT: BB12_5: ; %Flow3 ; GCN-IR-NEXT: s_or_b64 exec, exec, s[10:11] ; GCN-IR-NEXT: v_lshl_b64 v[2:3], v[2:3], 1 -; GCN-IR-NEXT: v_or_b32_e32 v5, v7, v3 -; GCN-IR-NEXT: v_or_b32_e32 v4, v6, v2 +; GCN-IR-NEXT: v_or_b32_e32 v5, v5, v3 +; GCN-IR-NEXT: v_or_b32_e32 v4, v4, v2 ; GCN-IR-NEXT: BB12_6: ; %Flow4 ; GCN-IR-NEXT: s_or_b64 exec, exec, s[6:7] ; GCN-IR-NEXT: v_mul_lo_u32 v2, v0, v5 Index: llvm/test/CodeGen/AMDGPU/udiv64.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/udiv64.ll +++ llvm/test/CodeGen/AMDGPU/udiv64.ll @@ -62,7 +62,7 @@ ; GCN-NEXT: v_add_i32_e32 v10, vcc, v11, v10 ; GCN-NEXT: v_addc_u32_e32 v11, vcc, v2, v12, vcc ; GCN-NEXT: v_mul_lo_u32 v4, v4, v6 -; GCN-NEXT: v_add_i32_e32 v7, vcc, v7, v10 +; GCN-NEXT: v_add_i32_e32 v7, vcc, v10, v7 ; GCN-NEXT: v_addc_u32_e32 v7, vcc, v11, v9, vcc ; GCN-NEXT: v_addc_u32_e32 v6, vcc, v8, v1, vcc ; GCN-NEXT: v_add_i32_e32 v4, vcc, v7, v4 @@ -768,7 +768,7 @@ ; GCN-NEXT: v_add_i32_e32 v11, vcc, v12, v11 ; GCN-NEXT: v_addc_u32_e32 v12, vcc, v9, v13, vcc ; GCN-NEXT: v_mul_lo_u32 v3, v3, v5 -; GCN-NEXT: v_add_i32_e32 v6, vcc, v6, v11 +; GCN-NEXT: v_add_i32_e32 v6, vcc, v11, v6 ; GCN-NEXT: v_addc_u32_e32 v6, vcc, v12, v10, vcc ; GCN-NEXT: v_addc_u32_e32 v5, vcc, v7, v8, vcc ; GCN-NEXT: v_add_i32_e32 v3, vcc, v6, v3 @@ -997,7 +997,7 @@ ; GCN-NEXT: v_add_i32_e32 v10, vcc, v11, v10 ; GCN-NEXT: v_addc_u32_e32 v11, vcc, v2, v12, vcc ; GCN-NEXT: v_mul_lo_u32 v4, v4, v6 -; GCN-NEXT: v_add_i32_e32 v7, vcc, v7, v10 +; GCN-NEXT: v_add_i32_e32 v7, vcc, v10, v7 ; GCN-NEXT: v_addc_u32_e32 v7, vcc, v11, v9, vcc ; GCN-NEXT: v_addc_u32_e32 v6, vcc, v8, v1, vcc ; GCN-NEXT: v_add_i32_e32 v4, vcc, v7, v4 @@ -1267,52 +1267,52 @@ ; GCN-IR-NEXT: s_and_saveexec_b64 s[6:7], s[4:5] ; GCN-IR-NEXT: s_cbranch_execz BB9_6 ; GCN-IR-NEXT: ; %bb.1: ; %udiv-bb1 -; GCN-IR-NEXT: v_add_i32_e32 v10, vcc, 1, v4 -; GCN-IR-NEXT: v_addc_u32_e32 v11, vcc, 0, v5, vcc +; GCN-IR-NEXT: v_add_i32_e32 v6, vcc, 1, v4 +; GCN-IR-NEXT: v_addc_u32_e32 v7, vcc, 0, v5, vcc ; GCN-IR-NEXT: v_sub_i32_e64 v2, s[4:5], 63, v4 +; GCN-IR-NEXT: v_cmp_ge_u64_e32 vcc, v[6:7], v[4:5] ; GCN-IR-NEXT: v_lshl_b64 v[2:3], s[8:9], v2 -; GCN-IR-NEXT: v_mov_b32_e32 v6, 0 -; GCN-IR-NEXT: v_cmp_ge_u64_e32 vcc, v[10:11], v[4:5] +; GCN-IR-NEXT: v_mov_b32_e32 v4, 0 ; GCN-IR-NEXT: s_mov_b64 s[8:9], 0 -; GCN-IR-NEXT: v_mov_b32_e32 v7, 0 +; GCN-IR-NEXT: v_mov_b32_e32 v5, 0 ; GCN-IR-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GCN-IR-NEXT: s_xor_b64 s[10:11], exec, s[4:5] ; GCN-IR-NEXT: s_cbranch_execz BB9_5 ; GCN-IR-NEXT: ; %bb.2: ; %udiv-preheader -; GCN-IR-NEXT: v_add_i32_e32 v4, vcc, -1, v0 -; GCN-IR-NEXT: v_addc_u32_e32 v5, vcc, -1, v1, vcc ; GCN-IR-NEXT: s_mov_b32 s5, 0 ; GCN-IR-NEXT: s_mov_b32 s4, 0x8000 +; GCN-IR-NEXT: v_lshr_b64 v[10:11], s[4:5], v6 +; GCN-IR-NEXT: v_add_i32_e32 v6, vcc, -1, v0 +; GCN-IR-NEXT: v_addc_u32_e32 v7, vcc, -1, v1, vcc ; GCN-IR-NEXT: v_sub_i32_e32 v8, vcc, 47, v8 ; GCN-IR-NEXT: v_mov_b32_e32 v12, 0 -; GCN-IR-NEXT: v_lshr_b64 v[10:11], s[4:5], v10 ; GCN-IR-NEXT: v_subb_u32_e32 v9, vcc, 0, v9, vcc ; GCN-IR-NEXT: v_mov_b32_e32 v13, 0 ; GCN-IR-NEXT: BB9_3: ; %udiv-do-while ; GCN-IR-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN-IR-NEXT: v_lshl_b64 v[10:11], v[10:11], 1 -; GCN-IR-NEXT: v_lshrrev_b32_e32 v6, 31, v3 -; GCN-IR-NEXT: v_or_b32_e32 v10, v10, v6 +; GCN-IR-NEXT: v_lshrrev_b32_e32 v4, 31, v3 +; GCN-IR-NEXT: v_or_b32_e32 v10, v10, v4 ; GCN-IR-NEXT: v_lshl_b64 v[2:3], v[2:3], 1 -; GCN-IR-NEXT: v_sub_i32_e32 v6, vcc, v4, v10 -; GCN-IR-NEXT: v_subb_u32_e32 v6, vcc, v5, v11, vcc +; GCN-IR-NEXT: v_sub_i32_e32 v4, vcc, v6, v10 +; GCN-IR-NEXT: v_subb_u32_e32 v4, vcc, v7, v11, vcc ; GCN-IR-NEXT: v_or_b32_e32 v2, v12, v2 -; GCN-IR-NEXT: v_ashrrev_i32_e32 v12, 31, v6 +; GCN-IR-NEXT: v_ashrrev_i32_e32 v12, 31, v4 ; GCN-IR-NEXT: v_and_b32_e32 v15, v12, v0 -; GCN-IR-NEXT: v_and_b32_e32 v6, 1, v12 +; GCN-IR-NEXT: v_and_b32_e32 v4, 1, v12 ; GCN-IR-NEXT: v_and_b32_e32 v14, v12, v1 ; GCN-IR-NEXT: v_add_i32_e32 v12, vcc, 1, v8 ; GCN-IR-NEXT: v_or_b32_e32 v3, v13, v3 ; GCN-IR-NEXT: v_addc_u32_e32 v13, vcc, 0, v9, vcc ; GCN-IR-NEXT: v_cmp_lt_u64_e32 vcc, v[12:13], v[8:9] ; GCN-IR-NEXT: v_mov_b32_e32 v8, v12 -; GCN-IR-NEXT: v_mov_b32_e32 v7, 0 +; GCN-IR-NEXT: v_mov_b32_e32 v5, 0 ; GCN-IR-NEXT: v_sub_i32_e64 v10, s[4:5], v10, v15 ; GCN-IR-NEXT: v_mov_b32_e32 v9, v13 -; GCN-IR-NEXT: v_mov_b32_e32 v13, v7 +; GCN-IR-NEXT: v_mov_b32_e32 v13, v5 ; GCN-IR-NEXT: v_subb_u32_e64 v11, s[4:5], v11, v14, s[4:5] ; GCN-IR-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GCN-IR-NEXT: v_mov_b32_e32 v12, v6 +; GCN-IR-NEXT: v_mov_b32_e32 v12, v4 ; GCN-IR-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GCN-IR-NEXT: s_cbranch_execnz BB9_3 ; GCN-IR-NEXT: ; %bb.4: ; %Flow @@ -1320,8 +1320,8 @@ ; GCN-IR-NEXT: BB9_5: ; %Flow3 ; GCN-IR-NEXT: s_or_b64 exec, exec, s[10:11] ; GCN-IR-NEXT: v_lshl_b64 v[0:1], v[2:3], 1 -; GCN-IR-NEXT: v_or_b32_e32 v3, v7, v1 -; GCN-IR-NEXT: v_or_b32_e32 v2, v6, v0 +; GCN-IR-NEXT: v_or_b32_e32 v3, v5, v1 +; GCN-IR-NEXT: v_or_b32_e32 v2, v4, v0 ; GCN-IR-NEXT: BB9_6: ; %Flow4 ; GCN-IR-NEXT: s_or_b64 exec, exec, s[6:7] ; GCN-IR-NEXT: v_mov_b32_e32 v0, v2 @@ -1477,7 +1477,7 @@ ; GCN-NEXT: v_mul_lo_u32 v10, v2, v6 ; GCN-NEXT: v_mul_hi_u32 v6, v2, v6 ; GCN-NEXT: v_mul_lo_u32 v2, v2, v4 -; GCN-NEXT: v_add_i32_e32 v5, vcc, v10, v5 +; GCN-NEXT: v_add_i32_e32 v5, vcc, v5, v10 ; GCN-NEXT: v_addc_u32_e32 v5, vcc, v9, v6, vcc ; GCN-NEXT: v_addc_u32_e32 v4, vcc, v11, v7, vcc ; GCN-NEXT: v_add_i32_e32 v2, vcc, v5, v2 @@ -1665,7 +1665,7 @@ ; GCN-NEXT: v_mul_lo_u32 v12, v4, v8 ; GCN-NEXT: v_mul_hi_u32 v8, v4, v8 ; GCN-NEXT: v_mul_lo_u32 v4, v4, v6 -; GCN-NEXT: v_add_i32_e32 v7, vcc, v12, v7 +; GCN-NEXT: v_add_i32_e32 v7, vcc, v7, v12 ; GCN-NEXT: v_addc_u32_e32 v7, vcc, v11, v8, vcc ; GCN-NEXT: v_addc_u32_e32 v6, vcc, v13, v9, vcc ; GCN-NEXT: v_add_i32_e32 v4, vcc, v7, v4 Index: llvm/test/CodeGen/AMDGPU/urem64.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/urem64.ll +++ llvm/test/CodeGen/AMDGPU/urem64.ll @@ -63,7 +63,7 @@ ; GCN-NEXT: v_add_i32_e32 v10, vcc, v11, v10 ; GCN-NEXT: v_addc_u32_e32 v11, vcc, v2, v12, vcc ; GCN-NEXT: v_mul_lo_u32 v4, v4, v6 -; GCN-NEXT: v_add_i32_e32 v7, vcc, v7, v10 +; GCN-NEXT: v_add_i32_e32 v7, vcc, v10, v7 ; GCN-NEXT: v_addc_u32_e32 v7, vcc, v11, v9, vcc ; GCN-NEXT: v_addc_u32_e32 v6, vcc, v8, v1, vcc ; GCN-NEXT: v_add_i32_e32 v4, vcc, v7, v4 @@ -805,7 +805,7 @@ ; GCN-NEXT: v_add_i32_e32 v10, vcc, v11, v10 ; GCN-NEXT: v_addc_u32_e32 v11, vcc, v2, v12, vcc ; GCN-NEXT: v_mul_lo_u32 v4, v4, v6 -; GCN-NEXT: v_add_i32_e32 v7, vcc, v7, v10 +; GCN-NEXT: v_add_i32_e32 v7, vcc, v10, v7 ; GCN-NEXT: v_addc_u32_e32 v7, vcc, v11, v9, vcc ; GCN-NEXT: v_addc_u32_e32 v6, vcc, v8, v1, vcc ; GCN-NEXT: v_add_i32_e32 v4, vcc, v7, v4 @@ -1007,7 +1007,7 @@ ; GCN-NEXT: v_mul_lo_u32 v10, v2, v6 ; GCN-NEXT: v_mul_hi_u32 v6, v2, v6 ; GCN-NEXT: v_mul_lo_u32 v2, v2, v4 -; GCN-NEXT: v_add_i32_e32 v5, vcc, v10, v5 +; GCN-NEXT: v_add_i32_e32 v5, vcc, v5, v10 ; GCN-NEXT: v_addc_u32_e32 v5, vcc, v9, v6, vcc ; GCN-NEXT: v_addc_u32_e32 v4, vcc, v11, v7, vcc ; GCN-NEXT: v_add_i32_e32 v2, vcc, v5, v2 @@ -1280,23 +1280,23 @@ ; GCN-IR-NEXT: s_and_saveexec_b64 s[6:7], s[4:5] ; GCN-IR-NEXT: s_cbranch_execz BB8_6 ; GCN-IR-NEXT: ; %bb.1: ; %udiv-bb1 -; GCN-IR-NEXT: v_add_i32_e32 v4, vcc, 1, v2 -; GCN-IR-NEXT: v_addc_u32_e32 v5, vcc, 0, v3, vcc -; GCN-IR-NEXT: v_cmp_ge_u64_e32 vcc, v[4:5], v[2:3] +; GCN-IR-NEXT: v_add_i32_e32 v6, vcc, 1, v2 +; GCN-IR-NEXT: v_addc_u32_e32 v7, vcc, 0, v3, vcc +; GCN-IR-NEXT: v_cmp_ge_u64_e32 vcc, v[6:7], v[2:3] ; GCN-IR-NEXT: v_sub_i32_e64 v2, s[4:5], 63, v2 ; GCN-IR-NEXT: v_lshl_b64 v[2:3], s[8:9], v2 -; GCN-IR-NEXT: v_mov_b32_e32 v6, 0 +; GCN-IR-NEXT: v_mov_b32_e32 v4, 0 ; GCN-IR-NEXT: s_mov_b64 s[8:9], 0 -; GCN-IR-NEXT: v_mov_b32_e32 v7, 0 +; GCN-IR-NEXT: v_mov_b32_e32 v5, 0 ; GCN-IR-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GCN-IR-NEXT: s_xor_b64 s[10:11], exec, s[4:5] ; GCN-IR-NEXT: s_cbranch_execz BB8_5 ; GCN-IR-NEXT: ; %bb.2: ; %udiv-preheader ; GCN-IR-NEXT: s_mov_b32 s5, 0 ; GCN-IR-NEXT: s_mov_b32 s4, 0x8000 -; GCN-IR-NEXT: v_lshr_b64 v[10:11], s[4:5], v4 -; GCN-IR-NEXT: v_add_i32_e32 v4, vcc, -1, v0 -; GCN-IR-NEXT: v_addc_u32_e32 v5, vcc, -1, v1, vcc +; GCN-IR-NEXT: v_lshr_b64 v[10:11], s[4:5], v6 +; GCN-IR-NEXT: v_add_i32_e32 v6, vcc, -1, v0 +; GCN-IR-NEXT: v_addc_u32_e32 v7, vcc, -1, v1, vcc ; GCN-IR-NEXT: v_sub_i32_e32 v8, vcc, 47, v8 ; GCN-IR-NEXT: v_mov_b32_e32 v12, 0 ; GCN-IR-NEXT: v_subb_u32_e32 v9, vcc, 0, v9, vcc @@ -1304,28 +1304,28 @@ ; GCN-IR-NEXT: BB8_3: ; %udiv-do-while ; GCN-IR-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN-IR-NEXT: v_lshl_b64 v[10:11], v[10:11], 1 -; GCN-IR-NEXT: v_lshrrev_b32_e32 v6, 31, v3 -; GCN-IR-NEXT: v_or_b32_e32 v10, v10, v6 +; GCN-IR-NEXT: v_lshrrev_b32_e32 v4, 31, v3 +; GCN-IR-NEXT: v_or_b32_e32 v10, v10, v4 ; GCN-IR-NEXT: v_lshl_b64 v[2:3], v[2:3], 1 -; GCN-IR-NEXT: v_sub_i32_e32 v6, vcc, v4, v10 -; GCN-IR-NEXT: v_subb_u32_e32 v6, vcc, v5, v11, vcc +; GCN-IR-NEXT: v_sub_i32_e32 v4, vcc, v6, v10 +; GCN-IR-NEXT: v_subb_u32_e32 v4, vcc, v7, v11, vcc ; GCN-IR-NEXT: v_or_b32_e32 v2, v12, v2 -; GCN-IR-NEXT: v_ashrrev_i32_e32 v12, 31, v6 +; GCN-IR-NEXT: v_ashrrev_i32_e32 v12, 31, v4 ; GCN-IR-NEXT: v_and_b32_e32 v15, v12, v0 -; GCN-IR-NEXT: v_and_b32_e32 v6, 1, v12 +; GCN-IR-NEXT: v_and_b32_e32 v4, 1, v12 ; GCN-IR-NEXT: v_and_b32_e32 v14, v12, v1 ; GCN-IR-NEXT: v_add_i32_e32 v12, vcc, 1, v8 ; GCN-IR-NEXT: v_or_b32_e32 v3, v13, v3 ; GCN-IR-NEXT: v_addc_u32_e32 v13, vcc, 0, v9, vcc ; GCN-IR-NEXT: v_cmp_lt_u64_e32 vcc, v[12:13], v[8:9] ; GCN-IR-NEXT: v_mov_b32_e32 v8, v12 -; GCN-IR-NEXT: v_mov_b32_e32 v7, 0 +; GCN-IR-NEXT: v_mov_b32_e32 v5, 0 ; GCN-IR-NEXT: v_sub_i32_e64 v10, s[4:5], v10, v15 ; GCN-IR-NEXT: v_mov_b32_e32 v9, v13 -; GCN-IR-NEXT: v_mov_b32_e32 v13, v7 +; GCN-IR-NEXT: v_mov_b32_e32 v13, v5 ; GCN-IR-NEXT: v_subb_u32_e64 v11, s[4:5], v11, v14, s[4:5] ; GCN-IR-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GCN-IR-NEXT: v_mov_b32_e32 v12, v6 +; GCN-IR-NEXT: v_mov_b32_e32 v12, v4 ; GCN-IR-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GCN-IR-NEXT: s_cbranch_execnz BB8_3 ; GCN-IR-NEXT: ; %bb.4: ; %Flow @@ -1333,8 +1333,8 @@ ; GCN-IR-NEXT: BB8_5: ; %Flow3 ; GCN-IR-NEXT: s_or_b64 exec, exec, s[10:11] ; GCN-IR-NEXT: v_lshl_b64 v[2:3], v[2:3], 1 -; GCN-IR-NEXT: v_or_b32_e32 v5, v7, v3 -; GCN-IR-NEXT: v_or_b32_e32 v4, v6, v2 +; GCN-IR-NEXT: v_or_b32_e32 v5, v5, v3 +; GCN-IR-NEXT: v_or_b32_e32 v4, v4, v2 ; GCN-IR-NEXT: BB8_6: ; %Flow4 ; GCN-IR-NEXT: s_or_b64 exec, exec, s[6:7] ; GCN-IR-NEXT: v_mul_lo_u32 v2, v0, v5