Index: llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp =================================================================== --- llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp +++ llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp @@ -696,6 +696,23 @@ MachineSDNode *AMDGPUDAGToDAGISel::buildSMovImm64(SDLoc &DL, uint64_t Imm, EVT VT) const { + const SIInstrInfo *TII = + static_cast(Subtarget->getInstrInfo()); + + if (VT == MVT::f64) { + // This is a heuristic. There are pros and cons in using a wide 64 bit + // move vs splitting it into two move instructions. If an immediate will + // stay as a whole it can be rematerialized and folded. If it is split + // it can be better coalesced and its parts can be folded. + // Do this only for f64 in the assumption that an f64 value rarely used + // as halfs but rather in fp64 instructions as a whole register. + unsigned Opcode = (isUInt<32>(Imm) || TII->isInlineConstant(APInt(64, Imm))) + ? AMDGPU::S_MOV_B64 + : AMDGPU::S_MOV_B64_IMM_PSEUDO; + return CurDAG->getMachineNode(Opcode, DL, VT, + CurDAG->getTargetConstant(Imm, DL, MVT::i64)); + } + SDNode *Lo = CurDAG->getMachineNode( AMDGPU::S_MOV_B32, DL, MVT::i32, CurDAG->getTargetConstant(Imm & 0xFFFFFFFF, DL, MVT::i32)); Index: llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp =================================================================== --- llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp +++ llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp @@ -2119,7 +2119,8 @@ unsigned Size = MRI->getType(DstReg).getSizeInBits(); // The AMDGPU backend only supports Imm operands and not CImm or FPImm. - if (ImmOp.isFPImm()) { + bool IsFP = ImmOp.isFPImm(); + if (IsFP) { const APInt &Imm = ImmOp.getFPImm()->getValueAPF().bitcastToAPInt(); ImmOp.ChangeToImmediate(Imm.getZExtValue()); } else if (ImmOp.isCImm()) { @@ -2155,8 +2156,17 @@ APInt Imm(Size, I.getOperand(1).getImm()); MachineInstr *ResInst; - if (IsSgpr && TII.isInlineConstant(Imm)) { - ResInst = BuildMI(*BB, &I, DL, TII.get(AMDGPU::S_MOV_B64), DstReg) + if (IsSgpr && (IsFP || TII.isInlineConstant(Imm))) { + // This is a heuristic. There are pros and cons in using a wide 64 bit + // move vs splitting it into two move instructions. If an immediate will + // stay as a whole it can be rematerialized and folded. If it is split + // it can be better coalesced and its parts can be folded. + // Do this only for f64 in the assumption that an f64 value rarely used + // as halfs but rather in fp64 instructions as a whole register. + Opcode = (Imm.isIntN(32) || TII.isInlineConstant(Imm)) + ? AMDGPU::S_MOV_B64 + : AMDGPU::S_MOV_B64_IMM_PSEUDO; + ResInst = BuildMI(*BB, &I, DL, TII.get(Opcode), DstReg) .addImm(I.getOperand(1).getImm()); } else { const TargetRegisterClass *RC = IsSgpr ? Index: llvm/lib/Target/AMDGPU/SIFixSGPRCopies.cpp =================================================================== --- llvm/lib/Target/AMDGPU/SIFixSGPRCopies.cpp +++ llvm/lib/Target/AMDGPU/SIFixSGPRCopies.cpp @@ -648,6 +648,23 @@ break; } + case AMDGPU::S_MOV_B64: + case AMDGPU::S_MOV_B64_IMM_PSEUDO: { + Register DstReg = MI.getOperand(0).getReg(); + if (DstReg.isPhysical() || !MRI->hasOneUse(DstReg)) + break; + MachineInstr *User = &*MRI->use_instr_begin(DstReg); + if (!User->isCopy() || User->getOperand(1).getReg() != DstReg) + break; + const TargetRegisterClass *SrcRC, *DstRC; + std::tie(SrcRC, DstRC) = getCopyRegClasses(*User, *TRI, *MRI); + if (!isSGPRToVGPRCopy(SrcRC, DstRC, *TRI)) + break; + MI.setDesc(TII->get(AMDGPU::V_MOV_B64_PSEUDO)); + MRI->setRegClass(DstReg, &AMDGPU::VReg_64RegClass); + MI.addImplicitDefUseOperands(MF); + break; + } case AMDGPU::PHI: { MachineBasicBlock *NewBB = processPHINode(MI); if (NewBB && NewBB != MBB) { Index: llvm/lib/Target/AMDGPU/SIInstrInfo.cpp =================================================================== --- llvm/lib/Target/AMDGPU/SIInstrInfo.cpp +++ llvm/lib/Target/AMDGPU/SIInstrInfo.cpp @@ -1761,6 +1761,25 @@ expandMovDPP64(MI); break; } + case AMDGPU::S_MOV_B64_IMM_PSEUDO: { + Register Dst = MI.getOperand(0).getReg(); + Register DstLo = RI.getSubReg(Dst, AMDGPU::sub0); + Register DstHi = RI.getSubReg(Dst, AMDGPU::sub1); + + const MachineOperand &SrcOp = MI.getOperand(1); + assert(!SrcOp.isFPImm()); + APInt Imm(64, SrcOp.getImm()); + APInt Lo(32, Imm.getLoBits(32).getZExtValue()); + APInt Hi(32, Imm.getHiBits(32).getZExtValue()); + BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B32), DstLo) + .addImm(Lo.getZExtValue()) + .addReg(Dst, RegState::Implicit | RegState::Define); + BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B32), DstHi) + .addImm(Hi.getZExtValue()) + .addReg(Dst, RegState::Implicit | RegState::Define); + MI.eraseFromParent(); + break; + } case AMDGPU::V_SET_INACTIVE_B32: { unsigned NotOpc = ST.isWave32() ? AMDGPU::S_NOT_B32 : AMDGPU::S_NOT_B64; unsigned Exec = ST.isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC; @@ -2665,6 +2684,10 @@ case AMDGPU::V_ACCVGPR_READ_B32_e64: case AMDGPU::V_ACCVGPR_MOV_B32: return true; + case AMDGPU::S_MOV_B64_IMM_PSEUDO: { + uint64_t Imm = MI.getOperand(1).getImm(); + return isUInt<32>(Imm) || isInlineConstant(APInt(64, Imm)); + } default: return false; } Index: llvm/lib/Target/AMDGPU/SIInstructions.td =================================================================== --- llvm/lib/Target/AMDGPU/SIInstructions.td +++ llvm/lib/Target/AMDGPU/SIInstructions.td @@ -111,6 +111,18 @@ let Size = 16; // Requires two 8-byte v_mov_b32_dpp to complete. } +// 64-bit scalar move immediate instruction. This is used to avoid subregs +// initialization and allow rematerialization. +def S_MOV_B64_IMM_PSEUDO : SPseudoInstSI <(outs SReg_64:$sdst), + (ins i64imm:$src0)> { + let isReMaterializable = 1; + let isAsCheapAsAMove = 1; + let isMoveImm = 1; + let SchedRW = [WriteSALU, Write64Bit]; + let Size = 16; // Needs maximum 2 s_mov_b32 instructions 8 byte long each. + let Uses = []; +} + // Pseudoinstruction for @llvm.amdgcn.wqm. It is turned into a copy after the // WQM pass processes it. def WQM : PseudoInstSI <(outs unknown:$vdst), (ins unknown:$src0)>; Index: llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.ll +++ llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.ll @@ -1857,69 +1857,69 @@ ; GCN-LABEL: dyn_extract_v16f64_s_s: ; GCN: ; %bb.0: ; %entry ; GCN-NEXT: s_mov_b32 s66, 0 +; GCN-NEXT: s_mov_b32 s64, 0 +; GCN-NEXT: s_mov_b32 s62, 0 +; GCN-NEXT: s_mov_b32 s60, 0 +; GCN-NEXT: s_mov_b32 s58, 0 +; GCN-NEXT: s_mov_b32 s56, 0 +; GCN-NEXT: s_mov_b32 s54, 0 +; GCN-NEXT: s_mov_b32 s52, 0 +; GCN-NEXT: s_mov_b32 s50, 0 +; GCN-NEXT: s_mov_b32 s48, 0 +; GCN-NEXT: s_mov_b32 s46, 0 +; GCN-NEXT: s_mov_b32 s44, 0 +; GCN-NEXT: s_mov_b32 s40, 0 ; GCN-NEXT: s_mov_b64 s[36:37], 1.0 ; GCN-NEXT: s_mov_b32 m0, s2 ; GCN-NEXT: s_mov_b32 s67, 0x40300000 ; GCN-NEXT: s_mov_b32 s65, 0x402e0000 -; GCN-NEXT: s_mov_b32 s64, s66 ; GCN-NEXT: s_mov_b32 s63, 0x402c0000 -; GCN-NEXT: s_mov_b32 s62, s66 ; GCN-NEXT: s_mov_b32 s61, 0x402a0000 -; GCN-NEXT: s_mov_b32 s60, s66 ; GCN-NEXT: s_mov_b32 s59, 0x40280000 -; GCN-NEXT: s_mov_b32 s58, s66 ; GCN-NEXT: s_mov_b32 s57, 0x40260000 -; GCN-NEXT: s_mov_b32 s56, s66 ; GCN-NEXT: s_mov_b32 s55, 0x40240000 -; GCN-NEXT: s_mov_b32 s54, s66 ; GCN-NEXT: s_mov_b32 s53, 0x40220000 -; GCN-NEXT: s_mov_b32 s52, s66 ; GCN-NEXT: s_mov_b32 s51, 0x40200000 -; GCN-NEXT: s_mov_b32 s50, s66 ; GCN-NEXT: s_mov_b32 s49, 0x401c0000 -; GCN-NEXT: s_mov_b32 s48, s66 ; GCN-NEXT: s_mov_b32 s47, 0x40180000 -; GCN-NEXT: s_mov_b32 s46, s66 ; GCN-NEXT: s_mov_b32 s45, 0x40140000 -; GCN-NEXT: s_mov_b32 s44, s66 ; GCN-NEXT: s_mov_b64 s[42:43], 4.0 ; GCN-NEXT: s_mov_b32 s41, 0x40080000 -; GCN-NEXT: s_mov_b32 s40, s66 ; GCN-NEXT: s_mov_b64 s[38:39], 2.0 ; GCN-NEXT: s_movrels_b64 s[0:1], s[36:37] ; GCN-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: dyn_extract_v16f64_s_s: ; GFX10: ; %bb.0: ; %entry -; GFX10-NEXT: s_mov_b32 s66, 0 ; GFX10-NEXT: s_mov_b64 s[36:37], 1.0 ; GFX10-NEXT: s_mov_b32 m0, s2 +; GFX10-NEXT: s_mov_b32 s66, 0 +; GFX10-NEXT: s_mov_b32 s64, 0 +; GFX10-NEXT: s_mov_b32 s62, 0 +; GFX10-NEXT: s_mov_b32 s60, 0 +; GFX10-NEXT: s_mov_b32 s58, 0 +; GFX10-NEXT: s_mov_b32 s56, 0 +; GFX10-NEXT: s_mov_b32 s54, 0 +; GFX10-NEXT: s_mov_b32 s52, 0 +; GFX10-NEXT: s_mov_b32 s50, 0 +; GFX10-NEXT: s_mov_b32 s48, 0 +; GFX10-NEXT: s_mov_b32 s46, 0 +; GFX10-NEXT: s_mov_b32 s44, 0 +; GFX10-NEXT: s_mov_b32 s40, 0 ; GFX10-NEXT: s_mov_b32 s67, 0x40300000 ; GFX10-NEXT: s_mov_b32 s65, 0x402e0000 -; GFX10-NEXT: s_mov_b32 s64, s66 ; GFX10-NEXT: s_mov_b32 s63, 0x402c0000 -; GFX10-NEXT: s_mov_b32 s62, s66 ; GFX10-NEXT: s_mov_b32 s61, 0x402a0000 -; GFX10-NEXT: s_mov_b32 s60, s66 ; GFX10-NEXT: s_mov_b32 s59, 0x40280000 -; GFX10-NEXT: s_mov_b32 s58, s66 ; GFX10-NEXT: s_mov_b32 s57, 0x40260000 -; GFX10-NEXT: s_mov_b32 s56, s66 ; GFX10-NEXT: s_mov_b32 s55, 0x40240000 -; GFX10-NEXT: s_mov_b32 s54, s66 ; GFX10-NEXT: s_mov_b32 s53, 0x40220000 -; GFX10-NEXT: s_mov_b32 s52, s66 ; GFX10-NEXT: s_mov_b32 s51, 0x40200000 -; GFX10-NEXT: s_mov_b32 s50, s66 ; GFX10-NEXT: s_mov_b32 s49, 0x401c0000 -; GFX10-NEXT: s_mov_b32 s48, s66 ; GFX10-NEXT: s_mov_b32 s47, 0x40180000 -; GFX10-NEXT: s_mov_b32 s46, s66 ; GFX10-NEXT: s_mov_b32 s45, 0x40140000 -; GFX10-NEXT: s_mov_b32 s44, s66 ; GFX10-NEXT: s_mov_b64 s[42:43], 4.0 ; GFX10-NEXT: s_mov_b32 s41, 0x40080000 -; GFX10-NEXT: s_mov_b32 s40, s66 ; GFX10-NEXT: s_mov_b64 s[38:39], 2.0 ; GFX10-NEXT: s_movrels_b64 s[0:1], s[36:37] ; GFX10-NEXT: ; return to shader part epilog @@ -2750,9 +2750,9 @@ ; GPRIDX-NEXT: ; %bb.0: ; %entry ; GPRIDX-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GPRIDX-NEXT: s_load_dword s8, s[4:5], 0x8 -; GPRIDX-NEXT: s_mov_b32 s0, 0 +; GPRIDX-NEXT: s_mov_b32 s2, 0 ; GPRIDX-NEXT: s_mov_b32 s3, 0x40080000 -; GPRIDX-NEXT: s_mov_b32 s2, s0 +; GPRIDX-NEXT: s_mov_b32 s0, 0 ; GPRIDX-NEXT: s_mov_b32 s1, 0x40140000 ; GPRIDX-NEXT: s_waitcnt lgkmcnt(0) ; GPRIDX-NEXT: s_cmp_eq_u32 s8, 1 @@ -2841,9 +2841,9 @@ ; MOVREL-NEXT: ; %bb.0: ; %entry ; MOVREL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; MOVREL-NEXT: s_load_dword s8, s[4:5], 0x8 -; MOVREL-NEXT: s_mov_b32 s0, 0 +; MOVREL-NEXT: s_mov_b32 s2, 0 ; MOVREL-NEXT: s_mov_b32 s3, 0x40080000 -; MOVREL-NEXT: s_mov_b32 s2, s0 +; MOVREL-NEXT: s_mov_b32 s0, 0 ; MOVREL-NEXT: s_mov_b32 s1, 0x40140000 ; MOVREL-NEXT: s_waitcnt lgkmcnt(0) ; MOVREL-NEXT: s_cmp_eq_u32 s8, 1 @@ -2872,7 +2872,7 @@ ; GFX10-NEXT: kernel_code_entry_byte_offset = 256 ; GFX10-NEXT: kernel_code_prefetch_byte_size = 0 ; GFX10-NEXT: granulated_workitem_vgpr_count = 0 -; GFX10-NEXT: granulated_wavefront_sgpr_count = 1 +; GFX10-NEXT: granulated_wavefront_sgpr_count = 0 ; GFX10-NEXT: priority = 0 ; GFX10-NEXT: float_mode = 240 ; GFX10-NEXT: priv = 0 @@ -2915,7 +2915,7 @@ ; GFX10-NEXT: gds_segment_byte_size = 0 ; GFX10-NEXT: kernarg_segment_byte_size = 28 ; GFX10-NEXT: workgroup_fbarrier_count = 0 -; GFX10-NEXT: wavefront_sgpr_count = 9 +; GFX10-NEXT: wavefront_sgpr_count = 7 ; GFX10-NEXT: workitem_vgpr_count = 3 ; GFX10-NEXT: reserved_vgpr_first = 0 ; GFX10-NEXT: reserved_vgpr_count = 0 @@ -2932,22 +2932,22 @@ ; GFX10-NEXT: .end_amd_kernel_code_t ; GFX10-NEXT: ; %bb.0: ; %entry ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dword s8, s[4:5], 0x8 +; GFX10-NEXT: s_load_dword s6, s[4:5], 0x8 ; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX10-NEXT: s_mov_b32 s2, 0 -; GFX10-NEXT: s_mov_b32 s5, 0x40080000 -; GFX10-NEXT: s_mov_b32 s4, s2 -; GFX10-NEXT: s_mov_b32 s3, 0x40140000 +; GFX10-NEXT: s_mov_b32 s3, 0x40080000 ; GFX10-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: s_cmp_eq_u32 s8, 1 -; GFX10-NEXT: s_cselect_b64 s[6:7], 2.0, 1.0 -; GFX10-NEXT: s_cmp_eq_u32 s8, 2 -; GFX10-NEXT: s_cselect_b64 s[4:5], s[4:5], s[6:7] -; GFX10-NEXT: s_cmp_eq_u32 s8, 3 -; GFX10-NEXT: s_cselect_b64 s[4:5], 4.0, s[4:5] -; GFX10-NEXT: s_cmp_eq_u32 s8, 4 +; GFX10-NEXT: s_cmp_eq_u32 s6, 1 +; GFX10-NEXT: s_cselect_b64 s[4:5], 2.0, 1.0 +; GFX10-NEXT: s_cmp_eq_u32 s6, 2 ; GFX10-NEXT: s_cselect_b64 s[2:3], s[2:3], s[4:5] +; GFX10-NEXT: s_cmp_eq_u32 s6, 3 +; GFX10-NEXT: s_mov_b32 s4, 0 +; GFX10-NEXT: s_cselect_b64 s[2:3], 4.0, s[2:3] +; GFX10-NEXT: s_mov_b32 s5, 0x40140000 +; GFX10-NEXT: s_cmp_eq_u32 s6, 4 +; GFX10-NEXT: s_cselect_b64 s[2:3], s[4:5], s[2:3] ; GFX10-NEXT: v_mov_b32_e32 v0, s2 ; GFX10-NEXT: v_mov_b32_e32 v1, s3 ; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] @@ -3837,21 +3837,21 @@ ; GPRIDX-NEXT: runtime_loader_kernel_symbol = 0 ; GPRIDX-NEXT: .end_amd_kernel_code_t ; GPRIDX-NEXT: ; %bb.0: ; %entry -; GPRIDX-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GPRIDX-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GPRIDX-NEXT: s_load_dword s6, s[4:5], 0x8 -; GPRIDX-NEXT: s_mov_b32 s0, 0 -; GPRIDX-NEXT: s_mov_b32 s1, 0x40080000 +; GPRIDX-NEXT: s_mov_b32 s2, 0 +; GPRIDX-NEXT: s_mov_b32 s3, 0x40080000 ; GPRIDX-NEXT: v_mov_b32_e32 v2, 0 ; GPRIDX-NEXT: s_waitcnt lgkmcnt(0) ; GPRIDX-NEXT: s_cmp_eq_u32 s6, 1 ; GPRIDX-NEXT: s_cselect_b64 s[4:5], 2.0, 1.0 ; GPRIDX-NEXT: s_cmp_eq_u32 s6, 2 -; GPRIDX-NEXT: s_cselect_b64 s[0:1], s[0:1], s[4:5] +; GPRIDX-NEXT: s_cselect_b64 s[2:3], s[2:3], s[4:5] ; GPRIDX-NEXT: s_cmp_eq_u32 s6, 3 -; GPRIDX-NEXT: s_cselect_b64 s[0:1], 4.0, s[0:1] -; GPRIDX-NEXT: v_mov_b32_e32 v0, s0 -; GPRIDX-NEXT: v_mov_b32_e32 v1, s1 -; GPRIDX-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3] +; GPRIDX-NEXT: s_cselect_b64 s[2:3], 4.0, s[2:3] +; GPRIDX-NEXT: v_mov_b32_e32 v0, s2 +; GPRIDX-NEXT: v_mov_b32_e32 v1, s3 +; GPRIDX-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] ; GPRIDX-NEXT: s_endpgm ; ; MOVREL-LABEL: dyn_extract_v4f64_s_s_s: @@ -3924,21 +3924,21 @@ ; MOVREL-NEXT: runtime_loader_kernel_symbol = 0 ; MOVREL-NEXT: .end_amd_kernel_code_t ; MOVREL-NEXT: ; %bb.0: ; %entry -; MOVREL-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; MOVREL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; MOVREL-NEXT: s_load_dword s6, s[4:5], 0x8 -; MOVREL-NEXT: s_mov_b32 s0, 0 -; MOVREL-NEXT: s_mov_b32 s1, 0x40080000 +; MOVREL-NEXT: s_mov_b32 s2, 0 +; MOVREL-NEXT: s_mov_b32 s3, 0x40080000 ; MOVREL-NEXT: s_waitcnt lgkmcnt(0) -; MOVREL-NEXT: v_mov_b32_e32 v2, s2 +; MOVREL-NEXT: v_mov_b32_e32 v3, s1 ; MOVREL-NEXT: s_cmp_eq_u32 s6, 1 ; MOVREL-NEXT: s_cselect_b64 s[4:5], 2.0, 1.0 ; MOVREL-NEXT: s_cmp_eq_u32 s6, 2 -; MOVREL-NEXT: s_cselect_b64 s[0:1], s[0:1], s[4:5] +; MOVREL-NEXT: s_cselect_b64 s[2:3], s[2:3], s[4:5] ; MOVREL-NEXT: s_cmp_eq_u32 s6, 3 -; MOVREL-NEXT: s_cselect_b64 s[0:1], 4.0, s[0:1] -; MOVREL-NEXT: v_mov_b32_e32 v0, s0 -; MOVREL-NEXT: v_mov_b32_e32 v1, s1 -; MOVREL-NEXT: v_mov_b32_e32 v3, s3 +; MOVREL-NEXT: s_cselect_b64 s[2:3], 4.0, s[2:3] +; MOVREL-NEXT: v_mov_b32_e32 v0, s2 +; MOVREL-NEXT: v_mov_b32_e32 v1, s3 +; MOVREL-NEXT: v_mov_b32_e32 v2, s0 ; MOVREL-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; MOVREL-NEXT: s_endpgm ; Index: llvm/test/CodeGen/AMDGPU/GlobalISel/floor.f64.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/GlobalISel/floor.f64.ll +++ llvm/test/CodeGen/AMDGPU/GlobalISel/floor.f64.ll @@ -7,8 +7,8 @@ ; GFX6-LABEL: v_floor_f64_ieee: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: v_fract_f64_e32 v[2:3], v[0:1] ; GFX6-NEXT: s_mov_b32 s4, -1 +; GFX6-NEXT: v_fract_f64_e32 v[2:3], v[0:1] ; GFX6-NEXT: s_mov_b32 s5, 0x3fefffff ; GFX6-NEXT: v_min_f64 v[2:3], v[2:3], s[4:5] ; GFX6-NEXT: v_cmp_o_f64_e32 vcc, v[0:1], v[0:1] @@ -30,8 +30,8 @@ ; GFX6-LABEL: v_floor_f64_ieee_nnan: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: v_fract_f64_e32 v[2:3], v[0:1] ; GFX6-NEXT: s_mov_b32 s4, -1 +; GFX6-NEXT: v_fract_f64_e32 v[2:3], v[0:1] ; GFX6-NEXT: s_mov_b32 s5, 0x3fefffff ; GFX6-NEXT: v_min_f64 v[2:3], v[2:3], s[4:5] ; GFX6-NEXT: v_add_f64 v[0:1], v[0:1], -v[2:3] @@ -50,8 +50,8 @@ ; GFX6-LABEL: v_floor_f64_ieee_fneg: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: v_fract_f64_e64 v[2:3], -v[0:1] ; GFX6-NEXT: s_mov_b32 s4, -1 +; GFX6-NEXT: v_fract_f64_e64 v[2:3], -v[0:1] ; GFX6-NEXT: s_mov_b32 s5, 0x3fefffff ; GFX6-NEXT: v_min_f64 v[2:3], v[2:3], s[4:5] ; GFX6-NEXT: v_cmp_o_f64_e32 vcc, v[0:1], v[0:1] @@ -74,8 +74,8 @@ ; GFX6-LABEL: v_floor_f64_nonieee: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: v_fract_f64_e32 v[2:3], v[0:1] ; GFX6-NEXT: s_mov_b32 s4, -1 +; GFX6-NEXT: v_fract_f64_e32 v[2:3], v[0:1] ; GFX6-NEXT: s_mov_b32 s5, 0x3fefffff ; GFX6-NEXT: v_min_f64 v[2:3], v[2:3], s[4:5] ; GFX6-NEXT: v_cmp_o_f64_e32 vcc, v[0:1], v[0:1] @@ -97,8 +97,8 @@ ; GFX6-LABEL: v_floor_f64_nonieee_nnan: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: v_fract_f64_e32 v[2:3], v[0:1] ; GFX6-NEXT: s_mov_b32 s4, -1 +; GFX6-NEXT: v_fract_f64_e32 v[2:3], v[0:1] ; GFX6-NEXT: s_mov_b32 s5, 0x3fefffff ; GFX6-NEXT: v_min_f64 v[2:3], v[2:3], s[4:5] ; GFX6-NEXT: v_add_f64 v[0:1], v[0:1], -v[2:3] @@ -117,8 +117,8 @@ ; GFX6-LABEL: v_floor_f64_non_ieee_fneg: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: v_fract_f64_e64 v[2:3], -v[0:1] ; GFX6-NEXT: s_mov_b32 s4, -1 +; GFX6-NEXT: v_fract_f64_e64 v[2:3], -v[0:1] ; GFX6-NEXT: s_mov_b32 s5, 0x3fefffff ; GFX6-NEXT: v_min_f64 v[2:3], v[2:3], s[4:5] ; GFX6-NEXT: v_cmp_o_f64_e32 vcc, v[0:1], v[0:1] @@ -141,8 +141,8 @@ ; GFX6-LABEL: v_floor_f64_fabs: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: v_fract_f64_e64 v[2:3], |v[0:1]| ; GFX6-NEXT: s_mov_b32 s4, -1 +; GFX6-NEXT: v_fract_f64_e64 v[2:3], |v[0:1]| ; GFX6-NEXT: s_mov_b32 s5, 0x3fefffff ; GFX6-NEXT: v_min_f64 v[2:3], v[2:3], s[4:5] ; GFX6-NEXT: v_cmp_o_f64_e32 vcc, v[0:1], v[0:1] @@ -170,8 +170,8 @@ ; GFX6-LABEL: v_floor_f64_fneg_fabs: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: v_fract_f64_e64 v[2:3], -|v[0:1]| ; GFX6-NEXT: s_mov_b32 s4, -1 +; GFX6-NEXT: v_fract_f64_e64 v[2:3], -|v[0:1]| ; GFX6-NEXT: s_mov_b32 s5, 0x3fefffff ; GFX6-NEXT: v_min_f64 v[2:3], v[2:3], s[4:5] ; GFX6-NEXT: v_cmp_o_f64_e32 vcc, v[0:1], v[0:1] @@ -194,8 +194,8 @@ define amdgpu_ps <2 x float> @s_floor_f64(double inreg %x) { ; GFX6-LABEL: s_floor_f64: ; GFX6: ; %bb.0: -; GFX6-NEXT: v_fract_f64_e32 v[0:1], s[2:3] ; GFX6-NEXT: s_mov_b32 s0, -1 +; GFX6-NEXT: v_fract_f64_e32 v[0:1], s[2:3] ; GFX6-NEXT: s_mov_b32 s1, 0x3fefffff ; GFX6-NEXT: v_min_f64 v[0:1], v[0:1], s[0:1] ; GFX6-NEXT: v_cmp_o_f64_e64 vcc, s[2:3], s[2:3] @@ -218,8 +218,8 @@ define amdgpu_ps <2 x float> @s_floor_f64_fneg(double inreg %x) { ; GFX6-LABEL: s_floor_f64_fneg: ; GFX6: ; %bb.0: -; GFX6-NEXT: v_fract_f64_e64 v[0:1], -s[2:3] ; GFX6-NEXT: s_mov_b32 s0, -1 +; GFX6-NEXT: v_fract_f64_e64 v[0:1], -s[2:3] ; GFX6-NEXT: s_mov_b32 s1, 0x3fefffff ; GFX6-NEXT: v_min_f64 v[0:1], v[0:1], s[0:1] ; GFX6-NEXT: v_cmp_o_f64_e64 vcc, s[2:3], s[2:3] @@ -243,8 +243,8 @@ define amdgpu_ps <2 x float> @s_floor_f64_fabs(double inreg %x) { ; GFX6-LABEL: s_floor_f64_fabs: ; GFX6: ; %bb.0: -; GFX6-NEXT: v_fract_f64_e64 v[0:1], |s[2:3]| ; GFX6-NEXT: s_mov_b32 s0, -1 +; GFX6-NEXT: v_fract_f64_e64 v[0:1], |s[2:3]| ; GFX6-NEXT: s_mov_b32 s1, 0x3fefffff ; GFX6-NEXT: v_min_f64 v[0:1], v[0:1], s[0:1] ; GFX6-NEXT: v_cmp_o_f64_e64 vcc, s[2:3], s[2:3] @@ -268,8 +268,8 @@ define amdgpu_ps <2 x float> @s_floor_f64_fneg_fabs(double inreg %x) { ; GFX6-LABEL: s_floor_f64_fneg_fabs: ; GFX6: ; %bb.0: -; GFX6-NEXT: v_fract_f64_e64 v[0:1], -|s[2:3]| ; GFX6-NEXT: s_mov_b32 s0, -1 +; GFX6-NEXT: v_fract_f64_e64 v[0:1], -|s[2:3]| ; GFX6-NEXT: s_mov_b32 s1, 0x3fefffff ; GFX6-NEXT: v_min_f64 v[0:1], v[0:1], s[0:1] ; GFX6-NEXT: v_cmp_o_f64_e64 vcc, s[2:3], s[2:3] Index: llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.ll +++ llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.ll @@ -754,17 +754,17 @@ ; GPRIDX: ; %bb.0: ; %entry ; GPRIDX-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GPRIDX-NEXT: s_mov_b32 s18, 0 +; GPRIDX-NEXT: s_mov_b32 s16, 0 +; GPRIDX-NEXT: s_mov_b32 s14, 0 +; GPRIDX-NEXT: s_mov_b32 s12, 0 +; GPRIDX-NEXT: s_mov_b32 s8, 0 ; GPRIDX-NEXT: s_mov_b64 s[4:5], 1.0 ; GPRIDX-NEXT: s_mov_b32 s19, 0x40200000 ; GPRIDX-NEXT: s_mov_b32 s17, 0x401c0000 -; GPRIDX-NEXT: s_mov_b32 s16, s18 ; GPRIDX-NEXT: s_mov_b32 s15, 0x40180000 -; GPRIDX-NEXT: s_mov_b32 s14, s18 ; GPRIDX-NEXT: s_mov_b32 s13, 0x40140000 -; GPRIDX-NEXT: s_mov_b32 s12, s18 ; GPRIDX-NEXT: s_mov_b64 s[10:11], 4.0 ; GPRIDX-NEXT: s_mov_b32 s9, 0x40080000 -; GPRIDX-NEXT: s_mov_b32 s8, s18 ; GPRIDX-NEXT: s_mov_b64 s[6:7], 2.0 ; GPRIDX-NEXT: v_mov_b32_e32 v3, s4 ; GPRIDX-NEXT: v_mov_b32_e32 v4, s5 @@ -821,17 +821,17 @@ ; MOVREL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; MOVREL-NEXT: s_waitcnt_vscnt null, 0x0 ; MOVREL-NEXT: s_mov_b32 s18, 0 +; MOVREL-NEXT: s_mov_b32 s16, 0 +; MOVREL-NEXT: s_mov_b32 s14, 0 +; MOVREL-NEXT: s_mov_b32 s12, 0 +; MOVREL-NEXT: s_mov_b32 s8, 0 ; MOVREL-NEXT: s_mov_b64 s[4:5], 1.0 ; MOVREL-NEXT: s_mov_b32 s19, 0x40200000 ; MOVREL-NEXT: s_mov_b32 s17, 0x401c0000 -; MOVREL-NEXT: s_mov_b32 s16, s18 ; MOVREL-NEXT: s_mov_b32 s15, 0x40180000 -; MOVREL-NEXT: s_mov_b32 s14, s18 ; MOVREL-NEXT: s_mov_b32 s13, 0x40140000 -; MOVREL-NEXT: s_mov_b32 s12, s18 ; MOVREL-NEXT: s_mov_b64 s[10:11], 4.0 ; MOVREL-NEXT: s_mov_b32 s9, 0x40080000 -; MOVREL-NEXT: s_mov_b32 s8, s18 ; MOVREL-NEXT: s_mov_b64 s[6:7], 2.0 ; MOVREL-NEXT: v_mov_b32_e32 v3, s4 ; MOVREL-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v2 Index: llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-fconstant.mir =================================================================== --- llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-fconstant.mir +++ llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-fconstant.mir @@ -96,16 +96,12 @@ bb.0: ; GCN-LABEL: name: fconstant_s_s64 ; GCN: [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 4607182418800017408 - ; GCN: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0 - ; GCN: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 1075838976 - ; GCN: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_]], %subreg.sub0, [[S_MOV_B32_1]], %subreg.sub1 + ; GCN: [[S_MOV_B:%[0-9]+]]:sreg_64 = S_MOV_B64_IMM_PSEUDO 4620693217682128896 ; GCN: [[S_MOV_B64_1:%[0-9]+]]:sreg_64 = S_MOV_B64 -4611686018427387904 - ; GCN: [[S_MOV_B32_2:%[0-9]+]]:sreg_32 = S_MOV_B32 0 - ; GCN: [[S_MOV_B32_3:%[0-9]+]]:sreg_32 = S_MOV_B32 -1071382528 - ; GCN: [[REG_SEQUENCE1:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_2]], %subreg.sub0, [[S_MOV_B32_3]], %subreg.sub1 + ; GCN: [[S_MOV_B1:%[0-9]+]]:sreg_64 = S_MOV_B64_IMM_PSEUDO -4601552919265804288 ; GCN: $sgpr0_sgpr1 = COPY [[S_MOV_B64_]] - ; GCN: $sgpr2_sgpr3 = COPY [[REG_SEQUENCE]] - ; GCN: S_ENDPGM 0, implicit [[S_MOV_B64_]], implicit [[REG_SEQUENCE]], implicit [[S_MOV_B64_1]], implicit [[REG_SEQUENCE1]] + ; GCN: $sgpr2_sgpr3 = COPY [[S_MOV_B]] + ; GCN: S_ENDPGM 0, implicit [[S_MOV_B64_]], implicit [[S_MOV_B]], implicit [[S_MOV_B64_1]], implicit [[S_MOV_B1]] %0:sgpr(s64) = G_FCONSTANT double 1.0 %1:sgpr(s64) = G_FCONSTANT double 8.0 %2:sgpr(s64) = G_FCONSTANT double -2.0 Index: llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-fract.f64.mir =================================================================== --- llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-fract.f64.mir +++ llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-fract.f64.mir @@ -18,10 +18,8 @@ ; CHECK: [[COPY1:%[0-9]+]]:sreg_64 = COPY [[S_LOAD_DWORDX4_IMM]].sub0_sub1 ; CHECK: [[COPY2:%[0-9]+]]:sreg_64 = COPY [[S_LOAD_DWORDX4_IMM]].sub2_sub3 ; CHECK: [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:sreg_64_xexec = S_LOAD_DWORDX2_IMM [[COPY2]], 0, 0 :: (load 8, addrspace 1) - ; CHECK: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0 - ; CHECK: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 -2147483648 - ; CHECK: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_]], %subreg.sub0, [[S_MOV_B32_1]], %subreg.sub1 - ; CHECK: [[COPY3:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]] + ; CHECK: [[S_MOV_B:%[0-9]+]]:sreg_64 = S_MOV_B64_IMM_PSEUDO -9223372036854775808 + ; CHECK: [[COPY3:%[0-9]+]]:vreg_64 = COPY [[S_MOV_B]] ; CHECK: [[COPY4:%[0-9]+]]:vreg_64 = COPY [[S_LOAD_DWORDX2_IMM]] ; CHECK: %12:vreg_64 = nofpexcept V_ADD_F64_e64 0, [[COPY3]], 1, [[COPY4]], 0, 0, implicit $mode, implicit $exec ; CHECK: %15:vreg_64 = nofpexcept V_FRACT_F64_e64 0, %12, 0, 0, implicit $mode, implicit $exec @@ -67,10 +65,8 @@ ; CHECK: [[COPY1:%[0-9]+]]:sreg_64 = COPY [[S_LOAD_DWORDX4_IMM]].sub0_sub1 ; CHECK: [[COPY2:%[0-9]+]]:sreg_64 = COPY [[S_LOAD_DWORDX4_IMM]].sub2_sub3 ; CHECK: [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:sreg_64_xexec = S_LOAD_DWORDX2_IMM [[COPY2]], 0, 0 :: (load 8, addrspace 1) - ; CHECK: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0 - ; CHECK: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 -2147483648 - ; CHECK: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_]], %subreg.sub0, [[S_MOV_B32_1]], %subreg.sub1 - ; CHECK: [[COPY3:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]] + ; CHECK: [[S_MOV_B:%[0-9]+]]:sreg_64 = S_MOV_B64_IMM_PSEUDO -9223372036854775808 + ; CHECK: [[COPY3:%[0-9]+]]:vreg_64 = COPY [[S_MOV_B]] ; CHECK: [[COPY4:%[0-9]+]]:vreg_64 = COPY [[S_LOAD_DWORDX2_IMM]] ; CHECK: %13:vreg_64 = nofpexcept V_ADD_F64_e64 0, [[COPY3]], 3, [[COPY4]], 0, 0, implicit $mode, implicit $exec ; CHECK: %16:vreg_64 = nofpexcept V_FRACT_F64_e64 0, %13, 0, 0, implicit $mode, implicit $exec Index: llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.rsq.clamp.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.rsq.clamp.ll +++ llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.rsq.clamp.ll @@ -53,6 +53,7 @@ ; VI-NEXT: s_mov_b32 s4, -1 ; VI-NEXT: s_mov_b32 s5, 0x7fefffff ; VI-NEXT: v_min_f64 v[0:1], v[0:1], s[4:5] +; VI-NEXT: s_mov_b32 s4, -1 ; VI-NEXT: s_mov_b32 s5, 0xffefffff ; VI-NEXT: v_max_f64 v[0:1], v[0:1], s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] @@ -74,6 +75,7 @@ ; VI-NEXT: s_mov_b32 s4, -1 ; VI-NEXT: s_mov_b32 s5, 0x7fefffff ; VI-NEXT: v_min_f64 v[0:1], v[0:1], s[4:5] +; VI-NEXT: s_mov_b32 s4, -1 ; VI-NEXT: s_mov_b32 s5, 0xffefffff ; VI-NEXT: v_max_f64 v[0:1], v[0:1], s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] @@ -114,6 +116,7 @@ ; VI-NEXT: s_mov_b32 s4, -1 ; VI-NEXT: s_mov_b32 s5, 0x7fefffff ; VI-NEXT: v_min_f64 v[0:1], v[0:1], s[4:5] +; VI-NEXT: s_mov_b32 s4, -1 ; VI-NEXT: s_mov_b32 s5, 0xffefffff ; VI-NEXT: v_max_f64 v[0:1], v[0:1], s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] @@ -153,6 +156,7 @@ ; VI-NEXT: s_mov_b32 s4, -1 ; VI-NEXT: s_mov_b32 s5, 0x7fefffff ; VI-NEXT: v_min_f64 v[0:1], v[0:1], s[4:5] +; VI-NEXT: s_mov_b32 s4, -1 ; VI-NEXT: s_mov_b32 s5, 0xffefffff ; VI-NEXT: v_max_f64 v[0:1], v[0:1], s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] Index: llvm/test/CodeGen/AMDGPU/GlobalISel/roundeven.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/GlobalISel/roundeven.ll +++ llvm/test/CodeGen/AMDGPU/GlobalISel/roundeven.ll @@ -568,8 +568,8 @@ ; GFX6-NEXT: v_mov_b32_e32 v3, 0 ; GFX6-NEXT: v_or_b32_e32 v4, 0x43300000, v4 ; GFX6-NEXT: v_add_f64 v[5:6], -v[0:1], v[3:4] -; GFX6-NEXT: v_mov_b32_e32 v1, v0 ; GFX6-NEXT: s_mov_b32 s4, -1 +; GFX6-NEXT: v_mov_b32_e32 v1, v0 ; GFX6-NEXT: s_mov_b32 s5, 0x432fffff ; GFX6-NEXT: v_add_f64 v[3:4], v[5:6], -v[3:4] ; GFX6-NEXT: v_cmp_gt_f64_e64 vcc, |v[1:2]|, s[4:5] Index: llvm/test/CodeGen/AMDGPU/fp_to_sint.f64.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/fp_to_sint.f64.ll +++ llvm/test/CodeGen/AMDGPU/fp_to_sint.f64.ll @@ -33,7 +33,7 @@ } ; FUNC-LABEL: @fp_to_sint_i64_f64 -; CI-DAG: buffer_load_dwordx2 [[VAL:v\[[0-9]+:[0-9]+\]]] +; CI: buffer_load_dwordx2 [[VAL:v\[[0-9]+:[0-9]+\]]] ; CI-DAG: v_trunc_f64_e32 [[TRUNC:v\[[0-9]+:[0-9]+\]]], [[VAL]] ; CI-DAG: s_mov_b32 s[[K0_LO:[0-9]+]], 0{{$}} ; CI-DAG: s_mov_b32 s[[K0_HI:[0-9]+]], 0x3df00000 Index: llvm/test/CodeGen/AMDGPU/fp_to_uint.f64.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/fp_to_uint.f64.ll +++ llvm/test/CodeGen/AMDGPU/fp_to_uint.f64.ll @@ -33,7 +33,7 @@ } ; FUNC-LABEL: @fp_to_uint_i64_f64 -; CI-DAG: buffer_load_dwordx2 [[VAL:v\[[0-9]+:[0-9]+\]]] +; CI: buffer_load_dwordx2 [[VAL:v\[[0-9]+:[0-9]+\]]] ; CI-DAG: v_trunc_f64_e32 [[TRUNC:v\[[0-9]+:[0-9]+\]]], [[VAL]] ; CI-DAG: s_mov_b32 s[[K0_LO:[0-9]+]], 0{{$}} ; CI-DAG: s_mov_b32 s[[K0_HI:[0-9]+]], 0x3df00000 Index: llvm/test/CodeGen/AMDGPU/llvm.amdgcn.rsq.clamp.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/llvm.amdgcn.rsq.clamp.ll +++ llvm/test/CodeGen/AMDGPU/llvm.amdgcn.rsq.clamp.ll @@ -22,9 +22,7 @@ ; FUNC-LABEL: {{^}}rsq_clamp_f64: ; SI: v_rsq_clamp_f64_e32 -; TODO: this constant should be folded: -; VI-DAG: s_mov_b32 [[NEG1:s[0-9]+]], -1 -; VI-DAG: s_mov_b32 s[[LOW1:[0-9]+]], [[NEG1]] +; VI-DAG: s_mov_b32 s[[LOW1:[0-9]+]], -1 ; VI-DAG: s_mov_b32 s[[HIGH1:[0-9]+]], 0x7fefffff ; VI-DAG: s_mov_b32 s[[HIGH2:[0-9]+]], 0xffefffff ; VI-DAG: v_rsq_f64_e32 [[RSQ:v\[[0-9]+:[0-9]+\]]], s[{{[0-9]+:[0-9]+}} Index: llvm/test/CodeGen/AMDGPU/remat-fp64-constants.ll =================================================================== --- /dev/null +++ llvm/test/CodeGen/AMDGPU/remat-fp64-constants.ll @@ -0,0 +1,45 @@ +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -O3 -verify-machineinstrs --stress-regalloc=10 < %s | FileCheck -check-prefix=GCN %s +; RUN: llc -global-isel -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -O3 -verify-machineinstrs --stress-regalloc=10 < %s | FileCheck -check-prefix=GCN %s + +; GCN-LABEL: {{^}}test_remat_sgpr: +; GCN-NOT: v_writelane_b32 +; GCN: {{^}}[[LOOP:BB[0-9_]+]]: +; GCN-COUNT-6: s_mov_b32 s{{[0-9]+}}, 0x +; GCN-NOT: v_writelane_b32 +; GCN: s_cbranch_{{[^ ]+}} [[LOOP]] +; GCN: .sgpr_spill_count: 0 +define amdgpu_kernel void @test_remat_sgpr(double addrspace(1)* %arg, double addrspace(1)* %arg1) { +bb: + %i = tail call i32 @llvm.amdgcn.workitem.id.x() + br label %bb3 + +bb2: ; preds = %bb3 + ret void + +bb3: ; preds = %bb3, %bb + %i4 = phi i32 [ 0, %bb ], [ %i22, %bb3 ] + %i5 = add nuw nsw i32 %i4, %i + %i6 = zext i32 %i5 to i64 + %i7 = getelementptr inbounds double, double addrspace(1)* %arg, i64 %i6 + %i8 = load double, double addrspace(1)* %i7, align 8 + %i9 = fadd double %i8, 0x3EFC01997CC9E6B0 + %i10 = tail call double @llvm.fma.f64(double %i8, double %i9, double 0x3FBE25E43ABE935A) + %i11 = tail call double @llvm.fma.f64(double %i10, double %i9, double 0x3FC110EF47E6C9C2) + %i12 = tail call double @llvm.fma.f64(double %i11, double %i9, double 0x3FC3B13BCFA74449) + %i13 = tail call double @llvm.fma.f64(double %i12, double %i9, double 0x3FC745D171BF3C30) + %i14 = tail call double @llvm.fma.f64(double %i13, double %i9, double 0x3FCC71C71C7792CE) + %i15 = tail call double @llvm.fma.f64(double %i14, double %i9, double 0x3FD24924924920DA) + %i16 = tail call double @llvm.fma.f64(double %i15, double %i9, double 0x3FD999999999999C) + %i17 = tail call double @llvm.fma.f64(double %i16, double %i9, double 0x3FD899999999899C) + %i18 = tail call double @llvm.fma.f64(double %i17, double %i9, double 0x3FD799999999799C) + %i19 = tail call double @llvm.fma.f64(double %i18, double %i9, double 0x3FD699999999699C) + %i20 = tail call double @llvm.fma.f64(double %i19, double %i9, double 0x3FD599999999599C) + %i21 = getelementptr inbounds double, double addrspace(1)* %arg1, i64 %i6 + store double %i19, double addrspace(1)* %i21, align 8 + %i22 = add nuw nsw i32 %i4, 1 + %i23 = icmp eq i32 %i22, 1024 + br i1 %i23, label %bb2, label %bb3 +} + +declare double @llvm.fma.f64(double, double, double) +declare i32 @llvm.amdgcn.workitem.id.x() Index: llvm/test/CodeGen/AMDGPU/sdiv64.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/sdiv64.ll +++ llvm/test/CodeGen/AMDGPU/sdiv64.ll @@ -422,63 +422,63 @@ ; GCN-IR-NEXT: v_mov_b32_e32 v1, v5 ; GCN-IR-NEXT: v_cndmask_b32_e64 v12, v10, 0, s[6:7] ; GCN-IR-NEXT: s_and_b64 s[4:5], s[8:9], s[4:5] -; GCN-IR-NEXT: v_mov_b32_e32 v15, v18 +; GCN-IR-NEXT: v_mov_b32_e32 v17, v18 ; GCN-IR-NEXT: v_cndmask_b32_e64 v0, v9, 0, s[6:7] ; GCN-IR-NEXT: s_and_saveexec_b64 s[6:7], s[4:5] ; GCN-IR-NEXT: s_cbranch_execz BB1_6 ; GCN-IR-NEXT: ; %bb.1: ; %udiv-bb1 -; GCN-IR-NEXT: v_add_i32_e32 v16, vcc, 1, v7 -; GCN-IR-NEXT: v_addc_u32_e32 v17, vcc, 0, v8, vcc +; GCN-IR-NEXT: v_add_i32_e32 v15, vcc, 1, v7 +; GCN-IR-NEXT: v_addc_u32_e32 v16, vcc, 0, v8, vcc ; GCN-IR-NEXT: v_sub_i32_e64 v0, s[4:5], 63, v7 -; GCN-IR-NEXT: v_cmp_ge_u64_e32 vcc, v[16:17], v[7:8] +; GCN-IR-NEXT: v_cmp_ge_u64_e32 vcc, v[15:16], v[7:8] ; GCN-IR-NEXT: v_mov_b32_e32 v11, 0 ; GCN-IR-NEXT: v_lshl_b64 v[7:8], v[9:10], v0 -; GCN-IR-NEXT: s_mov_b64 s[8:9], 0 ; GCN-IR-NEXT: v_mov_b32_e32 v12, 0 ; GCN-IR-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-IR-NEXT: s_xor_b64 s[10:11], exec, s[4:5] +; GCN-IR-NEXT: s_xor_b64 s[8:9], exec, s[4:5] ; GCN-IR-NEXT: s_cbranch_execz BB1_5 ; GCN-IR-NEXT: ; %bb.2: ; %udiv-preheader ; GCN-IR-NEXT: v_add_i32_e32 v0, vcc, -1, v2 -; GCN-IR-NEXT: v_lshr_b64 v[16:17], v[9:10], v16 +; GCN-IR-NEXT: v_lshr_b64 v[15:16], v[9:10], v15 ; GCN-IR-NEXT: v_addc_u32_e32 v9, vcc, -1, v3, vcc ; GCN-IR-NEXT: v_not_b32_e32 v10, v13 ; GCN-IR-NEXT: v_not_b32_e32 v11, v18 ; GCN-IR-NEXT: v_add_i32_e32 v13, vcc, v10, v14 +; GCN-IR-NEXT: v_addc_u32_e32 v14, vcc, v11, v17, vcc +; GCN-IR-NEXT: v_mov_b32_e32 v17, 0 +; GCN-IR-NEXT: s_mov_b64 s[10:11], 0 ; GCN-IR-NEXT: v_mov_b32_e32 v18, 0 -; GCN-IR-NEXT: v_addc_u32_e32 v14, vcc, v11, v15, vcc -; GCN-IR-NEXT: v_mov_b32_e32 v19, 0 ; GCN-IR-NEXT: BB1_3: ; %udiv-do-while ; GCN-IR-NEXT: ; =>This Inner Loop Header: Depth=1 -; GCN-IR-NEXT: v_lshl_b64 v[16:17], v[16:17], 1 +; GCN-IR-NEXT: v_lshl_b64 v[15:16], v[15:16], 1 ; GCN-IR-NEXT: v_lshrrev_b32_e32 v10, 31, v8 -; GCN-IR-NEXT: v_or_b32_e32 v10, v16, v10 +; GCN-IR-NEXT: v_or_b32_e32 v10, v15, v10 ; GCN-IR-NEXT: v_lshl_b64 v[7:8], v[7:8], 1 ; GCN-IR-NEXT: v_sub_i32_e32 v11, vcc, v0, v10 -; GCN-IR-NEXT: v_subb_u32_e32 v11, vcc, v9, v17, vcc -; GCN-IR-NEXT: v_or_b32_e32 v7, v18, v7 -; GCN-IR-NEXT: v_add_i32_e32 v18, vcc, 1, v13 +; GCN-IR-NEXT: v_subb_u32_e32 v11, vcc, v9, v16, vcc +; GCN-IR-NEXT: v_or_b32_e32 v7, v17, v7 +; GCN-IR-NEXT: v_add_i32_e32 v17, vcc, 1, v13 ; GCN-IR-NEXT: v_ashrrev_i32_e32 v15, 31, v11 -; GCN-IR-NEXT: v_or_b32_e32 v8, v19, v8 -; GCN-IR-NEXT: v_addc_u32_e32 v19, vcc, 0, v14, vcc -; GCN-IR-NEXT: v_cmp_lt_u64_e32 vcc, v[18:19], v[13:14] -; GCN-IR-NEXT: v_mov_b32_e32 v13, v18 +; GCN-IR-NEXT: v_or_b32_e32 v8, v18, v8 +; GCN-IR-NEXT: v_addc_u32_e32 v18, vcc, 0, v14, vcc +; GCN-IR-NEXT: v_cmp_lt_u64_e32 vcc, v[17:18], v[13:14] +; GCN-IR-NEXT: v_mov_b32_e32 v13, v17 ; GCN-IR-NEXT: v_mov_b32_e32 v12, 0 ; GCN-IR-NEXT: v_and_b32_e32 v11, 1, v15 -; GCN-IR-NEXT: v_and_b32_e32 v20, v15, v3 +; GCN-IR-NEXT: v_and_b32_e32 v19, v15, v3 ; GCN-IR-NEXT: v_and_b32_e32 v15, v15, v2 -; GCN-IR-NEXT: v_sub_i32_e64 v16, s[4:5], v10, v15 -; GCN-IR-NEXT: v_mov_b32_e32 v14, v19 -; GCN-IR-NEXT: v_mov_b32_e32 v19, v12 -; GCN-IR-NEXT: v_subb_u32_e64 v17, s[4:5], v17, v20, s[4:5] -; GCN-IR-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GCN-IR-NEXT: v_mov_b32_e32 v18, v11 -; GCN-IR-NEXT: s_andn2_b64 exec, exec, s[8:9] +; GCN-IR-NEXT: v_sub_i32_e64 v15, s[4:5], v10, v15 +; GCN-IR-NEXT: v_mov_b32_e32 v14, v18 +; GCN-IR-NEXT: v_mov_b32_e32 v18, v12 +; GCN-IR-NEXT: v_subb_u32_e64 v16, s[4:5], v16, v19, s[4:5] +; GCN-IR-NEXT: s_or_b64 s[10:11], vcc, s[10:11] +; GCN-IR-NEXT: v_mov_b32_e32 v17, v11 +; GCN-IR-NEXT: s_andn2_b64 exec, exec, s[10:11] ; GCN-IR-NEXT: s_cbranch_execnz BB1_3 ; GCN-IR-NEXT: ; %bb.4: ; %Flow -; GCN-IR-NEXT: s_or_b64 exec, exec, s[8:9] -; GCN-IR-NEXT: BB1_5: ; %Flow3 ; GCN-IR-NEXT: s_or_b64 exec, exec, s[10:11] +; GCN-IR-NEXT: BB1_5: ; %Flow3 +; GCN-IR-NEXT: s_or_b64 exec, exec, s[8:9] ; GCN-IR-NEXT: v_lshl_b64 v[2:3], v[7:8], 1 ; GCN-IR-NEXT: v_or_b32_e32 v12, v12, v3 ; GCN-IR-NEXT: v_or_b32_e32 v0, v11, v2 @@ -1492,68 +1492,68 @@ ; GCN-IR-NEXT: v_cndmask_b32_e64 v6, 24, 0, s[4:5] ; GCN-IR-NEXT: s_xor_b64 s[4:5], s[4:5], -1 ; GCN-IR-NEXT: v_mov_b32_e32 v3, v2 -; GCN-IR-NEXT: v_mov_b32_e32 v7, v11 +; GCN-IR-NEXT: v_mov_b32_e32 v8, v11 ; GCN-IR-NEXT: s_and_b64 s[4:5], s[4:5], vcc ; GCN-IR-NEXT: s_and_saveexec_b64 s[6:7], s[4:5] ; GCN-IR-NEXT: s_cbranch_execz BB11_6 ; GCN-IR-NEXT: ; %bb.1: ; %udiv-bb1 -; GCN-IR-NEXT: v_add_i32_e32 v8, vcc, 1, v4 -; GCN-IR-NEXT: v_addc_u32_e32 v9, vcc, 0, v5, vcc -; GCN-IR-NEXT: v_cmp_ge_u64_e32 vcc, v[8:9], v[4:5] +; GCN-IR-NEXT: v_add_i32_e32 v6, vcc, 1, v4 +; GCN-IR-NEXT: v_addc_u32_e32 v7, vcc, 0, v5, vcc +; GCN-IR-NEXT: v_cmp_ge_u64_e32 vcc, v[6:7], v[4:5] ; GCN-IR-NEXT: v_sub_i32_e64 v4, s[4:5], 63, v4 -; GCN-IR-NEXT: v_mov_b32_e32 v6, 0 -; GCN-IR-NEXT: v_lshl_b64 v[4:5], 24, v4 -; GCN-IR-NEXT: s_mov_b64 s[8:9], 0 ; GCN-IR-NEXT: v_mov_b32_e32 v7, 0 +; GCN-IR-NEXT: v_lshl_b64 v[4:5], 24, v4 +; GCN-IR-NEXT: v_mov_b32_e32 v8, 0 ; GCN-IR-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-IR-NEXT: s_xor_b64 s[10:11], exec, s[4:5] +; GCN-IR-NEXT: s_xor_b64 s[8:9], exec, s[4:5] ; GCN-IR-NEXT: s_cbranch_execz BB11_5 ; GCN-IR-NEXT: ; %bb.2: ; %udiv-preheader -; GCN-IR-NEXT: v_lshr_b64 v[12:13], 24, v8 -; GCN-IR-NEXT: v_add_i32_e32 v8, vcc, -1, v0 +; GCN-IR-NEXT: v_lshr_b64 v[12:13], 24, v6 +; GCN-IR-NEXT: v_add_i32_e32 v6, vcc, -1, v0 ; GCN-IR-NEXT: v_addc_u32_e32 v9, vcc, -1, v1, vcc ; GCN-IR-NEXT: v_sub_i32_e32 v10, vcc, 58, v10 ; GCN-IR-NEXT: v_mov_b32_e32 v14, 0 ; GCN-IR-NEXT: v_subb_u32_e32 v11, vcc, 0, v11, vcc +; GCN-IR-NEXT: s_mov_b64 s[10:11], 0 ; GCN-IR-NEXT: v_mov_b32_e32 v15, 0 ; GCN-IR-NEXT: BB11_3: ; %udiv-do-while ; GCN-IR-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN-IR-NEXT: v_lshl_b64 v[12:13], v[12:13], 1 -; GCN-IR-NEXT: v_lshrrev_b32_e32 v6, 31, v5 -; GCN-IR-NEXT: v_or_b32_e32 v12, v12, v6 +; GCN-IR-NEXT: v_lshrrev_b32_e32 v7, 31, v5 +; GCN-IR-NEXT: v_or_b32_e32 v12, v12, v7 ; GCN-IR-NEXT: v_lshl_b64 v[4:5], v[4:5], 1 -; GCN-IR-NEXT: v_sub_i32_e32 v6, vcc, v8, v12 -; GCN-IR-NEXT: v_subb_u32_e32 v6, vcc, v9, v13, vcc +; GCN-IR-NEXT: v_sub_i32_e32 v7, vcc, v6, v12 +; GCN-IR-NEXT: v_subb_u32_e32 v7, vcc, v9, v13, vcc ; GCN-IR-NEXT: v_or_b32_e32 v4, v14, v4 -; GCN-IR-NEXT: v_ashrrev_i32_e32 v14, 31, v6 +; GCN-IR-NEXT: v_ashrrev_i32_e32 v14, 31, v7 ; GCN-IR-NEXT: v_and_b32_e32 v17, v14, v0 -; GCN-IR-NEXT: v_and_b32_e32 v6, 1, v14 +; GCN-IR-NEXT: v_and_b32_e32 v7, 1, v14 ; GCN-IR-NEXT: v_and_b32_e32 v16, v14, v1 ; GCN-IR-NEXT: v_add_i32_e32 v14, vcc, 1, v10 ; GCN-IR-NEXT: v_or_b32_e32 v5, v15, v5 ; GCN-IR-NEXT: v_addc_u32_e32 v15, vcc, 0, v11, vcc ; GCN-IR-NEXT: v_cmp_lt_u64_e32 vcc, v[14:15], v[10:11] ; GCN-IR-NEXT: v_mov_b32_e32 v10, v14 -; GCN-IR-NEXT: v_mov_b32_e32 v7, 0 +; GCN-IR-NEXT: v_mov_b32_e32 v8, 0 ; GCN-IR-NEXT: v_sub_i32_e64 v12, s[4:5], v12, v17 ; GCN-IR-NEXT: v_mov_b32_e32 v11, v15 -; GCN-IR-NEXT: v_mov_b32_e32 v15, v7 +; GCN-IR-NEXT: v_mov_b32_e32 v15, v8 ; GCN-IR-NEXT: v_subb_u32_e64 v13, s[4:5], v13, v16, s[4:5] -; GCN-IR-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GCN-IR-NEXT: v_mov_b32_e32 v14, v6 -; GCN-IR-NEXT: s_andn2_b64 exec, exec, s[8:9] +; GCN-IR-NEXT: s_or_b64 s[10:11], vcc, s[10:11] +; GCN-IR-NEXT: v_mov_b32_e32 v14, v7 +; GCN-IR-NEXT: s_andn2_b64 exec, exec, s[10:11] ; GCN-IR-NEXT: s_cbranch_execnz BB11_3 ; GCN-IR-NEXT: ; %bb.4: ; %Flow -; GCN-IR-NEXT: s_or_b64 exec, exec, s[8:9] -; GCN-IR-NEXT: BB11_5: ; %Flow3 ; GCN-IR-NEXT: s_or_b64 exec, exec, s[10:11] +; GCN-IR-NEXT: BB11_5: ; %Flow3 +; GCN-IR-NEXT: s_or_b64 exec, exec, s[8:9] ; GCN-IR-NEXT: v_lshl_b64 v[0:1], v[4:5], 1 -; GCN-IR-NEXT: v_or_b32_e32 v7, v7, v1 -; GCN-IR-NEXT: v_or_b32_e32 v6, v6, v0 +; GCN-IR-NEXT: v_or_b32_e32 v8, v8, v1 +; GCN-IR-NEXT: v_or_b32_e32 v6, v7, v0 ; GCN-IR-NEXT: BB11_6: ; %Flow4 ; GCN-IR-NEXT: s_or_b64 exec, exec, s[6:7] ; GCN-IR-NEXT: v_xor_b32_e32 v0, v6, v2 -; GCN-IR-NEXT: v_xor_b32_e32 v1, v7, v3 +; GCN-IR-NEXT: v_xor_b32_e32 v1, v8, v3 ; GCN-IR-NEXT: v_sub_i32_e32 v0, vcc, v0, v2 ; GCN-IR-NEXT: v_subb_u32_e32 v1, vcc, v1, v3, vcc ; GCN-IR-NEXT: s_setpc_b64 s[30:31] @@ -1707,70 +1707,70 @@ ; GCN-IR-NEXT: s_xor_b64 s[4:5], s[4:5], -1 ; GCN-IR-NEXT: v_mov_b32_e32 v3, v2 ; GCN-IR-NEXT: s_mov_b32 s9, 0 -; GCN-IR-NEXT: v_mov_b32_e32 v7, v11 +; GCN-IR-NEXT: v_mov_b32_e32 v8, v11 ; GCN-IR-NEXT: s_and_b64 s[4:5], s[4:5], vcc ; GCN-IR-NEXT: s_and_saveexec_b64 s[6:7], s[4:5] ; GCN-IR-NEXT: s_cbranch_execz BB12_6 ; GCN-IR-NEXT: ; %bb.1: ; %udiv-bb1 -; GCN-IR-NEXT: v_add_i32_e32 v8, vcc, 1, v4 -; GCN-IR-NEXT: v_addc_u32_e32 v9, vcc, 0, v5, vcc -; GCN-IR-NEXT: v_cmp_ge_u64_e32 vcc, v[8:9], v[4:5] +; GCN-IR-NEXT: v_add_i32_e32 v6, vcc, 1, v4 +; GCN-IR-NEXT: v_addc_u32_e32 v7, vcc, 0, v5, vcc +; GCN-IR-NEXT: v_cmp_ge_u64_e32 vcc, v[6:7], v[4:5] ; GCN-IR-NEXT: v_sub_i32_e64 v4, s[4:5], 63, v4 -; GCN-IR-NEXT: v_lshl_b64 v[4:5], s[8:9], v4 -; GCN-IR-NEXT: v_mov_b32_e32 v6, 0 -; GCN-IR-NEXT: s_mov_b64 s[8:9], 0 ; GCN-IR-NEXT: v_mov_b32_e32 v7, 0 +; GCN-IR-NEXT: v_lshl_b64 v[4:5], s[8:9], v4 +; GCN-IR-NEXT: v_mov_b32_e32 v8, 0 ; GCN-IR-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-IR-NEXT: s_xor_b64 s[10:11], exec, s[4:5] +; GCN-IR-NEXT: s_xor_b64 s[8:9], exec, s[4:5] ; GCN-IR-NEXT: s_cbranch_execz BB12_5 ; GCN-IR-NEXT: ; %bb.2: ; %udiv-preheader ; GCN-IR-NEXT: s_mov_b32 s5, 0 ; GCN-IR-NEXT: s_mov_b32 s4, 0x8000 -; GCN-IR-NEXT: v_lshr_b64 v[12:13], s[4:5], v8 -; GCN-IR-NEXT: v_add_i32_e32 v8, vcc, -1, v0 +; GCN-IR-NEXT: v_lshr_b64 v[12:13], s[4:5], v6 +; GCN-IR-NEXT: v_add_i32_e32 v6, vcc, -1, v0 ; GCN-IR-NEXT: v_addc_u32_e32 v9, vcc, -1, v1, vcc ; GCN-IR-NEXT: v_sub_i32_e32 v10, vcc, 47, v10 ; GCN-IR-NEXT: v_mov_b32_e32 v14, 0 ; GCN-IR-NEXT: v_subb_u32_e32 v11, vcc, 0, v11, vcc +; GCN-IR-NEXT: s_mov_b64 s[10:11], 0 ; GCN-IR-NEXT: v_mov_b32_e32 v15, 0 ; GCN-IR-NEXT: BB12_3: ; %udiv-do-while ; GCN-IR-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN-IR-NEXT: v_lshl_b64 v[12:13], v[12:13], 1 -; GCN-IR-NEXT: v_lshrrev_b32_e32 v6, 31, v5 -; GCN-IR-NEXT: v_or_b32_e32 v12, v12, v6 +; GCN-IR-NEXT: v_lshrrev_b32_e32 v7, 31, v5 +; GCN-IR-NEXT: v_or_b32_e32 v12, v12, v7 ; GCN-IR-NEXT: v_lshl_b64 v[4:5], v[4:5], 1 -; GCN-IR-NEXT: v_sub_i32_e32 v6, vcc, v8, v12 -; GCN-IR-NEXT: v_subb_u32_e32 v6, vcc, v9, v13, vcc +; GCN-IR-NEXT: v_sub_i32_e32 v7, vcc, v6, v12 +; GCN-IR-NEXT: v_subb_u32_e32 v7, vcc, v9, v13, vcc ; GCN-IR-NEXT: v_or_b32_e32 v4, v14, v4 -; GCN-IR-NEXT: v_ashrrev_i32_e32 v14, 31, v6 +; GCN-IR-NEXT: v_ashrrev_i32_e32 v14, 31, v7 ; GCN-IR-NEXT: v_and_b32_e32 v17, v14, v0 -; GCN-IR-NEXT: v_and_b32_e32 v6, 1, v14 +; GCN-IR-NEXT: v_and_b32_e32 v7, 1, v14 ; GCN-IR-NEXT: v_and_b32_e32 v16, v14, v1 ; GCN-IR-NEXT: v_add_i32_e32 v14, vcc, 1, v10 ; GCN-IR-NEXT: v_or_b32_e32 v5, v15, v5 ; GCN-IR-NEXT: v_addc_u32_e32 v15, vcc, 0, v11, vcc ; GCN-IR-NEXT: v_cmp_lt_u64_e32 vcc, v[14:15], v[10:11] ; GCN-IR-NEXT: v_mov_b32_e32 v10, v14 -; GCN-IR-NEXT: v_mov_b32_e32 v7, 0 +; GCN-IR-NEXT: v_mov_b32_e32 v8, 0 ; GCN-IR-NEXT: v_sub_i32_e64 v12, s[4:5], v12, v17 ; GCN-IR-NEXT: v_mov_b32_e32 v11, v15 -; GCN-IR-NEXT: v_mov_b32_e32 v15, v7 +; GCN-IR-NEXT: v_mov_b32_e32 v15, v8 ; GCN-IR-NEXT: v_subb_u32_e64 v13, s[4:5], v13, v16, s[4:5] -; GCN-IR-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GCN-IR-NEXT: v_mov_b32_e32 v14, v6 -; GCN-IR-NEXT: s_andn2_b64 exec, exec, s[8:9] +; GCN-IR-NEXT: s_or_b64 s[10:11], vcc, s[10:11] +; GCN-IR-NEXT: v_mov_b32_e32 v14, v7 +; GCN-IR-NEXT: s_andn2_b64 exec, exec, s[10:11] ; GCN-IR-NEXT: s_cbranch_execnz BB12_3 ; GCN-IR-NEXT: ; %bb.4: ; %Flow -; GCN-IR-NEXT: s_or_b64 exec, exec, s[8:9] -; GCN-IR-NEXT: BB12_5: ; %Flow3 ; GCN-IR-NEXT: s_or_b64 exec, exec, s[10:11] +; GCN-IR-NEXT: BB12_5: ; %Flow3 +; GCN-IR-NEXT: s_or_b64 exec, exec, s[8:9] ; GCN-IR-NEXT: v_lshl_b64 v[0:1], v[4:5], 1 -; GCN-IR-NEXT: v_or_b32_e32 v7, v7, v1 -; GCN-IR-NEXT: v_or_b32_e32 v6, v6, v0 +; GCN-IR-NEXT: v_or_b32_e32 v8, v8, v1 +; GCN-IR-NEXT: v_or_b32_e32 v6, v7, v0 ; GCN-IR-NEXT: BB12_6: ; %Flow4 ; GCN-IR-NEXT: s_or_b64 exec, exec, s[6:7] ; GCN-IR-NEXT: v_xor_b32_e32 v0, v6, v2 -; GCN-IR-NEXT: v_xor_b32_e32 v1, v7, v3 +; GCN-IR-NEXT: v_xor_b32_e32 v1, v8, v3 ; GCN-IR-NEXT: v_sub_i32_e32 v0, vcc, v0, v2 ; GCN-IR-NEXT: v_subb_u32_e32 v1, vcc, v1, v3, vcc ; GCN-IR-NEXT: s_setpc_b64 s[30:31] @@ -1822,16 +1822,16 @@ ; GCN-IR-NEXT: v_sub_i32_e64 v3, s[4:5], 63, v3 ; GCN-IR-NEXT: v_mov_b32_e32 v5, 0 ; GCN-IR-NEXT: v_lshl_b64 v[3:4], v[7:8], v3 -; GCN-IR-NEXT: s_mov_b64 s[8:9], 0 ; GCN-IR-NEXT: v_mov_b32_e32 v6, 0 ; GCN-IR-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-IR-NEXT: s_xor_b64 s[10:11], exec, s[4:5] +; GCN-IR-NEXT: s_xor_b64 s[8:9], exec, s[4:5] ; GCN-IR-NEXT: s_cbranch_execz BB13_5 ; GCN-IR-NEXT: ; %bb.2: ; %udiv-preheader ; GCN-IR-NEXT: v_lshr_b64 v[9:10], v[7:8], v9 ; GCN-IR-NEXT: v_add_i32_e32 v7, vcc, 0xffffffcf, v0 ; GCN-IR-NEXT: v_mov_b32_e32 v11, 0 ; GCN-IR-NEXT: v_addc_u32_e64 v8, s[4:5], 0, -1, vcc +; GCN-IR-NEXT: s_mov_b64 s[10:11], 0 ; GCN-IR-NEXT: v_mov_b32_e32 v12, 0 ; GCN-IR-NEXT: s_movk_i32 s12, 0x7fff ; GCN-IR-NEXT: BB13_3: ; %udiv-do-while @@ -1857,14 +1857,14 @@ ; GCN-IR-NEXT: v_mov_b32_e32 v13, 0 ; GCN-IR-NEXT: v_sub_i32_e64 v9, s[4:5], v0, v9 ; GCN-IR-NEXT: v_subb_u32_e64 v10, s[4:5], v10, v13, s[4:5] -; GCN-IR-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GCN-IR-NEXT: s_or_b64 s[10:11], vcc, s[10:11] ; GCN-IR-NEXT: v_mov_b32_e32 v11, v5 -; GCN-IR-NEXT: s_andn2_b64 exec, exec, s[8:9] +; GCN-IR-NEXT: s_andn2_b64 exec, exec, s[10:11] ; GCN-IR-NEXT: s_cbranch_execnz BB13_3 ; GCN-IR-NEXT: ; %bb.4: ; %Flow -; GCN-IR-NEXT: s_or_b64 exec, exec, s[8:9] -; GCN-IR-NEXT: BB13_5: ; %Flow3 ; GCN-IR-NEXT: s_or_b64 exec, exec, s[10:11] +; GCN-IR-NEXT: BB13_5: ; %Flow3 +; GCN-IR-NEXT: s_or_b64 exec, exec, s[8:9] ; GCN-IR-NEXT: v_lshl_b64 v[3:4], v[3:4], 1 ; GCN-IR-NEXT: v_or_b32_e32 v6, v6, v4 ; GCN-IR-NEXT: v_or_b32_e32 v5, v5, v3 Index: llvm/test/CodeGen/AMDGPU/srem64.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/srem64.ll +++ llvm/test/CodeGen/AMDGPU/srem64.ll @@ -408,10 +408,9 @@ ; GCN-IR-NEXT: v_cmp_ge_u64_e32 vcc, v[9:10], v[7:8] ; GCN-IR-NEXT: v_mov_b32_e32 v10, 0 ; GCN-IR-NEXT: v_lshl_b64 v[7:8], v[0:1], v3 -; GCN-IR-NEXT: s_mov_b64 s[8:9], 0 ; GCN-IR-NEXT: v_mov_b32_e32 v11, 0 ; GCN-IR-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-IR-NEXT: s_xor_b64 s[10:11], exec, s[4:5] +; GCN-IR-NEXT: s_xor_b64 s[8:9], exec, s[4:5] ; GCN-IR-NEXT: s_cbranch_execz BB1_5 ; GCN-IR-NEXT: ; %bb.2: ; %udiv-preheader ; GCN-IR-NEXT: v_add_i32_e32 v3, vcc, -1, v5 @@ -422,6 +421,7 @@ ; GCN-IR-NEXT: v_add_i32_e32 v12, vcc, v10, v14 ; GCN-IR-NEXT: v_addc_u32_e32 v13, vcc, v11, v15, vcc ; GCN-IR-NEXT: v_mov_b32_e32 v14, 0 +; GCN-IR-NEXT: s_mov_b64 s[10:11], 0 ; GCN-IR-NEXT: v_mov_b32_e32 v15, 0 ; GCN-IR-NEXT: BB1_3: ; %udiv-do-while ; GCN-IR-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -446,14 +446,14 @@ ; GCN-IR-NEXT: v_mov_b32_e32 v13, v15 ; GCN-IR-NEXT: v_mov_b32_e32 v15, v11 ; GCN-IR-NEXT: v_subb_u32_e64 v17, s[4:5], v17, v18, s[4:5] -; GCN-IR-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GCN-IR-NEXT: s_or_b64 s[10:11], vcc, s[10:11] ; GCN-IR-NEXT: v_mov_b32_e32 v14, v10 -; GCN-IR-NEXT: s_andn2_b64 exec, exec, s[8:9] +; GCN-IR-NEXT: s_andn2_b64 exec, exec, s[10:11] ; GCN-IR-NEXT: s_cbranch_execnz BB1_3 ; GCN-IR-NEXT: ; %bb.4: ; %Flow -; GCN-IR-NEXT: s_or_b64 exec, exec, s[8:9] -; GCN-IR-NEXT: BB1_5: ; %Flow3 ; GCN-IR-NEXT: s_or_b64 exec, exec, s[10:11] +; GCN-IR-NEXT: BB1_5: ; %Flow3 +; GCN-IR-NEXT: s_or_b64 exec, exec, s[8:9] ; GCN-IR-NEXT: v_lshl_b64 v[7:8], v[7:8], 1 ; GCN-IR-NEXT: v_or_b32_e32 v3, v11, v8 ; GCN-IR-NEXT: v_or_b32_e32 v9, v10, v7 @@ -1671,67 +1671,67 @@ ; GCN-IR-NEXT: v_cmp_ne_u64_e32 vcc, 63, v[3:4] ; GCN-IR-NEXT: v_cndmask_b32_e64 v2, 24, 0, s[4:5] ; GCN-IR-NEXT: s_xor_b64 s[4:5], s[4:5], -1 -; GCN-IR-NEXT: v_mov_b32_e32 v5, v9 +; GCN-IR-NEXT: v_mov_b32_e32 v6, v9 ; GCN-IR-NEXT: s_and_b64 s[4:5], s[4:5], vcc ; GCN-IR-NEXT: s_and_saveexec_b64 s[6:7], s[4:5] ; GCN-IR-NEXT: s_cbranch_execz BB11_6 ; GCN-IR-NEXT: ; %bb.1: ; %udiv-bb1 -; GCN-IR-NEXT: v_add_i32_e32 v6, vcc, 1, v3 -; GCN-IR-NEXT: v_addc_u32_e32 v7, vcc, 0, v4, vcc +; GCN-IR-NEXT: v_add_i32_e32 v10, vcc, 1, v3 +; GCN-IR-NEXT: v_addc_u32_e32 v11, vcc, 0, v4, vcc ; GCN-IR-NEXT: v_sub_i32_e64 v2, s[4:5], 63, v3 -; GCN-IR-NEXT: v_cmp_ge_u64_e32 vcc, v[6:7], v[3:4] -; GCN-IR-NEXT: v_mov_b32_e32 v4, 0 -; GCN-IR-NEXT: v_lshl_b64 v[2:3], 24, v2 -; GCN-IR-NEXT: s_mov_b64 s[8:9], 0 +; GCN-IR-NEXT: v_cmp_ge_u64_e32 vcc, v[10:11], v[3:4] ; GCN-IR-NEXT: v_mov_b32_e32 v5, 0 +; GCN-IR-NEXT: v_lshl_b64 v[2:3], 24, v2 +; GCN-IR-NEXT: v_mov_b32_e32 v6, 0 ; GCN-IR-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-IR-NEXT: s_xor_b64 s[10:11], exec, s[4:5] +; GCN-IR-NEXT: s_xor_b64 s[8:9], exec, s[4:5] ; GCN-IR-NEXT: s_cbranch_execz BB11_5 ; GCN-IR-NEXT: ; %bb.2: ; %udiv-preheader -; GCN-IR-NEXT: v_lshr_b64 v[10:11], 24, v6 -; GCN-IR-NEXT: v_add_i32_e32 v6, vcc, -1, v0 +; GCN-IR-NEXT: v_add_i32_e32 v4, vcc, -1, v0 ; GCN-IR-NEXT: v_addc_u32_e32 v7, vcc, -1, v1, vcc ; GCN-IR-NEXT: v_sub_i32_e32 v8, vcc, 58, v8 ; GCN-IR-NEXT: v_mov_b32_e32 v12, 0 +; GCN-IR-NEXT: v_lshr_b64 v[10:11], 24, v10 ; GCN-IR-NEXT: v_subb_u32_e32 v9, vcc, 0, v9, vcc +; GCN-IR-NEXT: s_mov_b64 s[10:11], 0 ; GCN-IR-NEXT: v_mov_b32_e32 v13, 0 ; GCN-IR-NEXT: BB11_3: ; %udiv-do-while ; GCN-IR-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN-IR-NEXT: v_lshl_b64 v[10:11], v[10:11], 1 -; GCN-IR-NEXT: v_lshrrev_b32_e32 v4, 31, v3 -; GCN-IR-NEXT: v_or_b32_e32 v10, v10, v4 +; GCN-IR-NEXT: v_lshrrev_b32_e32 v5, 31, v3 +; GCN-IR-NEXT: v_or_b32_e32 v10, v10, v5 ; GCN-IR-NEXT: v_lshl_b64 v[2:3], v[2:3], 1 -; GCN-IR-NEXT: v_sub_i32_e32 v4, vcc, v6, v10 -; GCN-IR-NEXT: v_subb_u32_e32 v4, vcc, v7, v11, vcc +; GCN-IR-NEXT: v_sub_i32_e32 v5, vcc, v4, v10 +; GCN-IR-NEXT: v_subb_u32_e32 v5, vcc, v7, v11, vcc ; GCN-IR-NEXT: v_or_b32_e32 v2, v12, v2 -; GCN-IR-NEXT: v_ashrrev_i32_e32 v12, 31, v4 +; GCN-IR-NEXT: v_ashrrev_i32_e32 v12, 31, v5 ; GCN-IR-NEXT: v_and_b32_e32 v15, v12, v0 -; GCN-IR-NEXT: v_and_b32_e32 v4, 1, v12 +; GCN-IR-NEXT: v_and_b32_e32 v5, 1, v12 ; GCN-IR-NEXT: v_and_b32_e32 v14, v12, v1 ; GCN-IR-NEXT: v_add_i32_e32 v12, vcc, 1, v8 ; GCN-IR-NEXT: v_or_b32_e32 v3, v13, v3 ; GCN-IR-NEXT: v_addc_u32_e32 v13, vcc, 0, v9, vcc ; GCN-IR-NEXT: v_cmp_lt_u64_e32 vcc, v[12:13], v[8:9] ; GCN-IR-NEXT: v_mov_b32_e32 v8, v12 -; GCN-IR-NEXT: v_mov_b32_e32 v5, 0 +; GCN-IR-NEXT: v_mov_b32_e32 v6, 0 ; GCN-IR-NEXT: v_sub_i32_e64 v10, s[4:5], v10, v15 ; GCN-IR-NEXT: v_mov_b32_e32 v9, v13 -; GCN-IR-NEXT: v_mov_b32_e32 v13, v5 +; GCN-IR-NEXT: v_mov_b32_e32 v13, v6 ; GCN-IR-NEXT: v_subb_u32_e64 v11, s[4:5], v11, v14, s[4:5] -; GCN-IR-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GCN-IR-NEXT: v_mov_b32_e32 v12, v4 -; GCN-IR-NEXT: s_andn2_b64 exec, exec, s[8:9] +; GCN-IR-NEXT: s_or_b64 s[10:11], vcc, s[10:11] +; GCN-IR-NEXT: v_mov_b32_e32 v12, v5 +; GCN-IR-NEXT: s_andn2_b64 exec, exec, s[10:11] ; GCN-IR-NEXT: s_cbranch_execnz BB11_3 ; GCN-IR-NEXT: ; %bb.4: ; %Flow -; GCN-IR-NEXT: s_or_b64 exec, exec, s[8:9] -; GCN-IR-NEXT: BB11_5: ; %Flow3 ; GCN-IR-NEXT: s_or_b64 exec, exec, s[10:11] +; GCN-IR-NEXT: BB11_5: ; %Flow3 +; GCN-IR-NEXT: s_or_b64 exec, exec, s[8:9] ; GCN-IR-NEXT: v_lshl_b64 v[2:3], v[2:3], 1 -; GCN-IR-NEXT: v_or_b32_e32 v5, v5, v3 -; GCN-IR-NEXT: v_or_b32_e32 v2, v4, v2 +; GCN-IR-NEXT: v_or_b32_e32 v6, v6, v3 +; GCN-IR-NEXT: v_or_b32_e32 v2, v5, v2 ; GCN-IR-NEXT: BB11_6: ; %Flow4 ; GCN-IR-NEXT: s_or_b64 exec, exec, s[6:7] -; GCN-IR-NEXT: v_mul_lo_u32 v3, v0, v5 +; GCN-IR-NEXT: v_mul_lo_u32 v3, v0, v6 ; GCN-IR-NEXT: v_mul_hi_u32 v4, v0, v2 ; GCN-IR-NEXT: v_mul_lo_u32 v1, v1, v2 ; GCN-IR-NEXT: v_mul_lo_u32 v0, v0, v2 @@ -1884,69 +1884,69 @@ ; GCN-IR-NEXT: v_cndmask_b32_e64 v4, v4, 0, s[4:5] ; GCN-IR-NEXT: s_xor_b64 s[4:5], s[4:5], -1 ; GCN-IR-NEXT: s_mov_b32 s9, 0 -; GCN-IR-NEXT: v_mov_b32_e32 v5, v9 +; GCN-IR-NEXT: v_mov_b32_e32 v6, v9 ; GCN-IR-NEXT: s_and_b64 s[4:5], s[4:5], vcc ; GCN-IR-NEXT: s_and_saveexec_b64 s[6:7], s[4:5] ; GCN-IR-NEXT: s_cbranch_execz BB12_6 ; GCN-IR-NEXT: ; %bb.1: ; %udiv-bb1 -; GCN-IR-NEXT: v_add_i32_e32 v6, vcc, 1, v2 -; GCN-IR-NEXT: v_addc_u32_e32 v7, vcc, 0, v3, vcc -; GCN-IR-NEXT: v_cmp_ge_u64_e32 vcc, v[6:7], v[2:3] +; GCN-IR-NEXT: v_add_i32_e32 v4, vcc, 1, v2 +; GCN-IR-NEXT: v_addc_u32_e32 v5, vcc, 0, v3, vcc +; GCN-IR-NEXT: v_cmp_ge_u64_e32 vcc, v[4:5], v[2:3] ; GCN-IR-NEXT: v_sub_i32_e64 v2, s[4:5], 63, v2 -; GCN-IR-NEXT: v_lshl_b64 v[2:3], s[8:9], v2 -; GCN-IR-NEXT: v_mov_b32_e32 v4, 0 -; GCN-IR-NEXT: s_mov_b64 s[8:9], 0 ; GCN-IR-NEXT: v_mov_b32_e32 v5, 0 +; GCN-IR-NEXT: v_lshl_b64 v[2:3], s[8:9], v2 +; GCN-IR-NEXT: v_mov_b32_e32 v6, 0 ; GCN-IR-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-IR-NEXT: s_xor_b64 s[10:11], exec, s[4:5] +; GCN-IR-NEXT: s_xor_b64 s[8:9], exec, s[4:5] ; GCN-IR-NEXT: s_cbranch_execz BB12_5 ; GCN-IR-NEXT: ; %bb.2: ; %udiv-preheader ; GCN-IR-NEXT: s_mov_b32 s5, 0 ; GCN-IR-NEXT: s_mov_b32 s4, 0x8000 -; GCN-IR-NEXT: v_lshr_b64 v[10:11], s[4:5], v6 -; GCN-IR-NEXT: v_add_i32_e32 v6, vcc, -1, v0 +; GCN-IR-NEXT: v_lshr_b64 v[10:11], s[4:5], v4 +; GCN-IR-NEXT: v_add_i32_e32 v4, vcc, -1, v0 ; GCN-IR-NEXT: v_addc_u32_e32 v7, vcc, -1, v1, vcc ; GCN-IR-NEXT: v_sub_i32_e32 v8, vcc, 47, v8 ; GCN-IR-NEXT: v_mov_b32_e32 v12, 0 ; GCN-IR-NEXT: v_subb_u32_e32 v9, vcc, 0, v9, vcc +; GCN-IR-NEXT: s_mov_b64 s[10:11], 0 ; GCN-IR-NEXT: v_mov_b32_e32 v13, 0 ; GCN-IR-NEXT: BB12_3: ; %udiv-do-while ; GCN-IR-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN-IR-NEXT: v_lshl_b64 v[10:11], v[10:11], 1 -; GCN-IR-NEXT: v_lshrrev_b32_e32 v4, 31, v3 -; GCN-IR-NEXT: v_or_b32_e32 v10, v10, v4 +; GCN-IR-NEXT: v_lshrrev_b32_e32 v5, 31, v3 +; GCN-IR-NEXT: v_or_b32_e32 v10, v10, v5 ; GCN-IR-NEXT: v_lshl_b64 v[2:3], v[2:3], 1 -; GCN-IR-NEXT: v_sub_i32_e32 v4, vcc, v6, v10 -; GCN-IR-NEXT: v_subb_u32_e32 v4, vcc, v7, v11, vcc +; GCN-IR-NEXT: v_sub_i32_e32 v5, vcc, v4, v10 +; GCN-IR-NEXT: v_subb_u32_e32 v5, vcc, v7, v11, vcc ; GCN-IR-NEXT: v_or_b32_e32 v2, v12, v2 -; GCN-IR-NEXT: v_ashrrev_i32_e32 v12, 31, v4 +; GCN-IR-NEXT: v_ashrrev_i32_e32 v12, 31, v5 ; GCN-IR-NEXT: v_and_b32_e32 v15, v12, v0 -; GCN-IR-NEXT: v_and_b32_e32 v4, 1, v12 +; GCN-IR-NEXT: v_and_b32_e32 v5, 1, v12 ; GCN-IR-NEXT: v_and_b32_e32 v14, v12, v1 ; GCN-IR-NEXT: v_add_i32_e32 v12, vcc, 1, v8 ; GCN-IR-NEXT: v_or_b32_e32 v3, v13, v3 ; GCN-IR-NEXT: v_addc_u32_e32 v13, vcc, 0, v9, vcc ; GCN-IR-NEXT: v_cmp_lt_u64_e32 vcc, v[12:13], v[8:9] ; GCN-IR-NEXT: v_mov_b32_e32 v8, v12 -; GCN-IR-NEXT: v_mov_b32_e32 v5, 0 +; GCN-IR-NEXT: v_mov_b32_e32 v6, 0 ; GCN-IR-NEXT: v_sub_i32_e64 v10, s[4:5], v10, v15 ; GCN-IR-NEXT: v_mov_b32_e32 v9, v13 -; GCN-IR-NEXT: v_mov_b32_e32 v13, v5 +; GCN-IR-NEXT: v_mov_b32_e32 v13, v6 ; GCN-IR-NEXT: v_subb_u32_e64 v11, s[4:5], v11, v14, s[4:5] -; GCN-IR-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GCN-IR-NEXT: v_mov_b32_e32 v12, v4 -; GCN-IR-NEXT: s_andn2_b64 exec, exec, s[8:9] +; GCN-IR-NEXT: s_or_b64 s[10:11], vcc, s[10:11] +; GCN-IR-NEXT: v_mov_b32_e32 v12, v5 +; GCN-IR-NEXT: s_andn2_b64 exec, exec, s[10:11] ; GCN-IR-NEXT: s_cbranch_execnz BB12_3 ; GCN-IR-NEXT: ; %bb.4: ; %Flow -; GCN-IR-NEXT: s_or_b64 exec, exec, s[8:9] -; GCN-IR-NEXT: BB12_5: ; %Flow3 ; GCN-IR-NEXT: s_or_b64 exec, exec, s[10:11] +; GCN-IR-NEXT: BB12_5: ; %Flow3 +; GCN-IR-NEXT: s_or_b64 exec, exec, s[8:9] ; GCN-IR-NEXT: v_lshl_b64 v[2:3], v[2:3], 1 -; GCN-IR-NEXT: v_or_b32_e32 v5, v5, v3 -; GCN-IR-NEXT: v_or_b32_e32 v4, v4, v2 +; GCN-IR-NEXT: v_or_b32_e32 v6, v6, v3 +; GCN-IR-NEXT: v_or_b32_e32 v4, v5, v2 ; GCN-IR-NEXT: BB12_6: ; %Flow4 ; GCN-IR-NEXT: s_or_b64 exec, exec, s[6:7] -; GCN-IR-NEXT: v_mul_lo_u32 v2, v0, v5 +; GCN-IR-NEXT: v_mul_lo_u32 v2, v0, v6 ; GCN-IR-NEXT: v_mul_hi_u32 v3, v0, v4 ; GCN-IR-NEXT: v_mul_lo_u32 v1, v1, v4 ; GCN-IR-NEXT: v_mul_lo_u32 v0, v0, v4 @@ -2005,16 +2005,16 @@ ; GCN-IR-NEXT: v_sub_i32_e64 v4, s[4:5], 63, v4 ; GCN-IR-NEXT: v_mov_b32_e32 v6, 0 ; GCN-IR-NEXT: v_lshl_b64 v[4:5], v[0:1], v4 -; GCN-IR-NEXT: s_mov_b64 s[8:9], 0 ; GCN-IR-NEXT: v_mov_b32_e32 v7, 0 ; GCN-IR-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-IR-NEXT: s_xor_b64 s[10:11], exec, s[4:5] +; GCN-IR-NEXT: s_xor_b64 s[8:9], exec, s[4:5] ; GCN-IR-NEXT: s_cbranch_execz BB13_5 ; GCN-IR-NEXT: ; %bb.2: ; %udiv-preheader ; GCN-IR-NEXT: v_mov_b32_e32 v12, 0 ; GCN-IR-NEXT: v_lshr_b64 v[10:11], v[0:1], v9 ; GCN-IR-NEXT: v_add_i32_e32 v8, vcc, 0xffffffcf, v8 ; GCN-IR-NEXT: v_addc_u32_e64 v9, s[4:5], 0, -1, vcc +; GCN-IR-NEXT: s_mov_b64 s[10:11], 0 ; GCN-IR-NEXT: v_mov_b32_e32 v13, 0 ; GCN-IR-NEXT: s_movk_i32 s12, 0x7fff ; GCN-IR-NEXT: BB13_3: ; %udiv-do-while @@ -2040,14 +2040,14 @@ ; GCN-IR-NEXT: v_mov_b32_e32 v14, 0 ; GCN-IR-NEXT: v_sub_i32_e64 v10, s[4:5], v10, v15 ; GCN-IR-NEXT: v_subb_u32_e64 v11, s[4:5], v11, v14, s[4:5] -; GCN-IR-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GCN-IR-NEXT: s_or_b64 s[10:11], vcc, s[10:11] ; GCN-IR-NEXT: v_mov_b32_e32 v12, v6 -; GCN-IR-NEXT: s_andn2_b64 exec, exec, s[8:9] +; GCN-IR-NEXT: s_andn2_b64 exec, exec, s[10:11] ; GCN-IR-NEXT: s_cbranch_execnz BB13_3 ; GCN-IR-NEXT: ; %bb.4: ; %Flow -; GCN-IR-NEXT: s_or_b64 exec, exec, s[8:9] -; GCN-IR-NEXT: BB13_5: ; %Flow3 ; GCN-IR-NEXT: s_or_b64 exec, exec, s[10:11] +; GCN-IR-NEXT: BB13_5: ; %Flow3 +; GCN-IR-NEXT: s_or_b64 exec, exec, s[8:9] ; GCN-IR-NEXT: v_lshl_b64 v[4:5], v[4:5], 1 ; GCN-IR-NEXT: v_or_b32_e32 v7, v7, v5 ; GCN-IR-NEXT: v_or_b32_e32 v6, v6, v4 Index: llvm/test/CodeGen/AMDGPU/udiv64.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/udiv64.ll +++ llvm/test/CodeGen/AMDGPU/udiv64.ll @@ -374,10 +374,9 @@ ; GCN-IR-NEXT: v_cmp_ge_u64_e32 vcc, v[12:13], v[6:7] ; GCN-IR-NEXT: v_mov_b32_e32 v6, 0 ; GCN-IR-NEXT: v_lshl_b64 v[4:5], v[0:1], v4 -; GCN-IR-NEXT: s_mov_b64 s[8:9], 0 ; GCN-IR-NEXT: v_mov_b32_e32 v7, 0 ; GCN-IR-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-IR-NEXT: s_xor_b64 s[10:11], exec, s[4:5] +; GCN-IR-NEXT: s_xor_b64 s[8:9], exec, s[4:5] ; GCN-IR-NEXT: s_cbranch_execz BB1_5 ; GCN-IR-NEXT: ; %bb.2: ; %udiv-preheader ; GCN-IR-NEXT: v_lshr_b64 v[12:13], v[0:1], v12 @@ -388,6 +387,7 @@ ; GCN-IR-NEXT: v_add_i32_e32 v8, vcc, v6, v10 ; GCN-IR-NEXT: v_addc_u32_e32 v9, vcc, v7, v11, vcc ; GCN-IR-NEXT: v_mov_b32_e32 v10, 0 +; GCN-IR-NEXT: s_mov_b64 s[10:11], 0 ; GCN-IR-NEXT: v_mov_b32_e32 v11, 0 ; GCN-IR-NEXT: BB1_3: ; %udiv-do-while ; GCN-IR-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -412,14 +412,14 @@ ; GCN-IR-NEXT: v_mov_b32_e32 v9, v11 ; GCN-IR-NEXT: v_mov_b32_e32 v11, v7 ; GCN-IR-NEXT: v_subb_u32_e64 v13, s[4:5], v13, v14, s[4:5] -; GCN-IR-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GCN-IR-NEXT: s_or_b64 s[10:11], vcc, s[10:11] ; GCN-IR-NEXT: v_mov_b32_e32 v10, v6 -; GCN-IR-NEXT: s_andn2_b64 exec, exec, s[8:9] +; GCN-IR-NEXT: s_andn2_b64 exec, exec, s[10:11] ; GCN-IR-NEXT: s_cbranch_execnz BB1_3 ; GCN-IR-NEXT: ; %bb.4: ; %Flow -; GCN-IR-NEXT: s_or_b64 exec, exec, s[8:9] -; GCN-IR-NEXT: BB1_5: ; %Flow3 ; GCN-IR-NEXT: s_or_b64 exec, exec, s[10:11] +; GCN-IR-NEXT: BB1_5: ; %Flow3 +; GCN-IR-NEXT: s_or_b64 exec, exec, s[8:9] ; GCN-IR-NEXT: v_lshl_b64 v[0:1], v[4:5], 1 ; GCN-IR-NEXT: v_or_b32_e32 v4, v7, v1 ; GCN-IR-NEXT: v_or_b32_e32 v5, v6, v0 @@ -1262,61 +1262,61 @@ ; GCN-IR-NEXT: s_and_saveexec_b64 s[6:7], s[4:5] ; GCN-IR-NEXT: s_cbranch_execz BB9_6 ; GCN-IR-NEXT: ; %bb.1: ; %udiv-bb1 -; GCN-IR-NEXT: v_add_i32_e32 v6, vcc, 1, v4 -; GCN-IR-NEXT: v_addc_u32_e32 v7, vcc, 0, v5, vcc +; GCN-IR-NEXT: v_add_i32_e32 v10, vcc, 1, v4 +; GCN-IR-NEXT: v_addc_u32_e32 v11, vcc, 0, v5, vcc ; GCN-IR-NEXT: v_sub_i32_e64 v2, s[4:5], 63, v4 -; GCN-IR-NEXT: v_cmp_ge_u64_e32 vcc, v[6:7], v[4:5] -; GCN-IR-NEXT: v_lshl_b64 v[2:3], s[8:9], v2 -; GCN-IR-NEXT: v_mov_b32_e32 v4, 0 -; GCN-IR-NEXT: s_mov_b64 s[8:9], 0 +; GCN-IR-NEXT: v_cmp_ge_u64_e32 vcc, v[10:11], v[4:5] ; GCN-IR-NEXT: v_mov_b32_e32 v5, 0 +; GCN-IR-NEXT: v_lshl_b64 v[2:3], s[8:9], v2 +; GCN-IR-NEXT: v_mov_b32_e32 v6, 0 ; GCN-IR-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-IR-NEXT: s_xor_b64 s[10:11], exec, s[4:5] +; GCN-IR-NEXT: s_xor_b64 s[8:9], exec, s[4:5] ; GCN-IR-NEXT: s_cbranch_execz BB9_5 ; GCN-IR-NEXT: ; %bb.2: ; %udiv-preheader +; GCN-IR-NEXT: v_add_i32_e32 v4, vcc, -1, v0 +; GCN-IR-NEXT: v_addc_u32_e32 v7, vcc, -1, v1, vcc ; GCN-IR-NEXT: s_mov_b32 s5, 0 ; GCN-IR-NEXT: s_mov_b32 s4, 0x8000 -; GCN-IR-NEXT: v_lshr_b64 v[10:11], s[4:5], v6 -; GCN-IR-NEXT: v_add_i32_e32 v6, vcc, -1, v0 -; GCN-IR-NEXT: v_addc_u32_e32 v7, vcc, -1, v1, vcc ; GCN-IR-NEXT: v_sub_i32_e32 v8, vcc, 47, v8 ; GCN-IR-NEXT: v_mov_b32_e32 v12, 0 +; GCN-IR-NEXT: v_lshr_b64 v[10:11], s[4:5], v10 ; GCN-IR-NEXT: v_subb_u32_e32 v9, vcc, 0, v9, vcc +; GCN-IR-NEXT: s_mov_b64 s[10:11], 0 ; GCN-IR-NEXT: v_mov_b32_e32 v13, 0 ; GCN-IR-NEXT: BB9_3: ; %udiv-do-while ; GCN-IR-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN-IR-NEXT: v_lshl_b64 v[10:11], v[10:11], 1 -; GCN-IR-NEXT: v_lshrrev_b32_e32 v4, 31, v3 -; GCN-IR-NEXT: v_or_b32_e32 v10, v10, v4 +; GCN-IR-NEXT: v_lshrrev_b32_e32 v5, 31, v3 +; GCN-IR-NEXT: v_or_b32_e32 v10, v10, v5 ; GCN-IR-NEXT: v_lshl_b64 v[2:3], v[2:3], 1 -; GCN-IR-NEXT: v_sub_i32_e32 v4, vcc, v6, v10 -; GCN-IR-NEXT: v_subb_u32_e32 v4, vcc, v7, v11, vcc +; GCN-IR-NEXT: v_sub_i32_e32 v5, vcc, v4, v10 +; GCN-IR-NEXT: v_subb_u32_e32 v5, vcc, v7, v11, vcc ; GCN-IR-NEXT: v_or_b32_e32 v2, v12, v2 -; GCN-IR-NEXT: v_ashrrev_i32_e32 v12, 31, v4 +; GCN-IR-NEXT: v_ashrrev_i32_e32 v12, 31, v5 ; GCN-IR-NEXT: v_and_b32_e32 v15, v12, v0 -; GCN-IR-NEXT: v_and_b32_e32 v4, 1, v12 +; GCN-IR-NEXT: v_and_b32_e32 v5, 1, v12 ; GCN-IR-NEXT: v_and_b32_e32 v14, v12, v1 ; GCN-IR-NEXT: v_add_i32_e32 v12, vcc, 1, v8 ; GCN-IR-NEXT: v_or_b32_e32 v3, v13, v3 ; GCN-IR-NEXT: v_addc_u32_e32 v13, vcc, 0, v9, vcc ; GCN-IR-NEXT: v_cmp_lt_u64_e32 vcc, v[12:13], v[8:9] ; GCN-IR-NEXT: v_mov_b32_e32 v8, v12 -; GCN-IR-NEXT: v_mov_b32_e32 v5, 0 +; GCN-IR-NEXT: v_mov_b32_e32 v6, 0 ; GCN-IR-NEXT: v_sub_i32_e64 v10, s[4:5], v10, v15 ; GCN-IR-NEXT: v_mov_b32_e32 v9, v13 -; GCN-IR-NEXT: v_mov_b32_e32 v13, v5 +; GCN-IR-NEXT: v_mov_b32_e32 v13, v6 ; GCN-IR-NEXT: v_subb_u32_e64 v11, s[4:5], v11, v14, s[4:5] -; GCN-IR-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GCN-IR-NEXT: v_mov_b32_e32 v12, v4 -; GCN-IR-NEXT: s_andn2_b64 exec, exec, s[8:9] +; GCN-IR-NEXT: s_or_b64 s[10:11], vcc, s[10:11] +; GCN-IR-NEXT: v_mov_b32_e32 v12, v5 +; GCN-IR-NEXT: s_andn2_b64 exec, exec, s[10:11] ; GCN-IR-NEXT: s_cbranch_execnz BB9_3 ; GCN-IR-NEXT: ; %bb.4: ; %Flow -; GCN-IR-NEXT: s_or_b64 exec, exec, s[8:9] -; GCN-IR-NEXT: BB9_5: ; %Flow3 ; GCN-IR-NEXT: s_or_b64 exec, exec, s[10:11] +; GCN-IR-NEXT: BB9_5: ; %Flow3 +; GCN-IR-NEXT: s_or_b64 exec, exec, s[8:9] ; GCN-IR-NEXT: v_lshl_b64 v[0:1], v[2:3], 1 -; GCN-IR-NEXT: v_or_b32_e32 v3, v5, v1 -; GCN-IR-NEXT: v_or_b32_e32 v2, v4, v0 +; GCN-IR-NEXT: v_or_b32_e32 v3, v6, v1 +; GCN-IR-NEXT: v_or_b32_e32 v2, v5, v0 ; GCN-IR-NEXT: BB9_6: ; %Flow4 ; GCN-IR-NEXT: s_or_b64 exec, exec, s[6:7] ; GCN-IR-NEXT: v_mov_b32_e32 v0, v2 @@ -1361,16 +1361,16 @@ ; GCN-IR-NEXT: v_cmp_ge_u64_e32 vcc, v[7:8], v[4:5] ; GCN-IR-NEXT: v_mov_b32_e32 v4, 0 ; GCN-IR-NEXT: v_lshl_b64 v[2:3], v[0:1], v2 -; GCN-IR-NEXT: s_mov_b64 s[8:9], 0 ; GCN-IR-NEXT: v_mov_b32_e32 v5, 0 ; GCN-IR-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-IR-NEXT: s_xor_b64 s[10:11], exec, s[4:5] +; GCN-IR-NEXT: s_xor_b64 s[8:9], exec, s[4:5] ; GCN-IR-NEXT: s_cbranch_execz BB10_5 ; GCN-IR-NEXT: ; %bb.2: ; %udiv-preheader ; GCN-IR-NEXT: v_lshr_b64 v[7:8], v[0:1], v7 ; GCN-IR-NEXT: v_add_i32_e32 v0, vcc, 0xffffffcf, v6 ; GCN-IR-NEXT: v_mov_b32_e32 v9, 0 ; GCN-IR-NEXT: v_addc_u32_e64 v1, s[4:5], 0, -1, vcc +; GCN-IR-NEXT: s_mov_b64 s[10:11], 0 ; GCN-IR-NEXT: v_mov_b32_e32 v10, 0 ; GCN-IR-NEXT: s_movk_i32 s12, 0x7fff ; GCN-IR-NEXT: BB10_3: ; %udiv-do-while @@ -1396,14 +1396,14 @@ ; GCN-IR-NEXT: v_mov_b32_e32 v6, 0 ; GCN-IR-NEXT: v_sub_i32_e64 v7, s[4:5], v7, v11 ; GCN-IR-NEXT: v_subb_u32_e64 v8, s[4:5], v8, v6, s[4:5] -; GCN-IR-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GCN-IR-NEXT: s_or_b64 s[10:11], vcc, s[10:11] ; GCN-IR-NEXT: v_mov_b32_e32 v9, v4 -; GCN-IR-NEXT: s_andn2_b64 exec, exec, s[8:9] +; GCN-IR-NEXT: s_andn2_b64 exec, exec, s[10:11] ; GCN-IR-NEXT: s_cbranch_execnz BB10_3 ; GCN-IR-NEXT: ; %bb.4: ; %Flow -; GCN-IR-NEXT: s_or_b64 exec, exec, s[8:9] -; GCN-IR-NEXT: BB10_5: ; %Flow3 ; GCN-IR-NEXT: s_or_b64 exec, exec, s[10:11] +; GCN-IR-NEXT: BB10_5: ; %Flow3 +; GCN-IR-NEXT: s_or_b64 exec, exec, s[8:9] ; GCN-IR-NEXT: v_lshl_b64 v[0:1], v[2:3], 1 ; GCN-IR-NEXT: v_or_b32_e32 v2, v5, v1 ; GCN-IR-NEXT: v_or_b32_e32 v3, v4, v0 @@ -1738,16 +1738,16 @@ ; GCN-IR-NEXT: v_cmp_ge_u64_e32 vcc, v[7:8], v[4:5] ; GCN-IR-NEXT: v_mov_b32_e32 v4, 0 ; GCN-IR-NEXT: v_lshl_b64 v[2:3], v[0:1], v2 -; GCN-IR-NEXT: s_mov_b64 s[8:9], 0 ; GCN-IR-NEXT: v_mov_b32_e32 v5, 0 ; GCN-IR-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-IR-NEXT: s_xor_b64 s[10:11], exec, s[4:5] +; GCN-IR-NEXT: s_xor_b64 s[8:9], exec, s[4:5] ; GCN-IR-NEXT: s_cbranch_execz BB12_5 ; GCN-IR-NEXT: ; %bb.2: ; %udiv-preheader ; GCN-IR-NEXT: v_lshr_b64 v[7:8], v[0:1], v7 ; GCN-IR-NEXT: v_add_i32_e32 v0, vcc, 0xffffffc4, v6 ; GCN-IR-NEXT: v_mov_b32_e32 v9, 0 ; GCN-IR-NEXT: v_addc_u32_e64 v1, s[4:5], 0, -1, vcc +; GCN-IR-NEXT: s_mov_b64 s[10:11], 0 ; GCN-IR-NEXT: v_mov_b32_e32 v10, 0 ; GCN-IR-NEXT: BB12_3: ; %udiv-do-while ; GCN-IR-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -1771,14 +1771,14 @@ ; GCN-IR-NEXT: v_mov_b32_e32 v1, v10 ; GCN-IR-NEXT: v_mov_b32_e32 v10, v5 ; GCN-IR-NEXT: v_subbrev_u32_e64 v8, s[4:5], 0, v8, s[4:5] -; GCN-IR-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GCN-IR-NEXT: s_or_b64 s[10:11], vcc, s[10:11] ; GCN-IR-NEXT: v_mov_b32_e32 v9, v4 -; GCN-IR-NEXT: s_andn2_b64 exec, exec, s[8:9] +; GCN-IR-NEXT: s_andn2_b64 exec, exec, s[10:11] ; GCN-IR-NEXT: s_cbranch_execnz BB12_3 ; GCN-IR-NEXT: ; %bb.4: ; %Flow -; GCN-IR-NEXT: s_or_b64 exec, exec, s[8:9] -; GCN-IR-NEXT: BB12_5: ; %Flow3 ; GCN-IR-NEXT: s_or_b64 exec, exec, s[10:11] +; GCN-IR-NEXT: BB12_5: ; %Flow3 +; GCN-IR-NEXT: s_or_b64 exec, exec, s[8:9] ; GCN-IR-NEXT: v_lshl_b64 v[0:1], v[2:3], 1 ; GCN-IR-NEXT: v_or_b32_e32 v2, v5, v1 ; GCN-IR-NEXT: v_or_b32_e32 v3, v4, v0 Index: llvm/test/CodeGen/AMDGPU/urem64.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/urem64.ll +++ llvm/test/CodeGen/AMDGPU/urem64.ll @@ -383,10 +383,9 @@ ; GCN-IR-NEXT: v_cmp_ge_u64_e32 vcc, v[7:8], v[5:6] ; GCN-IR-NEXT: v_mov_b32_e32 v8, 0 ; GCN-IR-NEXT: v_lshl_b64 v[4:5], v[0:1], v4 -; GCN-IR-NEXT: s_mov_b64 s[8:9], 0 ; GCN-IR-NEXT: v_mov_b32_e32 v9, 0 ; GCN-IR-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-IR-NEXT: s_xor_b64 s[10:11], exec, s[4:5] +; GCN-IR-NEXT: s_xor_b64 s[8:9], exec, s[4:5] ; GCN-IR-NEXT: s_cbranch_execz BB1_5 ; GCN-IR-NEXT: ; %bb.2: ; %udiv-preheader ; GCN-IR-NEXT: v_add_i32_e32 v6, vcc, -1, v2 @@ -397,6 +396,7 @@ ; GCN-IR-NEXT: v_add_i32_e32 v10, vcc, v8, v12 ; GCN-IR-NEXT: v_addc_u32_e32 v11, vcc, v9, v13, vcc ; GCN-IR-NEXT: v_mov_b32_e32 v12, 0 +; GCN-IR-NEXT: s_mov_b64 s[10:11], 0 ; GCN-IR-NEXT: v_mov_b32_e32 v13, 0 ; GCN-IR-NEXT: BB1_3: ; %udiv-do-while ; GCN-IR-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -421,14 +421,14 @@ ; GCN-IR-NEXT: v_mov_b32_e32 v11, v13 ; GCN-IR-NEXT: v_mov_b32_e32 v13, v9 ; GCN-IR-NEXT: v_subb_u32_e64 v15, s[4:5], v15, v16, s[4:5] -; GCN-IR-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GCN-IR-NEXT: s_or_b64 s[10:11], vcc, s[10:11] ; GCN-IR-NEXT: v_mov_b32_e32 v12, v8 -; GCN-IR-NEXT: s_andn2_b64 exec, exec, s[8:9] +; GCN-IR-NEXT: s_andn2_b64 exec, exec, s[10:11] ; GCN-IR-NEXT: s_cbranch_execnz BB1_3 ; GCN-IR-NEXT: ; %bb.4: ; %Flow -; GCN-IR-NEXT: s_or_b64 exec, exec, s[8:9] -; GCN-IR-NEXT: BB1_5: ; %Flow3 ; GCN-IR-NEXT: s_or_b64 exec, exec, s[10:11] +; GCN-IR-NEXT: BB1_5: ; %Flow3 +; GCN-IR-NEXT: s_or_b64 exec, exec, s[8:9] ; GCN-IR-NEXT: v_lshl_b64 v[4:5], v[4:5], 1 ; GCN-IR-NEXT: v_or_b32_e32 v7, v9, v5 ; GCN-IR-NEXT: v_or_b32_e32 v4, v8, v4 @@ -1277,69 +1277,69 @@ ; GCN-IR-NEXT: v_cndmask_b32_e64 v4, v4, 0, s[4:5] ; GCN-IR-NEXT: s_xor_b64 s[4:5], s[4:5], -1 ; GCN-IR-NEXT: s_mov_b32 s9, 0 -; GCN-IR-NEXT: v_mov_b32_e32 v5, v9 +; GCN-IR-NEXT: v_mov_b32_e32 v6, v9 ; GCN-IR-NEXT: s_and_b64 s[4:5], s[4:5], vcc ; GCN-IR-NEXT: s_and_saveexec_b64 s[6:7], s[4:5] ; GCN-IR-NEXT: s_cbranch_execz BB8_6 ; GCN-IR-NEXT: ; %bb.1: ; %udiv-bb1 -; GCN-IR-NEXT: v_add_i32_e32 v6, vcc, 1, v2 -; GCN-IR-NEXT: v_addc_u32_e32 v7, vcc, 0, v3, vcc -; GCN-IR-NEXT: v_cmp_ge_u64_e32 vcc, v[6:7], v[2:3] +; GCN-IR-NEXT: v_add_i32_e32 v4, vcc, 1, v2 +; GCN-IR-NEXT: v_addc_u32_e32 v5, vcc, 0, v3, vcc +; GCN-IR-NEXT: v_cmp_ge_u64_e32 vcc, v[4:5], v[2:3] ; GCN-IR-NEXT: v_sub_i32_e64 v2, s[4:5], 63, v2 -; GCN-IR-NEXT: v_lshl_b64 v[2:3], s[8:9], v2 -; GCN-IR-NEXT: v_mov_b32_e32 v4, 0 -; GCN-IR-NEXT: s_mov_b64 s[8:9], 0 ; GCN-IR-NEXT: v_mov_b32_e32 v5, 0 +; GCN-IR-NEXT: v_lshl_b64 v[2:3], s[8:9], v2 +; GCN-IR-NEXT: v_mov_b32_e32 v6, 0 ; GCN-IR-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-IR-NEXT: s_xor_b64 s[10:11], exec, s[4:5] +; GCN-IR-NEXT: s_xor_b64 s[8:9], exec, s[4:5] ; GCN-IR-NEXT: s_cbranch_execz BB8_5 ; GCN-IR-NEXT: ; %bb.2: ; %udiv-preheader ; GCN-IR-NEXT: s_mov_b32 s5, 0 ; GCN-IR-NEXT: s_mov_b32 s4, 0x8000 -; GCN-IR-NEXT: v_lshr_b64 v[10:11], s[4:5], v6 -; GCN-IR-NEXT: v_add_i32_e32 v6, vcc, -1, v0 +; GCN-IR-NEXT: v_lshr_b64 v[10:11], s[4:5], v4 +; GCN-IR-NEXT: v_add_i32_e32 v4, vcc, -1, v0 ; GCN-IR-NEXT: v_addc_u32_e32 v7, vcc, -1, v1, vcc ; GCN-IR-NEXT: v_sub_i32_e32 v8, vcc, 47, v8 ; GCN-IR-NEXT: v_mov_b32_e32 v12, 0 ; GCN-IR-NEXT: v_subb_u32_e32 v9, vcc, 0, v9, vcc +; GCN-IR-NEXT: s_mov_b64 s[10:11], 0 ; GCN-IR-NEXT: v_mov_b32_e32 v13, 0 ; GCN-IR-NEXT: BB8_3: ; %udiv-do-while ; GCN-IR-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN-IR-NEXT: v_lshl_b64 v[10:11], v[10:11], 1 -; GCN-IR-NEXT: v_lshrrev_b32_e32 v4, 31, v3 -; GCN-IR-NEXT: v_or_b32_e32 v10, v10, v4 +; GCN-IR-NEXT: v_lshrrev_b32_e32 v5, 31, v3 +; GCN-IR-NEXT: v_or_b32_e32 v10, v10, v5 ; GCN-IR-NEXT: v_lshl_b64 v[2:3], v[2:3], 1 -; GCN-IR-NEXT: v_sub_i32_e32 v4, vcc, v6, v10 -; GCN-IR-NEXT: v_subb_u32_e32 v4, vcc, v7, v11, vcc +; GCN-IR-NEXT: v_sub_i32_e32 v5, vcc, v4, v10 +; GCN-IR-NEXT: v_subb_u32_e32 v5, vcc, v7, v11, vcc ; GCN-IR-NEXT: v_or_b32_e32 v2, v12, v2 -; GCN-IR-NEXT: v_ashrrev_i32_e32 v12, 31, v4 +; GCN-IR-NEXT: v_ashrrev_i32_e32 v12, 31, v5 ; GCN-IR-NEXT: v_and_b32_e32 v15, v12, v0 -; GCN-IR-NEXT: v_and_b32_e32 v4, 1, v12 +; GCN-IR-NEXT: v_and_b32_e32 v5, 1, v12 ; GCN-IR-NEXT: v_and_b32_e32 v14, v12, v1 ; GCN-IR-NEXT: v_add_i32_e32 v12, vcc, 1, v8 ; GCN-IR-NEXT: v_or_b32_e32 v3, v13, v3 ; GCN-IR-NEXT: v_addc_u32_e32 v13, vcc, 0, v9, vcc ; GCN-IR-NEXT: v_cmp_lt_u64_e32 vcc, v[12:13], v[8:9] ; GCN-IR-NEXT: v_mov_b32_e32 v8, v12 -; GCN-IR-NEXT: v_mov_b32_e32 v5, 0 +; GCN-IR-NEXT: v_mov_b32_e32 v6, 0 ; GCN-IR-NEXT: v_sub_i32_e64 v10, s[4:5], v10, v15 ; GCN-IR-NEXT: v_mov_b32_e32 v9, v13 -; GCN-IR-NEXT: v_mov_b32_e32 v13, v5 +; GCN-IR-NEXT: v_mov_b32_e32 v13, v6 ; GCN-IR-NEXT: v_subb_u32_e64 v11, s[4:5], v11, v14, s[4:5] -; GCN-IR-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GCN-IR-NEXT: v_mov_b32_e32 v12, v4 -; GCN-IR-NEXT: s_andn2_b64 exec, exec, s[8:9] +; GCN-IR-NEXT: s_or_b64 s[10:11], vcc, s[10:11] +; GCN-IR-NEXT: v_mov_b32_e32 v12, v5 +; GCN-IR-NEXT: s_andn2_b64 exec, exec, s[10:11] ; GCN-IR-NEXT: s_cbranch_execnz BB8_3 ; GCN-IR-NEXT: ; %bb.4: ; %Flow -; GCN-IR-NEXT: s_or_b64 exec, exec, s[8:9] -; GCN-IR-NEXT: BB8_5: ; %Flow3 ; GCN-IR-NEXT: s_or_b64 exec, exec, s[10:11] +; GCN-IR-NEXT: BB8_5: ; %Flow3 +; GCN-IR-NEXT: s_or_b64 exec, exec, s[8:9] ; GCN-IR-NEXT: v_lshl_b64 v[2:3], v[2:3], 1 -; GCN-IR-NEXT: v_or_b32_e32 v5, v5, v3 -; GCN-IR-NEXT: v_or_b32_e32 v4, v4, v2 +; GCN-IR-NEXT: v_or_b32_e32 v6, v6, v3 +; GCN-IR-NEXT: v_or_b32_e32 v4, v5, v2 ; GCN-IR-NEXT: BB8_6: ; %Flow4 ; GCN-IR-NEXT: s_or_b64 exec, exec, s[6:7] -; GCN-IR-NEXT: v_mul_lo_u32 v2, v0, v5 +; GCN-IR-NEXT: v_mul_lo_u32 v2, v0, v6 ; GCN-IR-NEXT: v_mul_hi_u32 v3, v0, v4 ; GCN-IR-NEXT: v_mul_lo_u32 v1, v1, v4 ; GCN-IR-NEXT: v_mul_lo_u32 v0, v0, v4 @@ -1387,16 +1387,16 @@ ; GCN-IR-NEXT: v_sub_i32_e64 v2, s[4:5], 63, v2 ; GCN-IR-NEXT: v_mov_b32_e32 v4, 0 ; GCN-IR-NEXT: v_lshl_b64 v[2:3], v[0:1], v2 -; GCN-IR-NEXT: s_mov_b64 s[8:9], 0 ; GCN-IR-NEXT: v_mov_b32_e32 v5, 0 ; GCN-IR-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-IR-NEXT: s_xor_b64 s[10:11], exec, s[4:5] +; GCN-IR-NEXT: s_xor_b64 s[8:9], exec, s[4:5] ; GCN-IR-NEXT: s_cbranch_execz BB9_5 ; GCN-IR-NEXT: ; %bb.2: ; %udiv-preheader ; GCN-IR-NEXT: v_mov_b32_e32 v10, 0 ; GCN-IR-NEXT: v_lshr_b64 v[8:9], v[0:1], v7 ; GCN-IR-NEXT: v_add_i32_e32 v6, vcc, 0xffffffcf, v6 ; GCN-IR-NEXT: v_addc_u32_e64 v7, s[4:5], 0, -1, vcc +; GCN-IR-NEXT: s_mov_b64 s[10:11], 0 ; GCN-IR-NEXT: v_mov_b32_e32 v11, 0 ; GCN-IR-NEXT: s_movk_i32 s12, 0x7fff ; GCN-IR-NEXT: BB9_3: ; %udiv-do-while @@ -1422,14 +1422,14 @@ ; GCN-IR-NEXT: v_mov_b32_e32 v12, 0 ; GCN-IR-NEXT: v_sub_i32_e64 v8, s[4:5], v8, v13 ; GCN-IR-NEXT: v_subb_u32_e64 v9, s[4:5], v9, v12, s[4:5] -; GCN-IR-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GCN-IR-NEXT: s_or_b64 s[10:11], vcc, s[10:11] ; GCN-IR-NEXT: v_mov_b32_e32 v10, v4 -; GCN-IR-NEXT: s_andn2_b64 exec, exec, s[8:9] +; GCN-IR-NEXT: s_andn2_b64 exec, exec, s[10:11] ; GCN-IR-NEXT: s_cbranch_execnz BB9_3 ; GCN-IR-NEXT: ; %bb.4: ; %Flow -; GCN-IR-NEXT: s_or_b64 exec, exec, s[8:9] -; GCN-IR-NEXT: BB9_5: ; %Flow3 ; GCN-IR-NEXT: s_or_b64 exec, exec, s[10:11] +; GCN-IR-NEXT: BB9_5: ; %Flow3 +; GCN-IR-NEXT: s_or_b64 exec, exec, s[8:9] ; GCN-IR-NEXT: v_lshl_b64 v[2:3], v[2:3], 1 ; GCN-IR-NEXT: v_or_b32_e32 v5, v5, v3 ; GCN-IR-NEXT: v_or_b32_e32 v4, v4, v2 Index: llvm/test/CodeGen/AMDGPU/use-sgpr-multiple-times.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/use-sgpr-multiple-times.ll +++ llvm/test/CodeGen/AMDGPU/use-sgpr-multiple-times.ll @@ -247,7 +247,6 @@ ret void } -; FIXME: Immediate in SGPRs just copied to VGPRs ; GCN-LABEL: {{^}}test_s0_s1_k_f64: ; GCN-DAG: s_load_dwordx2 [[SGPR0:s\[[0-9]+:[0-9]+\]]], s{{\[[0-9]+:[0-9]+\]}}, {{0x13|0x4c}} ; GCN-DAG: s_load_dwordx2 s{{\[}}[[SGPR1_SUB0:[0-9]+]]:[[SGPR1_SUB1:[0-9]+]]{{\]}}, s{{\[[0-9]+:[0-9]+\]}}, {{0x1d|0x74}} @@ -256,11 +255,13 @@ ; GCN-DAG: v_mov_b32_e32 v[[VS1_SUB0:[0-9]+]], s[[SGPR1_SUB0]] ; GCN-DAG: v_mov_b32_e32 v[[VS1_SUB1:[0-9]+]], s[[SGPR1_SUB1]] -; GCN: v_fma_f64 [[RESULT0:v\[[0-9]+:[0-9]+\]]], [[SGPR0]], v{{\[}}[[VS1_SUB0]]:[[VS1_SUB1]]{{\]}}, v{{\[}}[[VZERO]]:[[VK0_SUB1]]{{\]}} +; GCN-DAG: v_fma_f64 [[RESULT0:v\[[0-9]+:[0-9]+\]]], [[SGPR0]], v{{\[}}[[VS1_SUB0]]:[[VS1_SUB1]]{{\]}}, v{{\[}}[[VZERO]]:[[VK0_SUB1]]{{\]}} -; Same zero component is re-used for half of each immediate. -; GCN: v_mov_b32_e32 v[[VK1_SUB1:[0-9]+]], 0x40b00000 -; GCN: v_fma_f64 [[RESULT1:v\[[0-9]+:[0-9]+\]]], [[SGPR0]], v{{\[}}[[VS1_SUB0]]:[[VS1_SUB1]]{{\]}}, v{{\[}}[[VZERO]]:[[VK1_SUB1]]{{\]}} +; FIXME: Same zero component can be re-used for half of each immediate, +; but we need to split 64 bit move before the coalescer. + +; GCN-DAG: v_mov_b32_e32 v[[VK1_SUB1:[0-9]+]], 0x40b00000 +; GCN: v_fma_f64 [[RESULT1:v\[[0-9]+:[0-9]+\]]], [[SGPR0]], v{{\[}}[[VS1_SUB0]]:[[VS1_SUB1]]{{\]}}, v{{\[[0-9]+}}:[[VK1_SUB1]]{{\]}} ; GCN: buffer_store_dwordx2 [[RESULT0]] ; GCN: buffer_store_dwordx2 [[RESULT1]]