Index: llvm/include/llvm/CodeGen/GlobalISel/RegBankSelect.h =================================================================== --- llvm/include/llvm/CodeGen/GlobalISel/RegBankSelect.h +++ llvm/include/llvm/CodeGen/GlobalISel/RegBankSelect.h @@ -321,6 +321,8 @@ Insert, /// (Re)assign the register bank of the operand. Reassign, + /// Rematerialize instruction with the appropriate register bank. + Rematerialize, /// Mark this repairing placement as impossible. Impossible }; @@ -566,6 +568,17 @@ const iterator_range::const_iterator> &NewVRegs); + /// Insert repairing code for \p Reg as specified by \p ValMapping. + /// The repairing placement is specified by \p RepairPt. + /// Can be used to rematerialize instruction with a different regbank instead + /// of inserting a copy. \p OpdMapper is used to get a new VReg for the def of + /// rematerialized instruction is the original instructions has other uses. + /// Otherwise regbank of the original instruction is reassigned. + bool rematerialzeReg(MachineOperand &MO, + const RegisterBankInfo::ValueMapping &ValMapping, + RegBankSelect::RepairingPlacement &RepairPt, + RegisterBankInfo::OperandsMapper &OpdMapper); + /// Return the cost of the instruction needed to map \p MO to \p ValMapping. /// The cost is free of basic block frequencies. /// \pre MO.isReg() Index: llvm/lib/CodeGen/GlobalISel/RegBankSelect.cpp =================================================================== --- llvm/lib/CodeGen/GlobalISel/RegBankSelect.cpp +++ llvm/lib/CodeGen/GlobalISel/RegBankSelect.cpp @@ -234,6 +234,33 @@ return true; } +bool RegBankSelect::rematerialzeReg( + MachineOperand &MO, const RegisterBankInfo::ValueMapping &ValMapping, + RegBankSelect::RepairingPlacement &RepairPt, + RegisterBankInfo::OperandsMapper &OpdMapper) { + Register Reg = MO.getReg(); + MachineInstr *OldMI = MRI->getVRegDef(Reg); + + if (MRI->hasOneNonDBGUse(Reg)) { + // Reassign only use. + MRI->setRegBank(Reg, *ValMapping.BreakDown[0].RegBank); + return true; + } + + const unsigned OpIdx = RepairPt.getOpIdx(); + OpdMapper.createVRegs(OpIdx); + + // Clone the instruction but update the regbank. + MachineInstr *NewMI = OldMI->getMF()->CloneMachineInstr(OldMI); + NewMI->getOperand(0).setReg(*OpdMapper.getVRegs(OpIdx).begin()); + LLVM_DEBUG(dbgs() << "Clone with new regbank: " << *OldMI << '\n'); + + assert(RepairPt.getNumInsertPoints() == 1); + const auto InsertPt = RepairPt.begin(); + (*InsertPt)->insert(*NewMI); + return true; +} + uint64_t RegBankSelect::getRepairCost( const MachineOperand &MO, const RegisterBankInfo::ValueMapping &ValMapping) const { @@ -493,9 +520,19 @@ continue; } - // Find the insertion point for the repairing code. - RepairPts.emplace_back( - RepairingPlacement(MI, OpIdx, *TRI, *this, RepairingPlacement::Insert)); + Register OpReg = MO.getReg(); + MachineInstr *DefMI = MRI.getVRegDef(OpReg); + if ((DefMI->getOpcode() == TargetOpcode::G_CONSTANT || + DefMI->getOpcode() == TargetOpcode::G_FCONSTANT) && + !MO.isDef() && MRI.getType(OpReg).getSizeInBits() <= 32) { + // Constants can be rematerialzed. + RepairPts.emplace_back(RepairingPlacement( + MI, OpIdx, *TRI, *this, RepairingPlacement::Rematerialize)); + } else { + // Find the insertion point for the repairing code. + RepairPts.emplace_back(RepairingPlacement(MI, OpIdx, *TRI, *this, + RepairingPlacement::Insert)); + } RepairingPlacement &RepairPt = RepairPts.back(); // If we need to split a basic block to materialize this insertion point, @@ -616,6 +653,14 @@ if (!repairReg(MO, ValMapping, RepairPt, OpdMapper.getVRegs(OpIdx))) return false; break; + case RepairingPlacement::Rematerialize: { + // Don't insert additional instruction for debug instruction. + if (MI.isDebugInstr()) + break; + if (!rematerialzeReg(MO, ValMapping, RepairPt, OpdMapper)) + return false; + break; + } default: llvm_unreachable("Other kind should not happen"); } @@ -765,7 +810,7 @@ const MachineOperand &MO = MI.getOperand(OpIdx); assert(MO.isReg() && "Trying to repair a non-reg operand"); - if (Kind != RepairingKind::Insert) + if (Kind != RepairingKind::Insert && Kind != RepairingKind::Rematerialize) return; // Repairings for definitions happen after MI, uses happen before. Index: llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp =================================================================== --- llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp +++ llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp @@ -4311,9 +4311,8 @@ int64_t Offset = 0; - // FIXME: Copy check is a hack Register BasePtr; - if (mi_match(Reg, *MRI, m_GPtrAdd(m_Reg(BasePtr), m_Copy(m_ICst(Offset))))) { + if (mi_match(Reg, *MRI, m_GPtrAdd(m_Reg(BasePtr), m_ICst(Offset)))) { if (!SIInstrInfo::isLegalMUBUFImmOffset(Offset)) return {}; const MachineInstr *BasePtrDef = MRI->getVRegDef(BasePtr); Index: llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp =================================================================== --- llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp +++ llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp @@ -1262,6 +1262,13 @@ std::tie(Base, Offset) = AMDGPU::getBaseWithConstantOffset(*MRI, CombinedOffset); + // If BaseReg is a pointer, convert it to int. + if (MRI->getType(Base).isPointer()) { + const RegisterBank *BaseBank = getRegBank(Base, *MRI, *TRI); + Base = B.buildPtrToInt(MRI->getType(CombinedOffset), Base).getReg(0); + MRI->setRegBank(Base, *BaseBank); + } + uint32_t SOffset, ImmOffset; if ((int)Offset > 0 && TII->splitMUBUFOffset(Offset, SOffset, ImmOffset, Alignment)) { Index: llvm/test/CodeGen/AMDGPU/GlobalISel/add.v2i16.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/GlobalISel/add.v2i16.ll +++ llvm/test/CodeGen/AMDGPU/GlobalISel/add.v2i16.ll @@ -169,9 +169,8 @@ ; GFX7-LABEL: v_add_v2i16_neg_inline_imm_splat: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: s_movk_i32 s4, 0xffc0 -; GFX7-NEXT: v_add_i32_e32 v0, vcc, s4, v0 -; GFX7-NEXT: v_add_i32_e32 v1, vcc, s4, v1 +; GFX7-NEXT: v_subrev_i32_e32 v0, vcc, 64, v0 +; GFX7-NEXT: v_subrev_i32_e32 v1, vcc, 64, v1 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: v_add_v2i16_neg_inline_imm_splat: @@ -184,9 +183,9 @@ ; GFX8-LABEL: v_add_v2i16_neg_inline_imm_splat: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v2, 0xffffffc0 -; GFX8-NEXT: v_add_u16_e32 v1, 0xffc0, v0 -; GFX8-NEXT: v_add_u16_sdwa v0, v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-NEXT: v_mov_b32_e32 v2, 64 +; GFX8-NEXT: v_subrev_u16_e32 v1, 64, v0 +; GFX8-NEXT: v_sub_u16_sdwa v0, v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; GFX8-NEXT: v_or_b32_e32 v0, v1, v0 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; @@ -204,7 +203,7 @@ ; GFX7-LABEL: v_add_v2i16_neg_inline_imm_lo: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_add_i32_e32 v0, vcc, 0xffffffc0, v0 +; GFX7-NEXT: v_subrev_i32_e32 v0, vcc, 64, v0 ; GFX7-NEXT: v_add_i32_e32 v1, vcc, 4, v1 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; @@ -219,7 +218,7 @@ ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v2, 4 -; GFX8-NEXT: v_add_u16_e32 v1, 0xffc0, v0 +; GFX8-NEXT: v_subrev_u16_e32 v1, 64, v0 ; GFX8-NEXT: v_add_u16_sdwa v0, v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; GFX8-NEXT: v_or_b32_e32 v0, v1, v0 ; GFX8-NEXT: s_setpc_b64 s[30:31] @@ -239,7 +238,7 @@ ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_add_i32_e32 v0, vcc, 4, v0 -; GFX7-NEXT: v_add_i32_e32 v1, vcc, 0xffffffc0, v1 +; GFX7-NEXT: v_subrev_i32_e32 v1, vcc, 64, v1 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: v_add_v2i16_neg_inline_imm_hi: @@ -252,10 +251,10 @@ ; GFX8-LABEL: v_add_v2i16_neg_inline_imm_hi: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v1, 0xffffffc0 -; GFX8-NEXT: v_add_u16_e32 v2, 4, v0 -; GFX8-NEXT: v_add_u16_sdwa v0, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; GFX8-NEXT: v_or_b32_e32 v0, v2, v0 +; GFX8-NEXT: v_mov_b32_e32 v2, 64 +; GFX8-NEXT: v_add_u16_e32 v1, 4, v0 +; GFX8-NEXT: v_sub_u16_sdwa v0, v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_e32 v0, v1, v0 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_add_v2i16_neg_inline_imm_hi: Index: llvm/test/CodeGen/AMDGPU/GlobalISel/atomic_optimizations_mul_one.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/GlobalISel/atomic_optimizations_mul_one.ll +++ llvm/test/CodeGen/AMDGPU/GlobalISel/atomic_optimizations_mul_one.ll @@ -36,14 +36,14 @@ ; GCN-LABEL: atomic_add: ; GCN: ; %bb.0: ; %.entry ; GCN-NEXT: s_mov_b64 s[4:5], exec -; GCN-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s4, 0 -; GCN-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s5, v0 -; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GCN-NEXT: v_mov_b32_e32 v0, 0 +; GCN-NEXT: v_mbcnt_lo_u32_b32_e64 v1, s4, 0 +; GCN-NEXT: v_mbcnt_hi_u32_b32_e32 v1, s5, v1 +; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 ; GCN-NEXT: s_and_saveexec_b64 s[6:7], vcc ; GCN-NEXT: s_cbranch_execz .LBB0_2 ; GCN-NEXT: ; %bb.1: ; GCN-NEXT: s_bcnt1_i32_b64 s4, s[4:5] -; GCN-NEXT: v_mov_b32_e32 v0, 0 ; GCN-NEXT: v_mov_b32_e32 v1, s4 ; GCN-NEXT: buffer_atomic_add v1, v0, s[0:3], 0 idxen ; GCN-NEXT: .LBB0_2: @@ -78,22 +78,22 @@ ; ; GCN-LABEL: atomic_add_and_format: ; GCN: ; %bb.0: ; %.entry -; GCN-NEXT: s_mov_b64 s[6:7], exec -; GCN-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s6, 0 -; GCN-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s7, v0 +; GCN-NEXT: s_mov_b64 s[4:5], exec +; GCN-NEXT: v_mov_b32_e32 v1, 0 +; GCN-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s4, 0 +; GCN-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s5, v0 ; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN-NEXT: ; implicit-def: $vgpr2 +; GCN-NEXT: s_and_saveexec_b64 s[6:7], vcc ; GCN-NEXT: s_cbranch_execz .LBB1_2 ; GCN-NEXT: ; %bb.1: -; GCN-NEXT: s_bcnt1_i32_b64 s6, s[6:7] -; GCN-NEXT: v_mov_b32_e32 v1, s6 -; GCN-NEXT: v_mov_b32_e32 v2, 0 -; GCN-NEXT: buffer_atomic_add v1, v2, s[0:3], 0 idxen glc +; GCN-NEXT: s_bcnt1_i32_b64 s4, s[4:5] +; GCN-NEXT: v_mov_b32_e32 v2, s4 +; GCN-NEXT: buffer_atomic_add v2, v1, s[0:3], 0 idxen glc ; GCN-NEXT: .LBB1_2: -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN-NEXT: s_or_b64 exec, exec, s[6:7] ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_readfirstlane_b32 s4, v1 +; GCN-NEXT: v_readfirstlane_b32 s4, v2 ; GCN-NEXT: v_add_i32_e32 v4, vcc, s4, v0 ; GCN-NEXT: s_waitcnt expcnt(0) ; GCN-NEXT: v_mov_b32_e32 v0, s0 @@ -130,14 +130,14 @@ ; GCN-LABEL: atomic_sub: ; GCN: ; %bb.0: ; %.entry ; GCN-NEXT: s_mov_b64 s[4:5], exec -; GCN-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s4, 0 -; GCN-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s5, v0 -; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GCN-NEXT: v_mov_b32_e32 v0, 0 +; GCN-NEXT: v_mbcnt_lo_u32_b32_e64 v1, s4, 0 +; GCN-NEXT: v_mbcnt_hi_u32_b32_e32 v1, s5, v1 +; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 ; GCN-NEXT: s_and_saveexec_b64 s[6:7], vcc ; GCN-NEXT: s_cbranch_execz .LBB2_2 ; GCN-NEXT: ; %bb.1: ; GCN-NEXT: s_bcnt1_i32_b64 s4, s[4:5] -; GCN-NEXT: v_mov_b32_e32 v0, 0 ; GCN-NEXT: v_mov_b32_e32 v1, s4 ; GCN-NEXT: buffer_atomic_sub v1, v0, s[0:3], 0 idxen ; GCN-NEXT: .LBB2_2: @@ -172,22 +172,22 @@ ; ; GCN-LABEL: atomic_sub_and_format: ; GCN: ; %bb.0: ; %.entry -; GCN-NEXT: s_mov_b64 s[6:7], exec -; GCN-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s6, 0 -; GCN-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s7, v0 +; GCN-NEXT: s_mov_b64 s[4:5], exec +; GCN-NEXT: v_mov_b32_e32 v1, 0 +; GCN-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s4, 0 +; GCN-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s5, v0 ; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN-NEXT: ; implicit-def: $vgpr2 +; GCN-NEXT: s_and_saveexec_b64 s[6:7], vcc ; GCN-NEXT: s_cbranch_execz .LBB3_2 ; GCN-NEXT: ; %bb.1: -; GCN-NEXT: s_bcnt1_i32_b64 s6, s[6:7] -; GCN-NEXT: v_mov_b32_e32 v1, s6 -; GCN-NEXT: v_mov_b32_e32 v2, 0 -; GCN-NEXT: buffer_atomic_sub v1, v2, s[0:3], 0 idxen glc +; GCN-NEXT: s_bcnt1_i32_b64 s4, s[4:5] +; GCN-NEXT: v_mov_b32_e32 v2, s4 +; GCN-NEXT: buffer_atomic_sub v2, v1, s[0:3], 0 idxen glc ; GCN-NEXT: .LBB3_2: -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN-NEXT: s_or_b64 exec, exec, s[6:7] ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_readfirstlane_b32 s4, v1 +; GCN-NEXT: v_readfirstlane_b32 s4, v2 ; GCN-NEXT: v_sub_i32_e32 v4, vcc, s4, v0 ; GCN-NEXT: s_waitcnt expcnt(0) ; GCN-NEXT: v_mov_b32_e32 v0, s0 @@ -225,15 +225,15 @@ ; GCN-LABEL: atomic_xor: ; GCN: ; %bb.0: ; %.entry ; GCN-NEXT: s_mov_b64 s[4:5], exec -; GCN-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s4, 0 -; GCN-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s5, v0 -; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GCN-NEXT: v_mov_b32_e32 v0, 0 +; GCN-NEXT: v_mbcnt_lo_u32_b32_e64 v1, s4, 0 +; GCN-NEXT: v_mbcnt_hi_u32_b32_e32 v1, s5, v1 +; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 ; GCN-NEXT: s_and_saveexec_b64 s[6:7], vcc ; GCN-NEXT: s_cbranch_execz .LBB4_2 ; GCN-NEXT: ; %bb.1: ; GCN-NEXT: s_bcnt1_i32_b64 s4, s[4:5] ; GCN-NEXT: s_and_b32 s4, s4, 1 -; GCN-NEXT: v_mov_b32_e32 v0, 0 ; GCN-NEXT: v_mov_b32_e32 v1, s4 ; GCN-NEXT: buffer_atomic_xor v1, v0, s[0:3], 0 idxen ; GCN-NEXT: .LBB4_2: @@ -271,22 +271,22 @@ ; GCN-LABEL: atomic_xor_and_format: ; GCN: ; %bb.0: ; %.entry ; GCN-NEXT: s_mov_b64 s[6:7], exec +; GCN-NEXT: v_mov_b32_e32 v1, 0 ; GCN-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s6, 0 ; GCN-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s7, v0 ; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GCN-NEXT: ; implicit-def: $vgpr1 +; GCN-NEXT: ; implicit-def: $vgpr2 ; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GCN-NEXT: s_cbranch_execz .LBB5_2 ; GCN-NEXT: ; %bb.1: ; GCN-NEXT: s_bcnt1_i32_b64 s6, s[6:7] ; GCN-NEXT: s_and_b32 s6, s6, 1 -; GCN-NEXT: v_mov_b32_e32 v1, s6 -; GCN-NEXT: v_mov_b32_e32 v2, 0 -; GCN-NEXT: buffer_atomic_xor v1, v2, s[0:3], 0 idxen glc +; GCN-NEXT: v_mov_b32_e32 v2, s6 +; GCN-NEXT: buffer_atomic_xor v2, v1, s[0:3], 0 idxen glc ; GCN-NEXT: .LBB5_2: ; GCN-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_readfirstlane_b32 s4, v1 +; GCN-NEXT: v_readfirstlane_b32 s4, v2 ; GCN-NEXT: v_and_b32_e32 v0, 1, v0 ; GCN-NEXT: v_xor_b32_e32 v4, s4, v0 ; GCN-NEXT: s_waitcnt expcnt(0) @@ -324,14 +324,14 @@ ; GCN-LABEL: atomic_ptr_add: ; GCN: ; %bb.0: ; %.entry ; GCN-NEXT: s_mov_b64 s[4:5], exec -; GCN-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s4, 0 -; GCN-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s5, v0 -; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GCN-NEXT: v_mov_b32_e32 v0, 0 +; GCN-NEXT: v_mbcnt_lo_u32_b32_e64 v1, s4, 0 +; GCN-NEXT: v_mbcnt_hi_u32_b32_e32 v1, s5, v1 +; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 ; GCN-NEXT: s_and_saveexec_b64 s[6:7], vcc ; GCN-NEXT: s_cbranch_execz .LBB6_2 ; GCN-NEXT: ; %bb.1: ; GCN-NEXT: s_bcnt1_i32_b64 s4, s[4:5] -; GCN-NEXT: v_mov_b32_e32 v0, 0 ; GCN-NEXT: v_mov_b32_e32 v1, s4 ; GCN-NEXT: buffer_atomic_add v1, v0, s[0:3], 0 idxen ; GCN-NEXT: .LBB6_2: @@ -368,22 +368,22 @@ ; ; GCN-LABEL: atomic_ptr_add_and_format: ; GCN: ; %bb.0: ; %.entry -; GCN-NEXT: s_mov_b64 s[6:7], exec -; GCN-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s6, 0 -; GCN-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s7, v0 +; GCN-NEXT: s_mov_b64 s[4:5], exec +; GCN-NEXT: v_mov_b32_e32 v1, 0 +; GCN-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s4, 0 +; GCN-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s5, v0 ; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN-NEXT: ; implicit-def: $vgpr2 +; GCN-NEXT: s_and_saveexec_b64 s[6:7], vcc ; GCN-NEXT: s_cbranch_execz .LBB7_2 ; GCN-NEXT: ; %bb.1: -; GCN-NEXT: s_bcnt1_i32_b64 s6, s[6:7] -; GCN-NEXT: v_mov_b32_e32 v1, s6 -; GCN-NEXT: v_mov_b32_e32 v2, 0 -; GCN-NEXT: buffer_atomic_add v1, v2, s[0:3], 0 idxen glc +; GCN-NEXT: s_bcnt1_i32_b64 s4, s[4:5] +; GCN-NEXT: v_mov_b32_e32 v2, s4 +; GCN-NEXT: buffer_atomic_add v2, v1, s[0:3], 0 idxen glc ; GCN-NEXT: .LBB7_2: -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN-NEXT: s_or_b64 exec, exec, s[6:7] ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_readfirstlane_b32 s4, v1 +; GCN-NEXT: v_readfirstlane_b32 s4, v2 ; GCN-NEXT: v_add_i32_e32 v4, vcc, s4, v0 ; GCN-NEXT: s_waitcnt expcnt(0) ; GCN-NEXT: v_mov_b32_e32 v0, s0 @@ -422,14 +422,14 @@ ; GCN-LABEL: atomic_ptr_sub: ; GCN: ; %bb.0: ; %.entry ; GCN-NEXT: s_mov_b64 s[4:5], exec -; GCN-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s4, 0 -; GCN-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s5, v0 -; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GCN-NEXT: v_mov_b32_e32 v0, 0 +; GCN-NEXT: v_mbcnt_lo_u32_b32_e64 v1, s4, 0 +; GCN-NEXT: v_mbcnt_hi_u32_b32_e32 v1, s5, v1 +; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 ; GCN-NEXT: s_and_saveexec_b64 s[6:7], vcc ; GCN-NEXT: s_cbranch_execz .LBB8_2 ; GCN-NEXT: ; %bb.1: ; GCN-NEXT: s_bcnt1_i32_b64 s4, s[4:5] -; GCN-NEXT: v_mov_b32_e32 v0, 0 ; GCN-NEXT: v_mov_b32_e32 v1, s4 ; GCN-NEXT: buffer_atomic_sub v1, v0, s[0:3], 0 idxen ; GCN-NEXT: .LBB8_2: @@ -466,22 +466,22 @@ ; ; GCN-LABEL: atomic_ptr_sub_and_format: ; GCN: ; %bb.0: ; %.entry -; GCN-NEXT: s_mov_b64 s[6:7], exec -; GCN-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s6, 0 -; GCN-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s7, v0 +; GCN-NEXT: s_mov_b64 s[4:5], exec +; GCN-NEXT: v_mov_b32_e32 v1, 0 +; GCN-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s4, 0 +; GCN-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s5, v0 ; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN-NEXT: ; implicit-def: $vgpr2 +; GCN-NEXT: s_and_saveexec_b64 s[6:7], vcc ; GCN-NEXT: s_cbranch_execz .LBB9_2 ; GCN-NEXT: ; %bb.1: -; GCN-NEXT: s_bcnt1_i32_b64 s6, s[6:7] -; GCN-NEXT: v_mov_b32_e32 v1, s6 -; GCN-NEXT: v_mov_b32_e32 v2, 0 -; GCN-NEXT: buffer_atomic_sub v1, v2, s[0:3], 0 idxen glc +; GCN-NEXT: s_bcnt1_i32_b64 s4, s[4:5] +; GCN-NEXT: v_mov_b32_e32 v2, s4 +; GCN-NEXT: buffer_atomic_sub v2, v1, s[0:3], 0 idxen glc ; GCN-NEXT: .LBB9_2: -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN-NEXT: s_or_b64 exec, exec, s[6:7] ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_readfirstlane_b32 s4, v1 +; GCN-NEXT: v_readfirstlane_b32 s4, v2 ; GCN-NEXT: v_sub_i32_e32 v4, vcc, s4, v0 ; GCN-NEXT: s_waitcnt expcnt(0) ; GCN-NEXT: v_mov_b32_e32 v0, s0 @@ -521,15 +521,15 @@ ; GCN-LABEL: atomic_ptr_xor: ; GCN: ; %bb.0: ; %.entry ; GCN-NEXT: s_mov_b64 s[4:5], exec -; GCN-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s4, 0 -; GCN-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s5, v0 -; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GCN-NEXT: v_mov_b32_e32 v0, 0 +; GCN-NEXT: v_mbcnt_lo_u32_b32_e64 v1, s4, 0 +; GCN-NEXT: v_mbcnt_hi_u32_b32_e32 v1, s5, v1 +; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 ; GCN-NEXT: s_and_saveexec_b64 s[6:7], vcc ; GCN-NEXT: s_cbranch_execz .LBB10_2 ; GCN-NEXT: ; %bb.1: ; GCN-NEXT: s_bcnt1_i32_b64 s4, s[4:5] ; GCN-NEXT: s_and_b32 s4, s4, 1 -; GCN-NEXT: v_mov_b32_e32 v0, 0 ; GCN-NEXT: v_mov_b32_e32 v1, s4 ; GCN-NEXT: buffer_atomic_xor v1, v0, s[0:3], 0 idxen ; GCN-NEXT: .LBB10_2: @@ -569,22 +569,22 @@ ; GCN-LABEL: atomic_ptr_xor_and_format: ; GCN: ; %bb.0: ; %.entry ; GCN-NEXT: s_mov_b64 s[6:7], exec +; GCN-NEXT: v_mov_b32_e32 v1, 0 ; GCN-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s6, 0 ; GCN-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s7, v0 ; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GCN-NEXT: ; implicit-def: $vgpr1 +; GCN-NEXT: ; implicit-def: $vgpr2 ; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GCN-NEXT: s_cbranch_execz .LBB11_2 ; GCN-NEXT: ; %bb.1: ; GCN-NEXT: s_bcnt1_i32_b64 s6, s[6:7] ; GCN-NEXT: s_and_b32 s6, s6, 1 -; GCN-NEXT: v_mov_b32_e32 v1, s6 -; GCN-NEXT: v_mov_b32_e32 v2, 0 -; GCN-NEXT: buffer_atomic_xor v1, v2, s[0:3], 0 idxen glc +; GCN-NEXT: v_mov_b32_e32 v2, s6 +; GCN-NEXT: buffer_atomic_xor v2, v1, s[0:3], 0 idxen glc ; GCN-NEXT: .LBB11_2: ; GCN-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_readfirstlane_b32 s4, v1 +; GCN-NEXT: v_readfirstlane_b32 s4, v2 ; GCN-NEXT: v_and_b32_e32 v0, 1, v0 ; GCN-NEXT: v_xor_b32_e32 v4, s4, v0 ; GCN-NEXT: s_waitcnt expcnt(0) Index: llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_udec_wrap.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_udec_wrap.ll +++ llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_udec_wrap.ll @@ -50,11 +50,11 @@ ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX9-NEXT: v_mov_b32_e32 v1, 42 +; GFX9-NEXT: v_mov_b32_e32 v0, 42 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-NEXT: v_mov_b32_e32 v1, s2 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: ds_dec_rtn_u32 v0, v0, v1 +; GFX9-NEXT: ds_dec_rtn_u32 v0, v1, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: global_store_dword v1, v0, s[0:1] @@ -63,13 +63,13 @@ ; GFX10-LABEL: lds_atomic_dec_ret_i32: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_load_dword s0, s[4:5], 0x8 -; GFX10-NEXT: v_mov_b32_e32 v1, 42 +; GFX10-NEXT: v_mov_b32_e32 v0, 42 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-NEXT: v_mov_b32_e32 v1, s0 ; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: ds_dec_rtn_u32 v0, v0, v1 +; GFX10-NEXT: ds_dec_rtn_u32 v0, v1, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: v_mov_b32_e32 v1, 0 @@ -78,14 +78,14 @@ ; ; GFX11-LABEL: lds_atomic_dec_ret_i32: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_clause 0x1 ; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x8 +; GFX11-NEXT: v_mov_b32_e32 v0, 42 ; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v1, 42 :: v_dual_mov_b32 v0, s2 +; GFX11-NEXT: v_mov_b32_e32 v1, s2 ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: ds_dec_rtn_u32 v0, v0, v1 +; GFX11-NEXT: ds_dec_rtn_u32 v0, v1, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: v_mov_b32_e32 v1, 0 @@ -210,23 +210,23 @@ ; GFX9-LABEL: lds_atomic_dec_noret_i32: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dword s0, s[4:5], 0x0 -; GFX9-NEXT: v_mov_b32_e32 v1, 42 +; GFX9-NEXT: v_mov_b32_e32 v0, 42 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NEXT: v_mov_b32_e32 v1, s0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: ds_dec_u32 v0, v1 +; GFX9-NEXT: ds_dec_u32 v1, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_endpgm ; ; GFX10-LABEL: lds_atomic_dec_noret_i32: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_load_dword s0, s[4:5], 0x0 -; GFX10-NEXT: v_mov_b32_e32 v1, 42 +; GFX10-NEXT: v_mov_b32_e32 v0, 42 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-NEXT: v_mov_b32_e32 v1, s0 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: ds_dec_u32 v0, v1 +; GFX10-NEXT: ds_dec_u32 v1, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: s_endpgm @@ -235,10 +235,10 @@ ; GFX11: ; %bb.0: ; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v1, 42 :: v_dual_mov_b32 v0, s0 +; GFX11-NEXT: v_dual_mov_b32 v0, 42 :: v_dual_mov_b32 v1, s0 ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: ds_dec_u32 v0, v1 +; GFX11-NEXT: ds_dec_u32 v1, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: s_endpgm Index: llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_uinc_wrap.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_uinc_wrap.ll +++ llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_uinc_wrap.ll @@ -50,11 +50,11 @@ ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX9-NEXT: v_mov_b32_e32 v1, 42 +; GFX9-NEXT: v_mov_b32_e32 v0, 42 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-NEXT: v_mov_b32_e32 v1, s2 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: ds_inc_rtn_u32 v0, v0, v1 +; GFX9-NEXT: ds_inc_rtn_u32 v0, v1, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: global_store_dword v1, v0, s[0:1] @@ -63,13 +63,13 @@ ; GFX10-LABEL: lds_atomic_inc_ret_i32: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_load_dword s0, s[4:5], 0x8 -; GFX10-NEXT: v_mov_b32_e32 v1, 42 +; GFX10-NEXT: v_mov_b32_e32 v0, 42 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-NEXT: v_mov_b32_e32 v1, s0 ; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: ds_inc_rtn_u32 v0, v0, v1 +; GFX10-NEXT: ds_inc_rtn_u32 v0, v1, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: v_mov_b32_e32 v1, 0 @@ -78,14 +78,14 @@ ; ; GFX11-LABEL: lds_atomic_inc_ret_i32: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_clause 0x1 ; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x8 +; GFX11-NEXT: v_mov_b32_e32 v0, 42 ; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v1, 42 :: v_dual_mov_b32 v0, s2 +; GFX11-NEXT: v_mov_b32_e32 v1, s2 ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: ds_inc_rtn_u32 v0, v0, v1 +; GFX11-NEXT: ds_inc_rtn_u32 v0, v1, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: v_mov_b32_e32 v1, 0 @@ -210,23 +210,23 @@ ; GFX9-LABEL: lds_atomic_inc_noret_i32: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dword s0, s[4:5], 0x0 -; GFX9-NEXT: v_mov_b32_e32 v1, 42 +; GFX9-NEXT: v_mov_b32_e32 v0, 42 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NEXT: v_mov_b32_e32 v1, s0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: ds_inc_u32 v0, v1 +; GFX9-NEXT: ds_inc_u32 v1, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_endpgm ; ; GFX10-LABEL: lds_atomic_inc_noret_i32: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_load_dword s0, s[4:5], 0x0 -; GFX10-NEXT: v_mov_b32_e32 v1, 42 +; GFX10-NEXT: v_mov_b32_e32 v0, 42 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-NEXT: v_mov_b32_e32 v1, s0 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: ds_inc_u32 v0, v1 +; GFX10-NEXT: ds_inc_u32 v1, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: s_endpgm @@ -235,10 +235,10 @@ ; GFX11: ; %bb.0: ; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v1, 42 :: v_dual_mov_b32 v0, s0 +; GFX11-NEXT: v_dual_mov_b32 v0, 42 :: v_dual_mov_b32 v1, s0 ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: ds_inc_u32 v0, v1 +; GFX11-NEXT: ds_inc_u32 v1, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: s_endpgm @@ -3008,13 +3008,13 @@ ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dword s6, s[4:5], 0x10 ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; GFX9-NEXT: v_mov_b32_e32 v1, 42 +; GFX9-NEXT: v_mov_b32_e32 v0, 42 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s6 +; GFX9-NEXT: v_mov_b32_e32 v1, s6 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: ds_inc_rtn_u32 v2, v0, v1 +; GFX9-NEXT: ds_inc_rtn_u32 v2, v1, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: ds_inc_rtn_u32 v0, v0, v1 +; GFX9-NEXT: ds_inc_rtn_u32 v0, v1, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: global_store_dword v1, v2, s[0:1] @@ -3024,18 +3024,18 @@ ; GFX10-LABEL: nocse_lds_atomic_inc_ret_i32: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_load_dword s0, s[4:5], 0x10 -; GFX10-NEXT: v_mov_b32_e32 v1, 42 +; GFX10-NEXT: v_mov_b32_e32 v0, 42 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-NEXT: v_mov_b32_e32 v1, s0 ; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: ds_inc_rtn_u32 v2, v0, v1 +; GFX10-NEXT: ds_inc_rtn_u32 v2, v1, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: ds_inc_rtn_u32 v0, v0, v1 +; GFX10-NEXT: ds_inc_rtn_u32 v0, v1, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: v_mov_b32_e32 v1, 0 @@ -3047,16 +3047,16 @@ ; GFX11: ; %bb.0: ; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x10 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v1, 42 :: v_dual_mov_b32 v0, s2 +; GFX11-NEXT: v_dual_mov_b32 v0, 42 :: v_dual_mov_b32 v1, s2 ; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: ds_inc_rtn_u32 v2, v0, v1 +; GFX11-NEXT: ds_inc_rtn_u32 v2, v1, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: ds_inc_rtn_u32 v0, v0, v1 +; GFX11-NEXT: ds_inc_rtn_u32 v0, v1, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: v_mov_b32_e32 v1, 0 Index: llvm/test/CodeGen/AMDGPU/GlobalISel/call-outgoing-stack-args.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/GlobalISel/call-outgoing-stack-args.ll +++ llvm/test/CodeGen/AMDGPU/GlobalISel/call-outgoing-stack-args.ll @@ -63,18 +63,18 @@ ; MUBUF-NEXT: s_add_u32 flat_scratch_lo, s4, s7 ; MUBUF-NEXT: s_addc_u32 flat_scratch_hi, s5, 0 ; MUBUF-NEXT: s_add_u32 s0, s0, s7 -; MUBUF-NEXT: s_mov_b32 s5, 0 +; MUBUF-NEXT: s_mov_b32 s4, 0 ; MUBUF-NEXT: s_addc_u32 s1, s1, 0 -; MUBUF-NEXT: s_movk_i32 s4, 0x80 ; MUBUF-NEXT: v_mov_b32_e32 v0, 0 -; MUBUF-NEXT: v_mov_b32_e32 v1, s5 +; MUBUF-NEXT: v_mov_b32_e32 v1, 0x80 +; MUBUF-NEXT: v_mov_b32_e32 v2, s4 ; MUBUF-NEXT: s_movk_i32 s32, 0x1400 ; MUBUF-NEXT: .LBB1_1: ; %loadstoreloop ; MUBUF-NEXT: ; =>This Inner Loop Header: Depth=1 -; MUBUF-NEXT: v_add_u32_e32 v2, 4, v1 -; MUBUF-NEXT: v_add_u32_e32 v1, 1, v1 -; MUBUF-NEXT: v_cmp_gt_u32_e32 vcc, s4, v1 -; MUBUF-NEXT: buffer_store_byte v0, v2, s[0:3], 0 offen +; MUBUF-NEXT: v_add_u32_e32 v3, 4, v2 +; MUBUF-NEXT: v_add_u32_e32 v2, 1, v2 +; MUBUF-NEXT: v_cmp_lt_u32_e32 vcc, v2, v1 +; MUBUF-NEXT: buffer_store_byte v0, v3, s[0:3], 0 offen ; MUBUF-NEXT: s_cbranch_vccnz .LBB1_1 ; MUBUF-NEXT: ; %bb.2: ; %split ; MUBUF-NEXT: buffer_load_dword v0, off, s[0:3], 0 offset:4 @@ -134,18 +134,18 @@ ; FLATSCR-LABEL: kernel_caller_byval: ; FLATSCR: ; %bb.0: ; %loadstoreloop.preheader ; FLATSCR-NEXT: s_add_u32 flat_scratch_lo, s0, s3 -; FLATSCR-NEXT: s_addc_u32 flat_scratch_hi, s1, 0 -; FLATSCR-NEXT: s_mov_b32 s1, 0 -; FLATSCR-NEXT: s_movk_i32 s0, 0x80 +; FLATSCR-NEXT: s_mov_b32 s0, 0 ; FLATSCR-NEXT: v_mov_b32_e32 v0, 0 -; FLATSCR-NEXT: v_mov_b32_e32 v1, s1 +; FLATSCR-NEXT: v_mov_b32_e32 v1, 0x80 +; FLATSCR-NEXT: v_mov_b32_e32 v2, s0 ; FLATSCR-NEXT: s_movk_i32 s32, 0x50 +; FLATSCR-NEXT: s_addc_u32 flat_scratch_hi, s1, 0 ; FLATSCR-NEXT: .LBB1_1: ; %loadstoreloop ; FLATSCR-NEXT: ; =>This Inner Loop Header: Depth=1 -; FLATSCR-NEXT: v_add_u32_e32 v2, 4, v1 -; FLATSCR-NEXT: v_add_u32_e32 v1, 1, v1 -; FLATSCR-NEXT: v_cmp_gt_u32_e32 vcc, s0, v1 -; FLATSCR-NEXT: scratch_store_byte v2, v0, off +; FLATSCR-NEXT: v_add_u32_e32 v3, 4, v2 +; FLATSCR-NEXT: v_add_u32_e32 v2, 1, v2 +; FLATSCR-NEXT: v_cmp_lt_u32_e32 vcc, v2, v1 +; FLATSCR-NEXT: scratch_store_byte v3, v0, off ; FLATSCR-NEXT: s_cbranch_vccnz .LBB1_1 ; FLATSCR-NEXT: ; %bb.2: ; %split ; FLATSCR-NEXT: s_mov_b32 s0, 0 Index: llvm/test/CodeGen/AMDGPU/GlobalISel/clamp-minmax-const-combine.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/GlobalISel/clamp-minmax-const-combine.ll +++ llvm/test/CodeGen/AMDGPU/GlobalISel/clamp-minmax-const-combine.ll @@ -188,9 +188,7 @@ ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: v_mul_f32_e32 v0, 2.0, v0 -; GFX10-NEXT: v_max_f32_e32 v0, 0, v0 -; GFX10-NEXT: v_min_f32_e32 v0, 1.0, v0 +; GFX10-NEXT: v_mul_f32_e64 v0, v0, 2.0 clamp ; GFX10-NEXT: s_setpc_b64 s[30:31] %fmul = fmul float %a, 2.0 %maxnum = call float @llvm.maxnum.f32(float %fmul, float 0.0) @@ -204,8 +202,7 @@ ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: v_mul_f32_e32 v0, 2.0, v0 -; GFX10-NEXT: v_med3_f32 v0, v0, 0, 1.0 +; GFX10-NEXT: v_mul_f32_e64 v0, v0, 2.0 clamp ; GFX10-NEXT: s_setpc_b64 s[30:31] %fmul = fmul float %a, 2.0 %maxnum = call float @llvm.maxnum.f32(float %fmul, float 0.0) @@ -220,9 +217,7 @@ ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: v_mul_f32_e32 v0, 2.0, v0 -; GFX10-NEXT: v_min_f32_e32 v0, 1.0, v0 -; GFX10-NEXT: v_max_f32_e32 v0, 0, v0 +; GFX10-NEXT: v_mul_f32_e64 v0, v0, 2.0 clamp ; GFX10-NEXT: s_setpc_b64 s[30:31] %fmul = fmul float %a, 2.0 %minnum = call float @llvm.minnum.f32(float %fmul, float 1.0) @@ -235,9 +230,7 @@ ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: v_mul_f32_e32 v0, 2.0, v0 -; GFX10-NEXT: v_min_f32_e32 v0, 1.0, v0 -; GFX10-NEXT: v_max_f32_e32 v0, 0, v0 +; GFX10-NEXT: v_mul_f32_e64 v0, v0, 2.0 clamp ; GFX10-NEXT: s_setpc_b64 s[30:31] %fmul = fmul float %a, 2.0 %minnum = call float @llvm.minnum.f32(float %fmul, float 1.0) Index: llvm/test/CodeGen/AMDGPU/GlobalISel/combine-short-clamp.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/GlobalISel/combine-short-clamp.ll +++ llvm/test/CodeGen/AMDGPU/GlobalISel/combine-short-clamp.ll @@ -9,12 +9,12 @@ ; GCN-LABEL: {{^}}v_clamp_i64_i16 ; GFX678: v_cvt_pk_i16_i32_e32 [[A:v[0-9]+]], [[A]], [[B:v[0-9]+]] ; GFX9: v_cvt_pk_i16_i32 [[A:v[0-9]+]], [[A]], [[B:v[0-9]+]] -; GFX6789: v_mov_b32_e32 [[B]], 0xffff8000 -; GFX6789: v_mov_b32_e32 [[C:v[0-9]+]], 0x7fff -; GFX6789: v_med3_i32 [[A]], [[B]], [[A]], [[C]] +; GFX6789: v_mov_b32_e32 [[B]], 0x7fff +; GFX6789: v_mov_b32_e32 [[C:v[0-9]+]], 0xffff8000 +; GFX6789: v_med3_i32 [[A]], [[C]], [[A]], [[B]] ; GFX10: v_cvt_pk_i16_i32{{(_e64)?}} [[A:v[0-9]+]], {{v[0-9]+}}, [[B:v[0-9]+]] -; GFX10: v_mov_b32_e32 [[B]], 0x7fff -; GFX10: v_med3_i32 [[A]], 0xffff8000, [[A]], [[B]] +; GFX10: v_mov_b32_e32 [[B]], 0xffff8000 +; GFX10: v_med3_i32 [[A]], [[B]], [[A]], 0x7fff define i16 @v_clamp_i64_i16(i64 %in) #0 { entry: %max = call i64 @llvm.smax.i64(i64 %in, i64 -32768) @@ -26,12 +26,12 @@ ; GCN-LABEL: {{^}}v_clamp_i64_i16_reverse ; GFX678: v_cvt_pk_i16_i32_e32 [[A:v[0-9]+]], [[A]], [[B:v[0-9]+]] ; GFX9: v_cvt_pk_i16_i32 [[A:v[0-9]+]], [[A]], [[B:v[0-9]+]] -; GFX6789: v_mov_b32_e32 [[B]], 0xffff8000 -; GFX6789: v_mov_b32_e32 [[C:v[0-9]+]], 0x7fff -; GFX6789: v_med3_i32 [[A]], [[B]], [[A]], [[C]] +; GFX6789: v_mov_b32_e32 [[B]], 0x7fff +; GFX6789: v_mov_b32_e32 [[C:v[0-9]+]], 0xffff8000 +; GFX6789: v_med3_i32 [[A]], [[C]], [[A]], [[B]] ; GFX10: v_cvt_pk_i16_i32{{(_e64)?}} [[A:v[0-9]+]], {{v[0-9]+}}, [[B:v[0-9]+]] -; GFX10: v_mov_b32_e32 [[B]], 0x7fff -; GFX10: v_med3_i32 [[A]], 0xffff8000, [[A]], [[B]] +; GFX10: v_mov_b32_e32 [[B]], 0xffff8000 +; GFX10: v_med3_i32 [[A]], [[B]], [[A]], 0x7fff define i16 @v_clamp_i64_i16_reverse(i64 %in) #0 { entry: %min = call i64 @llvm.smin.i64(i64 %in, i64 32767) @@ -70,12 +70,12 @@ ; GCN-LABEL: {{^}}v_clamp_i64_i16_lower_than_short ; GFX678: v_cvt_pk_i16_i32_e32 [[A:v[0-9]+]], [[A]], [[B:v[0-9]+]] ; GFX9: v_cvt_pk_i16_i32 [[A:v[0-9]+]], [[A]], [[B:v[0-9]+]] -; GFX6789: v_mov_b32_e32 [[B]], 0xffffff01 -; GFX6789: v_mov_b32_e32 [[C:v[0-9]+]], 0x100 -; GFX6789: v_med3_i32 [[A]], [[B]], [[A]], [[C]] +; GFX6789: v_mov_b32_e32 [[B]], 0x100 +; GFX6789: v_mov_b32_e32 [[C:v[0-9]+]], 0xffffff01 +; GFX6789: v_med3_i32 [[A]], [[C]], [[A]], [[B]] ; GFX10: v_cvt_pk_i16_i32{{(_e64)?}} [[A:v[0-9]+]], {{v[0-9]+}}, [[B:v[0-9]+]] -; GFX10: v_mov_b32_e32 [[B]], 0x100 -; GFX10: v_med3_i32 [[A]], 0xffffff01, [[A]], [[B]] +; GFX10: v_mov_b32_e32 [[B]], 0xffffff01 +; GFX10: v_med3_i32 [[A]], [[B]], [[A]], 0x100 define i16 @v_clamp_i64_i16_lower_than_short(i64 %in) #0 { entry: %min = call i64 @llvm.smin.i64(i64 %in, i64 256) @@ -87,12 +87,12 @@ ; GCN-LABEL: {{^}}v_clamp_i64_i16_lower_than_short_reverse ; GFX678: v_cvt_pk_i16_i32_e32 [[A:v[0-9]+]], [[A]], [[B:v[0-9]+]] ; GFX9: v_cvt_pk_i16_i32 [[A:v[0-9]+]], [[A]], [[B:v[0-9]+]] -; GFX6789: v_mov_b32_e32 [[B]], 0xffffff01 -; GFX6789: v_mov_b32_e32 [[C:v[0-9]+]], 0x100 -; GFX6789: v_med3_i32 [[A]], [[B]], [[A]], [[C]] +; GFX6789: v_mov_b32_e32 [[B]], 0x100 +; GFX6789: v_mov_b32_e32 [[C:v[0-9]+]], 0xffffff01 +; GFX6789: v_med3_i32 [[A]], [[C]], [[A]], [[B]] ; GFX10: v_cvt_pk_i16_i32{{(_e64)?}} [[A:v[0-9]+]], {{v[0-9]+}}, [[B:v[0-9]+]] -; GFX10: v_mov_b32_e32 [[B]], 0x100 -; GFX10: v_med3_i32 [[A]], 0xffffff01, [[A]], [[B]] +; GFX10: v_mov_b32_e32 [[B]], 0xffffff01 +; GFX10: v_med3_i32 [[A]], [[B]], [[A]], 0x100 define i16 @v_clamp_i64_i16_lower_than_short_reverse(i64 %in) #0 { entry: %max = call i64 @llvm.smax.i64(i64 %in, i64 -255) Index: llvm/test/CodeGen/AMDGPU/GlobalISel/crash-stack-address-O0.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/GlobalISel/crash-stack-address-O0.ll +++ llvm/test/CodeGen/AMDGPU/GlobalISel/crash-stack-address-O0.ll @@ -9,12 +9,10 @@ ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_add_u32 s0, s0, s17 ; CHECK-NEXT: s_addc_u32 s1, s1, 0 -; CHECK-NEXT: s_mov_b32 s5, 0 -; CHECK-NEXT: s_mov_b32 s4, 0 -; CHECK-NEXT: v_mov_b32_e32 v0, s5 -; CHECK-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:4 +; CHECK-NEXT: v_mov_b32_e32 v1, 0 +; CHECK-NEXT: v_mov_b32_e32 v0, 0 +; CHECK-NEXT: buffer_store_dword v1, off, s[0:3], 0 offset:4 ; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: v_mov_b32_e32 v0, s4 ; CHECK-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:8 ; CHECK-NEXT: s_waitcnt vmcnt(0) ; CHECK-NEXT: s_endpgm Index: llvm/test/CodeGen/AMDGPU/GlobalISel/dropped_debug_info_assert.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/GlobalISel/dropped_debug_info_assert.ll +++ llvm/test/CodeGen/AMDGPU/GlobalISel/dropped_debug_info_assert.ll @@ -27,15 +27,13 @@ ; CHECK-NEXT: [[COPY14:%[0-9]+]]:sreg_32 = COPY [[COPY4]], debug-location !6 ; CHECK-NEXT: [[COPY15:%[0-9]+]]:sreg_32 = COPY [[COPY3]], debug-location !6 ; CHECK-NEXT: [[DEF:%[0-9]+]]:sreg_32 = IMPLICIT_DEF debug-location !6 - ; CHECK-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 10 - ; CHECK-NEXT: [[COPY16:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] - ; CHECK-NEXT: [[V_LSHLREV_B32_e64_:%[0-9]+]]:vgpr_32 = V_LSHLREV_B32_e64 [[COPY16]], [[COPY1]], implicit $exec, debug-location !6 - ; CHECK-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 20 - ; CHECK-NEXT: [[COPY17:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_1]] - ; CHECK-NEXT: [[V_LSHLREV_B32_e64_1:%[0-9]+]]:vgpr_32 = V_LSHLREV_B32_e64 [[COPY17]], [[COPY]], implicit $exec, debug-location !6 + ; CHECK-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 10, implicit $exec, debug-location !6 + ; CHECK-NEXT: [[V_LSHLREV_B32_e64_:%[0-9]+]]:vgpr_32 = V_LSHLREV_B32_e64 [[V_MOV_B32_e32_]], [[COPY1]], implicit $exec, debug-location !6 + ; CHECK-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 20, implicit $exec, debug-location !6 + ; CHECK-NEXT: [[V_LSHLREV_B32_e64_1:%[0-9]+]]:vgpr_32 = V_LSHLREV_B32_e64 [[V_MOV_B32_e32_1]], [[COPY]], implicit $exec, debug-location !6 ; CHECK-NEXT: [[V_OR3_B32_e64_:%[0-9]+]]:vgpr_32 = V_OR3_B32_e64 [[COPY2]], [[V_LSHLREV_B32_e64_]], [[V_LSHLREV_B32_e64_1]], implicit $exec, debug-location !6 - ; CHECK-NEXT: [[COPY18:%[0-9]+]]:sgpr_128 = COPY $sgpr0_sgpr1_sgpr2_sgpr3, debug-location !6 - ; CHECK-NEXT: $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY18]], debug-location !6 + ; CHECK-NEXT: [[COPY16:%[0-9]+]]:sgpr_128 = COPY $sgpr0_sgpr1_sgpr2_sgpr3, debug-location !6 + ; CHECK-NEXT: $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY16]], debug-location !6 ; CHECK-NEXT: $sgpr4_sgpr5 = COPY [[COPY10]], debug-location !6 ; CHECK-NEXT: $sgpr6_sgpr7 = COPY [[COPY11]], debug-location !6 ; CHECK-NEXT: $sgpr8_sgpr9 = COPY [[COPY9]], debug-location !6 Index: llvm/test/CodeGen/AMDGPU/GlobalISel/fdiv.f32.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/GlobalISel/fdiv.f32.ll +++ llvm/test/CodeGen/AMDGPU/GlobalISel/fdiv.f32.ll @@ -1175,19 +1175,19 @@ ; GCN-FLUSH-LABEL: v_fdiv_v2f32_ulp25: ; GCN-FLUSH: ; %bb.0: ; GCN-FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-FLUSH-NEXT: s_mov_b32 s4, 0x6f800000 -; GCN-FLUSH-NEXT: v_mov_b32_e32 v4, 0x2f800000 -; GCN-FLUSH-NEXT: v_cmp_gt_f32_e64 vcc, |v2|, s4 -; GCN-FLUSH-NEXT: v_cndmask_b32_e32 v5, 1.0, v4, vcc -; GCN-FLUSH-NEXT: v_cmp_gt_f32_e64 vcc, |v3|, s4 -; GCN-FLUSH-NEXT: v_cndmask_b32_e32 v4, 1.0, v4, vcc -; GCN-FLUSH-NEXT: v_mul_f32_e32 v2, v2, v5 +; GCN-FLUSH-NEXT: v_mov_b32_e32 v4, 0x6f800000 +; GCN-FLUSH-NEXT: v_mov_b32_e32 v5, 0x2f800000 +; GCN-FLUSH-NEXT: v_cmp_gt_f32_e64 vcc, |v2|, v4 +; GCN-FLUSH-NEXT: v_cndmask_b32_e32 v6, 1.0, v5, vcc +; GCN-FLUSH-NEXT: v_cmp_gt_f32_e64 vcc, |v3|, v4 +; GCN-FLUSH-NEXT: v_cndmask_b32_e32 v4, 1.0, v5, vcc +; GCN-FLUSH-NEXT: v_mul_f32_e32 v2, v2, v6 ; GCN-FLUSH-NEXT: v_mul_f32_e32 v3, v3, v4 ; GCN-FLUSH-NEXT: v_rcp_f32_e32 v2, v2 ; GCN-FLUSH-NEXT: v_rcp_f32_e32 v3, v3 ; GCN-FLUSH-NEXT: v_mul_f32_e32 v0, v0, v2 ; GCN-FLUSH-NEXT: v_mul_f32_e32 v1, v1, v3 -; GCN-FLUSH-NEXT: v_mul_f32_e32 v0, v5, v0 +; GCN-FLUSH-NEXT: v_mul_f32_e32 v0, v6, v0 ; GCN-FLUSH-NEXT: v_mul_f32_e32 v1, v4, v1 ; GCN-FLUSH-NEXT: s_setpc_b64 s[30:31] ; @@ -1878,19 +1878,19 @@ ; GCN-IEEE-LABEL: v_rcp_v2f32_ulp25: ; GCN-IEEE: ; %bb.0: ; GCN-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-IEEE-NEXT: s_mov_b32 s4, 0x6f800000 -; GCN-IEEE-NEXT: v_mov_b32_e32 v2, 0x2f800000 -; GCN-IEEE-NEXT: v_cmp_gt_f32_e64 vcc, |v0|, s4 -; GCN-IEEE-NEXT: v_cndmask_b32_e32 v3, 1.0, v2, vcc -; GCN-IEEE-NEXT: v_cmp_gt_f32_e64 vcc, |v1|, s4 -; GCN-IEEE-NEXT: v_cndmask_b32_e32 v2, 1.0, v2, vcc -; GCN-IEEE-NEXT: v_mul_f32_e32 v0, v0, v3 +; GCN-IEEE-NEXT: v_mov_b32_e32 v2, 0x6f800000 +; GCN-IEEE-NEXT: v_mov_b32_e32 v3, 0x2f800000 +; GCN-IEEE-NEXT: v_cmp_gt_f32_e64 vcc, |v0|, v2 +; GCN-IEEE-NEXT: v_cndmask_b32_e32 v4, 1.0, v3, vcc +; GCN-IEEE-NEXT: v_cmp_gt_f32_e64 vcc, |v1|, v2 +; GCN-IEEE-NEXT: v_cndmask_b32_e32 v2, 1.0, v3, vcc +; GCN-IEEE-NEXT: v_mul_f32_e32 v0, v0, v4 ; GCN-IEEE-NEXT: v_mul_f32_e32 v1, v1, v2 ; GCN-IEEE-NEXT: v_rcp_f32_e32 v0, v0 ; GCN-IEEE-NEXT: v_rcp_f32_e32 v1, v1 ; GCN-IEEE-NEXT: v_mul_f32_e32 v0, 1.0, v0 ; GCN-IEEE-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; GCN-IEEE-NEXT: v_mul_f32_e32 v0, v3, v0 +; GCN-IEEE-NEXT: v_mul_f32_e32 v0, v4, v0 ; GCN-IEEE-NEXT: v_mul_f32_e32 v1, v2, v1 ; GCN-IEEE-NEXT: s_setpc_b64 s[30:31] ; @@ -2021,19 +2021,19 @@ ; GCN-FLUSH-LABEL: v_fdiv_v2f32_arcp_ulp25: ; GCN-FLUSH: ; %bb.0: ; GCN-FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-FLUSH-NEXT: s_mov_b32 s4, 0x6f800000 -; GCN-FLUSH-NEXT: v_mov_b32_e32 v4, 0x2f800000 -; GCN-FLUSH-NEXT: v_cmp_gt_f32_e64 vcc, |v2|, s4 -; GCN-FLUSH-NEXT: v_cndmask_b32_e32 v5, 1.0, v4, vcc -; GCN-FLUSH-NEXT: v_cmp_gt_f32_e64 vcc, |v3|, s4 -; GCN-FLUSH-NEXT: v_cndmask_b32_e32 v4, 1.0, v4, vcc -; GCN-FLUSH-NEXT: v_mul_f32_e32 v2, v2, v5 +; GCN-FLUSH-NEXT: v_mov_b32_e32 v4, 0x6f800000 +; GCN-FLUSH-NEXT: v_mov_b32_e32 v5, 0x2f800000 +; GCN-FLUSH-NEXT: v_cmp_gt_f32_e64 vcc, |v2|, v4 +; GCN-FLUSH-NEXT: v_cndmask_b32_e32 v6, 1.0, v5, vcc +; GCN-FLUSH-NEXT: v_cmp_gt_f32_e64 vcc, |v3|, v4 +; GCN-FLUSH-NEXT: v_cndmask_b32_e32 v4, 1.0, v5, vcc +; GCN-FLUSH-NEXT: v_mul_f32_e32 v2, v2, v6 ; GCN-FLUSH-NEXT: v_mul_f32_e32 v3, v3, v4 ; GCN-FLUSH-NEXT: v_rcp_f32_e32 v2, v2 ; GCN-FLUSH-NEXT: v_rcp_f32_e32 v3, v3 ; GCN-FLUSH-NEXT: v_mul_f32_e32 v0, v0, v2 ; GCN-FLUSH-NEXT: v_mul_f32_e32 v1, v1, v3 -; GCN-FLUSH-NEXT: v_mul_f32_e32 v0, v5, v0 +; GCN-FLUSH-NEXT: v_mul_f32_e32 v0, v6, v0 ; GCN-FLUSH-NEXT: v_mul_f32_e32 v1, v4, v1 ; GCN-FLUSH-NEXT: s_setpc_b64 s[30:31] ; Index: llvm/test/CodeGen/AMDGPU/GlobalISel/flat-scratch.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/GlobalISel/flat-scratch.ll +++ llvm/test/CodeGen/AMDGPU/GlobalISel/flat-scratch.ll @@ -818,12 +818,12 @@ ; GFX9-NEXT: s_mov_b32 s0, 0 ; GFX9-NEXT: scratch_store_dword off, v0, s0 offset:4 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: s_movk_i32 s0, 0x3e80 -; GFX9-NEXT: v_mov_b32_e32 v0, 15 -; GFX9-NEXT: s_add_i32 s0, s0, 4 -; GFX9-NEXT: scratch_store_dword off, v0, s0 +; GFX9-NEXT: v_mov_b32_e32 v0, 0x3e80 +; GFX9-NEXT: v_add_u32_e32 v0, 4, v0 +; GFX9-NEXT: v_mov_b32_e32 v1, 15 +; GFX9-NEXT: scratch_store_dword v0, v1, off ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: scratch_load_dword v0, off, s0 glc +; GFX9-NEXT: scratch_load_dword v0, v0, off glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_endpgm ; @@ -833,15 +833,15 @@ ; GFX10-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s0 ; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1 -; GFX10-NEXT: v_mov_b32_e32 v0, 13 -; GFX10-NEXT: v_mov_b32_e32 v1, 15 -; GFX10-NEXT: s_movk_i32 s0, 0x3e80 -; GFX10-NEXT: s_add_i32 s0, s0, 4 -; GFX10-NEXT: scratch_store_dword off, v0, off offset:4 +; GFX10-NEXT: v_mov_b32_e32 v0, 0x3e80 +; GFX10-NEXT: v_mov_b32_e32 v1, 13 +; GFX10-NEXT: v_mov_b32_e32 v2, 15 +; GFX10-NEXT: v_add_nc_u32_e32 v0, 4, v0 +; GFX10-NEXT: scratch_store_dword off, v1, off offset:4 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: scratch_store_dword off, v1, s0 +; GFX10-NEXT: scratch_store_dword v0, v2, off ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: scratch_load_dword v0, off, s0 glc dlc +; GFX10-NEXT: scratch_load_dword v0, v0, off glc dlc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_endpgm ; @@ -885,15 +885,15 @@ ; GFX9: ; %bb.0: ; %bb ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, 13 -; GFX9-NEXT: s_movk_i32 s0, 0x3e80 -; GFX9-NEXT: s_add_i32 s1, s32, 4 ; GFX9-NEXT: scratch_store_dword off, v0, s32 offset:4 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, 15 -; GFX9-NEXT: s_add_i32 s0, s0, s1 -; GFX9-NEXT: scratch_store_dword off, v0, s0 +; GFX9-NEXT: v_mov_b32_e32 v0, 0x3e80 +; GFX9-NEXT: s_add_i32 s0, s32, 4 +; GFX9-NEXT: v_add_u32_e32 v0, s0, v0 +; GFX9-NEXT: v_mov_b32_e32 v1, 15 +; GFX9-NEXT: scratch_store_dword v0, v1, off ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: scratch_load_dword v0, off, s0 glc +; GFX9-NEXT: scratch_load_dword v0, v0, off glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -901,16 +901,16 @@ ; GFX10: ; %bb.0: ; %bb ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: v_mov_b32_e32 v0, 13 -; GFX10-NEXT: v_mov_b32_e32 v1, 15 -; GFX10-NEXT: s_movk_i32 s0, 0x3e80 -; GFX10-NEXT: s_add_i32 s1, s32, 4 -; GFX10-NEXT: s_add_i32 s0, s0, s1 -; GFX10-NEXT: scratch_store_dword off, v0, s32 offset:4 +; GFX10-NEXT: v_mov_b32_e32 v0, 0x3e80 +; GFX10-NEXT: s_add_i32 s0, s32, 4 +; GFX10-NEXT: v_mov_b32_e32 v1, 13 +; GFX10-NEXT: v_mov_b32_e32 v2, 15 +; GFX10-NEXT: v_add_nc_u32_e32 v0, s0, v0 +; GFX10-NEXT: scratch_store_dword off, v1, s32 offset:4 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: scratch_store_dword off, v1, s0 +; GFX10-NEXT: scratch_store_dword v0, v2, off ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: scratch_load_dword v0, off, s0 glc dlc +; GFX10-NEXT: scratch_load_dword v0, v0, off glc dlc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] ; Index: llvm/test/CodeGen/AMDGPU/GlobalISel/fmed3-min-max-const-combine.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/GlobalISel/fmed3-min-max-const-combine.ll +++ llvm/test/CodeGen/AMDGPU/GlobalISel/fmed3-min-max-const-combine.ll @@ -315,15 +315,13 @@ ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: v_max_f32_e32 v0, 2.0, v0 -; GFX10-NEXT: v_min_f32_e32 v0, 4.0, v0 +; GFX10-NEXT: v_med3_f32 v0, v0, 2.0, 4.0 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: test_min_max_maybe_NaN_input_ieee_false: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_max_f32_e32 v0, 2.0, v0 -; GFX8-NEXT: v_min_f32_e32 v0, 4.0, v0 +; GFX8-NEXT: v_med3_f32 v0, v0, 2.0, 4.0 ; GFX8-NEXT: s_setpc_b64 s[30:31] %maxnum = call float @llvm.maxnum.f32(float %a, float 2.0) %fmed = call float @llvm.minnum.f32(float %maxnum, float 4.0) @@ -337,15 +335,13 @@ ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: v_min_f32_e32 v0, 4.0, v0 -; GFX10-NEXT: v_max_f32_e32 v0, 2.0, v0 +; GFX10-NEXT: v_med3_f32 v0, v0, 2.0, 4.0 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: test_max_min_maybe_NaN_input_ieee_false: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_min_f32_e32 v0, 4.0, v0 -; GFX8-NEXT: v_max_f32_e32 v0, 2.0, v0 +; GFX8-NEXT: v_med3_f32 v0, v0, 2.0, 4.0 ; GFX8-NEXT: s_setpc_b64 s[30:31] %minnum = call float @llvm.minnum.f32(float %a, float 4.0) %fmed = call float @llvm.maxnum.f32(float %minnum, float 2.0) @@ -359,16 +355,14 @@ ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: v_max_f32_e32 v0, v0, v0 -; GFX10-NEXT: v_min_f32_e32 v0, 4.0, v0 -; GFX10-NEXT: v_max_f32_e32 v0, 2.0, v0 +; GFX10-NEXT: v_med3_f32 v0, v0, 2.0, 4.0 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: test_max_min_maybe_NaN_input_ieee_true: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; GFX8-NEXT: v_min_f32_e32 v0, 4.0, v0 -; GFX8-NEXT: v_max_f32_e32 v0, 2.0, v0 +; GFX8-NEXT: v_med3_f32 v0, v0, 2.0, 4.0 ; GFX8-NEXT: s_setpc_b64 s[30:31] %minnum = call float @llvm.minnum.f32(float %a, float 4.0) %fmed = call float @llvm.maxnum.f32(float %minnum, float 2.0) Index: llvm/test/CodeGen/AMDGPU/GlobalISel/fshl.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/GlobalISel/fshl.ll +++ llvm/test/CodeGen/AMDGPU/GlobalISel/fshl.ll @@ -910,12 +910,12 @@ ; GFX10-NEXT: v_lshrrev_b16 v1, 1, v1 ; GFX10-NEXT: v_lshlrev_b16 v3, v3, v5 ; GFX10-NEXT: v_lshlrev_b16 v0, v2, v0 -; GFX10-NEXT: s_movk_i32 s4, 0xff ; GFX10-NEXT: v_lshrrev_b16 v4, v6, v4 ; GFX10-NEXT: v_lshrrev_b16 v1, v7, v1 ; GFX10-NEXT: v_or_b32_e32 v2, v3, v4 +; GFX10-NEXT: v_mov_b32_e32 v3, 0xff ; GFX10-NEXT: v_or_b32_e32 v0, v0, v1 -; GFX10-NEXT: v_and_b32_sdwa v1, v2, s4 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX10-NEXT: v_and_b32_sdwa v1, v2, v3 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX10-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX10-NEXT: s_setpc_b64 s[30:31] ; @@ -1337,9 +1337,9 @@ ; GFX9-NEXT: v_lshrrev_b32_e32 v7, 24, v2 ; GFX9-NEXT: v_and_b32_e32 v8, 7, v2 ; GFX9-NEXT: v_not_b32_e32 v2, v2 -; GFX9-NEXT: s_mov_b32 s5, 1 +; GFX9-NEXT: s_mov_b32 s4, 1 ; GFX9-NEXT: v_and_b32_e32 v2, 7, v2 -; GFX9-NEXT: v_lshrrev_b16_sdwa v10, s5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_lshrrev_b16_sdwa v10, s4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_lshlrev_b16_e32 v8, v8, v0 ; GFX9-NEXT: v_lshrrev_b16_e32 v2, v2, v10 ; GFX9-NEXT: v_lshrrev_b32_e32 v4, 8, v1 @@ -1348,7 +1348,7 @@ ; GFX9-NEXT: v_not_b32_e32 v5, v5 ; GFX9-NEXT: v_lshrrev_b32_e32 v3, 8, v0 ; GFX9-NEXT: v_and_b32_e32 v5, 7, v5 -; GFX9-NEXT: v_lshrrev_b16_sdwa v4, s5, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_lshrrev_b16_sdwa v4, s4, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_mov_b32_e32 v9, 0xff ; GFX9-NEXT: v_lshlrev_b16_e32 v3, v8, v3 ; GFX9-NEXT: v_lshrrev_b16_e32 v4, v5, v4 @@ -1370,9 +1370,8 @@ ; GFX9-NEXT: v_lshrrev_b16_e32 v1, v6, v1 ; GFX9-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX9-NEXT: v_mov_b32_e32 v1, 8 -; GFX9-NEXT: s_movk_i32 s4, 0xff ; GFX9-NEXT: v_lshlrev_b32_sdwa v1, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_and_or_b32 v1, v2, s4, v1 +; GFX9-NEXT: v_and_or_b32 v1, v2, v9, v1 ; GFX9-NEXT: v_and_b32_e32 v2, 0xff, v4 ; GFX9-NEXT: v_and_b32_e32 v0, 0xff, v0 ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 16, v2 @@ -1428,7 +1427,7 @@ ; GFX10-NEXT: v_lshlrev_b32_sdwa v3, v6, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX10-NEXT: v_and_b32_e32 v1, 0xff, v1 ; GFX10-NEXT: v_and_b32_e32 v2, 0xff, v2 -; GFX10-NEXT: v_and_or_b32 v0, v0, 0xff, v3 +; GFX10-NEXT: v_and_or_b32 v0, 0xff, v0, v3 ; GFX10-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX10-NEXT: v_lshlrev_b32_e32 v2, 24, v2 ; GFX10-NEXT: v_or3_b32 v0, v0, v1, v2 @@ -1485,7 +1484,7 @@ ; GFX11-NEXT: v_and_b32_e32 v2, 0xff, v3 ; GFX11-NEXT: v_and_b32_e32 v3, 0xff, v4 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-NEXT: v_and_or_b32 v0, v0, 0xff, v1 +; GFX11-NEXT: v_and_or_b32 v0, 0xff, v0, v1 ; GFX11-NEXT: v_lshlrev_b32_e32 v1, 16, v2 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_lshlrev_b32_e32 v2, 24, v3 @@ -1583,7 +1582,6 @@ ; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc ; GFX9-NEXT: v_sub_u32_e32 v1, 23, v0 ; GFX9-NEXT: v_and_b32_e32 v1, 0xffffff, v1 -; GFX9-NEXT: v_and_b32_e32 v0, 0xffffff, v0 ; GFX9-NEXT: v_lshrrev_b32_e64 v1, v1, s1 ; GFX9-NEXT: v_lshl_or_b32 v0, s0, v0, v1 ; GFX9-NEXT: v_readfirstlane_b32 s0, v0 @@ -1610,7 +1608,6 @@ ; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, 24, v0 ; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc_lo ; GFX10-NEXT: v_sub_nc_u32_e32 v1, 23, v0 -; GFX10-NEXT: v_and_b32_e32 v0, 0xffffff, v0 ; GFX10-NEXT: v_and_b32_e32 v1, 0xffffff, v1 ; GFX10-NEXT: v_lshrrev_b32_e64 v1, v1, s1 ; GFX10-NEXT: v_lshl_or_b32 v0, s0, v0, v1 @@ -1644,9 +1641,8 @@ ; GFX11-NEXT: v_subrev_nc_u32_e32 v1, 24, v0 ; GFX11-NEXT: v_cmp_le_u32_e32 vcc_lo, 24, v0 ; GFX11-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc_lo -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_sub_nc_u32_e32 v1, 23, v0 -; GFX11-NEXT: v_and_b32_e32 v0, 0xffffff, v0 ; GFX11-NEXT: v_and_b32_e32 v1, 0xffffff, v1 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_lshrrev_b32_e64 v1, v1, s1 @@ -1743,7 +1739,6 @@ ; GFX9-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc ; GFX9-NEXT: v_sub_u32_e32 v3, 23, v2 ; GFX9-NEXT: v_and_b32_e32 v3, 0xffffff, v3 -; GFX9-NEXT: v_and_b32_e32 v2, 0xffffff, v2 ; GFX9-NEXT: v_lshrrev_b32_e32 v1, v3, v1 ; GFX9-NEXT: v_lshl_or_b32 v0, v0, v2, v1 ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -1771,7 +1766,6 @@ ; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, 24, v2 ; GFX10-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc_lo ; GFX10-NEXT: v_sub_nc_u32_e32 v3, 23, v2 -; GFX10-NEXT: v_and_b32_e32 v2, 0xffffff, v2 ; GFX10-NEXT: v_and_b32_e32 v3, 0xffffff, v3 ; GFX10-NEXT: v_lshrrev_b32_e32 v1, v3, v1 ; GFX10-NEXT: v_lshl_or_b32 v0, v0, v2, v1 @@ -1806,9 +1800,8 @@ ; GFX11-NEXT: v_subrev_nc_u32_e32 v3, 24, v2 ; GFX11-NEXT: v_cmp_le_u32_e32 vcc_lo, 24, v2 ; GFX11-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc_lo -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_sub_nc_u32_e32 v3, 23, v2 -; GFX11-NEXT: v_and_b32_e32 v2, 0xffffff, v2 ; GFX11-NEXT: v_and_b32_e32 v3, 0xffffff, v3 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_lshrrev_b32_e32 v1, v3, v1 @@ -1821,48 +1814,47 @@ define amdgpu_ps i48 @s_fshl_v2i24(i48 inreg %lhs.arg, i48 inreg %rhs.arg, i48 inreg %amt.arg) { ; GFX6-LABEL: s_fshl_v2i24: ; GFX6: ; %bb.0: -; GFX6-NEXT: v_cvt_f32_ubyte0_e32 v0, 24 -; GFX6-NEXT: v_rcp_iflag_f32_e32 v0, v0 ; GFX6-NEXT: s_lshr_b32 s6, s0, 16 ; GFX6-NEXT: s_lshr_b32 s7, s0, 24 ; GFX6-NEXT: s_and_b32 s9, s0, 0xff ; GFX6-NEXT: s_bfe_u32 s0, s0, 0x80008 +; GFX6-NEXT: v_cvt_f32_ubyte0_e32 v0, 24 ; GFX6-NEXT: s_lshl_b32 s0, s0, 8 ; GFX6-NEXT: s_and_b32 s6, s6, 0xff +; GFX6-NEXT: v_rcp_iflag_f32_e32 v0, v0 ; GFX6-NEXT: s_or_b32 s0, s9, s0 ; GFX6-NEXT: s_and_b32 s6, 0xffff, s6 -; GFX6-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX6-NEXT: s_lshr_b32 s8, s1, 8 ; GFX6-NEXT: s_and_b32 s0, 0xffff, s0 ; GFX6-NEXT: s_lshl_b32 s6, s6, 16 ; GFX6-NEXT: s_and_b32 s1, s1, 0xff -; GFX6-NEXT: v_cvt_u32_f32_e32 v0, v0 ; GFX6-NEXT: s_or_b32 s0, s0, s6 ; GFX6-NEXT: s_lshl_b32 s1, s1, 8 ; GFX6-NEXT: s_and_b32 s6, s8, 0xff ; GFX6-NEXT: s_or_b32 s1, s7, s1 ; GFX6-NEXT: s_and_b32 s6, 0xffff, s6 +; GFX6-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX6-NEXT: s_and_b32 s1, 0xffff, s1 ; GFX6-NEXT: s_lshl_b32 s6, s6, 16 -; GFX6-NEXT: v_mov_b32_e32 v1, 0xffffffe8 +; GFX6-NEXT: v_cvt_u32_f32_e32 v0, v0 ; GFX6-NEXT: s_or_b32 s1, s1, s6 ; GFX6-NEXT: s_lshr_b32 s6, s2, 16 ; GFX6-NEXT: s_lshr_b32 s7, s2, 24 ; GFX6-NEXT: s_and_b32 s9, s2, 0xff ; GFX6-NEXT: s_bfe_u32 s2, s2, 0x80008 -; GFX6-NEXT: v_mul_lo_u32 v2, v0, v1 ; GFX6-NEXT: s_lshl_b32 s2, s2, 8 ; GFX6-NEXT: s_and_b32 s6, s6, 0xff ; GFX6-NEXT: s_or_b32 s2, s9, s2 ; GFX6-NEXT: s_and_b32 s6, 0xffff, s6 +; GFX6-NEXT: v_mov_b32_e32 v1, 0xffffffe8 ; GFX6-NEXT: s_lshr_b32 s8, s3, 8 ; GFX6-NEXT: s_and_b32 s2, 0xffff, s2 ; GFX6-NEXT: s_lshl_b32 s6, s6, 16 ; GFX6-NEXT: s_and_b32 s3, s3, 0xff +; GFX6-NEXT: v_mul_lo_u32 v1, v0, v1 ; GFX6-NEXT: s_or_b32 s2, s2, s6 ; GFX6-NEXT: s_lshl_b32 s3, s3, 8 ; GFX6-NEXT: s_and_b32 s6, s8, 0xff -; GFX6-NEXT: v_mul_hi_u32 v2, v0, v2 ; GFX6-NEXT: s_or_b32 s3, s7, s3 ; GFX6-NEXT: s_and_b32 s6, 0xffff, s6 ; GFX6-NEXT: s_and_b32 s3, 0xffff, s3 @@ -1872,78 +1864,72 @@ ; GFX6-NEXT: s_lshr_b32 s7, s4, 24 ; GFX6-NEXT: s_and_b32 s9, s4, 0xff ; GFX6-NEXT: s_bfe_u32 s4, s4, 0x80008 +; GFX6-NEXT: v_mul_hi_u32 v1, v0, v1 ; GFX6-NEXT: s_lshl_b32 s4, s4, 8 ; GFX6-NEXT: s_and_b32 s6, s6, 0xff -; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v2 -; GFX6-NEXT: v_cvt_f32_ubyte0_e32 v2, 24 ; GFX6-NEXT: s_or_b32 s4, s9, s4 ; GFX6-NEXT: s_and_b32 s6, 0xffff, s6 -; GFX6-NEXT: v_rcp_iflag_f32_e32 v2, v2 ; GFX6-NEXT: s_and_b32 s4, 0xffff, s4 ; GFX6-NEXT: s_lshl_b32 s6, s6, 16 ; GFX6-NEXT: s_or_b32 s4, s4, s6 -; GFX6-NEXT: v_mul_hi_u32 v0, s4, v0 -; GFX6-NEXT: v_mul_f32_e32 v2, 0x4f7ffffe, v2 -; GFX6-NEXT: v_cvt_u32_f32_e32 v2, v2 +; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v1 +; GFX6-NEXT: v_mul_hi_u32 v1, s4, v0 ; GFX6-NEXT: s_lshr_b32 s8, s5, 8 -; GFX6-NEXT: v_mul_lo_u32 v0, v0, 24 ; GFX6-NEXT: s_and_b32 s5, s5, 0xff -; GFX6-NEXT: v_mul_lo_u32 v1, v2, v1 ; GFX6-NEXT: s_lshl_b32 s5, s5, 8 -; GFX6-NEXT: v_sub_i32_e32 v0, vcc, s4, v0 -; GFX6-NEXT: v_subrev_i32_e32 v3, vcc, 24, v0 -; GFX6-NEXT: v_cmp_le_u32_e32 vcc, 24, v0 -; GFX6-NEXT: v_mul_hi_u32 v1, v2, v1 +; GFX6-NEXT: v_mul_lo_u32 v1, v1, 24 ; GFX6-NEXT: s_and_b32 s6, s8, 0xff -; GFX6-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc ; GFX6-NEXT: s_or_b32 s5, s7, s5 ; GFX6-NEXT: s_and_b32 s6, 0xffff, s6 -; GFX6-NEXT: v_subrev_i32_e32 v3, vcc, 24, v0 ; GFX6-NEXT: s_and_b32 s5, 0xffff, s5 ; GFX6-NEXT: s_lshl_b32 s6, s6, 16 -; GFX6-NEXT: v_cmp_le_u32_e32 vcc, 24, v0 ; GFX6-NEXT: s_or_b32 s5, s5, s6 -; GFX6-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc -; GFX6-NEXT: v_add_i32_e32 v1, vcc, v2, v1 -; GFX6-NEXT: v_mul_hi_u32 v1, s5, v1 -; GFX6-NEXT: v_sub_i32_e32 v3, vcc, 23, v0 -; GFX6-NEXT: v_and_b32_e32 v0, 0xffffff, v0 -; GFX6-NEXT: v_mul_lo_u32 v1, v1, 24 -; GFX6-NEXT: v_lshl_b32_e32 v0, s0, v0 -; GFX6-NEXT: s_lshr_b32 s0, s2, 1 -; GFX6-NEXT: v_and_b32_e32 v2, 0xffffff, v3 -; GFX6-NEXT: v_lshr_b32_e32 v2, s0, v2 -; GFX6-NEXT: v_sub_i32_e32 v1, vcc, s5, v1 -; GFX6-NEXT: v_or_b32_e32 v0, v0, v2 +; GFX6-NEXT: v_sub_i32_e32 v1, vcc, s4, v1 ; GFX6-NEXT: v_subrev_i32_e32 v2, vcc, 24, v1 +; GFX6-NEXT: v_mul_hi_u32 v0, s5, v0 ; GFX6-NEXT: v_cmp_le_u32_e32 vcc, 24, v1 ; GFX6-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc ; GFX6-NEXT: v_subrev_i32_e32 v2, vcc, 24, v1 ; GFX6-NEXT: v_cmp_le_u32_e32 vcc, 24, v1 +; GFX6-NEXT: v_mul_lo_u32 v0, v0, 24 ; GFX6-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc ; GFX6-NEXT: v_sub_i32_e32 v2, vcc, 23, v1 ; GFX6-NEXT: v_and_b32_e32 v1, 0xffffff, v1 -; GFX6-NEXT: s_lshr_b32 s0, s3, 1 +; GFX6-NEXT: v_lshl_b32_e32 v1, s0, v1 +; GFX6-NEXT: s_lshr_b32 s0, s2, 1 ; GFX6-NEXT: v_and_b32_e32 v2, 0xffffff, v2 -; GFX6-NEXT: v_lshl_b32_e32 v1, s1, v1 ; GFX6-NEXT: v_lshr_b32_e32 v2, s0, v2 -; GFX6-NEXT: v_bfe_u32 v3, v0, 8, 8 +; GFX6-NEXT: v_sub_i32_e32 v0, vcc, s5, v0 ; GFX6-NEXT: v_or_b32_e32 v1, v1, v2 -; GFX6-NEXT: v_and_b32_e32 v2, 0xff, v0 -; GFX6-NEXT: v_lshlrev_b32_e32 v3, 8, v3 -; GFX6-NEXT: v_bfe_u32 v0, v0, 16, 8 -; GFX6-NEXT: v_or_b32_e32 v2, v2, v3 -; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX6-NEXT: v_or_b32_e32 v0, v2, v0 -; GFX6-NEXT: v_and_b32_e32 v2, 0xff, v1 -; GFX6-NEXT: v_lshlrev_b32_e32 v2, 24, v2 +; GFX6-NEXT: v_subrev_i32_e32 v2, vcc, 24, v0 +; GFX6-NEXT: v_cmp_le_u32_e32 vcc, 24, v0 +; GFX6-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc +; GFX6-NEXT: v_subrev_i32_e32 v2, vcc, 24, v0 +; GFX6-NEXT: v_cmp_le_u32_e32 vcc, 24, v0 +; GFX6-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc +; GFX6-NEXT: v_sub_i32_e32 v2, vcc, 23, v0 +; GFX6-NEXT: v_and_b32_e32 v0, 0xffffff, v0 +; GFX6-NEXT: s_lshr_b32 s0, s3, 1 +; GFX6-NEXT: v_and_b32_e32 v2, 0xffffff, v2 +; GFX6-NEXT: v_lshl_b32_e32 v0, s1, v0 +; GFX6-NEXT: v_lshr_b32_e32 v2, s0, v2 +; GFX6-NEXT: v_bfe_u32 v3, v1, 8, 8 ; GFX6-NEXT: v_or_b32_e32 v0, v0, v2 -; GFX6-NEXT: v_bfe_u32 v2, v1, 8, 8 +; GFX6-NEXT: v_and_b32_e32 v2, 0xff, v1 +; GFX6-NEXT: v_lshlrev_b32_e32 v3, 8, v3 ; GFX6-NEXT: v_bfe_u32 v1, v1, 16, 8 -; GFX6-NEXT: v_lshlrev_b32_e32 v1, 8, v1 +; GFX6-NEXT: v_or_b32_e32 v2, v2, v3 +; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX6-NEXT: v_or_b32_e32 v1, v2, v1 -; GFX6-NEXT: v_readfirstlane_b32 s0, v0 -; GFX6-NEXT: v_readfirstlane_b32 s1, v1 +; GFX6-NEXT: v_and_b32_e32 v2, 0xff, v0 +; GFX6-NEXT: v_lshlrev_b32_e32 v2, 24, v2 +; GFX6-NEXT: v_or_b32_e32 v1, v1, v2 +; GFX6-NEXT: v_bfe_u32 v2, v0, 8, 8 +; GFX6-NEXT: v_bfe_u32 v0, v0, 16, 8 +; GFX6-NEXT: v_lshlrev_b32_e32 v0, 8, v0 +; GFX6-NEXT: v_or_b32_e32 v0, v2, v0 +; GFX6-NEXT: v_readfirstlane_b32 s0, v1 +; GFX6-NEXT: v_readfirstlane_b32 s1, v0 ; GFX6-NEXT: ; return to shader part epilog ; ; GFX8-LABEL: s_fshl_v2i24: @@ -1956,9 +1942,7 @@ ; GFX8-NEXT: s_lshl_b32 s6, s6, 8 ; GFX8-NEXT: s_or_b32 s0, s0, s6 ; GFX8-NEXT: s_and_b32 s6, s7, 0xff -; GFX8-NEXT: v_cvt_f32_ubyte0_e32 v0, 24 ; GFX8-NEXT: s_and_b32 s6, 0xffff, s6 -; GFX8-NEXT: v_rcp_iflag_f32_e32 v0, v0 ; GFX8-NEXT: s_lshr_b32 s9, s1, 8 ; GFX8-NEXT: s_and_b32 s0, 0xffff, s0 ; GFX8-NEXT: s_lshl_b32 s6, s6, 16 @@ -1966,24 +1950,24 @@ ; GFX8-NEXT: s_or_b32 s0, s0, s6 ; GFX8-NEXT: s_lshl_b32 s1, s1, 8 ; GFX8-NEXT: s_and_b32 s6, s9, 0xff +; GFX8-NEXT: v_cvt_f32_ubyte0_e32 v0, 24 ; GFX8-NEXT: s_or_b32 s1, s8, s1 ; GFX8-NEXT: s_and_b32 s6, 0xffff, s6 +; GFX8-NEXT: v_rcp_iflag_f32_e32 v0, v0 ; GFX8-NEXT: s_and_b32 s1, 0xffff, s1 ; GFX8-NEXT: s_lshl_b32 s6, s6, 16 -; GFX8-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX8-NEXT: s_or_b32 s1, s1, s6 ; GFX8-NEXT: s_lshr_b32 s6, s2, 8 -; GFX8-NEXT: v_cvt_u32_f32_e32 v0, v0 ; GFX8-NEXT: s_and_b32 s6, s6, 0xff ; GFX8-NEXT: s_lshr_b32 s7, s2, 16 ; GFX8-NEXT: s_lshr_b32 s8, s2, 24 ; GFX8-NEXT: s_and_b32 s2, s2, 0xff ; GFX8-NEXT: s_lshl_b32 s6, s6, 8 +; GFX8-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX8-NEXT: s_or_b32 s2, s2, s6 ; GFX8-NEXT: s_and_b32 s6, s7, 0xff -; GFX8-NEXT: v_mov_b32_e32 v1, 0xffffffe8 +; GFX8-NEXT: v_cvt_u32_f32_e32 v0, v0 ; GFX8-NEXT: s_and_b32 s6, 0xffff, s6 -; GFX8-NEXT: v_mul_lo_u32 v2, v0, v1 ; GFX8-NEXT: s_lshr_b32 s9, s3, 8 ; GFX8-NEXT: s_and_b32 s2, 0xffff, s2 ; GFX8-NEXT: s_lshl_b32 s6, s6, 16 @@ -1991,11 +1975,12 @@ ; GFX8-NEXT: s_or_b32 s2, s2, s6 ; GFX8-NEXT: s_lshl_b32 s3, s3, 8 ; GFX8-NEXT: s_and_b32 s6, s9, 0xff +; GFX8-NEXT: v_mov_b32_e32 v1, 0xffffffe8 ; GFX8-NEXT: s_or_b32 s3, s8, s3 ; GFX8-NEXT: s_and_b32 s6, 0xffff, s6 +; GFX8-NEXT: v_mul_lo_u32 v1, v0, v1 ; GFX8-NEXT: s_and_b32 s3, 0xffff, s3 ; GFX8-NEXT: s_lshl_b32 s6, s6, 16 -; GFX8-NEXT: v_mul_hi_u32 v2, v0, v2 ; GFX8-NEXT: s_or_b32 s3, s3, s6 ; GFX8-NEXT: s_lshr_b32 s6, s4, 8 ; GFX8-NEXT: s_and_b32 s6, s6, 0xff @@ -2003,457 +1988,425 @@ ; GFX8-NEXT: s_lshr_b32 s8, s4, 24 ; GFX8-NEXT: s_and_b32 s4, s4, 0xff ; GFX8-NEXT: s_lshl_b32 s6, s6, 8 +; GFX8-NEXT: v_mul_hi_u32 v1, v0, v1 ; GFX8-NEXT: s_or_b32 s4, s4, s6 ; GFX8-NEXT: s_and_b32 s6, s7, 0xff -; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v2 -; GFX8-NEXT: v_cvt_f32_ubyte0_e32 v2, 24 ; GFX8-NEXT: s_and_b32 s6, 0xffff, s6 -; GFX8-NEXT: v_rcp_iflag_f32_e32 v2, v2 ; GFX8-NEXT: s_and_b32 s4, 0xffff, s4 ; GFX8-NEXT: s_lshl_b32 s6, s6, 16 ; GFX8-NEXT: s_or_b32 s4, s4, s6 -; GFX8-NEXT: v_mul_hi_u32 v0, s4, v0 -; GFX8-NEXT: v_mul_f32_e32 v2, 0x4f7ffffe, v2 -; GFX8-NEXT: v_cvt_u32_f32_e32 v2, v2 +; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v1 +; GFX8-NEXT: v_mul_hi_u32 v1, s4, v0 ; GFX8-NEXT: s_lshr_b32 s9, s5, 8 -; GFX8-NEXT: v_mul_lo_u32 v0, v0, 24 ; GFX8-NEXT: s_and_b32 s5, s5, 0xff -; GFX8-NEXT: v_mul_lo_u32 v1, v2, v1 ; GFX8-NEXT: s_lshl_b32 s5, s5, 8 -; GFX8-NEXT: v_sub_u32_e32 v0, vcc, s4, v0 -; GFX8-NEXT: v_subrev_u32_e32 v3, vcc, 24, v0 -; GFX8-NEXT: v_cmp_le_u32_e32 vcc, 24, v0 -; GFX8-NEXT: v_mul_hi_u32 v1, v2, v1 +; GFX8-NEXT: v_mul_lo_u32 v1, v1, 24 ; GFX8-NEXT: s_and_b32 s6, s9, 0xff -; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc ; GFX8-NEXT: s_or_b32 s5, s8, s5 ; GFX8-NEXT: s_and_b32 s6, 0xffff, s6 -; GFX8-NEXT: v_subrev_u32_e32 v3, vcc, 24, v0 ; GFX8-NEXT: s_and_b32 s5, 0xffff, s5 ; GFX8-NEXT: s_lshl_b32 s6, s6, 16 -; GFX8-NEXT: v_cmp_le_u32_e32 vcc, 24, v0 ; GFX8-NEXT: s_or_b32 s5, s5, s6 -; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc -; GFX8-NEXT: v_add_u32_e32 v1, vcc, v2, v1 -; GFX8-NEXT: v_mul_hi_u32 v1, s5, v1 -; GFX8-NEXT: v_sub_u32_e32 v3, vcc, 23, v0 -; GFX8-NEXT: v_and_b32_e32 v0, 0xffffff, v0 -; GFX8-NEXT: v_mul_lo_u32 v1, v1, 24 -; GFX8-NEXT: v_lshlrev_b32_e64 v0, v0, s0 -; GFX8-NEXT: s_lshr_b32 s0, s2, 1 -; GFX8-NEXT: v_and_b32_e32 v2, 0xffffff, v3 -; GFX8-NEXT: v_lshrrev_b32_e64 v2, v2, s0 -; GFX8-NEXT: v_sub_u32_e32 v1, vcc, s5, v1 -; GFX8-NEXT: v_or_b32_e32 v0, v0, v2 +; GFX8-NEXT: v_sub_u32_e32 v1, vcc, s4, v1 ; GFX8-NEXT: v_subrev_u32_e32 v2, vcc, 24, v1 +; GFX8-NEXT: v_mul_hi_u32 v0, s5, v0 ; GFX8-NEXT: v_cmp_le_u32_e32 vcc, 24, v1 ; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc ; GFX8-NEXT: v_subrev_u32_e32 v2, vcc, 24, v1 ; GFX8-NEXT: v_cmp_le_u32_e32 vcc, 24, v1 +; GFX8-NEXT: v_mul_lo_u32 v0, v0, 24 ; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc ; GFX8-NEXT: v_sub_u32_e32 v2, vcc, 23, v1 ; GFX8-NEXT: v_and_b32_e32 v1, 0xffffff, v1 -; GFX8-NEXT: s_lshr_b32 s0, s3, 1 +; GFX8-NEXT: v_lshlrev_b32_e64 v1, v1, s0 +; GFX8-NEXT: s_lshr_b32 s0, s2, 1 ; GFX8-NEXT: v_and_b32_e32 v2, 0xffffff, v2 -; GFX8-NEXT: v_lshlrev_b32_e64 v1, v1, s1 ; GFX8-NEXT: v_lshrrev_b32_e64 v2, v2, s0 +; GFX8-NEXT: v_sub_u32_e32 v0, vcc, s5, v0 ; GFX8-NEXT: v_or_b32_e32 v1, v1, v2 +; GFX8-NEXT: v_subrev_u32_e32 v2, vcc, 24, v0 +; GFX8-NEXT: v_cmp_le_u32_e32 vcc, 24, v0 +; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc +; GFX8-NEXT: v_subrev_u32_e32 v2, vcc, 24, v0 +; GFX8-NEXT: v_cmp_le_u32_e32 vcc, 24, v0 +; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc +; GFX8-NEXT: v_sub_u32_e32 v2, vcc, 23, v0 +; GFX8-NEXT: v_and_b32_e32 v0, 0xffffff, v0 +; GFX8-NEXT: s_lshr_b32 s0, s3, 1 +; GFX8-NEXT: v_and_b32_e32 v2, 0xffffff, v2 +; GFX8-NEXT: v_lshlrev_b32_e64 v0, v0, s1 +; GFX8-NEXT: v_lshrrev_b32_e64 v2, v2, s0 +; GFX8-NEXT: v_or_b32_e32 v0, v0, v2 ; GFX8-NEXT: v_mov_b32_e32 v2, 8 -; GFX8-NEXT: v_lshlrev_b32_sdwa v3, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 +; GFX8-NEXT: v_lshlrev_b32_sdwa v3, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 ; GFX8-NEXT: v_mov_b32_e32 v4, 16 -; GFX8-NEXT: v_or_b32_sdwa v3, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX8-NEXT: v_lshlrev_b32_sdwa v0, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 -; GFX8-NEXT: v_or_b32_e32 v0, v3, v0 -; GFX8-NEXT: v_and_b32_e32 v3, 0xff, v1 +; GFX8-NEXT: v_or_b32_sdwa v3, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX8-NEXT: v_lshlrev_b32_sdwa v1, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 +; GFX8-NEXT: v_or_b32_e32 v1, v3, v1 +; GFX8-NEXT: v_and_b32_e32 v3, 0xff, v0 ; GFX8-NEXT: v_lshlrev_b32_e32 v3, 24, v3 -; GFX8-NEXT: v_lshlrev_b32_sdwa v2, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 -; GFX8-NEXT: v_or_b32_e32 v0, v0, v3 -; GFX8-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD -; GFX8-NEXT: v_readfirstlane_b32 s0, v0 -; GFX8-NEXT: v_readfirstlane_b32 s1, v1 +; GFX8-NEXT: v_lshlrev_b32_sdwa v2, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 +; GFX8-NEXT: v_or_b32_e32 v1, v1, v3 +; GFX8-NEXT: v_or_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD +; GFX8-NEXT: v_readfirstlane_b32 s0, v1 +; GFX8-NEXT: v_readfirstlane_b32 s1, v0 ; GFX8-NEXT: ; return to shader part epilog ; ; GFX9-LABEL: s_fshl_v2i24: ; GFX9: ; %bb.0: -; GFX9-NEXT: v_cvt_f32_ubyte0_e32 v0, 24 -; GFX9-NEXT: v_rcp_iflag_f32_e32 v0, v0 -; GFX9-NEXT: s_lshr_b32 s7, s0, 8 -; GFX9-NEXT: s_and_b32 s7, s7, 0xff -; GFX9-NEXT: s_lshr_b32 s9, s0, 16 -; GFX9-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 -; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0 -; GFX9-NEXT: s_lshr_b32 s10, s0, 24 +; GFX9-NEXT: s_lshr_b32 s6, s0, 8 +; GFX9-NEXT: s_and_b32 s6, s6, 0xff +; GFX9-NEXT: s_lshr_b32 s7, s0, 16 +; GFX9-NEXT: s_lshr_b32 s8, s0, 24 ; GFX9-NEXT: s_and_b32 s0, s0, 0xff -; GFX9-NEXT: s_lshl_b32 s7, s7, 8 -; GFX9-NEXT: s_or_b32 s0, s0, s7 -; GFX9-NEXT: s_and_b32 s7, s9, 0xff -; GFX9-NEXT: s_and_b32 s7, 0xffff, s7 -; GFX9-NEXT: v_mov_b32_e32 v1, 0xffffffe8 -; GFX9-NEXT: s_lshr_b32 s11, s1, 8 +; GFX9-NEXT: s_lshl_b32 s6, s6, 8 +; GFX9-NEXT: s_or_b32 s0, s0, s6 +; GFX9-NEXT: s_and_b32 s6, s7, 0xff +; GFX9-NEXT: s_and_b32 s6, 0xffff, s6 +; GFX9-NEXT: s_lshr_b32 s9, s1, 8 ; GFX9-NEXT: s_and_b32 s0, 0xffff, s0 -; GFX9-NEXT: s_lshl_b32 s7, s7, 16 +; GFX9-NEXT: s_lshl_b32 s6, s6, 16 ; GFX9-NEXT: s_and_b32 s1, s1, 0xff -; GFX9-NEXT: v_mul_lo_u32 v2, v0, v1 -; GFX9-NEXT: s_or_b32 s0, s0, s7 +; GFX9-NEXT: s_or_b32 s0, s0, s6 ; GFX9-NEXT: s_lshl_b32 s1, s1, 8 -; GFX9-NEXT: s_and_b32 s7, s11, 0xff -; GFX9-NEXT: s_or_b32 s1, s10, s1 -; GFX9-NEXT: s_and_b32 s7, 0xffff, s7 +; GFX9-NEXT: s_and_b32 s6, s9, 0xff +; GFX9-NEXT: v_cvt_f32_ubyte0_e32 v0, 24 +; GFX9-NEXT: s_or_b32 s1, s8, s1 +; GFX9-NEXT: s_and_b32 s6, 0xffff, s6 +; GFX9-NEXT: v_rcp_iflag_f32_e32 v0, v0 ; GFX9-NEXT: s_and_b32 s1, 0xffff, s1 -; GFX9-NEXT: s_lshl_b32 s7, s7, 16 -; GFX9-NEXT: s_or_b32 s1, s1, s7 -; GFX9-NEXT: s_lshr_b32 s7, s2, 8 -; GFX9-NEXT: v_mul_hi_u32 v2, v0, v2 -; GFX9-NEXT: s_and_b32 s7, s7, 0xff -; GFX9-NEXT: s_lshr_b32 s9, s2, 16 -; GFX9-NEXT: s_lshr_b32 s10, s2, 24 +; GFX9-NEXT: s_lshl_b32 s6, s6, 16 +; GFX9-NEXT: s_or_b32 s1, s1, s6 +; GFX9-NEXT: s_lshr_b32 s6, s2, 8 +; GFX9-NEXT: s_and_b32 s6, s6, 0xff +; GFX9-NEXT: s_lshr_b32 s7, s2, 16 +; GFX9-NEXT: s_lshr_b32 s8, s2, 24 ; GFX9-NEXT: s_and_b32 s2, s2, 0xff -; GFX9-NEXT: s_lshl_b32 s7, s7, 8 -; GFX9-NEXT: s_or_b32 s2, s2, s7 -; GFX9-NEXT: s_and_b32 s7, s9, 0xff -; GFX9-NEXT: s_and_b32 s7, 0xffff, s7 -; GFX9-NEXT: v_add_u32_e32 v0, v0, v2 -; GFX9-NEXT: v_cvt_f32_ubyte0_e32 v2, 24 -; GFX9-NEXT: s_lshr_b32 s11, s3, 8 +; GFX9-NEXT: s_lshl_b32 s6, s6, 8 +; GFX9-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 +; GFX9-NEXT: s_or_b32 s2, s2, s6 +; GFX9-NEXT: s_and_b32 s6, s7, 0xff +; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0 +; GFX9-NEXT: s_and_b32 s6, 0xffff, s6 +; GFX9-NEXT: s_lshr_b32 s9, s3, 8 ; GFX9-NEXT: s_and_b32 s2, 0xffff, s2 -; GFX9-NEXT: s_lshl_b32 s7, s7, 16 +; GFX9-NEXT: s_lshl_b32 s6, s6, 16 ; GFX9-NEXT: s_and_b32 s3, s3, 0xff -; GFX9-NEXT: v_rcp_iflag_f32_e32 v2, v2 -; GFX9-NEXT: s_or_b32 s2, s2, s7 +; GFX9-NEXT: s_or_b32 s2, s2, s6 ; GFX9-NEXT: s_lshl_b32 s3, s3, 8 -; GFX9-NEXT: s_and_b32 s7, s11, 0xff -; GFX9-NEXT: s_or_b32 s3, s10, s3 -; GFX9-NEXT: s_and_b32 s7, 0xffff, s7 +; GFX9-NEXT: s_and_b32 s6, s9, 0xff +; GFX9-NEXT: v_mov_b32_e32 v1, 0xffffffe8 +; GFX9-NEXT: s_or_b32 s3, s8, s3 +; GFX9-NEXT: s_and_b32 s6, 0xffff, s6 +; GFX9-NEXT: v_mul_lo_u32 v1, v0, v1 ; GFX9-NEXT: s_and_b32 s3, 0xffff, s3 -; GFX9-NEXT: s_lshl_b32 s7, s7, 16 -; GFX9-NEXT: s_or_b32 s3, s3, s7 -; GFX9-NEXT: s_lshr_b32 s7, s4, 8 -; GFX9-NEXT: v_mul_f32_e32 v2, 0x4f7ffffe, v2 -; GFX9-NEXT: s_and_b32 s7, s7, 0xff -; GFX9-NEXT: v_cvt_u32_f32_e32 v2, v2 -; GFX9-NEXT: s_lshr_b32 s9, s4, 16 -; GFX9-NEXT: s_lshr_b32 s10, s4, 24 +; GFX9-NEXT: s_lshl_b32 s6, s6, 16 +; GFX9-NEXT: s_or_b32 s3, s3, s6 +; GFX9-NEXT: s_lshr_b32 s6, s4, 8 +; GFX9-NEXT: s_and_b32 s6, s6, 0xff +; GFX9-NEXT: s_lshr_b32 s7, s4, 16 +; GFX9-NEXT: s_lshr_b32 s8, s4, 24 ; GFX9-NEXT: s_and_b32 s4, s4, 0xff -; GFX9-NEXT: s_lshl_b32 s7, s7, 8 -; GFX9-NEXT: s_or_b32 s4, s4, s7 -; GFX9-NEXT: s_and_b32 s7, s9, 0xff -; GFX9-NEXT: s_and_b32 s7, 0xffff, s7 +; GFX9-NEXT: s_lshl_b32 s6, s6, 8 +; GFX9-NEXT: v_mul_hi_u32 v1, v0, v1 +; GFX9-NEXT: s_or_b32 s4, s4, s6 +; GFX9-NEXT: s_and_b32 s6, s7, 0xff +; GFX9-NEXT: s_and_b32 s6, 0xffff, s6 ; GFX9-NEXT: s_and_b32 s4, 0xffff, s4 -; GFX9-NEXT: s_lshl_b32 s7, s7, 16 -; GFX9-NEXT: v_mul_lo_u32 v1, v2, v1 -; GFX9-NEXT: s_or_b32 s4, s4, s7 -; GFX9-NEXT: v_mul_hi_u32 v0, s4, v0 -; GFX9-NEXT: s_lshr_b32 s11, s5, 8 +; GFX9-NEXT: s_lshl_b32 s6, s6, 16 +; GFX9-NEXT: s_or_b32 s4, s4, s6 +; GFX9-NEXT: v_add_u32_e32 v0, v0, v1 +; GFX9-NEXT: v_mul_hi_u32 v1, s4, v0 +; GFX9-NEXT: s_lshr_b32 s9, s5, 8 ; GFX9-NEXT: s_and_b32 s5, s5, 0xff -; GFX9-NEXT: v_mul_hi_u32 v1, v2, v1 ; GFX9-NEXT: s_lshl_b32 s5, s5, 8 -; GFX9-NEXT: s_and_b32 s7, s11, 0xff -; GFX9-NEXT: s_or_b32 s5, s10, s5 -; GFX9-NEXT: s_and_b32 s7, 0xffff, s7 -; GFX9-NEXT: v_mul_lo_u32 v0, v0, 24 -; GFX9-NEXT: s_and_b32 s5, 0xffff, s5 -; GFX9-NEXT: s_lshl_b32 s7, s7, 16 -; GFX9-NEXT: s_or_b32 s5, s5, s7 -; GFX9-NEXT: v_add_u32_e32 v1, v2, v1 -; GFX9-NEXT: v_mul_hi_u32 v1, s5, v1 -; GFX9-NEXT: v_sub_u32_e32 v0, s4, v0 -; GFX9-NEXT: v_subrev_u32_e32 v3, 24, v0 -; GFX9-NEXT: v_cmp_le_u32_e32 vcc, 24, v0 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc -; GFX9-NEXT: v_subrev_u32_e32 v3, 24, v0 -; GFX9-NEXT: v_cmp_le_u32_e32 vcc, 24, v0 +; GFX9-NEXT: s_and_b32 s6, s9, 0xff +; GFX9-NEXT: s_or_b32 s5, s8, s5 ; GFX9-NEXT: v_mul_lo_u32 v1, v1, 24 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc -; GFX9-NEXT: v_sub_u32_e32 v2, 23, v0 -; GFX9-NEXT: s_lshr_b32 s2, s2, 1 -; GFX9-NEXT: v_and_b32_e32 v2, 0xffffff, v2 -; GFX9-NEXT: v_and_b32_e32 v0, 0xffffff, v0 -; GFX9-NEXT: v_lshrrev_b32_e64 v2, v2, s2 -; GFX9-NEXT: v_sub_u32_e32 v1, s5, v1 -; GFX9-NEXT: v_lshl_or_b32 v0, s0, v0, v2 +; GFX9-NEXT: s_and_b32 s6, 0xffff, s6 +; GFX9-NEXT: s_and_b32 s5, 0xffff, s5 +; GFX9-NEXT: s_lshl_b32 s6, s6, 16 +; GFX9-NEXT: s_or_b32 s5, s5, s6 +; GFX9-NEXT: v_mul_hi_u32 v0, s5, v0 +; GFX9-NEXT: v_sub_u32_e32 v1, s4, v1 ; GFX9-NEXT: v_subrev_u32_e32 v2, 24, v1 ; GFX9-NEXT: v_cmp_le_u32_e32 vcc, 24, v1 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc ; GFX9-NEXT: v_subrev_u32_e32 v2, 24, v1 ; GFX9-NEXT: v_cmp_le_u32_e32 vcc, 24, v1 +; GFX9-NEXT: v_mul_lo_u32 v0, v0, 24 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc ; GFX9-NEXT: v_sub_u32_e32 v2, 23, v1 +; GFX9-NEXT: s_lshr_b32 s2, s2, 1 +; GFX9-NEXT: v_and_b32_e32 v2, 0xffffff, v2 +; GFX9-NEXT: v_lshrrev_b32_e64 v2, v2, s2 +; GFX9-NEXT: v_sub_u32_e32 v0, s5, v0 +; GFX9-NEXT: v_lshl_or_b32 v1, s0, v1, v2 +; GFX9-NEXT: v_subrev_u32_e32 v2, 24, v0 +; GFX9-NEXT: v_cmp_le_u32_e32 vcc, 24, v0 +; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc +; GFX9-NEXT: v_subrev_u32_e32 v2, 24, v0 +; GFX9-NEXT: v_cmp_le_u32_e32 vcc, 24, v0 +; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc +; GFX9-NEXT: v_sub_u32_e32 v2, 23, v0 ; GFX9-NEXT: s_lshr_b32 s0, s3, 1 ; GFX9-NEXT: v_and_b32_e32 v2, 0xffffff, v2 -; GFX9-NEXT: v_and_b32_e32 v1, 0xffffff, v1 ; GFX9-NEXT: v_lshrrev_b32_e64 v2, v2, s0 -; GFX9-NEXT: s_mov_b32 s6, 8 -; GFX9-NEXT: v_lshl_or_b32 v1, s1, v1, v2 -; GFX9-NEXT: s_mov_b32 s8, 16 -; GFX9-NEXT: s_movk_i32 s0, 0xff -; GFX9-NEXT: v_lshlrev_b32_sdwa v2, s6, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 -; GFX9-NEXT: v_and_b32_e32 v3, 0xff, v1 -; GFX9-NEXT: v_and_or_b32 v2, v0, s0, v2 -; GFX9-NEXT: v_lshlrev_b32_sdwa v0, s8, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 +; GFX9-NEXT: v_mov_b32_e32 v3, 8 +; GFX9-NEXT: v_lshl_or_b32 v0, s1, v0, v2 +; GFX9-NEXT: v_mov_b32_e32 v2, 0xff +; GFX9-NEXT: v_lshlrev_b32_sdwa v3, v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 +; GFX9-NEXT: v_and_or_b32 v2, v1, v2, v3 +; GFX9-NEXT: v_mov_b32_e32 v3, 16 +; GFX9-NEXT: v_lshlrev_b32_sdwa v1, v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 +; GFX9-NEXT: v_and_b32_e32 v3, 0xff, v0 ; GFX9-NEXT: v_lshlrev_b32_e32 v3, 24, v3 -; GFX9-NEXT: v_or3_b32 v0, v2, v0, v3 -; GFX9-NEXT: v_bfe_u32 v2, v1, 8, 8 -; GFX9-NEXT: v_bfe_u32 v1, v1, 16, 8 -; GFX9-NEXT: v_lshl_or_b32 v1, v1, 8, v2 -; GFX9-NEXT: v_readfirstlane_b32 s0, v0 -; GFX9-NEXT: v_readfirstlane_b32 s1, v1 +; GFX9-NEXT: v_or3_b32 v1, v2, v1, v3 +; GFX9-NEXT: v_bfe_u32 v2, v0, 8, 8 +; GFX9-NEXT: v_bfe_u32 v0, v0, 16, 8 +; GFX9-NEXT: v_lshl_or_b32 v0, v0, 8, v2 +; GFX9-NEXT: v_readfirstlane_b32 s0, v1 +; GFX9-NEXT: v_readfirstlane_b32 s1, v0 ; GFX9-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: s_fshl_v2i24: ; GFX10: ; %bb.0: ; GFX10-NEXT: v_cvt_f32_ubyte0_e32 v0, 24 -; GFX10-NEXT: v_cvt_f32_ubyte0_e32 v1, 24 -; GFX10-NEXT: s_lshr_b32 s6, s0, 8 -; GFX10-NEXT: s_lshr_b32 s7, s0, 16 -; GFX10-NEXT: s_and_b32 s6, s6, 0xff +; GFX10-NEXT: s_lshr_b32 s14, s4, 8 +; GFX10-NEXT: s_lshr_b32 s15, s4, 16 +; GFX10-NEXT: s_and_b32 s14, s14, 0xff +; GFX10-NEXT: s_lshr_b32 s16, s4, 24 ; GFX10-NEXT: v_rcp_iflag_f32_e32 v0, v0 -; GFX10-NEXT: v_rcp_iflag_f32_e32 v1, v1 -; GFX10-NEXT: s_lshr_b32 s8, s0, 24 -; GFX10-NEXT: s_and_b32 s0, s0, 0xff -; GFX10-NEXT: s_lshl_b32 s6, s6, 8 -; GFX10-NEXT: s_and_b32 s7, s7, 0xff -; GFX10-NEXT: s_or_b32 s0, s0, s6 -; GFX10-NEXT: s_and_b32 s6, 0xffff, s7 -; GFX10-NEXT: s_lshr_b32 s7, s4, 8 -; GFX10-NEXT: s_lshr_b32 s10, s4, 16 -; GFX10-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 -; GFX10-NEXT: v_mul_f32_e32 v1, 0x4f7ffffe, v1 -; GFX10-NEXT: s_and_b32 s7, s7, 0xff -; GFX10-NEXT: s_lshr_b32 s11, s4, 24 ; GFX10-NEXT: s_and_b32 s4, s4, 0xff -; GFX10-NEXT: v_cvt_u32_f32_e32 v0, v0 -; GFX10-NEXT: v_cvt_u32_f32_e32 v1, v1 -; GFX10-NEXT: s_lshl_b32 s7, s7, 8 -; GFX10-NEXT: s_lshr_b32 s12, s5, 8 -; GFX10-NEXT: s_or_b32 s4, s4, s7 -; GFX10-NEXT: v_mul_lo_u32 v2, 0xffffffe8, v0 -; GFX10-NEXT: v_mul_lo_u32 v3, 0xffffffe8, v1 -; GFX10-NEXT: s_and_b32 s7, s10, 0xff -; GFX10-NEXT: s_and_b32 s4, 0xffff, s4 -; GFX10-NEXT: s_and_b32 s7, 0xffff, s7 +; GFX10-NEXT: s_and_b32 s15, s15, 0xff +; GFX10-NEXT: s_lshl_b32 s14, s14, 8 +; GFX10-NEXT: s_and_b32 s15, 0xffff, s15 +; GFX10-NEXT: s_or_b32 s4, s4, s14 +; GFX10-NEXT: s_lshr_b32 s17, s5, 8 ; GFX10-NEXT: s_and_b32 s5, s5, 0xff -; GFX10-NEXT: s_lshl_b32 s7, s7, 16 +; GFX10-NEXT: s_lshl_b32 s14, s15, 16 +; GFX10-NEXT: s_and_b32 s4, 0xffff, s4 +; GFX10-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX10-NEXT: s_lshl_b32 s5, s5, 8 -; GFX10-NEXT: v_mul_hi_u32 v2, v0, v2 -; GFX10-NEXT: v_mul_hi_u32 v3, v1, v3 -; GFX10-NEXT: s_or_b32 s4, s4, s7 -; GFX10-NEXT: s_and_b32 s7, s12, 0xff -; GFX10-NEXT: s_or_b32 s5, s11, s5 -; GFX10-NEXT: s_and_b32 s7, 0xffff, s7 +; GFX10-NEXT: s_and_b32 s17, s17, 0xff +; GFX10-NEXT: s_or_b32 s4, s4, s14 +; GFX10-NEXT: s_or_b32 s5, s16, s5 +; GFX10-NEXT: v_cvt_u32_f32_e32 v0, v0 +; GFX10-NEXT: s_and_b32 s16, 0xffff, s17 ; GFX10-NEXT: s_and_b32 s5, 0xffff, s5 -; GFX10-NEXT: s_lshl_b32 s7, s7, 16 -; GFX10-NEXT: v_add_nc_u32_e32 v0, v0, v2 -; GFX10-NEXT: v_add_nc_u32_e32 v1, v1, v3 -; GFX10-NEXT: s_or_b32 s5, s5, s7 +; GFX10-NEXT: s_lshl_b32 s15, s16, 16 +; GFX10-NEXT: s_lshr_b32 s10, s2, 8 +; GFX10-NEXT: v_mul_lo_u32 v1, 0xffffffe8, v0 +; GFX10-NEXT: s_or_b32 s5, s5, s15 ; GFX10-NEXT: s_lshr_b32 s9, s1, 8 ; GFX10-NEXT: s_and_b32 s1, s1, 0xff -; GFX10-NEXT: v_mul_hi_u32 v0, s4, v0 -; GFX10-NEXT: v_mul_hi_u32 v1, s5, v1 +; GFX10-NEXT: s_lshr_b32 s11, s2, 16 +; GFX10-NEXT: s_lshr_b32 s13, s3, 8 +; GFX10-NEXT: s_and_b32 s3, s3, 0xff +; GFX10-NEXT: s_and_b32 s10, s10, 0xff +; GFX10-NEXT: v_mul_hi_u32 v1, v0, v1 +; GFX10-NEXT: s_lshr_b32 s8, s0, 24 +; GFX10-NEXT: s_lshr_b32 s12, s2, 24 +; GFX10-NEXT: s_and_b32 s2, s2, 0xff ; GFX10-NEXT: s_lshl_b32 s1, s1, 8 -; GFX10-NEXT: s_and_b32 s7, s9, 0xff +; GFX10-NEXT: s_and_b32 s9, s9, 0xff +; GFX10-NEXT: s_and_b32 s11, s11, 0xff +; GFX10-NEXT: s_lshl_b32 s3, s3, 8 +; GFX10-NEXT: v_add_nc_u32_e32 v0, v0, v1 +; GFX10-NEXT: s_and_b32 s13, s13, 0xff ; GFX10-NEXT: s_or_b32 s1, s8, s1 -; GFX10-NEXT: s_lshr_b32 s8, s2, 8 -; GFX10-NEXT: s_lshr_b32 s9, s2, 16 -; GFX10-NEXT: s_and_b32 s8, s8, 0xff -; GFX10-NEXT: v_mul_lo_u32 v0, v0, 24 +; GFX10-NEXT: s_and_b32 s8, 0xffff, s9 +; GFX10-NEXT: s_or_b32 s3, s12, s3 +; GFX10-NEXT: v_mul_hi_u32 v1, s4, v0 +; GFX10-NEXT: v_mul_hi_u32 v0, s5, v0 +; GFX10-NEXT: s_and_b32 s9, 0xffff, s13 +; GFX10-NEXT: s_lshr_b32 s6, s0, 8 +; GFX10-NEXT: s_and_b32 s3, 0xffff, s3 +; GFX10-NEXT: s_lshr_b32 s7, s0, 16 +; GFX10-NEXT: s_and_b32 s6, s6, 0xff +; GFX10-NEXT: s_and_b32 s0, s0, 0xff ; GFX10-NEXT: v_mul_lo_u32 v1, v1, 24 -; GFX10-NEXT: s_lshr_b32 s10, s2, 24 -; GFX10-NEXT: s_and_b32 s2, s2, 0xff -; GFX10-NEXT: s_lshl_b32 s8, s8, 8 +; GFX10-NEXT: v_mul_lo_u32 v0, v0, 24 +; GFX10-NEXT: s_and_b32 s7, s7, 0xff +; GFX10-NEXT: s_lshl_b32 s6, s6, 8 ; GFX10-NEXT: s_and_b32 s7, 0xffff, s7 -; GFX10-NEXT: s_or_b32 s2, s2, s8 -; GFX10-NEXT: s_and_b32 s0, 0xffff, s0 -; GFX10-NEXT: v_sub_nc_u32_e32 v0, s4, v0 -; GFX10-NEXT: v_sub_nc_u32_e32 v1, s5, v1 -; GFX10-NEXT: s_lshr_b32 s4, s3, 8 -; GFX10-NEXT: s_and_b32 s5, s9, 0xff -; GFX10-NEXT: s_and_b32 s3, s3, 0xff -; GFX10-NEXT: v_subrev_nc_u32_e32 v2, 24, v0 -; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, 24, v0 -; GFX10-NEXT: v_subrev_nc_u32_e32 v3, 24, v1 -; GFX10-NEXT: s_and_b32 s5, 0xffff, s5 -; GFX10-NEXT: s_lshl_b32 s3, s3, 8 -; GFX10-NEXT: s_and_b32 s4, s4, 0xff -; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc_lo +; GFX10-NEXT: s_or_b32 s0, s0, s6 +; GFX10-NEXT: s_lshl_b32 s6, s7, 16 +; GFX10-NEXT: s_and_b32 s1, 0xffff, s1 +; GFX10-NEXT: v_sub_nc_u32_e32 v1, s4, v1 +; GFX10-NEXT: v_sub_nc_u32_e32 v0, s5, v0 +; GFX10-NEXT: s_lshl_b32 s4, s10, 8 +; GFX10-NEXT: s_and_b32 s5, 0xffff, s11 +; GFX10-NEXT: s_or_b32 s2, s2, s4 +; GFX10-NEXT: v_subrev_nc_u32_e32 v2, 24, v1 ; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, 24, v1 +; GFX10-NEXT: v_subrev_nc_u32_e32 v3, 24, v0 +; GFX10-NEXT: s_lshl_b32 s4, s5, 16 +; GFX10-NEXT: s_lshl_b32 s5, s9, 16 ; GFX10-NEXT: s_and_b32 s2, 0xffff, s2 -; GFX10-NEXT: s_lshl_b32 s5, s5, 16 -; GFX10-NEXT: s_or_b32 s3, s10, s3 -; GFX10-NEXT: v_subrev_nc_u32_e32 v2, 24, v0 -; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc_lo ; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, 24, v0 -; GFX10-NEXT: s_and_b32 s4, 0xffff, s4 -; GFX10-NEXT: s_or_b32 s2, s2, s5 -; GFX10-NEXT: s_and_b32 s3, 0xffff, s3 -; GFX10-NEXT: v_subrev_nc_u32_e32 v3, 24, v1 -; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc_lo +; GFX10-NEXT: s_or_b32 s3, s3, s5 +; GFX10-NEXT: s_or_b32 s2, s2, s4 +; GFX10-NEXT: s_lshr_b32 s3, s3, 1 +; GFX10-NEXT: v_subrev_nc_u32_e32 v2, 24, v1 +; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc_lo ; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, 24, v1 -; GFX10-NEXT: s_lshl_b32 s4, s4, 16 ; GFX10-NEXT: s_lshr_b32 s2, s2, 1 -; GFX10-NEXT: s_or_b32 s3, s3, s4 -; GFX10-NEXT: v_sub_nc_u32_e32 v2, 23, v0 -; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc_lo -; GFX10-NEXT: v_and_b32_e32 v0, 0xffffff, v0 -; GFX10-NEXT: s_lshl_b32 s6, s6, 16 -; GFX10-NEXT: s_and_b32 s1, 0xffff, s1 -; GFX10-NEXT: v_and_b32_e32 v2, 0xffffff, v2 -; GFX10-NEXT: v_sub_nc_u32_e32 v3, 23, v1 -; GFX10-NEXT: v_and_b32_e32 v1, 0xffffff, v1 -; GFX10-NEXT: s_lshl_b32 s7, s7, 16 +; GFX10-NEXT: s_lshl_b32 s7, s8, 16 +; GFX10-NEXT: s_and_b32 s0, 0xffff, s0 +; GFX10-NEXT: v_subrev_nc_u32_e32 v3, 24, v0 +; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc_lo +; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, 24, v0 ; GFX10-NEXT: s_or_b32 s0, s0, s6 +; GFX10-NEXT: s_or_b32 s1, s1, s7 +; GFX10-NEXT: v_sub_nc_u32_e32 v2, 23, v1 +; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc_lo +; GFX10-NEXT: v_and_b32_e32 v2, 0xffffff, v2 +; GFX10-NEXT: v_sub_nc_u32_e32 v3, 23, v0 ; GFX10-NEXT: v_lshrrev_b32_e64 v2, v2, s2 ; GFX10-NEXT: v_and_b32_e32 v3, 0xffffff, v3 -; GFX10-NEXT: s_lshr_b32 s2, s3, 1 -; GFX10-NEXT: s_or_b32 s1, s1, s7 -; GFX10-NEXT: v_lshl_or_b32 v0, s0, v0, v2 -; GFX10-NEXT: v_lshrrev_b32_e64 v3, v3, s2 -; GFX10-NEXT: s_mov_b32 s0, 8 -; GFX10-NEXT: v_lshlrev_b32_sdwa v2, s0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 -; GFX10-NEXT: v_lshl_or_b32 v1, s1, v1, v3 -; GFX10-NEXT: s_mov_b32 s0, 16 -; GFX10-NEXT: v_and_or_b32 v2, v0, 0xff, v2 -; GFX10-NEXT: v_and_b32_e32 v3, 0xff, v1 -; GFX10-NEXT: v_lshlrev_b32_sdwa v0, s0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 -; GFX10-NEXT: v_bfe_u32 v4, v1, 8, 8 -; GFX10-NEXT: v_bfe_u32 v1, v1, 16, 8 -; GFX10-NEXT: v_lshlrev_b32_e32 v3, 24, v3 -; GFX10-NEXT: v_lshl_or_b32 v1, v1, 8, v4 -; GFX10-NEXT: v_or3_b32 v0, v2, v0, v3 -; GFX10-NEXT: v_readfirstlane_b32 s1, v1 -; GFX10-NEXT: v_readfirstlane_b32 s0, v0 +; GFX10-NEXT: v_lshl_or_b32 v1, s0, v1, v2 +; GFX10-NEXT: v_lshrrev_b32_e64 v3, v3, s3 +; GFX10-NEXT: v_mov_b32_e32 v2, 8 +; GFX10-NEXT: v_lshl_or_b32 v0, s1, v0, v3 +; GFX10-NEXT: v_mov_b32_e32 v3, 16 +; GFX10-NEXT: v_lshlrev_b32_sdwa v2, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 +; GFX10-NEXT: v_and_b32_e32 v4, 0xff, v0 +; GFX10-NEXT: v_lshlrev_b32_sdwa v3, v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 +; GFX10-NEXT: v_bfe_u32 v5, v0, 8, 8 +; GFX10-NEXT: v_and_or_b32 v1, 0xff, v1, v2 +; GFX10-NEXT: v_bfe_u32 v0, v0, 16, 8 +; GFX10-NEXT: v_lshlrev_b32_e32 v2, 24, v4 +; GFX10-NEXT: v_lshl_or_b32 v0, v0, 8, v5 +; GFX10-NEXT: v_or3_b32 v1, v1, v3, v2 +; GFX10-NEXT: v_readfirstlane_b32 s1, v0 +; GFX10-NEXT: v_readfirstlane_b32 s0, v1 ; GFX10-NEXT: ; return to shader part epilog ; ; GFX11-LABEL: s_fshl_v2i24: ; GFX11: ; %bb.0: ; GFX11-NEXT: v_cvt_f32_ubyte0_e32 v0, 24 -; GFX11-NEXT: v_cvt_f32_ubyte0_e32 v1, 24 -; GFX11-NEXT: s_lshr_b32 s6, s0, 8 -; GFX11-NEXT: s_lshr_b32 s7, s0, 16 -; GFX11-NEXT: s_and_b32 s6, s6, 0xff +; GFX11-NEXT: s_lshr_b32 s14, s4, 8 +; GFX11-NEXT: s_lshr_b32 s15, s4, 16 +; GFX11-NEXT: s_and_b32 s14, s14, 0xff +; GFX11-NEXT: s_lshr_b32 s16, s4, 24 ; GFX11-NEXT: v_rcp_iflag_f32_e32 v0, v0 -; GFX11-NEXT: v_rcp_iflag_f32_e32 v1, v1 -; GFX11-NEXT: s_lshr_b32 s8, s0, 24 -; GFX11-NEXT: s_and_b32 s0, s0, 0xff -; GFX11-NEXT: s_lshl_b32 s6, s6, 8 -; GFX11-NEXT: s_lshr_b32 s10, s4, 24 -; GFX11-NEXT: s_or_b32 s0, s0, s6 -; GFX11-NEXT: s_and_b32 s6, s7, 0xff -; GFX11-NEXT: s_and_b32 s0, 0xffff, s0 -; GFX11-NEXT: s_and_b32 s6, 0xffff, s6 -; GFX11-NEXT: s_waitcnt_depctr 0xfff -; GFX11-NEXT: v_dual_mul_f32 v0, 0x4f7ffffe, v0 :: v_dual_mul_f32 v1, 0x4f7ffffe, v1 -; GFX11-NEXT: s_lshl_b32 s6, s6, 16 -; GFX11-NEXT: s_lshr_b32 s7, s4, 16 -; GFX11-NEXT: s_or_b32 s0, s0, s6 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_cvt_u32_f32_e32 v0, v0 -; GFX11-NEXT: v_cvt_u32_f32_e32 v1, v1 -; GFX11-NEXT: s_lshr_b32 s6, s4, 8 ; GFX11-NEXT: s_and_b32 s4, s4, 0xff -; GFX11-NEXT: s_and_b32 s6, s6, 0xff -; GFX11-NEXT: v_mul_lo_u32 v2, 0xffffffe8, v0 -; GFX11-NEXT: v_mul_lo_u32 v3, 0xffffffe8, v1 -; GFX11-NEXT: s_lshl_b32 s6, s6, 8 -; GFX11-NEXT: s_and_b32 s7, s7, 0xff -; GFX11-NEXT: s_or_b32 s4, s4, s6 -; GFX11-NEXT: s_and_b32 s6, 0xffff, s7 -; GFX11-NEXT: s_lshr_b32 s11, s5, 8 -; GFX11-NEXT: s_and_b32 s4, 0xffff, s4 -; GFX11-NEXT: v_mul_hi_u32 v2, v0, v2 -; GFX11-NEXT: s_lshl_b32 s6, s6, 16 +; GFX11-NEXT: s_and_b32 s15, s15, 0xff +; GFX11-NEXT: s_lshl_b32 s14, s14, 8 +; GFX11-NEXT: s_and_b32 s15, 0xffff, s15 +; GFX11-NEXT: s_or_b32 s4, s4, s14 +; GFX11-NEXT: s_lshr_b32 s17, s5, 8 ; GFX11-NEXT: s_and_b32 s5, s5, 0xff -; GFX11-NEXT: s_or_b32 s4, s4, s6 +; GFX11-NEXT: s_lshl_b32 s14, s15, 16 +; GFX11-NEXT: s_and_b32 s4, 0xffff, s4 +; GFX11-NEXT: s_waitcnt_depctr 0xfff +; GFX11-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX11-NEXT: s_lshl_b32 s5, s5, 8 -; GFX11-NEXT: s_and_b32 s6, s11, 0xff -; GFX11-NEXT: s_or_b32 s5, s10, s5 -; GFX11-NEXT: s_and_b32 s6, 0xffff, s6 -; GFX11-NEXT: v_add_nc_u32_e32 v0, v0, v2 -; GFX11-NEXT: v_mul_hi_u32 v2, v1, v3 +; GFX11-NEXT: s_and_b32 s17, s17, 0xff +; GFX11-NEXT: s_or_b32 s4, s4, s14 +; GFX11-NEXT: s_or_b32 s5, s16, s5 +; GFX11-NEXT: v_cvt_u32_f32_e32 v0, v0 +; GFX11-NEXT: s_and_b32 s16, 0xffff, s17 ; GFX11-NEXT: s_and_b32 s5, 0xffff, s5 -; GFX11-NEXT: s_lshl_b32 s6, s6, 16 +; GFX11-NEXT: s_lshl_b32 s14, s16, 16 ; GFX11-NEXT: s_lshr_b32 s9, s1, 8 -; GFX11-NEXT: v_mul_hi_u32 v0, s4, v0 -; GFX11-NEXT: s_or_b32 s5, s5, s6 +; GFX11-NEXT: v_mul_lo_u32 v1, 0xffffffe8, v0 +; GFX11-NEXT: s_or_b32 s5, s5, s14 ; GFX11-NEXT: s_and_b32 s1, s1, 0xff -; GFX11-NEXT: s_and_b32 s7, s9, 0xff -; GFX11-NEXT: v_add_nc_u32_e32 v1, v1, v2 +; GFX11-NEXT: s_lshr_b32 s10, s2, 8 +; GFX11-NEXT: s_lshr_b32 s8, s0, 24 +; GFX11-NEXT: s_lshr_b32 s11, s2, 16 ; GFX11-NEXT: s_lshl_b32 s1, s1, 8 -; GFX11-NEXT: s_and_b32 s6, 0xffff, s7 -; GFX11-NEXT: s_lshr_b32 s7, s2, 8 -; GFX11-NEXT: v_mul_lo_u32 v0, v0, 24 -; GFX11-NEXT: v_mul_hi_u32 v1, s5, v1 +; GFX11-NEXT: s_and_b32 s10, s10, 0xff +; GFX11-NEXT: v_mul_hi_u32 v1, v0, v1 +; GFX11-NEXT: s_lshr_b32 s12, s2, 24 +; GFX11-NEXT: s_and_b32 s2, s2, 0xff +; GFX11-NEXT: s_and_b32 s11, s11, 0xff ; GFX11-NEXT: s_or_b32 s1, s8, s1 -; GFX11-NEXT: s_lshr_b32 s8, s2, 16 -; GFX11-NEXT: s_and_b32 s7, s7, 0xff -; GFX11-NEXT: s_lshr_b32 s9, s3, 8 -; GFX11-NEXT: s_lshl_b32 s7, s7, 8 +; GFX11-NEXT: s_lshl_b32 s8, s10, 8 +; GFX11-NEXT: s_lshr_b32 s6, s0, 8 +; GFX11-NEXT: s_or_b32 s2, s2, s8 +; GFX11-NEXT: v_add_nc_u32_e32 v0, v0, v1 +; GFX11-NEXT: s_lshr_b32 s7, s0, 16 +; GFX11-NEXT: s_lshr_b32 s13, s3, 8 ; GFX11-NEXT: s_and_b32 s3, s3, 0xff -; GFX11-NEXT: v_sub_nc_u32_e32 v0, s4, v0 -; GFX11-NEXT: v_mul_lo_u32 v1, v1, 24 -; GFX11-NEXT: s_lshr_b32 s4, s2, 24 -; GFX11-NEXT: s_and_b32 s2, s2, 0xff -; GFX11-NEXT: s_lshl_b32 s3, s3, 8 -; GFX11-NEXT: v_subrev_nc_u32_e32 v2, 24, v0 -; GFX11-NEXT: v_cmp_le_u32_e32 vcc_lo, 24, v0 -; GFX11-NEXT: s_or_b32 s2, s2, s7 -; GFX11-NEXT: s_or_b32 s3, s4, s3 -; GFX11-NEXT: v_sub_nc_u32_e32 v1, s5, v1 -; GFX11-NEXT: s_and_b32 s5, s8, 0xff -; GFX11-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc_lo -; GFX11-NEXT: s_and_b32 s5, 0xffff, s5 ; GFX11-NEXT: s_and_b32 s2, 0xffff, s2 +; GFX11-NEXT: v_mul_hi_u32 v1, s4, v0 +; GFX11-NEXT: v_mul_hi_u32 v0, s5, v0 +; GFX11-NEXT: s_and_b32 s6, s6, 0xff +; GFX11-NEXT: s_and_b32 s7, s7, 0xff +; GFX11-NEXT: s_and_b32 s9, s9, 0xff +; GFX11-NEXT: s_lshl_b32 s3, s3, 8 +; GFX11-NEXT: s_and_b32 s13, s13, 0xff +; GFX11-NEXT: s_and_b32 s0, s0, 0xff +; GFX11-NEXT: v_mul_lo_u32 v1, v1, 24 +; GFX11-NEXT: v_mul_lo_u32 v0, v0, 24 +; GFX11-NEXT: s_lshl_b32 s6, s6, 8 +; GFX11-NEXT: s_and_b32 s7, 0xffff, s7 +; GFX11-NEXT: s_or_b32 s3, s12, s3 +; GFX11-NEXT: s_or_b32 s0, s0, s6 +; GFX11-NEXT: s_lshl_b32 s6, s7, 16 +; GFX11-NEXT: s_and_b32 s3, 0xffff, s3 +; GFX11-NEXT: v_sub_nc_u32_e32 v1, s4, v1 +; GFX11-NEXT: v_sub_nc_u32_e32 v0, s5, v0 +; GFX11-NEXT: s_and_b32 s5, 0xffff, s11 +; GFX11-NEXT: s_and_b32 s4, 0xffff, s9 +; GFX11-NEXT: s_lshl_b32 s5, s5, 16 ; GFX11-NEXT: v_subrev_nc_u32_e32 v2, 24, v1 ; GFX11-NEXT: v_cmp_le_u32_e32 vcc_lo, 24, v1 ; GFX11-NEXT: v_subrev_nc_u32_e32 v3, 24, v0 -; GFX11-NEXT: s_lshl_b32 s5, s5, 16 -; GFX11-NEXT: s_and_b32 s4, s9, 0xff ; GFX11-NEXT: s_or_b32 s2, s2, s5 +; GFX11-NEXT: s_and_b32 s9, 0xffff, s13 +; GFX11-NEXT: s_lshr_b32 s2, s2, 1 ; GFX11-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc_lo ; GFX11-NEXT: v_cmp_le_u32_e32 vcc_lo, 24, v0 -; GFX11-NEXT: s_lshr_b32 s2, s2, 1 -; GFX11-NEXT: s_and_b32 s4, 0xffff, s4 -; GFX11-NEXT: s_and_b32 s1, 0xffff, s1 +; GFX11-NEXT: s_lshl_b32 s7, s9, 16 +; GFX11-NEXT: s_and_b32 s0, 0xffff, s0 +; GFX11-NEXT: s_or_b32 s3, s3, s7 ; GFX11-NEXT: v_subrev_nc_u32_e32 v2, 24, v1 ; GFX11-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc_lo ; GFX11-NEXT: v_cmp_le_u32_e32 vcc_lo, 24, v1 -; GFX11-NEXT: s_lshl_b32 s6, s6, 16 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-NEXT: s_or_b32 s0, s0, s6 +; GFX11-NEXT: s_and_b32 s1, 0xffff, s1 +; GFX11-NEXT: s_lshl_b32 s4, s4, 16 +; GFX11-NEXT: v_subrev_nc_u32_e32 v3, 24, v0 +; GFX11-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc_lo +; GFX11-NEXT: v_cmp_le_u32_e32 vcc_lo, 24, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_sub_nc_u32_e32 v2, 23, v1 +; GFX11-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc_lo +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_and_b32_e32 v2, 0xffffff, v2 ; GFX11-NEXT: v_sub_nc_u32_e32 v3, 23, v0 -; GFX11-NEXT: v_dual_cndmask_b32 v1, v1, v2 :: v_dual_and_b32 v0, 0xffffff, v0 -; GFX11-NEXT: v_and_b32_e32 v2, 0xffffff, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3) -; GFX11-NEXT: v_sub_nc_u32_e32 v3, 23, v1 -; GFX11-NEXT: v_and_b32_e32 v1, 0xffffff, v1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_lshrrev_b32_e64 v2, v2, s2 -; GFX11-NEXT: s_and_b32 s2, 0xffff, s3 -; GFX11-NEXT: s_lshl_b32 s3, s4, 16 ; GFX11-NEXT: v_and_b32_e32 v3, 0xffffff, v3 -; GFX11-NEXT: s_or_b32 s2, s2, s3 -; GFX11-NEXT: v_lshl_or_b32 v0, s0, v0, v2 -; GFX11-NEXT: s_lshr_b32 s0, s2, 1 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instid1(SALU_CYCLE_1) -; GFX11-NEXT: v_lshrrev_b32_e64 v2, v3, s0 -; GFX11-NEXT: s_or_b32 s0, s1, s6 +; GFX11-NEXT: s_lshr_b32 s2, s3, 1 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_bfe_u32 v3, v0, 8, 8 ; GFX11-NEXT: v_lshl_or_b32 v1, s0, v1, v2 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3) -; GFX11-NEXT: v_lshlrev_b32_e32 v2, 8, v3 -; GFX11-NEXT: v_bfe_u32 v3, v0, 16, 8 -; GFX11-NEXT: v_and_b32_e32 v4, 0xff, v1 +; GFX11-NEXT: v_lshrrev_b32_e64 v3, v3, s2 +; GFX11-NEXT: s_or_b32 s0, s1, s4 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_bfe_u32 v2, v1, 8, 8 +; GFX11-NEXT: v_lshl_or_b32 v0, s0, v0, v3 +; GFX11-NEXT: v_bfe_u32 v3, v1, 16, 8 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-NEXT: v_and_or_b32 v0, v0, 0xff, v2 -; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX11-NEXT: v_lshlrev_b32_e32 v3, 24, v4 -; GFX11-NEXT: v_bfe_u32 v4, v1, 8, 8 -; GFX11-NEXT: v_bfe_u32 v1, v1, 16, 8 -; GFX11-NEXT: v_or3_b32 v0, v0, v2, v3 +; GFX11-NEXT: v_lshlrev_b32_e32 v2, 8, v2 +; GFX11-NEXT: v_and_b32_e32 v4, 0xff, v0 +; GFX11-NEXT: v_bfe_u32 v5, v0, 8, 8 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX11-NEXT: v_bfe_u32 v0, v0, 16, 8 +; GFX11-NEXT: v_and_or_b32 v1, 0xff, v1, v2 +; GFX11-NEXT: v_lshlrev_b32_e32 v4, 24, v4 +; GFX11-NEXT: v_lshl_or_b32 v0, v0, 8, v5 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_lshl_or_b32 v1, v1, 8, v4 -; GFX11-NEXT: v_readfirstlane_b32 s0, v0 +; GFX11-NEXT: v_or3_b32 v1, v1, v3, v4 +; GFX11-NEXT: v_readfirstlane_b32 s1, v0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX11-NEXT: v_readfirstlane_b32 s1, v1 +; GFX11-NEXT: v_readfirstlane_b32 s0, v1 ; GFX11-NEXT: ; return to shader part epilog %lhs = bitcast i48 %lhs.arg to <2 x i24> %rhs = bitcast i48 %rhs.arg to <2 x i24> @@ -2470,36 +2423,29 @@ ; GFX6-NEXT: v_cvt_f32_ubyte0_e32 v6, 24 ; GFX6-NEXT: v_rcp_iflag_f32_e32 v6, v6 ; GFX6-NEXT: v_mov_b32_e32 v7, 0xffffffe8 -; GFX6-NEXT: v_cvt_f32_ubyte0_e32 v9, 24 -; GFX6-NEXT: v_rcp_iflag_f32_e32 v9, v9 -; GFX6-NEXT: v_mul_f32_e32 v6, 0x4f7ffffe, v6 -; GFX6-NEXT: v_cvt_u32_f32_e32 v6, v6 ; GFX6-NEXT: v_and_b32_e32 v4, 0xffffff, v4 ; GFX6-NEXT: v_and_b32_e32 v5, 0xffffff, v5 +; GFX6-NEXT: v_mul_f32_e32 v6, 0x4f7ffffe, v6 +; GFX6-NEXT: v_cvt_u32_f32_e32 v6, v6 ; GFX6-NEXT: v_bfe_u32 v2, v2, 1, 23 -; GFX6-NEXT: v_mul_lo_u32 v8, v6, v7 -; GFX6-NEXT: v_mul_hi_u32 v8, v6, v8 -; GFX6-NEXT: v_add_i32_e32 v6, vcc, v6, v8 -; GFX6-NEXT: v_mul_hi_u32 v6, v4, v6 -; GFX6-NEXT: v_mul_f32_e32 v8, 0x4f7ffffe, v9 -; GFX6-NEXT: v_cvt_u32_f32_e32 v8, v8 +; GFX6-NEXT: v_mul_lo_u32 v7, v6, v7 +; GFX6-NEXT: v_mul_hi_u32 v7, v6, v7 +; GFX6-NEXT: v_add_i32_e32 v6, vcc, v6, v7 +; GFX6-NEXT: v_mul_hi_u32 v7, v4, v6 +; GFX6-NEXT: v_mul_hi_u32 v6, v5, v6 +; GFX6-NEXT: v_mul_lo_u32 v7, v7, 24 ; GFX6-NEXT: v_mul_lo_u32 v6, v6, 24 -; GFX6-NEXT: v_mul_lo_u32 v7, v8, v7 -; GFX6-NEXT: v_sub_i32_e32 v4, vcc, v4, v6 -; GFX6-NEXT: v_subrev_i32_e32 v6, vcc, 24, v4 +; GFX6-NEXT: v_sub_i32_e32 v4, vcc, v4, v7 +; GFX6-NEXT: v_subrev_i32_e32 v7, vcc, 24, v4 ; GFX6-NEXT: v_cmp_le_u32_e32 vcc, 24, v4 -; GFX6-NEXT: v_mul_hi_u32 v7, v8, v7 -; GFX6-NEXT: v_cndmask_b32_e32 v4, v4, v6, vcc -; GFX6-NEXT: v_subrev_i32_e32 v6, vcc, 24, v4 +; GFX6-NEXT: v_cndmask_b32_e32 v4, v4, v7, vcc +; GFX6-NEXT: v_subrev_i32_e32 v7, vcc, 24, v4 ; GFX6-NEXT: v_cmp_le_u32_e32 vcc, 24, v4 -; GFX6-NEXT: v_cndmask_b32_e32 v4, v4, v6, vcc -; GFX6-NEXT: v_add_i32_e32 v7, vcc, v8, v7 -; GFX6-NEXT: v_mul_hi_u32 v7, v5, v7 -; GFX6-NEXT: v_sub_i32_e32 v6, vcc, 23, v4 +; GFX6-NEXT: v_cndmask_b32_e32 v4, v4, v7, vcc +; GFX6-NEXT: v_sub_i32_e32 v7, vcc, 23, v4 ; GFX6-NEXT: v_and_b32_e32 v4, 0xffffff, v4 ; GFX6-NEXT: v_lshlrev_b32_e32 v0, v4, v0 -; GFX6-NEXT: v_and_b32_e32 v4, 0xffffff, v6 -; GFX6-NEXT: v_mul_lo_u32 v6, v7, 24 +; GFX6-NEXT: v_and_b32_e32 v4, 0xffffff, v7 ; GFX6-NEXT: v_lshrrev_b32_e32 v2, v4, v2 ; GFX6-NEXT: v_or_b32_e32 v0, v0, v2 ; GFX6-NEXT: v_sub_i32_e32 v2, vcc, v5, v6 @@ -2524,36 +2470,29 @@ ; GFX8-NEXT: v_cvt_f32_ubyte0_e32 v6, 24 ; GFX8-NEXT: v_rcp_iflag_f32_e32 v6, v6 ; GFX8-NEXT: v_mov_b32_e32 v7, 0xffffffe8 -; GFX8-NEXT: v_cvt_f32_ubyte0_e32 v9, 24 -; GFX8-NEXT: v_rcp_iflag_f32_e32 v9, v9 -; GFX8-NEXT: v_mul_f32_e32 v6, 0x4f7ffffe, v6 -; GFX8-NEXT: v_cvt_u32_f32_e32 v6, v6 ; GFX8-NEXT: v_and_b32_e32 v4, 0xffffff, v4 ; GFX8-NEXT: v_and_b32_e32 v5, 0xffffff, v5 +; GFX8-NEXT: v_mul_f32_e32 v6, 0x4f7ffffe, v6 +; GFX8-NEXT: v_cvt_u32_f32_e32 v6, v6 ; GFX8-NEXT: v_bfe_u32 v2, v2, 1, 23 -; GFX8-NEXT: v_mul_lo_u32 v8, v6, v7 -; GFX8-NEXT: v_mul_hi_u32 v8, v6, v8 -; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v8 -; GFX8-NEXT: v_mul_hi_u32 v6, v4, v6 -; GFX8-NEXT: v_mul_f32_e32 v8, 0x4f7ffffe, v9 -; GFX8-NEXT: v_cvt_u32_f32_e32 v8, v8 +; GFX8-NEXT: v_mul_lo_u32 v7, v6, v7 +; GFX8-NEXT: v_mul_hi_u32 v7, v6, v7 +; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v7 +; GFX8-NEXT: v_mul_hi_u32 v7, v4, v6 +; GFX8-NEXT: v_mul_hi_u32 v6, v5, v6 +; GFX8-NEXT: v_mul_lo_u32 v7, v7, 24 ; GFX8-NEXT: v_mul_lo_u32 v6, v6, 24 -; GFX8-NEXT: v_mul_lo_u32 v7, v8, v7 -; GFX8-NEXT: v_sub_u32_e32 v4, vcc, v4, v6 -; GFX8-NEXT: v_subrev_u32_e32 v6, vcc, 24, v4 +; GFX8-NEXT: v_sub_u32_e32 v4, vcc, v4, v7 +; GFX8-NEXT: v_subrev_u32_e32 v7, vcc, 24, v4 ; GFX8-NEXT: v_cmp_le_u32_e32 vcc, 24, v4 -; GFX8-NEXT: v_mul_hi_u32 v7, v8, v7 -; GFX8-NEXT: v_cndmask_b32_e32 v4, v4, v6, vcc -; GFX8-NEXT: v_subrev_u32_e32 v6, vcc, 24, v4 +; GFX8-NEXT: v_cndmask_b32_e32 v4, v4, v7, vcc +; GFX8-NEXT: v_subrev_u32_e32 v7, vcc, 24, v4 ; GFX8-NEXT: v_cmp_le_u32_e32 vcc, 24, v4 -; GFX8-NEXT: v_cndmask_b32_e32 v4, v4, v6, vcc -; GFX8-NEXT: v_add_u32_e32 v7, vcc, v8, v7 -; GFX8-NEXT: v_mul_hi_u32 v7, v5, v7 -; GFX8-NEXT: v_sub_u32_e32 v6, vcc, 23, v4 +; GFX8-NEXT: v_cndmask_b32_e32 v4, v4, v7, vcc +; GFX8-NEXT: v_sub_u32_e32 v7, vcc, 23, v4 ; GFX8-NEXT: v_and_b32_e32 v4, 0xffffff, v4 ; GFX8-NEXT: v_lshlrev_b32_e32 v0, v4, v0 -; GFX8-NEXT: v_and_b32_e32 v4, 0xffffff, v6 -; GFX8-NEXT: v_mul_lo_u32 v6, v7, 24 +; GFX8-NEXT: v_and_b32_e32 v4, 0xffffff, v7 ; GFX8-NEXT: v_lshrrev_b32_e32 v2, v4, v2 ; GFX8-NEXT: v_or_b32_e32 v0, v0, v2 ; GFX8-NEXT: v_sub_u32_e32 v2, vcc, v5, v6 @@ -2578,48 +2517,39 @@ ; GFX9-NEXT: v_cvt_f32_ubyte0_e32 v6, 24 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v6, v6 ; GFX9-NEXT: v_mov_b32_e32 v7, 0xffffffe8 -; GFX9-NEXT: v_cvt_f32_ubyte0_e32 v9, 24 -; GFX9-NEXT: v_rcp_iflag_f32_e32 v9, v9 -; GFX9-NEXT: v_mul_f32_e32 v6, 0x4f7ffffe, v6 -; GFX9-NEXT: v_cvt_u32_f32_e32 v6, v6 ; GFX9-NEXT: v_and_b32_e32 v4, 0xffffff, v4 -; GFX9-NEXT: v_mul_f32_e32 v9, 0x4f7ffffe, v9 -; GFX9-NEXT: v_cvt_u32_f32_e32 v9, v9 -; GFX9-NEXT: v_mul_lo_u32 v8, v6, v7 ; GFX9-NEXT: v_and_b32_e32 v5, 0xffffff, v5 +; GFX9-NEXT: v_mul_f32_e32 v6, 0x4f7ffffe, v6 +; GFX9-NEXT: v_cvt_u32_f32_e32 v6, v6 ; GFX9-NEXT: v_bfe_u32 v2, v2, 1, 23 -; GFX9-NEXT: v_mul_lo_u32 v7, v9, v7 -; GFX9-NEXT: v_mul_hi_u32 v8, v6, v8 ; GFX9-NEXT: v_bfe_u32 v3, v3, 1, 23 -; GFX9-NEXT: v_mul_hi_u32 v7, v9, v7 -; GFX9-NEXT: v_add_u32_e32 v6, v6, v8 -; GFX9-NEXT: v_mul_hi_u32 v6, v4, v6 -; GFX9-NEXT: v_add_u32_e32 v7, v9, v7 +; GFX9-NEXT: v_mul_lo_u32 v7, v6, v7 +; GFX9-NEXT: v_mul_hi_u32 v7, v6, v7 +; GFX9-NEXT: v_add_u32_e32 v6, v6, v7 +; GFX9-NEXT: v_mul_hi_u32 v7, v4, v6 +; GFX9-NEXT: v_mul_hi_u32 v6, v5, v6 +; GFX9-NEXT: v_mul_lo_u32 v7, v7, 24 ; GFX9-NEXT: v_mul_lo_u32 v6, v6, 24 -; GFX9-NEXT: v_sub_u32_e32 v4, v4, v6 +; GFX9-NEXT: v_sub_u32_e32 v4, v4, v7 +; GFX9-NEXT: v_sub_u32_e32 v5, v5, v6 ; GFX9-NEXT: v_subrev_u32_e32 v6, 24, v4 ; GFX9-NEXT: v_cmp_le_u32_e32 vcc, 24, v4 +; GFX9-NEXT: v_subrev_u32_e32 v7, 24, v5 ; GFX9-NEXT: v_cndmask_b32_e32 v4, v4, v6, vcc +; GFX9-NEXT: v_cmp_le_u32_e32 vcc, 24, v5 +; GFX9-NEXT: v_cndmask_b32_e32 v5, v5, v7, vcc ; GFX9-NEXT: v_subrev_u32_e32 v6, 24, v4 ; GFX9-NEXT: v_cmp_le_u32_e32 vcc, 24, v4 ; GFX9-NEXT: v_cndmask_b32_e32 v4, v4, v6, vcc -; GFX9-NEXT: v_mul_hi_u32 v6, v5, v7 -; GFX9-NEXT: v_sub_u32_e32 v7, 23, v4 -; GFX9-NEXT: v_and_b32_e32 v7, 0xffffff, v7 -; GFX9-NEXT: v_and_b32_e32 v4, 0xffffff, v4 -; GFX9-NEXT: v_mul_lo_u32 v6, v6, 24 -; GFX9-NEXT: v_lshrrev_b32_e32 v2, v7, v2 +; GFX9-NEXT: v_sub_u32_e32 v6, 23, v4 +; GFX9-NEXT: v_and_b32_e32 v6, 0xffffff, v6 +; GFX9-NEXT: v_lshrrev_b32_e32 v2, v6, v2 ; GFX9-NEXT: v_lshl_or_b32 v0, v0, v4, v2 -; GFX9-NEXT: v_sub_u32_e32 v2, v5, v6 -; GFX9-NEXT: v_subrev_u32_e32 v4, 24, v2 -; GFX9-NEXT: v_cmp_le_u32_e32 vcc, 24, v2 -; GFX9-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc -; GFX9-NEXT: v_subrev_u32_e32 v4, 24, v2 -; GFX9-NEXT: v_cmp_le_u32_e32 vcc, 24, v2 -; GFX9-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc +; GFX9-NEXT: v_subrev_u32_e32 v2, 24, v5 +; GFX9-NEXT: v_cmp_le_u32_e32 vcc, 24, v5 +; GFX9-NEXT: v_cndmask_b32_e32 v2, v5, v2, vcc ; GFX9-NEXT: v_sub_u32_e32 v4, 23, v2 ; GFX9-NEXT: v_and_b32_e32 v4, 0xffffff, v4 -; GFX9-NEXT: v_and_b32_e32 v2, 0xffffff, v2 ; GFX9-NEXT: v_lshrrev_b32_e32 v3, v4, v3 ; GFX9-NEXT: v_lshl_or_b32 v1, v1, v2, v3 ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -2629,29 +2559,22 @@ ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: v_cvt_f32_ubyte0_e32 v6, 24 -; GFX10-NEXT: v_cvt_f32_ubyte0_e32 v7, 24 ; GFX10-NEXT: v_and_b32_e32 v4, 0xffffff, v4 ; GFX10-NEXT: v_and_b32_e32 v5, 0xffffff, v5 ; GFX10-NEXT: v_bfe_u32 v2, v2, 1, 23 -; GFX10-NEXT: v_rcp_iflag_f32_e32 v6, v6 -; GFX10-NEXT: v_rcp_iflag_f32_e32 v7, v7 ; GFX10-NEXT: v_bfe_u32 v3, v3, 1, 23 +; GFX10-NEXT: v_rcp_iflag_f32_e32 v6, v6 ; GFX10-NEXT: v_mul_f32_e32 v6, 0x4f7ffffe, v6 -; GFX10-NEXT: v_mul_f32_e32 v7, 0x4f7ffffe, v7 ; GFX10-NEXT: v_cvt_u32_f32_e32 v6, v6 -; GFX10-NEXT: v_cvt_u32_f32_e32 v7, v7 -; GFX10-NEXT: v_mul_lo_u32 v8, 0xffffffe8, v6 -; GFX10-NEXT: v_mul_lo_u32 v9, 0xffffffe8, v7 -; GFX10-NEXT: v_mul_hi_u32 v8, v6, v8 -; GFX10-NEXT: v_mul_hi_u32 v9, v7, v9 -; GFX10-NEXT: v_add_nc_u32_e32 v6, v6, v8 -; GFX10-NEXT: v_add_nc_u32_e32 v7, v7, v9 -; GFX10-NEXT: v_mul_hi_u32 v6, v4, v6 -; GFX10-NEXT: v_mul_hi_u32 v7, v5, v7 -; GFX10-NEXT: v_mul_lo_u32 v6, v6, 24 +; GFX10-NEXT: v_mul_lo_u32 v7, 0xffffffe8, v6 +; GFX10-NEXT: v_mul_hi_u32 v7, v6, v7 +; GFX10-NEXT: v_add_nc_u32_e32 v6, v6, v7 +; GFX10-NEXT: v_mul_hi_u32 v7, v4, v6 +; GFX10-NEXT: v_mul_hi_u32 v6, v5, v6 ; GFX10-NEXT: v_mul_lo_u32 v7, v7, 24 -; GFX10-NEXT: v_sub_nc_u32_e32 v4, v4, v6 -; GFX10-NEXT: v_sub_nc_u32_e32 v5, v5, v7 +; GFX10-NEXT: v_mul_lo_u32 v6, v6, 24 +; GFX10-NEXT: v_sub_nc_u32_e32 v4, v4, v7 +; GFX10-NEXT: v_sub_nc_u32_e32 v5, v5, v6 ; GFX10-NEXT: v_subrev_nc_u32_e32 v6, 24, v4 ; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, 24, v4 ; GFX10-NEXT: v_subrev_nc_u32_e32 v7, 24, v5 @@ -2665,10 +2588,8 @@ ; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, 24, v5 ; GFX10-NEXT: v_sub_nc_u32_e32 v6, 23, v4 ; GFX10-NEXT: v_cndmask_b32_e32 v5, v5, v7, vcc_lo -; GFX10-NEXT: v_and_b32_e32 v4, 0xffffff, v4 ; GFX10-NEXT: v_and_b32_e32 v6, 0xffffff, v6 ; GFX10-NEXT: v_sub_nc_u32_e32 v7, 23, v5 -; GFX10-NEXT: v_and_b32_e32 v5, 0xffffff, v5 ; GFX10-NEXT: v_lshrrev_b32_e32 v2, v6, v2 ; GFX10-NEXT: v_and_b32_e32 v7, 0xffffff, v7 ; GFX10-NEXT: v_lshl_or_b32 v0, v0, v4, v2 @@ -2681,43 +2602,36 @@ ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: v_cvt_f32_ubyte0_e32 v6, 24 -; GFX11-NEXT: v_cvt_f32_ubyte0_e32 v7, 24 -; GFX11-NEXT: v_and_b32_e32 v5, 0xffffff, v5 +; GFX11-NEXT: v_and_b32_e32 v4, 0xffffff, v4 ; GFX11-NEXT: v_bfe_u32 v2, v2, 1, 23 ; GFX11-NEXT: v_bfe_u32 v3, v3, 1, 23 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_rcp_iflag_f32_e32 v6, v6 -; GFX11-NEXT: v_rcp_iflag_f32_e32 v7, v7 ; GFX11-NEXT: s_waitcnt_depctr 0xfff -; GFX11-NEXT: v_dual_mul_f32 v6, 0x4f7ffffe, v6 :: v_dual_mul_f32 v7, 0x4f7ffffe, v7 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_mul_f32_e32 v6, 0x4f7ffffe, v6 ; GFX11-NEXT: v_cvt_u32_f32_e32 v6, v6 -; GFX11-NEXT: v_cvt_u32_f32_e32 v7, v7 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_mul_lo_u32 v8, 0xffffffe8, v6 -; GFX11-NEXT: v_mul_lo_u32 v9, 0xffffffe8, v7 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_mul_hi_u32 v8, v6, v8 -; GFX11-NEXT: v_mul_hi_u32 v9, v7, v9 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_add_nc_u32_e32 v6, v6, v8 -; GFX11-NEXT: v_add_nc_u32_e32 v7, v7, v9 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_mul_hi_u32 v7, v5, v7 -; GFX11-NEXT: v_mul_lo_u32 v7, v7, 24 +; GFX11-NEXT: v_mul_lo_u32 v7, 0xffffffe8, v6 +; GFX11-NEXT: v_mul_hi_u32 v7, v6, v7 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_sub_nc_u32_e32 v5, v5, v7 -; GFX11-NEXT: v_subrev_nc_u32_e32 v7, 24, v5 -; GFX11-NEXT: v_and_b32_e32 v4, 0xffffff, v4 +; GFX11-NEXT: v_add_nc_u32_e32 v6, v6, v7 +; GFX11-NEXT: v_mul_hi_u32 v7, v4, v6 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_mul_hi_u32 v6, v4, v6 -; GFX11-NEXT: v_mul_lo_u32 v6, v6, 24 +; GFX11-NEXT: v_mul_lo_u32 v7, v7, 24 +; GFX11-NEXT: v_sub_nc_u32_e32 v4, v4, v7 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_cmp_le_u32_e32 vcc_lo, 24, v4 +; GFX11-NEXT: v_and_b32_e32 v5, 0xffffff, v5 +; GFX11-NEXT: v_mul_hi_u32 v6, v5, v6 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_sub_nc_u32_e32 v4, v4, v6 +; GFX11-NEXT: v_mul_lo_u32 v6, v6, 24 +; GFX11-NEXT: v_sub_nc_u32_e32 v5, v5, v6 ; GFX11-NEXT: v_subrev_nc_u32_e32 v6, 24, v4 -; GFX11-NEXT: v_cmp_le_u32_e32 vcc_lo, 24, v4 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_subrev_nc_u32_e32 v7, 24, v5 ; GFX11-NEXT: v_cndmask_b32_e32 v4, v4, v6, vcc_lo ; GFX11-NEXT: v_cmp_le_u32_e32 vcc_lo, 24, v5 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_4) ; GFX11-NEXT: v_subrev_nc_u32_e32 v6, 24, v4 ; GFX11-NEXT: v_cndmask_b32_e32 v5, v5, v7, vcc_lo ; GFX11-NEXT: v_cmp_le_u32_e32 vcc_lo, 24, v4 @@ -2725,21 +2639,19 @@ ; GFX11-NEXT: v_subrev_nc_u32_e32 v7, 24, v5 ; GFX11-NEXT: v_cndmask_b32_e32 v4, v4, v6, vcc_lo ; GFX11-NEXT: v_cmp_le_u32_e32 vcc_lo, 24, v5 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_cndmask_b32_e32 v5, v5, v7, vcc_lo +; GFX11-NEXT: v_sub_nc_u32_e32 v7, 23, v5 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_and_b32_e32 v7, 0xffffff, v7 ; GFX11-NEXT: v_sub_nc_u32_e32 v6, 23, v4 -; GFX11-NEXT: v_dual_cndmask_b32 v5, v5, v7 :: v_dual_and_b32 v4, 0xffffff, v4 +; GFX11-NEXT: v_lshrrev_b32_e32 v3, v7, v3 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_and_b32_e32 v6, 0xffffff, v6 -; GFX11-NEXT: v_sub_nc_u32_e32 v7, 23, v5 -; GFX11-NEXT: v_and_b32_e32 v5, 0xffffff, v5 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_lshl_or_b32 v1, v1, v5, v3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_lshrrev_b32_e32 v2, v6, v2 -; GFX11-NEXT: v_and_b32_e32 v7, 0xffffff, v7 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_lshl_or_b32 v0, v0, v4, v2 -; GFX11-NEXT: v_lshrrev_b32_e32 v3, v7, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_lshl_or_b32 v1, v1, v5, v3 ; GFX11-NEXT: s_setpc_b64 s[30:31] %result = call <2 x i24> @llvm.fshl.v2i24(<2 x i24> %lhs, <2 x i24> %rhs, <2 x i24> %amt) ret <2 x i24> %result @@ -3603,24 +3515,20 @@ ; GFX8-LABEL: v_fshl_i16: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_and_b32_e32 v3, 15, v2 -; GFX8-NEXT: v_xor_b32_e32 v2, -1, v2 -; GFX8-NEXT: v_and_b32_e32 v2, 15, v2 +; GFX8-NEXT: v_xor_b32_e32 v3, -1, v2 ; GFX8-NEXT: v_lshrrev_b16_e32 v1, 1, v1 -; GFX8-NEXT: v_lshlrev_b16_e32 v0, v3, v0 -; GFX8-NEXT: v_lshrrev_b16_e32 v1, v2, v1 +; GFX8-NEXT: v_lshlrev_b16_e32 v0, v2, v0 +; GFX8-NEXT: v_lshrrev_b16_e32 v1, v3, v1 ; GFX8-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: v_fshl_i16: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_and_b32_e32 v3, 15, v2 -; GFX9-NEXT: v_xor_b32_e32 v2, -1, v2 -; GFX9-NEXT: v_and_b32_e32 v2, 15, v2 +; GFX9-NEXT: v_xor_b32_e32 v3, -1, v2 ; GFX9-NEXT: v_lshrrev_b16_e32 v1, 1, v1 -; GFX9-NEXT: v_lshlrev_b16_e32 v0, v3, v0 -; GFX9-NEXT: v_lshrrev_b16_e32 v1, v2, v1 +; GFX9-NEXT: v_lshlrev_b16_e32 v0, v2, v0 +; GFX9-NEXT: v_lshrrev_b16_e32 v1, v3, v1 ; GFX9-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -3629,9 +3537,7 @@ ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: v_xor_b32_e32 v3, -1, v2 -; GFX10-NEXT: v_and_b32_e32 v2, 15, v2 ; GFX10-NEXT: v_lshrrev_b16 v1, 1, v1 -; GFX10-NEXT: v_and_b32_e32 v3, 15, v3 ; GFX10-NEXT: v_lshlrev_b16 v0, v2, v0 ; GFX10-NEXT: v_lshrrev_b16 v1, v3, v1 ; GFX10-NEXT: v_or_b32_e32 v0, v0, v1 @@ -3642,10 +3548,7 @@ ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: v_xor_b32_e32 v3, -1, v2 -; GFX11-NEXT: v_and_b32_e32 v2, 15, v2 ; GFX11-NEXT: v_lshrrev_b16 v1, 1, v1 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-NEXT: v_and_b32_e32 v3, 15, v3 ; GFX11-NEXT: v_lshlrev_b16 v0, v2, v0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_lshrrev_b16 v1, v3, v1 @@ -3765,36 +3668,30 @@ ; ; GFX8-LABEL: v_fshl_i16_ssv: ; GFX8: ; %bb.0: -; GFX8-NEXT: v_and_b32_e32 v1, 15, v0 -; GFX8-NEXT: v_xor_b32_e32 v0, -1, v0 -; GFX8-NEXT: v_lshlrev_b16_e64 v1, v1, s0 +; GFX8-NEXT: v_xor_b32_e32 v1, -1, v0 +; GFX8-NEXT: v_lshlrev_b16_e64 v0, v0, s0 ; GFX8-NEXT: s_and_b32 s0, 0xffff, s1 -; GFX8-NEXT: v_and_b32_e32 v0, 15, v0 ; GFX8-NEXT: s_lshr_b32 s0, s0, 1 -; GFX8-NEXT: v_lshrrev_b16_e64 v0, v0, s0 -; GFX8-NEXT: v_or_b32_e32 v0, v1, v0 +; GFX8-NEXT: v_lshrrev_b16_e64 v1, v1, s0 +; GFX8-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX8-NEXT: ; return to shader part epilog ; ; GFX9-LABEL: v_fshl_i16_ssv: ; GFX9: ; %bb.0: -; GFX9-NEXT: v_and_b32_e32 v1, 15, v0 -; GFX9-NEXT: v_xor_b32_e32 v0, -1, v0 -; GFX9-NEXT: v_lshlrev_b16_e64 v1, v1, s0 +; GFX9-NEXT: v_xor_b32_e32 v1, -1, v0 +; GFX9-NEXT: v_lshlrev_b16_e64 v0, v0, s0 ; GFX9-NEXT: s_and_b32 s0, 0xffff, s1 -; GFX9-NEXT: v_and_b32_e32 v0, 15, v0 ; GFX9-NEXT: s_lshr_b32 s0, s0, 1 -; GFX9-NEXT: v_lshrrev_b16_e64 v0, v0, s0 -; GFX9-NEXT: v_or_b32_e32 v0, v1, v0 +; GFX9-NEXT: v_lshrrev_b16_e64 v1, v1, s0 +; GFX9-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX9-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: v_fshl_i16_ssv: ; GFX10: ; %bb.0: ; GFX10-NEXT: v_xor_b32_e32 v1, -1, v0 -; GFX10-NEXT: v_and_b32_e32 v0, 15, v0 ; GFX10-NEXT: s_and_b32 s1, 0xffff, s1 -; GFX10-NEXT: s_lshr_b32 s1, s1, 1 -; GFX10-NEXT: v_and_b32_e32 v1, 15, v1 ; GFX10-NEXT: v_lshlrev_b16 v0, v0, s0 +; GFX10-NEXT: s_lshr_b32 s1, s1, 1 ; GFX10-NEXT: v_lshrrev_b16 v1, v1, s1 ; GFX10-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX10-NEXT: ; return to shader part epilog @@ -3802,13 +3699,10 @@ ; GFX11-LABEL: v_fshl_i16_ssv: ; GFX11: ; %bb.0: ; GFX11-NEXT: v_xor_b32_e32 v1, -1, v0 -; GFX11-NEXT: v_and_b32_e32 v0, 15, v0 ; GFX11-NEXT: s_and_b32 s1, 0xffff, s1 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: s_lshr_b32 s1, s1, 1 -; GFX11-NEXT: v_and_b32_e32 v1, 15, v1 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_lshlrev_b16 v0, v0, s0 +; GFX11-NEXT: s_lshr_b32 s1, s1, 1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instid1(SALU_CYCLE_1) ; GFX11-NEXT: v_lshrrev_b16 v1, v1, s1 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_or_b32_e32 v0, v0, v1 @@ -4102,21 +3996,17 @@ ; GFX8-LABEL: v_fshl_v2i16: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_lshrrev_b32_e32 v3, 16, v2 -; GFX8-NEXT: v_and_b32_e32 v4, 15, v2 -; GFX8-NEXT: v_xor_b32_e32 v2, -1, v2 -; GFX8-NEXT: v_and_b32_e32 v2, 15, v2 +; GFX8-NEXT: v_xor_b32_e32 v4, -1, v2 ; GFX8-NEXT: v_lshrrev_b16_e32 v5, 1, v1 -; GFX8-NEXT: v_lshlrev_b16_e32 v4, v4, v0 -; GFX8-NEXT: v_lshrrev_b16_e32 v2, v2, v5 -; GFX8-NEXT: v_or_b32_e32 v2, v4, v2 -; GFX8-NEXT: v_and_b32_e32 v4, 15, v3 -; GFX8-NEXT: v_xor_b32_e32 v3, -1, v3 -; GFX8-NEXT: v_lshlrev_b16_sdwa v0, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX8-NEXT: v_mov_b32_e32 v4, 1 -; GFX8-NEXT: v_and_b32_e32 v3, 15, v3 -; GFX8-NEXT: v_lshrrev_b16_sdwa v1, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX8-NEXT: v_lshrrev_b16_e32 v1, v3, v1 +; GFX8-NEXT: v_lshrrev_b32_e32 v3, 16, v2 +; GFX8-NEXT: v_lshlrev_b16_e32 v2, v2, v0 +; GFX8-NEXT: v_lshrrev_b16_e32 v4, v4, v5 +; GFX8-NEXT: v_or_b32_e32 v2, v2, v4 +; GFX8-NEXT: v_xor_b32_e32 v4, -1, v3 +; GFX8-NEXT: v_lshlrev_b16_sdwa v0, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX8-NEXT: v_mov_b32_e32 v3, 1 +; GFX8-NEXT: v_lshrrev_b16_sdwa v1, v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX8-NEXT: v_lshrrev_b16_e32 v1, v4, v1 ; GFX8-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX8-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v0 @@ -4257,24 +4147,20 @@ ; ; GFX8-LABEL: v_fshl_v2i16_ssv: ; GFX8: ; %bb.0: -; GFX8-NEXT: v_and_b32_e32 v2, 15, v0 ; GFX8-NEXT: s_lshr_b32 s2, s0, 16 ; GFX8-NEXT: v_lshrrev_b32_e32 v1, 16, v0 -; GFX8-NEXT: v_xor_b32_e32 v0, -1, v0 -; GFX8-NEXT: v_lshlrev_b16_e64 v2, v2, s0 +; GFX8-NEXT: v_xor_b32_e32 v2, -1, v0 +; GFX8-NEXT: v_lshlrev_b16_e64 v0, v0, s0 ; GFX8-NEXT: s_and_b32 s0, 0xffff, s1 -; GFX8-NEXT: v_and_b32_e32 v0, 15, v0 ; GFX8-NEXT: s_lshr_b32 s0, s0, 1 -; GFX8-NEXT: v_lshrrev_b16_e64 v0, v0, s0 ; GFX8-NEXT: s_lshr_b32 s3, s1, 16 -; GFX8-NEXT: v_or_b32_e32 v0, v2, v0 -; GFX8-NEXT: v_and_b32_e32 v2, 15, v1 -; GFX8-NEXT: v_xor_b32_e32 v1, -1, v1 -; GFX8-NEXT: v_and_b32_e32 v1, 15, v1 +; GFX8-NEXT: v_lshrrev_b16_e64 v2, v2, s0 +; GFX8-NEXT: v_or_b32_e32 v0, v0, v2 +; GFX8-NEXT: v_xor_b32_e32 v2, -1, v1 ; GFX8-NEXT: s_lshr_b32 s0, s3, 1 -; GFX8-NEXT: v_lshlrev_b16_e64 v2, v2, s2 -; GFX8-NEXT: v_lshrrev_b16_e64 v1, v1, s0 -; GFX8-NEXT: v_or_b32_e32 v1, v2, v1 +; GFX8-NEXT: v_lshlrev_b16_e64 v1, v1, s2 +; GFX8-NEXT: v_lshrrev_b16_e64 v2, v2, s0 +; GFX8-NEXT: v_or_b32_e32 v1, v1, v2 ; GFX8-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX8-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD @@ -4794,28 +4680,22 @@ ; GFX8-LABEL: v_fshl_v3i16: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_lshrrev_b32_e32 v6, 16, v4 -; GFX8-NEXT: v_and_b32_e32 v7, 15, v4 -; GFX8-NEXT: v_xor_b32_e32 v4, -1, v4 -; GFX8-NEXT: v_and_b32_e32 v4, 15, v4 +; GFX8-NEXT: v_xor_b32_e32 v7, -1, v4 ; GFX8-NEXT: v_lshrrev_b16_e32 v8, 1, v2 -; GFX8-NEXT: v_lshlrev_b16_e32 v7, v7, v0 -; GFX8-NEXT: v_lshrrev_b16_e32 v4, v4, v8 -; GFX8-NEXT: v_or_b32_e32 v4, v7, v4 -; GFX8-NEXT: v_and_b32_e32 v7, 15, v6 -; GFX8-NEXT: v_xor_b32_e32 v6, -1, v6 -; GFX8-NEXT: v_lshlrev_b16_sdwa v0, v7, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX8-NEXT: v_mov_b32_e32 v7, 1 -; GFX8-NEXT: v_and_b32_e32 v6, 15, v6 -; GFX8-NEXT: v_lshrrev_b16_sdwa v2, v7, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX8-NEXT: v_lshrrev_b16_e32 v2, v6, v2 +; GFX8-NEXT: v_lshrrev_b32_e32 v6, 16, v4 +; GFX8-NEXT: v_lshlrev_b16_e32 v4, v4, v0 +; GFX8-NEXT: v_lshrrev_b16_e32 v7, v7, v8 +; GFX8-NEXT: v_or_b32_e32 v4, v4, v7 +; GFX8-NEXT: v_xor_b32_e32 v7, -1, v6 +; GFX8-NEXT: v_lshlrev_b16_sdwa v0, v6, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX8-NEXT: v_mov_b32_e32 v6, 1 +; GFX8-NEXT: v_lshrrev_b16_sdwa v2, v6, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX8-NEXT: v_lshrrev_b16_e32 v2, v7, v2 ; GFX8-NEXT: v_or_b32_e32 v0, v0, v2 -; GFX8-NEXT: v_and_b32_e32 v2, 15, v5 -; GFX8-NEXT: v_xor_b32_e32 v5, -1, v5 -; GFX8-NEXT: v_and_b32_e32 v5, 15, v5 -; GFX8-NEXT: v_lshlrev_b16_e32 v1, v2, v1 -; GFX8-NEXT: v_lshrrev_b16_e32 v2, 1, v3 -; GFX8-NEXT: v_lshrrev_b16_e32 v2, v5, v2 +; GFX8-NEXT: v_xor_b32_e32 v2, -1, v5 +; GFX8-NEXT: v_lshrrev_b16_e32 v3, 1, v3 +; GFX8-NEXT: v_lshlrev_b16_e32 v1, v5, v1 +; GFX8-NEXT: v_lshrrev_b16_e32 v2, v2, v3 ; GFX8-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX8-NEXT: v_or_b32_e32 v1, v1, v2 ; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v0 @@ -5160,37 +5040,28 @@ ; GFX8-LABEL: v_fshl_v4i16: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_lshrrev_b32_e32 v6, 16, v4 -; GFX8-NEXT: v_and_b32_e32 v8, 15, v4 -; GFX8-NEXT: v_xor_b32_e32 v4, -1, v4 -; GFX8-NEXT: v_and_b32_e32 v4, 15, v4 +; GFX8-NEXT: v_xor_b32_e32 v8, -1, v4 ; GFX8-NEXT: v_lshrrev_b16_e32 v9, 1, v2 -; GFX8-NEXT: v_lshlrev_b16_e32 v8, v8, v0 -; GFX8-NEXT: v_lshrrev_b16_e32 v4, v4, v9 -; GFX8-NEXT: v_or_b32_e32 v4, v8, v4 -; GFX8-NEXT: v_and_b32_e32 v8, 15, v6 -; GFX8-NEXT: v_xor_b32_e32 v6, -1, v6 -; GFX8-NEXT: v_lshlrev_b16_sdwa v0, v8, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX8-NEXT: v_mov_b32_e32 v8, 1 -; GFX8-NEXT: v_and_b32_e32 v6, 15, v6 -; GFX8-NEXT: v_lshrrev_b16_sdwa v2, v8, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX8-NEXT: v_lshrrev_b16_e32 v2, v6, v2 -; GFX8-NEXT: v_lshrrev_b32_e32 v7, 16, v5 +; GFX8-NEXT: v_lshrrev_b32_e32 v6, 16, v4 +; GFX8-NEXT: v_lshlrev_b16_e32 v4, v4, v0 +; GFX8-NEXT: v_lshrrev_b16_e32 v8, v8, v9 +; GFX8-NEXT: v_or_b32_e32 v4, v4, v8 +; GFX8-NEXT: v_xor_b32_e32 v8, -1, v6 +; GFX8-NEXT: v_lshlrev_b16_sdwa v0, v6, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX8-NEXT: v_mov_b32_e32 v6, 1 +; GFX8-NEXT: v_lshrrev_b16_sdwa v2, v6, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX8-NEXT: v_lshrrev_b16_e32 v2, v8, v2 ; GFX8-NEXT: v_or_b32_e32 v0, v0, v2 -; GFX8-NEXT: v_and_b32_e32 v2, 15, v5 -; GFX8-NEXT: v_xor_b32_e32 v5, -1, v5 -; GFX8-NEXT: v_and_b32_e32 v5, 15, v5 -; GFX8-NEXT: v_lshrrev_b16_e32 v6, 1, v3 -; GFX8-NEXT: v_lshlrev_b16_e32 v2, v2, v1 -; GFX8-NEXT: v_lshrrev_b16_e32 v5, v5, v6 -; GFX8-NEXT: v_or_b32_e32 v2, v2, v5 -; GFX8-NEXT: v_and_b32_e32 v5, 15, v7 -; GFX8-NEXT: v_xor_b32_e32 v6, -1, v7 -; GFX8-NEXT: v_lshlrev_b16_sdwa v1, v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX8-NEXT: v_mov_b32_e32 v5, 1 -; GFX8-NEXT: v_and_b32_e32 v6, 15, v6 -; GFX8-NEXT: v_lshrrev_b16_sdwa v3, v5, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX8-NEXT: v_lshrrev_b16_e32 v3, v6, v3 +; GFX8-NEXT: v_xor_b32_e32 v2, -1, v5 +; GFX8-NEXT: v_lshrrev_b16_e32 v8, 1, v3 +; GFX8-NEXT: v_lshrrev_b32_e32 v7, 16, v5 +; GFX8-NEXT: v_lshlrev_b16_e32 v5, v5, v1 +; GFX8-NEXT: v_lshrrev_b16_e32 v2, v2, v8 +; GFX8-NEXT: v_or_b32_e32 v2, v5, v2 +; GFX8-NEXT: v_xor_b32_e32 v5, -1, v7 +; GFX8-NEXT: v_lshrrev_b16_sdwa v3, v6, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX8-NEXT: v_lshlrev_b16_sdwa v1, v7, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX8-NEXT: v_lshrrev_b16_e32 v3, v5, v3 ; GFX8-NEXT: v_or_b32_e32 v1, v1, v3 ; GFX8-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX8-NEXT: v_and_b32_e32 v1, 0xffff, v1 Index: llvm/test/CodeGen/AMDGPU/GlobalISel/fshr.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/GlobalISel/fshr.ll +++ llvm/test/CodeGen/AMDGPU/GlobalISel/fshr.ll @@ -894,24 +894,24 @@ ; GFX10-NEXT: v_lshrrev_b32_e32 v3, 8, v2 ; GFX10-NEXT: v_lshrrev_b32_e32 v4, 8, v0 ; GFX10-NEXT: v_lshrrev_b32_e32 v5, 8, v1 -; GFX10-NEXT: v_and_b32_e32 v7, 7, v2 +; GFX10-NEXT: v_and_b32_e32 v6, 7, v2 ; GFX10-NEXT: v_not_b32_e32 v2, v2 -; GFX10-NEXT: v_not_b32_e32 v6, v3 +; GFX10-NEXT: v_not_b32_e32 v7, v3 ; GFX10-NEXT: v_and_b32_e32 v3, 7, v3 ; GFX10-NEXT: v_lshlrev_b16 v4, 1, v4 ; GFX10-NEXT: v_and_b32_e32 v5, 0xff, v5 ; GFX10-NEXT: v_lshlrev_b16 v0, 1, v0 -; GFX10-NEXT: v_and_b32_e32 v6, 7, v6 +; GFX10-NEXT: v_and_b32_e32 v7, 7, v7 ; GFX10-NEXT: v_and_b32_e32 v1, 0xff, v1 ; GFX10-NEXT: v_and_b32_e32 v2, 7, v2 ; GFX10-NEXT: v_lshrrev_b16 v3, v3, v5 -; GFX10-NEXT: s_movk_i32 s4, 0xff -; GFX10-NEXT: v_lshlrev_b16 v4, v6, v4 -; GFX10-NEXT: v_lshrrev_b16 v1, v7, v1 +; GFX10-NEXT: v_lshlrev_b16 v4, v7, v4 +; GFX10-NEXT: v_lshrrev_b16 v1, v6, v1 ; GFX10-NEXT: v_lshlrev_b16 v0, v2, v0 ; GFX10-NEXT: v_or_b32_e32 v2, v4, v3 +; GFX10-NEXT: v_mov_b32_e32 v3, 0xff ; GFX10-NEXT: v_or_b32_e32 v0, v0, v1 -; GFX10-NEXT: v_and_b32_sdwa v1, v2, s4 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX10-NEXT: v_and_b32_sdwa v1, v2, v3 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX10-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX10-NEXT: s_setpc_b64 s[30:31] ; @@ -1288,8 +1288,8 @@ ; GFX8-NEXT: v_and_b32_e32 v8, 7, v2 ; GFX8-NEXT: v_not_b32_e32 v2, v2 ; GFX8-NEXT: v_and_b32_e32 v2, 7, v2 -; GFX8-NEXT: v_lshlrev_b16_e32 v9, 1, v0 -; GFX8-NEXT: v_lshlrev_b16_e32 v2, v2, v9 +; GFX8-NEXT: v_lshlrev_b16_e32 v10, 1, v0 +; GFX8-NEXT: v_lshlrev_b16_e32 v2, v2, v10 ; GFX8-NEXT: v_lshrrev_b16_sdwa v8, v8, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX8-NEXT: v_lshrrev_b32_e32 v3, 8, v0 ; GFX8-NEXT: v_or_b32_e32 v2, v2, v8 @@ -1304,18 +1304,19 @@ ; GFX8-NEXT: v_and_b32_e32 v4, 7, v6 ; GFX8-NEXT: v_not_b32_e32 v5, v6 ; GFX8-NEXT: v_mov_b32_e32 v6, 1 +; GFX8-NEXT: v_mov_b32_e32 v9, 0xff ; GFX8-NEXT: v_and_b32_e32 v5, 7, v5 -; GFX8-NEXT: v_lshlrev_b16_sdwa v8, v6, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX8-NEXT: v_lshlrev_b16_e32 v5, v5, v8 -; GFX8-NEXT: v_mov_b32_e32 v8, 0xff -; GFX8-NEXT: v_and_b32_sdwa v8, v1, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; GFX8-NEXT: v_lshrrev_b16_e32 v4, v4, v8 +; GFX8-NEXT: v_lshlrev_b16_sdwa v6, v6, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX8-NEXT: v_lshlrev_b16_e32 v5, v5, v6 +; GFX8-NEXT: v_and_b32_sdwa v6, v1, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-NEXT: v_lshrrev_b16_e32 v4, v4, v6 ; GFX8-NEXT: v_or_b32_e32 v4, v5, v4 ; GFX8-NEXT: v_and_b32_e32 v5, 7, v7 -; GFX8-NEXT: v_not_b32_e32 v7, v7 -; GFX8-NEXT: v_and_b32_e32 v7, 7, v7 -; GFX8-NEXT: v_lshlrev_b16_sdwa v0, v6, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3 -; GFX8-NEXT: v_lshlrev_b16_e32 v0, v7, v0 +; GFX8-NEXT: v_not_b32_e32 v6, v7 +; GFX8-NEXT: v_mov_b32_e32 v7, 1 +; GFX8-NEXT: v_and_b32_e32 v6, 7, v6 +; GFX8-NEXT: v_lshlrev_b16_sdwa v0, v7, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3 +; GFX8-NEXT: v_lshlrev_b16_e32 v0, v6, v0 ; GFX8-NEXT: v_lshrrev_b16_sdwa v1, v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3 ; GFX8-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX8-NEXT: v_mov_b32_e32 v1, 8 @@ -1338,8 +1339,8 @@ ; GFX9-NEXT: v_and_b32_e32 v8, 7, v2 ; GFX9-NEXT: v_not_b32_e32 v2, v2 ; GFX9-NEXT: v_and_b32_e32 v2, 7, v2 -; GFX9-NEXT: v_lshlrev_b16_e32 v9, 1, v0 -; GFX9-NEXT: v_lshlrev_b16_e32 v2, v2, v9 +; GFX9-NEXT: v_lshlrev_b16_e32 v10, 1, v0 +; GFX9-NEXT: v_lshlrev_b16_e32 v2, v2, v10 ; GFX9-NEXT: v_lshrrev_b16_sdwa v8, v8, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_lshrrev_b32_e32 v3, 8, v0 ; GFX9-NEXT: v_or_b32_e32 v2, v2, v8 @@ -1348,29 +1349,30 @@ ; GFX9-NEXT: v_lshrrev_b32_e32 v4, 8, v1 ; GFX9-NEXT: v_and_b32_e32 v5, 7, v5 ; GFX9-NEXT: v_lshlrev_b16_e32 v3, 1, v3 +; GFX9-NEXT: s_mov_b32 s4, 1 ; GFX9-NEXT: v_lshlrev_b16_e32 v3, v5, v3 ; GFX9-NEXT: v_lshrrev_b16_sdwa v4, v8, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_not_b32_e32 v5, v6 +; GFX9-NEXT: v_mov_b32_e32 v9, 0xff ; GFX9-NEXT: v_or_b32_e32 v3, v3, v4 ; GFX9-NEXT: v_and_b32_e32 v4, 7, v6 -; GFX9-NEXT: v_not_b32_e32 v5, v6 -; GFX9-NEXT: v_mov_b32_e32 v6, 1 -; GFX9-NEXT: s_movk_i32 s4, 0xff ; GFX9-NEXT: v_and_b32_e32 v5, 7, v5 -; GFX9-NEXT: v_lshlrev_b16_sdwa v8, v6, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX9-NEXT: v_lshlrev_b16_e32 v5, v5, v8 -; GFX9-NEXT: v_and_b32_sdwa v8, v1, s4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; GFX9-NEXT: v_lshrrev_b16_e32 v4, v4, v8 +; GFX9-NEXT: v_lshlrev_b16_sdwa v6, s4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_lshlrev_b16_e32 v5, v5, v6 +; GFX9-NEXT: v_and_b32_sdwa v6, v1, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX9-NEXT: v_lshrrev_b16_e32 v4, v4, v6 ; GFX9-NEXT: v_or_b32_e32 v4, v5, v4 ; GFX9-NEXT: v_and_b32_e32 v5, 7, v7 -; GFX9-NEXT: v_not_b32_e32 v7, v7 -; GFX9-NEXT: v_and_b32_e32 v7, 7, v7 -; GFX9-NEXT: v_lshlrev_b16_sdwa v0, v6, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3 -; GFX9-NEXT: v_lshlrev_b16_e32 v0, v7, v0 +; GFX9-NEXT: v_not_b32_e32 v6, v7 +; GFX9-NEXT: v_mov_b32_e32 v7, 1 +; GFX9-NEXT: v_and_b32_e32 v6, 7, v6 +; GFX9-NEXT: v_lshlrev_b16_sdwa v0, v7, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3 +; GFX9-NEXT: v_lshlrev_b16_e32 v0, v6, v0 ; GFX9-NEXT: v_lshrrev_b16_sdwa v1, v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3 ; GFX9-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX9-NEXT: v_mov_b32_e32 v1, 8 ; GFX9-NEXT: v_lshlrev_b32_sdwa v1, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_and_or_b32 v1, v2, s4, v1 +; GFX9-NEXT: v_and_or_b32 v1, v2, v9, v1 ; GFX9-NEXT: v_and_b32_e32 v2, 0xff, v4 ; GFX9-NEXT: v_and_b32_e32 v0, 0xff, v0 ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 16, v2 @@ -1382,51 +1384,51 @@ ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: v_lshrrev_b32_e32 v7, 8, v2 +; GFX10-NEXT: v_lshrrev_b32_e32 v5, 8, v2 ; GFX10-NEXT: v_lshrrev_b32_e32 v3, 8, v0 ; GFX10-NEXT: v_not_b32_e32 v8, v2 -; GFX10-NEXT: v_lshrrev_b32_e32 v10, 16, v2 -; GFX10-NEXT: v_lshrrev_b32_e32 v11, 24, v2 -; GFX10-NEXT: v_not_b32_e32 v12, v7 +; GFX10-NEXT: v_lshrrev_b32_e32 v11, 16, v2 +; GFX10-NEXT: v_lshrrev_b32_e32 v12, 24, v2 +; GFX10-NEXT: v_not_b32_e32 v10, v5 ; GFX10-NEXT: v_lshlrev_b16 v3, 1, v3 ; GFX10-NEXT: v_lshrrev_b32_e32 v4, 16, v0 -; GFX10-NEXT: v_lshrrev_b32_e32 v5, 24, v0 -; GFX10-NEXT: v_lshrrev_b32_e32 v6, 8, v1 -; GFX10-NEXT: v_and_b32_e32 v12, 7, v12 -; GFX10-NEXT: v_and_b32_e32 v8, 7, v8 +; GFX10-NEXT: v_lshrrev_b32_e32 v6, 24, v0 +; GFX10-NEXT: v_lshrrev_b32_e32 v7, 8, v1 +; GFX10-NEXT: v_and_b32_e32 v10, 7, v10 ; GFX10-NEXT: v_lshlrev_b16 v0, 1, v0 -; GFX10-NEXT: v_not_b32_e32 v13, v10 -; GFX10-NEXT: s_movk_i32 s4, 0xff -; GFX10-NEXT: v_lshlrev_b16 v3, v12, v3 -; GFX10-NEXT: v_not_b32_e32 v12, v11 +; GFX10-NEXT: v_and_b32_e32 v8, 7, v8 +; GFX10-NEXT: v_mov_b32_e32 v13, 0xff +; GFX10-NEXT: v_not_b32_e32 v14, v12 +; GFX10-NEXT: v_lshlrev_b16 v3, v10, v3 +; GFX10-NEXT: v_not_b32_e32 v10, v11 ; GFX10-NEXT: v_lshrrev_b32_e32 v9, 24, v1 ; GFX10-NEXT: v_lshlrev_b16 v0, v8, v0 ; GFX10-NEXT: v_and_b32_e32 v8, 0xff, v1 -; GFX10-NEXT: v_and_b32_e32 v7, 7, v7 -; GFX10-NEXT: v_and_b32_e32 v6, 0xff, v6 +; GFX10-NEXT: v_and_b32_e32 v5, 7, v5 +; GFX10-NEXT: v_and_b32_e32 v7, 0xff, v7 +; GFX10-NEXT: v_and_b32_e32 v11, 7, v11 ; GFX10-NEXT: v_and_b32_e32 v10, 7, v10 -; GFX10-NEXT: v_and_b32_e32 v13, 7, v13 ; GFX10-NEXT: v_lshlrev_b16 v4, 1, v4 -; GFX10-NEXT: v_and_b32_sdwa v1, v1, s4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX10-NEXT: v_and_b32_sdwa v1, v1, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX10-NEXT: v_and_b32_e32 v13, 7, v14 +; GFX10-NEXT: v_lshlrev_b16 v6, 1, v6 ; GFX10-NEXT: v_and_b32_e32 v12, 7, v12 -; GFX10-NEXT: v_lshlrev_b16 v5, 1, v5 -; GFX10-NEXT: v_and_b32_e32 v11, 7, v11 ; GFX10-NEXT: v_and_b32_e32 v2, 7, v2 -; GFX10-NEXT: v_lshrrev_b16 v6, v7, v6 -; GFX10-NEXT: v_lshlrev_b16 v4, v13, v4 -; GFX10-NEXT: v_lshrrev_b16 v1, v10, v1 -; GFX10-NEXT: v_lshlrev_b16 v5, v12, v5 -; GFX10-NEXT: v_lshrrev_b16 v7, v11, v9 +; GFX10-NEXT: v_lshrrev_b16 v5, v5, v7 +; GFX10-NEXT: v_lshlrev_b16 v4, v10, v4 +; GFX10-NEXT: v_lshrrev_b16 v1, v11, v1 +; GFX10-NEXT: v_lshlrev_b16 v6, v13, v6 +; GFX10-NEXT: v_lshrrev_b16 v7, v12, v9 ; GFX10-NEXT: v_lshrrev_b16 v2, v2, v8 -; GFX10-NEXT: v_or_b32_e32 v3, v3, v6 -; GFX10-NEXT: v_mov_b32_e32 v6, 8 +; GFX10-NEXT: v_or_b32_e32 v3, v3, v5 +; GFX10-NEXT: v_mov_b32_e32 v5, 8 ; GFX10-NEXT: v_or_b32_e32 v1, v4, v1 -; GFX10-NEXT: v_or_b32_e32 v4, v5, v7 +; GFX10-NEXT: v_or_b32_e32 v4, v6, v7 ; GFX10-NEXT: v_or_b32_e32 v0, v0, v2 -; GFX10-NEXT: v_lshlrev_b32_sdwa v2, v6, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX10-NEXT: v_lshlrev_b32_sdwa v2, v5, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX10-NEXT: v_and_b32_e32 v1, 0xff, v1 ; GFX10-NEXT: v_and_b32_e32 v3, 0xff, v4 -; GFX10-NEXT: v_and_or_b32 v0, v0, 0xff, v2 +; GFX10-NEXT: v_and_or_b32 v0, 0xff, v0, v2 ; GFX10-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX10-NEXT: v_lshlrev_b32_e32 v2, 24, v3 ; GFX10-NEXT: v_or3_b32 v0, v0, v1, v2 @@ -1483,7 +1485,7 @@ ; GFX11-NEXT: v_and_b32_e32 v2, 0xff, v3 ; GFX11-NEXT: v_and_b32_e32 v3, 0xff, v4 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-NEXT: v_and_or_b32 v0, v0, 0xff, v1 +; GFX11-NEXT: v_and_or_b32 v0, 0xff, v0, v1 ; GFX11-NEXT: v_lshlrev_b32_e32 v1, 16, v2 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_lshlrev_b32_e32 v2, 24, v3 @@ -1584,7 +1586,6 @@ ; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc ; GFX9-NEXT: v_sub_u32_e32 v1, 23, v0 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffffff, v0 -; GFX9-NEXT: v_and_b32_e32 v1, 0xffffff, v1 ; GFX9-NEXT: v_lshrrev_b32_e64 v0, v0, s1 ; GFX9-NEXT: v_lshl_or_b32 v0, s0, v1, v0 ; GFX9-NEXT: v_readfirstlane_b32 s0, v0 @@ -1611,11 +1612,10 @@ ; GFX10-NEXT: v_subrev_nc_u32_e32 v1, 24, v0 ; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, 24, v0 ; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc_lo -; GFX10-NEXT: v_sub_nc_u32_e32 v1, 23, v0 -; GFX10-NEXT: v_and_b32_e32 v0, 0xffffff, v0 -; GFX10-NEXT: v_and_b32_e32 v1, 0xffffff, v1 -; GFX10-NEXT: v_lshrrev_b32_e64 v0, v0, s1 -; GFX10-NEXT: v_lshl_or_b32 v0, s0, v1, v0 +; GFX10-NEXT: v_and_b32_e32 v1, 0xffffff, v0 +; GFX10-NEXT: v_sub_nc_u32_e32 v0, 23, v0 +; GFX10-NEXT: v_lshrrev_b32_e64 v1, v1, s1 +; GFX10-NEXT: v_lshl_or_b32 v0, s0, v0, v1 ; GFX10-NEXT: v_readfirstlane_b32 s0, v0 ; GFX10-NEXT: ; return to shader part epilog ; @@ -1648,13 +1648,11 @@ ; GFX11-NEXT: v_cmp_le_u32_e32 vcc_lo, 24, v0 ; GFX11-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc_lo ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_sub_nc_u32_e32 v1, 23, v0 -; GFX11-NEXT: v_and_b32_e32 v0, 0xffffff, v0 -; GFX11-NEXT: v_and_b32_e32 v1, 0xffffff, v1 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_lshrrev_b32_e64 v0, v0, s1 -; GFX11-NEXT: v_lshl_or_b32 v0, s0, v1, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_and_b32_e32 v1, 0xffffff, v0 +; GFX11-NEXT: v_sub_nc_u32_e32 v0, 23, v0 +; GFX11-NEXT: v_lshrrev_b32_e64 v1, v1, s1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_lshl_or_b32 v0, s0, v0, v1 ; GFX11-NEXT: v_readfirstlane_b32 s0, v0 ; GFX11-NEXT: ; return to shader part epilog %result = call i24 @llvm.fshr.i24(i24 %lhs, i24 %rhs, i24 %amt) @@ -1749,7 +1747,6 @@ ; GFX9-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc ; GFX9-NEXT: v_sub_u32_e32 v3, 23, v2 ; GFX9-NEXT: v_and_b32_e32 v2, 0xffffff, v2 -; GFX9-NEXT: v_and_b32_e32 v3, 0xffffff, v3 ; GFX9-NEXT: v_lshrrev_b32_e32 v1, v2, v1 ; GFX9-NEXT: v_lshl_or_b32 v0, v0, v3, v1 ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -1777,11 +1774,10 @@ ; GFX10-NEXT: v_subrev_nc_u32_e32 v3, 24, v2 ; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, 24, v2 ; GFX10-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc_lo -; GFX10-NEXT: v_sub_nc_u32_e32 v3, 23, v2 -; GFX10-NEXT: v_and_b32_e32 v2, 0xffffff, v2 -; GFX10-NEXT: v_and_b32_e32 v3, 0xffffff, v3 -; GFX10-NEXT: v_lshrrev_b32_e32 v1, v2, v1 -; GFX10-NEXT: v_lshl_or_b32 v0, v0, v3, v1 +; GFX10-NEXT: v_and_b32_e32 v3, 0xffffff, v2 +; GFX10-NEXT: v_sub_nc_u32_e32 v2, 23, v2 +; GFX10-NEXT: v_lshrrev_b32_e32 v1, v3, v1 +; GFX10-NEXT: v_lshl_or_b32 v0, v0, v2, v1 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: v_fshr_i24: @@ -1815,12 +1811,11 @@ ; GFX11-NEXT: v_cmp_le_u32_e32 vcc_lo, 24, v2 ; GFX11-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc_lo ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_sub_nc_u32_e32 v3, 23, v2 -; GFX11-NEXT: v_and_b32_e32 v2, 0xffffff, v2 -; GFX11-NEXT: v_and_b32_e32 v3, 0xffffff, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_lshrrev_b32_e32 v1, v2, v1 -; GFX11-NEXT: v_lshl_or_b32 v0, v0, v3, v1 +; GFX11-NEXT: v_and_b32_e32 v3, 0xffffff, v2 +; GFX11-NEXT: v_sub_nc_u32_e32 v2, 23, v2 +; GFX11-NEXT: v_lshrrev_b32_e32 v1, v3, v1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_lshl_or_b32 v0, v0, v2, v1 ; GFX11-NEXT: s_setpc_b64 s[30:31] %result = call i24 @llvm.fshr.i24(i24 %lhs, i24 %rhs, i24 %amt) ret i24 %result @@ -1834,14 +1829,13 @@ ; GFX6-NEXT: s_lshr_b32 s6, s0, 16 ; GFX6-NEXT: s_lshr_b32 s7, s0, 24 ; GFX6-NEXT: s_lshr_b32 s8, s1, 8 -; GFX6-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 -; GFX6-NEXT: v_cvt_u32_f32_e32 v0, v0 ; GFX6-NEXT: s_and_b32 s9, s0, 0xff ; GFX6-NEXT: s_bfe_u32 s0, s0, 0x80008 ; GFX6-NEXT: s_and_b32 s1, s1, 0xff +; GFX6-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX6-NEXT: s_lshl_b32 s0, s0, 8 ; GFX6-NEXT: s_lshl_b32 s1, s1, 8 -; GFX6-NEXT: v_mov_b32_e32 v1, 0xffffffe8 +; GFX6-NEXT: v_cvt_u32_f32_e32 v0, v0 ; GFX6-NEXT: s_or_b32 s0, s9, s0 ; GFX6-NEXT: s_or_b32 s1, s7, s1 ; GFX6-NEXT: s_and_b32 s7, s8, 0xff @@ -1849,19 +1843,19 @@ ; GFX6-NEXT: s_lshr_b32 s9, s2, 24 ; GFX6-NEXT: s_and_b32 s11, s2, 0xff ; GFX6-NEXT: s_bfe_u32 s2, s2, 0x80008 -; GFX6-NEXT: v_mul_lo_u32 v2, v0, v1 ; GFX6-NEXT: s_lshl_b32 s2, s2, 8 ; GFX6-NEXT: s_and_b32 s8, s8, 0xff ; GFX6-NEXT: s_or_b32 s2, s11, s2 ; GFX6-NEXT: s_and_b32 s8, 0xffff, s8 +; GFX6-NEXT: v_mov_b32_e32 v1, 0xffffffe8 ; GFX6-NEXT: s_lshr_b32 s10, s3, 8 ; GFX6-NEXT: s_and_b32 s2, 0xffff, s2 ; GFX6-NEXT: s_lshl_b32 s8, s8, 16 ; GFX6-NEXT: s_and_b32 s3, s3, 0xff +; GFX6-NEXT: v_mul_lo_u32 v1, v0, v1 ; GFX6-NEXT: s_or_b32 s2, s2, s8 ; GFX6-NEXT: s_lshl_b32 s3, s3, 8 ; GFX6-NEXT: s_and_b32 s8, s10, 0xff -; GFX6-NEXT: v_mul_hi_u32 v2, v0, v2 ; GFX6-NEXT: s_or_b32 s3, s9, s3 ; GFX6-NEXT: s_and_b32 s8, 0xffff, s8 ; GFX6-NEXT: s_and_b32 s3, 0xffff, s3 @@ -1871,103 +1865,95 @@ ; GFX6-NEXT: s_lshr_b32 s9, s4, 24 ; GFX6-NEXT: s_and_b32 s11, s4, 0xff ; GFX6-NEXT: s_bfe_u32 s4, s4, 0x80008 +; GFX6-NEXT: v_mul_hi_u32 v1, v0, v1 ; GFX6-NEXT: s_lshl_b32 s4, s4, 8 ; GFX6-NEXT: s_and_b32 s8, s8, 0xff -; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v2 -; GFX6-NEXT: v_cvt_f32_ubyte0_e32 v2, 24 ; GFX6-NEXT: s_or_b32 s4, s11, s4 ; GFX6-NEXT: s_and_b32 s8, 0xffff, s8 -; GFX6-NEXT: v_rcp_iflag_f32_e32 v2, v2 ; GFX6-NEXT: s_and_b32 s4, 0xffff, s4 ; GFX6-NEXT: s_lshl_b32 s8, s8, 16 ; GFX6-NEXT: s_or_b32 s4, s4, s8 -; GFX6-NEXT: v_mul_hi_u32 v0, s4, v0 -; GFX6-NEXT: v_mul_f32_e32 v2, 0x4f7ffffe, v2 -; GFX6-NEXT: v_cvt_u32_f32_e32 v2, v2 +; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v1 +; GFX6-NEXT: v_mul_hi_u32 v1, s4, v0 ; GFX6-NEXT: s_lshr_b32 s10, s5, 8 -; GFX6-NEXT: v_mul_lo_u32 v0, v0, 24 ; GFX6-NEXT: s_and_b32 s5, s5, 0xff -; GFX6-NEXT: v_mul_lo_u32 v1, v2, v1 ; GFX6-NEXT: s_lshl_b32 s5, s5, 8 -; GFX6-NEXT: v_sub_i32_e32 v0, vcc, s4, v0 -; GFX6-NEXT: v_subrev_i32_e32 v3, vcc, 24, v0 -; GFX6-NEXT: v_cmp_le_u32_e32 vcc, 24, v0 -; GFX6-NEXT: v_mul_hi_u32 v1, v2, v1 +; GFX6-NEXT: v_mul_lo_u32 v1, v1, 24 ; GFX6-NEXT: s_and_b32 s8, s10, 0xff -; GFX6-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc ; GFX6-NEXT: s_or_b32 s5, s9, s5 ; GFX6-NEXT: s_and_b32 s8, 0xffff, s8 -; GFX6-NEXT: v_subrev_i32_e32 v3, vcc, 24, v0 ; GFX6-NEXT: s_and_b32 s5, 0xffff, s5 ; GFX6-NEXT: s_lshl_b32 s8, s8, 16 -; GFX6-NEXT: v_cmp_le_u32_e32 vcc, 24, v0 ; GFX6-NEXT: s_or_b32 s5, s5, s8 -; GFX6-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc -; GFX6-NEXT: v_add_i32_e32 v1, vcc, v2, v1 -; GFX6-NEXT: v_mul_hi_u32 v1, s5, v1 +; GFX6-NEXT: v_sub_i32_e32 v1, vcc, s4, v1 +; GFX6-NEXT: v_subrev_i32_e32 v2, vcc, 24, v1 +; GFX6-NEXT: v_mul_hi_u32 v0, s5, v0 +; GFX6-NEXT: v_cmp_le_u32_e32 vcc, 24, v1 +; GFX6-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc +; GFX6-NEXT: v_subrev_i32_e32 v2, vcc, 24, v1 ; GFX6-NEXT: s_and_b32 s6, s6, 0xff +; GFX6-NEXT: v_cmp_le_u32_e32 vcc, 24, v1 +; GFX6-NEXT: v_mul_lo_u32 v0, v0, 24 ; GFX6-NEXT: s_and_b32 s0, 0xffff, s0 ; GFX6-NEXT: s_and_b32 s6, 0xffff, s6 -; GFX6-NEXT: v_mul_lo_u32 v1, v1, 24 -; GFX6-NEXT: v_sub_i32_e32 v3, vcc, 23, v0 +; GFX6-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc +; GFX6-NEXT: v_sub_i32_e32 v2, vcc, 23, v1 ; GFX6-NEXT: s_lshl_b32 s4, s6, 17 ; GFX6-NEXT: s_lshl_b32 s0, s0, 1 ; GFX6-NEXT: s_or_b32 s0, s4, s0 -; GFX6-NEXT: v_and_b32_e32 v2, 0xffffff, v3 -; GFX6-NEXT: v_and_b32_e32 v0, 0xffffff, v0 +; GFX6-NEXT: v_and_b32_e32 v2, 0xffffff, v2 +; GFX6-NEXT: v_and_b32_e32 v1, 0xffffff, v1 ; GFX6-NEXT: v_lshl_b32_e32 v2, s0, v2 -; GFX6-NEXT: v_lshr_b32_e32 v0, s2, v0 -; GFX6-NEXT: v_sub_i32_e32 v1, vcc, s5, v1 -; GFX6-NEXT: v_or_b32_e32 v0, v2, v0 -; GFX6-NEXT: v_subrev_i32_e32 v2, vcc, 24, v1 -; GFX6-NEXT: v_cmp_le_u32_e32 vcc, 24, v1 -; GFX6-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc -; GFX6-NEXT: v_subrev_i32_e32 v2, vcc, 24, v1 -; GFX6-NEXT: v_cmp_le_u32_e32 vcc, 24, v1 +; GFX6-NEXT: v_lshr_b32_e32 v1, s2, v1 +; GFX6-NEXT: v_sub_i32_e32 v0, vcc, s5, v0 +; GFX6-NEXT: v_or_b32_e32 v1, v2, v1 +; GFX6-NEXT: v_subrev_i32_e32 v2, vcc, 24, v0 +; GFX6-NEXT: v_cmp_le_u32_e32 vcc, 24, v0 +; GFX6-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc +; GFX6-NEXT: v_subrev_i32_e32 v2, vcc, 24, v0 +; GFX6-NEXT: v_cmp_le_u32_e32 vcc, 24, v0 ; GFX6-NEXT: s_and_b32 s1, 0xffff, s1 ; GFX6-NEXT: s_and_b32 s7, 0xffff, s7 -; GFX6-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc -; GFX6-NEXT: v_sub_i32_e32 v2, vcc, 23, v1 +; GFX6-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc +; GFX6-NEXT: v_sub_i32_e32 v2, vcc, 23, v0 ; GFX6-NEXT: s_lshl_b32 s0, s7, 17 ; GFX6-NEXT: s_lshl_b32 s1, s1, 1 ; GFX6-NEXT: s_or_b32 s0, s0, s1 ; GFX6-NEXT: v_and_b32_e32 v2, 0xffffff, v2 -; GFX6-NEXT: v_and_b32_e32 v1, 0xffffff, v1 +; GFX6-NEXT: v_and_b32_e32 v0, 0xffffff, v0 ; GFX6-NEXT: v_lshl_b32_e32 v2, s0, v2 -; GFX6-NEXT: v_lshr_b32_e32 v1, s3, v1 -; GFX6-NEXT: v_bfe_u32 v3, v0, 8, 8 -; GFX6-NEXT: v_or_b32_e32 v1, v2, v1 -; GFX6-NEXT: v_and_b32_e32 v2, 0xff, v0 -; GFX6-NEXT: v_lshlrev_b32_e32 v3, 8, v3 -; GFX6-NEXT: v_bfe_u32 v0, v0, 16, 8 -; GFX6-NEXT: v_or_b32_e32 v2, v2, v3 -; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX6-NEXT: v_lshr_b32_e32 v0, s3, v0 +; GFX6-NEXT: v_bfe_u32 v3, v1, 8, 8 ; GFX6-NEXT: v_or_b32_e32 v0, v2, v0 ; GFX6-NEXT: v_and_b32_e32 v2, 0xff, v1 -; GFX6-NEXT: v_lshlrev_b32_e32 v2, 24, v2 -; GFX6-NEXT: v_or_b32_e32 v0, v0, v2 -; GFX6-NEXT: v_bfe_u32 v2, v1, 8, 8 +; GFX6-NEXT: v_lshlrev_b32_e32 v3, 8, v3 ; GFX6-NEXT: v_bfe_u32 v1, v1, 16, 8 -; GFX6-NEXT: v_lshlrev_b32_e32 v1, 8, v1 +; GFX6-NEXT: v_or_b32_e32 v2, v2, v3 +; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX6-NEXT: v_or_b32_e32 v1, v2, v1 -; GFX6-NEXT: v_readfirstlane_b32 s0, v0 -; GFX6-NEXT: v_readfirstlane_b32 s1, v1 +; GFX6-NEXT: v_and_b32_e32 v2, 0xff, v0 +; GFX6-NEXT: v_lshlrev_b32_e32 v2, 24, v2 +; GFX6-NEXT: v_or_b32_e32 v1, v1, v2 +; GFX6-NEXT: v_bfe_u32 v2, v0, 8, 8 +; GFX6-NEXT: v_bfe_u32 v0, v0, 16, 8 +; GFX6-NEXT: v_lshlrev_b32_e32 v0, 8, v0 +; GFX6-NEXT: v_or_b32_e32 v0, v2, v0 +; GFX6-NEXT: v_readfirstlane_b32 s0, v1 +; GFX6-NEXT: v_readfirstlane_b32 s1, v0 ; GFX6-NEXT: ; return to shader part epilog ; ; GFX8-LABEL: s_fshr_v2i24: ; GFX8: ; %bb.0: ; GFX8-NEXT: v_cvt_f32_ubyte0_e32 v0, 24 -; GFX8-NEXT: v_rcp_iflag_f32_e32 v0, v0 ; GFX8-NEXT: s_lshr_b32 s9, s1, 8 ; GFX8-NEXT: s_and_b32 s1, s1, 0xff +; GFX8-NEXT: v_rcp_iflag_f32_e32 v0, v0 ; GFX8-NEXT: s_lshr_b32 s6, s0, 8 ; GFX8-NEXT: s_lshr_b32 s8, s0, 24 ; GFX8-NEXT: s_lshl_b32 s1, s1, 8 -; GFX8-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX8-NEXT: s_and_b32 s6, s6, 0xff ; GFX8-NEXT: s_or_b32 s1, s8, s1 ; GFX8-NEXT: s_lshr_b32 s8, s2, 8 -; GFX8-NEXT: v_cvt_u32_f32_e32 v0, v0 ; GFX8-NEXT: s_lshr_b32 s7, s0, 16 ; GFX8-NEXT: s_and_b32 s0, s0, 0xff ; GFX8-NEXT: s_lshl_b32 s6, s6, 8 @@ -1979,11 +1965,11 @@ ; GFX8-NEXT: s_lshr_b32 s10, s2, 24 ; GFX8-NEXT: s_and_b32 s2, s2, 0xff ; GFX8-NEXT: s_lshl_b32 s8, s8, 8 +; GFX8-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX8-NEXT: s_or_b32 s2, s2, s8 ; GFX8-NEXT: s_and_b32 s8, s9, 0xff -; GFX8-NEXT: v_mov_b32_e32 v1, 0xffffffe8 +; GFX8-NEXT: v_cvt_u32_f32_e32 v0, v0 ; GFX8-NEXT: s_and_b32 s8, 0xffff, s8 -; GFX8-NEXT: v_mul_lo_u32 v2, v0, v1 ; GFX8-NEXT: s_lshr_b32 s11, s3, 8 ; GFX8-NEXT: s_and_b32 s2, 0xffff, s2 ; GFX8-NEXT: s_lshl_b32 s8, s8, 16 @@ -1991,11 +1977,12 @@ ; GFX8-NEXT: s_or_b32 s2, s2, s8 ; GFX8-NEXT: s_lshl_b32 s3, s3, 8 ; GFX8-NEXT: s_and_b32 s8, s11, 0xff +; GFX8-NEXT: v_mov_b32_e32 v1, 0xffffffe8 ; GFX8-NEXT: s_or_b32 s3, s10, s3 ; GFX8-NEXT: s_and_b32 s8, 0xffff, s8 +; GFX8-NEXT: v_mul_lo_u32 v1, v0, v1 ; GFX8-NEXT: s_and_b32 s3, 0xffff, s3 ; GFX8-NEXT: s_lshl_b32 s8, s8, 16 -; GFX8-NEXT: v_mul_hi_u32 v2, v0, v2 ; GFX8-NEXT: s_or_b32 s3, s3, s8 ; GFX8-NEXT: s_lshr_b32 s8, s4, 8 ; GFX8-NEXT: s_and_b32 s8, s8, 0xff @@ -2003,220 +1990,203 @@ ; GFX8-NEXT: s_lshr_b32 s10, s4, 24 ; GFX8-NEXT: s_and_b32 s4, s4, 0xff ; GFX8-NEXT: s_lshl_b32 s8, s8, 8 +; GFX8-NEXT: v_mul_hi_u32 v1, v0, v1 ; GFX8-NEXT: s_or_b32 s4, s4, s8 ; GFX8-NEXT: s_and_b32 s8, s9, 0xff -; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v2 -; GFX8-NEXT: v_cvt_f32_ubyte0_e32 v2, 24 ; GFX8-NEXT: s_and_b32 s8, 0xffff, s8 -; GFX8-NEXT: v_rcp_iflag_f32_e32 v2, v2 ; GFX8-NEXT: s_and_b32 s4, 0xffff, s4 ; GFX8-NEXT: s_lshl_b32 s8, s8, 16 ; GFX8-NEXT: s_or_b32 s4, s4, s8 -; GFX8-NEXT: v_mul_hi_u32 v0, s4, v0 -; GFX8-NEXT: v_mul_f32_e32 v2, 0x4f7ffffe, v2 -; GFX8-NEXT: v_cvt_u32_f32_e32 v2, v2 +; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v1 +; GFX8-NEXT: v_mul_hi_u32 v1, s4, v0 ; GFX8-NEXT: s_lshr_b32 s11, s5, 8 -; GFX8-NEXT: v_mul_lo_u32 v0, v0, 24 ; GFX8-NEXT: s_and_b32 s5, s5, 0xff -; GFX8-NEXT: v_mul_lo_u32 v1, v2, v1 ; GFX8-NEXT: s_lshl_b32 s5, s5, 8 -; GFX8-NEXT: v_sub_u32_e32 v0, vcc, s4, v0 -; GFX8-NEXT: v_subrev_u32_e32 v3, vcc, 24, v0 -; GFX8-NEXT: v_cmp_le_u32_e32 vcc, 24, v0 -; GFX8-NEXT: v_mul_hi_u32 v1, v2, v1 +; GFX8-NEXT: v_mul_lo_u32 v1, v1, 24 ; GFX8-NEXT: s_and_b32 s8, s11, 0xff -; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc ; GFX8-NEXT: s_or_b32 s5, s10, s5 ; GFX8-NEXT: s_and_b32 s8, 0xffff, s8 -; GFX8-NEXT: v_subrev_u32_e32 v3, vcc, 24, v0 ; GFX8-NEXT: s_and_b32 s5, 0xffff, s5 ; GFX8-NEXT: s_lshl_b32 s8, s8, 16 -; GFX8-NEXT: v_cmp_le_u32_e32 vcc, 24, v0 ; GFX8-NEXT: s_or_b32 s5, s5, s8 -; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc -; GFX8-NEXT: v_add_u32_e32 v1, vcc, v2, v1 -; GFX8-NEXT: v_mul_hi_u32 v1, s5, v1 +; GFX8-NEXT: v_sub_u32_e32 v1, vcc, s4, v1 +; GFX8-NEXT: v_subrev_u32_e32 v2, vcc, 24, v1 +; GFX8-NEXT: v_mul_hi_u32 v0, s5, v0 +; GFX8-NEXT: v_cmp_le_u32_e32 vcc, 24, v1 +; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc +; GFX8-NEXT: v_subrev_u32_e32 v2, vcc, 24, v1 +; GFX8-NEXT: v_cmp_le_u32_e32 vcc, 24, v1 +; GFX8-NEXT: v_mul_lo_u32 v0, v0, 24 ; GFX8-NEXT: s_and_b32 s0, 0xffff, s0 ; GFX8-NEXT: s_and_b32 s6, 0xffff, s6 -; GFX8-NEXT: v_sub_u32_e32 v3, vcc, 23, v0 -; GFX8-NEXT: v_mul_lo_u32 v1, v1, 24 +; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc +; GFX8-NEXT: v_sub_u32_e32 v2, vcc, 23, v1 ; GFX8-NEXT: s_lshl_b32 s4, s6, 17 ; GFX8-NEXT: s_lshl_b32 s0, s0, 1 ; GFX8-NEXT: s_or_b32 s0, s4, s0 -; GFX8-NEXT: v_and_b32_e32 v2, 0xffffff, v3 -; GFX8-NEXT: v_and_b32_e32 v0, 0xffffff, v0 +; GFX8-NEXT: v_and_b32_e32 v2, 0xffffff, v2 +; GFX8-NEXT: v_and_b32_e32 v1, 0xffffff, v1 ; GFX8-NEXT: v_lshlrev_b32_e64 v2, v2, s0 -; GFX8-NEXT: v_lshrrev_b32_e64 v0, v0, s2 -; GFX8-NEXT: v_sub_u32_e32 v1, vcc, s5, v1 -; GFX8-NEXT: v_or_b32_e32 v0, v2, v0 -; GFX8-NEXT: v_subrev_u32_e32 v2, vcc, 24, v1 -; GFX8-NEXT: v_cmp_le_u32_e32 vcc, 24, v1 -; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc -; GFX8-NEXT: v_subrev_u32_e32 v2, vcc, 24, v1 -; GFX8-NEXT: v_cmp_le_u32_e32 vcc, 24, v1 +; GFX8-NEXT: v_lshrrev_b32_e64 v1, v1, s2 +; GFX8-NEXT: v_sub_u32_e32 v0, vcc, s5, v0 +; GFX8-NEXT: v_or_b32_e32 v1, v2, v1 +; GFX8-NEXT: v_subrev_u32_e32 v2, vcc, 24, v0 +; GFX8-NEXT: v_cmp_le_u32_e32 vcc, 24, v0 +; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc +; GFX8-NEXT: v_subrev_u32_e32 v2, vcc, 24, v0 +; GFX8-NEXT: v_cmp_le_u32_e32 vcc, 24, v0 ; GFX8-NEXT: s_and_b32 s1, 0xffff, s1 ; GFX8-NEXT: s_and_b32 s7, 0xffff, s7 -; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc -; GFX8-NEXT: v_sub_u32_e32 v2, vcc, 23, v1 +; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc +; GFX8-NEXT: v_sub_u32_e32 v2, vcc, 23, v0 ; GFX8-NEXT: s_lshl_b32 s0, s7, 17 ; GFX8-NEXT: s_lshl_b32 s1, s1, 1 ; GFX8-NEXT: s_or_b32 s0, s0, s1 ; GFX8-NEXT: v_and_b32_e32 v2, 0xffffff, v2 -; GFX8-NEXT: v_and_b32_e32 v1, 0xffffff, v1 +; GFX8-NEXT: v_and_b32_e32 v0, 0xffffff, v0 ; GFX8-NEXT: v_lshlrev_b32_e64 v2, v2, s0 -; GFX8-NEXT: v_lshrrev_b32_e64 v1, v1, s3 -; GFX8-NEXT: v_or_b32_e32 v1, v2, v1 +; GFX8-NEXT: v_lshrrev_b32_e64 v0, v0, s3 +; GFX8-NEXT: v_or_b32_e32 v0, v2, v0 ; GFX8-NEXT: v_mov_b32_e32 v2, 8 -; GFX8-NEXT: v_lshlrev_b32_sdwa v3, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 +; GFX8-NEXT: v_lshlrev_b32_sdwa v3, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 ; GFX8-NEXT: v_mov_b32_e32 v4, 16 -; GFX8-NEXT: v_or_b32_sdwa v3, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX8-NEXT: v_lshlrev_b32_sdwa v0, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 -; GFX8-NEXT: v_or_b32_e32 v0, v3, v0 -; GFX8-NEXT: v_and_b32_e32 v3, 0xff, v1 +; GFX8-NEXT: v_or_b32_sdwa v3, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX8-NEXT: v_lshlrev_b32_sdwa v1, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 +; GFX8-NEXT: v_or_b32_e32 v1, v3, v1 +; GFX8-NEXT: v_and_b32_e32 v3, 0xff, v0 ; GFX8-NEXT: v_lshlrev_b32_e32 v3, 24, v3 -; GFX8-NEXT: v_lshlrev_b32_sdwa v2, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 -; GFX8-NEXT: v_or_b32_e32 v0, v0, v3 -; GFX8-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD -; GFX8-NEXT: v_readfirstlane_b32 s0, v0 -; GFX8-NEXT: v_readfirstlane_b32 s1, v1 +; GFX8-NEXT: v_lshlrev_b32_sdwa v2, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 +; GFX8-NEXT: v_or_b32_e32 v1, v1, v3 +; GFX8-NEXT: v_or_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD +; GFX8-NEXT: v_readfirstlane_b32 s0, v1 +; GFX8-NEXT: v_readfirstlane_b32 s1, v0 ; GFX8-NEXT: ; return to shader part epilog ; ; GFX9-LABEL: s_fshr_v2i24: ; GFX9: ; %bb.0: ; GFX9-NEXT: v_cvt_f32_ubyte0_e32 v0, 24 -; GFX9-NEXT: v_rcp_iflag_f32_e32 v0, v0 -; GFX9-NEXT: v_mov_b32_e32 v1, 0xffffffe8 -; GFX9-NEXT: s_lshr_b32 s11, s1, 8 +; GFX9-NEXT: s_lshr_b32 s9, s1, 8 ; GFX9-NEXT: s_and_b32 s1, s1, 0xff -; GFX9-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 -; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0 -; GFX9-NEXT: s_lshr_b32 s7, s0, 8 -; GFX9-NEXT: s_lshr_b32 s10, s0, 24 +; GFX9-NEXT: v_rcp_iflag_f32_e32 v0, v0 +; GFX9-NEXT: s_lshr_b32 s6, s0, 8 +; GFX9-NEXT: s_lshr_b32 s8, s0, 24 ; GFX9-NEXT: s_lshl_b32 s1, s1, 8 -; GFX9-NEXT: v_mul_lo_u32 v2, v0, v1 -; GFX9-NEXT: s_and_b32 s7, s7, 0xff -; GFX9-NEXT: s_or_b32 s1, s10, s1 -; GFX9-NEXT: s_lshr_b32 s10, s2, 8 -; GFX9-NEXT: v_mul_hi_u32 v2, v0, v2 -; GFX9-NEXT: s_lshr_b32 s9, s0, 16 +; GFX9-NEXT: s_and_b32 s6, s6, 0xff +; GFX9-NEXT: s_or_b32 s1, s8, s1 +; GFX9-NEXT: s_lshr_b32 s8, s2, 8 +; GFX9-NEXT: s_lshr_b32 s7, s0, 16 ; GFX9-NEXT: s_and_b32 s0, s0, 0xff -; GFX9-NEXT: s_lshl_b32 s7, s7, 8 -; GFX9-NEXT: s_and_b32 s10, s10, 0xff -; GFX9-NEXT: s_or_b32 s0, s0, s7 +; GFX9-NEXT: s_lshl_b32 s6, s6, 8 +; GFX9-NEXT: s_and_b32 s8, s8, 0xff +; GFX9-NEXT: s_or_b32 s0, s0, s6 +; GFX9-NEXT: s_and_b32 s6, s7, 0xff ; GFX9-NEXT: s_and_b32 s7, s9, 0xff -; GFX9-NEXT: s_and_b32 s9, s11, 0xff -; GFX9-NEXT: s_lshr_b32 s11, s2, 16 -; GFX9-NEXT: s_lshr_b32 s12, s2, 24 +; GFX9-NEXT: s_lshr_b32 s9, s2, 16 +; GFX9-NEXT: s_lshr_b32 s10, s2, 24 ; GFX9-NEXT: s_and_b32 s2, s2, 0xff -; GFX9-NEXT: s_lshl_b32 s10, s10, 8 -; GFX9-NEXT: s_or_b32 s2, s2, s10 -; GFX9-NEXT: s_and_b32 s10, s11, 0xff -; GFX9-NEXT: v_add_u32_e32 v0, v0, v2 -; GFX9-NEXT: v_cvt_f32_ubyte0_e32 v2, 24 -; GFX9-NEXT: s_and_b32 s10, 0xffff, s10 -; GFX9-NEXT: v_rcp_iflag_f32_e32 v2, v2 -; GFX9-NEXT: s_lshr_b32 s13, s3, 8 +; GFX9-NEXT: s_lshl_b32 s8, s8, 8 +; GFX9-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 +; GFX9-NEXT: s_or_b32 s2, s2, s8 +; GFX9-NEXT: s_and_b32 s8, s9, 0xff +; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0 +; GFX9-NEXT: s_and_b32 s8, 0xffff, s8 +; GFX9-NEXT: s_lshr_b32 s11, s3, 8 ; GFX9-NEXT: s_and_b32 s2, 0xffff, s2 -; GFX9-NEXT: s_lshl_b32 s10, s10, 16 +; GFX9-NEXT: s_lshl_b32 s8, s8, 16 ; GFX9-NEXT: s_and_b32 s3, s3, 0xff -; GFX9-NEXT: s_or_b32 s2, s2, s10 +; GFX9-NEXT: s_or_b32 s2, s2, s8 ; GFX9-NEXT: s_lshl_b32 s3, s3, 8 -; GFX9-NEXT: s_and_b32 s10, s13, 0xff -; GFX9-NEXT: s_or_b32 s3, s12, s3 -; GFX9-NEXT: s_and_b32 s10, 0xffff, s10 +; GFX9-NEXT: s_and_b32 s8, s11, 0xff +; GFX9-NEXT: v_mov_b32_e32 v1, 0xffffffe8 +; GFX9-NEXT: s_or_b32 s3, s10, s3 +; GFX9-NEXT: s_and_b32 s8, 0xffff, s8 +; GFX9-NEXT: v_mul_lo_u32 v1, v0, v1 ; GFX9-NEXT: s_and_b32 s3, 0xffff, s3 -; GFX9-NEXT: s_lshl_b32 s10, s10, 16 -; GFX9-NEXT: v_mul_f32_e32 v2, 0x4f7ffffe, v2 -; GFX9-NEXT: s_or_b32 s3, s3, s10 -; GFX9-NEXT: s_lshr_b32 s10, s4, 8 -; GFX9-NEXT: v_cvt_u32_f32_e32 v2, v2 -; GFX9-NEXT: s_and_b32 s10, s10, 0xff -; GFX9-NEXT: s_lshr_b32 s11, s4, 16 -; GFX9-NEXT: s_lshr_b32 s12, s4, 24 +; GFX9-NEXT: s_lshl_b32 s8, s8, 16 +; GFX9-NEXT: s_or_b32 s3, s3, s8 +; GFX9-NEXT: s_lshr_b32 s8, s4, 8 +; GFX9-NEXT: s_and_b32 s8, s8, 0xff +; GFX9-NEXT: s_lshr_b32 s9, s4, 16 +; GFX9-NEXT: s_lshr_b32 s10, s4, 24 ; GFX9-NEXT: s_and_b32 s4, s4, 0xff -; GFX9-NEXT: s_lshl_b32 s10, s10, 8 -; GFX9-NEXT: s_or_b32 s4, s4, s10 -; GFX9-NEXT: s_and_b32 s10, s11, 0xff -; GFX9-NEXT: s_and_b32 s10, 0xffff, s10 -; GFX9-NEXT: v_mul_lo_u32 v1, v2, v1 +; GFX9-NEXT: s_lshl_b32 s8, s8, 8 +; GFX9-NEXT: v_mul_hi_u32 v1, v0, v1 +; GFX9-NEXT: s_or_b32 s4, s4, s8 +; GFX9-NEXT: s_and_b32 s8, s9, 0xff +; GFX9-NEXT: s_and_b32 s8, 0xffff, s8 ; GFX9-NEXT: s_and_b32 s4, 0xffff, s4 -; GFX9-NEXT: s_lshl_b32 s10, s10, 16 -; GFX9-NEXT: s_or_b32 s4, s4, s10 -; GFX9-NEXT: v_mul_hi_u32 v0, s4, v0 -; GFX9-NEXT: s_lshr_b32 s13, s5, 8 +; GFX9-NEXT: s_lshl_b32 s8, s8, 16 +; GFX9-NEXT: s_or_b32 s4, s4, s8 +; GFX9-NEXT: v_add_u32_e32 v0, v0, v1 +; GFX9-NEXT: v_mul_hi_u32 v1, s4, v0 +; GFX9-NEXT: s_lshr_b32 s11, s5, 8 ; GFX9-NEXT: s_and_b32 s5, s5, 0xff -; GFX9-NEXT: v_mul_hi_u32 v1, v2, v1 ; GFX9-NEXT: s_lshl_b32 s5, s5, 8 -; GFX9-NEXT: s_and_b32 s10, s13, 0xff -; GFX9-NEXT: s_or_b32 s5, s12, s5 -; GFX9-NEXT: s_and_b32 s10, 0xffff, s10 +; GFX9-NEXT: s_and_b32 s8, s11, 0xff +; GFX9-NEXT: s_or_b32 s5, s10, s5 +; GFX9-NEXT: s_and_b32 s8, 0xffff, s8 ; GFX9-NEXT: s_and_b32 s5, 0xffff, s5 -; GFX9-NEXT: s_lshl_b32 s10, s10, 16 -; GFX9-NEXT: v_mul_lo_u32 v0, v0, 24 -; GFX9-NEXT: s_or_b32 s5, s5, s10 -; GFX9-NEXT: v_add_u32_e32 v1, v2, v1 -; GFX9-NEXT: v_mul_hi_u32 v1, s5, v1 -; GFX9-NEXT: v_sub_u32_e32 v0, s4, v0 -; GFX9-NEXT: v_subrev_u32_e32 v3, 24, v0 -; GFX9-NEXT: v_cmp_le_u32_e32 vcc, 24, v0 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc ; GFX9-NEXT: v_mul_lo_u32 v1, v1, 24 -; GFX9-NEXT: v_subrev_u32_e32 v3, 24, v0 -; GFX9-NEXT: v_cmp_le_u32_e32 vcc, 24, v0 -; GFX9-NEXT: s_and_b32 s0, 0xffff, s0 -; GFX9-NEXT: s_and_b32 s7, 0xffff, s7 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc -; GFX9-NEXT: v_sub_u32_e32 v3, 23, v0 -; GFX9-NEXT: s_lshl_b32 s4, s7, 17 -; GFX9-NEXT: s_lshl_b32 s0, s0, 1 -; GFX9-NEXT: v_and_b32_e32 v0, 0xffffff, v0 -; GFX9-NEXT: s_or_b32 s0, s4, s0 -; GFX9-NEXT: v_and_b32_e32 v2, 0xffffff, v3 -; GFX9-NEXT: v_lshrrev_b32_e64 v0, v0, s2 -; GFX9-NEXT: v_sub_u32_e32 v1, s5, v1 -; GFX9-NEXT: v_lshl_or_b32 v0, s0, v2, v0 +; GFX9-NEXT: s_lshl_b32 s8, s8, 16 +; GFX9-NEXT: s_or_b32 s5, s5, s8 +; GFX9-NEXT: v_mul_hi_u32 v0, s5, v0 +; GFX9-NEXT: v_sub_u32_e32 v1, s4, v1 ; GFX9-NEXT: v_subrev_u32_e32 v2, 24, v1 ; GFX9-NEXT: v_cmp_le_u32_e32 vcc, 24, v1 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc +; GFX9-NEXT: v_mul_lo_u32 v0, v0, 24 ; GFX9-NEXT: v_subrev_u32_e32 v2, 24, v1 ; GFX9-NEXT: v_cmp_le_u32_e32 vcc, 24, v1 -; GFX9-NEXT: s_and_b32 s1, 0xffff, s1 -; GFX9-NEXT: s_and_b32 s9, 0xffff, s9 +; GFX9-NEXT: s_and_b32 s0, 0xffff, s0 +; GFX9-NEXT: s_and_b32 s6, 0xffff, s6 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc ; GFX9-NEXT: v_sub_u32_e32 v2, 23, v1 -; GFX9-NEXT: s_lshl_b32 s0, s9, 17 -; GFX9-NEXT: s_lshl_b32 s1, s1, 1 +; GFX9-NEXT: s_lshl_b32 s4, s6, 17 +; GFX9-NEXT: s_lshl_b32 s0, s0, 1 ; GFX9-NEXT: v_and_b32_e32 v1, 0xffffff, v1 -; GFX9-NEXT: s_or_b32 s0, s0, s1 -; GFX9-NEXT: v_and_b32_e32 v2, 0xffffff, v2 -; GFX9-NEXT: v_lshrrev_b32_e64 v1, v1, s3 -; GFX9-NEXT: s_mov_b32 s6, 8 +; GFX9-NEXT: s_or_b32 s0, s4, s0 +; GFX9-NEXT: v_lshrrev_b32_e64 v1, v1, s2 +; GFX9-NEXT: v_sub_u32_e32 v0, s5, v0 ; GFX9-NEXT: v_lshl_or_b32 v1, s0, v2, v1 -; GFX9-NEXT: s_mov_b32 s8, 16 -; GFX9-NEXT: s_movk_i32 s0, 0xff -; GFX9-NEXT: v_lshlrev_b32_sdwa v2, s6, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 -; GFX9-NEXT: v_and_b32_e32 v3, 0xff, v1 -; GFX9-NEXT: v_and_or_b32 v2, v0, s0, v2 -; GFX9-NEXT: v_lshlrev_b32_sdwa v0, s8, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 +; GFX9-NEXT: v_subrev_u32_e32 v2, 24, v0 +; GFX9-NEXT: v_cmp_le_u32_e32 vcc, 24, v0 +; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc +; GFX9-NEXT: v_subrev_u32_e32 v2, 24, v0 +; GFX9-NEXT: v_cmp_le_u32_e32 vcc, 24, v0 +; GFX9-NEXT: s_and_b32 s1, 0xffff, s1 +; GFX9-NEXT: s_and_b32 s7, 0xffff, s7 +; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc +; GFX9-NEXT: v_sub_u32_e32 v2, 23, v0 +; GFX9-NEXT: s_lshl_b32 s0, s7, 17 +; GFX9-NEXT: s_lshl_b32 s1, s1, 1 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffffff, v0 +; GFX9-NEXT: s_or_b32 s0, s0, s1 +; GFX9-NEXT: v_lshrrev_b32_e64 v0, v0, s3 +; GFX9-NEXT: v_mov_b32_e32 v3, 8 +; GFX9-NEXT: v_lshl_or_b32 v0, s0, v2, v0 +; GFX9-NEXT: v_mov_b32_e32 v2, 0xff +; GFX9-NEXT: v_lshlrev_b32_sdwa v3, v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 +; GFX9-NEXT: v_and_or_b32 v2, v1, v2, v3 +; GFX9-NEXT: v_mov_b32_e32 v3, 16 +; GFX9-NEXT: v_lshlrev_b32_sdwa v1, v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 +; GFX9-NEXT: v_and_b32_e32 v3, 0xff, v0 ; GFX9-NEXT: v_lshlrev_b32_e32 v3, 24, v3 -; GFX9-NEXT: v_or3_b32 v0, v2, v0, v3 -; GFX9-NEXT: v_bfe_u32 v2, v1, 8, 8 -; GFX9-NEXT: v_bfe_u32 v1, v1, 16, 8 -; GFX9-NEXT: v_lshl_or_b32 v1, v1, 8, v2 -; GFX9-NEXT: v_readfirstlane_b32 s0, v0 -; GFX9-NEXT: v_readfirstlane_b32 s1, v1 +; GFX9-NEXT: v_or3_b32 v1, v2, v1, v3 +; GFX9-NEXT: v_bfe_u32 v2, v0, 8, 8 +; GFX9-NEXT: v_bfe_u32 v0, v0, 16, 8 +; GFX9-NEXT: v_lshl_or_b32 v0, v0, 8, v2 +; GFX9-NEXT: v_readfirstlane_b32 s0, v1 +; GFX9-NEXT: v_readfirstlane_b32 s1, v0 ; GFX9-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: s_fshr_v2i24: ; GFX10: ; %bb.0: ; GFX10-NEXT: v_cvt_f32_ubyte0_e32 v0, 24 -; GFX10-NEXT: v_cvt_f32_ubyte0_e32 v1, 24 ; GFX10-NEXT: s_lshr_b32 s9, s1, 8 ; GFX10-NEXT: s_and_b32 s1, s1, 0xff ; GFX10-NEXT: s_lshr_b32 s6, s0, 8 -; GFX10-NEXT: v_rcp_iflag_f32_e32 v0, v0 -; GFX10-NEXT: v_rcp_iflag_f32_e32 v1, v1 ; GFX10-NEXT: s_lshr_b32 s8, s0, 24 +; GFX10-NEXT: v_rcp_iflag_f32_e32 v0, v0 ; GFX10-NEXT: s_lshl_b32 s1, s1, 8 ; GFX10-NEXT: s_and_b32 s6, s6, 0xff ; GFX10-NEXT: s_or_b32 s1, s8, s1 @@ -2224,246 +2194,230 @@ ; GFX10-NEXT: s_lshr_b32 s7, s0, 16 ; GFX10-NEXT: s_and_b32 s0, s0, 0xff ; GFX10-NEXT: s_lshl_b32 s6, s6, 8 -; GFX10-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 -; GFX10-NEXT: v_mul_f32_e32 v1, 0x4f7ffffe, v1 ; GFX10-NEXT: s_and_b32 s8, s8, 0xff ; GFX10-NEXT: s_or_b32 s0, s0, s6 +; GFX10-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX10-NEXT: s_and_b32 s6, s7, 0xff -; GFX10-NEXT: v_cvt_u32_f32_e32 v0, v0 -; GFX10-NEXT: v_cvt_u32_f32_e32 v1, v1 ; GFX10-NEXT: s_and_b32 s7, s9, 0xff ; GFX10-NEXT: s_lshr_b32 s9, s4, 16 ; GFX10-NEXT: s_lshr_b32 s10, s4, 24 -; GFX10-NEXT: v_mul_lo_u32 v2, 0xffffffe8, v0 -; GFX10-NEXT: v_mul_lo_u32 v3, 0xffffffe8, v1 +; GFX10-NEXT: v_cvt_u32_f32_e32 v0, v0 ; GFX10-NEXT: s_and_b32 s4, s4, 0xff ; GFX10-NEXT: s_lshl_b32 s8, s8, 8 ; GFX10-NEXT: s_lshr_b32 s11, s5, 8 ; GFX10-NEXT: s_or_b32 s4, s4, s8 +; GFX10-NEXT: v_mul_lo_u32 v1, 0xffffffe8, v0 ; GFX10-NEXT: s_and_b32 s8, s9, 0xff ; GFX10-NEXT: s_and_b32 s4, 0xffff, s4 -; GFX10-NEXT: v_mul_hi_u32 v2, v0, v2 -; GFX10-NEXT: v_mul_hi_u32 v3, v1, v3 ; GFX10-NEXT: s_and_b32 s8, 0xffff, s8 ; GFX10-NEXT: s_and_b32 s5, s5, 0xff ; GFX10-NEXT: s_lshl_b32 s8, s8, 16 ; GFX10-NEXT: s_lshl_b32 s5, s5, 8 ; GFX10-NEXT: s_or_b32 s4, s4, s8 +; GFX10-NEXT: v_mul_hi_u32 v1, v0, v1 ; GFX10-NEXT: s_and_b32 s8, s11, 0xff -; GFX10-NEXT: v_add_nc_u32_e32 v0, v0, v2 ; GFX10-NEXT: s_or_b32 s5, s10, s5 ; GFX10-NEXT: s_and_b32 s8, 0xffff, s8 -; GFX10-NEXT: v_add_nc_u32_e32 v1, v1, v3 ; GFX10-NEXT: s_and_b32 s5, 0xffff, s5 -; GFX10-NEXT: v_mul_hi_u32 v0, s4, v0 ; GFX10-NEXT: s_lshl_b32 s8, s8, 16 -; GFX10-NEXT: s_lshr_b32 s9, s2, 8 +; GFX10-NEXT: s_lshr_b32 s9, s2, 16 ; GFX10-NEXT: s_or_b32 s5, s5, s8 -; GFX10-NEXT: s_lshr_b32 s8, s2, 16 -; GFX10-NEXT: v_mul_hi_u32 v1, s5, v1 -; GFX10-NEXT: s_and_b32 s9, s9, 0xff +; GFX10-NEXT: v_add_nc_u32_e32 v0, v0, v1 +; GFX10-NEXT: s_lshr_b32 s8, s2, 8 ; GFX10-NEXT: s_lshr_b32 s10, s2, 24 -; GFX10-NEXT: v_mul_lo_u32 v0, v0, 24 -; GFX10-NEXT: s_lshr_b32 s11, s3, 8 -; GFX10-NEXT: s_and_b32 s2, s2, 0xff -; GFX10-NEXT: s_lshl_b32 s9, s9, 8 ; GFX10-NEXT: s_and_b32 s8, s8, 0xff +; GFX10-NEXT: s_and_b32 s2, s2, 0xff +; GFX10-NEXT: v_mul_hi_u32 v1, s4, v0 +; GFX10-NEXT: v_mul_hi_u32 v0, s5, v0 +; GFX10-NEXT: s_lshl_b32 s8, s8, 8 +; GFX10-NEXT: s_lshr_b32 s11, s3, 8 +; GFX10-NEXT: s_or_b32 s2, s2, s8 +; GFX10-NEXT: s_and_b32 s8, s9, 0xff +; GFX10-NEXT: s_and_b32 s2, 0xffff, s2 +; GFX10-NEXT: s_and_b32 s8, 0xffff, s8 ; GFX10-NEXT: v_mul_lo_u32 v1, v1, 24 +; GFX10-NEXT: v_mul_lo_u32 v0, v0, 24 ; GFX10-NEXT: s_and_b32 s3, s3, 0xff -; GFX10-NEXT: s_or_b32 s2, s2, s9 -; GFX10-NEXT: v_sub_nc_u32_e32 v0, s4, v0 -; GFX10-NEXT: s_and_b32 s4, 0xffff, s8 +; GFX10-NEXT: s_and_b32 s0, 0xffff, s0 ; GFX10-NEXT: s_lshl_b32 s3, s3, 8 -; GFX10-NEXT: s_and_b32 s2, 0xffff, s2 -; GFX10-NEXT: s_lshl_b32 s4, s4, 16 -; GFX10-NEXT: v_subrev_nc_u32_e32 v2, 24, v0 -; GFX10-NEXT: v_sub_nc_u32_e32 v1, s5, v1 -; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, 24, v0 -; GFX10-NEXT: s_and_b32 s5, s11, 0xff +; GFX10-NEXT: s_and_b32 s6, 0xffff, s6 ; GFX10-NEXT: s_or_b32 s3, s10, s3 -; GFX10-NEXT: s_and_b32 s5, 0xffff, s5 +; GFX10-NEXT: s_and_b32 s1, 0xffff, s1 +; GFX10-NEXT: v_sub_nc_u32_e32 v1, s4, v1 +; GFX10-NEXT: v_sub_nc_u32_e32 v0, s5, v0 +; GFX10-NEXT: s_lshl_b32 s4, s8, 16 ; GFX10-NEXT: s_and_b32 s3, 0xffff, s3 -; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc_lo +; GFX10-NEXT: s_or_b32 s2, s2, s4 ; GFX10-NEXT: v_subrev_nc_u32_e32 v2, 24, v1 ; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, 24, v1 -; GFX10-NEXT: s_lshl_b32 s5, s5, 16 -; GFX10-NEXT: s_or_b32 s2, s2, s4 ; GFX10-NEXT: v_subrev_nc_u32_e32 v3, 24, v0 -; GFX10-NEXT: s_and_b32 s0, 0xffff, s0 +; GFX10-NEXT: s_and_b32 s4, s11, 0xff +; GFX10-NEXT: s_and_b32 s7, 0xffff, s7 +; GFX10-NEXT: s_and_b32 s4, 0xffff, s4 ; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc_lo ; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, 24, v0 -; GFX10-NEXT: s_and_b32 s6, 0xffff, s6 -; GFX10-NEXT: s_or_b32 s3, s3, s5 -; GFX10-NEXT: s_and_b32 s1, 0xffff, s1 +; GFX10-NEXT: s_lshl_b32 s4, s4, 16 +; GFX10-NEXT: s_lshl_b32 s0, s0, 1 +; GFX10-NEXT: s_or_b32 s3, s3, s4 ; GFX10-NEXT: v_subrev_nc_u32_e32 v2, 24, v1 ; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc_lo ; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, 24, v1 -; GFX10-NEXT: s_and_b32 s7, 0xffff, s7 ; GFX10-NEXT: s_lshl_b32 s4, s6, 17 -; GFX10-NEXT: s_lshl_b32 s0, s0, 1 -; GFX10-NEXT: v_sub_nc_u32_e32 v3, 23, v0 -; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc_lo -; GFX10-NEXT: v_and_b32_e32 v0, 0xffffff, v0 -; GFX10-NEXT: s_or_b32 s0, s4, s0 ; GFX10-NEXT: s_lshl_b32 s1, s1, 1 -; GFX10-NEXT: v_and_b32_e32 v3, 0xffffff, v3 -; GFX10-NEXT: v_sub_nc_u32_e32 v2, 23, v1 -; GFX10-NEXT: v_and_b32_e32 v1, 0xffffff, v1 -; GFX10-NEXT: v_lshrrev_b32_e64 v0, v0, s2 +; GFX10-NEXT: s_or_b32 s0, s4, s0 +; GFX10-NEXT: v_subrev_nc_u32_e32 v3, 24, v0 +; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc_lo +; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, 24, v0 +; GFX10-NEXT: v_and_b32_e32 v2, 0xffffff, v1 +; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc_lo +; GFX10-NEXT: v_sub_nc_u32_e32 v1, 23, v1 +; GFX10-NEXT: v_lshrrev_b32_e64 v2, v2, s2 +; GFX10-NEXT: v_and_b32_e32 v3, 0xffffff, v0 +; GFX10-NEXT: v_sub_nc_u32_e32 v0, 23, v0 ; GFX10-NEXT: s_lshl_b32 s2, s7, 17 -; GFX10-NEXT: v_and_b32_e32 v2, 0xffffff, v2 -; GFX10-NEXT: v_lshrrev_b32_e64 v1, v1, s3 -; GFX10-NEXT: v_lshl_or_b32 v0, s0, v3, v0 +; GFX10-NEXT: v_lshl_or_b32 v1, s0, v1, v2 +; GFX10-NEXT: v_lshrrev_b32_e64 v3, v3, s3 ; GFX10-NEXT: s_or_b32 s0, s2, s1 -; GFX10-NEXT: v_lshl_or_b32 v1, s0, v2, v1 -; GFX10-NEXT: s_mov_b32 s0, 8 -; GFX10-NEXT: v_lshlrev_b32_sdwa v2, s0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 -; GFX10-NEXT: s_mov_b32 s0, 16 -; GFX10-NEXT: v_and_b32_e32 v3, 0xff, v1 -; GFX10-NEXT: v_bfe_u32 v4, v1, 8, 8 -; GFX10-NEXT: v_bfe_u32 v1, v1, 16, 8 -; GFX10-NEXT: v_and_or_b32 v2, v0, 0xff, v2 -; GFX10-NEXT: v_lshlrev_b32_sdwa v0, s0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 -; GFX10-NEXT: v_lshlrev_b32_e32 v3, 24, v3 -; GFX10-NEXT: v_lshl_or_b32 v1, v1, 8, v4 -; GFX10-NEXT: v_or3_b32 v0, v2, v0, v3 -; GFX10-NEXT: v_readfirstlane_b32 s1, v1 -; GFX10-NEXT: v_readfirstlane_b32 s0, v0 +; GFX10-NEXT: v_mov_b32_e32 v2, 8 +; GFX10-NEXT: v_lshl_or_b32 v0, s0, v0, v3 +; GFX10-NEXT: v_mov_b32_e32 v3, 16 +; GFX10-NEXT: v_lshlrev_b32_sdwa v2, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 +; GFX10-NEXT: v_and_b32_e32 v4, 0xff, v0 +; GFX10-NEXT: v_and_or_b32 v2, 0xff, v1, v2 +; GFX10-NEXT: v_lshlrev_b32_sdwa v1, v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 +; GFX10-NEXT: v_lshlrev_b32_e32 v3, 24, v4 +; GFX10-NEXT: v_bfe_u32 v4, v0, 8, 8 +; GFX10-NEXT: v_bfe_u32 v0, v0, 16, 8 +; GFX10-NEXT: v_or3_b32 v1, v2, v1, v3 +; GFX10-NEXT: v_lshl_or_b32 v0, v0, 8, v4 +; GFX10-NEXT: v_readfirstlane_b32 s0, v1 +; GFX10-NEXT: v_readfirstlane_b32 s1, v0 ; GFX10-NEXT: ; return to shader part epilog ; ; GFX11-LABEL: s_fshr_v2i24: ; GFX11: ; %bb.0: ; GFX11-NEXT: v_cvt_f32_ubyte0_e32 v0, 24 -; GFX11-NEXT: v_cvt_f32_ubyte0_e32 v1, 24 -; GFX11-NEXT: s_lshr_b32 s6, s0, 8 -; GFX11-NEXT: s_lshr_b32 s7, s0, 16 -; GFX11-NEXT: s_and_b32 s6, s6, 0xff +; GFX11-NEXT: s_lshr_b32 s14, s4, 8 +; GFX11-NEXT: s_lshr_b32 s15, s4, 16 +; GFX11-NEXT: s_and_b32 s14, s14, 0xff +; GFX11-NEXT: s_lshr_b32 s16, s4, 24 ; GFX11-NEXT: v_rcp_iflag_f32_e32 v0, v0 -; GFX11-NEXT: v_rcp_iflag_f32_e32 v1, v1 -; GFX11-NEXT: s_lshr_b32 s8, s0, 24 -; GFX11-NEXT: s_and_b32 s0, s0, 0xff -; GFX11-NEXT: s_lshl_b32 s6, s6, 8 -; GFX11-NEXT: s_lshr_b32 s9, s1, 8 -; GFX11-NEXT: s_or_b32 s0, s0, s6 -; GFX11-NEXT: s_and_b32 s6, s7, 0xff -; GFX11-NEXT: s_and_b32 s7, s9, 0xff -; GFX11-NEXT: s_lshr_b32 s9, s4, 8 -; GFX11-NEXT: s_waitcnt_depctr 0xfff -; GFX11-NEXT: v_dual_mul_f32 v0, 0x4f7ffffe, v0 :: v_dual_mul_f32 v1, 0x4f7ffffe, v1 -; GFX11-NEXT: s_lshr_b32 s10, s4, 16 -; GFX11-NEXT: s_and_b32 s9, s9, 0xff -; GFX11-NEXT: s_and_b32 s11, s4, 0xff -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_cvt_u32_f32_e32 v0, v0 -; GFX11-NEXT: v_cvt_u32_f32_e32 v1, v1 -; GFX11-NEXT: s_lshl_b32 s9, s9, 8 -; GFX11-NEXT: s_and_b32 s10, s10, 0xff -; GFX11-NEXT: s_or_b32 s9, s11, s9 -; GFX11-NEXT: v_mul_lo_u32 v2, 0xffffffe8, v0 -; GFX11-NEXT: v_mul_lo_u32 v3, 0xffffffe8, v1 -; GFX11-NEXT: s_and_b32 s10, 0xffff, s10 -; GFX11-NEXT: s_and_b32 s9, 0xffff, s9 -; GFX11-NEXT: s_lshl_b32 s10, s10, 16 -; GFX11-NEXT: s_lshr_b32 s11, s5, 8 -; GFX11-NEXT: s_or_b32 s9, s9, s10 +; GFX11-NEXT: s_and_b32 s4, s4, 0xff +; GFX11-NEXT: s_and_b32 s15, s15, 0xff +; GFX11-NEXT: s_lshl_b32 s14, s14, 8 +; GFX11-NEXT: s_and_b32 s15, 0xffff, s15 +; GFX11-NEXT: s_or_b32 s4, s4, s14 +; GFX11-NEXT: s_lshr_b32 s17, s5, 8 ; GFX11-NEXT: s_and_b32 s5, s5, 0xff -; GFX11-NEXT: v_mul_hi_u32 v2, v0, v2 -; GFX11-NEXT: s_lshr_b32 s4, s4, 24 -; GFX11-NEXT: s_lshl_b32 s5, s5, 8 -; GFX11-NEXT: s_and_b32 s10, s11, 0xff -; GFX11-NEXT: s_or_b32 s4, s4, s5 -; GFX11-NEXT: s_and_b32 s5, 0xffff, s10 +; GFX11-NEXT: s_lshl_b32 s14, s15, 16 ; GFX11-NEXT: s_and_b32 s4, 0xffff, s4 -; GFX11-NEXT: s_lshl_b32 s5, s5, 16 -; GFX11-NEXT: v_add_nc_u32_e32 v0, v0, v2 -; GFX11-NEXT: v_mul_hi_u32 v2, v1, v3 -; GFX11-NEXT: s_or_b32 s4, s4, s5 +; GFX11-NEXT: s_waitcnt_depctr 0xfff +; GFX11-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 +; GFX11-NEXT: s_lshl_b32 s5, s5, 8 +; GFX11-NEXT: s_and_b32 s15, s17, 0xff +; GFX11-NEXT: s_or_b32 s4, s4, s14 +; GFX11-NEXT: s_or_b32 s5, s16, s5 +; GFX11-NEXT: v_cvt_u32_f32_e32 v0, v0 +; GFX11-NEXT: s_and_b32 s14, 0xffff, s15 +; GFX11-NEXT: s_and_b32 s5, 0xffff, s5 +; GFX11-NEXT: s_lshl_b32 s14, s14, 16 +; GFX11-NEXT: s_lshr_b32 s10, s2, 8 +; GFX11-NEXT: v_mul_lo_u32 v1, 0xffffffe8, v0 +; GFX11-NEXT: s_or_b32 s5, s5, s14 +; GFX11-NEXT: s_lshr_b32 s9, s1, 8 ; GFX11-NEXT: s_and_b32 s1, s1, 0xff -; GFX11-NEXT: s_lshr_b32 s10, s2, 16 -; GFX11-NEXT: v_mul_hi_u32 v0, s9, v0 +; GFX11-NEXT: s_lshr_b32 s11, s2, 16 +; GFX11-NEXT: s_and_b32 s10, s10, 0xff +; GFX11-NEXT: s_lshr_b32 s6, s0, 8 +; GFX11-NEXT: s_lshr_b32 s8, s0, 24 +; GFX11-NEXT: v_mul_hi_u32 v1, v0, v1 +; GFX11-NEXT: s_lshr_b32 s12, s2, 24 +; GFX11-NEXT: s_and_b32 s2, s2, 0xff ; GFX11-NEXT: s_lshl_b32 s1, s1, 8 -; GFX11-NEXT: s_lshr_b32 s5, s2, 24 +; GFX11-NEXT: s_and_b32 s9, s9, 0xff +; GFX11-NEXT: s_and_b32 s11, s11, 0xff +; GFX11-NEXT: s_and_b32 s6, s6, 0xff ; GFX11-NEXT: s_or_b32 s1, s8, s1 -; GFX11-NEXT: v_add_nc_u32_e32 v1, v1, v2 -; GFX11-NEXT: s_lshr_b32 s8, s2, 8 -; GFX11-NEXT: s_and_b32 s2, s2, 0xff -; GFX11-NEXT: s_and_b32 s8, s8, 0xff -; GFX11-NEXT: v_mul_lo_u32 v0, v0, 24 -; GFX11-NEXT: v_mul_hi_u32 v1, s4, v1 -; GFX11-NEXT: s_lshl_b32 s8, s8, 8 +; GFX11-NEXT: v_add_nc_u32_e32 v0, v0, v1 +; GFX11-NEXT: s_and_b32 s8, 0xffff, s9 +; GFX11-NEXT: s_and_b32 s9, 0xffff, s11 +; GFX11-NEXT: s_lshr_b32 s7, s0, 16 +; GFX11-NEXT: s_and_b32 s0, s0, 0xff +; GFX11-NEXT: v_mul_hi_u32 v1, s4, v0 +; GFX11-NEXT: v_mul_hi_u32 v0, s5, v0 +; GFX11-NEXT: s_lshl_b32 s6, s6, 8 +; GFX11-NEXT: s_and_b32 s7, s7, 0xff +; GFX11-NEXT: s_or_b32 s0, s0, s6 +; GFX11-NEXT: s_and_b32 s7, 0xffff, s7 ; GFX11-NEXT: s_and_b32 s0, 0xffff, s0 -; GFX11-NEXT: s_or_b32 s2, s2, s8 -; GFX11-NEXT: s_and_b32 s8, s10, 0xff -; GFX11-NEXT: s_and_b32 s2, 0xffff, s2 -; GFX11-NEXT: s_and_b32 s8, 0xffff, s8 -; GFX11-NEXT: v_sub_nc_u32_e32 v0, s9, v0 +; GFX11-NEXT: s_lshr_b32 s13, s3, 8 ; GFX11-NEXT: v_mul_lo_u32 v1, v1, 24 -; GFX11-NEXT: s_lshr_b32 s9, s3, 8 +; GFX11-NEXT: v_mul_lo_u32 v0, v0, 24 ; GFX11-NEXT: s_and_b32 s3, s3, 0xff -; GFX11-NEXT: s_lshl_b32 s8, s8, 16 -; GFX11-NEXT: v_subrev_nc_u32_e32 v2, 24, v0 -; GFX11-NEXT: v_cmp_le_u32_e32 vcc_lo, 24, v0 +; GFX11-NEXT: s_lshl_b32 s0, s0, 1 ; GFX11-NEXT: s_lshl_b32 s3, s3, 8 -; GFX11-NEXT: s_or_b32 s2, s2, s8 +; GFX11-NEXT: s_and_b32 s13, s13, 0xff +; GFX11-NEXT: s_or_b32 s3, s12, s3 +; GFX11-NEXT: s_and_b32 s1, 0xffff, s1 ; GFX11-NEXT: v_sub_nc_u32_e32 v1, s4, v1 -; GFX11-NEXT: s_and_b32 s4, s9, 0xff -; GFX11-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc_lo -; GFX11-NEXT: s_and_b32 s6, 0xffff, s6 -; GFX11-NEXT: s_or_b32 s3, s5, s3 -; GFX11-NEXT: v_subrev_nc_u32_e32 v3, 24, v1 -; GFX11-NEXT: s_and_b32 s4, 0xffff, s4 +; GFX11-NEXT: v_sub_nc_u32_e32 v0, s5, v0 +; GFX11-NEXT: s_lshl_b32 s4, s10, 8 +; GFX11-NEXT: s_lshl_b32 s5, s7, 17 +; GFX11-NEXT: s_or_b32 s2, s2, s4 +; GFX11-NEXT: v_subrev_nc_u32_e32 v2, 24, v1 +; GFX11-NEXT: v_cmp_le_u32_e32 vcc_lo, 24, v1 +; GFX11-NEXT: s_lshl_b32 s4, s9, 16 +; GFX11-NEXT: s_and_b32 s2, 0xffff, s2 +; GFX11-NEXT: s_or_b32 s0, s5, s0 +; GFX11-NEXT: s_or_b32 s2, s2, s4 +; GFX11-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc_lo ; GFX11-NEXT: v_subrev_nc_u32_e32 v2, 24, v0 ; GFX11-NEXT: v_cmp_le_u32_e32 vcc_lo, 24, v0 +; GFX11-NEXT: s_and_b32 s10, 0xffff, s13 ; GFX11-NEXT: s_and_b32 s3, 0xffff, s3 -; GFX11-NEXT: s_lshl_b32 s4, s4, 16 -; GFX11-NEXT: s_lshl_b32 s5, s6, 17 -; GFX11-NEXT: s_lshl_b32 s0, s0, 1 -; GFX11-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc_lo -; GFX11-NEXT: v_cmp_le_u32_e32 vcc_lo, 24, v1 -; GFX11-NEXT: s_or_b32 s0, s5, s0 -; GFX11-NEXT: s_and_b32 s1, 0xffff, s1 -; GFX11-NEXT: s_and_b32 s7, 0xffff, s7 -; GFX11-NEXT: s_lshl_b32 s1, s1, 1 -; GFX11-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc_lo -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_subrev_nc_u32_e32 v3, 24, v1 +; GFX11-NEXT: s_lshl_b32 s1, s1, 1 +; GFX11-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc_lo ; GFX11-NEXT: v_cmp_le_u32_e32 vcc_lo, 24, v1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc_lo -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_sub_nc_u32_e32 v3, 23, v1 -; GFX11-NEXT: v_and_b32_e32 v1, 0xffffff, v1 -; GFX11-NEXT: v_sub_nc_u32_e32 v2, 23, v0 -; GFX11-NEXT: v_and_b32_e32 v0, 0xffffff, v0 -; GFX11-NEXT: v_and_b32_e32 v2, 0xffffff, v2 +; GFX11-NEXT: v_and_b32_e32 v3, 0xffffff, v1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_subrev_nc_u32_e32 v2, 24, v0 +; GFX11-NEXT: v_cmp_le_u32_e32 vcc_lo, 24, v0 +; GFX11-NEXT: v_sub_nc_u32_e32 v1, 23, v1 +; GFX11-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc_lo +; GFX11-NEXT: v_lshrrev_b32_e64 v2, v3, s2 +; GFX11-NEXT: s_lshl_b32 s2, s10, 16 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: s_or_b32 s2, s3, s2 +; GFX11-NEXT: v_and_b32_e32 v3, 0xffffff, v0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) -; GFX11-NEXT: v_lshrrev_b32_e64 v0, v0, s2 -; GFX11-NEXT: s_or_b32 s2, s3, s4 -; GFX11-NEXT: v_lshrrev_b32_e64 v1, v1, s2 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(SALU_CYCLE_1) -; GFX11-NEXT: v_lshl_or_b32 v0, s0, v2, v0 -; GFX11-NEXT: v_and_b32_e32 v2, 0xffffff, v3 -; GFX11-NEXT: s_lshl_b32 s0, s7, 17 +; GFX11-NEXT: v_lshl_or_b32 v1, s0, v1, v2 +; GFX11-NEXT: s_lshl_b32 s0, s8, 17 ; GFX11-NEXT: s_or_b32 s0, s0, s1 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_bfe_u32 v3, v0, 8, 8 -; GFX11-NEXT: v_lshl_or_b32 v1, s0, v2, v1 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3) -; GFX11-NEXT: v_lshlrev_b32_e32 v2, 8, v3 -; GFX11-NEXT: v_bfe_u32 v3, v0, 16, 8 -; GFX11-NEXT: v_and_b32_e32 v4, 0xff, v1 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-NEXT: v_and_or_b32 v0, v0, 0xff, v2 -; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX11-NEXT: v_lshlrev_b32_e32 v3, 24, v4 -; GFX11-NEXT: v_bfe_u32 v4, v1, 8, 8 -; GFX11-NEXT: v_bfe_u32 v1, v1, 16, 8 -; GFX11-NEXT: v_or3_b32 v0, v0, v2, v3 +; GFX11-NEXT: v_lshrrev_b32_e64 v2, v3, s2 +; GFX11-NEXT: v_bfe_u32 v3, v1, 16, 8 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX11-NEXT: v_sub_nc_u32_e32 v0, 23, v0 +; GFX11-NEXT: v_lshl_or_b32 v0, s0, v0, v2 +; GFX11-NEXT: v_bfe_u32 v2, v1, 8, 8 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_lshl_or_b32 v1, v1, 8, v4 -; GFX11-NEXT: v_readfirstlane_b32 s0, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX11-NEXT: v_readfirstlane_b32 s1, v1 +; GFX11-NEXT: v_and_b32_e32 v4, 0xff, v0 +; GFX11-NEXT: v_lshlrev_b32_e32 v2, 8, v2 +; GFX11-NEXT: v_bfe_u32 v5, v0, 8, 8 +; GFX11-NEXT: v_bfe_u32 v0, v0, 16, 8 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_lshlrev_b32_e32 v4, 24, v4 +; GFX11-NEXT: v_and_or_b32 v1, 0xff, v1, v2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_lshl_or_b32 v0, v0, 8, v5 +; GFX11-NEXT: v_or3_b32 v1, v1, v3, v4 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_readfirstlane_b32 s1, v0 +; GFX11-NEXT: v_readfirstlane_b32 s0, v1 ; GFX11-NEXT: ; return to shader part epilog %lhs = bitcast i48 %lhs.arg to <2 x i24> %rhs = bitcast i48 %rhs.arg to <2 x i24> @@ -2481,40 +2435,33 @@ ; GFX6-NEXT: v_rcp_iflag_f32_e32 v6, v6 ; GFX6-NEXT: v_mov_b32_e32 v7, 0xffffffe8 ; GFX6-NEXT: v_and_b32_e32 v4, 0xffffff, v4 -; GFX6-NEXT: v_cvt_f32_ubyte0_e32 v9, 24 +; GFX6-NEXT: v_and_b32_e32 v5, 0xffffff, v5 ; GFX6-NEXT: v_mul_f32_e32 v6, 0x4f7ffffe, v6 ; GFX6-NEXT: v_cvt_u32_f32_e32 v6, v6 -; GFX6-NEXT: v_and_b32_e32 v5, 0xffffff, v5 ; GFX6-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX6-NEXT: v_and_b32_e32 v2, 0xffffff, v2 -; GFX6-NEXT: v_mul_lo_u32 v8, v6, v7 ; GFX6-NEXT: v_lshlrev_b32_e32 v1, 1, v1 +; GFX6-NEXT: v_mul_lo_u32 v7, v6, v7 ; GFX6-NEXT: v_and_b32_e32 v3, 0xffffff, v3 -; GFX6-NEXT: v_mul_hi_u32 v8, v6, v8 -; GFX6-NEXT: v_add_i32_e32 v6, vcc, v6, v8 -; GFX6-NEXT: v_mul_hi_u32 v6, v4, v6 -; GFX6-NEXT: v_rcp_iflag_f32_e32 v8, v9 +; GFX6-NEXT: v_mul_hi_u32 v7, v6, v7 +; GFX6-NEXT: v_add_i32_e32 v6, vcc, v6, v7 +; GFX6-NEXT: v_mul_hi_u32 v7, v4, v6 +; GFX6-NEXT: v_mul_hi_u32 v6, v5, v6 +; GFX6-NEXT: v_mul_lo_u32 v7, v7, 24 ; GFX6-NEXT: v_mul_lo_u32 v6, v6, 24 -; GFX6-NEXT: v_mul_f32_e32 v8, 0x4f7ffffe, v8 -; GFX6-NEXT: v_cvt_u32_f32_e32 v8, v8 -; GFX6-NEXT: v_sub_i32_e32 v4, vcc, v4, v6 -; GFX6-NEXT: v_subrev_i32_e32 v6, vcc, 24, v4 +; GFX6-NEXT: v_sub_i32_e32 v4, vcc, v4, v7 +; GFX6-NEXT: v_subrev_i32_e32 v7, vcc, 24, v4 ; GFX6-NEXT: v_cmp_le_u32_e32 vcc, 24, v4 -; GFX6-NEXT: v_cndmask_b32_e32 v4, v4, v6, vcc -; GFX6-NEXT: v_subrev_i32_e32 v6, vcc, 24, v4 +; GFX6-NEXT: v_cndmask_b32_e32 v4, v4, v7, vcc +; GFX6-NEXT: v_subrev_i32_e32 v7, vcc, 24, v4 ; GFX6-NEXT: v_cmp_le_u32_e32 vcc, 24, v4 -; GFX6-NEXT: v_cndmask_b32_e32 v4, v4, v6, vcc -; GFX6-NEXT: v_mul_lo_u32 v6, v8, v7 +; GFX6-NEXT: v_cndmask_b32_e32 v4, v4, v7, vcc ; GFX6-NEXT: v_sub_i32_e32 v7, vcc, 23, v4 ; GFX6-NEXT: v_and_b32_e32 v7, 0xffffff, v7 -; GFX6-NEXT: v_mul_hi_u32 v6, v8, v6 ; GFX6-NEXT: v_and_b32_e32 v4, 0xffffff, v4 ; GFX6-NEXT: v_lshlrev_b32_e32 v0, v7, v0 ; GFX6-NEXT: v_lshrrev_b32_e32 v2, v4, v2 -; GFX6-NEXT: v_add_i32_e32 v6, vcc, v8, v6 -; GFX6-NEXT: v_mul_hi_u32 v6, v5, v6 ; GFX6-NEXT: v_or_b32_e32 v0, v0, v2 -; GFX6-NEXT: v_mul_lo_u32 v6, v6, 24 ; GFX6-NEXT: v_sub_i32_e32 v2, vcc, v5, v6 ; GFX6-NEXT: v_subrev_i32_e32 v4, vcc, 24, v2 ; GFX6-NEXT: v_cmp_le_u32_e32 vcc, 24, v2 @@ -2537,40 +2484,33 @@ ; GFX8-NEXT: v_rcp_iflag_f32_e32 v6, v6 ; GFX8-NEXT: v_mov_b32_e32 v7, 0xffffffe8 ; GFX8-NEXT: v_and_b32_e32 v4, 0xffffff, v4 -; GFX8-NEXT: v_cvt_f32_ubyte0_e32 v9, 24 +; GFX8-NEXT: v_and_b32_e32 v5, 0xffffff, v5 ; GFX8-NEXT: v_mul_f32_e32 v6, 0x4f7ffffe, v6 ; GFX8-NEXT: v_cvt_u32_f32_e32 v6, v6 -; GFX8-NEXT: v_and_b32_e32 v5, 0xffffff, v5 ; GFX8-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX8-NEXT: v_and_b32_e32 v2, 0xffffff, v2 -; GFX8-NEXT: v_mul_lo_u32 v8, v6, v7 ; GFX8-NEXT: v_lshlrev_b32_e32 v1, 1, v1 +; GFX8-NEXT: v_mul_lo_u32 v7, v6, v7 ; GFX8-NEXT: v_and_b32_e32 v3, 0xffffff, v3 -; GFX8-NEXT: v_mul_hi_u32 v8, v6, v8 -; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v8 -; GFX8-NEXT: v_mul_hi_u32 v6, v4, v6 -; GFX8-NEXT: v_rcp_iflag_f32_e32 v8, v9 +; GFX8-NEXT: v_mul_hi_u32 v7, v6, v7 +; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v7 +; GFX8-NEXT: v_mul_hi_u32 v7, v4, v6 +; GFX8-NEXT: v_mul_hi_u32 v6, v5, v6 +; GFX8-NEXT: v_mul_lo_u32 v7, v7, 24 ; GFX8-NEXT: v_mul_lo_u32 v6, v6, 24 -; GFX8-NEXT: v_mul_f32_e32 v8, 0x4f7ffffe, v8 -; GFX8-NEXT: v_cvt_u32_f32_e32 v8, v8 -; GFX8-NEXT: v_sub_u32_e32 v4, vcc, v4, v6 -; GFX8-NEXT: v_subrev_u32_e32 v6, vcc, 24, v4 +; GFX8-NEXT: v_sub_u32_e32 v4, vcc, v4, v7 +; GFX8-NEXT: v_subrev_u32_e32 v7, vcc, 24, v4 ; GFX8-NEXT: v_cmp_le_u32_e32 vcc, 24, v4 -; GFX8-NEXT: v_cndmask_b32_e32 v4, v4, v6, vcc -; GFX8-NEXT: v_subrev_u32_e32 v6, vcc, 24, v4 +; GFX8-NEXT: v_cndmask_b32_e32 v4, v4, v7, vcc +; GFX8-NEXT: v_subrev_u32_e32 v7, vcc, 24, v4 ; GFX8-NEXT: v_cmp_le_u32_e32 vcc, 24, v4 -; GFX8-NEXT: v_cndmask_b32_e32 v4, v4, v6, vcc -; GFX8-NEXT: v_mul_lo_u32 v6, v8, v7 +; GFX8-NEXT: v_cndmask_b32_e32 v4, v4, v7, vcc ; GFX8-NEXT: v_sub_u32_e32 v7, vcc, 23, v4 ; GFX8-NEXT: v_and_b32_e32 v7, 0xffffff, v7 -; GFX8-NEXT: v_mul_hi_u32 v6, v8, v6 ; GFX8-NEXT: v_and_b32_e32 v4, 0xffffff, v4 ; GFX8-NEXT: v_lshlrev_b32_e32 v0, v7, v0 ; GFX8-NEXT: v_lshrrev_b32_e32 v2, v4, v2 -; GFX8-NEXT: v_add_u32_e32 v6, vcc, v8, v6 -; GFX8-NEXT: v_mul_hi_u32 v6, v5, v6 ; GFX8-NEXT: v_or_b32_e32 v0, v0, v2 -; GFX8-NEXT: v_mul_lo_u32 v6, v6, 24 ; GFX8-NEXT: v_sub_u32_e32 v2, vcc, v5, v6 ; GFX8-NEXT: v_subrev_u32_e32 v4, vcc, 24, v2 ; GFX8-NEXT: v_cmp_le_u32_e32 vcc, 24, v2 @@ -2591,30 +2531,24 @@ ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: v_cvt_f32_ubyte0_e32 v6, 24 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v6, v6 -; GFX9-NEXT: v_cvt_f32_ubyte0_e32 v9, 24 -; GFX9-NEXT: v_rcp_iflag_f32_e32 v9, v9 ; GFX9-NEXT: v_mov_b32_e32 v7, 0xffffffe8 -; GFX9-NEXT: v_mul_f32_e32 v6, 0x4f7ffffe, v6 -; GFX9-NEXT: v_cvt_u32_f32_e32 v6, v6 -; GFX9-NEXT: v_mul_f32_e32 v9, 0x4f7ffffe, v9 -; GFX9-NEXT: v_cvt_u32_f32_e32 v9, v9 ; GFX9-NEXT: v_and_b32_e32 v4, 0xffffff, v4 -; GFX9-NEXT: v_mul_lo_u32 v8, v6, v7 ; GFX9-NEXT: v_and_b32_e32 v5, 0xffffff, v5 -; GFX9-NEXT: v_mul_lo_u32 v7, v9, v7 +; GFX9-NEXT: v_mul_f32_e32 v6, 0x4f7ffffe, v6 +; GFX9-NEXT: v_cvt_u32_f32_e32 v6, v6 ; GFX9-NEXT: v_and_b32_e32 v2, 0xffffff, v2 -; GFX9-NEXT: v_mul_hi_u32 v8, v6, v8 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 1, v0 -; GFX9-NEXT: v_mul_hi_u32 v7, v9, v7 ; GFX9-NEXT: v_and_b32_e32 v3, 0xffffff, v3 -; GFX9-NEXT: v_add_u32_e32 v6, v6, v8 -; GFX9-NEXT: v_mul_hi_u32 v6, v4, v6 -; GFX9-NEXT: v_add_u32_e32 v7, v9, v7 -; GFX9-NEXT: v_mul_hi_u32 v7, v5, v7 +; GFX9-NEXT: v_mul_lo_u32 v7, v6, v7 ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 1, v1 -; GFX9-NEXT: v_mul_lo_u32 v6, v6, 24 +; GFX9-NEXT: v_mul_hi_u32 v7, v6, v7 +; GFX9-NEXT: v_add_u32_e32 v6, v6, v7 +; GFX9-NEXT: v_mul_hi_u32 v7, v4, v6 +; GFX9-NEXT: v_mul_hi_u32 v6, v5, v6 ; GFX9-NEXT: v_mul_lo_u32 v7, v7, 24 -; GFX9-NEXT: v_sub_u32_e32 v4, v4, v6 +; GFX9-NEXT: v_mul_lo_u32 v6, v6, 24 +; GFX9-NEXT: v_sub_u32_e32 v4, v4, v7 +; GFX9-NEXT: v_sub_u32_e32 v5, v5, v6 ; GFX9-NEXT: v_subrev_u32_e32 v6, 24, v4 ; GFX9-NEXT: v_cmp_le_u32_e32 vcc, 24, v4 ; GFX9-NEXT: v_cndmask_b32_e32 v4, v4, v6, vcc @@ -2623,19 +2557,16 @@ ; GFX9-NEXT: v_cndmask_b32_e32 v4, v4, v6, vcc ; GFX9-NEXT: v_sub_u32_e32 v6, 23, v4 ; GFX9-NEXT: v_and_b32_e32 v4, 0xffffff, v4 -; GFX9-NEXT: v_and_b32_e32 v6, 0xffffff, v6 +; GFX9-NEXT: v_subrev_u32_e32 v7, 24, v5 ; GFX9-NEXT: v_lshrrev_b32_e32 v2, v4, v2 +; GFX9-NEXT: v_cmp_le_u32_e32 vcc, 24, v5 ; GFX9-NEXT: v_lshl_or_b32 v0, v0, v6, v2 -; GFX9-NEXT: v_sub_u32_e32 v2, v5, v7 -; GFX9-NEXT: v_subrev_u32_e32 v4, 24, v2 -; GFX9-NEXT: v_cmp_le_u32_e32 vcc, 24, v2 -; GFX9-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v2, v5, v7, vcc ; GFX9-NEXT: v_subrev_u32_e32 v4, 24, v2 ; GFX9-NEXT: v_cmp_le_u32_e32 vcc, 24, v2 ; GFX9-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc ; GFX9-NEXT: v_sub_u32_e32 v4, 23, v2 ; GFX9-NEXT: v_and_b32_e32 v2, 0xffffff, v2 -; GFX9-NEXT: v_and_b32_e32 v4, 0xffffff, v4 ; GFX9-NEXT: v_lshrrev_b32_e32 v2, v2, v3 ; GFX9-NEXT: v_lshl_or_b32 v1, v1, v4, v2 ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -2645,31 +2576,24 @@ ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: v_cvt_f32_ubyte0_e32 v6, 24 -; GFX10-NEXT: v_cvt_f32_ubyte0_e32 v7, 24 ; GFX10-NEXT: v_and_b32_e32 v4, 0xffffff, v4 ; GFX10-NEXT: v_and_b32_e32 v5, 0xffffff, v5 ; GFX10-NEXT: v_and_b32_e32 v2, 0xffffff, v2 -; GFX10-NEXT: v_rcp_iflag_f32_e32 v6, v6 -; GFX10-NEXT: v_rcp_iflag_f32_e32 v7, v7 ; GFX10-NEXT: v_and_b32_e32 v3, 0xffffff, v3 +; GFX10-NEXT: v_rcp_iflag_f32_e32 v6, v6 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX10-NEXT: v_lshlrev_b32_e32 v1, 1, v1 ; GFX10-NEXT: v_mul_f32_e32 v6, 0x4f7ffffe, v6 -; GFX10-NEXT: v_mul_f32_e32 v7, 0x4f7ffffe, v7 ; GFX10-NEXT: v_cvt_u32_f32_e32 v6, v6 -; GFX10-NEXT: v_cvt_u32_f32_e32 v7, v7 -; GFX10-NEXT: v_mul_lo_u32 v8, 0xffffffe8, v6 -; GFX10-NEXT: v_mul_lo_u32 v9, 0xffffffe8, v7 -; GFX10-NEXT: v_mul_hi_u32 v8, v6, v8 -; GFX10-NEXT: v_mul_hi_u32 v9, v7, v9 -; GFX10-NEXT: v_add_nc_u32_e32 v6, v6, v8 -; GFX10-NEXT: v_add_nc_u32_e32 v7, v7, v9 -; GFX10-NEXT: v_mul_hi_u32 v6, v4, v6 -; GFX10-NEXT: v_mul_hi_u32 v7, v5, v7 -; GFX10-NEXT: v_mul_lo_u32 v6, v6, 24 +; GFX10-NEXT: v_mul_lo_u32 v7, 0xffffffe8, v6 +; GFX10-NEXT: v_mul_hi_u32 v7, v6, v7 +; GFX10-NEXT: v_add_nc_u32_e32 v6, v6, v7 +; GFX10-NEXT: v_mul_hi_u32 v7, v4, v6 +; GFX10-NEXT: v_mul_hi_u32 v6, v5, v6 ; GFX10-NEXT: v_mul_lo_u32 v7, v7, 24 -; GFX10-NEXT: v_sub_nc_u32_e32 v4, v4, v6 -; GFX10-NEXT: v_sub_nc_u32_e32 v5, v5, v7 +; GFX10-NEXT: v_mul_lo_u32 v6, v6, 24 +; GFX10-NEXT: v_sub_nc_u32_e32 v4, v4, v7 +; GFX10-NEXT: v_sub_nc_u32_e32 v5, v5, v6 ; GFX10-NEXT: v_subrev_nc_u32_e32 v6, 24, v4 ; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, 24, v4 ; GFX10-NEXT: v_subrev_nc_u32_e32 v7, 24, v5 @@ -2681,17 +2605,15 @@ ; GFX10-NEXT: v_subrev_nc_u32_e32 v7, 24, v5 ; GFX10-NEXT: v_cndmask_b32_e32 v4, v4, v6, vcc_lo ; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, 24, v5 -; GFX10-NEXT: v_sub_nc_u32_e32 v6, 23, v4 +; GFX10-NEXT: v_and_b32_e32 v6, 0xffffff, v4 ; GFX10-NEXT: v_cndmask_b32_e32 v5, v5, v7, vcc_lo -; GFX10-NEXT: v_and_b32_e32 v4, 0xffffff, v4 -; GFX10-NEXT: v_and_b32_e32 v6, 0xffffff, v6 -; GFX10-NEXT: v_sub_nc_u32_e32 v7, 23, v5 -; GFX10-NEXT: v_and_b32_e32 v5, 0xffffff, v5 -; GFX10-NEXT: v_lshrrev_b32_e32 v2, v4, v2 -; GFX10-NEXT: v_and_b32_e32 v4, 0xffffff, v7 -; GFX10-NEXT: v_lshrrev_b32_e32 v3, v5, v3 -; GFX10-NEXT: v_lshl_or_b32 v0, v0, v6, v2 -; GFX10-NEXT: v_lshl_or_b32 v1, v1, v4, v3 +; GFX10-NEXT: v_sub_nc_u32_e32 v4, 23, v4 +; GFX10-NEXT: v_lshrrev_b32_e32 v2, v6, v2 +; GFX10-NEXT: v_and_b32_e32 v7, 0xffffff, v5 +; GFX10-NEXT: v_sub_nc_u32_e32 v5, 23, v5 +; GFX10-NEXT: v_lshl_or_b32 v0, v0, v4, v2 +; GFX10-NEXT: v_lshrrev_b32_e32 v3, v7, v3 +; GFX10-NEXT: v_lshl_or_b32 v1, v1, v5, v3 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: v_fshr_v2i24: @@ -2699,67 +2621,56 @@ ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: v_cvt_f32_ubyte0_e32 v6, 24 -; GFX11-NEXT: v_cvt_f32_ubyte0_e32 v7, 24 -; GFX11-NEXT: v_and_b32_e32 v5, 0xffffff, v5 +; GFX11-NEXT: v_and_b32_e32 v4, 0xffffff, v4 ; GFX11-NEXT: v_and_b32_e32 v2, 0xffffff, v2 -; GFX11-NEXT: v_and_b32_e32 v3, 0xffffff, v3 -; GFX11-NEXT: v_rcp_iflag_f32_e32 v6, v6 -; GFX11-NEXT: v_rcp_iflag_f32_e32 v7, v7 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 1, v0 +; GFX11-NEXT: v_lshlrev_b32_e32 v1, 1, v1 +; GFX11-NEXT: v_rcp_iflag_f32_e32 v6, v6 +; GFX11-NEXT: v_and_b32_e32 v3, 0xffffff, v3 ; GFX11-NEXT: s_waitcnt_depctr 0xfff -; GFX11-NEXT: v_dual_mul_f32 v6, 0x4f7ffffe, v6 :: v_dual_lshlrev_b32 v1, 1, v1 -; GFX11-NEXT: v_mul_f32_e32 v7, 0x4f7ffffe, v7 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_mul_f32_e32 v6, 0x4f7ffffe, v6 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_cvt_u32_f32_e32 v6, v6 -; GFX11-NEXT: v_cvt_u32_f32_e32 v7, v7 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_mul_lo_u32 v8, 0xffffffe8, v6 -; GFX11-NEXT: v_mul_lo_u32 v9, 0xffffffe8, v7 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_mul_hi_u32 v8, v6, v8 -; GFX11-NEXT: v_mul_hi_u32 v9, v7, v9 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_add_nc_u32_e32 v6, v6, v8 -; GFX11-NEXT: v_add_nc_u32_e32 v7, v7, v9 +; GFX11-NEXT: v_mul_lo_u32 v7, 0xffffffe8, v6 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_mul_hi_u32 v7, v5, v7 +; GFX11-NEXT: v_mul_hi_u32 v7, v6, v7 +; GFX11-NEXT: v_add_nc_u32_e32 v6, v6, v7 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_mul_hi_u32 v7, v4, v6 ; GFX11-NEXT: v_mul_lo_u32 v7, v7, 24 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_sub_nc_u32_e32 v5, v5, v7 -; GFX11-NEXT: v_subrev_nc_u32_e32 v7, 24, v5 -; GFX11-NEXT: v_and_b32_e32 v4, 0xffffff, v4 +; GFX11-NEXT: v_sub_nc_u32_e32 v4, v4, v7 +; GFX11-NEXT: v_cmp_le_u32_e32 vcc_lo, 24, v4 +; GFX11-NEXT: v_and_b32_e32 v5, 0xffffff, v5 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_mul_hi_u32 v6, v4, v6 +; GFX11-NEXT: v_mul_hi_u32 v6, v5, v6 ; GFX11-NEXT: v_mul_lo_u32 v6, v6, 24 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_sub_nc_u32_e32 v4, v4, v6 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_sub_nc_u32_e32 v5, v5, v6 ; GFX11-NEXT: v_subrev_nc_u32_e32 v6, 24, v4 -; GFX11-NEXT: v_cmp_le_u32_e32 vcc_lo, 24, v4 +; GFX11-NEXT: v_subrev_nc_u32_e32 v7, 24, v5 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_cndmask_b32_e32 v4, v4, v6, vcc_lo ; GFX11-NEXT: v_cmp_le_u32_e32 vcc_lo, 24, v5 ; GFX11-NEXT: v_subrev_nc_u32_e32 v6, 24, v4 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_cndmask_b32_e32 v5, v5, v7, vcc_lo ; GFX11-NEXT: v_cmp_le_u32_e32 vcc_lo, 24, v4 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_4) ; GFX11-NEXT: v_subrev_nc_u32_e32 v7, 24, v5 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_cndmask_b32_e32 v4, v4, v6, vcc_lo ; GFX11-NEXT: v_cmp_le_u32_e32 vcc_lo, 24, v5 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-NEXT: v_sub_nc_u32_e32 v6, 23, v4 -; GFX11-NEXT: v_dual_cndmask_b32 v5, v5, v7 :: v_dual_and_b32 v4, 0xffffff, v4 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_and_b32_e32 v6, 0xffffff, v6 -; GFX11-NEXT: v_sub_nc_u32_e32 v7, 23, v5 -; GFX11-NEXT: v_and_b32_e32 v5, 0xffffff, v5 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-NEXT: v_lshrrev_b32_e32 v2, v4, v2 -; GFX11-NEXT: v_and_b32_e32 v4, 0xffffff, v7 +; GFX11-NEXT: v_dual_cndmask_b32 v5, v5, v7 :: v_dual_and_b32 v6, 0xffffff, v4 +; GFX11-NEXT: v_sub_nc_u32_e32 v4, 23, v4 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_lshrrev_b32_e32 v2, v6, v2 +; GFX11-NEXT: v_and_b32_e32 v7, 0xffffff, v5 +; GFX11-NEXT: v_sub_nc_u32_e32 v5, 23, v5 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-NEXT: v_lshrrev_b32_e32 v3, v5, v3 -; GFX11-NEXT: v_lshl_or_b32 v0, v0, v6, v2 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX11-NEXT: v_lshl_or_b32 v1, v1, v4, v3 +; GFX11-NEXT: v_lshl_or_b32 v0, v0, v4, v2 +; GFX11-NEXT: v_lshrrev_b32_e32 v3, v7, v3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_lshl_or_b32 v1, v1, v5, v3 ; GFX11-NEXT: s_setpc_b64 s[30:31] %result = call <2 x i24> @llvm.fshr.v2i24(<2 x i24> %lhs, <2 x i24> %rhs, <2 x i24> %amt) ret <2 x i24> %result @@ -3412,24 +3323,20 @@ ; GFX8-LABEL: v_fshr_i16: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_and_b32_e32 v3, 15, v2 -; GFX8-NEXT: v_xor_b32_e32 v2, -1, v2 -; GFX8-NEXT: v_and_b32_e32 v2, 15, v2 +; GFX8-NEXT: v_xor_b32_e32 v3, -1, v2 ; GFX8-NEXT: v_lshlrev_b16_e32 v0, 1, v0 -; GFX8-NEXT: v_lshlrev_b16_e32 v0, v2, v0 -; GFX8-NEXT: v_lshrrev_b16_e32 v1, v3, v1 +; GFX8-NEXT: v_lshlrev_b16_e32 v0, v3, v0 +; GFX8-NEXT: v_lshrrev_b16_e32 v1, v2, v1 ; GFX8-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: v_fshr_i16: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_and_b32_e32 v3, 15, v2 -; GFX9-NEXT: v_xor_b32_e32 v2, -1, v2 -; GFX9-NEXT: v_and_b32_e32 v2, 15, v2 +; GFX9-NEXT: v_xor_b32_e32 v3, -1, v2 ; GFX9-NEXT: v_lshlrev_b16_e32 v0, 1, v0 -; GFX9-NEXT: v_lshlrev_b16_e32 v0, v2, v0 -; GFX9-NEXT: v_lshrrev_b16_e32 v1, v3, v1 +; GFX9-NEXT: v_lshlrev_b16_e32 v0, v3, v0 +; GFX9-NEXT: v_lshrrev_b16_e32 v1, v2, v1 ; GFX9-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -3439,8 +3346,6 @@ ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: v_xor_b32_e32 v3, -1, v2 ; GFX10-NEXT: v_lshlrev_b16 v0, 1, v0 -; GFX10-NEXT: v_and_b32_e32 v2, 15, v2 -; GFX10-NEXT: v_and_b32_e32 v3, 15, v3 ; GFX10-NEXT: v_lshrrev_b16 v1, v2, v1 ; GFX10-NEXT: v_lshlrev_b16 v0, v3, v0 ; GFX10-NEXT: v_or_b32_e32 v0, v0, v1 @@ -3452,9 +3357,6 @@ ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: v_xor_b32_e32 v3, -1, v2 ; GFX11-NEXT: v_lshlrev_b16 v0, 1, v0 -; GFX11-NEXT: v_and_b32_e32 v2, 15, v2 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_and_b32_e32 v3, 15, v3 ; GFX11-NEXT: v_lshrrev_b16 v1, v2, v1 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_lshlrev_b16 v0, v3, v0 @@ -3575,32 +3477,26 @@ ; ; GFX8-LABEL: v_fshr_i16_ssv: ; GFX8: ; %bb.0: -; GFX8-NEXT: v_and_b32_e32 v1, 15, v0 -; GFX8-NEXT: v_xor_b32_e32 v0, -1, v0 -; GFX8-NEXT: v_and_b32_e32 v0, 15, v0 +; GFX8-NEXT: v_xor_b32_e32 v1, -1, v0 ; GFX8-NEXT: s_lshl_b32 s0, s0, 1 -; GFX8-NEXT: v_lshlrev_b16_e64 v0, v0, s0 -; GFX8-NEXT: v_lshrrev_b16_e64 v1, v1, s1 -; GFX8-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX8-NEXT: v_lshlrev_b16_e64 v1, v1, s0 +; GFX8-NEXT: v_lshrrev_b16_e64 v0, v0, s1 +; GFX8-NEXT: v_or_b32_e32 v0, v1, v0 ; GFX8-NEXT: ; return to shader part epilog ; ; GFX9-LABEL: v_fshr_i16_ssv: ; GFX9: ; %bb.0: -; GFX9-NEXT: v_and_b32_e32 v1, 15, v0 -; GFX9-NEXT: v_xor_b32_e32 v0, -1, v0 -; GFX9-NEXT: v_and_b32_e32 v0, 15, v0 +; GFX9-NEXT: v_xor_b32_e32 v1, -1, v0 ; GFX9-NEXT: s_lshl_b32 s0, s0, 1 -; GFX9-NEXT: v_lshlrev_b16_e64 v0, v0, s0 -; GFX9-NEXT: v_lshrrev_b16_e64 v1, v1, s1 -; GFX9-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX9-NEXT: v_lshlrev_b16_e64 v1, v1, s0 +; GFX9-NEXT: v_lshrrev_b16_e64 v0, v0, s1 +; GFX9-NEXT: v_or_b32_e32 v0, v1, v0 ; GFX9-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: v_fshr_i16_ssv: ; GFX10: ; %bb.0: ; GFX10-NEXT: v_xor_b32_e32 v1, -1, v0 -; GFX10-NEXT: v_and_b32_e32 v0, 15, v0 ; GFX10-NEXT: s_lshl_b32 s0, s0, 1 -; GFX10-NEXT: v_and_b32_e32 v1, 15, v1 ; GFX10-NEXT: v_lshrrev_b16 v0, v0, s1 ; GFX10-NEXT: v_lshlrev_b16 v1, v1, s0 ; GFX10-NEXT: v_or_b32_e32 v0, v1, v0 @@ -3609,10 +3505,7 @@ ; GFX11-LABEL: v_fshr_i16_ssv: ; GFX11: ; %bb.0: ; GFX11-NEXT: v_xor_b32_e32 v1, -1, v0 -; GFX11-NEXT: v_and_b32_e32 v0, 15, v0 ; GFX11-NEXT: s_lshl_b32 s0, s0, 1 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_and_b32_e32 v1, 15, v1 ; GFX11-NEXT: v_lshrrev_b16 v0, v0, s1 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_lshlrev_b16 v1, v1, s0 @@ -3946,35 +3839,31 @@ ; GFX8-LABEL: v_fshr_v2i16: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_lshlrev_b16_e32 v3, 1, v0 -; GFX8-NEXT: v_lshrrev_b16_e32 v4, 15, v1 -; GFX8-NEXT: v_or_b32_e32 v3, v3, v4 -; GFX8-NEXT: v_mov_b32_e32 v4, 1 -; GFX8-NEXT: v_mov_b32_e32 v5, 15 -; GFX8-NEXT: v_lshlrev_b16_sdwa v0, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX8-NEXT: v_lshrrev_b16_sdwa v5, v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX8-NEXT: v_xor_b32_e32 v2, -1, v2 -; GFX8-NEXT: v_or_b32_e32 v0, v0, v5 -; GFX8-NEXT: v_lshlrev_b16_e32 v5, 1, v1 -; GFX8-NEXT: v_lshlrev_b16_sdwa v1, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX8-NEXT: v_lshrrev_b32_e32 v4, 16, v2 -; GFX8-NEXT: v_and_b32_e32 v6, 15, v2 +; GFX8-NEXT: v_lshlrev_b16_e32 v4, 1, v0 +; GFX8-NEXT: v_lshrrev_b16_e32 v5, 15, v1 +; GFX8-NEXT: v_lshrrev_b32_e32 v3, 16, v1 +; GFX8-NEXT: v_or_b32_e32 v4, v4, v5 +; GFX8-NEXT: v_mov_b32_e32 v5, 1 +; GFX8-NEXT: v_lshlrev_b16_e32 v1, 1, v1 ; GFX8-NEXT: v_xor_b32_e32 v2, -1, v2 -; GFX8-NEXT: v_and_b32_e32 v2, 15, v2 -; GFX8-NEXT: v_lshrrev_b16_e32 v5, 1, v5 -; GFX8-NEXT: v_lshlrev_b16_e32 v3, v6, v3 -; GFX8-NEXT: v_lshrrev_b16_e32 v2, v2, v5 -; GFX8-NEXT: v_or_b32_e32 v2, v3, v2 -; GFX8-NEXT: v_and_b32_e32 v3, 15, v4 -; GFX8-NEXT: v_xor_b32_e32 v4, -1, v4 -; GFX8-NEXT: v_and_b32_e32 v4, 15, v4 +; GFX8-NEXT: v_lshlrev_b16_sdwa v0, v5, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX8-NEXT: v_lshrrev_b16_e32 v5, 15, v3 +; GFX8-NEXT: v_xor_b32_e32 v6, -1, v2 ; GFX8-NEXT: v_lshrrev_b16_e32 v1, 1, v1 -; GFX8-NEXT: v_lshlrev_b16_e32 v0, v3, v0 -; GFX8-NEXT: v_lshrrev_b16_e32 v1, v4, v1 -; GFX8-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX8-NEXT: v_or_b32_e32 v0, v0, v5 +; GFX8-NEXT: v_lshlrev_b16_e32 v3, 1, v3 +; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v2 +; GFX8-NEXT: v_lshlrev_b16_e32 v2, v2, v4 +; GFX8-NEXT: v_lshrrev_b16_e32 v1, v6, v1 +; GFX8-NEXT: v_or_b32_e32 v1, v2, v1 +; GFX8-NEXT: v_xor_b32_e32 v2, -1, v5 +; GFX8-NEXT: v_lshrrev_b16_e32 v3, 1, v3 +; GFX8-NEXT: v_lshlrev_b16_e32 v0, v5, v0 +; GFX8-NEXT: v_lshrrev_b16_e32 v2, v2, v3 +; GFX8-NEXT: v_or_b32_e32 v0, v0, v2 ; GFX8-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX8-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: v_fshr_v2i16: @@ -4130,31 +4019,27 @@ ; GFX8-NEXT: s_lshr_b32 s2, s0, 16 ; GFX8-NEXT: s_lshl_b32 s0, s0, 1 ; GFX8-NEXT: s_lshr_b32 s4, s4, 15 -; GFX8-NEXT: v_xor_b32_e32 v0, -1, v0 ; GFX8-NEXT: s_lshr_b32 s3, s1, 16 ; GFX8-NEXT: s_or_b32 s0, s0, s4 ; GFX8-NEXT: s_lshl_b32 s1, s1, 1 -; GFX8-NEXT: v_and_b32_e32 v2, 15, v0 -; GFX8-NEXT: v_lshrrev_b32_e32 v1, 16, v0 ; GFX8-NEXT: v_xor_b32_e32 v0, -1, v0 -; GFX8-NEXT: v_lshlrev_b16_e64 v2, v2, s0 +; GFX8-NEXT: v_lshrrev_b32_e32 v1, 16, v0 +; GFX8-NEXT: v_xor_b32_e32 v2, -1, v0 +; GFX8-NEXT: v_lshlrev_b16_e64 v0, v0, s0 ; GFX8-NEXT: s_and_b32 s0, 0xffff, s1 -; GFX8-NEXT: v_and_b32_e32 v0, 15, v0 -; GFX8-NEXT: s_lshr_b32 s0, s0, 1 ; GFX8-NEXT: s_lshr_b32 s4, s3, 15 ; GFX8-NEXT: s_lshl_b32 s3, s3, 1 -; GFX8-NEXT: v_lshrrev_b16_e64 v0, v0, s0 +; GFX8-NEXT: s_lshr_b32 s0, s0, 1 ; GFX8-NEXT: s_lshl_b32 s2, s2, 1 -; GFX8-NEXT: v_or_b32_e32 v0, v2, v0 -; GFX8-NEXT: v_and_b32_e32 v2, 15, v1 -; GFX8-NEXT: v_xor_b32_e32 v1, -1, v1 +; GFX8-NEXT: v_lshrrev_b16_e64 v2, v2, s0 ; GFX8-NEXT: s_and_b32 s0, 0xffff, s3 ; GFX8-NEXT: s_or_b32 s2, s2, s4 -; GFX8-NEXT: v_and_b32_e32 v1, 15, v1 +; GFX8-NEXT: v_or_b32_e32 v0, v0, v2 +; GFX8-NEXT: v_xor_b32_e32 v2, -1, v1 ; GFX8-NEXT: s_lshr_b32 s0, s0, 1 -; GFX8-NEXT: v_lshlrev_b16_e64 v2, v2, s2 -; GFX8-NEXT: v_lshrrev_b16_e64 v1, v1, s0 -; GFX8-NEXT: v_or_b32_e32 v1, v2, v1 +; GFX8-NEXT: v_lshlrev_b16_e64 v1, v1, s2 +; GFX8-NEXT: v_lshrrev_b16_e64 v2, v2, s0 +; GFX8-NEXT: v_or_b32_e32 v1, v1, v2 ; GFX8-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX8-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD @@ -4250,33 +4135,32 @@ ; GFX8-LABEL: v_fshr_v2i16_svs: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_lshr_b32 s2, s0, 16 +; GFX8-NEXT: v_lshrrev_b32_e32 v1, 16, v0 ; GFX8-NEXT: s_lshl_b32 s0, s0, 1 -; GFX8-NEXT: v_lshrrev_b16_e32 v1, 15, v0 -; GFX8-NEXT: v_mov_b32_e32 v2, 15 -; GFX8-NEXT: v_or_b32_e32 v1, s0, v1 -; GFX8-NEXT: s_lshl_b32 s0, s2, 1 -; GFX8-NEXT: v_lshrrev_b16_sdwa v2, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX8-NEXT: v_lshrrev_b16_e32 v2, 15, v0 ; GFX8-NEXT: v_or_b32_e32 v2, s0, v2 -; GFX8-NEXT: v_lshlrev_b16_e32 v3, 1, v0 -; GFX8-NEXT: v_mov_b32_e32 v4, 1 +; GFX8-NEXT: s_lshl_b32 s0, s2, 1 +; GFX8-NEXT: v_lshrrev_b16_e32 v3, 15, v1 +; GFX8-NEXT: v_or_b32_e32 v3, s0, v3 +; GFX8-NEXT: v_lshlrev_b16_e32 v0, 1, v0 ; GFX8-NEXT: s_xor_b32 s0, s1, -1 -; GFX8-NEXT: v_lshlrev_b16_sdwa v0, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX8-NEXT: v_lshlrev_b16_e32 v1, 1, v1 ; GFX8-NEXT: s_lshr_b32 s1, s0, 16 ; GFX8-NEXT: s_and_b32 s2, s0, 15 ; GFX8-NEXT: s_andn2_b32 s0, 15, s0 -; GFX8-NEXT: v_lshrrev_b16_e32 v3, 1, v3 -; GFX8-NEXT: v_lshrrev_b16_e32 v3, s0, v3 +; GFX8-NEXT: v_lshrrev_b16_e32 v0, 1, v0 +; GFX8-NEXT: v_lshlrev_b16_e32 v2, s2, v2 +; GFX8-NEXT: v_lshrrev_b16_e32 v0, s0, v0 ; GFX8-NEXT: s_and_b32 s0, s1, 15 ; GFX8-NEXT: s_andn2_b32 s1, 15, s1 -; GFX8-NEXT: v_lshrrev_b16_e32 v0, 1, v0 -; GFX8-NEXT: v_lshlrev_b16_e32 v2, s0, v2 -; GFX8-NEXT: v_lshrrev_b16_e32 v0, s1, v0 +; GFX8-NEXT: v_lshrrev_b16_e32 v1, 1, v1 ; GFX8-NEXT: v_or_b32_e32 v0, v2, v0 -; GFX8-NEXT: v_lshlrev_b16_e32 v1, s2, v1 -; GFX8-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX8-NEXT: v_or_b32_e32 v1, v1, v3 -; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX8-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX8-NEXT: v_lshlrev_b16_e32 v2, s0, v3 +; GFX8-NEXT: v_lshrrev_b16_e32 v1, s1, v1 +; GFX8-NEXT: v_or_b32_e32 v1, v2, v1 +; GFX8-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX8-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX8-NEXT: ; return to shader part epilog ; ; GFX9-LABEL: v_fshr_v2i16_svs: @@ -4780,39 +4664,33 @@ ; GFX8-NEXT: v_lshrrev_b32_e32 v6, 16, v2 ; GFX8-NEXT: v_or_b32_e32 v7, v7, v8 ; GFX8-NEXT: v_mov_b32_e32 v8, 1 -; GFX8-NEXT: v_lshlrev_b16_sdwa v0, v8, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX8-NEXT: v_lshrrev_b16_e32 v8, 15, v6 -; GFX8-NEXT: v_xor_b32_e32 v4, -1, v4 -; GFX8-NEXT: v_or_b32_e32 v0, v0, v8 ; GFX8-NEXT: v_lshlrev_b16_e32 v2, 1, v2 -; GFX8-NEXT: v_lshrrev_b32_e32 v8, 16, v4 -; GFX8-NEXT: v_and_b32_e32 v9, 15, v4 ; GFX8-NEXT: v_xor_b32_e32 v4, -1, v4 -; GFX8-NEXT: v_and_b32_e32 v4, 15, v4 +; GFX8-NEXT: v_lshlrev_b16_sdwa v0, v8, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX8-NEXT: v_lshrrev_b16_e32 v8, 15, v6 +; GFX8-NEXT: v_xor_b32_e32 v9, -1, v4 ; GFX8-NEXT: v_lshrrev_b16_e32 v2, 1, v2 -; GFX8-NEXT: v_lshlrev_b16_e32 v7, v9, v7 -; GFX8-NEXT: v_lshrrev_b16_e32 v2, v4, v2 +; GFX8-NEXT: v_or_b32_e32 v0, v0, v8 ; GFX8-NEXT: v_lshlrev_b16_e32 v6, 1, v6 -; GFX8-NEXT: v_or_b32_e32 v2, v7, v2 -; GFX8-NEXT: v_and_b32_e32 v4, 15, v8 -; GFX8-NEXT: v_xor_b32_e32 v7, -1, v8 -; GFX8-NEXT: v_and_b32_e32 v7, 15, v7 -; GFX8-NEXT: v_lshlrev_b16_e32 v0, v4, v0 -; GFX8-NEXT: v_lshrrev_b16_e32 v4, 1, v6 -; GFX8-NEXT: v_lshrrev_b16_e32 v4, v7, v4 +; GFX8-NEXT: v_lshrrev_b32_e32 v8, 16, v4 +; GFX8-NEXT: v_lshlrev_b16_e32 v4, v4, v7 +; GFX8-NEXT: v_lshrrev_b16_e32 v2, v9, v2 +; GFX8-NEXT: v_or_b32_e32 v2, v4, v2 +; GFX8-NEXT: v_xor_b32_e32 v4, -1, v8 +; GFX8-NEXT: v_lshrrev_b16_e32 v6, 1, v6 +; GFX8-NEXT: v_lshlrev_b16_e32 v0, v8, v0 +; GFX8-NEXT: v_lshrrev_b16_e32 v4, v4, v6 ; GFX8-NEXT: v_or_b32_e32 v0, v0, v4 ; GFX8-NEXT: v_lshlrev_b16_e32 v1, 1, v1 ; GFX8-NEXT: v_lshrrev_b16_e32 v4, 15, v3 ; GFX8-NEXT: v_or_b32_e32 v1, v1, v4 ; GFX8-NEXT: v_mov_b32_e32 v4, -1 -; GFX8-NEXT: v_xor_b32_sdwa v4, v5, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX8-NEXT: v_lshlrev_b16_e32 v3, 1, v3 -; GFX8-NEXT: v_and_b32_e32 v5, 15, v4 -; GFX8-NEXT: v_xor_b32_e32 v4, -1, v4 -; GFX8-NEXT: v_and_b32_e32 v4, 15, v4 +; GFX8-NEXT: v_xor_b32_sdwa v4, v5, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX8-NEXT: v_xor_b32_e32 v5, -1, v4 ; GFX8-NEXT: v_lshrrev_b16_e32 v3, 1, v3 -; GFX8-NEXT: v_lshlrev_b16_e32 v1, v5, v1 -; GFX8-NEXT: v_lshrrev_b16_e32 v3, v4, v3 +; GFX8-NEXT: v_lshlrev_b16_e32 v1, v4, v1 +; GFX8-NEXT: v_lshrrev_b16_e32 v3, v5, v3 ; GFX8-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX8-NEXT: v_or_b32_e32 v1, v1, v3 ; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v0 @@ -5233,63 +5111,55 @@ ; GFX8-LABEL: v_fshr_v4i16: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_lshlrev_b16_e32 v6, 1, v0 -; GFX8-NEXT: v_lshrrev_b16_e32 v7, 15, v2 -; GFX8-NEXT: v_or_b32_e32 v6, v6, v7 -; GFX8-NEXT: v_mov_b32_e32 v7, 1 -; GFX8-NEXT: v_mov_b32_e32 v8, 15 -; GFX8-NEXT: v_lshlrev_b16_sdwa v0, v7, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX8-NEXT: v_lshrrev_b16_sdwa v9, v8, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX8-NEXT: v_xor_b32_e32 v4, -1, v4 -; GFX8-NEXT: v_or_b32_e32 v0, v0, v9 -; GFX8-NEXT: v_lshlrev_b16_e32 v9, 1, v2 -; GFX8-NEXT: v_lshlrev_b16_sdwa v2, v7, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX8-NEXT: v_lshrrev_b32_e32 v7, 16, v4 -; GFX8-NEXT: v_and_b32_e32 v10, 15, v4 +; GFX8-NEXT: v_lshlrev_b16_e32 v7, 1, v0 +; GFX8-NEXT: v_lshrrev_b16_e32 v8, 15, v2 +; GFX8-NEXT: v_lshrrev_b32_e32 v6, 16, v2 +; GFX8-NEXT: v_or_b32_e32 v7, v7, v8 +; GFX8-NEXT: v_mov_b32_e32 v8, 1 +; GFX8-NEXT: v_lshlrev_b16_e32 v2, 1, v2 ; GFX8-NEXT: v_xor_b32_e32 v4, -1, v4 -; GFX8-NEXT: v_and_b32_e32 v4, 15, v4 -; GFX8-NEXT: v_lshrrev_b16_e32 v9, 1, v9 -; GFX8-NEXT: v_lshlrev_b16_e32 v6, v10, v6 -; GFX8-NEXT: v_lshrrev_b16_e32 v4, v4, v9 -; GFX8-NEXT: v_or_b32_e32 v4, v6, v4 -; GFX8-NEXT: v_and_b32_e32 v6, 15, v7 -; GFX8-NEXT: v_xor_b32_e32 v7, -1, v7 -; GFX8-NEXT: v_and_b32_e32 v7, 15, v7 +; GFX8-NEXT: v_lshlrev_b16_sdwa v0, v8, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX8-NEXT: v_lshrrev_b16_e32 v9, 15, v6 +; GFX8-NEXT: v_xor_b32_e32 v10, -1, v4 ; GFX8-NEXT: v_lshrrev_b16_e32 v2, 1, v2 -; GFX8-NEXT: v_lshlrev_b16_e32 v0, v6, v0 -; GFX8-NEXT: v_lshrrev_b16_e32 v2, v7, v2 -; GFX8-NEXT: v_or_b32_e32 v0, v0, v2 -; GFX8-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX8-NEXT: v_or_b32_sdwa v0, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX8-NEXT: v_lshlrev_b16_e32 v2, 1, v1 -; GFX8-NEXT: v_lshrrev_b16_e32 v4, 15, v3 -; GFX8-NEXT: v_or_b32_e32 v2, v2, v4 -; GFX8-NEXT: v_mov_b32_e32 v4, 1 -; GFX8-NEXT: v_lshlrev_b16_sdwa v1, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX8-NEXT: v_lshrrev_b16_sdwa v6, v8, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX8-NEXT: v_or_b32_e32 v1, v1, v6 -; GFX8-NEXT: v_lshlrev_b16_e32 v6, 1, v3 -; GFX8-NEXT: v_lshlrev_b16_sdwa v3, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX8-NEXT: v_xor_b32_e32 v4, -1, v5 -; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v4 -; GFX8-NEXT: v_and_b32_e32 v7, 15, v4 -; GFX8-NEXT: v_xor_b32_e32 v4, -1, v4 -; GFX8-NEXT: v_and_b32_e32 v4, 15, v4 +; GFX8-NEXT: v_or_b32_e32 v0, v0, v9 +; GFX8-NEXT: v_lshlrev_b16_e32 v6, 1, v6 +; GFX8-NEXT: v_lshrrev_b32_e32 v9, 16, v4 +; GFX8-NEXT: v_lshlrev_b16_e32 v4, v4, v7 +; GFX8-NEXT: v_lshrrev_b16_e32 v2, v10, v2 +; GFX8-NEXT: v_or_b32_e32 v2, v4, v2 +; GFX8-NEXT: v_xor_b32_e32 v4, -1, v9 ; GFX8-NEXT: v_lshrrev_b16_e32 v6, 1, v6 -; GFX8-NEXT: v_lshlrev_b16_e32 v2, v7, v2 +; GFX8-NEXT: v_lshlrev_b16_e32 v0, v9, v0 ; GFX8-NEXT: v_lshrrev_b16_e32 v4, v4, v6 -; GFX8-NEXT: v_or_b32_e32 v2, v2, v4 -; GFX8-NEXT: v_and_b32_e32 v4, 15, v5 +; GFX8-NEXT: v_or_b32_e32 v0, v0, v4 +; GFX8-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX8-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX8-NEXT: v_lshrrev_b32_e32 v2, 16, v3 +; GFX8-NEXT: v_lshlrev_b16_e32 v4, 1, v1 +; GFX8-NEXT: v_lshrrev_b16_e32 v6, 15, v3 +; GFX8-NEXT: v_lshlrev_b16_e32 v3, 1, v3 ; GFX8-NEXT: v_xor_b32_e32 v5, -1, v5 -; GFX8-NEXT: v_and_b32_e32 v5, 15, v5 +; GFX8-NEXT: v_or_b32_e32 v4, v4, v6 +; GFX8-NEXT: v_lshlrev_b16_sdwa v1, v8, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX8-NEXT: v_lshrrev_b16_e32 v6, 15, v2 +; GFX8-NEXT: v_xor_b32_e32 v7, -1, v5 ; GFX8-NEXT: v_lshrrev_b16_e32 v3, 1, v3 -; GFX8-NEXT: v_lshlrev_b16_e32 v1, v4, v1 -; GFX8-NEXT: v_lshrrev_b16_e32 v3, v5, v3 -; GFX8-NEXT: v_or_b32_e32 v1, v1, v3 +; GFX8-NEXT: v_or_b32_e32 v1, v1, v6 +; GFX8-NEXT: v_lshlrev_b16_e32 v2, 1, v2 +; GFX8-NEXT: v_lshrrev_b32_e32 v6, 16, v5 +; GFX8-NEXT: v_lshlrev_b16_e32 v4, v5, v4 +; GFX8-NEXT: v_lshrrev_b16_e32 v3, v7, v3 +; GFX8-NEXT: v_or_b32_e32 v3, v4, v3 +; GFX8-NEXT: v_xor_b32_e32 v4, -1, v6 +; GFX8-NEXT: v_lshrrev_b16_e32 v2, 1, v2 +; GFX8-NEXT: v_lshlrev_b16_e32 v1, v6, v1 +; GFX8-NEXT: v_lshrrev_b16_e32 v2, v4, v2 +; GFX8-NEXT: v_or_b32_e32 v1, v1, v2 ; GFX8-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX8-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_sdwa v1, v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: v_fshr_v4i16: Index: llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.i16.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.i16.ll +++ llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.i16.ll @@ -279,10 +279,10 @@ ; GFX9-NEXT: s_load_dword s0, s[2:3], 0x0 ; GFX9-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 4, v0 -; GFX9-NEXT: s_mov_b32 s1, 0xffff -; GFX9-NEXT: s_and_b32 s2, s4, 0xffff -; GFX9-NEXT: v_lshlrev_b32_e64 v2, v0, s2 -; GFX9-NEXT: v_lshlrev_b32_e64 v0, v0, s1 +; GFX9-NEXT: s_and_b32 s1, s4, 0xffff +; GFX9-NEXT: v_mov_b32_e32 v1, 0xffff +; GFX9-NEXT: v_lshlrev_b32_e64 v2, v0, s1 +; GFX9-NEXT: v_lshlrev_b32_e32 v0, v0, v1 ; GFX9-NEXT: v_not_b32_e32 v3, v0 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 @@ -296,10 +296,10 @@ ; GFX8-NEXT: s_load_dword s0, s[2:3], 0x0 ; GFX8-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX8-NEXT: v_lshlrev_b32_e32 v0, 4, v0 -; GFX8-NEXT: s_mov_b32 s1, 0xffff -; GFX8-NEXT: s_and_b32 s2, s4, 0xffff -; GFX8-NEXT: v_lshlrev_b32_e64 v2, v0, s2 -; GFX8-NEXT: v_lshlrev_b32_e64 v0, v0, s1 +; GFX8-NEXT: s_and_b32 s1, s4, 0xffff +; GFX8-NEXT: v_mov_b32_e32 v1, 0xffff +; GFX8-NEXT: v_lshlrev_b32_e64 v2, v0, s1 +; GFX8-NEXT: v_lshlrev_b32_e32 v0, v0, v1 ; GFX8-NEXT: v_not_b32_e32 v0, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_and_b32_e32 v3, s0, v0 @@ -372,14 +372,14 @@ ; GFX9-NEXT: s_load_dword s0, s[2:3], 0x0 ; GFX9-NEXT: v_and_b32_e32 v1, 1, v1 ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 4, v1 -; GFX9-NEXT: s_mov_b32 s1, 0xffff -; GFX9-NEXT: v_lshlrev_b32_sdwa v2, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; GFX9-NEXT: v_lshlrev_b32_e64 v0, v1, s1 -; GFX9-NEXT: v_not_b32_e32 v3, v0 +; GFX9-NEXT: v_mov_b32_e32 v2, 0xffff +; GFX9-NEXT: v_lshlrev_b32_sdwa v3, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; GFX9-NEXT: v_lshlrev_b32_e32 v0, v1, v2 +; GFX9-NEXT: v_not_b32_e32 v2, v0 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_and_or_b32 v2, s0, v3, v2 +; GFX9-NEXT: v_and_or_b32 v2, s0, v2, v3 ; GFX9-NEXT: global_store_dword v[0:1], v2, off ; GFX9-NEXT: s_endpgm ; @@ -388,15 +388,15 @@ ; GFX8-NEXT: s_load_dword s0, s[2:3], 0x0 ; GFX8-NEXT: v_and_b32_e32 v1, 1, v1 ; GFX8-NEXT: v_lshlrev_b32_e32 v1, 4, v1 -; GFX8-NEXT: s_mov_b32 s1, 0xffff -; GFX8-NEXT: v_lshlrev_b32_sdwa v2, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; GFX8-NEXT: v_lshlrev_b32_e64 v0, v1, s1 +; GFX8-NEXT: v_mov_b32_e32 v2, 0xffff +; GFX8-NEXT: v_lshlrev_b32_sdwa v3, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; GFX8-NEXT: v_lshlrev_b32_e32 v0, v1, v2 ; GFX8-NEXT: v_not_b32_e32 v0, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_and_b32_e32 v3, s0, v0 +; GFX8-NEXT: v_and_b32_e32 v2, s0, v0 ; GFX8-NEXT: v_mov_b32_e32 v0, 0 ; GFX8-NEXT: v_mov_b32_e32 v1, 0 -; GFX8-NEXT: v_or_b32_e32 v2, v3, v2 +; GFX8-NEXT: v_or_b32_e32 v2, v2, v3 ; GFX8-NEXT: flat_store_dword v[0:1], v2 ; GFX8-NEXT: s_endpgm ; @@ -462,10 +462,10 @@ ; GFX9-NEXT: global_load_dword v3, v[0:1], off ; GFX9-NEXT: v_and_b32_e32 v0, 1, v2 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 4, v0 -; GFX9-NEXT: s_mov_b32 s0, 0xffff -; GFX9-NEXT: s_and_b32 s1, s2, 0xffff -; GFX9-NEXT: v_lshlrev_b32_e64 v2, v0, s1 -; GFX9-NEXT: v_lshlrev_b32_e64 v0, v0, s0 +; GFX9-NEXT: s_and_b32 s0, s2, 0xffff +; GFX9-NEXT: v_mov_b32_e32 v1, 0xffff +; GFX9-NEXT: v_lshlrev_b32_e64 v2, v0, s0 +; GFX9-NEXT: v_lshlrev_b32_e32 v0, v0, v1 ; GFX9-NEXT: v_not_b32_e32 v4, v0 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 @@ -477,18 +477,18 @@ ; GFX8-LABEL: insertelement_v_v2i16_s_v: ; GFX8: ; %bb.0: ; GFX8-NEXT: flat_load_dword v0, v[0:1] -; GFX8-NEXT: v_and_b32_e32 v1, 1, v2 -; GFX8-NEXT: s_mov_b32 s0, 0xffff -; GFX8-NEXT: s_and_b32 s1, s2, 0xffff -; GFX8-NEXT: v_lshlrev_b32_e32 v1, 4, v1 -; GFX8-NEXT: v_lshlrev_b32_e64 v2, v1, s1 -; GFX8-NEXT: v_lshlrev_b32_e64 v1, v1, s0 +; GFX8-NEXT: v_and_b32_e32 v2, 1, v2 +; GFX8-NEXT: v_mov_b32_e32 v1, 0xffff +; GFX8-NEXT: v_lshlrev_b32_e32 v2, 4, v2 +; GFX8-NEXT: v_lshlrev_b32_e32 v1, v2, v1 +; GFX8-NEXT: s_and_b32 s0, s2, 0xffff ; GFX8-NEXT: v_not_b32_e32 v1, v1 +; GFX8-NEXT: v_lshlrev_b32_e64 v3, v2, s0 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_and_b32_e32 v3, v0, v1 +; GFX8-NEXT: v_and_b32_e32 v2, v0, v1 ; GFX8-NEXT: v_mov_b32_e32 v0, 0 ; GFX8-NEXT: v_mov_b32_e32 v1, 0 -; GFX8-NEXT: v_or_b32_e32 v2, v3, v2 +; GFX8-NEXT: v_or_b32_e32 v2, v2, v3 ; GFX8-NEXT: flat_store_dword v[0:1], v2 ; GFX8-NEXT: s_endpgm ; @@ -644,9 +644,9 @@ ; GFX9-NEXT: global_load_dword v4, v[0:1], off ; GFX9-NEXT: v_and_b32_e32 v0, 1, v3 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 4, v0 -; GFX9-NEXT: s_mov_b32 s0, 0xffff +; GFX9-NEXT: v_mov_b32_e32 v1, 0xffff ; GFX9-NEXT: v_lshlrev_b32_sdwa v2, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; GFX9-NEXT: v_lshlrev_b32_e64 v0, v0, s0 +; GFX9-NEXT: v_lshlrev_b32_e32 v0, v0, v1 ; GFX9-NEXT: v_not_b32_e32 v3, v0 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 @@ -658,12 +658,12 @@ ; GFX8-LABEL: insertelement_v_v2i16_v_v: ; GFX8: ; %bb.0: ; GFX8-NEXT: flat_load_dword v0, v[0:1] -; GFX8-NEXT: v_and_b32_e32 v1, 1, v3 -; GFX8-NEXT: s_mov_b32 s0, 0xffff -; GFX8-NEXT: v_lshlrev_b32_e32 v1, 4, v1 -; GFX8-NEXT: v_lshlrev_b32_sdwa v2, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; GFX8-NEXT: v_lshlrev_b32_e64 v1, v1, s0 +; GFX8-NEXT: v_and_b32_e32 v3, 1, v3 +; GFX8-NEXT: v_mov_b32_e32 v1, 0xffff +; GFX8-NEXT: v_lshlrev_b32_e32 v3, 4, v3 +; GFX8-NEXT: v_lshlrev_b32_e32 v1, v3, v1 ; GFX8-NEXT: v_not_b32_e32 v1, v1 +; GFX8-NEXT: v_lshlrev_b32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_and_b32_e32 v3, v0, v1 ; GFX8-NEXT: v_mov_b32_e32 v0, 0 @@ -1048,11 +1048,11 @@ ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v1, s0 ; GFX9-NEXT: v_mov_b32_e32 v3, s1 -; GFX9-NEXT: s_mov_b32 s2, 0xffff -; GFX9-NEXT: s_and_b32 s3, s4, 0xffff +; GFX9-NEXT: s_and_b32 s2, s4, 0xffff +; GFX9-NEXT: v_mov_b32_e32 v4, 0xffff ; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc -; GFX9-NEXT: v_lshlrev_b32_e64 v3, v0, s3 -; GFX9-NEXT: v_lshlrev_b32_e64 v0, v0, s2 +; GFX9-NEXT: v_lshlrev_b32_e64 v3, v0, s2 +; GFX9-NEXT: v_lshlrev_b32_e32 v0, v0, v4 ; GFX9-NEXT: v_not_b32_e32 v0, v0 ; GFX9-NEXT: v_and_or_b32 v4, v1, v0, v3 ; GFX9-NEXT: v_mov_b32_e32 v0, s0 @@ -1075,11 +1075,11 @@ ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, s0 ; GFX8-NEXT: v_mov_b32_e32 v3, s1 -; GFX8-NEXT: s_mov_b32 s2, 0xffff -; GFX8-NEXT: s_and_b32 s3, s4, 0xffff +; GFX8-NEXT: s_and_b32 s2, s4, 0xffff +; GFX8-NEXT: v_mov_b32_e32 v4, 0xffff ; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc -; GFX8-NEXT: v_lshlrev_b32_e64 v3, v0, s3 -; GFX8-NEXT: v_lshlrev_b32_e64 v0, v0, s2 +; GFX8-NEXT: v_lshlrev_b32_e64 v3, v0, s2 +; GFX8-NEXT: v_lshlrev_b32_e32 v0, v0, v4 ; GFX8-NEXT: v_not_b32_e32 v0, v0 ; GFX8-NEXT: v_and_b32_e32 v0, v1, v0 ; GFX8-NEXT: v_or_b32_e32 v4, v0, v3 @@ -1182,16 +1182,16 @@ ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; GFX9-NEXT: v_lshrrev_b32_e32 v2, 1, v1 +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v2 ; GFX9-NEXT: v_and_b32_e32 v1, 1, v1 ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 4, v1 -; GFX9-NEXT: s_mov_b32 s2, 0xffff ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v3, s0 ; GFX9-NEXT: v_mov_b32_e32 v4, s1 -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v2 -; GFX9-NEXT: v_lshlrev_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; GFX9-NEXT: v_lshlrev_b32_e64 v1, v1, s2 ; GFX9-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc +; GFX9-NEXT: v_mov_b32_e32 v4, 0xffff +; GFX9-NEXT: v_lshlrev_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; GFX9-NEXT: v_lshlrev_b32_e32 v1, v1, v4 ; GFX9-NEXT: v_not_b32_e32 v1, v1 ; GFX9-NEXT: v_and_or_b32 v4, v3, v1, v0 ; GFX9-NEXT: v_mov_b32_e32 v0, s0 @@ -1208,16 +1208,16 @@ ; GFX8: ; %bb.0: ; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; GFX8-NEXT: v_lshrrev_b32_e32 v2, 1, v1 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 1, v2 ; GFX8-NEXT: v_and_b32_e32 v1, 1, v1 ; GFX8-NEXT: v_lshlrev_b32_e32 v1, 4, v1 -; GFX8-NEXT: s_mov_b32 s2, 0xffff ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v3, s0 ; GFX8-NEXT: v_mov_b32_e32 v4, s1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 1, v2 -; GFX8-NEXT: v_lshlrev_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; GFX8-NEXT: v_lshlrev_b32_e64 v1, v1, s2 ; GFX8-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc +; GFX8-NEXT: v_mov_b32_e32 v4, 0xffff +; GFX8-NEXT: v_lshlrev_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; GFX8-NEXT: v_lshlrev_b32_e32 v1, v1, v4 ; GFX8-NEXT: v_not_b32_e32 v1, v1 ; GFX8-NEXT: v_and_b32_e32 v1, v3, v1 ; GFX8-NEXT: v_or_b32_e32 v4, v1, v0 @@ -1318,21 +1318,21 @@ ; GFX9-LABEL: insertelement_v_v4i16_s_v: ; GFX9: ; %bb.0: ; GFX9-NEXT: global_load_dwordx2 v[0:1], v[0:1], off -; GFX9-NEXT: v_lshrrev_b32_e32 v5, 1, v2 +; GFX9-NEXT: v_lshrrev_b32_e32 v6, 1, v2 ; GFX9-NEXT: v_and_b32_e32 v2, 1, v2 -; GFX9-NEXT: s_mov_b32 s0, 0xffff -; GFX9-NEXT: s_and_b32 s1, s2, 0xffff +; GFX9-NEXT: v_mov_b32_e32 v5, 0xffff +; GFX9-NEXT: s_and_b32 s0, s2, 0xffff ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 4, v2 -; GFX9-NEXT: v_lshlrev_b32_e64 v6, v2, s1 -; GFX9-NEXT: v_lshlrev_b32_e64 v2, v2, s0 -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v5 +; GFX9-NEXT: v_lshlrev_b32_e64 v7, v2, s0 +; GFX9-NEXT: v_lshlrev_b32_e32 v2, v2, v5 +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v6 ; GFX9-NEXT: v_not_b32_e32 v2, v2 ; GFX9-NEXT: v_mov_b32_e32 v3, 0 -; GFX9-NEXT: v_cmp_eq_u32_e64 s[0:1], 0, v5 +; GFX9-NEXT: v_cmp_eq_u32_e64 s[0:1], 0, v6 ; GFX9-NEXT: v_mov_b32_e32 v4, 0 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_cndmask_b32_e32 v7, v0, v1, vcc -; GFX9-NEXT: v_and_or_b32 v2, v7, v2, v6 +; GFX9-NEXT: v_cndmask_b32_e32 v5, v0, v1, vcc +; GFX9-NEXT: v_and_or_b32 v2, v5, v2, v7 ; GFX9-NEXT: v_cndmask_b32_e64 v0, v0, v2, s[0:1] ; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc ; GFX9-NEXT: global_store_dwordx2 v[3:4], v[0:1], off @@ -1341,22 +1341,22 @@ ; GFX8-LABEL: insertelement_v_v4i16_s_v: ; GFX8: ; %bb.0: ; GFX8-NEXT: flat_load_dwordx2 v[0:1], v[0:1] -; GFX8-NEXT: v_lshrrev_b32_e32 v5, 1, v2 +; GFX8-NEXT: v_lshrrev_b32_e32 v6, 1, v2 ; GFX8-NEXT: v_and_b32_e32 v2, 1, v2 -; GFX8-NEXT: s_mov_b32 s0, 0xffff -; GFX8-NEXT: s_and_b32 s1, s2, 0xffff +; GFX8-NEXT: v_mov_b32_e32 v5, 0xffff +; GFX8-NEXT: s_and_b32 s0, s2, 0xffff ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 4, v2 -; GFX8-NEXT: v_lshlrev_b32_e64 v6, v2, s1 -; GFX8-NEXT: v_lshlrev_b32_e64 v2, v2, s0 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 1, v5 +; GFX8-NEXT: v_lshlrev_b32_e64 v7, v2, s0 +; GFX8-NEXT: v_lshlrev_b32_e32 v2, v2, v5 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 1, v6 ; GFX8-NEXT: v_not_b32_e32 v2, v2 ; GFX8-NEXT: v_mov_b32_e32 v3, 0 -; GFX8-NEXT: v_cmp_eq_u32_e64 s[0:1], 0, v5 +; GFX8-NEXT: v_cmp_eq_u32_e64 s[0:1], 0, v6 ; GFX8-NEXT: v_mov_b32_e32 v4, 0 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_cndmask_b32_e32 v7, v0, v1, vcc -; GFX8-NEXT: v_and_b32_e32 v2, v7, v2 -; GFX8-NEXT: v_or_b32_e32 v2, v2, v6 +; GFX8-NEXT: v_cndmask_b32_e32 v5, v0, v1, vcc +; GFX8-NEXT: v_and_b32_e32 v2, v5, v2 +; GFX8-NEXT: v_or_b32_e32 v2, v2, v7 ; GFX8-NEXT: v_cndmask_b32_e64 v0, v0, v2, s[0:1] ; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc ; GFX8-NEXT: flat_store_dwordx2 v[3:4], v[0:1] @@ -1561,20 +1561,20 @@ ; GFX9-LABEL: insertelement_v_v4i16_v_v: ; GFX9: ; %bb.0: ; GFX9-NEXT: global_load_dwordx2 v[0:1], v[0:1], off -; GFX9-NEXT: v_lshrrev_b32_e32 v6, 1, v3 +; GFX9-NEXT: v_lshrrev_b32_e32 v7, 1, v3 ; GFX9-NEXT: v_and_b32_e32 v3, 1, v3 -; GFX9-NEXT: s_mov_b32 s0, 0xffff +; GFX9-NEXT: v_mov_b32_e32 v6, 0xffff ; GFX9-NEXT: v_lshlrev_b32_e32 v3, 4, v3 ; GFX9-NEXT: v_lshlrev_b32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; GFX9-NEXT: v_lshlrev_b32_e64 v3, v3, s0 -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v6 +; GFX9-NEXT: v_lshlrev_b32_e32 v3, v3, v6 +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v7 ; GFX9-NEXT: v_not_b32_e32 v3, v3 ; GFX9-NEXT: v_mov_b32_e32 v4, 0 -; GFX9-NEXT: v_cmp_eq_u32_e64 s[0:1], 0, v6 +; GFX9-NEXT: v_cmp_eq_u32_e64 s[0:1], 0, v7 ; GFX9-NEXT: v_mov_b32_e32 v5, 0 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_cndmask_b32_e32 v7, v0, v1, vcc -; GFX9-NEXT: v_and_or_b32 v2, v7, v3, v2 +; GFX9-NEXT: v_cndmask_b32_e32 v6, v0, v1, vcc +; GFX9-NEXT: v_and_or_b32 v2, v6, v3, v2 ; GFX9-NEXT: v_cndmask_b32_e64 v0, v0, v2, s[0:1] ; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc ; GFX9-NEXT: global_store_dwordx2 v[4:5], v[0:1], off @@ -1583,20 +1583,20 @@ ; GFX8-LABEL: insertelement_v_v4i16_v_v: ; GFX8: ; %bb.0: ; GFX8-NEXT: flat_load_dwordx2 v[0:1], v[0:1] -; GFX8-NEXT: v_lshrrev_b32_e32 v6, 1, v3 +; GFX8-NEXT: v_lshrrev_b32_e32 v7, 1, v3 ; GFX8-NEXT: v_and_b32_e32 v3, 1, v3 -; GFX8-NEXT: s_mov_b32 s0, 0xffff +; GFX8-NEXT: v_mov_b32_e32 v6, 0xffff ; GFX8-NEXT: v_lshlrev_b32_e32 v3, 4, v3 ; GFX8-NEXT: v_lshlrev_b32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; GFX8-NEXT: v_lshlrev_b32_e64 v3, v3, s0 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 1, v6 +; GFX8-NEXT: v_lshlrev_b32_e32 v3, v3, v6 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 1, v7 ; GFX8-NEXT: v_not_b32_e32 v3, v3 ; GFX8-NEXT: v_mov_b32_e32 v4, 0 -; GFX8-NEXT: v_cmp_eq_u32_e64 s[0:1], 0, v6 +; GFX8-NEXT: v_cmp_eq_u32_e64 s[0:1], 0, v7 ; GFX8-NEXT: v_mov_b32_e32 v5, 0 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_cndmask_b32_e32 v7, v0, v1, vcc -; GFX8-NEXT: v_and_b32_e32 v3, v7, v3 +; GFX8-NEXT: v_cndmask_b32_e32 v6, v0, v1, vcc +; GFX8-NEXT: v_and_b32_e32 v3, v6, v3 ; GFX8-NEXT: v_or_b32_e32 v2, v3, v2 ; GFX8-NEXT: v_cndmask_b32_e64 v0, v0, v2, s[0:1] ; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc @@ -2205,21 +2205,21 @@ ; GFX9-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0x0 ; GFX9-NEXT: v_lshrrev_b32_e32 v4, 1, v0 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v4 -; GFX9-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX9-NEXT: v_cmp_eq_u32_e64 s[0:1], 2, v4 +; GFX9-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v1, s8 ; GFX9-NEXT: v_mov_b32_e32 v2, s9 ; GFX9-NEXT: v_mov_b32_e32 v3, s10 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc +; GFX9-NEXT: v_cndmask_b32_e64 v1, v1, v3, s[0:1] ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 4, v0 -; GFX9-NEXT: s_mov_b32 s5, 0xffff ; GFX9-NEXT: s_and_b32 s4, s4, 0xffff +; GFX9-NEXT: v_mov_b32_e32 v3, 0xffff ; GFX9-NEXT: v_mov_b32_e32 v5, s11 -; GFX9-NEXT: v_cndmask_b32_e64 v1, v1, v3, s[0:1] ; GFX9-NEXT: v_cmp_eq_u32_e64 s[2:3], 3, v4 ; GFX9-NEXT: v_lshlrev_b32_e64 v2, v0, s4 -; GFX9-NEXT: v_lshlrev_b32_e64 v0, v0, s5 +; GFX9-NEXT: v_lshlrev_b32_e32 v0, v0, v3 ; GFX9-NEXT: v_cndmask_b32_e64 v1, v1, v5, s[2:3] ; GFX9-NEXT: v_not_b32_e32 v0, v0 ; GFX9-NEXT: v_and_or_b32 v6, v1, v0, v2 @@ -2242,21 +2242,21 @@ ; GFX8-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0x0 ; GFX8-NEXT: v_lshrrev_b32_e32 v4, 1, v0 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 1, v4 -; GFX8-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX8-NEXT: v_cmp_eq_u32_e64 s[0:1], 2, v4 +; GFX8-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, s8 ; GFX8-NEXT: v_mov_b32_e32 v2, s9 ; GFX8-NEXT: v_mov_b32_e32 v3, s10 ; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc +; GFX8-NEXT: v_cndmask_b32_e64 v1, v1, v3, s[0:1] ; GFX8-NEXT: v_lshlrev_b32_e32 v0, 4, v0 -; GFX8-NEXT: s_mov_b32 s5, 0xffff ; GFX8-NEXT: s_and_b32 s4, s4, 0xffff +; GFX8-NEXT: v_mov_b32_e32 v3, 0xffff ; GFX8-NEXT: v_mov_b32_e32 v5, s11 -; GFX8-NEXT: v_cndmask_b32_e64 v1, v1, v3, s[0:1] ; GFX8-NEXT: v_cmp_eq_u32_e64 s[2:3], 3, v4 ; GFX8-NEXT: v_lshlrev_b32_e64 v2, v0, s4 -; GFX8-NEXT: v_lshlrev_b32_e64 v0, v0, s5 +; GFX8-NEXT: v_lshlrev_b32_e32 v0, v0, v3 ; GFX8-NEXT: v_cndmask_b32_e64 v1, v1, v5, s[2:3] ; GFX8-NEXT: v_not_b32_e32 v0, v0 ; GFX8-NEXT: v_and_b32_e32 v0, v1, v0 @@ -2400,12 +2400,12 @@ ; GFX9-NEXT: v_mov_b32_e32 v5, s6 ; GFX9-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 4, v1 -; GFX9-NEXT: s_mov_b32 s8, 0xffff +; GFX9-NEXT: v_mov_b32_e32 v3, 0xffff ; GFX9-NEXT: v_mov_b32_e32 v6, s7 ; GFX9-NEXT: v_cndmask_b32_e64 v2, v2, v5, s[0:1] ; GFX9-NEXT: v_cmp_eq_u32_e64 s[2:3], 3, v4 ; GFX9-NEXT: v_lshlrev_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; GFX9-NEXT: v_lshlrev_b32_e64 v1, v1, s8 +; GFX9-NEXT: v_lshlrev_b32_e32 v1, v1, v3 ; GFX9-NEXT: v_cndmask_b32_e64 v2, v2, v6, s[2:3] ; GFX9-NEXT: v_not_b32_e32 v1, v1 ; GFX9-NEXT: v_and_or_b32 v6, v2, v1, v0 @@ -2436,12 +2436,12 @@ ; GFX8-NEXT: v_mov_b32_e32 v5, s6 ; GFX8-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc ; GFX8-NEXT: v_lshlrev_b32_e32 v1, 4, v1 -; GFX8-NEXT: s_mov_b32 s8, 0xffff +; GFX8-NEXT: v_mov_b32_e32 v3, 0xffff ; GFX8-NEXT: v_mov_b32_e32 v6, s7 ; GFX8-NEXT: v_cndmask_b32_e64 v2, v2, v5, s[0:1] ; GFX8-NEXT: v_cmp_eq_u32_e64 s[2:3], 3, v4 ; GFX8-NEXT: v_lshlrev_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; GFX8-NEXT: v_lshlrev_b32_e64 v1, v1, s8 +; GFX8-NEXT: v_lshlrev_b32_e32 v1, v1, v3 ; GFX8-NEXT: v_cndmask_b32_e64 v2, v2, v6, s[2:3] ; GFX8-NEXT: v_not_b32_e32 v1, v1 ; GFX8-NEXT: v_and_b32_e32 v1, v2, v1 @@ -2577,25 +2577,25 @@ ; GFX9-LABEL: insertelement_v_v8i16_s_v: ; GFX9: ; %bb.0: ; GFX9-NEXT: global_load_dwordx4 v[3:6], v[0:1], off -; GFX9-NEXT: v_lshrrev_b32_e32 v0, 1, v2 -; GFX9-NEXT: v_and_b32_e32 v1, 1, v2 -; GFX9-NEXT: s_mov_b32 s0, 0xffff -; GFX9-NEXT: s_and_b32 s1, s2, 0xffff -; GFX9-NEXT: v_lshlrev_b32_e32 v1, 4, v1 -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 -; GFX9-NEXT: v_lshlrev_b32_e64 v2, v1, s1 -; GFX9-NEXT: v_lshlrev_b32_e64 v1, v1, s0 -; GFX9-NEXT: v_cmp_eq_u32_e64 s[0:1], 2, v0 -; GFX9-NEXT: v_cmp_eq_u32_e64 s[2:3], 3, v0 -; GFX9-NEXT: v_not_b32_e32 v1, v1 +; GFX9-NEXT: v_lshrrev_b32_e32 v1, 1, v2 +; GFX9-NEXT: v_and_b32_e32 v2, 1, v2 +; GFX9-NEXT: v_mov_b32_e32 v0, 0xffff +; GFX9-NEXT: s_and_b32 s0, s2, 0xffff +; GFX9-NEXT: v_lshlrev_b32_e32 v2, 4, v2 +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v1 +; GFX9-NEXT: v_lshlrev_b32_e64 v9, v2, s0 +; GFX9-NEXT: v_lshlrev_b32_e32 v0, v2, v0 +; GFX9-NEXT: v_cmp_eq_u32_e64 s[0:1], 2, v1 +; GFX9-NEXT: v_cmp_eq_u32_e64 s[2:3], 3, v1 +; GFX9-NEXT: v_not_b32_e32 v0, v0 ; GFX9-NEXT: v_mov_b32_e32 v7, 0 -; GFX9-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v0 +; GFX9-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v1 ; GFX9-NEXT: v_mov_b32_e32 v8, 0 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_cndmask_b32_e32 v9, v3, v4, vcc -; GFX9-NEXT: v_cndmask_b32_e64 v9, v9, v5, s[0:1] -; GFX9-NEXT: v_cndmask_b32_e64 v9, v9, v6, s[2:3] -; GFX9-NEXT: v_and_or_b32 v9, v9, v1, v2 +; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc +; GFX9-NEXT: v_cndmask_b32_e64 v2, v2, v5, s[0:1] +; GFX9-NEXT: v_cndmask_b32_e64 v2, v2, v6, s[2:3] +; GFX9-NEXT: v_and_or_b32 v9, v2, v0, v9 ; GFX9-NEXT: v_cndmask_b32_e64 v0, v3, v9, s[4:5] ; GFX9-NEXT: v_cndmask_b32_e32 v1, v4, v9, vcc ; GFX9-NEXT: v_cndmask_b32_e64 v2, v5, v9, s[0:1] @@ -2606,26 +2606,26 @@ ; GFX8-LABEL: insertelement_v_v8i16_s_v: ; GFX8: ; %bb.0: ; GFX8-NEXT: flat_load_dwordx4 v[3:6], v[0:1] -; GFX8-NEXT: v_lshrrev_b32_e32 v0, 1, v2 -; GFX8-NEXT: v_and_b32_e32 v1, 1, v2 -; GFX8-NEXT: s_mov_b32 s0, 0xffff -; GFX8-NEXT: s_and_b32 s1, s2, 0xffff -; GFX8-NEXT: v_lshlrev_b32_e32 v1, 4, v1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 -; GFX8-NEXT: v_lshlrev_b32_e64 v2, v1, s1 -; GFX8-NEXT: v_lshlrev_b32_e64 v1, v1, s0 -; GFX8-NEXT: v_cmp_eq_u32_e64 s[0:1], 2, v0 -; GFX8-NEXT: v_cmp_eq_u32_e64 s[2:3], 3, v0 -; GFX8-NEXT: v_not_b32_e32 v1, v1 +; GFX8-NEXT: v_lshrrev_b32_e32 v1, 1, v2 +; GFX8-NEXT: v_and_b32_e32 v2, 1, v2 +; GFX8-NEXT: v_mov_b32_e32 v0, 0xffff +; GFX8-NEXT: s_and_b32 s0, s2, 0xffff +; GFX8-NEXT: v_lshlrev_b32_e32 v2, 4, v2 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 1, v1 +; GFX8-NEXT: v_lshlrev_b32_e64 v9, v2, s0 +; GFX8-NEXT: v_lshlrev_b32_e32 v0, v2, v0 +; GFX8-NEXT: v_cmp_eq_u32_e64 s[0:1], 2, v1 +; GFX8-NEXT: v_cmp_eq_u32_e64 s[2:3], 3, v1 +; GFX8-NEXT: v_not_b32_e32 v0, v0 ; GFX8-NEXT: v_mov_b32_e32 v7, 0 -; GFX8-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v0 +; GFX8-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v1 ; GFX8-NEXT: v_mov_b32_e32 v8, 0 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_cndmask_b32_e32 v9, v3, v4, vcc -; GFX8-NEXT: v_cndmask_b32_e64 v9, v9, v5, s[0:1] -; GFX8-NEXT: v_cndmask_b32_e64 v9, v9, v6, s[2:3] -; GFX8-NEXT: v_and_b32_e32 v1, v9, v1 -; GFX8-NEXT: v_or_b32_e32 v9, v1, v2 +; GFX8-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc +; GFX8-NEXT: v_cndmask_b32_e64 v2, v2, v5, s[0:1] +; GFX8-NEXT: v_cndmask_b32_e64 v2, v2, v6, s[2:3] +; GFX8-NEXT: v_and_b32_e32 v0, v2, v0 +; GFX8-NEXT: v_or_b32_e32 v9, v0, v9 ; GFX8-NEXT: v_cndmask_b32_e64 v0, v3, v9, s[4:5] ; GFX8-NEXT: v_cndmask_b32_e32 v1, v4, v9, vcc ; GFX8-NEXT: v_cndmask_b32_e64 v2, v5, v9, s[0:1] @@ -2885,24 +2885,24 @@ ; GFX9-LABEL: insertelement_v_v8i16_v_v: ; GFX9: ; %bb.0: ; GFX9-NEXT: global_load_dwordx4 v[4:7], v[0:1], off -; GFX9-NEXT: v_lshrrev_b32_e32 v0, 1, v3 -; GFX9-NEXT: v_and_b32_e32 v1, 1, v3 -; GFX9-NEXT: s_mov_b32 s0, 0xffff -; GFX9-NEXT: v_lshlrev_b32_e32 v1, 4, v1 -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 -; GFX9-NEXT: v_lshlrev_b32_sdwa v2, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; GFX9-NEXT: v_lshlrev_b32_e64 v1, v1, s0 -; GFX9-NEXT: v_cmp_eq_u32_e64 s[0:1], 2, v0 -; GFX9-NEXT: v_cmp_eq_u32_e64 s[2:3], 3, v0 -; GFX9-NEXT: v_not_b32_e32 v1, v1 +; GFX9-NEXT: v_lshrrev_b32_e32 v1, 1, v3 +; GFX9-NEXT: v_and_b32_e32 v3, 1, v3 +; GFX9-NEXT: v_mov_b32_e32 v0, 0xffff +; GFX9-NEXT: v_lshlrev_b32_e32 v3, 4, v3 +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v1 +; GFX9-NEXT: v_lshlrev_b32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; GFX9-NEXT: v_lshlrev_b32_e32 v0, v3, v0 +; GFX9-NEXT: v_cmp_eq_u32_e64 s[0:1], 2, v1 +; GFX9-NEXT: v_cmp_eq_u32_e64 s[2:3], 3, v1 +; GFX9-NEXT: v_not_b32_e32 v0, v0 ; GFX9-NEXT: v_mov_b32_e32 v8, 0 -; GFX9-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v0 +; GFX9-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v1 ; GFX9-NEXT: v_mov_b32_e32 v9, 0 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc ; GFX9-NEXT: v_cndmask_b32_e64 v3, v3, v6, s[0:1] ; GFX9-NEXT: v_cndmask_b32_e64 v3, v3, v7, s[2:3] -; GFX9-NEXT: v_and_or_b32 v3, v3, v1, v2 +; GFX9-NEXT: v_and_or_b32 v3, v3, v0, v2 ; GFX9-NEXT: v_cndmask_b32_e64 v0, v4, v3, s[4:5] ; GFX9-NEXT: v_cndmask_b32_e32 v1, v5, v3, vcc ; GFX9-NEXT: v_cndmask_b32_e64 v2, v6, v3, s[0:1] @@ -2913,25 +2913,25 @@ ; GFX8-LABEL: insertelement_v_v8i16_v_v: ; GFX8: ; %bb.0: ; GFX8-NEXT: flat_load_dwordx4 v[4:7], v[0:1] -; GFX8-NEXT: v_lshrrev_b32_e32 v0, 1, v3 -; GFX8-NEXT: v_and_b32_e32 v1, 1, v3 -; GFX8-NEXT: s_mov_b32 s0, 0xffff -; GFX8-NEXT: v_lshlrev_b32_e32 v1, 4, v1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 -; GFX8-NEXT: v_lshlrev_b32_sdwa v2, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; GFX8-NEXT: v_lshlrev_b32_e64 v1, v1, s0 -; GFX8-NEXT: v_cmp_eq_u32_e64 s[0:1], 2, v0 -; GFX8-NEXT: v_cmp_eq_u32_e64 s[2:3], 3, v0 -; GFX8-NEXT: v_not_b32_e32 v1, v1 +; GFX8-NEXT: v_lshrrev_b32_e32 v1, 1, v3 +; GFX8-NEXT: v_and_b32_e32 v3, 1, v3 +; GFX8-NEXT: v_mov_b32_e32 v0, 0xffff +; GFX8-NEXT: v_lshlrev_b32_e32 v3, 4, v3 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 1, v1 +; GFX8-NEXT: v_lshlrev_b32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; GFX8-NEXT: v_lshlrev_b32_e32 v0, v3, v0 +; GFX8-NEXT: v_cmp_eq_u32_e64 s[0:1], 2, v1 +; GFX8-NEXT: v_cmp_eq_u32_e64 s[2:3], 3, v1 +; GFX8-NEXT: v_not_b32_e32 v0, v0 ; GFX8-NEXT: v_mov_b32_e32 v8, 0 -; GFX8-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v0 +; GFX8-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v1 ; GFX8-NEXT: v_mov_b32_e32 v9, 0 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc ; GFX8-NEXT: v_cndmask_b32_e64 v3, v3, v6, s[0:1] ; GFX8-NEXT: v_cndmask_b32_e64 v3, v3, v7, s[2:3] -; GFX8-NEXT: v_and_b32_e32 v1, v3, v1 -; GFX8-NEXT: v_or_b32_e32 v3, v1, v2 +; GFX8-NEXT: v_and_b32_e32 v0, v3, v0 +; GFX8-NEXT: v_or_b32_e32 v3, v0, v2 ; GFX8-NEXT: v_cndmask_b32_e64 v0, v4, v3, s[4:5] ; GFX8-NEXT: v_cndmask_b32_e32 v1, v5, v3, vcc ; GFX8-NEXT: v_cndmask_b32_e64 v2, v6, v3, s[0:1] @@ -3578,13 +3578,13 @@ ; GFX9-NEXT: v_cndmask_b32_e64 v1, v1, v6, s[6:7] ; GFX9-NEXT: v_cmp_eq_u32_e64 s[8:9], 6, v8 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 4, v0 -; GFX9-NEXT: s_mov_b32 s5, 0xffff ; GFX9-NEXT: s_and_b32 s4, s4, 0xffff +; GFX9-NEXT: v_mov_b32_e32 v3, 0xffff ; GFX9-NEXT: v_mov_b32_e32 v9, s23 ; GFX9-NEXT: v_cndmask_b32_e64 v1, v1, v7, s[8:9] ; GFX9-NEXT: v_cmp_eq_u32_e64 s[10:11], 7, v8 ; GFX9-NEXT: v_lshlrev_b32_e64 v2, v0, s4 -; GFX9-NEXT: v_lshlrev_b32_e64 v0, v0, s5 +; GFX9-NEXT: v_lshlrev_b32_e32 v0, v0, v3 ; GFX9-NEXT: v_cndmask_b32_e64 v1, v1, v9, s[10:11] ; GFX9-NEXT: v_not_b32_e32 v0, v0 ; GFX9-NEXT: v_and_or_b32 v9, v1, v0, v2 @@ -3638,13 +3638,13 @@ ; GFX8-NEXT: v_cndmask_b32_e64 v1, v1, v6, s[6:7] ; GFX8-NEXT: v_cmp_eq_u32_e64 s[8:9], 6, v8 ; GFX8-NEXT: v_lshlrev_b32_e32 v0, 4, v0 -; GFX8-NEXT: s_mov_b32 s5, 0xffff ; GFX8-NEXT: s_and_b32 s4, s4, 0xffff +; GFX8-NEXT: v_mov_b32_e32 v3, 0xffff ; GFX8-NEXT: v_mov_b32_e32 v9, s23 ; GFX8-NEXT: v_cndmask_b32_e64 v1, v1, v7, s[8:9] ; GFX8-NEXT: v_cmp_eq_u32_e64 s[10:11], 7, v8 ; GFX8-NEXT: v_lshlrev_b32_e64 v2, v0, s4 -; GFX8-NEXT: v_lshlrev_b32_e64 v0, v0, s5 +; GFX8-NEXT: v_lshlrev_b32_e32 v0, v0, v3 ; GFX8-NEXT: v_cndmask_b32_e64 v1, v1, v9, s[10:11] ; GFX8-NEXT: v_not_b32_e32 v0, v0 ; GFX8-NEXT: v_and_b32_e32 v0, v1, v0 @@ -3872,12 +3872,12 @@ ; GFX9-NEXT: v_cndmask_b32_e64 v2, v2, v7, s[6:7] ; GFX9-NEXT: v_cmp_eq_u32_e64 s[8:9], 6, v8 ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 4, v1 -; GFX9-NEXT: s_mov_b32 s20, 0xffff +; GFX9-NEXT: v_mov_b32_e32 v3, 0xffff ; GFX9-NEXT: v_mov_b32_e32 v10, s19 ; GFX9-NEXT: v_cndmask_b32_e64 v2, v2, v9, s[8:9] ; GFX9-NEXT: v_cmp_eq_u32_e64 s[10:11], 7, v8 ; GFX9-NEXT: v_lshlrev_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; GFX9-NEXT: v_lshlrev_b32_e64 v1, v1, s20 +; GFX9-NEXT: v_lshlrev_b32_e32 v1, v1, v3 ; GFX9-NEXT: v_cndmask_b32_e64 v2, v2, v10, s[10:11] ; GFX9-NEXT: v_not_b32_e32 v1, v1 ; GFX9-NEXT: v_and_or_b32 v9, v2, v1, v0 @@ -3931,12 +3931,12 @@ ; GFX8-NEXT: v_cndmask_b32_e64 v2, v2, v7, s[6:7] ; GFX8-NEXT: v_cmp_eq_u32_e64 s[8:9], 6, v8 ; GFX8-NEXT: v_lshlrev_b32_e32 v1, 4, v1 -; GFX8-NEXT: s_mov_b32 s20, 0xffff +; GFX8-NEXT: v_mov_b32_e32 v3, 0xffff ; GFX8-NEXT: v_mov_b32_e32 v10, s19 ; GFX8-NEXT: v_cndmask_b32_e64 v2, v2, v9, s[8:9] ; GFX8-NEXT: v_cmp_eq_u32_e64 s[10:11], 7, v8 ; GFX8-NEXT: v_lshlrev_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; GFX8-NEXT: v_lshlrev_b32_e64 v1, v1, s20 +; GFX8-NEXT: v_lshlrev_b32_e32 v1, v1, v3 ; GFX8-NEXT: v_cndmask_b32_e64 v2, v2, v10, s[10:11] ; GFX8-NEXT: v_not_b32_e32 v1, v1 ; GFX8-NEXT: v_and_b32_e32 v1, v2, v1 @@ -4145,32 +4145,32 @@ ; GFX9: ; %bb.0: ; GFX9-NEXT: global_load_dwordx4 v[3:6], v[0:1], off ; GFX9-NEXT: global_load_dwordx4 v[7:10], v[0:1], off offset:16 -; GFX9-NEXT: v_lshrrev_b32_e32 v0, 1, v2 -; GFX9-NEXT: v_and_b32_e32 v1, 1, v2 -; GFX9-NEXT: s_mov_b32 s0, 0xffff -; GFX9-NEXT: s_and_b32 s1, s2, 0xffff -; GFX9-NEXT: v_lshlrev_b32_e32 v1, 4, v1 -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 -; GFX9-NEXT: v_lshlrev_b32_e64 v2, v1, s1 -; GFX9-NEXT: v_lshlrev_b32_e64 v1, v1, s0 -; GFX9-NEXT: v_cmp_eq_u32_e64 s[0:1], 2, v0 -; GFX9-NEXT: v_cmp_eq_u32_e64 s[2:3], 3, v0 -; GFX9-NEXT: v_cmp_eq_u32_e64 s[4:5], 4, v0 -; GFX9-NEXT: v_cmp_eq_u32_e64 s[6:7], 5, v0 -; GFX9-NEXT: v_cmp_eq_u32_e64 s[8:9], 6, v0 -; GFX9-NEXT: v_cmp_eq_u32_e64 s[10:11], 7, v0 -; GFX9-NEXT: v_not_b32_e32 v1, v1 -; GFX9-NEXT: v_cmp_eq_u32_e64 s[12:13], 0, v0 +; GFX9-NEXT: v_lshrrev_b32_e32 v1, 1, v2 +; GFX9-NEXT: v_and_b32_e32 v2, 1, v2 +; GFX9-NEXT: v_mov_b32_e32 v0, 0xffff +; GFX9-NEXT: s_and_b32 s0, s2, 0xffff +; GFX9-NEXT: v_lshlrev_b32_e32 v2, 4, v2 +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v1 +; GFX9-NEXT: v_lshlrev_b32_e64 v11, v2, s0 +; GFX9-NEXT: v_lshlrev_b32_e32 v0, v2, v0 +; GFX9-NEXT: v_cmp_eq_u32_e64 s[0:1], 2, v1 +; GFX9-NEXT: v_cmp_eq_u32_e64 s[2:3], 3, v1 +; GFX9-NEXT: v_cmp_eq_u32_e64 s[4:5], 4, v1 +; GFX9-NEXT: v_cmp_eq_u32_e64 s[6:7], 5, v1 +; GFX9-NEXT: v_cmp_eq_u32_e64 s[8:9], 6, v1 +; GFX9-NEXT: v_cmp_eq_u32_e64 s[10:11], 7, v1 +; GFX9-NEXT: v_not_b32_e32 v0, v0 +; GFX9-NEXT: v_cmp_eq_u32_e64 s[12:13], 0, v1 ; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_cndmask_b32_e32 v11, v3, v4, vcc -; GFX9-NEXT: v_cndmask_b32_e64 v11, v11, v5, s[0:1] -; GFX9-NEXT: v_cndmask_b32_e64 v11, v11, v6, s[2:3] +; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc +; GFX9-NEXT: v_cndmask_b32_e64 v2, v2, v5, s[0:1] +; GFX9-NEXT: v_cndmask_b32_e64 v2, v2, v6, s[2:3] ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_cndmask_b32_e64 v11, v11, v7, s[4:5] -; GFX9-NEXT: v_cndmask_b32_e64 v11, v11, v8, s[6:7] -; GFX9-NEXT: v_cndmask_b32_e64 v11, v11, v9, s[8:9] -; GFX9-NEXT: v_cndmask_b32_e64 v11, v11, v10, s[10:11] -; GFX9-NEXT: v_and_or_b32 v11, v11, v1, v2 +; GFX9-NEXT: v_cndmask_b32_e64 v2, v2, v7, s[4:5] +; GFX9-NEXT: v_cndmask_b32_e64 v2, v2, v8, s[6:7] +; GFX9-NEXT: v_cndmask_b32_e64 v2, v2, v9, s[8:9] +; GFX9-NEXT: v_cndmask_b32_e64 v2, v2, v10, s[10:11] +; GFX9-NEXT: v_and_or_b32 v11, v2, v0, v11 ; GFX9-NEXT: v_cndmask_b32_e64 v0, v3, v11, s[12:13] ; GFX9-NEXT: v_cndmask_b32_e64 v2, v5, v11, s[0:1] ; GFX9-NEXT: v_cndmask_b32_e64 v3, v6, v11, s[2:3] @@ -4193,33 +4193,33 @@ ; GFX8-NEXT: v_add_u32_e32 v0, vcc, 16, v0 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; GFX8-NEXT: flat_load_dwordx4 v[7:10], v[0:1] -; GFX8-NEXT: v_lshrrev_b32_e32 v0, 1, v2 -; GFX8-NEXT: v_and_b32_e32 v1, 1, v2 -; GFX8-NEXT: s_mov_b32 s0, 0xffff -; GFX8-NEXT: s_and_b32 s1, s2, 0xffff -; GFX8-NEXT: v_lshlrev_b32_e32 v1, 4, v1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 -; GFX8-NEXT: v_lshlrev_b32_e64 v2, v1, s1 -; GFX8-NEXT: v_lshlrev_b32_e64 v1, v1, s0 -; GFX8-NEXT: v_cmp_eq_u32_e64 s[0:1], 2, v0 -; GFX8-NEXT: v_cmp_eq_u32_e64 s[2:3], 3, v0 -; GFX8-NEXT: v_cmp_eq_u32_e64 s[4:5], 4, v0 -; GFX8-NEXT: v_cmp_eq_u32_e64 s[6:7], 5, v0 -; GFX8-NEXT: v_cmp_eq_u32_e64 s[8:9], 6, v0 -; GFX8-NEXT: v_cmp_eq_u32_e64 s[10:11], 7, v0 -; GFX8-NEXT: v_not_b32_e32 v1, v1 -; GFX8-NEXT: v_cmp_eq_u32_e64 s[12:13], 0, v0 +; GFX8-NEXT: v_lshrrev_b32_e32 v1, 1, v2 +; GFX8-NEXT: v_and_b32_e32 v2, 1, v2 +; GFX8-NEXT: v_mov_b32_e32 v0, 0xffff +; GFX8-NEXT: s_and_b32 s0, s2, 0xffff +; GFX8-NEXT: v_lshlrev_b32_e32 v2, 4, v2 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 1, v1 +; GFX8-NEXT: v_lshlrev_b32_e64 v11, v2, s0 +; GFX8-NEXT: v_lshlrev_b32_e32 v0, v2, v0 +; GFX8-NEXT: v_cmp_eq_u32_e64 s[0:1], 2, v1 +; GFX8-NEXT: v_cmp_eq_u32_e64 s[2:3], 3, v1 +; GFX8-NEXT: v_cmp_eq_u32_e64 s[4:5], 4, v1 +; GFX8-NEXT: v_cmp_eq_u32_e64 s[6:7], 5, v1 +; GFX8-NEXT: v_cmp_eq_u32_e64 s[8:9], 6, v1 +; GFX8-NEXT: v_cmp_eq_u32_e64 s[10:11], 7, v1 +; GFX8-NEXT: v_not_b32_e32 v0, v0 +; GFX8-NEXT: v_cmp_eq_u32_e64 s[12:13], 0, v1 ; GFX8-NEXT: s_waitcnt vmcnt(1) -; GFX8-NEXT: v_cndmask_b32_e32 v11, v3, v4, vcc -; GFX8-NEXT: v_cndmask_b32_e64 v11, v11, v5, s[0:1] -; GFX8-NEXT: v_cndmask_b32_e64 v11, v11, v6, s[2:3] +; GFX8-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc +; GFX8-NEXT: v_cndmask_b32_e64 v2, v2, v5, s[0:1] +; GFX8-NEXT: v_cndmask_b32_e64 v2, v2, v6, s[2:3] ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_cndmask_b32_e64 v11, v11, v7, s[4:5] -; GFX8-NEXT: v_cndmask_b32_e64 v11, v11, v8, s[6:7] -; GFX8-NEXT: v_cndmask_b32_e64 v11, v11, v9, s[8:9] -; GFX8-NEXT: v_cndmask_b32_e64 v11, v11, v10, s[10:11] -; GFX8-NEXT: v_and_b32_e32 v1, v11, v1 -; GFX8-NEXT: v_or_b32_e32 v11, v1, v2 +; GFX8-NEXT: v_cndmask_b32_e64 v2, v2, v7, s[4:5] +; GFX8-NEXT: v_cndmask_b32_e64 v2, v2, v8, s[6:7] +; GFX8-NEXT: v_cndmask_b32_e64 v2, v2, v9, s[8:9] +; GFX8-NEXT: v_cndmask_b32_e64 v2, v2, v10, s[10:11] +; GFX8-NEXT: v_and_b32_e32 v0, v2, v0 +; GFX8-NEXT: v_or_b32_e32 v11, v0, v11 ; GFX8-NEXT: v_cndmask_b32_e64 v0, v3, v11, s[12:13] ; GFX8-NEXT: v_cndmask_b32_e64 v2, v5, v11, s[0:1] ; GFX8-NEXT: v_cndmask_b32_e64 v3, v6, v11, s[2:3] @@ -4542,21 +4542,21 @@ ; GFX9: ; %bb.0: ; GFX9-NEXT: global_load_dwordx4 v[4:7], v[0:1], off ; GFX9-NEXT: global_load_dwordx4 v[8:11], v[0:1], off offset:16 -; GFX9-NEXT: v_lshrrev_b32_e32 v0, 1, v3 -; GFX9-NEXT: v_and_b32_e32 v1, 1, v3 -; GFX9-NEXT: s_mov_b32 s0, 0xffff -; GFX9-NEXT: v_lshlrev_b32_e32 v1, 4, v1 -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 -; GFX9-NEXT: v_lshlrev_b32_sdwa v2, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; GFX9-NEXT: v_lshlrev_b32_e64 v1, v1, s0 -; GFX9-NEXT: v_cmp_eq_u32_e64 s[0:1], 2, v0 -; GFX9-NEXT: v_cmp_eq_u32_e64 s[2:3], 3, v0 -; GFX9-NEXT: v_cmp_eq_u32_e64 s[4:5], 4, v0 -; GFX9-NEXT: v_cmp_eq_u32_e64 s[6:7], 5, v0 -; GFX9-NEXT: v_cmp_eq_u32_e64 s[8:9], 6, v0 -; GFX9-NEXT: v_cmp_eq_u32_e64 s[10:11], 7, v0 -; GFX9-NEXT: v_not_b32_e32 v1, v1 -; GFX9-NEXT: v_cmp_eq_u32_e64 s[12:13], 0, v0 +; GFX9-NEXT: v_lshrrev_b32_e32 v1, 1, v3 +; GFX9-NEXT: v_and_b32_e32 v3, 1, v3 +; GFX9-NEXT: v_mov_b32_e32 v0, 0xffff +; GFX9-NEXT: v_lshlrev_b32_e32 v3, 4, v3 +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v1 +; GFX9-NEXT: v_lshlrev_b32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; GFX9-NEXT: v_lshlrev_b32_e32 v0, v3, v0 +; GFX9-NEXT: v_cmp_eq_u32_e64 s[0:1], 2, v1 +; GFX9-NEXT: v_cmp_eq_u32_e64 s[2:3], 3, v1 +; GFX9-NEXT: v_cmp_eq_u32_e64 s[4:5], 4, v1 +; GFX9-NEXT: v_cmp_eq_u32_e64 s[6:7], 5, v1 +; GFX9-NEXT: v_cmp_eq_u32_e64 s[8:9], 6, v1 +; GFX9-NEXT: v_cmp_eq_u32_e64 s[10:11], 7, v1 +; GFX9-NEXT: v_not_b32_e32 v0, v0 +; GFX9-NEXT: v_cmp_eq_u32_e64 s[12:13], 0, v1 ; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc ; GFX9-NEXT: v_cndmask_b32_e64 v3, v3, v6, s[0:1] @@ -4566,7 +4566,7 @@ ; GFX9-NEXT: v_cndmask_b32_e64 v3, v3, v9, s[6:7] ; GFX9-NEXT: v_cndmask_b32_e64 v3, v3, v10, s[8:9] ; GFX9-NEXT: v_cndmask_b32_e64 v3, v3, v11, s[10:11] -; GFX9-NEXT: v_and_or_b32 v12, v3, v1, v2 +; GFX9-NEXT: v_and_or_b32 v12, v3, v0, v2 ; GFX9-NEXT: v_cndmask_b32_e64 v0, v4, v12, s[12:13] ; GFX9-NEXT: v_cndmask_b32_e32 v1, v5, v12, vcc ; GFX9-NEXT: v_cndmask_b32_e64 v4, v8, v12, s[4:5] @@ -4589,21 +4589,21 @@ ; GFX8-NEXT: v_add_u32_e32 v0, vcc, 16, v0 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; GFX8-NEXT: flat_load_dwordx4 v[8:11], v[0:1] -; GFX8-NEXT: v_lshrrev_b32_e32 v0, 1, v3 -; GFX8-NEXT: v_and_b32_e32 v1, 1, v3 -; GFX8-NEXT: s_mov_b32 s0, 0xffff -; GFX8-NEXT: v_lshlrev_b32_e32 v1, 4, v1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 -; GFX8-NEXT: v_lshlrev_b32_sdwa v2, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; GFX8-NEXT: v_lshlrev_b32_e64 v1, v1, s0 -; GFX8-NEXT: v_cmp_eq_u32_e64 s[0:1], 2, v0 -; GFX8-NEXT: v_cmp_eq_u32_e64 s[2:3], 3, v0 -; GFX8-NEXT: v_cmp_eq_u32_e64 s[4:5], 4, v0 -; GFX8-NEXT: v_cmp_eq_u32_e64 s[6:7], 5, v0 -; GFX8-NEXT: v_cmp_eq_u32_e64 s[8:9], 6, v0 -; GFX8-NEXT: v_cmp_eq_u32_e64 s[10:11], 7, v0 -; GFX8-NEXT: v_not_b32_e32 v1, v1 -; GFX8-NEXT: v_cmp_eq_u32_e64 s[12:13], 0, v0 +; GFX8-NEXT: v_lshrrev_b32_e32 v1, 1, v3 +; GFX8-NEXT: v_and_b32_e32 v3, 1, v3 +; GFX8-NEXT: v_mov_b32_e32 v0, 0xffff +; GFX8-NEXT: v_lshlrev_b32_e32 v3, 4, v3 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 1, v1 +; GFX8-NEXT: v_lshlrev_b32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; GFX8-NEXT: v_lshlrev_b32_e32 v0, v3, v0 +; GFX8-NEXT: v_cmp_eq_u32_e64 s[0:1], 2, v1 +; GFX8-NEXT: v_cmp_eq_u32_e64 s[2:3], 3, v1 +; GFX8-NEXT: v_cmp_eq_u32_e64 s[4:5], 4, v1 +; GFX8-NEXT: v_cmp_eq_u32_e64 s[6:7], 5, v1 +; GFX8-NEXT: v_cmp_eq_u32_e64 s[8:9], 6, v1 +; GFX8-NEXT: v_cmp_eq_u32_e64 s[10:11], 7, v1 +; GFX8-NEXT: v_not_b32_e32 v0, v0 +; GFX8-NEXT: v_cmp_eq_u32_e64 s[12:13], 0, v1 ; GFX8-NEXT: s_waitcnt vmcnt(1) ; GFX8-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc ; GFX8-NEXT: v_cndmask_b32_e64 v3, v3, v6, s[0:1] @@ -4613,8 +4613,8 @@ ; GFX8-NEXT: v_cndmask_b32_e64 v3, v3, v9, s[6:7] ; GFX8-NEXT: v_cndmask_b32_e64 v3, v3, v10, s[8:9] ; GFX8-NEXT: v_cndmask_b32_e64 v3, v3, v11, s[10:11] -; GFX8-NEXT: v_and_b32_e32 v1, v3, v1 -; GFX8-NEXT: v_or_b32_e32 v12, v1, v2 +; GFX8-NEXT: v_and_b32_e32 v0, v3, v0 +; GFX8-NEXT: v_or_b32_e32 v12, v0, v2 ; GFX8-NEXT: v_cndmask_b32_e64 v0, v4, v12, s[12:13] ; GFX8-NEXT: v_cndmask_b32_e32 v1, v5, v12, vcc ; GFX8-NEXT: v_cndmask_b32_e64 v4, v8, v12, s[4:5] Index: llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.i8.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.i8.ll +++ llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.i8.ll @@ -71,17 +71,17 @@ ; GFX10: ; %bb.0: ; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: v_cmp_eq_u32_e64 s0, s5, 1 -; GFX10-NEXT: v_cmp_eq_u32_e64 s1, s5, 0 +; GFX10-NEXT: v_mov_b32_e32 v2, 0xff ; GFX10-NEXT: global_load_ushort v0, v0, s[2:3] ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_lshrrev_b32_e32 v1, 8, v0 -; GFX10-NEXT: v_cndmask_b32_e64 v2, v0, s4, s1 ; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, s4, s0 -; GFX10-NEXT: s_movk_i32 s0, 0xff -; GFX10-NEXT: v_and_b32_sdwa v3, v1, s0 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX10-NEXT: v_cmp_eq_u32_e64 s0, s5, 0 +; GFX10-NEXT: v_and_b32_sdwa v2, v1, v2 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX10-NEXT: v_cndmask_b32_e64 v3, v0, s4, s0 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 -; GFX10-NEXT: v_or_b32_sdwa v2, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX10-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX10-NEXT: global_store_short v[0:1], v2, off ; GFX10-NEXT: s_endpgm ; @@ -175,16 +175,16 @@ ; GFX10: ; %bb.0: ; GFX10-NEXT: global_load_ushort v0, v[0:1], off ; GFX10-NEXT: v_cmp_eq_u32_e64 s0, s3, 1 -; GFX10-NEXT: v_cmp_eq_u32_e64 s1, s3, 0 +; GFX10-NEXT: v_mov_b32_e32 v2, 0xff ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_lshrrev_b32_e32 v1, 8, v0 -; GFX10-NEXT: v_cndmask_b32_e64 v2, v0, s2, s1 ; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, s2, s0 -; GFX10-NEXT: s_movk_i32 s0, 0xff -; GFX10-NEXT: v_and_b32_sdwa v3, v1, s0 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX10-NEXT: v_cmp_eq_u32_e64 s0, s3, 0 +; GFX10-NEXT: v_and_b32_sdwa v2, v1, v2 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX10-NEXT: v_cndmask_b32_e64 v3, v0, s2, s0 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 -; GFX10-NEXT: v_or_b32_sdwa v2, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX10-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX10-NEXT: global_store_short v[0:1], v2, off ; GFX10-NEXT: s_endpgm ; @@ -278,17 +278,17 @@ ; GFX10: ; %bb.0: ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: v_cmp_eq_u32_e64 vcc_lo, s4, 1 -; GFX10-NEXT: s_movk_i32 s0, 0xff +; GFX10-NEXT: v_mov_b32_e32 v3, 0xff ; GFX10-NEXT: global_load_ushort v1, v1, s[2:3] ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_lshrrev_b32_e32 v2, 8, v1 ; GFX10-NEXT: v_cndmask_b32_e32 v2, v2, v0, vcc_lo ; GFX10-NEXT: v_cmp_eq_u32_e64 vcc_lo, s4, 0 -; GFX10-NEXT: v_and_b32_sdwa v2, v2, s0 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX10-NEXT: v_cndmask_b32_e32 v3, v1, v0, vcc_lo +; GFX10-NEXT: v_and_b32_sdwa v2, v2, v3 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX10-NEXT: v_cndmask_b32_e32 v4, v1, v0, vcc_lo ; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 -; GFX10-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX10-NEXT: v_or_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX10-NEXT: global_store_short v[0:1], v2, off ; GFX10-NEXT: s_endpgm ; @@ -385,17 +385,17 @@ ; GFX10: ; %bb.0: ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 -; GFX10-NEXT: s_movk_i32 s0, 0xff +; GFX10-NEXT: v_mov_b32_e32 v3, 0xff ; GFX10-NEXT: global_load_ushort v1, v1, s[2:3] ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_lshrrev_b32_e32 v2, 8, v1 ; GFX10-NEXT: v_cndmask_b32_e64 v2, v2, s4, vcc_lo ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX10-NEXT: v_and_b32_sdwa v2, v2, s0 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX10-NEXT: v_cndmask_b32_e64 v3, v1, s4, vcc_lo +; GFX10-NEXT: v_and_b32_sdwa v2, v2, v3 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX10-NEXT: v_cndmask_b32_e64 v4, v1, s4, vcc_lo ; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 -; GFX10-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX10-NEXT: v_or_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX10-NEXT: global_store_short v[0:1], v2, off ; GFX10-NEXT: s_endpgm ; @@ -490,13 +490,13 @@ ; GFX10: ; %bb.0: ; GFX10-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v1 -; GFX10-NEXT: s_movk_i32 s0, 0xff +; GFX10-NEXT: v_mov_b32_e32 v4, 0xff ; GFX10-NEXT: global_load_ushort v2, v2, s[2:3] ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_lshrrev_b32_e32 v3, 8, v2 ; GFX10-NEXT: v_cndmask_b32_e32 v3, v3, v0, vcc_lo ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1 -; GFX10-NEXT: v_and_b32_sdwa v3, v3, s0 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX10-NEXT: v_and_b32_sdwa v3, v3, v4 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX10-NEXT: v_cndmask_b32_e32 v2, v2, v0, vcc_lo ; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 @@ -593,12 +593,12 @@ ; GFX10: ; %bb.0: ; GFX10-NEXT: global_load_ushort v0, v[0:1], off ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v2 -; GFX10-NEXT: s_movk_i32 s0, 0xff +; GFX10-NEXT: v_mov_b32_e32 v3, 0xff ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_lshrrev_b32_e32 v1, 8, v0 ; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, s2, vcc_lo ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v2 -; GFX10-NEXT: v_and_b32_sdwa v3, v1, s0 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX10-NEXT: v_and_b32_sdwa v3, v1, v3 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX10-NEXT: v_cndmask_b32_e64 v2, v0, s2, vcc_lo ; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 @@ -692,12 +692,12 @@ ; GFX10: ; %bb.0: ; GFX10-NEXT: global_load_ushort v0, v[0:1], off ; GFX10-NEXT: v_cmp_eq_u32_e64 vcc_lo, s2, 1 -; GFX10-NEXT: s_movk_i32 s0, 0xff +; GFX10-NEXT: v_mov_b32_e32 v3, 0xff ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_lshrrev_b32_e32 v1, 8, v0 ; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc_lo ; GFX10-NEXT: v_cmp_eq_u32_e64 vcc_lo, s2, 0 -; GFX10-NEXT: v_and_b32_sdwa v3, v1, s0 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX10-NEXT: v_and_b32_sdwa v3, v1, v3 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX10-NEXT: v_cndmask_b32_e32 v2, v0, v2, vcc_lo ; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 @@ -791,12 +791,12 @@ ; GFX10: ; %bb.0: ; GFX10-NEXT: global_load_ushort v0, v[0:1], off ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v3 -; GFX10-NEXT: s_movk_i32 s0, 0xff +; GFX10-NEXT: v_mov_b32_e32 v4, 0xff ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_lshrrev_b32_e32 v1, 8, v0 ; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc_lo ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v3 -; GFX10-NEXT: v_and_b32_sdwa v3, v1, s0 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX10-NEXT: v_and_b32_sdwa v3, v1, v4 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX10-NEXT: v_cndmask_b32_e32 v2, v0, v2, vcc_lo ; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 @@ -1073,10 +1073,10 @@ ; GFX9-NEXT: s_load_dword s0, s[2:3], 0x0 ; GFX9-NEXT: v_and_b32_e32 v0, 3, v0 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 3, v0 -; GFX9-NEXT: s_movk_i32 s1, 0xff -; GFX9-NEXT: s_and_b32 s2, s4, 0xff -; GFX9-NEXT: v_lshlrev_b32_e64 v2, v0, s2 -; GFX9-NEXT: v_lshlrev_b32_e64 v0, v0, s1 +; GFX9-NEXT: s_and_b32 s1, s4, 0xff +; GFX9-NEXT: v_mov_b32_e32 v1, 0xff +; GFX9-NEXT: v_lshlrev_b32_e64 v2, v0, s1 +; GFX9-NEXT: v_lshlrev_b32_e32 v0, v0, v1 ; GFX9-NEXT: v_not_b32_e32 v3, v0 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 @@ -1090,10 +1090,10 @@ ; GFX8-NEXT: s_load_dword s0, s[2:3], 0x0 ; GFX8-NEXT: v_and_b32_e32 v0, 3, v0 ; GFX8-NEXT: v_lshlrev_b32_e32 v0, 3, v0 -; GFX8-NEXT: s_movk_i32 s1, 0xff -; GFX8-NEXT: s_and_b32 s2, s4, 0xff -; GFX8-NEXT: v_lshlrev_b32_e64 v2, v0, s2 -; GFX8-NEXT: v_lshlrev_b32_e64 v0, v0, s1 +; GFX8-NEXT: s_and_b32 s1, s4, 0xff +; GFX8-NEXT: v_mov_b32_e32 v1, 0xff +; GFX8-NEXT: v_lshlrev_b32_e64 v2, v0, s1 +; GFX8-NEXT: v_lshlrev_b32_e32 v0, v0, v1 ; GFX8-NEXT: v_not_b32_e32 v0, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_and_b32_e32 v3, s0, v0 @@ -1167,14 +1167,14 @@ ; GFX9-NEXT: s_load_dword s0, s[2:3], 0x0 ; GFX9-NEXT: v_and_b32_e32 v1, 3, v1 ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 3, v1 -; GFX9-NEXT: s_movk_i32 s1, 0xff -; GFX9-NEXT: v_lshlrev_b32_sdwa v2, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_lshlrev_b32_e64 v0, v1, s1 -; GFX9-NEXT: v_not_b32_e32 v3, v0 +; GFX9-NEXT: v_mov_b32_e32 v2, 0xff +; GFX9-NEXT: v_lshlrev_b32_sdwa v3, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_lshlrev_b32_e32 v0, v1, v2 +; GFX9-NEXT: v_not_b32_e32 v2, v0 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_and_or_b32 v2, s0, v3, v2 +; GFX9-NEXT: v_and_or_b32 v2, s0, v2, v3 ; GFX9-NEXT: global_store_dword v[0:1], v2, off ; GFX9-NEXT: s_endpgm ; @@ -1183,15 +1183,15 @@ ; GFX8-NEXT: s_load_dword s0, s[2:3], 0x0 ; GFX8-NEXT: v_and_b32_e32 v1, 3, v1 ; GFX8-NEXT: v_lshlrev_b32_e32 v1, 3, v1 -; GFX8-NEXT: s_movk_i32 s1, 0xff -; GFX8-NEXT: v_lshlrev_b32_sdwa v2, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX8-NEXT: v_lshlrev_b32_e64 v0, v1, s1 +; GFX8-NEXT: v_mov_b32_e32 v2, 0xff +; GFX8-NEXT: v_lshlrev_b32_sdwa v3, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX8-NEXT: v_lshlrev_b32_e32 v0, v1, v2 ; GFX8-NEXT: v_not_b32_e32 v0, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_and_b32_e32 v3, s0, v0 +; GFX8-NEXT: v_and_b32_e32 v2, s0, v0 ; GFX8-NEXT: v_mov_b32_e32 v0, 0 ; GFX8-NEXT: v_mov_b32_e32 v1, 0 -; GFX8-NEXT: v_or_b32_e32 v2, v3, v2 +; GFX8-NEXT: v_or_b32_e32 v2, v2, v3 ; GFX8-NEXT: flat_store_dword v[0:1], v2 ; GFX8-NEXT: s_endpgm ; @@ -1258,10 +1258,10 @@ ; GFX9-NEXT: global_load_dword v3, v[0:1], off ; GFX9-NEXT: v_and_b32_e32 v0, 3, v2 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 3, v0 -; GFX9-NEXT: s_movk_i32 s0, 0xff -; GFX9-NEXT: s_and_b32 s1, s2, 0xff -; GFX9-NEXT: v_lshlrev_b32_e64 v2, v0, s1 -; GFX9-NEXT: v_lshlrev_b32_e64 v0, v0, s0 +; GFX9-NEXT: s_and_b32 s0, s2, 0xff +; GFX9-NEXT: v_mov_b32_e32 v1, 0xff +; GFX9-NEXT: v_lshlrev_b32_e64 v2, v0, s0 +; GFX9-NEXT: v_lshlrev_b32_e32 v0, v0, v1 ; GFX9-NEXT: v_not_b32_e32 v4, v0 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 @@ -1273,18 +1273,18 @@ ; GFX8-LABEL: insertelement_v_v4i8_s_v: ; GFX8: ; %bb.0: ; GFX8-NEXT: flat_load_dword v0, v[0:1] -; GFX8-NEXT: v_and_b32_e32 v1, 3, v2 -; GFX8-NEXT: s_movk_i32 s0, 0xff -; GFX8-NEXT: s_and_b32 s1, s2, 0xff -; GFX8-NEXT: v_lshlrev_b32_e32 v1, 3, v1 -; GFX8-NEXT: v_lshlrev_b32_e64 v2, v1, s1 -; GFX8-NEXT: v_lshlrev_b32_e64 v1, v1, s0 +; GFX8-NEXT: v_and_b32_e32 v2, 3, v2 +; GFX8-NEXT: v_mov_b32_e32 v1, 0xff +; GFX8-NEXT: v_lshlrev_b32_e32 v2, 3, v2 +; GFX8-NEXT: v_lshlrev_b32_e32 v1, v2, v1 +; GFX8-NEXT: s_and_b32 s0, s2, 0xff ; GFX8-NEXT: v_not_b32_e32 v1, v1 +; GFX8-NEXT: v_lshlrev_b32_e64 v3, v2, s0 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_and_b32_e32 v3, v0, v1 +; GFX8-NEXT: v_and_b32_e32 v2, v0, v1 ; GFX8-NEXT: v_mov_b32_e32 v0, 0 ; GFX8-NEXT: v_mov_b32_e32 v1, 0 -; GFX8-NEXT: v_or_b32_e32 v2, v3, v2 +; GFX8-NEXT: v_or_b32_e32 v2, v2, v3 ; GFX8-NEXT: flat_store_dword v[0:1], v2 ; GFX8-NEXT: s_endpgm ; @@ -1444,9 +1444,9 @@ ; GFX9-NEXT: global_load_dword v4, v[0:1], off ; GFX9-NEXT: v_and_b32_e32 v0, 3, v3 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 3, v0 -; GFX9-NEXT: s_movk_i32 s0, 0xff +; GFX9-NEXT: v_mov_b32_e32 v1, 0xff ; GFX9-NEXT: v_lshlrev_b32_sdwa v2, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_lshlrev_b32_e64 v0, v0, s0 +; GFX9-NEXT: v_lshlrev_b32_e32 v0, v0, v1 ; GFX9-NEXT: v_not_b32_e32 v3, v0 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 @@ -1458,12 +1458,12 @@ ; GFX8-LABEL: insertelement_v_v4i8_v_v: ; GFX8: ; %bb.0: ; GFX8-NEXT: flat_load_dword v0, v[0:1] -; GFX8-NEXT: v_and_b32_e32 v1, 3, v3 -; GFX8-NEXT: s_movk_i32 s0, 0xff -; GFX8-NEXT: v_lshlrev_b32_e32 v1, 3, v1 -; GFX8-NEXT: v_lshlrev_b32_sdwa v2, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX8-NEXT: v_lshlrev_b32_e64 v1, v1, s0 +; GFX8-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX8-NEXT: v_mov_b32_e32 v1, 0xff +; GFX8-NEXT: v_lshlrev_b32_e32 v3, 3, v3 +; GFX8-NEXT: v_lshlrev_b32_e32 v1, v3, v1 ; GFX8-NEXT: v_not_b32_e32 v1, v1 +; GFX8-NEXT: v_lshlrev_b32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_and_b32_e32 v3, v0, v1 ; GFX8-NEXT: v_mov_b32_e32 v0, 0 @@ -1930,11 +1930,11 @@ ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v1, s0 ; GFX9-NEXT: v_mov_b32_e32 v3, s1 -; GFX9-NEXT: s_movk_i32 s2, 0xff -; GFX9-NEXT: s_and_b32 s3, s4, 0xff +; GFX9-NEXT: s_and_b32 s2, s4, 0xff +; GFX9-NEXT: v_mov_b32_e32 v4, 0xff ; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc -; GFX9-NEXT: v_lshlrev_b32_e64 v3, v0, s3 -; GFX9-NEXT: v_lshlrev_b32_e64 v0, v0, s2 +; GFX9-NEXT: v_lshlrev_b32_e64 v3, v0, s2 +; GFX9-NEXT: v_lshlrev_b32_e32 v0, v0, v4 ; GFX9-NEXT: v_not_b32_e32 v0, v0 ; GFX9-NEXT: v_and_or_b32 v4, v1, v0, v3 ; GFX9-NEXT: v_mov_b32_e32 v0, s0 @@ -1957,11 +1957,11 @@ ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, s0 ; GFX8-NEXT: v_mov_b32_e32 v3, s1 -; GFX8-NEXT: s_movk_i32 s2, 0xff -; GFX8-NEXT: s_and_b32 s3, s4, 0xff +; GFX8-NEXT: s_and_b32 s2, s4, 0xff +; GFX8-NEXT: v_mov_b32_e32 v4, 0xff ; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc -; GFX8-NEXT: v_lshlrev_b32_e64 v3, v0, s3 -; GFX8-NEXT: v_lshlrev_b32_e64 v0, v0, s2 +; GFX8-NEXT: v_lshlrev_b32_e64 v3, v0, s2 +; GFX8-NEXT: v_lshlrev_b32_e32 v0, v0, v4 ; GFX8-NEXT: v_not_b32_e32 v0, v0 ; GFX8-NEXT: v_and_b32_e32 v0, v1, v0 ; GFX8-NEXT: v_or_b32_e32 v4, v0, v3 @@ -2065,16 +2065,16 @@ ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; GFX9-NEXT: v_lshrrev_b32_e32 v2, 2, v1 +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v2 ; GFX9-NEXT: v_and_b32_e32 v1, 3, v1 ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 3, v1 -; GFX9-NEXT: s_movk_i32 s2, 0xff ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v3, s0 ; GFX9-NEXT: v_mov_b32_e32 v4, s1 -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v2 -; GFX9-NEXT: v_lshlrev_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_lshlrev_b32_e64 v1, v1, s2 ; GFX9-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc +; GFX9-NEXT: v_mov_b32_e32 v4, 0xff +; GFX9-NEXT: v_lshlrev_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_lshlrev_b32_e32 v1, v1, v4 ; GFX9-NEXT: v_not_b32_e32 v1, v1 ; GFX9-NEXT: v_and_or_b32 v4, v3, v1, v0 ; GFX9-NEXT: v_mov_b32_e32 v0, s0 @@ -2091,16 +2091,16 @@ ; GFX8: ; %bb.0: ; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; GFX8-NEXT: v_lshrrev_b32_e32 v2, 2, v1 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 1, v2 ; GFX8-NEXT: v_and_b32_e32 v1, 3, v1 ; GFX8-NEXT: v_lshlrev_b32_e32 v1, 3, v1 -; GFX8-NEXT: s_movk_i32 s2, 0xff ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v3, s0 ; GFX8-NEXT: v_mov_b32_e32 v4, s1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 1, v2 -; GFX8-NEXT: v_lshlrev_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX8-NEXT: v_lshlrev_b32_e64 v1, v1, s2 ; GFX8-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc +; GFX8-NEXT: v_mov_b32_e32 v4, 0xff +; GFX8-NEXT: v_lshlrev_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX8-NEXT: v_lshlrev_b32_e32 v1, v1, v4 ; GFX8-NEXT: v_not_b32_e32 v1, v1 ; GFX8-NEXT: v_and_b32_e32 v1, v3, v1 ; GFX8-NEXT: v_or_b32_e32 v4, v1, v0 @@ -2202,21 +2202,21 @@ ; GFX9-LABEL: insertelement_v_v8i8_s_v: ; GFX9: ; %bb.0: ; GFX9-NEXT: global_load_dwordx2 v[0:1], v[0:1], off -; GFX9-NEXT: v_lshrrev_b32_e32 v5, 2, v2 +; GFX9-NEXT: v_lshrrev_b32_e32 v6, 2, v2 ; GFX9-NEXT: v_and_b32_e32 v2, 3, v2 -; GFX9-NEXT: s_movk_i32 s0, 0xff -; GFX9-NEXT: s_and_b32 s1, s2, 0xff +; GFX9-NEXT: v_mov_b32_e32 v5, 0xff +; GFX9-NEXT: s_and_b32 s0, s2, 0xff ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 3, v2 -; GFX9-NEXT: v_lshlrev_b32_e64 v6, v2, s1 -; GFX9-NEXT: v_lshlrev_b32_e64 v2, v2, s0 -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v5 +; GFX9-NEXT: v_lshlrev_b32_e64 v7, v2, s0 +; GFX9-NEXT: v_lshlrev_b32_e32 v2, v2, v5 +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v6 ; GFX9-NEXT: v_not_b32_e32 v2, v2 ; GFX9-NEXT: v_mov_b32_e32 v3, 0 -; GFX9-NEXT: v_cmp_eq_u32_e64 s[0:1], 0, v5 +; GFX9-NEXT: v_cmp_eq_u32_e64 s[0:1], 0, v6 ; GFX9-NEXT: v_mov_b32_e32 v4, 0 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_cndmask_b32_e32 v7, v0, v1, vcc -; GFX9-NEXT: v_and_or_b32 v2, v7, v2, v6 +; GFX9-NEXT: v_cndmask_b32_e32 v5, v0, v1, vcc +; GFX9-NEXT: v_and_or_b32 v2, v5, v2, v7 ; GFX9-NEXT: v_cndmask_b32_e64 v0, v0, v2, s[0:1] ; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc ; GFX9-NEXT: global_store_dwordx2 v[3:4], v[0:1], off @@ -2225,22 +2225,22 @@ ; GFX8-LABEL: insertelement_v_v8i8_s_v: ; GFX8: ; %bb.0: ; GFX8-NEXT: flat_load_dwordx2 v[0:1], v[0:1] -; GFX8-NEXT: v_lshrrev_b32_e32 v5, 2, v2 +; GFX8-NEXT: v_lshrrev_b32_e32 v6, 2, v2 ; GFX8-NEXT: v_and_b32_e32 v2, 3, v2 -; GFX8-NEXT: s_movk_i32 s0, 0xff -; GFX8-NEXT: s_and_b32 s1, s2, 0xff +; GFX8-NEXT: v_mov_b32_e32 v5, 0xff +; GFX8-NEXT: s_and_b32 s0, s2, 0xff ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 3, v2 -; GFX8-NEXT: v_lshlrev_b32_e64 v6, v2, s1 -; GFX8-NEXT: v_lshlrev_b32_e64 v2, v2, s0 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 1, v5 +; GFX8-NEXT: v_lshlrev_b32_e64 v7, v2, s0 +; GFX8-NEXT: v_lshlrev_b32_e32 v2, v2, v5 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 1, v6 ; GFX8-NEXT: v_not_b32_e32 v2, v2 ; GFX8-NEXT: v_mov_b32_e32 v3, 0 -; GFX8-NEXT: v_cmp_eq_u32_e64 s[0:1], 0, v5 +; GFX8-NEXT: v_cmp_eq_u32_e64 s[0:1], 0, v6 ; GFX8-NEXT: v_mov_b32_e32 v4, 0 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_cndmask_b32_e32 v7, v0, v1, vcc -; GFX8-NEXT: v_and_b32_e32 v2, v7, v2 -; GFX8-NEXT: v_or_b32_e32 v2, v2, v6 +; GFX8-NEXT: v_cndmask_b32_e32 v5, v0, v1, vcc +; GFX8-NEXT: v_and_b32_e32 v2, v5, v2 +; GFX8-NEXT: v_or_b32_e32 v2, v2, v7 ; GFX8-NEXT: v_cndmask_b32_e64 v0, v0, v2, s[0:1] ; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc ; GFX8-NEXT: flat_store_dwordx2 v[3:4], v[0:1] @@ -2449,20 +2449,20 @@ ; GFX9-LABEL: insertelement_v_v8i8_v_v: ; GFX9: ; %bb.0: ; GFX9-NEXT: global_load_dwordx2 v[0:1], v[0:1], off -; GFX9-NEXT: v_lshrrev_b32_e32 v6, 2, v3 +; GFX9-NEXT: v_lshrrev_b32_e32 v7, 2, v3 ; GFX9-NEXT: v_and_b32_e32 v3, 3, v3 -; GFX9-NEXT: s_movk_i32 s0, 0xff +; GFX9-NEXT: v_mov_b32_e32 v6, 0xff ; GFX9-NEXT: v_lshlrev_b32_e32 v3, 3, v3 ; GFX9-NEXT: v_lshlrev_b32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_lshlrev_b32_e64 v3, v3, s0 -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v6 +; GFX9-NEXT: v_lshlrev_b32_e32 v3, v3, v6 +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v7 ; GFX9-NEXT: v_not_b32_e32 v3, v3 ; GFX9-NEXT: v_mov_b32_e32 v4, 0 -; GFX9-NEXT: v_cmp_eq_u32_e64 s[0:1], 0, v6 +; GFX9-NEXT: v_cmp_eq_u32_e64 s[0:1], 0, v7 ; GFX9-NEXT: v_mov_b32_e32 v5, 0 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_cndmask_b32_e32 v7, v0, v1, vcc -; GFX9-NEXT: v_and_or_b32 v2, v7, v3, v2 +; GFX9-NEXT: v_cndmask_b32_e32 v6, v0, v1, vcc +; GFX9-NEXT: v_and_or_b32 v2, v6, v3, v2 ; GFX9-NEXT: v_cndmask_b32_e64 v0, v0, v2, s[0:1] ; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc ; GFX9-NEXT: global_store_dwordx2 v[4:5], v[0:1], off @@ -2471,20 +2471,20 @@ ; GFX8-LABEL: insertelement_v_v8i8_v_v: ; GFX8: ; %bb.0: ; GFX8-NEXT: flat_load_dwordx2 v[0:1], v[0:1] -; GFX8-NEXT: v_lshrrev_b32_e32 v6, 2, v3 +; GFX8-NEXT: v_lshrrev_b32_e32 v7, 2, v3 ; GFX8-NEXT: v_and_b32_e32 v3, 3, v3 -; GFX8-NEXT: s_movk_i32 s0, 0xff +; GFX8-NEXT: v_mov_b32_e32 v6, 0xff ; GFX8-NEXT: v_lshlrev_b32_e32 v3, 3, v3 ; GFX8-NEXT: v_lshlrev_b32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX8-NEXT: v_lshlrev_b32_e64 v3, v3, s0 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 1, v6 +; GFX8-NEXT: v_lshlrev_b32_e32 v3, v3, v6 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 1, v7 ; GFX8-NEXT: v_not_b32_e32 v3, v3 ; GFX8-NEXT: v_mov_b32_e32 v4, 0 -; GFX8-NEXT: v_cmp_eq_u32_e64 s[0:1], 0, v6 +; GFX8-NEXT: v_cmp_eq_u32_e64 s[0:1], 0, v7 ; GFX8-NEXT: v_mov_b32_e32 v5, 0 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_cndmask_b32_e32 v7, v0, v1, vcc -; GFX8-NEXT: v_and_b32_e32 v3, v7, v3 +; GFX8-NEXT: v_cndmask_b32_e32 v6, v0, v1, vcc +; GFX8-NEXT: v_and_b32_e32 v3, v6, v3 ; GFX8-NEXT: v_or_b32_e32 v2, v3, v2 ; GFX8-NEXT: v_cndmask_b32_e64 v0, v0, v2, s[0:1] ; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc @@ -3095,21 +3095,21 @@ ; GFX9-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0x0 ; GFX9-NEXT: v_lshrrev_b32_e32 v4, 2, v0 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v4 -; GFX9-NEXT: v_and_b32_e32 v0, 3, v0 ; GFX9-NEXT: v_cmp_eq_u32_e64 s[0:1], 2, v4 +; GFX9-NEXT: v_and_b32_e32 v0, 3, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v1, s8 ; GFX9-NEXT: v_mov_b32_e32 v2, s9 ; GFX9-NEXT: v_mov_b32_e32 v3, s10 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc +; GFX9-NEXT: v_cndmask_b32_e64 v1, v1, v3, s[0:1] ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 3, v0 -; GFX9-NEXT: s_movk_i32 s5, 0xff ; GFX9-NEXT: s_and_b32 s4, s4, 0xff +; GFX9-NEXT: v_mov_b32_e32 v3, 0xff ; GFX9-NEXT: v_mov_b32_e32 v5, s11 -; GFX9-NEXT: v_cndmask_b32_e64 v1, v1, v3, s[0:1] ; GFX9-NEXT: v_cmp_eq_u32_e64 s[2:3], 3, v4 ; GFX9-NEXT: v_lshlrev_b32_e64 v2, v0, s4 -; GFX9-NEXT: v_lshlrev_b32_e64 v0, v0, s5 +; GFX9-NEXT: v_lshlrev_b32_e32 v0, v0, v3 ; GFX9-NEXT: v_cndmask_b32_e64 v1, v1, v5, s[2:3] ; GFX9-NEXT: v_not_b32_e32 v0, v0 ; GFX9-NEXT: v_and_or_b32 v6, v1, v0, v2 @@ -3132,21 +3132,21 @@ ; GFX8-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0x0 ; GFX8-NEXT: v_lshrrev_b32_e32 v4, 2, v0 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 1, v4 -; GFX8-NEXT: v_and_b32_e32 v0, 3, v0 ; GFX8-NEXT: v_cmp_eq_u32_e64 s[0:1], 2, v4 +; GFX8-NEXT: v_and_b32_e32 v0, 3, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, s8 ; GFX8-NEXT: v_mov_b32_e32 v2, s9 ; GFX8-NEXT: v_mov_b32_e32 v3, s10 ; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc +; GFX8-NEXT: v_cndmask_b32_e64 v1, v1, v3, s[0:1] ; GFX8-NEXT: v_lshlrev_b32_e32 v0, 3, v0 -; GFX8-NEXT: s_movk_i32 s5, 0xff ; GFX8-NEXT: s_and_b32 s4, s4, 0xff +; GFX8-NEXT: v_mov_b32_e32 v3, 0xff ; GFX8-NEXT: v_mov_b32_e32 v5, s11 -; GFX8-NEXT: v_cndmask_b32_e64 v1, v1, v3, s[0:1] ; GFX8-NEXT: v_cmp_eq_u32_e64 s[2:3], 3, v4 ; GFX8-NEXT: v_lshlrev_b32_e64 v2, v0, s4 -; GFX8-NEXT: v_lshlrev_b32_e64 v0, v0, s5 +; GFX8-NEXT: v_lshlrev_b32_e32 v0, v0, v3 ; GFX8-NEXT: v_cndmask_b32_e64 v1, v1, v5, s[2:3] ; GFX8-NEXT: v_not_b32_e32 v0, v0 ; GFX8-NEXT: v_and_b32_e32 v0, v1, v0 @@ -3290,12 +3290,12 @@ ; GFX9-NEXT: v_mov_b32_e32 v5, s6 ; GFX9-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 3, v1 -; GFX9-NEXT: s_movk_i32 s8, 0xff +; GFX9-NEXT: v_mov_b32_e32 v3, 0xff ; GFX9-NEXT: v_mov_b32_e32 v6, s7 ; GFX9-NEXT: v_cndmask_b32_e64 v2, v2, v5, s[0:1] ; GFX9-NEXT: v_cmp_eq_u32_e64 s[2:3], 3, v4 ; GFX9-NEXT: v_lshlrev_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_lshlrev_b32_e64 v1, v1, s8 +; GFX9-NEXT: v_lshlrev_b32_e32 v1, v1, v3 ; GFX9-NEXT: v_cndmask_b32_e64 v2, v2, v6, s[2:3] ; GFX9-NEXT: v_not_b32_e32 v1, v1 ; GFX9-NEXT: v_and_or_b32 v6, v2, v1, v0 @@ -3326,12 +3326,12 @@ ; GFX8-NEXT: v_mov_b32_e32 v5, s6 ; GFX8-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc ; GFX8-NEXT: v_lshlrev_b32_e32 v1, 3, v1 -; GFX8-NEXT: s_movk_i32 s8, 0xff +; GFX8-NEXT: v_mov_b32_e32 v3, 0xff ; GFX8-NEXT: v_mov_b32_e32 v6, s7 ; GFX8-NEXT: v_cndmask_b32_e64 v2, v2, v5, s[0:1] ; GFX8-NEXT: v_cmp_eq_u32_e64 s[2:3], 3, v4 ; GFX8-NEXT: v_lshlrev_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX8-NEXT: v_lshlrev_b32_e64 v1, v1, s8 +; GFX8-NEXT: v_lshlrev_b32_e32 v1, v1, v3 ; GFX8-NEXT: v_cndmask_b32_e64 v2, v2, v6, s[2:3] ; GFX8-NEXT: v_not_b32_e32 v1, v1 ; GFX8-NEXT: v_and_b32_e32 v1, v2, v1 @@ -3467,25 +3467,25 @@ ; GFX9-LABEL: insertelement_v_v16i8_s_v: ; GFX9: ; %bb.0: ; GFX9-NEXT: global_load_dwordx4 v[3:6], v[0:1], off -; GFX9-NEXT: v_lshrrev_b32_e32 v0, 2, v2 -; GFX9-NEXT: v_and_b32_e32 v1, 3, v2 -; GFX9-NEXT: s_movk_i32 s0, 0xff -; GFX9-NEXT: s_and_b32 s1, s2, 0xff -; GFX9-NEXT: v_lshlrev_b32_e32 v1, 3, v1 -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 -; GFX9-NEXT: v_lshlrev_b32_e64 v2, v1, s1 -; GFX9-NEXT: v_lshlrev_b32_e64 v1, v1, s0 -; GFX9-NEXT: v_cmp_eq_u32_e64 s[0:1], 2, v0 -; GFX9-NEXT: v_cmp_eq_u32_e64 s[2:3], 3, v0 -; GFX9-NEXT: v_not_b32_e32 v1, v1 +; GFX9-NEXT: v_lshrrev_b32_e32 v1, 2, v2 +; GFX9-NEXT: v_and_b32_e32 v2, 3, v2 +; GFX9-NEXT: v_mov_b32_e32 v0, 0xff +; GFX9-NEXT: s_and_b32 s0, s2, 0xff +; GFX9-NEXT: v_lshlrev_b32_e32 v2, 3, v2 +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v1 +; GFX9-NEXT: v_lshlrev_b32_e64 v9, v2, s0 +; GFX9-NEXT: v_lshlrev_b32_e32 v0, v2, v0 +; GFX9-NEXT: v_cmp_eq_u32_e64 s[0:1], 2, v1 +; GFX9-NEXT: v_cmp_eq_u32_e64 s[2:3], 3, v1 +; GFX9-NEXT: v_not_b32_e32 v0, v0 ; GFX9-NEXT: v_mov_b32_e32 v7, 0 -; GFX9-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v0 +; GFX9-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v1 ; GFX9-NEXT: v_mov_b32_e32 v8, 0 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_cndmask_b32_e32 v9, v3, v4, vcc -; GFX9-NEXT: v_cndmask_b32_e64 v9, v9, v5, s[0:1] -; GFX9-NEXT: v_cndmask_b32_e64 v9, v9, v6, s[2:3] -; GFX9-NEXT: v_and_or_b32 v9, v9, v1, v2 +; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc +; GFX9-NEXT: v_cndmask_b32_e64 v2, v2, v5, s[0:1] +; GFX9-NEXT: v_cndmask_b32_e64 v2, v2, v6, s[2:3] +; GFX9-NEXT: v_and_or_b32 v9, v2, v0, v9 ; GFX9-NEXT: v_cndmask_b32_e64 v0, v3, v9, s[4:5] ; GFX9-NEXT: v_cndmask_b32_e32 v1, v4, v9, vcc ; GFX9-NEXT: v_cndmask_b32_e64 v2, v5, v9, s[0:1] @@ -3496,26 +3496,26 @@ ; GFX8-LABEL: insertelement_v_v16i8_s_v: ; GFX8: ; %bb.0: ; GFX8-NEXT: flat_load_dwordx4 v[3:6], v[0:1] -; GFX8-NEXT: v_lshrrev_b32_e32 v0, 2, v2 -; GFX8-NEXT: v_and_b32_e32 v1, 3, v2 -; GFX8-NEXT: s_movk_i32 s0, 0xff -; GFX8-NEXT: s_and_b32 s1, s2, 0xff -; GFX8-NEXT: v_lshlrev_b32_e32 v1, 3, v1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 -; GFX8-NEXT: v_lshlrev_b32_e64 v2, v1, s1 -; GFX8-NEXT: v_lshlrev_b32_e64 v1, v1, s0 -; GFX8-NEXT: v_cmp_eq_u32_e64 s[0:1], 2, v0 -; GFX8-NEXT: v_cmp_eq_u32_e64 s[2:3], 3, v0 -; GFX8-NEXT: v_not_b32_e32 v1, v1 +; GFX8-NEXT: v_lshrrev_b32_e32 v1, 2, v2 +; GFX8-NEXT: v_and_b32_e32 v2, 3, v2 +; GFX8-NEXT: v_mov_b32_e32 v0, 0xff +; GFX8-NEXT: s_and_b32 s0, s2, 0xff +; GFX8-NEXT: v_lshlrev_b32_e32 v2, 3, v2 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 1, v1 +; GFX8-NEXT: v_lshlrev_b32_e64 v9, v2, s0 +; GFX8-NEXT: v_lshlrev_b32_e32 v0, v2, v0 +; GFX8-NEXT: v_cmp_eq_u32_e64 s[0:1], 2, v1 +; GFX8-NEXT: v_cmp_eq_u32_e64 s[2:3], 3, v1 +; GFX8-NEXT: v_not_b32_e32 v0, v0 ; GFX8-NEXT: v_mov_b32_e32 v7, 0 -; GFX8-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v0 +; GFX8-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v1 ; GFX8-NEXT: v_mov_b32_e32 v8, 0 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_cndmask_b32_e32 v9, v3, v4, vcc -; GFX8-NEXT: v_cndmask_b32_e64 v9, v9, v5, s[0:1] -; GFX8-NEXT: v_cndmask_b32_e64 v9, v9, v6, s[2:3] -; GFX8-NEXT: v_and_b32_e32 v1, v9, v1 -; GFX8-NEXT: v_or_b32_e32 v9, v1, v2 +; GFX8-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc +; GFX8-NEXT: v_cndmask_b32_e64 v2, v2, v5, s[0:1] +; GFX8-NEXT: v_cndmask_b32_e64 v2, v2, v6, s[2:3] +; GFX8-NEXT: v_and_b32_e32 v0, v2, v0 +; GFX8-NEXT: v_or_b32_e32 v9, v0, v9 ; GFX8-NEXT: v_cndmask_b32_e64 v0, v3, v9, s[4:5] ; GFX8-NEXT: v_cndmask_b32_e32 v1, v4, v9, vcc ; GFX8-NEXT: v_cndmask_b32_e64 v2, v5, v9, s[0:1] @@ -3775,24 +3775,24 @@ ; GFX9-LABEL: insertelement_v_v16i8_v_v: ; GFX9: ; %bb.0: ; GFX9-NEXT: global_load_dwordx4 v[4:7], v[0:1], off -; GFX9-NEXT: v_lshrrev_b32_e32 v0, 2, v3 -; GFX9-NEXT: v_and_b32_e32 v1, 3, v3 -; GFX9-NEXT: s_movk_i32 s0, 0xff -; GFX9-NEXT: v_lshlrev_b32_e32 v1, 3, v1 -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 -; GFX9-NEXT: v_lshlrev_b32_sdwa v2, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_lshlrev_b32_e64 v1, v1, s0 -; GFX9-NEXT: v_cmp_eq_u32_e64 s[0:1], 2, v0 -; GFX9-NEXT: v_cmp_eq_u32_e64 s[2:3], 3, v0 -; GFX9-NEXT: v_not_b32_e32 v1, v1 +; GFX9-NEXT: v_lshrrev_b32_e32 v1, 2, v3 +; GFX9-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX9-NEXT: v_mov_b32_e32 v0, 0xff +; GFX9-NEXT: v_lshlrev_b32_e32 v3, 3, v3 +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v1 +; GFX9-NEXT: v_lshlrev_b32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_lshlrev_b32_e32 v0, v3, v0 +; GFX9-NEXT: v_cmp_eq_u32_e64 s[0:1], 2, v1 +; GFX9-NEXT: v_cmp_eq_u32_e64 s[2:3], 3, v1 +; GFX9-NEXT: v_not_b32_e32 v0, v0 ; GFX9-NEXT: v_mov_b32_e32 v8, 0 -; GFX9-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v0 +; GFX9-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v1 ; GFX9-NEXT: v_mov_b32_e32 v9, 0 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc ; GFX9-NEXT: v_cndmask_b32_e64 v3, v3, v6, s[0:1] ; GFX9-NEXT: v_cndmask_b32_e64 v3, v3, v7, s[2:3] -; GFX9-NEXT: v_and_or_b32 v3, v3, v1, v2 +; GFX9-NEXT: v_and_or_b32 v3, v3, v0, v2 ; GFX9-NEXT: v_cndmask_b32_e64 v0, v4, v3, s[4:5] ; GFX9-NEXT: v_cndmask_b32_e32 v1, v5, v3, vcc ; GFX9-NEXT: v_cndmask_b32_e64 v2, v6, v3, s[0:1] @@ -3803,25 +3803,25 @@ ; GFX8-LABEL: insertelement_v_v16i8_v_v: ; GFX8: ; %bb.0: ; GFX8-NEXT: flat_load_dwordx4 v[4:7], v[0:1] -; GFX8-NEXT: v_lshrrev_b32_e32 v0, 2, v3 -; GFX8-NEXT: v_and_b32_e32 v1, 3, v3 -; GFX8-NEXT: s_movk_i32 s0, 0xff -; GFX8-NEXT: v_lshlrev_b32_e32 v1, 3, v1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 -; GFX8-NEXT: v_lshlrev_b32_sdwa v2, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX8-NEXT: v_lshlrev_b32_e64 v1, v1, s0 -; GFX8-NEXT: v_cmp_eq_u32_e64 s[0:1], 2, v0 -; GFX8-NEXT: v_cmp_eq_u32_e64 s[2:3], 3, v0 -; GFX8-NEXT: v_not_b32_e32 v1, v1 +; GFX8-NEXT: v_lshrrev_b32_e32 v1, 2, v3 +; GFX8-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX8-NEXT: v_mov_b32_e32 v0, 0xff +; GFX8-NEXT: v_lshlrev_b32_e32 v3, 3, v3 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 1, v1 +; GFX8-NEXT: v_lshlrev_b32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX8-NEXT: v_lshlrev_b32_e32 v0, v3, v0 +; GFX8-NEXT: v_cmp_eq_u32_e64 s[0:1], 2, v1 +; GFX8-NEXT: v_cmp_eq_u32_e64 s[2:3], 3, v1 +; GFX8-NEXT: v_not_b32_e32 v0, v0 ; GFX8-NEXT: v_mov_b32_e32 v8, 0 -; GFX8-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v0 +; GFX8-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v1 ; GFX8-NEXT: v_mov_b32_e32 v9, 0 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc ; GFX8-NEXT: v_cndmask_b32_e64 v3, v3, v6, s[0:1] ; GFX8-NEXT: v_cndmask_b32_e64 v3, v3, v7, s[2:3] -; GFX8-NEXT: v_and_b32_e32 v1, v3, v1 -; GFX8-NEXT: v_or_b32_e32 v3, v1, v2 +; GFX8-NEXT: v_and_b32_e32 v0, v3, v0 +; GFX8-NEXT: v_or_b32_e32 v3, v0, v2 ; GFX8-NEXT: v_cndmask_b32_e64 v0, v4, v3, s[4:5] ; GFX8-NEXT: v_cndmask_b32_e32 v1, v5, v3, vcc ; GFX8-NEXT: v_cndmask_b32_e64 v2, v6, v3, s[0:1] Index: llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-store-private.mir =================================================================== --- llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-store-private.mir +++ llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-store-private.mir @@ -739,12 +739,17 @@ ; GFX6: liveins: $vgpr0, $vgpr1 ; GFX6-NEXT: {{ $}} ; GFX6-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; GFX6-NEXT: BUFFER_STORE_DWORD_OFFSET [[COPY]], $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 4095, 0, 0, implicit $exec :: (store (s32), addrspace 5) + ; GFX6-NEXT: [[V_LSHRREV_B32_e64_:%[0-9]+]]:vgpr_32 = V_LSHRREV_B32_e64 6, $sgpr32, implicit $exec + ; GFX6-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 4095 + ; GFX6-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] + ; GFX6-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[V_LSHRREV_B32_e64_]], [[COPY1]], 0, implicit $exec + ; GFX6-NEXT: BUFFER_STORE_DWORD_OFFEN [[COPY]], [[V_ADD_CO_U32_e64_]], $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, implicit $exec :: (store (s32), addrspace 5) ; GFX9-LABEL: name: function_store_private_s32_to_4_wave_address_offset_copy_constant_4095 ; GFX9: liveins: $vgpr0, $vgpr1 ; GFX9-NEXT: {{ $}} ; GFX9-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; GFX9-NEXT: BUFFER_STORE_DWORD_OFFSET [[COPY]], $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 4095, 0, 0, implicit $exec :: (store (s32), addrspace 5) + ; GFX9-NEXT: [[V_LSHRREV_B32_e64_:%[0-9]+]]:vgpr_32 = V_LSHRREV_B32_e64 6, $sgpr32, implicit $exec + ; GFX9-NEXT: BUFFER_STORE_DWORD_OFFEN [[COPY]], [[V_LSHRREV_B32_e64_]], $sgpr0_sgpr1_sgpr2_sgpr3, 0, 4095, 0, 0, implicit $exec :: (store (s32), addrspace 5) ; GFX11-LABEL: name: function_store_private_s32_to_4_wave_address_offset_copy_constant_4095 ; GFX11: liveins: $vgpr0, $vgpr1 ; GFX11-NEXT: {{ $}} @@ -781,16 +786,12 @@ ; GFX6: liveins: $vgpr0, $vgpr1 ; GFX6-NEXT: {{ $}} ; GFX6-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; GFX6-NEXT: [[V_LSHRREV_B32_e64_:%[0-9]+]]:vgpr_32 = V_LSHRREV_B32_e64 6, $sgpr32, implicit $exec - ; GFX6-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4095, implicit $exec - ; GFX6-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[V_LSHRREV_B32_e64_]], [[V_MOV_B32_e32_]], 0, implicit $exec - ; GFX6-NEXT: BUFFER_STORE_DWORD_OFFEN [[COPY]], [[V_ADD_CO_U32_e64_]], $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, implicit $exec :: (store (s32), addrspace 5) + ; GFX6-NEXT: BUFFER_STORE_DWORD_OFFSET [[COPY]], $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 4095, 0, 0, implicit $exec :: (store (s32), addrspace 5) ; GFX9-LABEL: name: function_store_private_s32_to_4_wave_address_offset_4095 ; GFX9: liveins: $vgpr0, $vgpr1 ; GFX9-NEXT: {{ $}} ; GFX9-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; GFX9-NEXT: [[V_LSHRREV_B32_e64_:%[0-9]+]]:vgpr_32 = V_LSHRREV_B32_e64 6, $sgpr32, implicit $exec - ; GFX9-NEXT: BUFFER_STORE_DWORD_OFFEN [[COPY]], [[V_LSHRREV_B32_e64_]], $sgpr0_sgpr1_sgpr2_sgpr3, 0, 4095, 0, 0, implicit $exec :: (store (s32), addrspace 5) + ; GFX9-NEXT: BUFFER_STORE_DWORD_OFFSET [[COPY]], $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 4095, 0, 0, implicit $exec :: (store (s32), addrspace 5) ; GFX11-LABEL: name: function_store_private_s32_to_4_wave_address_offset_4095 ; GFX11: liveins: $vgpr0, $vgpr1 ; GFX11-NEXT: {{ $}} Index: llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.div.fmas.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.div.fmas.ll +++ llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.div.fmas.ll @@ -1238,6 +1238,7 @@ ; GFX10_W32-NEXT: v_lshlrev_b32_e32 v1, 2, v0 ; GFX10_W32-NEXT: s_load_dword s0, s[0:1], 0x30 ; GFX10_W32-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX10_W32-NEXT: v_mov_b32_e32 v0, 0 ; GFX10_W32-NEXT: s_waitcnt lgkmcnt(0) ; GFX10_W32-NEXT: global_load_dword v2, v1, s[6:7] glc dlc ; GFX10_W32-NEXT: s_waitcnt vmcnt(0) @@ -1245,14 +1246,13 @@ ; GFX10_W32-NEXT: s_waitcnt vmcnt(0) ; GFX10_W32-NEXT: global_load_dword v4, v1, s[6:7] offset:8 glc dlc ; GFX10_W32-NEXT: s_waitcnt vmcnt(0) -; GFX10_W32-NEXT: v_mov_b32_e32 v1, 0 ; GFX10_W32-NEXT: s_cmp_lg_u32 s0, 0 ; GFX10_W32-NEXT: s_cselect_b32 s0, 1, 0 ; GFX10_W32-NEXT: s_and_b32 s0, 1, s0 ; GFX10_W32-NEXT: v_cmp_ne_u32_e64 s0, 0, s0 ; GFX10_W32-NEXT: s_and_b32 vcc_lo, vcc_lo, s0 -; GFX10_W32-NEXT: v_div_fmas_f32 v0, v2, v3, v4 -; GFX10_W32-NEXT: global_store_dword v1, v0, s[4:5] offset:8 +; GFX10_W32-NEXT: v_div_fmas_f32 v1, v2, v3, v4 +; GFX10_W32-NEXT: global_store_dword v0, v1, s[4:5] offset:8 ; GFX10_W32-NEXT: s_endpgm ; ; GFX10_W64-LABEL: test_div_fmas_f32_logical_cond_to_vcc: @@ -1261,6 +1261,7 @@ ; GFX10_W64-NEXT: v_lshlrev_b32_e32 v1, 2, v0 ; GFX10_W64-NEXT: s_load_dword s0, s[0:1], 0x30 ; GFX10_W64-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX10_W64-NEXT: v_mov_b32_e32 v0, 0 ; GFX10_W64-NEXT: s_waitcnt lgkmcnt(0) ; GFX10_W64-NEXT: global_load_dword v2, v1, s[6:7] glc dlc ; GFX10_W64-NEXT: s_waitcnt vmcnt(0) @@ -1268,22 +1269,22 @@ ; GFX10_W64-NEXT: s_waitcnt vmcnt(0) ; GFX10_W64-NEXT: global_load_dword v4, v1, s[6:7] offset:8 glc dlc ; GFX10_W64-NEXT: s_waitcnt vmcnt(0) -; GFX10_W64-NEXT: v_mov_b32_e32 v1, 0 ; GFX10_W64-NEXT: s_cmp_lg_u32 s0, 0 ; GFX10_W64-NEXT: s_cselect_b32 s0, 1, 0 ; GFX10_W64-NEXT: s_and_b32 s0, 1, s0 ; GFX10_W64-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, s0 ; GFX10_W64-NEXT: s_and_b64 vcc, vcc, s[0:1] -; GFX10_W64-NEXT: v_div_fmas_f32 v0, v2, v3, v4 -; GFX10_W64-NEXT: global_store_dword v1, v0, s[4:5] offset:8 +; GFX10_W64-NEXT: v_div_fmas_f32 v1, v2, v3, v4 +; GFX10_W64-NEXT: global_store_dword v0, v1, s[4:5] offset:8 ; GFX10_W64-NEXT: s_endpgm ; ; GFX11_W32-LABEL: test_div_fmas_f32_logical_cond_to_vcc: ; GFX11_W32: ; %bb.0: ; GFX11_W32-NEXT: s_load_b128 s[4:7], s[0:1], 0x0 ; GFX11_W32-NEXT: v_lshlrev_b32_e32 v1, 2, v0 -; GFX11_W32-NEXT: s_load_b32 s0, s[0:1], 0x30 ; GFX11_W32-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX11_W32-NEXT: v_mov_b32_e32 v0, 0 +; GFX11_W32-NEXT: s_load_b32 s0, s[0:1], 0x30 ; GFX11_W32-NEXT: s_waitcnt lgkmcnt(0) ; GFX11_W32-NEXT: global_load_b32 v2, v1, s[6:7] glc dlc ; GFX11_W32-NEXT: s_waitcnt vmcnt(0) @@ -1296,9 +1297,8 @@ ; GFX11_W32-NEXT: s_and_b32 s0, 1, s0 ; GFX11_W32-NEXT: v_cmp_ne_u32_e64 s0, 0, s0 ; GFX11_W32-NEXT: s_and_b32 vcc_lo, vcc_lo, s0 -; GFX11_W32-NEXT: v_div_fmas_f32 v0, v2, v3, v1 -; GFX11_W32-NEXT: v_mov_b32_e32 v1, 0 -; GFX11_W32-NEXT: global_store_b32 v1, v0, s[4:5] offset:8 +; GFX11_W32-NEXT: v_div_fmas_f32 v1, v2, v3, v1 +; GFX11_W32-NEXT: global_store_b32 v0, v1, s[4:5] offset:8 ; GFX11_W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11_W32-NEXT: s_endpgm ; @@ -1308,6 +1308,7 @@ ; GFX11_W64-NEXT: v_lshlrev_b32_e32 v1, 2, v0 ; GFX11_W64-NEXT: s_load_b32 s0, s[0:1], 0x30 ; GFX11_W64-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX11_W64-NEXT: v_mov_b32_e32 v0, 0 ; GFX11_W64-NEXT: s_waitcnt lgkmcnt(0) ; GFX11_W64-NEXT: global_load_b32 v2, v1, s[6:7] glc dlc ; GFX11_W64-NEXT: s_waitcnt vmcnt(0) @@ -1320,9 +1321,8 @@ ; GFX11_W64-NEXT: s_and_b32 s0, 1, s0 ; GFX11_W64-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, s0 ; GFX11_W64-NEXT: s_and_b64 vcc, vcc, s[0:1] -; GFX11_W64-NEXT: v_div_fmas_f32 v0, v2, v3, v1 -; GFX11_W64-NEXT: v_mov_b32_e32 v1, 0 -; GFX11_W64-NEXT: global_store_b32 v1, v0, s[4:5] offset:8 +; GFX11_W64-NEXT: v_div_fmas_f32 v1, v2, v3, v1 +; GFX11_W64-NEXT: global_store_b32 v0, v1, s[4:5] offset:8 ; GFX11_W64-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11_W64-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() @@ -1418,6 +1418,7 @@ ; GFX10_W32: ; %bb.0: ; %entry ; GFX10_W32-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x28 ; GFX10_W32-NEXT: v_lshlrev_b32_e32 v1, 2, v0 +; GFX10_W32-NEXT: v_mov_b32_e32 v4, 0 ; GFX10_W32-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX10_W32-NEXT: s_waitcnt lgkmcnt(0) ; GFX10_W32-NEXT: global_load_dwordx3 v[1:3], v1, s[2:3] @@ -1439,15 +1440,15 @@ ; GFX10_W32-NEXT: v_cmp_ne_u32_e64 vcc_lo, 0, s2 ; GFX10_W32-NEXT: s_waitcnt vmcnt(0) ; GFX10_W32-NEXT: v_div_fmas_f32 v0, v1, v2, v3 -; GFX10_W32-NEXT: v_mov_b32_e32 v1, 0 ; GFX10_W32-NEXT: s_waitcnt lgkmcnt(0) -; GFX10_W32-NEXT: global_store_dword v1, v0, s[0:1] offset:8 +; GFX10_W32-NEXT: global_store_dword v4, v0, s[0:1] offset:8 ; GFX10_W32-NEXT: s_endpgm ; ; GFX10_W64-LABEL: test_div_fmas_f32_i1_phi_vcc: ; GFX10_W64: ; %bb.0: ; %entry ; GFX10_W64-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x28 ; GFX10_W64-NEXT: v_lshlrev_b32_e32 v1, 2, v0 +; GFX10_W64-NEXT: v_mov_b32_e32 v4, 0 ; GFX10_W64-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX10_W64-NEXT: s_mov_b32 s4, 0 ; GFX10_W64-NEXT: s_waitcnt lgkmcnt(0) @@ -1469,15 +1470,14 @@ ; GFX10_W64-NEXT: v_cmp_ne_u32_e64 vcc, 0, s2 ; GFX10_W64-NEXT: s_waitcnt vmcnt(0) ; GFX10_W64-NEXT: v_div_fmas_f32 v0, v1, v2, v3 -; GFX10_W64-NEXT: v_mov_b32_e32 v1, 0 ; GFX10_W64-NEXT: s_waitcnt lgkmcnt(0) -; GFX10_W64-NEXT: global_store_dword v1, v0, s[0:1] offset:8 +; GFX10_W64-NEXT: global_store_dword v4, v0, s[0:1] offset:8 ; GFX10_W64-NEXT: s_endpgm ; ; GFX11_W32-LABEL: test_div_fmas_f32_i1_phi_vcc: ; GFX11_W32: ; %bb.0: ; %entry ; GFX11_W32-NEXT: s_load_b64 s[2:3], s[0:1], 0x28 -; GFX11_W32-NEXT: v_lshlrev_b32_e32 v1, 2, v0 +; GFX11_W32-NEXT: v_dual_mov_b32 v4, 0 :: v_dual_lshlrev_b32 v1, 2, v0 ; GFX11_W32-NEXT: s_waitcnt lgkmcnt(0) ; GFX11_W32-NEXT: global_load_b96 v[1:3], v1, s[2:3] ; GFX11_W32-NEXT: s_mov_b32 s2, 0 @@ -1498,9 +1498,8 @@ ; GFX11_W32-NEXT: v_cmp_ne_u32_e64 vcc_lo, 0, s2 ; GFX11_W32-NEXT: s_waitcnt vmcnt(0) ; GFX11_W32-NEXT: v_div_fmas_f32 v0, v1, v2, v3 -; GFX11_W32-NEXT: v_mov_b32_e32 v1, 0 ; GFX11_W32-NEXT: s_waitcnt lgkmcnt(0) -; GFX11_W32-NEXT: global_store_b32 v1, v0, s[0:1] offset:8 +; GFX11_W32-NEXT: global_store_b32 v4, v0, s[0:1] offset:8 ; GFX11_W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11_W32-NEXT: s_endpgm ; @@ -1508,6 +1507,7 @@ ; GFX11_W64: ; %bb.0: ; %entry ; GFX11_W64-NEXT: s_load_b64 s[2:3], s[0:1], 0x28 ; GFX11_W64-NEXT: v_lshlrev_b32_e32 v1, 2, v0 +; GFX11_W64-NEXT: v_mov_b32_e32 v4, 0 ; GFX11_W64-NEXT: s_mov_b32 s4, 0 ; GFX11_W64-NEXT: s_waitcnt lgkmcnt(0) ; GFX11_W64-NEXT: global_load_b96 v[1:3], v1, s[2:3] @@ -1528,9 +1528,8 @@ ; GFX11_W64-NEXT: v_cmp_ne_u32_e64 vcc, 0, s2 ; GFX11_W64-NEXT: s_waitcnt vmcnt(0) ; GFX11_W64-NEXT: v_div_fmas_f32 v0, v1, v2, v3 -; GFX11_W64-NEXT: v_mov_b32_e32 v1, 0 ; GFX11_W64-NEXT: s_waitcnt lgkmcnt(0) -; GFX11_W64-NEXT: global_store_b32 v1, v0, s[0:1] offset:8 +; GFX11_W64-NEXT: global_store_b32 v4, v0, s[0:1] offset:8 ; GFX11_W64-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11_W64-NEXT: s_endpgm entry: Index: llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.div.scale.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.div.scale.ll +++ llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.div.scale.ll @@ -1552,18 +1552,18 @@ ; GFX10-LABEL: test_div_scale_f32_undef_val_val: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX10-NEXT: v_div_scale_f32 v0, s2, 0x41000000, 0x41000000, v0 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_div_scale_f32 v0, s2, 0x41000000, 0x41000000, s0 ; GFX10-NEXT: global_store_dword v1, v0, s[0:1] ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: test_div_scale_f32_undef_val_val: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: v_div_scale_f32 v0, null, 0x41000000, 0x41000000, v0 ; GFX11-NEXT: v_mov_b32_e32 v1, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_div_scale_f32 v0, null, 0x41000000, 0x41000000, s0 ; GFX11-NEXT: global_store_b32 v1, v0, s[0:1] ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm Index: llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.intersect_ray.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.intersect_ray.ll +++ llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.intersect_ray.ll @@ -56,8 +56,8 @@ ; GFX10-NEXT: v_lshlrev_b32_e32 v9, 16, v9 ; GFX10-NEXT: v_lshlrev_b32_e32 v10, 16, v10 ; GFX10-NEXT: v_alignbit_b32 v7, v8, v7, 16 -; GFX10-NEXT: v_and_or_b32 v5, v5, 0xffff, v9 -; GFX10-NEXT: v_and_or_b32 v6, v6, 0xffff, v10 +; GFX10-NEXT: v_and_or_b32 v5, 0xffff, v5, v9 +; GFX10-NEXT: v_and_or_b32 v6, 0xffff, v6, v10 ; GFX10-NEXT: image_bvh_intersect_ray v[0:3], v[0:7], s[0:3] a16 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog @@ -119,8 +119,8 @@ ; GFX10-NEXT: v_lshlrev_b32_e32 v10, 16, v10 ; GFX10-NEXT: v_lshlrev_b32_e32 v11, 16, v11 ; GFX10-NEXT: v_alignbit_b32 v8, v9, v8, 16 -; GFX10-NEXT: v_and_or_b32 v6, v6, 0xffff, v10 -; GFX10-NEXT: v_and_or_b32 v7, v7, 0xffff, v11 +; GFX10-NEXT: v_and_or_b32 v6, 0xffff, v6, v10 +; GFX10-NEXT: v_and_or_b32 v7, 0xffff, v7, v11 ; GFX10-NEXT: image_bvh64_intersect_ray v[0:3], v[0:8], s[0:3] a16 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog @@ -265,8 +265,8 @@ ; GFX1030-NEXT: v_mov_b32_e32 v17, v4 ; GFX1030-NEXT: v_alignbit_b32 v20, v2, v7, 16 ; GFX1030-NEXT: s_mov_b32 s1, exec_lo -; GFX1030-NEXT: v_and_or_b32 v18, v5, 0xffff, v0 -; GFX1030-NEXT: v_and_or_b32 v19, v6, 0xffff, v1 +; GFX1030-NEXT: v_and_or_b32 v18, 0xffff, v5, v0 +; GFX1030-NEXT: v_and_or_b32 v19, 0xffff, v6, v1 ; GFX1030-NEXT: .LBB7_1: ; =>This Inner Loop Header: Depth=1 ; GFX1030-NEXT: v_readfirstlane_b32 s4, v9 ; GFX1030-NEXT: v_readfirstlane_b32 s5, v10 @@ -303,8 +303,8 @@ ; GFX1013-NEXT: v_lshlrev_b32_e32 v13, 16, v13 ; GFX1013-NEXT: v_lshlrev_b32_e32 v14, 16, v14 ; GFX1013-NEXT: v_alignbit_b32 v7, v8, v7, 16 -; GFX1013-NEXT: v_and_or_b32 v5, v5, 0xffff, v13 -; GFX1013-NEXT: v_and_or_b32 v6, v6, 0xffff, v14 +; GFX1013-NEXT: v_and_or_b32 v5, 0xffff, v5, v13 +; GFX1013-NEXT: v_and_or_b32 v6, 0xffff, v6, v14 ; GFX1013-NEXT: .LBB7_1: ; =>This Inner Loop Header: Depth=1 ; GFX1013-NEXT: v_readfirstlane_b32 s4, v9 ; GFX1013-NEXT: v_readfirstlane_b32 s5, v10 @@ -496,8 +496,8 @@ ; GFX1030-NEXT: v_mov_b32_e32 v18, v4 ; GFX1030-NEXT: v_mov_b32_e32 v19, v5 ; GFX1030-NEXT: v_alignbit_b32 v22, v2, v8, 16 -; GFX1030-NEXT: v_and_or_b32 v20, v6, 0xffff, v0 -; GFX1030-NEXT: v_and_or_b32 v21, v7, 0xffff, v1 +; GFX1030-NEXT: v_and_or_b32 v20, 0xffff, v6, v0 +; GFX1030-NEXT: v_and_or_b32 v21, 0xffff, v7, v1 ; GFX1030-NEXT: s_mov_b32 s1, exec_lo ; GFX1030-NEXT: .LBB9_1: ; =>This Inner Loop Header: Depth=1 ; GFX1030-NEXT: v_readfirstlane_b32 s4, v10 @@ -536,8 +536,8 @@ ; GFX1013-NEXT: v_lshlrev_b32_e32 v14, 16, v14 ; GFX1013-NEXT: v_lshlrev_b32_e32 v15, 16, v15 ; GFX1013-NEXT: v_alignbit_b32 v8, v9, v8, 16 -; GFX1013-NEXT: v_and_or_b32 v6, v6, 0xffff, v14 -; GFX1013-NEXT: v_and_or_b32 v7, v7, 0xffff, v15 +; GFX1013-NEXT: v_and_or_b32 v6, 0xffff, v6, v14 +; GFX1013-NEXT: v_and_or_b32 v7, 0xffff, v7, v15 ; GFX1013-NEXT: .LBB9_1: ; =>This Inner Loop Header: Depth=1 ; GFX1013-NEXT: v_readfirstlane_b32 s4, v10 ; GFX1013-NEXT: v_readfirstlane_b32 s5, v11 @@ -609,12 +609,12 @@ ; GFX1030: ; %bb.0: ; GFX1030-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 ; GFX1030-NEXT: v_lshlrev_b32_e32 v4, 2, v0 -; GFX1030-NEXT: v_mov_b32_e32 v5, 0x40400000 -; GFX1030-NEXT: v_mov_b32_e32 v6, 4.0 ; GFX1030-NEXT: v_mov_b32_e32 v7, 0x40a00000 -; GFX1030-NEXT: v_mov_b32_e32 v8, 0x40c00000 -; GFX1030-NEXT: v_mov_b32_e32 v9, 0x40e00000 +; GFX1030-NEXT: v_mov_b32_e32 v6, 4.0 +; GFX1030-NEXT: v_mov_b32_e32 v5, 0x40400000 ; GFX1030-NEXT: v_mov_b32_e32 v10, 0x41000000 +; GFX1030-NEXT: v_mov_b32_e32 v9, 0x40e00000 +; GFX1030-NEXT: v_mov_b32_e32 v8, 0x40c00000 ; GFX1030-NEXT: s_waitcnt lgkmcnt(0) ; GFX1030-NEXT: v_mov_b32_e32 v0, s0 ; GFX1030-NEXT: v_mov_b32_e32 v1, s1 @@ -627,8 +627,8 @@ ; GFX1030-NEXT: v_mov_b32_e32 v4, 2.0 ; GFX1030-NEXT: flat_load_dword v0, v[0:1] ; GFX1030-NEXT: flat_load_dword v1, v[2:3] -; GFX1030-NEXT: v_mov_b32_e32 v2, 0 ; GFX1030-NEXT: v_mov_b32_e32 v3, 1.0 +; GFX1030-NEXT: v_mov_b32_e32 v2, 0 ; GFX1030-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX1030-NEXT: image_bvh_intersect_ray v[0:3], v[0:10], s[4:7] ; GFX1030-NEXT: s_waitcnt vmcnt(0) @@ -640,9 +640,9 @@ ; GFX1013-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 ; GFX1013-NEXT: v_lshlrev_b32_e32 v6, 2, v0 ; GFX1013-NEXT: v_mov_b32_e32 v7, 0x40a00000 -; GFX1013-NEXT: v_mov_b32_e32 v8, 0x40c00000 -; GFX1013-NEXT: v_mov_b32_e32 v9, 0x40e00000 ; GFX1013-NEXT: v_mov_b32_e32 v10, 0x41000000 +; GFX1013-NEXT: v_mov_b32_e32 v9, 0x40e00000 +; GFX1013-NEXT: v_mov_b32_e32 v8, 0x40c00000 ; GFX1013-NEXT: s_waitcnt lgkmcnt(0) ; GFX1013-NEXT: v_mov_b32_e32 v0, s0 ; GFX1013-NEXT: v_mov_b32_e32 v1, s1 @@ -655,9 +655,9 @@ ; GFX1013-NEXT: v_mov_b32_e32 v6, 4.0 ; GFX1013-NEXT: flat_load_dword v0, v[4:5] ; GFX1013-NEXT: flat_load_dword v1, v[2:3] -; GFX1013-NEXT: v_mov_b32_e32 v2, 0 -; GFX1013-NEXT: v_mov_b32_e32 v3, 1.0 ; GFX1013-NEXT: v_mov_b32_e32 v4, 2.0 +; GFX1013-NEXT: v_mov_b32_e32 v3, 1.0 +; GFX1013-NEXT: v_mov_b32_e32 v2, 0 ; GFX1013-NEXT: v_mov_b32_e32 v5, 0x40400000 ; GFX1013-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX1013-NEXT: image_bvh_intersect_ray v[0:3], v[0:10], s[4:7] @@ -738,8 +738,8 @@ ; GFX1030-NEXT: v_mov_b32_e32 v4, 2.0 ; GFX1030-NEXT: flat_load_dword v0, v[0:1] ; GFX1030-NEXT: flat_load_dword v1, v[2:3] -; GFX1030-NEXT: v_mov_b32_e32 v2, 0 ; GFX1030-NEXT: v_mov_b32_e32 v3, 1.0 +; GFX1030-NEXT: v_mov_b32_e32 v2, 0 ; GFX1030-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX1030-NEXT: image_bvh_intersect_ray v[0:3], v[0:7], s[4:7] a16 ; GFX1030-NEXT: s_waitcnt vmcnt(0) @@ -763,9 +763,9 @@ ; GFX1013-NEXT: v_mov_b32_e32 v6, 0x46004500 ; GFX1013-NEXT: flat_load_dword v0, v[4:5] ; GFX1013-NEXT: flat_load_dword v1, v[2:3] -; GFX1013-NEXT: v_mov_b32_e32 v2, 0 -; GFX1013-NEXT: v_mov_b32_e32 v3, 1.0 ; GFX1013-NEXT: v_mov_b32_e32 v4, 2.0 +; GFX1013-NEXT: v_mov_b32_e32 v3, 1.0 +; GFX1013-NEXT: v_mov_b32_e32 v2, 0 ; GFX1013-NEXT: v_mov_b32_e32 v5, 0x44004200 ; GFX1013-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX1013-NEXT: image_bvh_intersect_ray v[0:3], v[0:7], s[4:7] a16 @@ -828,15 +828,15 @@ ; GFX1030-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 ; GFX1030-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x34 ; GFX1030-NEXT: v_lshlrev_b32_e32 v2, 2, v0 -; GFX1030-NEXT: v_mov_b32_e32 v3, 0 -; GFX1030-NEXT: v_mov_b32_e32 v4, 1.0 ; GFX1030-NEXT: v_mov_b32_e32 v5, 2.0 -; GFX1030-NEXT: v_mov_b32_e32 v6, 0x40400000 -; GFX1030-NEXT: v_mov_b32_e32 v7, 4.0 +; GFX1030-NEXT: v_mov_b32_e32 v4, 1.0 +; GFX1030-NEXT: v_mov_b32_e32 v3, 0 ; GFX1030-NEXT: v_mov_b32_e32 v8, 0x40a00000 -; GFX1030-NEXT: v_mov_b32_e32 v9, 0x40c00000 -; GFX1030-NEXT: v_mov_b32_e32 v10, 0x40e00000 +; GFX1030-NEXT: v_mov_b32_e32 v7, 4.0 +; GFX1030-NEXT: v_mov_b32_e32 v6, 0x40400000 ; GFX1030-NEXT: v_mov_b32_e32 v11, 0x41000000 +; GFX1030-NEXT: v_mov_b32_e32 v10, 0x40e00000 +; GFX1030-NEXT: v_mov_b32_e32 v9, 0x40c00000 ; GFX1030-NEXT: s_waitcnt lgkmcnt(0) ; GFX1030-NEXT: v_mov_b32_e32 v0, s4 ; GFX1030-NEXT: v_mov_b32_e32 v1, s5 @@ -857,15 +857,15 @@ ; GFX1013-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX1013-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 ; GFX1013-NEXT: v_lshlrev_b32_e32 v2, 2, v0 -; GFX1013-NEXT: v_mov_b32_e32 v3, 0 -; GFX1013-NEXT: v_mov_b32_e32 v4, 1.0 ; GFX1013-NEXT: v_mov_b32_e32 v5, 2.0 -; GFX1013-NEXT: v_mov_b32_e32 v6, 0x40400000 -; GFX1013-NEXT: v_mov_b32_e32 v7, 4.0 +; GFX1013-NEXT: v_mov_b32_e32 v4, 1.0 +; GFX1013-NEXT: v_mov_b32_e32 v3, 0 ; GFX1013-NEXT: v_mov_b32_e32 v8, 0x40a00000 -; GFX1013-NEXT: v_mov_b32_e32 v9, 0x40c00000 -; GFX1013-NEXT: v_mov_b32_e32 v10, 0x40e00000 +; GFX1013-NEXT: v_mov_b32_e32 v7, 4.0 +; GFX1013-NEXT: v_mov_b32_e32 v6, 0x40400000 ; GFX1013-NEXT: v_mov_b32_e32 v11, 0x41000000 +; GFX1013-NEXT: v_mov_b32_e32 v10, 0x40e00000 +; GFX1013-NEXT: v_mov_b32_e32 v9, 0x40c00000 ; GFX1013-NEXT: s_waitcnt lgkmcnt(0) ; GFX1013-NEXT: v_mov_b32_e32 v0, s2 ; GFX1013-NEXT: v_mov_b32_e32 v1, s3 @@ -939,9 +939,9 @@ ; GFX1030-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 ; GFX1030-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x34 ; GFX1030-NEXT: v_lshlrev_b32_e32 v2, 2, v0 -; GFX1030-NEXT: v_mov_b32_e32 v3, 0 -; GFX1030-NEXT: v_mov_b32_e32 v4, 1.0 ; GFX1030-NEXT: v_mov_b32_e32 v5, 2.0 +; GFX1030-NEXT: v_mov_b32_e32 v4, 1.0 +; GFX1030-NEXT: v_mov_b32_e32 v3, 0 ; GFX1030-NEXT: v_mov_b32_e32 v6, 0x44004200 ; GFX1030-NEXT: v_mov_b32_e32 v7, 0x46004500 ; GFX1030-NEXT: v_mov_b32_e32 v8, 0x48004700 @@ -965,9 +965,9 @@ ; GFX1013-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX1013-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 ; GFX1013-NEXT: v_lshlrev_b32_e32 v2, 2, v0 -; GFX1013-NEXT: v_mov_b32_e32 v3, 0 -; GFX1013-NEXT: v_mov_b32_e32 v4, 1.0 ; GFX1013-NEXT: v_mov_b32_e32 v5, 2.0 +; GFX1013-NEXT: v_mov_b32_e32 v4, 1.0 +; GFX1013-NEXT: v_mov_b32_e32 v3, 0 ; GFX1013-NEXT: v_mov_b32_e32 v6, 0x44004200 ; GFX1013-NEXT: v_mov_b32_e32 v7, 0x46004500 ; GFX1013-NEXT: v_mov_b32_e32 v8, 0x48004700 Index: llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.make.buffer.rsrc.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.make.buffer.rsrc.ll +++ llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.make.buffer.rsrc.ll @@ -8,22 +8,22 @@ ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0 ; CHECK-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr1 - ; CHECK-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 5678 - ; CHECK-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 1234 - ; CHECK-NEXT: [[S_MOV_B32_2:%[0-9]+]]:sreg_32 = S_MOV_B32 65535 - ; CHECK-NEXT: [[S_AND_B32_:%[0-9]+]]:sreg_32 = S_AND_B32 [[COPY1]], [[S_MOV_B32_2]], implicit-def $scc + ; CHECK-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 5678, implicit $exec + ; CHECK-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 1234, implicit $exec + ; CHECK-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 65535 + ; CHECK-NEXT: [[S_AND_B32_:%[0-9]+]]:sreg_32 = S_AND_B32 [[COPY1]], [[S_MOV_B32_]], implicit-def $scc ; CHECK-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[COPY]] ; CHECK-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY2]], implicit $exec ; CHECK-NEXT: $sgpr0 = COPY [[V_READFIRSTLANE_B32_]] ; CHECK-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[S_AND_B32_]] ; CHECK-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY3]], implicit $exec ; CHECK-NEXT: $sgpr1 = COPY [[V_READFIRSTLANE_B32_1]] - ; CHECK-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_1]] - ; CHECK-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY4]], implicit $exec - ; CHECK-NEXT: $sgpr2 = COPY [[V_READFIRSTLANE_B32_2]] - ; CHECK-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] - ; CHECK-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY5]], implicit $exec - ; CHECK-NEXT: $sgpr3 = COPY [[V_READFIRSTLANE_B32_3]] + ; CHECK-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY [[V_MOV_B32_e32_1]] + ; CHECK-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 [[COPY4]] + ; CHECK-NEXT: $sgpr2 = COPY [[S_MOV_B32_1]] + ; CHECK-NEXT: [[COPY5:%[0-9]+]]:sreg_32 = COPY [[V_MOV_B32_e32_]] + ; CHECK-NEXT: [[S_MOV_B32_2:%[0-9]+]]:sreg_32 = S_MOV_B32 [[COPY5]] + ; CHECK-NEXT: $sgpr3 = COPY [[S_MOV_B32_2]] ; CHECK-NEXT: SI_RETURN_TO_EPILOG implicit $sgpr0, implicit $sgpr1, implicit $sgpr2, implicit $sgpr3 %rsrc = call ptr addrspace(8) @llvm.amdgcn.make.buffer.rsrc.p0(ptr %p, i16 0, i32 1234, i32 5678) ret ptr addrspace(8) %rsrc @@ -55,24 +55,24 @@ ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0 ; CHECK-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr1 - ; CHECK-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 5678 - ; CHECK-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 1234 - ; CHECK-NEXT: [[S_MOV_B32_2:%[0-9]+]]:sreg_32 = S_MOV_B32 65535 - ; CHECK-NEXT: [[S_AND_B32_:%[0-9]+]]:sreg_32 = S_AND_B32 [[COPY1]], [[S_MOV_B32_2]], implicit-def $scc - ; CHECK-NEXT: [[S_MOV_B32_3:%[0-9]+]]:sreg_32 = S_MOV_B32 262144 - ; CHECK-NEXT: [[S_OR_B32_:%[0-9]+]]:sreg_32 = S_OR_B32 [[S_AND_B32_]], [[S_MOV_B32_3]], implicit-def $scc + ; CHECK-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 5678, implicit $exec + ; CHECK-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 1234, implicit $exec + ; CHECK-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 65535 + ; CHECK-NEXT: [[S_AND_B32_:%[0-9]+]]:sreg_32 = S_AND_B32 [[COPY1]], [[S_MOV_B32_]], implicit-def $scc + ; CHECK-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 262144 + ; CHECK-NEXT: [[S_OR_B32_:%[0-9]+]]:sreg_32 = S_OR_B32 [[S_AND_B32_]], [[S_MOV_B32_1]], implicit-def $scc ; CHECK-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[COPY]] ; CHECK-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY2]], implicit $exec ; CHECK-NEXT: $sgpr0 = COPY [[V_READFIRSTLANE_B32_]] ; CHECK-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[S_OR_B32_]] ; CHECK-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY3]], implicit $exec ; CHECK-NEXT: $sgpr1 = COPY [[V_READFIRSTLANE_B32_1]] - ; CHECK-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_1]] - ; CHECK-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY4]], implicit $exec - ; CHECK-NEXT: $sgpr2 = COPY [[V_READFIRSTLANE_B32_2]] - ; CHECK-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] - ; CHECK-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY5]], implicit $exec - ; CHECK-NEXT: $sgpr3 = COPY [[V_READFIRSTLANE_B32_3]] + ; CHECK-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY [[V_MOV_B32_e32_1]] + ; CHECK-NEXT: [[S_MOV_B32_2:%[0-9]+]]:sreg_32 = S_MOV_B32 [[COPY4]] + ; CHECK-NEXT: $sgpr2 = COPY [[S_MOV_B32_2]] + ; CHECK-NEXT: [[COPY5:%[0-9]+]]:sreg_32 = COPY [[V_MOV_B32_e32_]] + ; CHECK-NEXT: [[S_MOV_B32_3:%[0-9]+]]:sreg_32 = S_MOV_B32 [[COPY5]] + ; CHECK-NEXT: $sgpr3 = COPY [[S_MOV_B32_3]] ; CHECK-NEXT: SI_RETURN_TO_EPILOG implicit $sgpr0, implicit $sgpr1, implicit $sgpr2, implicit $sgpr3 %rsrc = call ptr addrspace(8) @llvm.amdgcn.make.buffer.rsrc.p0(ptr %p, i16 4, i32 1234, i32 5678) ret ptr addrspace(8) %rsrc @@ -157,8 +157,8 @@ ; CHECK-NEXT: [[S_OR_B32_:%[0-9]+]]:sreg_32 = S_OR_B32 [[S_AND_B32_]], [[S_LSHL_B32_]], implicit-def $scc ; CHECK-NEXT: [[S_MOV_B32_2:%[0-9]+]]:sreg_32 = S_MOV_B32 0 ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[S_OR_B32_]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3 - ; CHECK-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_2]] - ; CHECK-NEXT: [[BUFFER_LOAD_DWORD_IDXEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_IDXEN [[COPY5]], [[REG_SEQUENCE]], [[S_MOV_B32_2]], 0, 0, 0, implicit $exec :: (dereferenceable load (s32) from %ir.rsrc, align 1, addrspace 8) + ; CHECK-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + ; CHECK-NEXT: [[BUFFER_LOAD_DWORD_IDXEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_IDXEN [[V_MOV_B32_e32_]], [[REG_SEQUENCE]], [[S_MOV_B32_2]], 0, 0, 0, implicit $exec :: (dereferenceable load (s32) from %ir.rsrc, align 1, addrspace 8) ; CHECK-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_DWORD_IDXEN]] ; CHECK-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0 %rsrc = call ptr addrspace(8) @llvm.amdgcn.make.buffer.rsrc.p0(ptr %p, i16 %stride, i32 %numVals, i32 %flags) @@ -178,15 +178,13 @@ ; CHECK-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2 ; CHECK-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr3 ; CHECK-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr4 - ; CHECK-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 65535 - ; CHECK-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] - ; CHECK-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 16 - ; CHECK-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_1]] - ; CHECK-NEXT: [[V_LSHLREV_B32_e64_:%[0-9]+]]:vgpr_32 = V_LSHLREV_B32_e64 [[COPY6]], [[COPY2]], implicit $exec - ; CHECK-NEXT: [[V_AND_OR_B32_e64_:%[0-9]+]]:vgpr_32 = V_AND_OR_B32_e64 [[COPY1]], [[COPY5]], [[V_LSHLREV_B32_e64_]], implicit $exec - ; CHECK-NEXT: [[S_MOV_B32_2:%[0-9]+]]:sreg_32 = S_MOV_B32 0 + ; CHECK-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 65535, implicit $exec + ; CHECK-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 16, implicit $exec + ; CHECK-NEXT: [[V_LSHLREV_B32_e64_:%[0-9]+]]:vgpr_32 = V_LSHLREV_B32_e64 [[V_MOV_B32_e32_1]], [[COPY2]], implicit $exec + ; CHECK-NEXT: [[V_AND_OR_B32_e64_:%[0-9]+]]:vgpr_32 = V_AND_OR_B32_e64 [[COPY1]], [[V_MOV_B32_e32_]], [[V_LSHLREV_B32_e64_]], implicit $exec + ; CHECK-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0 ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[V_AND_OR_B32_e64_]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3 - ; CHECK-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_2]] + ; CHECK-NEXT: [[V_MOV_B32_e32_2:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec ; CHECK-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.2: @@ -197,19 +195,19 @@ ; CHECK-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY3]], implicit $exec ; CHECK-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY4]], implicit $exec ; CHECK-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3 - ; CHECK-NEXT: [[COPY8:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub0_sub1 - ; CHECK-NEXT: [[COPY9:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub2_sub3 - ; CHECK-NEXT: [[COPY10:%[0-9]+]]:sreg_64 = COPY [[REG_SEQUENCE1]].sub0_sub1 - ; CHECK-NEXT: [[COPY11:%[0-9]+]]:sreg_64 = COPY [[REG_SEQUENCE1]].sub2_sub3 - ; CHECK-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[COPY10]], [[COPY8]], implicit $exec - ; CHECK-NEXT: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[COPY11]], [[COPY9]], implicit $exec + ; CHECK-NEXT: [[COPY5:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub0_sub1 + ; CHECK-NEXT: [[COPY6:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub2_sub3 + ; CHECK-NEXT: [[COPY7:%[0-9]+]]:sreg_64 = COPY [[REG_SEQUENCE1]].sub0_sub1 + ; CHECK-NEXT: [[COPY8:%[0-9]+]]:sreg_64 = COPY [[REG_SEQUENCE1]].sub2_sub3 + ; CHECK-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[COPY7]], [[COPY5]], implicit $exec + ; CHECK-NEXT: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[COPY8]], [[COPY6]], implicit $exec ; CHECK-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def $scc ; CHECK-NEXT: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_]], implicit-def $exec, implicit-def $scc, implicit $exec ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.3: ; CHECK-NEXT: successors: %bb.4(0x40000000), %bb.2(0x40000000) ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: [[BUFFER_LOAD_DWORD_IDXEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_IDXEN [[COPY7]], [[REG_SEQUENCE1]], [[S_MOV_B32_2]], 0, 0, 0, implicit $exec :: (dereferenceable load (s32) from %ir.rsrc, align 1, addrspace 8) + ; CHECK-NEXT: [[BUFFER_LOAD_DWORD_IDXEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_IDXEN [[V_MOV_B32_e32_2]], [[REG_SEQUENCE1]], [[S_MOV_B32_]], 0, 0, 0, implicit $exec :: (dereferenceable load (s32) from %ir.rsrc, align 1, addrspace 8) ; CHECK-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc ; CHECK-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec ; CHECK-NEXT: {{ $}} Index: llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.buffer.load.format.f16.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.buffer.load.format.f16.ll +++ llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.buffer.load.format.f16.ll @@ -65,14 +65,12 @@ ; UNPACKED-NEXT: [[BUFFER_LOAD_FORMAT_D16_XY_gfx80_OFFEN:%[0-9]+]]:vreg_64 = BUFFER_LOAD_FORMAT_D16_XY_gfx80_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[COPY5]], 0, 0, 0, implicit $exec :: (dereferenceable load (<2 x s16>), align 1, addrspace 8) ; UNPACKED-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_FORMAT_D16_XY_gfx80_OFFEN]].sub0 ; UNPACKED-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_FORMAT_D16_XY_gfx80_OFFEN]].sub1 - ; UNPACKED-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 65535 - ; UNPACKED-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] - ; UNPACKED-NEXT: [[V_AND_B32_e64_:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 [[COPY6]], [[COPY8]], implicit $exec - ; UNPACKED-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] - ; UNPACKED-NEXT: [[V_AND_B32_e64_1:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 [[COPY7]], [[COPY9]], implicit $exec - ; UNPACKED-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 16 - ; UNPACKED-NEXT: [[COPY10:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_1]] - ; UNPACKED-NEXT: [[V_LSHLREV_B32_e64_:%[0-9]+]]:vgpr_32 = V_LSHLREV_B32_e64 [[COPY10]], [[V_AND_B32_e64_1]], implicit $exec + ; UNPACKED-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 65535, implicit $exec + ; UNPACKED-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 65535, implicit $exec + ; UNPACKED-NEXT: [[V_AND_B32_e64_:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 [[COPY6]], [[V_MOV_B32_e32_1]], implicit $exec + ; UNPACKED-NEXT: [[V_AND_B32_e64_1:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 [[COPY7]], [[V_MOV_B32_e32_]], implicit $exec + ; UNPACKED-NEXT: [[V_MOV_B32_e32_2:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 16, implicit $exec + ; UNPACKED-NEXT: [[V_LSHLREV_B32_e64_:%[0-9]+]]:vgpr_32 = V_LSHLREV_B32_e64 [[V_MOV_B32_e32_2]], [[V_AND_B32_e64_1]], implicit $exec ; UNPACKED-NEXT: [[V_OR_B32_e64_:%[0-9]+]]:vgpr_32 = V_OR_B32_e64 [[V_AND_B32_e64_]], [[V_LSHLREV_B32_e64_]], implicit $exec ; UNPACKED-NEXT: $vgpr0 = COPY [[V_OR_B32_e64_]] ; UNPACKED-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0 @@ -120,21 +118,19 @@ ; UNPACKED-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_FORMAT_D16_XYZW_gfx80_OFFEN]].sub1 ; UNPACKED-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_FORMAT_D16_XYZW_gfx80_OFFEN]].sub2 ; UNPACKED-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_FORMAT_D16_XYZW_gfx80_OFFEN]].sub3 - ; UNPACKED-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 65535 - ; UNPACKED-NEXT: [[COPY10:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] - ; UNPACKED-NEXT: [[V_AND_B32_e64_:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 [[COPY6]], [[COPY10]], implicit $exec - ; UNPACKED-NEXT: [[COPY11:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] - ; UNPACKED-NEXT: [[V_AND_B32_e64_1:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 [[COPY7]], [[COPY11]], implicit $exec - ; UNPACKED-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 16 - ; UNPACKED-NEXT: [[COPY12:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_1]] - ; UNPACKED-NEXT: [[V_LSHLREV_B32_e64_:%[0-9]+]]:vgpr_32 = V_LSHLREV_B32_e64 [[COPY12]], [[V_AND_B32_e64_1]], implicit $exec + ; UNPACKED-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 65535, implicit $exec + ; UNPACKED-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 65535, implicit $exec + ; UNPACKED-NEXT: [[V_AND_B32_e64_:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 [[COPY6]], [[V_MOV_B32_e32_1]], implicit $exec + ; UNPACKED-NEXT: [[V_MOV_B32_e32_2:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 65535, implicit $exec + ; UNPACKED-NEXT: [[V_AND_B32_e64_1:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 [[COPY7]], [[V_MOV_B32_e32_2]], implicit $exec + ; UNPACKED-NEXT: [[V_MOV_B32_e32_3:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 16, implicit $exec + ; UNPACKED-NEXT: [[V_MOV_B32_e32_4:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 16, implicit $exec + ; UNPACKED-NEXT: [[V_LSHLREV_B32_e64_:%[0-9]+]]:vgpr_32 = V_LSHLREV_B32_e64 [[V_MOV_B32_e32_4]], [[V_AND_B32_e64_1]], implicit $exec ; UNPACKED-NEXT: [[V_OR_B32_e64_:%[0-9]+]]:vgpr_32 = V_OR_B32_e64 [[V_AND_B32_e64_]], [[V_LSHLREV_B32_e64_]], implicit $exec - ; UNPACKED-NEXT: [[COPY13:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] - ; UNPACKED-NEXT: [[V_AND_B32_e64_2:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 [[COPY8]], [[COPY13]], implicit $exec - ; UNPACKED-NEXT: [[COPY14:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] - ; UNPACKED-NEXT: [[V_AND_B32_e64_3:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 [[COPY9]], [[COPY14]], implicit $exec - ; UNPACKED-NEXT: [[COPY15:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_1]] - ; UNPACKED-NEXT: [[V_LSHLREV_B32_e64_1:%[0-9]+]]:vgpr_32 = V_LSHLREV_B32_e64 [[COPY15]], [[V_AND_B32_e64_3]], implicit $exec + ; UNPACKED-NEXT: [[V_MOV_B32_e32_5:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 65535, implicit $exec + ; UNPACKED-NEXT: [[V_AND_B32_e64_2:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 [[COPY8]], [[V_MOV_B32_e32_5]], implicit $exec + ; UNPACKED-NEXT: [[V_AND_B32_e64_3:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 [[COPY9]], [[V_MOV_B32_e32_]], implicit $exec + ; UNPACKED-NEXT: [[V_LSHLREV_B32_e64_1:%[0-9]+]]:vgpr_32 = V_LSHLREV_B32_e64 [[V_MOV_B32_e32_3]], [[V_AND_B32_e64_3]], implicit $exec ; UNPACKED-NEXT: [[V_OR_B32_e64_1:%[0-9]+]]:vgpr_32 = V_OR_B32_e64 [[V_AND_B32_e64_2]], [[V_LSHLREV_B32_e64_1]], implicit $exec ; UNPACKED-NEXT: $vgpr0 = COPY [[V_OR_B32_e64_]] ; UNPACKED-NEXT: $vgpr1 = COPY [[V_OR_B32_e64_1]] @@ -283,21 +279,19 @@ ; UNPACKED-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_FORMAT_D16_XYZW_gfx80_OFFEN]].sub1 ; UNPACKED-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_FORMAT_D16_XYZW_gfx80_OFFEN]].sub2 ; UNPACKED-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_FORMAT_D16_XYZW_gfx80_OFFEN]].sub3 - ; UNPACKED-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 65535 - ; UNPACKED-NEXT: [[COPY10:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] - ; UNPACKED-NEXT: [[V_AND_B32_e64_:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 [[COPY6]], [[COPY10]], implicit $exec - ; UNPACKED-NEXT: [[COPY11:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] - ; UNPACKED-NEXT: [[V_AND_B32_e64_1:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 [[COPY7]], [[COPY11]], implicit $exec - ; UNPACKED-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 16 - ; UNPACKED-NEXT: [[COPY12:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_1]] - ; UNPACKED-NEXT: [[V_LSHLREV_B32_e64_:%[0-9]+]]:vgpr_32 = V_LSHLREV_B32_e64 [[COPY12]], [[V_AND_B32_e64_1]], implicit $exec + ; UNPACKED-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 65535, implicit $exec + ; UNPACKED-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 65535, implicit $exec + ; UNPACKED-NEXT: [[V_AND_B32_e64_:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 [[COPY6]], [[V_MOV_B32_e32_1]], implicit $exec + ; UNPACKED-NEXT: [[V_MOV_B32_e32_2:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 65535, implicit $exec + ; UNPACKED-NEXT: [[V_AND_B32_e64_1:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 [[COPY7]], [[V_MOV_B32_e32_2]], implicit $exec + ; UNPACKED-NEXT: [[V_MOV_B32_e32_3:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 16, implicit $exec + ; UNPACKED-NEXT: [[V_MOV_B32_e32_4:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 16, implicit $exec + ; UNPACKED-NEXT: [[V_LSHLREV_B32_e64_:%[0-9]+]]:vgpr_32 = V_LSHLREV_B32_e64 [[V_MOV_B32_e32_4]], [[V_AND_B32_e64_1]], implicit $exec ; UNPACKED-NEXT: [[V_OR_B32_e64_:%[0-9]+]]:vgpr_32 = V_OR_B32_e64 [[V_AND_B32_e64_]], [[V_LSHLREV_B32_e64_]], implicit $exec - ; UNPACKED-NEXT: [[COPY13:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] - ; UNPACKED-NEXT: [[V_AND_B32_e64_2:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 [[COPY8]], [[COPY13]], implicit $exec - ; UNPACKED-NEXT: [[COPY14:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] - ; UNPACKED-NEXT: [[V_AND_B32_e64_3:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 [[COPY9]], [[COPY14]], implicit $exec - ; UNPACKED-NEXT: [[COPY15:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_1]] - ; UNPACKED-NEXT: [[V_LSHLREV_B32_e64_1:%[0-9]+]]:vgpr_32 = V_LSHLREV_B32_e64 [[COPY15]], [[V_AND_B32_e64_3]], implicit $exec + ; UNPACKED-NEXT: [[V_MOV_B32_e32_5:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 65535, implicit $exec + ; UNPACKED-NEXT: [[V_AND_B32_e64_2:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 [[COPY8]], [[V_MOV_B32_e32_5]], implicit $exec + ; UNPACKED-NEXT: [[V_AND_B32_e64_3:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 [[COPY9]], [[V_MOV_B32_e32_]], implicit $exec + ; UNPACKED-NEXT: [[V_LSHLREV_B32_e64_1:%[0-9]+]]:vgpr_32 = V_LSHLREV_B32_e64 [[V_MOV_B32_e32_3]], [[V_AND_B32_e64_3]], implicit $exec ; UNPACKED-NEXT: [[V_OR_B32_e64_1:%[0-9]+]]:vgpr_32 = V_OR_B32_e64 [[V_AND_B32_e64_2]], [[V_LSHLREV_B32_e64_1]], implicit $exec ; UNPACKED-NEXT: $vgpr0 = COPY [[V_OR_B32_e64_]] ; UNPACKED-NEXT: $vgpr1 = COPY [[V_OR_B32_e64_1]] Index: llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.buffer.load.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.buffer.load.ll +++ llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.buffer.load.ll @@ -646,9 +646,8 @@ ; CHECK-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5 ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; CHECK-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr6 - ; CHECK-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 4096 - ; CHECK-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] - ; CHECK-NEXT: [[BUFFER_LOAD_DWORD_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN [[COPY5]], [[REG_SEQUENCE]], [[COPY4]], 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8) + ; CHECK-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4096, implicit $exec + ; CHECK-NEXT: [[BUFFER_LOAD_DWORD_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN [[V_MOV_B32_e32_]], [[REG_SEQUENCE]], [[COPY4]], 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8) ; CHECK-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_DWORD_OFFEN]] ; CHECK-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0 %val = call float @llvm.amdgcn.raw.buffer.load.f32(<4 x i32> %rsrc, i32 4096, i32 %soffset, i32 0) @@ -707,9 +706,8 @@ ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; CHECK-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; CHECK-NEXT: [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr6 - ; CHECK-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 4096 - ; CHECK-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] - ; CHECK-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64 = V_ADD_CO_U32_e64 [[COPY4]], [[COPY6]], 0, implicit $exec + ; CHECK-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4096, implicit $exec + ; CHECK-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64 = V_ADD_CO_U32_e64 [[COPY4]], [[V_MOV_B32_e32_]], 0, implicit $exec ; CHECK-NEXT: [[BUFFER_LOAD_DWORD_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN [[V_ADD_CO_U32_e64_]], [[REG_SEQUENCE]], [[COPY5]], 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8) ; CHECK-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_DWORD_OFFEN]] ; CHECK-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0 @@ -891,9 +889,8 @@ ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; CHECK-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr4 ; CHECK-NEXT: [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr2 - ; CHECK-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 4096 - ; CHECK-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] - ; CHECK-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64 = V_ADD_CO_U32_e64 [[COPY4]], [[COPY6]], 0, implicit $exec + ; CHECK-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4096, implicit $exec + ; CHECK-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64 = V_ADD_CO_U32_e64 [[COPY4]], [[V_MOV_B32_e32_]], 0, implicit $exec ; CHECK-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.2: @@ -904,12 +901,12 @@ ; CHECK-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY2]], implicit $exec ; CHECK-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY3]], implicit $exec ; CHECK-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3 - ; CHECK-NEXT: [[COPY7:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub0_sub1 - ; CHECK-NEXT: [[COPY8:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub2_sub3 - ; CHECK-NEXT: [[COPY9:%[0-9]+]]:sreg_64 = COPY [[REG_SEQUENCE1]].sub0_sub1 - ; CHECK-NEXT: [[COPY10:%[0-9]+]]:sreg_64 = COPY [[REG_SEQUENCE1]].sub2_sub3 - ; CHECK-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[COPY9]], [[COPY7]], implicit $exec - ; CHECK-NEXT: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[COPY10]], [[COPY8]], implicit $exec + ; CHECK-NEXT: [[COPY6:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub0_sub1 + ; CHECK-NEXT: [[COPY7:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub2_sub3 + ; CHECK-NEXT: [[COPY8:%[0-9]+]]:sreg_64 = COPY [[REG_SEQUENCE1]].sub0_sub1 + ; CHECK-NEXT: [[COPY9:%[0-9]+]]:sreg_64 = COPY [[REG_SEQUENCE1]].sub2_sub3 + ; CHECK-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[COPY8]], [[COPY6]], implicit $exec + ; CHECK-NEXT: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[COPY9]], [[COPY7]], implicit $exec ; CHECK-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def $scc ; CHECK-NEXT: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_]], implicit-def $exec, implicit-def $scc, implicit $exec ; CHECK-NEXT: {{ $}} Index: llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.buffer.store.format.f16.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.buffer.store.format.f16.ll +++ llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.buffer.store.format.f16.ll @@ -79,9 +79,8 @@ ; UNPACKED-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; UNPACKED-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1 ; UNPACKED-NEXT: [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6 - ; UNPACKED-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 16 - ; UNPACKED-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] - ; UNPACKED-NEXT: [[V_LSHRREV_B32_e64_:%[0-9]+]]:vgpr_32 = V_LSHRREV_B32_e64 [[COPY7]], [[COPY4]], implicit $exec + ; UNPACKED-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 16, implicit $exec + ; UNPACKED-NEXT: [[V_LSHRREV_B32_e64_:%[0-9]+]]:vgpr_32 = V_LSHRREV_B32_e64 [[V_MOV_B32_e32_]], [[COPY4]], implicit $exec ; UNPACKED-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[V_LSHRREV_B32_e64_]], %subreg.sub1 ; UNPACKED-NEXT: BUFFER_STORE_FORMAT_D16_XY_gfx80_OFFEN_exact [[REG_SEQUENCE1]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 0, 0, 0, implicit $exec :: (dereferenceable store (<2 x s16>), align 1, addrspace 8) ; UNPACKED-NEXT: S_ENDPGM 0 @@ -117,11 +116,10 @@ ; UNPACKED-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1 ; UNPACKED-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr2 ; UNPACKED-NEXT: [[COPY7:%[0-9]+]]:sreg_32 = COPY $sgpr6 - ; UNPACKED-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 16 - ; UNPACKED-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] - ; UNPACKED-NEXT: [[V_LSHRREV_B32_e64_:%[0-9]+]]:vgpr_32 = V_LSHRREV_B32_e64 [[COPY8]], [[COPY4]], implicit $exec - ; UNPACKED-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] - ; UNPACKED-NEXT: [[V_LSHRREV_B32_e64_1:%[0-9]+]]:vgpr_32 = V_LSHRREV_B32_e64 [[COPY9]], [[COPY5]], implicit $exec + ; UNPACKED-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 16, implicit $exec + ; UNPACKED-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 16, implicit $exec + ; UNPACKED-NEXT: [[V_LSHRREV_B32_e64_:%[0-9]+]]:vgpr_32 = V_LSHRREV_B32_e64 [[V_MOV_B32_e32_1]], [[COPY4]], implicit $exec + ; UNPACKED-NEXT: [[V_LSHRREV_B32_e64_1:%[0-9]+]]:vgpr_32 = V_LSHRREV_B32_e64 [[V_MOV_B32_e32_]], [[COPY5]], implicit $exec ; UNPACKED-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[V_LSHRREV_B32_e64_]], %subreg.sub1, [[COPY5]], %subreg.sub2, [[V_LSHRREV_B32_e64_1]], %subreg.sub3 ; UNPACKED-NEXT: BUFFER_STORE_FORMAT_D16_XYZW_gfx80_OFFEN_exact [[REG_SEQUENCE1]], [[COPY6]], [[REG_SEQUENCE]], [[COPY7]], 0, 0, 0, implicit $exec :: (dereferenceable store (<4 x s16>), align 1, addrspace 8) ; UNPACKED-NEXT: S_ENDPGM 0 @@ -161,11 +159,10 @@ ; UNPACKED-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr5 ; UNPACKED-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr6 ; UNPACKED-NEXT: [[COPY7:%[0-9]+]]:sreg_32 = COPY $sgpr2 - ; UNPACKED-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 16 - ; UNPACKED-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] - ; UNPACKED-NEXT: [[V_LSHRREV_B32_e64_:%[0-9]+]]:vgpr_32 = V_LSHRREV_B32_e64 [[COPY8]], [[COPY4]], implicit $exec - ; UNPACKED-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] - ; UNPACKED-NEXT: [[V_LSHRREV_B32_e64_1:%[0-9]+]]:vgpr_32 = V_LSHRREV_B32_e64 [[COPY9]], [[COPY5]], implicit $exec + ; UNPACKED-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 16, implicit $exec + ; UNPACKED-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 16, implicit $exec + ; UNPACKED-NEXT: [[V_LSHRREV_B32_e64_:%[0-9]+]]:vgpr_32 = V_LSHRREV_B32_e64 [[V_MOV_B32_e32_1]], [[COPY4]], implicit $exec + ; UNPACKED-NEXT: [[V_LSHRREV_B32_e64_1:%[0-9]+]]:vgpr_32 = V_LSHRREV_B32_e64 [[V_MOV_B32_e32_]], [[COPY5]], implicit $exec ; UNPACKED-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[V_LSHRREV_B32_e64_]], %subreg.sub1, [[COPY5]], %subreg.sub2, [[V_LSHRREV_B32_e64_1]], %subreg.sub3 ; UNPACKED-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec ; UNPACKED-NEXT: {{ $}} @@ -177,12 +174,12 @@ ; UNPACKED-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY2]], implicit $exec ; UNPACKED-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY3]], implicit $exec ; UNPACKED-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3 - ; UNPACKED-NEXT: [[COPY10:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub0_sub1 - ; UNPACKED-NEXT: [[COPY11:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub2_sub3 - ; UNPACKED-NEXT: [[COPY12:%[0-9]+]]:sreg_64 = COPY [[REG_SEQUENCE2]].sub0_sub1 - ; UNPACKED-NEXT: [[COPY13:%[0-9]+]]:sreg_64 = COPY [[REG_SEQUENCE2]].sub2_sub3 - ; UNPACKED-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[COPY12]], [[COPY10]], implicit $exec - ; UNPACKED-NEXT: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[COPY13]], [[COPY11]], implicit $exec + ; UNPACKED-NEXT: [[COPY8:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub0_sub1 + ; UNPACKED-NEXT: [[COPY9:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub2_sub3 + ; UNPACKED-NEXT: [[COPY10:%[0-9]+]]:sreg_64 = COPY [[REG_SEQUENCE2]].sub0_sub1 + ; UNPACKED-NEXT: [[COPY11:%[0-9]+]]:sreg_64 = COPY [[REG_SEQUENCE2]].sub2_sub3 + ; UNPACKED-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[COPY10]], [[COPY8]], implicit $exec + ; UNPACKED-NEXT: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[COPY11]], [[COPY9]], implicit $exec ; UNPACKED-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def $scc ; UNPACKED-NEXT: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_]], implicit-def $exec, implicit-def $scc, implicit $exec ; UNPACKED-NEXT: {{ $}} @@ -265,9 +262,8 @@ ; UNPACKED-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; UNPACKED-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1 ; UNPACKED-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 4095 - ; UNPACKED-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 16 - ; UNPACKED-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_1]] - ; UNPACKED-NEXT: [[V_LSHRREV_B32_e64_:%[0-9]+]]:vgpr_32 = V_LSHRREV_B32_e64 [[COPY6]], [[COPY4]], implicit $exec + ; UNPACKED-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 16, implicit $exec + ; UNPACKED-NEXT: [[V_LSHRREV_B32_e64_:%[0-9]+]]:vgpr_32 = V_LSHRREV_B32_e64 [[V_MOV_B32_e32_]], [[COPY4]], implicit $exec ; UNPACKED-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[V_LSHRREV_B32_e64_]], %subreg.sub1 ; UNPACKED-NEXT: BUFFER_STORE_FORMAT_D16_XY_gfx80_OFFEN_exact [[REG_SEQUENCE1]], [[COPY5]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 0, 0, 0, implicit $exec :: (dereferenceable store (<2 x s16>), align 1, addrspace 8) ; UNPACKED-NEXT: S_ENDPGM 0 @@ -302,9 +298,8 @@ ; UNPACKED-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; UNPACKED-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1 ; UNPACKED-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 4096 - ; UNPACKED-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 16 - ; UNPACKED-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_1]] - ; UNPACKED-NEXT: [[V_LSHRREV_B32_e64_:%[0-9]+]]:vgpr_32 = V_LSHRREV_B32_e64 [[COPY6]], [[COPY4]], implicit $exec + ; UNPACKED-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 16, implicit $exec + ; UNPACKED-NEXT: [[V_LSHRREV_B32_e64_:%[0-9]+]]:vgpr_32 = V_LSHRREV_B32_e64 [[V_MOV_B32_e32_]], [[COPY4]], implicit $exec ; UNPACKED-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[V_LSHRREV_B32_e64_]], %subreg.sub1 ; UNPACKED-NEXT: BUFFER_STORE_FORMAT_D16_XY_gfx80_OFFEN_exact [[REG_SEQUENCE1]], [[COPY5]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 0, 0, 0, implicit $exec :: (dereferenceable store (<2 x s16>), align 1, addrspace 8) ; UNPACKED-NEXT: S_ENDPGM 0 @@ -339,9 +334,8 @@ ; UNPACKED-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; UNPACKED-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1 ; UNPACKED-NEXT: [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6 - ; UNPACKED-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 16 - ; UNPACKED-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] - ; UNPACKED-NEXT: [[V_LSHRREV_B32_e64_:%[0-9]+]]:vgpr_32 = V_LSHRREV_B32_e64 [[COPY7]], [[COPY4]], implicit $exec + ; UNPACKED-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 16, implicit $exec + ; UNPACKED-NEXT: [[V_LSHRREV_B32_e64_:%[0-9]+]]:vgpr_32 = V_LSHRREV_B32_e64 [[V_MOV_B32_e32_]], [[COPY4]], implicit $exec ; UNPACKED-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[V_LSHRREV_B32_e64_]], %subreg.sub1 ; UNPACKED-NEXT: BUFFER_STORE_FORMAT_D16_XY_gfx80_OFFEN_exact [[REG_SEQUENCE1]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 16, 0, 0, implicit $exec :: (dereferenceable store (<2 x s16>), align 1, addrspace 8) ; UNPACKED-NEXT: S_ENDPGM 0 @@ -377,9 +371,8 @@ ; UNPACKED-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; UNPACKED-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1 ; UNPACKED-NEXT: [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6 - ; UNPACKED-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 16 - ; UNPACKED-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] - ; UNPACKED-NEXT: [[V_LSHRREV_B32_e64_:%[0-9]+]]:vgpr_32 = V_LSHRREV_B32_e64 [[COPY7]], [[COPY4]], implicit $exec + ; UNPACKED-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 16, implicit $exec + ; UNPACKED-NEXT: [[V_LSHRREV_B32_e64_:%[0-9]+]]:vgpr_32 = V_LSHRREV_B32_e64 [[V_MOV_B32_e32_]], [[COPY4]], implicit $exec ; UNPACKED-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[V_LSHRREV_B32_e64_]], %subreg.sub1 ; UNPACKED-NEXT: BUFFER_STORE_FORMAT_D16_XY_gfx80_OFFEN_exact [[REG_SEQUENCE1]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 4095, 0, 0, implicit $exec :: (dereferenceable store (<2 x s16>), align 1, addrspace 8) ; UNPACKED-NEXT: S_ENDPGM 0 @@ -415,12 +408,10 @@ ; UNPACKED-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; UNPACKED-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1 ; UNPACKED-NEXT: [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6 - ; UNPACKED-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 4096 - ; UNPACKED-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] - ; UNPACKED-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64 = V_ADD_CO_U32_e64 [[COPY5]], [[COPY7]], 0, implicit $exec - ; UNPACKED-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 16 - ; UNPACKED-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_1]] - ; UNPACKED-NEXT: [[V_LSHRREV_B32_e64_:%[0-9]+]]:vgpr_32 = V_LSHRREV_B32_e64 [[COPY8]], [[COPY4]], implicit $exec + ; UNPACKED-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4096, implicit $exec + ; UNPACKED-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64 = V_ADD_CO_U32_e64 [[COPY5]], [[V_MOV_B32_e32_]], 0, implicit $exec + ; UNPACKED-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 16, implicit $exec + ; UNPACKED-NEXT: [[V_LSHRREV_B32_e64_:%[0-9]+]]:vgpr_32 = V_LSHRREV_B32_e64 [[V_MOV_B32_e32_1]], [[COPY4]], implicit $exec ; UNPACKED-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[V_LSHRREV_B32_e64_]], %subreg.sub1 ; UNPACKED-NEXT: BUFFER_STORE_FORMAT_D16_XY_gfx80_OFFEN_exact [[REG_SEQUENCE1]], [[V_ADD_CO_U32_e64_]], [[REG_SEQUENCE]], [[COPY6]], 0, 0, 0, implicit $exec :: (dereferenceable store (<2 x s16>), align 1, addrspace 8) ; UNPACKED-NEXT: S_ENDPGM 0 @@ -436,9 +427,8 @@ ; PACKED-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; PACKED-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1 ; PACKED-NEXT: [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6 - ; PACKED-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 4096 - ; PACKED-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] - ; PACKED-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64 = V_ADD_CO_U32_e64 [[COPY5]], [[COPY7]], 0, implicit $exec + ; PACKED-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4096, implicit $exec + ; PACKED-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64 = V_ADD_CO_U32_e64 [[COPY5]], [[V_MOV_B32_e32_]], 0, implicit $exec ; PACKED-NEXT: BUFFER_STORE_FORMAT_D16_XY_OFFEN_exact [[COPY4]], [[V_ADD_CO_U32_e64_]], [[REG_SEQUENCE]], [[COPY6]], 0, 0, 0, implicit $exec :: (dereferenceable store (<2 x s16>), align 1, addrspace 8) ; PACKED-NEXT: S_ENDPGM 0 %voffset.add = add i32 %voffset, 4096 @@ -463,14 +453,12 @@ ; UNPACKED-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr5 ; UNPACKED-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr6 ; UNPACKED-NEXT: [[COPY7:%[0-9]+]]:sreg_32 = COPY $sgpr2 - ; UNPACKED-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 4096 - ; UNPACKED-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] - ; UNPACKED-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64 = V_ADD_CO_U32_e64 [[COPY6]], [[COPY8]], 0, implicit $exec - ; UNPACKED-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 16 - ; UNPACKED-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_1]] - ; UNPACKED-NEXT: [[V_LSHRREV_B32_e64_:%[0-9]+]]:vgpr_32 = V_LSHRREV_B32_e64 [[COPY9]], [[COPY4]], implicit $exec - ; UNPACKED-NEXT: [[COPY10:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_1]] - ; UNPACKED-NEXT: [[V_LSHRREV_B32_e64_1:%[0-9]+]]:vgpr_32 = V_LSHRREV_B32_e64 [[COPY10]], [[COPY5]], implicit $exec + ; UNPACKED-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4096, implicit $exec + ; UNPACKED-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64 = V_ADD_CO_U32_e64 [[COPY6]], [[V_MOV_B32_e32_]], 0, implicit $exec + ; UNPACKED-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 16, implicit $exec + ; UNPACKED-NEXT: [[V_MOV_B32_e32_2:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 16, implicit $exec + ; UNPACKED-NEXT: [[V_LSHRREV_B32_e64_:%[0-9]+]]:vgpr_32 = V_LSHRREV_B32_e64 [[V_MOV_B32_e32_2]], [[COPY4]], implicit $exec + ; UNPACKED-NEXT: [[V_LSHRREV_B32_e64_1:%[0-9]+]]:vgpr_32 = V_LSHRREV_B32_e64 [[V_MOV_B32_e32_1]], [[COPY5]], implicit $exec ; UNPACKED-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[V_LSHRREV_B32_e64_]], %subreg.sub1, [[COPY5]], %subreg.sub2, [[V_LSHRREV_B32_e64_1]], %subreg.sub3 ; UNPACKED-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec ; UNPACKED-NEXT: {{ $}} @@ -482,12 +470,12 @@ ; UNPACKED-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY2]], implicit $exec ; UNPACKED-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY3]], implicit $exec ; UNPACKED-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3 - ; UNPACKED-NEXT: [[COPY11:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub0_sub1 - ; UNPACKED-NEXT: [[COPY12:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub2_sub3 - ; UNPACKED-NEXT: [[COPY13:%[0-9]+]]:sreg_64 = COPY [[REG_SEQUENCE2]].sub0_sub1 - ; UNPACKED-NEXT: [[COPY14:%[0-9]+]]:sreg_64 = COPY [[REG_SEQUENCE2]].sub2_sub3 - ; UNPACKED-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[COPY13]], [[COPY11]], implicit $exec - ; UNPACKED-NEXT: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[COPY14]], [[COPY12]], implicit $exec + ; UNPACKED-NEXT: [[COPY8:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub0_sub1 + ; UNPACKED-NEXT: [[COPY9:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub2_sub3 + ; UNPACKED-NEXT: [[COPY10:%[0-9]+]]:sreg_64 = COPY [[REG_SEQUENCE2]].sub0_sub1 + ; UNPACKED-NEXT: [[COPY11:%[0-9]+]]:sreg_64 = COPY [[REG_SEQUENCE2]].sub2_sub3 + ; UNPACKED-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[COPY10]], [[COPY8]], implicit $exec + ; UNPACKED-NEXT: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[COPY11]], [[COPY9]], implicit $exec ; UNPACKED-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def $scc ; UNPACKED-NEXT: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_]], implicit-def $exec, implicit-def $scc, implicit $exec ; UNPACKED-NEXT: {{ $}} @@ -520,9 +508,8 @@ ; PACKED-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY5]], %subreg.sub1 ; PACKED-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr6 ; PACKED-NEXT: [[COPY7:%[0-9]+]]:sreg_32 = COPY $sgpr2 - ; PACKED-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 4096 - ; PACKED-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] - ; PACKED-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64 = V_ADD_CO_U32_e64 [[COPY6]], [[COPY8]], 0, implicit $exec + ; PACKED-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4096, implicit $exec + ; PACKED-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64 = V_ADD_CO_U32_e64 [[COPY6]], [[V_MOV_B32_e32_]], 0, implicit $exec ; PACKED-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec ; PACKED-NEXT: {{ $}} ; PACKED-NEXT: bb.2: @@ -533,12 +520,12 @@ ; PACKED-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY2]], implicit $exec ; PACKED-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY3]], implicit $exec ; PACKED-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3 - ; PACKED-NEXT: [[COPY9:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub0_sub1 - ; PACKED-NEXT: [[COPY10:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub2_sub3 - ; PACKED-NEXT: [[COPY11:%[0-9]+]]:sreg_64 = COPY [[REG_SEQUENCE2]].sub0_sub1 - ; PACKED-NEXT: [[COPY12:%[0-9]+]]:sreg_64 = COPY [[REG_SEQUENCE2]].sub2_sub3 - ; PACKED-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[COPY11]], [[COPY9]], implicit $exec - ; PACKED-NEXT: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[COPY12]], [[COPY10]], implicit $exec + ; PACKED-NEXT: [[COPY8:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub0_sub1 + ; PACKED-NEXT: [[COPY9:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub2_sub3 + ; PACKED-NEXT: [[COPY10:%[0-9]+]]:sreg_64 = COPY [[REG_SEQUENCE2]].sub0_sub1 + ; PACKED-NEXT: [[COPY11:%[0-9]+]]:sreg_64 = COPY [[REG_SEQUENCE2]].sub2_sub3 + ; PACKED-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[COPY10]], [[COPY8]], implicit $exec + ; PACKED-NEXT: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[COPY11]], [[COPY9]], implicit $exec ; PACKED-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def $scc ; PACKED-NEXT: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_]], implicit-def $exec, implicit-def $scc, implicit $exec ; PACKED-NEXT: {{ $}} Index: llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.buffer.store.format.f32.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.buffer.store.format.f32.ll +++ llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.buffer.store.format.f32.ll @@ -261,9 +261,8 @@ ; CHECK-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY5]], %subreg.sub1 ; CHECK-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr2 ; CHECK-NEXT: [[COPY7:%[0-9]+]]:sreg_32 = COPY $sgpr6 - ; CHECK-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 4096 - ; CHECK-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] - ; CHECK-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64 = V_ADD_CO_U32_e64 [[COPY6]], [[COPY8]], 0, implicit $exec + ; CHECK-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4096, implicit $exec + ; CHECK-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64 = V_ADD_CO_U32_e64 [[COPY6]], [[V_MOV_B32_e32_]], 0, implicit $exec ; CHECK-NEXT: BUFFER_STORE_FORMAT_XY_OFFEN_exact [[REG_SEQUENCE1]], [[V_ADD_CO_U32_e64_]], [[REG_SEQUENCE]], [[COPY7]], 0, 0, 0, implicit $exec :: (dereferenceable store (<2 x s32>), align 1, addrspace 8) ; CHECK-NEXT: S_ENDPGM 0 %voffset.add = add i32 %voffset, 4096 @@ -291,9 +290,8 @@ ; CHECK-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY5]], %subreg.sub1, [[COPY6]], %subreg.sub2, [[COPY7]], %subreg.sub3 ; CHECK-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY $vgpr8 ; CHECK-NEXT: [[COPY9:%[0-9]+]]:sreg_32 = COPY $sgpr2 - ; CHECK-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 4096 - ; CHECK-NEXT: [[COPY10:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] - ; CHECK-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64 = V_ADD_CO_U32_e64 [[COPY8]], [[COPY10]], 0, implicit $exec + ; CHECK-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4096, implicit $exec + ; CHECK-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64 = V_ADD_CO_U32_e64 [[COPY8]], [[V_MOV_B32_e32_]], 0, implicit $exec ; CHECK-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.2: @@ -304,12 +302,12 @@ ; CHECK-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY2]], implicit $exec ; CHECK-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY3]], implicit $exec ; CHECK-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3 - ; CHECK-NEXT: [[COPY11:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub0_sub1 - ; CHECK-NEXT: [[COPY12:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub2_sub3 - ; CHECK-NEXT: [[COPY13:%[0-9]+]]:sreg_64 = COPY [[REG_SEQUENCE2]].sub0_sub1 - ; CHECK-NEXT: [[COPY14:%[0-9]+]]:sreg_64 = COPY [[REG_SEQUENCE2]].sub2_sub3 - ; CHECK-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[COPY13]], [[COPY11]], implicit $exec - ; CHECK-NEXT: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[COPY14]], [[COPY12]], implicit $exec + ; CHECK-NEXT: [[COPY10:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub0_sub1 + ; CHECK-NEXT: [[COPY11:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub2_sub3 + ; CHECK-NEXT: [[COPY12:%[0-9]+]]:sreg_64 = COPY [[REG_SEQUENCE2]].sub0_sub1 + ; CHECK-NEXT: [[COPY13:%[0-9]+]]:sreg_64 = COPY [[REG_SEQUENCE2]].sub2_sub3 + ; CHECK-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[COPY12]], [[COPY10]], implicit $exec + ; CHECK-NEXT: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[COPY13]], [[COPY11]], implicit $exec ; CHECK-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def $scc ; CHECK-NEXT: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_]], implicit-def $exec, implicit-def $scc, implicit $exec ; CHECK-NEXT: {{ $}} Index: llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.buffer.store.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.buffer.store.ll +++ llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.buffer.store.ll @@ -575,9 +575,8 @@ ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; CHECK-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; CHECK-NEXT: [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr6 - ; CHECK-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 4096 - ; CHECK-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] - ; CHECK-NEXT: BUFFER_STORE_DWORD_OFFEN_exact [[COPY4]], [[COPY6]], [[REG_SEQUENCE]], [[COPY5]], 0, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 8) + ; CHECK-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4096, implicit $exec + ; CHECK-NEXT: BUFFER_STORE_DWORD_OFFEN_exact [[COPY4]], [[V_MOV_B32_e32_]], [[REG_SEQUENCE]], [[COPY5]], 0, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 8) ; CHECK-NEXT: S_ENDPGM 0 call void @llvm.amdgcn.raw.buffer.store.f32(float %val, <4 x i32> %rsrc, i32 4096, i32 %soffset, i32 0) ret void @@ -636,9 +635,8 @@ ; CHECK-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; CHECK-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1 ; CHECK-NEXT: [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6 - ; CHECK-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 4096 - ; CHECK-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] - ; CHECK-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64 = V_ADD_CO_U32_e64 [[COPY5]], [[COPY7]], 0, implicit $exec + ; CHECK-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4096, implicit $exec + ; CHECK-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64 = V_ADD_CO_U32_e64 [[COPY5]], [[V_MOV_B32_e32_]], 0, implicit $exec ; CHECK-NEXT: BUFFER_STORE_DWORD_OFFEN_exact [[COPY4]], [[V_ADD_CO_U32_e64_]], [[REG_SEQUENCE]], [[COPY6]], 0, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 8) ; CHECK-NEXT: S_ENDPGM 0 %voffset.add = add i32 %voffset, 4096 @@ -737,9 +735,8 @@ ; CHECK-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; CHECK-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1 ; CHECK-NEXT: [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6 - ; CHECK-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 4096 - ; CHECK-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] - ; CHECK-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64 = V_ADD_CO_U32_e64 [[COPY5]], [[COPY7]], 0, implicit $exec + ; CHECK-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4096, implicit $exec + ; CHECK-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64 = V_ADD_CO_U32_e64 [[COPY5]], [[V_MOV_B32_e32_]], 0, implicit $exec ; CHECK-NEXT: BUFFER_STORE_DWORD_OFFEN_exact [[COPY4]], [[V_ADD_CO_U32_e64_]], [[REG_SEQUENCE]], [[COPY6]], 0, 0, 0, implicit $exec :: (dereferenceable store (<2 x s16>), align 1, addrspace 8) ; CHECK-NEXT: S_ENDPGM 0 %voffset.add = add i32 %voffset, 4096 @@ -762,9 +759,8 @@ ; CHECK-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr4 ; CHECK-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr5 ; CHECK-NEXT: [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr2 - ; CHECK-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 4096 - ; CHECK-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] - ; CHECK-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64 = V_ADD_CO_U32_e64 [[COPY5]], [[COPY7]], 0, implicit $exec + ; CHECK-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4096, implicit $exec + ; CHECK-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64 = V_ADD_CO_U32_e64 [[COPY5]], [[V_MOV_B32_e32_]], 0, implicit $exec ; CHECK-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.2: @@ -775,12 +771,12 @@ ; CHECK-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY2]], implicit $exec ; CHECK-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY3]], implicit $exec ; CHECK-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3 - ; CHECK-NEXT: [[COPY8:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub0_sub1 - ; CHECK-NEXT: [[COPY9:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub2_sub3 - ; CHECK-NEXT: [[COPY10:%[0-9]+]]:sreg_64 = COPY [[REG_SEQUENCE1]].sub0_sub1 - ; CHECK-NEXT: [[COPY11:%[0-9]+]]:sreg_64 = COPY [[REG_SEQUENCE1]].sub2_sub3 - ; CHECK-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[COPY10]], [[COPY8]], implicit $exec - ; CHECK-NEXT: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[COPY11]], [[COPY9]], implicit $exec + ; CHECK-NEXT: [[COPY7:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub0_sub1 + ; CHECK-NEXT: [[COPY8:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub2_sub3 + ; CHECK-NEXT: [[COPY9:%[0-9]+]]:sreg_64 = COPY [[REG_SEQUENCE1]].sub0_sub1 + ; CHECK-NEXT: [[COPY10:%[0-9]+]]:sreg_64 = COPY [[REG_SEQUENCE1]].sub2_sub3 + ; CHECK-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[COPY9]], [[COPY7]], implicit $exec + ; CHECK-NEXT: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[COPY10]], [[COPY8]], implicit $exec ; CHECK-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def $scc ; CHECK-NEXT: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_]], implicit-def $exec, implicit-def $scc, implicit $exec ; CHECK-NEXT: {{ $}} @@ -817,8 +813,7 @@ ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; CHECK-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr4 ; CHECK-NEXT: [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr2 - ; CHECK-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 4096 - ; CHECK-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] + ; CHECK-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4096, implicit $exec ; CHECK-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.2: @@ -829,19 +824,19 @@ ; CHECK-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY2]], implicit $exec ; CHECK-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY3]], implicit $exec ; CHECK-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3 - ; CHECK-NEXT: [[COPY7:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub0_sub1 - ; CHECK-NEXT: [[COPY8:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub2_sub3 - ; CHECK-NEXT: [[COPY9:%[0-9]+]]:sreg_64 = COPY [[REG_SEQUENCE1]].sub0_sub1 - ; CHECK-NEXT: [[COPY10:%[0-9]+]]:sreg_64 = COPY [[REG_SEQUENCE1]].sub2_sub3 - ; CHECK-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[COPY9]], [[COPY7]], implicit $exec - ; CHECK-NEXT: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[COPY10]], [[COPY8]], implicit $exec + ; CHECK-NEXT: [[COPY6:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub0_sub1 + ; CHECK-NEXT: [[COPY7:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub2_sub3 + ; CHECK-NEXT: [[COPY8:%[0-9]+]]:sreg_64 = COPY [[REG_SEQUENCE1]].sub0_sub1 + ; CHECK-NEXT: [[COPY9:%[0-9]+]]:sreg_64 = COPY [[REG_SEQUENCE1]].sub2_sub3 + ; CHECK-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[COPY8]], [[COPY6]], implicit $exec + ; CHECK-NEXT: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[COPY9]], [[COPY7]], implicit $exec ; CHECK-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def $scc ; CHECK-NEXT: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_]], implicit-def $exec, implicit-def $scc, implicit $exec ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.3: ; CHECK-NEXT: successors: %bb.4(0x40000000), %bb.2(0x40000000) ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: BUFFER_STORE_DWORD_OFFEN_exact [[COPY4]], [[COPY6]], [[REG_SEQUENCE1]], [[COPY5]], 904, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 8) + ; CHECK-NEXT: BUFFER_STORE_DWORD_OFFEN_exact [[COPY4]], [[V_MOV_B32_e32_]], [[REG_SEQUENCE1]], [[COPY5]], 904, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 8) ; CHECK-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc ; CHECK-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec ; CHECK-NEXT: {{ $}} Index: llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.ptr.buffer.load.format.f16.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.ptr.buffer.load.format.f16.ll +++ llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.ptr.buffer.load.format.f16.ll @@ -65,14 +65,12 @@ ; UNPACKED-NEXT: [[BUFFER_LOAD_FORMAT_D16_XY_gfx80_OFFEN:%[0-9]+]]:vreg_64 = BUFFER_LOAD_FORMAT_D16_XY_gfx80_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[COPY5]], 0, 0, 0, implicit $exec :: (dereferenceable load (<2 x s16>) from %ir.rsrc, align 1, addrspace 8) ; UNPACKED-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_FORMAT_D16_XY_gfx80_OFFEN]].sub0 ; UNPACKED-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_FORMAT_D16_XY_gfx80_OFFEN]].sub1 - ; UNPACKED-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 65535 - ; UNPACKED-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] - ; UNPACKED-NEXT: [[V_AND_B32_e64_:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 [[COPY6]], [[COPY8]], implicit $exec - ; UNPACKED-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] - ; UNPACKED-NEXT: [[V_AND_B32_e64_1:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 [[COPY7]], [[COPY9]], implicit $exec - ; UNPACKED-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 16 - ; UNPACKED-NEXT: [[COPY10:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_1]] - ; UNPACKED-NEXT: [[V_LSHLREV_B32_e64_:%[0-9]+]]:vgpr_32 = V_LSHLREV_B32_e64 [[COPY10]], [[V_AND_B32_e64_1]], implicit $exec + ; UNPACKED-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 65535, implicit $exec + ; UNPACKED-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 65535, implicit $exec + ; UNPACKED-NEXT: [[V_AND_B32_e64_:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 [[COPY6]], [[V_MOV_B32_e32_1]], implicit $exec + ; UNPACKED-NEXT: [[V_AND_B32_e64_1:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 [[COPY7]], [[V_MOV_B32_e32_]], implicit $exec + ; UNPACKED-NEXT: [[V_MOV_B32_e32_2:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 16, implicit $exec + ; UNPACKED-NEXT: [[V_LSHLREV_B32_e64_:%[0-9]+]]:vgpr_32 = V_LSHLREV_B32_e64 [[V_MOV_B32_e32_2]], [[V_AND_B32_e64_1]], implicit $exec ; UNPACKED-NEXT: [[V_OR_B32_e64_:%[0-9]+]]:vgpr_32 = V_OR_B32_e64 [[V_AND_B32_e64_]], [[V_LSHLREV_B32_e64_]], implicit $exec ; UNPACKED-NEXT: $vgpr0 = COPY [[V_OR_B32_e64_]] ; UNPACKED-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0 @@ -120,21 +118,19 @@ ; UNPACKED-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_FORMAT_D16_XYZW_gfx80_OFFEN]].sub1 ; UNPACKED-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_FORMAT_D16_XYZW_gfx80_OFFEN]].sub2 ; UNPACKED-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_FORMAT_D16_XYZW_gfx80_OFFEN]].sub3 - ; UNPACKED-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 65535 - ; UNPACKED-NEXT: [[COPY10:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] - ; UNPACKED-NEXT: [[V_AND_B32_e64_:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 [[COPY6]], [[COPY10]], implicit $exec - ; UNPACKED-NEXT: [[COPY11:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] - ; UNPACKED-NEXT: [[V_AND_B32_e64_1:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 [[COPY7]], [[COPY11]], implicit $exec - ; UNPACKED-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 16 - ; UNPACKED-NEXT: [[COPY12:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_1]] - ; UNPACKED-NEXT: [[V_LSHLREV_B32_e64_:%[0-9]+]]:vgpr_32 = V_LSHLREV_B32_e64 [[COPY12]], [[V_AND_B32_e64_1]], implicit $exec + ; UNPACKED-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 65535, implicit $exec + ; UNPACKED-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 65535, implicit $exec + ; UNPACKED-NEXT: [[V_AND_B32_e64_:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 [[COPY6]], [[V_MOV_B32_e32_1]], implicit $exec + ; UNPACKED-NEXT: [[V_MOV_B32_e32_2:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 65535, implicit $exec + ; UNPACKED-NEXT: [[V_AND_B32_e64_1:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 [[COPY7]], [[V_MOV_B32_e32_2]], implicit $exec + ; UNPACKED-NEXT: [[V_MOV_B32_e32_3:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 16, implicit $exec + ; UNPACKED-NEXT: [[V_MOV_B32_e32_4:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 16, implicit $exec + ; UNPACKED-NEXT: [[V_LSHLREV_B32_e64_:%[0-9]+]]:vgpr_32 = V_LSHLREV_B32_e64 [[V_MOV_B32_e32_4]], [[V_AND_B32_e64_1]], implicit $exec ; UNPACKED-NEXT: [[V_OR_B32_e64_:%[0-9]+]]:vgpr_32 = V_OR_B32_e64 [[V_AND_B32_e64_]], [[V_LSHLREV_B32_e64_]], implicit $exec - ; UNPACKED-NEXT: [[COPY13:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] - ; UNPACKED-NEXT: [[V_AND_B32_e64_2:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 [[COPY8]], [[COPY13]], implicit $exec - ; UNPACKED-NEXT: [[COPY14:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] - ; UNPACKED-NEXT: [[V_AND_B32_e64_3:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 [[COPY9]], [[COPY14]], implicit $exec - ; UNPACKED-NEXT: [[COPY15:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_1]] - ; UNPACKED-NEXT: [[V_LSHLREV_B32_e64_1:%[0-9]+]]:vgpr_32 = V_LSHLREV_B32_e64 [[COPY15]], [[V_AND_B32_e64_3]], implicit $exec + ; UNPACKED-NEXT: [[V_MOV_B32_e32_5:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 65535, implicit $exec + ; UNPACKED-NEXT: [[V_AND_B32_e64_2:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 [[COPY8]], [[V_MOV_B32_e32_5]], implicit $exec + ; UNPACKED-NEXT: [[V_AND_B32_e64_3:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 [[COPY9]], [[V_MOV_B32_e32_]], implicit $exec + ; UNPACKED-NEXT: [[V_LSHLREV_B32_e64_1:%[0-9]+]]:vgpr_32 = V_LSHLREV_B32_e64 [[V_MOV_B32_e32_3]], [[V_AND_B32_e64_3]], implicit $exec ; UNPACKED-NEXT: [[V_OR_B32_e64_1:%[0-9]+]]:vgpr_32 = V_OR_B32_e64 [[V_AND_B32_e64_2]], [[V_LSHLREV_B32_e64_1]], implicit $exec ; UNPACKED-NEXT: $vgpr0 = COPY [[V_OR_B32_e64_]] ; UNPACKED-NEXT: $vgpr1 = COPY [[V_OR_B32_e64_1]] @@ -283,21 +279,19 @@ ; UNPACKED-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_FORMAT_D16_XYZW_gfx80_OFFEN]].sub1 ; UNPACKED-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_FORMAT_D16_XYZW_gfx80_OFFEN]].sub2 ; UNPACKED-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_FORMAT_D16_XYZW_gfx80_OFFEN]].sub3 - ; UNPACKED-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 65535 - ; UNPACKED-NEXT: [[COPY10:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] - ; UNPACKED-NEXT: [[V_AND_B32_e64_:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 [[COPY6]], [[COPY10]], implicit $exec - ; UNPACKED-NEXT: [[COPY11:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] - ; UNPACKED-NEXT: [[V_AND_B32_e64_1:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 [[COPY7]], [[COPY11]], implicit $exec - ; UNPACKED-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 16 - ; UNPACKED-NEXT: [[COPY12:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_1]] - ; UNPACKED-NEXT: [[V_LSHLREV_B32_e64_:%[0-9]+]]:vgpr_32 = V_LSHLREV_B32_e64 [[COPY12]], [[V_AND_B32_e64_1]], implicit $exec + ; UNPACKED-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 65535, implicit $exec + ; UNPACKED-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 65535, implicit $exec + ; UNPACKED-NEXT: [[V_AND_B32_e64_:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 [[COPY6]], [[V_MOV_B32_e32_1]], implicit $exec + ; UNPACKED-NEXT: [[V_MOV_B32_e32_2:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 65535, implicit $exec + ; UNPACKED-NEXT: [[V_AND_B32_e64_1:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 [[COPY7]], [[V_MOV_B32_e32_2]], implicit $exec + ; UNPACKED-NEXT: [[V_MOV_B32_e32_3:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 16, implicit $exec + ; UNPACKED-NEXT: [[V_MOV_B32_e32_4:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 16, implicit $exec + ; UNPACKED-NEXT: [[V_LSHLREV_B32_e64_:%[0-9]+]]:vgpr_32 = V_LSHLREV_B32_e64 [[V_MOV_B32_e32_4]], [[V_AND_B32_e64_1]], implicit $exec ; UNPACKED-NEXT: [[V_OR_B32_e64_:%[0-9]+]]:vgpr_32 = V_OR_B32_e64 [[V_AND_B32_e64_]], [[V_LSHLREV_B32_e64_]], implicit $exec - ; UNPACKED-NEXT: [[COPY13:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] - ; UNPACKED-NEXT: [[V_AND_B32_e64_2:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 [[COPY8]], [[COPY13]], implicit $exec - ; UNPACKED-NEXT: [[COPY14:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] - ; UNPACKED-NEXT: [[V_AND_B32_e64_3:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 [[COPY9]], [[COPY14]], implicit $exec - ; UNPACKED-NEXT: [[COPY15:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_1]] - ; UNPACKED-NEXT: [[V_LSHLREV_B32_e64_1:%[0-9]+]]:vgpr_32 = V_LSHLREV_B32_e64 [[COPY15]], [[V_AND_B32_e64_3]], implicit $exec + ; UNPACKED-NEXT: [[V_MOV_B32_e32_5:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 65535, implicit $exec + ; UNPACKED-NEXT: [[V_AND_B32_e64_2:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 [[COPY8]], [[V_MOV_B32_e32_5]], implicit $exec + ; UNPACKED-NEXT: [[V_AND_B32_e64_3:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 [[COPY9]], [[V_MOV_B32_e32_]], implicit $exec + ; UNPACKED-NEXT: [[V_LSHLREV_B32_e64_1:%[0-9]+]]:vgpr_32 = V_LSHLREV_B32_e64 [[V_MOV_B32_e32_3]], [[V_AND_B32_e64_3]], implicit $exec ; UNPACKED-NEXT: [[V_OR_B32_e64_1:%[0-9]+]]:vgpr_32 = V_OR_B32_e64 [[V_AND_B32_e64_2]], [[V_LSHLREV_B32_e64_1]], implicit $exec ; UNPACKED-NEXT: $vgpr0 = COPY [[V_OR_B32_e64_]] ; UNPACKED-NEXT: $vgpr1 = COPY [[V_OR_B32_e64_1]] Index: llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.ptr.buffer.load.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.ptr.buffer.load.ll +++ llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.ptr.buffer.load.ll @@ -645,10 +645,9 @@ ; CHECK-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4 ; CHECK-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5 ; CHECK-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr6 - ; CHECK-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 4096 + ; CHECK-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4096, implicit $exec ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 - ; CHECK-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] - ; CHECK-NEXT: [[BUFFER_LOAD_DWORD_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN [[COPY5]], [[REG_SEQUENCE]], [[COPY4]], 0, 0, 0, implicit $exec :: (dereferenceable load (s32) from %ir.rsrc, align 1, addrspace 8) + ; CHECK-NEXT: [[BUFFER_LOAD_DWORD_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN [[V_MOV_B32_e32_]], [[REG_SEQUENCE]], [[COPY4]], 0, 0, 0, implicit $exec :: (dereferenceable load (s32) from %ir.rsrc, align 1, addrspace 8) ; CHECK-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_DWORD_OFFEN]] ; CHECK-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0 %val = call float @llvm.amdgcn.raw.ptr.buffer.load.f32(ptr addrspace(8) %rsrc, i32 4096, i32 %soffset, i32 0) @@ -706,9 +705,8 @@ ; CHECK-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5 ; CHECK-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; CHECK-NEXT: [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr6 - ; CHECK-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 4096 - ; CHECK-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] - ; CHECK-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64 = V_ADD_CO_U32_e64 [[COPY4]], [[COPY6]], 0, implicit $exec + ; CHECK-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4096, implicit $exec + ; CHECK-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64 = V_ADD_CO_U32_e64 [[COPY4]], [[V_MOV_B32_e32_]], 0, implicit $exec ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; CHECK-NEXT: [[BUFFER_LOAD_DWORD_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN [[V_ADD_CO_U32_e64_]], [[REG_SEQUENCE]], [[COPY5]], 0, 0, 0, implicit $exec :: (dereferenceable load (s32) from %ir.rsrc, align 1, addrspace 8) ; CHECK-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_DWORD_OFFEN]] @@ -891,9 +889,8 @@ ; CHECK-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr4 ; CHECK-NEXT: [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr2 ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 - ; CHECK-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 4096 - ; CHECK-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] - ; CHECK-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64 = V_ADD_CO_U32_e64 [[COPY4]], [[COPY6]], 0, implicit $exec + ; CHECK-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4096, implicit $exec + ; CHECK-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64 = V_ADD_CO_U32_e64 [[COPY4]], [[V_MOV_B32_e32_]], 0, implicit $exec ; CHECK-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.2: @@ -904,12 +901,12 @@ ; CHECK-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY2]], implicit $exec ; CHECK-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY3]], implicit $exec ; CHECK-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3 - ; CHECK-NEXT: [[COPY7:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub0_sub1 - ; CHECK-NEXT: [[COPY8:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub2_sub3 - ; CHECK-NEXT: [[COPY9:%[0-9]+]]:sreg_64 = COPY [[REG_SEQUENCE1]].sub0_sub1 - ; CHECK-NEXT: [[COPY10:%[0-9]+]]:sreg_64 = COPY [[REG_SEQUENCE1]].sub2_sub3 - ; CHECK-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[COPY9]], [[COPY7]], implicit $exec - ; CHECK-NEXT: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[COPY10]], [[COPY8]], implicit $exec + ; CHECK-NEXT: [[COPY6:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub0_sub1 + ; CHECK-NEXT: [[COPY7:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub2_sub3 + ; CHECK-NEXT: [[COPY8:%[0-9]+]]:sreg_64 = COPY [[REG_SEQUENCE1]].sub0_sub1 + ; CHECK-NEXT: [[COPY9:%[0-9]+]]:sreg_64 = COPY [[REG_SEQUENCE1]].sub2_sub3 + ; CHECK-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[COPY8]], [[COPY6]], implicit $exec + ; CHECK-NEXT: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[COPY9]], [[COPY7]], implicit $exec ; CHECK-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def $scc ; CHECK-NEXT: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_]], implicit-def $exec, implicit-def $scc, implicit $exec ; CHECK-NEXT: {{ $}} Index: llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.ptr.buffer.store.format.f16.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.ptr.buffer.store.format.f16.ll +++ llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.ptr.buffer.store.format.f16.ll @@ -78,9 +78,8 @@ ; UNPACKED-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; UNPACKED-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1 ; UNPACKED-NEXT: [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6 - ; UNPACKED-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 16 - ; UNPACKED-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] - ; UNPACKED-NEXT: [[V_LSHRREV_B32_e64_:%[0-9]+]]:vgpr_32 = V_LSHRREV_B32_e64 [[COPY7]], [[COPY4]], implicit $exec + ; UNPACKED-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 16, implicit $exec + ; UNPACKED-NEXT: [[V_LSHRREV_B32_e64_:%[0-9]+]]:vgpr_32 = V_LSHRREV_B32_e64 [[V_MOV_B32_e32_]], [[COPY4]], implicit $exec ; UNPACKED-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[V_LSHRREV_B32_e64_]], %subreg.sub1 ; UNPACKED-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; UNPACKED-NEXT: BUFFER_STORE_FORMAT_D16_XY_gfx80_OFFEN_exact [[REG_SEQUENCE]], [[COPY5]], [[REG_SEQUENCE1]], [[COPY6]], 0, 0, 0, implicit $exec :: (dereferenceable store (<2 x s16>) into %ir.rsrc, align 1, addrspace 8) @@ -116,11 +115,10 @@ ; UNPACKED-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1 ; UNPACKED-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr2 ; UNPACKED-NEXT: [[COPY7:%[0-9]+]]:sreg_32 = COPY $sgpr6 - ; UNPACKED-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 16 - ; UNPACKED-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] - ; UNPACKED-NEXT: [[V_LSHRREV_B32_e64_:%[0-9]+]]:vgpr_32 = V_LSHRREV_B32_e64 [[COPY8]], [[COPY4]], implicit $exec - ; UNPACKED-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] - ; UNPACKED-NEXT: [[V_LSHRREV_B32_e64_1:%[0-9]+]]:vgpr_32 = V_LSHRREV_B32_e64 [[COPY9]], [[COPY5]], implicit $exec + ; UNPACKED-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 16, implicit $exec + ; UNPACKED-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 16, implicit $exec + ; UNPACKED-NEXT: [[V_LSHRREV_B32_e64_:%[0-9]+]]:vgpr_32 = V_LSHRREV_B32_e64 [[V_MOV_B32_e32_1]], [[COPY4]], implicit $exec + ; UNPACKED-NEXT: [[V_LSHRREV_B32_e64_1:%[0-9]+]]:vgpr_32 = V_LSHRREV_B32_e64 [[V_MOV_B32_e32_]], [[COPY5]], implicit $exec ; UNPACKED-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[V_LSHRREV_B32_e64_]], %subreg.sub1, [[COPY5]], %subreg.sub2, [[V_LSHRREV_B32_e64_1]], %subreg.sub3 ; UNPACKED-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; UNPACKED-NEXT: BUFFER_STORE_FORMAT_D16_XYZW_gfx80_OFFEN_exact [[REG_SEQUENCE]], [[COPY6]], [[REG_SEQUENCE1]], [[COPY7]], 0, 0, 0, implicit $exec :: (dereferenceable store (<4 x s16>) into %ir.rsrc, align 1, addrspace 8) @@ -160,11 +158,10 @@ ; UNPACKED-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr5 ; UNPACKED-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr6 ; UNPACKED-NEXT: [[COPY7:%[0-9]+]]:sreg_32 = COPY $sgpr2 - ; UNPACKED-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 16 - ; UNPACKED-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] - ; UNPACKED-NEXT: [[V_LSHRREV_B32_e64_:%[0-9]+]]:vgpr_32 = V_LSHRREV_B32_e64 [[COPY8]], [[COPY4]], implicit $exec - ; UNPACKED-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] - ; UNPACKED-NEXT: [[V_LSHRREV_B32_e64_1:%[0-9]+]]:vgpr_32 = V_LSHRREV_B32_e64 [[COPY9]], [[COPY5]], implicit $exec + ; UNPACKED-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 16, implicit $exec + ; UNPACKED-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 16, implicit $exec + ; UNPACKED-NEXT: [[V_LSHRREV_B32_e64_:%[0-9]+]]:vgpr_32 = V_LSHRREV_B32_e64 [[V_MOV_B32_e32_1]], [[COPY4]], implicit $exec + ; UNPACKED-NEXT: [[V_LSHRREV_B32_e64_1:%[0-9]+]]:vgpr_32 = V_LSHRREV_B32_e64 [[V_MOV_B32_e32_]], [[COPY5]], implicit $exec ; UNPACKED-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[V_LSHRREV_B32_e64_]], %subreg.sub1, [[COPY5]], %subreg.sub2, [[V_LSHRREV_B32_e64_1]], %subreg.sub3 ; UNPACKED-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; UNPACKED-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec @@ -177,12 +174,12 @@ ; UNPACKED-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY2]], implicit $exec ; UNPACKED-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY3]], implicit $exec ; UNPACKED-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3 - ; UNPACKED-NEXT: [[COPY10:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE1]].sub0_sub1 - ; UNPACKED-NEXT: [[COPY11:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE1]].sub2_sub3 - ; UNPACKED-NEXT: [[COPY12:%[0-9]+]]:sreg_64 = COPY [[REG_SEQUENCE2]].sub0_sub1 - ; UNPACKED-NEXT: [[COPY13:%[0-9]+]]:sreg_64 = COPY [[REG_SEQUENCE2]].sub2_sub3 - ; UNPACKED-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[COPY12]], [[COPY10]], implicit $exec - ; UNPACKED-NEXT: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[COPY13]], [[COPY11]], implicit $exec + ; UNPACKED-NEXT: [[COPY8:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE1]].sub0_sub1 + ; UNPACKED-NEXT: [[COPY9:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE1]].sub2_sub3 + ; UNPACKED-NEXT: [[COPY10:%[0-9]+]]:sreg_64 = COPY [[REG_SEQUENCE2]].sub0_sub1 + ; UNPACKED-NEXT: [[COPY11:%[0-9]+]]:sreg_64 = COPY [[REG_SEQUENCE2]].sub2_sub3 + ; UNPACKED-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[COPY10]], [[COPY8]], implicit $exec + ; UNPACKED-NEXT: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[COPY11]], [[COPY9]], implicit $exec ; UNPACKED-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def $scc ; UNPACKED-NEXT: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_]], implicit-def $exec, implicit-def $scc, implicit $exec ; UNPACKED-NEXT: {{ $}} @@ -264,9 +261,8 @@ ; UNPACKED-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; UNPACKED-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1 ; UNPACKED-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 4095 - ; UNPACKED-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 16 - ; UNPACKED-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_1]] - ; UNPACKED-NEXT: [[V_LSHRREV_B32_e64_:%[0-9]+]]:vgpr_32 = V_LSHRREV_B32_e64 [[COPY6]], [[COPY4]], implicit $exec + ; UNPACKED-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 16, implicit $exec + ; UNPACKED-NEXT: [[V_LSHRREV_B32_e64_:%[0-9]+]]:vgpr_32 = V_LSHRREV_B32_e64 [[V_MOV_B32_e32_]], [[COPY4]], implicit $exec ; UNPACKED-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[V_LSHRREV_B32_e64_]], %subreg.sub1 ; UNPACKED-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; UNPACKED-NEXT: BUFFER_STORE_FORMAT_D16_XY_gfx80_OFFEN_exact [[REG_SEQUENCE]], [[COPY5]], [[REG_SEQUENCE1]], [[S_MOV_B32_]], 0, 0, 0, implicit $exec :: (dereferenceable store (<2 x s16>) into %ir.rsrc, align 1, addrspace 8) @@ -301,9 +297,8 @@ ; UNPACKED-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; UNPACKED-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1 ; UNPACKED-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 4096 - ; UNPACKED-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 16 - ; UNPACKED-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_1]] - ; UNPACKED-NEXT: [[V_LSHRREV_B32_e64_:%[0-9]+]]:vgpr_32 = V_LSHRREV_B32_e64 [[COPY6]], [[COPY4]], implicit $exec + ; UNPACKED-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 16, implicit $exec + ; UNPACKED-NEXT: [[V_LSHRREV_B32_e64_:%[0-9]+]]:vgpr_32 = V_LSHRREV_B32_e64 [[V_MOV_B32_e32_]], [[COPY4]], implicit $exec ; UNPACKED-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[V_LSHRREV_B32_e64_]], %subreg.sub1 ; UNPACKED-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; UNPACKED-NEXT: BUFFER_STORE_FORMAT_D16_XY_gfx80_OFFEN_exact [[REG_SEQUENCE]], [[COPY5]], [[REG_SEQUENCE1]], [[S_MOV_B32_]], 0, 0, 0, implicit $exec :: (dereferenceable store (<2 x s16>) into %ir.rsrc, align 1, addrspace 8) @@ -338,9 +333,8 @@ ; UNPACKED-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; UNPACKED-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1 ; UNPACKED-NEXT: [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6 - ; UNPACKED-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 16 - ; UNPACKED-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] - ; UNPACKED-NEXT: [[V_LSHRREV_B32_e64_:%[0-9]+]]:vgpr_32 = V_LSHRREV_B32_e64 [[COPY7]], [[COPY4]], implicit $exec + ; UNPACKED-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 16, implicit $exec + ; UNPACKED-NEXT: [[V_LSHRREV_B32_e64_:%[0-9]+]]:vgpr_32 = V_LSHRREV_B32_e64 [[V_MOV_B32_e32_]], [[COPY4]], implicit $exec ; UNPACKED-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[V_LSHRREV_B32_e64_]], %subreg.sub1 ; UNPACKED-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; UNPACKED-NEXT: BUFFER_STORE_FORMAT_D16_XY_gfx80_OFFEN_exact [[REG_SEQUENCE]], [[COPY5]], [[REG_SEQUENCE1]], [[COPY6]], 16, 0, 0, implicit $exec :: (dereferenceable store (<2 x s16>) into %ir.rsrc, align 1, addrspace 8) @@ -376,9 +370,8 @@ ; UNPACKED-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; UNPACKED-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1 ; UNPACKED-NEXT: [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6 - ; UNPACKED-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 16 - ; UNPACKED-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] - ; UNPACKED-NEXT: [[V_LSHRREV_B32_e64_:%[0-9]+]]:vgpr_32 = V_LSHRREV_B32_e64 [[COPY7]], [[COPY4]], implicit $exec + ; UNPACKED-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 16, implicit $exec + ; UNPACKED-NEXT: [[V_LSHRREV_B32_e64_:%[0-9]+]]:vgpr_32 = V_LSHRREV_B32_e64 [[V_MOV_B32_e32_]], [[COPY4]], implicit $exec ; UNPACKED-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[V_LSHRREV_B32_e64_]], %subreg.sub1 ; UNPACKED-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; UNPACKED-NEXT: BUFFER_STORE_FORMAT_D16_XY_gfx80_OFFEN_exact [[REG_SEQUENCE]], [[COPY5]], [[REG_SEQUENCE1]], [[COPY6]], 4095, 0, 0, implicit $exec :: (dereferenceable store (<2 x s16>) into %ir.rsrc, align 1, addrspace 8) @@ -414,12 +407,10 @@ ; UNPACKED-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; UNPACKED-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1 ; UNPACKED-NEXT: [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6 - ; UNPACKED-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 4096 - ; UNPACKED-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] - ; UNPACKED-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64 = V_ADD_CO_U32_e64 [[COPY5]], [[COPY7]], 0, implicit $exec - ; UNPACKED-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 16 - ; UNPACKED-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_1]] - ; UNPACKED-NEXT: [[V_LSHRREV_B32_e64_:%[0-9]+]]:vgpr_32 = V_LSHRREV_B32_e64 [[COPY8]], [[COPY4]], implicit $exec + ; UNPACKED-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4096, implicit $exec + ; UNPACKED-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64 = V_ADD_CO_U32_e64 [[COPY5]], [[V_MOV_B32_e32_]], 0, implicit $exec + ; UNPACKED-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 16, implicit $exec + ; UNPACKED-NEXT: [[V_LSHRREV_B32_e64_:%[0-9]+]]:vgpr_32 = V_LSHRREV_B32_e64 [[V_MOV_B32_e32_1]], [[COPY4]], implicit $exec ; UNPACKED-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[V_LSHRREV_B32_e64_]], %subreg.sub1 ; UNPACKED-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; UNPACKED-NEXT: BUFFER_STORE_FORMAT_D16_XY_gfx80_OFFEN_exact [[REG_SEQUENCE]], [[V_ADD_CO_U32_e64_]], [[REG_SEQUENCE1]], [[COPY6]], 0, 0, 0, implicit $exec :: (dereferenceable store (<2 x s16>) into %ir.rsrc, align 1, addrspace 8) @@ -435,9 +426,8 @@ ; PACKED-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; PACKED-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1 ; PACKED-NEXT: [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6 - ; PACKED-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 4096 - ; PACKED-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] - ; PACKED-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64 = V_ADD_CO_U32_e64 [[COPY5]], [[COPY7]], 0, implicit $exec + ; PACKED-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4096, implicit $exec + ; PACKED-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64 = V_ADD_CO_U32_e64 [[COPY5]], [[V_MOV_B32_e32_]], 0, implicit $exec ; PACKED-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; PACKED-NEXT: BUFFER_STORE_FORMAT_D16_XY_OFFEN_exact [[COPY4]], [[V_ADD_CO_U32_e64_]], [[REG_SEQUENCE]], [[COPY6]], 0, 0, 0, implicit $exec :: (dereferenceable store (<2 x s16>) into %ir.rsrc, align 1, addrspace 8) ; PACKED-NEXT: S_ENDPGM 0 @@ -462,14 +452,12 @@ ; UNPACKED-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr5 ; UNPACKED-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr6 ; UNPACKED-NEXT: [[COPY7:%[0-9]+]]:sreg_32 = COPY $sgpr2 - ; UNPACKED-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 4096 - ; UNPACKED-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] - ; UNPACKED-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64 = V_ADD_CO_U32_e64 [[COPY6]], [[COPY8]], 0, implicit $exec - ; UNPACKED-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 16 - ; UNPACKED-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_1]] - ; UNPACKED-NEXT: [[V_LSHRREV_B32_e64_:%[0-9]+]]:vgpr_32 = V_LSHRREV_B32_e64 [[COPY9]], [[COPY4]], implicit $exec - ; UNPACKED-NEXT: [[COPY10:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_1]] - ; UNPACKED-NEXT: [[V_LSHRREV_B32_e64_1:%[0-9]+]]:vgpr_32 = V_LSHRREV_B32_e64 [[COPY10]], [[COPY5]], implicit $exec + ; UNPACKED-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4096, implicit $exec + ; UNPACKED-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64 = V_ADD_CO_U32_e64 [[COPY6]], [[V_MOV_B32_e32_]], 0, implicit $exec + ; UNPACKED-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 16, implicit $exec + ; UNPACKED-NEXT: [[V_MOV_B32_e32_2:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 16, implicit $exec + ; UNPACKED-NEXT: [[V_LSHRREV_B32_e64_:%[0-9]+]]:vgpr_32 = V_LSHRREV_B32_e64 [[V_MOV_B32_e32_2]], [[COPY4]], implicit $exec + ; UNPACKED-NEXT: [[V_LSHRREV_B32_e64_1:%[0-9]+]]:vgpr_32 = V_LSHRREV_B32_e64 [[V_MOV_B32_e32_1]], [[COPY5]], implicit $exec ; UNPACKED-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[V_LSHRREV_B32_e64_]], %subreg.sub1, [[COPY5]], %subreg.sub2, [[V_LSHRREV_B32_e64_1]], %subreg.sub3 ; UNPACKED-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; UNPACKED-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec @@ -482,12 +470,12 @@ ; UNPACKED-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY2]], implicit $exec ; UNPACKED-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY3]], implicit $exec ; UNPACKED-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3 - ; UNPACKED-NEXT: [[COPY11:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE1]].sub0_sub1 - ; UNPACKED-NEXT: [[COPY12:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE1]].sub2_sub3 - ; UNPACKED-NEXT: [[COPY13:%[0-9]+]]:sreg_64 = COPY [[REG_SEQUENCE2]].sub0_sub1 - ; UNPACKED-NEXT: [[COPY14:%[0-9]+]]:sreg_64 = COPY [[REG_SEQUENCE2]].sub2_sub3 - ; UNPACKED-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[COPY13]], [[COPY11]], implicit $exec - ; UNPACKED-NEXT: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[COPY14]], [[COPY12]], implicit $exec + ; UNPACKED-NEXT: [[COPY8:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE1]].sub0_sub1 + ; UNPACKED-NEXT: [[COPY9:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE1]].sub2_sub3 + ; UNPACKED-NEXT: [[COPY10:%[0-9]+]]:sreg_64 = COPY [[REG_SEQUENCE2]].sub0_sub1 + ; UNPACKED-NEXT: [[COPY11:%[0-9]+]]:sreg_64 = COPY [[REG_SEQUENCE2]].sub2_sub3 + ; UNPACKED-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[COPY10]], [[COPY8]], implicit $exec + ; UNPACKED-NEXT: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[COPY11]], [[COPY9]], implicit $exec ; UNPACKED-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def $scc ; UNPACKED-NEXT: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_]], implicit-def $exec, implicit-def $scc, implicit $exec ; UNPACKED-NEXT: {{ $}} @@ -519,9 +507,8 @@ ; PACKED-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY5]], %subreg.sub1 ; PACKED-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr6 ; PACKED-NEXT: [[COPY7:%[0-9]+]]:sreg_32 = COPY $sgpr2 - ; PACKED-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 4096 - ; PACKED-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] - ; PACKED-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64 = V_ADD_CO_U32_e64 [[COPY6]], [[COPY8]], 0, implicit $exec + ; PACKED-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4096, implicit $exec + ; PACKED-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64 = V_ADD_CO_U32_e64 [[COPY6]], [[V_MOV_B32_e32_]], 0, implicit $exec ; PACKED-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; PACKED-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec ; PACKED-NEXT: {{ $}} @@ -533,12 +520,12 @@ ; PACKED-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY2]], implicit $exec ; PACKED-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY3]], implicit $exec ; PACKED-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3 - ; PACKED-NEXT: [[COPY9:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE1]].sub0_sub1 - ; PACKED-NEXT: [[COPY10:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE1]].sub2_sub3 - ; PACKED-NEXT: [[COPY11:%[0-9]+]]:sreg_64 = COPY [[REG_SEQUENCE2]].sub0_sub1 - ; PACKED-NEXT: [[COPY12:%[0-9]+]]:sreg_64 = COPY [[REG_SEQUENCE2]].sub2_sub3 - ; PACKED-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[COPY11]], [[COPY9]], implicit $exec - ; PACKED-NEXT: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[COPY12]], [[COPY10]], implicit $exec + ; PACKED-NEXT: [[COPY8:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE1]].sub0_sub1 + ; PACKED-NEXT: [[COPY9:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE1]].sub2_sub3 + ; PACKED-NEXT: [[COPY10:%[0-9]+]]:sreg_64 = COPY [[REG_SEQUENCE2]].sub0_sub1 + ; PACKED-NEXT: [[COPY11:%[0-9]+]]:sreg_64 = COPY [[REG_SEQUENCE2]].sub2_sub3 + ; PACKED-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[COPY10]], [[COPY8]], implicit $exec + ; PACKED-NEXT: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[COPY11]], [[COPY9]], implicit $exec ; PACKED-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def $scc ; PACKED-NEXT: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_]], implicit-def $exec, implicit-def $scc, implicit $exec ; PACKED-NEXT: {{ $}} Index: llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.ptr.buffer.store.format.f32.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.ptr.buffer.store.format.f32.ll +++ llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.ptr.buffer.store.format.f32.ll @@ -260,9 +260,8 @@ ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY5]], %subreg.sub1 ; CHECK-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr2 ; CHECK-NEXT: [[COPY7:%[0-9]+]]:sreg_32 = COPY $sgpr6 - ; CHECK-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 4096 - ; CHECK-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] - ; CHECK-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64 = V_ADD_CO_U32_e64 [[COPY6]], [[COPY8]], 0, implicit $exec + ; CHECK-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4096, implicit $exec + ; CHECK-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64 = V_ADD_CO_U32_e64 [[COPY6]], [[V_MOV_B32_e32_]], 0, implicit $exec ; CHECK-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; CHECK-NEXT: BUFFER_STORE_FORMAT_XY_OFFEN_exact [[REG_SEQUENCE]], [[V_ADD_CO_U32_e64_]], [[REG_SEQUENCE1]], [[COPY7]], 0, 0, 0, implicit $exec :: (dereferenceable store (<2 x s32>) into %ir.rsrc, align 1, addrspace 8) ; CHECK-NEXT: S_ENDPGM 0 @@ -290,9 +289,8 @@ ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY5]], %subreg.sub1, [[COPY6]], %subreg.sub2, [[COPY7]], %subreg.sub3 ; CHECK-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY $vgpr8 ; CHECK-NEXT: [[COPY9:%[0-9]+]]:sreg_32 = COPY $sgpr2 - ; CHECK-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 4096 - ; CHECK-NEXT: [[COPY10:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] - ; CHECK-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64 = V_ADD_CO_U32_e64 [[COPY8]], [[COPY10]], 0, implicit $exec + ; CHECK-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4096, implicit $exec + ; CHECK-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64 = V_ADD_CO_U32_e64 [[COPY8]], [[V_MOV_B32_e32_]], 0, implicit $exec ; CHECK-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; CHECK-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec ; CHECK-NEXT: {{ $}} @@ -304,12 +302,12 @@ ; CHECK-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY2]], implicit $exec ; CHECK-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY3]], implicit $exec ; CHECK-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3 - ; CHECK-NEXT: [[COPY11:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE1]].sub0_sub1 - ; CHECK-NEXT: [[COPY12:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE1]].sub2_sub3 - ; CHECK-NEXT: [[COPY13:%[0-9]+]]:sreg_64 = COPY [[REG_SEQUENCE2]].sub0_sub1 - ; CHECK-NEXT: [[COPY14:%[0-9]+]]:sreg_64 = COPY [[REG_SEQUENCE2]].sub2_sub3 - ; CHECK-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[COPY13]], [[COPY11]], implicit $exec - ; CHECK-NEXT: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[COPY14]], [[COPY12]], implicit $exec + ; CHECK-NEXT: [[COPY10:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE1]].sub0_sub1 + ; CHECK-NEXT: [[COPY11:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE1]].sub2_sub3 + ; CHECK-NEXT: [[COPY12:%[0-9]+]]:sreg_64 = COPY [[REG_SEQUENCE2]].sub0_sub1 + ; CHECK-NEXT: [[COPY13:%[0-9]+]]:sreg_64 = COPY [[REG_SEQUENCE2]].sub2_sub3 + ; CHECK-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[COPY12]], [[COPY10]], implicit $exec + ; CHECK-NEXT: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[COPY13]], [[COPY11]], implicit $exec ; CHECK-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def $scc ; CHECK-NEXT: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_]], implicit-def $exec, implicit-def $scc, implicit $exec ; CHECK-NEXT: {{ $}} Index: llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.ptr.buffer.store.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.ptr.buffer.store.ll +++ llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.ptr.buffer.store.ll @@ -574,10 +574,9 @@ ; CHECK-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5 ; CHECK-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; CHECK-NEXT: [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr6 - ; CHECK-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 4096 + ; CHECK-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4096, implicit $exec ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 - ; CHECK-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] - ; CHECK-NEXT: BUFFER_STORE_DWORD_OFFEN_exact [[COPY4]], [[COPY6]], [[REG_SEQUENCE]], [[COPY5]], 0, 0, 0, implicit $exec :: (dereferenceable store (s32) into %ir.rsrc, align 1, addrspace 8) + ; CHECK-NEXT: BUFFER_STORE_DWORD_OFFEN_exact [[COPY4]], [[V_MOV_B32_e32_]], [[REG_SEQUENCE]], [[COPY5]], 0, 0, 0, implicit $exec :: (dereferenceable store (s32) into %ir.rsrc, align 1, addrspace 8) ; CHECK-NEXT: S_ENDPGM 0 call void @llvm.amdgcn.raw.ptr.buffer.store.f32(float %val, ptr addrspace(8) %rsrc, i32 4096, i32 %soffset, i32 0) ret void @@ -635,9 +634,8 @@ ; CHECK-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; CHECK-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1 ; CHECK-NEXT: [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6 - ; CHECK-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 4096 - ; CHECK-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] - ; CHECK-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64 = V_ADD_CO_U32_e64 [[COPY5]], [[COPY7]], 0, implicit $exec + ; CHECK-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4096, implicit $exec + ; CHECK-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64 = V_ADD_CO_U32_e64 [[COPY5]], [[V_MOV_B32_e32_]], 0, implicit $exec ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; CHECK-NEXT: BUFFER_STORE_DWORD_OFFEN_exact [[COPY4]], [[V_ADD_CO_U32_e64_]], [[REG_SEQUENCE]], [[COPY6]], 0, 0, 0, implicit $exec :: (dereferenceable store (s32) into %ir.rsrc, align 1, addrspace 8) ; CHECK-NEXT: S_ENDPGM 0 @@ -736,9 +734,8 @@ ; CHECK-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; CHECK-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1 ; CHECK-NEXT: [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6 - ; CHECK-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 4096 - ; CHECK-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] - ; CHECK-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64 = V_ADD_CO_U32_e64 [[COPY5]], [[COPY7]], 0, implicit $exec + ; CHECK-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4096, implicit $exec + ; CHECK-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64 = V_ADD_CO_U32_e64 [[COPY5]], [[V_MOV_B32_e32_]], 0, implicit $exec ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; CHECK-NEXT: BUFFER_STORE_DWORD_OFFEN_exact [[COPY4]], [[V_ADD_CO_U32_e64_]], [[REG_SEQUENCE]], [[COPY6]], 0, 0, 0, implicit $exec :: (dereferenceable store (<2 x s16>) into %ir.rsrc, align 1, addrspace 8) ; CHECK-NEXT: S_ENDPGM 0 @@ -762,9 +759,8 @@ ; CHECK-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr5 ; CHECK-NEXT: [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr2 ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 - ; CHECK-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 4096 - ; CHECK-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] - ; CHECK-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64 = V_ADD_CO_U32_e64 [[COPY5]], [[COPY7]], 0, implicit $exec + ; CHECK-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4096, implicit $exec + ; CHECK-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64 = V_ADD_CO_U32_e64 [[COPY5]], [[V_MOV_B32_e32_]], 0, implicit $exec ; CHECK-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.2: @@ -775,12 +771,12 @@ ; CHECK-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY2]], implicit $exec ; CHECK-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY3]], implicit $exec ; CHECK-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3 - ; CHECK-NEXT: [[COPY8:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub0_sub1 - ; CHECK-NEXT: [[COPY9:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub2_sub3 - ; CHECK-NEXT: [[COPY10:%[0-9]+]]:sreg_64 = COPY [[REG_SEQUENCE1]].sub0_sub1 - ; CHECK-NEXT: [[COPY11:%[0-9]+]]:sreg_64 = COPY [[REG_SEQUENCE1]].sub2_sub3 - ; CHECK-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[COPY10]], [[COPY8]], implicit $exec - ; CHECK-NEXT: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[COPY11]], [[COPY9]], implicit $exec + ; CHECK-NEXT: [[COPY7:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub0_sub1 + ; CHECK-NEXT: [[COPY8:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub2_sub3 + ; CHECK-NEXT: [[COPY9:%[0-9]+]]:sreg_64 = COPY [[REG_SEQUENCE1]].sub0_sub1 + ; CHECK-NEXT: [[COPY10:%[0-9]+]]:sreg_64 = COPY [[REG_SEQUENCE1]].sub2_sub3 + ; CHECK-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[COPY9]], [[COPY7]], implicit $exec + ; CHECK-NEXT: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[COPY10]], [[COPY8]], implicit $exec ; CHECK-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def $scc ; CHECK-NEXT: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_]], implicit-def $exec, implicit-def $scc, implicit $exec ; CHECK-NEXT: {{ $}} @@ -817,8 +813,7 @@ ; CHECK-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr4 ; CHECK-NEXT: [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr2 ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 - ; CHECK-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 4096 - ; CHECK-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] + ; CHECK-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4096, implicit $exec ; CHECK-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.2: @@ -829,19 +824,19 @@ ; CHECK-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY2]], implicit $exec ; CHECK-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY3]], implicit $exec ; CHECK-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3 - ; CHECK-NEXT: [[COPY7:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub0_sub1 - ; CHECK-NEXT: [[COPY8:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub2_sub3 - ; CHECK-NEXT: [[COPY9:%[0-9]+]]:sreg_64 = COPY [[REG_SEQUENCE1]].sub0_sub1 - ; CHECK-NEXT: [[COPY10:%[0-9]+]]:sreg_64 = COPY [[REG_SEQUENCE1]].sub2_sub3 - ; CHECK-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[COPY9]], [[COPY7]], implicit $exec - ; CHECK-NEXT: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[COPY10]], [[COPY8]], implicit $exec + ; CHECK-NEXT: [[COPY6:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub0_sub1 + ; CHECK-NEXT: [[COPY7:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub2_sub3 + ; CHECK-NEXT: [[COPY8:%[0-9]+]]:sreg_64 = COPY [[REG_SEQUENCE1]].sub0_sub1 + ; CHECK-NEXT: [[COPY9:%[0-9]+]]:sreg_64 = COPY [[REG_SEQUENCE1]].sub2_sub3 + ; CHECK-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[COPY8]], [[COPY6]], implicit $exec + ; CHECK-NEXT: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[COPY9]], [[COPY7]], implicit $exec ; CHECK-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def $scc ; CHECK-NEXT: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_]], implicit-def $exec, implicit-def $scc, implicit $exec ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.3: ; CHECK-NEXT: successors: %bb.4(0x40000000), %bb.2(0x40000000) ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: BUFFER_STORE_DWORD_OFFEN_exact [[COPY4]], [[COPY6]], [[REG_SEQUENCE1]], [[COPY5]], 904, 0, 0, implicit $exec :: (dereferenceable store (s32) into %ir.rsrc, align 1, addrspace 8) + ; CHECK-NEXT: BUFFER_STORE_DWORD_OFFEN_exact [[COPY4]], [[V_MOV_B32_e32_]], [[REG_SEQUENCE1]], [[COPY5]], 904, 0, 0, implicit $exec :: (dereferenceable store (s32) into %ir.rsrc, align 1, addrspace 8) ; CHECK-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc ; CHECK-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec ; CHECK-NEXT: {{ $}} Index: llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.ptr.tbuffer.load.f16.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.ptr.tbuffer.load.f16.ll +++ llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.ptr.tbuffer.load.f16.ll @@ -50,14 +50,12 @@ ; UNPACKED-NEXT: [[TBUFFER_LOAD_FORMAT_D16_XY_gfx80_OFFEN:%[0-9]+]]:vreg_64 = TBUFFER_LOAD_FORMAT_D16_XY_gfx80_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[COPY5]], 0, 78, 0, 0, implicit $exec :: (dereferenceable load (<2 x s16>) from %ir.rsrc, align 1, addrspace 8) ; UNPACKED-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[TBUFFER_LOAD_FORMAT_D16_XY_gfx80_OFFEN]].sub0 ; UNPACKED-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[TBUFFER_LOAD_FORMAT_D16_XY_gfx80_OFFEN]].sub1 - ; UNPACKED-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 65535 - ; UNPACKED-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] - ; UNPACKED-NEXT: [[V_AND_B32_e64_:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 [[COPY6]], [[COPY8]], implicit $exec - ; UNPACKED-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] - ; UNPACKED-NEXT: [[V_AND_B32_e64_1:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 [[COPY7]], [[COPY9]], implicit $exec - ; UNPACKED-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 16 - ; UNPACKED-NEXT: [[COPY10:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_1]] - ; UNPACKED-NEXT: [[V_LSHLREV_B32_e64_:%[0-9]+]]:vgpr_32 = V_LSHLREV_B32_e64 [[COPY10]], [[V_AND_B32_e64_1]], implicit $exec + ; UNPACKED-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 65535, implicit $exec + ; UNPACKED-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 65535, implicit $exec + ; UNPACKED-NEXT: [[V_AND_B32_e64_:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 [[COPY6]], [[V_MOV_B32_e32_1]], implicit $exec + ; UNPACKED-NEXT: [[V_AND_B32_e64_1:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 [[COPY7]], [[V_MOV_B32_e32_]], implicit $exec + ; UNPACKED-NEXT: [[V_MOV_B32_e32_2:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 16, implicit $exec + ; UNPACKED-NEXT: [[V_LSHLREV_B32_e64_:%[0-9]+]]:vgpr_32 = V_LSHLREV_B32_e64 [[V_MOV_B32_e32_2]], [[V_AND_B32_e64_1]], implicit $exec ; UNPACKED-NEXT: [[V_OR_B32_e64_:%[0-9]+]]:vgpr_32 = V_OR_B32_e64 [[V_AND_B32_e64_]], [[V_LSHLREV_B32_e64_]], implicit $exec ; UNPACKED-NEXT: $vgpr0 = COPY [[V_OR_B32_e64_]] ; UNPACKED-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0 @@ -102,21 +100,19 @@ ; UNPACKED-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[TBUFFER_LOAD_FORMAT_D16_XYZW_gfx80_OFFEN]].sub1 ; UNPACKED-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[TBUFFER_LOAD_FORMAT_D16_XYZW_gfx80_OFFEN]].sub2 ; UNPACKED-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[TBUFFER_LOAD_FORMAT_D16_XYZW_gfx80_OFFEN]].sub3 - ; UNPACKED-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 65535 - ; UNPACKED-NEXT: [[COPY10:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] - ; UNPACKED-NEXT: [[V_AND_B32_e64_:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 [[COPY6]], [[COPY10]], implicit $exec - ; UNPACKED-NEXT: [[COPY11:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] - ; UNPACKED-NEXT: [[V_AND_B32_e64_1:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 [[COPY7]], [[COPY11]], implicit $exec - ; UNPACKED-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 16 - ; UNPACKED-NEXT: [[COPY12:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_1]] - ; UNPACKED-NEXT: [[V_LSHLREV_B32_e64_:%[0-9]+]]:vgpr_32 = V_LSHLREV_B32_e64 [[COPY12]], [[V_AND_B32_e64_1]], implicit $exec + ; UNPACKED-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 65535, implicit $exec + ; UNPACKED-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 65535, implicit $exec + ; UNPACKED-NEXT: [[V_AND_B32_e64_:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 [[COPY6]], [[V_MOV_B32_e32_1]], implicit $exec + ; UNPACKED-NEXT: [[V_MOV_B32_e32_2:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 65535, implicit $exec + ; UNPACKED-NEXT: [[V_AND_B32_e64_1:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 [[COPY7]], [[V_MOV_B32_e32_2]], implicit $exec + ; UNPACKED-NEXT: [[V_MOV_B32_e32_3:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 16, implicit $exec + ; UNPACKED-NEXT: [[V_MOV_B32_e32_4:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 16, implicit $exec + ; UNPACKED-NEXT: [[V_LSHLREV_B32_e64_:%[0-9]+]]:vgpr_32 = V_LSHLREV_B32_e64 [[V_MOV_B32_e32_4]], [[V_AND_B32_e64_1]], implicit $exec ; UNPACKED-NEXT: [[V_OR_B32_e64_:%[0-9]+]]:vgpr_32 = V_OR_B32_e64 [[V_AND_B32_e64_]], [[V_LSHLREV_B32_e64_]], implicit $exec - ; UNPACKED-NEXT: [[COPY13:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] - ; UNPACKED-NEXT: [[V_AND_B32_e64_2:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 [[COPY8]], [[COPY13]], implicit $exec - ; UNPACKED-NEXT: [[COPY14:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] - ; UNPACKED-NEXT: [[V_AND_B32_e64_3:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 [[COPY9]], [[COPY14]], implicit $exec - ; UNPACKED-NEXT: [[COPY15:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_1]] - ; UNPACKED-NEXT: [[V_LSHLREV_B32_e64_1:%[0-9]+]]:vgpr_32 = V_LSHLREV_B32_e64 [[COPY15]], [[V_AND_B32_e64_3]], implicit $exec + ; UNPACKED-NEXT: [[V_MOV_B32_e32_5:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 65535, implicit $exec + ; UNPACKED-NEXT: [[V_AND_B32_e64_2:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 [[COPY8]], [[V_MOV_B32_e32_5]], implicit $exec + ; UNPACKED-NEXT: [[V_AND_B32_e64_3:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 [[COPY9]], [[V_MOV_B32_e32_]], implicit $exec + ; UNPACKED-NEXT: [[V_LSHLREV_B32_e64_1:%[0-9]+]]:vgpr_32 = V_LSHLREV_B32_e64 [[V_MOV_B32_e32_3]], [[V_AND_B32_e64_3]], implicit $exec ; UNPACKED-NEXT: [[V_OR_B32_e64_1:%[0-9]+]]:vgpr_32 = V_OR_B32_e64 [[V_AND_B32_e64_2]], [[V_LSHLREV_B32_e64_1]], implicit $exec ; UNPACKED-NEXT: $vgpr0 = COPY [[V_OR_B32_e64_]] ; UNPACKED-NEXT: $vgpr1 = COPY [[V_OR_B32_e64_1]] Index: llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.ptr.tbuffer.store.f16.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.ptr.tbuffer.store.f16.ll +++ llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.ptr.tbuffer.store.f16.ll @@ -47,9 +47,8 @@ ; UNPACKED-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr5 ; UNPACKED-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1 ; UNPACKED-NEXT: [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6 - ; UNPACKED-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 16 - ; UNPACKED-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] - ; UNPACKED-NEXT: [[V_LSHRREV_B32_e64_:%[0-9]+]]:vgpr_32 = V_LSHRREV_B32_e64 [[COPY7]], [[COPY]], implicit $exec + ; UNPACKED-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 16, implicit $exec + ; UNPACKED-NEXT: [[V_LSHRREV_B32_e64_:%[0-9]+]]:vgpr_32 = V_LSHRREV_B32_e64 [[V_MOV_B32_e32_]], [[COPY]], implicit $exec ; UNPACKED-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[V_LSHRREV_B32_e64_]], %subreg.sub1 ; UNPACKED-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3 ; UNPACKED-NEXT: TBUFFER_STORE_FORMAT_D16_XY_gfx80_OFFEN_exact [[REG_SEQUENCE]], [[COPY5]], [[REG_SEQUENCE1]], [[COPY6]], 0, 78, 0, 0, implicit $exec :: (dereferenceable store (<2 x s16>) into %ir.rsrc, align 1, addrspace 8) @@ -91,11 +90,10 @@ ; UNPACKED-NEXT: [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr5 ; UNPACKED-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr2 ; UNPACKED-NEXT: [[COPY7:%[0-9]+]]:sreg_32 = COPY $sgpr6 - ; UNPACKED-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 16 - ; UNPACKED-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] - ; UNPACKED-NEXT: [[V_LSHRREV_B32_e64_:%[0-9]+]]:vgpr_32 = V_LSHRREV_B32_e64 [[COPY8]], [[COPY]], implicit $exec - ; UNPACKED-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] - ; UNPACKED-NEXT: [[V_LSHRREV_B32_e64_1:%[0-9]+]]:vgpr_32 = V_LSHRREV_B32_e64 [[COPY9]], [[COPY1]], implicit $exec + ; UNPACKED-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 16, implicit $exec + ; UNPACKED-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 16, implicit $exec + ; UNPACKED-NEXT: [[V_LSHRREV_B32_e64_:%[0-9]+]]:vgpr_32 = V_LSHRREV_B32_e64 [[V_MOV_B32_e32_1]], [[COPY]], implicit $exec + ; UNPACKED-NEXT: [[V_LSHRREV_B32_e64_1:%[0-9]+]]:vgpr_32 = V_LSHRREV_B32_e64 [[V_MOV_B32_e32_]], [[COPY1]], implicit $exec ; UNPACKED-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[V_LSHRREV_B32_e64_]], %subreg.sub1, [[COPY1]], %subreg.sub2, [[V_LSHRREV_B32_e64_1]], %subreg.sub3 ; UNPACKED-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY3]], %subreg.sub1, [[COPY4]], %subreg.sub2, [[COPY5]], %subreg.sub3 ; UNPACKED-NEXT: TBUFFER_STORE_FORMAT_D16_XYZW_gfx80_OFFEN_exact [[REG_SEQUENCE]], [[COPY6]], [[REG_SEQUENCE1]], [[COPY7]], 0, 78, 0, 0, implicit $exec :: (dereferenceable store (<4 x s16>) into %ir.rsrc, align 1, addrspace 8) Index: llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.ptr.tbuffer.store.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.ptr.tbuffer.store.ll +++ llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.ptr.tbuffer.store.ll @@ -404,10 +404,9 @@ ; CHECK-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr4 ; CHECK-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr5 ; CHECK-NEXT: [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr6 - ; CHECK-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 4096 + ; CHECK-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4096, implicit $exec ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3 - ; CHECK-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] - ; CHECK-NEXT: TBUFFER_STORE_FORMAT_X_OFFEN_exact [[COPY]], [[COPY6]], [[REG_SEQUENCE]], [[COPY5]], 0, 94, 0, 0, implicit $exec :: (dereferenceable store (s32) into %ir.rsrc, align 1, addrspace 8) + ; CHECK-NEXT: TBUFFER_STORE_FORMAT_X_OFFEN_exact [[COPY]], [[V_MOV_B32_e32_]], [[REG_SEQUENCE]], [[COPY5]], 0, 94, 0, 0, implicit $exec :: (dereferenceable store (s32) into %ir.rsrc, align 1, addrspace 8) ; CHECK-NEXT: S_ENDPGM 0 call void @llvm.amdgcn.raw.ptr.tbuffer.store.f32(float %val, ptr addrspace(8) %rsrc, i32 4096, i32 %soffset, i32 94, i32 0) ret void @@ -465,9 +464,8 @@ ; CHECK-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr5 ; CHECK-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1 ; CHECK-NEXT: [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6 - ; CHECK-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 4096 - ; CHECK-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] - ; CHECK-NEXT: [[V_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[COPY5]], [[COPY7]], 0, implicit $exec + ; CHECK-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4096, implicit $exec + ; CHECK-NEXT: [[V_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[COPY5]], [[V_MOV_B32_e32_]], 0, implicit $exec ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3 ; CHECK-NEXT: TBUFFER_STORE_FORMAT_X_OFFEN_exact [[COPY]], [[V_ADD_U32_e64_]], [[REG_SEQUENCE]], [[COPY6]], 0, 94, 0, 0, implicit $exec :: (dereferenceable store (s32) into %ir.rsrc, align 1, addrspace 8) ; CHECK-NEXT: S_ENDPGM 0 @@ -650,10 +648,9 @@ ; CHECK-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr5 ; CHECK-NEXT: [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr2 ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3 - ; CHECK-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 4096 - ; CHECK-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] - ; CHECK-NEXT: [[V_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[COPY5]], [[COPY7]], 0, implicit $exec - ; CHECK-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32_xm0_xexec = S_MOV_B32 $exec_lo + ; CHECK-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4096, implicit $exec + ; CHECK-NEXT: [[V_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[COPY5]], [[V_MOV_B32_e32_]], 0, implicit $exec + ; CHECK-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_MOV_B32 $exec_lo ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.2: ; CHECK-NEXT: successors: %bb.3(0x80000000) @@ -663,12 +660,12 @@ ; CHECK-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY3]], implicit $exec ; CHECK-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY4]], implicit $exec ; CHECK-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3 - ; CHECK-NEXT: [[COPY8:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub0_sub1 - ; CHECK-NEXT: [[COPY9:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub2_sub3 - ; CHECK-NEXT: [[COPY10:%[0-9]+]]:sreg_64 = COPY [[REG_SEQUENCE1]].sub0_sub1 - ; CHECK-NEXT: [[COPY11:%[0-9]+]]:sreg_64 = COPY [[REG_SEQUENCE1]].sub2_sub3 - ; CHECK-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U64_e64 [[COPY10]], [[COPY8]], implicit $exec - ; CHECK-NEXT: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U64_e64 [[COPY11]], [[COPY9]], implicit $exec + ; CHECK-NEXT: [[COPY7:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub0_sub1 + ; CHECK-NEXT: [[COPY8:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub2_sub3 + ; CHECK-NEXT: [[COPY9:%[0-9]+]]:sreg_64 = COPY [[REG_SEQUENCE1]].sub0_sub1 + ; CHECK-NEXT: [[COPY10:%[0-9]+]]:sreg_64 = COPY [[REG_SEQUENCE1]].sub2_sub3 + ; CHECK-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U64_e64 [[COPY9]], [[COPY7]], implicit $exec + ; CHECK-NEXT: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U64_e64 [[COPY10]], [[COPY8]], implicit $exec ; CHECK-NEXT: [[S_AND_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_B32 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def $scc ; CHECK-NEXT: [[S_AND_SAVEEXEC_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_SAVEEXEC_B32 killed [[S_AND_B32_]], implicit-def $exec, implicit-def $scc, implicit $exec ; CHECK-NEXT: {{ $}} @@ -682,7 +679,7 @@ ; CHECK-NEXT: bb.4: ; CHECK-NEXT: successors: %bb.5(0x80000000) ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: $exec_lo = S_MOV_B32_term [[S_MOV_B32_1]] + ; CHECK-NEXT: $exec_lo = S_MOV_B32_term [[S_MOV_B32_]] ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.5: ; CHECK-NEXT: S_ENDPGM 0 Index: llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.tbuffer.load.f16.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.tbuffer.load.f16.ll +++ llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.tbuffer.load.f16.ll @@ -50,14 +50,12 @@ ; UNPACKED-NEXT: [[TBUFFER_LOAD_FORMAT_D16_XY_gfx80_OFFEN:%[0-9]+]]:vreg_64 = TBUFFER_LOAD_FORMAT_D16_XY_gfx80_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[COPY5]], 0, 78, 0, 0, implicit $exec :: (dereferenceable load (<2 x s16>), align 1, addrspace 8) ; UNPACKED-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[TBUFFER_LOAD_FORMAT_D16_XY_gfx80_OFFEN]].sub0 ; UNPACKED-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[TBUFFER_LOAD_FORMAT_D16_XY_gfx80_OFFEN]].sub1 - ; UNPACKED-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 65535 - ; UNPACKED-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] - ; UNPACKED-NEXT: [[V_AND_B32_e64_:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 [[COPY6]], [[COPY8]], implicit $exec - ; UNPACKED-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] - ; UNPACKED-NEXT: [[V_AND_B32_e64_1:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 [[COPY7]], [[COPY9]], implicit $exec - ; UNPACKED-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 16 - ; UNPACKED-NEXT: [[COPY10:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_1]] - ; UNPACKED-NEXT: [[V_LSHLREV_B32_e64_:%[0-9]+]]:vgpr_32 = V_LSHLREV_B32_e64 [[COPY10]], [[V_AND_B32_e64_1]], implicit $exec + ; UNPACKED-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 65535, implicit $exec + ; UNPACKED-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 65535, implicit $exec + ; UNPACKED-NEXT: [[V_AND_B32_e64_:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 [[COPY6]], [[V_MOV_B32_e32_1]], implicit $exec + ; UNPACKED-NEXT: [[V_AND_B32_e64_1:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 [[COPY7]], [[V_MOV_B32_e32_]], implicit $exec + ; UNPACKED-NEXT: [[V_MOV_B32_e32_2:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 16, implicit $exec + ; UNPACKED-NEXT: [[V_LSHLREV_B32_e64_:%[0-9]+]]:vgpr_32 = V_LSHLREV_B32_e64 [[V_MOV_B32_e32_2]], [[V_AND_B32_e64_1]], implicit $exec ; UNPACKED-NEXT: [[V_OR_B32_e64_:%[0-9]+]]:vgpr_32 = V_OR_B32_e64 [[V_AND_B32_e64_]], [[V_LSHLREV_B32_e64_]], implicit $exec ; UNPACKED-NEXT: $vgpr0 = COPY [[V_OR_B32_e64_]] ; UNPACKED-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0 @@ -102,21 +100,19 @@ ; UNPACKED-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[TBUFFER_LOAD_FORMAT_D16_XYZW_gfx80_OFFEN]].sub1 ; UNPACKED-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[TBUFFER_LOAD_FORMAT_D16_XYZW_gfx80_OFFEN]].sub2 ; UNPACKED-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[TBUFFER_LOAD_FORMAT_D16_XYZW_gfx80_OFFEN]].sub3 - ; UNPACKED-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 65535 - ; UNPACKED-NEXT: [[COPY10:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] - ; UNPACKED-NEXT: [[V_AND_B32_e64_:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 [[COPY6]], [[COPY10]], implicit $exec - ; UNPACKED-NEXT: [[COPY11:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] - ; UNPACKED-NEXT: [[V_AND_B32_e64_1:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 [[COPY7]], [[COPY11]], implicit $exec - ; UNPACKED-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 16 - ; UNPACKED-NEXT: [[COPY12:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_1]] - ; UNPACKED-NEXT: [[V_LSHLREV_B32_e64_:%[0-9]+]]:vgpr_32 = V_LSHLREV_B32_e64 [[COPY12]], [[V_AND_B32_e64_1]], implicit $exec + ; UNPACKED-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 65535, implicit $exec + ; UNPACKED-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 65535, implicit $exec + ; UNPACKED-NEXT: [[V_AND_B32_e64_:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 [[COPY6]], [[V_MOV_B32_e32_1]], implicit $exec + ; UNPACKED-NEXT: [[V_MOV_B32_e32_2:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 65535, implicit $exec + ; UNPACKED-NEXT: [[V_AND_B32_e64_1:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 [[COPY7]], [[V_MOV_B32_e32_2]], implicit $exec + ; UNPACKED-NEXT: [[V_MOV_B32_e32_3:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 16, implicit $exec + ; UNPACKED-NEXT: [[V_MOV_B32_e32_4:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 16, implicit $exec + ; UNPACKED-NEXT: [[V_LSHLREV_B32_e64_:%[0-9]+]]:vgpr_32 = V_LSHLREV_B32_e64 [[V_MOV_B32_e32_4]], [[V_AND_B32_e64_1]], implicit $exec ; UNPACKED-NEXT: [[V_OR_B32_e64_:%[0-9]+]]:vgpr_32 = V_OR_B32_e64 [[V_AND_B32_e64_]], [[V_LSHLREV_B32_e64_]], implicit $exec - ; UNPACKED-NEXT: [[COPY13:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] - ; UNPACKED-NEXT: [[V_AND_B32_e64_2:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 [[COPY8]], [[COPY13]], implicit $exec - ; UNPACKED-NEXT: [[COPY14:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] - ; UNPACKED-NEXT: [[V_AND_B32_e64_3:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 [[COPY9]], [[COPY14]], implicit $exec - ; UNPACKED-NEXT: [[COPY15:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_1]] - ; UNPACKED-NEXT: [[V_LSHLREV_B32_e64_1:%[0-9]+]]:vgpr_32 = V_LSHLREV_B32_e64 [[COPY15]], [[V_AND_B32_e64_3]], implicit $exec + ; UNPACKED-NEXT: [[V_MOV_B32_e32_5:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 65535, implicit $exec + ; UNPACKED-NEXT: [[V_AND_B32_e64_2:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 [[COPY8]], [[V_MOV_B32_e32_5]], implicit $exec + ; UNPACKED-NEXT: [[V_AND_B32_e64_3:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 [[COPY9]], [[V_MOV_B32_e32_]], implicit $exec + ; UNPACKED-NEXT: [[V_LSHLREV_B32_e64_1:%[0-9]+]]:vgpr_32 = V_LSHLREV_B32_e64 [[V_MOV_B32_e32_3]], [[V_AND_B32_e64_3]], implicit $exec ; UNPACKED-NEXT: [[V_OR_B32_e64_1:%[0-9]+]]:vgpr_32 = V_OR_B32_e64 [[V_AND_B32_e64_2]], [[V_LSHLREV_B32_e64_1]], implicit $exec ; UNPACKED-NEXT: $vgpr0 = COPY [[V_OR_B32_e64_]] ; UNPACKED-NEXT: $vgpr1 = COPY [[V_OR_B32_e64_1]] Index: llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.tbuffer.store.f16.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.tbuffer.store.f16.ll +++ llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.tbuffer.store.f16.ll @@ -48,9 +48,8 @@ ; UNPACKED-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3 ; UNPACKED-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1 ; UNPACKED-NEXT: [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6 - ; UNPACKED-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 16 - ; UNPACKED-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] - ; UNPACKED-NEXT: [[V_LSHRREV_B32_e64_:%[0-9]+]]:vgpr_32 = V_LSHRREV_B32_e64 [[COPY7]], [[COPY]], implicit $exec + ; UNPACKED-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 16, implicit $exec + ; UNPACKED-NEXT: [[V_LSHRREV_B32_e64_:%[0-9]+]]:vgpr_32 = V_LSHRREV_B32_e64 [[V_MOV_B32_e32_]], [[COPY]], implicit $exec ; UNPACKED-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[V_LSHRREV_B32_e64_]], %subreg.sub1 ; UNPACKED-NEXT: TBUFFER_STORE_FORMAT_D16_XY_gfx80_OFFEN_exact [[REG_SEQUENCE1]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 0, 78, 0, 0, implicit $exec :: (dereferenceable store (<2 x s16>), align 1, addrspace 8) ; UNPACKED-NEXT: S_ENDPGM 0 @@ -92,11 +91,10 @@ ; UNPACKED-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY3]], %subreg.sub1, [[COPY4]], %subreg.sub2, [[COPY5]], %subreg.sub3 ; UNPACKED-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr2 ; UNPACKED-NEXT: [[COPY7:%[0-9]+]]:sreg_32 = COPY $sgpr6 - ; UNPACKED-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 16 - ; UNPACKED-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] - ; UNPACKED-NEXT: [[V_LSHRREV_B32_e64_:%[0-9]+]]:vgpr_32 = V_LSHRREV_B32_e64 [[COPY8]], [[COPY]], implicit $exec - ; UNPACKED-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] - ; UNPACKED-NEXT: [[V_LSHRREV_B32_e64_1:%[0-9]+]]:vgpr_32 = V_LSHRREV_B32_e64 [[COPY9]], [[COPY1]], implicit $exec + ; UNPACKED-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 16, implicit $exec + ; UNPACKED-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 16, implicit $exec + ; UNPACKED-NEXT: [[V_LSHRREV_B32_e64_:%[0-9]+]]:vgpr_32 = V_LSHRREV_B32_e64 [[V_MOV_B32_e32_1]], [[COPY]], implicit $exec + ; UNPACKED-NEXT: [[V_LSHRREV_B32_e64_1:%[0-9]+]]:vgpr_32 = V_LSHRREV_B32_e64 [[V_MOV_B32_e32_]], [[COPY1]], implicit $exec ; UNPACKED-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[V_LSHRREV_B32_e64_]], %subreg.sub1, [[COPY1]], %subreg.sub2, [[V_LSHRREV_B32_e64_1]], %subreg.sub3 ; UNPACKED-NEXT: TBUFFER_STORE_FORMAT_D16_XYZW_gfx80_OFFEN_exact [[REG_SEQUENCE1]], [[COPY6]], [[REG_SEQUENCE]], [[COPY7]], 0, 78, 0, 0, implicit $exec :: (dereferenceable store (<4 x s16>), align 1, addrspace 8) ; UNPACKED-NEXT: S_ENDPGM 0 Index: llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.tbuffer.store.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.tbuffer.store.ll +++ llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.tbuffer.store.ll @@ -405,9 +405,8 @@ ; CHECK-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr5 ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3 ; CHECK-NEXT: [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr6 - ; CHECK-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 4096 - ; CHECK-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] - ; CHECK-NEXT: TBUFFER_STORE_FORMAT_X_OFFEN_exact [[COPY]], [[COPY6]], [[REG_SEQUENCE]], [[COPY5]], 0, 94, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 8) + ; CHECK-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4096, implicit $exec + ; CHECK-NEXT: TBUFFER_STORE_FORMAT_X_OFFEN_exact [[COPY]], [[V_MOV_B32_e32_]], [[REG_SEQUENCE]], [[COPY5]], 0, 94, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 8) ; CHECK-NEXT: S_ENDPGM 0 call void @llvm.amdgcn.raw.tbuffer.store.f32(float %val, <4 x i32> %rsrc, i32 4096, i32 %soffset, i32 94, i32 0) ret void @@ -466,9 +465,8 @@ ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3 ; CHECK-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1 ; CHECK-NEXT: [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6 - ; CHECK-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 4096 - ; CHECK-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] - ; CHECK-NEXT: [[V_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[COPY5]], [[COPY7]], 0, implicit $exec + ; CHECK-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4096, implicit $exec + ; CHECK-NEXT: [[V_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[COPY5]], [[V_MOV_B32_e32_]], 0, implicit $exec ; CHECK-NEXT: TBUFFER_STORE_FORMAT_X_OFFEN_exact [[COPY]], [[V_ADD_U32_e64_]], [[REG_SEQUENCE]], [[COPY6]], 0, 94, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 8) ; CHECK-NEXT: S_ENDPGM 0 %voffset = add i32 %voffset.base, 4096 @@ -650,10 +648,9 @@ ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3 ; CHECK-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr5 ; CHECK-NEXT: [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr2 - ; CHECK-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 4096 - ; CHECK-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] - ; CHECK-NEXT: [[V_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[COPY5]], [[COPY7]], 0, implicit $exec - ; CHECK-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32_xm0_xexec = S_MOV_B32 $exec_lo + ; CHECK-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4096, implicit $exec + ; CHECK-NEXT: [[V_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[COPY5]], [[V_MOV_B32_e32_]], 0, implicit $exec + ; CHECK-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_MOV_B32 $exec_lo ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.2: ; CHECK-NEXT: successors: %bb.3(0x80000000) @@ -663,12 +660,12 @@ ; CHECK-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY3]], implicit $exec ; CHECK-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY4]], implicit $exec ; CHECK-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3 - ; CHECK-NEXT: [[COPY8:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub0_sub1 - ; CHECK-NEXT: [[COPY9:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub2_sub3 - ; CHECK-NEXT: [[COPY10:%[0-9]+]]:sreg_64 = COPY [[REG_SEQUENCE1]].sub0_sub1 - ; CHECK-NEXT: [[COPY11:%[0-9]+]]:sreg_64 = COPY [[REG_SEQUENCE1]].sub2_sub3 - ; CHECK-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U64_e64 [[COPY10]], [[COPY8]], implicit $exec - ; CHECK-NEXT: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U64_e64 [[COPY11]], [[COPY9]], implicit $exec + ; CHECK-NEXT: [[COPY7:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub0_sub1 + ; CHECK-NEXT: [[COPY8:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub2_sub3 + ; CHECK-NEXT: [[COPY9:%[0-9]+]]:sreg_64 = COPY [[REG_SEQUENCE1]].sub0_sub1 + ; CHECK-NEXT: [[COPY10:%[0-9]+]]:sreg_64 = COPY [[REG_SEQUENCE1]].sub2_sub3 + ; CHECK-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U64_e64 [[COPY9]], [[COPY7]], implicit $exec + ; CHECK-NEXT: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U64_e64 [[COPY10]], [[COPY8]], implicit $exec ; CHECK-NEXT: [[S_AND_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_B32 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def $scc ; CHECK-NEXT: [[S_AND_SAVEEXEC_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_SAVEEXEC_B32 killed [[S_AND_B32_]], implicit-def $exec, implicit-def $scc, implicit $exec ; CHECK-NEXT: {{ $}} @@ -682,7 +679,7 @@ ; CHECK-NEXT: bb.4: ; CHECK-NEXT: successors: %bb.5(0x80000000) ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: $exec_lo = S_MOV_B32_term [[S_MOV_B32_1]] + ; CHECK-NEXT: $exec_lo = S_MOV_B32_term [[S_MOV_B32_]] ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.5: ; CHECK-NEXT: S_ENDPGM 0 Index: llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.s.buffer.load.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.s.buffer.load.ll +++ llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.s.buffer.load.ll @@ -2139,8 +2139,10 @@ ; GFX6-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5 ; GFX6-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; GFX6-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; GFX6-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 4096 - ; GFX6-NEXT: [[BUFFER_LOAD_DWORD_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 0, 0, 0, implicit $exec :: (dereferenceable invariant load (s32)) + ; GFX6-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4096, implicit $exec + ; GFX6-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64 = V_ADD_CO_U32_e64 [[COPY4]], [[V_MOV_B32_e32_]], 0, implicit $exec + ; GFX6-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0 + ; GFX6-NEXT: [[BUFFER_LOAD_DWORD_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN [[V_ADD_CO_U32_e64_]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 0, 0, 0, implicit $exec :: (dereferenceable invariant load (s32)) ; GFX6-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_DWORD_OFFEN]] ; GFX6-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0 ; GFX7-LABEL: name: s_buffer_load_f32_vgpr_offset_add_4096 @@ -2153,8 +2155,10 @@ ; GFX7-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5 ; GFX7-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; GFX7-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; GFX7-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 4096 - ; GFX7-NEXT: [[BUFFER_LOAD_DWORD_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 0, 0, 0, implicit $exec :: (dereferenceable invariant load (s32)) + ; GFX7-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4096, implicit $exec + ; GFX7-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64 = V_ADD_CO_U32_e64 [[COPY4]], [[V_MOV_B32_e32_]], 0, implicit $exec + ; GFX7-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0 + ; GFX7-NEXT: [[BUFFER_LOAD_DWORD_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN [[V_ADD_CO_U32_e64_]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 0, 0, 0, implicit $exec :: (dereferenceable invariant load (s32)) ; GFX7-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_DWORD_OFFEN]] ; GFX7-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0 ; GFX8-LABEL: name: s_buffer_load_f32_vgpr_offset_add_4096 @@ -2288,9 +2292,11 @@ ; GFX6-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5 ; GFX6-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; GFX6-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; GFX6-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 4068 - ; GFX6-NEXT: [[BUFFER_LOAD_DWORDX4_OFFEN:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 0, 0, 0, implicit $exec :: (dereferenceable invariant load (s128), align 4) - ; GFX6-NEXT: [[BUFFER_LOAD_DWORDX4_OFFEN1:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 16, 0, 0, implicit $exec :: (dereferenceable invariant load (s128), align 4) + ; GFX6-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4068, implicit $exec + ; GFX6-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64 = V_ADD_CO_U32_e64 [[COPY4]], [[V_MOV_B32_e32_]], 0, implicit $exec + ; GFX6-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0 + ; GFX6-NEXT: [[BUFFER_LOAD_DWORDX4_OFFEN:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[V_ADD_CO_U32_e64_]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 0, 0, 0, implicit $exec :: (dereferenceable invariant load (s128), align 4) + ; GFX6-NEXT: [[BUFFER_LOAD_DWORDX4_OFFEN1:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[V_ADD_CO_U32_e64_]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 16, 0, 0, implicit $exec :: (dereferenceable invariant load (s128), align 4) ; GFX6-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_256 = REG_SEQUENCE [[BUFFER_LOAD_DWORDX4_OFFEN]], %subreg.sub0_sub1_sub2_sub3, [[BUFFER_LOAD_DWORDX4_OFFEN1]], %subreg.sub4_sub5_sub6_sub7 ; GFX6-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub0 ; GFX6-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub1 @@ -2319,9 +2325,11 @@ ; GFX7-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5 ; GFX7-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; GFX7-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; GFX7-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 4068 - ; GFX7-NEXT: [[BUFFER_LOAD_DWORDX4_OFFEN:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 0, 0, 0, implicit $exec :: (dereferenceable invariant load (s128), align 4) - ; GFX7-NEXT: [[BUFFER_LOAD_DWORDX4_OFFEN1:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 16, 0, 0, implicit $exec :: (dereferenceable invariant load (s128), align 4) + ; GFX7-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4068, implicit $exec + ; GFX7-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64 = V_ADD_CO_U32_e64 [[COPY4]], [[V_MOV_B32_e32_]], 0, implicit $exec + ; GFX7-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0 + ; GFX7-NEXT: [[BUFFER_LOAD_DWORDX4_OFFEN:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[V_ADD_CO_U32_e64_]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 0, 0, 0, implicit $exec :: (dereferenceable invariant load (s128), align 4) + ; GFX7-NEXT: [[BUFFER_LOAD_DWORDX4_OFFEN1:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[V_ADD_CO_U32_e64_]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 16, 0, 0, implicit $exec :: (dereferenceable invariant load (s128), align 4) ; GFX7-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_256 = REG_SEQUENCE [[BUFFER_LOAD_DWORDX4_OFFEN]], %subreg.sub0_sub1_sub2_sub3, [[BUFFER_LOAD_DWORDX4_OFFEN1]], %subreg.sub4_sub5_sub6_sub7 ; GFX7-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub0 ; GFX7-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub1 @@ -2540,11 +2548,13 @@ ; GFX6-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5 ; GFX6-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; GFX6-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; GFX6-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 4036 - ; GFX6-NEXT: [[BUFFER_LOAD_DWORDX4_OFFEN:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 0, 0, 0, implicit $exec :: (dereferenceable invariant load (s128), align 4) - ; GFX6-NEXT: [[BUFFER_LOAD_DWORDX4_OFFEN1:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 16, 0, 0, implicit $exec :: (dereferenceable invariant load (s128), align 4) - ; GFX6-NEXT: [[BUFFER_LOAD_DWORDX4_OFFEN2:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 32, 0, 0, implicit $exec :: (dereferenceable invariant load (s128) from unknown-address + 16, align 4) - ; GFX6-NEXT: [[BUFFER_LOAD_DWORDX4_OFFEN3:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 48, 0, 0, implicit $exec :: (dereferenceable invariant load (s128) from unknown-address + 48, align 4) + ; GFX6-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4036, implicit $exec + ; GFX6-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64 = V_ADD_CO_U32_e64 [[COPY4]], [[V_MOV_B32_e32_]], 0, implicit $exec + ; GFX6-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0 + ; GFX6-NEXT: [[BUFFER_LOAD_DWORDX4_OFFEN:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[V_ADD_CO_U32_e64_]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 0, 0, 0, implicit $exec :: (dereferenceable invariant load (s128), align 4) + ; GFX6-NEXT: [[BUFFER_LOAD_DWORDX4_OFFEN1:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[V_ADD_CO_U32_e64_]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 16, 0, 0, implicit $exec :: (dereferenceable invariant load (s128), align 4) + ; GFX6-NEXT: [[BUFFER_LOAD_DWORDX4_OFFEN2:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[V_ADD_CO_U32_e64_]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 32, 0, 0, implicit $exec :: (dereferenceable invariant load (s128) from unknown-address + 16, align 4) + ; GFX6-NEXT: [[BUFFER_LOAD_DWORDX4_OFFEN3:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[V_ADD_CO_U32_e64_]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 48, 0, 0, implicit $exec :: (dereferenceable invariant load (s128) from unknown-address + 48, align 4) ; GFX6-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_512 = REG_SEQUENCE [[BUFFER_LOAD_DWORDX4_OFFEN]], %subreg.sub0_sub1_sub2_sub3, [[BUFFER_LOAD_DWORDX4_OFFEN1]], %subreg.sub4_sub5_sub6_sub7, [[BUFFER_LOAD_DWORDX4_OFFEN2]], %subreg.sub8_sub9_sub10_sub11, [[BUFFER_LOAD_DWORDX4_OFFEN3]], %subreg.sub12_sub13_sub14_sub15 ; GFX6-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub0 ; GFX6-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub1 @@ -2589,11 +2599,13 @@ ; GFX7-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5 ; GFX7-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; GFX7-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; GFX7-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 4036 - ; GFX7-NEXT: [[BUFFER_LOAD_DWORDX4_OFFEN:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 0, 0, 0, implicit $exec :: (dereferenceable invariant load (s128), align 4) - ; GFX7-NEXT: [[BUFFER_LOAD_DWORDX4_OFFEN1:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 16, 0, 0, implicit $exec :: (dereferenceable invariant load (s128), align 4) - ; GFX7-NEXT: [[BUFFER_LOAD_DWORDX4_OFFEN2:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 32, 0, 0, implicit $exec :: (dereferenceable invariant load (s128) from unknown-address + 16, align 4) - ; GFX7-NEXT: [[BUFFER_LOAD_DWORDX4_OFFEN3:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 48, 0, 0, implicit $exec :: (dereferenceable invariant load (s128) from unknown-address + 48, align 4) + ; GFX7-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4036, implicit $exec + ; GFX7-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64 = V_ADD_CO_U32_e64 [[COPY4]], [[V_MOV_B32_e32_]], 0, implicit $exec + ; GFX7-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0 + ; GFX7-NEXT: [[BUFFER_LOAD_DWORDX4_OFFEN:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[V_ADD_CO_U32_e64_]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 0, 0, 0, implicit $exec :: (dereferenceable invariant load (s128), align 4) + ; GFX7-NEXT: [[BUFFER_LOAD_DWORDX4_OFFEN1:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[V_ADD_CO_U32_e64_]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 16, 0, 0, implicit $exec :: (dereferenceable invariant load (s128), align 4) + ; GFX7-NEXT: [[BUFFER_LOAD_DWORDX4_OFFEN2:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[V_ADD_CO_U32_e64_]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 32, 0, 0, implicit $exec :: (dereferenceable invariant load (s128) from unknown-address + 16, align 4) + ; GFX7-NEXT: [[BUFFER_LOAD_DWORDX4_OFFEN3:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[V_ADD_CO_U32_e64_]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 48, 0, 0, implicit $exec :: (dereferenceable invariant load (s128) from unknown-address + 48, align 4) ; GFX7-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_512 = REG_SEQUENCE [[BUFFER_LOAD_DWORDX4_OFFEN]], %subreg.sub0_sub1_sub2_sub3, [[BUFFER_LOAD_DWORDX4_OFFEN1]], %subreg.sub4_sub5_sub6_sub7, [[BUFFER_LOAD_DWORDX4_OFFEN2]], %subreg.sub8_sub9_sub10_sub11, [[BUFFER_LOAD_DWORDX4_OFFEN3]], %subreg.sub12_sub13_sub14_sub15 ; GFX7-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub0 ; GFX7-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub1 @@ -4027,7 +4039,9 @@ ; GFX6-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr3 ; GFX6-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; GFX6-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr4 - ; GFX6-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 5000 + ; GFX6-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 5000, implicit $exec + ; GFX6-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64 = V_ADD_CO_U32_e64 [[COPY4]], [[V_MOV_B32_e32_]], 0, implicit $exec + ; GFX6-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0 ; GFX6-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec ; GFX6-NEXT: {{ $}} ; GFX6-NEXT: bb.2: @@ -4050,8 +4064,8 @@ ; GFX6-NEXT: bb.3: ; GFX6-NEXT: successors: %bb.4(0x40000000), %bb.2(0x40000000) ; GFX6-NEXT: {{ $}} - ; GFX6-NEXT: [[BUFFER_LOAD_DWORDX4_OFFEN:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY4]], [[REG_SEQUENCE1]], [[S_MOV_B32_]], 0, 0, 0, implicit $exec :: (dereferenceable invariant load (s128), align 4) - ; GFX6-NEXT: [[BUFFER_LOAD_DWORDX4_OFFEN1:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY4]], [[REG_SEQUENCE1]], [[S_MOV_B32_]], 16, 0, 0, implicit $exec :: (dereferenceable invariant load (s128), align 4) + ; GFX6-NEXT: [[BUFFER_LOAD_DWORDX4_OFFEN:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[V_ADD_CO_U32_e64_]], [[REG_SEQUENCE1]], [[S_MOV_B32_]], 0, 0, 0, implicit $exec :: (dereferenceable invariant load (s128), align 4) + ; GFX6-NEXT: [[BUFFER_LOAD_DWORDX4_OFFEN1:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[V_ADD_CO_U32_e64_]], [[REG_SEQUENCE1]], [[S_MOV_B32_]], 16, 0, 0, implicit $exec :: (dereferenceable invariant load (s128), align 4) ; GFX6-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc ; GFX6-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec ; GFX6-NEXT: {{ $}} @@ -4090,7 +4104,9 @@ ; GFX7-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr3 ; GFX7-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; GFX7-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr4 - ; GFX7-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 5000 + ; GFX7-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 5000, implicit $exec + ; GFX7-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64 = V_ADD_CO_U32_e64 [[COPY4]], [[V_MOV_B32_e32_]], 0, implicit $exec + ; GFX7-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0 ; GFX7-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec ; GFX7-NEXT: {{ $}} ; GFX7-NEXT: bb.2: @@ -4113,8 +4129,8 @@ ; GFX7-NEXT: bb.3: ; GFX7-NEXT: successors: %bb.4(0x40000000), %bb.2(0x40000000) ; GFX7-NEXT: {{ $}} - ; GFX7-NEXT: [[BUFFER_LOAD_DWORDX4_OFFEN:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY4]], [[REG_SEQUENCE1]], [[S_MOV_B32_]], 0, 0, 0, implicit $exec :: (dereferenceable invariant load (s128), align 4) - ; GFX7-NEXT: [[BUFFER_LOAD_DWORDX4_OFFEN1:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY4]], [[REG_SEQUENCE1]], [[S_MOV_B32_]], 16, 0, 0, implicit $exec :: (dereferenceable invariant load (s128), align 4) + ; GFX7-NEXT: [[BUFFER_LOAD_DWORDX4_OFFEN:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[V_ADD_CO_U32_e64_]], [[REG_SEQUENCE1]], [[S_MOV_B32_]], 0, 0, 0, implicit $exec :: (dereferenceable invariant load (s128), align 4) + ; GFX7-NEXT: [[BUFFER_LOAD_DWORDX4_OFFEN1:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[V_ADD_CO_U32_e64_]], [[REG_SEQUENCE1]], [[S_MOV_B32_]], 16, 0, 0, implicit $exec :: (dereferenceable invariant load (s128), align 4) ; GFX7-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc ; GFX7-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec ; GFX7-NEXT: {{ $}} @@ -4222,7 +4238,9 @@ ; GFX6-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr3 ; GFX6-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; GFX6-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr4 - ; GFX6-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 4076 + ; GFX6-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4076, implicit $exec + ; GFX6-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64 = V_ADD_CO_U32_e64 [[COPY4]], [[V_MOV_B32_e32_]], 0, implicit $exec + ; GFX6-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0 ; GFX6-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec ; GFX6-NEXT: {{ $}} ; GFX6-NEXT: bb.2: @@ -4245,8 +4263,8 @@ ; GFX6-NEXT: bb.3: ; GFX6-NEXT: successors: %bb.4(0x40000000), %bb.2(0x40000000) ; GFX6-NEXT: {{ $}} - ; GFX6-NEXT: [[BUFFER_LOAD_DWORDX4_OFFEN:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY4]], [[REG_SEQUENCE1]], [[S_MOV_B32_]], 0, 0, 0, implicit $exec :: (dereferenceable invariant load (s128), align 4) - ; GFX6-NEXT: [[BUFFER_LOAD_DWORDX4_OFFEN1:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY4]], [[REG_SEQUENCE1]], [[S_MOV_B32_]], 16, 0, 0, implicit $exec :: (dereferenceable invariant load (s128), align 4) + ; GFX6-NEXT: [[BUFFER_LOAD_DWORDX4_OFFEN:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[V_ADD_CO_U32_e64_]], [[REG_SEQUENCE1]], [[S_MOV_B32_]], 0, 0, 0, implicit $exec :: (dereferenceable invariant load (s128), align 4) + ; GFX6-NEXT: [[BUFFER_LOAD_DWORDX4_OFFEN1:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[V_ADD_CO_U32_e64_]], [[REG_SEQUENCE1]], [[S_MOV_B32_]], 16, 0, 0, implicit $exec :: (dereferenceable invariant load (s128), align 4) ; GFX6-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc ; GFX6-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec ; GFX6-NEXT: {{ $}} @@ -4285,7 +4303,9 @@ ; GFX7-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr3 ; GFX7-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; GFX7-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr4 - ; GFX7-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 4076 + ; GFX7-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4076, implicit $exec + ; GFX7-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64 = V_ADD_CO_U32_e64 [[COPY4]], [[V_MOV_B32_e32_]], 0, implicit $exec + ; GFX7-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0 ; GFX7-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec ; GFX7-NEXT: {{ $}} ; GFX7-NEXT: bb.2: @@ -4308,8 +4328,8 @@ ; GFX7-NEXT: bb.3: ; GFX7-NEXT: successors: %bb.4(0x40000000), %bb.2(0x40000000) ; GFX7-NEXT: {{ $}} - ; GFX7-NEXT: [[BUFFER_LOAD_DWORDX4_OFFEN:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY4]], [[REG_SEQUENCE1]], [[S_MOV_B32_]], 0, 0, 0, implicit $exec :: (dereferenceable invariant load (s128), align 4) - ; GFX7-NEXT: [[BUFFER_LOAD_DWORDX4_OFFEN1:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY4]], [[REG_SEQUENCE1]], [[S_MOV_B32_]], 16, 0, 0, implicit $exec :: (dereferenceable invariant load (s128), align 4) + ; GFX7-NEXT: [[BUFFER_LOAD_DWORDX4_OFFEN:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[V_ADD_CO_U32_e64_]], [[REG_SEQUENCE1]], [[S_MOV_B32_]], 0, 0, 0, implicit $exec :: (dereferenceable invariant load (s128), align 4) + ; GFX7-NEXT: [[BUFFER_LOAD_DWORDX4_OFFEN1:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[V_ADD_CO_U32_e64_]], [[REG_SEQUENCE1]], [[S_MOV_B32_]], 16, 0, 0, implicit $exec :: (dereferenceable invariant load (s128), align 4) ; GFX7-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc ; GFX7-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec ; GFX7-NEXT: {{ $}} @@ -4417,7 +4437,9 @@ ; GFX6-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr3 ; GFX6-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; GFX6-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr4 - ; GFX6-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 4080 + ; GFX6-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4080, implicit $exec + ; GFX6-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64 = V_ADD_CO_U32_e64 [[COPY4]], [[V_MOV_B32_e32_]], 0, implicit $exec + ; GFX6-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0 ; GFX6-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec ; GFX6-NEXT: {{ $}} ; GFX6-NEXT: bb.2: @@ -4440,8 +4462,8 @@ ; GFX6-NEXT: bb.3: ; GFX6-NEXT: successors: %bb.4(0x40000000), %bb.2(0x40000000) ; GFX6-NEXT: {{ $}} - ; GFX6-NEXT: [[BUFFER_LOAD_DWORDX4_OFFEN:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY4]], [[REG_SEQUENCE1]], [[S_MOV_B32_]], 0, 0, 0, implicit $exec :: (dereferenceable invariant load (s128), align 4) - ; GFX6-NEXT: [[BUFFER_LOAD_DWORDX4_OFFEN1:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY4]], [[REG_SEQUENCE1]], [[S_MOV_B32_]], 16, 0, 0, implicit $exec :: (dereferenceable invariant load (s128), align 4) + ; GFX6-NEXT: [[BUFFER_LOAD_DWORDX4_OFFEN:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[V_ADD_CO_U32_e64_]], [[REG_SEQUENCE1]], [[S_MOV_B32_]], 0, 0, 0, implicit $exec :: (dereferenceable invariant load (s128), align 4) + ; GFX6-NEXT: [[BUFFER_LOAD_DWORDX4_OFFEN1:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[V_ADD_CO_U32_e64_]], [[REG_SEQUENCE1]], [[S_MOV_B32_]], 16, 0, 0, implicit $exec :: (dereferenceable invariant load (s128), align 4) ; GFX6-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc ; GFX6-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec ; GFX6-NEXT: {{ $}} @@ -4480,7 +4502,9 @@ ; GFX7-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr3 ; GFX7-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; GFX7-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr4 - ; GFX7-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 4080 + ; GFX7-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4080, implicit $exec + ; GFX7-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64 = V_ADD_CO_U32_e64 [[COPY4]], [[V_MOV_B32_e32_]], 0, implicit $exec + ; GFX7-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0 ; GFX7-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec ; GFX7-NEXT: {{ $}} ; GFX7-NEXT: bb.2: @@ -4503,8 +4527,8 @@ ; GFX7-NEXT: bb.3: ; GFX7-NEXT: successors: %bb.4(0x40000000), %bb.2(0x40000000) ; GFX7-NEXT: {{ $}} - ; GFX7-NEXT: [[BUFFER_LOAD_DWORDX4_OFFEN:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY4]], [[REG_SEQUENCE1]], [[S_MOV_B32_]], 0, 0, 0, implicit $exec :: (dereferenceable invariant load (s128), align 4) - ; GFX7-NEXT: [[BUFFER_LOAD_DWORDX4_OFFEN1:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY4]], [[REG_SEQUENCE1]], [[S_MOV_B32_]], 16, 0, 0, implicit $exec :: (dereferenceable invariant load (s128), align 4) + ; GFX7-NEXT: [[BUFFER_LOAD_DWORDX4_OFFEN:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[V_ADD_CO_U32_e64_]], [[REG_SEQUENCE1]], [[S_MOV_B32_]], 0, 0, 0, implicit $exec :: (dereferenceable invariant load (s128), align 4) + ; GFX7-NEXT: [[BUFFER_LOAD_DWORDX4_OFFEN1:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[V_ADD_CO_U32_e64_]], [[REG_SEQUENCE1]], [[S_MOV_B32_]], 16, 0, 0, implicit $exec :: (dereferenceable invariant load (s128), align 4) ; GFX7-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc ; GFX7-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec ; GFX7-NEXT: {{ $}} Index: llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.sdot2.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.sdot2.ll +++ llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.sdot2.ll @@ -159,8 +159,8 @@ ; GFX908-LABEL: v_sdot2_inline_literal_a_b_c: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: v_mov_b32_e32 v1, 0x40004 ; GFX908-NEXT: v_mov_b32_e32 v0, 8 +; GFX908-NEXT: v_mov_b32_e32 v1, 0x40004 ; GFX908-NEXT: v_dot2c_i32_i16_e32 v0, 0x80008, v1 ; GFX908-NEXT: s_setpc_b64 s[30:31] ; Index: llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.sdot4.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.sdot4.ll +++ llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.sdot4.ll @@ -43,19 +43,19 @@ ; GFX906-LABEL: v_sdot4_cast_v4i8: ; GFX906: ; %bb.0: ; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX906-NEXT: s_mov_b32 s5, 8 -; GFX906-NEXT: s_movk_i32 s4, 0xff -; GFX906-NEXT: v_lshlrev_b32_sdwa v1, s5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX906-NEXT: v_and_or_b32 v0, v0, s4, v1 +; GFX906-NEXT: v_mov_b32_e32 v10, 8 +; GFX906-NEXT: v_mov_b32_e32 v9, 0xff +; GFX906-NEXT: v_lshlrev_b32_sdwa v1, v10, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX906-NEXT: v_and_or_b32 v0, v0, v9, v1 ; GFX906-NEXT: v_and_b32_e32 v1, 0xff, v2 ; GFX906-NEXT: v_and_b32_e32 v2, 0xff, v3 ; GFX906-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX906-NEXT: v_lshlrev_b32_e32 v2, 24, v2 ; GFX906-NEXT: v_or3_b32 v0, v0, v1, v2 -; GFX906-NEXT: v_lshlrev_b32_sdwa v1, s5, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX906-NEXT: v_lshlrev_b32_sdwa v1, v10, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX906-NEXT: v_and_b32_e32 v2, 0xff, v6 ; GFX906-NEXT: v_and_b32_e32 v3, 0xff, v7 -; GFX906-NEXT: v_and_or_b32 v1, v4, s4, v1 +; GFX906-NEXT: v_and_or_b32 v1, v4, v9, v1 ; GFX906-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX906-NEXT: v_lshlrev_b32_e32 v3, 24, v3 ; GFX906-NEXT: v_or3_b32 v1, v1, v2, v3 @@ -66,17 +66,17 @@ ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: s_mov_b32 s4, 8 -; GFX10-NEXT: v_lshlrev_b32_sdwa v1, s4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX10-NEXT: v_and_or_b32 v0, v0, 0xff, v1 +; GFX10-NEXT: v_mov_b32_e32 v9, 8 +; GFX10-NEXT: v_lshlrev_b32_sdwa v1, v9, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX10-NEXT: v_and_or_b32 v0, 0xff, v0, v1 ; GFX10-NEXT: v_and_b32_e32 v1, 0xff, v2 ; GFX10-NEXT: v_and_b32_e32 v2, 0xff, v3 -; GFX10-NEXT: v_lshlrev_b32_sdwa v3, s4, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX10-NEXT: v_lshlrev_b32_sdwa v3, v9, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX10-NEXT: v_and_b32_e32 v5, 0xff, v6 ; GFX10-NEXT: v_and_b32_e32 v6, 0xff, v7 ; GFX10-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX10-NEXT: v_lshlrev_b32_e32 v2, 24, v2 -; GFX10-NEXT: v_and_or_b32 v3, v4, 0xff, v3 +; GFX10-NEXT: v_and_or_b32 v3, 0xff, v4, v3 ; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v5 ; GFX10-NEXT: v_lshlrev_b32_e32 v5, 24, v6 ; GFX10-NEXT: v_or3_b32 v0, v0, v1, v2 Index: llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.buffer.load.format.f16.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.buffer.load.format.f16.ll +++ llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.buffer.load.format.f16.ll @@ -56,14 +56,12 @@ ; UNPACKED-NEXT: [[BUFFER_LOAD_FORMAT_D16_XY_gfx80_BOTHEN:%[0-9]+]]:vreg_64 = BUFFER_LOAD_FORMAT_D16_XY_gfx80_BOTHEN [[REG_SEQUENCE1]], [[REG_SEQUENCE]], [[COPY6]], 0, 0, 0, implicit $exec :: (dereferenceable load (<2 x s16>), align 1, addrspace 8) ; UNPACKED-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_FORMAT_D16_XY_gfx80_BOTHEN]].sub0 ; UNPACKED-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_FORMAT_D16_XY_gfx80_BOTHEN]].sub1 - ; UNPACKED-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 65535 - ; UNPACKED-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] - ; UNPACKED-NEXT: [[V_AND_B32_e64_:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 [[COPY7]], [[COPY9]], implicit $exec - ; UNPACKED-NEXT: [[COPY10:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] - ; UNPACKED-NEXT: [[V_AND_B32_e64_1:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 [[COPY8]], [[COPY10]], implicit $exec - ; UNPACKED-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 16 - ; UNPACKED-NEXT: [[COPY11:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_1]] - ; UNPACKED-NEXT: [[V_LSHLREV_B32_e64_:%[0-9]+]]:vgpr_32 = V_LSHLREV_B32_e64 [[COPY11]], [[V_AND_B32_e64_1]], implicit $exec + ; UNPACKED-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 65535, implicit $exec + ; UNPACKED-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 65535, implicit $exec + ; UNPACKED-NEXT: [[V_AND_B32_e64_:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 [[COPY7]], [[V_MOV_B32_e32_1]], implicit $exec + ; UNPACKED-NEXT: [[V_AND_B32_e64_1:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 [[COPY8]], [[V_MOV_B32_e32_]], implicit $exec + ; UNPACKED-NEXT: [[V_MOV_B32_e32_2:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 16, implicit $exec + ; UNPACKED-NEXT: [[V_LSHLREV_B32_e64_:%[0-9]+]]:vgpr_32 = V_LSHLREV_B32_e64 [[V_MOV_B32_e32_2]], [[V_AND_B32_e64_1]], implicit $exec ; UNPACKED-NEXT: [[V_OR_B32_e64_:%[0-9]+]]:vgpr_32 = V_OR_B32_e64 [[V_AND_B32_e64_]], [[V_LSHLREV_B32_e64_]], implicit $exec ; UNPACKED-NEXT: $vgpr0 = COPY [[V_OR_B32_e64_]] ; UNPACKED-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0 @@ -112,21 +110,19 @@ ; UNPACKED-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_FORMAT_D16_XYZW_gfx80_BOTHEN]].sub1 ; UNPACKED-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_FORMAT_D16_XYZW_gfx80_BOTHEN]].sub2 ; UNPACKED-NEXT: [[COPY10:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_FORMAT_D16_XYZW_gfx80_BOTHEN]].sub3 - ; UNPACKED-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 65535 - ; UNPACKED-NEXT: [[COPY11:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] - ; UNPACKED-NEXT: [[V_AND_B32_e64_:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 [[COPY7]], [[COPY11]], implicit $exec - ; UNPACKED-NEXT: [[COPY12:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] - ; UNPACKED-NEXT: [[V_AND_B32_e64_1:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 [[COPY8]], [[COPY12]], implicit $exec - ; UNPACKED-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 16 - ; UNPACKED-NEXT: [[COPY13:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_1]] - ; UNPACKED-NEXT: [[V_LSHLREV_B32_e64_:%[0-9]+]]:vgpr_32 = V_LSHLREV_B32_e64 [[COPY13]], [[V_AND_B32_e64_1]], implicit $exec + ; UNPACKED-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 65535, implicit $exec + ; UNPACKED-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 65535, implicit $exec + ; UNPACKED-NEXT: [[V_AND_B32_e64_:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 [[COPY7]], [[V_MOV_B32_e32_1]], implicit $exec + ; UNPACKED-NEXT: [[V_MOV_B32_e32_2:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 65535, implicit $exec + ; UNPACKED-NEXT: [[V_AND_B32_e64_1:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 [[COPY8]], [[V_MOV_B32_e32_2]], implicit $exec + ; UNPACKED-NEXT: [[V_MOV_B32_e32_3:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 16, implicit $exec + ; UNPACKED-NEXT: [[V_MOV_B32_e32_4:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 16, implicit $exec + ; UNPACKED-NEXT: [[V_LSHLREV_B32_e64_:%[0-9]+]]:vgpr_32 = V_LSHLREV_B32_e64 [[V_MOV_B32_e32_4]], [[V_AND_B32_e64_1]], implicit $exec ; UNPACKED-NEXT: [[V_OR_B32_e64_:%[0-9]+]]:vgpr_32 = V_OR_B32_e64 [[V_AND_B32_e64_]], [[V_LSHLREV_B32_e64_]], implicit $exec - ; UNPACKED-NEXT: [[COPY14:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] - ; UNPACKED-NEXT: [[V_AND_B32_e64_2:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 [[COPY9]], [[COPY14]], implicit $exec - ; UNPACKED-NEXT: [[COPY15:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] - ; UNPACKED-NEXT: [[V_AND_B32_e64_3:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 [[COPY10]], [[COPY15]], implicit $exec - ; UNPACKED-NEXT: [[COPY16:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_1]] - ; UNPACKED-NEXT: [[V_LSHLREV_B32_e64_1:%[0-9]+]]:vgpr_32 = V_LSHLREV_B32_e64 [[COPY16]], [[V_AND_B32_e64_3]], implicit $exec + ; UNPACKED-NEXT: [[V_MOV_B32_e32_5:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 65535, implicit $exec + ; UNPACKED-NEXT: [[V_AND_B32_e64_2:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 [[COPY9]], [[V_MOV_B32_e32_5]], implicit $exec + ; UNPACKED-NEXT: [[V_AND_B32_e64_3:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 [[COPY10]], [[V_MOV_B32_e32_]], implicit $exec + ; UNPACKED-NEXT: [[V_LSHLREV_B32_e64_1:%[0-9]+]]:vgpr_32 = V_LSHLREV_B32_e64 [[V_MOV_B32_e32_3]], [[V_AND_B32_e64_3]], implicit $exec ; UNPACKED-NEXT: [[V_OR_B32_e64_1:%[0-9]+]]:vgpr_32 = V_OR_B32_e64 [[V_AND_B32_e64_2]], [[V_LSHLREV_B32_e64_1]], implicit $exec ; UNPACKED-NEXT: $vgpr0 = COPY [[V_OR_B32_e64_]] ; UNPACKED-NEXT: $vgpr1 = COPY [[V_OR_B32_e64_1]] @@ -211,21 +207,19 @@ ; UNPACKED-NEXT: [[COPY14:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_FORMAT_D16_XYZW_gfx80_BOTHEN]].sub1 ; UNPACKED-NEXT: [[COPY15:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_FORMAT_D16_XYZW_gfx80_BOTHEN]].sub2 ; UNPACKED-NEXT: [[COPY16:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_FORMAT_D16_XYZW_gfx80_BOTHEN]].sub3 - ; UNPACKED-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 65535 - ; UNPACKED-NEXT: [[COPY17:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] - ; UNPACKED-NEXT: [[V_AND_B32_e64_:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 [[COPY13]], [[COPY17]], implicit $exec - ; UNPACKED-NEXT: [[COPY18:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] - ; UNPACKED-NEXT: [[V_AND_B32_e64_1:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 [[COPY14]], [[COPY18]], implicit $exec - ; UNPACKED-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 16 - ; UNPACKED-NEXT: [[COPY19:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_1]] - ; UNPACKED-NEXT: [[V_LSHLREV_B32_e64_:%[0-9]+]]:vgpr_32 = V_LSHLREV_B32_e64 [[COPY19]], [[V_AND_B32_e64_1]], implicit $exec + ; UNPACKED-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 65535, implicit $exec + ; UNPACKED-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 65535, implicit $exec + ; UNPACKED-NEXT: [[V_AND_B32_e64_:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 [[COPY13]], [[V_MOV_B32_e32_1]], implicit $exec + ; UNPACKED-NEXT: [[V_MOV_B32_e32_2:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 65535, implicit $exec + ; UNPACKED-NEXT: [[V_AND_B32_e64_1:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 [[COPY14]], [[V_MOV_B32_e32_2]], implicit $exec + ; UNPACKED-NEXT: [[V_MOV_B32_e32_3:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 16, implicit $exec + ; UNPACKED-NEXT: [[V_MOV_B32_e32_4:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 16, implicit $exec + ; UNPACKED-NEXT: [[V_LSHLREV_B32_e64_:%[0-9]+]]:vgpr_32 = V_LSHLREV_B32_e64 [[V_MOV_B32_e32_4]], [[V_AND_B32_e64_1]], implicit $exec ; UNPACKED-NEXT: [[V_OR_B32_e64_:%[0-9]+]]:vgpr_32 = V_OR_B32_e64 [[V_AND_B32_e64_]], [[V_LSHLREV_B32_e64_]], implicit $exec - ; UNPACKED-NEXT: [[COPY20:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] - ; UNPACKED-NEXT: [[V_AND_B32_e64_2:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 [[COPY15]], [[COPY20]], implicit $exec - ; UNPACKED-NEXT: [[COPY21:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] - ; UNPACKED-NEXT: [[V_AND_B32_e64_3:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 [[COPY16]], [[COPY21]], implicit $exec - ; UNPACKED-NEXT: [[COPY22:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_1]] - ; UNPACKED-NEXT: [[V_LSHLREV_B32_e64_1:%[0-9]+]]:vgpr_32 = V_LSHLREV_B32_e64 [[COPY22]], [[V_AND_B32_e64_3]], implicit $exec + ; UNPACKED-NEXT: [[V_MOV_B32_e32_5:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 65535, implicit $exec + ; UNPACKED-NEXT: [[V_AND_B32_e64_2:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 [[COPY15]], [[V_MOV_B32_e32_5]], implicit $exec + ; UNPACKED-NEXT: [[V_AND_B32_e64_3:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 [[COPY16]], [[V_MOV_B32_e32_]], implicit $exec + ; UNPACKED-NEXT: [[V_LSHLREV_B32_e64_1:%[0-9]+]]:vgpr_32 = V_LSHLREV_B32_e64 [[V_MOV_B32_e32_3]], [[V_AND_B32_e64_3]], implicit $exec ; UNPACKED-NEXT: [[V_OR_B32_e64_1:%[0-9]+]]:vgpr_32 = V_OR_B32_e64 [[V_AND_B32_e64_2]], [[V_LSHLREV_B32_e64_1]], implicit $exec ; UNPACKED-NEXT: $vgpr0 = COPY [[V_OR_B32_e64_]] ; UNPACKED-NEXT: $vgpr1 = COPY [[V_OR_B32_e64_1]] Index: llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.buffer.load.format.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.buffer.load.format.ll +++ llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.buffer.load.format.ll @@ -227,16 +227,16 @@ ; CHECK-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY $vgpr3 ; CHECK-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY6]], %subreg.sub0, [[COPY7]], %subreg.sub1 ; CHECK-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0 - ; CHECK-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] - ; CHECK-NEXT: [[BUFFER_LOAD_FORMAT_XYZW_TFE_IDXEN:%[0-9]+]]:vreg_160 = BUFFER_LOAD_FORMAT_XYZW_TFE_IDXEN [[COPY8]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 0, 0, 0, implicit $exec :: (dereferenceable load (<4 x s32>), align 1, addrspace 8) - ; CHECK-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_FORMAT_XYZW_TFE_IDXEN]].sub0 - ; CHECK-NEXT: [[COPY10:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_FORMAT_XYZW_TFE_IDXEN]].sub1 - ; CHECK-NEXT: [[COPY11:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_FORMAT_XYZW_TFE_IDXEN]].sub2 - ; CHECK-NEXT: [[COPY12:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_FORMAT_XYZW_TFE_IDXEN]].sub3 - ; CHECK-NEXT: [[COPY13:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_FORMAT_XYZW_TFE_IDXEN]].sub4 - ; CHECK-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY9]], %subreg.sub0, [[COPY10]], %subreg.sub1, [[COPY11]], %subreg.sub2, [[COPY12]], %subreg.sub3 + ; CHECK-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + ; CHECK-NEXT: [[BUFFER_LOAD_FORMAT_XYZW_TFE_IDXEN:%[0-9]+]]:vreg_160 = BUFFER_LOAD_FORMAT_XYZW_TFE_IDXEN [[V_MOV_B32_e32_]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 0, 0, 0, implicit $exec :: (dereferenceable load (<4 x s32>), align 1, addrspace 8) + ; CHECK-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_FORMAT_XYZW_TFE_IDXEN]].sub0 + ; CHECK-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_FORMAT_XYZW_TFE_IDXEN]].sub1 + ; CHECK-NEXT: [[COPY10:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_FORMAT_XYZW_TFE_IDXEN]].sub2 + ; CHECK-NEXT: [[COPY11:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_FORMAT_XYZW_TFE_IDXEN]].sub3 + ; CHECK-NEXT: [[COPY12:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_FORMAT_XYZW_TFE_IDXEN]].sub4 + ; CHECK-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY8]], %subreg.sub0, [[COPY9]], %subreg.sub1, [[COPY10]], %subreg.sub2, [[COPY11]], %subreg.sub3 ; CHECK-NEXT: FLAT_STORE_DWORDX4 [[REG_SEQUENCE1]], [[REG_SEQUENCE3]], 0, 0, implicit $exec, implicit $flat_scr :: (store (<4 x s32>) into %ir.value, addrspace 1) - ; CHECK-NEXT: FLAT_STORE_DWORD [[REG_SEQUENCE2]], [[COPY13]], 0, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %ir.status, addrspace 1) + ; CHECK-NEXT: FLAT_STORE_DWORD [[REG_SEQUENCE2]], [[COPY12]], 0, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %ir.status, addrspace 1) ; CHECK-NEXT: S_ENDPGM 0 %load = call { <4 x i32>, i32 } @llvm.amdgcn.struct.buffer.load.format.sl_v4i32i32s(<4 x i32> %rsrc, i32 0, i32 0, i32 0, i32 0) @@ -266,15 +266,15 @@ ; CHECK-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY $vgpr3 ; CHECK-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY6]], %subreg.sub0, [[COPY7]], %subreg.sub1 ; CHECK-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0 - ; CHECK-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] - ; CHECK-NEXT: [[BUFFER_LOAD_FORMAT_XYZ_TFE_IDXEN:%[0-9]+]]:vreg_128 = BUFFER_LOAD_FORMAT_XYZ_TFE_IDXEN [[COPY8]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 0, 0, 0, implicit $exec :: (dereferenceable load (<3 x s32>), align 1, addrspace 8) - ; CHECK-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_FORMAT_XYZ_TFE_IDXEN]].sub0 - ; CHECK-NEXT: [[COPY10:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_FORMAT_XYZ_TFE_IDXEN]].sub1 - ; CHECK-NEXT: [[COPY11:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_FORMAT_XYZ_TFE_IDXEN]].sub2 - ; CHECK-NEXT: [[COPY12:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_FORMAT_XYZ_TFE_IDXEN]].sub3 - ; CHECK-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:vreg_96 = REG_SEQUENCE [[COPY9]], %subreg.sub0, [[COPY10]], %subreg.sub1, [[COPY11]], %subreg.sub2 + ; CHECK-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + ; CHECK-NEXT: [[BUFFER_LOAD_FORMAT_XYZ_TFE_IDXEN:%[0-9]+]]:vreg_128 = BUFFER_LOAD_FORMAT_XYZ_TFE_IDXEN [[V_MOV_B32_e32_]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 0, 0, 0, implicit $exec :: (dereferenceable load (<3 x s32>), align 1, addrspace 8) + ; CHECK-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_FORMAT_XYZ_TFE_IDXEN]].sub0 + ; CHECK-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_FORMAT_XYZ_TFE_IDXEN]].sub1 + ; CHECK-NEXT: [[COPY10:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_FORMAT_XYZ_TFE_IDXEN]].sub2 + ; CHECK-NEXT: [[COPY11:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_FORMAT_XYZ_TFE_IDXEN]].sub3 + ; CHECK-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:vreg_96 = REG_SEQUENCE [[COPY8]], %subreg.sub0, [[COPY9]], %subreg.sub1, [[COPY10]], %subreg.sub2 ; CHECK-NEXT: FLAT_STORE_DWORDX3 [[REG_SEQUENCE1]], [[REG_SEQUENCE3]], 0, 0, implicit $exec, implicit $flat_scr :: (store (<3 x s32>) into %ir.value, align 16, addrspace 1) - ; CHECK-NEXT: FLAT_STORE_DWORD [[REG_SEQUENCE2]], [[COPY12]], 0, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %ir.status, addrspace 1) + ; CHECK-NEXT: FLAT_STORE_DWORD [[REG_SEQUENCE2]], [[COPY11]], 0, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %ir.status, addrspace 1) ; CHECK-NEXT: S_ENDPGM 0 %load = call { <3 x i32>, i32 } @llvm.amdgcn.struct.buffer.load.format.sl_v3i32i32s(<4 x i32> %rsrc, i32 0, i32 0, i32 0, i32 0) @@ -304,12 +304,12 @@ ; CHECK-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY $vgpr3 ; CHECK-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY6]], %subreg.sub0, [[COPY7]], %subreg.sub1 ; CHECK-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0 - ; CHECK-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] - ; CHECK-NEXT: [[BUFFER_LOAD_FORMAT_X_TFE_IDXEN:%[0-9]+]]:vreg_64 = BUFFER_LOAD_FORMAT_X_TFE_IDXEN [[COPY8]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8) - ; CHECK-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_FORMAT_X_TFE_IDXEN]].sub0 - ; CHECK-NEXT: [[COPY10:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_FORMAT_X_TFE_IDXEN]].sub1 - ; CHECK-NEXT: FLAT_STORE_DWORD [[REG_SEQUENCE1]], [[COPY9]], 0, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %ir.value, addrspace 1) - ; CHECK-NEXT: FLAT_STORE_DWORD [[REG_SEQUENCE2]], [[COPY10]], 0, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %ir.status, addrspace 1) + ; CHECK-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + ; CHECK-NEXT: [[BUFFER_LOAD_FORMAT_X_TFE_IDXEN:%[0-9]+]]:vreg_64 = BUFFER_LOAD_FORMAT_X_TFE_IDXEN [[V_MOV_B32_e32_]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8) + ; CHECK-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_FORMAT_X_TFE_IDXEN]].sub0 + ; CHECK-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_FORMAT_X_TFE_IDXEN]].sub1 + ; CHECK-NEXT: FLAT_STORE_DWORD [[REG_SEQUENCE1]], [[COPY8]], 0, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %ir.value, addrspace 1) + ; CHECK-NEXT: FLAT_STORE_DWORD [[REG_SEQUENCE2]], [[COPY9]], 0, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %ir.status, addrspace 1) ; CHECK-NEXT: S_ENDPGM 0 %load = call { i32, i32 } @llvm.amdgcn.struct.buffer.load.format.sl_i32i32s(<4 x i32> %rsrc, i32 0, i32 0, i32 0, i32 0) Index: llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.buffer.load.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.buffer.load.ll +++ llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.buffer.load.ll @@ -117,9 +117,8 @@ ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; CHECK-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; CHECK-NEXT: [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr6 - ; CHECK-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0 - ; CHECK-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] - ; CHECK-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY6]], %subreg.sub0, [[COPY4]], %subreg.sub1 + ; CHECK-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + ; CHECK-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_MOV_B32_e32_]], %subreg.sub0, [[COPY4]], %subreg.sub1 ; CHECK-NEXT: [[BUFFER_LOAD_DWORD_BOTHEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_BOTHEN [[REG_SEQUENCE1]], [[REG_SEQUENCE]], [[COPY5]], 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8) ; CHECK-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_DWORD_BOTHEN]] ; CHECK-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0 Index: llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.buffer.store.format.f16.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.buffer.store.format.f16.ll +++ llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.buffer.store.format.f16.ll @@ -53,9 +53,8 @@ ; UNPACKED-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1 ; UNPACKED-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr2 ; UNPACKED-NEXT: [[COPY7:%[0-9]+]]:sreg_32 = COPY $sgpr6 - ; UNPACKED-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 16 - ; UNPACKED-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] - ; UNPACKED-NEXT: [[V_LSHRREV_B32_e64_:%[0-9]+]]:vgpr_32 = V_LSHRREV_B32_e64 [[COPY8]], [[COPY]], implicit $exec + ; UNPACKED-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 16, implicit $exec + ; UNPACKED-NEXT: [[V_LSHRREV_B32_e64_:%[0-9]+]]:vgpr_32 = V_LSHRREV_B32_e64 [[V_MOV_B32_e32_]], [[COPY]], implicit $exec ; UNPACKED-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[V_LSHRREV_B32_e64_]], %subreg.sub1 ; UNPACKED-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY5]], %subreg.sub0, [[COPY6]], %subreg.sub1 ; UNPACKED-NEXT: BUFFER_STORE_FORMAT_D16_XY_gfx80_BOTHEN_exact [[REG_SEQUENCE1]], [[REG_SEQUENCE2]], [[REG_SEQUENCE]], [[COPY7]], 0, 0, 0, implicit $exec :: (dereferenceable store (<2 x s16>), align 1, addrspace 8) @@ -101,11 +100,10 @@ ; UNPACKED-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr2 ; UNPACKED-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY $vgpr3 ; UNPACKED-NEXT: [[COPY8:%[0-9]+]]:sreg_32 = COPY $sgpr6 - ; UNPACKED-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 16 - ; UNPACKED-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] - ; UNPACKED-NEXT: [[V_LSHRREV_B32_e64_:%[0-9]+]]:vgpr_32 = V_LSHRREV_B32_e64 [[COPY9]], [[COPY]], implicit $exec - ; UNPACKED-NEXT: [[COPY10:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] - ; UNPACKED-NEXT: [[V_LSHRREV_B32_e64_1:%[0-9]+]]:vgpr_32 = V_LSHRREV_B32_e64 [[COPY10]], [[COPY1]], implicit $exec + ; UNPACKED-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 16, implicit $exec + ; UNPACKED-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 16, implicit $exec + ; UNPACKED-NEXT: [[V_LSHRREV_B32_e64_:%[0-9]+]]:vgpr_32 = V_LSHRREV_B32_e64 [[V_MOV_B32_e32_1]], [[COPY]], implicit $exec + ; UNPACKED-NEXT: [[V_LSHRREV_B32_e64_1:%[0-9]+]]:vgpr_32 = V_LSHRREV_B32_e64 [[V_MOV_B32_e32_]], [[COPY1]], implicit $exec ; UNPACKED-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[V_LSHRREV_B32_e64_]], %subreg.sub1, [[COPY1]], %subreg.sub2, [[V_LSHRREV_B32_e64_1]], %subreg.sub3 ; UNPACKED-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY6]], %subreg.sub0, [[COPY7]], %subreg.sub1 ; UNPACKED-NEXT: BUFFER_STORE_FORMAT_D16_XYZW_gfx80_BOTHEN_exact [[REG_SEQUENCE1]], [[REG_SEQUENCE2]], [[REG_SEQUENCE]], [[COPY8]], 0, 0, 0, implicit $exec :: (dereferenceable store (<4 x s16>), align 1, addrspace 8) Index: llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.ptr.buffer.load.format.f16.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.ptr.buffer.load.format.f16.ll +++ llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.ptr.buffer.load.format.f16.ll @@ -56,14 +56,12 @@ ; UNPACKED-NEXT: [[BUFFER_LOAD_FORMAT_D16_XY_gfx80_BOTHEN:%[0-9]+]]:vreg_64 = BUFFER_LOAD_FORMAT_D16_XY_gfx80_BOTHEN [[REG_SEQUENCE1]], [[REG_SEQUENCE]], [[COPY6]], 0, 0, 0, implicit $exec :: (dereferenceable load (<2 x s16>) from %ir.rsrc, align 1, addrspace 8) ; UNPACKED-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_FORMAT_D16_XY_gfx80_BOTHEN]].sub0 ; UNPACKED-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_FORMAT_D16_XY_gfx80_BOTHEN]].sub1 - ; UNPACKED-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 65535 - ; UNPACKED-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] - ; UNPACKED-NEXT: [[V_AND_B32_e64_:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 [[COPY7]], [[COPY9]], implicit $exec - ; UNPACKED-NEXT: [[COPY10:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] - ; UNPACKED-NEXT: [[V_AND_B32_e64_1:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 [[COPY8]], [[COPY10]], implicit $exec - ; UNPACKED-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 16 - ; UNPACKED-NEXT: [[COPY11:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_1]] - ; UNPACKED-NEXT: [[V_LSHLREV_B32_e64_:%[0-9]+]]:vgpr_32 = V_LSHLREV_B32_e64 [[COPY11]], [[V_AND_B32_e64_1]], implicit $exec + ; UNPACKED-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 65535, implicit $exec + ; UNPACKED-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 65535, implicit $exec + ; UNPACKED-NEXT: [[V_AND_B32_e64_:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 [[COPY7]], [[V_MOV_B32_e32_1]], implicit $exec + ; UNPACKED-NEXT: [[V_AND_B32_e64_1:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 [[COPY8]], [[V_MOV_B32_e32_]], implicit $exec + ; UNPACKED-NEXT: [[V_MOV_B32_e32_2:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 16, implicit $exec + ; UNPACKED-NEXT: [[V_LSHLREV_B32_e64_:%[0-9]+]]:vgpr_32 = V_LSHLREV_B32_e64 [[V_MOV_B32_e32_2]], [[V_AND_B32_e64_1]], implicit $exec ; UNPACKED-NEXT: [[V_OR_B32_e64_:%[0-9]+]]:vgpr_32 = V_OR_B32_e64 [[V_AND_B32_e64_]], [[V_LSHLREV_B32_e64_]], implicit $exec ; UNPACKED-NEXT: $vgpr0 = COPY [[V_OR_B32_e64_]] ; UNPACKED-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0 @@ -112,21 +110,19 @@ ; UNPACKED-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_FORMAT_D16_XYZW_gfx80_BOTHEN]].sub1 ; UNPACKED-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_FORMAT_D16_XYZW_gfx80_BOTHEN]].sub2 ; UNPACKED-NEXT: [[COPY10:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_FORMAT_D16_XYZW_gfx80_BOTHEN]].sub3 - ; UNPACKED-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 65535 - ; UNPACKED-NEXT: [[COPY11:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] - ; UNPACKED-NEXT: [[V_AND_B32_e64_:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 [[COPY7]], [[COPY11]], implicit $exec - ; UNPACKED-NEXT: [[COPY12:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] - ; UNPACKED-NEXT: [[V_AND_B32_e64_1:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 [[COPY8]], [[COPY12]], implicit $exec - ; UNPACKED-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 16 - ; UNPACKED-NEXT: [[COPY13:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_1]] - ; UNPACKED-NEXT: [[V_LSHLREV_B32_e64_:%[0-9]+]]:vgpr_32 = V_LSHLREV_B32_e64 [[COPY13]], [[V_AND_B32_e64_1]], implicit $exec + ; UNPACKED-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 65535, implicit $exec + ; UNPACKED-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 65535, implicit $exec + ; UNPACKED-NEXT: [[V_AND_B32_e64_:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 [[COPY7]], [[V_MOV_B32_e32_1]], implicit $exec + ; UNPACKED-NEXT: [[V_MOV_B32_e32_2:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 65535, implicit $exec + ; UNPACKED-NEXT: [[V_AND_B32_e64_1:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 [[COPY8]], [[V_MOV_B32_e32_2]], implicit $exec + ; UNPACKED-NEXT: [[V_MOV_B32_e32_3:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 16, implicit $exec + ; UNPACKED-NEXT: [[V_MOV_B32_e32_4:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 16, implicit $exec + ; UNPACKED-NEXT: [[V_LSHLREV_B32_e64_:%[0-9]+]]:vgpr_32 = V_LSHLREV_B32_e64 [[V_MOV_B32_e32_4]], [[V_AND_B32_e64_1]], implicit $exec ; UNPACKED-NEXT: [[V_OR_B32_e64_:%[0-9]+]]:vgpr_32 = V_OR_B32_e64 [[V_AND_B32_e64_]], [[V_LSHLREV_B32_e64_]], implicit $exec - ; UNPACKED-NEXT: [[COPY14:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] - ; UNPACKED-NEXT: [[V_AND_B32_e64_2:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 [[COPY9]], [[COPY14]], implicit $exec - ; UNPACKED-NEXT: [[COPY15:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] - ; UNPACKED-NEXT: [[V_AND_B32_e64_3:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 [[COPY10]], [[COPY15]], implicit $exec - ; UNPACKED-NEXT: [[COPY16:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_1]] - ; UNPACKED-NEXT: [[V_LSHLREV_B32_e64_1:%[0-9]+]]:vgpr_32 = V_LSHLREV_B32_e64 [[COPY16]], [[V_AND_B32_e64_3]], implicit $exec + ; UNPACKED-NEXT: [[V_MOV_B32_e32_5:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 65535, implicit $exec + ; UNPACKED-NEXT: [[V_AND_B32_e64_2:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 [[COPY9]], [[V_MOV_B32_e32_5]], implicit $exec + ; UNPACKED-NEXT: [[V_AND_B32_e64_3:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 [[COPY10]], [[V_MOV_B32_e32_]], implicit $exec + ; UNPACKED-NEXT: [[V_LSHLREV_B32_e64_1:%[0-9]+]]:vgpr_32 = V_LSHLREV_B32_e64 [[V_MOV_B32_e32_3]], [[V_AND_B32_e64_3]], implicit $exec ; UNPACKED-NEXT: [[V_OR_B32_e64_1:%[0-9]+]]:vgpr_32 = V_OR_B32_e64 [[V_AND_B32_e64_2]], [[V_LSHLREV_B32_e64_1]], implicit $exec ; UNPACKED-NEXT: $vgpr0 = COPY [[V_OR_B32_e64_]] ; UNPACKED-NEXT: $vgpr1 = COPY [[V_OR_B32_e64_1]] @@ -211,21 +207,19 @@ ; UNPACKED-NEXT: [[COPY14:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_FORMAT_D16_XYZW_gfx80_BOTHEN]].sub1 ; UNPACKED-NEXT: [[COPY15:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_FORMAT_D16_XYZW_gfx80_BOTHEN]].sub2 ; UNPACKED-NEXT: [[COPY16:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_FORMAT_D16_XYZW_gfx80_BOTHEN]].sub3 - ; UNPACKED-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 65535 - ; UNPACKED-NEXT: [[COPY17:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] - ; UNPACKED-NEXT: [[V_AND_B32_e64_:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 [[COPY13]], [[COPY17]], implicit $exec - ; UNPACKED-NEXT: [[COPY18:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] - ; UNPACKED-NEXT: [[V_AND_B32_e64_1:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 [[COPY14]], [[COPY18]], implicit $exec - ; UNPACKED-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 16 - ; UNPACKED-NEXT: [[COPY19:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_1]] - ; UNPACKED-NEXT: [[V_LSHLREV_B32_e64_:%[0-9]+]]:vgpr_32 = V_LSHLREV_B32_e64 [[COPY19]], [[V_AND_B32_e64_1]], implicit $exec + ; UNPACKED-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 65535, implicit $exec + ; UNPACKED-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 65535, implicit $exec + ; UNPACKED-NEXT: [[V_AND_B32_e64_:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 [[COPY13]], [[V_MOV_B32_e32_1]], implicit $exec + ; UNPACKED-NEXT: [[V_MOV_B32_e32_2:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 65535, implicit $exec + ; UNPACKED-NEXT: [[V_AND_B32_e64_1:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 [[COPY14]], [[V_MOV_B32_e32_2]], implicit $exec + ; UNPACKED-NEXT: [[V_MOV_B32_e32_3:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 16, implicit $exec + ; UNPACKED-NEXT: [[V_MOV_B32_e32_4:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 16, implicit $exec + ; UNPACKED-NEXT: [[V_LSHLREV_B32_e64_:%[0-9]+]]:vgpr_32 = V_LSHLREV_B32_e64 [[V_MOV_B32_e32_4]], [[V_AND_B32_e64_1]], implicit $exec ; UNPACKED-NEXT: [[V_OR_B32_e64_:%[0-9]+]]:vgpr_32 = V_OR_B32_e64 [[V_AND_B32_e64_]], [[V_LSHLREV_B32_e64_]], implicit $exec - ; UNPACKED-NEXT: [[COPY20:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] - ; UNPACKED-NEXT: [[V_AND_B32_e64_2:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 [[COPY15]], [[COPY20]], implicit $exec - ; UNPACKED-NEXT: [[COPY21:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] - ; UNPACKED-NEXT: [[V_AND_B32_e64_3:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 [[COPY16]], [[COPY21]], implicit $exec - ; UNPACKED-NEXT: [[COPY22:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_1]] - ; UNPACKED-NEXT: [[V_LSHLREV_B32_e64_1:%[0-9]+]]:vgpr_32 = V_LSHLREV_B32_e64 [[COPY22]], [[V_AND_B32_e64_3]], implicit $exec + ; UNPACKED-NEXT: [[V_MOV_B32_e32_5:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 65535, implicit $exec + ; UNPACKED-NEXT: [[V_AND_B32_e64_2:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 [[COPY15]], [[V_MOV_B32_e32_5]], implicit $exec + ; UNPACKED-NEXT: [[V_AND_B32_e64_3:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 [[COPY16]], [[V_MOV_B32_e32_]], implicit $exec + ; UNPACKED-NEXT: [[V_LSHLREV_B32_e64_1:%[0-9]+]]:vgpr_32 = V_LSHLREV_B32_e64 [[V_MOV_B32_e32_3]], [[V_AND_B32_e64_3]], implicit $exec ; UNPACKED-NEXT: [[V_OR_B32_e64_1:%[0-9]+]]:vgpr_32 = V_OR_B32_e64 [[V_AND_B32_e64_2]], [[V_LSHLREV_B32_e64_1]], implicit $exec ; UNPACKED-NEXT: $vgpr0 = COPY [[V_OR_B32_e64_]] ; UNPACKED-NEXT: $vgpr1 = COPY [[V_OR_B32_e64_1]] Index: llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.ptr.buffer.load.format.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.ptr.buffer.load.format.ll +++ llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.ptr.buffer.load.format.ll @@ -227,16 +227,16 @@ ; CHECK-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY6]], %subreg.sub0, [[COPY7]], %subreg.sub1 ; CHECK-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0 ; CHECK-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 - ; CHECK-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] - ; CHECK-NEXT: [[BUFFER_LOAD_FORMAT_XYZW_TFE_IDXEN:%[0-9]+]]:vreg_160 = BUFFER_LOAD_FORMAT_XYZW_TFE_IDXEN [[COPY8]], [[REG_SEQUENCE2]], [[S_MOV_B32_]], 0, 0, 0, implicit $exec :: (dereferenceable load (<4 x s32>) from %ir.rsrc, align 1, addrspace 8) - ; CHECK-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_FORMAT_XYZW_TFE_IDXEN]].sub0 - ; CHECK-NEXT: [[COPY10:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_FORMAT_XYZW_TFE_IDXEN]].sub1 - ; CHECK-NEXT: [[COPY11:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_FORMAT_XYZW_TFE_IDXEN]].sub2 - ; CHECK-NEXT: [[COPY12:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_FORMAT_XYZW_TFE_IDXEN]].sub3 - ; CHECK-NEXT: [[COPY13:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_FORMAT_XYZW_TFE_IDXEN]].sub4 - ; CHECK-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY9]], %subreg.sub0, [[COPY10]], %subreg.sub1, [[COPY11]], %subreg.sub2, [[COPY12]], %subreg.sub3 + ; CHECK-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + ; CHECK-NEXT: [[BUFFER_LOAD_FORMAT_XYZW_TFE_IDXEN:%[0-9]+]]:vreg_160 = BUFFER_LOAD_FORMAT_XYZW_TFE_IDXEN [[V_MOV_B32_e32_]], [[REG_SEQUENCE2]], [[S_MOV_B32_]], 0, 0, 0, implicit $exec :: (dereferenceable load (<4 x s32>) from %ir.rsrc, align 1, addrspace 8) + ; CHECK-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_FORMAT_XYZW_TFE_IDXEN]].sub0 + ; CHECK-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_FORMAT_XYZW_TFE_IDXEN]].sub1 + ; CHECK-NEXT: [[COPY10:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_FORMAT_XYZW_TFE_IDXEN]].sub2 + ; CHECK-NEXT: [[COPY11:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_FORMAT_XYZW_TFE_IDXEN]].sub3 + ; CHECK-NEXT: [[COPY12:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_FORMAT_XYZW_TFE_IDXEN]].sub4 + ; CHECK-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY8]], %subreg.sub0, [[COPY9]], %subreg.sub1, [[COPY10]], %subreg.sub2, [[COPY11]], %subreg.sub3 ; CHECK-NEXT: FLAT_STORE_DWORDX4 [[REG_SEQUENCE]], [[REG_SEQUENCE3]], 0, 0, implicit $exec, implicit $flat_scr :: (store (<4 x s32>) into %ir.value, addrspace 1) - ; CHECK-NEXT: FLAT_STORE_DWORD [[REG_SEQUENCE1]], [[COPY13]], 0, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %ir.status, addrspace 1) + ; CHECK-NEXT: FLAT_STORE_DWORD [[REG_SEQUENCE1]], [[COPY12]], 0, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %ir.status, addrspace 1) ; CHECK-NEXT: S_ENDPGM 0 %load = call { <4 x i32>, i32 } @llvm.amdgcn.struct.ptr.buffer.load.format.sl_v4i32i32s(ptr addrspace(8) %rsrc, i32 0, i32 0, i32 0, i32 0) @@ -266,15 +266,15 @@ ; CHECK-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY6]], %subreg.sub0, [[COPY7]], %subreg.sub1 ; CHECK-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0 ; CHECK-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 - ; CHECK-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] - ; CHECK-NEXT: [[BUFFER_LOAD_FORMAT_XYZ_TFE_IDXEN:%[0-9]+]]:vreg_128 = BUFFER_LOAD_FORMAT_XYZ_TFE_IDXEN [[COPY8]], [[REG_SEQUENCE2]], [[S_MOV_B32_]], 0, 0, 0, implicit $exec :: (dereferenceable load (<3 x s32>) from %ir.rsrc, align 1, addrspace 8) - ; CHECK-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_FORMAT_XYZ_TFE_IDXEN]].sub0 - ; CHECK-NEXT: [[COPY10:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_FORMAT_XYZ_TFE_IDXEN]].sub1 - ; CHECK-NEXT: [[COPY11:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_FORMAT_XYZ_TFE_IDXEN]].sub2 - ; CHECK-NEXT: [[COPY12:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_FORMAT_XYZ_TFE_IDXEN]].sub3 - ; CHECK-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:vreg_96 = REG_SEQUENCE [[COPY9]], %subreg.sub0, [[COPY10]], %subreg.sub1, [[COPY11]], %subreg.sub2 + ; CHECK-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + ; CHECK-NEXT: [[BUFFER_LOAD_FORMAT_XYZ_TFE_IDXEN:%[0-9]+]]:vreg_128 = BUFFER_LOAD_FORMAT_XYZ_TFE_IDXEN [[V_MOV_B32_e32_]], [[REG_SEQUENCE2]], [[S_MOV_B32_]], 0, 0, 0, implicit $exec :: (dereferenceable load (<3 x s32>) from %ir.rsrc, align 1, addrspace 8) + ; CHECK-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_FORMAT_XYZ_TFE_IDXEN]].sub0 + ; CHECK-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_FORMAT_XYZ_TFE_IDXEN]].sub1 + ; CHECK-NEXT: [[COPY10:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_FORMAT_XYZ_TFE_IDXEN]].sub2 + ; CHECK-NEXT: [[COPY11:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_FORMAT_XYZ_TFE_IDXEN]].sub3 + ; CHECK-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:vreg_96 = REG_SEQUENCE [[COPY8]], %subreg.sub0, [[COPY9]], %subreg.sub1, [[COPY10]], %subreg.sub2 ; CHECK-NEXT: FLAT_STORE_DWORDX3 [[REG_SEQUENCE]], [[REG_SEQUENCE3]], 0, 0, implicit $exec, implicit $flat_scr :: (store (<3 x s32>) into %ir.value, align 16, addrspace 1) - ; CHECK-NEXT: FLAT_STORE_DWORD [[REG_SEQUENCE1]], [[COPY12]], 0, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %ir.status, addrspace 1) + ; CHECK-NEXT: FLAT_STORE_DWORD [[REG_SEQUENCE1]], [[COPY11]], 0, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %ir.status, addrspace 1) ; CHECK-NEXT: S_ENDPGM 0 %load = call { <3 x i32>, i32 } @llvm.amdgcn.struct.ptr.buffer.load.format.sl_v3i32i32s(ptr addrspace(8) %rsrc, i32 0, i32 0, i32 0, i32 0) @@ -304,12 +304,12 @@ ; CHECK-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY6]], %subreg.sub0, [[COPY7]], %subreg.sub1 ; CHECK-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0 ; CHECK-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 - ; CHECK-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] - ; CHECK-NEXT: [[BUFFER_LOAD_FORMAT_X_TFE_IDXEN:%[0-9]+]]:vreg_64 = BUFFER_LOAD_FORMAT_X_TFE_IDXEN [[COPY8]], [[REG_SEQUENCE2]], [[S_MOV_B32_]], 0, 0, 0, implicit $exec :: (dereferenceable load (s32) from %ir.rsrc, align 1, addrspace 8) - ; CHECK-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_FORMAT_X_TFE_IDXEN]].sub0 - ; CHECK-NEXT: [[COPY10:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_FORMAT_X_TFE_IDXEN]].sub1 - ; CHECK-NEXT: FLAT_STORE_DWORD [[REG_SEQUENCE]], [[COPY9]], 0, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %ir.value, addrspace 1) - ; CHECK-NEXT: FLAT_STORE_DWORD [[REG_SEQUENCE1]], [[COPY10]], 0, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %ir.status, addrspace 1) + ; CHECK-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + ; CHECK-NEXT: [[BUFFER_LOAD_FORMAT_X_TFE_IDXEN:%[0-9]+]]:vreg_64 = BUFFER_LOAD_FORMAT_X_TFE_IDXEN [[V_MOV_B32_e32_]], [[REG_SEQUENCE2]], [[S_MOV_B32_]], 0, 0, 0, implicit $exec :: (dereferenceable load (s32) from %ir.rsrc, align 1, addrspace 8) + ; CHECK-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_FORMAT_X_TFE_IDXEN]].sub0 + ; CHECK-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_FORMAT_X_TFE_IDXEN]].sub1 + ; CHECK-NEXT: FLAT_STORE_DWORD [[REG_SEQUENCE]], [[COPY8]], 0, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %ir.value, addrspace 1) + ; CHECK-NEXT: FLAT_STORE_DWORD [[REG_SEQUENCE1]], [[COPY9]], 0, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %ir.status, addrspace 1) ; CHECK-NEXT: S_ENDPGM 0 %load = call { i32, i32 } @llvm.amdgcn.struct.ptr.buffer.load.format.sl_i32i32s(ptr addrspace(8) %rsrc, i32 0, i32 0, i32 0, i32 0) Index: llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.ptr.buffer.load.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.ptr.buffer.load.ll +++ llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.ptr.buffer.load.ll @@ -116,10 +116,9 @@ ; CHECK-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5 ; CHECK-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; CHECK-NEXT: [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr6 - ; CHECK-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0 + ; CHECK-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 - ; CHECK-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] - ; CHECK-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY6]], %subreg.sub0, [[COPY4]], %subreg.sub1 + ; CHECK-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_MOV_B32_e32_]], %subreg.sub0, [[COPY4]], %subreg.sub1 ; CHECK-NEXT: [[BUFFER_LOAD_DWORD_BOTHEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_BOTHEN [[REG_SEQUENCE1]], [[REG_SEQUENCE]], [[COPY5]], 0, 0, 0, implicit $exec :: (dereferenceable load (s32) from %ir.rsrc, align 1, addrspace 8) ; CHECK-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_DWORD_BOTHEN]] ; CHECK-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0 Index: llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.ptr.buffer.store.format.f16.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.ptr.buffer.store.format.f16.ll +++ llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.ptr.buffer.store.format.f16.ll @@ -52,9 +52,8 @@ ; UNPACKED-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1 ; UNPACKED-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr2 ; UNPACKED-NEXT: [[COPY7:%[0-9]+]]:sreg_32 = COPY $sgpr6 - ; UNPACKED-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 16 - ; UNPACKED-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] - ; UNPACKED-NEXT: [[V_LSHRREV_B32_e64_:%[0-9]+]]:vgpr_32 = V_LSHRREV_B32_e64 [[COPY8]], [[COPY]], implicit $exec + ; UNPACKED-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 16, implicit $exec + ; UNPACKED-NEXT: [[V_LSHRREV_B32_e64_:%[0-9]+]]:vgpr_32 = V_LSHRREV_B32_e64 [[V_MOV_B32_e32_]], [[COPY]], implicit $exec ; UNPACKED-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[V_LSHRREV_B32_e64_]], %subreg.sub1 ; UNPACKED-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3 ; UNPACKED-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY5]], %subreg.sub0, [[COPY6]], %subreg.sub1 @@ -100,11 +99,10 @@ ; UNPACKED-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr2 ; UNPACKED-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY $vgpr3 ; UNPACKED-NEXT: [[COPY8:%[0-9]+]]:sreg_32 = COPY $sgpr6 - ; UNPACKED-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 16 - ; UNPACKED-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] - ; UNPACKED-NEXT: [[V_LSHRREV_B32_e64_:%[0-9]+]]:vgpr_32 = V_LSHRREV_B32_e64 [[COPY9]], [[COPY]], implicit $exec - ; UNPACKED-NEXT: [[COPY10:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] - ; UNPACKED-NEXT: [[V_LSHRREV_B32_e64_1:%[0-9]+]]:vgpr_32 = V_LSHRREV_B32_e64 [[COPY10]], [[COPY1]], implicit $exec + ; UNPACKED-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 16, implicit $exec + ; UNPACKED-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 16, implicit $exec + ; UNPACKED-NEXT: [[V_LSHRREV_B32_e64_:%[0-9]+]]:vgpr_32 = V_LSHRREV_B32_e64 [[V_MOV_B32_e32_1]], [[COPY]], implicit $exec + ; UNPACKED-NEXT: [[V_LSHRREV_B32_e64_1:%[0-9]+]]:vgpr_32 = V_LSHRREV_B32_e64 [[V_MOV_B32_e32_]], [[COPY1]], implicit $exec ; UNPACKED-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[V_LSHRREV_B32_e64_]], %subreg.sub1, [[COPY1]], %subreg.sub2, [[V_LSHRREV_B32_e64_1]], %subreg.sub3 ; UNPACKED-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY3]], %subreg.sub1, [[COPY4]], %subreg.sub2, [[COPY5]], %subreg.sub3 ; UNPACKED-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY6]], %subreg.sub0, [[COPY7]], %subreg.sub1 Index: llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.ptr.tbuffer.load.f16.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.ptr.tbuffer.load.f16.ll +++ llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.ptr.tbuffer.load.f16.ll @@ -74,14 +74,12 @@ ; UNPACKED-NEXT: [[TBUFFER_LOAD_FORMAT_D16_XY_gfx80_BOTHEN:%[0-9]+]]:vreg_64 = TBUFFER_LOAD_FORMAT_D16_XY_gfx80_BOTHEN [[REG_SEQUENCE1]], [[REG_SEQUENCE]], [[COPY6]], 0, 78, 0, 0, implicit $exec :: (dereferenceable load (<2 x s16>) from %ir.rsrc, align 1, addrspace 8) ; UNPACKED-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[TBUFFER_LOAD_FORMAT_D16_XY_gfx80_BOTHEN]].sub0 ; UNPACKED-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[TBUFFER_LOAD_FORMAT_D16_XY_gfx80_BOTHEN]].sub1 - ; UNPACKED-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 65535 - ; UNPACKED-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] - ; UNPACKED-NEXT: [[V_AND_B32_e64_:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 [[COPY7]], [[COPY9]], implicit $exec - ; UNPACKED-NEXT: [[COPY10:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] - ; UNPACKED-NEXT: [[V_AND_B32_e64_1:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 [[COPY8]], [[COPY10]], implicit $exec - ; UNPACKED-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 16 - ; UNPACKED-NEXT: [[COPY11:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_1]] - ; UNPACKED-NEXT: [[V_LSHLREV_B32_e64_:%[0-9]+]]:vgpr_32 = V_LSHLREV_B32_e64 [[COPY11]], [[V_AND_B32_e64_1]], implicit $exec + ; UNPACKED-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 65535, implicit $exec + ; UNPACKED-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 65535, implicit $exec + ; UNPACKED-NEXT: [[V_AND_B32_e64_:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 [[COPY7]], [[V_MOV_B32_e32_1]], implicit $exec + ; UNPACKED-NEXT: [[V_AND_B32_e64_1:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 [[COPY8]], [[V_MOV_B32_e32_]], implicit $exec + ; UNPACKED-NEXT: [[V_MOV_B32_e32_2:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 16, implicit $exec + ; UNPACKED-NEXT: [[V_LSHLREV_B32_e64_:%[0-9]+]]:vgpr_32 = V_LSHLREV_B32_e64 [[V_MOV_B32_e32_2]], [[V_AND_B32_e64_1]], implicit $exec ; UNPACKED-NEXT: [[V_OR_B32_e64_:%[0-9]+]]:vgpr_32 = V_OR_B32_e64 [[V_AND_B32_e64_]], [[V_LSHLREV_B32_e64_]], implicit $exec ; UNPACKED-NEXT: $vgpr0 = COPY [[V_OR_B32_e64_]] ; UNPACKED-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0 @@ -133,21 +131,19 @@ ; UNPACKED-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[TBUFFER_LOAD_FORMAT_D16_XYZW_gfx80_BOTHEN]].sub1 ; UNPACKED-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[TBUFFER_LOAD_FORMAT_D16_XYZW_gfx80_BOTHEN]].sub2 ; UNPACKED-NEXT: [[COPY10:%[0-9]+]]:vgpr_32 = COPY [[TBUFFER_LOAD_FORMAT_D16_XYZW_gfx80_BOTHEN]].sub3 - ; UNPACKED-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 65535 - ; UNPACKED-NEXT: [[COPY11:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] - ; UNPACKED-NEXT: [[V_AND_B32_e64_:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 [[COPY7]], [[COPY11]], implicit $exec - ; UNPACKED-NEXT: [[COPY12:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] - ; UNPACKED-NEXT: [[V_AND_B32_e64_1:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 [[COPY8]], [[COPY12]], implicit $exec - ; UNPACKED-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 16 - ; UNPACKED-NEXT: [[COPY13:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_1]] - ; UNPACKED-NEXT: [[V_LSHLREV_B32_e64_:%[0-9]+]]:vgpr_32 = V_LSHLREV_B32_e64 [[COPY13]], [[V_AND_B32_e64_1]], implicit $exec + ; UNPACKED-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 65535, implicit $exec + ; UNPACKED-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 65535, implicit $exec + ; UNPACKED-NEXT: [[V_AND_B32_e64_:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 [[COPY7]], [[V_MOV_B32_e32_1]], implicit $exec + ; UNPACKED-NEXT: [[V_MOV_B32_e32_2:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 65535, implicit $exec + ; UNPACKED-NEXT: [[V_AND_B32_e64_1:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 [[COPY8]], [[V_MOV_B32_e32_2]], implicit $exec + ; UNPACKED-NEXT: [[V_MOV_B32_e32_3:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 16, implicit $exec + ; UNPACKED-NEXT: [[V_MOV_B32_e32_4:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 16, implicit $exec + ; UNPACKED-NEXT: [[V_LSHLREV_B32_e64_:%[0-9]+]]:vgpr_32 = V_LSHLREV_B32_e64 [[V_MOV_B32_e32_4]], [[V_AND_B32_e64_1]], implicit $exec ; UNPACKED-NEXT: [[V_OR_B32_e64_:%[0-9]+]]:vgpr_32 = V_OR_B32_e64 [[V_AND_B32_e64_]], [[V_LSHLREV_B32_e64_]], implicit $exec - ; UNPACKED-NEXT: [[COPY14:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] - ; UNPACKED-NEXT: [[V_AND_B32_e64_2:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 [[COPY9]], [[COPY14]], implicit $exec - ; UNPACKED-NEXT: [[COPY15:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] - ; UNPACKED-NEXT: [[V_AND_B32_e64_3:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 [[COPY10]], [[COPY15]], implicit $exec - ; UNPACKED-NEXT: [[COPY16:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_1]] - ; UNPACKED-NEXT: [[V_LSHLREV_B32_e64_1:%[0-9]+]]:vgpr_32 = V_LSHLREV_B32_e64 [[COPY16]], [[V_AND_B32_e64_3]], implicit $exec + ; UNPACKED-NEXT: [[V_MOV_B32_e32_5:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 65535, implicit $exec + ; UNPACKED-NEXT: [[V_AND_B32_e64_2:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 [[COPY9]], [[V_MOV_B32_e32_5]], implicit $exec + ; UNPACKED-NEXT: [[V_AND_B32_e64_3:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 [[COPY10]], [[V_MOV_B32_e32_]], implicit $exec + ; UNPACKED-NEXT: [[V_LSHLREV_B32_e64_1:%[0-9]+]]:vgpr_32 = V_LSHLREV_B32_e64 [[V_MOV_B32_e32_3]], [[V_AND_B32_e64_3]], implicit $exec ; UNPACKED-NEXT: [[V_OR_B32_e64_1:%[0-9]+]]:vgpr_32 = V_OR_B32_e64 [[V_AND_B32_e64_2]], [[V_LSHLREV_B32_e64_1]], implicit $exec ; UNPACKED-NEXT: $vgpr0 = COPY [[V_OR_B32_e64_]] ; UNPACKED-NEXT: $vgpr1 = COPY [[V_OR_B32_e64_1]] @@ -167,10 +163,9 @@ ; PACKED-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5 ; PACKED-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; PACKED-NEXT: [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr6 - ; PACKED-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0 + ; PACKED-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec ; PACKED-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 - ; PACKED-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] - ; PACKED-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY6]], %subreg.sub0, [[COPY4]], %subreg.sub1 + ; PACKED-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_MOV_B32_e32_]], %subreg.sub0, [[COPY4]], %subreg.sub1 ; PACKED-NEXT: [[TBUFFER_LOAD_FORMAT_D16_X_BOTHEN:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_D16_X_BOTHEN [[REG_SEQUENCE1]], [[REG_SEQUENCE]], [[COPY5]], 0, 78, 0, 0, implicit $exec :: (dereferenceable load (s16) from %ir.rsrc, align 1, addrspace 8) ; PACKED-NEXT: $vgpr0 = COPY [[TBUFFER_LOAD_FORMAT_D16_X_BOTHEN]] ; PACKED-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0 @@ -184,10 +179,9 @@ ; UNPACKED-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5 ; UNPACKED-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; UNPACKED-NEXT: [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr6 - ; UNPACKED-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0 + ; UNPACKED-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec ; UNPACKED-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 - ; UNPACKED-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] - ; UNPACKED-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY6]], %subreg.sub0, [[COPY4]], %subreg.sub1 + ; UNPACKED-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_MOV_B32_e32_]], %subreg.sub0, [[COPY4]], %subreg.sub1 ; UNPACKED-NEXT: [[TBUFFER_LOAD_FORMAT_D16_X_gfx80_BOTHEN:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_D16_X_gfx80_BOTHEN [[REG_SEQUENCE1]], [[REG_SEQUENCE]], [[COPY5]], 0, 78, 0, 0, implicit $exec :: (dereferenceable load (s16) from %ir.rsrc, align 1, addrspace 8) ; UNPACKED-NEXT: $vgpr0 = COPY [[TBUFFER_LOAD_FORMAT_D16_X_gfx80_BOTHEN]] ; UNPACKED-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0 @@ -307,21 +301,19 @@ ; UNPACKED-NEXT: [[COPY14:%[0-9]+]]:vgpr_32 = COPY [[TBUFFER_LOAD_FORMAT_D16_XYZW_gfx80_BOTHEN]].sub1 ; UNPACKED-NEXT: [[COPY15:%[0-9]+]]:vgpr_32 = COPY [[TBUFFER_LOAD_FORMAT_D16_XYZW_gfx80_BOTHEN]].sub2 ; UNPACKED-NEXT: [[COPY16:%[0-9]+]]:vgpr_32 = COPY [[TBUFFER_LOAD_FORMAT_D16_XYZW_gfx80_BOTHEN]].sub3 - ; UNPACKED-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 65535 - ; UNPACKED-NEXT: [[COPY17:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] - ; UNPACKED-NEXT: [[V_AND_B32_e64_:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 [[COPY13]], [[COPY17]], implicit $exec - ; UNPACKED-NEXT: [[COPY18:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] - ; UNPACKED-NEXT: [[V_AND_B32_e64_1:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 [[COPY14]], [[COPY18]], implicit $exec - ; UNPACKED-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 16 - ; UNPACKED-NEXT: [[COPY19:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_1]] - ; UNPACKED-NEXT: [[V_LSHLREV_B32_e64_:%[0-9]+]]:vgpr_32 = V_LSHLREV_B32_e64 [[COPY19]], [[V_AND_B32_e64_1]], implicit $exec + ; UNPACKED-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 65535, implicit $exec + ; UNPACKED-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 65535, implicit $exec + ; UNPACKED-NEXT: [[V_AND_B32_e64_:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 [[COPY13]], [[V_MOV_B32_e32_1]], implicit $exec + ; UNPACKED-NEXT: [[V_MOV_B32_e32_2:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 65535, implicit $exec + ; UNPACKED-NEXT: [[V_AND_B32_e64_1:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 [[COPY14]], [[V_MOV_B32_e32_2]], implicit $exec + ; UNPACKED-NEXT: [[V_MOV_B32_e32_3:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 16, implicit $exec + ; UNPACKED-NEXT: [[V_MOV_B32_e32_4:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 16, implicit $exec + ; UNPACKED-NEXT: [[V_LSHLREV_B32_e64_:%[0-9]+]]:vgpr_32 = V_LSHLREV_B32_e64 [[V_MOV_B32_e32_4]], [[V_AND_B32_e64_1]], implicit $exec ; UNPACKED-NEXT: [[V_OR_B32_e64_:%[0-9]+]]:vgpr_32 = V_OR_B32_e64 [[V_AND_B32_e64_]], [[V_LSHLREV_B32_e64_]], implicit $exec - ; UNPACKED-NEXT: [[COPY20:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] - ; UNPACKED-NEXT: [[V_AND_B32_e64_2:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 [[COPY15]], [[COPY20]], implicit $exec - ; UNPACKED-NEXT: [[COPY21:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] - ; UNPACKED-NEXT: [[V_AND_B32_e64_3:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 [[COPY16]], [[COPY21]], implicit $exec - ; UNPACKED-NEXT: [[COPY22:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_1]] - ; UNPACKED-NEXT: [[V_LSHLREV_B32_e64_1:%[0-9]+]]:vgpr_32 = V_LSHLREV_B32_e64 [[COPY22]], [[V_AND_B32_e64_3]], implicit $exec + ; UNPACKED-NEXT: [[V_MOV_B32_e32_5:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 65535, implicit $exec + ; UNPACKED-NEXT: [[V_AND_B32_e64_2:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 [[COPY15]], [[V_MOV_B32_e32_5]], implicit $exec + ; UNPACKED-NEXT: [[V_AND_B32_e64_3:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 [[COPY16]], [[V_MOV_B32_e32_]], implicit $exec + ; UNPACKED-NEXT: [[V_LSHLREV_B32_e64_1:%[0-9]+]]:vgpr_32 = V_LSHLREV_B32_e64 [[V_MOV_B32_e32_3]], [[V_AND_B32_e64_3]], implicit $exec ; UNPACKED-NEXT: [[V_OR_B32_e64_1:%[0-9]+]]:vgpr_32 = V_OR_B32_e64 [[V_AND_B32_e64_2]], [[V_LSHLREV_B32_e64_1]], implicit $exec ; UNPACKED-NEXT: $vgpr0 = COPY [[V_OR_B32_e64_]] ; UNPACKED-NEXT: $vgpr1 = COPY [[V_OR_B32_e64_1]] Index: llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.ptr.tbuffer.load.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.ptr.tbuffer.load.ll +++ llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.ptr.tbuffer.load.ll @@ -113,10 +113,9 @@ ; CHECK-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5 ; CHECK-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; CHECK-NEXT: [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr6 - ; CHECK-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0 + ; CHECK-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 - ; CHECK-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] - ; CHECK-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY6]], %subreg.sub0, [[COPY4]], %subreg.sub1 + ; CHECK-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_MOV_B32_e32_]], %subreg.sub0, [[COPY4]], %subreg.sub1 ; CHECK-NEXT: [[TBUFFER_LOAD_FORMAT_X_BOTHEN:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_BOTHEN [[REG_SEQUENCE1]], [[REG_SEQUENCE]], [[COPY5]], 0, 78, 0, 0, implicit $exec :: (dereferenceable load (s32) from %ir.rsrc, align 1, addrspace 8) ; CHECK-NEXT: $vgpr0 = COPY [[TBUFFER_LOAD_FORMAT_X_BOTHEN]] ; CHECK-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0 Index: llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.tbuffer.load.f16.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.tbuffer.load.f16.ll +++ llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.tbuffer.load.f16.ll @@ -74,14 +74,12 @@ ; UNPACKED-NEXT: [[TBUFFER_LOAD_FORMAT_D16_XY_gfx80_BOTHEN:%[0-9]+]]:vreg_64 = TBUFFER_LOAD_FORMAT_D16_XY_gfx80_BOTHEN [[REG_SEQUENCE1]], [[REG_SEQUENCE]], [[COPY6]], 0, 78, 0, 0, implicit $exec :: (dereferenceable load (<2 x s16>), align 1, addrspace 8) ; UNPACKED-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[TBUFFER_LOAD_FORMAT_D16_XY_gfx80_BOTHEN]].sub0 ; UNPACKED-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[TBUFFER_LOAD_FORMAT_D16_XY_gfx80_BOTHEN]].sub1 - ; UNPACKED-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 65535 - ; UNPACKED-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] - ; UNPACKED-NEXT: [[V_AND_B32_e64_:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 [[COPY7]], [[COPY9]], implicit $exec - ; UNPACKED-NEXT: [[COPY10:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] - ; UNPACKED-NEXT: [[V_AND_B32_e64_1:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 [[COPY8]], [[COPY10]], implicit $exec - ; UNPACKED-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 16 - ; UNPACKED-NEXT: [[COPY11:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_1]] - ; UNPACKED-NEXT: [[V_LSHLREV_B32_e64_:%[0-9]+]]:vgpr_32 = V_LSHLREV_B32_e64 [[COPY11]], [[V_AND_B32_e64_1]], implicit $exec + ; UNPACKED-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 65535, implicit $exec + ; UNPACKED-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 65535, implicit $exec + ; UNPACKED-NEXT: [[V_AND_B32_e64_:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 [[COPY7]], [[V_MOV_B32_e32_1]], implicit $exec + ; UNPACKED-NEXT: [[V_AND_B32_e64_1:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 [[COPY8]], [[V_MOV_B32_e32_]], implicit $exec + ; UNPACKED-NEXT: [[V_MOV_B32_e32_2:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 16, implicit $exec + ; UNPACKED-NEXT: [[V_LSHLREV_B32_e64_:%[0-9]+]]:vgpr_32 = V_LSHLREV_B32_e64 [[V_MOV_B32_e32_2]], [[V_AND_B32_e64_1]], implicit $exec ; UNPACKED-NEXT: [[V_OR_B32_e64_:%[0-9]+]]:vgpr_32 = V_OR_B32_e64 [[V_AND_B32_e64_]], [[V_LSHLREV_B32_e64_]], implicit $exec ; UNPACKED-NEXT: $vgpr0 = COPY [[V_OR_B32_e64_]] ; UNPACKED-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0 @@ -133,21 +131,19 @@ ; UNPACKED-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[TBUFFER_LOAD_FORMAT_D16_XYZW_gfx80_BOTHEN]].sub1 ; UNPACKED-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[TBUFFER_LOAD_FORMAT_D16_XYZW_gfx80_BOTHEN]].sub2 ; UNPACKED-NEXT: [[COPY10:%[0-9]+]]:vgpr_32 = COPY [[TBUFFER_LOAD_FORMAT_D16_XYZW_gfx80_BOTHEN]].sub3 - ; UNPACKED-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 65535 - ; UNPACKED-NEXT: [[COPY11:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] - ; UNPACKED-NEXT: [[V_AND_B32_e64_:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 [[COPY7]], [[COPY11]], implicit $exec - ; UNPACKED-NEXT: [[COPY12:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] - ; UNPACKED-NEXT: [[V_AND_B32_e64_1:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 [[COPY8]], [[COPY12]], implicit $exec - ; UNPACKED-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 16 - ; UNPACKED-NEXT: [[COPY13:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_1]] - ; UNPACKED-NEXT: [[V_LSHLREV_B32_e64_:%[0-9]+]]:vgpr_32 = V_LSHLREV_B32_e64 [[COPY13]], [[V_AND_B32_e64_1]], implicit $exec + ; UNPACKED-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 65535, implicit $exec + ; UNPACKED-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 65535, implicit $exec + ; UNPACKED-NEXT: [[V_AND_B32_e64_:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 [[COPY7]], [[V_MOV_B32_e32_1]], implicit $exec + ; UNPACKED-NEXT: [[V_MOV_B32_e32_2:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 65535, implicit $exec + ; UNPACKED-NEXT: [[V_AND_B32_e64_1:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 [[COPY8]], [[V_MOV_B32_e32_2]], implicit $exec + ; UNPACKED-NEXT: [[V_MOV_B32_e32_3:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 16, implicit $exec + ; UNPACKED-NEXT: [[V_MOV_B32_e32_4:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 16, implicit $exec + ; UNPACKED-NEXT: [[V_LSHLREV_B32_e64_:%[0-9]+]]:vgpr_32 = V_LSHLREV_B32_e64 [[V_MOV_B32_e32_4]], [[V_AND_B32_e64_1]], implicit $exec ; UNPACKED-NEXT: [[V_OR_B32_e64_:%[0-9]+]]:vgpr_32 = V_OR_B32_e64 [[V_AND_B32_e64_]], [[V_LSHLREV_B32_e64_]], implicit $exec - ; UNPACKED-NEXT: [[COPY14:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] - ; UNPACKED-NEXT: [[V_AND_B32_e64_2:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 [[COPY9]], [[COPY14]], implicit $exec - ; UNPACKED-NEXT: [[COPY15:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] - ; UNPACKED-NEXT: [[V_AND_B32_e64_3:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 [[COPY10]], [[COPY15]], implicit $exec - ; UNPACKED-NEXT: [[COPY16:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_1]] - ; UNPACKED-NEXT: [[V_LSHLREV_B32_e64_1:%[0-9]+]]:vgpr_32 = V_LSHLREV_B32_e64 [[COPY16]], [[V_AND_B32_e64_3]], implicit $exec + ; UNPACKED-NEXT: [[V_MOV_B32_e32_5:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 65535, implicit $exec + ; UNPACKED-NEXT: [[V_AND_B32_e64_2:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 [[COPY9]], [[V_MOV_B32_e32_5]], implicit $exec + ; UNPACKED-NEXT: [[V_AND_B32_e64_3:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 [[COPY10]], [[V_MOV_B32_e32_]], implicit $exec + ; UNPACKED-NEXT: [[V_LSHLREV_B32_e64_1:%[0-9]+]]:vgpr_32 = V_LSHLREV_B32_e64 [[V_MOV_B32_e32_3]], [[V_AND_B32_e64_3]], implicit $exec ; UNPACKED-NEXT: [[V_OR_B32_e64_1:%[0-9]+]]:vgpr_32 = V_OR_B32_e64 [[V_AND_B32_e64_2]], [[V_LSHLREV_B32_e64_1]], implicit $exec ; UNPACKED-NEXT: $vgpr0 = COPY [[V_OR_B32_e64_]] ; UNPACKED-NEXT: $vgpr1 = COPY [[V_OR_B32_e64_1]] @@ -168,9 +164,8 @@ ; PACKED-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; PACKED-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; PACKED-NEXT: [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr6 - ; PACKED-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0 - ; PACKED-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] - ; PACKED-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY6]], %subreg.sub0, [[COPY4]], %subreg.sub1 + ; PACKED-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + ; PACKED-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_MOV_B32_e32_]], %subreg.sub0, [[COPY4]], %subreg.sub1 ; PACKED-NEXT: [[TBUFFER_LOAD_FORMAT_D16_X_BOTHEN:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_D16_X_BOTHEN [[REG_SEQUENCE1]], [[REG_SEQUENCE]], [[COPY5]], 0, 78, 0, 0, implicit $exec :: (dereferenceable load (s16), align 1, addrspace 8) ; PACKED-NEXT: $vgpr0 = COPY [[TBUFFER_LOAD_FORMAT_D16_X_BOTHEN]] ; PACKED-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0 @@ -185,9 +180,8 @@ ; UNPACKED-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; UNPACKED-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; UNPACKED-NEXT: [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr6 - ; UNPACKED-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0 - ; UNPACKED-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] - ; UNPACKED-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY6]], %subreg.sub0, [[COPY4]], %subreg.sub1 + ; UNPACKED-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + ; UNPACKED-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_MOV_B32_e32_]], %subreg.sub0, [[COPY4]], %subreg.sub1 ; UNPACKED-NEXT: [[TBUFFER_LOAD_FORMAT_D16_X_gfx80_BOTHEN:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_D16_X_gfx80_BOTHEN [[REG_SEQUENCE1]], [[REG_SEQUENCE]], [[COPY5]], 0, 78, 0, 0, implicit $exec :: (dereferenceable load (s16), align 1, addrspace 8) ; UNPACKED-NEXT: $vgpr0 = COPY [[TBUFFER_LOAD_FORMAT_D16_X_gfx80_BOTHEN]] ; UNPACKED-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0 @@ -307,21 +301,19 @@ ; UNPACKED-NEXT: [[COPY14:%[0-9]+]]:vgpr_32 = COPY [[TBUFFER_LOAD_FORMAT_D16_XYZW_gfx80_BOTHEN]].sub1 ; UNPACKED-NEXT: [[COPY15:%[0-9]+]]:vgpr_32 = COPY [[TBUFFER_LOAD_FORMAT_D16_XYZW_gfx80_BOTHEN]].sub2 ; UNPACKED-NEXT: [[COPY16:%[0-9]+]]:vgpr_32 = COPY [[TBUFFER_LOAD_FORMAT_D16_XYZW_gfx80_BOTHEN]].sub3 - ; UNPACKED-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 65535 - ; UNPACKED-NEXT: [[COPY17:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] - ; UNPACKED-NEXT: [[V_AND_B32_e64_:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 [[COPY13]], [[COPY17]], implicit $exec - ; UNPACKED-NEXT: [[COPY18:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] - ; UNPACKED-NEXT: [[V_AND_B32_e64_1:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 [[COPY14]], [[COPY18]], implicit $exec - ; UNPACKED-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 16 - ; UNPACKED-NEXT: [[COPY19:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_1]] - ; UNPACKED-NEXT: [[V_LSHLREV_B32_e64_:%[0-9]+]]:vgpr_32 = V_LSHLREV_B32_e64 [[COPY19]], [[V_AND_B32_e64_1]], implicit $exec + ; UNPACKED-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 65535, implicit $exec + ; UNPACKED-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 65535, implicit $exec + ; UNPACKED-NEXT: [[V_AND_B32_e64_:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 [[COPY13]], [[V_MOV_B32_e32_1]], implicit $exec + ; UNPACKED-NEXT: [[V_MOV_B32_e32_2:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 65535, implicit $exec + ; UNPACKED-NEXT: [[V_AND_B32_e64_1:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 [[COPY14]], [[V_MOV_B32_e32_2]], implicit $exec + ; UNPACKED-NEXT: [[V_MOV_B32_e32_3:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 16, implicit $exec + ; UNPACKED-NEXT: [[V_MOV_B32_e32_4:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 16, implicit $exec + ; UNPACKED-NEXT: [[V_LSHLREV_B32_e64_:%[0-9]+]]:vgpr_32 = V_LSHLREV_B32_e64 [[V_MOV_B32_e32_4]], [[V_AND_B32_e64_1]], implicit $exec ; UNPACKED-NEXT: [[V_OR_B32_e64_:%[0-9]+]]:vgpr_32 = V_OR_B32_e64 [[V_AND_B32_e64_]], [[V_LSHLREV_B32_e64_]], implicit $exec - ; UNPACKED-NEXT: [[COPY20:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] - ; UNPACKED-NEXT: [[V_AND_B32_e64_2:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 [[COPY15]], [[COPY20]], implicit $exec - ; UNPACKED-NEXT: [[COPY21:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] - ; UNPACKED-NEXT: [[V_AND_B32_e64_3:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 [[COPY16]], [[COPY21]], implicit $exec - ; UNPACKED-NEXT: [[COPY22:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_1]] - ; UNPACKED-NEXT: [[V_LSHLREV_B32_e64_1:%[0-9]+]]:vgpr_32 = V_LSHLREV_B32_e64 [[COPY22]], [[V_AND_B32_e64_3]], implicit $exec + ; UNPACKED-NEXT: [[V_MOV_B32_e32_5:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 65535, implicit $exec + ; UNPACKED-NEXT: [[V_AND_B32_e64_2:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 [[COPY15]], [[V_MOV_B32_e32_5]], implicit $exec + ; UNPACKED-NEXT: [[V_AND_B32_e64_3:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 [[COPY16]], [[V_MOV_B32_e32_]], implicit $exec + ; UNPACKED-NEXT: [[V_LSHLREV_B32_e64_1:%[0-9]+]]:vgpr_32 = V_LSHLREV_B32_e64 [[V_MOV_B32_e32_3]], [[V_AND_B32_e64_3]], implicit $exec ; UNPACKED-NEXT: [[V_OR_B32_e64_1:%[0-9]+]]:vgpr_32 = V_OR_B32_e64 [[V_AND_B32_e64_2]], [[V_LSHLREV_B32_e64_1]], implicit $exec ; UNPACKED-NEXT: $vgpr0 = COPY [[V_OR_B32_e64_]] ; UNPACKED-NEXT: $vgpr1 = COPY [[V_OR_B32_e64_1]] Index: llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.tbuffer.load.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.tbuffer.load.ll +++ llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.tbuffer.load.ll @@ -114,9 +114,8 @@ ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; CHECK-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; CHECK-NEXT: [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr6 - ; CHECK-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0 - ; CHECK-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] - ; CHECK-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY6]], %subreg.sub0, [[COPY4]], %subreg.sub1 + ; CHECK-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + ; CHECK-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_MOV_B32_e32_]], %subreg.sub0, [[COPY4]], %subreg.sub1 ; CHECK-NEXT: [[TBUFFER_LOAD_FORMAT_X_BOTHEN:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_BOTHEN [[REG_SEQUENCE1]], [[REG_SEQUENCE]], [[COPY5]], 0, 78, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8) ; CHECK-NEXT: $vgpr0 = COPY [[TBUFFER_LOAD_FORMAT_X_BOTHEN]] ; CHECK-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0 Index: llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.udot4.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.udot4.ll +++ llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.udot4.ll @@ -43,19 +43,19 @@ ; GFX906-LABEL: v_udot4_cast_v4i8: ; GFX906: ; %bb.0: ; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX906-NEXT: s_mov_b32 s5, 8 -; GFX906-NEXT: s_movk_i32 s4, 0xff -; GFX906-NEXT: v_lshlrev_b32_sdwa v1, s5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX906-NEXT: v_and_or_b32 v0, v0, s4, v1 +; GFX906-NEXT: v_mov_b32_e32 v10, 8 +; GFX906-NEXT: v_mov_b32_e32 v9, 0xff +; GFX906-NEXT: v_lshlrev_b32_sdwa v1, v10, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX906-NEXT: v_and_or_b32 v0, v0, v9, v1 ; GFX906-NEXT: v_and_b32_e32 v1, 0xff, v2 ; GFX906-NEXT: v_and_b32_e32 v2, 0xff, v3 ; GFX906-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX906-NEXT: v_lshlrev_b32_e32 v2, 24, v2 ; GFX906-NEXT: v_or3_b32 v0, v0, v1, v2 -; GFX906-NEXT: v_lshlrev_b32_sdwa v1, s5, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX906-NEXT: v_lshlrev_b32_sdwa v1, v10, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX906-NEXT: v_and_b32_e32 v2, 0xff, v6 ; GFX906-NEXT: v_and_b32_e32 v3, 0xff, v7 -; GFX906-NEXT: v_and_or_b32 v1, v4, s4, v1 +; GFX906-NEXT: v_and_or_b32 v1, v4, v9, v1 ; GFX906-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX906-NEXT: v_lshlrev_b32_e32 v3, 24, v3 ; GFX906-NEXT: v_or3_b32 v1, v1, v2, v3 @@ -66,17 +66,17 @@ ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: s_mov_b32 s4, 8 -; GFX10-NEXT: v_lshlrev_b32_sdwa v1, s4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX10-NEXT: v_and_or_b32 v0, v0, 0xff, v1 +; GFX10-NEXT: v_mov_b32_e32 v9, 8 +; GFX10-NEXT: v_lshlrev_b32_sdwa v1, v9, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX10-NEXT: v_and_or_b32 v0, 0xff, v0, v1 ; GFX10-NEXT: v_and_b32_e32 v1, 0xff, v2 ; GFX10-NEXT: v_and_b32_e32 v2, 0xff, v3 -; GFX10-NEXT: v_lshlrev_b32_sdwa v3, s4, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX10-NEXT: v_lshlrev_b32_sdwa v3, v9, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX10-NEXT: v_and_b32_e32 v5, 0xff, v6 ; GFX10-NEXT: v_and_b32_e32 v6, 0xff, v7 ; GFX10-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX10-NEXT: v_lshlrev_b32_e32 v2, 24, v2 -; GFX10-NEXT: v_and_or_b32 v3, v4, 0xff, v3 +; GFX10-NEXT: v_and_or_b32 v3, 0xff, v4, v3 ; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v5 ; GFX10-NEXT: v_lshlrev_b32_e32 v5, 24, v6 ; GFX10-NEXT: v_or3_b32 v0, v0, v1, v2 @@ -96,10 +96,10 @@ ; GFX11-NEXT: v_lshlrev_b32_e32 v5, 8, v5 ; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX11-NEXT: v_lshlrev_b32_e32 v3, 24, v3 -; GFX11-NEXT: v_and_or_b32 v0, v0, 0xff, v1 +; GFX11-NEXT: v_and_or_b32 v0, 0xff, v0, v1 ; GFX11-NEXT: v_and_b32_e32 v1, 0xff, v6 ; GFX11-NEXT: v_and_b32_e32 v6, 0xff, v7 -; GFX11-NEXT: v_and_or_b32 v4, v4, 0xff, v5 +; GFX11-NEXT: v_and_or_b32 v4, 0xff, v4, v5 ; GFX11-NEXT: v_or3_b32 v0, v0, v2, v3 ; GFX11-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX11-NEXT: v_lshlrev_b32_e32 v5, 24, v6 Index: llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.wqm.demote.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.wqm.demote.ll +++ llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.wqm.demote.ll @@ -951,7 +951,6 @@ ; GFX9-NEXT: s_mov_b64 s[0:1], exec ; GFX9-NEXT: s_wqm_b64 exec, exec ; GFX9-NEXT: v_cvt_i32_f32_e32 v0, v0 -; GFX9-NEXT: s_mov_b32 s4, 0 ; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 ; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX9-NEXT: s_xor_b64 s[2:3], exec, s[2:3] @@ -960,10 +959,11 @@ ; GFX9-NEXT: s_andn2_b64 s[0:1], s[0:1], exec ; GFX9-NEXT: s_cbranch_scc0 .LBB7_9 ; GFX9-NEXT: ; %bb.2: ; %.demote0 -; GFX9-NEXT: s_wqm_b64 s[6:7], s[0:1] -; GFX9-NEXT: s_and_b64 exec, exec, s[6:7] +; GFX9-NEXT: s_wqm_b64 s[4:5], s[0:1] +; GFX9-NEXT: s_and_b64 exec, exec, s[4:5] ; GFX9-NEXT: .LBB7_3: ; %.continue0.preheader ; GFX9-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX9-NEXT: s_mov_b32 s4, 0 ; GFX9-NEXT: s_mov_b64 s[2:3], 0 ; GFX9-NEXT: v_mov_b32_e32 v0, s4 ; GFX9-NEXT: s_branch .LBB7_5 @@ -1017,19 +1017,19 @@ ; GFX10-32-NEXT: s_mov_b32 s0, exec_lo ; GFX10-32-NEXT: s_wqm_b32 exec_lo, exec_lo ; GFX10-32-NEXT: v_cvt_i32_f32_e32 v0, v0 -; GFX10-32-NEXT: s_mov_b32 s1, 0 ; GFX10-32-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0 -; GFX10-32-NEXT: s_and_saveexec_b32 s2, vcc_lo -; GFX10-32-NEXT: s_xor_b32 s2, exec_lo, s2 +; GFX10-32-NEXT: s_and_saveexec_b32 s1, vcc_lo +; GFX10-32-NEXT: s_xor_b32 s1, exec_lo, s1 ; GFX10-32-NEXT: s_cbranch_execz .LBB7_3 ; GFX10-32-NEXT: ; %bb.1: ; %.demote0 ; GFX10-32-NEXT: s_andn2_b32 s0, s0, exec_lo ; GFX10-32-NEXT: s_cbranch_scc0 .LBB7_9 ; GFX10-32-NEXT: ; %bb.2: ; %.demote0 -; GFX10-32-NEXT: s_wqm_b32 s3, s0 -; GFX10-32-NEXT: s_and_b32 exec_lo, exec_lo, s3 +; GFX10-32-NEXT: s_wqm_b32 s2, s0 +; GFX10-32-NEXT: s_and_b32 exec_lo, exec_lo, s2 ; GFX10-32-NEXT: .LBB7_3: ; %.continue0.preheader -; GFX10-32-NEXT: s_or_b32 exec_lo, exec_lo, s2 +; GFX10-32-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; GFX10-32-NEXT: s_mov_b32 s1, 0 ; GFX10-32-NEXT: v_mov_b32_e32 v0, s1 ; GFX10-32-NEXT: s_branch .LBB7_5 ; GFX10-32-NEXT: .LBB7_4: ; %.continue1 @@ -1080,7 +1080,6 @@ ; GFX10-64-NEXT: s_mov_b64 s[0:1], exec ; GFX10-64-NEXT: s_wqm_b64 exec, exec ; GFX10-64-NEXT: v_cvt_i32_f32_e32 v0, v0 -; GFX10-64-NEXT: s_mov_b32 s4, 0 ; GFX10-64-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 ; GFX10-64-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX10-64-NEXT: s_xor_b64 s[2:3], exec, s[2:3] @@ -1089,11 +1088,12 @@ ; GFX10-64-NEXT: s_andn2_b64 s[0:1], s[0:1], exec ; GFX10-64-NEXT: s_cbranch_scc0 .LBB7_9 ; GFX10-64-NEXT: ; %bb.2: ; %.demote0 -; GFX10-64-NEXT: s_wqm_b64 s[6:7], s[0:1] -; GFX10-64-NEXT: s_and_b64 exec, exec, s[6:7] +; GFX10-64-NEXT: s_wqm_b64 s[4:5], s[0:1] +; GFX10-64-NEXT: s_and_b64 exec, exec, s[4:5] ; GFX10-64-NEXT: .LBB7_3: ; %.continue0.preheader ; GFX10-64-NEXT: s_or_b64 exec, exec, s[2:3] -; GFX10-64-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-64-NEXT: s_mov_b32 s2, 0 +; GFX10-64-NEXT: v_mov_b32_e32 v0, s2 ; GFX10-64-NEXT: s_mov_b64 s[2:3], 0 ; GFX10-64-NEXT: s_branch .LBB7_5 ; GFX10-64-NEXT: .LBB7_4: ; %.continue1 Index: llvm/test/CodeGen/AMDGPU/GlobalISel/load-constant32bit.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/GlobalISel/load-constant32bit.ll +++ llvm/test/CodeGen/AMDGPU/GlobalISel/load-constant32bit.ll @@ -11,8 +11,8 @@ ; GFX6-LABEL: load_constant32bit_vgpr_offset: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX6-NEXT: s_mov_b32 s2, 0 ; GFX6-NEXT: v_mov_b32_e32 v1, 0 +; GFX6-NEXT: s_mov_b32 s2, 0 ; GFX6-NEXT: s_mov_b32 s3, 0xf000 ; GFX6-NEXT: s_mov_b64 s[0:1], 0 ; GFX6-NEXT: buffer_load_dword v0, v[0:1], s[0:3], 0 addr64 @@ -59,8 +59,8 @@ ; GFX6-LABEL: load_constant32bit_vgpr_v8f32: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: v_mov_b32_e32 v4, v0 -; GFX6-NEXT: s_mov_b32 s2, 0 ; GFX6-NEXT: v_mov_b32_e32 v5, 0 +; GFX6-NEXT: s_mov_b32 s2, 0 ; GFX6-NEXT: s_mov_b32 s3, 0xf000 ; GFX6-NEXT: s_mov_b64 s[0:1], 0 ; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[4:5], s[0:3], 0 addr64 Index: llvm/test/CodeGen/AMDGPU/GlobalISel/mubuf-global.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/GlobalISel/mubuf-global.ll +++ llvm/test/CodeGen/AMDGPU/GlobalISel/mubuf-global.ll @@ -97,8 +97,8 @@ ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: s_mov_b32 s0, s2 ; GFX6-NEXT: s_mov_b32 s1, s3 -; GFX6-NEXT: s_mov_b32 s2, 0 ; GFX6-NEXT: v_mov_b32_e32 v2, 0 +; GFX6-NEXT: s_mov_b32 s2, 0 ; GFX6-NEXT: s_mov_b32 s3, 0xf000 ; GFX6-NEXT: v_mov_b32_e32 v1, s5 ; GFX6-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 @@ -111,8 +111,8 @@ ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: s_mov_b32 s0, s2 ; GFX7-NEXT: s_mov_b32 s1, s3 -; GFX7-NEXT: s_mov_b32 s2, 0 ; GFX7-NEXT: v_mov_b32_e32 v2, 0 +; GFX7-NEXT: s_mov_b32 s2, 0 ; GFX7-NEXT: s_mov_b32 s3, 0xf000 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 @@ -152,8 +152,8 @@ define amdgpu_ps void @mubuf_store_vgpr_ptr_offset4095(ptr addrspace(1) %ptr) { ; GFX6-LABEL: mubuf_store_vgpr_ptr_offset4095: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_mov_b32 s2, 0 ; GFX6-NEXT: v_mov_b32_e32 v2, 0 +; GFX6-NEXT: s_mov_b32 s2, 0 ; GFX6-NEXT: s_mov_b32 s3, 0xf000 ; GFX6-NEXT: s_mov_b64 s[0:1], 0 ; GFX6-NEXT: s_movk_i32 s4, 0x3ffc @@ -162,8 +162,8 @@ ; ; GFX7-LABEL: mubuf_store_vgpr_ptr_offset4095: ; GFX7: ; %bb.0: -; GFX7-NEXT: s_mov_b32 s2, 0 ; GFX7-NEXT: v_mov_b32_e32 v2, 0 +; GFX7-NEXT: s_mov_b32 s2, 0 ; GFX7-NEXT: s_mov_b32 s3, 0xf000 ; GFX7-NEXT: s_mov_b64 s[0:1], 0 ; GFX7-NEXT: s_movk_i32 s4, 0x3ffc @@ -204,8 +204,8 @@ ; GFX6: ; %bb.0: ; GFX6-NEXT: s_mov_b32 s0, 4 ; GFX6-NEXT: s_mov_b32 s1, s0 -; GFX6-NEXT: s_mov_b32 s2, 0 ; GFX6-NEXT: v_mov_b32_e32 v2, 0 +; GFX6-NEXT: s_mov_b32 s2, 0 ; GFX6-NEXT: s_mov_b32 s3, 0xf000 ; GFX6-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 ; GFX6-NEXT: s_endpgm @@ -214,8 +214,8 @@ ; GFX7: ; %bb.0: ; GFX7-NEXT: s_mov_b32 s0, 4 ; GFX7-NEXT: s_mov_b32 s1, s0 -; GFX7-NEXT: s_mov_b32 s2, 0 ; GFX7-NEXT: v_mov_b32_e32 v2, 0 +; GFX7-NEXT: s_mov_b32 s2, 0 ; GFX7-NEXT: s_mov_b32 s3, 0xf000 ; GFX7-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 ; GFX7-NEXT: s_endpgm @@ -227,8 +227,8 @@ define amdgpu_ps void @mubuf_store_vgpr_ptr_offset4096(ptr addrspace(1) %ptr) { ; GFX6-LABEL: mubuf_store_vgpr_ptr_offset4096: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_mov_b32 s2, 0 ; GFX6-NEXT: v_mov_b32_e32 v2, 0 +; GFX6-NEXT: s_mov_b32 s2, 0 ; GFX6-NEXT: s_mov_b32 s3, 0xf000 ; GFX6-NEXT: s_mov_b64 s[0:1], 0 ; GFX6-NEXT: s_movk_i32 s4, 0x4000 @@ -237,8 +237,8 @@ ; ; GFX7-LABEL: mubuf_store_vgpr_ptr_offset4096: ; GFX7: ; %bb.0: -; GFX7-NEXT: s_mov_b32 s2, 0 ; GFX7-NEXT: v_mov_b32_e32 v2, 0 +; GFX7-NEXT: s_mov_b32 s2, 0 ; GFX7-NEXT: s_mov_b32 s3, 0xf000 ; GFX7-NEXT: s_mov_b64 s[0:1], 0 ; GFX7-NEXT: s_movk_i32 s4, 0x4000 @@ -257,8 +257,8 @@ ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: s_mov_b32 s0, s2 ; GFX6-NEXT: s_mov_b32 s1, s3 -; GFX6-NEXT: s_mov_b32 s2, 0 ; GFX6-NEXT: v_mov_b32_e32 v2, 0 +; GFX6-NEXT: s_mov_b32 s2, 0 ; GFX6-NEXT: s_mov_b32 s3, 0xf000 ; GFX6-NEXT: v_mov_b32_e32 v1, s5 ; GFX6-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 @@ -271,8 +271,8 @@ ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: s_mov_b32 s0, s2 ; GFX7-NEXT: s_mov_b32 s1, s3 -; GFX7-NEXT: s_mov_b32 s2, 0 ; GFX7-NEXT: v_mov_b32_e32 v2, 0 +; GFX7-NEXT: s_mov_b32 s2, 0 ; GFX7-NEXT: s_mov_b32 s3, 0xf000 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 @@ -287,8 +287,8 @@ ; GFX6: ; %bb.0: ; GFX6-NEXT: s_ashr_i32 s3, s2, 31 ; GFX6-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 -; GFX6-NEXT: s_mov_b32 s2, 0 ; GFX6-NEXT: v_mov_b32_e32 v2, 0 +; GFX6-NEXT: s_mov_b32 s2, 0 ; GFX6-NEXT: s_mov_b32 s3, 0xf000 ; GFX6-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 ; GFX6-NEXT: s_endpgm @@ -297,8 +297,8 @@ ; GFX7: ; %bb.0: ; GFX7-NEXT: s_ashr_i32 s3, s2, 31 ; GFX7-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 -; GFX7-NEXT: s_mov_b32 s2, 0 ; GFX7-NEXT: v_mov_b32_e32 v2, 0 +; GFX7-NEXT: s_mov_b32 s2, 0 ; GFX7-NEXT: s_mov_b32 s3, 0xf000 ; GFX7-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 ; GFX7-NEXT: s_endpgm @@ -312,8 +312,8 @@ ; GFX6: ; %bb.0: ; GFX6-NEXT: s_ashr_i32 s3, s2, 31 ; GFX6-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 -; GFX6-NEXT: s_mov_b32 s2, 0 ; GFX6-NEXT: v_mov_b32_e32 v2, 0 +; GFX6-NEXT: s_mov_b32 s2, 0 ; GFX6-NEXT: s_mov_b32 s3, 0xf000 ; GFX6-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 offset:1024 ; GFX6-NEXT: s_endpgm @@ -322,8 +322,8 @@ ; GFX7: ; %bb.0: ; GFX7-NEXT: s_ashr_i32 s3, s2, 31 ; GFX7-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 -; GFX7-NEXT: s_mov_b32 s2, 0 ; GFX7-NEXT: v_mov_b32_e32 v2, 0 +; GFX7-NEXT: s_mov_b32 s2, 0 ; GFX7-NEXT: s_mov_b32 s3, 0xf000 ; GFX7-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 offset:1024 ; GFX7-NEXT: s_endpgm @@ -338,8 +338,8 @@ ; GFX6: ; %bb.0: ; GFX6-NEXT: s_ashr_i32 s3, s2, 31 ; GFX6-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 -; GFX6-NEXT: s_mov_b32 s2, 0 ; GFX6-NEXT: v_mov_b32_e32 v2, 0 +; GFX6-NEXT: s_mov_b32 s2, 0 ; GFX6-NEXT: s_mov_b32 s3, 0xf000 ; GFX6-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 offset:1024 ; GFX6-NEXT: s_endpgm @@ -348,8 +348,8 @@ ; GFX7: ; %bb.0: ; GFX7-NEXT: s_ashr_i32 s3, s2, 31 ; GFX7-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 -; GFX7-NEXT: s_mov_b32 s2, 0 ; GFX7-NEXT: v_mov_b32_e32 v2, 0 +; GFX7-NEXT: s_mov_b32 s2, 0 ; GFX7-NEXT: s_mov_b32 s3, 0xf000 ; GFX7-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 offset:1024 ; GFX7-NEXT: s_endpgm @@ -366,8 +366,8 @@ ; GFX6-NEXT: v_lshl_b64 v[0:1], v[0:1], 2 ; GFX6-NEXT: s_mov_b32 s0, s2 ; GFX6-NEXT: s_mov_b32 s1, s3 -; GFX6-NEXT: s_mov_b32 s2, 0 ; GFX6-NEXT: v_mov_b32_e32 v2, 0 +; GFX6-NEXT: s_mov_b32 s2, 0 ; GFX6-NEXT: s_mov_b32 s3, 0xf000 ; GFX6-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 ; GFX6-NEXT: s_endpgm @@ -378,8 +378,8 @@ ; GFX7-NEXT: v_lshl_b64 v[0:1], v[0:1], 2 ; GFX7-NEXT: s_mov_b32 s0, s2 ; GFX7-NEXT: s_mov_b32 s1, s3 -; GFX7-NEXT: s_mov_b32 s2, 0 ; GFX7-NEXT: v_mov_b32_e32 v2, 0 +; GFX7-NEXT: s_mov_b32 s2, 0 ; GFX7-NEXT: s_mov_b32 s3, 0xf000 ; GFX7-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 ; GFX7-NEXT: s_endpgm @@ -395,8 +395,8 @@ ; GFX6-NEXT: v_lshl_b64 v[0:1], v[0:1], 2 ; GFX6-NEXT: s_mov_b32 s0, s2 ; GFX6-NEXT: s_mov_b32 s1, s3 -; GFX6-NEXT: s_mov_b32 s2, 0 ; GFX6-NEXT: v_mov_b32_e32 v2, 0 +; GFX6-NEXT: s_mov_b32 s2, 0 ; GFX6-NEXT: s_mov_b32 s3, 0xf000 ; GFX6-NEXT: s_movk_i32 s4, 0x3ffc ; GFX6-NEXT: buffer_store_dword v2, v[0:1], s[0:3], s4 addr64 @@ -408,8 +408,8 @@ ; GFX7-NEXT: v_lshl_b64 v[0:1], v[0:1], 2 ; GFX7-NEXT: s_mov_b32 s0, s2 ; GFX7-NEXT: s_mov_b32 s1, s3 -; GFX7-NEXT: s_mov_b32 s2, 0 ; GFX7-NEXT: v_mov_b32_e32 v2, 0 +; GFX7-NEXT: s_mov_b32 s2, 0 ; GFX7-NEXT: s_mov_b32 s3, 0xf000 ; GFX7-NEXT: s_movk_i32 s4, 0x3ffc ; GFX7-NEXT: buffer_store_dword v2, v[0:1], s[0:3], s4 addr64 @@ -426,8 +426,8 @@ ; GFX6-NEXT: v_lshl_b64 v[0:1], v[0:1], 2 ; GFX6-NEXT: s_mov_b32 s0, s2 ; GFX6-NEXT: s_mov_b32 s1, s3 -; GFX6-NEXT: s_mov_b32 s2, 0 ; GFX6-NEXT: v_mov_b32_e32 v2, 0 +; GFX6-NEXT: s_mov_b32 s2, 0 ; GFX6-NEXT: s_mov_b32 s3, 0xf000 ; GFX6-NEXT: s_movk_i32 s4, 0x3ffc ; GFX6-NEXT: buffer_store_dword v2, v[0:1], s[0:3], s4 addr64 @@ -439,8 +439,8 @@ ; GFX7-NEXT: v_lshl_b64 v[0:1], v[0:1], 2 ; GFX7-NEXT: s_mov_b32 s0, s2 ; GFX7-NEXT: s_mov_b32 s1, s3 -; GFX7-NEXT: s_mov_b32 s2, 0 ; GFX7-NEXT: v_mov_b32_e32 v2, 0 +; GFX7-NEXT: s_mov_b32 s2, 0 ; GFX7-NEXT: s_mov_b32 s3, 0xf000 ; GFX7-NEXT: s_movk_i32 s4, 0x3ffc ; GFX7-NEXT: buffer_store_dword v2, v[0:1], s[0:3], s4 addr64 Index: llvm/test/CodeGen/AMDGPU/GlobalISel/non-entry-alloca.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/GlobalISel/non-entry-alloca.ll +++ llvm/test/CodeGen/AMDGPU/GlobalISel/non-entry-alloca.ll @@ -149,6 +149,7 @@ ; GCN: ; %bb.0: ; %entry ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN-NEXT: s_mov_b32 s7, s33 +; GCN-NEXT: v_mov_b32_e32 v5, 0 ; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 ; GCN-NEXT: s_mov_b32 s33, s32 ; GCN-NEXT: s_addk_i32 s32, 0x400 @@ -174,8 +175,7 @@ ; GCN-NEXT: global_store_dword v[0:1], v2, off ; GCN-NEXT: .LBB2_3: ; %bb.2 ; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: v_mov_b32_e32 v0, 0 -; GCN-NEXT: global_store_dword v[0:1], v0, off +; GCN-NEXT: global_store_dword v[0:1], v5, off ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: s_addk_i32 s32, 0xfc00 ; GCN-NEXT: s_mov_b32 s33, s7 @@ -213,6 +213,7 @@ ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN-NEXT: s_mov_b32 s7, s33 ; GCN-NEXT: s_add_i32 s33, s32, 0xfc0 +; GCN-NEXT: v_mov_b32_e32 v4, 0 ; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 ; GCN-NEXT: s_and_b32 s33, s33, 0xfffff000 ; GCN-NEXT: s_addk_i32 s32, 0x2000 @@ -221,11 +222,10 @@ ; GCN-NEXT: ; %bb.1: ; %bb.0 ; GCN-NEXT: s_add_u32 s6, s32, 0x1000 ; GCN-NEXT: s_and_b32 s6, s6, 0xfffff000 -; GCN-NEXT: v_mov_b32_e32 v2, 0 -; GCN-NEXT: v_mov_b32_e32 v4, s6 -; GCN-NEXT: buffer_store_dword v2, v4, s[0:3], 0 offen -; GCN-NEXT: v_mov_b32_e32 v2, 1 -; GCN-NEXT: buffer_store_dword v2, v4, s[0:3], 0 offen offset:4 +; GCN-NEXT: v_mov_b32_e32 v2, s6 +; GCN-NEXT: v_mov_b32_e32 v5, 1 +; GCN-NEXT: buffer_store_dword v4, v2, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v5, v2, s[0:3], 0 offen offset:4 ; GCN-NEXT: v_lshlrev_b32_e32 v2, 2, v3 ; GCN-NEXT: v_add_u32_e32 v2, s6, v2 ; GCN-NEXT: buffer_load_dword v2, v2, s[0:3], 0 offen @@ -235,8 +235,7 @@ ; GCN-NEXT: global_store_dword v[0:1], v2, off ; GCN-NEXT: .LBB3_2: ; %bb.1 ; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: v_mov_b32_e32 v0, 0 -; GCN-NEXT: global_store_dword v[0:1], v0, off +; GCN-NEXT: global_store_dword v[0:1], v4, off ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: s_addk_i32 s32, 0xe000 ; GCN-NEXT: s_mov_b32 s33, s7 Index: llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgcn.div.fmas.mir =================================================================== --- llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgcn.div.fmas.mir +++ llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgcn.div.fmas.mir @@ -47,13 +47,12 @@ ; CHECK-NEXT: [[COPY1:%[0-9]+]]:sgpr(s32) = COPY $sgpr1 ; CHECK-NEXT: [[COPY2:%[0-9]+]]:sgpr(s32) = COPY $sgpr2 ; CHECK-NEXT: [[COPY3:%[0-9]+]]:vgpr(s32) = COPY $vgpr0 - ; CHECK-NEXT: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 0 - ; CHECK-NEXT: [[COPY4:%[0-9]+]]:vgpr(s32) = COPY [[C]](s32) - ; CHECK-NEXT: [[ICMP:%[0-9]+]]:vcc(s1) = G_ICMP intpred(eq), [[COPY3]](s32), [[COPY4]] - ; CHECK-NEXT: [[COPY5:%[0-9]+]]:vgpr(s32) = COPY [[COPY]](s32) - ; CHECK-NEXT: [[COPY6:%[0-9]+]]:vgpr(s32) = COPY [[COPY1]](s32) - ; CHECK-NEXT: [[COPY7:%[0-9]+]]:vgpr(s32) = COPY [[COPY2]](s32) - ; CHECK-NEXT: [[INT:%[0-9]+]]:vgpr(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.div.fmas), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32), [[ICMP]](s1) + ; CHECK-NEXT: [[C:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0 + ; CHECK-NEXT: [[ICMP:%[0-9]+]]:vcc(s1) = G_ICMP intpred(eq), [[COPY3]](s32), [[C]] + ; CHECK-NEXT: [[COPY4:%[0-9]+]]:vgpr(s32) = COPY [[COPY]](s32) + ; CHECK-NEXT: [[COPY5:%[0-9]+]]:vgpr(s32) = COPY [[COPY1]](s32) + ; CHECK-NEXT: [[COPY6:%[0-9]+]]:vgpr(s32) = COPY [[COPY2]](s32) + ; CHECK-NEXT: [[INT:%[0-9]+]]:vgpr(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.div.fmas), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[ICMP]](s1) %0:_(s32) = COPY $sgpr0 %1:_(s32) = COPY $sgpr1 %2:_(s32) = COPY $sgpr2 @@ -77,12 +76,11 @@ ; CHECK-NEXT: [[COPY1:%[0-9]+]]:sgpr(s32) = COPY $sgpr0 ; CHECK-NEXT: [[COPY2:%[0-9]+]]:sgpr(s32) = COPY $sgpr1 ; CHECK-NEXT: [[COPY3:%[0-9]+]]:vgpr(s32) = COPY $vgpr1 - ; CHECK-NEXT: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 0 - ; CHECK-NEXT: [[COPY4:%[0-9]+]]:vgpr(s32) = COPY [[C]](s32) - ; CHECK-NEXT: [[ICMP:%[0-9]+]]:vcc(s1) = G_ICMP intpred(eq), [[COPY3]](s32), [[COPY4]] - ; CHECK-NEXT: [[COPY5:%[0-9]+]]:vgpr(s32) = COPY [[COPY1]](s32) - ; CHECK-NEXT: [[COPY6:%[0-9]+]]:vgpr(s32) = COPY [[COPY2]](s32) - ; CHECK-NEXT: [[INT:%[0-9]+]]:vgpr(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.div.fmas), [[COPY]](s32), [[COPY5]](s32), [[COPY6]](s32), [[ICMP]](s1) + ; CHECK-NEXT: [[C:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0 + ; CHECK-NEXT: [[ICMP:%[0-9]+]]:vcc(s1) = G_ICMP intpred(eq), [[COPY3]](s32), [[C]] + ; CHECK-NEXT: [[COPY4:%[0-9]+]]:vgpr(s32) = COPY [[COPY1]](s32) + ; CHECK-NEXT: [[COPY5:%[0-9]+]]:vgpr(s32) = COPY [[COPY2]](s32) + ; CHECK-NEXT: [[INT:%[0-9]+]]:vgpr(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.div.fmas), [[COPY]](s32), [[COPY4]](s32), [[COPY5]](s32), [[ICMP]](s1) %0:_(s32) = COPY $vgpr0 %1:_(s32) = COPY $sgpr0 %2:_(s32) = COPY $sgpr1 @@ -106,9 +104,8 @@ ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr(s32) = COPY $vgpr1 ; CHECK-NEXT: [[COPY2:%[0-9]+]]:vgpr(s32) = COPY $vgpr2 ; CHECK-NEXT: [[COPY3:%[0-9]+]]:vgpr(s32) = COPY $vgpr3 - ; CHECK-NEXT: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 0 - ; CHECK-NEXT: [[COPY4:%[0-9]+]]:vgpr(s32) = COPY [[C]](s32) - ; CHECK-NEXT: [[ICMP:%[0-9]+]]:vcc(s1) = G_ICMP intpred(eq), [[COPY3]](s32), [[COPY4]] + ; CHECK-NEXT: [[C:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0 + ; CHECK-NEXT: [[ICMP:%[0-9]+]]:vcc(s1) = G_ICMP intpred(eq), [[COPY3]](s32), [[C]] ; CHECK-NEXT: [[INT:%[0-9]+]]:vgpr(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.div.fmas), [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[ICMP]](s1) %0:_(s32) = COPY $vgpr0 %1:_(s32) = COPY $vgpr1 Index: llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgcn.raw.buffer.load.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgcn.raw.buffer.load.ll +++ llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgcn.raw.buffer.load.ll @@ -15,9 +15,8 @@ ; CHECK-NEXT: [[BUILD_VECTOR:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32) ; CHECK-NEXT: [[COPY4:%[0-9]+]]:vgpr(s32) = COPY $vgpr0 ; CHECK-NEXT: [[COPY5:%[0-9]+]]:sgpr(s32) = COPY $sgpr6 - ; CHECK-NEXT: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 0 - ; CHECK-NEXT: [[COPY6:%[0-9]+]]:vgpr(s32) = COPY [[C]](s32) - ; CHECK-NEXT: [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(s32) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[COPY6]](s32), [[COPY4]], [[COPY5]], 0, 0, 0 :: (dereferenceable load (s32), align 1, addrspace 8) + ; CHECK-NEXT: [[C:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0 + ; CHECK-NEXT: [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(s32) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C]](s32), [[COPY4]], [[COPY5]], 0, 0, 0 :: (dereferenceable load (s32), align 1, addrspace 8) ; CHECK-NEXT: $vgpr0 = COPY [[AMDGPU_BUFFER_LOAD]](s32) ; CHECK-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0 %val = call float @llvm.amdgcn.raw.buffer.load.f32(<4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 0) @@ -37,10 +36,9 @@ ; CHECK-NEXT: [[BUILD_VECTOR:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32) ; CHECK-NEXT: [[COPY4:%[0-9]+]]:sgpr(s32) = COPY $sgpr6 ; CHECK-NEXT: [[COPY5:%[0-9]+]]:sgpr(s32) = COPY $sgpr7 - ; CHECK-NEXT: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 0 - ; CHECK-NEXT: [[COPY6:%[0-9]+]]:vgpr(s32) = COPY [[C]](s32) - ; CHECK-NEXT: [[COPY7:%[0-9]+]]:vgpr(s32) = COPY [[COPY4]](s32) - ; CHECK-NEXT: [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(s32) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[COPY6]](s32), [[COPY7]], [[COPY5]], 0, 0, 0 :: (dereferenceable load (s32), align 1, addrspace 8) + ; CHECK-NEXT: [[C:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0 + ; CHECK-NEXT: [[COPY6:%[0-9]+]]:vgpr(s32) = COPY [[COPY4]](s32) + ; CHECK-NEXT: [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(s32) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C]](s32), [[COPY6]], [[COPY5]], 0, 0, 0 :: (dereferenceable load (s32), align 1, addrspace 8) ; CHECK-NEXT: $vgpr0 = COPY [[AMDGPU_BUFFER_LOAD]](s32) ; CHECK-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0 %val = call float @llvm.amdgcn.raw.buffer.load.f32(<4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 0) @@ -61,15 +59,14 @@ ; CHECK-NEXT: [[BUILD_VECTOR:%[0-9]+]]:vgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32) ; CHECK-NEXT: [[COPY4:%[0-9]+]]:vgpr(s32) = COPY $vgpr4 ; CHECK-NEXT: [[COPY5:%[0-9]+]]:sgpr(s32) = COPY $sgpr2 - ; CHECK-NEXT: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 0 - ; CHECK-NEXT: [[COPY6:%[0-9]+]]:vgpr(s32) = COPY [[C]](s32) + ; CHECK-NEXT: [[C:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0 ; CHECK-NEXT: [[DEF:%[0-9]+]]:sreg_64_xexec = IMPLICIT_DEF ; CHECK-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.2: ; CHECK-NEXT: successors: %bb.3(0x80000000) ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: [[PHI:%[0-9]+]]:sreg_64_xexec = PHI [[DEF]], %bb.1, %15, %bb.3 + ; CHECK-NEXT: [[PHI:%[0-9]+]]:sreg_64_xexec = PHI [[DEF]], %bb.1, %14, %bb.3 ; CHECK-NEXT: [[UV:%[0-9]+]]:vgpr_32(s32), [[UV1:%[0-9]+]]:vgpr_32(s32), [[UV2:%[0-9]+]]:vgpr_32(s32), [[UV3:%[0-9]+]]:vgpr_32(s32) = G_UNMERGE_VALUES [[BUILD_VECTOR]](<4 x s32>) ; CHECK-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32(s32) = V_READFIRSTLANE_B32 [[UV]](s32), implicit $exec ; CHECK-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32(s32) = V_READFIRSTLANE_B32 [[UV1]](s32), implicit $exec @@ -87,7 +84,7 @@ ; CHECK-NEXT: bb.3: ; CHECK-NEXT: successors: %bb.4(0x40000000), %bb.2(0x40000000) ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(s32) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR1]](<4 x s32>), [[COPY6]](s32), [[COPY4]], [[COPY5]], 0, 0, 0 :: (dereferenceable load (s32), align 1, addrspace 8) + ; CHECK-NEXT: [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(s32) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR1]](<4 x s32>), [[C]](s32), [[COPY4]], [[COPY5]], 0, 0, 0 :: (dereferenceable load (s32), align 1, addrspace 8) ; CHECK-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc ; CHECK-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec ; CHECK-NEXT: {{ $}} @@ -117,15 +114,14 @@ ; CHECK-NEXT: [[BUILD_VECTOR:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32) ; CHECK-NEXT: [[COPY4:%[0-9]+]]:vgpr(s32) = COPY $vgpr0 ; CHECK-NEXT: [[COPY5:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr1 - ; CHECK-NEXT: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 0 - ; CHECK-NEXT: [[COPY6:%[0-9]+]]:vgpr(s32) = COPY [[C]](s32) + ; CHECK-NEXT: [[C:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0 ; CHECK-NEXT: [[DEF:%[0-9]+]]:sreg_64_xexec = IMPLICIT_DEF ; CHECK-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.2: ; CHECK-NEXT: successors: %bb.3(0x80000000) ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: [[PHI:%[0-9]+]]:sreg_64_xexec = PHI [[DEF]], %bb.1, %15, %bb.3 + ; CHECK-NEXT: [[PHI:%[0-9]+]]:sreg_64_xexec = PHI [[DEF]], %bb.1, %14, %bb.3 ; CHECK-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32(s32) = V_READFIRSTLANE_B32 [[COPY5]](s32), implicit $exec ; CHECK-NEXT: [[ICMP:%[0-9]+]]:vcc(s1) = G_ICMP intpred(eq), [[V_READFIRSTLANE_B32_]](s32), [[COPY5]] ; CHECK-NEXT: [[INT:%[0-9]+]]:sreg_64_xexec(s64) = G_INTRINSIC intrinsic(@llvm.amdgcn.ballot), [[ICMP]](s1) @@ -134,7 +130,7 @@ ; CHECK-NEXT: bb.3: ; CHECK-NEXT: successors: %bb.4(0x40000000), %bb.2(0x40000000) ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(s32) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[COPY6]](s32), [[COPY4]], [[V_READFIRSTLANE_B32_]], 0, 0, 0 :: (dereferenceable load (s32), align 1, addrspace 8) + ; CHECK-NEXT: [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(s32) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C]](s32), [[COPY4]], [[V_READFIRSTLANE_B32_]], 0, 0, 0 :: (dereferenceable load (s32), align 1, addrspace 8) ; CHECK-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc ; CHECK-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec ; CHECK-NEXT: {{ $}} @@ -164,15 +160,14 @@ ; CHECK-NEXT: [[BUILD_VECTOR:%[0-9]+]]:vgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32) ; CHECK-NEXT: [[COPY4:%[0-9]+]]:vgpr(s32) = COPY $vgpr4 ; CHECK-NEXT: [[COPY5:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr5 - ; CHECK-NEXT: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 0 - ; CHECK-NEXT: [[COPY6:%[0-9]+]]:vgpr(s32) = COPY [[C]](s32) + ; CHECK-NEXT: [[C:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0 ; CHECK-NEXT: [[DEF:%[0-9]+]]:sreg_64_xexec = IMPLICIT_DEF ; CHECK-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.2: ; CHECK-NEXT: successors: %bb.3(0x80000000) ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: [[PHI:%[0-9]+]]:sreg_64_xexec = PHI [[DEF]], %bb.1, %15, %bb.3 + ; CHECK-NEXT: [[PHI:%[0-9]+]]:sreg_64_xexec = PHI [[DEF]], %bb.1, %14, %bb.3 ; CHECK-NEXT: [[UV:%[0-9]+]]:vgpr_32(s32), [[UV1:%[0-9]+]]:vgpr_32(s32), [[UV2:%[0-9]+]]:vgpr_32(s32), [[UV3:%[0-9]+]]:vgpr_32(s32) = G_UNMERGE_VALUES [[BUILD_VECTOR]](<4 x s32>) ; CHECK-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32(s32) = V_READFIRSTLANE_B32 [[UV]](s32), implicit $exec ; CHECK-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32(s32) = V_READFIRSTLANE_B32 [[UV1]](s32), implicit $exec @@ -193,7 +188,7 @@ ; CHECK-NEXT: bb.3: ; CHECK-NEXT: successors: %bb.4(0x40000000), %bb.2(0x40000000) ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(s32) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR1]](<4 x s32>), [[COPY6]](s32), [[COPY4]], [[V_READFIRSTLANE_B32_4]], 0, 0, 0 :: (dereferenceable load (s32), align 1, addrspace 8) + ; CHECK-NEXT: [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(s32) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR1]](<4 x s32>), [[C]](s32), [[COPY4]], [[V_READFIRSTLANE_B32_4]], 0, 0, 0 :: (dereferenceable load (s32), align 1, addrspace 8) ; CHECK-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc ; CHECK-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec ; CHECK-NEXT: {{ $}} Index: llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgcn.raw.ptr.buffer.load.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgcn.raw.ptr.buffer.load.ll +++ llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgcn.raw.ptr.buffer.load.ll @@ -15,9 +15,8 @@ ; CHECK-NEXT: [[COPY4:%[0-9]+]]:vgpr(s32) = COPY $vgpr0 ; CHECK-NEXT: [[COPY5:%[0-9]+]]:sgpr(s32) = COPY $sgpr6 ; CHECK-NEXT: [[BUILD_VECTOR:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32) - ; CHECK-NEXT: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 0 - ; CHECK-NEXT: [[COPY6:%[0-9]+]]:vgpr(s32) = COPY [[C]](s32) - ; CHECK-NEXT: [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(s32) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[COPY6]](s32), [[COPY4]], [[COPY5]], 0, 0, 0 :: (dereferenceable load (s32) from %ir.rsrc, align 1, addrspace 8) + ; CHECK-NEXT: [[C:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0 + ; CHECK-NEXT: [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(s32) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C]](s32), [[COPY4]], [[COPY5]], 0, 0, 0 :: (dereferenceable load (s32) from %ir.rsrc, align 1, addrspace 8) ; CHECK-NEXT: $vgpr0 = COPY [[AMDGPU_BUFFER_LOAD]](s32) ; CHECK-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0 %val = call float @llvm.amdgcn.raw.ptr.buffer.load.f32(ptr addrspace(8) %rsrc, i32 %voffset, i32 %soffset, i32 0) @@ -37,10 +36,9 @@ ; CHECK-NEXT: [[COPY4:%[0-9]+]]:sgpr(s32) = COPY $sgpr6 ; CHECK-NEXT: [[COPY5:%[0-9]+]]:sgpr(s32) = COPY $sgpr7 ; CHECK-NEXT: [[BUILD_VECTOR:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32) - ; CHECK-NEXT: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 0 - ; CHECK-NEXT: [[COPY6:%[0-9]+]]:vgpr(s32) = COPY [[C]](s32) - ; CHECK-NEXT: [[COPY7:%[0-9]+]]:vgpr(s32) = COPY [[COPY4]](s32) - ; CHECK-NEXT: [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(s32) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[COPY6]](s32), [[COPY7]], [[COPY5]], 0, 0, 0 :: (dereferenceable load (s32) from %ir.rsrc, align 1, addrspace 8) + ; CHECK-NEXT: [[C:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0 + ; CHECK-NEXT: [[COPY6:%[0-9]+]]:vgpr(s32) = COPY [[COPY4]](s32) + ; CHECK-NEXT: [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(s32) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C]](s32), [[COPY6]], [[COPY5]], 0, 0, 0 :: (dereferenceable load (s32) from %ir.rsrc, align 1, addrspace 8) ; CHECK-NEXT: $vgpr0 = COPY [[AMDGPU_BUFFER_LOAD]](s32) ; CHECK-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0 %val = call float @llvm.amdgcn.raw.ptr.buffer.load.f32(ptr addrspace(8) %rsrc, i32 %voffset, i32 %soffset, i32 0) @@ -61,15 +59,14 @@ ; CHECK-NEXT: [[COPY4:%[0-9]+]]:vgpr(s32) = COPY $vgpr4 ; CHECK-NEXT: [[COPY5:%[0-9]+]]:sgpr(s32) = COPY $sgpr2 ; CHECK-NEXT: [[BUILD_VECTOR:%[0-9]+]]:vgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32) - ; CHECK-NEXT: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 0 - ; CHECK-NEXT: [[COPY6:%[0-9]+]]:vgpr(s32) = COPY [[C]](s32) + ; CHECK-NEXT: [[C:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0 ; CHECK-NEXT: [[DEF:%[0-9]+]]:sreg_64_xexec = IMPLICIT_DEF ; CHECK-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.2: ; CHECK-NEXT: successors: %bb.3(0x80000000) ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: [[PHI:%[0-9]+]]:sreg_64_xexec = PHI [[DEF]], %bb.1, %20, %bb.3 + ; CHECK-NEXT: [[PHI:%[0-9]+]]:sreg_64_xexec = PHI [[DEF]], %bb.1, %19, %bb.3 ; CHECK-NEXT: [[UV:%[0-9]+]]:vgpr_32(s32), [[UV1:%[0-9]+]]:vgpr_32(s32), [[UV2:%[0-9]+]]:vgpr_32(s32), [[UV3:%[0-9]+]]:vgpr_32(s32) = G_UNMERGE_VALUES [[BUILD_VECTOR]](<4 x s32>) ; CHECK-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32(s32) = V_READFIRSTLANE_B32 [[UV]](s32), implicit $exec ; CHECK-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32(s32) = V_READFIRSTLANE_B32 [[UV1]](s32), implicit $exec @@ -87,7 +84,7 @@ ; CHECK-NEXT: bb.3: ; CHECK-NEXT: successors: %bb.4(0x40000000), %bb.2(0x40000000) ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(s32) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR1]](<4 x s32>), [[COPY6]](s32), [[COPY4]], [[COPY5]], 0, 0, 0 :: (dereferenceable load (s32) from %ir.rsrc, align 1, addrspace 8) + ; CHECK-NEXT: [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(s32) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR1]](<4 x s32>), [[C]](s32), [[COPY4]], [[COPY5]], 0, 0, 0 :: (dereferenceable load (s32) from %ir.rsrc, align 1, addrspace 8) ; CHECK-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc ; CHECK-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec ; CHECK-NEXT: {{ $}} @@ -117,15 +114,14 @@ ; CHECK-NEXT: [[COPY4:%[0-9]+]]:vgpr(s32) = COPY $vgpr0 ; CHECK-NEXT: [[COPY5:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr1 ; CHECK-NEXT: [[BUILD_VECTOR:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32) - ; CHECK-NEXT: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 0 - ; CHECK-NEXT: [[COPY6:%[0-9]+]]:vgpr(s32) = COPY [[C]](s32) + ; CHECK-NEXT: [[C:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0 ; CHECK-NEXT: [[DEF:%[0-9]+]]:sreg_64_xexec = IMPLICIT_DEF ; CHECK-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.2: ; CHECK-NEXT: successors: %bb.3(0x80000000) ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: [[PHI:%[0-9]+]]:sreg_64_xexec = PHI [[DEF]], %bb.1, %20, %bb.3 + ; CHECK-NEXT: [[PHI:%[0-9]+]]:sreg_64_xexec = PHI [[DEF]], %bb.1, %19, %bb.3 ; CHECK-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32(s32) = V_READFIRSTLANE_B32 [[COPY5]](s32), implicit $exec ; CHECK-NEXT: [[ICMP:%[0-9]+]]:vcc(s1) = G_ICMP intpred(eq), [[V_READFIRSTLANE_B32_]](s32), [[COPY5]] ; CHECK-NEXT: [[INT:%[0-9]+]]:sreg_64_xexec(s64) = G_INTRINSIC intrinsic(@llvm.amdgcn.ballot), [[ICMP]](s1) @@ -134,7 +130,7 @@ ; CHECK-NEXT: bb.3: ; CHECK-NEXT: successors: %bb.4(0x40000000), %bb.2(0x40000000) ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(s32) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[COPY6]](s32), [[COPY4]], [[V_READFIRSTLANE_B32_]], 0, 0, 0 :: (dereferenceable load (s32) from %ir.rsrc, align 1, addrspace 8) + ; CHECK-NEXT: [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(s32) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C]](s32), [[COPY4]], [[V_READFIRSTLANE_B32_]], 0, 0, 0 :: (dereferenceable load (s32) from %ir.rsrc, align 1, addrspace 8) ; CHECK-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc ; CHECK-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec ; CHECK-NEXT: {{ $}} @@ -164,15 +160,14 @@ ; CHECK-NEXT: [[COPY4:%[0-9]+]]:vgpr(s32) = COPY $vgpr4 ; CHECK-NEXT: [[COPY5:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr5 ; CHECK-NEXT: [[BUILD_VECTOR:%[0-9]+]]:vgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32) - ; CHECK-NEXT: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 0 - ; CHECK-NEXT: [[COPY6:%[0-9]+]]:vgpr(s32) = COPY [[C]](s32) + ; CHECK-NEXT: [[C:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0 ; CHECK-NEXT: [[DEF:%[0-9]+]]:sreg_64_xexec = IMPLICIT_DEF ; CHECK-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.2: ; CHECK-NEXT: successors: %bb.3(0x80000000) ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: [[PHI:%[0-9]+]]:sreg_64_xexec = PHI [[DEF]], %bb.1, %20, %bb.3 + ; CHECK-NEXT: [[PHI:%[0-9]+]]:sreg_64_xexec = PHI [[DEF]], %bb.1, %19, %bb.3 ; CHECK-NEXT: [[UV:%[0-9]+]]:vgpr_32(s32), [[UV1:%[0-9]+]]:vgpr_32(s32), [[UV2:%[0-9]+]]:vgpr_32(s32), [[UV3:%[0-9]+]]:vgpr_32(s32) = G_UNMERGE_VALUES [[BUILD_VECTOR]](<4 x s32>) ; CHECK-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32(s32) = V_READFIRSTLANE_B32 [[UV]](s32), implicit $exec ; CHECK-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32(s32) = V_READFIRSTLANE_B32 [[UV1]](s32), implicit $exec @@ -193,7 +188,7 @@ ; CHECK-NEXT: bb.3: ; CHECK-NEXT: successors: %bb.4(0x40000000), %bb.2(0x40000000) ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(s32) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR1]](<4 x s32>), [[COPY6]](s32), [[COPY4]], [[V_READFIRSTLANE_B32_4]], 0, 0, 0 :: (dereferenceable load (s32) from %ir.rsrc, align 1, addrspace 8) + ; CHECK-NEXT: [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(s32) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR1]](<4 x s32>), [[C]](s32), [[COPY4]], [[V_READFIRSTLANE_B32_4]], 0, 0, 0 :: (dereferenceable load (s32) from %ir.rsrc, align 1, addrspace 8) ; CHECK-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc ; CHECK-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec ; CHECK-NEXT: {{ $}} Index: llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgcn.s.buffer.load.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgcn.s.buffer.load.ll +++ llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgcn.s.buffer.load.ll @@ -640,9 +640,8 @@ ; GFX7-NEXT: [[COPY3:%[0-9]+]]:sgpr(s32) = COPY $sgpr5 ; GFX7-NEXT: [[BUILD_VECTOR:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32) ; GFX7-NEXT: [[COPY4:%[0-9]+]]:vgpr(s32) = COPY $vgpr0 - ; GFX7-NEXT: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 4092 - ; GFX7-NEXT: [[COPY5:%[0-9]+]]:vgpr(s32) = COPY [[C]](s32) - ; GFX7-NEXT: [[ADD:%[0-9]+]]:vgpr(s32) = G_ADD [[COPY4]], [[COPY5]] + ; GFX7-NEXT: [[C:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 4092 + ; GFX7-NEXT: [[ADD:%[0-9]+]]:vgpr(s32) = G_ADD [[COPY4]], [[C]] ; GFX7-NEXT: [[C1:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 0 ; GFX7-NEXT: [[C2:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0 ; GFX7-NEXT: [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(s32) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C2]](s32), [[COPY4]], [[C1]], 4092, 0, 0 :: (dereferenceable invariant load (s32)) @@ -664,9 +663,8 @@ ; GFX7-NEXT: [[COPY3:%[0-9]+]]:sgpr(s32) = COPY $sgpr5 ; GFX7-NEXT: [[BUILD_VECTOR:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32) ; GFX7-NEXT: [[COPY4:%[0-9]+]]:vgpr(s32) = COPY $vgpr0 - ; GFX7-NEXT: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 4095 - ; GFX7-NEXT: [[COPY5:%[0-9]+]]:vgpr(s32) = COPY [[C]](s32) - ; GFX7-NEXT: [[ADD:%[0-9]+]]:vgpr(s32) = G_ADD [[COPY4]], [[COPY5]] + ; GFX7-NEXT: [[C:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 4095 + ; GFX7-NEXT: [[ADD:%[0-9]+]]:vgpr(s32) = G_ADD [[COPY4]], [[C]] ; GFX7-NEXT: [[C1:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 0 ; GFX7-NEXT: [[C2:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0 ; GFX7-NEXT: [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(s32) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C2]](s32), [[COPY4]], [[C1]], 4095, 0, 0 :: (dereferenceable invariant load (s32)) @@ -688,11 +686,11 @@ ; GFX7-NEXT: [[COPY3:%[0-9]+]]:sgpr(s32) = COPY $sgpr5 ; GFX7-NEXT: [[BUILD_VECTOR:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32) ; GFX7-NEXT: [[COPY4:%[0-9]+]]:vgpr(s32) = COPY $vgpr0 - ; GFX7-NEXT: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 4096 - ; GFX7-NEXT: [[COPY5:%[0-9]+]]:vgpr(s32) = COPY [[C]](s32) - ; GFX7-NEXT: [[ADD:%[0-9]+]]:vgpr(s32) = G_ADD [[COPY4]], [[COPY5]] - ; GFX7-NEXT: [[C1:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0 - ; GFX7-NEXT: [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(s32) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C1]](s32), [[COPY4]], [[C]], 0, 0, 0 :: (dereferenceable invariant load (s32)) + ; GFX7-NEXT: [[C:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 4096 + ; GFX7-NEXT: [[ADD:%[0-9]+]]:vgpr(s32) = G_ADD [[COPY4]], [[C]] + ; GFX7-NEXT: [[C1:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 0 + ; GFX7-NEXT: [[C2:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0 + ; GFX7-NEXT: [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(s32) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C2]](s32), [[ADD]], [[C1]], 0, 0, 0 :: (dereferenceable invariant load (s32)) ; GFX7-NEXT: $vgpr0 = COPY [[AMDGPU_BUFFER_LOAD]](s32) ; GFX7-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0 %soffset = add i32 %soffset.base, 4096 @@ -712,9 +710,8 @@ ; GFX7-NEXT: [[COPY3:%[0-9]+]]:sgpr(s32) = COPY $sgpr5 ; GFX7-NEXT: [[BUILD_VECTOR:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32) ; GFX7-NEXT: [[COPY4:%[0-9]+]]:vgpr(s32) = COPY $vgpr0 - ; GFX7-NEXT: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 4064 - ; GFX7-NEXT: [[COPY5:%[0-9]+]]:vgpr(s32) = COPY [[C]](s32) - ; GFX7-NEXT: [[ADD:%[0-9]+]]:vgpr(s32) = G_ADD [[COPY4]], [[COPY5]] + ; GFX7-NEXT: [[C:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 4064 + ; GFX7-NEXT: [[ADD:%[0-9]+]]:vgpr(s32) = G_ADD [[COPY4]], [[C]] ; GFX7-NEXT: [[C1:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 0 ; GFX7-NEXT: [[C2:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0 ; GFX7-NEXT: [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(<4 x s32>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C2]](s32), [[COPY4]], [[C1]], 4064, 0, 0 :: (dereferenceable invariant load (s128), align 4) @@ -747,12 +744,12 @@ ; GFX7-NEXT: [[COPY3:%[0-9]+]]:sgpr(s32) = COPY $sgpr5 ; GFX7-NEXT: [[BUILD_VECTOR:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32) ; GFX7-NEXT: [[COPY4:%[0-9]+]]:vgpr(s32) = COPY $vgpr0 - ; GFX7-NEXT: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 4068 - ; GFX7-NEXT: [[COPY5:%[0-9]+]]:vgpr(s32) = COPY [[C]](s32) - ; GFX7-NEXT: [[ADD:%[0-9]+]]:vgpr(s32) = G_ADD [[COPY4]], [[COPY5]] - ; GFX7-NEXT: [[C1:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0 - ; GFX7-NEXT: [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(<4 x s32>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C1]](s32), [[COPY4]], [[C]], 0, 0, 0 :: (dereferenceable invariant load (s128), align 4) - ; GFX7-NEXT: [[AMDGPU_BUFFER_LOAD1:%[0-9]+]]:vgpr(<4 x s32>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C1]](s32), [[COPY4]], [[C]], 16, 0, 0 :: (dereferenceable invariant load (s128), align 4) + ; GFX7-NEXT: [[C:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 4068 + ; GFX7-NEXT: [[ADD:%[0-9]+]]:vgpr(s32) = G_ADD [[COPY4]], [[C]] + ; GFX7-NEXT: [[C1:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 0 + ; GFX7-NEXT: [[C2:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0 + ; GFX7-NEXT: [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(<4 x s32>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C2]](s32), [[ADD]], [[C1]], 0, 0, 0 :: (dereferenceable invariant load (s128), align 4) + ; GFX7-NEXT: [[AMDGPU_BUFFER_LOAD1:%[0-9]+]]:vgpr(<4 x s32>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C2]](s32), [[ADD]], [[C1]], 16, 0, 0 :: (dereferenceable invariant load (s128), align 4) ; GFX7-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:vgpr(<8 x s32>) = G_CONCAT_VECTORS [[AMDGPU_BUFFER_LOAD]](<4 x s32>), [[AMDGPU_BUFFER_LOAD1]](<4 x s32>) ; GFX7-NEXT: [[UV:%[0-9]+]]:vgpr(s32), [[UV1:%[0-9]+]]:vgpr(s32), [[UV2:%[0-9]+]]:vgpr(s32), [[UV3:%[0-9]+]]:vgpr(s32), [[UV4:%[0-9]+]]:vgpr(s32), [[UV5:%[0-9]+]]:vgpr(s32), [[UV6:%[0-9]+]]:vgpr(s32), [[UV7:%[0-9]+]]:vgpr(s32) = G_UNMERGE_VALUES [[CONCAT_VECTORS]](<8 x s32>) ; GFX7-NEXT: $vgpr0 = COPY [[UV]](s32) @@ -780,9 +777,8 @@ ; GFX7-NEXT: [[COPY3:%[0-9]+]]:sgpr(s32) = COPY $sgpr5 ; GFX7-NEXT: [[BUILD_VECTOR:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32) ; GFX7-NEXT: [[COPY4:%[0-9]+]]:vgpr(s32) = COPY $vgpr0 - ; GFX7-NEXT: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 4032 - ; GFX7-NEXT: [[COPY5:%[0-9]+]]:vgpr(s32) = COPY [[C]](s32) - ; GFX7-NEXT: [[ADD:%[0-9]+]]:vgpr(s32) = G_ADD [[COPY4]], [[COPY5]] + ; GFX7-NEXT: [[C:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 4032 + ; GFX7-NEXT: [[ADD:%[0-9]+]]:vgpr(s32) = G_ADD [[COPY4]], [[C]] ; GFX7-NEXT: [[C1:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 0 ; GFX7-NEXT: [[C2:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0 ; GFX7-NEXT: [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(<4 x s32>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C2]](s32), [[COPY4]], [[C1]], 4032, 0, 0 :: (dereferenceable invariant load (s128), align 4) @@ -824,14 +820,14 @@ ; GFX7-NEXT: [[COPY3:%[0-9]+]]:sgpr(s32) = COPY $sgpr5 ; GFX7-NEXT: [[BUILD_VECTOR:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32) ; GFX7-NEXT: [[COPY4:%[0-9]+]]:vgpr(s32) = COPY $vgpr0 - ; GFX7-NEXT: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 4036 - ; GFX7-NEXT: [[COPY5:%[0-9]+]]:vgpr(s32) = COPY [[C]](s32) - ; GFX7-NEXT: [[ADD:%[0-9]+]]:vgpr(s32) = G_ADD [[COPY4]], [[COPY5]] - ; GFX7-NEXT: [[C1:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0 - ; GFX7-NEXT: [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(<4 x s32>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C1]](s32), [[COPY4]], [[C]], 0, 0, 0 :: (dereferenceable invariant load (s128), align 4) - ; GFX7-NEXT: [[AMDGPU_BUFFER_LOAD1:%[0-9]+]]:vgpr(<4 x s32>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C1]](s32), [[COPY4]], [[C]], 16, 0, 0 :: (dereferenceable invariant load (s128), align 4) - ; GFX7-NEXT: [[AMDGPU_BUFFER_LOAD2:%[0-9]+]]:vgpr(<4 x s32>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C1]](s32), [[COPY4]], [[C]], 32, 0, 0 :: (dereferenceable invariant load (s128) from unknown-address + 16, align 4) - ; GFX7-NEXT: [[AMDGPU_BUFFER_LOAD3:%[0-9]+]]:vgpr(<4 x s32>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C1]](s32), [[COPY4]], [[C]], 48, 0, 0 :: (dereferenceable invariant load (s128) from unknown-address + 48, align 4) + ; GFX7-NEXT: [[C:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 4036 + ; GFX7-NEXT: [[ADD:%[0-9]+]]:vgpr(s32) = G_ADD [[COPY4]], [[C]] + ; GFX7-NEXT: [[C1:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 0 + ; GFX7-NEXT: [[C2:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0 + ; GFX7-NEXT: [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(<4 x s32>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C2]](s32), [[ADD]], [[C1]], 0, 0, 0 :: (dereferenceable invariant load (s128), align 4) + ; GFX7-NEXT: [[AMDGPU_BUFFER_LOAD1:%[0-9]+]]:vgpr(<4 x s32>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C2]](s32), [[ADD]], [[C1]], 16, 0, 0 :: (dereferenceable invariant load (s128), align 4) + ; GFX7-NEXT: [[AMDGPU_BUFFER_LOAD2:%[0-9]+]]:vgpr(<4 x s32>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C2]](s32), [[ADD]], [[C1]], 32, 0, 0 :: (dereferenceable invariant load (s128) from unknown-address + 16, align 4) + ; GFX7-NEXT: [[AMDGPU_BUFFER_LOAD3:%[0-9]+]]:vgpr(<4 x s32>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C2]](s32), [[ADD]], [[C1]], 48, 0, 0 :: (dereferenceable invariant load (s128) from unknown-address + 48, align 4) ; GFX7-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:vgpr(<16 x s32>) = G_CONCAT_VECTORS [[AMDGPU_BUFFER_LOAD]](<4 x s32>), [[AMDGPU_BUFFER_LOAD1]](<4 x s32>), [[AMDGPU_BUFFER_LOAD2]](<4 x s32>), [[AMDGPU_BUFFER_LOAD3]](<4 x s32>) ; GFX7-NEXT: [[UV:%[0-9]+]]:vgpr(s32), [[UV1:%[0-9]+]]:vgpr(s32), [[UV2:%[0-9]+]]:vgpr(s32), [[UV3:%[0-9]+]]:vgpr(s32), [[UV4:%[0-9]+]]:vgpr(s32), [[UV5:%[0-9]+]]:vgpr(s32), [[UV6:%[0-9]+]]:vgpr(s32), [[UV7:%[0-9]+]]:vgpr(s32), [[UV8:%[0-9]+]]:vgpr(s32), [[UV9:%[0-9]+]]:vgpr(s32), [[UV10:%[0-9]+]]:vgpr(s32), [[UV11:%[0-9]+]]:vgpr(s32), [[UV12:%[0-9]+]]:vgpr(s32), [[UV13:%[0-9]+]]:vgpr(s32), [[UV14:%[0-9]+]]:vgpr(s32), [[UV15:%[0-9]+]]:vgpr(s32) = G_UNMERGE_VALUES [[CONCAT_VECTORS]](<16 x s32>) ; GFX7-NEXT: $vgpr0 = COPY [[UV]](s32) @@ -1319,10 +1315,10 @@ ; GFX7-NEXT: [[COPY3:%[0-9]+]]:vgpr(s32) = COPY $vgpr3 ; GFX7-NEXT: [[BUILD_VECTOR:%[0-9]+]]:vgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32) ; GFX7-NEXT: [[COPY4:%[0-9]+]]:vgpr(s32) = COPY $vgpr4 - ; GFX7-NEXT: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 5000 - ; GFX7-NEXT: [[COPY5:%[0-9]+]]:vgpr(s32) = COPY [[C]](s32) - ; GFX7-NEXT: [[ADD:%[0-9]+]]:vgpr(s32) = G_ADD [[COPY4]], [[COPY5]] - ; GFX7-NEXT: [[C1:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0 + ; GFX7-NEXT: [[C:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 5000 + ; GFX7-NEXT: [[ADD:%[0-9]+]]:vgpr(s32) = G_ADD [[COPY4]], [[C]] + ; GFX7-NEXT: [[C1:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 0 + ; GFX7-NEXT: [[C2:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0 ; GFX7-NEXT: [[DEF:%[0-9]+]]:sreg_64_xexec = IMPLICIT_DEF ; GFX7-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec ; GFX7-NEXT: {{ $}} @@ -1345,8 +1341,8 @@ ; GFX7-NEXT: bb.3: ; GFX7-NEXT: successors: %bb.4, %bb.2 ; GFX7-NEXT: {{ $}} - ; GFX7-NEXT: [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(<4 x s32>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR1]](<4 x s32>), [[C1]](s32), [[COPY4]], [[C]], 0, 0, 0 :: (dereferenceable invariant load (s128), align 4) - ; GFX7-NEXT: [[AMDGPU_BUFFER_LOAD1:%[0-9]+]]:vgpr(<4 x s32>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR1]](<4 x s32>), [[C1]](s32), [[COPY4]], [[C]], 16, 0, 0 :: (dereferenceable invariant load (s128), align 4) + ; GFX7-NEXT: [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(<4 x s32>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR1]](<4 x s32>), [[C2]](s32), [[ADD]], [[C1]], 0, 0, 0 :: (dereferenceable invariant load (s128), align 4) + ; GFX7-NEXT: [[AMDGPU_BUFFER_LOAD1:%[0-9]+]]:vgpr(<4 x s32>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR1]](<4 x s32>), [[C2]](s32), [[ADD]], [[C1]], 16, 0, 0 :: (dereferenceable invariant load (s128), align 4) ; GFX7-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc ; GFX7-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec ; GFX7-NEXT: {{ $}} @@ -1381,10 +1377,10 @@ ; GFX7-NEXT: [[COPY3:%[0-9]+]]:vgpr(s32) = COPY $vgpr3 ; GFX7-NEXT: [[BUILD_VECTOR:%[0-9]+]]:vgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32) ; GFX7-NEXT: [[COPY4:%[0-9]+]]:vgpr(s32) = COPY $vgpr4 - ; GFX7-NEXT: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 4076 - ; GFX7-NEXT: [[COPY5:%[0-9]+]]:vgpr(s32) = COPY [[C]](s32) - ; GFX7-NEXT: [[ADD:%[0-9]+]]:vgpr(s32) = G_ADD [[COPY4]], [[COPY5]] - ; GFX7-NEXT: [[C1:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0 + ; GFX7-NEXT: [[C:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 4076 + ; GFX7-NEXT: [[ADD:%[0-9]+]]:vgpr(s32) = G_ADD [[COPY4]], [[C]] + ; GFX7-NEXT: [[C1:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 0 + ; GFX7-NEXT: [[C2:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0 ; GFX7-NEXT: [[DEF:%[0-9]+]]:sreg_64_xexec = IMPLICIT_DEF ; GFX7-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec ; GFX7-NEXT: {{ $}} @@ -1407,8 +1403,8 @@ ; GFX7-NEXT: bb.3: ; GFX7-NEXT: successors: %bb.4, %bb.2 ; GFX7-NEXT: {{ $}} - ; GFX7-NEXT: [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(<4 x s32>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR1]](<4 x s32>), [[C1]](s32), [[COPY4]], [[C]], 0, 0, 0 :: (dereferenceable invariant load (s128), align 4) - ; GFX7-NEXT: [[AMDGPU_BUFFER_LOAD1:%[0-9]+]]:vgpr(<4 x s32>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR1]](<4 x s32>), [[C1]](s32), [[COPY4]], [[C]], 16, 0, 0 :: (dereferenceable invariant load (s128), align 4) + ; GFX7-NEXT: [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(<4 x s32>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR1]](<4 x s32>), [[C2]](s32), [[ADD]], [[C1]], 0, 0, 0 :: (dereferenceable invariant load (s128), align 4) + ; GFX7-NEXT: [[AMDGPU_BUFFER_LOAD1:%[0-9]+]]:vgpr(<4 x s32>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR1]](<4 x s32>), [[C2]](s32), [[ADD]], [[C1]], 16, 0, 0 :: (dereferenceable invariant load (s128), align 4) ; GFX7-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc ; GFX7-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec ; GFX7-NEXT: {{ $}} @@ -1443,10 +1439,10 @@ ; GFX7-NEXT: [[COPY3:%[0-9]+]]:vgpr(s32) = COPY $vgpr3 ; GFX7-NEXT: [[BUILD_VECTOR:%[0-9]+]]:vgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32) ; GFX7-NEXT: [[COPY4:%[0-9]+]]:vgpr(s32) = COPY $vgpr4 - ; GFX7-NEXT: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 4080 - ; GFX7-NEXT: [[COPY5:%[0-9]+]]:vgpr(s32) = COPY [[C]](s32) - ; GFX7-NEXT: [[ADD:%[0-9]+]]:vgpr(s32) = G_ADD [[COPY4]], [[COPY5]] - ; GFX7-NEXT: [[C1:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0 + ; GFX7-NEXT: [[C:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 4080 + ; GFX7-NEXT: [[ADD:%[0-9]+]]:vgpr(s32) = G_ADD [[COPY4]], [[C]] + ; GFX7-NEXT: [[C1:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 0 + ; GFX7-NEXT: [[C2:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0 ; GFX7-NEXT: [[DEF:%[0-9]+]]:sreg_64_xexec = IMPLICIT_DEF ; GFX7-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec ; GFX7-NEXT: {{ $}} @@ -1469,8 +1465,8 @@ ; GFX7-NEXT: bb.3: ; GFX7-NEXT: successors: %bb.4, %bb.2 ; GFX7-NEXT: {{ $}} - ; GFX7-NEXT: [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(<4 x s32>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR1]](<4 x s32>), [[C1]](s32), [[COPY4]], [[C]], 0, 0, 0 :: (dereferenceable invariant load (s128), align 4) - ; GFX7-NEXT: [[AMDGPU_BUFFER_LOAD1:%[0-9]+]]:vgpr(<4 x s32>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR1]](<4 x s32>), [[C1]](s32), [[COPY4]], [[C]], 16, 0, 0 :: (dereferenceable invariant load (s128), align 4) + ; GFX7-NEXT: [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(<4 x s32>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR1]](<4 x s32>), [[C2]](s32), [[ADD]], [[C1]], 0, 0, 0 :: (dereferenceable invariant load (s128), align 4) + ; GFX7-NEXT: [[AMDGPU_BUFFER_LOAD1:%[0-9]+]]:vgpr(<4 x s32>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR1]](<4 x s32>), [[C2]](s32), [[ADD]], [[C1]], 16, 0, 0 :: (dereferenceable invariant load (s128), align 4) ; GFX7-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc ; GFX7-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec ; GFX7-NEXT: {{ $}} @@ -1614,9 +1610,8 @@ ; GFX7-NEXT: [[COPY5:%[0-9]+]]:sgpr(s32) = COPY $sgpr6 ; GFX7-NEXT: [[COPY6:%[0-9]+]]:vgpr(s32) = COPY [[COPY5]](s32) ; GFX7-NEXT: [[ADD:%[0-9]+]]:vgpr(s32) = G_ADD [[COPY4]], [[COPY6]] - ; GFX7-NEXT: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 1024 - ; GFX7-NEXT: [[COPY7:%[0-9]+]]:vgpr(s32) = COPY [[C]](s32) - ; GFX7-NEXT: [[ADD1:%[0-9]+]]:vgpr(s32) = G_ADD [[ADD]], [[COPY7]] + ; GFX7-NEXT: [[C:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 1024 + ; GFX7-NEXT: [[ADD1:%[0-9]+]]:vgpr(s32) = G_ADD [[ADD]], [[C]] ; GFX7-NEXT: [[C1:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 0 ; GFX7-NEXT: [[C2:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0 ; GFX7-NEXT: [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(s32) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C2]](s32), [[ADD]], [[C1]], 1024, 0, 0 :: (dereferenceable invariant load (s32)) @@ -1642,9 +1637,8 @@ ; GFX7-NEXT: [[COPY5:%[0-9]+]]:sgpr(s32) = COPY $sgpr6 ; GFX7-NEXT: [[COPY6:%[0-9]+]]:vgpr(s32) = COPY [[COPY5]](s32) ; GFX7-NEXT: [[ADD:%[0-9]+]]:vgpr(s32) = G_ADD [[COPY6]], [[COPY4]] - ; GFX7-NEXT: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 1024 - ; GFX7-NEXT: [[COPY7:%[0-9]+]]:vgpr(s32) = COPY [[C]](s32) - ; GFX7-NEXT: [[ADD1:%[0-9]+]]:vgpr(s32) = G_ADD [[ADD]], [[COPY7]] + ; GFX7-NEXT: [[C:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 1024 + ; GFX7-NEXT: [[ADD1:%[0-9]+]]:vgpr(s32) = G_ADD [[ADD]], [[C]] ; GFX7-NEXT: [[C1:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 0 ; GFX7-NEXT: [[C2:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0 ; GFX7-NEXT: [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(s32) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C2]](s32), [[ADD]], [[C1]], 1024, 0, 0 :: (dereferenceable invariant load (s32)) @@ -1671,9 +1665,8 @@ ; GFX7-NEXT: [[COPY5:%[0-9]+]]:sgpr(s32) = COPY $sgpr6 ; GFX7-NEXT: [[COPY6:%[0-9]+]]:vgpr(s32) = COPY [[COPY5]](s32) ; GFX7-NEXT: [[ADD:%[0-9]+]]:vgpr(s32) = G_ADD [[COPY6]], [[COPY4]] - ; GFX7-NEXT: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 1024 - ; GFX7-NEXT: [[COPY7:%[0-9]+]]:vgpr(s32) = COPY [[C]](s32) - ; GFX7-NEXT: [[ADD1:%[0-9]+]]:vgpr(s32) = G_ADD [[ADD]], [[COPY7]] + ; GFX7-NEXT: [[C:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 1024 + ; GFX7-NEXT: [[ADD1:%[0-9]+]]:vgpr(s32) = G_ADD [[ADD]], [[C]] ; GFX7-NEXT: [[C1:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 0 ; GFX7-NEXT: [[C2:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0 ; GFX7-NEXT: [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(s32) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C2]](s32), [[ADD]], [[C1]], 1024, 0, 0 :: (dereferenceable invariant load (s32)) @@ -1699,9 +1692,8 @@ ; GFX7-NEXT: [[COPY5:%[0-9]+]]:sgpr(s32) = COPY $sgpr6 ; GFX7-NEXT: [[COPY6:%[0-9]+]]:vgpr(s32) = COPY [[COPY5]](s32) ; GFX7-NEXT: [[ADD:%[0-9]+]]:vgpr(s32) = G_ADD [[COPY4]], [[COPY6]] - ; GFX7-NEXT: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 1024 - ; GFX7-NEXT: [[COPY7:%[0-9]+]]:vgpr(s32) = COPY [[C]](s32) - ; GFX7-NEXT: [[ADD1:%[0-9]+]]:vgpr(s32) = G_ADD [[ADD]], [[COPY7]] + ; GFX7-NEXT: [[C:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 1024 + ; GFX7-NEXT: [[ADD1:%[0-9]+]]:vgpr(s32) = G_ADD [[ADD]], [[C]] ; GFX7-NEXT: [[C1:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 0 ; GFX7-NEXT: [[C2:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0 ; GFX7-NEXT: [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(s32) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C2]](s32), [[ADD]], [[C1]], 1024, 0, 0 :: (dereferenceable invariant load (s32)) Index: llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgcn.s.buffer.load.mir =================================================================== --- llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgcn.s.buffer.load.mir +++ llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgcn.s.buffer.load.mir @@ -62,9 +62,8 @@ ; FAST-NEXT: {{ $}} ; FAST-NEXT: [[COPY:%[0-9]+]]:sgpr(<4 x s32>) = COPY $sgpr0_sgpr1_sgpr2_sgpr3 ; FAST-NEXT: [[COPY1:%[0-9]+]]:vgpr(s32) = COPY $vgpr0 - ; FAST-NEXT: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 -60 - ; FAST-NEXT: [[COPY2:%[0-9]+]]:vgpr(s32) = COPY [[C]](s32) - ; FAST-NEXT: [[ADD:%[0-9]+]]:vgpr(s32) = G_ADD [[COPY1]], [[COPY2]] + ; FAST-NEXT: [[C:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 -60 + ; FAST-NEXT: [[ADD:%[0-9]+]]:vgpr(s32) = G_ADD [[COPY1]], [[C]] ; FAST-NEXT: [[C1:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 0 ; FAST-NEXT: [[C2:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0 ; FAST-NEXT: [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(s32) = G_AMDGPU_BUFFER_LOAD [[COPY]](<4 x s32>), [[C2]](s32), [[ADD]], [[C1]], 0, 0, 0 :: (dereferenceable invariant load (s32)) @@ -74,9 +73,8 @@ ; GREEDY-NEXT: {{ $}} ; GREEDY-NEXT: [[COPY:%[0-9]+]]:sgpr(<4 x s32>) = COPY $sgpr0_sgpr1_sgpr2_sgpr3 ; GREEDY-NEXT: [[COPY1:%[0-9]+]]:vgpr(s32) = COPY $vgpr0 - ; GREEDY-NEXT: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 -60 - ; GREEDY-NEXT: [[COPY2:%[0-9]+]]:vgpr(s32) = COPY [[C]](s32) - ; GREEDY-NEXT: [[ADD:%[0-9]+]]:vgpr(s32) = G_ADD [[COPY1]], [[COPY2]] + ; GREEDY-NEXT: [[C:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 -60 + ; GREEDY-NEXT: [[ADD:%[0-9]+]]:vgpr(s32) = G_ADD [[COPY1]], [[C]] ; GREEDY-NEXT: [[C1:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 0 ; GREEDY-NEXT: [[C2:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0 ; GREEDY-NEXT: [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(s32) = G_AMDGPU_BUFFER_LOAD [[COPY]](<4 x s32>), [[C2]](s32), [[ADD]], [[C1]], 0, 0, 0 :: (dereferenceable invariant load (s32)) Index: llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-and-s1.mir =================================================================== --- llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-and-s1.mir +++ llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-and-s1.mir @@ -93,11 +93,10 @@ ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr(s32) = COPY $vgpr0 ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr(s32) = COPY $vgpr1 - ; CHECK-NEXT: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 0 - ; CHECK-NEXT: [[COPY2:%[0-9]+]]:vgpr(s32) = COPY [[C]](s32) - ; CHECK-NEXT: [[ICMP:%[0-9]+]]:vcc(s1) = G_ICMP intpred(eq), [[COPY]](s32), [[COPY2]] - ; CHECK-NEXT: [[COPY3:%[0-9]+]]:vgpr(s32) = COPY [[C]](s32) - ; CHECK-NEXT: [[ICMP1:%[0-9]+]]:vcc(s1) = G_ICMP intpred(eq), [[COPY1]](s32), [[COPY3]] + ; CHECK-NEXT: [[C:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0 + ; CHECK-NEXT: [[C1:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0 + ; CHECK-NEXT: [[ICMP:%[0-9]+]]:vcc(s1) = G_ICMP intpred(eq), [[COPY]](s32), [[C1]] + ; CHECK-NEXT: [[ICMP1:%[0-9]+]]:vcc(s1) = G_ICMP intpred(eq), [[COPY1]](s32), [[C]] ; CHECK-NEXT: [[AND:%[0-9]+]]:vcc(s1) = G_AND [[ICMP]], [[ICMP1]] %0:_(s32) = COPY $vgpr0 %1:_(s32) = COPY $vgpr1 @@ -173,10 +172,10 @@ ; CHECK-NEXT: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 0 ; CHECK-NEXT: [[ICMP:%[0-9]+]]:sgpr(s32) = G_ICMP intpred(eq), [[COPY]](s32), [[C]] ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:sgpr(s1) = G_TRUNC [[ICMP]](s32) - ; CHECK-NEXT: [[COPY2:%[0-9]+]]:vgpr(s32) = COPY [[C]](s32) - ; CHECK-NEXT: [[ICMP1:%[0-9]+]]:vcc(s1) = G_ICMP intpred(eq), [[COPY1]](s32), [[COPY2]] - ; CHECK-NEXT: [[COPY3:%[0-9]+]]:vcc(s1) = COPY [[TRUNC]](s1) - ; CHECK-NEXT: [[AND:%[0-9]+]]:vcc(s1) = G_AND [[COPY3]], [[ICMP1]] + ; CHECK-NEXT: [[C1:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0 + ; CHECK-NEXT: [[ICMP1:%[0-9]+]]:vcc(s1) = G_ICMP intpred(eq), [[COPY1]](s32), [[C1]] + ; CHECK-NEXT: [[COPY2:%[0-9]+]]:vcc(s1) = COPY [[TRUNC]](s1) + ; CHECK-NEXT: [[AND:%[0-9]+]]:vcc(s1) = G_AND [[COPY2]], [[ICMP1]] %0:_(s32) = COPY $sgpr0 %1:_(s32) = COPY $vgpr0 %2:_(s32) = G_CONSTANT i32 0 @@ -197,11 +196,10 @@ ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr(s32) = COPY $vgpr0 ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr(s32) = COPY $vgpr1 - ; CHECK-NEXT: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 0 - ; CHECK-NEXT: [[COPY2:%[0-9]+]]:vgpr(s32) = COPY [[C]](s32) - ; CHECK-NEXT: [[ICMP:%[0-9]+]]:vcc(s1) = G_ICMP intpred(eq), [[COPY]](s32), [[COPY2]] - ; CHECK-NEXT: [[COPY3:%[0-9]+]]:vgpr(s32) = COPY [[C]](s32) - ; CHECK-NEXT: [[ICMP1:%[0-9]+]]:vcc(s1) = G_ICMP intpred(eq), [[COPY1]](s32), [[COPY3]] + ; CHECK-NEXT: [[C:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0 + ; CHECK-NEXT: [[C1:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0 + ; CHECK-NEXT: [[ICMP:%[0-9]+]]:vcc(s1) = G_ICMP intpred(eq), [[COPY]](s32), [[C1]] + ; CHECK-NEXT: [[ICMP1:%[0-9]+]]:vcc(s1) = G_ICMP intpred(eq), [[COPY1]](s32), [[C]] ; CHECK-NEXT: [[AND:%[0-9]+]]:vcc(s1) = G_AND [[ICMP]], [[ICMP1]] %0:_(s32) = COPY $vgpr0 %1:_(s32) = COPY $vgpr1 Index: llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-constant.mir =================================================================== --- llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-constant.mir +++ llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-constant.mir @@ -12,9 +12,8 @@ ; CHECK: liveins: $vgpr0_vgpr1 ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr(p1) = COPY $vgpr0_vgpr1 - ; CHECK-NEXT: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 1 - ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr(s32) = COPY [[C]](s32) - ; CHECK-NEXT: G_STORE [[COPY1]](s32), [[COPY]](p1) :: (store (s32)) + ; CHECK-NEXT: [[C:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 1 + ; CHECK-NEXT: G_STORE [[C]](s32), [[COPY]](p1) :: (store (s32)) %0:_(p1) = COPY $vgpr0_vgpr1 %1:_(s32) = G_CONSTANT i32 1 G_STORE %1, %0 :: (store (s32)) Index: llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-extract-vector-elt.mir =================================================================== --- llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-extract-vector-elt.mir +++ llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-extract-vector-elt.mir @@ -571,9 +571,8 @@ ; WAVE64-NEXT: {{ $}} ; WAVE64-NEXT: [[COPY:%[0-9]+]]:vgpr(<16 x s32>) = COPY $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 ; WAVE64-NEXT: [[COPY1:%[0-9]+]]:vgpr(s32) = COPY $vgpr16 - ; WAVE64-NEXT: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 1 - ; WAVE64-NEXT: [[COPY2:%[0-9]+]]:vgpr(s32) = COPY [[C]](s32) - ; WAVE64-NEXT: [[ADD:%[0-9]+]]:vgpr(s32) = G_ADD [[COPY1]], [[COPY2]] + ; WAVE64-NEXT: [[C:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 1 + ; WAVE64-NEXT: [[ADD:%[0-9]+]]:vgpr(s32) = G_ADD [[COPY1]], [[C]] ; WAVE64-NEXT: [[UV:%[0-9]+]]:vgpr(s32), [[UV1:%[0-9]+]]:vgpr(s32), [[UV2:%[0-9]+]]:vgpr(s32), [[UV3:%[0-9]+]]:vgpr(s32), [[UV4:%[0-9]+]]:vgpr(s32), [[UV5:%[0-9]+]]:vgpr(s32), [[UV6:%[0-9]+]]:vgpr(s32), [[UV7:%[0-9]+]]:vgpr(s32), [[UV8:%[0-9]+]]:vgpr(s32), [[UV9:%[0-9]+]]:vgpr(s32), [[UV10:%[0-9]+]]:vgpr(s32), [[UV11:%[0-9]+]]:vgpr(s32), [[UV12:%[0-9]+]]:vgpr(s32), [[UV13:%[0-9]+]]:vgpr(s32), [[UV14:%[0-9]+]]:vgpr(s32), [[UV15:%[0-9]+]]:vgpr(s32) = G_UNMERGE_VALUES [[COPY]](<16 x s32>) ; WAVE64-NEXT: [[C1:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 1 ; WAVE64-NEXT: [[ICMP:%[0-9]+]]:vcc(s1) = G_ICMP intpred(eq), [[ADD]](s32), [[C1]] @@ -620,16 +619,15 @@ ; WAVE64-NEXT: [[C15:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 15 ; WAVE64-NEXT: [[ICMP14:%[0-9]+]]:vcc(s1) = G_ICMP intpred(eq), [[ADD]](s32), [[C15]] ; WAVE64-NEXT: [[SELECT14:%[0-9]+]]:vgpr(s32) = G_SELECT [[ICMP14]](s1), [[UV15]], [[SELECT13]] - ; WAVE64-NEXT: [[COPY3:%[0-9]+]]:vgpr(s32) = COPY [[SELECT14]](s32) - ; WAVE64-NEXT: $vgpr0 = COPY [[COPY3]](s32) + ; WAVE64-NEXT: [[COPY2:%[0-9]+]]:vgpr(s32) = COPY [[SELECT14]](s32) + ; WAVE64-NEXT: $vgpr0 = COPY [[COPY2]](s32) ; WAVE32-LABEL: name: extract_vector_elt_v16s32_vv_idx_add1 ; WAVE32: liveins: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, $vgpr16 ; WAVE32-NEXT: {{ $}} ; WAVE32-NEXT: [[COPY:%[0-9]+]]:vgpr(<16 x s32>) = COPY $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 ; WAVE32-NEXT: [[COPY1:%[0-9]+]]:vgpr(s32) = COPY $vgpr16 - ; WAVE32-NEXT: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 1 - ; WAVE32-NEXT: [[COPY2:%[0-9]+]]:vgpr(s32) = COPY [[C]](s32) - ; WAVE32-NEXT: [[ADD:%[0-9]+]]:vgpr(s32) = G_ADD [[COPY1]], [[COPY2]] + ; WAVE32-NEXT: [[C:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 1 + ; WAVE32-NEXT: [[ADD:%[0-9]+]]:vgpr(s32) = G_ADD [[COPY1]], [[C]] ; WAVE32-NEXT: [[UV:%[0-9]+]]:vgpr(s32), [[UV1:%[0-9]+]]:vgpr(s32), [[UV2:%[0-9]+]]:vgpr(s32), [[UV3:%[0-9]+]]:vgpr(s32), [[UV4:%[0-9]+]]:vgpr(s32), [[UV5:%[0-9]+]]:vgpr(s32), [[UV6:%[0-9]+]]:vgpr(s32), [[UV7:%[0-9]+]]:vgpr(s32), [[UV8:%[0-9]+]]:vgpr(s32), [[UV9:%[0-9]+]]:vgpr(s32), [[UV10:%[0-9]+]]:vgpr(s32), [[UV11:%[0-9]+]]:vgpr(s32), [[UV12:%[0-9]+]]:vgpr(s32), [[UV13:%[0-9]+]]:vgpr(s32), [[UV14:%[0-9]+]]:vgpr(s32), [[UV15:%[0-9]+]]:vgpr(s32) = G_UNMERGE_VALUES [[COPY]](<16 x s32>) ; WAVE32-NEXT: [[C1:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 1 ; WAVE32-NEXT: [[ICMP:%[0-9]+]]:vcc(s1) = G_ICMP intpred(eq), [[ADD]](s32), [[C1]] @@ -676,8 +674,8 @@ ; WAVE32-NEXT: [[C15:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 15 ; WAVE32-NEXT: [[ICMP14:%[0-9]+]]:vcc(s1) = G_ICMP intpred(eq), [[ADD]](s32), [[C15]] ; WAVE32-NEXT: [[SELECT14:%[0-9]+]]:vgpr(s32) = G_SELECT [[ICMP14]](s1), [[UV15]], [[SELECT13]] - ; WAVE32-NEXT: [[COPY3:%[0-9]+]]:vgpr(s32) = COPY [[SELECT14]](s32) - ; WAVE32-NEXT: $vgpr0 = COPY [[COPY3]](s32) + ; WAVE32-NEXT: [[COPY2:%[0-9]+]]:vgpr(s32) = COPY [[SELECT14]](s32) + ; WAVE32-NEXT: $vgpr0 = COPY [[COPY2]](s32) %0:_(<16 x s32>) = COPY $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 %1:_(s32) = COPY $vgpr16 %2:_(s32) = G_CONSTANT i32 1 @@ -700,9 +698,8 @@ ; WAVE64-NEXT: {{ $}} ; WAVE64-NEXT: [[COPY:%[0-9]+]]:vgpr(<16 x s32>) = COPY $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 ; WAVE64-NEXT: [[COPY1:%[0-9]+]]:vgpr(s32) = COPY $vgpr16 - ; WAVE64-NEXT: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 -1 - ; WAVE64-NEXT: [[COPY2:%[0-9]+]]:vgpr(s32) = COPY [[C]](s32) - ; WAVE64-NEXT: [[ADD:%[0-9]+]]:vgpr(s32) = G_ADD [[COPY1]], [[COPY2]] + ; WAVE64-NEXT: [[C:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 -1 + ; WAVE64-NEXT: [[ADD:%[0-9]+]]:vgpr(s32) = G_ADD [[COPY1]], [[C]] ; WAVE64-NEXT: [[UV:%[0-9]+]]:vgpr(s32), [[UV1:%[0-9]+]]:vgpr(s32), [[UV2:%[0-9]+]]:vgpr(s32), [[UV3:%[0-9]+]]:vgpr(s32), [[UV4:%[0-9]+]]:vgpr(s32), [[UV5:%[0-9]+]]:vgpr(s32), [[UV6:%[0-9]+]]:vgpr(s32), [[UV7:%[0-9]+]]:vgpr(s32), [[UV8:%[0-9]+]]:vgpr(s32), [[UV9:%[0-9]+]]:vgpr(s32), [[UV10:%[0-9]+]]:vgpr(s32), [[UV11:%[0-9]+]]:vgpr(s32), [[UV12:%[0-9]+]]:vgpr(s32), [[UV13:%[0-9]+]]:vgpr(s32), [[UV14:%[0-9]+]]:vgpr(s32), [[UV15:%[0-9]+]]:vgpr(s32) = G_UNMERGE_VALUES [[COPY]](<16 x s32>) ; WAVE64-NEXT: [[C1:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 1 ; WAVE64-NEXT: [[ICMP:%[0-9]+]]:vcc(s1) = G_ICMP intpred(eq), [[ADD]](s32), [[C1]] @@ -749,16 +746,15 @@ ; WAVE64-NEXT: [[C15:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 15 ; WAVE64-NEXT: [[ICMP14:%[0-9]+]]:vcc(s1) = G_ICMP intpred(eq), [[ADD]](s32), [[C15]] ; WAVE64-NEXT: [[SELECT14:%[0-9]+]]:vgpr(s32) = G_SELECT [[ICMP14]](s1), [[UV15]], [[SELECT13]] - ; WAVE64-NEXT: [[COPY3:%[0-9]+]]:vgpr(s32) = COPY [[SELECT14]](s32) - ; WAVE64-NEXT: $vgpr0 = COPY [[COPY3]](s32) + ; WAVE64-NEXT: [[COPY2:%[0-9]+]]:vgpr(s32) = COPY [[SELECT14]](s32) + ; WAVE64-NEXT: $vgpr0 = COPY [[COPY2]](s32) ; WAVE32-LABEL: name: extract_vector_elt_v16s32_vv_idx_addm1 ; WAVE32: liveins: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, $vgpr16 ; WAVE32-NEXT: {{ $}} ; WAVE32-NEXT: [[COPY:%[0-9]+]]:vgpr(<16 x s32>) = COPY $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 ; WAVE32-NEXT: [[COPY1:%[0-9]+]]:vgpr(s32) = COPY $vgpr16 - ; WAVE32-NEXT: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 -1 - ; WAVE32-NEXT: [[COPY2:%[0-9]+]]:vgpr(s32) = COPY [[C]](s32) - ; WAVE32-NEXT: [[ADD:%[0-9]+]]:vgpr(s32) = G_ADD [[COPY1]], [[COPY2]] + ; WAVE32-NEXT: [[C:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 -1 + ; WAVE32-NEXT: [[ADD:%[0-9]+]]:vgpr(s32) = G_ADD [[COPY1]], [[C]] ; WAVE32-NEXT: [[UV:%[0-9]+]]:vgpr(s32), [[UV1:%[0-9]+]]:vgpr(s32), [[UV2:%[0-9]+]]:vgpr(s32), [[UV3:%[0-9]+]]:vgpr(s32), [[UV4:%[0-9]+]]:vgpr(s32), [[UV5:%[0-9]+]]:vgpr(s32), [[UV6:%[0-9]+]]:vgpr(s32), [[UV7:%[0-9]+]]:vgpr(s32), [[UV8:%[0-9]+]]:vgpr(s32), [[UV9:%[0-9]+]]:vgpr(s32), [[UV10:%[0-9]+]]:vgpr(s32), [[UV11:%[0-9]+]]:vgpr(s32), [[UV12:%[0-9]+]]:vgpr(s32), [[UV13:%[0-9]+]]:vgpr(s32), [[UV14:%[0-9]+]]:vgpr(s32), [[UV15:%[0-9]+]]:vgpr(s32) = G_UNMERGE_VALUES [[COPY]](<16 x s32>) ; WAVE32-NEXT: [[C1:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 1 ; WAVE32-NEXT: [[ICMP:%[0-9]+]]:vcc(s1) = G_ICMP intpred(eq), [[ADD]](s32), [[C1]] @@ -805,8 +801,8 @@ ; WAVE32-NEXT: [[C15:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 15 ; WAVE32-NEXT: [[ICMP14:%[0-9]+]]:vcc(s1) = G_ICMP intpred(eq), [[ADD]](s32), [[C15]] ; WAVE32-NEXT: [[SELECT14:%[0-9]+]]:vgpr(s32) = G_SELECT [[ICMP14]](s1), [[UV15]], [[SELECT13]] - ; WAVE32-NEXT: [[COPY3:%[0-9]+]]:vgpr(s32) = COPY [[SELECT14]](s32) - ; WAVE32-NEXT: $vgpr0 = COPY [[COPY3]](s32) + ; WAVE32-NEXT: [[COPY2:%[0-9]+]]:vgpr(s32) = COPY [[SELECT14]](s32) + ; WAVE32-NEXT: $vgpr0 = COPY [[COPY2]](s32) %0:_(<16 x s32>) = COPY $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 %1:_(s32) = COPY $vgpr16 %2:_(s32) = G_CONSTANT i32 -1 @@ -829,9 +825,8 @@ ; WAVE64-NEXT: {{ $}} ; WAVE64-NEXT: [[COPY:%[0-9]+]]:vgpr(<16 x s32>) = COPY $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 ; WAVE64-NEXT: [[COPY1:%[0-9]+]]:vgpr(s32) = COPY $vgpr16 - ; WAVE64-NEXT: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 16 - ; WAVE64-NEXT: [[COPY2:%[0-9]+]]:vgpr(s32) = COPY [[C]](s32) - ; WAVE64-NEXT: [[ADD:%[0-9]+]]:vgpr(s32) = G_ADD [[COPY1]], [[COPY2]] + ; WAVE64-NEXT: [[C:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 16 + ; WAVE64-NEXT: [[ADD:%[0-9]+]]:vgpr(s32) = G_ADD [[COPY1]], [[C]] ; WAVE64-NEXT: [[UV:%[0-9]+]]:vgpr(s32), [[UV1:%[0-9]+]]:vgpr(s32), [[UV2:%[0-9]+]]:vgpr(s32), [[UV3:%[0-9]+]]:vgpr(s32), [[UV4:%[0-9]+]]:vgpr(s32), [[UV5:%[0-9]+]]:vgpr(s32), [[UV6:%[0-9]+]]:vgpr(s32), [[UV7:%[0-9]+]]:vgpr(s32), [[UV8:%[0-9]+]]:vgpr(s32), [[UV9:%[0-9]+]]:vgpr(s32), [[UV10:%[0-9]+]]:vgpr(s32), [[UV11:%[0-9]+]]:vgpr(s32), [[UV12:%[0-9]+]]:vgpr(s32), [[UV13:%[0-9]+]]:vgpr(s32), [[UV14:%[0-9]+]]:vgpr(s32), [[UV15:%[0-9]+]]:vgpr(s32) = G_UNMERGE_VALUES [[COPY]](<16 x s32>) ; WAVE64-NEXT: [[C1:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 1 ; WAVE64-NEXT: [[ICMP:%[0-9]+]]:vcc(s1) = G_ICMP intpred(eq), [[ADD]](s32), [[C1]] @@ -878,16 +873,15 @@ ; WAVE64-NEXT: [[C15:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 15 ; WAVE64-NEXT: [[ICMP14:%[0-9]+]]:vcc(s1) = G_ICMP intpred(eq), [[ADD]](s32), [[C15]] ; WAVE64-NEXT: [[SELECT14:%[0-9]+]]:vgpr(s32) = G_SELECT [[ICMP14]](s1), [[UV15]], [[SELECT13]] - ; WAVE64-NEXT: [[COPY3:%[0-9]+]]:vgpr(s32) = COPY [[SELECT14]](s32) - ; WAVE64-NEXT: $vgpr0 = COPY [[COPY3]](s32) + ; WAVE64-NEXT: [[COPY2:%[0-9]+]]:vgpr(s32) = COPY [[SELECT14]](s32) + ; WAVE64-NEXT: $vgpr0 = COPY [[COPY2]](s32) ; WAVE32-LABEL: name: extract_vector_elt_v16s32_vv_idx_add16 ; WAVE32: liveins: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, $vgpr16 ; WAVE32-NEXT: {{ $}} ; WAVE32-NEXT: [[COPY:%[0-9]+]]:vgpr(<16 x s32>) = COPY $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 ; WAVE32-NEXT: [[COPY1:%[0-9]+]]:vgpr(s32) = COPY $vgpr16 - ; WAVE32-NEXT: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 16 - ; WAVE32-NEXT: [[COPY2:%[0-9]+]]:vgpr(s32) = COPY [[C]](s32) - ; WAVE32-NEXT: [[ADD:%[0-9]+]]:vgpr(s32) = G_ADD [[COPY1]], [[COPY2]] + ; WAVE32-NEXT: [[C:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 16 + ; WAVE32-NEXT: [[ADD:%[0-9]+]]:vgpr(s32) = G_ADD [[COPY1]], [[C]] ; WAVE32-NEXT: [[UV:%[0-9]+]]:vgpr(s32), [[UV1:%[0-9]+]]:vgpr(s32), [[UV2:%[0-9]+]]:vgpr(s32), [[UV3:%[0-9]+]]:vgpr(s32), [[UV4:%[0-9]+]]:vgpr(s32), [[UV5:%[0-9]+]]:vgpr(s32), [[UV6:%[0-9]+]]:vgpr(s32), [[UV7:%[0-9]+]]:vgpr(s32), [[UV8:%[0-9]+]]:vgpr(s32), [[UV9:%[0-9]+]]:vgpr(s32), [[UV10:%[0-9]+]]:vgpr(s32), [[UV11:%[0-9]+]]:vgpr(s32), [[UV12:%[0-9]+]]:vgpr(s32), [[UV13:%[0-9]+]]:vgpr(s32), [[UV14:%[0-9]+]]:vgpr(s32), [[UV15:%[0-9]+]]:vgpr(s32) = G_UNMERGE_VALUES [[COPY]](<16 x s32>) ; WAVE32-NEXT: [[C1:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 1 ; WAVE32-NEXT: [[ICMP:%[0-9]+]]:vcc(s1) = G_ICMP intpred(eq), [[ADD]](s32), [[C1]] @@ -934,8 +928,8 @@ ; WAVE32-NEXT: [[C15:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 15 ; WAVE32-NEXT: [[ICMP14:%[0-9]+]]:vcc(s1) = G_ICMP intpred(eq), [[ADD]](s32), [[C15]] ; WAVE32-NEXT: [[SELECT14:%[0-9]+]]:vgpr(s32) = G_SELECT [[ICMP14]](s1), [[UV15]], [[SELECT13]] - ; WAVE32-NEXT: [[COPY3:%[0-9]+]]:vgpr(s32) = COPY [[SELECT14]](s32) - ; WAVE32-NEXT: $vgpr0 = COPY [[COPY3]](s32) + ; WAVE32-NEXT: [[COPY2:%[0-9]+]]:vgpr(s32) = COPY [[SELECT14]](s32) + ; WAVE32-NEXT: $vgpr0 = COPY [[COPY2]](s32) %0:_(<16 x s32>) = COPY $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 %1:_(s32) = COPY $vgpr16 %2:_(s32) = G_CONSTANT i32 16 @@ -958,9 +952,8 @@ ; WAVE64-NEXT: {{ $}} ; WAVE64-NEXT: [[COPY:%[0-9]+]]:vgpr(<8 x s64>) = COPY $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 ; WAVE64-NEXT: [[COPY1:%[0-9]+]]:vgpr(s32) = COPY $vgpr16 - ; WAVE64-NEXT: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 1 - ; WAVE64-NEXT: [[COPY2:%[0-9]+]]:vgpr(s32) = COPY [[C]](s32) - ; WAVE64-NEXT: [[ADD:%[0-9]+]]:vgpr(s32) = G_ADD [[COPY1]], [[COPY2]] + ; WAVE64-NEXT: [[C:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 1 + ; WAVE64-NEXT: [[ADD:%[0-9]+]]:vgpr(s32) = G_ADD [[COPY1]], [[C]] ; WAVE64-NEXT: [[UV:%[0-9]+]]:vgpr(s32), [[UV1:%[0-9]+]]:vgpr(s32), [[UV2:%[0-9]+]]:vgpr(s32), [[UV3:%[0-9]+]]:vgpr(s32), [[UV4:%[0-9]+]]:vgpr(s32), [[UV5:%[0-9]+]]:vgpr(s32), [[UV6:%[0-9]+]]:vgpr(s32), [[UV7:%[0-9]+]]:vgpr(s32), [[UV8:%[0-9]+]]:vgpr(s32), [[UV9:%[0-9]+]]:vgpr(s32), [[UV10:%[0-9]+]]:vgpr(s32), [[UV11:%[0-9]+]]:vgpr(s32), [[UV12:%[0-9]+]]:vgpr(s32), [[UV13:%[0-9]+]]:vgpr(s32), [[UV14:%[0-9]+]]:vgpr(s32), [[UV15:%[0-9]+]]:vgpr(s32) = G_UNMERGE_VALUES [[COPY]](<8 x s64>) ; WAVE64-NEXT: [[C1:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 1 ; WAVE64-NEXT: [[ICMP:%[0-9]+]]:vcc(s1) = G_ICMP intpred(eq), [[ADD]](s32), [[C1]] @@ -990,18 +983,17 @@ ; WAVE64-NEXT: [[ICMP6:%[0-9]+]]:vcc(s1) = G_ICMP intpred(eq), [[ADD]](s32), [[C7]] ; WAVE64-NEXT: [[SELECT12:%[0-9]+]]:vgpr(s32) = G_SELECT [[ICMP6]](s1), [[UV14]], [[SELECT10]] ; WAVE64-NEXT: [[SELECT13:%[0-9]+]]:vgpr(s32) = G_SELECT [[ICMP6]](s1), [[UV15]], [[SELECT11]] - ; WAVE64-NEXT: [[COPY3:%[0-9]+]]:vgpr(s32) = COPY [[SELECT12]](s32) - ; WAVE64-NEXT: [[COPY4:%[0-9]+]]:vgpr(s32) = COPY [[SELECT13]](s32) - ; WAVE64-NEXT: [[MV:%[0-9]+]]:vgpr(s64) = G_MERGE_VALUES [[COPY3]](s32), [[COPY4]](s32) + ; WAVE64-NEXT: [[COPY2:%[0-9]+]]:vgpr(s32) = COPY [[SELECT12]](s32) + ; WAVE64-NEXT: [[COPY3:%[0-9]+]]:vgpr(s32) = COPY [[SELECT13]](s32) + ; WAVE64-NEXT: [[MV:%[0-9]+]]:vgpr(s64) = G_MERGE_VALUES [[COPY2]](s32), [[COPY3]](s32) ; WAVE64-NEXT: $vgpr0_vgpr1 = COPY [[MV]](s64) ; WAVE32-LABEL: name: extract_vector_elt_v8s64_vv_idx_add1 ; WAVE32: liveins: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, $vgpr16 ; WAVE32-NEXT: {{ $}} ; WAVE32-NEXT: [[COPY:%[0-9]+]]:vgpr(<8 x s64>) = COPY $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 ; WAVE32-NEXT: [[COPY1:%[0-9]+]]:vgpr(s32) = COPY $vgpr16 - ; WAVE32-NEXT: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 1 - ; WAVE32-NEXT: [[COPY2:%[0-9]+]]:vgpr(s32) = COPY [[C]](s32) - ; WAVE32-NEXT: [[ADD:%[0-9]+]]:vgpr(s32) = G_ADD [[COPY1]], [[COPY2]] + ; WAVE32-NEXT: [[C:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 1 + ; WAVE32-NEXT: [[ADD:%[0-9]+]]:vgpr(s32) = G_ADD [[COPY1]], [[C]] ; WAVE32-NEXT: [[UV:%[0-9]+]]:vgpr(s32), [[UV1:%[0-9]+]]:vgpr(s32), [[UV2:%[0-9]+]]:vgpr(s32), [[UV3:%[0-9]+]]:vgpr(s32), [[UV4:%[0-9]+]]:vgpr(s32), [[UV5:%[0-9]+]]:vgpr(s32), [[UV6:%[0-9]+]]:vgpr(s32), [[UV7:%[0-9]+]]:vgpr(s32), [[UV8:%[0-9]+]]:vgpr(s32), [[UV9:%[0-9]+]]:vgpr(s32), [[UV10:%[0-9]+]]:vgpr(s32), [[UV11:%[0-9]+]]:vgpr(s32), [[UV12:%[0-9]+]]:vgpr(s32), [[UV13:%[0-9]+]]:vgpr(s32), [[UV14:%[0-9]+]]:vgpr(s32), [[UV15:%[0-9]+]]:vgpr(s32) = G_UNMERGE_VALUES [[COPY]](<8 x s64>) ; WAVE32-NEXT: [[C1:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 1 ; WAVE32-NEXT: [[ICMP:%[0-9]+]]:vcc(s1) = G_ICMP intpred(eq), [[ADD]](s32), [[C1]] @@ -1031,9 +1023,9 @@ ; WAVE32-NEXT: [[ICMP6:%[0-9]+]]:vcc(s1) = G_ICMP intpred(eq), [[ADD]](s32), [[C7]] ; WAVE32-NEXT: [[SELECT12:%[0-9]+]]:vgpr(s32) = G_SELECT [[ICMP6]](s1), [[UV14]], [[SELECT10]] ; WAVE32-NEXT: [[SELECT13:%[0-9]+]]:vgpr(s32) = G_SELECT [[ICMP6]](s1), [[UV15]], [[SELECT11]] - ; WAVE32-NEXT: [[COPY3:%[0-9]+]]:vgpr(s32) = COPY [[SELECT12]](s32) - ; WAVE32-NEXT: [[COPY4:%[0-9]+]]:vgpr(s32) = COPY [[SELECT13]](s32) - ; WAVE32-NEXT: [[MV:%[0-9]+]]:vgpr(s64) = G_MERGE_VALUES [[COPY3]](s32), [[COPY4]](s32) + ; WAVE32-NEXT: [[COPY2:%[0-9]+]]:vgpr(s32) = COPY [[SELECT12]](s32) + ; WAVE32-NEXT: [[COPY3:%[0-9]+]]:vgpr(s32) = COPY [[SELECT13]](s32) + ; WAVE32-NEXT: [[MV:%[0-9]+]]:vgpr(s64) = G_MERGE_VALUES [[COPY2]](s32), [[COPY3]](s32) ; WAVE32-NEXT: $vgpr0_vgpr1 = COPY [[MV]](s64) %0:_(<8 x s64>) = COPY $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 %1:_(s32) = COPY $vgpr16 @@ -1057,9 +1049,8 @@ ; WAVE64-NEXT: {{ $}} ; WAVE64-NEXT: [[COPY:%[0-9]+]]:sgpr(<16 x s32>) = COPY $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 ; WAVE64-NEXT: [[COPY1:%[0-9]+]]:vgpr(s32) = COPY $vgpr0 - ; WAVE64-NEXT: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 1 - ; WAVE64-NEXT: [[COPY2:%[0-9]+]]:vgpr(s32) = COPY [[C]](s32) - ; WAVE64-NEXT: [[ADD:%[0-9]+]]:vgpr(s32) = G_ADD [[COPY1]], [[COPY2]] + ; WAVE64-NEXT: [[C:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 1 + ; WAVE64-NEXT: [[ADD:%[0-9]+]]:vgpr(s32) = G_ADD [[COPY1]], [[C]] ; WAVE64-NEXT: [[UV:%[0-9]+]]:vgpr(s32), [[UV1:%[0-9]+]]:vgpr(s32), [[UV2:%[0-9]+]]:vgpr(s32), [[UV3:%[0-9]+]]:vgpr(s32), [[UV4:%[0-9]+]]:vgpr(s32), [[UV5:%[0-9]+]]:vgpr(s32), [[UV6:%[0-9]+]]:vgpr(s32), [[UV7:%[0-9]+]]:vgpr(s32), [[UV8:%[0-9]+]]:vgpr(s32), [[UV9:%[0-9]+]]:vgpr(s32), [[UV10:%[0-9]+]]:vgpr(s32), [[UV11:%[0-9]+]]:vgpr(s32), [[UV12:%[0-9]+]]:vgpr(s32), [[UV13:%[0-9]+]]:vgpr(s32), [[UV14:%[0-9]+]]:vgpr(s32), [[UV15:%[0-9]+]]:vgpr(s32) = G_UNMERGE_VALUES [[COPY]](<16 x s32>) ; WAVE64-NEXT: [[C1:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 1 ; WAVE64-NEXT: [[ICMP:%[0-9]+]]:vcc(s1) = G_ICMP intpred(eq), [[ADD]](s32), [[C1]] @@ -1106,16 +1097,15 @@ ; WAVE64-NEXT: [[C15:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 15 ; WAVE64-NEXT: [[ICMP14:%[0-9]+]]:vcc(s1) = G_ICMP intpred(eq), [[ADD]](s32), [[C15]] ; WAVE64-NEXT: [[SELECT14:%[0-9]+]]:vgpr(s32) = G_SELECT [[ICMP14]](s1), [[UV15]], [[SELECT13]] - ; WAVE64-NEXT: [[COPY3:%[0-9]+]]:vgpr(s32) = COPY [[SELECT14]](s32) - ; WAVE64-NEXT: $vgpr0 = COPY [[COPY3]](s32) + ; WAVE64-NEXT: [[COPY2:%[0-9]+]]:vgpr(s32) = COPY [[SELECT14]](s32) + ; WAVE64-NEXT: $vgpr0 = COPY [[COPY2]](s32) ; WAVE32-LABEL: name: extract_vector_elt_v16s32_sv_idx_add1 ; WAVE32: liveins: $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15, $vgpr0 ; WAVE32-NEXT: {{ $}} ; WAVE32-NEXT: [[COPY:%[0-9]+]]:sgpr(<16 x s32>) = COPY $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 ; WAVE32-NEXT: [[COPY1:%[0-9]+]]:vgpr(s32) = COPY $vgpr0 - ; WAVE32-NEXT: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 1 - ; WAVE32-NEXT: [[COPY2:%[0-9]+]]:vgpr(s32) = COPY [[C]](s32) - ; WAVE32-NEXT: [[ADD:%[0-9]+]]:vgpr(s32) = G_ADD [[COPY1]], [[COPY2]] + ; WAVE32-NEXT: [[C:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 1 + ; WAVE32-NEXT: [[ADD:%[0-9]+]]:vgpr(s32) = G_ADD [[COPY1]], [[C]] ; WAVE32-NEXT: [[UV:%[0-9]+]]:vgpr(s32), [[UV1:%[0-9]+]]:vgpr(s32), [[UV2:%[0-9]+]]:vgpr(s32), [[UV3:%[0-9]+]]:vgpr(s32), [[UV4:%[0-9]+]]:vgpr(s32), [[UV5:%[0-9]+]]:vgpr(s32), [[UV6:%[0-9]+]]:vgpr(s32), [[UV7:%[0-9]+]]:vgpr(s32), [[UV8:%[0-9]+]]:vgpr(s32), [[UV9:%[0-9]+]]:vgpr(s32), [[UV10:%[0-9]+]]:vgpr(s32), [[UV11:%[0-9]+]]:vgpr(s32), [[UV12:%[0-9]+]]:vgpr(s32), [[UV13:%[0-9]+]]:vgpr(s32), [[UV14:%[0-9]+]]:vgpr(s32), [[UV15:%[0-9]+]]:vgpr(s32) = G_UNMERGE_VALUES [[COPY]](<16 x s32>) ; WAVE32-NEXT: [[C1:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 1 ; WAVE32-NEXT: [[ICMP:%[0-9]+]]:vcc(s1) = G_ICMP intpred(eq), [[ADD]](s32), [[C1]] @@ -1162,8 +1152,8 @@ ; WAVE32-NEXT: [[C15:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 15 ; WAVE32-NEXT: [[ICMP14:%[0-9]+]]:vcc(s1) = G_ICMP intpred(eq), [[ADD]](s32), [[C15]] ; WAVE32-NEXT: [[SELECT14:%[0-9]+]]:vgpr(s32) = G_SELECT [[ICMP14]](s1), [[UV15]], [[SELECT13]] - ; WAVE32-NEXT: [[COPY3:%[0-9]+]]:vgpr(s32) = COPY [[SELECT14]](s32) - ; WAVE32-NEXT: $vgpr0 = COPY [[COPY3]](s32) + ; WAVE32-NEXT: [[COPY2:%[0-9]+]]:vgpr(s32) = COPY [[SELECT14]](s32) + ; WAVE32-NEXT: $vgpr0 = COPY [[COPY2]](s32) %0:_(<16 x s32>) = COPY $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 %1:_(s32) = COPY $vgpr0 %2:_(s32) = G_CONSTANT i32 1 @@ -1186,9 +1176,8 @@ ; WAVE64-NEXT: {{ $}} ; WAVE64-NEXT: [[COPY:%[0-9]+]]:sgpr(<8 x s64>) = COPY $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 ; WAVE64-NEXT: [[COPY1:%[0-9]+]]:vgpr(s32) = COPY $vgpr0 - ; WAVE64-NEXT: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 1 - ; WAVE64-NEXT: [[COPY2:%[0-9]+]]:vgpr(s32) = COPY [[C]](s32) - ; WAVE64-NEXT: [[ADD:%[0-9]+]]:vgpr(s32) = G_ADD [[COPY1]], [[COPY2]] + ; WAVE64-NEXT: [[C:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 1 + ; WAVE64-NEXT: [[ADD:%[0-9]+]]:vgpr(s32) = G_ADD [[COPY1]], [[C]] ; WAVE64-NEXT: [[UV:%[0-9]+]]:vgpr(s32), [[UV1:%[0-9]+]]:vgpr(s32), [[UV2:%[0-9]+]]:vgpr(s32), [[UV3:%[0-9]+]]:vgpr(s32), [[UV4:%[0-9]+]]:vgpr(s32), [[UV5:%[0-9]+]]:vgpr(s32), [[UV6:%[0-9]+]]:vgpr(s32), [[UV7:%[0-9]+]]:vgpr(s32), [[UV8:%[0-9]+]]:vgpr(s32), [[UV9:%[0-9]+]]:vgpr(s32), [[UV10:%[0-9]+]]:vgpr(s32), [[UV11:%[0-9]+]]:vgpr(s32), [[UV12:%[0-9]+]]:vgpr(s32), [[UV13:%[0-9]+]]:vgpr(s32), [[UV14:%[0-9]+]]:vgpr(s32), [[UV15:%[0-9]+]]:vgpr(s32) = G_UNMERGE_VALUES [[COPY]](<8 x s64>) ; WAVE64-NEXT: [[C1:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 1 ; WAVE64-NEXT: [[ICMP:%[0-9]+]]:vcc(s1) = G_ICMP intpred(eq), [[ADD]](s32), [[C1]] @@ -1218,18 +1207,17 @@ ; WAVE64-NEXT: [[ICMP6:%[0-9]+]]:vcc(s1) = G_ICMP intpred(eq), [[ADD]](s32), [[C7]] ; WAVE64-NEXT: [[SELECT12:%[0-9]+]]:vgpr(s32) = G_SELECT [[ICMP6]](s1), [[UV14]], [[SELECT10]] ; WAVE64-NEXT: [[SELECT13:%[0-9]+]]:vgpr(s32) = G_SELECT [[ICMP6]](s1), [[UV15]], [[SELECT11]] - ; WAVE64-NEXT: [[COPY3:%[0-9]+]]:vgpr(s32) = COPY [[SELECT12]](s32) - ; WAVE64-NEXT: [[COPY4:%[0-9]+]]:vgpr(s32) = COPY [[SELECT13]](s32) - ; WAVE64-NEXT: [[MV:%[0-9]+]]:vgpr(s64) = G_MERGE_VALUES [[COPY3]](s32), [[COPY4]](s32) + ; WAVE64-NEXT: [[COPY2:%[0-9]+]]:vgpr(s32) = COPY [[SELECT12]](s32) + ; WAVE64-NEXT: [[COPY3:%[0-9]+]]:vgpr(s32) = COPY [[SELECT13]](s32) + ; WAVE64-NEXT: [[MV:%[0-9]+]]:vgpr(s64) = G_MERGE_VALUES [[COPY2]](s32), [[COPY3]](s32) ; WAVE64-NEXT: $vgpr0_vgpr1 = COPY [[MV]](s64) ; WAVE32-LABEL: name: extract_vector_elt_v8s64_sv_add1 ; WAVE32: liveins: $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15, $vgpr0 ; WAVE32-NEXT: {{ $}} ; WAVE32-NEXT: [[COPY:%[0-9]+]]:sgpr(<8 x s64>) = COPY $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 ; WAVE32-NEXT: [[COPY1:%[0-9]+]]:vgpr(s32) = COPY $vgpr0 - ; WAVE32-NEXT: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 1 - ; WAVE32-NEXT: [[COPY2:%[0-9]+]]:vgpr(s32) = COPY [[C]](s32) - ; WAVE32-NEXT: [[ADD:%[0-9]+]]:vgpr(s32) = G_ADD [[COPY1]], [[COPY2]] + ; WAVE32-NEXT: [[C:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 1 + ; WAVE32-NEXT: [[ADD:%[0-9]+]]:vgpr(s32) = G_ADD [[COPY1]], [[C]] ; WAVE32-NEXT: [[UV:%[0-9]+]]:vgpr(s32), [[UV1:%[0-9]+]]:vgpr(s32), [[UV2:%[0-9]+]]:vgpr(s32), [[UV3:%[0-9]+]]:vgpr(s32), [[UV4:%[0-9]+]]:vgpr(s32), [[UV5:%[0-9]+]]:vgpr(s32), [[UV6:%[0-9]+]]:vgpr(s32), [[UV7:%[0-9]+]]:vgpr(s32), [[UV8:%[0-9]+]]:vgpr(s32), [[UV9:%[0-9]+]]:vgpr(s32), [[UV10:%[0-9]+]]:vgpr(s32), [[UV11:%[0-9]+]]:vgpr(s32), [[UV12:%[0-9]+]]:vgpr(s32), [[UV13:%[0-9]+]]:vgpr(s32), [[UV14:%[0-9]+]]:vgpr(s32), [[UV15:%[0-9]+]]:vgpr(s32) = G_UNMERGE_VALUES [[COPY]](<8 x s64>) ; WAVE32-NEXT: [[C1:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 1 ; WAVE32-NEXT: [[ICMP:%[0-9]+]]:vcc(s1) = G_ICMP intpred(eq), [[ADD]](s32), [[C1]] @@ -1259,9 +1247,9 @@ ; WAVE32-NEXT: [[ICMP6:%[0-9]+]]:vcc(s1) = G_ICMP intpred(eq), [[ADD]](s32), [[C7]] ; WAVE32-NEXT: [[SELECT12:%[0-9]+]]:vgpr(s32) = G_SELECT [[ICMP6]](s1), [[UV14]], [[SELECT10]] ; WAVE32-NEXT: [[SELECT13:%[0-9]+]]:vgpr(s32) = G_SELECT [[ICMP6]](s1), [[UV15]], [[SELECT11]] - ; WAVE32-NEXT: [[COPY3:%[0-9]+]]:vgpr(s32) = COPY [[SELECT12]](s32) - ; WAVE32-NEXT: [[COPY4:%[0-9]+]]:vgpr(s32) = COPY [[SELECT13]](s32) - ; WAVE32-NEXT: [[MV:%[0-9]+]]:vgpr(s64) = G_MERGE_VALUES [[COPY3]](s32), [[COPY4]](s32) + ; WAVE32-NEXT: [[COPY2:%[0-9]+]]:vgpr(s32) = COPY [[SELECT12]](s32) + ; WAVE32-NEXT: [[COPY3:%[0-9]+]]:vgpr(s32) = COPY [[SELECT13]](s32) + ; WAVE32-NEXT: [[MV:%[0-9]+]]:vgpr(s64) = G_MERGE_VALUES [[COPY2]](s32), [[COPY3]](s32) ; WAVE32-NEXT: $vgpr0_vgpr1 = COPY [[MV]](s64) %0:_(<8 x s64>) = COPY $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 %1:_(s32) = COPY $vgpr0 Index: llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-or.mir =================================================================== --- llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-or.mir +++ llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-or.mir @@ -119,11 +119,10 @@ ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr(s32) = COPY $vgpr0 ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr(s32) = COPY $vgpr1 - ; CHECK-NEXT: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 0 - ; CHECK-NEXT: [[COPY2:%[0-9]+]]:vgpr(s32) = COPY [[C]](s32) - ; CHECK-NEXT: [[ICMP:%[0-9]+]]:vcc(s1) = G_ICMP intpred(ne), [[COPY]](s32), [[COPY2]] - ; CHECK-NEXT: [[COPY3:%[0-9]+]]:vgpr(s32) = COPY [[C]](s32) - ; CHECK-NEXT: [[ICMP1:%[0-9]+]]:vcc(s1) = G_ICMP intpred(ne), [[COPY1]](s32), [[COPY3]] + ; CHECK-NEXT: [[C:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0 + ; CHECK-NEXT: [[C1:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0 + ; CHECK-NEXT: [[ICMP:%[0-9]+]]:vcc(s1) = G_ICMP intpred(ne), [[COPY]](s32), [[C1]] + ; CHECK-NEXT: [[ICMP1:%[0-9]+]]:vcc(s1) = G_ICMP intpred(ne), [[COPY1]](s32), [[C]] ; CHECK-NEXT: [[OR:%[0-9]+]]:vcc(s1) = G_OR [[ICMP]], [[ICMP1]] ; CHECK-NEXT: S_NOP 0, implicit [[OR]](s1) %0:_(s32) = COPY $vgpr0 @@ -150,10 +149,10 @@ ; CHECK-NEXT: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 0 ; CHECK-NEXT: [[ICMP:%[0-9]+]]:sgpr(s32) = G_ICMP intpred(ne), [[COPY]](s32), [[C]] ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:sgpr(s1) = G_TRUNC [[ICMP]](s32) - ; CHECK-NEXT: [[COPY2:%[0-9]+]]:vgpr(s32) = COPY [[C]](s32) - ; CHECK-NEXT: [[ICMP1:%[0-9]+]]:vcc(s1) = G_ICMP intpred(ne), [[COPY1]](s32), [[COPY2]] - ; CHECK-NEXT: [[COPY3:%[0-9]+]]:vcc(s1) = COPY [[TRUNC]](s1) - ; CHECK-NEXT: [[OR:%[0-9]+]]:vcc(s1) = G_OR [[COPY3]], [[ICMP1]] + ; CHECK-NEXT: [[C1:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0 + ; CHECK-NEXT: [[ICMP1:%[0-9]+]]:vcc(s1) = G_ICMP intpred(ne), [[COPY1]](s32), [[C1]] + ; CHECK-NEXT: [[COPY2:%[0-9]+]]:vcc(s1) = COPY [[TRUNC]](s1) + ; CHECK-NEXT: [[OR:%[0-9]+]]:vcc(s1) = G_OR [[COPY2]], [[ICMP1]] ; CHECK-NEXT: S_NOP 0, implicit [[OR]](s1) %0:_(s32) = COPY $sgpr0 %1:_(s32) = COPY $vgpr0 Index: llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-phi-s1.mir =================================================================== --- llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-phi-s1.mir +++ llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-phi-s1.mir @@ -265,15 +265,15 @@ ; FAST-NEXT: bb.1: ; FAST-NEXT: successors: %bb.2(0x80000000) ; FAST-NEXT: {{ $}} - ; FAST-NEXT: [[COPY4:%[0-9]+]]:vgpr(s32) = COPY [[C]](s32) - ; FAST-NEXT: [[ICMP2:%[0-9]+]]:vcc(s1) = G_ICMP intpred(eq), [[COPY1]](s32), [[COPY4]] + ; FAST-NEXT: [[C1:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0 + ; FAST-NEXT: [[ICMP2:%[0-9]+]]:vcc(s1) = G_ICMP intpred(eq), [[COPY1]](s32), [[C1]] ; FAST-NEXT: G_BR %bb.2 ; FAST-NEXT: {{ $}} ; FAST-NEXT: bb.2: ; FAST-NEXT: [[PHI:%[0-9]+]]:vcc(s1) = G_PHI [[COPY3]](s1), %bb.0, [[ICMP2]](s1), %bb.1 - ; FAST-NEXT: [[COPY5:%[0-9]+]]:vgpr(s32) = COPY [[C]](s32) - ; FAST-NEXT: [[COPY6:%[0-9]+]]:vgpr(s32) = COPY [[COPY]](s32) - ; FAST-NEXT: [[SELECT:%[0-9]+]]:vgpr(s32) = G_SELECT [[PHI]](s1), [[COPY5]], [[COPY6]] + ; FAST-NEXT: [[C2:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0 + ; FAST-NEXT: [[COPY4:%[0-9]+]]:vgpr(s32) = COPY [[COPY]](s32) + ; FAST-NEXT: [[SELECT:%[0-9]+]]:vgpr(s32) = G_SELECT [[PHI]](s1), [[C2]], [[COPY4]] ; FAST-NEXT: S_SETPC_B64 undef $sgpr30_sgpr31, implicit [[SELECT]](s32) ; GREEDY-LABEL: name: phi_s1_scc_vcc_sbranch ; GREEDY: bb.0: @@ -296,15 +296,15 @@ ; GREEDY-NEXT: bb.1: ; GREEDY-NEXT: successors: %bb.2(0x80000000) ; GREEDY-NEXT: {{ $}} - ; GREEDY-NEXT: [[COPY4:%[0-9]+]]:vgpr(s32) = COPY [[C]](s32) - ; GREEDY-NEXT: [[ICMP2:%[0-9]+]]:vcc(s1) = G_ICMP intpred(eq), [[COPY1]](s32), [[COPY4]] + ; GREEDY-NEXT: [[C1:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0 + ; GREEDY-NEXT: [[ICMP2:%[0-9]+]]:vcc(s1) = G_ICMP intpred(eq), [[COPY1]](s32), [[C1]] ; GREEDY-NEXT: G_BR %bb.2 ; GREEDY-NEXT: {{ $}} ; GREEDY-NEXT: bb.2: ; GREEDY-NEXT: [[PHI:%[0-9]+]]:vcc(s1) = G_PHI [[COPY3]](s1), %bb.0, [[ICMP2]](s1), %bb.1 - ; GREEDY-NEXT: [[COPY5:%[0-9]+]]:vgpr(s32) = COPY [[C]](s32) - ; GREEDY-NEXT: [[COPY6:%[0-9]+]]:vgpr(s32) = COPY [[COPY]](s32) - ; GREEDY-NEXT: [[SELECT:%[0-9]+]]:vgpr(s32) = G_SELECT [[PHI]](s1), [[COPY5]], [[COPY6]] + ; GREEDY-NEXT: [[C2:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0 + ; GREEDY-NEXT: [[COPY4:%[0-9]+]]:vgpr(s32) = COPY [[COPY]](s32) + ; GREEDY-NEXT: [[SELECT:%[0-9]+]]:vgpr(s32) = G_SELECT [[PHI]](s1), [[C2]], [[COPY4]] ; GREEDY-NEXT: S_SETPC_B64 undef $sgpr30_sgpr31, implicit [[SELECT]](s32) bb.0: successors: %bb.1, %bb.2 @@ -347,8 +347,8 @@ ; FAST-NEXT: [[COPY1:%[0-9]+]]:sgpr(s32) = COPY $sgpr0 ; FAST-NEXT: [[COPY2:%[0-9]+]]:sgpr(s32) = COPY $sgpr1 ; FAST-NEXT: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 0 - ; FAST-NEXT: [[COPY3:%[0-9]+]]:vgpr(s32) = COPY [[C]](s32) - ; FAST-NEXT: [[ICMP:%[0-9]+]]:vcc(s1) = G_ICMP intpred(eq), [[COPY]](s32), [[COPY3]] + ; FAST-NEXT: [[C1:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0 + ; FAST-NEXT: [[ICMP:%[0-9]+]]:vcc(s1) = G_ICMP intpred(eq), [[COPY]](s32), [[C1]] ; FAST-NEXT: [[ICMP1:%[0-9]+]]:sgpr(s32) = G_ICMP intpred(eq), [[COPY2]](s32), [[C]] ; FAST-NEXT: [[TRUNC:%[0-9]+]]:sgpr(s1) = G_TRUNC [[ICMP1]](s32) ; FAST-NEXT: [[ZEXT:%[0-9]+]]:sgpr(s32) = G_ZEXT [[TRUNC]](s1) @@ -360,14 +360,14 @@ ; FAST-NEXT: {{ $}} ; FAST-NEXT: [[ICMP2:%[0-9]+]]:sgpr(s32) = G_ICMP intpred(eq), [[COPY1]](s32), [[C]] ; FAST-NEXT: [[TRUNC1:%[0-9]+]]:sgpr(s1) = G_TRUNC [[ICMP2]](s32) - ; FAST-NEXT: [[COPY4:%[0-9]+]]:vcc(s1) = COPY [[TRUNC1]](s1) + ; FAST-NEXT: [[COPY3:%[0-9]+]]:vcc(s1) = COPY [[TRUNC1]](s1) ; FAST-NEXT: G_BR %bb.2 ; FAST-NEXT: {{ $}} ; FAST-NEXT: bb.2: - ; FAST-NEXT: [[PHI:%[0-9]+]]:vcc(s1) = G_PHI [[ICMP]](s1), %bb.0, [[COPY4]](s1), %bb.1 - ; FAST-NEXT: [[COPY5:%[0-9]+]]:vgpr(s32) = COPY [[C]](s32) - ; FAST-NEXT: [[COPY6:%[0-9]+]]:vgpr(s32) = COPY [[COPY1]](s32) - ; FAST-NEXT: [[SELECT:%[0-9]+]]:vgpr(s32) = G_SELECT [[PHI]](s1), [[COPY5]], [[COPY6]] + ; FAST-NEXT: [[PHI:%[0-9]+]]:vcc(s1) = G_PHI [[ICMP]](s1), %bb.0, [[COPY3]](s1), %bb.1 + ; FAST-NEXT: [[C2:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0 + ; FAST-NEXT: [[COPY4:%[0-9]+]]:vgpr(s32) = COPY [[COPY1]](s32) + ; FAST-NEXT: [[SELECT:%[0-9]+]]:vgpr(s32) = G_SELECT [[PHI]](s1), [[C2]], [[COPY4]] ; FAST-NEXT: S_SETPC_B64 undef $sgpr30_sgpr31, implicit [[SELECT]](s32) ; GREEDY-LABEL: name: phi_s1_vcc_scc_sbranch ; GREEDY: bb.0: @@ -378,8 +378,8 @@ ; GREEDY-NEXT: [[COPY1:%[0-9]+]]:sgpr(s32) = COPY $sgpr0 ; GREEDY-NEXT: [[COPY2:%[0-9]+]]:sgpr(s32) = COPY $sgpr1 ; GREEDY-NEXT: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 0 - ; GREEDY-NEXT: [[COPY3:%[0-9]+]]:vgpr(s32) = COPY [[C]](s32) - ; GREEDY-NEXT: [[ICMP:%[0-9]+]]:vcc(s1) = G_ICMP intpred(eq), [[COPY]](s32), [[COPY3]] + ; GREEDY-NEXT: [[C1:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0 + ; GREEDY-NEXT: [[ICMP:%[0-9]+]]:vcc(s1) = G_ICMP intpred(eq), [[COPY]](s32), [[C1]] ; GREEDY-NEXT: [[ICMP1:%[0-9]+]]:sgpr(s32) = G_ICMP intpred(eq), [[COPY2]](s32), [[C]] ; GREEDY-NEXT: [[TRUNC:%[0-9]+]]:sgpr(s1) = G_TRUNC [[ICMP1]](s32) ; GREEDY-NEXT: [[ZEXT:%[0-9]+]]:sgpr(s32) = G_ZEXT [[TRUNC]](s1) @@ -391,14 +391,14 @@ ; GREEDY-NEXT: {{ $}} ; GREEDY-NEXT: [[ICMP2:%[0-9]+]]:sgpr(s32) = G_ICMP intpred(eq), [[COPY1]](s32), [[C]] ; GREEDY-NEXT: [[TRUNC1:%[0-9]+]]:sgpr(s1) = G_TRUNC [[ICMP2]](s32) - ; GREEDY-NEXT: [[COPY4:%[0-9]+]]:vcc(s1) = COPY [[TRUNC1]](s1) + ; GREEDY-NEXT: [[COPY3:%[0-9]+]]:vcc(s1) = COPY [[TRUNC1]](s1) ; GREEDY-NEXT: G_BR %bb.2 ; GREEDY-NEXT: {{ $}} ; GREEDY-NEXT: bb.2: - ; GREEDY-NEXT: [[PHI:%[0-9]+]]:vcc(s1) = G_PHI [[ICMP]](s1), %bb.0, [[COPY4]](s1), %bb.1 - ; GREEDY-NEXT: [[COPY5:%[0-9]+]]:vgpr(s32) = COPY [[C]](s32) - ; GREEDY-NEXT: [[COPY6:%[0-9]+]]:vgpr(s32) = COPY [[COPY1]](s32) - ; GREEDY-NEXT: [[SELECT:%[0-9]+]]:vgpr(s32) = G_SELECT [[PHI]](s1), [[COPY5]], [[COPY6]] + ; GREEDY-NEXT: [[PHI:%[0-9]+]]:vcc(s1) = G_PHI [[ICMP]](s1), %bb.0, [[COPY3]](s1), %bb.1 + ; GREEDY-NEXT: [[C2:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0 + ; GREEDY-NEXT: [[COPY4:%[0-9]+]]:vgpr(s32) = COPY [[COPY1]](s32) + ; GREEDY-NEXT: [[SELECT:%[0-9]+]]:vgpr(s32) = G_SELECT [[PHI]](s1), [[C2]], [[COPY4]] ; GREEDY-NEXT: S_SETPC_B64 undef $sgpr30_sgpr31, implicit [[SELECT]](s32) bb.0: successors: %bb.1, %bb.2 @@ -441,8 +441,8 @@ ; FAST-NEXT: [[COPY1:%[0-9]+]]:vgpr(s32) = COPY $vgpr1 ; FAST-NEXT: [[COPY2:%[0-9]+]]:sgpr(s32) = COPY $sgpr0 ; FAST-NEXT: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 0 - ; FAST-NEXT: [[COPY3:%[0-9]+]]:vgpr(s32) = COPY [[C]](s32) - ; FAST-NEXT: [[ICMP:%[0-9]+]]:vcc(s1) = G_ICMP intpred(eq), [[COPY]](s32), [[COPY3]] + ; FAST-NEXT: [[C1:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0 + ; FAST-NEXT: [[ICMP:%[0-9]+]]:vcc(s1) = G_ICMP intpred(eq), [[COPY]](s32), [[C1]] ; FAST-NEXT: [[ICMP1:%[0-9]+]]:sgpr(s32) = G_ICMP intpred(eq), [[COPY2]](s32), [[C]] ; FAST-NEXT: [[TRUNC:%[0-9]+]]:sgpr(s1) = G_TRUNC [[ICMP1]](s32) ; FAST-NEXT: [[ZEXT:%[0-9]+]]:sgpr(s32) = G_ZEXT [[TRUNC]](s1) @@ -452,14 +452,14 @@ ; FAST-NEXT: bb.1: ; FAST-NEXT: successors: %bb.2(0x80000000) ; FAST-NEXT: {{ $}} - ; FAST-NEXT: [[COPY4:%[0-9]+]]:vgpr(s32) = COPY [[C]](s32) - ; FAST-NEXT: [[ICMP2:%[0-9]+]]:vcc(s1) = G_ICMP intpred(eq), [[COPY1]](s32), [[COPY4]] + ; FAST-NEXT: [[C2:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0 + ; FAST-NEXT: [[ICMP2:%[0-9]+]]:vcc(s1) = G_ICMP intpred(eq), [[COPY1]](s32), [[C2]] ; FAST-NEXT: G_BR %bb.2 ; FAST-NEXT: {{ $}} ; FAST-NEXT: bb.2: ; FAST-NEXT: [[PHI:%[0-9]+]]:vcc(s1) = G_PHI [[ICMP]](s1), %bb.0, [[ICMP2]](s1), %bb.1 - ; FAST-NEXT: [[COPY5:%[0-9]+]]:vgpr(s32) = COPY [[C]](s32) - ; FAST-NEXT: [[SELECT:%[0-9]+]]:vgpr(s32) = G_SELECT [[PHI]](s1), [[COPY5]], [[COPY]] + ; FAST-NEXT: [[C3:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0 + ; FAST-NEXT: [[SELECT:%[0-9]+]]:vgpr(s32) = G_SELECT [[PHI]](s1), [[C3]], [[COPY]] ; FAST-NEXT: S_SETPC_B64 undef $sgpr30_sgpr31, implicit [[SELECT]](s32) ; GREEDY-LABEL: name: phi_s1_vcc_vcc_sbranch ; GREEDY: bb.0: @@ -470,8 +470,8 @@ ; GREEDY-NEXT: [[COPY1:%[0-9]+]]:vgpr(s32) = COPY $vgpr1 ; GREEDY-NEXT: [[COPY2:%[0-9]+]]:sgpr(s32) = COPY $sgpr0 ; GREEDY-NEXT: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 0 - ; GREEDY-NEXT: [[COPY3:%[0-9]+]]:vgpr(s32) = COPY [[C]](s32) - ; GREEDY-NEXT: [[ICMP:%[0-9]+]]:vcc(s1) = G_ICMP intpred(eq), [[COPY]](s32), [[COPY3]] + ; GREEDY-NEXT: [[C1:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0 + ; GREEDY-NEXT: [[ICMP:%[0-9]+]]:vcc(s1) = G_ICMP intpred(eq), [[COPY]](s32), [[C1]] ; GREEDY-NEXT: [[ICMP1:%[0-9]+]]:sgpr(s32) = G_ICMP intpred(eq), [[COPY2]](s32), [[C]] ; GREEDY-NEXT: [[TRUNC:%[0-9]+]]:sgpr(s1) = G_TRUNC [[ICMP1]](s32) ; GREEDY-NEXT: [[ZEXT:%[0-9]+]]:sgpr(s32) = G_ZEXT [[TRUNC]](s1) @@ -481,14 +481,14 @@ ; GREEDY-NEXT: bb.1: ; GREEDY-NEXT: successors: %bb.2(0x80000000) ; GREEDY-NEXT: {{ $}} - ; GREEDY-NEXT: [[COPY4:%[0-9]+]]:vgpr(s32) = COPY [[C]](s32) - ; GREEDY-NEXT: [[ICMP2:%[0-9]+]]:vcc(s1) = G_ICMP intpred(eq), [[COPY1]](s32), [[COPY4]] + ; GREEDY-NEXT: [[C2:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0 + ; GREEDY-NEXT: [[ICMP2:%[0-9]+]]:vcc(s1) = G_ICMP intpred(eq), [[COPY1]](s32), [[C2]] ; GREEDY-NEXT: G_BR %bb.2 ; GREEDY-NEXT: {{ $}} ; GREEDY-NEXT: bb.2: ; GREEDY-NEXT: [[PHI:%[0-9]+]]:vcc(s1) = G_PHI [[ICMP]](s1), %bb.0, [[ICMP2]](s1), %bb.1 - ; GREEDY-NEXT: [[COPY5:%[0-9]+]]:vgpr(s32) = COPY [[C]](s32) - ; GREEDY-NEXT: [[SELECT:%[0-9]+]]:vgpr(s32) = G_SELECT [[PHI]](s1), [[COPY5]], [[COPY]] + ; GREEDY-NEXT: [[C3:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0 + ; GREEDY-NEXT: [[SELECT:%[0-9]+]]:vgpr(s32) = G_SELECT [[PHI]](s1), [[C3]], [[COPY]] ; GREEDY-NEXT: S_SETPC_B64 undef $sgpr30_sgpr31, implicit [[SELECT]](s32) bb.0: successors: %bb.1, %bb.2 @@ -739,9 +739,9 @@ ; FAST-NEXT: [[PHI:%[0-9]+]]:vgpr(s32) = G_PHI [[ANYEXT]](s32), %bb.0, [[ANYEXT1]](s32), %bb.1 ; FAST-NEXT: [[TRUNC3:%[0-9]+]]:vgpr(s1) = G_TRUNC [[PHI]](s32) ; FAST-NEXT: [[COPY3:%[0-9]+]]:vcc(s1) = COPY [[TRUNC3]](s1) - ; FAST-NEXT: [[COPY4:%[0-9]+]]:vgpr(s32) = COPY [[C]](s32) - ; FAST-NEXT: [[COPY5:%[0-9]+]]:vgpr(s32) = COPY [[COPY]](s32) - ; FAST-NEXT: [[SELECT:%[0-9]+]]:vgpr(s32) = G_SELECT [[COPY3]](s1), [[COPY4]], [[COPY5]] + ; FAST-NEXT: [[C1:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0 + ; FAST-NEXT: [[COPY4:%[0-9]+]]:vgpr(s32) = COPY [[COPY]](s32) + ; FAST-NEXT: [[SELECT:%[0-9]+]]:vgpr(s32) = G_SELECT [[COPY3]](s1), [[C1]], [[COPY4]] ; FAST-NEXT: S_SETPC_B64 undef $sgpr30_sgpr31, implicit [[SELECT]](s32) ; GREEDY-LABEL: name: phi_s1_scc_v_sbranch ; GREEDY: bb.0: @@ -772,9 +772,9 @@ ; GREEDY-NEXT: [[PHI:%[0-9]+]]:vgpr(s32) = G_PHI [[ANYEXT]](s32), %bb.0, [[ANYEXT1]](s32), %bb.1 ; GREEDY-NEXT: [[TRUNC3:%[0-9]+]]:vgpr(s1) = G_TRUNC [[PHI]](s32) ; GREEDY-NEXT: [[COPY3:%[0-9]+]]:vcc(s1) = COPY [[TRUNC3]](s1) - ; GREEDY-NEXT: [[COPY4:%[0-9]+]]:vgpr(s32) = COPY [[C]](s32) - ; GREEDY-NEXT: [[COPY5:%[0-9]+]]:vgpr(s32) = COPY [[COPY]](s32) - ; GREEDY-NEXT: [[SELECT:%[0-9]+]]:vgpr(s32) = G_SELECT [[COPY3]](s1), [[COPY4]], [[COPY5]] + ; GREEDY-NEXT: [[C1:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0 + ; GREEDY-NEXT: [[COPY4:%[0-9]+]]:vgpr(s32) = COPY [[COPY]](s32) + ; GREEDY-NEXT: [[SELECT:%[0-9]+]]:vgpr(s32) = G_SELECT [[COPY3]](s1), [[C1]], [[COPY4]] ; GREEDY-NEXT: S_SETPC_B64 undef $sgpr30_sgpr31, implicit [[SELECT]](s32) bb.0: successors: %bb.1, %bb.2 @@ -837,8 +837,8 @@ ; FAST-NEXT: [[PHI:%[0-9]+]]:vgpr(s32) = G_PHI [[ANYEXT]](s32), %bb.0, [[ANYEXT1]](s32), %bb.1 ; FAST-NEXT: [[TRUNC3:%[0-9]+]]:vgpr(s1) = G_TRUNC [[PHI]](s32) ; FAST-NEXT: [[COPY3:%[0-9]+]]:vcc(s1) = COPY [[TRUNC3]](s1) - ; FAST-NEXT: [[COPY4:%[0-9]+]]:vgpr(s32) = COPY [[C]](s32) - ; FAST-NEXT: [[SELECT:%[0-9]+]]:vgpr(s32) = G_SELECT [[COPY3]](s1), [[COPY4]], [[COPY]] + ; FAST-NEXT: [[C1:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0 + ; FAST-NEXT: [[SELECT:%[0-9]+]]:vgpr(s32) = G_SELECT [[COPY3]](s1), [[C1]], [[COPY]] ; FAST-NEXT: S_SETPC_B64 undef $sgpr30_sgpr31, implicit [[SELECT]](s32) ; GREEDY-LABEL: name: phi_s1_v_scc_sbranch ; GREEDY: bb.0: @@ -869,8 +869,8 @@ ; GREEDY-NEXT: [[PHI:%[0-9]+]]:vgpr(s32) = G_PHI [[ANYEXT]](s32), %bb.0, [[ANYEXT1]](s32), %bb.1 ; GREEDY-NEXT: [[TRUNC3:%[0-9]+]]:vgpr(s1) = G_TRUNC [[PHI]](s32) ; GREEDY-NEXT: [[COPY3:%[0-9]+]]:vcc(s1) = COPY [[TRUNC3]](s1) - ; GREEDY-NEXT: [[COPY4:%[0-9]+]]:vgpr(s32) = COPY [[C]](s32) - ; GREEDY-NEXT: [[SELECT:%[0-9]+]]:vgpr(s32) = G_SELECT [[COPY3]](s1), [[COPY4]], [[COPY]] + ; GREEDY-NEXT: [[C1:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0 + ; GREEDY-NEXT: [[SELECT:%[0-9]+]]:vgpr(s32) = G_SELECT [[COPY3]](s1), [[C1]], [[COPY]] ; GREEDY-NEXT: S_SETPC_B64 undef $sgpr30_sgpr31, implicit [[SELECT]](s32) bb.0: successors: %bb.1, %bb.2 @@ -913,8 +913,8 @@ ; FAST-NEXT: [[COPY1:%[0-9]+]]:sgpr(s32) = COPY $sgpr0 ; FAST-NEXT: [[COPY2:%[0-9]+]]:sgpr(s32) = COPY $sgpr1 ; FAST-NEXT: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 0 - ; FAST-NEXT: [[COPY3:%[0-9]+]]:vgpr(s32) = COPY [[C]](s32) - ; FAST-NEXT: [[ICMP:%[0-9]+]]:vcc(s1) = G_ICMP intpred(eq), [[COPY]](s32), [[COPY3]] + ; FAST-NEXT: [[C1:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0 + ; FAST-NEXT: [[ICMP:%[0-9]+]]:vcc(s1) = G_ICMP intpred(eq), [[COPY]](s32), [[C1]] ; FAST-NEXT: [[ICMP1:%[0-9]+]]:sgpr(s32) = G_ICMP intpred(eq), [[COPY2]](s32), [[C]] ; FAST-NEXT: [[TRUNC:%[0-9]+]]:sgpr(s1) = G_TRUNC [[ICMP1]](s32) ; FAST-NEXT: [[ZEXT:%[0-9]+]]:sgpr(s32) = G_ZEXT [[TRUNC]](s1) @@ -925,13 +925,13 @@ ; FAST-NEXT: successors: %bb.2(0x80000000) ; FAST-NEXT: {{ $}} ; FAST-NEXT: [[TRUNC1:%[0-9]+]]:sgpr(s1) = G_TRUNC [[COPY1]](s32) - ; FAST-NEXT: [[COPY4:%[0-9]+]]:vcc(s1) = COPY [[TRUNC1]](s1) + ; FAST-NEXT: [[COPY3:%[0-9]+]]:vcc(s1) = COPY [[TRUNC1]](s1) ; FAST-NEXT: G_BR %bb.2 ; FAST-NEXT: {{ $}} ; FAST-NEXT: bb.2: - ; FAST-NEXT: [[PHI:%[0-9]+]]:vcc(s1) = G_PHI [[ICMP]](s1), %bb.0, [[COPY4]](s1), %bb.1 - ; FAST-NEXT: [[COPY5:%[0-9]+]]:vgpr(s32) = COPY [[C]](s32) - ; FAST-NEXT: [[SELECT:%[0-9]+]]:vgpr(s32) = G_SELECT [[PHI]](s1), [[COPY5]], [[COPY]] + ; FAST-NEXT: [[PHI:%[0-9]+]]:vcc(s1) = G_PHI [[ICMP]](s1), %bb.0, [[COPY3]](s1), %bb.1 + ; FAST-NEXT: [[C2:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0 + ; FAST-NEXT: [[SELECT:%[0-9]+]]:vgpr(s32) = G_SELECT [[PHI]](s1), [[C2]], [[COPY]] ; FAST-NEXT: S_SETPC_B64 undef $sgpr30_sgpr31, implicit [[SELECT]](s32) ; GREEDY-LABEL: name: phi_s1_vcc_s_sbranch ; GREEDY: bb.0: @@ -942,8 +942,8 @@ ; GREEDY-NEXT: [[COPY1:%[0-9]+]]:sgpr(s32) = COPY $sgpr0 ; GREEDY-NEXT: [[COPY2:%[0-9]+]]:sgpr(s32) = COPY $sgpr1 ; GREEDY-NEXT: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 0 - ; GREEDY-NEXT: [[COPY3:%[0-9]+]]:vgpr(s32) = COPY [[C]](s32) - ; GREEDY-NEXT: [[ICMP:%[0-9]+]]:vcc(s1) = G_ICMP intpred(eq), [[COPY]](s32), [[COPY3]] + ; GREEDY-NEXT: [[C1:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0 + ; GREEDY-NEXT: [[ICMP:%[0-9]+]]:vcc(s1) = G_ICMP intpred(eq), [[COPY]](s32), [[C1]] ; GREEDY-NEXT: [[ICMP1:%[0-9]+]]:sgpr(s32) = G_ICMP intpred(eq), [[COPY2]](s32), [[C]] ; GREEDY-NEXT: [[TRUNC:%[0-9]+]]:sgpr(s1) = G_TRUNC [[ICMP1]](s32) ; GREEDY-NEXT: [[ZEXT:%[0-9]+]]:sgpr(s32) = G_ZEXT [[TRUNC]](s1) @@ -954,13 +954,13 @@ ; GREEDY-NEXT: successors: %bb.2(0x80000000) ; GREEDY-NEXT: {{ $}} ; GREEDY-NEXT: [[TRUNC1:%[0-9]+]]:sgpr(s1) = G_TRUNC [[COPY1]](s32) - ; GREEDY-NEXT: [[COPY4:%[0-9]+]]:vcc(s1) = COPY [[TRUNC1]](s1) + ; GREEDY-NEXT: [[COPY3:%[0-9]+]]:vcc(s1) = COPY [[TRUNC1]](s1) ; GREEDY-NEXT: G_BR %bb.2 ; GREEDY-NEXT: {{ $}} ; GREEDY-NEXT: bb.2: - ; GREEDY-NEXT: [[PHI:%[0-9]+]]:vcc(s1) = G_PHI [[ICMP]](s1), %bb.0, [[COPY4]](s1), %bb.1 - ; GREEDY-NEXT: [[COPY5:%[0-9]+]]:vgpr(s32) = COPY [[C]](s32) - ; GREEDY-NEXT: [[SELECT:%[0-9]+]]:vgpr(s32) = G_SELECT [[PHI]](s1), [[COPY5]], [[COPY]] + ; GREEDY-NEXT: [[PHI:%[0-9]+]]:vcc(s1) = G_PHI [[ICMP]](s1), %bb.0, [[COPY3]](s1), %bb.1 + ; GREEDY-NEXT: [[C2:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0 + ; GREEDY-NEXT: [[SELECT:%[0-9]+]]:vgpr(s32) = G_SELECT [[PHI]](s1), [[C2]], [[COPY]] ; GREEDY-NEXT: S_SETPC_B64 undef $sgpr30_sgpr31, implicit [[SELECT]](s32) bb.0: successors: %bb.1, %bb.2 @@ -1014,15 +1014,15 @@ ; FAST-NEXT: bb.1: ; FAST-NEXT: successors: %bb.2(0x80000000) ; FAST-NEXT: {{ $}} - ; FAST-NEXT: [[COPY4:%[0-9]+]]:vgpr(s32) = COPY [[C]](s32) - ; FAST-NEXT: [[ICMP1:%[0-9]+]]:vcc(s1) = G_ICMP intpred(eq), [[COPY1]](s32), [[COPY4]] + ; FAST-NEXT: [[C1:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0 + ; FAST-NEXT: [[ICMP1:%[0-9]+]]:vcc(s1) = G_ICMP intpred(eq), [[COPY1]](s32), [[C1]] ; FAST-NEXT: G_BR %bb.2 ; FAST-NEXT: {{ $}} ; FAST-NEXT: bb.2: ; FAST-NEXT: [[PHI:%[0-9]+]]:vcc(s1) = G_PHI [[COPY3]](s1), %bb.0, [[ICMP1]](s1), %bb.1 - ; FAST-NEXT: [[COPY5:%[0-9]+]]:vgpr(s32) = COPY [[C]](s32) - ; FAST-NEXT: [[COPY6:%[0-9]+]]:vgpr(s32) = COPY [[COPY]](s32) - ; FAST-NEXT: [[SELECT:%[0-9]+]]:vgpr(s32) = G_SELECT [[PHI]](s1), [[COPY5]], [[COPY6]] + ; FAST-NEXT: [[C2:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0 + ; FAST-NEXT: [[COPY4:%[0-9]+]]:vgpr(s32) = COPY [[COPY]](s32) + ; FAST-NEXT: [[SELECT:%[0-9]+]]:vgpr(s32) = G_SELECT [[PHI]](s1), [[C2]], [[COPY4]] ; FAST-NEXT: S_SETPC_B64 undef $sgpr30_sgpr31, implicit [[SELECT]](s32) ; GREEDY-LABEL: name: phi_s1_s_vcc_sbranch ; GREEDY: bb.0: @@ -1044,15 +1044,15 @@ ; GREEDY-NEXT: bb.1: ; GREEDY-NEXT: successors: %bb.2(0x80000000) ; GREEDY-NEXT: {{ $}} - ; GREEDY-NEXT: [[COPY4:%[0-9]+]]:vgpr(s32) = COPY [[C]](s32) - ; GREEDY-NEXT: [[ICMP1:%[0-9]+]]:vcc(s1) = G_ICMP intpred(eq), [[COPY1]](s32), [[COPY4]] + ; GREEDY-NEXT: [[C1:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0 + ; GREEDY-NEXT: [[ICMP1:%[0-9]+]]:vcc(s1) = G_ICMP intpred(eq), [[COPY1]](s32), [[C1]] ; GREEDY-NEXT: G_BR %bb.2 ; GREEDY-NEXT: {{ $}} ; GREEDY-NEXT: bb.2: ; GREEDY-NEXT: [[PHI:%[0-9]+]]:vcc(s1) = G_PHI [[COPY3]](s1), %bb.0, [[ICMP1]](s1), %bb.1 - ; GREEDY-NEXT: [[COPY5:%[0-9]+]]:vgpr(s32) = COPY [[C]](s32) - ; GREEDY-NEXT: [[COPY6:%[0-9]+]]:vgpr(s32) = COPY [[COPY]](s32) - ; GREEDY-NEXT: [[SELECT:%[0-9]+]]:vgpr(s32) = G_SELECT [[PHI]](s1), [[COPY5]], [[COPY6]] + ; GREEDY-NEXT: [[C2:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0 + ; GREEDY-NEXT: [[COPY4:%[0-9]+]]:vgpr(s32) = COPY [[COPY]](s32) + ; GREEDY-NEXT: [[SELECT:%[0-9]+]]:vgpr(s32) = G_SELECT [[PHI]](s1), [[C2]], [[COPY4]] ; GREEDY-NEXT: S_SETPC_B64 undef $sgpr30_sgpr31, implicit [[SELECT]](s32) bb.0: successors: %bb.1, %bb.2 @@ -1095,14 +1095,14 @@ ; FAST-NEXT: [[COPY1:%[0-9]+]]:vgpr(s32) = COPY $vgpr1 ; FAST-NEXT: [[COPY2:%[0-9]+]]:sgpr(s32) = COPY $sgpr0 ; FAST-NEXT: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 0 - ; FAST-NEXT: [[COPY3:%[0-9]+]]:vgpr(s32) = COPY [[C]](s32) - ; FAST-NEXT: [[ICMP:%[0-9]+]]:vcc(s1) = G_ICMP intpred(eq), [[COPY]](s32), [[COPY3]] + ; FAST-NEXT: [[C1:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0 + ; FAST-NEXT: [[ICMP:%[0-9]+]]:vcc(s1) = G_ICMP intpred(eq), [[COPY]](s32), [[C1]] ; FAST-NEXT: [[ICMP1:%[0-9]+]]:sgpr(s32) = G_ICMP intpred(eq), [[COPY2]](s32), [[C]] ; FAST-NEXT: [[TRUNC:%[0-9]+]]:sgpr(s1) = G_TRUNC [[ICMP1]](s32) ; FAST-NEXT: [[ZEXT:%[0-9]+]]:sgpr(s32) = G_ZEXT [[TRUNC]](s1) - ; FAST-NEXT: [[C1:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 1 - ; FAST-NEXT: [[C2:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0 - ; FAST-NEXT: [[SELECT:%[0-9]+]]:vgpr(s32) = G_SELECT [[ICMP]](s1), [[C1]], [[C2]] + ; FAST-NEXT: [[C2:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 1 + ; FAST-NEXT: [[C3:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0 + ; FAST-NEXT: [[SELECT:%[0-9]+]]:vgpr(s32) = G_SELECT [[ICMP]](s1), [[C2]], [[C3]] ; FAST-NEXT: G_BRCOND [[ZEXT]](s32), %bb.1 ; FAST-NEXT: G_BR %bb.2 ; FAST-NEXT: {{ $}} @@ -1116,9 +1116,9 @@ ; FAST-NEXT: bb.2: ; FAST-NEXT: [[PHI:%[0-9]+]]:vgpr(s32) = G_PHI [[SELECT]](s32), %bb.0, [[ANYEXT]](s32), %bb.1 ; FAST-NEXT: [[TRUNC2:%[0-9]+]]:vgpr(s1) = G_TRUNC [[PHI]](s32) - ; FAST-NEXT: [[COPY4:%[0-9]+]]:vcc(s1) = COPY [[TRUNC2]](s1) - ; FAST-NEXT: [[COPY5:%[0-9]+]]:vgpr(s32) = COPY [[C]](s32) - ; FAST-NEXT: [[SELECT1:%[0-9]+]]:vgpr(s32) = G_SELECT [[COPY4]](s1), [[COPY5]], [[COPY]] + ; FAST-NEXT: [[COPY3:%[0-9]+]]:vcc(s1) = COPY [[TRUNC2]](s1) + ; FAST-NEXT: [[C4:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0 + ; FAST-NEXT: [[SELECT1:%[0-9]+]]:vgpr(s32) = G_SELECT [[COPY3]](s1), [[C4]], [[COPY]] ; FAST-NEXT: S_SETPC_B64 undef $sgpr30_sgpr31, implicit [[SELECT1]](s32) ; GREEDY-LABEL: name: phi_s1_vcc_v_sbranch ; GREEDY: bb.0: @@ -1129,14 +1129,14 @@ ; GREEDY-NEXT: [[COPY1:%[0-9]+]]:vgpr(s32) = COPY $vgpr1 ; GREEDY-NEXT: [[COPY2:%[0-9]+]]:sgpr(s32) = COPY $sgpr0 ; GREEDY-NEXT: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 0 - ; GREEDY-NEXT: [[COPY3:%[0-9]+]]:vgpr(s32) = COPY [[C]](s32) - ; GREEDY-NEXT: [[ICMP:%[0-9]+]]:vcc(s1) = G_ICMP intpred(eq), [[COPY]](s32), [[COPY3]] + ; GREEDY-NEXT: [[C1:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0 + ; GREEDY-NEXT: [[ICMP:%[0-9]+]]:vcc(s1) = G_ICMP intpred(eq), [[COPY]](s32), [[C1]] ; GREEDY-NEXT: [[ICMP1:%[0-9]+]]:sgpr(s32) = G_ICMP intpred(eq), [[COPY2]](s32), [[C]] ; GREEDY-NEXT: [[TRUNC:%[0-9]+]]:sgpr(s1) = G_TRUNC [[ICMP1]](s32) ; GREEDY-NEXT: [[ZEXT:%[0-9]+]]:sgpr(s32) = G_ZEXT [[TRUNC]](s1) - ; GREEDY-NEXT: [[C1:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 1 - ; GREEDY-NEXT: [[C2:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0 - ; GREEDY-NEXT: [[SELECT:%[0-9]+]]:vgpr(s32) = G_SELECT [[ICMP]](s1), [[C1]], [[C2]] + ; GREEDY-NEXT: [[C2:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 1 + ; GREEDY-NEXT: [[C3:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0 + ; GREEDY-NEXT: [[SELECT:%[0-9]+]]:vgpr(s32) = G_SELECT [[ICMP]](s1), [[C2]], [[C3]] ; GREEDY-NEXT: G_BRCOND [[ZEXT]](s32), %bb.1 ; GREEDY-NEXT: G_BR %bb.2 ; GREEDY-NEXT: {{ $}} @@ -1150,9 +1150,9 @@ ; GREEDY-NEXT: bb.2: ; GREEDY-NEXT: [[PHI:%[0-9]+]]:vgpr(s32) = G_PHI [[SELECT]](s32), %bb.0, [[ANYEXT]](s32), %bb.1 ; GREEDY-NEXT: [[TRUNC2:%[0-9]+]]:vgpr(s1) = G_TRUNC [[PHI]](s32) - ; GREEDY-NEXT: [[COPY4:%[0-9]+]]:vcc(s1) = COPY [[TRUNC2]](s1) - ; GREEDY-NEXT: [[COPY5:%[0-9]+]]:vgpr(s32) = COPY [[C]](s32) - ; GREEDY-NEXT: [[SELECT1:%[0-9]+]]:vgpr(s32) = G_SELECT [[COPY4]](s1), [[COPY5]], [[COPY]] + ; GREEDY-NEXT: [[COPY3:%[0-9]+]]:vcc(s1) = COPY [[TRUNC2]](s1) + ; GREEDY-NEXT: [[C4:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0 + ; GREEDY-NEXT: [[SELECT1:%[0-9]+]]:vgpr(s32) = G_SELECT [[COPY3]](s1), [[C4]], [[COPY]] ; GREEDY-NEXT: S_SETPC_B64 undef $sgpr30_sgpr31, implicit [[SELECT1]](s32) bb.0: successors: %bb.1, %bb.2 @@ -1206,19 +1206,19 @@ ; FAST-NEXT: bb.1: ; FAST-NEXT: successors: %bb.2(0x80000000) ; FAST-NEXT: {{ $}} - ; FAST-NEXT: [[COPY3:%[0-9]+]]:vgpr(s32) = COPY [[C]](s32) - ; FAST-NEXT: [[ICMP1:%[0-9]+]]:vcc(s1) = G_ICMP intpred(eq), [[COPY1]](s32), [[COPY3]] - ; FAST-NEXT: [[C1:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 1 - ; FAST-NEXT: [[C2:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0 - ; FAST-NEXT: [[SELECT:%[0-9]+]]:vgpr(s32) = G_SELECT [[ICMP1]](s1), [[C1]], [[C2]] + ; FAST-NEXT: [[C1:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0 + ; FAST-NEXT: [[ICMP1:%[0-9]+]]:vcc(s1) = G_ICMP intpred(eq), [[COPY1]](s32), [[C1]] + ; FAST-NEXT: [[C2:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 1 + ; FAST-NEXT: [[C3:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0 + ; FAST-NEXT: [[SELECT:%[0-9]+]]:vgpr(s32) = G_SELECT [[ICMP1]](s1), [[C2]], [[C3]] ; FAST-NEXT: G_BR %bb.2 ; FAST-NEXT: {{ $}} ; FAST-NEXT: bb.2: ; FAST-NEXT: [[PHI:%[0-9]+]]:vgpr(s32) = G_PHI [[ANYEXT]](s32), %bb.0, [[SELECT]](s32), %bb.1 ; FAST-NEXT: [[TRUNC2:%[0-9]+]]:vgpr(s1) = G_TRUNC [[PHI]](s32) - ; FAST-NEXT: [[COPY4:%[0-9]+]]:vcc(s1) = COPY [[TRUNC2]](s1) - ; FAST-NEXT: [[COPY5:%[0-9]+]]:vgpr(s32) = COPY [[C]](s32) - ; FAST-NEXT: [[SELECT1:%[0-9]+]]:vgpr(s32) = G_SELECT [[COPY4]](s1), [[COPY5]], [[COPY]] + ; FAST-NEXT: [[COPY3:%[0-9]+]]:vcc(s1) = COPY [[TRUNC2]](s1) + ; FAST-NEXT: [[C4:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0 + ; FAST-NEXT: [[SELECT1:%[0-9]+]]:vgpr(s32) = G_SELECT [[COPY3]](s1), [[C4]], [[COPY]] ; FAST-NEXT: S_SETPC_B64 undef $sgpr30_sgpr31, implicit [[SELECT1]](s32) ; GREEDY-LABEL: name: phi_s1_v_vcc_sbranch ; GREEDY: bb.0: @@ -1240,19 +1240,19 @@ ; GREEDY-NEXT: bb.1: ; GREEDY-NEXT: successors: %bb.2(0x80000000) ; GREEDY-NEXT: {{ $}} - ; GREEDY-NEXT: [[COPY3:%[0-9]+]]:vgpr(s32) = COPY [[C]](s32) - ; GREEDY-NEXT: [[ICMP1:%[0-9]+]]:vcc(s1) = G_ICMP intpred(eq), [[COPY1]](s32), [[COPY3]] - ; GREEDY-NEXT: [[C1:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 1 - ; GREEDY-NEXT: [[C2:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0 - ; GREEDY-NEXT: [[SELECT:%[0-9]+]]:vgpr(s32) = G_SELECT [[ICMP1]](s1), [[C1]], [[C2]] + ; GREEDY-NEXT: [[C1:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0 + ; GREEDY-NEXT: [[ICMP1:%[0-9]+]]:vcc(s1) = G_ICMP intpred(eq), [[COPY1]](s32), [[C1]] + ; GREEDY-NEXT: [[C2:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 1 + ; GREEDY-NEXT: [[C3:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0 + ; GREEDY-NEXT: [[SELECT:%[0-9]+]]:vgpr(s32) = G_SELECT [[ICMP1]](s1), [[C2]], [[C3]] ; GREEDY-NEXT: G_BR %bb.2 ; GREEDY-NEXT: {{ $}} ; GREEDY-NEXT: bb.2: ; GREEDY-NEXT: [[PHI:%[0-9]+]]:vgpr(s32) = G_PHI [[ANYEXT]](s32), %bb.0, [[SELECT]](s32), %bb.1 ; GREEDY-NEXT: [[TRUNC2:%[0-9]+]]:vgpr(s1) = G_TRUNC [[PHI]](s32) - ; GREEDY-NEXT: [[COPY4:%[0-9]+]]:vcc(s1) = COPY [[TRUNC2]](s1) - ; GREEDY-NEXT: [[COPY5:%[0-9]+]]:vgpr(s32) = COPY [[C]](s32) - ; GREEDY-NEXT: [[SELECT1:%[0-9]+]]:vgpr(s32) = G_SELECT [[COPY4]](s1), [[COPY5]], [[COPY]] + ; GREEDY-NEXT: [[COPY3:%[0-9]+]]:vcc(s1) = COPY [[TRUNC2]](s1) + ; GREEDY-NEXT: [[C4:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0 + ; GREEDY-NEXT: [[SELECT1:%[0-9]+]]:vgpr(s32) = G_SELECT [[COPY3]](s1), [[C4]], [[COPY]] ; GREEDY-NEXT: S_SETPC_B64 undef $sgpr30_sgpr31, implicit [[SELECT1]](s32) bb.0: successors: %bb.1, %bb.2 @@ -1314,8 +1314,8 @@ ; FAST-NEXT: [[PHI:%[0-9]+]]:vgpr(s32) = G_PHI [[ANYEXT]](s32), %bb.0, [[ANYEXT1]](s32), %bb.1 ; FAST-NEXT: [[TRUNC3:%[0-9]+]]:vgpr(s1) = G_TRUNC [[PHI]](s32) ; FAST-NEXT: [[COPY3:%[0-9]+]]:vcc(s1) = COPY [[TRUNC3]](s1) - ; FAST-NEXT: [[COPY4:%[0-9]+]]:vgpr(s32) = COPY [[C]](s32) - ; FAST-NEXT: [[SELECT:%[0-9]+]]:vgpr(s32) = G_SELECT [[COPY3]](s1), [[COPY4]], [[COPY]] + ; FAST-NEXT: [[C1:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0 + ; FAST-NEXT: [[SELECT:%[0-9]+]]:vgpr(s32) = G_SELECT [[COPY3]](s1), [[C1]], [[COPY]] ; FAST-NEXT: S_SETPC_B64 undef $sgpr30_sgpr31, implicit [[SELECT]](s32) ; GREEDY-LABEL: name: phi_s1_v_s_sbranch ; GREEDY: bb.0: @@ -1345,8 +1345,8 @@ ; GREEDY-NEXT: [[PHI:%[0-9]+]]:vgpr(s32) = G_PHI [[ANYEXT]](s32), %bb.0, [[ANYEXT1]](s32), %bb.1 ; GREEDY-NEXT: [[TRUNC3:%[0-9]+]]:vgpr(s1) = G_TRUNC [[PHI]](s32) ; GREEDY-NEXT: [[COPY3:%[0-9]+]]:vcc(s1) = COPY [[TRUNC3]](s1) - ; GREEDY-NEXT: [[COPY4:%[0-9]+]]:vgpr(s32) = COPY [[C]](s32) - ; GREEDY-NEXT: [[SELECT:%[0-9]+]]:vgpr(s32) = G_SELECT [[COPY3]](s1), [[COPY4]], [[COPY]] + ; GREEDY-NEXT: [[C1:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0 + ; GREEDY-NEXT: [[SELECT:%[0-9]+]]:vgpr(s32) = G_SELECT [[COPY3]](s1), [[C1]], [[COPY]] ; GREEDY-NEXT: S_SETPC_B64 undef $sgpr30_sgpr31, implicit [[SELECT]](s32) bb.0: successors: %bb.1, %bb.2 @@ -1408,9 +1408,9 @@ ; FAST-NEXT: [[PHI:%[0-9]+]]:vgpr(s32) = G_PHI [[ANYEXT]](s32), %bb.0, [[ANYEXT1]](s32), %bb.1 ; FAST-NEXT: [[TRUNC3:%[0-9]+]]:vgpr(s1) = G_TRUNC [[PHI]](s32) ; FAST-NEXT: [[COPY3:%[0-9]+]]:vcc(s1) = COPY [[TRUNC3]](s1) - ; FAST-NEXT: [[COPY4:%[0-9]+]]:vgpr(s32) = COPY [[C]](s32) - ; FAST-NEXT: [[COPY5:%[0-9]+]]:vgpr(s32) = COPY [[COPY]](s32) - ; FAST-NEXT: [[SELECT:%[0-9]+]]:vgpr(s32) = G_SELECT [[COPY3]](s1), [[COPY4]], [[COPY5]] + ; FAST-NEXT: [[C1:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0 + ; FAST-NEXT: [[COPY4:%[0-9]+]]:vgpr(s32) = COPY [[COPY]](s32) + ; FAST-NEXT: [[SELECT:%[0-9]+]]:vgpr(s32) = G_SELECT [[COPY3]](s1), [[C1]], [[COPY4]] ; FAST-NEXT: S_SETPC_B64 undef $sgpr30_sgpr31, implicit [[SELECT]](s32) ; GREEDY-LABEL: name: phi_s1_s_v_sbranch ; GREEDY: bb.0: @@ -1440,9 +1440,9 @@ ; GREEDY-NEXT: [[PHI:%[0-9]+]]:vgpr(s32) = G_PHI [[ANYEXT]](s32), %bb.0, [[ANYEXT1]](s32), %bb.1 ; GREEDY-NEXT: [[TRUNC3:%[0-9]+]]:vgpr(s1) = G_TRUNC [[PHI]](s32) ; GREEDY-NEXT: [[COPY3:%[0-9]+]]:vcc(s1) = COPY [[TRUNC3]](s1) - ; GREEDY-NEXT: [[COPY4:%[0-9]+]]:vgpr(s32) = COPY [[C]](s32) - ; GREEDY-NEXT: [[COPY5:%[0-9]+]]:vgpr(s32) = COPY [[COPY]](s32) - ; GREEDY-NEXT: [[SELECT:%[0-9]+]]:vgpr(s32) = G_SELECT [[COPY3]](s1), [[COPY4]], [[COPY5]] + ; GREEDY-NEXT: [[C1:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0 + ; GREEDY-NEXT: [[COPY4:%[0-9]+]]:vgpr(s32) = COPY [[COPY]](s32) + ; GREEDY-NEXT: [[SELECT:%[0-9]+]]:vgpr(s32) = G_SELECT [[COPY3]](s1), [[C1]], [[COPY4]] ; GREEDY-NEXT: S_SETPC_B64 undef $sgpr30_sgpr31, implicit [[SELECT]](s32) bb.0: successors: %bb.1, %bb.2 @@ -1504,8 +1504,8 @@ ; FAST-NEXT: [[PHI:%[0-9]+]]:vgpr(s32) = G_PHI [[ANYEXT]](s32), %bb.0, [[ANYEXT1]](s32), %bb.1 ; FAST-NEXT: [[TRUNC3:%[0-9]+]]:vgpr(s1) = G_TRUNC [[PHI]](s32) ; FAST-NEXT: [[COPY3:%[0-9]+]]:vcc(s1) = COPY [[TRUNC3]](s1) - ; FAST-NEXT: [[COPY4:%[0-9]+]]:vgpr(s32) = COPY [[C]](s32) - ; FAST-NEXT: [[SELECT:%[0-9]+]]:vgpr(s32) = G_SELECT [[COPY3]](s1), [[COPY4]], [[COPY]] + ; FAST-NEXT: [[C1:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0 + ; FAST-NEXT: [[SELECT:%[0-9]+]]:vgpr(s32) = G_SELECT [[COPY3]](s1), [[C1]], [[COPY]] ; FAST-NEXT: S_SETPC_B64 undef $sgpr30_sgpr31, implicit [[SELECT]](s32) ; GREEDY-LABEL: name: phi_s1_v_v_sbranch ; GREEDY: bb.0: @@ -1535,8 +1535,8 @@ ; GREEDY-NEXT: [[PHI:%[0-9]+]]:vgpr(s32) = G_PHI [[ANYEXT]](s32), %bb.0, [[ANYEXT1]](s32), %bb.1 ; GREEDY-NEXT: [[TRUNC3:%[0-9]+]]:vgpr(s1) = G_TRUNC [[PHI]](s32) ; GREEDY-NEXT: [[COPY3:%[0-9]+]]:vcc(s1) = COPY [[TRUNC3]](s1) - ; GREEDY-NEXT: [[COPY4:%[0-9]+]]:vgpr(s32) = COPY [[C]](s32) - ; GREEDY-NEXT: [[SELECT:%[0-9]+]]:vgpr(s32) = G_SELECT [[COPY3]](s1), [[COPY4]], [[COPY]] + ; GREEDY-NEXT: [[C1:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0 + ; GREEDY-NEXT: [[SELECT:%[0-9]+]]:vgpr(s32) = G_SELECT [[COPY3]](s1), [[C1]], [[COPY]] ; GREEDY-NEXT: S_SETPC_B64 undef $sgpr30_sgpr31, implicit [[SELECT]](s32) bb.0: successors: %bb.1, %bb.2 Index: llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-phi.mir =================================================================== --- llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-phi.mir +++ llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-phi.mir @@ -339,20 +339,19 @@ ; FAST-NEXT: [[COPY:%[0-9]+]]:sgpr(s32) = COPY $sgpr0 ; FAST-NEXT: [[COPY1:%[0-9]+]]:sgpr(s32) = COPY $sgpr1 ; FAST-NEXT: [[COPY2:%[0-9]+]]:vgpr(s32) = COPY $vgpr0 - ; FAST-NEXT: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 0 - ; FAST-NEXT: [[COPY3:%[0-9]+]]:vgpr(s32) = COPY [[C]](s32) - ; FAST-NEXT: [[ICMP:%[0-9]+]]:vcc(s1) = G_ICMP intpred(eq), [[COPY2]](s32), [[COPY3]] + ; FAST-NEXT: [[C:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0 + ; FAST-NEXT: [[ICMP:%[0-9]+]]:vcc(s1) = G_ICMP intpred(eq), [[COPY2]](s32), [[C]] ; FAST-NEXT: G_BRCOND [[ICMP]](s1), %bb.1 ; FAST-NEXT: G_BR %bb.2 ; FAST-NEXT: {{ $}} ; FAST-NEXT: bb.1: ; FAST-NEXT: successors: %bb.2(0x80000000) ; FAST-NEXT: {{ $}} - ; FAST-NEXT: [[COPY4:%[0-9]+]]:sgpr(s32) = COPY [[COPY1]](s32) + ; FAST-NEXT: [[COPY3:%[0-9]+]]:sgpr(s32) = COPY [[COPY1]](s32) ; FAST-NEXT: G_BR %bb.2 ; FAST-NEXT: {{ $}} ; FAST-NEXT: bb.2: - ; FAST-NEXT: [[PHI:%[0-9]+]]:sgpr(s32) = G_PHI [[COPY]](s32), %bb.0, [[COPY4]](s32), %bb.1 + ; FAST-NEXT: [[PHI:%[0-9]+]]:sgpr(s32) = G_PHI [[COPY]](s32), %bb.0, [[COPY3]](s32), %bb.1 ; FAST-NEXT: $sgpr0 = COPY [[PHI]](s32) ; FAST-NEXT: S_SETPC_B64 undef $sgpr30_sgpr31 ; GREEDY-LABEL: name: phi_s32_ss_vcc_sbranch @@ -363,20 +362,19 @@ ; GREEDY-NEXT: [[COPY:%[0-9]+]]:sgpr(s32) = COPY $sgpr0 ; GREEDY-NEXT: [[COPY1:%[0-9]+]]:sgpr(s32) = COPY $sgpr1 ; GREEDY-NEXT: [[COPY2:%[0-9]+]]:vgpr(s32) = COPY $vgpr0 - ; GREEDY-NEXT: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 0 - ; GREEDY-NEXT: [[COPY3:%[0-9]+]]:vgpr(s32) = COPY [[C]](s32) - ; GREEDY-NEXT: [[ICMP:%[0-9]+]]:vcc(s1) = G_ICMP intpred(eq), [[COPY2]](s32), [[COPY3]] + ; GREEDY-NEXT: [[C:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0 + ; GREEDY-NEXT: [[ICMP:%[0-9]+]]:vcc(s1) = G_ICMP intpred(eq), [[COPY2]](s32), [[C]] ; GREEDY-NEXT: G_BRCOND [[ICMP]](s1), %bb.1 ; GREEDY-NEXT: G_BR %bb.2 ; GREEDY-NEXT: {{ $}} ; GREEDY-NEXT: bb.1: ; GREEDY-NEXT: successors: %bb.2(0x80000000) ; GREEDY-NEXT: {{ $}} - ; GREEDY-NEXT: [[COPY4:%[0-9]+]]:sgpr(s32) = COPY [[COPY1]](s32) + ; GREEDY-NEXT: [[COPY3:%[0-9]+]]:sgpr(s32) = COPY [[COPY1]](s32) ; GREEDY-NEXT: G_BR %bb.2 ; GREEDY-NEXT: {{ $}} ; GREEDY-NEXT: bb.2: - ; GREEDY-NEXT: [[PHI:%[0-9]+]]:sgpr(s32) = G_PHI [[COPY]](s32), %bb.0, [[COPY4]](s32), %bb.1 + ; GREEDY-NEXT: [[PHI:%[0-9]+]]:sgpr(s32) = G_PHI [[COPY]](s32), %bb.0, [[COPY3]](s32), %bb.1 ; GREEDY-NEXT: $sgpr0 = COPY [[PHI]](s32) ; GREEDY-NEXT: S_SETPC_B64 undef $sgpr30_sgpr31 bb.0: @@ -418,20 +416,19 @@ ; FAST-NEXT: [[COPY:%[0-9]+]]:sgpr(s32) = COPY $sgpr0 ; FAST-NEXT: [[COPY1:%[0-9]+]]:vgpr(s32) = COPY $vgpr0 ; FAST-NEXT: [[COPY2:%[0-9]+]]:vgpr(s32) = COPY $vgpr1 - ; FAST-NEXT: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 0 - ; FAST-NEXT: [[COPY3:%[0-9]+]]:vgpr(s32) = COPY [[C]](s32) - ; FAST-NEXT: [[ICMP:%[0-9]+]]:vcc(s1) = G_ICMP intpred(eq), [[COPY2]](s32), [[COPY3]] + ; FAST-NEXT: [[C:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0 + ; FAST-NEXT: [[ICMP:%[0-9]+]]:vcc(s1) = G_ICMP intpred(eq), [[COPY2]](s32), [[C]] ; FAST-NEXT: G_BRCOND [[ICMP]](s1), %bb.1 ; FAST-NEXT: G_BR %bb.2 ; FAST-NEXT: {{ $}} ; FAST-NEXT: bb.1: ; FAST-NEXT: successors: %bb.2(0x80000000) ; FAST-NEXT: {{ $}} - ; FAST-NEXT: [[COPY4:%[0-9]+]]:vgpr(s32) = COPY [[COPY1]](s32) + ; FAST-NEXT: [[COPY3:%[0-9]+]]:vgpr(s32) = COPY [[COPY1]](s32) ; FAST-NEXT: G_BR %bb.2 ; FAST-NEXT: {{ $}} ; FAST-NEXT: bb.2: - ; FAST-NEXT: [[PHI:%[0-9]+]]:vgpr(s32) = G_PHI [[COPY]](s32), %bb.0, [[COPY4]](s32), %bb.1 + ; FAST-NEXT: [[PHI:%[0-9]+]]:vgpr(s32) = G_PHI [[COPY]](s32), %bb.0, [[COPY3]](s32), %bb.1 ; FAST-NEXT: $vgpr0 = COPY [[PHI]](s32) ; FAST-NEXT: S_SETPC_B64 undef $sgpr30_sgpr31 ; GREEDY-LABEL: name: phi_s32_sv_vcc_sbranch @@ -442,20 +439,19 @@ ; GREEDY-NEXT: [[COPY:%[0-9]+]]:sgpr(s32) = COPY $sgpr0 ; GREEDY-NEXT: [[COPY1:%[0-9]+]]:vgpr(s32) = COPY $vgpr0 ; GREEDY-NEXT: [[COPY2:%[0-9]+]]:vgpr(s32) = COPY $vgpr1 - ; GREEDY-NEXT: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 0 - ; GREEDY-NEXT: [[COPY3:%[0-9]+]]:vgpr(s32) = COPY [[C]](s32) - ; GREEDY-NEXT: [[ICMP:%[0-9]+]]:vcc(s1) = G_ICMP intpred(eq), [[COPY2]](s32), [[COPY3]] + ; GREEDY-NEXT: [[C:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0 + ; GREEDY-NEXT: [[ICMP:%[0-9]+]]:vcc(s1) = G_ICMP intpred(eq), [[COPY2]](s32), [[C]] ; GREEDY-NEXT: G_BRCOND [[ICMP]](s1), %bb.1 ; GREEDY-NEXT: G_BR %bb.2 ; GREEDY-NEXT: {{ $}} ; GREEDY-NEXT: bb.1: ; GREEDY-NEXT: successors: %bb.2(0x80000000) ; GREEDY-NEXT: {{ $}} - ; GREEDY-NEXT: [[COPY4:%[0-9]+]]:vgpr(s32) = COPY [[COPY1]](s32) + ; GREEDY-NEXT: [[COPY3:%[0-9]+]]:vgpr(s32) = COPY [[COPY1]](s32) ; GREEDY-NEXT: G_BR %bb.2 ; GREEDY-NEXT: {{ $}} ; GREEDY-NEXT: bb.2: - ; GREEDY-NEXT: [[PHI:%[0-9]+]]:vgpr(s32) = G_PHI [[COPY]](s32), %bb.0, [[COPY4]](s32), %bb.1 + ; GREEDY-NEXT: [[PHI:%[0-9]+]]:vgpr(s32) = G_PHI [[COPY]](s32), %bb.0, [[COPY3]](s32), %bb.1 ; GREEDY-NEXT: $vgpr0 = COPY [[PHI]](s32) ; GREEDY-NEXT: S_SETPC_B64 undef $sgpr30_sgpr31 bb.0: @@ -497,20 +493,19 @@ ; FAST-NEXT: [[COPY:%[0-9]+]]:vgpr(s32) = COPY $vgpr0 ; FAST-NEXT: [[COPY1:%[0-9]+]]:sgpr(s32) = COPY $sgpr0 ; FAST-NEXT: [[COPY2:%[0-9]+]]:vgpr(s32) = COPY $vgpr1 - ; FAST-NEXT: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 0 - ; FAST-NEXT: [[COPY3:%[0-9]+]]:vgpr(s32) = COPY [[C]](s32) - ; FAST-NEXT: [[ICMP:%[0-9]+]]:vcc(s1) = G_ICMP intpred(eq), [[COPY2]](s32), [[COPY3]] + ; FAST-NEXT: [[C:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0 + ; FAST-NEXT: [[ICMP:%[0-9]+]]:vcc(s1) = G_ICMP intpred(eq), [[COPY2]](s32), [[C]] ; FAST-NEXT: G_BRCOND [[ICMP]](s1), %bb.1 ; FAST-NEXT: G_BR %bb.2 ; FAST-NEXT: {{ $}} ; FAST-NEXT: bb.1: ; FAST-NEXT: successors: %bb.2(0x80000000) ; FAST-NEXT: {{ $}} - ; FAST-NEXT: [[COPY4:%[0-9]+]]:sgpr(s32) = COPY [[COPY1]](s32) + ; FAST-NEXT: [[COPY3:%[0-9]+]]:sgpr(s32) = COPY [[COPY1]](s32) ; FAST-NEXT: G_BR %bb.2 ; FAST-NEXT: {{ $}} ; FAST-NEXT: bb.2: - ; FAST-NEXT: [[PHI:%[0-9]+]]:vgpr(s32) = G_PHI [[COPY]](s32), %bb.0, [[COPY4]](s32), %bb.1 + ; FAST-NEXT: [[PHI:%[0-9]+]]:vgpr(s32) = G_PHI [[COPY]](s32), %bb.0, [[COPY3]](s32), %bb.1 ; FAST-NEXT: $vgpr0 = COPY [[PHI]](s32) ; FAST-NEXT: S_SETPC_B64 undef $sgpr30_sgpr31 ; GREEDY-LABEL: name: phi_s32_vs_vcc_sbranch @@ -521,20 +516,19 @@ ; GREEDY-NEXT: [[COPY:%[0-9]+]]:vgpr(s32) = COPY $vgpr0 ; GREEDY-NEXT: [[COPY1:%[0-9]+]]:sgpr(s32) = COPY $sgpr0 ; GREEDY-NEXT: [[COPY2:%[0-9]+]]:vgpr(s32) = COPY $vgpr1 - ; GREEDY-NEXT: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 0 - ; GREEDY-NEXT: [[COPY3:%[0-9]+]]:vgpr(s32) = COPY [[C]](s32) - ; GREEDY-NEXT: [[ICMP:%[0-9]+]]:vcc(s1) = G_ICMP intpred(eq), [[COPY2]](s32), [[COPY3]] + ; GREEDY-NEXT: [[C:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0 + ; GREEDY-NEXT: [[ICMP:%[0-9]+]]:vcc(s1) = G_ICMP intpred(eq), [[COPY2]](s32), [[C]] ; GREEDY-NEXT: G_BRCOND [[ICMP]](s1), %bb.1 ; GREEDY-NEXT: G_BR %bb.2 ; GREEDY-NEXT: {{ $}} ; GREEDY-NEXT: bb.1: ; GREEDY-NEXT: successors: %bb.2(0x80000000) ; GREEDY-NEXT: {{ $}} - ; GREEDY-NEXT: [[COPY4:%[0-9]+]]:sgpr(s32) = COPY [[COPY1]](s32) + ; GREEDY-NEXT: [[COPY3:%[0-9]+]]:sgpr(s32) = COPY [[COPY1]](s32) ; GREEDY-NEXT: G_BR %bb.2 ; GREEDY-NEXT: {{ $}} ; GREEDY-NEXT: bb.2: - ; GREEDY-NEXT: [[PHI:%[0-9]+]]:vgpr(s32) = G_PHI [[COPY]](s32), %bb.0, [[COPY4]](s32), %bb.1 + ; GREEDY-NEXT: [[PHI:%[0-9]+]]:vgpr(s32) = G_PHI [[COPY]](s32), %bb.0, [[COPY3]](s32), %bb.1 ; GREEDY-NEXT: $vgpr0 = COPY [[PHI]](s32) ; GREEDY-NEXT: S_SETPC_B64 undef $sgpr30_sgpr31 bb.0: @@ -576,20 +570,19 @@ ; FAST-NEXT: [[COPY:%[0-9]+]]:vgpr(s32) = COPY $vgpr0 ; FAST-NEXT: [[COPY1:%[0-9]+]]:vgpr(s32) = COPY $vgpr1 ; FAST-NEXT: [[COPY2:%[0-9]+]]:vgpr(s32) = COPY $vgpr2 - ; FAST-NEXT: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 0 - ; FAST-NEXT: [[COPY3:%[0-9]+]]:vgpr(s32) = COPY [[C]](s32) - ; FAST-NEXT: [[ICMP:%[0-9]+]]:vcc(s1) = G_ICMP intpred(eq), [[COPY2]](s32), [[COPY3]] + ; FAST-NEXT: [[C:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0 + ; FAST-NEXT: [[ICMP:%[0-9]+]]:vcc(s1) = G_ICMP intpred(eq), [[COPY2]](s32), [[C]] ; FAST-NEXT: G_BRCOND [[ICMP]](s1), %bb.1 ; FAST-NEXT: G_BR %bb.2 ; FAST-NEXT: {{ $}} ; FAST-NEXT: bb.1: ; FAST-NEXT: successors: %bb.2(0x80000000) ; FAST-NEXT: {{ $}} - ; FAST-NEXT: [[COPY4:%[0-9]+]]:vgpr(s32) = COPY [[COPY1]](s32) + ; FAST-NEXT: [[COPY3:%[0-9]+]]:vgpr(s32) = COPY [[COPY1]](s32) ; FAST-NEXT: G_BR %bb.2 ; FAST-NEXT: {{ $}} ; FAST-NEXT: bb.2: - ; FAST-NEXT: [[PHI:%[0-9]+]]:vgpr(s32) = G_PHI [[COPY]](s32), %bb.0, [[COPY4]](s32), %bb.1 + ; FAST-NEXT: [[PHI:%[0-9]+]]:vgpr(s32) = G_PHI [[COPY]](s32), %bb.0, [[COPY3]](s32), %bb.1 ; FAST-NEXT: $vgpr0 = COPY [[PHI]](s32) ; FAST-NEXT: S_SETPC_B64 undef $sgpr30_sgpr31 ; GREEDY-LABEL: name: phi_s32_vv_vcc_sbranch @@ -600,20 +593,19 @@ ; GREEDY-NEXT: [[COPY:%[0-9]+]]:vgpr(s32) = COPY $vgpr0 ; GREEDY-NEXT: [[COPY1:%[0-9]+]]:vgpr(s32) = COPY $vgpr1 ; GREEDY-NEXT: [[COPY2:%[0-9]+]]:vgpr(s32) = COPY $vgpr2 - ; GREEDY-NEXT: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 0 - ; GREEDY-NEXT: [[COPY3:%[0-9]+]]:vgpr(s32) = COPY [[C]](s32) - ; GREEDY-NEXT: [[ICMP:%[0-9]+]]:vcc(s1) = G_ICMP intpred(eq), [[COPY2]](s32), [[COPY3]] + ; GREEDY-NEXT: [[C:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0 + ; GREEDY-NEXT: [[ICMP:%[0-9]+]]:vcc(s1) = G_ICMP intpred(eq), [[COPY2]](s32), [[C]] ; GREEDY-NEXT: G_BRCOND [[ICMP]](s1), %bb.1 ; GREEDY-NEXT: G_BR %bb.2 ; GREEDY-NEXT: {{ $}} ; GREEDY-NEXT: bb.1: ; GREEDY-NEXT: successors: %bb.2(0x80000000) ; GREEDY-NEXT: {{ $}} - ; GREEDY-NEXT: [[COPY4:%[0-9]+]]:vgpr(s32) = COPY [[COPY1]](s32) + ; GREEDY-NEXT: [[COPY3:%[0-9]+]]:vgpr(s32) = COPY [[COPY1]](s32) ; GREEDY-NEXT: G_BR %bb.2 ; GREEDY-NEXT: {{ $}} ; GREEDY-NEXT: bb.2: - ; GREEDY-NEXT: [[PHI:%[0-9]+]]:vgpr(s32) = G_PHI [[COPY]](s32), %bb.0, [[COPY4]](s32), %bb.1 + ; GREEDY-NEXT: [[PHI:%[0-9]+]]:vgpr(s32) = G_PHI [[COPY]](s32), %bb.0, [[COPY3]](s32), %bb.1 ; GREEDY-NEXT: $vgpr0 = COPY [[PHI]](s32) ; GREEDY-NEXT: S_SETPC_B64 undef $sgpr30_sgpr31 bb.0: @@ -904,15 +896,15 @@ ; FAST-NEXT: bb.1: ; FAST-NEXT: successors: %bb.2(0x80000000) ; FAST-NEXT: {{ $}} - ; FAST-NEXT: [[COPY4:%[0-9]+]]:vgpr(s32) = COPY [[C]](s32) - ; FAST-NEXT: [[ICMP2:%[0-9]+]]:vcc(s1) = G_ICMP intpred(eq), [[COPY1]](s32), [[COPY4]] + ; FAST-NEXT: [[C1:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0 + ; FAST-NEXT: [[ICMP2:%[0-9]+]]:vcc(s1) = G_ICMP intpred(eq), [[COPY1]](s32), [[C1]] ; FAST-NEXT: G_BR %bb.2 ; FAST-NEXT: {{ $}} ; FAST-NEXT: bb.2: ; FAST-NEXT: [[PHI:%[0-9]+]]:vcc(s1) = G_PHI [[COPY3]](s1), %bb.0, [[ICMP2]](s1), %bb.1 - ; FAST-NEXT: [[COPY5:%[0-9]+]]:vgpr(s32) = COPY [[C]](s32) - ; FAST-NEXT: [[COPY6:%[0-9]+]]:vgpr(s32) = COPY [[COPY]](s32) - ; FAST-NEXT: [[SELECT:%[0-9]+]]:vgpr(s32) = G_SELECT [[PHI]](s1), [[COPY5]], [[COPY6]] + ; FAST-NEXT: [[C2:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0 + ; FAST-NEXT: [[COPY4:%[0-9]+]]:vgpr(s32) = COPY [[COPY]](s32) + ; FAST-NEXT: [[SELECT:%[0-9]+]]:vgpr(s32) = G_SELECT [[PHI]](s1), [[C2]], [[COPY4]] ; FAST-NEXT: S_SETPC_B64 undef $sgpr30_sgpr31, implicit [[SELECT]](s32) ; GREEDY-LABEL: name: phi_s1_scc_vcc_sbranch ; GREEDY: bb.0: @@ -935,15 +927,15 @@ ; GREEDY-NEXT: bb.1: ; GREEDY-NEXT: successors: %bb.2(0x80000000) ; GREEDY-NEXT: {{ $}} - ; GREEDY-NEXT: [[COPY4:%[0-9]+]]:vgpr(s32) = COPY [[C]](s32) - ; GREEDY-NEXT: [[ICMP2:%[0-9]+]]:vcc(s1) = G_ICMP intpred(eq), [[COPY1]](s32), [[COPY4]] + ; GREEDY-NEXT: [[C1:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0 + ; GREEDY-NEXT: [[ICMP2:%[0-9]+]]:vcc(s1) = G_ICMP intpred(eq), [[COPY1]](s32), [[C1]] ; GREEDY-NEXT: G_BR %bb.2 ; GREEDY-NEXT: {{ $}} ; GREEDY-NEXT: bb.2: ; GREEDY-NEXT: [[PHI:%[0-9]+]]:vcc(s1) = G_PHI [[COPY3]](s1), %bb.0, [[ICMP2]](s1), %bb.1 - ; GREEDY-NEXT: [[COPY5:%[0-9]+]]:vgpr(s32) = COPY [[C]](s32) - ; GREEDY-NEXT: [[COPY6:%[0-9]+]]:vgpr(s32) = COPY [[COPY]](s32) - ; GREEDY-NEXT: [[SELECT:%[0-9]+]]:vgpr(s32) = G_SELECT [[PHI]](s1), [[COPY5]], [[COPY6]] + ; GREEDY-NEXT: [[C2:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0 + ; GREEDY-NEXT: [[COPY4:%[0-9]+]]:vgpr(s32) = COPY [[COPY]](s32) + ; GREEDY-NEXT: [[SELECT:%[0-9]+]]:vgpr(s32) = G_SELECT [[PHI]](s1), [[C2]], [[COPY4]] ; GREEDY-NEXT: S_SETPC_B64 undef $sgpr30_sgpr31, implicit [[SELECT]](s32) bb.0: successors: %bb.1, %bb.2 @@ -986,8 +978,8 @@ ; FAST-NEXT: [[COPY1:%[0-9]+]]:sgpr(s32) = COPY $sgpr0 ; FAST-NEXT: [[COPY2:%[0-9]+]]:sgpr(s32) = COPY $sgpr1 ; FAST-NEXT: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 0 - ; FAST-NEXT: [[COPY3:%[0-9]+]]:vgpr(s32) = COPY [[C]](s32) - ; FAST-NEXT: [[ICMP:%[0-9]+]]:vcc(s1) = G_ICMP intpred(eq), [[COPY]](s32), [[COPY3]] + ; FAST-NEXT: [[C1:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0 + ; FAST-NEXT: [[ICMP:%[0-9]+]]:vcc(s1) = G_ICMP intpred(eq), [[COPY]](s32), [[C1]] ; FAST-NEXT: [[ICMP1:%[0-9]+]]:sgpr(s32) = G_ICMP intpred(eq), [[COPY2]](s32), [[C]] ; FAST-NEXT: [[TRUNC:%[0-9]+]]:sgpr(s1) = G_TRUNC [[ICMP1]](s32) ; FAST-NEXT: [[ZEXT:%[0-9]+]]:sgpr(s32) = G_ZEXT [[TRUNC]](s1) @@ -999,14 +991,14 @@ ; FAST-NEXT: {{ $}} ; FAST-NEXT: [[ICMP2:%[0-9]+]]:sgpr(s32) = G_ICMP intpred(eq), [[COPY1]](s32), [[C]] ; FAST-NEXT: [[TRUNC1:%[0-9]+]]:sgpr(s1) = G_TRUNC [[ICMP2]](s32) - ; FAST-NEXT: [[COPY4:%[0-9]+]]:vcc(s1) = COPY [[TRUNC1]](s1) + ; FAST-NEXT: [[COPY3:%[0-9]+]]:vcc(s1) = COPY [[TRUNC1]](s1) ; FAST-NEXT: G_BR %bb.2 ; FAST-NEXT: {{ $}} ; FAST-NEXT: bb.2: - ; FAST-NEXT: [[PHI:%[0-9]+]]:vcc(s1) = G_PHI [[ICMP]](s1), %bb.0, [[COPY4]](s1), %bb.1 - ; FAST-NEXT: [[COPY5:%[0-9]+]]:vgpr(s32) = COPY [[C]](s32) - ; FAST-NEXT: [[COPY6:%[0-9]+]]:vgpr(s32) = COPY [[COPY1]](s32) - ; FAST-NEXT: [[SELECT:%[0-9]+]]:vgpr(s32) = G_SELECT [[PHI]](s1), [[COPY5]], [[COPY6]] + ; FAST-NEXT: [[PHI:%[0-9]+]]:vcc(s1) = G_PHI [[ICMP]](s1), %bb.0, [[COPY3]](s1), %bb.1 + ; FAST-NEXT: [[C2:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0 + ; FAST-NEXT: [[COPY4:%[0-9]+]]:vgpr(s32) = COPY [[COPY1]](s32) + ; FAST-NEXT: [[SELECT:%[0-9]+]]:vgpr(s32) = G_SELECT [[PHI]](s1), [[C2]], [[COPY4]] ; FAST-NEXT: S_SETPC_B64 undef $sgpr30_sgpr31, implicit [[SELECT]](s32) ; GREEDY-LABEL: name: phi_s1_vcc_scc_sbranch ; GREEDY: bb.0: @@ -1017,8 +1009,8 @@ ; GREEDY-NEXT: [[COPY1:%[0-9]+]]:sgpr(s32) = COPY $sgpr0 ; GREEDY-NEXT: [[COPY2:%[0-9]+]]:sgpr(s32) = COPY $sgpr1 ; GREEDY-NEXT: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 0 - ; GREEDY-NEXT: [[COPY3:%[0-9]+]]:vgpr(s32) = COPY [[C]](s32) - ; GREEDY-NEXT: [[ICMP:%[0-9]+]]:vcc(s1) = G_ICMP intpred(eq), [[COPY]](s32), [[COPY3]] + ; GREEDY-NEXT: [[C1:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0 + ; GREEDY-NEXT: [[ICMP:%[0-9]+]]:vcc(s1) = G_ICMP intpred(eq), [[COPY]](s32), [[C1]] ; GREEDY-NEXT: [[ICMP1:%[0-9]+]]:sgpr(s32) = G_ICMP intpred(eq), [[COPY2]](s32), [[C]] ; GREEDY-NEXT: [[TRUNC:%[0-9]+]]:sgpr(s1) = G_TRUNC [[ICMP1]](s32) ; GREEDY-NEXT: [[ZEXT:%[0-9]+]]:sgpr(s32) = G_ZEXT [[TRUNC]](s1) @@ -1030,14 +1022,14 @@ ; GREEDY-NEXT: {{ $}} ; GREEDY-NEXT: [[ICMP2:%[0-9]+]]:sgpr(s32) = G_ICMP intpred(eq), [[COPY1]](s32), [[C]] ; GREEDY-NEXT: [[TRUNC1:%[0-9]+]]:sgpr(s1) = G_TRUNC [[ICMP2]](s32) - ; GREEDY-NEXT: [[COPY4:%[0-9]+]]:vcc(s1) = COPY [[TRUNC1]](s1) + ; GREEDY-NEXT: [[COPY3:%[0-9]+]]:vcc(s1) = COPY [[TRUNC1]](s1) ; GREEDY-NEXT: G_BR %bb.2 ; GREEDY-NEXT: {{ $}} ; GREEDY-NEXT: bb.2: - ; GREEDY-NEXT: [[PHI:%[0-9]+]]:vcc(s1) = G_PHI [[ICMP]](s1), %bb.0, [[COPY4]](s1), %bb.1 - ; GREEDY-NEXT: [[COPY5:%[0-9]+]]:vgpr(s32) = COPY [[C]](s32) - ; GREEDY-NEXT: [[COPY6:%[0-9]+]]:vgpr(s32) = COPY [[COPY1]](s32) - ; GREEDY-NEXT: [[SELECT:%[0-9]+]]:vgpr(s32) = G_SELECT [[PHI]](s1), [[COPY5]], [[COPY6]] + ; GREEDY-NEXT: [[PHI:%[0-9]+]]:vcc(s1) = G_PHI [[ICMP]](s1), %bb.0, [[COPY3]](s1), %bb.1 + ; GREEDY-NEXT: [[C2:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0 + ; GREEDY-NEXT: [[COPY4:%[0-9]+]]:vgpr(s32) = COPY [[COPY1]](s32) + ; GREEDY-NEXT: [[SELECT:%[0-9]+]]:vgpr(s32) = G_SELECT [[PHI]](s1), [[C2]], [[COPY4]] ; GREEDY-NEXT: S_SETPC_B64 undef $sgpr30_sgpr31, implicit [[SELECT]](s32) bb.0: successors: %bb.1, %bb.2 @@ -1080,8 +1072,8 @@ ; FAST-NEXT: [[COPY1:%[0-9]+]]:vgpr(s32) = COPY $vgpr1 ; FAST-NEXT: [[COPY2:%[0-9]+]]:sgpr(s32) = COPY $sgpr0 ; FAST-NEXT: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 0 - ; FAST-NEXT: [[COPY3:%[0-9]+]]:vgpr(s32) = COPY [[C]](s32) - ; FAST-NEXT: [[ICMP:%[0-9]+]]:vcc(s1) = G_ICMP intpred(eq), [[COPY]](s32), [[COPY3]] + ; FAST-NEXT: [[C1:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0 + ; FAST-NEXT: [[ICMP:%[0-9]+]]:vcc(s1) = G_ICMP intpred(eq), [[COPY]](s32), [[C1]] ; FAST-NEXT: [[ICMP1:%[0-9]+]]:sgpr(s32) = G_ICMP intpred(eq), [[COPY2]](s32), [[C]] ; FAST-NEXT: [[TRUNC:%[0-9]+]]:sgpr(s1) = G_TRUNC [[ICMP1]](s32) ; FAST-NEXT: [[ZEXT:%[0-9]+]]:sgpr(s32) = G_ZEXT [[TRUNC]](s1) @@ -1091,14 +1083,14 @@ ; FAST-NEXT: bb.1: ; FAST-NEXT: successors: %bb.2(0x80000000) ; FAST-NEXT: {{ $}} - ; FAST-NEXT: [[COPY4:%[0-9]+]]:vgpr(s32) = COPY [[C]](s32) - ; FAST-NEXT: [[ICMP2:%[0-9]+]]:vcc(s1) = G_ICMP intpred(eq), [[COPY1]](s32), [[COPY4]] + ; FAST-NEXT: [[C2:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0 + ; FAST-NEXT: [[ICMP2:%[0-9]+]]:vcc(s1) = G_ICMP intpred(eq), [[COPY1]](s32), [[C2]] ; FAST-NEXT: G_BR %bb.2 ; FAST-NEXT: {{ $}} ; FAST-NEXT: bb.2: ; FAST-NEXT: [[PHI:%[0-9]+]]:vcc(s1) = G_PHI [[ICMP]](s1), %bb.0, [[ICMP2]](s1), %bb.1 - ; FAST-NEXT: [[COPY5:%[0-9]+]]:vgpr(s32) = COPY [[C]](s32) - ; FAST-NEXT: [[SELECT:%[0-9]+]]:vgpr(s32) = G_SELECT [[PHI]](s1), [[COPY5]], [[COPY]] + ; FAST-NEXT: [[C3:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0 + ; FAST-NEXT: [[SELECT:%[0-9]+]]:vgpr(s32) = G_SELECT [[PHI]](s1), [[C3]], [[COPY]] ; FAST-NEXT: S_SETPC_B64 undef $sgpr30_sgpr31, implicit [[SELECT]](s32) ; GREEDY-LABEL: name: phi_s1_vcc_vcc_sbranch ; GREEDY: bb.0: @@ -1109,8 +1101,8 @@ ; GREEDY-NEXT: [[COPY1:%[0-9]+]]:vgpr(s32) = COPY $vgpr1 ; GREEDY-NEXT: [[COPY2:%[0-9]+]]:sgpr(s32) = COPY $sgpr0 ; GREEDY-NEXT: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 0 - ; GREEDY-NEXT: [[COPY3:%[0-9]+]]:vgpr(s32) = COPY [[C]](s32) - ; GREEDY-NEXT: [[ICMP:%[0-9]+]]:vcc(s1) = G_ICMP intpred(eq), [[COPY]](s32), [[COPY3]] + ; GREEDY-NEXT: [[C1:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0 + ; GREEDY-NEXT: [[ICMP:%[0-9]+]]:vcc(s1) = G_ICMP intpred(eq), [[COPY]](s32), [[C1]] ; GREEDY-NEXT: [[ICMP1:%[0-9]+]]:sgpr(s32) = G_ICMP intpred(eq), [[COPY2]](s32), [[C]] ; GREEDY-NEXT: [[TRUNC:%[0-9]+]]:sgpr(s1) = G_TRUNC [[ICMP1]](s32) ; GREEDY-NEXT: [[ZEXT:%[0-9]+]]:sgpr(s32) = G_ZEXT [[TRUNC]](s1) @@ -1120,14 +1112,14 @@ ; GREEDY-NEXT: bb.1: ; GREEDY-NEXT: successors: %bb.2(0x80000000) ; GREEDY-NEXT: {{ $}} - ; GREEDY-NEXT: [[COPY4:%[0-9]+]]:vgpr(s32) = COPY [[C]](s32) - ; GREEDY-NEXT: [[ICMP2:%[0-9]+]]:vcc(s1) = G_ICMP intpred(eq), [[COPY1]](s32), [[COPY4]] + ; GREEDY-NEXT: [[C2:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0 + ; GREEDY-NEXT: [[ICMP2:%[0-9]+]]:vcc(s1) = G_ICMP intpred(eq), [[COPY1]](s32), [[C2]] ; GREEDY-NEXT: G_BR %bb.2 ; GREEDY-NEXT: {{ $}} ; GREEDY-NEXT: bb.2: ; GREEDY-NEXT: [[PHI:%[0-9]+]]:vcc(s1) = G_PHI [[ICMP]](s1), %bb.0, [[ICMP2]](s1), %bb.1 - ; GREEDY-NEXT: [[COPY5:%[0-9]+]]:vgpr(s32) = COPY [[C]](s32) - ; GREEDY-NEXT: [[SELECT:%[0-9]+]]:vgpr(s32) = G_SELECT [[PHI]](s1), [[COPY5]], [[COPY]] + ; GREEDY-NEXT: [[C3:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0 + ; GREEDY-NEXT: [[SELECT:%[0-9]+]]:vgpr(s32) = G_SELECT [[PHI]](s1), [[C3]], [[COPY]] ; GREEDY-NEXT: S_SETPC_B64 undef $sgpr30_sgpr31, implicit [[SELECT]](s32) bb.0: successors: %bb.1, %bb.2 @@ -1378,9 +1370,9 @@ ; FAST-NEXT: [[PHI:%[0-9]+]]:vgpr(s32) = G_PHI [[ANYEXT]](s32), %bb.0, [[ANYEXT1]](s32), %bb.1 ; FAST-NEXT: [[TRUNC3:%[0-9]+]]:vgpr(s1) = G_TRUNC [[PHI]](s32) ; FAST-NEXT: [[COPY3:%[0-9]+]]:vcc(s1) = COPY [[TRUNC3]](s1) - ; FAST-NEXT: [[COPY4:%[0-9]+]]:vgpr(s32) = COPY [[C]](s32) - ; FAST-NEXT: [[COPY5:%[0-9]+]]:vgpr(s32) = COPY [[COPY]](s32) - ; FAST-NEXT: [[SELECT:%[0-9]+]]:vgpr(s32) = G_SELECT [[COPY3]](s1), [[COPY4]], [[COPY5]] + ; FAST-NEXT: [[C1:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0 + ; FAST-NEXT: [[COPY4:%[0-9]+]]:vgpr(s32) = COPY [[COPY]](s32) + ; FAST-NEXT: [[SELECT:%[0-9]+]]:vgpr(s32) = G_SELECT [[COPY3]](s1), [[C1]], [[COPY4]] ; FAST-NEXT: S_SETPC_B64 undef $sgpr30_sgpr31, implicit [[SELECT]](s32) ; GREEDY-LABEL: name: phi_s1_scc_v_sbranch ; GREEDY: bb.0: @@ -1411,9 +1403,9 @@ ; GREEDY-NEXT: [[PHI:%[0-9]+]]:vgpr(s32) = G_PHI [[ANYEXT]](s32), %bb.0, [[ANYEXT1]](s32), %bb.1 ; GREEDY-NEXT: [[TRUNC3:%[0-9]+]]:vgpr(s1) = G_TRUNC [[PHI]](s32) ; GREEDY-NEXT: [[COPY3:%[0-9]+]]:vcc(s1) = COPY [[TRUNC3]](s1) - ; GREEDY-NEXT: [[COPY4:%[0-9]+]]:vgpr(s32) = COPY [[C]](s32) - ; GREEDY-NEXT: [[COPY5:%[0-9]+]]:vgpr(s32) = COPY [[COPY]](s32) - ; GREEDY-NEXT: [[SELECT:%[0-9]+]]:vgpr(s32) = G_SELECT [[COPY3]](s1), [[COPY4]], [[COPY5]] + ; GREEDY-NEXT: [[C1:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0 + ; GREEDY-NEXT: [[COPY4:%[0-9]+]]:vgpr(s32) = COPY [[COPY]](s32) + ; GREEDY-NEXT: [[SELECT:%[0-9]+]]:vgpr(s32) = G_SELECT [[COPY3]](s1), [[C1]], [[COPY4]] ; GREEDY-NEXT: S_SETPC_B64 undef $sgpr30_sgpr31, implicit [[SELECT]](s32) bb.0: successors: %bb.1, %bb.2 @@ -1476,8 +1468,8 @@ ; FAST-NEXT: [[PHI:%[0-9]+]]:vgpr(s32) = G_PHI [[ANYEXT]](s32), %bb.0, [[ANYEXT1]](s32), %bb.1 ; FAST-NEXT: [[TRUNC3:%[0-9]+]]:vgpr(s1) = G_TRUNC [[PHI]](s32) ; FAST-NEXT: [[COPY3:%[0-9]+]]:vcc(s1) = COPY [[TRUNC3]](s1) - ; FAST-NEXT: [[COPY4:%[0-9]+]]:vgpr(s32) = COPY [[C]](s32) - ; FAST-NEXT: [[SELECT:%[0-9]+]]:vgpr(s32) = G_SELECT [[COPY3]](s1), [[COPY4]], [[COPY]] + ; FAST-NEXT: [[C1:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0 + ; FAST-NEXT: [[SELECT:%[0-9]+]]:vgpr(s32) = G_SELECT [[COPY3]](s1), [[C1]], [[COPY]] ; FAST-NEXT: S_SETPC_B64 undef $sgpr30_sgpr31, implicit [[SELECT]](s32) ; GREEDY-LABEL: name: phi_s1_v_scc_sbranch ; GREEDY: bb.0: @@ -1508,8 +1500,8 @@ ; GREEDY-NEXT: [[PHI:%[0-9]+]]:vgpr(s32) = G_PHI [[ANYEXT]](s32), %bb.0, [[ANYEXT1]](s32), %bb.1 ; GREEDY-NEXT: [[TRUNC3:%[0-9]+]]:vgpr(s1) = G_TRUNC [[PHI]](s32) ; GREEDY-NEXT: [[COPY3:%[0-9]+]]:vcc(s1) = COPY [[TRUNC3]](s1) - ; GREEDY-NEXT: [[COPY4:%[0-9]+]]:vgpr(s32) = COPY [[C]](s32) - ; GREEDY-NEXT: [[SELECT:%[0-9]+]]:vgpr(s32) = G_SELECT [[COPY3]](s1), [[COPY4]], [[COPY]] + ; GREEDY-NEXT: [[C1:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0 + ; GREEDY-NEXT: [[SELECT:%[0-9]+]]:vgpr(s32) = G_SELECT [[COPY3]](s1), [[C1]], [[COPY]] ; GREEDY-NEXT: S_SETPC_B64 undef $sgpr30_sgpr31, implicit [[SELECT]](s32) bb.0: successors: %bb.1, %bb.2 @@ -1552,8 +1544,8 @@ ; FAST-NEXT: [[COPY1:%[0-9]+]]:sgpr(s32) = COPY $sgpr0 ; FAST-NEXT: [[COPY2:%[0-9]+]]:sgpr(s32) = COPY $sgpr1 ; FAST-NEXT: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 0 - ; FAST-NEXT: [[COPY3:%[0-9]+]]:vgpr(s32) = COPY [[C]](s32) - ; FAST-NEXT: [[ICMP:%[0-9]+]]:vcc(s1) = G_ICMP intpred(eq), [[COPY]](s32), [[COPY3]] + ; FAST-NEXT: [[C1:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0 + ; FAST-NEXT: [[ICMP:%[0-9]+]]:vcc(s1) = G_ICMP intpred(eq), [[COPY]](s32), [[C1]] ; FAST-NEXT: [[ICMP1:%[0-9]+]]:sgpr(s32) = G_ICMP intpred(eq), [[COPY2]](s32), [[C]] ; FAST-NEXT: [[TRUNC:%[0-9]+]]:sgpr(s1) = G_TRUNC [[ICMP1]](s32) ; FAST-NEXT: [[ZEXT:%[0-9]+]]:sgpr(s32) = G_ZEXT [[TRUNC]](s1) @@ -1564,13 +1556,13 @@ ; FAST-NEXT: successors: %bb.2(0x80000000) ; FAST-NEXT: {{ $}} ; FAST-NEXT: [[TRUNC1:%[0-9]+]]:sgpr(s1) = G_TRUNC [[COPY1]](s32) - ; FAST-NEXT: [[COPY4:%[0-9]+]]:vcc(s1) = COPY [[TRUNC1]](s1) + ; FAST-NEXT: [[COPY3:%[0-9]+]]:vcc(s1) = COPY [[TRUNC1]](s1) ; FAST-NEXT: G_BR %bb.2 ; FAST-NEXT: {{ $}} ; FAST-NEXT: bb.2: - ; FAST-NEXT: [[PHI:%[0-9]+]]:vcc(s1) = G_PHI [[ICMP]](s1), %bb.0, [[COPY4]](s1), %bb.1 - ; FAST-NEXT: [[COPY5:%[0-9]+]]:vgpr(s32) = COPY [[C]](s32) - ; FAST-NEXT: [[SELECT:%[0-9]+]]:vgpr(s32) = G_SELECT [[PHI]](s1), [[COPY5]], [[COPY]] + ; FAST-NEXT: [[PHI:%[0-9]+]]:vcc(s1) = G_PHI [[ICMP]](s1), %bb.0, [[COPY3]](s1), %bb.1 + ; FAST-NEXT: [[C2:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0 + ; FAST-NEXT: [[SELECT:%[0-9]+]]:vgpr(s32) = G_SELECT [[PHI]](s1), [[C2]], [[COPY]] ; FAST-NEXT: S_SETPC_B64 undef $sgpr30_sgpr31, implicit [[SELECT]](s32) ; GREEDY-LABEL: name: phi_s1_vcc_s_sbranch ; GREEDY: bb.0: @@ -1581,8 +1573,8 @@ ; GREEDY-NEXT: [[COPY1:%[0-9]+]]:sgpr(s32) = COPY $sgpr0 ; GREEDY-NEXT: [[COPY2:%[0-9]+]]:sgpr(s32) = COPY $sgpr1 ; GREEDY-NEXT: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 0 - ; GREEDY-NEXT: [[COPY3:%[0-9]+]]:vgpr(s32) = COPY [[C]](s32) - ; GREEDY-NEXT: [[ICMP:%[0-9]+]]:vcc(s1) = G_ICMP intpred(eq), [[COPY]](s32), [[COPY3]] + ; GREEDY-NEXT: [[C1:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0 + ; GREEDY-NEXT: [[ICMP:%[0-9]+]]:vcc(s1) = G_ICMP intpred(eq), [[COPY]](s32), [[C1]] ; GREEDY-NEXT: [[ICMP1:%[0-9]+]]:sgpr(s32) = G_ICMP intpred(eq), [[COPY2]](s32), [[C]] ; GREEDY-NEXT: [[TRUNC:%[0-9]+]]:sgpr(s1) = G_TRUNC [[ICMP1]](s32) ; GREEDY-NEXT: [[ZEXT:%[0-9]+]]:sgpr(s32) = G_ZEXT [[TRUNC]](s1) @@ -1593,13 +1585,13 @@ ; GREEDY-NEXT: successors: %bb.2(0x80000000) ; GREEDY-NEXT: {{ $}} ; GREEDY-NEXT: [[TRUNC1:%[0-9]+]]:sgpr(s1) = G_TRUNC [[COPY1]](s32) - ; GREEDY-NEXT: [[COPY4:%[0-9]+]]:vcc(s1) = COPY [[TRUNC1]](s1) + ; GREEDY-NEXT: [[COPY3:%[0-9]+]]:vcc(s1) = COPY [[TRUNC1]](s1) ; GREEDY-NEXT: G_BR %bb.2 ; GREEDY-NEXT: {{ $}} ; GREEDY-NEXT: bb.2: - ; GREEDY-NEXT: [[PHI:%[0-9]+]]:vcc(s1) = G_PHI [[ICMP]](s1), %bb.0, [[COPY4]](s1), %bb.1 - ; GREEDY-NEXT: [[COPY5:%[0-9]+]]:vgpr(s32) = COPY [[C]](s32) - ; GREEDY-NEXT: [[SELECT:%[0-9]+]]:vgpr(s32) = G_SELECT [[PHI]](s1), [[COPY5]], [[COPY]] + ; GREEDY-NEXT: [[PHI:%[0-9]+]]:vcc(s1) = G_PHI [[ICMP]](s1), %bb.0, [[COPY3]](s1), %bb.1 + ; GREEDY-NEXT: [[C2:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0 + ; GREEDY-NEXT: [[SELECT:%[0-9]+]]:vgpr(s32) = G_SELECT [[PHI]](s1), [[C2]], [[COPY]] ; GREEDY-NEXT: S_SETPC_B64 undef $sgpr30_sgpr31, implicit [[SELECT]](s32) bb.0: successors: %bb.1, %bb.2 @@ -1653,15 +1645,15 @@ ; FAST-NEXT: bb.1: ; FAST-NEXT: successors: %bb.2(0x80000000) ; FAST-NEXT: {{ $}} - ; FAST-NEXT: [[COPY4:%[0-9]+]]:vgpr(s32) = COPY [[C]](s32) - ; FAST-NEXT: [[ICMP1:%[0-9]+]]:vcc(s1) = G_ICMP intpred(eq), [[COPY1]](s32), [[COPY4]] + ; FAST-NEXT: [[C1:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0 + ; FAST-NEXT: [[ICMP1:%[0-9]+]]:vcc(s1) = G_ICMP intpred(eq), [[COPY1]](s32), [[C1]] ; FAST-NEXT: G_BR %bb.2 ; FAST-NEXT: {{ $}} ; FAST-NEXT: bb.2: ; FAST-NEXT: [[PHI:%[0-9]+]]:vcc(s1) = G_PHI [[COPY3]](s1), %bb.0, [[ICMP1]](s1), %bb.1 - ; FAST-NEXT: [[COPY5:%[0-9]+]]:vgpr(s32) = COPY [[C]](s32) - ; FAST-NEXT: [[COPY6:%[0-9]+]]:vgpr(s32) = COPY [[COPY]](s32) - ; FAST-NEXT: [[SELECT:%[0-9]+]]:vgpr(s32) = G_SELECT [[PHI]](s1), [[COPY5]], [[COPY6]] + ; FAST-NEXT: [[C2:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0 + ; FAST-NEXT: [[COPY4:%[0-9]+]]:vgpr(s32) = COPY [[COPY]](s32) + ; FAST-NEXT: [[SELECT:%[0-9]+]]:vgpr(s32) = G_SELECT [[PHI]](s1), [[C2]], [[COPY4]] ; FAST-NEXT: S_SETPC_B64 undef $sgpr30_sgpr31, implicit [[SELECT]](s32) ; GREEDY-LABEL: name: phi_s1_s_vcc_sbranch ; GREEDY: bb.0: @@ -1683,15 +1675,15 @@ ; GREEDY-NEXT: bb.1: ; GREEDY-NEXT: successors: %bb.2(0x80000000) ; GREEDY-NEXT: {{ $}} - ; GREEDY-NEXT: [[COPY4:%[0-9]+]]:vgpr(s32) = COPY [[C]](s32) - ; GREEDY-NEXT: [[ICMP1:%[0-9]+]]:vcc(s1) = G_ICMP intpred(eq), [[COPY1]](s32), [[COPY4]] + ; GREEDY-NEXT: [[C1:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0 + ; GREEDY-NEXT: [[ICMP1:%[0-9]+]]:vcc(s1) = G_ICMP intpred(eq), [[COPY1]](s32), [[C1]] ; GREEDY-NEXT: G_BR %bb.2 ; GREEDY-NEXT: {{ $}} ; GREEDY-NEXT: bb.2: ; GREEDY-NEXT: [[PHI:%[0-9]+]]:vcc(s1) = G_PHI [[COPY3]](s1), %bb.0, [[ICMP1]](s1), %bb.1 - ; GREEDY-NEXT: [[COPY5:%[0-9]+]]:vgpr(s32) = COPY [[C]](s32) - ; GREEDY-NEXT: [[COPY6:%[0-9]+]]:vgpr(s32) = COPY [[COPY]](s32) - ; GREEDY-NEXT: [[SELECT:%[0-9]+]]:vgpr(s32) = G_SELECT [[PHI]](s1), [[COPY5]], [[COPY6]] + ; GREEDY-NEXT: [[C2:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0 + ; GREEDY-NEXT: [[COPY4:%[0-9]+]]:vgpr(s32) = COPY [[COPY]](s32) + ; GREEDY-NEXT: [[SELECT:%[0-9]+]]:vgpr(s32) = G_SELECT [[PHI]](s1), [[C2]], [[COPY4]] ; GREEDY-NEXT: S_SETPC_B64 undef $sgpr30_sgpr31, implicit [[SELECT]](s32) bb.0: successors: %bb.1, %bb.2 @@ -1734,14 +1726,14 @@ ; FAST-NEXT: [[COPY1:%[0-9]+]]:vgpr(s32) = COPY $vgpr1 ; FAST-NEXT: [[COPY2:%[0-9]+]]:sgpr(s32) = COPY $sgpr0 ; FAST-NEXT: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 0 - ; FAST-NEXT: [[COPY3:%[0-9]+]]:vgpr(s32) = COPY [[C]](s32) - ; FAST-NEXT: [[ICMP:%[0-9]+]]:vcc(s1) = G_ICMP intpred(eq), [[COPY]](s32), [[COPY3]] + ; FAST-NEXT: [[C1:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0 + ; FAST-NEXT: [[ICMP:%[0-9]+]]:vcc(s1) = G_ICMP intpred(eq), [[COPY]](s32), [[C1]] ; FAST-NEXT: [[ICMP1:%[0-9]+]]:sgpr(s32) = G_ICMP intpred(eq), [[COPY2]](s32), [[C]] ; FAST-NEXT: [[TRUNC:%[0-9]+]]:sgpr(s1) = G_TRUNC [[ICMP1]](s32) ; FAST-NEXT: [[ZEXT:%[0-9]+]]:sgpr(s32) = G_ZEXT [[TRUNC]](s1) - ; FAST-NEXT: [[C1:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 1 - ; FAST-NEXT: [[C2:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0 - ; FAST-NEXT: [[SELECT:%[0-9]+]]:vgpr(s32) = G_SELECT [[ICMP]](s1), [[C1]], [[C2]] + ; FAST-NEXT: [[C2:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 1 + ; FAST-NEXT: [[C3:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0 + ; FAST-NEXT: [[SELECT:%[0-9]+]]:vgpr(s32) = G_SELECT [[ICMP]](s1), [[C2]], [[C3]] ; FAST-NEXT: G_BRCOND [[ZEXT]](s32), %bb.1 ; FAST-NEXT: G_BR %bb.2 ; FAST-NEXT: {{ $}} @@ -1755,9 +1747,9 @@ ; FAST-NEXT: bb.2: ; FAST-NEXT: [[PHI:%[0-9]+]]:vgpr(s32) = G_PHI [[SELECT]](s32), %bb.0, [[ANYEXT]](s32), %bb.1 ; FAST-NEXT: [[TRUNC2:%[0-9]+]]:vgpr(s1) = G_TRUNC [[PHI]](s32) - ; FAST-NEXT: [[COPY4:%[0-9]+]]:vcc(s1) = COPY [[TRUNC2]](s1) - ; FAST-NEXT: [[COPY5:%[0-9]+]]:vgpr(s32) = COPY [[C]](s32) - ; FAST-NEXT: [[SELECT1:%[0-9]+]]:vgpr(s32) = G_SELECT [[COPY4]](s1), [[COPY5]], [[COPY]] + ; FAST-NEXT: [[COPY3:%[0-9]+]]:vcc(s1) = COPY [[TRUNC2]](s1) + ; FAST-NEXT: [[C4:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0 + ; FAST-NEXT: [[SELECT1:%[0-9]+]]:vgpr(s32) = G_SELECT [[COPY3]](s1), [[C4]], [[COPY]] ; FAST-NEXT: S_SETPC_B64 undef $sgpr30_sgpr31, implicit [[SELECT1]](s32) ; GREEDY-LABEL: name: phi_s1_vcc_v_sbranch ; GREEDY: bb.0: @@ -1768,14 +1760,14 @@ ; GREEDY-NEXT: [[COPY1:%[0-9]+]]:vgpr(s32) = COPY $vgpr1 ; GREEDY-NEXT: [[COPY2:%[0-9]+]]:sgpr(s32) = COPY $sgpr0 ; GREEDY-NEXT: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 0 - ; GREEDY-NEXT: [[COPY3:%[0-9]+]]:vgpr(s32) = COPY [[C]](s32) - ; GREEDY-NEXT: [[ICMP:%[0-9]+]]:vcc(s1) = G_ICMP intpred(eq), [[COPY]](s32), [[COPY3]] + ; GREEDY-NEXT: [[C1:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0 + ; GREEDY-NEXT: [[ICMP:%[0-9]+]]:vcc(s1) = G_ICMP intpred(eq), [[COPY]](s32), [[C1]] ; GREEDY-NEXT: [[ICMP1:%[0-9]+]]:sgpr(s32) = G_ICMP intpred(eq), [[COPY2]](s32), [[C]] ; GREEDY-NEXT: [[TRUNC:%[0-9]+]]:sgpr(s1) = G_TRUNC [[ICMP1]](s32) ; GREEDY-NEXT: [[ZEXT:%[0-9]+]]:sgpr(s32) = G_ZEXT [[TRUNC]](s1) - ; GREEDY-NEXT: [[C1:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 1 - ; GREEDY-NEXT: [[C2:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0 - ; GREEDY-NEXT: [[SELECT:%[0-9]+]]:vgpr(s32) = G_SELECT [[ICMP]](s1), [[C1]], [[C2]] + ; GREEDY-NEXT: [[C2:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 1 + ; GREEDY-NEXT: [[C3:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0 + ; GREEDY-NEXT: [[SELECT:%[0-9]+]]:vgpr(s32) = G_SELECT [[ICMP]](s1), [[C2]], [[C3]] ; GREEDY-NEXT: G_BRCOND [[ZEXT]](s32), %bb.1 ; GREEDY-NEXT: G_BR %bb.2 ; GREEDY-NEXT: {{ $}} @@ -1789,9 +1781,9 @@ ; GREEDY-NEXT: bb.2: ; GREEDY-NEXT: [[PHI:%[0-9]+]]:vgpr(s32) = G_PHI [[SELECT]](s32), %bb.0, [[ANYEXT]](s32), %bb.1 ; GREEDY-NEXT: [[TRUNC2:%[0-9]+]]:vgpr(s1) = G_TRUNC [[PHI]](s32) - ; GREEDY-NEXT: [[COPY4:%[0-9]+]]:vcc(s1) = COPY [[TRUNC2]](s1) - ; GREEDY-NEXT: [[COPY5:%[0-9]+]]:vgpr(s32) = COPY [[C]](s32) - ; GREEDY-NEXT: [[SELECT1:%[0-9]+]]:vgpr(s32) = G_SELECT [[COPY4]](s1), [[COPY5]], [[COPY]] + ; GREEDY-NEXT: [[COPY3:%[0-9]+]]:vcc(s1) = COPY [[TRUNC2]](s1) + ; GREEDY-NEXT: [[C4:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0 + ; GREEDY-NEXT: [[SELECT1:%[0-9]+]]:vgpr(s32) = G_SELECT [[COPY3]](s1), [[C4]], [[COPY]] ; GREEDY-NEXT: S_SETPC_B64 undef $sgpr30_sgpr31, implicit [[SELECT1]](s32) bb.0: successors: %bb.1, %bb.2 @@ -1845,19 +1837,19 @@ ; FAST-NEXT: bb.1: ; FAST-NEXT: successors: %bb.2(0x80000000) ; FAST-NEXT: {{ $}} - ; FAST-NEXT: [[COPY3:%[0-9]+]]:vgpr(s32) = COPY [[C]](s32) - ; FAST-NEXT: [[ICMP1:%[0-9]+]]:vcc(s1) = G_ICMP intpred(eq), [[COPY1]](s32), [[COPY3]] - ; FAST-NEXT: [[C1:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 1 - ; FAST-NEXT: [[C2:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0 - ; FAST-NEXT: [[SELECT:%[0-9]+]]:vgpr(s32) = G_SELECT [[ICMP1]](s1), [[C1]], [[C2]] + ; FAST-NEXT: [[C1:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0 + ; FAST-NEXT: [[ICMP1:%[0-9]+]]:vcc(s1) = G_ICMP intpred(eq), [[COPY1]](s32), [[C1]] + ; FAST-NEXT: [[C2:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 1 + ; FAST-NEXT: [[C3:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0 + ; FAST-NEXT: [[SELECT:%[0-9]+]]:vgpr(s32) = G_SELECT [[ICMP1]](s1), [[C2]], [[C3]] ; FAST-NEXT: G_BR %bb.2 ; FAST-NEXT: {{ $}} ; FAST-NEXT: bb.2: ; FAST-NEXT: [[PHI:%[0-9]+]]:vgpr(s32) = G_PHI [[ANYEXT]](s32), %bb.0, [[SELECT]](s32), %bb.1 ; FAST-NEXT: [[TRUNC2:%[0-9]+]]:vgpr(s1) = G_TRUNC [[PHI]](s32) - ; FAST-NEXT: [[COPY4:%[0-9]+]]:vcc(s1) = COPY [[TRUNC2]](s1) - ; FAST-NEXT: [[COPY5:%[0-9]+]]:vgpr(s32) = COPY [[C]](s32) - ; FAST-NEXT: [[SELECT1:%[0-9]+]]:vgpr(s32) = G_SELECT [[COPY4]](s1), [[COPY5]], [[COPY]] + ; FAST-NEXT: [[COPY3:%[0-9]+]]:vcc(s1) = COPY [[TRUNC2]](s1) + ; FAST-NEXT: [[C4:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0 + ; FAST-NEXT: [[SELECT1:%[0-9]+]]:vgpr(s32) = G_SELECT [[COPY3]](s1), [[C4]], [[COPY]] ; FAST-NEXT: S_SETPC_B64 undef $sgpr30_sgpr31, implicit [[SELECT1]](s32) ; GREEDY-LABEL: name: phi_s1_v_vcc_sbranch ; GREEDY: bb.0: @@ -1879,19 +1871,19 @@ ; GREEDY-NEXT: bb.1: ; GREEDY-NEXT: successors: %bb.2(0x80000000) ; GREEDY-NEXT: {{ $}} - ; GREEDY-NEXT: [[COPY3:%[0-9]+]]:vgpr(s32) = COPY [[C]](s32) - ; GREEDY-NEXT: [[ICMP1:%[0-9]+]]:vcc(s1) = G_ICMP intpred(eq), [[COPY1]](s32), [[COPY3]] - ; GREEDY-NEXT: [[C1:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 1 - ; GREEDY-NEXT: [[C2:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0 - ; GREEDY-NEXT: [[SELECT:%[0-9]+]]:vgpr(s32) = G_SELECT [[ICMP1]](s1), [[C1]], [[C2]] + ; GREEDY-NEXT: [[C1:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0 + ; GREEDY-NEXT: [[ICMP1:%[0-9]+]]:vcc(s1) = G_ICMP intpred(eq), [[COPY1]](s32), [[C1]] + ; GREEDY-NEXT: [[C2:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 1 + ; GREEDY-NEXT: [[C3:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0 + ; GREEDY-NEXT: [[SELECT:%[0-9]+]]:vgpr(s32) = G_SELECT [[ICMP1]](s1), [[C2]], [[C3]] ; GREEDY-NEXT: G_BR %bb.2 ; GREEDY-NEXT: {{ $}} ; GREEDY-NEXT: bb.2: ; GREEDY-NEXT: [[PHI:%[0-9]+]]:vgpr(s32) = G_PHI [[ANYEXT]](s32), %bb.0, [[SELECT]](s32), %bb.1 ; GREEDY-NEXT: [[TRUNC2:%[0-9]+]]:vgpr(s1) = G_TRUNC [[PHI]](s32) - ; GREEDY-NEXT: [[COPY4:%[0-9]+]]:vcc(s1) = COPY [[TRUNC2]](s1) - ; GREEDY-NEXT: [[COPY5:%[0-9]+]]:vgpr(s32) = COPY [[C]](s32) - ; GREEDY-NEXT: [[SELECT1:%[0-9]+]]:vgpr(s32) = G_SELECT [[COPY4]](s1), [[COPY5]], [[COPY]] + ; GREEDY-NEXT: [[COPY3:%[0-9]+]]:vcc(s1) = COPY [[TRUNC2]](s1) + ; GREEDY-NEXT: [[C4:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0 + ; GREEDY-NEXT: [[SELECT1:%[0-9]+]]:vgpr(s32) = G_SELECT [[COPY3]](s1), [[C4]], [[COPY]] ; GREEDY-NEXT: S_SETPC_B64 undef $sgpr30_sgpr31, implicit [[SELECT1]](s32) bb.0: successors: %bb.1, %bb.2 @@ -1953,8 +1945,8 @@ ; FAST-NEXT: [[PHI:%[0-9]+]]:vgpr(s32) = G_PHI [[ANYEXT]](s32), %bb.0, [[ANYEXT1]](s32), %bb.1 ; FAST-NEXT: [[TRUNC3:%[0-9]+]]:vgpr(s1) = G_TRUNC [[PHI]](s32) ; FAST-NEXT: [[COPY3:%[0-9]+]]:vcc(s1) = COPY [[TRUNC3]](s1) - ; FAST-NEXT: [[COPY4:%[0-9]+]]:vgpr(s32) = COPY [[C]](s32) - ; FAST-NEXT: [[SELECT:%[0-9]+]]:vgpr(s32) = G_SELECT [[COPY3]](s1), [[COPY4]], [[COPY]] + ; FAST-NEXT: [[C1:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0 + ; FAST-NEXT: [[SELECT:%[0-9]+]]:vgpr(s32) = G_SELECT [[COPY3]](s1), [[C1]], [[COPY]] ; FAST-NEXT: S_SETPC_B64 undef $sgpr30_sgpr31, implicit [[SELECT]](s32) ; GREEDY-LABEL: name: phi_s1_v_s_sbranch ; GREEDY: bb.0: @@ -1984,8 +1976,8 @@ ; GREEDY-NEXT: [[PHI:%[0-9]+]]:vgpr(s32) = G_PHI [[ANYEXT]](s32), %bb.0, [[ANYEXT1]](s32), %bb.1 ; GREEDY-NEXT: [[TRUNC3:%[0-9]+]]:vgpr(s1) = G_TRUNC [[PHI]](s32) ; GREEDY-NEXT: [[COPY3:%[0-9]+]]:vcc(s1) = COPY [[TRUNC3]](s1) - ; GREEDY-NEXT: [[COPY4:%[0-9]+]]:vgpr(s32) = COPY [[C]](s32) - ; GREEDY-NEXT: [[SELECT:%[0-9]+]]:vgpr(s32) = G_SELECT [[COPY3]](s1), [[COPY4]], [[COPY]] + ; GREEDY-NEXT: [[C1:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0 + ; GREEDY-NEXT: [[SELECT:%[0-9]+]]:vgpr(s32) = G_SELECT [[COPY3]](s1), [[C1]], [[COPY]] ; GREEDY-NEXT: S_SETPC_B64 undef $sgpr30_sgpr31, implicit [[SELECT]](s32) bb.0: successors: %bb.1, %bb.2 @@ -2047,9 +2039,9 @@ ; FAST-NEXT: [[PHI:%[0-9]+]]:vgpr(s32) = G_PHI [[ANYEXT]](s32), %bb.0, [[ANYEXT1]](s32), %bb.1 ; FAST-NEXT: [[TRUNC3:%[0-9]+]]:vgpr(s1) = G_TRUNC [[PHI]](s32) ; FAST-NEXT: [[COPY3:%[0-9]+]]:vcc(s1) = COPY [[TRUNC3]](s1) - ; FAST-NEXT: [[COPY4:%[0-9]+]]:vgpr(s32) = COPY [[C]](s32) - ; FAST-NEXT: [[COPY5:%[0-9]+]]:vgpr(s32) = COPY [[COPY]](s32) - ; FAST-NEXT: [[SELECT:%[0-9]+]]:vgpr(s32) = G_SELECT [[COPY3]](s1), [[COPY4]], [[COPY5]] + ; FAST-NEXT: [[C1:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0 + ; FAST-NEXT: [[COPY4:%[0-9]+]]:vgpr(s32) = COPY [[COPY]](s32) + ; FAST-NEXT: [[SELECT:%[0-9]+]]:vgpr(s32) = G_SELECT [[COPY3]](s1), [[C1]], [[COPY4]] ; FAST-NEXT: S_SETPC_B64 undef $sgpr30_sgpr31, implicit [[SELECT]](s32) ; GREEDY-LABEL: name: phi_s1_s_v_sbranch ; GREEDY: bb.0: @@ -2079,9 +2071,9 @@ ; GREEDY-NEXT: [[PHI:%[0-9]+]]:vgpr(s32) = G_PHI [[ANYEXT]](s32), %bb.0, [[ANYEXT1]](s32), %bb.1 ; GREEDY-NEXT: [[TRUNC3:%[0-9]+]]:vgpr(s1) = G_TRUNC [[PHI]](s32) ; GREEDY-NEXT: [[COPY3:%[0-9]+]]:vcc(s1) = COPY [[TRUNC3]](s1) - ; GREEDY-NEXT: [[COPY4:%[0-9]+]]:vgpr(s32) = COPY [[C]](s32) - ; GREEDY-NEXT: [[COPY5:%[0-9]+]]:vgpr(s32) = COPY [[COPY]](s32) - ; GREEDY-NEXT: [[SELECT:%[0-9]+]]:vgpr(s32) = G_SELECT [[COPY3]](s1), [[COPY4]], [[COPY5]] + ; GREEDY-NEXT: [[C1:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0 + ; GREEDY-NEXT: [[COPY4:%[0-9]+]]:vgpr(s32) = COPY [[COPY]](s32) + ; GREEDY-NEXT: [[SELECT:%[0-9]+]]:vgpr(s32) = G_SELECT [[COPY3]](s1), [[C1]], [[COPY4]] ; GREEDY-NEXT: S_SETPC_B64 undef $sgpr30_sgpr31, implicit [[SELECT]](s32) bb.0: successors: %bb.1, %bb.2 @@ -2143,8 +2135,8 @@ ; FAST-NEXT: [[PHI:%[0-9]+]]:vgpr(s32) = G_PHI [[ANYEXT]](s32), %bb.0, [[ANYEXT1]](s32), %bb.1 ; FAST-NEXT: [[TRUNC3:%[0-9]+]]:vgpr(s1) = G_TRUNC [[PHI]](s32) ; FAST-NEXT: [[COPY3:%[0-9]+]]:vcc(s1) = COPY [[TRUNC3]](s1) - ; FAST-NEXT: [[COPY4:%[0-9]+]]:vgpr(s32) = COPY [[C]](s32) - ; FAST-NEXT: [[SELECT:%[0-9]+]]:vgpr(s32) = G_SELECT [[COPY3]](s1), [[COPY4]], [[COPY]] + ; FAST-NEXT: [[C1:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0 + ; FAST-NEXT: [[SELECT:%[0-9]+]]:vgpr(s32) = G_SELECT [[COPY3]](s1), [[C1]], [[COPY]] ; FAST-NEXT: S_SETPC_B64 undef $sgpr30_sgpr31, implicit [[SELECT]](s32) ; GREEDY-LABEL: name: phi_s1_v_v_sbranch ; GREEDY: bb.0: @@ -2174,8 +2166,8 @@ ; GREEDY-NEXT: [[PHI:%[0-9]+]]:vgpr(s32) = G_PHI [[ANYEXT]](s32), %bb.0, [[ANYEXT1]](s32), %bb.1 ; GREEDY-NEXT: [[TRUNC3:%[0-9]+]]:vgpr(s1) = G_TRUNC [[PHI]](s32) ; GREEDY-NEXT: [[COPY3:%[0-9]+]]:vcc(s1) = COPY [[TRUNC3]](s1) - ; GREEDY-NEXT: [[COPY4:%[0-9]+]]:vgpr(s32) = COPY [[C]](s32) - ; GREEDY-NEXT: [[SELECT:%[0-9]+]]:vgpr(s32) = G_SELECT [[COPY3]](s1), [[COPY4]], [[COPY]] + ; GREEDY-NEXT: [[C1:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0 + ; GREEDY-NEXT: [[SELECT:%[0-9]+]]:vgpr(s32) = G_SELECT [[COPY3]](s1), [[C1]], [[COPY]] ; GREEDY-NEXT: S_SETPC_B64 undef $sgpr30_sgpr31, implicit [[SELECT]](s32) bb.0: successors: %bb.1, %bb.2 Index: llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-sbfx.mir =================================================================== --- llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-sbfx.mir +++ llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-sbfx.mir @@ -41,11 +41,9 @@ ; CHECK: liveins: $vgpr0 ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr(s32) = COPY $vgpr0 - ; CHECK-NEXT: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 10 - ; CHECK-NEXT: [[C1:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 4 - ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr(s32) = COPY [[C]](s32) - ; CHECK-NEXT: [[COPY2:%[0-9]+]]:vgpr(s32) = COPY [[C1]](s32) - ; CHECK-NEXT: [[SBFX:%[0-9]+]]:vgpr(s32) = G_SBFX [[COPY]], [[COPY1]](s32), [[COPY2]] + ; CHECK-NEXT: [[C:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 10 + ; CHECK-NEXT: [[C1:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 4 + ; CHECK-NEXT: [[SBFX:%[0-9]+]]:vgpr(s32) = G_SBFX [[COPY]], [[C]](s32), [[C1]] ; CHECK-NEXT: $vgpr0 = COPY [[SBFX]](s32) %0:_(s32) = COPY $vgpr0 %1:_(s32) = G_CONSTANT i32 10 @@ -151,14 +149,12 @@ ; CHECK: liveins: $vgpr0_vgpr1 ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr(s64) = COPY $vgpr0_vgpr1 - ; CHECK-NEXT: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 31 - ; CHECK-NEXT: [[C1:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 4 - ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr(s32) = COPY [[C]](s32) - ; CHECK-NEXT: [[COPY2:%[0-9]+]]:vgpr(s32) = COPY [[C1]](s32) - ; CHECK-NEXT: [[ASHR:%[0-9]+]]:vgpr(s64) = G_ASHR [[COPY]], [[COPY1]](s32) + ; CHECK-NEXT: [[C:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 31 + ; CHECK-NEXT: [[C1:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 4 + ; CHECK-NEXT: [[ASHR:%[0-9]+]]:vgpr(s64) = G_ASHR [[COPY]], [[C]](s32) ; CHECK-NEXT: [[UV:%[0-9]+]]:vgpr(s32), [[UV1:%[0-9]+]]:vgpr(s32) = G_UNMERGE_VALUES [[ASHR]](s64) ; CHECK-NEXT: [[C2:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0 - ; CHECK-NEXT: [[SBFX:%[0-9]+]]:vgpr(s32) = G_SBFX [[UV]], [[C2]](s32), [[COPY2]] + ; CHECK-NEXT: [[SBFX:%[0-9]+]]:vgpr(s32) = G_SBFX [[UV]], [[C2]](s32), [[C1]] ; CHECK-NEXT: [[C3:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 31 ; CHECK-NEXT: [[ASHR1:%[0-9]+]]:vgpr(s32) = G_ASHR [[SBFX]], [[C3]](s32) ; CHECK-NEXT: [[MV:%[0-9]+]]:vgpr(s64) = G_MERGE_VALUES [[SBFX]](s32), [[ASHR1]](s32) @@ -182,11 +178,9 @@ ; CHECK: liveins: $vgpr0_vgpr1 ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr(s64) = COPY $vgpr0_vgpr1 - ; CHECK-NEXT: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 8 - ; CHECK-NEXT: [[C1:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 40 - ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr(s32) = COPY [[C]](s32) - ; CHECK-NEXT: [[COPY2:%[0-9]+]]:vgpr(s32) = COPY [[C1]](s32) - ; CHECK-NEXT: [[ASHR:%[0-9]+]]:vgpr(s64) = G_ASHR [[COPY]], [[COPY1]](s32) + ; CHECK-NEXT: [[C:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 8 + ; CHECK-NEXT: [[C1:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 40 + ; CHECK-NEXT: [[ASHR:%[0-9]+]]:vgpr(s64) = G_ASHR [[COPY]], [[C]](s32) ; CHECK-NEXT: [[UV:%[0-9]+]]:vgpr(s32), [[UV1:%[0-9]+]]:vgpr(s32) = G_UNMERGE_VALUES [[ASHR]](s64) ; CHECK-NEXT: [[C2:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0 ; CHECK-NEXT: [[C3:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 8 Index: llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-ubfx.mir =================================================================== --- llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-ubfx.mir +++ llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-ubfx.mir @@ -41,11 +41,9 @@ ; CHECK: liveins: $vgpr0 ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr(s32) = COPY $vgpr0 - ; CHECK-NEXT: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 10 - ; CHECK-NEXT: [[C1:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 4 - ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr(s32) = COPY [[C]](s32) - ; CHECK-NEXT: [[COPY2:%[0-9]+]]:vgpr(s32) = COPY [[C1]](s32) - ; CHECK-NEXT: [[UBFX:%[0-9]+]]:vgpr(s32) = G_UBFX [[COPY]], [[COPY1]](s32), [[COPY2]] + ; CHECK-NEXT: [[C:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 10 + ; CHECK-NEXT: [[C1:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 4 + ; CHECK-NEXT: [[UBFX:%[0-9]+]]:vgpr(s32) = G_UBFX [[COPY]], [[C]](s32), [[C1]] ; CHECK-NEXT: $vgpr0 = COPY [[UBFX]](s32) %0:_(s32) = COPY $vgpr0 %1:_(s32) = G_CONSTANT i32 10 @@ -151,14 +149,12 @@ ; CHECK: liveins: $vgpr0_vgpr1 ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr(s64) = COPY $vgpr0_vgpr1 - ; CHECK-NEXT: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 31 - ; CHECK-NEXT: [[C1:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 4 - ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr(s32) = COPY [[C]](s32) - ; CHECK-NEXT: [[COPY2:%[0-9]+]]:vgpr(s32) = COPY [[C1]](s32) - ; CHECK-NEXT: [[LSHR:%[0-9]+]]:vgpr(s64) = G_LSHR [[COPY]], [[COPY1]](s32) + ; CHECK-NEXT: [[C:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 31 + ; CHECK-NEXT: [[C1:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 4 + ; CHECK-NEXT: [[LSHR:%[0-9]+]]:vgpr(s64) = G_LSHR [[COPY]], [[C]](s32) ; CHECK-NEXT: [[UV:%[0-9]+]]:vgpr(s32), [[UV1:%[0-9]+]]:vgpr(s32) = G_UNMERGE_VALUES [[LSHR]](s64) ; CHECK-NEXT: [[C2:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0 - ; CHECK-NEXT: [[UBFX:%[0-9]+]]:vgpr(s32) = G_UBFX [[UV]], [[C2]](s32), [[COPY2]] + ; CHECK-NEXT: [[UBFX:%[0-9]+]]:vgpr(s32) = G_UBFX [[UV]], [[C2]](s32), [[C1]] ; CHECK-NEXT: [[MV:%[0-9]+]]:vgpr(s64) = G_MERGE_VALUES [[UBFX]](s32), [[C2]](s32) ; CHECK-NEXT: $vgpr0_vgpr1 = COPY [[MV]](s64) %0:_(s64) = COPY $vgpr0_vgpr1 @@ -180,11 +176,9 @@ ; CHECK: liveins: $vgpr0_vgpr1 ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr(s64) = COPY $vgpr0_vgpr1 - ; CHECK-NEXT: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 8 - ; CHECK-NEXT: [[C1:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 40 - ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr(s32) = COPY [[C]](s32) - ; CHECK-NEXT: [[COPY2:%[0-9]+]]:vgpr(s32) = COPY [[C1]](s32) - ; CHECK-NEXT: [[LSHR:%[0-9]+]]:vgpr(s64) = G_LSHR [[COPY]], [[COPY1]](s32) + ; CHECK-NEXT: [[C:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 8 + ; CHECK-NEXT: [[C1:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 40 + ; CHECK-NEXT: [[LSHR:%[0-9]+]]:vgpr(s64) = G_LSHR [[COPY]], [[C]](s32) ; CHECK-NEXT: [[UV:%[0-9]+]]:vgpr(s32), [[UV1:%[0-9]+]]:vgpr(s32) = G_UNMERGE_VALUES [[LSHR]](s64) ; CHECK-NEXT: [[C2:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0 ; CHECK-NEXT: [[C3:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 8 Index: llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-waterfall-agpr.mir =================================================================== --- llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-waterfall-agpr.mir +++ llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-waterfall-agpr.mir @@ -21,25 +21,24 @@ ; CHECK-NEXT: %rsrc:sgpr(<4 x s32>) = COPY $sgpr0_sgpr1_sgpr2_sgpr3 ; CHECK-NEXT: %agpr:agpr(s32) = COPY $agpr0 ; CHECK-NEXT: %voffset:vgpr(s32) = COPY $vgpr1 - ; CHECK-NEXT: %zero:sgpr(s32) = G_CONSTANT i32 0 - ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr(s32) = COPY %zero(s32) + ; CHECK-NEXT: %zero:vgpr(s32) = G_CONSTANT i32 0 ; CHECK-NEXT: [[DEF:%[0-9]+]]:sreg_64_xexec = IMPLICIT_DEF - ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr_32(s32) = COPY %agpr(s32) + ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr_32(s32) = COPY %agpr(s32) ; CHECK-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: .1: ; CHECK-NEXT: successors: %bb.2(0x80000000) ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: [[PHI:%[0-9]+]]:sreg_64_xexec = PHI [[DEF]], %bb.0, %9, %bb.2 - ; CHECK-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32(s32) = V_READFIRSTLANE_B32 [[COPY1]](s32), implicit $exec - ; CHECK-NEXT: [[ICMP:%[0-9]+]]:vcc(s1) = G_ICMP intpred(eq), [[V_READFIRSTLANE_B32_]](s32), [[COPY1]] + ; CHECK-NEXT: [[PHI:%[0-9]+]]:sreg_64_xexec = PHI [[DEF]], %bb.0, %8, %bb.2 + ; CHECK-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32(s32) = V_READFIRSTLANE_B32 [[COPY]](s32), implicit $exec + ; CHECK-NEXT: [[ICMP:%[0-9]+]]:vcc(s1) = G_ICMP intpred(eq), [[V_READFIRSTLANE_B32_]](s32), [[COPY]] ; CHECK-NEXT: [[INT:%[0-9]+]]:sreg_64_xexec(s64) = G_INTRINSIC intrinsic(@llvm.amdgcn.ballot), [[ICMP]](s1) ; CHECK-NEXT: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[INT]](s64), implicit-def $exec, implicit-def $scc, implicit $exec ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: .2: ; CHECK-NEXT: successors: %bb.3(0x40000000), %bb.1(0x40000000) ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: G_AMDGPU_BUFFER_STORE %val(s32), %rsrc(<4 x s32>), [[COPY]](s32), %voffset, [[V_READFIRSTLANE_B32_]], 0, 0, 0 :: (dereferenceable store (s32), addrspace 4) + ; CHECK-NEXT: G_AMDGPU_BUFFER_STORE %val(s32), %rsrc(<4 x s32>), %zero(s32), %voffset, [[V_READFIRSTLANE_B32_]], 0, 0, 0 :: (dereferenceable store (s32), addrspace 4) ; CHECK-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc ; CHECK-NEXT: SI_WATERFALL_LOOP %bb.1, implicit $exec ; CHECK-NEXT: {{ $}} Index: llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-xor.mir =================================================================== --- llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-xor.mir +++ llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-xor.mir @@ -119,11 +119,10 @@ ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr(s32) = COPY $vgpr0 ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr(s32) = COPY $vgpr1 - ; CHECK-NEXT: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 0 - ; CHECK-NEXT: [[COPY2:%[0-9]+]]:vgpr(s32) = COPY [[C]](s32) - ; CHECK-NEXT: [[ICMP:%[0-9]+]]:vcc(s1) = G_ICMP intpred(ne), [[COPY]](s32), [[COPY2]] - ; CHECK-NEXT: [[COPY3:%[0-9]+]]:vgpr(s32) = COPY [[C]](s32) - ; CHECK-NEXT: [[ICMP1:%[0-9]+]]:vcc(s1) = G_ICMP intpred(ne), [[COPY1]](s32), [[COPY3]] + ; CHECK-NEXT: [[C:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0 + ; CHECK-NEXT: [[C1:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0 + ; CHECK-NEXT: [[ICMP:%[0-9]+]]:vcc(s1) = G_ICMP intpred(ne), [[COPY]](s32), [[C1]] + ; CHECK-NEXT: [[ICMP1:%[0-9]+]]:vcc(s1) = G_ICMP intpred(ne), [[COPY1]](s32), [[C]] ; CHECK-NEXT: [[XOR:%[0-9]+]]:vcc(s1) = G_XOR [[ICMP]], [[ICMP1]] ; CHECK-NEXT: S_NOP 0, implicit [[XOR]](s1) %0:_(s32) = COPY $vgpr0 @@ -150,10 +149,10 @@ ; CHECK-NEXT: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 0 ; CHECK-NEXT: [[ICMP:%[0-9]+]]:sgpr(s32) = G_ICMP intpred(ne), [[COPY]](s32), [[C]] ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:sgpr(s1) = G_TRUNC [[ICMP]](s32) - ; CHECK-NEXT: [[COPY2:%[0-9]+]]:vgpr(s32) = COPY [[C]](s32) - ; CHECK-NEXT: [[ICMP1:%[0-9]+]]:vcc(s1) = G_ICMP intpred(ne), [[COPY1]](s32), [[COPY2]] - ; CHECK-NEXT: [[COPY3:%[0-9]+]]:vcc(s1) = COPY [[TRUNC]](s1) - ; CHECK-NEXT: [[XOR:%[0-9]+]]:vcc(s1) = G_XOR [[COPY3]], [[ICMP1]] + ; CHECK-NEXT: [[C1:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0 + ; CHECK-NEXT: [[ICMP1:%[0-9]+]]:vcc(s1) = G_ICMP intpred(ne), [[COPY1]](s32), [[C1]] + ; CHECK-NEXT: [[COPY2:%[0-9]+]]:vcc(s1) = COPY [[TRUNC]](s1) + ; CHECK-NEXT: [[XOR:%[0-9]+]]:vcc(s1) = G_XOR [[COPY2]], [[ICMP1]] ; CHECK-NEXT: S_NOP 0, implicit [[XOR]](s1) %0:_(s32) = COPY $sgpr0 %1:_(s32) = COPY $vgpr0 @@ -830,13 +829,12 @@ ; CHECK: liveins: $vgpr0, $vgpr1 ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr(s32) = COPY $vgpr0 - ; CHECK-NEXT: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 0 - ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr(s32) = COPY [[C]](s32) - ; CHECK-NEXT: [[ICMP:%[0-9]+]]:vcc(s1) = G_ICMP intpred(ne), [[COPY]](s32), [[COPY1]] + ; CHECK-NEXT: [[C:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0 + ; CHECK-NEXT: [[ICMP:%[0-9]+]]:vcc(s1) = G_ICMP intpred(ne), [[COPY]](s32), [[C]] ; CHECK-NEXT: [[C1:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 1 ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:sgpr(s1) = G_TRUNC [[C1]](s32) - ; CHECK-NEXT: [[COPY2:%[0-9]+]]:vcc(s1) = COPY [[TRUNC]](s1) - ; CHECK-NEXT: [[XOR:%[0-9]+]]:vcc(s1) = G_XOR [[ICMP]], [[COPY2]] + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vcc(s1) = COPY [[TRUNC]](s1) + ; CHECK-NEXT: [[XOR:%[0-9]+]]:vcc(s1) = G_XOR [[ICMP]], [[COPY1]] ; CHECK-NEXT: S_NOP 0, implicit [[XOR]](s1) %0:_(s32) = COPY $vgpr0 %1:_(s32) = G_CONSTANT i32 0 Index: llvm/test/CodeGen/AMDGPU/GlobalISel/saddsat.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/GlobalISel/saddsat.ll +++ llvm/test/CodeGen/AMDGPU/GlobalISel/saddsat.ll @@ -233,14 +233,12 @@ ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: v_lshrrev_b32_e32 v2, 8, v0 ; GFX6-NEXT: v_lshlrev_b32_e32 v0, 24, v0 -; GFX6-NEXT: s_brev_b32 s5, 1 ; GFX6-NEXT: v_min_i32_e32 v5, 0, v0 ; GFX6-NEXT: v_lshrrev_b32_e32 v3, 8, v1 ; GFX6-NEXT: v_lshlrev_b32_e32 v1, 24, v1 -; GFX6-NEXT: s_brev_b32 s4, -2 ; GFX6-NEXT: v_max_i32_e32 v4, 0, v0 -; GFX6-NEXT: v_sub_i32_e32 v5, vcc, s5, v5 -; GFX6-NEXT: v_sub_i32_e32 v4, vcc, s4, v4 +; GFX6-NEXT: v_sub_i32_e32 v5, vcc, 0x80000000, v5 +; GFX6-NEXT: v_sub_i32_e32 v4, vcc, 0x7fffffff, v4 ; GFX6-NEXT: v_max_i32_e32 v1, v5, v1 ; GFX6-NEXT: v_min_i32_e32 v1, v1, v4 ; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v1 @@ -248,8 +246,8 @@ ; GFX6-NEXT: v_min_i32_e32 v4, 0, v1 ; GFX6-NEXT: v_lshlrev_b32_e32 v2, 24, v3 ; GFX6-NEXT: v_max_i32_e32 v3, 0, v1 -; GFX6-NEXT: v_sub_i32_e32 v4, vcc, s5, v4 -; GFX6-NEXT: v_sub_i32_e32 v3, vcc, s4, v3 +; GFX6-NEXT: v_sub_i32_e32 v4, vcc, 0x80000000, v4 +; GFX6-NEXT: v_sub_i32_e32 v3, vcc, 0x7fffffff, v3 ; GFX6-NEXT: v_max_i32_e32 v2, v4, v2 ; GFX6-NEXT: v_min_i32_e32 v2, v2, v3 ; GFX6-NEXT: v_add_i32_e32 v1, vcc, v1, v2 @@ -302,8 +300,8 @@ ; GFX9-NEXT: v_pk_lshlrev_b16 v1, 8, v1 op_sel_hi:[0,1] ; GFX9-NEXT: v_pk_add_i16 v0, v0, v1 clamp ; GFX9-NEXT: v_pk_ashrrev_i16 v0, 8, v0 op_sel_hi:[0,1] -; GFX9-NEXT: s_movk_i32 s4, 0xff -; GFX9-NEXT: v_and_b32_sdwa v1, v0, s4 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX9-NEXT: v_mov_b32_e32 v1, 0xff +; GFX9-NEXT: v_and_b32_sdwa v1, v0, v1 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -315,14 +313,14 @@ ; GFX10-NEXT: v_lshrrev_b32_e32 v3, 8, v1 ; GFX10-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX10-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GFX10-NEXT: s_movk_i32 s4, 0xff ; GFX10-NEXT: v_lshl_or_b32 v0, v2, 16, v0 ; GFX10-NEXT: v_lshl_or_b32 v1, v3, 16, v1 ; GFX10-NEXT: v_pk_lshlrev_b16 v0, 8, v0 op_sel_hi:[0,1] ; GFX10-NEXT: v_pk_lshlrev_b16 v1, 8, v1 op_sel_hi:[0,1] ; GFX10-NEXT: v_pk_add_i16 v0, v0, v1 clamp +; GFX10-NEXT: v_mov_b32_e32 v1, 0xff ; GFX10-NEXT: v_pk_ashrrev_i16 v0, 8, v0 op_sel_hi:[0,1] -; GFX10-NEXT: v_and_b32_sdwa v1, v0, s4 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX10-NEXT: v_and_b32_sdwa v1, v0, v1 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; GFX10-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX10-NEXT: s_setpc_b64 s[30:31] ; @@ -444,8 +442,8 @@ ; GFX9-NEXT: v_mov_b32_e32 v0, s1 ; GFX9-NEXT: v_pk_add_i16 v0, s0, v0 clamp ; GFX9-NEXT: v_pk_ashrrev_i16 v0, 8, v0 op_sel_hi:[0,1] -; GFX9-NEXT: s_movk_i32 s0, 0xff -; GFX9-NEXT: v_and_b32_sdwa v1, v0, s0 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX9-NEXT: v_mov_b32_e32 v1, 0xff +; GFX9-NEXT: v_and_b32_sdwa v1, v0, v1 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_readfirstlane_b32 s0, v0 ; GFX9-NEXT: ; return to shader part epilog @@ -464,10 +462,10 @@ ; GFX10-NEXT: s_lshl_b32 s3, s3, 8 ; GFX10-NEXT: s_pack_ll_b32_b16 s0, s0, s2 ; GFX10-NEXT: s_pack_ll_b32_b16 s1, s1, s3 +; GFX10-NEXT: v_mov_b32_e32 v1, 0xff ; GFX10-NEXT: v_pk_add_i16 v0, s0, s1 clamp -; GFX10-NEXT: s_movk_i32 s0, 0xff ; GFX10-NEXT: v_pk_ashrrev_i16 v0, 8, v0 op_sel_hi:[0,1] -; GFX10-NEXT: v_and_b32_sdwa v1, v0, s0 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX10-NEXT: v_and_b32_sdwa v1, v0, v1 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; GFX10-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX10-NEXT: v_readfirstlane_b32 s0, v0 ; GFX10-NEXT: ; return to shader part epilog @@ -510,47 +508,45 @@ ; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v0 ; GFX6-NEXT: v_lshrrev_b32_e32 v4, 24, v0 ; GFX6-NEXT: v_lshlrev_b32_e32 v0, 24, v0 -; GFX6-NEXT: s_brev_b32 s5, 1 -; GFX6-NEXT: v_min_i32_e32 v10, 0, v0 +; GFX6-NEXT: v_bfrev_b32_e32 v9, 1 +; GFX6-NEXT: v_min_i32_e32 v11, 0, v0 ; GFX6-NEXT: v_lshrrev_b32_e32 v5, 8, v1 ; GFX6-NEXT: v_lshrrev_b32_e32 v6, 16, v1 ; GFX6-NEXT: v_lshrrev_b32_e32 v7, 24, v1 ; GFX6-NEXT: v_lshlrev_b32_e32 v1, 24, v1 -; GFX6-NEXT: s_brev_b32 s4, -2 -; GFX6-NEXT: v_max_i32_e32 v8, 0, v0 -; GFX6-NEXT: v_sub_i32_e32 v10, vcc, s5, v10 -; GFX6-NEXT: v_sub_i32_e32 v8, vcc, s4, v8 -; GFX6-NEXT: v_max_i32_e32 v1, v10, v1 -; GFX6-NEXT: v_min_i32_e32 v1, v1, v8 +; GFX6-NEXT: v_max_i32_e32 v10, 0, v0 +; GFX6-NEXT: v_sub_i32_e32 v11, vcc, v9, v11 +; GFX6-NEXT: v_sub_i32_e32 v10, vcc, 0x7fffffff, v10 +; GFX6-NEXT: v_max_i32_e32 v1, v11, v1 +; GFX6-NEXT: v_min_i32_e32 v1, v1, v10 ; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v1 ; GFX6-NEXT: v_lshlrev_b32_e32 v1, 24, v2 -; GFX6-NEXT: v_min_i32_e32 v8, 0, v1 +; GFX6-NEXT: v_min_i32_e32 v10, 0, v1 +; GFX6-NEXT: v_bfrev_b32_e32 v8, -2 ; GFX6-NEXT: v_lshlrev_b32_e32 v2, 24, v5 ; GFX6-NEXT: v_max_i32_e32 v5, 0, v1 -; GFX6-NEXT: v_sub_i32_e32 v8, vcc, s5, v8 -; GFX6-NEXT: v_sub_i32_e32 v5, vcc, s4, v5 -; GFX6-NEXT: v_max_i32_e32 v2, v8, v2 +; GFX6-NEXT: v_sub_i32_e32 v10, vcc, v9, v10 +; GFX6-NEXT: v_sub_i32_e32 v5, vcc, v8, v5 +; GFX6-NEXT: v_max_i32_e32 v2, v10, v2 ; GFX6-NEXT: v_min_i32_e32 v2, v2, v5 ; GFX6-NEXT: v_add_i32_e32 v1, vcc, v1, v2 ; GFX6-NEXT: v_lshlrev_b32_e32 v2, 24, v3 ; GFX6-NEXT: v_lshlrev_b32_e32 v3, 24, v6 ; GFX6-NEXT: v_min_i32_e32 v6, 0, v2 -; GFX6-NEXT: v_bfrev_b32_e32 v9, -2 ; GFX6-NEXT: v_max_i32_e32 v5, 0, v2 -; GFX6-NEXT: v_sub_i32_e32 v6, vcc, s5, v6 -; GFX6-NEXT: v_sub_i32_e32 v5, vcc, v9, v5 +; GFX6-NEXT: v_sub_i32_e32 v6, vcc, v9, v6 +; GFX6-NEXT: v_sub_i32_e32 v5, vcc, v8, v5 ; GFX6-NEXT: v_max_i32_e32 v3, v6, v3 ; GFX6-NEXT: v_min_i32_e32 v3, v3, v5 ; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v3 ; GFX6-NEXT: v_lshlrev_b32_e32 v3, 24, v4 -; GFX6-NEXT: v_bfrev_b32_e32 v11, 1 ; GFX6-NEXT: v_min_i32_e32 v6, 0, v3 ; GFX6-NEXT: v_ashrrev_i32_e32 v1, 24, v1 ; GFX6-NEXT: v_lshlrev_b32_e32 v4, 24, v7 ; GFX6-NEXT: v_max_i32_e32 v5, 0, v3 -; GFX6-NEXT: v_sub_i32_e32 v6, vcc, v11, v6 +; GFX6-NEXT: v_sub_i32_e32 v6, vcc, v9, v6 ; GFX6-NEXT: v_ashrrev_i32_e32 v0, 24, v0 -; GFX6-NEXT: v_sub_i32_e32 v5, vcc, v9, v5 +; GFX6-NEXT: v_sub_i32_e32 v5, vcc, v8, v5 ; GFX6-NEXT: v_max_i32_e32 v4, v6, v4 ; GFX6-NEXT: v_and_b32_e32 v1, 0xff, v1 ; GFX6-NEXT: v_ashrrev_i32_e32 v2, 24, v2 @@ -643,11 +639,11 @@ ; GFX9-NEXT: v_pk_add_i16 v2, v2, v3 clamp ; GFX9-NEXT: v_pk_add_i16 v0, v0, v1 clamp ; GFX9-NEXT: v_pk_ashrrev_i16 v1, 8, v2 op_sel_hi:[0,1] -; GFX9-NEXT: v_mov_b32_e32 v2, 8 +; GFX9-NEXT: v_mov_b32_e32 v3, 8 ; GFX9-NEXT: v_pk_ashrrev_i16 v0, 8, v0 op_sel_hi:[0,1] -; GFX9-NEXT: s_movk_i32 s4, 0xff -; GFX9-NEXT: v_lshlrev_b32_sdwa v2, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 -; GFX9-NEXT: v_and_or_b32 v1, v1, s4, v2 +; GFX9-NEXT: v_mov_b32_e32 v2, 0xff +; GFX9-NEXT: v_lshlrev_b32_sdwa v3, v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 +; GFX9-NEXT: v_and_or_b32 v1, v1, v2, v3 ; GFX9-NEXT: v_and_b32_e32 v2, 0xff, v0 ; GFX9-NEXT: v_mov_b32_e32 v3, 24 ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 16, v2 @@ -682,7 +678,7 @@ ; GFX10-NEXT: v_lshlrev_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 ; GFX10-NEXT: v_and_b32_e32 v3, 0xff, v0 ; GFX10-NEXT: v_lshlrev_b32_sdwa v0, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 -; GFX10-NEXT: v_and_or_b32 v1, v2, 0xff, v1 +; GFX10-NEXT: v_and_or_b32 v1, 0xff, v2, v1 ; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v3 ; GFX10-NEXT: v_or3_b32 v0, v1, v2, v0 ; GFX10-NEXT: s_setpc_b64 s[30:31] @@ -715,7 +711,7 @@ ; GFX11-NEXT: v_lshlrev_b32_e32 v2, 8, v2 ; GFX11-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 24, v0 -; GFX11-NEXT: v_and_or_b32 v1, v1, 0xff, v2 +; GFX11-NEXT: v_and_or_b32 v1, 0xff, v1, v2 ; GFX11-NEXT: v_or3_b32 v0, v1, v3, v0 ; GFX11-NEXT: s_setpc_b64 s[30:31] %lhs = bitcast i32 %lhs.arg to <4 x i8> @@ -873,46 +869,46 @@ ; ; GFX9-LABEL: s_saddsat_v4i8: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_lshr_b32 s3, s0, 8 +; GFX9-NEXT: s_lshr_b32 s2, s0, 8 +; GFX9-NEXT: s_lshr_b32 s3, s0, 16 +; GFX9-NEXT: s_lshr_b32 s4, s0, 24 +; GFX9-NEXT: s_pack_ll_b32_b16 s0, s0, s2 +; GFX9-NEXT: s_pack_ll_b32_b16 s2, s3, s4 ; GFX9-NEXT: s_lshr_b32 s4, s0, 16 -; GFX9-NEXT: s_lshr_b32 s6, s0, 24 -; GFX9-NEXT: s_pack_ll_b32_b16 s0, s0, s3 -; GFX9-NEXT: s_pack_ll_b32_b16 s3, s4, s6 -; GFX9-NEXT: s_lshr_b32 s6, s0, 16 ; GFX9-NEXT: s_lshl_b32 s0, s0, 0x80008 -; GFX9-NEXT: s_lshl_b32 s6, s6, 8 -; GFX9-NEXT: s_lshr_b32 s7, s1, 8 -; GFX9-NEXT: s_pack_ll_b32_b16 s0, s0, s6 -; GFX9-NEXT: s_lshr_b32 s6, s3, 16 -; GFX9-NEXT: s_lshr_b32 s8, s1, 16 -; GFX9-NEXT: s_lshr_b32 s9, s1, 24 -; GFX9-NEXT: s_pack_ll_b32_b16 s1, s1, s7 -; GFX9-NEXT: s_lshl_b32 s3, s3, 0x80008 -; GFX9-NEXT: s_lshl_b32 s6, s6, 8 -; GFX9-NEXT: s_pack_ll_b32_b16 s3, s3, s6 +; GFX9-NEXT: s_lshl_b32 s4, s4, 8 +; GFX9-NEXT: s_lshr_b32 s5, s1, 8 +; GFX9-NEXT: s_pack_ll_b32_b16 s0, s0, s4 +; GFX9-NEXT: s_lshr_b32 s4, s2, 16 ; GFX9-NEXT: s_lshr_b32 s6, s1, 16 -; GFX9-NEXT: s_pack_ll_b32_b16 s4, s8, s9 +; GFX9-NEXT: s_lshr_b32 s7, s1, 24 +; GFX9-NEXT: s_pack_ll_b32_b16 s1, s1, s5 +; GFX9-NEXT: s_lshl_b32 s2, s2, 0x80008 +; GFX9-NEXT: s_lshl_b32 s4, s4, 8 +; GFX9-NEXT: s_pack_ll_b32_b16 s2, s2, s4 +; GFX9-NEXT: s_lshr_b32 s4, s1, 16 +; GFX9-NEXT: s_pack_ll_b32_b16 s3, s6, s7 ; GFX9-NEXT: s_lshl_b32 s1, s1, 0x80008 -; GFX9-NEXT: s_lshl_b32 s6, s6, 8 -; GFX9-NEXT: s_pack_ll_b32_b16 s1, s1, s6 -; GFX9-NEXT: s_lshr_b32 s6, s4, 16 -; GFX9-NEXT: s_lshl_b32 s4, s4, 0x80008 -; GFX9-NEXT: s_lshl_b32 s6, s6, 8 -; GFX9-NEXT: s_pack_ll_b32_b16 s4, s4, s6 +; GFX9-NEXT: s_lshl_b32 s4, s4, 8 +; GFX9-NEXT: s_pack_ll_b32_b16 s1, s1, s4 +; GFX9-NEXT: s_lshr_b32 s4, s3, 16 +; GFX9-NEXT: s_lshl_b32 s3, s3, 0x80008 +; GFX9-NEXT: s_lshl_b32 s4, s4, 8 +; GFX9-NEXT: s_pack_ll_b32_b16 s3, s3, s4 ; GFX9-NEXT: v_mov_b32_e32 v0, s1 ; GFX9-NEXT: v_pk_add_i16 v0, s0, v0 clamp -; GFX9-NEXT: v_mov_b32_e32 v1, s4 -; GFX9-NEXT: s_mov_b32 s2, 8 -; GFX9-NEXT: v_pk_add_i16 v1, s3, v1 clamp +; GFX9-NEXT: v_mov_b32_e32 v1, s3 +; GFX9-NEXT: v_pk_add_i16 v1, s2, v1 clamp ; GFX9-NEXT: v_pk_ashrrev_i16 v0, 8, v0 op_sel_hi:[0,1] +; GFX9-NEXT: v_mov_b32_e32 v3, 8 ; GFX9-NEXT: v_pk_ashrrev_i16 v1, 8, v1 op_sel_hi:[0,1] -; GFX9-NEXT: s_movk_i32 s0, 0xff -; GFX9-NEXT: v_lshlrev_b32_sdwa v2, s2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 -; GFX9-NEXT: s_mov_b32 s5, 24 -; GFX9-NEXT: v_and_or_b32 v0, v0, s0, v2 +; GFX9-NEXT: v_mov_b32_e32 v2, 0xff +; GFX9-NEXT: v_lshlrev_b32_sdwa v3, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 +; GFX9-NEXT: v_and_or_b32 v0, v0, v2, v3 ; GFX9-NEXT: v_and_b32_e32 v2, 0xff, v1 +; GFX9-NEXT: v_mov_b32_e32 v3, 24 ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX9-NEXT: v_lshlrev_b32_sdwa v1, s5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 +; GFX9-NEXT: v_lshlrev_b32_sdwa v1, v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 ; GFX9-NEXT: v_or3_b32 v0, v0, v2, v1 ; GFX9-NEXT: v_readfirstlane_b32 s0, v0 ; GFX9-NEXT: ; return to shader part epilog @@ -947,14 +943,14 @@ ; GFX10-NEXT: s_pack_ll_b32_b16 s3, s3, s5 ; GFX10-NEXT: v_pk_add_i16 v0, s0, s1 clamp ; GFX10-NEXT: v_pk_add_i16 v1, s2, s3 clamp -; GFX10-NEXT: s_mov_b32 s0, 8 +; GFX10-NEXT: v_mov_b32_e32 v2, 8 +; GFX10-NEXT: v_mov_b32_e32 v4, 24 ; GFX10-NEXT: v_pk_ashrrev_i16 v0, 8, v0 op_sel_hi:[0,1] ; GFX10-NEXT: v_pk_ashrrev_i16 v1, 8, v1 op_sel_hi:[0,1] -; GFX10-NEXT: v_lshlrev_b32_sdwa v2, s0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 +; GFX10-NEXT: v_lshlrev_b32_sdwa v2, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 ; GFX10-NEXT: v_and_b32_e32 v3, 0xff, v1 -; GFX10-NEXT: s_mov_b32 s0, 24 -; GFX10-NEXT: v_lshlrev_b32_sdwa v1, s0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 -; GFX10-NEXT: v_and_or_b32 v0, v0, 0xff, v2 +; GFX10-NEXT: v_lshlrev_b32_sdwa v1, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 +; GFX10-NEXT: v_and_or_b32 v0, 0xff, v0, v2 ; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v3 ; GFX10-NEXT: v_or3_b32 v0, v0, v2, v1 ; GFX10-NEXT: v_readfirstlane_b32 s0, v0 @@ -994,7 +990,7 @@ ; GFX11-NEXT: v_lshlrev_b32_e32 v2, 8, v2 ; GFX11-NEXT: v_and_b32_e32 v3, 0xff, v1 ; GFX11-NEXT: v_bfe_u32 v1, v1, 16, 8 -; GFX11-NEXT: v_and_or_b32 v0, v0, 0xff, v2 +; GFX11-NEXT: v_and_or_b32 v0, 0xff, v0, v2 ; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v3 ; GFX11-NEXT: v_lshlrev_b32_e32 v1, 24, v1 ; GFX11-NEXT: v_or3_b32 v0, v0, v2, v1 @@ -1273,19 +1269,17 @@ ; GFX6-LABEL: v_saddsat_v2i32: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: s_brev_b32 s5, 1 ; GFX6-NEXT: v_min_i32_e32 v5, 0, v0 -; GFX6-NEXT: s_brev_b32 s4, -2 ; GFX6-NEXT: v_max_i32_e32 v4, 0, v0 -; GFX6-NEXT: v_sub_i32_e32 v5, vcc, s5, v5 -; GFX6-NEXT: v_sub_i32_e32 v4, vcc, s4, v4 +; GFX6-NEXT: v_sub_i32_e32 v5, vcc, 0x80000000, v5 +; GFX6-NEXT: v_sub_i32_e32 v4, vcc, 0x7fffffff, v4 ; GFX6-NEXT: v_max_i32_e32 v2, v5, v2 ; GFX6-NEXT: v_min_i32_e32 v2, v2, v4 ; GFX6-NEXT: v_min_i32_e32 v4, 0, v1 ; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v2 ; GFX6-NEXT: v_max_i32_e32 v2, 0, v1 -; GFX6-NEXT: v_sub_i32_e32 v4, vcc, s5, v4 -; GFX6-NEXT: v_sub_i32_e32 v2, vcc, s4, v2 +; GFX6-NEXT: v_sub_i32_e32 v4, vcc, 0x80000000, v4 +; GFX6-NEXT: v_sub_i32_e32 v2, vcc, 0x7fffffff, v2 ; GFX6-NEXT: v_max_i32_e32 v3, v4, v3 ; GFX6-NEXT: v_min_i32_e32 v2, v3, v2 ; GFX6-NEXT: v_add_i32_e32 v1, vcc, v1, v2 @@ -1294,19 +1288,17 @@ ; GFX8-LABEL: v_saddsat_v2i32: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: s_brev_b32 s5, 1 ; GFX8-NEXT: v_min_i32_e32 v5, 0, v0 -; GFX8-NEXT: s_brev_b32 s4, -2 ; GFX8-NEXT: v_max_i32_e32 v4, 0, v0 -; GFX8-NEXT: v_sub_u32_e32 v5, vcc, s5, v5 -; GFX8-NEXT: v_sub_u32_e32 v4, vcc, s4, v4 +; GFX8-NEXT: v_sub_u32_e32 v5, vcc, 0x80000000, v5 +; GFX8-NEXT: v_sub_u32_e32 v4, vcc, 0x7fffffff, v4 ; GFX8-NEXT: v_max_i32_e32 v2, v5, v2 ; GFX8-NEXT: v_min_i32_e32 v2, v2, v4 ; GFX8-NEXT: v_min_i32_e32 v4, 0, v1 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v2 ; GFX8-NEXT: v_max_i32_e32 v2, 0, v1 -; GFX8-NEXT: v_sub_u32_e32 v4, vcc, s5, v4 -; GFX8-NEXT: v_sub_u32_e32 v2, vcc, s4, v2 +; GFX8-NEXT: v_sub_u32_e32 v4, vcc, 0x80000000, v4 +; GFX8-NEXT: v_sub_u32_e32 v2, vcc, 0x7fffffff, v2 ; GFX8-NEXT: v_max_i32_e32 v3, v4, v3 ; GFX8-NEXT: v_min_i32_e32 v2, v3, v2 ; GFX8-NEXT: v_add_u32_e32 v1, vcc, v1, v2 @@ -1392,26 +1384,25 @@ ; GFX6-LABEL: v_saddsat_v3i32: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: s_brev_b32 s5, 1 -; GFX6-NEXT: v_min_i32_e32 v7, 0, v0 -; GFX6-NEXT: s_brev_b32 s4, -2 -; GFX6-NEXT: v_max_i32_e32 v6, 0, v0 -; GFX6-NEXT: v_sub_i32_e32 v7, vcc, s5, v7 -; GFX6-NEXT: v_sub_i32_e32 v6, vcc, s4, v6 -; GFX6-NEXT: v_max_i32_e32 v3, v7, v3 -; GFX6-NEXT: v_min_i32_e32 v3, v3, v6 -; GFX6-NEXT: v_min_i32_e32 v6, 0, v1 +; GFX6-NEXT: v_min_i32_e32 v8, 0, v0 +; GFX6-NEXT: v_max_i32_e32 v7, 0, v0 +; GFX6-NEXT: v_sub_i32_e32 v8, vcc, 0x80000000, v8 +; GFX6-NEXT: v_sub_i32_e32 v7, vcc, 0x7fffffff, v7 +; GFX6-NEXT: v_max_i32_e32 v3, v8, v3 +; GFX6-NEXT: v_min_i32_e32 v3, v3, v7 +; GFX6-NEXT: v_bfrev_b32_e32 v6, -2 ; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v3 ; GFX6-NEXT: v_max_i32_e32 v3, 0, v1 -; GFX6-NEXT: v_sub_i32_e32 v6, vcc, s5, v6 -; GFX6-NEXT: v_sub_i32_e32 v3, vcc, s4, v3 +; GFX6-NEXT: v_sub_i32_e32 v3, vcc, v6, v3 +; GFX6-NEXT: v_min_i32_e32 v6, 0, v1 +; GFX6-NEXT: v_sub_i32_e32 v6, vcc, 0x80000000, v6 ; GFX6-NEXT: v_max_i32_e32 v4, v6, v4 ; GFX6-NEXT: v_min_i32_e32 v3, v4, v3 ; GFX6-NEXT: v_min_i32_e32 v4, 0, v2 ; GFX6-NEXT: v_add_i32_e32 v1, vcc, v1, v3 ; GFX6-NEXT: v_max_i32_e32 v3, 0, v2 -; GFX6-NEXT: v_sub_i32_e32 v4, vcc, s5, v4 -; GFX6-NEXT: v_sub_i32_e32 v3, vcc, s4, v3 +; GFX6-NEXT: v_sub_i32_e32 v4, vcc, 0x80000000, v4 +; GFX6-NEXT: v_sub_i32_e32 v3, vcc, 0x7fffffff, v3 ; GFX6-NEXT: v_max_i32_e32 v4, v4, v5 ; GFX6-NEXT: v_min_i32_e32 v3, v4, v3 ; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v3 @@ -1420,26 +1411,25 @@ ; GFX8-LABEL: v_saddsat_v3i32: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: s_brev_b32 s5, 1 -; GFX8-NEXT: v_min_i32_e32 v7, 0, v0 -; GFX8-NEXT: s_brev_b32 s4, -2 -; GFX8-NEXT: v_max_i32_e32 v6, 0, v0 -; GFX8-NEXT: v_sub_u32_e32 v7, vcc, s5, v7 -; GFX8-NEXT: v_sub_u32_e32 v6, vcc, s4, v6 -; GFX8-NEXT: v_max_i32_e32 v3, v7, v3 -; GFX8-NEXT: v_min_i32_e32 v3, v3, v6 -; GFX8-NEXT: v_min_i32_e32 v6, 0, v1 +; GFX8-NEXT: v_min_i32_e32 v8, 0, v0 +; GFX8-NEXT: v_max_i32_e32 v7, 0, v0 +; GFX8-NEXT: v_sub_u32_e32 v8, vcc, 0x80000000, v8 +; GFX8-NEXT: v_sub_u32_e32 v7, vcc, 0x7fffffff, v7 +; GFX8-NEXT: v_max_i32_e32 v3, v8, v3 +; GFX8-NEXT: v_min_i32_e32 v3, v3, v7 +; GFX8-NEXT: v_bfrev_b32_e32 v6, -2 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v3 ; GFX8-NEXT: v_max_i32_e32 v3, 0, v1 -; GFX8-NEXT: v_sub_u32_e32 v6, vcc, s5, v6 -; GFX8-NEXT: v_sub_u32_e32 v3, vcc, s4, v3 +; GFX8-NEXT: v_sub_u32_e32 v3, vcc, v6, v3 +; GFX8-NEXT: v_min_i32_e32 v6, 0, v1 +; GFX8-NEXT: v_sub_u32_e32 v6, vcc, 0x80000000, v6 ; GFX8-NEXT: v_max_i32_e32 v4, v6, v4 ; GFX8-NEXT: v_min_i32_e32 v3, v4, v3 ; GFX8-NEXT: v_min_i32_e32 v4, 0, v2 ; GFX8-NEXT: v_add_u32_e32 v1, vcc, v1, v3 ; GFX8-NEXT: v_max_i32_e32 v3, 0, v2 -; GFX8-NEXT: v_sub_u32_e32 v4, vcc, s5, v4 -; GFX8-NEXT: v_sub_u32_e32 v3, vcc, s4, v3 +; GFX8-NEXT: v_sub_u32_e32 v4, vcc, 0x80000000, v4 +; GFX8-NEXT: v_sub_u32_e32 v3, vcc, 0x7fffffff, v3 ; GFX8-NEXT: v_max_i32_e32 v4, v4, v5 ; GFX8-NEXT: v_min_i32_e32 v3, v4, v3 ; GFX8-NEXT: v_add_u32_e32 v2, vcc, v2, v3 @@ -1546,26 +1536,26 @@ ; GFX6-LABEL: v_saddsat_v4i32: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: s_brev_b32 s5, 1 -; GFX6-NEXT: v_min_i32_e32 v9, 0, v0 -; GFX6-NEXT: s_brev_b32 s4, -2 -; GFX6-NEXT: v_max_i32_e32 v8, 0, v0 -; GFX6-NEXT: v_sub_i32_e32 v9, vcc, s5, v9 -; GFX6-NEXT: v_sub_i32_e32 v8, vcc, s4, v8 +; GFX6-NEXT: v_bfrev_b32_e32 v9, 1 +; GFX6-NEXT: v_min_i32_e32 v11, 0, v0 +; GFX6-NEXT: v_max_i32_e32 v10, 0, v0 +; GFX6-NEXT: v_sub_i32_e32 v9, vcc, v9, v11 +; GFX6-NEXT: v_sub_i32_e32 v10, vcc, 0x7fffffff, v10 ; GFX6-NEXT: v_max_i32_e32 v4, v9, v4 -; GFX6-NEXT: v_min_i32_e32 v4, v4, v8 -; GFX6-NEXT: v_min_i32_e32 v8, 0, v1 +; GFX6-NEXT: v_min_i32_e32 v4, v4, v10 +; GFX6-NEXT: v_min_i32_e32 v9, 0, v1 +; GFX6-NEXT: v_bfrev_b32_e32 v8, -2 ; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v4 ; GFX6-NEXT: v_max_i32_e32 v4, 0, v1 -; GFX6-NEXT: v_sub_i32_e32 v8, vcc, s5, v8 -; GFX6-NEXT: v_sub_i32_e32 v4, vcc, s4, v4 -; GFX6-NEXT: v_max_i32_e32 v5, v8, v5 +; GFX6-NEXT: v_sub_i32_e32 v9, vcc, 0x80000000, v9 +; GFX6-NEXT: v_sub_i32_e32 v4, vcc, v8, v4 +; GFX6-NEXT: v_max_i32_e32 v5, v9, v5 ; GFX6-NEXT: v_min_i32_e32 v4, v5, v4 ; GFX6-NEXT: v_min_i32_e32 v5, 0, v2 ; GFX6-NEXT: v_add_i32_e32 v1, vcc, v1, v4 ; GFX6-NEXT: v_max_i32_e32 v4, 0, v2 -; GFX6-NEXT: v_sub_i32_e32 v5, vcc, s5, v5 -; GFX6-NEXT: v_sub_i32_e32 v4, vcc, s4, v4 +; GFX6-NEXT: v_sub_i32_e32 v5, vcc, 0x80000000, v5 +; GFX6-NEXT: v_sub_i32_e32 v4, vcc, v8, v4 ; GFX6-NEXT: v_max_i32_e32 v5, v5, v6 ; GFX6-NEXT: v_min_i32_e32 v4, v5, v4 ; GFX6-NEXT: v_min_i32_e32 v5, 0, v3 @@ -1581,26 +1571,26 @@ ; GFX8-LABEL: v_saddsat_v4i32: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: s_brev_b32 s5, 1 -; GFX8-NEXT: v_min_i32_e32 v9, 0, v0 -; GFX8-NEXT: s_brev_b32 s4, -2 -; GFX8-NEXT: v_max_i32_e32 v8, 0, v0 -; GFX8-NEXT: v_sub_u32_e32 v9, vcc, s5, v9 -; GFX8-NEXT: v_sub_u32_e32 v8, vcc, s4, v8 +; GFX8-NEXT: v_bfrev_b32_e32 v9, 1 +; GFX8-NEXT: v_min_i32_e32 v11, 0, v0 +; GFX8-NEXT: v_max_i32_e32 v10, 0, v0 +; GFX8-NEXT: v_sub_u32_e32 v9, vcc, v9, v11 +; GFX8-NEXT: v_sub_u32_e32 v10, vcc, 0x7fffffff, v10 ; GFX8-NEXT: v_max_i32_e32 v4, v9, v4 -; GFX8-NEXT: v_min_i32_e32 v4, v4, v8 -; GFX8-NEXT: v_min_i32_e32 v8, 0, v1 +; GFX8-NEXT: v_min_i32_e32 v4, v4, v10 +; GFX8-NEXT: v_min_i32_e32 v9, 0, v1 +; GFX8-NEXT: v_bfrev_b32_e32 v8, -2 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v4 ; GFX8-NEXT: v_max_i32_e32 v4, 0, v1 -; GFX8-NEXT: v_sub_u32_e32 v8, vcc, s5, v8 -; GFX8-NEXT: v_sub_u32_e32 v4, vcc, s4, v4 -; GFX8-NEXT: v_max_i32_e32 v5, v8, v5 +; GFX8-NEXT: v_sub_u32_e32 v9, vcc, 0x80000000, v9 +; GFX8-NEXT: v_sub_u32_e32 v4, vcc, v8, v4 +; GFX8-NEXT: v_max_i32_e32 v5, v9, v5 ; GFX8-NEXT: v_min_i32_e32 v4, v5, v4 ; GFX8-NEXT: v_min_i32_e32 v5, 0, v2 ; GFX8-NEXT: v_add_u32_e32 v1, vcc, v1, v4 ; GFX8-NEXT: v_max_i32_e32 v4, 0, v2 -; GFX8-NEXT: v_sub_u32_e32 v5, vcc, s5, v5 -; GFX8-NEXT: v_sub_u32_e32 v4, vcc, s4, v4 +; GFX8-NEXT: v_sub_u32_e32 v5, vcc, 0x80000000, v5 +; GFX8-NEXT: v_sub_u32_e32 v4, vcc, v8, v4 ; GFX8-NEXT: v_max_i32_e32 v5, v5, v6 ; GFX8-NEXT: v_min_i32_e32 v4, v5, v4 ; GFX8-NEXT: v_min_i32_e32 v5, 0, v3 @@ -1735,34 +1725,33 @@ ; GFX6-LABEL: v_saddsat_v5i32: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: s_brev_b32 s5, 1 -; GFX6-NEXT: v_min_i32_e32 v12, 0, v0 -; GFX6-NEXT: s_brev_b32 s4, -2 -; GFX6-NEXT: v_max_i32_e32 v10, 0, v0 -; GFX6-NEXT: v_sub_i32_e32 v12, vcc, s5, v12 -; GFX6-NEXT: v_sub_i32_e32 v10, vcc, s4, v10 -; GFX6-NEXT: v_max_i32_e32 v5, v12, v5 -; GFX6-NEXT: v_min_i32_e32 v5, v5, v10 -; GFX6-NEXT: v_min_i32_e32 v10, 0, v1 +; GFX6-NEXT: v_bfrev_b32_e32 v11, 1 +; GFX6-NEXT: v_min_i32_e32 v13, 0, v0 +; GFX6-NEXT: v_max_i32_e32 v12, 0, v0 +; GFX6-NEXT: v_sub_i32_e32 v13, vcc, v11, v13 +; GFX6-NEXT: v_sub_i32_e32 v12, vcc, 0x7fffffff, v12 +; GFX6-NEXT: v_max_i32_e32 v5, v13, v5 +; GFX6-NEXT: v_min_i32_e32 v5, v5, v12 +; GFX6-NEXT: v_min_i32_e32 v12, 0, v1 +; GFX6-NEXT: v_bfrev_b32_e32 v10, -2 ; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v5 ; GFX6-NEXT: v_max_i32_e32 v5, 0, v1 -; GFX6-NEXT: v_sub_i32_e32 v10, vcc, s5, v10 -; GFX6-NEXT: v_sub_i32_e32 v5, vcc, s4, v5 -; GFX6-NEXT: v_max_i32_e32 v6, v10, v6 +; GFX6-NEXT: v_sub_i32_e32 v11, vcc, v11, v12 +; GFX6-NEXT: v_sub_i32_e32 v5, vcc, v10, v5 +; GFX6-NEXT: v_max_i32_e32 v6, v11, v6 ; GFX6-NEXT: v_min_i32_e32 v5, v6, v5 ; GFX6-NEXT: v_min_i32_e32 v6, 0, v2 ; GFX6-NEXT: v_add_i32_e32 v1, vcc, v1, v5 ; GFX6-NEXT: v_max_i32_e32 v5, 0, v2 -; GFX6-NEXT: v_sub_i32_e32 v6, vcc, s5, v6 -; GFX6-NEXT: v_sub_i32_e32 v5, vcc, s4, v5 +; GFX6-NEXT: v_sub_i32_e32 v6, vcc, 0x80000000, v6 +; GFX6-NEXT: v_sub_i32_e32 v5, vcc, v10, v5 ; GFX6-NEXT: v_max_i32_e32 v6, v6, v7 ; GFX6-NEXT: v_min_i32_e32 v5, v6, v5 ; GFX6-NEXT: v_min_i32_e32 v6, 0, v3 -; GFX6-NEXT: v_bfrev_b32_e32 v11, -2 ; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v5 ; GFX6-NEXT: v_max_i32_e32 v5, 0, v3 ; GFX6-NEXT: v_sub_i32_e32 v6, vcc, 0x80000000, v6 -; GFX6-NEXT: v_sub_i32_e32 v5, vcc, v11, v5 +; GFX6-NEXT: v_sub_i32_e32 v5, vcc, v10, v5 ; GFX6-NEXT: v_max_i32_e32 v6, v6, v8 ; GFX6-NEXT: v_min_i32_e32 v5, v6, v5 ; GFX6-NEXT: v_min_i32_e32 v6, 0, v4 @@ -1778,34 +1767,33 @@ ; GFX8-LABEL: v_saddsat_v5i32: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: s_brev_b32 s5, 1 -; GFX8-NEXT: v_min_i32_e32 v12, 0, v0 -; GFX8-NEXT: s_brev_b32 s4, -2 -; GFX8-NEXT: v_max_i32_e32 v10, 0, v0 -; GFX8-NEXT: v_sub_u32_e32 v12, vcc, s5, v12 -; GFX8-NEXT: v_sub_u32_e32 v10, vcc, s4, v10 -; GFX8-NEXT: v_max_i32_e32 v5, v12, v5 -; GFX8-NEXT: v_min_i32_e32 v5, v5, v10 -; GFX8-NEXT: v_min_i32_e32 v10, 0, v1 +; GFX8-NEXT: v_bfrev_b32_e32 v11, 1 +; GFX8-NEXT: v_min_i32_e32 v13, 0, v0 +; GFX8-NEXT: v_max_i32_e32 v12, 0, v0 +; GFX8-NEXT: v_sub_u32_e32 v13, vcc, v11, v13 +; GFX8-NEXT: v_sub_u32_e32 v12, vcc, 0x7fffffff, v12 +; GFX8-NEXT: v_max_i32_e32 v5, v13, v5 +; GFX8-NEXT: v_min_i32_e32 v5, v5, v12 +; GFX8-NEXT: v_min_i32_e32 v12, 0, v1 +; GFX8-NEXT: v_bfrev_b32_e32 v10, -2 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v5 ; GFX8-NEXT: v_max_i32_e32 v5, 0, v1 -; GFX8-NEXT: v_sub_u32_e32 v10, vcc, s5, v10 -; GFX8-NEXT: v_sub_u32_e32 v5, vcc, s4, v5 -; GFX8-NEXT: v_max_i32_e32 v6, v10, v6 +; GFX8-NEXT: v_sub_u32_e32 v11, vcc, v11, v12 +; GFX8-NEXT: v_sub_u32_e32 v5, vcc, v10, v5 +; GFX8-NEXT: v_max_i32_e32 v6, v11, v6 ; GFX8-NEXT: v_min_i32_e32 v5, v6, v5 ; GFX8-NEXT: v_min_i32_e32 v6, 0, v2 ; GFX8-NEXT: v_add_u32_e32 v1, vcc, v1, v5 ; GFX8-NEXT: v_max_i32_e32 v5, 0, v2 -; GFX8-NEXT: v_sub_u32_e32 v6, vcc, s5, v6 -; GFX8-NEXT: v_sub_u32_e32 v5, vcc, s4, v5 +; GFX8-NEXT: v_sub_u32_e32 v6, vcc, 0x80000000, v6 +; GFX8-NEXT: v_sub_u32_e32 v5, vcc, v10, v5 ; GFX8-NEXT: v_max_i32_e32 v6, v6, v7 ; GFX8-NEXT: v_min_i32_e32 v5, v6, v5 ; GFX8-NEXT: v_min_i32_e32 v6, 0, v3 -; GFX8-NEXT: v_bfrev_b32_e32 v11, -2 ; GFX8-NEXT: v_add_u32_e32 v2, vcc, v2, v5 ; GFX8-NEXT: v_max_i32_e32 v5, 0, v3 ; GFX8-NEXT: v_sub_u32_e32 v6, vcc, 0x80000000, v6 -; GFX8-NEXT: v_sub_u32_e32 v5, vcc, v11, v5 +; GFX8-NEXT: v_sub_u32_e32 v5, vcc, v10, v5 ; GFX8-NEXT: v_max_i32_e32 v6, v6, v8 ; GFX8-NEXT: v_min_i32_e32 v5, v6, v5 ; GFX8-NEXT: v_min_i32_e32 v6, 0, v4 @@ -1961,246 +1949,242 @@ ; GFX6-LABEL: v_saddsat_v16i32: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: s_brev_b32 s4, 1 -; GFX6-NEXT: v_min_i32_e32 v31, 0, v0 -; GFX6-NEXT: v_sub_i32_e32 v31, vcc, s4, v31 -; GFX6-NEXT: v_max_i32_e32 v16, v31, v16 -; GFX6-NEXT: s_brev_b32 s5, -2 -; GFX6-NEXT: v_max_i32_e32 v31, 0, v0 -; GFX6-NEXT: v_sub_i32_e32 v31, vcc, s5, v31 -; GFX6-NEXT: v_min_i32_e32 v16, v16, v31 +; GFX6-NEXT: v_bfrev_b32_e32 v31, 1 +; GFX6-NEXT: v_min_i32_e32 v32, 0, v0 +; GFX6-NEXT: v_sub_i32_e32 v32, vcc, v31, v32 +; GFX6-NEXT: v_max_i32_e32 v16, v32, v16 +; GFX6-NEXT: v_bfrev_b32_e32 v32, -2 +; GFX6-NEXT: v_max_i32_e32 v33, 0, v0 +; GFX6-NEXT: v_sub_i32_e32 v33, vcc, v32, v33 +; GFX6-NEXT: v_min_i32_e32 v16, v16, v33 ; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v16 ; GFX6-NEXT: v_min_i32_e32 v16, 0, v1 -; GFX6-NEXT: v_sub_i32_e32 v16, vcc, s4, v16 +; GFX6-NEXT: v_sub_i32_e32 v16, vcc, v31, v16 ; GFX6-NEXT: v_max_i32_e32 v16, v16, v17 ; GFX6-NEXT: v_max_i32_e32 v17, 0, v1 -; GFX6-NEXT: v_sub_i32_e32 v17, vcc, s5, v17 +; GFX6-NEXT: v_sub_i32_e32 v17, vcc, v32, v17 ; GFX6-NEXT: v_min_i32_e32 v16, v16, v17 ; GFX6-NEXT: v_add_i32_e32 v1, vcc, v1, v16 ; GFX6-NEXT: v_min_i32_e32 v16, 0, v2 -; GFX6-NEXT: v_sub_i32_e32 v16, vcc, s4, v16 +; GFX6-NEXT: v_sub_i32_e32 v16, vcc, v31, v16 ; GFX6-NEXT: v_max_i32_e32 v17, 0, v2 ; GFX6-NEXT: v_max_i32_e32 v16, v16, v18 -; GFX6-NEXT: v_sub_i32_e32 v17, vcc, s5, v17 +; GFX6-NEXT: v_sub_i32_e32 v17, vcc, v32, v17 ; GFX6-NEXT: v_min_i32_e32 v16, v16, v17 ; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v16 -; GFX6-NEXT: v_bfrev_b32_e32 v16, 1 -; GFX6-NEXT: v_min_i32_e32 v17, 0, v3 -; GFX6-NEXT: v_sub_i32_e32 v17, vcc, v16, v17 -; GFX6-NEXT: v_max_i32_e32 v17, v17, v19 -; GFX6-NEXT: v_bfrev_b32_e32 v18, -2 -; GFX6-NEXT: v_max_i32_e32 v19, 0, v3 -; GFX6-NEXT: v_sub_i32_e32 v19, vcc, v18, v19 -; GFX6-NEXT: v_min_i32_e32 v17, v17, v19 -; GFX6-NEXT: v_add_i32_e32 v3, vcc, v3, v17 -; GFX6-NEXT: v_min_i32_e32 v17, 0, v4 -; GFX6-NEXT: v_sub_i32_e32 v17, vcc, v16, v17 -; GFX6-NEXT: v_max_i32_e32 v19, 0, v4 -; GFX6-NEXT: v_max_i32_e32 v17, v17, v20 -; GFX6-NEXT: v_sub_i32_e32 v19, vcc, v18, v19 -; GFX6-NEXT: v_min_i32_e32 v17, v17, v19 -; GFX6-NEXT: v_add_i32_e32 v4, vcc, v4, v17 -; GFX6-NEXT: v_min_i32_e32 v17, 0, v5 -; GFX6-NEXT: v_sub_i32_e32 v17, vcc, v16, v17 -; GFX6-NEXT: v_max_i32_e32 v19, 0, v5 -; GFX6-NEXT: v_max_i32_e32 v17, v17, v21 -; GFX6-NEXT: v_sub_i32_e32 v19, vcc, v18, v19 -; GFX6-NEXT: v_min_i32_e32 v17, v17, v19 -; GFX6-NEXT: v_add_i32_e32 v5, vcc, v5, v17 -; GFX6-NEXT: v_min_i32_e32 v17, 0, v6 -; GFX6-NEXT: v_sub_i32_e32 v17, vcc, v16, v17 -; GFX6-NEXT: v_max_i32_e32 v19, 0, v6 -; GFX6-NEXT: v_max_i32_e32 v17, v17, v22 -; GFX6-NEXT: v_sub_i32_e32 v19, vcc, v18, v19 -; GFX6-NEXT: v_min_i32_e32 v17, v17, v19 -; GFX6-NEXT: buffer_load_dword v19, off, s[0:3], s32 -; GFX6-NEXT: v_add_i32_e32 v6, vcc, v6, v17 -; GFX6-NEXT: v_min_i32_e32 v17, 0, v7 -; GFX6-NEXT: v_sub_i32_e32 v17, vcc, v16, v17 -; GFX6-NEXT: v_max_i32_e32 v20, 0, v7 -; GFX6-NEXT: v_max_i32_e32 v17, v17, v23 -; GFX6-NEXT: v_sub_i32_e32 v20, vcc, v18, v20 -; GFX6-NEXT: v_min_i32_e32 v17, v17, v20 -; GFX6-NEXT: v_min_i32_e32 v20, 0, v8 -; GFX6-NEXT: v_add_i32_e32 v7, vcc, v7, v17 -; GFX6-NEXT: v_max_i32_e32 v17, 0, v8 -; GFX6-NEXT: v_sub_i32_e32 v20, vcc, v16, v20 -; GFX6-NEXT: v_sub_i32_e32 v17, vcc, v18, v17 -; GFX6-NEXT: v_max_i32_e32 v20, v20, v24 -; GFX6-NEXT: v_min_i32_e32 v17, v20, v17 -; GFX6-NEXT: v_min_i32_e32 v20, 0, v9 -; GFX6-NEXT: v_add_i32_e32 v8, vcc, v8, v17 -; GFX6-NEXT: v_max_i32_e32 v17, 0, v9 -; GFX6-NEXT: v_sub_i32_e32 v20, vcc, v16, v20 -; GFX6-NEXT: v_sub_i32_e32 v17, vcc, v18, v17 -; GFX6-NEXT: v_max_i32_e32 v20, v20, v25 -; GFX6-NEXT: v_min_i32_e32 v17, v20, v17 -; GFX6-NEXT: v_min_i32_e32 v20, 0, v10 -; GFX6-NEXT: v_add_i32_e32 v9, vcc, v9, v17 -; GFX6-NEXT: v_max_i32_e32 v17, 0, v10 -; GFX6-NEXT: v_sub_i32_e32 v20, vcc, v16, v20 -; GFX6-NEXT: v_sub_i32_e32 v17, vcc, v18, v17 -; GFX6-NEXT: v_max_i32_e32 v20, v20, v26 -; GFX6-NEXT: v_min_i32_e32 v17, v20, v17 -; GFX6-NEXT: v_min_i32_e32 v20, 0, v11 -; GFX6-NEXT: v_add_i32_e32 v10, vcc, v10, v17 -; GFX6-NEXT: v_max_i32_e32 v17, 0, v11 -; GFX6-NEXT: v_sub_i32_e32 v20, vcc, v16, v20 -; GFX6-NEXT: v_sub_i32_e32 v17, vcc, v18, v17 -; GFX6-NEXT: v_max_i32_e32 v20, v20, v27 -; GFX6-NEXT: v_min_i32_e32 v17, v20, v17 -; GFX6-NEXT: v_min_i32_e32 v20, 0, v12 -; GFX6-NEXT: v_add_i32_e32 v11, vcc, v11, v17 -; GFX6-NEXT: v_max_i32_e32 v17, 0, v12 -; GFX6-NEXT: v_sub_i32_e32 v20, vcc, v16, v20 -; GFX6-NEXT: v_sub_i32_e32 v17, vcc, v18, v17 -; GFX6-NEXT: v_max_i32_e32 v20, v20, v28 -; GFX6-NEXT: v_min_i32_e32 v17, v20, v17 -; GFX6-NEXT: v_min_i32_e32 v20, 0, v13 -; GFX6-NEXT: v_add_i32_e32 v12, vcc, v12, v17 -; GFX6-NEXT: v_max_i32_e32 v17, 0, v13 -; GFX6-NEXT: v_sub_i32_e32 v20, vcc, v16, v20 -; GFX6-NEXT: v_sub_i32_e32 v17, vcc, v18, v17 -; GFX6-NEXT: v_max_i32_e32 v20, v20, v29 -; GFX6-NEXT: v_min_i32_e32 v17, v20, v17 -; GFX6-NEXT: v_min_i32_e32 v20, 0, v14 -; GFX6-NEXT: v_add_i32_e32 v13, vcc, v13, v17 -; GFX6-NEXT: v_max_i32_e32 v17, 0, v14 -; GFX6-NEXT: v_sub_i32_e32 v20, vcc, v16, v20 -; GFX6-NEXT: v_sub_i32_e32 v17, vcc, v18, v17 -; GFX6-NEXT: v_max_i32_e32 v20, v20, v30 -; GFX6-NEXT: v_min_i32_e32 v17, v20, v17 -; GFX6-NEXT: v_add_i32_e32 v14, vcc, v14, v17 -; GFX6-NEXT: v_max_i32_e32 v17, 0, v15 -; GFX6-NEXT: v_sub_i32_e32 v17, vcc, v18, v17 -; GFX6-NEXT: v_min_i32_e32 v18, 0, v15 -; GFX6-NEXT: v_sub_i32_e32 v16, vcc, v16, v18 -; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: v_min_i32_e32 v16, 0, v3 +; GFX6-NEXT: v_sub_i32_e32 v16, vcc, v31, v16 +; GFX6-NEXT: v_max_i32_e32 v17, 0, v3 ; GFX6-NEXT: v_max_i32_e32 v16, v16, v19 +; GFX6-NEXT: v_sub_i32_e32 v17, vcc, v32, v17 +; GFX6-NEXT: v_min_i32_e32 v16, v16, v17 +; GFX6-NEXT: v_add_i32_e32 v3, vcc, v3, v16 +; GFX6-NEXT: v_min_i32_e32 v16, 0, v4 +; GFX6-NEXT: v_sub_i32_e32 v16, vcc, v31, v16 +; GFX6-NEXT: v_max_i32_e32 v17, 0, v4 +; GFX6-NEXT: v_max_i32_e32 v16, v16, v20 +; GFX6-NEXT: v_sub_i32_e32 v17, vcc, v32, v17 ; GFX6-NEXT: v_min_i32_e32 v16, v16, v17 +; GFX6-NEXT: v_add_i32_e32 v4, vcc, v4, v16 +; GFX6-NEXT: v_min_i32_e32 v16, 0, v5 +; GFX6-NEXT: v_sub_i32_e32 v16, vcc, v31, v16 +; GFX6-NEXT: v_max_i32_e32 v17, 0, v5 +; GFX6-NEXT: v_max_i32_e32 v16, v16, v21 +; GFX6-NEXT: v_sub_i32_e32 v17, vcc, v32, v17 +; GFX6-NEXT: v_min_i32_e32 v16, v16, v17 +; GFX6-NEXT: v_add_i32_e32 v5, vcc, v5, v16 +; GFX6-NEXT: v_min_i32_e32 v16, 0, v6 +; GFX6-NEXT: v_sub_i32_e32 v16, vcc, v31, v16 +; GFX6-NEXT: v_max_i32_e32 v17, 0, v6 +; GFX6-NEXT: v_max_i32_e32 v16, v16, v22 +; GFX6-NEXT: v_sub_i32_e32 v17, vcc, v32, v17 +; GFX6-NEXT: v_min_i32_e32 v16, v16, v17 +; GFX6-NEXT: buffer_load_dword v17, off, s[0:3], s32 +; GFX6-NEXT: v_add_i32_e32 v6, vcc, v6, v16 +; GFX6-NEXT: v_min_i32_e32 v16, 0, v7 +; GFX6-NEXT: v_sub_i32_e32 v16, vcc, v31, v16 +; GFX6-NEXT: v_max_i32_e32 v18, 0, v7 +; GFX6-NEXT: v_max_i32_e32 v16, v16, v23 +; GFX6-NEXT: v_sub_i32_e32 v18, vcc, v32, v18 +; GFX6-NEXT: v_min_i32_e32 v16, v16, v18 +; GFX6-NEXT: v_min_i32_e32 v18, 0, v8 +; GFX6-NEXT: v_add_i32_e32 v7, vcc, v7, v16 +; GFX6-NEXT: v_max_i32_e32 v16, 0, v8 +; GFX6-NEXT: v_sub_i32_e32 v18, vcc, v31, v18 +; GFX6-NEXT: v_sub_i32_e32 v16, vcc, v32, v16 +; GFX6-NEXT: v_max_i32_e32 v18, v18, v24 +; GFX6-NEXT: v_min_i32_e32 v16, v18, v16 +; GFX6-NEXT: v_min_i32_e32 v18, 0, v9 +; GFX6-NEXT: v_add_i32_e32 v8, vcc, v8, v16 +; GFX6-NEXT: v_max_i32_e32 v16, 0, v9 +; GFX6-NEXT: v_sub_i32_e32 v18, vcc, v31, v18 +; GFX6-NEXT: v_sub_i32_e32 v16, vcc, v32, v16 +; GFX6-NEXT: v_max_i32_e32 v18, v18, v25 +; GFX6-NEXT: v_min_i32_e32 v16, v18, v16 +; GFX6-NEXT: v_min_i32_e32 v18, 0, v10 +; GFX6-NEXT: v_add_i32_e32 v9, vcc, v9, v16 +; GFX6-NEXT: v_max_i32_e32 v16, 0, v10 +; GFX6-NEXT: v_sub_i32_e32 v18, vcc, v31, v18 +; GFX6-NEXT: v_sub_i32_e32 v16, vcc, v32, v16 +; GFX6-NEXT: v_max_i32_e32 v18, v18, v26 +; GFX6-NEXT: v_min_i32_e32 v16, v18, v16 +; GFX6-NEXT: v_min_i32_e32 v18, 0, v11 +; GFX6-NEXT: v_add_i32_e32 v10, vcc, v10, v16 +; GFX6-NEXT: v_max_i32_e32 v16, 0, v11 +; GFX6-NEXT: v_sub_i32_e32 v18, vcc, v31, v18 +; GFX6-NEXT: v_sub_i32_e32 v16, vcc, v32, v16 +; GFX6-NEXT: v_max_i32_e32 v18, v18, v27 +; GFX6-NEXT: v_min_i32_e32 v16, v18, v16 +; GFX6-NEXT: v_min_i32_e32 v18, 0, v12 +; GFX6-NEXT: v_add_i32_e32 v11, vcc, v11, v16 +; GFX6-NEXT: v_max_i32_e32 v16, 0, v12 +; GFX6-NEXT: v_sub_i32_e32 v18, vcc, v31, v18 +; GFX6-NEXT: v_sub_i32_e32 v16, vcc, v32, v16 +; GFX6-NEXT: v_max_i32_e32 v18, v18, v28 +; GFX6-NEXT: v_min_i32_e32 v16, v18, v16 +; GFX6-NEXT: v_min_i32_e32 v18, 0, v13 +; GFX6-NEXT: v_add_i32_e32 v12, vcc, v12, v16 +; GFX6-NEXT: v_max_i32_e32 v16, 0, v13 +; GFX6-NEXT: v_sub_i32_e32 v18, vcc, v31, v18 +; GFX6-NEXT: v_sub_i32_e32 v16, vcc, v32, v16 +; GFX6-NEXT: v_max_i32_e32 v18, v18, v29 +; GFX6-NEXT: v_min_i32_e32 v16, v18, v16 +; GFX6-NEXT: v_min_i32_e32 v18, 0, v14 +; GFX6-NEXT: v_add_i32_e32 v13, vcc, v13, v16 +; GFX6-NEXT: v_max_i32_e32 v16, 0, v14 +; GFX6-NEXT: v_sub_i32_e32 v18, vcc, v31, v18 +; GFX6-NEXT: v_sub_i32_e32 v16, vcc, v32, v16 +; GFX6-NEXT: v_max_i32_e32 v18, v18, v30 +; GFX6-NEXT: v_min_i32_e32 v16, v18, v16 +; GFX6-NEXT: v_min_i32_e32 v18, 0, v15 +; GFX6-NEXT: v_add_i32_e32 v14, vcc, v14, v16 +; GFX6-NEXT: v_max_i32_e32 v16, 0, v15 +; GFX6-NEXT: v_sub_i32_e32 v18, vcc, v31, v18 +; GFX6-NEXT: v_sub_i32_e32 v16, vcc, v32, v16 +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: v_max_i32_e32 v17, v18, v17 +; GFX6-NEXT: v_min_i32_e32 v16, v17, v16 ; GFX6-NEXT: v_add_i32_e32 v15, vcc, v15, v16 ; GFX6-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: v_saddsat_v16i32: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: s_brev_b32 s4, 1 -; GFX8-NEXT: v_min_i32_e32 v31, 0, v0 -; GFX8-NEXT: v_sub_u32_e32 v31, vcc, s4, v31 -; GFX8-NEXT: v_max_i32_e32 v16, v31, v16 -; GFX8-NEXT: s_brev_b32 s5, -2 -; GFX8-NEXT: v_max_i32_e32 v31, 0, v0 -; GFX8-NEXT: v_sub_u32_e32 v31, vcc, s5, v31 -; GFX8-NEXT: v_min_i32_e32 v16, v16, v31 +; GFX8-NEXT: v_bfrev_b32_e32 v31, 1 +; GFX8-NEXT: v_min_i32_e32 v32, 0, v0 +; GFX8-NEXT: v_sub_u32_e32 v32, vcc, v31, v32 +; GFX8-NEXT: v_max_i32_e32 v16, v32, v16 +; GFX8-NEXT: v_bfrev_b32_e32 v32, -2 +; GFX8-NEXT: v_max_i32_e32 v33, 0, v0 +; GFX8-NEXT: v_sub_u32_e32 v33, vcc, v32, v33 +; GFX8-NEXT: v_min_i32_e32 v16, v16, v33 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v16 ; GFX8-NEXT: v_min_i32_e32 v16, 0, v1 -; GFX8-NEXT: v_sub_u32_e32 v16, vcc, s4, v16 +; GFX8-NEXT: v_sub_u32_e32 v16, vcc, v31, v16 ; GFX8-NEXT: v_max_i32_e32 v16, v16, v17 ; GFX8-NEXT: v_max_i32_e32 v17, 0, v1 -; GFX8-NEXT: v_sub_u32_e32 v17, vcc, s5, v17 +; GFX8-NEXT: v_sub_u32_e32 v17, vcc, v32, v17 ; GFX8-NEXT: v_min_i32_e32 v16, v16, v17 ; GFX8-NEXT: v_add_u32_e32 v1, vcc, v1, v16 ; GFX8-NEXT: v_min_i32_e32 v16, 0, v2 -; GFX8-NEXT: v_sub_u32_e32 v16, vcc, s4, v16 +; GFX8-NEXT: v_sub_u32_e32 v16, vcc, v31, v16 ; GFX8-NEXT: v_max_i32_e32 v17, 0, v2 ; GFX8-NEXT: v_max_i32_e32 v16, v16, v18 -; GFX8-NEXT: v_sub_u32_e32 v17, vcc, s5, v17 +; GFX8-NEXT: v_sub_u32_e32 v17, vcc, v32, v17 ; GFX8-NEXT: v_min_i32_e32 v16, v16, v17 ; GFX8-NEXT: v_add_u32_e32 v2, vcc, v2, v16 -; GFX8-NEXT: v_bfrev_b32_e32 v16, 1 -; GFX8-NEXT: v_min_i32_e32 v17, 0, v3 -; GFX8-NEXT: v_sub_u32_e32 v17, vcc, v16, v17 -; GFX8-NEXT: v_max_i32_e32 v17, v17, v19 -; GFX8-NEXT: v_bfrev_b32_e32 v18, -2 -; GFX8-NEXT: v_max_i32_e32 v19, 0, v3 -; GFX8-NEXT: v_sub_u32_e32 v19, vcc, v18, v19 -; GFX8-NEXT: v_min_i32_e32 v17, v17, v19 -; GFX8-NEXT: v_add_u32_e32 v3, vcc, v3, v17 -; GFX8-NEXT: v_min_i32_e32 v17, 0, v4 -; GFX8-NEXT: v_sub_u32_e32 v17, vcc, v16, v17 -; GFX8-NEXT: v_max_i32_e32 v19, 0, v4 -; GFX8-NEXT: v_max_i32_e32 v17, v17, v20 -; GFX8-NEXT: v_sub_u32_e32 v19, vcc, v18, v19 -; GFX8-NEXT: v_min_i32_e32 v17, v17, v19 -; GFX8-NEXT: v_add_u32_e32 v4, vcc, v4, v17 -; GFX8-NEXT: v_min_i32_e32 v17, 0, v5 -; GFX8-NEXT: v_sub_u32_e32 v17, vcc, v16, v17 -; GFX8-NEXT: v_max_i32_e32 v19, 0, v5 -; GFX8-NEXT: v_max_i32_e32 v17, v17, v21 -; GFX8-NEXT: v_sub_u32_e32 v19, vcc, v18, v19 -; GFX8-NEXT: v_min_i32_e32 v17, v17, v19 -; GFX8-NEXT: v_add_u32_e32 v5, vcc, v5, v17 -; GFX8-NEXT: v_min_i32_e32 v17, 0, v6 -; GFX8-NEXT: v_sub_u32_e32 v17, vcc, v16, v17 -; GFX8-NEXT: v_max_i32_e32 v19, 0, v6 -; GFX8-NEXT: v_max_i32_e32 v17, v17, v22 -; GFX8-NEXT: v_sub_u32_e32 v19, vcc, v18, v19 -; GFX8-NEXT: v_min_i32_e32 v17, v17, v19 -; GFX8-NEXT: buffer_load_dword v19, off, s[0:3], s32 -; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v17 -; GFX8-NEXT: v_min_i32_e32 v17, 0, v7 -; GFX8-NEXT: v_sub_u32_e32 v17, vcc, v16, v17 -; GFX8-NEXT: v_max_i32_e32 v20, 0, v7 -; GFX8-NEXT: v_max_i32_e32 v17, v17, v23 -; GFX8-NEXT: v_sub_u32_e32 v20, vcc, v18, v20 -; GFX8-NEXT: v_min_i32_e32 v17, v17, v20 -; GFX8-NEXT: v_min_i32_e32 v20, 0, v8 -; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v17 -; GFX8-NEXT: v_max_i32_e32 v17, 0, v8 -; GFX8-NEXT: v_sub_u32_e32 v20, vcc, v16, v20 -; GFX8-NEXT: v_sub_u32_e32 v17, vcc, v18, v17 -; GFX8-NEXT: v_max_i32_e32 v20, v20, v24 -; GFX8-NEXT: v_min_i32_e32 v17, v20, v17 -; GFX8-NEXT: v_min_i32_e32 v20, 0, v9 -; GFX8-NEXT: v_add_u32_e32 v8, vcc, v8, v17 -; GFX8-NEXT: v_max_i32_e32 v17, 0, v9 -; GFX8-NEXT: v_sub_u32_e32 v20, vcc, v16, v20 -; GFX8-NEXT: v_sub_u32_e32 v17, vcc, v18, v17 -; GFX8-NEXT: v_max_i32_e32 v20, v20, v25 -; GFX8-NEXT: v_min_i32_e32 v17, v20, v17 -; GFX8-NEXT: v_min_i32_e32 v20, 0, v10 -; GFX8-NEXT: v_add_u32_e32 v9, vcc, v9, v17 -; GFX8-NEXT: v_max_i32_e32 v17, 0, v10 -; GFX8-NEXT: v_sub_u32_e32 v20, vcc, v16, v20 -; GFX8-NEXT: v_sub_u32_e32 v17, vcc, v18, v17 -; GFX8-NEXT: v_max_i32_e32 v20, v20, v26 -; GFX8-NEXT: v_min_i32_e32 v17, v20, v17 -; GFX8-NEXT: v_min_i32_e32 v20, 0, v11 -; GFX8-NEXT: v_add_u32_e32 v10, vcc, v10, v17 -; GFX8-NEXT: v_max_i32_e32 v17, 0, v11 -; GFX8-NEXT: v_sub_u32_e32 v20, vcc, v16, v20 -; GFX8-NEXT: v_sub_u32_e32 v17, vcc, v18, v17 -; GFX8-NEXT: v_max_i32_e32 v20, v20, v27 -; GFX8-NEXT: v_min_i32_e32 v17, v20, v17 -; GFX8-NEXT: v_min_i32_e32 v20, 0, v12 -; GFX8-NEXT: v_add_u32_e32 v11, vcc, v11, v17 -; GFX8-NEXT: v_max_i32_e32 v17, 0, v12 -; GFX8-NEXT: v_sub_u32_e32 v20, vcc, v16, v20 -; GFX8-NEXT: v_sub_u32_e32 v17, vcc, v18, v17 -; GFX8-NEXT: v_max_i32_e32 v20, v20, v28 -; GFX8-NEXT: v_min_i32_e32 v17, v20, v17 -; GFX8-NEXT: v_min_i32_e32 v20, 0, v13 -; GFX8-NEXT: v_add_u32_e32 v12, vcc, v12, v17 -; GFX8-NEXT: v_max_i32_e32 v17, 0, v13 -; GFX8-NEXT: v_sub_u32_e32 v20, vcc, v16, v20 -; GFX8-NEXT: v_sub_u32_e32 v17, vcc, v18, v17 -; GFX8-NEXT: v_max_i32_e32 v20, v20, v29 -; GFX8-NEXT: v_min_i32_e32 v17, v20, v17 -; GFX8-NEXT: v_min_i32_e32 v20, 0, v14 -; GFX8-NEXT: v_add_u32_e32 v13, vcc, v13, v17 -; GFX8-NEXT: v_max_i32_e32 v17, 0, v14 -; GFX8-NEXT: v_sub_u32_e32 v20, vcc, v16, v20 -; GFX8-NEXT: v_sub_u32_e32 v17, vcc, v18, v17 -; GFX8-NEXT: v_max_i32_e32 v20, v20, v30 -; GFX8-NEXT: v_min_i32_e32 v17, v20, v17 -; GFX8-NEXT: v_add_u32_e32 v14, vcc, v14, v17 -; GFX8-NEXT: v_max_i32_e32 v17, 0, v15 -; GFX8-NEXT: v_sub_u32_e32 v17, vcc, v18, v17 -; GFX8-NEXT: v_min_i32_e32 v18, 0, v15 -; GFX8-NEXT: v_sub_u32_e32 v16, vcc, v16, v18 -; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: v_min_i32_e32 v16, 0, v3 +; GFX8-NEXT: v_sub_u32_e32 v16, vcc, v31, v16 +; GFX8-NEXT: v_max_i32_e32 v17, 0, v3 ; GFX8-NEXT: v_max_i32_e32 v16, v16, v19 +; GFX8-NEXT: v_sub_u32_e32 v17, vcc, v32, v17 +; GFX8-NEXT: v_min_i32_e32 v16, v16, v17 +; GFX8-NEXT: v_add_u32_e32 v3, vcc, v3, v16 +; GFX8-NEXT: v_min_i32_e32 v16, 0, v4 +; GFX8-NEXT: v_sub_u32_e32 v16, vcc, v31, v16 +; GFX8-NEXT: v_max_i32_e32 v17, 0, v4 +; GFX8-NEXT: v_max_i32_e32 v16, v16, v20 +; GFX8-NEXT: v_sub_u32_e32 v17, vcc, v32, v17 +; GFX8-NEXT: v_min_i32_e32 v16, v16, v17 +; GFX8-NEXT: v_add_u32_e32 v4, vcc, v4, v16 +; GFX8-NEXT: v_min_i32_e32 v16, 0, v5 +; GFX8-NEXT: v_sub_u32_e32 v16, vcc, v31, v16 +; GFX8-NEXT: v_max_i32_e32 v17, 0, v5 +; GFX8-NEXT: v_max_i32_e32 v16, v16, v21 +; GFX8-NEXT: v_sub_u32_e32 v17, vcc, v32, v17 ; GFX8-NEXT: v_min_i32_e32 v16, v16, v17 +; GFX8-NEXT: v_add_u32_e32 v5, vcc, v5, v16 +; GFX8-NEXT: v_min_i32_e32 v16, 0, v6 +; GFX8-NEXT: v_sub_u32_e32 v16, vcc, v31, v16 +; GFX8-NEXT: v_max_i32_e32 v17, 0, v6 +; GFX8-NEXT: v_max_i32_e32 v16, v16, v22 +; GFX8-NEXT: v_sub_u32_e32 v17, vcc, v32, v17 +; GFX8-NEXT: v_min_i32_e32 v16, v16, v17 +; GFX8-NEXT: buffer_load_dword v17, off, s[0:3], s32 +; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v16 +; GFX8-NEXT: v_min_i32_e32 v16, 0, v7 +; GFX8-NEXT: v_sub_u32_e32 v16, vcc, v31, v16 +; GFX8-NEXT: v_max_i32_e32 v18, 0, v7 +; GFX8-NEXT: v_max_i32_e32 v16, v16, v23 +; GFX8-NEXT: v_sub_u32_e32 v18, vcc, v32, v18 +; GFX8-NEXT: v_min_i32_e32 v16, v16, v18 +; GFX8-NEXT: v_min_i32_e32 v18, 0, v8 +; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v16 +; GFX8-NEXT: v_max_i32_e32 v16, 0, v8 +; GFX8-NEXT: v_sub_u32_e32 v18, vcc, v31, v18 +; GFX8-NEXT: v_sub_u32_e32 v16, vcc, v32, v16 +; GFX8-NEXT: v_max_i32_e32 v18, v18, v24 +; GFX8-NEXT: v_min_i32_e32 v16, v18, v16 +; GFX8-NEXT: v_min_i32_e32 v18, 0, v9 +; GFX8-NEXT: v_add_u32_e32 v8, vcc, v8, v16 +; GFX8-NEXT: v_max_i32_e32 v16, 0, v9 +; GFX8-NEXT: v_sub_u32_e32 v18, vcc, v31, v18 +; GFX8-NEXT: v_sub_u32_e32 v16, vcc, v32, v16 +; GFX8-NEXT: v_max_i32_e32 v18, v18, v25 +; GFX8-NEXT: v_min_i32_e32 v16, v18, v16 +; GFX8-NEXT: v_min_i32_e32 v18, 0, v10 +; GFX8-NEXT: v_add_u32_e32 v9, vcc, v9, v16 +; GFX8-NEXT: v_max_i32_e32 v16, 0, v10 +; GFX8-NEXT: v_sub_u32_e32 v18, vcc, v31, v18 +; GFX8-NEXT: v_sub_u32_e32 v16, vcc, v32, v16 +; GFX8-NEXT: v_max_i32_e32 v18, v18, v26 +; GFX8-NEXT: v_min_i32_e32 v16, v18, v16 +; GFX8-NEXT: v_min_i32_e32 v18, 0, v11 +; GFX8-NEXT: v_add_u32_e32 v10, vcc, v10, v16 +; GFX8-NEXT: v_max_i32_e32 v16, 0, v11 +; GFX8-NEXT: v_sub_u32_e32 v18, vcc, v31, v18 +; GFX8-NEXT: v_sub_u32_e32 v16, vcc, v32, v16 +; GFX8-NEXT: v_max_i32_e32 v18, v18, v27 +; GFX8-NEXT: v_min_i32_e32 v16, v18, v16 +; GFX8-NEXT: v_min_i32_e32 v18, 0, v12 +; GFX8-NEXT: v_add_u32_e32 v11, vcc, v11, v16 +; GFX8-NEXT: v_max_i32_e32 v16, 0, v12 +; GFX8-NEXT: v_sub_u32_e32 v18, vcc, v31, v18 +; GFX8-NEXT: v_sub_u32_e32 v16, vcc, v32, v16 +; GFX8-NEXT: v_max_i32_e32 v18, v18, v28 +; GFX8-NEXT: v_min_i32_e32 v16, v18, v16 +; GFX8-NEXT: v_min_i32_e32 v18, 0, v13 +; GFX8-NEXT: v_add_u32_e32 v12, vcc, v12, v16 +; GFX8-NEXT: v_max_i32_e32 v16, 0, v13 +; GFX8-NEXT: v_sub_u32_e32 v18, vcc, v31, v18 +; GFX8-NEXT: v_sub_u32_e32 v16, vcc, v32, v16 +; GFX8-NEXT: v_max_i32_e32 v18, v18, v29 +; GFX8-NEXT: v_min_i32_e32 v16, v18, v16 +; GFX8-NEXT: v_min_i32_e32 v18, 0, v14 +; GFX8-NEXT: v_add_u32_e32 v13, vcc, v13, v16 +; GFX8-NEXT: v_max_i32_e32 v16, 0, v14 +; GFX8-NEXT: v_sub_u32_e32 v18, vcc, v31, v18 +; GFX8-NEXT: v_sub_u32_e32 v16, vcc, v32, v16 +; GFX8-NEXT: v_max_i32_e32 v18, v18, v30 +; GFX8-NEXT: v_min_i32_e32 v16, v18, v16 +; GFX8-NEXT: v_min_i32_e32 v18, 0, v15 +; GFX8-NEXT: v_add_u32_e32 v14, vcc, v14, v16 +; GFX8-NEXT: v_max_i32_e32 v16, 0, v15 +; GFX8-NEXT: v_sub_u32_e32 v18, vcc, v31, v18 +; GFX8-NEXT: v_sub_u32_e32 v16, vcc, v32, v16 +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: v_max_i32_e32 v17, v18, v17 +; GFX8-NEXT: v_min_i32_e32 v16, v17, v16 ; GFX8-NEXT: v_add_u32_e32 v15, vcc, v15, v16 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; @@ -2781,13 +2765,11 @@ ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX6-NEXT: s_brev_b32 s5, 1 ; GFX6-NEXT: v_min_i32_e32 v5, 0, v0 ; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX6-NEXT: s_brev_b32 s4, -2 ; GFX6-NEXT: v_max_i32_e32 v4, 0, v0 -; GFX6-NEXT: v_sub_i32_e32 v5, vcc, s5, v5 -; GFX6-NEXT: v_sub_i32_e32 v4, vcc, s4, v4 +; GFX6-NEXT: v_sub_i32_e32 v5, vcc, 0x80000000, v5 +; GFX6-NEXT: v_sub_i32_e32 v4, vcc, 0x7fffffff, v4 ; GFX6-NEXT: v_max_i32_e32 v2, v5, v2 ; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX6-NEXT: v_min_i32_e32 v2, v2, v4 @@ -2795,8 +2777,8 @@ ; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v2 ; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v3 ; GFX6-NEXT: v_max_i32_e32 v3, 0, v1 -; GFX6-NEXT: v_sub_i32_e32 v4, vcc, s5, v4 -; GFX6-NEXT: v_sub_i32_e32 v3, vcc, s4, v3 +; GFX6-NEXT: v_sub_i32_e32 v4, vcc, 0x80000000, v4 +; GFX6-NEXT: v_sub_i32_e32 v3, vcc, 0x7fffffff, v3 ; GFX6-NEXT: v_max_i32_e32 v2, v4, v2 ; GFX6-NEXT: v_min_i32_e32 v2, v2, v3 ; GFX6-NEXT: v_add_i32_e32 v1, vcc, v1, v2 @@ -2994,13 +2976,11 @@ ; GFX6-LABEL: saddsat_v2i16_vs: ; GFX6: ; %bb.0: ; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX6-NEXT: s_brev_b32 s3, 1 ; GFX6-NEXT: v_min_i32_e32 v3, 0, v0 ; GFX6-NEXT: s_lshl_b32 s0, s0, 16 -; GFX6-NEXT: s_brev_b32 s2, -2 ; GFX6-NEXT: v_max_i32_e32 v2, 0, v0 -; GFX6-NEXT: v_sub_i32_e32 v3, vcc, s3, v3 -; GFX6-NEXT: v_sub_i32_e32 v2, vcc, s2, v2 +; GFX6-NEXT: v_sub_i32_e32 v3, vcc, 0x80000000, v3 +; GFX6-NEXT: v_sub_i32_e32 v2, vcc, 0x7fffffff, v2 ; GFX6-NEXT: v_max_i32_e32 v3, s0, v3 ; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX6-NEXT: v_min_i32_e32 v2, v3, v2 @@ -3008,8 +2988,8 @@ ; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v2 ; GFX6-NEXT: s_lshl_b32 s0, s1, 16 ; GFX6-NEXT: v_max_i32_e32 v2, 0, v1 -; GFX6-NEXT: v_sub_i32_e32 v3, vcc, s3, v3 -; GFX6-NEXT: v_sub_i32_e32 v2, vcc, s2, v2 +; GFX6-NEXT: v_sub_i32_e32 v3, vcc, 0x80000000, v3 +; GFX6-NEXT: v_sub_i32_e32 v2, vcc, 0x7fffffff, v2 ; GFX6-NEXT: v_max_i32_e32 v3, s0, v3 ; GFX6-NEXT: v_min_i32_e32 v2, v3, v2 ; GFX6-NEXT: v_add_i32_e32 v1, vcc, v1, v2 @@ -3072,42 +3052,40 @@ ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX6-NEXT: s_brev_b32 s5, 1 -; GFX6-NEXT: v_min_i32_e32 v10, 0, v0 +; GFX6-NEXT: v_bfrev_b32_e32 v9, 1 +; GFX6-NEXT: v_min_i32_e32 v11, 0, v0 ; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GFX6-NEXT: s_brev_b32 s4, -2 -; GFX6-NEXT: v_max_i32_e32 v8, 0, v0 -; GFX6-NEXT: v_sub_i32_e32 v10, vcc, s5, v10 -; GFX6-NEXT: v_sub_i32_e32 v8, vcc, s4, v8 -; GFX6-NEXT: v_max_i32_e32 v4, v10, v4 +; GFX6-NEXT: v_max_i32_e32 v10, 0, v0 +; GFX6-NEXT: v_sub_i32_e32 v11, vcc, v9, v11 +; GFX6-NEXT: v_sub_i32_e32 v10, vcc, 0x7fffffff, v10 +; GFX6-NEXT: v_max_i32_e32 v4, v11, v4 ; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX6-NEXT: v_min_i32_e32 v4, v4, v8 -; GFX6-NEXT: v_min_i32_e32 v8, 0, v1 +; GFX6-NEXT: v_min_i32_e32 v4, v4, v10 +; GFX6-NEXT: v_min_i32_e32 v10, 0, v1 +; GFX6-NEXT: v_bfrev_b32_e32 v8, -2 ; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v4 ; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v5 ; GFX6-NEXT: v_max_i32_e32 v5, 0, v1 -; GFX6-NEXT: v_sub_i32_e32 v8, vcc, s5, v8 -; GFX6-NEXT: v_sub_i32_e32 v5, vcc, s4, v5 -; GFX6-NEXT: v_max_i32_e32 v4, v8, v4 +; GFX6-NEXT: v_sub_i32_e32 v10, vcc, v9, v10 +; GFX6-NEXT: v_sub_i32_e32 v5, vcc, v8, v5 +; GFX6-NEXT: v_max_i32_e32 v4, v10, v4 ; GFX6-NEXT: v_min_i32_e32 v4, v4, v5 ; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX6-NEXT: v_add_i32_e32 v1, vcc, v1, v4 ; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v6 ; GFX6-NEXT: v_min_i32_e32 v6, 0, v2 -; GFX6-NEXT: v_bfrev_b32_e32 v9, -2 ; GFX6-NEXT: v_max_i32_e32 v5, 0, v2 -; GFX6-NEXT: v_sub_i32_e32 v6, vcc, s5, v6 -; GFX6-NEXT: v_sub_i32_e32 v5, vcc, v9, v5 +; GFX6-NEXT: v_sub_i32_e32 v6, vcc, v9, v6 +; GFX6-NEXT: v_sub_i32_e32 v5, vcc, v8, v5 ; GFX6-NEXT: v_max_i32_e32 v4, v6, v4 ; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GFX6-NEXT: v_bfrev_b32_e32 v11, 1 ; GFX6-NEXT: v_min_i32_e32 v4, v4, v5 ; GFX6-NEXT: v_min_i32_e32 v6, 0, v3 ; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v4 ; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v7 ; GFX6-NEXT: v_max_i32_e32 v5, 0, v3 -; GFX6-NEXT: v_sub_i32_e32 v6, vcc, v11, v6 -; GFX6-NEXT: v_sub_i32_e32 v5, vcc, v9, v5 +; GFX6-NEXT: v_sub_i32_e32 v6, vcc, v9, v6 +; GFX6-NEXT: v_sub_i32_e32 v5, vcc, v8, v5 ; GFX6-NEXT: v_max_i32_e32 v4, v6, v4 ; GFX6-NEXT: v_ashrrev_i32_e32 v1, 16, v1 ; GFX6-NEXT: v_min_i32_e32 v4, v4, v5 @@ -3337,42 +3315,40 @@ ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX6-NEXT: s_brev_b32 s5, 1 -; GFX6-NEXT: v_min_i32_e32 v14, 0, v0 +; GFX6-NEXT: v_bfrev_b32_e32 v13, 1 +; GFX6-NEXT: v_min_i32_e32 v15, 0, v0 ; GFX6-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; GFX6-NEXT: s_brev_b32 s4, -2 -; GFX6-NEXT: v_max_i32_e32 v12, 0, v0 -; GFX6-NEXT: v_sub_i32_e32 v14, vcc, s5, v14 -; GFX6-NEXT: v_sub_i32_e32 v12, vcc, s4, v12 -; GFX6-NEXT: v_max_i32_e32 v6, v14, v6 +; GFX6-NEXT: v_bfrev_b32_e32 v12, -2 +; GFX6-NEXT: v_max_i32_e32 v14, 0, v0 +; GFX6-NEXT: v_sub_i32_e32 v15, vcc, v13, v15 +; GFX6-NEXT: v_sub_i32_e32 v14, vcc, v12, v14 +; GFX6-NEXT: v_max_i32_e32 v6, v15, v6 ; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX6-NEXT: v_min_i32_e32 v6, v6, v12 -; GFX6-NEXT: v_min_i32_e32 v12, 0, v1 +; GFX6-NEXT: v_min_i32_e32 v6, v6, v14 +; GFX6-NEXT: v_min_i32_e32 v14, 0, v1 ; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v6 ; GFX6-NEXT: v_lshlrev_b32_e32 v6, 16, v7 ; GFX6-NEXT: v_max_i32_e32 v7, 0, v1 -; GFX6-NEXT: v_sub_i32_e32 v12, vcc, s5, v12 -; GFX6-NEXT: v_sub_i32_e32 v7, vcc, s4, v7 -; GFX6-NEXT: v_max_i32_e32 v6, v12, v6 +; GFX6-NEXT: v_sub_i32_e32 v14, vcc, v13, v14 +; GFX6-NEXT: v_sub_i32_e32 v7, vcc, v12, v7 +; GFX6-NEXT: v_max_i32_e32 v6, v14, v6 ; GFX6-NEXT: v_min_i32_e32 v6, v6, v7 ; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX6-NEXT: v_add_i32_e32 v1, vcc, v1, v6 ; GFX6-NEXT: v_lshlrev_b32_e32 v6, 16, v8 ; GFX6-NEXT: v_min_i32_e32 v8, 0, v2 -; GFX6-NEXT: v_bfrev_b32_e32 v13, -2 ; GFX6-NEXT: v_max_i32_e32 v7, 0, v2 -; GFX6-NEXT: v_sub_i32_e32 v8, vcc, s5, v8 -; GFX6-NEXT: v_sub_i32_e32 v7, vcc, v13, v7 +; GFX6-NEXT: v_sub_i32_e32 v8, vcc, v13, v8 +; GFX6-NEXT: v_sub_i32_e32 v7, vcc, v12, v7 ; GFX6-NEXT: v_max_i32_e32 v6, v8, v6 ; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GFX6-NEXT: v_bfrev_b32_e32 v15, 1 ; GFX6-NEXT: v_min_i32_e32 v6, v6, v7 ; GFX6-NEXT: v_min_i32_e32 v8, 0, v3 ; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v6 ; GFX6-NEXT: v_lshlrev_b32_e32 v6, 16, v9 ; GFX6-NEXT: v_max_i32_e32 v7, 0, v3 -; GFX6-NEXT: v_sub_i32_e32 v8, vcc, v15, v8 -; GFX6-NEXT: v_sub_i32_e32 v7, vcc, v13, v7 +; GFX6-NEXT: v_sub_i32_e32 v8, vcc, v13, v8 +; GFX6-NEXT: v_sub_i32_e32 v7, vcc, v12, v7 ; GFX6-NEXT: v_max_i32_e32 v6, v8, v6 ; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v4 ; GFX6-NEXT: v_min_i32_e32 v6, v6, v7 @@ -3380,8 +3356,8 @@ ; GFX6-NEXT: v_add_i32_e32 v3, vcc, v3, v6 ; GFX6-NEXT: v_lshlrev_b32_e32 v6, 16, v10 ; GFX6-NEXT: v_max_i32_e32 v7, 0, v4 -; GFX6-NEXT: v_sub_i32_e32 v8, vcc, v15, v8 -; GFX6-NEXT: v_sub_i32_e32 v7, vcc, v13, v7 +; GFX6-NEXT: v_sub_i32_e32 v8, vcc, v13, v8 +; GFX6-NEXT: v_sub_i32_e32 v7, vcc, v12, v7 ; GFX6-NEXT: v_max_i32_e32 v6, v8, v6 ; GFX6-NEXT: v_lshlrev_b32_e32 v5, 16, v5 ; GFX6-NEXT: v_min_i32_e32 v6, v6, v7 @@ -3389,9 +3365,9 @@ ; GFX6-NEXT: v_add_i32_e32 v4, vcc, v4, v6 ; GFX6-NEXT: v_lshlrev_b32_e32 v6, 16, v11 ; GFX6-NEXT: v_max_i32_e32 v7, 0, v5 -; GFX6-NEXT: v_sub_i32_e32 v8, vcc, v15, v8 +; GFX6-NEXT: v_sub_i32_e32 v8, vcc, v13, v8 ; GFX6-NEXT: v_ashrrev_i32_e32 v1, 16, v1 -; GFX6-NEXT: v_sub_i32_e32 v7, vcc, v13, v7 +; GFX6-NEXT: v_sub_i32_e32 v7, vcc, v12, v7 ; GFX6-NEXT: v_max_i32_e32 v6, v8, v6 ; GFX6-NEXT: v_ashrrev_i32_e32 v0, 16, v0 ; GFX6-NEXT: v_min_i32_e32 v6, v6, v7 @@ -3692,42 +3668,40 @@ ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX6-NEXT: s_brev_b32 s5, 1 -; GFX6-NEXT: v_min_i32_e32 v18, 0, v0 +; GFX6-NEXT: v_bfrev_b32_e32 v17, 1 +; GFX6-NEXT: v_min_i32_e32 v19, 0, v0 ; GFX6-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; GFX6-NEXT: s_brev_b32 s4, -2 -; GFX6-NEXT: v_max_i32_e32 v16, 0, v0 -; GFX6-NEXT: v_sub_i32_e32 v18, vcc, s5, v18 -; GFX6-NEXT: v_sub_i32_e32 v16, vcc, s4, v16 -; GFX6-NEXT: v_max_i32_e32 v8, v18, v8 +; GFX6-NEXT: v_bfrev_b32_e32 v16, -2 +; GFX6-NEXT: v_max_i32_e32 v18, 0, v0 +; GFX6-NEXT: v_sub_i32_e32 v19, vcc, v17, v19 +; GFX6-NEXT: v_sub_i32_e32 v18, vcc, v16, v18 +; GFX6-NEXT: v_max_i32_e32 v8, v19, v8 ; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX6-NEXT: v_min_i32_e32 v8, v8, v16 -; GFX6-NEXT: v_min_i32_e32 v16, 0, v1 +; GFX6-NEXT: v_min_i32_e32 v8, v8, v18 +; GFX6-NEXT: v_min_i32_e32 v18, 0, v1 ; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v8 ; GFX6-NEXT: v_lshlrev_b32_e32 v8, 16, v9 ; GFX6-NEXT: v_max_i32_e32 v9, 0, v1 -; GFX6-NEXT: v_sub_i32_e32 v16, vcc, s5, v16 -; GFX6-NEXT: v_sub_i32_e32 v9, vcc, s4, v9 -; GFX6-NEXT: v_max_i32_e32 v8, v16, v8 +; GFX6-NEXT: v_sub_i32_e32 v18, vcc, v17, v18 +; GFX6-NEXT: v_sub_i32_e32 v9, vcc, v16, v9 +; GFX6-NEXT: v_max_i32_e32 v8, v18, v8 ; GFX6-NEXT: v_min_i32_e32 v8, v8, v9 ; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX6-NEXT: v_add_i32_e32 v1, vcc, v1, v8 ; GFX6-NEXT: v_lshlrev_b32_e32 v8, 16, v10 ; GFX6-NEXT: v_min_i32_e32 v10, 0, v2 -; GFX6-NEXT: v_bfrev_b32_e32 v17, -2 ; GFX6-NEXT: v_max_i32_e32 v9, 0, v2 -; GFX6-NEXT: v_sub_i32_e32 v10, vcc, s5, v10 -; GFX6-NEXT: v_sub_i32_e32 v9, vcc, v17, v9 +; GFX6-NEXT: v_sub_i32_e32 v10, vcc, v17, v10 +; GFX6-NEXT: v_sub_i32_e32 v9, vcc, v16, v9 ; GFX6-NEXT: v_max_i32_e32 v8, v10, v8 ; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GFX6-NEXT: v_bfrev_b32_e32 v19, 1 ; GFX6-NEXT: v_min_i32_e32 v8, v8, v9 ; GFX6-NEXT: v_min_i32_e32 v10, 0, v3 ; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v8 ; GFX6-NEXT: v_lshlrev_b32_e32 v8, 16, v11 ; GFX6-NEXT: v_max_i32_e32 v9, 0, v3 -; GFX6-NEXT: v_sub_i32_e32 v10, vcc, v19, v10 -; GFX6-NEXT: v_sub_i32_e32 v9, vcc, v17, v9 +; GFX6-NEXT: v_sub_i32_e32 v10, vcc, v17, v10 +; GFX6-NEXT: v_sub_i32_e32 v9, vcc, v16, v9 ; GFX6-NEXT: v_max_i32_e32 v8, v10, v8 ; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v4 ; GFX6-NEXT: v_min_i32_e32 v8, v8, v9 @@ -3735,8 +3709,8 @@ ; GFX6-NEXT: v_add_i32_e32 v3, vcc, v3, v8 ; GFX6-NEXT: v_lshlrev_b32_e32 v8, 16, v12 ; GFX6-NEXT: v_max_i32_e32 v9, 0, v4 -; GFX6-NEXT: v_sub_i32_e32 v10, vcc, v19, v10 -; GFX6-NEXT: v_sub_i32_e32 v9, vcc, v17, v9 +; GFX6-NEXT: v_sub_i32_e32 v10, vcc, v17, v10 +; GFX6-NEXT: v_sub_i32_e32 v9, vcc, v16, v9 ; GFX6-NEXT: v_max_i32_e32 v8, v10, v8 ; GFX6-NEXT: v_lshlrev_b32_e32 v5, 16, v5 ; GFX6-NEXT: v_min_i32_e32 v8, v8, v9 @@ -3744,8 +3718,8 @@ ; GFX6-NEXT: v_add_i32_e32 v4, vcc, v4, v8 ; GFX6-NEXT: v_lshlrev_b32_e32 v8, 16, v13 ; GFX6-NEXT: v_max_i32_e32 v9, 0, v5 -; GFX6-NEXT: v_sub_i32_e32 v10, vcc, v19, v10 -; GFX6-NEXT: v_sub_i32_e32 v9, vcc, v17, v9 +; GFX6-NEXT: v_sub_i32_e32 v10, vcc, v17, v10 +; GFX6-NEXT: v_sub_i32_e32 v9, vcc, v16, v9 ; GFX6-NEXT: v_max_i32_e32 v8, v10, v8 ; GFX6-NEXT: v_lshlrev_b32_e32 v6, 16, v6 ; GFX6-NEXT: v_min_i32_e32 v8, v8, v9 @@ -3753,8 +3727,8 @@ ; GFX6-NEXT: v_add_i32_e32 v5, vcc, v5, v8 ; GFX6-NEXT: v_lshlrev_b32_e32 v8, 16, v14 ; GFX6-NEXT: v_max_i32_e32 v9, 0, v6 -; GFX6-NEXT: v_sub_i32_e32 v10, vcc, v19, v10 -; GFX6-NEXT: v_sub_i32_e32 v9, vcc, v17, v9 +; GFX6-NEXT: v_sub_i32_e32 v10, vcc, v17, v10 +; GFX6-NEXT: v_sub_i32_e32 v9, vcc, v16, v9 ; GFX6-NEXT: v_max_i32_e32 v8, v10, v8 ; GFX6-NEXT: v_lshlrev_b32_e32 v7, 16, v7 ; GFX6-NEXT: v_min_i32_e32 v8, v8, v9 @@ -3763,9 +3737,9 @@ ; GFX6-NEXT: v_add_i32_e32 v6, vcc, v6, v8 ; GFX6-NEXT: v_lshlrev_b32_e32 v8, 16, v15 ; GFX6-NEXT: v_max_i32_e32 v9, 0, v7 -; GFX6-NEXT: v_sub_i32_e32 v10, vcc, v19, v10 +; GFX6-NEXT: v_sub_i32_e32 v10, vcc, v17, v10 ; GFX6-NEXT: v_ashrrev_i32_e32 v0, 16, v0 -; GFX6-NEXT: v_sub_i32_e32 v9, vcc, v17, v9 +; GFX6-NEXT: v_sub_i32_e32 v9, vcc, v16, v9 ; GFX6-NEXT: v_max_i32_e32 v8, v10, v8 ; GFX6-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; GFX6-NEXT: v_ashrrev_i32_e32 v2, 16, v2 Index: llvm/test/CodeGen/AMDGPU/GlobalISel/sdiv.i32.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/GlobalISel/sdiv.i32.ll +++ llvm/test/CodeGen/AMDGPU/GlobalISel/sdiv.i32.ll @@ -278,30 +278,30 @@ ; CHECK-LABEL: v_sdiv_i32_pow2k_denom: ; CHECK: ; %bb.0: ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_movk_i32 s6, 0x1000 -; CHECK-NEXT: v_ashrrev_i32_e32 v1, 31, v0 -; CHECK-NEXT: v_rcp_iflag_f32_e32 v2, 0x45800000 -; CHECK-NEXT: v_mov_b32_e32 v3, 0xfffff000 -; CHECK-NEXT: v_add_i32_e32 v0, vcc, v0, v1 -; CHECK-NEXT: v_mul_f32_e32 v2, 0x4f7ffffe, v2 -; CHECK-NEXT: v_xor_b32_e32 v0, v0, v1 -; CHECK-NEXT: v_cvt_u32_f32_e32 v2, v2 -; CHECK-NEXT: v_mul_lo_u32 v3, v2, v3 -; CHECK-NEXT: v_mul_hi_u32 v3, v2, v3 -; CHECK-NEXT: v_add_i32_e32 v2, vcc, v2, v3 -; CHECK-NEXT: v_mul_hi_u32 v2, v0, v2 -; CHECK-NEXT: v_lshlrev_b32_e32 v3, 12, v2 -; CHECK-NEXT: v_add_i32_e32 v4, vcc, 1, v2 -; CHECK-NEXT: v_sub_i32_e32 v0, vcc, v0, v3 -; CHECK-NEXT: v_cmp_le_u32_e64 s[4:5], s6, v0 -; CHECK-NEXT: v_cndmask_b32_e64 v2, v2, v4, s[4:5] -; CHECK-NEXT: v_subrev_i32_e32 v3, vcc, s6, v0 -; CHECK-NEXT: v_cndmask_b32_e64 v0, v0, v3, s[4:5] -; CHECK-NEXT: v_add_i32_e32 v3, vcc, 1, v2 -; CHECK-NEXT: v_cmp_le_u32_e32 vcc, s6, v0 -; CHECK-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc -; CHECK-NEXT: v_xor_b32_e32 v0, v0, v1 -; CHECK-NEXT: v_sub_i32_e32 v0, vcc, v0, v1 +; CHECK-NEXT: v_mov_b32_e32 v1, 0x1000 +; CHECK-NEXT: v_ashrrev_i32_e32 v2, 31, v0 +; CHECK-NEXT: v_rcp_iflag_f32_e32 v3, 0x45800000 +; CHECK-NEXT: v_mov_b32_e32 v4, 0xfffff000 +; CHECK-NEXT: v_add_i32_e32 v0, vcc, v0, v2 +; CHECK-NEXT: v_mul_f32_e32 v3, 0x4f7ffffe, v3 +; CHECK-NEXT: v_xor_b32_e32 v0, v0, v2 +; CHECK-NEXT: v_cvt_u32_f32_e32 v3, v3 +; CHECK-NEXT: v_mul_lo_u32 v4, v3, v4 +; CHECK-NEXT: v_mul_hi_u32 v4, v3, v4 +; CHECK-NEXT: v_add_i32_e32 v3, vcc, v3, v4 +; CHECK-NEXT: v_mul_hi_u32 v3, v0, v3 +; CHECK-NEXT: v_lshlrev_b32_e32 v4, 12, v3 +; CHECK-NEXT: v_add_i32_e32 v5, vcc, 1, v3 +; CHECK-NEXT: v_sub_i32_e32 v0, vcc, v0, v4 +; CHECK-NEXT: v_cmp_ge_u32_e64 s[4:5], v0, v1 +; CHECK-NEXT: v_cndmask_b32_e64 v3, v3, v5, s[4:5] +; CHECK-NEXT: v_subrev_i32_e32 v4, vcc, 0x1000, v0 +; CHECK-NEXT: v_cndmask_b32_e64 v0, v0, v4, s[4:5] +; CHECK-NEXT: v_add_i32_e32 v4, vcc, 1, v3 +; CHECK-NEXT: v_cmp_ge_u32_e32 vcc, v0, v1 +; CHECK-NEXT: v_cndmask_b32_e32 v0, v3, v4, vcc +; CHECK-NEXT: v_xor_b32_e32 v0, v0, v2 +; CHECK-NEXT: v_sub_i32_e32 v0, vcc, v0, v2 ; CHECK-NEXT: s_setpc_b64 s[30:31] %result = sdiv i32 %num, 4096 ret i32 %result @@ -321,17 +321,12 @@ ; GISEL-NEXT: v_xor_b32_e32 v0, v0, v2 ; GISEL-NEXT: v_rcp_iflag_f32_e32 v4, v4 ; GISEL-NEXT: v_xor_b32_e32 v1, v1, v3 -; GISEL-NEXT: v_mul_f32_e32 v5, 0x4f7ffffe, v4 ; GISEL-NEXT: v_mul_f32_e32 v4, 0x4f7ffffe, v4 -; GISEL-NEXT: v_cvt_u32_f32_e32 v5, v5 ; GISEL-NEXT: v_cvt_u32_f32_e32 v4, v4 -; GISEL-NEXT: v_mul_lo_u32 v6, s4, v5 -; GISEL-NEXT: v_mul_lo_u32 v7, s4, v4 -; GISEL-NEXT: v_mul_hi_u32 v6, v5, v6 -; GISEL-NEXT: v_mul_hi_u32 v7, v4, v7 -; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v6 -; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v7 -; GISEL-NEXT: v_mul_hi_u32 v5, v0, v5 +; GISEL-NEXT: v_mul_lo_u32 v5, s4, v4 +; GISEL-NEXT: v_mul_hi_u32 v5, v4, v5 +; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v5 +; GISEL-NEXT: v_mul_hi_u32 v5, v0, v4 ; GISEL-NEXT: v_mul_hi_u32 v4, v1, v4 ; GISEL-NEXT: v_mul_lo_u32 v6, v5, s8 ; GISEL-NEXT: v_add_i32_e32 v7, vcc, 1, v5 @@ -362,53 +357,45 @@ ; CGP-LABEL: v_sdiv_v2i32_pow2k_denom: ; CGP: ; %bb.0: ; CGP-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CGP-NEXT: s_movk_i32 s8, 0x1000 -; CGP-NEXT: v_ashrrev_i32_e32 v2, 31, v0 -; CGP-NEXT: v_rcp_iflag_f32_e32 v3, 0x45800000 -; CGP-NEXT: s_movk_i32 s4, 0xf000 -; CGP-NEXT: v_mov_b32_e32 v4, 0xfffff000 -; CGP-NEXT: v_mov_b32_e32 v5, 0x1000 +; CGP-NEXT: v_mov_b32_e32 v2, 0x1000 +; CGP-NEXT: v_ashrrev_i32_e32 v3, 31, v0 +; CGP-NEXT: v_rcp_iflag_f32_e32 v4, 0x45800000 +; CGP-NEXT: v_mov_b32_e32 v5, 0xfffff000 ; CGP-NEXT: v_ashrrev_i32_e32 v6, 31, v1 -; CGP-NEXT: v_rcp_iflag_f32_e32 v7, 0x45800000 -; CGP-NEXT: v_add_i32_e32 v0, vcc, v0, v2 -; CGP-NEXT: v_mul_f32_e32 v3, 0x4f7ffffe, v3 +; CGP-NEXT: v_add_i32_e32 v0, vcc, v0, v3 +; CGP-NEXT: v_mul_f32_e32 v4, 0x4f7ffffe, v4 ; CGP-NEXT: v_add_i32_e32 v1, vcc, v1, v6 -; CGP-NEXT: v_mul_f32_e32 v7, 0x4f7ffffe, v7 -; CGP-NEXT: v_xor_b32_e32 v0, v0, v2 -; CGP-NEXT: v_cvt_u32_f32_e32 v3, v3 +; CGP-NEXT: v_xor_b32_e32 v0, v0, v3 +; CGP-NEXT: v_cvt_u32_f32_e32 v4, v4 ; CGP-NEXT: v_xor_b32_e32 v1, v1, v6 -; CGP-NEXT: v_cvt_u32_f32_e32 v7, v7 -; CGP-NEXT: v_mul_lo_u32 v8, v3, s4 -; CGP-NEXT: v_mul_lo_u32 v4, v7, v4 -; CGP-NEXT: v_mul_hi_u32 v8, v3, v8 -; CGP-NEXT: v_mul_hi_u32 v4, v7, v4 -; CGP-NEXT: v_add_i32_e32 v3, vcc, v3, v8 -; CGP-NEXT: v_add_i32_e32 v4, vcc, v7, v4 -; CGP-NEXT: v_mul_hi_u32 v3, v0, v3 +; CGP-NEXT: v_mul_lo_u32 v5, v4, v5 +; CGP-NEXT: v_mul_hi_u32 v5, v4, v5 +; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v5 +; CGP-NEXT: v_mul_hi_u32 v5, v0, v4 ; CGP-NEXT: v_mul_hi_u32 v4, v1, v4 -; CGP-NEXT: v_lshlrev_b32_e32 v7, 12, v3 -; CGP-NEXT: v_add_i32_e32 v8, vcc, 1, v3 +; CGP-NEXT: v_lshlrev_b32_e32 v7, 12, v5 +; CGP-NEXT: v_add_i32_e32 v8, vcc, 1, v5 ; CGP-NEXT: v_lshlrev_b32_e32 v9, 12, v4 ; CGP-NEXT: v_add_i32_e32 v10, vcc, 1, v4 ; CGP-NEXT: v_sub_i32_e32 v0, vcc, v0, v7 ; CGP-NEXT: v_sub_i32_e32 v1, vcc, v1, v9 -; CGP-NEXT: v_cmp_le_u32_e64 s[4:5], s8, v0 -; CGP-NEXT: v_cndmask_b32_e64 v3, v3, v8, s[4:5] -; CGP-NEXT: v_subrev_i32_e32 v7, vcc, s8, v0 -; CGP-NEXT: v_cmp_ge_u32_e64 s[6:7], v1, v5 +; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v0, v2 +; CGP-NEXT: v_cndmask_b32_e64 v5, v5, v8, s[4:5] +; CGP-NEXT: v_sub_i32_e32 v7, vcc, v0, v2 +; CGP-NEXT: v_cmp_ge_u32_e64 s[6:7], v1, v2 ; CGP-NEXT: v_cndmask_b32_e64 v4, v4, v10, s[6:7] ; CGP-NEXT: v_subrev_i32_e32 v8, vcc, 0x1000, v1 ; CGP-NEXT: v_cndmask_b32_e64 v0, v0, v7, s[4:5] -; CGP-NEXT: v_add_i32_e32 v7, vcc, 1, v3 +; CGP-NEXT: v_add_i32_e32 v7, vcc, 1, v5 ; CGP-NEXT: v_cndmask_b32_e64 v1, v1, v8, s[6:7] ; CGP-NEXT: v_add_i32_e32 v8, vcc, 1, v4 -; CGP-NEXT: v_cmp_le_u32_e32 vcc, s8, v0 -; CGP-NEXT: v_cndmask_b32_e32 v0, v3, v7, vcc -; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v1, v5 +; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v0, v2 +; CGP-NEXT: v_cndmask_b32_e32 v0, v5, v7, vcc +; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v1, v2 ; CGP-NEXT: v_cndmask_b32_e32 v1, v4, v8, vcc -; CGP-NEXT: v_xor_b32_e32 v0, v0, v2 +; CGP-NEXT: v_xor_b32_e32 v0, v0, v3 ; CGP-NEXT: v_xor_b32_e32 v1, v1, v6 -; CGP-NEXT: v_sub_i32_e32 v0, vcc, v0, v2 +; CGP-NEXT: v_sub_i32_e32 v0, vcc, v0, v3 ; CGP-NEXT: v_sub_i32_e32 v1, vcc, v1, v6 ; CGP-NEXT: s_setpc_b64 s[30:31] %result = sdiv <2 x i32> %num, @@ -419,30 +406,30 @@ ; CHECK-LABEL: v_sdiv_i32_oddk_denom: ; CHECK: ; %bb.0: ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_mov_b32 s6, 0x12d8fb -; CHECK-NEXT: v_ashrrev_i32_e32 v1, 31, v0 -; CHECK-NEXT: v_rcp_iflag_f32_e32 v2, 0x4996c7d8 -; CHECK-NEXT: v_mov_b32_e32 v3, 0xffed2705 -; CHECK-NEXT: v_add_i32_e32 v0, vcc, v0, v1 -; CHECK-NEXT: v_mul_f32_e32 v2, 0x4f7ffffe, v2 -; CHECK-NEXT: v_xor_b32_e32 v0, v0, v1 -; CHECK-NEXT: v_cvt_u32_f32_e32 v2, v2 -; CHECK-NEXT: v_mul_lo_u32 v3, v2, v3 -; CHECK-NEXT: v_mul_hi_u32 v3, v2, v3 -; CHECK-NEXT: v_add_i32_e32 v2, vcc, v2, v3 -; CHECK-NEXT: v_mul_hi_u32 v2, v0, v2 -; CHECK-NEXT: v_mul_lo_u32 v3, v2, s6 -; CHECK-NEXT: v_add_i32_e32 v4, vcc, 1, v2 -; CHECK-NEXT: v_sub_i32_e32 v0, vcc, v0, v3 -; CHECK-NEXT: v_cmp_le_u32_e64 s[4:5], s6, v0 -; CHECK-NEXT: v_cndmask_b32_e64 v2, v2, v4, s[4:5] -; CHECK-NEXT: v_subrev_i32_e32 v3, vcc, s6, v0 -; CHECK-NEXT: v_cndmask_b32_e64 v0, v0, v3, s[4:5] -; CHECK-NEXT: v_add_i32_e32 v3, vcc, 1, v2 -; CHECK-NEXT: v_cmp_le_u32_e32 vcc, s6, v0 -; CHECK-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc -; CHECK-NEXT: v_xor_b32_e32 v0, v0, v1 -; CHECK-NEXT: v_sub_i32_e32 v0, vcc, v0, v1 +; CHECK-NEXT: v_mov_b32_e32 v1, 0x12d8fb +; CHECK-NEXT: v_ashrrev_i32_e32 v2, 31, v0 +; CHECK-NEXT: v_rcp_iflag_f32_e32 v3, 0x4996c7d8 +; CHECK-NEXT: v_mov_b32_e32 v4, 0xffed2705 +; CHECK-NEXT: v_add_i32_e32 v0, vcc, v0, v2 +; CHECK-NEXT: v_mul_f32_e32 v3, 0x4f7ffffe, v3 +; CHECK-NEXT: v_xor_b32_e32 v0, v0, v2 +; CHECK-NEXT: v_cvt_u32_f32_e32 v3, v3 +; CHECK-NEXT: v_mul_lo_u32 v4, v3, v4 +; CHECK-NEXT: v_mul_hi_u32 v4, v3, v4 +; CHECK-NEXT: v_add_i32_e32 v3, vcc, v3, v4 +; CHECK-NEXT: v_mul_hi_u32 v3, v0, v3 +; CHECK-NEXT: v_mul_lo_u32 v4, v3, v1 +; CHECK-NEXT: v_add_i32_e32 v5, vcc, 1, v3 +; CHECK-NEXT: v_sub_i32_e32 v0, vcc, v0, v4 +; CHECK-NEXT: v_cmp_ge_u32_e64 s[4:5], v0, v1 +; CHECK-NEXT: v_cndmask_b32_e64 v3, v3, v5, s[4:5] +; CHECK-NEXT: v_subrev_i32_e32 v4, vcc, 0x12d8fb, v0 +; CHECK-NEXT: v_cndmask_b32_e64 v0, v0, v4, s[4:5] +; CHECK-NEXT: v_add_i32_e32 v4, vcc, 1, v3 +; CHECK-NEXT: v_cmp_ge_u32_e32 vcc, v0, v1 +; CHECK-NEXT: v_cndmask_b32_e32 v0, v3, v4, vcc +; CHECK-NEXT: v_xor_b32_e32 v0, v0, v2 +; CHECK-NEXT: v_sub_i32_e32 v0, vcc, v0, v2 ; CHECK-NEXT: s_setpc_b64 s[30:31] %result = sdiv i32 %num, 1235195 ret i32 %result @@ -462,17 +449,12 @@ ; GISEL-NEXT: v_xor_b32_e32 v0, v0, v2 ; GISEL-NEXT: v_rcp_iflag_f32_e32 v4, v4 ; GISEL-NEXT: v_xor_b32_e32 v1, v1, v3 -; GISEL-NEXT: v_mul_f32_e32 v5, 0x4f7ffffe, v4 ; GISEL-NEXT: v_mul_f32_e32 v4, 0x4f7ffffe, v4 -; GISEL-NEXT: v_cvt_u32_f32_e32 v5, v5 ; GISEL-NEXT: v_cvt_u32_f32_e32 v4, v4 -; GISEL-NEXT: v_mul_lo_u32 v6, s4, v5 -; GISEL-NEXT: v_mul_lo_u32 v7, s4, v4 -; GISEL-NEXT: v_mul_hi_u32 v6, v5, v6 -; GISEL-NEXT: v_mul_hi_u32 v7, v4, v7 -; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v6 -; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v7 -; GISEL-NEXT: v_mul_hi_u32 v5, v0, v5 +; GISEL-NEXT: v_mul_lo_u32 v5, s4, v4 +; GISEL-NEXT: v_mul_hi_u32 v5, v4, v5 +; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v5 +; GISEL-NEXT: v_mul_hi_u32 v5, v0, v4 ; GISEL-NEXT: v_mul_hi_u32 v4, v1, v4 ; GISEL-NEXT: v_mul_lo_u32 v6, v5, s8 ; GISEL-NEXT: v_add_i32_e32 v7, vcc, 1, v5 @@ -503,53 +485,45 @@ ; CGP-LABEL: v_sdiv_v2i32_oddk_denom: ; CGP: ; %bb.0: ; CGP-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CGP-NEXT: s_mov_b32 s8, 0x12d8fb -; CGP-NEXT: v_ashrrev_i32_e32 v2, 31, v0 -; CGP-NEXT: v_rcp_iflag_f32_e32 v3, 0x4996c7d8 -; CGP-NEXT: s_mov_b32 s4, 0xffed2705 -; CGP-NEXT: v_mov_b32_e32 v4, 0xffed2705 -; CGP-NEXT: v_mov_b32_e32 v5, 0x12d8fb +; CGP-NEXT: v_mov_b32_e32 v2, 0x12d8fb +; CGP-NEXT: v_ashrrev_i32_e32 v3, 31, v0 +; CGP-NEXT: v_rcp_iflag_f32_e32 v4, 0x4996c7d8 +; CGP-NEXT: v_mov_b32_e32 v5, 0xffed2705 ; CGP-NEXT: v_ashrrev_i32_e32 v6, 31, v1 -; CGP-NEXT: v_rcp_iflag_f32_e32 v7, 0x4996c7d8 -; CGP-NEXT: v_add_i32_e32 v0, vcc, v0, v2 -; CGP-NEXT: v_mul_f32_e32 v3, 0x4f7ffffe, v3 +; CGP-NEXT: v_add_i32_e32 v0, vcc, v0, v3 +; CGP-NEXT: v_mul_f32_e32 v4, 0x4f7ffffe, v4 ; CGP-NEXT: v_add_i32_e32 v1, vcc, v1, v6 -; CGP-NEXT: v_mul_f32_e32 v7, 0x4f7ffffe, v7 -; CGP-NEXT: v_xor_b32_e32 v0, v0, v2 -; CGP-NEXT: v_cvt_u32_f32_e32 v3, v3 +; CGP-NEXT: v_xor_b32_e32 v0, v0, v3 +; CGP-NEXT: v_cvt_u32_f32_e32 v4, v4 ; CGP-NEXT: v_xor_b32_e32 v1, v1, v6 -; CGP-NEXT: v_cvt_u32_f32_e32 v7, v7 -; CGP-NEXT: v_mul_lo_u32 v8, v3, s4 -; CGP-NEXT: v_mul_lo_u32 v4, v7, v4 -; CGP-NEXT: v_mul_hi_u32 v8, v3, v8 -; CGP-NEXT: v_mul_hi_u32 v4, v7, v4 -; CGP-NEXT: v_add_i32_e32 v3, vcc, v3, v8 -; CGP-NEXT: v_add_i32_e32 v4, vcc, v7, v4 -; CGP-NEXT: v_mul_hi_u32 v3, v0, v3 +; CGP-NEXT: v_mul_lo_u32 v5, v4, v5 +; CGP-NEXT: v_mul_hi_u32 v5, v4, v5 +; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v5 +; CGP-NEXT: v_mul_hi_u32 v5, v0, v4 ; CGP-NEXT: v_mul_hi_u32 v4, v1, v4 -; CGP-NEXT: v_mul_lo_u32 v7, v3, s8 -; CGP-NEXT: v_add_i32_e32 v8, vcc, 1, v3 -; CGP-NEXT: v_mul_lo_u32 v9, v4, v5 +; CGP-NEXT: v_mul_lo_u32 v7, v5, v2 +; CGP-NEXT: v_add_i32_e32 v8, vcc, 1, v5 +; CGP-NEXT: v_mul_lo_u32 v9, v4, v2 ; CGP-NEXT: v_add_i32_e32 v10, vcc, 1, v4 ; CGP-NEXT: v_sub_i32_e32 v0, vcc, v0, v7 ; CGP-NEXT: v_sub_i32_e32 v1, vcc, v1, v9 -; CGP-NEXT: v_cmp_le_u32_e64 s[4:5], s8, v0 -; CGP-NEXT: v_cndmask_b32_e64 v3, v3, v8, s[4:5] -; CGP-NEXT: v_subrev_i32_e32 v7, vcc, s8, v0 -; CGP-NEXT: v_cmp_ge_u32_e64 s[6:7], v1, v5 +; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v0, v2 +; CGP-NEXT: v_cndmask_b32_e64 v5, v5, v8, s[4:5] +; CGP-NEXT: v_sub_i32_e32 v7, vcc, v0, v2 +; CGP-NEXT: v_cmp_ge_u32_e64 s[6:7], v1, v2 ; CGP-NEXT: v_cndmask_b32_e64 v4, v4, v10, s[6:7] ; CGP-NEXT: v_subrev_i32_e32 v8, vcc, 0x12d8fb, v1 ; CGP-NEXT: v_cndmask_b32_e64 v0, v0, v7, s[4:5] -; CGP-NEXT: v_add_i32_e32 v7, vcc, 1, v3 +; CGP-NEXT: v_add_i32_e32 v7, vcc, 1, v5 ; CGP-NEXT: v_cndmask_b32_e64 v1, v1, v8, s[6:7] ; CGP-NEXT: v_add_i32_e32 v8, vcc, 1, v4 -; CGP-NEXT: v_cmp_le_u32_e32 vcc, s8, v0 -; CGP-NEXT: v_cndmask_b32_e32 v0, v3, v7, vcc -; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v1, v5 +; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v0, v2 +; CGP-NEXT: v_cndmask_b32_e32 v0, v5, v7, vcc +; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v1, v2 ; CGP-NEXT: v_cndmask_b32_e32 v1, v4, v8, vcc -; CGP-NEXT: v_xor_b32_e32 v0, v0, v2 +; CGP-NEXT: v_xor_b32_e32 v0, v0, v3 ; CGP-NEXT: v_xor_b32_e32 v1, v1, v6 -; CGP-NEXT: v_sub_i32_e32 v0, vcc, v0, v2 +; CGP-NEXT: v_sub_i32_e32 v0, vcc, v0, v3 ; CGP-NEXT: v_sub_i32_e32 v1, vcc, v1, v6 ; CGP-NEXT: s_setpc_b64 s[30:31] %result = sdiv <2 x i32> %num, Index: llvm/test/CodeGen/AMDGPU/GlobalISel/sdiv.i64.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/GlobalISel/sdiv.i64.ll +++ llvm/test/CodeGen/AMDGPU/GlobalISel/sdiv.i64.ll @@ -169,17 +169,17 @@ ; CHECK-NEXT: v_mul_hi_u32 v1, v0, v1 ; CHECK-NEXT: v_add_i32_e32 v0, vcc, v0, v1 ; CHECK-NEXT: v_mul_hi_u32 v0, v4, v0 -; CHECK-NEXT: v_mul_lo_u32 v1, v0, v2 -; CHECK-NEXT: v_add_i32_e32 v3, vcc, 1, v0 -; CHECK-NEXT: v_sub_i32_e32 v1, vcc, v4, v1 -; CHECK-NEXT: v_cmp_ge_u32_e32 vcc, v1, v2 -; CHECK-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc -; CHECK-NEXT: v_sub_i32_e64 v3, s[4:5], v1, v2 -; CHECK-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc -; CHECK-NEXT: v_add_i32_e32 v3, vcc, 1, v0 -; CHECK-NEXT: v_cmp_ge_u32_e32 vcc, v1, v2 -; CHECK-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc ; CHECK-NEXT: v_mov_b32_e32 v1, 0 +; CHECK-NEXT: v_mul_lo_u32 v3, v0, v2 +; CHECK-NEXT: v_add_i32_e32 v5, vcc, 1, v0 +; CHECK-NEXT: v_sub_i32_e32 v3, vcc, v4, v3 +; CHECK-NEXT: v_cmp_ge_u32_e32 vcc, v3, v2 +; CHECK-NEXT: v_cndmask_b32_e32 v0, v0, v5, vcc +; CHECK-NEXT: v_sub_i32_e64 v4, s[4:5], v3, v2 +; CHECK-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc +; CHECK-NEXT: v_add_i32_e32 v4, vcc, 1, v0 +; CHECK-NEXT: v_cmp_ge_u32_e32 vcc, v3, v2 +; CHECK-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc ; CHECK-NEXT: s_or_b64 exec, exec, s[6:7] ; CHECK-NEXT: s_setpc_b64 s[30:31] %result = sdiv i64 %num, %den @@ -803,17 +803,17 @@ ; CGP-NEXT: v_mul_hi_u32 v1, v0, v1 ; CGP-NEXT: v_add_i32_e32 v0, vcc, v0, v1 ; CGP-NEXT: v_mul_hi_u32 v0, v10, v0 -; CGP-NEXT: v_mul_lo_u32 v1, v0, v4 -; CGP-NEXT: v_add_i32_e32 v2, vcc, 1, v0 -; CGP-NEXT: v_sub_i32_e32 v1, vcc, v10, v1 -; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v1, v4 -; CGP-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc -; CGP-NEXT: v_sub_i32_e64 v2, s[4:5], v1, v4 -; CGP-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc -; CGP-NEXT: v_add_i32_e32 v2, vcc, 1, v0 -; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v1, v4 -; CGP-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc ; CGP-NEXT: v_mov_b32_e32 v1, 0 +; CGP-NEXT: v_mul_lo_u32 v2, v0, v4 +; CGP-NEXT: v_add_i32_e32 v3, vcc, 1, v0 +; CGP-NEXT: v_sub_i32_e32 v2, vcc, v10, v2 +; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v2, v4 +; CGP-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc +; CGP-NEXT: v_sub_i32_e64 v3, s[4:5], v2, v4 +; CGP-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc +; CGP-NEXT: v_add_i32_e32 v3, vcc, 1, v0 +; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v2, v4 +; CGP-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc ; CGP-NEXT: .LBB2_4: ; CGP-NEXT: s_or_b64 exec, exec, s[6:7] ; CGP-NEXT: v_or_b32_e32 v3, v9, v7 @@ -975,17 +975,17 @@ ; CGP-NEXT: v_mul_hi_u32 v3, v2, v3 ; CGP-NEXT: v_add_i32_e32 v2, vcc, v2, v3 ; CGP-NEXT: v_mul_hi_u32 v2, v8, v2 -; CGP-NEXT: v_mul_lo_u32 v3, v2, v6 -; CGP-NEXT: v_add_i32_e32 v4, vcc, 1, v2 -; CGP-NEXT: v_sub_i32_e32 v3, vcc, v8, v3 -; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v3, v6 -; CGP-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc -; CGP-NEXT: v_sub_i32_e64 v4, s[4:5], v3, v6 -; CGP-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc -; CGP-NEXT: v_add_i32_e32 v4, vcc, 1, v2 -; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v3, v6 -; CGP-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc ; CGP-NEXT: v_mov_b32_e32 v3, 0 +; CGP-NEXT: v_mul_lo_u32 v4, v2, v6 +; CGP-NEXT: v_add_i32_e32 v5, vcc, 1, v2 +; CGP-NEXT: v_sub_i32_e32 v4, vcc, v8, v4 +; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v4, v6 +; CGP-NEXT: v_cndmask_b32_e32 v2, v2, v5, vcc +; CGP-NEXT: v_sub_i32_e64 v5, s[4:5], v4, v6 +; CGP-NEXT: v_cndmask_b32_e32 v4, v4, v5, vcc +; CGP-NEXT: v_add_i32_e32 v5, vcc, 1, v2 +; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v4, v6 +; CGP-NEXT: v_cndmask_b32_e32 v2, v2, v5, vcc ; CGP-NEXT: s_or_b64 exec, exec, s[6:7] ; CGP-NEXT: s_setpc_b64 s[30:31] %result = sdiv <2 x i64> %num, %den @@ -998,7 +998,8 @@ ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; CHECK-NEXT: v_cvt_f32_u32_e32 v2, 0x1000 ; CHECK-NEXT: v_cvt_f32_ubyte0_e32 v3, 0 -; CHECK-NEXT: s_movk_i32 s6, 0xf000 +; CHECK-NEXT: v_mov_b32_e32 v6, 0xfffff000 +; CHECK-NEXT: s_bfe_i32 s6, 1, 0x10000 ; CHECK-NEXT: v_mac_f32_e32 v2, 0x4f800000, v3 ; CHECK-NEXT: v_rcp_iflag_f32_e32 v2, v2 ; CHECK-NEXT: v_mul_f32_e32 v2, 0x5f7ffffc, v2 @@ -1006,122 +1007,117 @@ ; CHECK-NEXT: v_trunc_f32_e32 v4, v3 ; CHECK-NEXT: v_mac_f32_e32 v2, 0xcf800000, v4 ; CHECK-NEXT: v_cvt_u32_f32_e32 v5, v2 -; CHECK-NEXT: v_mov_b32_e32 v2, 0xfffff000 -; CHECK-NEXT: v_cvt_u32_f32_e32 v6, v4 -; CHECK-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v2, v5, 0 -; CHECK-NEXT: v_mad_u64_u32 v[3:4], s[4:5], s6, v6, v[3:4] -; CHECK-NEXT: v_mul_hi_u32 v7, v5, v2 +; CHECK-NEXT: v_cvt_u32_f32_e32 v7, v4 +; CHECK-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v6, v5, 0 +; CHECK-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v6, v7, v[3:4] +; CHECK-NEXT: v_mul_hi_u32 v8, v5, v2 ; CHECK-NEXT: v_mad_u64_u32 v[3:4], s[4:5], -1, v5, v[3:4] -; CHECK-NEXT: v_mul_lo_u32 v4, v6, v2 -; CHECK-NEXT: v_mul_hi_u32 v2, v6, v2 -; CHECK-NEXT: v_mul_lo_u32 v8, v5, v3 -; CHECK-NEXT: v_mul_lo_u32 v9, v6, v3 -; CHECK-NEXT: v_mul_hi_u32 v10, v5, v3 -; CHECK-NEXT: v_mul_hi_u32 v3, v6, v3 +; CHECK-NEXT: v_mul_lo_u32 v4, v7, v2 +; CHECK-NEXT: v_mul_hi_u32 v2, v7, v2 +; CHECK-NEXT: v_mul_lo_u32 v9, v5, v3 +; CHECK-NEXT: v_mul_lo_u32 v10, v7, v3 +; CHECK-NEXT: v_mul_hi_u32 v11, v5, v3 +; CHECK-NEXT: v_mul_hi_u32 v3, v7, v3 +; CHECK-NEXT: v_add_i32_e32 v4, vcc, v4, v9 +; CHECK-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc +; CHECK-NEXT: v_add_i32_e32 v2, vcc, v10, v2 +; CHECK-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc ; CHECK-NEXT: v_add_i32_e32 v4, vcc, v4, v8 -; CHECK-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc -; CHECK-NEXT: v_add_i32_e32 v4, vcc, v4, v7 ; CHECK-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc -; CHECK-NEXT: v_add_i32_e32 v4, vcc, v8, v4 -; CHECK-NEXT: v_add_i32_e32 v2, vcc, v9, v2 -; CHECK-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc -; CHECK-NEXT: v_add_i32_e32 v2, vcc, v2, v10 +; CHECK-NEXT: v_add_i32_e32 v4, vcc, v9, v4 +; CHECK-NEXT: v_add_i32_e32 v2, vcc, v2, v11 ; CHECK-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc -; CHECK-NEXT: v_add_i32_e32 v7, vcc, v7, v8 +; CHECK-NEXT: v_add_i32_e32 v8, vcc, v10, v8 ; CHECK-NEXT: v_add_i32_e32 v2, vcc, v2, v4 ; CHECK-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc -; CHECK-NEXT: v_add_i32_e32 v4, vcc, v7, v4 +; CHECK-NEXT: v_add_i32_e32 v4, vcc, v8, v4 ; CHECK-NEXT: v_add_i32_e32 v3, vcc, v3, v4 ; CHECK-NEXT: v_add_i32_e32 v5, vcc, v5, v2 -; CHECK-NEXT: v_addc_u32_e32 v6, vcc, v6, v3, vcc -; CHECK-NEXT: v_mad_u64_u32 v[2:3], s[4:5], s6, v5, 0 -; CHECK-NEXT: v_ashrrev_i32_e32 v7, 31, v1 -; CHECK-NEXT: v_add_i32_e32 v8, vcc, v0, v7 -; CHECK-NEXT: v_mad_u64_u32 v[3:4], s[4:5], s6, v6, v[3:4] -; CHECK-NEXT: v_addc_u32_e32 v9, vcc, v1, v7, vcc -; CHECK-NEXT: v_mad_u64_u32 v[0:1], s[4:5], -1, v5, v[3:4] -; CHECK-NEXT: v_xor_b32_e32 v3, v8, v7 -; CHECK-NEXT: v_mul_lo_u32 v1, v6, v2 -; CHECK-NEXT: v_mul_lo_u32 v8, v5, v0 -; CHECK-NEXT: v_xor_b32_e32 v4, v9, v7 -; CHECK-NEXT: v_mul_hi_u32 v9, v5, v2 -; CHECK-NEXT: v_mul_hi_u32 v2, v6, v2 +; CHECK-NEXT: v_addc_u32_e32 v7, vcc, v7, v3, vcc +; CHECK-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v6, v5, 0 +; CHECK-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v6, v7, v[3:4] +; CHECK-NEXT: v_ashrrev_i32_e32 v6, 31, v1 +; CHECK-NEXT: v_add_i32_e32 v0, vcc, v0, v6 +; CHECK-NEXT: v_mad_u64_u32 v[3:4], s[4:5], -1, v5, v[3:4] +; CHECK-NEXT: v_addc_u32_e32 v1, vcc, v1, v6, vcc +; CHECK-NEXT: v_xor_b32_e32 v4, v0, v6 +; CHECK-NEXT: v_mul_lo_u32 v0, v7, v2 +; CHECK-NEXT: v_mul_lo_u32 v8, v5, v3 +; CHECK-NEXT: v_xor_b32_e32 v9, v1, v6 +; CHECK-NEXT: v_mul_hi_u32 v1, v5, v2 +; CHECK-NEXT: v_mul_hi_u32 v2, v7, v2 +; CHECK-NEXT: v_add_i32_e32 v0, vcc, v0, v8 +; CHECK-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc +; CHECK-NEXT: v_add_i32_e32 v0, vcc, v0, v1 +; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc +; CHECK-NEXT: v_mul_lo_u32 v1, v7, v3 +; CHECK-NEXT: v_add_i32_e32 v0, vcc, v8, v0 +; CHECK-NEXT: v_mul_hi_u32 v8, v5, v3 +; CHECK-NEXT: v_add_i32_e32 v1, vcc, v1, v2 +; CHECK-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc ; CHECK-NEXT: v_add_i32_e32 v1, vcc, v1, v8 ; CHECK-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc -; CHECK-NEXT: v_add_i32_e32 v1, vcc, v1, v9 -; CHECK-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc -; CHECK-NEXT: v_mul_lo_u32 v9, v6, v0 -; CHECK-NEXT: v_add_i32_e32 v1, vcc, v8, v1 -; CHECK-NEXT: v_mul_hi_u32 v8, v5, v0 -; CHECK-NEXT: v_add_i32_e32 v2, vcc, v9, v2 -; CHECK-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc ; CHECK-NEXT: v_add_i32_e32 v2, vcc, v2, v8 -; CHECK-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc -; CHECK-NEXT: v_add_i32_e32 v8, vcc, v9, v8 -; CHECK-NEXT: v_mul_hi_u32 v0, v6, v0 +; CHECK-NEXT: v_mul_hi_u32 v3, v7, v3 +; CHECK-NEXT: v_add_i32_e32 v0, vcc, v1, v0 +; CHECK-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc ; CHECK-NEXT: v_add_i32_e32 v1, vcc, v2, v1 +; CHECK-NEXT: v_add_i32_e32 v1, vcc, v3, v1 +; CHECK-NEXT: v_add_i32_e32 v0, vcc, v5, v0 +; CHECK-NEXT: v_addc_u32_e32 v1, vcc, v7, v1, vcc +; CHECK-NEXT: v_mul_lo_u32 v2, v9, v0 +; CHECK-NEXT: v_mul_lo_u32 v3, v4, v1 +; CHECK-NEXT: v_mul_hi_u32 v7, v4, v0 +; CHECK-NEXT: v_mul_hi_u32 v0, v9, v0 +; CHECK-NEXT: v_mov_b32_e32 v5, 0x1000 +; CHECK-NEXT: v_add_i32_e32 v2, vcc, v2, v3 +; CHECK-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc +; CHECK-NEXT: v_add_i32_e32 v2, vcc, v2, v7 ; CHECK-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc -; CHECK-NEXT: v_add_i32_e32 v2, vcc, v8, v2 -; CHECK-NEXT: v_add_i32_e32 v0, vcc, v0, v2 -; CHECK-NEXT: v_add_i32_e32 v1, vcc, v5, v1 -; CHECK-NEXT: v_addc_u32_e32 v0, vcc, v6, v0, vcc -; CHECK-NEXT: v_mul_lo_u32 v2, v4, v1 -; CHECK-NEXT: v_mul_lo_u32 v5, v3, v0 -; CHECK-NEXT: v_mul_hi_u32 v8, v3, v1 -; CHECK-NEXT: v_mul_hi_u32 v1, v4, v1 -; CHECK-NEXT: s_movk_i32 s6, 0x1000 -; CHECK-NEXT: v_add_i32_e32 v2, vcc, v2, v5 -; CHECK-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc -; CHECK-NEXT: v_add_i32_e32 v2, vcc, v2, v8 -; CHECK-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc -; CHECK-NEXT: v_mul_lo_u32 v8, v4, v0 -; CHECK-NEXT: v_add_i32_e32 v2, vcc, v5, v2 -; CHECK-NEXT: v_mul_hi_u32 v5, v3, v0 -; CHECK-NEXT: v_add_i32_e32 v1, vcc, v8, v1 -; CHECK-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc -; CHECK-NEXT: v_add_i32_e32 v1, vcc, v1, v5 -; CHECK-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc -; CHECK-NEXT: v_add_i32_e32 v5, vcc, v8, v5 -; CHECK-NEXT: v_add_i32_e32 v8, vcc, v1, v2 -; CHECK-NEXT: v_mul_hi_u32 v9, v4, v0 -; CHECK-NEXT: v_mad_u64_u32 v[0:1], s[4:5], s6, v8, 0 +; CHECK-NEXT: v_mul_lo_u32 v7, v9, v1 +; CHECK-NEXT: v_add_i32_e32 v2, vcc, v3, v2 +; CHECK-NEXT: v_mul_hi_u32 v3, v4, v1 +; CHECK-NEXT: v_add_i32_e32 v0, vcc, v7, v0 +; CHECK-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc +; CHECK-NEXT: v_add_i32_e32 v0, vcc, v0, v3 +; CHECK-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc +; CHECK-NEXT: v_add_i32_e32 v3, vcc, v7, v3 +; CHECK-NEXT: v_add_i32_e32 v7, vcc, v0, v2 +; CHECK-NEXT: v_mul_hi_u32 v8, v9, v1 +; CHECK-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v5, v7, 0 ; CHECK-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc -; CHECK-NEXT: v_add_i32_e32 v2, vcc, v5, v2 -; CHECK-NEXT: v_add_i32_e32 v5, vcc, v9, v2 -; CHECK-NEXT: v_mad_u64_u32 v[1:2], s[4:5], s6, v5, v[1:2] -; CHECK-NEXT: v_sub_i32_e32 v0, vcc, v3, v0 -; CHECK-NEXT: v_subb_u32_e64 v2, s[4:5], v4, v1, vcc -; CHECK-NEXT: v_sub_i32_e64 v1, s[4:5], v4, v1 -; CHECK-NEXT: v_mov_b32_e32 v6, 0x1000 +; CHECK-NEXT: v_add_i32_e32 v2, vcc, v3, v2 +; CHECK-NEXT: v_add_i32_e32 v3, vcc, v8, v2 +; CHECK-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v5, v3, v[1:2] +; CHECK-NEXT: v_sub_i32_e32 v0, vcc, v4, v0 +; CHECK-NEXT: v_subb_u32_e64 v2, s[4:5], v9, v1, vcc +; CHECK-NEXT: v_sub_i32_e64 v1, s[4:5], v9, v1 ; CHECK-NEXT: v_subbrev_u32_e32 v1, vcc, 0, v1, vcc -; CHECK-NEXT: s_bfe_i32 s6, 1, 0x10000 -; CHECK-NEXT: v_cmp_ge_u32_e64 s[4:5], v0, v6 -; CHECK-NEXT: v_sub_i32_e32 v0, vcc, v0, v6 -; CHECK-NEXT: v_cndmask_b32_e64 v3, 0, -1, s[4:5] -; CHECK-NEXT: v_mov_b32_e32 v4, s6 +; CHECK-NEXT: v_cmp_ge_u32_e64 s[4:5], v0, v5 +; CHECK-NEXT: v_sub_i32_e32 v0, vcc, v0, v5 +; CHECK-NEXT: v_cndmask_b32_e64 v4, 0, -1, s[4:5] +; CHECK-NEXT: v_mov_b32_e32 v8, s6 ; CHECK-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v2 ; CHECK-NEXT: v_subbrev_u32_e32 v1, vcc, 0, v1, vcc -; CHECK-NEXT: v_cndmask_b32_e64 v2, v4, v3, s[4:5] -; CHECK-NEXT: v_add_i32_e32 v3, vcc, 1, v8 -; CHECK-NEXT: v_addc_u32_e32 v4, vcc, 0, v5, vcc -; CHECK-NEXT: s_bfe_i32 s4, 1, 0x10000 -; CHECK-NEXT: v_cmp_ge_u32_e32 vcc, v0, v6 +; CHECK-NEXT: v_cndmask_b32_e64 v2, v8, v4, s[4:5] +; CHECK-NEXT: v_add_i32_e32 v4, vcc, 1, v7 +; CHECK-NEXT: v_addc_u32_e32 v9, vcc, 0, v3, vcc +; CHECK-NEXT: v_cmp_ge_u32_e32 vcc, v0, v5 ; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc -; CHECK-NEXT: v_mov_b32_e32 v6, s4 ; CHECK-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 -; CHECK-NEXT: v_cndmask_b32_e32 v0, v6, v0, vcc -; CHECK-NEXT: v_add_i32_e32 v1, vcc, 1, v3 -; CHECK-NEXT: v_addc_u32_e32 v6, vcc, 0, v4, vcc +; CHECK-NEXT: v_cndmask_b32_e32 v0, v8, v0, vcc +; CHECK-NEXT: v_add_i32_e32 v1, vcc, 1, v4 +; CHECK-NEXT: v_addc_u32_e32 v5, vcc, 0, v9, vcc ; CHECK-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 -; CHECK-NEXT: v_cndmask_b32_e32 v0, v3, v1, vcc -; CHECK-NEXT: v_cndmask_b32_e32 v1, v4, v6, vcc +; CHECK-NEXT: v_cndmask_b32_e32 v0, v4, v1, vcc +; CHECK-NEXT: v_cndmask_b32_e32 v1, v9, v5, vcc ; CHECK-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 -; CHECK-NEXT: v_cndmask_b32_e32 v0, v8, v0, vcc -; CHECK-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc -; CHECK-NEXT: v_xor_b32_e32 v0, v0, v7 -; CHECK-NEXT: v_xor_b32_e32 v1, v1, v7 -; CHECK-NEXT: v_sub_i32_e32 v0, vcc, v0, v7 -; CHECK-NEXT: v_subb_u32_e32 v1, vcc, v1, v7, vcc +; CHECK-NEXT: v_cndmask_b32_e32 v0, v7, v0, vcc +; CHECK-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc +; CHECK-NEXT: v_xor_b32_e32 v0, v0, v6 +; CHECK-NEXT: v_xor_b32_e32 v1, v1, v6 +; CHECK-NEXT: v_sub_i32_e32 v0, vcc, v0, v6 +; CHECK-NEXT: v_subb_u32_e32 v1, vcc, v1, v6, vcc ; CHECK-NEXT: s_setpc_b64 s[30:31] %result = sdiv i64 %num, 4096 ret i64 %result @@ -1400,253 +1396,236 @@ ; CGP-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; CGP-NEXT: v_cvt_f32_u32_e32 v4, 0x1000 ; CGP-NEXT: v_cvt_f32_ubyte0_e32 v5, 0 -; CGP-NEXT: s_movk_i32 s6, 0xf000 -; CGP-NEXT: s_movk_i32 s7, 0x1000 +; CGP-NEXT: s_bfe_i32 s6, 1, 0x10000 ; CGP-NEXT: v_mac_f32_e32 v4, 0x4f800000, v5 ; CGP-NEXT: v_rcp_iflag_f32_e32 v4, v4 -; CGP-NEXT: s_bfe_i32 s8, 1, 0x10000 ; CGP-NEXT: v_mul_f32_e32 v4, 0x5f7ffffc, v4 ; CGP-NEXT: v_mul_f32_e32 v5, 0x2f800000, v4 -; CGP-NEXT: v_trunc_f32_e32 v6, v5 -; CGP-NEXT: v_mac_f32_e32 v4, 0xcf800000, v6 -; CGP-NEXT: v_cvt_u32_f32_e32 v7, v4 -; CGP-NEXT: v_mov_b32_e32 v4, 0xfffff000 -; CGP-NEXT: v_cvt_u32_f32_e32 v8, v6 -; CGP-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v4, v7, 0 -; CGP-NEXT: v_mad_u64_u32 v[5:6], s[4:5], s6, v8, v[5:6] -; CGP-NEXT: v_mul_hi_u32 v9, v7, v4 -; CGP-NEXT: v_mad_u64_u32 v[5:6], s[4:5], -1, v7, v[5:6] -; CGP-NEXT: v_mul_lo_u32 v6, v8, v4 -; CGP-NEXT: v_mul_hi_u32 v4, v8, v4 -; CGP-NEXT: v_mul_lo_u32 v10, v7, v5 -; CGP-NEXT: v_mul_lo_u32 v11, v8, v5 -; CGP-NEXT: v_mul_hi_u32 v12, v7, v5 -; CGP-NEXT: v_mul_hi_u32 v5, v8, v5 -; CGP-NEXT: v_add_i32_e32 v6, vcc, v6, v10 -; CGP-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v6, vcc, v6, v9 -; CGP-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v6, vcc, v10, v6 +; CGP-NEXT: v_trunc_f32_e32 v7, v5 +; CGP-NEXT: v_mac_f32_e32 v4, 0xcf800000, v7 +; CGP-NEXT: v_cvt_u32_f32_e32 v6, v4 +; CGP-NEXT: v_mov_b32_e32 v5, 0xfffff000 +; CGP-NEXT: v_cvt_u32_f32_e32 v8, v7 +; CGP-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v5, v6, 0 +; CGP-NEXT: v_mov_b32_e32 v4, v10 +; CGP-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v5, v8, v[4:5] +; CGP-NEXT: v_mul_lo_u32 v4, v8, v9 +; CGP-NEXT: v_mul_hi_u32 v7, v6, v9 +; CGP-NEXT: v_mad_u64_u32 v[10:11], s[4:5], -1, v6, v[10:11] +; CGP-NEXT: v_mul_hi_u32 v9, v8, v9 +; CGP-NEXT: v_mul_lo_u32 v11, v6, v10 +; CGP-NEXT: v_mul_lo_u32 v12, v8, v10 +; CGP-NEXT: v_mul_hi_u32 v13, v6, v10 +; CGP-NEXT: v_mul_hi_u32 v10, v8, v10 +; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v11 +; CGP-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v7 +; CGP-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v4, vcc, v11, v4 +; CGP-NEXT: v_add_i32_e32 v7, vcc, v12, v9 +; CGP-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v7, vcc, v7, v13 +; CGP-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v9, vcc, v9, v11 +; CGP-NEXT: v_add_i32_e32 v4, vcc, v7, v4 +; CGP-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v7, vcc, v9, v7 +; CGP-NEXT: v_add_i32_e32 v7, vcc, v10, v7 +; CGP-NEXT: v_add_i32_e32 v12, vcc, v6, v4 +; CGP-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v5, v12, 0 +; CGP-NEXT: v_addc_u32_e32 v13, vcc, v8, v7, vcc +; CGP-NEXT: v_mov_b32_e32 v4, v10 +; CGP-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v5, v13, v[4:5] +; CGP-NEXT: v_ashrrev_i32_e32 v7, 31, v1 +; CGP-NEXT: v_add_i32_e32 v0, vcc, v0, v7 +; CGP-NEXT: v_mad_u64_u32 v[10:11], s[4:5], -1, v12, v[10:11] +; CGP-NEXT: v_addc_u32_e32 v1, vcc, v1, v7, vcc +; CGP-NEXT: v_xor_b32_e32 v11, v0, v7 +; CGP-NEXT: v_mul_lo_u32 v0, v13, v9 +; CGP-NEXT: v_mul_lo_u32 v4, v12, v10 +; CGP-NEXT: v_xor_b32_e32 v14, v1, v7 +; CGP-NEXT: v_mul_hi_u32 v1, v12, v9 +; CGP-NEXT: v_mul_hi_u32 v9, v13, v9 +; CGP-NEXT: v_add_i32_e32 v0, vcc, v0, v4 +; CGP-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v0, vcc, v0, v1 +; CGP-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc +; CGP-NEXT: v_mul_lo_u32 v1, v13, v10 +; CGP-NEXT: v_add_i32_e32 v0, vcc, v4, v0 +; CGP-NEXT: v_mul_hi_u32 v4, v12, v10 +; CGP-NEXT: v_add_i32_e32 v1, vcc, v1, v9 ; CGP-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v12 -; CGP-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v9, vcc, v9, v10 -; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v6 -; CGP-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v6, vcc, v9, v6 -; CGP-NEXT: v_add_i32_e32 v5, vcc, v5, v6 -; CGP-NEXT: v_add_i32_e32 v9, vcc, v7, v4 -; CGP-NEXT: v_mad_u64_u32 v[6:7], s[4:5], s6, v9, 0 -; CGP-NEXT: v_addc_u32_e32 v10, vcc, v8, v5, vcc -; CGP-NEXT: v_ashrrev_i32_e32 v5, 31, v1 -; CGP-NEXT: v_mov_b32_e32 v4, v7 -; CGP-NEXT: v_mad_u64_u32 v[7:8], s[4:5], s6, v10, v[4:5] -; CGP-NEXT: v_add_i32_e32 v4, vcc, v0, v5 -; CGP-NEXT: v_addc_u32_e32 v11, vcc, v1, v5, vcc -; CGP-NEXT: v_mad_u64_u32 v[0:1], s[4:5], -1, v9, v[7:8] -; CGP-NEXT: v_xor_b32_e32 v8, v4, v5 -; CGP-NEXT: v_mul_lo_u32 v1, v10, v6 -; CGP-NEXT: v_mul_lo_u32 v4, v9, v0 -; CGP-NEXT: v_mul_hi_u32 v7, v9, v6 -; CGP-NEXT: v_mul_hi_u32 v6, v10, v6 -; CGP-NEXT: v_xor_b32_e32 v11, v11, v5 ; CGP-NEXT: v_add_i32_e32 v1, vcc, v1, v4 ; CGP-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v1, vcc, v1, v7 +; CGP-NEXT: v_add_i32_e32 v4, vcc, v9, v4 +; CGP-NEXT: v_mul_hi_u32 v9, v13, v10 +; CGP-NEXT: v_add_i32_e32 v0, vcc, v1, v0 ; CGP-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc -; CGP-NEXT: v_mul_lo_u32 v7, v10, v0 ; CGP-NEXT: v_add_i32_e32 v1, vcc, v4, v1 -; CGP-NEXT: v_mul_hi_u32 v4, v9, v0 -; CGP-NEXT: v_add_i32_e32 v6, vcc, v7, v6 -; CGP-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v4, vcc, v6, v4 -; CGP-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v6, vcc, v7, v6 -; CGP-NEXT: v_mul_hi_u32 v0, v10, v0 -; CGP-NEXT: v_add_i32_e32 v1, vcc, v4, v1 -; CGP-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v4, vcc, v6, v4 -; CGP-NEXT: v_add_i32_e32 v0, vcc, v0, v4 -; CGP-NEXT: v_add_i32_e32 v1, vcc, v9, v1 -; CGP-NEXT: v_addc_u32_e32 v0, vcc, v10, v0, vcc -; CGP-NEXT: v_mul_lo_u32 v6, v11, v1 -; CGP-NEXT: v_mul_lo_u32 v7, v8, v0 -; CGP-NEXT: v_mul_hi_u32 v9, v8, v1 -; CGP-NEXT: v_mul_hi_u32 v1, v11, v1 -; CGP-NEXT: v_mul_hi_u32 v10, v11, v0 -; CGP-NEXT: v_add_i32_e32 v6, vcc, v6, v7 -; CGP-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v6, vcc, v6, v9 -; CGP-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc -; CGP-NEXT: v_mul_lo_u32 v9, v11, v0 -; CGP-NEXT: v_add_i32_e32 v6, vcc, v7, v6 -; CGP-NEXT: v_mul_hi_u32 v7, v8, v0 ; CGP-NEXT: v_add_i32_e32 v1, vcc, v9, v1 -; CGP-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v1, vcc, v1, v7 -; CGP-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v7, vcc, v9, v7 -; CGP-NEXT: v_add_i32_e32 v9, vcc, v1, v6 -; CGP-NEXT: v_mad_u64_u32 v[0:1], s[4:5], s7, v9, 0 -; CGP-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v6, vcc, v7, v6 -; CGP-NEXT: v_add_i32_e32 v10, vcc, v10, v6 -; CGP-NEXT: v_mad_u64_u32 v[6:7], s[4:5], s7, v10, v[1:2] -; CGP-NEXT: v_sub_i32_e32 v0, vcc, v8, v0 +; CGP-NEXT: v_add_i32_e32 v0, vcc, v12, v0 +; CGP-NEXT: v_addc_u32_e32 v1, vcc, v13, v1, vcc +; CGP-NEXT: v_mul_lo_u32 v9, v14, v0 +; CGP-NEXT: v_mul_lo_u32 v10, v11, v1 +; CGP-NEXT: v_mul_hi_u32 v12, v11, v0 +; CGP-NEXT: v_mul_hi_u32 v0, v14, v0 ; CGP-NEXT: v_mov_b32_e32 v4, 0x1000 -; CGP-NEXT: v_subb_u32_e64 v1, s[4:5], v11, v6, vcc -; CGP-NEXT: v_sub_i32_e64 v6, s[4:5], v11, v6 +; CGP-NEXT: v_add_i32_e32 v9, vcc, v9, v10 +; CGP-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v9, vcc, v9, v12 +; CGP-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc +; CGP-NEXT: v_mul_lo_u32 v12, v14, v1 +; CGP-NEXT: v_add_i32_e32 v9, vcc, v10, v9 +; CGP-NEXT: v_mul_hi_u32 v10, v11, v1 +; CGP-NEXT: v_add_i32_e32 v0, vcc, v12, v0 +; CGP-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v0, vcc, v0, v10 +; CGP-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v10, vcc, v12, v10 +; CGP-NEXT: v_add_i32_e32 v12, vcc, v0, v9 +; CGP-NEXT: v_mul_hi_u32 v13, v14, v1 +; CGP-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v4, v12, 0 +; CGP-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v9, vcc, v10, v9 +; CGP-NEXT: v_add_i32_e32 v13, vcc, v13, v9 +; CGP-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v4, v13, v[1:2] +; CGP-NEXT: v_sub_i32_e32 v0, vcc, v11, v0 +; CGP-NEXT: v_subb_u32_e64 v1, s[4:5], v14, v9, vcc +; CGP-NEXT: v_sub_i32_e64 v9, s[4:5], v14, v9 ; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v0, v4 -; CGP-NEXT: v_cndmask_b32_e64 v7, 0, -1, s[4:5] +; CGP-NEXT: v_cndmask_b32_e64 v10, 0, -1, s[4:5] ; CGP-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v1 -; CGP-NEXT: v_subbrev_u32_e32 v1, vcc, 0, v6, vcc -; CGP-NEXT: v_cvt_f32_u32_e32 v6, 0x1000 -; CGP-NEXT: v_mov_b32_e32 v8, s8 -; CGP-NEXT: v_sub_i32_e32 v0, vcc, v0, v4 -; CGP-NEXT: v_cndmask_b32_e64 v8, v8, v7, s[4:5] -; CGP-NEXT: v_subbrev_u32_e32 v7, vcc, 0, v1, vcc -; CGP-NEXT: v_cvt_f32_ubyte0_e32 v1, 0 -; CGP-NEXT: v_mac_f32_e32 v6, 0x4f800000, v1 -; CGP-NEXT: v_rcp_iflag_f32_e32 v1, v6 -; CGP-NEXT: v_add_i32_e32 v11, vcc, 1, v9 -; CGP-NEXT: v_addc_u32_e32 v12, vcc, 0, v10, vcc -; CGP-NEXT: v_mul_f32_e32 v1, 0x5f7ffffc, v1 -; CGP-NEXT: v_mul_f32_e32 v6, 0x2f800000, v1 -; CGP-NEXT: v_trunc_f32_e32 v6, v6 -; CGP-NEXT: v_mac_f32_e32 v1, 0xcf800000, v6 -; CGP-NEXT: v_cvt_u32_f32_e32 v13, v1 -; CGP-NEXT: s_bfe_i32 s4, 1, 0x10000 -; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v0, v4 -; CGP-NEXT: v_mov_b32_e32 v15, s4 -; CGP-NEXT: v_mad_u64_u32 v[0:1], s[4:5], s6, v13, 0 -; CGP-NEXT: v_cvt_u32_f32_e32 v16, v6 -; CGP-NEXT: v_cndmask_b32_e64 v14, 0, -1, vcc -; CGP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v7 -; CGP-NEXT: v_cndmask_b32_e32 v14, v15, v14, vcc -; CGP-NEXT: v_mad_u64_u32 v[6:7], s[4:5], s6, v16, v[1:2] -; CGP-NEXT: v_add_i32_e32 v1, vcc, 1, v11 -; CGP-NEXT: v_mad_u64_u32 v[6:7], s[4:5], -1, v13, v[6:7] -; CGP-NEXT: v_addc_u32_e32 v15, vcc, 0, v12, vcc -; CGP-NEXT: v_cmp_ne_u32_e32 vcc, 0, v14 -; CGP-NEXT: v_cndmask_b32_e32 v7, v11, v1, vcc -; CGP-NEXT: v_mul_lo_u32 v1, v16, v0 -; CGP-NEXT: v_mul_lo_u32 v11, v13, v6 -; CGP-NEXT: v_mul_hi_u32 v14, v13, v0 -; CGP-NEXT: v_cndmask_b32_e32 v12, v12, v15, vcc -; CGP-NEXT: v_mul_hi_u32 v0, v16, v0 -; CGP-NEXT: v_add_i32_e32 v1, vcc, v1, v11 -; CGP-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v1, vcc, v1, v14 +; CGP-NEXT: v_subbrev_u32_e32 v1, vcc, 0, v9, vcc +; CGP-NEXT: v_mov_b32_e32 v11, s6 +; CGP-NEXT: v_sub_i32_e32 v9, vcc, v0, v4 +; CGP-NEXT: v_cndmask_b32_e64 v14, v11, v10, s[4:5] +; CGP-NEXT: v_subbrev_u32_e32 v10, vcc, 0, v1, vcc +; CGP-NEXT: v_add_i32_e32 v15, vcc, 1, v12 +; CGP-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v5, v6, 0 +; CGP-NEXT: v_addc_u32_e32 v16, vcc, 0, v13, vcc +; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v9, v4 +; CGP-NEXT: v_cndmask_b32_e64 v9, 0, -1, vcc +; CGP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v10 +; CGP-NEXT: v_cndmask_b32_e32 v17, v11, v9, vcc +; CGP-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v5, v8, v[1:2] +; CGP-NEXT: v_add_i32_e32 v1, vcc, 1, v15 +; CGP-NEXT: v_mad_u64_u32 v[9:10], s[4:5], -1, v6, v[9:10] +; CGP-NEXT: v_addc_u32_e32 v18, vcc, 0, v16, vcc +; CGP-NEXT: v_cmp_ne_u32_e32 vcc, 0, v17 +; CGP-NEXT: v_cndmask_b32_e32 v10, v15, v1, vcc +; CGP-NEXT: v_mul_lo_u32 v1, v8, v0 +; CGP-NEXT: v_mul_lo_u32 v15, v6, v9 +; CGP-NEXT: v_mul_hi_u32 v17, v6, v0 +; CGP-NEXT: v_cndmask_b32_e32 v16, v16, v18, vcc +; CGP-NEXT: v_mul_hi_u32 v0, v8, v0 +; CGP-NEXT: v_add_i32_e32 v1, vcc, v1, v15 +; CGP-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v1, vcc, v1, v17 ; CGP-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc -; CGP-NEXT: v_mul_lo_u32 v14, v16, v6 -; CGP-NEXT: v_add_i32_e32 v1, vcc, v11, v1 -; CGP-NEXT: v_mul_hi_u32 v11, v13, v6 -; CGP-NEXT: v_add_i32_e32 v0, vcc, v14, v0 -; CGP-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v0, vcc, v0, v11 -; CGP-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v11, vcc, v14, v11 -; CGP-NEXT: v_mul_hi_u32 v6, v16, v6 +; CGP-NEXT: v_mul_lo_u32 v17, v8, v9 +; CGP-NEXT: v_add_i32_e32 v1, vcc, v15, v1 +; CGP-NEXT: v_mul_hi_u32 v15, v6, v9 +; CGP-NEXT: v_add_i32_e32 v0, vcc, v17, v0 +; CGP-NEXT: v_cndmask_b32_e64 v17, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v0, vcc, v0, v15 +; CGP-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v15, vcc, v17, v15 +; CGP-NEXT: v_mul_hi_u32 v9, v8, v9 ; CGP-NEXT: v_add_i32_e32 v0, vcc, v0, v1 ; CGP-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v1, vcc, v11, v1 -; CGP-NEXT: v_add_i32_e32 v1, vcc, v6, v1 -; CGP-NEXT: v_add_i32_e32 v11, vcc, v13, v0 -; CGP-NEXT: v_addc_u32_e32 v13, vcc, v16, v1, vcc -; CGP-NEXT: v_mad_u64_u32 v[0:1], s[4:5], s6, v11, 0 -; CGP-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 -; CGP-NEXT: v_cndmask_b32_e32 v6, v9, v7, vcc -; CGP-NEXT: v_xor_b32_e32 v9, v6, v5 -; CGP-NEXT: v_mad_u64_u32 v[6:7], s[4:5], s6, v13, v[1:2] -; CGP-NEXT: v_cndmask_b32_e32 v8, v10, v12, vcc -; CGP-NEXT: v_xor_b32_e32 v1, v8, v5 -; CGP-NEXT: v_mad_u64_u32 v[6:7], s[4:5], -1, v11, v[6:7] -; CGP-NEXT: v_ashrrev_i32_e32 v8, 31, v3 -; CGP-NEXT: v_add_i32_e32 v2, vcc, v2, v8 -; CGP-NEXT: v_addc_u32_e32 v3, vcc, v3, v8, vcc -; CGP-NEXT: v_xor_b32_e32 v7, v2, v8 -; CGP-NEXT: v_mul_lo_u32 v2, v13, v0 -; CGP-NEXT: v_mul_lo_u32 v10, v11, v6 -; CGP-NEXT: v_xor_b32_e32 v12, v3, v8 -; CGP-NEXT: v_mul_hi_u32 v3, v11, v0 -; CGP-NEXT: v_mul_hi_u32 v0, v13, v0 +; CGP-NEXT: v_add_i32_e32 v1, vcc, v15, v1 +; CGP-NEXT: v_add_i32_e32 v1, vcc, v9, v1 +; CGP-NEXT: v_add_i32_e32 v9, vcc, v6, v0 +; CGP-NEXT: v_addc_u32_e32 v8, vcc, v8, v1, vcc +; CGP-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v5, v9, 0 +; CGP-NEXT: v_cmp_ne_u32_e32 vcc, 0, v14 +; CGP-NEXT: v_cndmask_b32_e32 v6, v12, v10, vcc +; CGP-NEXT: v_xor_b32_e32 v12, v6, v7 +; CGP-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v5, v8, v[1:2] +; CGP-NEXT: v_cndmask_b32_e32 v10, v13, v16, vcc +; CGP-NEXT: v_xor_b32_e32 v1, v10, v7 +; CGP-NEXT: v_mad_u64_u32 v[5:6], s[4:5], -1, v9, v[5:6] +; CGP-NEXT: v_ashrrev_i32_e32 v10, 31, v3 ; CGP-NEXT: v_add_i32_e32 v2, vcc, v2, v10 -; CGP-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc +; CGP-NEXT: v_addc_u32_e32 v3, vcc, v3, v10, vcc +; CGP-NEXT: v_xor_b32_e32 v13, v2, v10 +; CGP-NEXT: v_mul_lo_u32 v2, v8, v0 +; CGP-NEXT: v_mul_lo_u32 v6, v9, v5 +; CGP-NEXT: v_xor_b32_e32 v14, v3, v10 +; CGP-NEXT: v_mul_hi_u32 v3, v9, v0 +; CGP-NEXT: v_mul_hi_u32 v0, v8, v0 +; CGP-NEXT: v_add_i32_e32 v2, vcc, v2, v6 +; CGP-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v2, vcc, v2, v3 ; CGP-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc -; CGP-NEXT: v_mul_lo_u32 v3, v13, v6 -; CGP-NEXT: v_add_i32_e32 v2, vcc, v10, v2 -; CGP-NEXT: v_mul_hi_u32 v10, v11, v6 +; CGP-NEXT: v_mul_lo_u32 v3, v8, v5 +; CGP-NEXT: v_add_i32_e32 v2, vcc, v6, v2 +; CGP-NEXT: v_mul_hi_u32 v6, v9, v5 ; CGP-NEXT: v_add_i32_e32 v0, vcc, v3, v0 ; CGP-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v0, vcc, v0, v10 -; CGP-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v3, vcc, v3, v10 -; CGP-NEXT: v_mul_hi_u32 v6, v13, v6 +; CGP-NEXT: v_add_i32_e32 v0, vcc, v0, v6 +; CGP-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v3, vcc, v3, v6 +; CGP-NEXT: v_mul_hi_u32 v5, v8, v5 ; CGP-NEXT: v_add_i32_e32 v0, vcc, v0, v2 ; CGP-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v2, vcc, v3, v2 -; CGP-NEXT: v_add_i32_e32 v2, vcc, v6, v2 -; CGP-NEXT: v_add_i32_e32 v3, vcc, v11, v0 -; CGP-NEXT: v_addc_u32_e32 v2, vcc, v13, v2, vcc -; CGP-NEXT: v_mul_lo_u32 v6, v12, v3 -; CGP-NEXT: v_mul_lo_u32 v10, v7, v2 -; CGP-NEXT: v_sub_i32_e32 v0, vcc, v9, v5 -; CGP-NEXT: v_subb_u32_e32 v1, vcc, v1, v5, vcc -; CGP-NEXT: v_mul_hi_u32 v5, v7, v3 -; CGP-NEXT: v_add_i32_e32 v6, vcc, v6, v10 -; CGP-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v5, vcc, v6, v5 +; CGP-NEXT: v_add_i32_e32 v2, vcc, v5, v2 +; CGP-NEXT: v_add_i32_e32 v3, vcc, v9, v0 +; CGP-NEXT: v_addc_u32_e32 v2, vcc, v8, v2, vcc +; CGP-NEXT: v_mul_lo_u32 v5, v14, v3 +; CGP-NEXT: v_mul_lo_u32 v6, v13, v2 +; CGP-NEXT: v_sub_i32_e32 v0, vcc, v12, v7 +; CGP-NEXT: v_subb_u32_e32 v1, vcc, v1, v7, vcc +; CGP-NEXT: v_mul_hi_u32 v7, v13, v3 +; CGP-NEXT: v_add_i32_e32 v5, vcc, v5, v6 +; CGP-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v5, vcc, v5, v7 ; CGP-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc -; CGP-NEXT: v_mul_lo_u32 v6, v12, v2 -; CGP-NEXT: v_mul_hi_u32 v3, v12, v3 -; CGP-NEXT: v_add_i32_e32 v5, vcc, v9, v5 -; CGP-NEXT: v_mul_hi_u32 v9, v7, v2 -; CGP-NEXT: v_add_i32_e32 v3, vcc, v6, v3 +; CGP-NEXT: v_mul_lo_u32 v7, v14, v2 +; CGP-NEXT: v_mul_hi_u32 v3, v14, v3 +; CGP-NEXT: v_add_i32_e32 v5, vcc, v6, v5 +; CGP-NEXT: v_mul_hi_u32 v6, v13, v2 +; CGP-NEXT: v_add_i32_e32 v3, vcc, v7, v3 +; CGP-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v3, vcc, v3, v6 ; CGP-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v3, vcc, v3, v9 -; CGP-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v6, vcc, v6, v9 -; CGP-NEXT: v_add_i32_e32 v9, vcc, v3, v5 -; CGP-NEXT: v_mul_hi_u32 v10, v12, v2 -; CGP-NEXT: v_mad_u64_u32 v[2:3], s[4:5], s7, v9, 0 +; CGP-NEXT: v_add_i32_e32 v6, vcc, v7, v6 +; CGP-NEXT: v_add_i32_e32 v7, vcc, v3, v5 +; CGP-NEXT: v_mul_hi_u32 v8, v14, v2 +; CGP-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v4, v7, 0 ; CGP-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v5, vcc, v6, v5 -; CGP-NEXT: v_add_i32_e32 v10, vcc, v10, v5 -; CGP-NEXT: v_mad_u64_u32 v[5:6], s[4:5], s7, v10, v[3:4] -; CGP-NEXT: v_sub_i32_e32 v2, vcc, v7, v2 -; CGP-NEXT: v_subb_u32_e64 v3, s[4:5], v12, v5, vcc -; CGP-NEXT: v_sub_i32_e64 v5, s[4:5], v12, v5 +; CGP-NEXT: v_add_i32_e32 v8, vcc, v8, v5 +; CGP-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v4, v8, v[3:4] +; CGP-NEXT: v_sub_i32_e32 v2, vcc, v13, v2 +; CGP-NEXT: v_subb_u32_e64 v3, s[4:5], v14, v5, vcc +; CGP-NEXT: v_sub_i32_e64 v5, s[4:5], v14, v5 ; CGP-NEXT: v_subbrev_u32_e32 v5, vcc, 0, v5, vcc -; CGP-NEXT: s_bfe_i32 s6, 1, 0x10000 ; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v2, v4 ; CGP-NEXT: v_sub_i32_e32 v2, vcc, v2, v4 ; CGP-NEXT: v_cndmask_b32_e64 v6, 0, -1, s[4:5] -; CGP-NEXT: v_mov_b32_e32 v7, s6 ; CGP-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v3 ; CGP-NEXT: v_subbrev_u32_e32 v5, vcc, 0, v5, vcc -; CGP-NEXT: v_cndmask_b32_e64 v3, v7, v6, s[4:5] -; CGP-NEXT: v_add_i32_e32 v6, vcc, 1, v9 -; CGP-NEXT: v_addc_u32_e32 v7, vcc, 0, v10, vcc -; CGP-NEXT: s_bfe_i32 s4, 1, 0x10000 +; CGP-NEXT: v_cndmask_b32_e64 v3, v11, v6, s[4:5] +; CGP-NEXT: v_add_i32_e32 v6, vcc, 1, v7 +; CGP-NEXT: v_addc_u32_e32 v9, vcc, 0, v8, vcc ; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v2, v4 ; CGP-NEXT: v_cndmask_b32_e64 v2, 0, -1, vcc -; CGP-NEXT: v_mov_b32_e32 v4, s4 ; CGP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v5 -; CGP-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc +; CGP-NEXT: v_cndmask_b32_e32 v2, v11, v2, vcc ; CGP-NEXT: v_add_i32_e32 v4, vcc, 1, v6 -; CGP-NEXT: v_addc_u32_e32 v5, vcc, 0, v7, vcc +; CGP-NEXT: v_addc_u32_e32 v5, vcc, 0, v9, vcc ; CGP-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 ; CGP-NEXT: v_cndmask_b32_e32 v2, v6, v4, vcc -; CGP-NEXT: v_cndmask_b32_e32 v4, v7, v5, vcc +; CGP-NEXT: v_cndmask_b32_e32 v4, v9, v5, vcc ; CGP-NEXT: v_cmp_ne_u32_e32 vcc, 0, v3 -; CGP-NEXT: v_cndmask_b32_e32 v2, v9, v2, vcc -; CGP-NEXT: v_cndmask_b32_e32 v3, v10, v4, vcc -; CGP-NEXT: v_xor_b32_e32 v2, v2, v8 -; CGP-NEXT: v_xor_b32_e32 v3, v3, v8 -; CGP-NEXT: v_sub_i32_e32 v2, vcc, v2, v8 -; CGP-NEXT: v_subb_u32_e32 v3, vcc, v3, v8, vcc +; CGP-NEXT: v_cndmask_b32_e32 v2, v7, v2, vcc +; CGP-NEXT: v_cndmask_b32_e32 v3, v8, v4, vcc +; CGP-NEXT: v_xor_b32_e32 v2, v2, v10 +; CGP-NEXT: v_xor_b32_e32 v3, v3, v10 +; CGP-NEXT: v_sub_i32_e32 v2, vcc, v2, v10 +; CGP-NEXT: v_subb_u32_e32 v3, vcc, v3, v10, vcc ; CGP-NEXT: s_setpc_b64 s[30:31] %result = sdiv <2 x i64> %num, ret <2 x i64> %result @@ -1658,7 +1637,8 @@ ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; CHECK-NEXT: v_cvt_f32_u32_e32 v2, 0x12d8fb ; CHECK-NEXT: v_cvt_f32_ubyte0_e32 v3, 0 -; CHECK-NEXT: s_mov_b32 s6, 0xffed2705 +; CHECK-NEXT: v_mov_b32_e32 v6, 0xffed2705 +; CHECK-NEXT: s_bfe_i32 s6, 1, 0x10000 ; CHECK-NEXT: v_mac_f32_e32 v2, 0x4f800000, v3 ; CHECK-NEXT: v_rcp_iflag_f32_e32 v2, v2 ; CHECK-NEXT: v_mul_f32_e32 v2, 0x5f7ffffc, v2 @@ -1666,122 +1646,117 @@ ; CHECK-NEXT: v_trunc_f32_e32 v4, v3 ; CHECK-NEXT: v_mac_f32_e32 v2, 0xcf800000, v4 ; CHECK-NEXT: v_cvt_u32_f32_e32 v5, v2 -; CHECK-NEXT: v_mov_b32_e32 v2, 0xffed2705 -; CHECK-NEXT: v_cvt_u32_f32_e32 v6, v4 -; CHECK-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v2, v5, 0 -; CHECK-NEXT: v_mad_u64_u32 v[3:4], s[4:5], s6, v6, v[3:4] -; CHECK-NEXT: v_mul_hi_u32 v7, v5, v2 +; CHECK-NEXT: v_cvt_u32_f32_e32 v7, v4 +; CHECK-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v6, v5, 0 +; CHECK-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v6, v7, v[3:4] +; CHECK-NEXT: v_mul_hi_u32 v8, v5, v2 ; CHECK-NEXT: v_mad_u64_u32 v[3:4], s[4:5], -1, v5, v[3:4] -; CHECK-NEXT: v_mul_lo_u32 v4, v6, v2 -; CHECK-NEXT: v_mul_hi_u32 v2, v6, v2 -; CHECK-NEXT: v_mul_lo_u32 v8, v5, v3 -; CHECK-NEXT: v_mul_lo_u32 v9, v6, v3 -; CHECK-NEXT: v_mul_hi_u32 v10, v5, v3 -; CHECK-NEXT: v_mul_hi_u32 v3, v6, v3 +; CHECK-NEXT: v_mul_lo_u32 v4, v7, v2 +; CHECK-NEXT: v_mul_hi_u32 v2, v7, v2 +; CHECK-NEXT: v_mul_lo_u32 v9, v5, v3 +; CHECK-NEXT: v_mul_lo_u32 v10, v7, v3 +; CHECK-NEXT: v_mul_hi_u32 v11, v5, v3 +; CHECK-NEXT: v_mul_hi_u32 v3, v7, v3 +; CHECK-NEXT: v_add_i32_e32 v4, vcc, v4, v9 +; CHECK-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc +; CHECK-NEXT: v_add_i32_e32 v2, vcc, v10, v2 +; CHECK-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc ; CHECK-NEXT: v_add_i32_e32 v4, vcc, v4, v8 -; CHECK-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc -; CHECK-NEXT: v_add_i32_e32 v4, vcc, v4, v7 ; CHECK-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc -; CHECK-NEXT: v_add_i32_e32 v4, vcc, v8, v4 -; CHECK-NEXT: v_add_i32_e32 v2, vcc, v9, v2 -; CHECK-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc -; CHECK-NEXT: v_add_i32_e32 v2, vcc, v2, v10 +; CHECK-NEXT: v_add_i32_e32 v4, vcc, v9, v4 +; CHECK-NEXT: v_add_i32_e32 v2, vcc, v2, v11 ; CHECK-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc -; CHECK-NEXT: v_add_i32_e32 v7, vcc, v7, v8 +; CHECK-NEXT: v_add_i32_e32 v8, vcc, v10, v8 ; CHECK-NEXT: v_add_i32_e32 v2, vcc, v2, v4 ; CHECK-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc -; CHECK-NEXT: v_add_i32_e32 v4, vcc, v7, v4 +; CHECK-NEXT: v_add_i32_e32 v4, vcc, v8, v4 ; CHECK-NEXT: v_add_i32_e32 v3, vcc, v3, v4 ; CHECK-NEXT: v_add_i32_e32 v5, vcc, v5, v2 -; CHECK-NEXT: v_addc_u32_e32 v6, vcc, v6, v3, vcc -; CHECK-NEXT: v_mad_u64_u32 v[2:3], s[4:5], s6, v5, 0 -; CHECK-NEXT: v_ashrrev_i32_e32 v7, 31, v1 -; CHECK-NEXT: v_add_i32_e32 v8, vcc, v0, v7 -; CHECK-NEXT: v_mad_u64_u32 v[3:4], s[4:5], s6, v6, v[3:4] -; CHECK-NEXT: v_addc_u32_e32 v9, vcc, v1, v7, vcc -; CHECK-NEXT: v_mad_u64_u32 v[0:1], s[4:5], -1, v5, v[3:4] -; CHECK-NEXT: v_xor_b32_e32 v3, v8, v7 -; CHECK-NEXT: v_mul_lo_u32 v1, v6, v2 -; CHECK-NEXT: v_mul_lo_u32 v8, v5, v0 -; CHECK-NEXT: v_xor_b32_e32 v4, v9, v7 -; CHECK-NEXT: v_mul_hi_u32 v9, v5, v2 -; CHECK-NEXT: v_mul_hi_u32 v2, v6, v2 +; CHECK-NEXT: v_addc_u32_e32 v7, vcc, v7, v3, vcc +; CHECK-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v6, v5, 0 +; CHECK-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v6, v7, v[3:4] +; CHECK-NEXT: v_ashrrev_i32_e32 v6, 31, v1 +; CHECK-NEXT: v_add_i32_e32 v0, vcc, v0, v6 +; CHECK-NEXT: v_mad_u64_u32 v[3:4], s[4:5], -1, v5, v[3:4] +; CHECK-NEXT: v_addc_u32_e32 v1, vcc, v1, v6, vcc +; CHECK-NEXT: v_xor_b32_e32 v4, v0, v6 +; CHECK-NEXT: v_mul_lo_u32 v0, v7, v2 +; CHECK-NEXT: v_mul_lo_u32 v8, v5, v3 +; CHECK-NEXT: v_xor_b32_e32 v9, v1, v6 +; CHECK-NEXT: v_mul_hi_u32 v1, v5, v2 +; CHECK-NEXT: v_mul_hi_u32 v2, v7, v2 +; CHECK-NEXT: v_add_i32_e32 v0, vcc, v0, v8 +; CHECK-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc +; CHECK-NEXT: v_add_i32_e32 v0, vcc, v0, v1 +; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc +; CHECK-NEXT: v_mul_lo_u32 v1, v7, v3 +; CHECK-NEXT: v_add_i32_e32 v0, vcc, v8, v0 +; CHECK-NEXT: v_mul_hi_u32 v8, v5, v3 +; CHECK-NEXT: v_add_i32_e32 v1, vcc, v1, v2 +; CHECK-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc ; CHECK-NEXT: v_add_i32_e32 v1, vcc, v1, v8 ; CHECK-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc -; CHECK-NEXT: v_add_i32_e32 v1, vcc, v1, v9 -; CHECK-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc -; CHECK-NEXT: v_mul_lo_u32 v9, v6, v0 -; CHECK-NEXT: v_add_i32_e32 v1, vcc, v8, v1 -; CHECK-NEXT: v_mul_hi_u32 v8, v5, v0 -; CHECK-NEXT: v_add_i32_e32 v2, vcc, v9, v2 -; CHECK-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc ; CHECK-NEXT: v_add_i32_e32 v2, vcc, v2, v8 -; CHECK-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc -; CHECK-NEXT: v_add_i32_e32 v8, vcc, v9, v8 -; CHECK-NEXT: v_mul_hi_u32 v0, v6, v0 +; CHECK-NEXT: v_mul_hi_u32 v3, v7, v3 +; CHECK-NEXT: v_add_i32_e32 v0, vcc, v1, v0 +; CHECK-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc ; CHECK-NEXT: v_add_i32_e32 v1, vcc, v2, v1 +; CHECK-NEXT: v_add_i32_e32 v1, vcc, v3, v1 +; CHECK-NEXT: v_add_i32_e32 v0, vcc, v5, v0 +; CHECK-NEXT: v_addc_u32_e32 v1, vcc, v7, v1, vcc +; CHECK-NEXT: v_mul_lo_u32 v2, v9, v0 +; CHECK-NEXT: v_mul_lo_u32 v3, v4, v1 +; CHECK-NEXT: v_mul_hi_u32 v7, v4, v0 +; CHECK-NEXT: v_mul_hi_u32 v0, v9, v0 +; CHECK-NEXT: v_mov_b32_e32 v5, 0x12d8fb +; CHECK-NEXT: v_add_i32_e32 v2, vcc, v2, v3 +; CHECK-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc +; CHECK-NEXT: v_add_i32_e32 v2, vcc, v2, v7 ; CHECK-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc -; CHECK-NEXT: v_add_i32_e32 v2, vcc, v8, v2 -; CHECK-NEXT: v_add_i32_e32 v0, vcc, v0, v2 -; CHECK-NEXT: v_add_i32_e32 v1, vcc, v5, v1 -; CHECK-NEXT: v_addc_u32_e32 v0, vcc, v6, v0, vcc -; CHECK-NEXT: v_mul_lo_u32 v2, v4, v1 -; CHECK-NEXT: v_mul_lo_u32 v5, v3, v0 -; CHECK-NEXT: v_mul_hi_u32 v8, v3, v1 -; CHECK-NEXT: v_mul_hi_u32 v1, v4, v1 -; CHECK-NEXT: s_mov_b32 s6, 0x12d8fb -; CHECK-NEXT: v_add_i32_e32 v2, vcc, v2, v5 -; CHECK-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc -; CHECK-NEXT: v_add_i32_e32 v2, vcc, v2, v8 -; CHECK-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc -; CHECK-NEXT: v_mul_lo_u32 v8, v4, v0 -; CHECK-NEXT: v_add_i32_e32 v2, vcc, v5, v2 -; CHECK-NEXT: v_mul_hi_u32 v5, v3, v0 -; CHECK-NEXT: v_add_i32_e32 v1, vcc, v8, v1 -; CHECK-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc -; CHECK-NEXT: v_add_i32_e32 v1, vcc, v1, v5 -; CHECK-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc -; CHECK-NEXT: v_add_i32_e32 v5, vcc, v8, v5 -; CHECK-NEXT: v_add_i32_e32 v8, vcc, v1, v2 -; CHECK-NEXT: v_mul_hi_u32 v9, v4, v0 -; CHECK-NEXT: v_mad_u64_u32 v[0:1], s[4:5], s6, v8, 0 +; CHECK-NEXT: v_mul_lo_u32 v7, v9, v1 +; CHECK-NEXT: v_add_i32_e32 v2, vcc, v3, v2 +; CHECK-NEXT: v_mul_hi_u32 v3, v4, v1 +; CHECK-NEXT: v_add_i32_e32 v0, vcc, v7, v0 +; CHECK-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc +; CHECK-NEXT: v_add_i32_e32 v0, vcc, v0, v3 +; CHECK-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc +; CHECK-NEXT: v_add_i32_e32 v3, vcc, v7, v3 +; CHECK-NEXT: v_add_i32_e32 v7, vcc, v0, v2 +; CHECK-NEXT: v_mul_hi_u32 v8, v9, v1 +; CHECK-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v5, v7, 0 ; CHECK-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc -; CHECK-NEXT: v_add_i32_e32 v2, vcc, v5, v2 -; CHECK-NEXT: v_add_i32_e32 v5, vcc, v9, v2 -; CHECK-NEXT: v_mad_u64_u32 v[1:2], s[4:5], s6, v5, v[1:2] -; CHECK-NEXT: v_sub_i32_e32 v0, vcc, v3, v0 -; CHECK-NEXT: v_subb_u32_e64 v2, s[4:5], v4, v1, vcc -; CHECK-NEXT: v_sub_i32_e64 v1, s[4:5], v4, v1 -; CHECK-NEXT: v_mov_b32_e32 v6, 0x12d8fb +; CHECK-NEXT: v_add_i32_e32 v2, vcc, v3, v2 +; CHECK-NEXT: v_add_i32_e32 v3, vcc, v8, v2 +; CHECK-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v5, v3, v[1:2] +; CHECK-NEXT: v_sub_i32_e32 v0, vcc, v4, v0 +; CHECK-NEXT: v_subb_u32_e64 v2, s[4:5], v9, v1, vcc +; CHECK-NEXT: v_sub_i32_e64 v1, s[4:5], v9, v1 ; CHECK-NEXT: v_subbrev_u32_e32 v1, vcc, 0, v1, vcc -; CHECK-NEXT: s_bfe_i32 s6, 1, 0x10000 -; CHECK-NEXT: v_cmp_ge_u32_e64 s[4:5], v0, v6 -; CHECK-NEXT: v_sub_i32_e32 v0, vcc, v0, v6 -; CHECK-NEXT: v_cndmask_b32_e64 v3, 0, -1, s[4:5] -; CHECK-NEXT: v_mov_b32_e32 v4, s6 +; CHECK-NEXT: v_cmp_ge_u32_e64 s[4:5], v0, v5 +; CHECK-NEXT: v_sub_i32_e32 v0, vcc, v0, v5 +; CHECK-NEXT: v_cndmask_b32_e64 v4, 0, -1, s[4:5] +; CHECK-NEXT: v_mov_b32_e32 v8, s6 ; CHECK-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v2 ; CHECK-NEXT: v_subbrev_u32_e32 v1, vcc, 0, v1, vcc -; CHECK-NEXT: v_cndmask_b32_e64 v2, v4, v3, s[4:5] -; CHECK-NEXT: v_add_i32_e32 v3, vcc, 1, v8 -; CHECK-NEXT: v_addc_u32_e32 v4, vcc, 0, v5, vcc -; CHECK-NEXT: s_bfe_i32 s4, 1, 0x10000 -; CHECK-NEXT: v_cmp_ge_u32_e32 vcc, v0, v6 +; CHECK-NEXT: v_cndmask_b32_e64 v2, v8, v4, s[4:5] +; CHECK-NEXT: v_add_i32_e32 v4, vcc, 1, v7 +; CHECK-NEXT: v_addc_u32_e32 v9, vcc, 0, v3, vcc +; CHECK-NEXT: v_cmp_ge_u32_e32 vcc, v0, v5 ; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc -; CHECK-NEXT: v_mov_b32_e32 v6, s4 ; CHECK-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 -; CHECK-NEXT: v_cndmask_b32_e32 v0, v6, v0, vcc -; CHECK-NEXT: v_add_i32_e32 v1, vcc, 1, v3 -; CHECK-NEXT: v_addc_u32_e32 v6, vcc, 0, v4, vcc +; CHECK-NEXT: v_cndmask_b32_e32 v0, v8, v0, vcc +; CHECK-NEXT: v_add_i32_e32 v1, vcc, 1, v4 +; CHECK-NEXT: v_addc_u32_e32 v5, vcc, 0, v9, vcc ; CHECK-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 -; CHECK-NEXT: v_cndmask_b32_e32 v0, v3, v1, vcc -; CHECK-NEXT: v_cndmask_b32_e32 v1, v4, v6, vcc +; CHECK-NEXT: v_cndmask_b32_e32 v0, v4, v1, vcc +; CHECK-NEXT: v_cndmask_b32_e32 v1, v9, v5, vcc ; CHECK-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 -; CHECK-NEXT: v_cndmask_b32_e32 v0, v8, v0, vcc -; CHECK-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc -; CHECK-NEXT: v_xor_b32_e32 v0, v0, v7 -; CHECK-NEXT: v_xor_b32_e32 v1, v1, v7 -; CHECK-NEXT: v_sub_i32_e32 v0, vcc, v0, v7 -; CHECK-NEXT: v_subb_u32_e32 v1, vcc, v1, v7, vcc +; CHECK-NEXT: v_cndmask_b32_e32 v0, v7, v0, vcc +; CHECK-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc +; CHECK-NEXT: v_xor_b32_e32 v0, v0, v6 +; CHECK-NEXT: v_xor_b32_e32 v1, v1, v6 +; CHECK-NEXT: v_sub_i32_e32 v0, vcc, v0, v6 +; CHECK-NEXT: v_subb_u32_e32 v1, vcc, v1, v6, vcc ; CHECK-NEXT: s_setpc_b64 s[30:31] %result = sdiv i64 %num, 1235195 ret i64 %result @@ -2060,253 +2035,236 @@ ; CGP-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; CGP-NEXT: v_cvt_f32_u32_e32 v4, 0x12d8fb ; CGP-NEXT: v_cvt_f32_ubyte0_e32 v5, 0 -; CGP-NEXT: s_mov_b32 s6, 0xffed2705 -; CGP-NEXT: s_mov_b32 s7, 0x12d8fb +; CGP-NEXT: s_bfe_i32 s6, 1, 0x10000 ; CGP-NEXT: v_mac_f32_e32 v4, 0x4f800000, v5 ; CGP-NEXT: v_rcp_iflag_f32_e32 v4, v4 -; CGP-NEXT: s_bfe_i32 s8, 1, 0x10000 ; CGP-NEXT: v_mul_f32_e32 v4, 0x5f7ffffc, v4 ; CGP-NEXT: v_mul_f32_e32 v5, 0x2f800000, v4 -; CGP-NEXT: v_trunc_f32_e32 v6, v5 -; CGP-NEXT: v_mac_f32_e32 v4, 0xcf800000, v6 -; CGP-NEXT: v_cvt_u32_f32_e32 v7, v4 -; CGP-NEXT: v_mov_b32_e32 v4, 0xffed2705 -; CGP-NEXT: v_cvt_u32_f32_e32 v8, v6 -; CGP-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v4, v7, 0 -; CGP-NEXT: v_mad_u64_u32 v[5:6], s[4:5], s6, v8, v[5:6] -; CGP-NEXT: v_mul_hi_u32 v9, v7, v4 -; CGP-NEXT: v_mad_u64_u32 v[5:6], s[4:5], -1, v7, v[5:6] -; CGP-NEXT: v_mul_lo_u32 v6, v8, v4 -; CGP-NEXT: v_mul_hi_u32 v4, v8, v4 -; CGP-NEXT: v_mul_lo_u32 v10, v7, v5 -; CGP-NEXT: v_mul_lo_u32 v11, v8, v5 -; CGP-NEXT: v_mul_hi_u32 v12, v7, v5 -; CGP-NEXT: v_mul_hi_u32 v5, v8, v5 -; CGP-NEXT: v_add_i32_e32 v6, vcc, v6, v10 -; CGP-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v6, vcc, v6, v9 -; CGP-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v6, vcc, v10, v6 +; CGP-NEXT: v_trunc_f32_e32 v7, v5 +; CGP-NEXT: v_mac_f32_e32 v4, 0xcf800000, v7 +; CGP-NEXT: v_cvt_u32_f32_e32 v6, v4 +; CGP-NEXT: v_mov_b32_e32 v5, 0xffed2705 +; CGP-NEXT: v_cvt_u32_f32_e32 v8, v7 +; CGP-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v5, v6, 0 +; CGP-NEXT: v_mov_b32_e32 v4, v10 +; CGP-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v5, v8, v[4:5] +; CGP-NEXT: v_mul_lo_u32 v4, v8, v9 +; CGP-NEXT: v_mul_hi_u32 v7, v6, v9 +; CGP-NEXT: v_mad_u64_u32 v[10:11], s[4:5], -1, v6, v[10:11] +; CGP-NEXT: v_mul_hi_u32 v9, v8, v9 +; CGP-NEXT: v_mul_lo_u32 v11, v6, v10 +; CGP-NEXT: v_mul_lo_u32 v12, v8, v10 +; CGP-NEXT: v_mul_hi_u32 v13, v6, v10 +; CGP-NEXT: v_mul_hi_u32 v10, v8, v10 +; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v11 +; CGP-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v7 +; CGP-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v4, vcc, v11, v4 +; CGP-NEXT: v_add_i32_e32 v7, vcc, v12, v9 +; CGP-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v7, vcc, v7, v13 +; CGP-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v9, vcc, v9, v11 +; CGP-NEXT: v_add_i32_e32 v4, vcc, v7, v4 +; CGP-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v7, vcc, v9, v7 +; CGP-NEXT: v_add_i32_e32 v7, vcc, v10, v7 +; CGP-NEXT: v_add_i32_e32 v12, vcc, v6, v4 +; CGP-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v5, v12, 0 +; CGP-NEXT: v_addc_u32_e32 v13, vcc, v8, v7, vcc +; CGP-NEXT: v_mov_b32_e32 v4, v10 +; CGP-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v5, v13, v[4:5] +; CGP-NEXT: v_ashrrev_i32_e32 v7, 31, v1 +; CGP-NEXT: v_add_i32_e32 v0, vcc, v0, v7 +; CGP-NEXT: v_mad_u64_u32 v[10:11], s[4:5], -1, v12, v[10:11] +; CGP-NEXT: v_addc_u32_e32 v1, vcc, v1, v7, vcc +; CGP-NEXT: v_xor_b32_e32 v11, v0, v7 +; CGP-NEXT: v_mul_lo_u32 v0, v13, v9 +; CGP-NEXT: v_mul_lo_u32 v4, v12, v10 +; CGP-NEXT: v_xor_b32_e32 v14, v1, v7 +; CGP-NEXT: v_mul_hi_u32 v1, v12, v9 +; CGP-NEXT: v_mul_hi_u32 v9, v13, v9 +; CGP-NEXT: v_add_i32_e32 v0, vcc, v0, v4 +; CGP-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v0, vcc, v0, v1 +; CGP-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc +; CGP-NEXT: v_mul_lo_u32 v1, v13, v10 +; CGP-NEXT: v_add_i32_e32 v0, vcc, v4, v0 +; CGP-NEXT: v_mul_hi_u32 v4, v12, v10 +; CGP-NEXT: v_add_i32_e32 v1, vcc, v1, v9 ; CGP-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v12 -; CGP-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v9, vcc, v9, v10 -; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v6 -; CGP-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v6, vcc, v9, v6 -; CGP-NEXT: v_add_i32_e32 v5, vcc, v5, v6 -; CGP-NEXT: v_add_i32_e32 v9, vcc, v7, v4 -; CGP-NEXT: v_mad_u64_u32 v[6:7], s[4:5], s6, v9, 0 -; CGP-NEXT: v_addc_u32_e32 v10, vcc, v8, v5, vcc -; CGP-NEXT: v_ashrrev_i32_e32 v5, 31, v1 -; CGP-NEXT: v_mov_b32_e32 v4, v7 -; CGP-NEXT: v_mad_u64_u32 v[7:8], s[4:5], s6, v10, v[4:5] -; CGP-NEXT: v_add_i32_e32 v4, vcc, v0, v5 -; CGP-NEXT: v_addc_u32_e32 v11, vcc, v1, v5, vcc -; CGP-NEXT: v_mad_u64_u32 v[0:1], s[4:5], -1, v9, v[7:8] -; CGP-NEXT: v_xor_b32_e32 v8, v4, v5 -; CGP-NEXT: v_mul_lo_u32 v1, v10, v6 -; CGP-NEXT: v_mul_lo_u32 v4, v9, v0 -; CGP-NEXT: v_mul_hi_u32 v7, v9, v6 -; CGP-NEXT: v_mul_hi_u32 v6, v10, v6 -; CGP-NEXT: v_xor_b32_e32 v11, v11, v5 ; CGP-NEXT: v_add_i32_e32 v1, vcc, v1, v4 ; CGP-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v1, vcc, v1, v7 +; CGP-NEXT: v_add_i32_e32 v4, vcc, v9, v4 +; CGP-NEXT: v_mul_hi_u32 v9, v13, v10 +; CGP-NEXT: v_add_i32_e32 v0, vcc, v1, v0 ; CGP-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc -; CGP-NEXT: v_mul_lo_u32 v7, v10, v0 -; CGP-NEXT: v_add_i32_e32 v1, vcc, v4, v1 -; CGP-NEXT: v_mul_hi_u32 v4, v9, v0 -; CGP-NEXT: v_add_i32_e32 v6, vcc, v7, v6 -; CGP-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v4, vcc, v6, v4 -; CGP-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v6, vcc, v7, v6 -; CGP-NEXT: v_mul_hi_u32 v0, v10, v0 ; CGP-NEXT: v_add_i32_e32 v1, vcc, v4, v1 -; CGP-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v4, vcc, v6, v4 -; CGP-NEXT: v_add_i32_e32 v0, vcc, v0, v4 -; CGP-NEXT: v_add_i32_e32 v1, vcc, v9, v1 -; CGP-NEXT: v_addc_u32_e32 v0, vcc, v10, v0, vcc -; CGP-NEXT: v_mul_lo_u32 v6, v11, v1 -; CGP-NEXT: v_mul_lo_u32 v7, v8, v0 -; CGP-NEXT: v_mul_hi_u32 v9, v8, v1 -; CGP-NEXT: v_mul_hi_u32 v1, v11, v1 -; CGP-NEXT: v_mul_hi_u32 v10, v11, v0 -; CGP-NEXT: v_add_i32_e32 v6, vcc, v6, v7 -; CGP-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v6, vcc, v6, v9 -; CGP-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc -; CGP-NEXT: v_mul_lo_u32 v9, v11, v0 -; CGP-NEXT: v_add_i32_e32 v6, vcc, v7, v6 -; CGP-NEXT: v_mul_hi_u32 v7, v8, v0 ; CGP-NEXT: v_add_i32_e32 v1, vcc, v9, v1 -; CGP-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v1, vcc, v1, v7 -; CGP-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v7, vcc, v9, v7 -; CGP-NEXT: v_add_i32_e32 v9, vcc, v1, v6 -; CGP-NEXT: v_mad_u64_u32 v[0:1], s[4:5], s7, v9, 0 -; CGP-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v6, vcc, v7, v6 -; CGP-NEXT: v_add_i32_e32 v10, vcc, v10, v6 -; CGP-NEXT: v_mad_u64_u32 v[6:7], s[4:5], s7, v10, v[1:2] -; CGP-NEXT: v_sub_i32_e32 v0, vcc, v8, v0 +; CGP-NEXT: v_add_i32_e32 v0, vcc, v12, v0 +; CGP-NEXT: v_addc_u32_e32 v1, vcc, v13, v1, vcc +; CGP-NEXT: v_mul_lo_u32 v9, v14, v0 +; CGP-NEXT: v_mul_lo_u32 v10, v11, v1 +; CGP-NEXT: v_mul_hi_u32 v12, v11, v0 +; CGP-NEXT: v_mul_hi_u32 v0, v14, v0 ; CGP-NEXT: v_mov_b32_e32 v4, 0x12d8fb -; CGP-NEXT: v_subb_u32_e64 v1, s[4:5], v11, v6, vcc -; CGP-NEXT: v_sub_i32_e64 v6, s[4:5], v11, v6 +; CGP-NEXT: v_add_i32_e32 v9, vcc, v9, v10 +; CGP-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v9, vcc, v9, v12 +; CGP-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc +; CGP-NEXT: v_mul_lo_u32 v12, v14, v1 +; CGP-NEXT: v_add_i32_e32 v9, vcc, v10, v9 +; CGP-NEXT: v_mul_hi_u32 v10, v11, v1 +; CGP-NEXT: v_add_i32_e32 v0, vcc, v12, v0 +; CGP-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v0, vcc, v0, v10 +; CGP-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v10, vcc, v12, v10 +; CGP-NEXT: v_add_i32_e32 v12, vcc, v0, v9 +; CGP-NEXT: v_mul_hi_u32 v13, v14, v1 +; CGP-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v4, v12, 0 +; CGP-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v9, vcc, v10, v9 +; CGP-NEXT: v_add_i32_e32 v13, vcc, v13, v9 +; CGP-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v4, v13, v[1:2] +; CGP-NEXT: v_sub_i32_e32 v0, vcc, v11, v0 +; CGP-NEXT: v_subb_u32_e64 v1, s[4:5], v14, v9, vcc +; CGP-NEXT: v_sub_i32_e64 v9, s[4:5], v14, v9 ; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v0, v4 -; CGP-NEXT: v_cndmask_b32_e64 v7, 0, -1, s[4:5] +; CGP-NEXT: v_cndmask_b32_e64 v10, 0, -1, s[4:5] ; CGP-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v1 -; CGP-NEXT: v_subbrev_u32_e32 v1, vcc, 0, v6, vcc -; CGP-NEXT: v_cvt_f32_u32_e32 v6, 0x12d8fb -; CGP-NEXT: v_mov_b32_e32 v8, s8 -; CGP-NEXT: v_sub_i32_e32 v0, vcc, v0, v4 -; CGP-NEXT: v_cndmask_b32_e64 v8, v8, v7, s[4:5] -; CGP-NEXT: v_subbrev_u32_e32 v7, vcc, 0, v1, vcc -; CGP-NEXT: v_cvt_f32_ubyte0_e32 v1, 0 -; CGP-NEXT: v_mac_f32_e32 v6, 0x4f800000, v1 -; CGP-NEXT: v_rcp_iflag_f32_e32 v1, v6 -; CGP-NEXT: v_add_i32_e32 v11, vcc, 1, v9 -; CGP-NEXT: v_addc_u32_e32 v12, vcc, 0, v10, vcc -; CGP-NEXT: v_mul_f32_e32 v1, 0x5f7ffffc, v1 -; CGP-NEXT: v_mul_f32_e32 v6, 0x2f800000, v1 -; CGP-NEXT: v_trunc_f32_e32 v6, v6 -; CGP-NEXT: v_mac_f32_e32 v1, 0xcf800000, v6 -; CGP-NEXT: v_cvt_u32_f32_e32 v13, v1 -; CGP-NEXT: s_bfe_i32 s4, 1, 0x10000 -; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v0, v4 -; CGP-NEXT: v_mov_b32_e32 v15, s4 -; CGP-NEXT: v_mad_u64_u32 v[0:1], s[4:5], s6, v13, 0 -; CGP-NEXT: v_cvt_u32_f32_e32 v16, v6 -; CGP-NEXT: v_cndmask_b32_e64 v14, 0, -1, vcc -; CGP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v7 -; CGP-NEXT: v_cndmask_b32_e32 v14, v15, v14, vcc -; CGP-NEXT: v_mad_u64_u32 v[6:7], s[4:5], s6, v16, v[1:2] -; CGP-NEXT: v_add_i32_e32 v1, vcc, 1, v11 -; CGP-NEXT: v_mad_u64_u32 v[6:7], s[4:5], -1, v13, v[6:7] -; CGP-NEXT: v_addc_u32_e32 v15, vcc, 0, v12, vcc -; CGP-NEXT: v_cmp_ne_u32_e32 vcc, 0, v14 -; CGP-NEXT: v_cndmask_b32_e32 v7, v11, v1, vcc -; CGP-NEXT: v_mul_lo_u32 v1, v16, v0 -; CGP-NEXT: v_mul_lo_u32 v11, v13, v6 -; CGP-NEXT: v_mul_hi_u32 v14, v13, v0 -; CGP-NEXT: v_cndmask_b32_e32 v12, v12, v15, vcc -; CGP-NEXT: v_mul_hi_u32 v0, v16, v0 -; CGP-NEXT: v_add_i32_e32 v1, vcc, v1, v11 -; CGP-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v1, vcc, v1, v14 +; CGP-NEXT: v_subbrev_u32_e32 v1, vcc, 0, v9, vcc +; CGP-NEXT: v_mov_b32_e32 v11, s6 +; CGP-NEXT: v_sub_i32_e32 v9, vcc, v0, v4 +; CGP-NEXT: v_cndmask_b32_e64 v14, v11, v10, s[4:5] +; CGP-NEXT: v_subbrev_u32_e32 v10, vcc, 0, v1, vcc +; CGP-NEXT: v_add_i32_e32 v15, vcc, 1, v12 +; CGP-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v5, v6, 0 +; CGP-NEXT: v_addc_u32_e32 v16, vcc, 0, v13, vcc +; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v9, v4 +; CGP-NEXT: v_cndmask_b32_e64 v9, 0, -1, vcc +; CGP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v10 +; CGP-NEXT: v_cndmask_b32_e32 v17, v11, v9, vcc +; CGP-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v5, v8, v[1:2] +; CGP-NEXT: v_add_i32_e32 v1, vcc, 1, v15 +; CGP-NEXT: v_mad_u64_u32 v[9:10], s[4:5], -1, v6, v[9:10] +; CGP-NEXT: v_addc_u32_e32 v18, vcc, 0, v16, vcc +; CGP-NEXT: v_cmp_ne_u32_e32 vcc, 0, v17 +; CGP-NEXT: v_cndmask_b32_e32 v10, v15, v1, vcc +; CGP-NEXT: v_mul_lo_u32 v1, v8, v0 +; CGP-NEXT: v_mul_lo_u32 v15, v6, v9 +; CGP-NEXT: v_mul_hi_u32 v17, v6, v0 +; CGP-NEXT: v_cndmask_b32_e32 v16, v16, v18, vcc +; CGP-NEXT: v_mul_hi_u32 v0, v8, v0 +; CGP-NEXT: v_add_i32_e32 v1, vcc, v1, v15 +; CGP-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v1, vcc, v1, v17 ; CGP-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc -; CGP-NEXT: v_mul_lo_u32 v14, v16, v6 -; CGP-NEXT: v_add_i32_e32 v1, vcc, v11, v1 -; CGP-NEXT: v_mul_hi_u32 v11, v13, v6 -; CGP-NEXT: v_add_i32_e32 v0, vcc, v14, v0 -; CGP-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v0, vcc, v0, v11 -; CGP-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v11, vcc, v14, v11 -; CGP-NEXT: v_mul_hi_u32 v6, v16, v6 +; CGP-NEXT: v_mul_lo_u32 v17, v8, v9 +; CGP-NEXT: v_add_i32_e32 v1, vcc, v15, v1 +; CGP-NEXT: v_mul_hi_u32 v15, v6, v9 +; CGP-NEXT: v_add_i32_e32 v0, vcc, v17, v0 +; CGP-NEXT: v_cndmask_b32_e64 v17, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v0, vcc, v0, v15 +; CGP-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v15, vcc, v17, v15 +; CGP-NEXT: v_mul_hi_u32 v9, v8, v9 ; CGP-NEXT: v_add_i32_e32 v0, vcc, v0, v1 ; CGP-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v1, vcc, v11, v1 -; CGP-NEXT: v_add_i32_e32 v1, vcc, v6, v1 -; CGP-NEXT: v_add_i32_e32 v11, vcc, v13, v0 -; CGP-NEXT: v_addc_u32_e32 v13, vcc, v16, v1, vcc -; CGP-NEXT: v_mad_u64_u32 v[0:1], s[4:5], s6, v11, 0 -; CGP-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 -; CGP-NEXT: v_cndmask_b32_e32 v6, v9, v7, vcc -; CGP-NEXT: v_xor_b32_e32 v9, v6, v5 -; CGP-NEXT: v_mad_u64_u32 v[6:7], s[4:5], s6, v13, v[1:2] -; CGP-NEXT: v_cndmask_b32_e32 v8, v10, v12, vcc -; CGP-NEXT: v_xor_b32_e32 v1, v8, v5 -; CGP-NEXT: v_mad_u64_u32 v[6:7], s[4:5], -1, v11, v[6:7] -; CGP-NEXT: v_ashrrev_i32_e32 v8, 31, v3 -; CGP-NEXT: v_add_i32_e32 v2, vcc, v2, v8 -; CGP-NEXT: v_addc_u32_e32 v3, vcc, v3, v8, vcc -; CGP-NEXT: v_xor_b32_e32 v7, v2, v8 -; CGP-NEXT: v_mul_lo_u32 v2, v13, v0 -; CGP-NEXT: v_mul_lo_u32 v10, v11, v6 -; CGP-NEXT: v_xor_b32_e32 v12, v3, v8 -; CGP-NEXT: v_mul_hi_u32 v3, v11, v0 -; CGP-NEXT: v_mul_hi_u32 v0, v13, v0 +; CGP-NEXT: v_add_i32_e32 v1, vcc, v15, v1 +; CGP-NEXT: v_add_i32_e32 v1, vcc, v9, v1 +; CGP-NEXT: v_add_i32_e32 v9, vcc, v6, v0 +; CGP-NEXT: v_addc_u32_e32 v8, vcc, v8, v1, vcc +; CGP-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v5, v9, 0 +; CGP-NEXT: v_cmp_ne_u32_e32 vcc, 0, v14 +; CGP-NEXT: v_cndmask_b32_e32 v6, v12, v10, vcc +; CGP-NEXT: v_xor_b32_e32 v12, v6, v7 +; CGP-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v5, v8, v[1:2] +; CGP-NEXT: v_cndmask_b32_e32 v10, v13, v16, vcc +; CGP-NEXT: v_xor_b32_e32 v1, v10, v7 +; CGP-NEXT: v_mad_u64_u32 v[5:6], s[4:5], -1, v9, v[5:6] +; CGP-NEXT: v_ashrrev_i32_e32 v10, 31, v3 ; CGP-NEXT: v_add_i32_e32 v2, vcc, v2, v10 -; CGP-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc +; CGP-NEXT: v_addc_u32_e32 v3, vcc, v3, v10, vcc +; CGP-NEXT: v_xor_b32_e32 v13, v2, v10 +; CGP-NEXT: v_mul_lo_u32 v2, v8, v0 +; CGP-NEXT: v_mul_lo_u32 v6, v9, v5 +; CGP-NEXT: v_xor_b32_e32 v14, v3, v10 +; CGP-NEXT: v_mul_hi_u32 v3, v9, v0 +; CGP-NEXT: v_mul_hi_u32 v0, v8, v0 +; CGP-NEXT: v_add_i32_e32 v2, vcc, v2, v6 +; CGP-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v2, vcc, v2, v3 ; CGP-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc -; CGP-NEXT: v_mul_lo_u32 v3, v13, v6 -; CGP-NEXT: v_add_i32_e32 v2, vcc, v10, v2 -; CGP-NEXT: v_mul_hi_u32 v10, v11, v6 +; CGP-NEXT: v_mul_lo_u32 v3, v8, v5 +; CGP-NEXT: v_add_i32_e32 v2, vcc, v6, v2 +; CGP-NEXT: v_mul_hi_u32 v6, v9, v5 ; CGP-NEXT: v_add_i32_e32 v0, vcc, v3, v0 ; CGP-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v0, vcc, v0, v10 -; CGP-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v3, vcc, v3, v10 -; CGP-NEXT: v_mul_hi_u32 v6, v13, v6 +; CGP-NEXT: v_add_i32_e32 v0, vcc, v0, v6 +; CGP-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v3, vcc, v3, v6 +; CGP-NEXT: v_mul_hi_u32 v5, v8, v5 ; CGP-NEXT: v_add_i32_e32 v0, vcc, v0, v2 ; CGP-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v2, vcc, v3, v2 -; CGP-NEXT: v_add_i32_e32 v2, vcc, v6, v2 -; CGP-NEXT: v_add_i32_e32 v3, vcc, v11, v0 -; CGP-NEXT: v_addc_u32_e32 v2, vcc, v13, v2, vcc -; CGP-NEXT: v_mul_lo_u32 v6, v12, v3 -; CGP-NEXT: v_mul_lo_u32 v10, v7, v2 -; CGP-NEXT: v_sub_i32_e32 v0, vcc, v9, v5 -; CGP-NEXT: v_subb_u32_e32 v1, vcc, v1, v5, vcc -; CGP-NEXT: v_mul_hi_u32 v5, v7, v3 -; CGP-NEXT: v_add_i32_e32 v6, vcc, v6, v10 -; CGP-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v5, vcc, v6, v5 +; CGP-NEXT: v_add_i32_e32 v2, vcc, v5, v2 +; CGP-NEXT: v_add_i32_e32 v3, vcc, v9, v0 +; CGP-NEXT: v_addc_u32_e32 v2, vcc, v8, v2, vcc +; CGP-NEXT: v_mul_lo_u32 v5, v14, v3 +; CGP-NEXT: v_mul_lo_u32 v6, v13, v2 +; CGP-NEXT: v_sub_i32_e32 v0, vcc, v12, v7 +; CGP-NEXT: v_subb_u32_e32 v1, vcc, v1, v7, vcc +; CGP-NEXT: v_mul_hi_u32 v7, v13, v3 +; CGP-NEXT: v_add_i32_e32 v5, vcc, v5, v6 +; CGP-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v5, vcc, v5, v7 ; CGP-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc -; CGP-NEXT: v_mul_lo_u32 v6, v12, v2 -; CGP-NEXT: v_mul_hi_u32 v3, v12, v3 -; CGP-NEXT: v_add_i32_e32 v5, vcc, v9, v5 -; CGP-NEXT: v_mul_hi_u32 v9, v7, v2 -; CGP-NEXT: v_add_i32_e32 v3, vcc, v6, v3 +; CGP-NEXT: v_mul_lo_u32 v7, v14, v2 +; CGP-NEXT: v_mul_hi_u32 v3, v14, v3 +; CGP-NEXT: v_add_i32_e32 v5, vcc, v6, v5 +; CGP-NEXT: v_mul_hi_u32 v6, v13, v2 +; CGP-NEXT: v_add_i32_e32 v3, vcc, v7, v3 +; CGP-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v3, vcc, v3, v6 ; CGP-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v3, vcc, v3, v9 -; CGP-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v6, vcc, v6, v9 -; CGP-NEXT: v_add_i32_e32 v9, vcc, v3, v5 -; CGP-NEXT: v_mul_hi_u32 v10, v12, v2 -; CGP-NEXT: v_mad_u64_u32 v[2:3], s[4:5], s7, v9, 0 +; CGP-NEXT: v_add_i32_e32 v6, vcc, v7, v6 +; CGP-NEXT: v_add_i32_e32 v7, vcc, v3, v5 +; CGP-NEXT: v_mul_hi_u32 v8, v14, v2 +; CGP-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v4, v7, 0 ; CGP-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v5, vcc, v6, v5 -; CGP-NEXT: v_add_i32_e32 v10, vcc, v10, v5 -; CGP-NEXT: v_mad_u64_u32 v[5:6], s[4:5], s7, v10, v[3:4] -; CGP-NEXT: v_sub_i32_e32 v2, vcc, v7, v2 -; CGP-NEXT: v_subb_u32_e64 v3, s[4:5], v12, v5, vcc -; CGP-NEXT: v_sub_i32_e64 v5, s[4:5], v12, v5 +; CGP-NEXT: v_add_i32_e32 v8, vcc, v8, v5 +; CGP-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v4, v8, v[3:4] +; CGP-NEXT: v_sub_i32_e32 v2, vcc, v13, v2 +; CGP-NEXT: v_subb_u32_e64 v3, s[4:5], v14, v5, vcc +; CGP-NEXT: v_sub_i32_e64 v5, s[4:5], v14, v5 ; CGP-NEXT: v_subbrev_u32_e32 v5, vcc, 0, v5, vcc -; CGP-NEXT: s_bfe_i32 s6, 1, 0x10000 ; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v2, v4 ; CGP-NEXT: v_sub_i32_e32 v2, vcc, v2, v4 ; CGP-NEXT: v_cndmask_b32_e64 v6, 0, -1, s[4:5] -; CGP-NEXT: v_mov_b32_e32 v7, s6 ; CGP-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v3 ; CGP-NEXT: v_subbrev_u32_e32 v5, vcc, 0, v5, vcc -; CGP-NEXT: v_cndmask_b32_e64 v3, v7, v6, s[4:5] -; CGP-NEXT: v_add_i32_e32 v6, vcc, 1, v9 -; CGP-NEXT: v_addc_u32_e32 v7, vcc, 0, v10, vcc -; CGP-NEXT: s_bfe_i32 s4, 1, 0x10000 +; CGP-NEXT: v_cndmask_b32_e64 v3, v11, v6, s[4:5] +; CGP-NEXT: v_add_i32_e32 v6, vcc, 1, v7 +; CGP-NEXT: v_addc_u32_e32 v9, vcc, 0, v8, vcc ; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v2, v4 ; CGP-NEXT: v_cndmask_b32_e64 v2, 0, -1, vcc -; CGP-NEXT: v_mov_b32_e32 v4, s4 ; CGP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v5 -; CGP-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc +; CGP-NEXT: v_cndmask_b32_e32 v2, v11, v2, vcc ; CGP-NEXT: v_add_i32_e32 v4, vcc, 1, v6 -; CGP-NEXT: v_addc_u32_e32 v5, vcc, 0, v7, vcc +; CGP-NEXT: v_addc_u32_e32 v5, vcc, 0, v9, vcc ; CGP-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 ; CGP-NEXT: v_cndmask_b32_e32 v2, v6, v4, vcc -; CGP-NEXT: v_cndmask_b32_e32 v4, v7, v5, vcc +; CGP-NEXT: v_cndmask_b32_e32 v4, v9, v5, vcc ; CGP-NEXT: v_cmp_ne_u32_e32 vcc, 0, v3 -; CGP-NEXT: v_cndmask_b32_e32 v2, v9, v2, vcc -; CGP-NEXT: v_cndmask_b32_e32 v3, v10, v4, vcc -; CGP-NEXT: v_xor_b32_e32 v2, v2, v8 -; CGP-NEXT: v_xor_b32_e32 v3, v3, v8 -; CGP-NEXT: v_sub_i32_e32 v2, vcc, v2, v8 -; CGP-NEXT: v_subb_u32_e32 v3, vcc, v3, v8, vcc +; CGP-NEXT: v_cndmask_b32_e32 v2, v7, v2, vcc +; CGP-NEXT: v_cndmask_b32_e32 v3, v8, v4, vcc +; CGP-NEXT: v_xor_b32_e32 v2, v2, v10 +; CGP-NEXT: v_xor_b32_e32 v3, v3, v10 +; CGP-NEXT: v_sub_i32_e32 v2, vcc, v2, v10 +; CGP-NEXT: v_subb_u32_e32 v3, vcc, v3, v10, vcc ; CGP-NEXT: s_setpc_b64 s[30:31] %result = sdiv <2 x i64> %num, ret <2 x i64> %result @@ -2479,17 +2437,17 @@ ; CHECK-NEXT: v_mul_hi_u32 v1, v0, v1 ; CHECK-NEXT: v_add_i32_e32 v0, vcc, v0, v1 ; CHECK-NEXT: v_mul_hi_u32 v0, v3, v0 -; CHECK-NEXT: v_mul_lo_u32 v1, v0, v5 -; CHECK-NEXT: v_add_i32_e32 v2, vcc, 1, v0 -; CHECK-NEXT: v_sub_i32_e32 v1, vcc, v3, v1 -; CHECK-NEXT: v_cmp_ge_u32_e32 vcc, v1, v5 -; CHECK-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc -; CHECK-NEXT: v_sub_i32_e64 v2, s[4:5], v1, v5 -; CHECK-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc -; CHECK-NEXT: v_add_i32_e32 v2, vcc, 1, v0 -; CHECK-NEXT: v_cmp_ge_u32_e32 vcc, v1, v5 -; CHECK-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc ; CHECK-NEXT: v_mov_b32_e32 v1, 0 +; CHECK-NEXT: v_mul_lo_u32 v2, v0, v5 +; CHECK-NEXT: v_add_i32_e32 v4, vcc, 1, v0 +; CHECK-NEXT: v_sub_i32_e32 v2, vcc, v3, v2 +; CHECK-NEXT: v_cmp_ge_u32_e32 vcc, v2, v5 +; CHECK-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc +; CHECK-NEXT: v_sub_i32_e64 v3, s[4:5], v2, v5 +; CHECK-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc +; CHECK-NEXT: v_add_i32_e32 v3, vcc, 1, v0 +; CHECK-NEXT: v_cmp_ge_u32_e32 vcc, v2, v5 +; CHECK-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc ; CHECK-NEXT: s_or_b64 exec, exec, s[6:7] ; CHECK-NEXT: s_setpc_b64 s[30:31] %shl.y = shl i64 4096, %y @@ -2935,17 +2893,17 @@ ; CGP-NEXT: v_mul_hi_u32 v1, v0, v1 ; CGP-NEXT: v_add_i32_e32 v0, vcc, v0, v1 ; CGP-NEXT: v_mul_hi_u32 v0, v8, v0 -; CGP-NEXT: v_mul_lo_u32 v1, v0, v2 -; CGP-NEXT: v_add_i32_e32 v3, vcc, 1, v0 -; CGP-NEXT: v_sub_i32_e32 v1, vcc, v8, v1 -; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v1, v2 -; CGP-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc -; CGP-NEXT: v_sub_i32_e64 v3, s[4:5], v1, v2 -; CGP-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc -; CGP-NEXT: v_add_i32_e32 v3, vcc, 1, v0 -; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v1, v2 -; CGP-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc ; CGP-NEXT: v_mov_b32_e32 v1, 0 +; CGP-NEXT: v_mul_lo_u32 v3, v0, v2 +; CGP-NEXT: v_add_i32_e32 v4, vcc, 1, v0 +; CGP-NEXT: v_sub_i32_e32 v3, vcc, v8, v3 +; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v3, v2 +; CGP-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc +; CGP-NEXT: v_sub_i32_e64 v4, s[4:5], v3, v2 +; CGP-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc +; CGP-NEXT: v_add_i32_e32 v4, vcc, 1, v0 +; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v3, v2 +; CGP-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc ; CGP-NEXT: .LBB8_4: ; CGP-NEXT: s_or_b64 exec, exec, s[8:9] ; CGP-NEXT: v_or_b32_e32 v3, v7, v10 @@ -3107,17 +3065,17 @@ ; CGP-NEXT: v_mul_hi_u32 v3, v2, v3 ; CGP-NEXT: v_add_i32_e32 v2, vcc, v2, v3 ; CGP-NEXT: v_mul_hi_u32 v2, v5, v2 -; CGP-NEXT: v_mul_lo_u32 v3, v2, v9 -; CGP-NEXT: v_add_i32_e32 v4, vcc, 1, v2 -; CGP-NEXT: v_sub_i32_e32 v3, vcc, v5, v3 -; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v3, v9 -; CGP-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc -; CGP-NEXT: v_sub_i32_e64 v4, s[4:5], v3, v9 -; CGP-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc -; CGP-NEXT: v_add_i32_e32 v4, vcc, 1, v2 -; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v3, v9 -; CGP-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc ; CGP-NEXT: v_mov_b32_e32 v3, 0 +; CGP-NEXT: v_mul_lo_u32 v4, v2, v9 +; CGP-NEXT: v_add_i32_e32 v6, vcc, 1, v2 +; CGP-NEXT: v_sub_i32_e32 v4, vcc, v5, v4 +; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v4, v9 +; CGP-NEXT: v_cndmask_b32_e32 v2, v2, v6, vcc +; CGP-NEXT: v_sub_i32_e64 v5, s[4:5], v4, v9 +; CGP-NEXT: v_cndmask_b32_e32 v4, v4, v5, vcc +; CGP-NEXT: v_add_i32_e32 v5, vcc, 1, v2 +; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v4, v9 +; CGP-NEXT: v_cndmask_b32_e32 v2, v2, v5, vcc ; CGP-NEXT: s_or_b64 exec, exec, s[6:7] ; CGP-NEXT: s_setpc_b64 s[30:31] %shl.y = shl <2 x i64> , %y Index: llvm/test/CodeGen/AMDGPU/GlobalISel/sdivrem.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/GlobalISel/sdivrem.ll +++ llvm/test/CodeGen/AMDGPU/GlobalISel/sdivrem.ll @@ -277,16 +277,15 @@ ; GFX8-NEXT: v_cmp_ne_u32_e32 vcc, 0, v11 ; GFX8-NEXT: v_cndmask_b32_e32 v9, v9, v12, vcc ; GFX8-NEXT: v_cndmask_b32_e32 v10, v10, v13, vcc -; GFX8-NEXT: v_cmp_ne_u32_e32 vcc, 0, v6 -; GFX8-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v11 -; GFX8-NEXT: v_cndmask_b32_e32 v4, v4, v9, vcc -; GFX8-NEXT: v_cndmask_b32_e64 v5, v7, v5, s[0:1] -; GFX8-NEXT: v_cndmask_b32_e64 v1, v8, v1, s[0:1] +; GFX8-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v6 +; GFX8-NEXT: v_cndmask_b32_e32 v5, v7, v5, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v1, v8, v1, vcc +; GFX8-NEXT: v_cndmask_b32_e64 v4, v4, v9, s[0:1] +; GFX8-NEXT: v_cndmask_b32_e64 v3, v3, v10, s[0:1] +; GFX8-NEXT: v_cndmask_b32_e64 v5, v0, v5, s[0:1] +; GFX8-NEXT: v_cndmask_b32_e64 v2, v2, v1, s[0:1] ; GFX8-NEXT: s_xor_b64 s[0:1], s[2:3], s[12:13] -; GFX8-NEXT: v_cndmask_b32_e32 v3, v3, v10, vcc -; GFX8-NEXT: v_cndmask_b32_e32 v5, v0, v5, vcc ; GFX8-NEXT: v_xor_b32_e32 v0, s0, v4 -; GFX8-NEXT: v_cndmask_b32_e32 v2, v2, v1, vcc ; GFX8-NEXT: v_xor_b32_e32 v1, s1, v3 ; GFX8-NEXT: v_mov_b32_e32 v3, s1 ; GFX8-NEXT: v_subrev_u32_e32 v0, vcc, s0, v0 @@ -358,7 +357,7 @@ ; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, v3, v0 ; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, v4, v1, vcc ; GFX9-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s14, v3, 0 -; GFX9-NEXT: v_mov_b32_e32 v7, s9 +; GFX9-NEXT: v_mov_b32_e32 v7, 0 ; GFX9-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s14, v4, v[1:2] ; GFX9-NEXT: v_mul_hi_u32 v6, v3, v0 ; GFX9-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s15, v3, v[1:2] @@ -408,7 +407,7 @@ ; GFX9-NEXT: v_mov_b32_e32 v6, s11 ; GFX9-NEXT: v_sub_co_u32_e32 v0, vcc, s10, v0 ; GFX9-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s9, v5, v[1:2] -; GFX9-NEXT: v_mov_b32_e32 v4, 0 +; GFX9-NEXT: v_mov_b32_e32 v4, s9 ; GFX9-NEXT: v_subb_co_u32_e64 v2, s[0:1], v6, v1, vcc ; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s9, v2 ; GFX9-NEXT: v_sub_u32_e32 v1, s11, v1 @@ -416,7 +415,7 @@ ; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s8, v0 ; GFX9-NEXT: v_cndmask_b32_e64 v8, 0, -1, s[0:1] ; GFX9-NEXT: v_cmp_eq_u32_e64 s[0:1], s9, v2 -; GFX9-NEXT: v_subb_co_u32_e32 v1, vcc, v1, v7, vcc +; GFX9-NEXT: v_subb_co_u32_e32 v1, vcc, v1, v4, vcc ; GFX9-NEXT: v_cndmask_b32_e64 v6, v6, v8, s[0:1] ; GFX9-NEXT: v_subrev_co_u32_e32 v8, vcc, s8, v0 ; GFX9-NEXT: v_subbrev_co_u32_e64 v9, s[0:1], 0, v1, vcc @@ -425,10 +424,10 @@ ; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s9, v9 ; GFX9-NEXT: v_cndmask_b32_e64 v12, 0, -1, s[0:1] ; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s8, v8 -; GFX9-NEXT: v_subb_co_u32_e32 v1, vcc, v1, v7, vcc +; GFX9-NEXT: v_subb_co_u32_e32 v1, vcc, v1, v4, vcc ; GFX9-NEXT: v_cndmask_b32_e64 v13, 0, -1, s[0:1] ; GFX9-NEXT: v_cmp_eq_u32_e64 s[0:1], s9, v9 -; GFX9-NEXT: v_subrev_co_u32_e32 v7, vcc, s8, v8 +; GFX9-NEXT: v_subrev_co_u32_e32 v4, vcc, s8, v8 ; GFX9-NEXT: v_cndmask_b32_e64 v12, v12, v13, s[0:1] ; GFX9-NEXT: v_add_co_u32_e64 v13, s[0:1], 1, v10 ; GFX9-NEXT: v_subbrev_co_u32_e32 v1, vcc, 0, v1, vcc @@ -436,27 +435,26 @@ ; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v12 ; GFX9-NEXT: v_cndmask_b32_e32 v10, v10, v13, vcc ; GFX9-NEXT: v_cndmask_b32_e32 v11, v11, v14, vcc -; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v6 -; GFX9-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v12 -; GFX9-NEXT: v_cndmask_b32_e32 v5, v5, v10, vcc -; GFX9-NEXT: v_cndmask_b32_e64 v6, v8, v7, s[0:1] -; GFX9-NEXT: v_cndmask_b32_e64 v1, v9, v1, s[0:1] +; GFX9-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v6 +; GFX9-NEXT: v_cndmask_b32_e32 v4, v8, v4, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v1, v9, v1, vcc +; GFX9-NEXT: v_cndmask_b32_e64 v5, v5, v10, s[0:1] +; GFX9-NEXT: v_cndmask_b32_e64 v3, v3, v11, s[0:1] +; GFX9-NEXT: v_cndmask_b32_e64 v4, v0, v4, s[0:1] +; GFX9-NEXT: v_cndmask_b32_e64 v2, v2, v1, s[0:1] ; GFX9-NEXT: s_xor_b64 s[0:1], s[2:3], s[12:13] -; GFX9-NEXT: v_cndmask_b32_e32 v3, v3, v11, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v6, v0, v6, vcc ; GFX9-NEXT: v_xor_b32_e32 v0, s0, v5 -; GFX9-NEXT: v_cndmask_b32_e32 v2, v2, v1, vcc ; GFX9-NEXT: v_xor_b32_e32 v1, s1, v3 ; GFX9-NEXT: v_mov_b32_e32 v3, s1 ; GFX9-NEXT: v_subrev_co_u32_e32 v0, vcc, s0, v0 ; GFX9-NEXT: v_subb_co_u32_e32 v1, vcc, v1, v3, vcc -; GFX9-NEXT: v_xor_b32_e32 v3, s2, v6 -; GFX9-NEXT: v_xor_b32_e32 v5, s2, v2 -; GFX9-NEXT: v_mov_b32_e32 v6, s2 +; GFX9-NEXT: v_xor_b32_e32 v3, s2, v4 +; GFX9-NEXT: v_xor_b32_e32 v4, s2, v2 +; GFX9-NEXT: v_mov_b32_e32 v5, s2 ; GFX9-NEXT: v_subrev_co_u32_e32 v2, vcc, s2, v3 -; GFX9-NEXT: v_subb_co_u32_e32 v3, vcc, v5, v6, vcc -; GFX9-NEXT: global_store_dwordx2 v4, v[0:1], s[4:5] -; GFX9-NEXT: global_store_dwordx2 v4, v[2:3], s[6:7] +; GFX9-NEXT: v_subb_co_u32_e32 v3, vcc, v4, v5, vcc +; GFX9-NEXT: global_store_dwordx2 v7, v[0:1], s[4:5] +; GFX9-NEXT: global_store_dwordx2 v7, v[2:3], s[6:7] ; GFX9-NEXT: s_endpgm ; ; GFX10-LABEL: sdivrem_i64: @@ -559,47 +557,46 @@ ; GFX10-NEXT: v_add_co_u32 v2, s10, v2, v0 ; GFX10-NEXT: v_cndmask_b32_e64 v0, 0, 1, s10 ; GFX10-NEXT: v_mul_lo_u32 v4, s9, v2 -; GFX10-NEXT: v_add_co_u32 v6, vcc_lo, v2, 1 ; GFX10-NEXT: v_add3_u32 v3, v3, v0, v1 ; GFX10-NEXT: v_mad_u64_u32 v[0:1], s10, s8, v2, 0 ; GFX10-NEXT: v_mul_lo_u32 v5, s8, v3 -; GFX10-NEXT: v_add_co_ci_u32_e32 v7, vcc_lo, 0, v3, vcc_lo ; GFX10-NEXT: v_add3_u32 v1, v1, v5, v4 -; GFX10-NEXT: v_add_co_u32 v4, vcc_lo, v6, 1 -; GFX10-NEXT: v_add_co_ci_u32_e32 v5, vcc_lo, 0, v7, vcc_lo -; GFX10-NEXT: v_sub_nc_u32_e32 v8, s1, v1 +; GFX10-NEXT: v_add_co_u32 v4, vcc_lo, v2, 1 +; GFX10-NEXT: v_add_co_ci_u32_e32 v5, vcc_lo, 0, v3, vcc_lo +; GFX10-NEXT: v_sub_nc_u32_e32 v6, s1, v1 ; GFX10-NEXT: v_sub_co_u32 v0, vcc_lo, s0, v0 ; GFX10-NEXT: v_sub_co_ci_u32_e64 v1, s0, s1, v1, vcc_lo -; GFX10-NEXT: v_subrev_co_ci_u32_e32 v8, vcc_lo, s9, v8, vcc_lo +; GFX10-NEXT: v_subrev_co_ci_u32_e32 v6, vcc_lo, s9, v6, vcc_lo ; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, s8, v0 -; GFX10-NEXT: v_cndmask_b32_e64 v9, 0, -1, vcc_lo -; GFX10-NEXT: v_sub_co_u32 v10, vcc_lo, v0, s8 -; GFX10-NEXT: v_subrev_co_ci_u32_e64 v11, s0, 0, v8, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e64 v7, 0, -1, vcc_lo +; GFX10-NEXT: v_sub_co_u32 v8, vcc_lo, v0, s8 +; GFX10-NEXT: v_subrev_co_ci_u32_e64 v9, s0, 0, v6, vcc_lo ; GFX10-NEXT: v_cmp_le_u32_e64 s0, s9, v1 -; GFX10-NEXT: v_subrev_co_ci_u32_e32 v8, vcc_lo, s9, v8, vcc_lo -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, s9, v11 +; GFX10-NEXT: v_subrev_co_ci_u32_e32 v6, vcc_lo, s9, v6, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e64 v10, 0, -1, s0 +; GFX10-NEXT: v_cmp_le_u32_e64 s0, s8, v8 +; GFX10-NEXT: v_cndmask_b32_e64 v11, 0, -1, s0 +; GFX10-NEXT: v_cmp_le_u32_e64 s0, s9, v9 ; GFX10-NEXT: v_cndmask_b32_e64 v12, 0, -1, s0 -; GFX10-NEXT: v_cmp_le_u32_e64 s0, s8, v10 -; GFX10-NEXT: v_cndmask_b32_e64 v13, 0, -1, s0 -; GFX10-NEXT: v_cmp_le_u32_e64 s0, s9, v11 -; GFX10-NEXT: v_cndmask_b32_e64 v14, 0, -1, s0 +; GFX10-NEXT: v_add_co_u32 v13, s0, v4, 1 +; GFX10-NEXT: v_add_co_ci_u32_e64 v14, s0, 0, v5, s0 +; GFX10-NEXT: v_cmp_eq_u32_e64 s0, s9, v9 +; GFX10-NEXT: v_cndmask_b32_e64 v11, v12, v11, s0 ; GFX10-NEXT: v_cmp_eq_u32_e64 s0, s9, v1 -; GFX10-NEXT: v_cndmask_b32_e64 v9, v12, v9, s0 -; GFX10-NEXT: v_cndmask_b32_e32 v12, v14, v13, vcc_lo -; GFX10-NEXT: v_sub_co_u32 v13, vcc_lo, v10, s8 -; GFX10-NEXT: v_subrev_co_ci_u32_e32 v8, vcc_lo, 0, v8, vcc_lo -; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v12 -; GFX10-NEXT: v_cmp_ne_u32_e64 s0, 0, v12 -; GFX10-NEXT: v_cmp_ne_u32_e64 s1, 0, v9 +; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v11 +; GFX10-NEXT: v_cndmask_b32_e64 v7, v10, v7, s0 +; GFX10-NEXT: v_sub_co_u32 v10, s0, v8, s8 +; GFX10-NEXT: v_subrev_co_ci_u32_e64 v6, s0, 0, v6, s0 +; GFX10-NEXT: v_cndmask_b32_e32 v4, v4, v13, vcc_lo +; GFX10-NEXT: v_cmp_ne_u32_e64 s0, 0, v7 +; GFX10-NEXT: v_cndmask_b32_e32 v5, v5, v14, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e32 v7, v8, v10, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e32 v6, v9, v6, vcc_lo ; GFX10-NEXT: s_xor_b64 s[8:9], s[2:3], s[12:13] -; GFX10-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e32 v5, v7, v5, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e64 v6, v10, v13, s0 -; GFX10-NEXT: v_cndmask_b32_e64 v7, v11, v8, s0 -; GFX10-NEXT: v_cndmask_b32_e64 v2, v2, v4, s1 -; GFX10-NEXT: v_cndmask_b32_e64 v3, v3, v5, s1 -; GFX10-NEXT: v_cndmask_b32_e64 v0, v0, v6, s1 -; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, v7, s1 +; GFX10-NEXT: v_cndmask_b32_e64 v2, v2, v4, s0 +; GFX10-NEXT: v_cndmask_b32_e64 v3, v3, v5, s0 +; GFX10-NEXT: v_cndmask_b32_e64 v0, v0, v7, s0 +; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, v6, s0 ; GFX10-NEXT: v_mov_b32_e32 v4, 0 ; GFX10-NEXT: v_xor_b32_e32 v2, s8, v2 ; GFX10-NEXT: v_xor_b32_e32 v3, s9, v3 @@ -1423,40 +1420,40 @@ ; GFX8-NEXT: v_rcp_iflag_f32_e32 v0, v0 ; GFX8-NEXT: v_cmp_ne_u32_e32 vcc, 0, v11 ; GFX8-NEXT: v_cndmask_b32_e32 v5, v1, v12, vcc -; GFX8-NEXT: v_cndmask_b32_e32 v10, v10, v13, vcc +; GFX8-NEXT: s_mov_b32 s13, s12 ; GFX8-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0 ; GFX8-NEXT: v_mul_f32_e32 v1, 0x2f800000, v0 -; GFX8-NEXT: v_trunc_f32_e32 v12, v1 -; GFX8-NEXT: v_mul_f32_e32 v1, 0xcf800000, v12 +; GFX8-NEXT: v_trunc_f32_e32 v11, v1 +; GFX8-NEXT: v_mul_f32_e32 v1, 0xcf800000, v11 ; GFX8-NEXT: v_add_f32_e32 v0, v1, v0 -; GFX8-NEXT: v_cvt_u32_f32_e32 v13, v0 -; GFX8-NEXT: s_mov_b32 s13, s12 +; GFX8-NEXT: v_cvt_u32_f32_e32 v12, v0 ; GFX8-NEXT: s_xor_b64 s[6:7], s[0:1], s[12:13] ; GFX8-NEXT: s_sub_u32 s5, 0, s2 -; GFX8-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 -; GFX8-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s5, v13, 0 -; GFX8-NEXT: v_cndmask_b32_e32 v4, v4, v5, vcc -; GFX8-NEXT: v_cvt_u32_f32_e32 v5, v12 ; GFX8-NEXT: s_subb_u32 s20, 0, s3 -; GFX8-NEXT: v_cndmask_b32_e32 v10, v3, v10, vcc -; GFX8-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s5, v5, v[1:2] -; GFX8-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v11 -; GFX8-NEXT: v_cndmask_b32_e64 v3, v8, v15, s[0:1] -; GFX8-NEXT: v_mad_u64_u32 v[1:2], s[14:15], s20, v13, v[1:2] -; GFX8-NEXT: v_cndmask_b32_e64 v2, v9, v16, s[0:1] -; GFX8-NEXT: v_cndmask_b32_e32 v7, v7, v3, vcc +; GFX8-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s5, v12, 0 +; GFX8-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v2 +; GFX8-NEXT: v_cndmask_b32_e64 v4, v4, v5, s[0:1] +; GFX8-NEXT: v_cvt_u32_f32_e32 v5, v11 +; GFX8-NEXT: v_cndmask_b32_e32 v10, v10, v13, vcc +; GFX8-NEXT: v_cndmask_b32_e64 v10, v3, v10, s[0:1] +; GFX8-NEXT: v_cndmask_b32_e32 v3, v8, v15, vcc +; GFX8-NEXT: v_mad_u64_u32 v[1:2], s[14:15], s5, v5, v[1:2] +; GFX8-NEXT: v_cndmask_b32_e64 v7, v7, v3, s[0:1] ; GFX8-NEXT: v_mul_lo_u32 v3, v5, v0 -; GFX8-NEXT: v_mul_lo_u32 v8, v13, v1 -; GFX8-NEXT: v_cndmask_b32_e32 v6, v6, v2, vcc -; GFX8-NEXT: v_mul_hi_u32 v2, v13, v0 +; GFX8-NEXT: v_mad_u64_u32 v[1:2], s[14:15], s20, v12, v[1:2] +; GFX8-NEXT: v_cndmask_b32_e32 v2, v9, v16, vcc +; GFX8-NEXT: v_cndmask_b32_e64 v6, v6, v2, s[0:1] +; GFX8-NEXT: v_mul_lo_u32 v8, v12, v1 +; GFX8-NEXT: v_mul_hi_u32 v2, v12, v0 ; GFX8-NEXT: v_mul_hi_u32 v0, v5, v0 +; GFX8-NEXT: v_xor_b32_e32 v9, s19, v10 ; GFX8-NEXT: v_add_u32_e32 v3, vcc, v3, v8 ; GFX8-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc ; GFX8-NEXT: v_add_u32_e32 v2, vcc, v3, v2 ; GFX8-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc ; GFX8-NEXT: v_mul_lo_u32 v3, v5, v1 ; GFX8-NEXT: v_add_u32_e32 v2, vcc, v8, v2 -; GFX8-NEXT: v_mul_hi_u32 v8, v13, v1 +; GFX8-NEXT: v_mul_hi_u32 v8, v12, v1 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, v3, v0 ; GFX8-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc ; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v8 @@ -1467,16 +1464,15 @@ ; GFX8-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc ; GFX8-NEXT: v_add_u32_e32 v2, vcc, v3, v2 ; GFX8-NEXT: v_add_u32_e32 v1, vcc, v1, v2 -; GFX8-NEXT: v_add_u32_e32 v8, vcc, v13, v0 +; GFX8-NEXT: v_add_u32_e32 v8, vcc, v12, v0 ; GFX8-NEXT: v_mad_u64_u32 v[2:3], s[0:1], s5, v8, 0 ; GFX8-NEXT: v_addc_u32_e32 v5, vcc, v5, v1, vcc ; GFX8-NEXT: v_xor_b32_e32 v1, s18, v4 ; GFX8-NEXT: v_mov_b32_e32 v0, v3 ; GFX8-NEXT: v_mad_u64_u32 v[3:4], s[0:1], s5, v5, v[0:1] -; GFX8-NEXT: v_xor_b32_e32 v9, s19, v10 ; GFX8-NEXT: v_mov_b32_e32 v10, s19 -; GFX8-NEXT: v_mad_u64_u32 v[3:4], s[0:1], s20, v8, v[3:4] ; GFX8-NEXT: v_subrev_u32_e32 v0, vcc, s18, v1 +; GFX8-NEXT: v_mad_u64_u32 v[3:4], s[0:1], s20, v8, v[3:4] ; GFX8-NEXT: v_subb_u32_e32 v1, vcc, v9, v10, vcc ; GFX8-NEXT: v_xor_b32_e32 v4, s4, v7 ; GFX8-NEXT: v_mul_lo_u32 v7, v5, v2 @@ -1642,7 +1638,6 @@ ; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, v3, v0 ; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, v4, v1, vcc ; GFX9-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s18, v3, 0 -; GFX9-NEXT: v_mov_b32_e32 v7, s7 ; GFX9-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s18, v4, v[1:2] ; GFX9-NEXT: v_mul_hi_u32 v6, v3, v0 ; GFX9-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s19, v3, v[1:2] @@ -1674,7 +1669,7 @@ ; GFX9-NEXT: v_mul_lo_u32 v3, s12, v1 ; GFX9-NEXT: v_mul_hi_u32 v4, s12, v0 ; GFX9-NEXT: v_mul_hi_u32 v0, s13, v0 -; GFX9-NEXT: v_mul_hi_u32 v6, s13, v1 +; GFX9-NEXT: v_mul_hi_u32 v7, s13, v1 ; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v2, v3 ; GFX9-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc ; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v2, v4 @@ -1688,159 +1683,158 @@ ; GFX9-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc ; GFX9-NEXT: v_add_co_u32_e32 v5, vcc, v0, v2 ; GFX9-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s6, v5, 0 -; GFX9-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc +; GFX9-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc ; GFX9-NEXT: v_add_u32_e32 v3, v4, v3 -; GFX9-NEXT: v_add3_u32 v4, v3, v0, v6 -; GFX9-NEXT: v_mov_b32_e32 v0, v2 -; GFX9-NEXT: v_mad_u64_u32 v[2:3], s[0:1], s6, v4, v[0:1] -; GFX9-NEXT: v_mov_b32_e32 v6, s13 +; GFX9-NEXT: v_add3_u32 v4, v3, v6, v7 +; GFX9-NEXT: v_mad_u64_u32 v[2:3], s[0:1], s6, v4, v[2:3] +; GFX9-NEXT: v_mov_b32_e32 v7, s13 ; GFX9-NEXT: v_sub_co_u32_e32 v8, vcc, s12, v1 ; GFX9-NEXT: v_mad_u64_u32 v[2:3], s[0:1], s7, v5, v[2:3] +; GFX9-NEXT: v_mov_b32_e32 v6, s7 ; GFX9-NEXT: s_ashr_i32 s12, s15, 31 -; GFX9-NEXT: v_mov_b32_e32 v0, 0 -; GFX9-NEXT: v_subb_co_u32_e64 v6, s[0:1], v6, v2, vcc +; GFX9-NEXT: v_subb_co_u32_e64 v7, s[0:1], v7, v2, vcc ; GFX9-NEXT: v_sub_u32_e32 v1, s13, v2 -; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s7, v6 +; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s7, v7 ; GFX9-NEXT: v_cndmask_b32_e64 v2, 0, -1, s[0:1] ; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s6, v8 -; GFX9-NEXT: v_subb_co_u32_e32 v1, vcc, v1, v7, vcc +; GFX9-NEXT: v_subb_co_u32_e32 v1, vcc, v1, v6, vcc ; GFX9-NEXT: v_cndmask_b32_e64 v3, 0, -1, s[0:1] -; GFX9-NEXT: v_cmp_eq_u32_e64 s[0:1], s7, v6 -; GFX9-NEXT: v_subrev_co_u32_e32 v9, vcc, s6, v8 -; GFX9-NEXT: v_cndmask_b32_e64 v3, v2, v3, s[0:1] -; GFX9-NEXT: v_subbrev_co_u32_e64 v10, s[0:1], 0, v1, vcc -; GFX9-NEXT: v_add_co_u32_e64 v2, s[0:1], 1, v5 -; GFX9-NEXT: v_addc_co_u32_e64 v11, s[0:1], 0, v4, s[0:1] -; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s7, v10 -; GFX9-NEXT: v_cndmask_b32_e64 v12, 0, -1, s[0:1] -; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s6, v9 +; GFX9-NEXT: v_cmp_eq_u32_e64 s[0:1], s7, v7 +; GFX9-NEXT: v_subrev_co_u32_e32 v10, vcc, s6, v8 +; GFX9-NEXT: v_cndmask_b32_e64 v9, v2, v3, s[0:1] +; GFX9-NEXT: v_subbrev_co_u32_e64 v11, s[0:1], 0, v1, vcc +; GFX9-NEXT: v_add_co_u32_e64 v3, s[0:1], 1, v5 +; GFX9-NEXT: v_addc_co_u32_e64 v12, s[0:1], 0, v4, s[0:1] +; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s7, v11 +; GFX9-NEXT: v_cndmask_b32_e64 v2, 0, -1, s[0:1] +; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s6, v10 ; GFX9-NEXT: v_cndmask_b32_e64 v13, 0, -1, s[0:1] -; GFX9-NEXT: v_cmp_eq_u32_e64 s[0:1], s7, v10 -; GFX9-NEXT: v_cndmask_b32_e64 v12, v12, v13, s[0:1] -; GFX9-NEXT: v_add_co_u32_e64 v13, s[0:1], 1, v2 -; GFX9-NEXT: v_addc_co_u32_e64 v14, s[0:1], 0, v11, s[0:1] +; GFX9-NEXT: v_cmp_eq_u32_e64 s[0:1], s7, v11 +; GFX9-NEXT: v_cndmask_b32_e64 v13, v2, v13, s[0:1] +; GFX9-NEXT: v_add_co_u32_e64 v14, s[0:1], 1, v3 +; GFX9-NEXT: v_addc_co_u32_e64 v15, s[0:1], 0, v12, s[0:1] ; GFX9-NEXT: s_add_u32 s0, s14, s12 ; GFX9-NEXT: s_addc_u32 s1, s15, s12 ; GFX9-NEXT: s_add_u32 s2, s2, s16 ; GFX9-NEXT: s_addc_u32 s3, s3, s16 ; GFX9-NEXT: s_xor_b64 s[2:3], s[2:3], s[16:17] -; GFX9-NEXT: v_cvt_f32_u32_e32 v15, s3 -; GFX9-NEXT: v_subb_co_u32_e32 v1, vcc, v1, v7, vcc -; GFX9-NEXT: v_cvt_f32_u32_e32 v7, s2 -; GFX9-NEXT: v_subrev_co_u32_e32 v16, vcc, s6, v9 -; GFX9-NEXT: v_subbrev_co_u32_e32 v17, vcc, 0, v1, vcc -; GFX9-NEXT: v_mul_f32_e32 v1, 0x4f800000, v15 -; GFX9-NEXT: v_add_f32_e32 v1, v1, v7 -; GFX9-NEXT: v_rcp_iflag_f32_e32 v1, v1 -; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v12 -; GFX9-NEXT: v_cndmask_b32_e32 v7, v2, v13, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v11, v11, v14, vcc -; GFX9-NEXT: v_mul_f32_e32 v1, 0x5f7ffffc, v1 +; GFX9-NEXT: v_cvt_f32_u32_e32 v2, s3 +; GFX9-NEXT: v_cvt_f32_u32_e32 v16, s2 +; GFX9-NEXT: v_subb_co_u32_e32 v1, vcc, v1, v6, vcc +; GFX9-NEXT: v_mul_f32_e32 v2, 0x4f800000, v2 +; GFX9-NEXT: v_add_f32_e32 v2, v2, v16 +; GFX9-NEXT: v_rcp_iflag_f32_e32 v2, v2 +; GFX9-NEXT: v_subrev_co_u32_e32 v6, vcc, s6, v10 +; GFX9-NEXT: v_subbrev_co_u32_e32 v16, vcc, 0, v1, vcc +; GFX9-NEXT: v_mul_f32_e32 v1, 0x5f7ffffc, v2 ; GFX9-NEXT: v_mul_f32_e32 v2, 0x2f800000, v1 -; GFX9-NEXT: v_trunc_f32_e32 v13, v2 -; GFX9-NEXT: v_mul_f32_e32 v2, 0xcf800000, v13 +; GFX9-NEXT: v_trunc_f32_e32 v17, v2 +; GFX9-NEXT: v_mul_f32_e32 v2, 0xcf800000, v17 ; GFX9-NEXT: v_add_f32_e32 v1, v2, v1 -; GFX9-NEXT: v_cvt_u32_f32_e32 v14, v1 +; GFX9-NEXT: v_cvt_u32_f32_e32 v18, v1 ; GFX9-NEXT: s_mov_b32 s13, s12 ; GFX9-NEXT: s_xor_b64 s[6:7], s[0:1], s[12:13] ; GFX9-NEXT: s_sub_u32 s5, 0, s2 -; GFX9-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s5, v14, 0 -; GFX9-NEXT: v_cvt_u32_f32_e32 v13, v13 -; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v3 -; GFX9-NEXT: s_subb_u32 s14, 0, s3 -; GFX9-NEXT: v_cndmask_b32_e32 v5, v5, v7, vcc -; GFX9-NEXT: v_mad_u64_u32 v[2:3], s[0:1], s5, v13, v[2:3] -; GFX9-NEXT: v_cndmask_b32_e32 v7, v4, v11, vcc -; GFX9-NEXT: v_mul_hi_u32 v11, v14, v1 -; GFX9-NEXT: v_mad_u64_u32 v[2:3], s[0:1], s14, v14, v[2:3] -; GFX9-NEXT: v_mul_lo_u32 v3, v13, v1 -; GFX9-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v12 -; GFX9-NEXT: v_mul_lo_u32 v4, v14, v2 -; GFX9-NEXT: v_cndmask_b32_e64 v9, v9, v16, s[0:1] -; GFX9-NEXT: v_cndmask_b32_e64 v10, v10, v17, s[0:1] -; GFX9-NEXT: v_mul_hi_u32 v1, v13, v1 -; GFX9-NEXT: v_add_co_u32_e64 v3, s[0:1], v3, v4 -; GFX9-NEXT: v_cndmask_b32_e64 v4, 0, 1, s[0:1] -; GFX9-NEXT: v_add_co_u32_e64 v3, s[0:1], v3, v11 -; GFX9-NEXT: v_cndmask_b32_e64 v3, 0, 1, s[0:1] -; GFX9-NEXT: v_mul_lo_u32 v11, v13, v2 +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v13 +; GFX9-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s5, v18, 0 +; GFX9-NEXT: v_cndmask_b32_e32 v13, v3, v14, vcc +; GFX9-NEXT: v_cvt_u32_f32_e32 v14, v17 +; GFX9-NEXT: s_subb_u32 s20, 0, s3 +; GFX9-NEXT: v_cndmask_b32_e32 v12, v12, v15, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v6, v10, v6, vcc +; GFX9-NEXT: v_mad_u64_u32 v[2:3], s[0:1], s5, v14, v[2:3] +; GFX9-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v9 +; GFX9-NEXT: v_cndmask_b32_e64 v9, v4, v12, s[0:1] +; GFX9-NEXT: v_mad_u64_u32 v[2:3], s[14:15], s20, v18, v[2:3] +; GFX9-NEXT: v_mul_lo_u32 v3, v14, v1 +; GFX9-NEXT: v_cndmask_b32_e32 v10, v11, v16, vcc +; GFX9-NEXT: v_mul_lo_u32 v4, v18, v2 +; GFX9-NEXT: v_mul_hi_u32 v11, v18, v1 +; GFX9-NEXT: v_mul_hi_u32 v1, v14, v1 +; GFX9-NEXT: v_cndmask_b32_e64 v5, v5, v13, s[0:1] +; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, v3, v4 +; GFX9-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc +; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, v3, v11 +; GFX9-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc +; GFX9-NEXT: v_mul_lo_u32 v11, v14, v2 ; GFX9-NEXT: v_add_u32_e32 v3, v4, v3 -; GFX9-NEXT: v_mul_hi_u32 v4, v14, v2 -; GFX9-NEXT: v_mul_hi_u32 v2, v13, v2 -; GFX9-NEXT: v_add_co_u32_e64 v1, s[0:1], v11, v1 -; GFX9-NEXT: v_cndmask_b32_e64 v11, 0, 1, s[0:1] -; GFX9-NEXT: v_add_co_u32_e64 v1, s[0:1], v1, v4 -; GFX9-NEXT: v_cndmask_b32_e64 v4, 0, 1, s[0:1] -; GFX9-NEXT: v_add_co_u32_e64 v1, s[0:1], v1, v3 +; GFX9-NEXT: v_mul_hi_u32 v4, v18, v2 +; GFX9-NEXT: v_mul_hi_u32 v2, v14, v2 +; GFX9-NEXT: v_add_co_u32_e32 v1, vcc, v11, v1 +; GFX9-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc +; GFX9-NEXT: v_add_co_u32_e32 v1, vcc, v1, v4 +; GFX9-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc +; GFX9-NEXT: v_add_co_u32_e32 v1, vcc, v1, v3 ; GFX9-NEXT: v_add_u32_e32 v4, v11, v4 -; GFX9-NEXT: v_cndmask_b32_e64 v3, 0, 1, s[0:1] +; GFX9-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc +; GFX9-NEXT: v_add_co_u32_e32 v11, vcc, v18, v1 ; GFX9-NEXT: v_add3_u32 v2, v4, v3, v2 -; GFX9-NEXT: v_add_co_u32_e64 v11, s[0:1], v14, v1 -; GFX9-NEXT: v_addc_co_u32_e64 v12, s[0:1], v13, v2, s[0:1] -; GFX9-NEXT: v_mad_u64_u32 v[3:4], s[0:1], s5, v11, 0 -; GFX9-NEXT: v_cndmask_b32_e32 v8, v8, v9, vcc -; GFX9-NEXT: v_xor_b32_e32 v9, s18, v5 +; GFX9-NEXT: v_mad_u64_u32 v[3:4], s[14:15], s5, v11, 0 +; GFX9-NEXT: v_addc_co_u32_e32 v12, vcc, v14, v2, vcc ; GFX9-NEXT: v_mov_b32_e32 v1, v4 +; GFX9-NEXT: v_cndmask_b32_e64 v6, v8, v6, s[0:1] +; GFX9-NEXT: v_cndmask_b32_e64 v7, v7, v10, s[0:1] ; GFX9-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s5, v12, v[1:2] -; GFX9-NEXT: v_cndmask_b32_e32 v6, v6, v10, vcc -; GFX9-NEXT: v_xor_b32_e32 v7, s19, v7 -; GFX9-NEXT: v_mad_u64_u32 v[4:5], s[0:1], s14, v11, v[1:2] +; GFX9-NEXT: v_xor_b32_e32 v8, s18, v5 +; GFX9-NEXT: v_xor_b32_e32 v9, s19, v9 +; GFX9-NEXT: v_mad_u64_u32 v[4:5], s[0:1], s20, v11, v[1:2] ; GFX9-NEXT: v_mov_b32_e32 v10, s19 -; GFX9-NEXT: v_subrev_co_u32_e32 v1, vcc, s18, v9 -; GFX9-NEXT: v_subb_co_u32_e32 v2, vcc, v7, v10, vcc -; GFX9-NEXT: v_xor_b32_e32 v5, s4, v8 -; GFX9-NEXT: v_mul_lo_u32 v7, v12, v3 +; GFX9-NEXT: v_subrev_co_u32_e32 v1, vcc, s18, v8 +; GFX9-NEXT: v_xor_b32_e32 v5, s4, v6 +; GFX9-NEXT: v_mul_lo_u32 v6, v12, v3 ; GFX9-NEXT: v_mul_lo_u32 v8, v11, v4 +; GFX9-NEXT: v_subb_co_u32_e32 v2, vcc, v9, v10, vcc ; GFX9-NEXT: v_mul_hi_u32 v9, v11, v3 -; GFX9-NEXT: v_mul_hi_u32 v3, v12, v3 -; GFX9-NEXT: v_xor_b32_e32 v6, s4, v6 -; GFX9-NEXT: v_add_co_u32_e32 v7, vcc, v7, v8 +; GFX9-NEXT: v_add_co_u32_e32 v6, vcc, v6, v8 ; GFX9-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc -; GFX9-NEXT: v_add_co_u32_e32 v7, vcc, v7, v9 -; GFX9-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc +; GFX9-NEXT: v_add_co_u32_e32 v6, vcc, v6, v9 +; GFX9-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc ; GFX9-NEXT: v_mul_lo_u32 v9, v12, v4 -; GFX9-NEXT: v_add_u32_e32 v7, v8, v7 +; GFX9-NEXT: v_mul_hi_u32 v3, v12, v3 +; GFX9-NEXT: v_add_u32_e32 v6, v8, v6 ; GFX9-NEXT: v_mul_hi_u32 v8, v11, v4 ; GFX9-NEXT: v_mul_hi_u32 v4, v12, v4 ; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, v9, v3 ; GFX9-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc ; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, v3, v8 ; GFX9-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc -; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, v3, v7 +; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, v3, v6 ; GFX9-NEXT: v_add_u32_e32 v8, v9, v8 -; GFX9-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc -; GFX9-NEXT: v_add3_u32 v4, v8, v7, v4 +; GFX9-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc +; GFX9-NEXT: v_add3_u32 v4, v8, v6, v4 ; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, v11, v3 ; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, v12, v4, vcc -; GFX9-NEXT: v_mul_lo_u32 v7, s7, v3 +; GFX9-NEXT: v_mul_lo_u32 v6, s7, v3 ; GFX9-NEXT: v_mul_lo_u32 v8, s6, v4 ; GFX9-NEXT: v_mul_hi_u32 v10, s6, v3 ; GFX9-NEXT: v_mul_hi_u32 v3, s7, v3 -; GFX9-NEXT: v_mul_hi_u32 v12, s7, v4 -; GFX9-NEXT: v_add_co_u32_e32 v7, vcc, v7, v8 +; GFX9-NEXT: v_mul_hi_u32 v13, s7, v4 +; GFX9-NEXT: v_add_co_u32_e32 v6, vcc, v6, v8 ; GFX9-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc -; GFX9-NEXT: v_add_co_u32_e32 v7, vcc, v7, v10 -; GFX9-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc +; GFX9-NEXT: v_add_co_u32_e32 v6, vcc, v6, v10 +; GFX9-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc ; GFX9-NEXT: v_mul_lo_u32 v10, s7, v4 -; GFX9-NEXT: v_add_u32_e32 v7, v8, v7 +; GFX9-NEXT: v_add_u32_e32 v6, v8, v6 ; GFX9-NEXT: v_mul_hi_u32 v8, s6, v4 -; GFX9-NEXT: v_mov_b32_e32 v9, s4 +; GFX9-NEXT: v_xor_b32_e32 v7, s4, v7 ; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, v10, v3 ; GFX9-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc ; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, v3, v8 ; GFX9-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc -; GFX9-NEXT: v_add_co_u32_e32 v11, vcc, v3, v7 +; GFX9-NEXT: v_add_co_u32_e32 v11, vcc, v3, v6 ; GFX9-NEXT: v_mad_u64_u32 v[3:4], s[0:1], s2, v11, 0 -; GFX9-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc +; GFX9-NEXT: v_mov_b32_e32 v9, s4 +; GFX9-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc ; GFX9-NEXT: v_subrev_co_u32_e32 v5, vcc, s4, v5 -; GFX9-NEXT: v_add_u32_e32 v8, v10, v8 -; GFX9-NEXT: v_subb_co_u32_e32 v6, vcc, v6, v9, vcc -; GFX9-NEXT: v_add3_u32 v9, v8, v7, v12 +; GFX9-NEXT: v_subb_co_u32_e32 v6, vcc, v7, v9, vcc +; GFX9-NEXT: v_add_u32_e32 v7, v10, v8 +; GFX9-NEXT: v_add3_u32 v9, v7, v12, v13 ; GFX9-NEXT: v_mad_u64_u32 v[7:8], s[0:1], s2, v9, v[4:5] ; GFX9-NEXT: v_mov_b32_e32 v10, s7 ; GFX9-NEXT: v_sub_co_u32_e32 v3, vcc, s6, v3 ; GFX9-NEXT: v_mad_u64_u32 v[7:8], s[0:1], s3, v11, v[7:8] ; GFX9-NEXT: v_mov_b32_e32 v4, s3 +; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: v_subb_co_u32_e64 v8, s[0:1], v10, v7, vcc ; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s3, v8 ; GFX9-NEXT: v_sub_u32_e32 v7, s7, v7 @@ -1994,170 +1988,169 @@ ; GFX10-NEXT: v_add_nc_u32_e32 v10, v13, v1 ; GFX10-NEXT: v_mad_u64_u32 v[0:1], s23, s20, v6, 0 ; GFX10-NEXT: v_add_co_u32 v7, s23, v9, v8 +; GFX10-NEXT: v_cndmask_b32_e64 v8, 0, 1, s23 ; GFX10-NEXT: v_mul_lo_u32 v9, s21, v6 ; GFX10-NEXT: v_mul_lo_u32 v11, s20, v5 -; GFX10-NEXT: v_cndmask_b32_e64 v8, 0, 1, s23 ; GFX10-NEXT: v_add_co_u32 v4, vcc_lo, v4, v7 ; GFX10-NEXT: v_add3_u32 v2, v10, v8, v2 ; GFX10-NEXT: v_mul_lo_u32 v8, v5, v0 -; GFX10-NEXT: v_add3_u32 v7, v1, v11, v9 ; GFX10-NEXT: v_mul_hi_u32 v10, v6, v0 ; GFX10-NEXT: v_mul_hi_u32 v0, v5, v0 +; GFX10-NEXT: v_add3_u32 v7, v1, v11, v9 ; GFX10-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, v3, v2, vcc_lo -; GFX10-NEXT: v_mul_lo_u32 v12, v6, v7 ; GFX10-NEXT: v_mad_u64_u32 v[1:2], s20, s5, v4, 0 +; GFX10-NEXT: v_mul_lo_u32 v12, v6, v7 ; GFX10-NEXT: v_mul_lo_u32 v9, s22, v4 ; GFX10-NEXT: v_mul_lo_u32 v11, s5, v3 ; GFX10-NEXT: v_mul_lo_u32 v13, v5, v7 ; GFX10-NEXT: v_mul_hi_u32 v14, v6, v7 ; GFX10-NEXT: v_mul_hi_u32 v7, v5, v7 -; GFX10-NEXT: v_add_co_u32 v8, s5, v8, v12 ; GFX10-NEXT: v_mul_lo_u32 v15, v3, v1 ; GFX10-NEXT: v_mul_hi_u32 v16, v4, v1 +; GFX10-NEXT: v_add_co_u32 v8, s5, v8, v12 ; GFX10-NEXT: v_add3_u32 v2, v2, v11, v9 ; GFX10-NEXT: v_cndmask_b32_e64 v9, 0, 1, s5 ; GFX10-NEXT: v_add_co_u32 v0, s5, v13, v0 ; GFX10-NEXT: v_cndmask_b32_e64 v11, 0, 1, s5 ; GFX10-NEXT: v_add_co_u32 v8, s5, v8, v10 +; GFX10-NEXT: v_mul_lo_u32 v12, v4, v2 ; GFX10-NEXT: v_cndmask_b32_e64 v8, 0, 1, s5 ; GFX10-NEXT: v_add_co_u32 v0, s5, v0, v14 ; GFX10-NEXT: v_cndmask_b32_e64 v10, 0, 1, s5 -; GFX10-NEXT: v_mul_lo_u32 v12, v4, v2 -; GFX10-NEXT: v_add_nc_u32_e32 v8, v9, v8 ; GFX10-NEXT: v_mul_hi_u32 v1, v3, v1 +; GFX10-NEXT: v_add_nc_u32_e32 v8, v9, v8 ; GFX10-NEXT: v_mul_lo_u32 v13, v3, v2 -; GFX10-NEXT: v_add_nc_u32_e32 v10, v11, v10 -; GFX10-NEXT: v_mul_hi_u32 v9, v4, v2 +; GFX10-NEXT: v_mul_hi_u32 v14, v4, v2 +; GFX10-NEXT: v_add_nc_u32_e32 v9, v11, v10 +; GFX10-NEXT: v_add_co_u32 v10, s5, v15, v12 +; GFX10-NEXT: v_cndmask_b32_e64 v11, 0, 1, s5 ; GFX10-NEXT: v_add_co_u32 v0, s5, v0, v8 ; GFX10-NEXT: v_cndmask_b32_e64 v8, 0, 1, s5 -; GFX10-NEXT: v_add_co_u32 v11, s5, v15, v12 -; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, v6, v0 -; GFX10-NEXT: v_add3_u32 v7, v10, v8, v7 -; GFX10-NEXT: v_cndmask_b32_e64 v12, 0, 1, s5 ; GFX10-NEXT: v_add_co_u32 v1, s5, v13, v1 -; GFX10-NEXT: v_cndmask_b32_e64 v13, 0, 1, s5 -; GFX10-NEXT: v_add_co_ci_u32_e32 v5, vcc_lo, v5, v7, vcc_lo -; GFX10-NEXT: v_add_co_u32 v8, s5, v11, v16 +; GFX10-NEXT: v_cndmask_b32_e64 v12, 0, 1, s5 +; GFX10-NEXT: v_add_co_u32 v10, s5, v10, v16 +; GFX10-NEXT: v_add3_u32 v7, v9, v8, v7 +; GFX10-NEXT: v_cndmask_b32_e64 v10, 0, 1, s5 +; GFX10-NEXT: v_add_co_u32 v1, s5, v1, v14 +; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, v6, v0 ; GFX10-NEXT: v_cndmask_b32_e64 v8, 0, 1, s5 -; GFX10-NEXT: v_add_co_u32 v1, s5, v1, v9 -; GFX10-NEXT: v_mul_lo_u32 v7, s1, v0 -; GFX10-NEXT: v_mul_lo_u32 v9, s0, v5 -; GFX10-NEXT: v_mul_hi_u32 v10, s1, v0 -; GFX10-NEXT: v_mul_hi_u32 v0, s0, v0 +; GFX10-NEXT: v_add_co_ci_u32_e32 v5, vcc_lo, v5, v7, vcc_lo +; GFX10-NEXT: v_add_nc_u32_e32 v6, v11, v10 +; GFX10-NEXT: v_mul_hi_u32 v9, s0, v0 +; GFX10-NEXT: v_add_nc_u32_e32 v7, v12, v8 +; GFX10-NEXT: v_mul_lo_u32 v8, s1, v0 +; GFX10-NEXT: v_mul_lo_u32 v10, s0, v5 +; GFX10-NEXT: v_mul_hi_u32 v0, s1, v0 ; GFX10-NEXT: v_mul_lo_u32 v11, s1, v5 +; GFX10-NEXT: v_add_co_u32 v1, s5, v1, v6 ; GFX10-NEXT: v_cndmask_b32_e64 v6, 0, 1, s5 -; GFX10-NEXT: v_add_nc_u32_e32 v8, v12, v8 ; GFX10-NEXT: v_mul_hi_u32 v12, s0, v5 -; GFX10-NEXT: v_mul_hi_u32 v5, s1, v5 -; GFX10-NEXT: v_add_co_u32 v7, s5, v7, v9 -; GFX10-NEXT: v_cndmask_b32_e64 v9, 0, 1, s5 -; GFX10-NEXT: v_add_co_u32 v10, s5, v11, v10 -; GFX10-NEXT: v_add_co_u32 v0, s20, v7, v0 -; GFX10-NEXT: v_cndmask_b32_e64 v0, 0, 1, s20 -; GFX10-NEXT: v_cndmask_b32_e64 v7, 0, 1, s5 -; GFX10-NEXT: v_add_co_u32 v10, s5, v10, v12 +; GFX10-NEXT: v_mul_hi_u32 v2, v3, v2 +; GFX10-NEXT: v_add_co_u32 v8, s5, v8, v10 +; GFX10-NEXT: v_cndmask_b32_e64 v10, 0, 1, s5 +; GFX10-NEXT: v_add_co_u32 v0, s5, v11, v0 ; GFX10-NEXT: v_cndmask_b32_e64 v11, 0, 1, s5 -; GFX10-NEXT: v_add_nc_u32_e32 v0, v9, v0 -; GFX10-NEXT: v_add_co_u32 v8, s5, v1, v8 -; GFX10-NEXT: v_cndmask_b32_e64 v1, 0, 1, s5 -; GFX10-NEXT: v_add_nc_u32_e32 v7, v7, v11 -; GFX10-NEXT: v_add_co_u32 v9, s5, v10, v0 +; GFX10-NEXT: v_add_co_u32 v8, s5, v8, v9 +; GFX10-NEXT: v_cndmask_b32_e64 v8, 0, 1, s5 +; GFX10-NEXT: v_add_co_u32 v0, s5, v0, v12 +; GFX10-NEXT: v_cndmask_b32_e64 v9, 0, 1, s5 +; GFX10-NEXT: v_mul_hi_u32 v5, s1, v5 +; GFX10-NEXT: v_add_nc_u32_e32 v8, v10, v8 +; GFX10-NEXT: v_add3_u32 v2, v7, v6, v2 +; GFX10-NEXT: v_add_co_u32 v4, vcc_lo, v4, v1 +; GFX10-NEXT: v_add_nc_u32_e32 v6, v11, v9 +; GFX10-NEXT: v_add_co_u32 v7, s5, v0, v8 ; GFX10-NEXT: v_cndmask_b32_e64 v0, 0, 1, s5 -; GFX10-NEXT: v_mul_hi_u32 v2, v3, v2 -; GFX10-NEXT: v_add_nc_u32_e32 v6, v13, v6 -; GFX10-NEXT: v_add_co_u32 v4, vcc_lo, v4, v8 -; GFX10-NEXT: v_add3_u32 v5, v7, v0, v5 -; GFX10-NEXT: v_mul_hi_u32 v8, s14, v4 -; GFX10-NEXT: v_add3_u32 v2, v6, v1, v2 -; GFX10-NEXT: v_mad_u64_u32 v[0:1], s5, s6, v9, 0 -; GFX10-NEXT: v_mul_lo_u32 v6, s7, v9 -; GFX10-NEXT: v_mul_lo_u32 v7, s6, v5 ; GFX10-NEXT: v_add_co_ci_u32_e32 v2, vcc_lo, v3, v2, vcc_lo +; GFX10-NEXT: v_mul_lo_u32 v8, s7, v7 ; GFX10-NEXT: v_mul_lo_u32 v3, s15, v4 +; GFX10-NEXT: v_add3_u32 v5, v6, v0, v5 +; GFX10-NEXT: v_mad_u64_u32 v[0:1], s5, s6, v7, 0 +; GFX10-NEXT: v_mul_lo_u32 v6, s14, v2 +; GFX10-NEXT: v_mul_hi_u32 v10, s14, v4 +; GFX10-NEXT: v_mul_lo_u32 v9, s6, v5 ; GFX10-NEXT: v_mul_hi_u32 v4, s15, v4 -; GFX10-NEXT: v_mul_lo_u32 v10, s14, v2 ; GFX10-NEXT: v_mul_lo_u32 v11, s15, v2 -; GFX10-NEXT: v_add3_u32 v1, v1, v7, v6 -; GFX10-NEXT: v_add_co_u32 v6, vcc_lo, v9, 1 -; GFX10-NEXT: v_add_co_ci_u32_e32 v7, vcc_lo, 0, v5, vcc_lo -; GFX10-NEXT: v_sub_nc_u32_e32 v12, s1, v1 -; GFX10-NEXT: v_sub_co_u32 v13, vcc_lo, s0, v0 -; GFX10-NEXT: v_sub_co_ci_u32_e64 v14, s0, s1, v1, vcc_lo -; GFX10-NEXT: v_subrev_co_ci_u32_e32 v0, vcc_lo, s7, v12, vcc_lo -; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, s6, v13 -; GFX10-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc_lo -; GFX10-NEXT: v_sub_co_u32 v12, vcc_lo, v13, s6 -; GFX10-NEXT: v_subrev_co_ci_u32_e64 v15, s0, 0, v0, vcc_lo -; GFX10-NEXT: v_cmp_le_u32_e64 s0, s7, v14 -; GFX10-NEXT: v_subrev_co_ci_u32_e32 v0, vcc_lo, s7, v0, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e64 v16, 0, -1, s0 -; GFX10-NEXT: v_cmp_le_u32_e64 s0, s6, v12 -; GFX10-NEXT: v_cndmask_b32_e64 v17, 0, -1, s0 +; GFX10-NEXT: v_add_co_u32 v3, s5, v3, v6 +; GFX10-NEXT: v_add3_u32 v1, v1, v9, v8 +; GFX10-NEXT: v_sub_co_u32 v6, vcc_lo, s0, v0 +; GFX10-NEXT: v_add_co_u32 v0, s0, v7, 1 +; GFX10-NEXT: v_sub_nc_u32_e32 v9, s1, v1 +; GFX10-NEXT: v_add_co_ci_u32_e64 v8, s0, 0, v5, s0 +; GFX10-NEXT: v_sub_co_ci_u32_e64 v12, s0, s1, v1, vcc_lo +; GFX10-NEXT: v_subrev_co_ci_u32_e32 v9, vcc_lo, s7, v9, vcc_lo +; GFX10-NEXT: v_cmp_le_u32_e64 s0, s6, v6 +; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, s7, v12 +; GFX10-NEXT: v_cndmask_b32_e64 v17, 0, 1, s5 +; GFX10-NEXT: v_add_co_u32 v4, s1, v11, v4 +; GFX10-NEXT: v_cndmask_b32_e64 v1, 0, -1, s0 +; GFX10-NEXT: v_cndmask_b32_e64 v13, 0, -1, vcc_lo +; GFX10-NEXT: v_sub_co_u32 v14, vcc_lo, v6, s6 +; GFX10-NEXT: v_subrev_co_ci_u32_e64 v15, s0, 0, v9, vcc_lo +; GFX10-NEXT: v_cmp_eq_u32_e64 s0, s7, v12 +; GFX10-NEXT: v_subrev_co_ci_u32_e32 v9, vcc_lo, s7, v9, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e64 v1, v13, v1, s0 +; GFX10-NEXT: v_cmp_le_u32_e64 s0, s6, v14 +; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v1 +; GFX10-NEXT: v_cndmask_b32_e64 v13, 0, -1, s0 ; GFX10-NEXT: v_cmp_le_u32_e64 s0, s7, v15 -; GFX10-NEXT: v_cndmask_b32_e64 v18, 0, -1, s0 -; GFX10-NEXT: v_add_co_u32 v19, s0, v6, 1 -; GFX10-NEXT: v_add_co_ci_u32_e64 v20, s0, 0, v7, s0 -; GFX10-NEXT: v_cmp_eq_u32_e64 s0, s7, v14 -; GFX10-NEXT: v_cndmask_b32_e64 v16, v16, v1, s0 -; GFX10-NEXT: v_cmp_eq_u32_e64 s0, s7, v15 -; GFX10-NEXT: v_cndmask_b32_e64 v17, v18, v17, s0 -; GFX10-NEXT: v_add_co_u32 v1, s0, v3, v10 +; GFX10-NEXT: v_cndmask_b32_e64 v16, 0, -1, s0 +; GFX10-NEXT: v_add_co_u32 v3, s0, v3, v10 ; GFX10-NEXT: v_mul_hi_u32 v10, s14, v2 ; GFX10-NEXT: v_cndmask_b32_e64 v3, 0, 1, s0 -; GFX10-NEXT: v_add_co_u32 v4, s0, v11, v4 -; GFX10-NEXT: v_add_co_u32 v1, s1, v1, v8 -; GFX10-NEXT: v_cndmask_b32_e64 v1, 0, 1, s1 -; GFX10-NEXT: v_cndmask_b32_e64 v8, 0, 1, s0 +; GFX10-NEXT: v_cmp_eq_u32_e64 s0, s7, v15 +; GFX10-NEXT: v_mul_hi_u32 v2, s15, v2 +; GFX10-NEXT: v_add_nc_u32_e32 v3, v17, v3 +; GFX10-NEXT: v_cndmask_b32_e64 v11, v16, v13, s0 ; GFX10-NEXT: v_add_co_u32 v4, s0, v4, v10 +; GFX10-NEXT: v_cndmask_b32_e64 v13, 0, 1, s1 ; GFX10-NEXT: v_cndmask_b32_e64 v10, 0, 1, s0 -; GFX10-NEXT: v_add_nc_u32_e32 v1, v3, v1 -; GFX10-NEXT: v_mul_hi_u32 v2, s15, v2 -; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v17 -; GFX10-NEXT: v_add_nc_u32_e32 v3, v8, v10 -; GFX10-NEXT: v_add_co_u32 v4, s0, v4, v1 -; GFX10-NEXT: v_cndmask_b32_e64 v1, 0, 1, s0 -; GFX10-NEXT: v_sub_co_u32 v8, s0, v12, s6 -; GFX10-NEXT: v_subrev_co_ci_u32_e64 v10, s0, 0, v0, s0 -; GFX10-NEXT: v_add3_u32 v2, v3, v1, v2 -; GFX10-NEXT: v_cndmask_b32_e32 v3, v6, v19, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e32 v6, v7, v20, vcc_lo -; GFX10-NEXT: v_mad_u64_u32 v[0:1], s0, s2, v4, 0 -; GFX10-NEXT: v_mul_lo_u32 v7, s2, v2 -; GFX10-NEXT: v_mul_lo_u32 v11, s3, v4 -; GFX10-NEXT: v_cmp_ne_u32_e64 s0, 0, v17 -; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v16 -; GFX10-NEXT: v_mov_b32_e32 v16, 0 -; GFX10-NEXT: v_cndmask_b32_e64 v8, v12, v8, s0 -; GFX10-NEXT: v_cndmask_b32_e32 v5, v5, v6, vcc_lo -; GFX10-NEXT: v_add3_u32 v1, v1, v7, v11 -; GFX10-NEXT: v_cndmask_b32_e64 v6, v15, v10, s0 -; GFX10-NEXT: v_cndmask_b32_e32 v3, v9, v3, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e32 v7, v13, v8, vcc_lo +; GFX10-NEXT: v_add_co_u32 v16, s0, v0, 1 +; GFX10-NEXT: v_add_co_ci_u32_e64 v17, s0, 0, v8, s0 +; GFX10-NEXT: v_add_co_u32 v3, s0, v4, v3 +; GFX10-NEXT: v_add_nc_u32_e32 v10, v13, v10 +; GFX10-NEXT: v_cndmask_b32_e64 v4, 0, 1, s0 +; GFX10-NEXT: v_cmp_ne_u32_e64 s0, 0, v11 +; GFX10-NEXT: v_mul_lo_u32 v13, s3, v3 +; GFX10-NEXT: v_add3_u32 v2, v10, v4, v2 +; GFX10-NEXT: v_cndmask_b32_e64 v11, v0, v16, s0 +; GFX10-NEXT: v_cndmask_b32_e64 v4, v8, v17, s0 +; GFX10-NEXT: v_mad_u64_u32 v[0:1], s1, s2, v3, 0 +; GFX10-NEXT: v_mul_lo_u32 v8, s2, v2 +; GFX10-NEXT: v_sub_co_u32 v10, s1, v14, s6 +; GFX10-NEXT: v_subrev_co_ci_u32_e64 v9, s1, 0, v9, s1 +; GFX10-NEXT: v_cndmask_b32_e32 v4, v5, v4, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e64 v10, v14, v10, s0 +; GFX10-NEXT: v_cndmask_b32_e32 v7, v7, v11, vcc_lo +; GFX10-NEXT: v_add3_u32 v1, v1, v8, v13 +; GFX10-NEXT: v_cndmask_b32_e64 v5, v15, v9, s0 ; GFX10-NEXT: v_sub_co_u32 v8, s0, s14, v0 +; GFX10-NEXT: v_cndmask_b32_e32 v6, v6, v10, vcc_lo ; GFX10-NEXT: v_sub_co_ci_u32_e64 v9, s1, s15, v1, s0 -; GFX10-NEXT: v_cndmask_b32_e32 v6, v14, v6, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e32 v5, v12, v5, vcc_lo ; GFX10-NEXT: v_sub_nc_u32_e32 v1, s15, v1 -; GFX10-NEXT: v_xor_b32_e32 v0, s18, v3 +; GFX10-NEXT: v_xor_b32_e32 v0, s18, v7 ; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, s3, v9 -; GFX10-NEXT: v_xor_b32_e32 v3, s19, v5 -; GFX10-NEXT: v_xor_b32_e32 v6, s4, v6 -; GFX10-NEXT: v_cndmask_b32_e64 v5, 0, -1, vcc_lo +; GFX10-NEXT: v_xor_b32_e32 v4, s19, v4 +; GFX10-NEXT: v_xor_b32_e32 v5, s4, v5 +; GFX10-NEXT: v_mov_b32_e32 v16, 0 +; GFX10-NEXT: v_cndmask_b32_e64 v7, 0, -1, vcc_lo ; GFX10-NEXT: v_subrev_co_ci_u32_e64 v10, vcc_lo, s3, v1, s0 ; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, s2, v8 ; GFX10-NEXT: v_cndmask_b32_e64 v11, 0, -1, vcc_lo ; GFX10-NEXT: v_sub_co_u32 v12, vcc_lo, v8, s2 ; GFX10-NEXT: v_subrev_co_ci_u32_e64 v13, s0, 0, v10, vcc_lo ; GFX10-NEXT: v_sub_co_u32 v0, s0, v0, s18 -; GFX10-NEXT: v_subrev_co_ci_u32_e64 v1, s0, s19, v3, s0 +; GFX10-NEXT: v_subrev_co_ci_u32_e64 v1, s0, s19, v4, s0 ; GFX10-NEXT: v_cmp_eq_u32_e64 s0, s3, v9 -; GFX10-NEXT: v_xor_b32_e32 v3, s4, v7 +; GFX10-NEXT: v_xor_b32_e32 v4, s4, v6 ; GFX10-NEXT: v_subrev_co_ci_u32_e32 v10, vcc_lo, s3, v10, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e64 v5, v5, v11, s0 +; GFX10-NEXT: v_cndmask_b32_e64 v6, v7, v11, s0 ; GFX10-NEXT: v_cmp_le_u32_e64 s0, s3, v13 ; GFX10-NEXT: v_cndmask_b32_e64 v7, 0, -1, s0 ; GFX10-NEXT: v_cmp_le_u32_e64 s0, s2, v12 ; GFX10-NEXT: v_cndmask_b32_e64 v11, 0, -1, s0 -; GFX10-NEXT: v_add_co_u32 v14, s0, v4, 1 +; GFX10-NEXT: v_add_co_u32 v14, s0, v3, 1 ; GFX10-NEXT: v_add_co_ci_u32_e64 v15, s0, 0, v2, s0 ; GFX10-NEXT: v_cmp_eq_u32_e64 s0, s3, v13 ; GFX10-NEXT: v_cndmask_b32_e64 v7, v7, v11, s0 @@ -2167,24 +2160,24 @@ ; GFX10-NEXT: v_sub_co_u32 v7, s0, v12, s2 ; GFX10-NEXT: v_subrev_co_ci_u32_e64 v10, s0, 0, v10, s0 ; GFX10-NEXT: v_cndmask_b32_e32 v11, v14, v11, vcc_lo -; GFX10-NEXT: v_cmp_ne_u32_e64 s0, 0, v5 +; GFX10-NEXT: v_cmp_ne_u32_e64 s0, 0, v6 ; GFX10-NEXT: v_cndmask_b32_e32 v14, v15, v17, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e32 v5, v12, v7, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e32 v6, v12, v7, vcc_lo ; GFX10-NEXT: v_cndmask_b32_e32 v7, v13, v10, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e64 v10, v4, v11, s0 +; GFX10-NEXT: v_sub_co_u32 v4, vcc_lo, v4, s4 +; GFX10-NEXT: v_cndmask_b32_e64 v3, v3, v11, s0 ; GFX10-NEXT: v_cndmask_b32_e64 v2, v2, v14, s0 -; GFX10-NEXT: v_cndmask_b32_e64 v8, v8, v5, s0 +; GFX10-NEXT: v_cndmask_b32_e64 v6, v8, v6, s0 ; GFX10-NEXT: v_cndmask_b32_e64 v7, v9, v7, s0 ; GFX10-NEXT: s_xor_b64 s[0:1], s[12:13], s[16:17] -; GFX10-NEXT: v_sub_co_u32 v4, vcc_lo, v3, s4 -; GFX10-NEXT: v_xor_b32_e32 v3, s0, v10 -; GFX10-NEXT: v_subrev_co_ci_u32_e32 v5, vcc_lo, s4, v6, vcc_lo -; GFX10-NEXT: v_xor_b32_e32 v6, s1, v2 -; GFX10-NEXT: v_xor_b32_e32 v8, s12, v8 +; GFX10-NEXT: v_subrev_co_ci_u32_e32 v5, vcc_lo, s4, v5, vcc_lo +; GFX10-NEXT: v_xor_b32_e32 v3, s0, v3 +; GFX10-NEXT: v_xor_b32_e32 v8, s1, v2 +; GFX10-NEXT: v_xor_b32_e32 v6, s12, v6 ; GFX10-NEXT: v_xor_b32_e32 v7, s12, v7 ; GFX10-NEXT: v_sub_co_u32 v2, vcc_lo, v3, s0 -; GFX10-NEXT: v_subrev_co_ci_u32_e32 v3, vcc_lo, s1, v6, vcc_lo -; GFX10-NEXT: v_sub_co_u32 v6, vcc_lo, v8, s12 +; GFX10-NEXT: v_subrev_co_ci_u32_e32 v3, vcc_lo, s1, v8, vcc_lo +; GFX10-NEXT: v_sub_co_u32 v6, vcc_lo, v6, s12 ; GFX10-NEXT: v_subrev_co_ci_u32_e32 v7, vcc_lo, s12, v7, vcc_lo ; GFX10-NEXT: global_store_dwordx4 v16, v[0:3], s[8:9] ; GFX10-NEXT: global_store_dwordx4 v16, v[4:7], s[10:11] @@ -2520,17 +2513,16 @@ ; GFX10-NEXT: s_load_dword s0, s[4:5], 0x10 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_bfe_i32 s1, s0, 0x80018 -; GFX10-NEXT: s_bfe_i32 s2, s0, 0x80010 -; GFX10-NEXT: s_ashr_i32 s3, s1, 31 -; GFX10-NEXT: s_ashr_i32 s8, s2, 31 -; GFX10-NEXT: s_add_i32 s1, s1, s3 -; GFX10-NEXT: s_add_i32 s2, s2, s8 -; GFX10-NEXT: s_xor_b32 s1, s1, s3 -; GFX10-NEXT: s_xor_b32 s2, s2, s8 +; GFX10-NEXT: s_bfe_i32 s3, s0, 0x80010 +; GFX10-NEXT: s_ashr_i32 s2, s1, 31 +; GFX10-NEXT: s_ashr_i32 s8, s3, 31 +; GFX10-NEXT: s_add_i32 s1, s1, s2 +; GFX10-NEXT: s_add_i32 s3, s3, s8 +; GFX10-NEXT: s_xor_b32 s1, s1, s2 +; GFX10-NEXT: s_xor_b32 s3, s3, s8 ; GFX10-NEXT: v_cvt_f32_u32_e32 v0, s1 -; GFX10-NEXT: v_cvt_f32_u32_e32 v1, s2 +; GFX10-NEXT: v_cvt_f32_u32_e32 v1, s3 ; GFX10-NEXT: s_sub_i32 s6, 0, s1 -; GFX10-NEXT: s_sub_i32 s7, 0, s2 ; GFX10-NEXT: v_rcp_iflag_f32_e32 v0, v0 ; GFX10-NEXT: v_rcp_iflag_f32_e32 v1, v1 ; GFX10-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 @@ -2538,59 +2530,60 @@ ; GFX10-NEXT: v_cvt_u32_f32_e32 v0, v0 ; GFX10-NEXT: v_cvt_u32_f32_e32 v1, v1 ; GFX10-NEXT: v_mul_lo_u32 v2, s6, v0 -; GFX10-NEXT: v_mul_lo_u32 v3, s7, v1 -; GFX10-NEXT: s_sext_i32_i8 s6, s0 -; GFX10-NEXT: s_bfe_i32 s0, s0, 0x80008 +; GFX10-NEXT: s_sub_i32 s6, 0, s3 +; GFX10-NEXT: v_mul_lo_u32 v3, s6, v1 +; GFX10-NEXT: s_bfe_i32 s6, s0, 0x80008 +; GFX10-NEXT: s_sext_i32_i8 s0, s0 ; GFX10-NEXT: s_ashr_i32 s9, s6, 31 ; GFX10-NEXT: s_ashr_i32 s10, s0, 31 +; GFX10-NEXT: v_mul_hi_u32 v2, v0, v2 ; GFX10-NEXT: s_add_i32 s6, s6, s9 ; GFX10-NEXT: s_add_i32 s0, s0, s10 -; GFX10-NEXT: v_mul_hi_u32 v2, v0, v2 ; GFX10-NEXT: v_mul_hi_u32 v3, v1, v3 -; GFX10-NEXT: s_xor_b32 s0, s0, s10 ; GFX10-NEXT: s_xor_b32 s6, s6, s9 +; GFX10-NEXT: s_xor_b32 s0, s0, s10 ; GFX10-NEXT: v_add_nc_u32_e32 v0, v0, v2 ; GFX10-NEXT: v_add_nc_u32_e32 v1, v1, v3 -; GFX10-NEXT: v_mul_hi_u32 v0, s0, v0 -; GFX10-NEXT: v_mul_hi_u32 v1, s6, v1 +; GFX10-NEXT: v_mul_hi_u32 v0, s6, v0 +; GFX10-NEXT: v_mul_hi_u32 v1, s0, v1 ; GFX10-NEXT: v_mul_lo_u32 v2, v0, s1 -; GFX10-NEXT: v_mul_lo_u32 v3, v1, s2 -; GFX10-NEXT: v_add_nc_u32_e32 v5, 1, v0 -; GFX10-NEXT: v_add_nc_u32_e32 v4, 1, v1 -; GFX10-NEXT: v_sub_nc_u32_e32 v2, s0, v2 -; GFX10-NEXT: v_sub_nc_u32_e32 v3, s6, v3 +; GFX10-NEXT: v_add_nc_u32_e32 v4, 1, v0 +; GFX10-NEXT: v_mul_lo_u32 v3, v1, s3 +; GFX10-NEXT: v_add_nc_u32_e32 v6, 1, v1 +; GFX10-NEXT: v_sub_nc_u32_e32 v2, s6, v2 ; GFX10-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x0 +; GFX10-NEXT: v_sub_nc_u32_e32 v3, s0, v3 +; GFX10-NEXT: v_subrev_nc_u32_e32 v5, s1, v2 ; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, s1, v2 -; GFX10-NEXT: v_subrev_nc_u32_e32 v6, s1, v2 -; GFX10-NEXT: v_subrev_nc_u32_e32 v7, s2, v3 -; GFX10-NEXT: v_cmp_le_u32_e64 s0, s2, v3 -; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v5, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e32 v2, v2, v6, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, v4, s0 +; GFX10-NEXT: v_cmp_le_u32_e64 s0, s3, v3 +; GFX10-NEXT: v_subrev_nc_u32_e32 v7, s3, v3 +; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e32 v2, v2, v5, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, v6, s0 ; GFX10-NEXT: v_cndmask_b32_e64 v3, v3, v7, s0 -; GFX10-NEXT: v_add_nc_u32_e32 v5, 1, v0 +; GFX10-NEXT: v_add_nc_u32_e32 v4, 1, v0 ; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, s1, v2 -; GFX10-NEXT: v_subrev_nc_u32_e32 v6, s1, v2 -; GFX10-NEXT: v_add_nc_u32_e32 v4, 1, v1 -; GFX10-NEXT: v_cmp_le_u32_e64 s0, s2, v3 -; GFX10-NEXT: v_subrev_nc_u32_e32 v7, s2, v3 -; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v5, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e32 v2, v2, v6, vcc_lo -; GFX10-NEXT: s_xor_b32 s1, s10, s3 -; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, v4, s0 +; GFX10-NEXT: v_subrev_nc_u32_e32 v5, s1, v2 +; GFX10-NEXT: v_add_nc_u32_e32 v6, 1, v1 +; GFX10-NEXT: v_cmp_le_u32_e64 s0, s3, v3 +; GFX10-NEXT: v_subrev_nc_u32_e32 v7, s3, v3 +; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e32 v2, v2, v5, vcc_lo +; GFX10-NEXT: s_xor_b32 s1, s9, s2 +; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, v6, s0 ; GFX10-NEXT: v_cndmask_b32_e64 v3, v3, v7, s0 ; GFX10-NEXT: v_xor_b32_e32 v0, s1, v0 -; GFX10-NEXT: v_xor_b32_e32 v2, s10, v2 -; GFX10-NEXT: s_xor_b32 s0, s9, s8 +; GFX10-NEXT: v_xor_b32_e32 v2, s9, v2 +; GFX10-NEXT: s_xor_b32 s0, s10, s8 +; GFX10-NEXT: v_mov_b32_e32 v4, 0xff ; GFX10-NEXT: v_xor_b32_e32 v1, s0, v1 ; GFX10-NEXT: v_subrev_nc_u32_e32 v0, s1, v0 -; GFX10-NEXT: v_xor_b32_e32 v3, s9, v3 -; GFX10-NEXT: v_subrev_nc_u32_e32 v2, s10, v2 -; GFX10-NEXT: s_movk_i32 s1, 0xff +; GFX10-NEXT: v_xor_b32_e32 v3, s10, v3 +; GFX10-NEXT: v_subrev_nc_u32_e32 v2, s9, v2 ; GFX10-NEXT: v_subrev_nc_u32_e32 v1, s0, v1 -; GFX10-NEXT: v_and_b32_sdwa v0, v0, s1 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX10-NEXT: v_subrev_nc_u32_e32 v3, s9, v3 -; GFX10-NEXT: v_and_b32_sdwa v2, v2, s1 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX10-NEXT: v_and_b32_sdwa v0, v0, v4 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX10-NEXT: v_subrev_nc_u32_e32 v3, s10, v3 +; GFX10-NEXT: v_and_b32_sdwa v2, v2, v4 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX10-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD Index: llvm/test/CodeGen/AMDGPU/GlobalISel/select-to-fmin-fmax.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/GlobalISel/select-to-fmin-fmax.ll +++ llvm/test/CodeGen/AMDGPU/GlobalISel/select-to-fmin-fmax.ll @@ -45,21 +45,20 @@ ; GCN-LABEL: test_v4s16: ; GCN: ; %bb.0: ; %entry ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: s_mov_b32 s6, 0 -; GCN-NEXT: v_cmp_gt_f16_e32 vcc, 0, v0 ; GCN-NEXT: v_lshrrev_b32_e32 v2, 16, v0 -; GCN-NEXT: v_cndmask_b32_e64 v4, v0, 0, vcc -; GCN-NEXT: v_cmp_lt_f16_sdwa s[4:5], v0, s6 src0_sel:WORD_1 src1_sel:DWORD -; GCN-NEXT: v_cmp_gt_f16_e32 vcc, 0, v1 +; GCN-NEXT: v_cmp_gt_f16_e32 vcc, 0, v0 +; GCN-NEXT: v_cndmask_b32_e64 v0, v0, 0, vcc +; GCN-NEXT: v_cmp_gt_f16_e32 vcc, 0, v2 ; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v1 -; GCN-NEXT: v_cndmask_b32_e64 v0, v2, 0, s[4:5] -; GCN-NEXT: v_cndmask_b32_e64 v2, v1, 0, vcc -; GCN-NEXT: v_cmp_lt_f16_sdwa s[4:5], v1, s6 src0_sel:WORD_1 src1_sel:DWORD -; GCN-NEXT: v_cndmask_b32_e64 v1, v3, 0, s[4:5] -; GCN-NEXT: v_and_b32_e32 v3, 0xffff, v4 -; GCN-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GCN-NEXT: v_lshl_or_b32 v0, v0, 16, v3 -; GCN-NEXT: v_lshl_or_b32 v1, v1, 16, v2 +; GCN-NEXT: v_cndmask_b32_e64 v2, v2, 0, vcc +; GCN-NEXT: v_cmp_gt_f16_e32 vcc, 0, v1 +; GCN-NEXT: v_cndmask_b32_e64 v1, v1, 0, vcc +; GCN-NEXT: v_cmp_gt_f16_e32 vcc, 0, v3 +; GCN-NEXT: v_cndmask_b32_e64 v3, v3, 0, vcc +; GCN-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GCN-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GCN-NEXT: v_lshl_or_b32 v0, v2, 16, v0 +; GCN-NEXT: v_lshl_or_b32 v1, v3, 16, v1 ; GCN-NEXT: s_setpc_b64 s[30:31] entry: %fcmp = fcmp olt <4 x half> %a, zeroinitializer @@ -71,35 +70,34 @@ ; GCN-LABEL: test_v8s16: ; GCN: ; %bb.0: ; %entry ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: s_mov_b32 s6, 0 -; GCN-NEXT: v_cmp_gt_f16_e32 vcc, 0, v0 ; GCN-NEXT: v_lshrrev_b32_e32 v4, 16, v0 -; GCN-NEXT: v_cndmask_b32_e64 v8, v0, 0, vcc -; GCN-NEXT: v_cmp_lt_f16_sdwa s[4:5], v0, s6 src0_sel:WORD_1 src1_sel:DWORD -; GCN-NEXT: v_cmp_gt_f16_e32 vcc, 0, v1 +; GCN-NEXT: v_cmp_gt_f16_e32 vcc, 0, v0 +; GCN-NEXT: v_cndmask_b32_e64 v0, v0, 0, vcc +; GCN-NEXT: v_cmp_gt_f16_e32 vcc, 0, v4 ; GCN-NEXT: v_lshrrev_b32_e32 v5, 16, v1 -; GCN-NEXT: v_cndmask_b32_e64 v0, v4, 0, s[4:5] -; GCN-NEXT: v_cndmask_b32_e64 v4, v1, 0, vcc -; GCN-NEXT: v_cmp_lt_f16_sdwa s[4:5], v1, s6 src0_sel:WORD_1 src1_sel:DWORD -; GCN-NEXT: v_cmp_gt_f16_e32 vcc, 0, v2 +; GCN-NEXT: v_cndmask_b32_e64 v4, v4, 0, vcc +; GCN-NEXT: v_cmp_gt_f16_e32 vcc, 0, v1 +; GCN-NEXT: v_cndmask_b32_e64 v1, v1, 0, vcc +; GCN-NEXT: v_cmp_gt_f16_e32 vcc, 0, v5 ; GCN-NEXT: v_lshrrev_b32_e32 v6, 16, v2 -; GCN-NEXT: v_cndmask_b32_e64 v1, v5, 0, s[4:5] -; GCN-NEXT: v_cndmask_b32_e64 v5, v2, 0, vcc -; GCN-NEXT: v_cmp_lt_f16_sdwa s[4:5], v2, s6 src0_sel:WORD_1 src1_sel:DWORD -; GCN-NEXT: v_cmp_gt_f16_e32 vcc, 0, v3 -; GCN-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; GCN-NEXT: v_cndmask_b32_e64 v5, v5, 0, vcc +; GCN-NEXT: v_cmp_gt_f16_e32 vcc, 0, v2 +; GCN-NEXT: v_cndmask_b32_e64 v2, v2, 0, vcc +; GCN-NEXT: v_cmp_gt_f16_e32 vcc, 0, v6 ; GCN-NEXT: v_lshrrev_b32_e32 v7, 16, v3 -; GCN-NEXT: v_cndmask_b32_e64 v2, v6, 0, s[4:5] -; GCN-NEXT: v_cndmask_b32_e64 v6, v3, 0, vcc -; GCN-NEXT: v_cmp_lt_f16_sdwa s[4:5], v3, s6 src0_sel:WORD_1 src1_sel:DWORD -; GCN-NEXT: v_lshl_or_b32 v1, v1, 16, v4 -; GCN-NEXT: v_and_b32_e32 v4, 0xffff, v5 -; GCN-NEXT: v_cndmask_b32_e64 v3, v7, 0, s[4:5] -; GCN-NEXT: v_and_b32_e32 v7, 0xffff, v8 -; GCN-NEXT: v_lshl_or_b32 v2, v2, 16, v4 -; GCN-NEXT: v_and_b32_e32 v4, 0xffff, v6 -; GCN-NEXT: v_lshl_or_b32 v0, v0, 16, v7 -; GCN-NEXT: v_lshl_or_b32 v3, v3, 16, v4 +; GCN-NEXT: v_cndmask_b32_e64 v6, v6, 0, vcc +; GCN-NEXT: v_cmp_gt_f16_e32 vcc, 0, v3 +; GCN-NEXT: v_cndmask_b32_e64 v3, v3, 0, vcc +; GCN-NEXT: v_cmp_gt_f16_e32 vcc, 0, v7 +; GCN-NEXT: v_cndmask_b32_e64 v7, v7, 0, vcc +; GCN-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GCN-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GCN-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GCN-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GCN-NEXT: v_lshl_or_b32 v0, v4, 16, v0 +; GCN-NEXT: v_lshl_or_b32 v1, v5, 16, v1 +; GCN-NEXT: v_lshl_or_b32 v2, v6, 16, v2 +; GCN-NEXT: v_lshl_or_b32 v3, v7, 16, v3 ; GCN-NEXT: s_setpc_b64 s[30:31] entry: %fcmp = fcmp olt <8 x half> %a, zeroinitializer Index: llvm/test/CodeGen/AMDGPU/GlobalISel/sext_inreg.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/GlobalISel/sext_inreg.ll +++ llvm/test/CodeGen/AMDGPU/GlobalISel/sext_inreg.ll @@ -1016,7 +1016,6 @@ ; GFX8-NEXT: v_or_b32_e32 v1, v4, v1 ; GFX8-NEXT: v_ashrrev_i16_e32 v4, 11, v7 ; GFX8-NEXT: v_ashrrev_i16_sdwa v2, v5, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX8-NEXT: v_mov_b32_e32 v5, 11 ; GFX8-NEXT: v_or_b32_e32 v2, v4, v2 ; GFX8-NEXT: v_ashrrev_i16_e32 v4, 11, v8 ; GFX8-NEXT: v_ashrrev_i16_sdwa v3, v5, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD Index: llvm/test/CodeGen/AMDGPU/GlobalISel/shl.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/GlobalISel/shl.ll +++ llvm/test/CodeGen/AMDGPU/GlobalISel/shl.ll @@ -1798,21 +1798,21 @@ ; GCN-LABEL: s_shl_i65_33: ; GCN: ; %bb.0: ; GCN-NEXT: s_lshl_b32 s4, s0, 1 -; GCN-NEXT: s_mov_b32 s6, 0 -; GCN-NEXT: s_lshl_b32 s7, s2, 1 +; GCN-NEXT: s_lshl_b32 s3, s2, 1 +; GCN-NEXT: s_mov_b32 s2, 0 ; GCN-NEXT: s_lshr_b64 s[0:1], s[0:1], 31 -; GCN-NEXT: s_or_b64 s[2:3], s[6:7], s[0:1] +; GCN-NEXT: s_or_b64 s[2:3], s[2:3], s[0:1] ; GCN-NEXT: s_mov_b32 s0, 0 ; GCN-NEXT: s_mov_b32 s1, s4 ; GCN-NEXT: ; return to shader part epilog ; ; GFX10PLUS-LABEL: s_shl_i65_33: ; GFX10PLUS: ; %bb.0: -; GFX10PLUS-NEXT: s_mov_b32 s4, 0 -; GFX10PLUS-NEXT: s_lshl_b32 s5, s2, 1 -; GFX10PLUS-NEXT: s_lshr_b64 s[2:3], s[0:1], 31 +; GFX10PLUS-NEXT: s_lshl_b32 s3, s2, 1 +; GFX10PLUS-NEXT: s_mov_b32 s2, 0 +; GFX10PLUS-NEXT: s_lshr_b64 s[4:5], s[0:1], 31 ; GFX10PLUS-NEXT: s_lshl_b32 s1, s0, 1 -; GFX10PLUS-NEXT: s_or_b64 s[2:3], s[4:5], s[2:3] +; GFX10PLUS-NEXT: s_or_b64 s[2:3], s[2:3], s[4:5] ; GFX10PLUS-NEXT: s_mov_b32 s0, 0 ; GFX10PLUS-NEXT: ; return to shader part epilog %result = shl i65 %value, 33 Index: llvm/test/CodeGen/AMDGPU/GlobalISel/srem.i32.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/GlobalISel/srem.i32.ll +++ llvm/test/CodeGen/AMDGPU/GlobalISel/srem.i32.ll @@ -254,28 +254,28 @@ ; CHECK-LABEL: v_srem_i32_pow2k_denom: ; CHECK: ; %bb.0: ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_movk_i32 s4, 0x1000 -; CHECK-NEXT: v_ashrrev_i32_e32 v1, 31, v0 -; CHECK-NEXT: v_rcp_iflag_f32_e32 v2, 0x45800000 -; CHECK-NEXT: v_mov_b32_e32 v3, 0xfffff000 -; CHECK-NEXT: v_add_i32_e32 v0, vcc, v0, v1 -; CHECK-NEXT: v_mul_f32_e32 v2, 0x4f7ffffe, v2 -; CHECK-NEXT: v_xor_b32_e32 v0, v0, v1 -; CHECK-NEXT: v_cvt_u32_f32_e32 v2, v2 -; CHECK-NEXT: v_mul_lo_u32 v3, v2, v3 -; CHECK-NEXT: v_mul_hi_u32 v3, v2, v3 -; CHECK-NEXT: v_add_i32_e32 v2, vcc, v2, v3 -; CHECK-NEXT: v_mul_hi_u32 v2, v0, v2 -; CHECK-NEXT: v_lshlrev_b32_e32 v2, 12, v2 +; CHECK-NEXT: v_mov_b32_e32 v1, 0x1000 +; CHECK-NEXT: v_ashrrev_i32_e32 v2, 31, v0 +; CHECK-NEXT: v_rcp_iflag_f32_e32 v3, 0x45800000 +; CHECK-NEXT: v_mov_b32_e32 v4, 0xfffff000 +; CHECK-NEXT: v_add_i32_e32 v0, vcc, v0, v2 +; CHECK-NEXT: v_mul_f32_e32 v3, 0x4f7ffffe, v3 +; CHECK-NEXT: v_xor_b32_e32 v0, v0, v2 +; CHECK-NEXT: v_cvt_u32_f32_e32 v3, v3 +; CHECK-NEXT: v_mul_lo_u32 v4, v3, v4 +; CHECK-NEXT: v_mul_hi_u32 v4, v3, v4 +; CHECK-NEXT: v_add_i32_e32 v3, vcc, v3, v4 +; CHECK-NEXT: v_mul_hi_u32 v3, v0, v3 +; CHECK-NEXT: v_lshlrev_b32_e32 v3, 12, v3 +; CHECK-NEXT: v_sub_i32_e32 v0, vcc, v0, v3 +; CHECK-NEXT: v_subrev_i32_e32 v3, vcc, 0x1000, v0 +; CHECK-NEXT: v_cmp_ge_u32_e32 vcc, v0, v1 +; CHECK-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc +; CHECK-NEXT: v_subrev_i32_e32 v3, vcc, 0x1000, v0 +; CHECK-NEXT: v_cmp_ge_u32_e32 vcc, v0, v1 +; CHECK-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc +; CHECK-NEXT: v_xor_b32_e32 v0, v0, v2 ; CHECK-NEXT: v_sub_i32_e32 v0, vcc, v0, v2 -; CHECK-NEXT: v_subrev_i32_e32 v2, vcc, s4, v0 -; CHECK-NEXT: v_cmp_le_u32_e32 vcc, s4, v0 -; CHECK-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc -; CHECK-NEXT: v_subrev_i32_e32 v2, vcc, s4, v0 -; CHECK-NEXT: v_cmp_le_u32_e32 vcc, s4, v0 -; CHECK-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc -; CHECK-NEXT: v_xor_b32_e32 v0, v0, v1 -; CHECK-NEXT: v_sub_i32_e32 v0, vcc, v0, v1 ; CHECK-NEXT: s_setpc_b64 s[30:31] %result = srem i32 %num, 4096 ret i32 %result @@ -295,17 +295,12 @@ ; GISEL-NEXT: v_xor_b32_e32 v0, v0, v2 ; GISEL-NEXT: v_rcp_iflag_f32_e32 v4, v4 ; GISEL-NEXT: v_xor_b32_e32 v1, v1, v3 -; GISEL-NEXT: v_mul_f32_e32 v5, 0x4f7ffffe, v4 ; GISEL-NEXT: v_mul_f32_e32 v4, 0x4f7ffffe, v4 -; GISEL-NEXT: v_cvt_u32_f32_e32 v5, v5 ; GISEL-NEXT: v_cvt_u32_f32_e32 v4, v4 -; GISEL-NEXT: v_mul_lo_u32 v6, s5, v5 -; GISEL-NEXT: v_mul_lo_u32 v7, s5, v4 -; GISEL-NEXT: v_mul_hi_u32 v6, v5, v6 -; GISEL-NEXT: v_mul_hi_u32 v7, v4, v7 -; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v6 -; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v7 -; GISEL-NEXT: v_mul_hi_u32 v5, v0, v5 +; GISEL-NEXT: v_mul_lo_u32 v5, s5, v4 +; GISEL-NEXT: v_mul_hi_u32 v5, v4, v5 +; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v5 +; GISEL-NEXT: v_mul_hi_u32 v5, v0, v4 ; GISEL-NEXT: v_mul_hi_u32 v4, v1, v4 ; GISEL-NEXT: v_mul_lo_u32 v5, v5, s4 ; GISEL-NEXT: v_mul_lo_u32 v4, v4, s4 @@ -332,49 +327,41 @@ ; CGP-LABEL: v_srem_v2i32_pow2k_denom: ; CGP: ; %bb.0: ; CGP-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CGP-NEXT: s_movk_i32 s4, 0x1000 -; CGP-NEXT: v_ashrrev_i32_e32 v2, 31, v0 -; CGP-NEXT: v_rcp_iflag_f32_e32 v3, 0x45800000 -; CGP-NEXT: s_movk_i32 s5, 0xf000 -; CGP-NEXT: v_mov_b32_e32 v4, 0xfffff000 -; CGP-NEXT: v_mov_b32_e32 v5, 0x1000 +; CGP-NEXT: v_mov_b32_e32 v2, 0x1000 +; CGP-NEXT: v_ashrrev_i32_e32 v3, 31, v0 +; CGP-NEXT: v_rcp_iflag_f32_e32 v4, 0x45800000 +; CGP-NEXT: v_mov_b32_e32 v5, 0xfffff000 ; CGP-NEXT: v_ashrrev_i32_e32 v6, 31, v1 -; CGP-NEXT: v_rcp_iflag_f32_e32 v7, 0x45800000 -; CGP-NEXT: v_add_i32_e32 v0, vcc, v0, v2 -; CGP-NEXT: v_mul_f32_e32 v3, 0x4f7ffffe, v3 +; CGP-NEXT: v_add_i32_e32 v0, vcc, v0, v3 +; CGP-NEXT: v_mul_f32_e32 v4, 0x4f7ffffe, v4 ; CGP-NEXT: v_add_i32_e32 v1, vcc, v1, v6 -; CGP-NEXT: v_mul_f32_e32 v7, 0x4f7ffffe, v7 -; CGP-NEXT: v_xor_b32_e32 v0, v0, v2 -; CGP-NEXT: v_cvt_u32_f32_e32 v3, v3 +; CGP-NEXT: v_xor_b32_e32 v0, v0, v3 +; CGP-NEXT: v_cvt_u32_f32_e32 v4, v4 ; CGP-NEXT: v_xor_b32_e32 v1, v1, v6 -; CGP-NEXT: v_cvt_u32_f32_e32 v7, v7 -; CGP-NEXT: v_mul_lo_u32 v8, v3, s5 -; CGP-NEXT: v_mul_lo_u32 v4, v7, v4 -; CGP-NEXT: v_mul_hi_u32 v8, v3, v8 -; CGP-NEXT: v_mul_hi_u32 v4, v7, v4 -; CGP-NEXT: v_add_i32_e32 v3, vcc, v3, v8 -; CGP-NEXT: v_add_i32_e32 v4, vcc, v7, v4 -; CGP-NEXT: v_mul_hi_u32 v3, v0, v3 +; CGP-NEXT: v_mul_lo_u32 v5, v4, v5 +; CGP-NEXT: v_mul_hi_u32 v5, v4, v5 +; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v5 +; CGP-NEXT: v_mul_hi_u32 v5, v0, v4 ; CGP-NEXT: v_mul_hi_u32 v4, v1, v4 -; CGP-NEXT: v_lshlrev_b32_e32 v3, 12, v3 +; CGP-NEXT: v_lshlrev_b32_e32 v5, 12, v5 ; CGP-NEXT: v_lshlrev_b32_e32 v4, 12, v4 -; CGP-NEXT: v_sub_i32_e32 v0, vcc, v0, v3 +; CGP-NEXT: v_sub_i32_e32 v0, vcc, v0, v5 ; CGP-NEXT: v_sub_i32_e32 v1, vcc, v1, v4 -; CGP-NEXT: v_subrev_i32_e32 v3, vcc, s4, v0 -; CGP-NEXT: v_subrev_i32_e32 v4, vcc, 0x1000, v1 -; CGP-NEXT: v_cmp_le_u32_e32 vcc, s4, v0 -; CGP-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc -; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v1, v5 -; CGP-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc -; CGP-NEXT: v_subrev_i32_e32 v3, vcc, s4, v0 -; CGP-NEXT: v_subrev_i32_e32 v4, vcc, 0x1000, v1 -; CGP-NEXT: v_cmp_le_u32_e32 vcc, s4, v0 -; CGP-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc -; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v1, v5 -; CGP-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc -; CGP-NEXT: v_xor_b32_e32 v0, v0, v2 +; CGP-NEXT: v_sub_i32_e32 v4, vcc, v0, v2 +; CGP-NEXT: v_subrev_i32_e32 v5, vcc, 0x1000, v1 +; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v0, v2 +; CGP-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc +; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v1, v2 +; CGP-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc +; CGP-NEXT: v_sub_i32_e32 v4, vcc, v0, v2 +; CGP-NEXT: v_subrev_i32_e32 v5, vcc, 0x1000, v1 +; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v0, v2 +; CGP-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc +; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v1, v2 +; CGP-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc +; CGP-NEXT: v_xor_b32_e32 v0, v0, v3 ; CGP-NEXT: v_xor_b32_e32 v1, v1, v6 -; CGP-NEXT: v_sub_i32_e32 v0, vcc, v0, v2 +; CGP-NEXT: v_sub_i32_e32 v0, vcc, v0, v3 ; CGP-NEXT: v_sub_i32_e32 v1, vcc, v1, v6 ; CGP-NEXT: s_setpc_b64 s[30:31] %result = srem <2 x i32> %num, @@ -385,28 +372,28 @@ ; CHECK-LABEL: v_srem_i32_oddk_denom: ; CHECK: ; %bb.0: ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_mov_b32 s4, 0x12d8fb -; CHECK-NEXT: v_ashrrev_i32_e32 v1, 31, v0 -; CHECK-NEXT: v_rcp_iflag_f32_e32 v2, 0x4996c7d8 -; CHECK-NEXT: v_mov_b32_e32 v3, 0xffed2705 -; CHECK-NEXT: v_add_i32_e32 v0, vcc, v0, v1 -; CHECK-NEXT: v_mul_f32_e32 v2, 0x4f7ffffe, v2 -; CHECK-NEXT: v_xor_b32_e32 v0, v0, v1 -; CHECK-NEXT: v_cvt_u32_f32_e32 v2, v2 -; CHECK-NEXT: v_mul_lo_u32 v3, v2, v3 -; CHECK-NEXT: v_mul_hi_u32 v3, v2, v3 -; CHECK-NEXT: v_add_i32_e32 v2, vcc, v2, v3 -; CHECK-NEXT: v_mul_hi_u32 v2, v0, v2 -; CHECK-NEXT: v_mul_lo_u32 v2, v2, s4 +; CHECK-NEXT: v_mov_b32_e32 v1, 0x12d8fb +; CHECK-NEXT: v_ashrrev_i32_e32 v2, 31, v0 +; CHECK-NEXT: v_rcp_iflag_f32_e32 v3, 0x4996c7d8 +; CHECK-NEXT: v_mov_b32_e32 v4, 0xffed2705 +; CHECK-NEXT: v_add_i32_e32 v0, vcc, v0, v2 +; CHECK-NEXT: v_mul_f32_e32 v3, 0x4f7ffffe, v3 +; CHECK-NEXT: v_xor_b32_e32 v0, v0, v2 +; CHECK-NEXT: v_cvt_u32_f32_e32 v3, v3 +; CHECK-NEXT: v_mul_lo_u32 v4, v3, v4 +; CHECK-NEXT: v_mul_hi_u32 v4, v3, v4 +; CHECK-NEXT: v_add_i32_e32 v3, vcc, v3, v4 +; CHECK-NEXT: v_mul_hi_u32 v3, v0, v3 +; CHECK-NEXT: v_mul_lo_u32 v3, v3, v1 +; CHECK-NEXT: v_sub_i32_e32 v0, vcc, v0, v3 +; CHECK-NEXT: v_subrev_i32_e32 v3, vcc, 0x12d8fb, v0 +; CHECK-NEXT: v_cmp_ge_u32_e32 vcc, v0, v1 +; CHECK-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc +; CHECK-NEXT: v_subrev_i32_e32 v3, vcc, 0x12d8fb, v0 +; CHECK-NEXT: v_cmp_ge_u32_e32 vcc, v0, v1 +; CHECK-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc +; CHECK-NEXT: v_xor_b32_e32 v0, v0, v2 ; CHECK-NEXT: v_sub_i32_e32 v0, vcc, v0, v2 -; CHECK-NEXT: v_subrev_i32_e32 v2, vcc, s4, v0 -; CHECK-NEXT: v_cmp_le_u32_e32 vcc, s4, v0 -; CHECK-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc -; CHECK-NEXT: v_subrev_i32_e32 v2, vcc, s4, v0 -; CHECK-NEXT: v_cmp_le_u32_e32 vcc, s4, v0 -; CHECK-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc -; CHECK-NEXT: v_xor_b32_e32 v0, v0, v1 -; CHECK-NEXT: v_sub_i32_e32 v0, vcc, v0, v1 ; CHECK-NEXT: s_setpc_b64 s[30:31] %result = srem i32 %num, 1235195 ret i32 %result @@ -426,17 +413,12 @@ ; GISEL-NEXT: v_xor_b32_e32 v0, v0, v2 ; GISEL-NEXT: v_rcp_iflag_f32_e32 v4, v4 ; GISEL-NEXT: v_xor_b32_e32 v1, v1, v3 -; GISEL-NEXT: v_mul_f32_e32 v5, 0x4f7ffffe, v4 ; GISEL-NEXT: v_mul_f32_e32 v4, 0x4f7ffffe, v4 -; GISEL-NEXT: v_cvt_u32_f32_e32 v5, v5 ; GISEL-NEXT: v_cvt_u32_f32_e32 v4, v4 -; GISEL-NEXT: v_mul_lo_u32 v6, s5, v5 -; GISEL-NEXT: v_mul_lo_u32 v7, s5, v4 -; GISEL-NEXT: v_mul_hi_u32 v6, v5, v6 -; GISEL-NEXT: v_mul_hi_u32 v7, v4, v7 -; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v6 -; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v7 -; GISEL-NEXT: v_mul_hi_u32 v5, v0, v5 +; GISEL-NEXT: v_mul_lo_u32 v5, s5, v4 +; GISEL-NEXT: v_mul_hi_u32 v5, v4, v5 +; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v5 +; GISEL-NEXT: v_mul_hi_u32 v5, v0, v4 ; GISEL-NEXT: v_mul_hi_u32 v4, v1, v4 ; GISEL-NEXT: v_mul_lo_u32 v5, v5, s4 ; GISEL-NEXT: v_mul_lo_u32 v4, v4, s4 @@ -463,49 +445,41 @@ ; CGP-LABEL: v_srem_v2i32_oddk_denom: ; CGP: ; %bb.0: ; CGP-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CGP-NEXT: s_mov_b32 s4, 0x12d8fb -; CGP-NEXT: v_ashrrev_i32_e32 v2, 31, v0 -; CGP-NEXT: v_rcp_iflag_f32_e32 v3, 0x4996c7d8 -; CGP-NEXT: s_mov_b32 s5, 0xffed2705 -; CGP-NEXT: v_mov_b32_e32 v4, 0xffed2705 -; CGP-NEXT: v_mov_b32_e32 v5, 0x12d8fb +; CGP-NEXT: v_mov_b32_e32 v2, 0x12d8fb +; CGP-NEXT: v_ashrrev_i32_e32 v3, 31, v0 +; CGP-NEXT: v_rcp_iflag_f32_e32 v4, 0x4996c7d8 +; CGP-NEXT: v_mov_b32_e32 v5, 0xffed2705 ; CGP-NEXT: v_ashrrev_i32_e32 v6, 31, v1 -; CGP-NEXT: v_rcp_iflag_f32_e32 v7, 0x4996c7d8 -; CGP-NEXT: v_add_i32_e32 v0, vcc, v0, v2 -; CGP-NEXT: v_mul_f32_e32 v3, 0x4f7ffffe, v3 +; CGP-NEXT: v_add_i32_e32 v0, vcc, v0, v3 +; CGP-NEXT: v_mul_f32_e32 v4, 0x4f7ffffe, v4 ; CGP-NEXT: v_add_i32_e32 v1, vcc, v1, v6 -; CGP-NEXT: v_mul_f32_e32 v7, 0x4f7ffffe, v7 -; CGP-NEXT: v_xor_b32_e32 v0, v0, v2 -; CGP-NEXT: v_cvt_u32_f32_e32 v3, v3 +; CGP-NEXT: v_xor_b32_e32 v0, v0, v3 +; CGP-NEXT: v_cvt_u32_f32_e32 v4, v4 ; CGP-NEXT: v_xor_b32_e32 v1, v1, v6 -; CGP-NEXT: v_cvt_u32_f32_e32 v7, v7 -; CGP-NEXT: v_mul_lo_u32 v8, v3, s5 -; CGP-NEXT: v_mul_lo_u32 v4, v7, v4 -; CGP-NEXT: v_mul_hi_u32 v8, v3, v8 -; CGP-NEXT: v_mul_hi_u32 v4, v7, v4 -; CGP-NEXT: v_add_i32_e32 v3, vcc, v3, v8 -; CGP-NEXT: v_add_i32_e32 v4, vcc, v7, v4 -; CGP-NEXT: v_mul_hi_u32 v3, v0, v3 +; CGP-NEXT: v_mul_lo_u32 v5, v4, v5 +; CGP-NEXT: v_mul_hi_u32 v5, v4, v5 +; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v5 +; CGP-NEXT: v_mul_hi_u32 v5, v0, v4 ; CGP-NEXT: v_mul_hi_u32 v4, v1, v4 -; CGP-NEXT: v_mul_lo_u32 v3, v3, s4 -; CGP-NEXT: v_mul_lo_u32 v4, v4, s4 -; CGP-NEXT: v_sub_i32_e32 v0, vcc, v0, v3 +; CGP-NEXT: v_mul_lo_u32 v5, v5, v2 +; CGP-NEXT: v_mul_lo_u32 v4, v4, v2 +; CGP-NEXT: v_sub_i32_e32 v0, vcc, v0, v5 ; CGP-NEXT: v_sub_i32_e32 v1, vcc, v1, v4 -; CGP-NEXT: v_subrev_i32_e32 v3, vcc, s4, v0 -; CGP-NEXT: v_subrev_i32_e32 v4, vcc, 0x12d8fb, v1 -; CGP-NEXT: v_cmp_le_u32_e32 vcc, s4, v0 -; CGP-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc -; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v1, v5 -; CGP-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc -; CGP-NEXT: v_subrev_i32_e32 v3, vcc, s4, v0 -; CGP-NEXT: v_subrev_i32_e32 v4, vcc, 0x12d8fb, v1 -; CGP-NEXT: v_cmp_le_u32_e32 vcc, s4, v0 -; CGP-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc -; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v1, v5 -; CGP-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc -; CGP-NEXT: v_xor_b32_e32 v0, v0, v2 +; CGP-NEXT: v_sub_i32_e32 v4, vcc, v0, v2 +; CGP-NEXT: v_subrev_i32_e32 v5, vcc, 0x12d8fb, v1 +; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v0, v2 +; CGP-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc +; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v1, v2 +; CGP-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc +; CGP-NEXT: v_sub_i32_e32 v4, vcc, v0, v2 +; CGP-NEXT: v_subrev_i32_e32 v5, vcc, 0x12d8fb, v1 +; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v0, v2 +; CGP-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc +; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v1, v2 +; CGP-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc +; CGP-NEXT: v_xor_b32_e32 v0, v0, v3 ; CGP-NEXT: v_xor_b32_e32 v1, v1, v6 -; CGP-NEXT: v_sub_i32_e32 v0, vcc, v0, v2 +; CGP-NEXT: v_sub_i32_e32 v0, vcc, v0, v3 ; CGP-NEXT: v_sub_i32_e32 v1, vcc, v1, v6 ; CGP-NEXT: s_setpc_b64 s[30:31] %result = srem <2 x i32> %num, Index: llvm/test/CodeGen/AMDGPU/GlobalISel/srem.i64.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/GlobalISel/srem.i64.ll +++ llvm/test/CodeGen/AMDGPU/GlobalISel/srem.i64.ll @@ -167,15 +167,15 @@ ; CHECK-NEXT: v_mul_hi_u32 v1, v0, v1 ; CHECK-NEXT: v_add_i32_e32 v0, vcc, v0, v1 ; CHECK-NEXT: v_mul_hi_u32 v0, v4, v0 +; CHECK-NEXT: v_mov_b32_e32 v1, 0 ; CHECK-NEXT: v_mul_lo_u32 v0, v0, v2 ; CHECK-NEXT: v_sub_i32_e32 v0, vcc, v4, v0 -; CHECK-NEXT: v_sub_i32_e32 v1, vcc, v0, v2 +; CHECK-NEXT: v_sub_i32_e32 v3, vcc, v0, v2 ; CHECK-NEXT: v_cmp_ge_u32_e32 vcc, v0, v2 -; CHECK-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc -; CHECK-NEXT: v_sub_i32_e32 v1, vcc, v0, v2 +; CHECK-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc +; CHECK-NEXT: v_sub_i32_e32 v3, vcc, v0, v2 ; CHECK-NEXT: v_cmp_ge_u32_e32 vcc, v0, v2 -; CHECK-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc -; CHECK-NEXT: v_mov_b32_e32 v1, 0 +; CHECK-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc ; CHECK-NEXT: s_or_b64 exec, exec, s[4:5] ; CHECK-NEXT: s_setpc_b64 s[30:31] %result = srem i64 %num, %den @@ -790,15 +790,15 @@ ; CGP-NEXT: v_mul_hi_u32 v1, v0, v1 ; CGP-NEXT: v_add_i32_e32 v0, vcc, v0, v1 ; CGP-NEXT: v_mul_hi_u32 v0, v10, v0 +; CGP-NEXT: v_mov_b32_e32 v1, 0 ; CGP-NEXT: v_mul_lo_u32 v0, v0, v4 ; CGP-NEXT: v_sub_i32_e32 v0, vcc, v10, v0 -; CGP-NEXT: v_sub_i32_e32 v1, vcc, v0, v4 +; CGP-NEXT: v_sub_i32_e32 v2, vcc, v0, v4 ; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v0, v4 -; CGP-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc -; CGP-NEXT: v_sub_i32_e32 v1, vcc, v0, v4 +; CGP-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc +; CGP-NEXT: v_sub_i32_e32 v2, vcc, v0, v4 ; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v0, v4 -; CGP-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc -; CGP-NEXT: v_mov_b32_e32 v1, 0 +; CGP-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc ; CGP-NEXT: .LBB2_4: ; CGP-NEXT: s_or_b64 exec, exec, s[4:5] ; CGP-NEXT: v_or_b32_e32 v3, v9, v7 @@ -958,15 +958,15 @@ ; CGP-NEXT: v_mul_hi_u32 v3, v2, v3 ; CGP-NEXT: v_add_i32_e32 v2, vcc, v2, v3 ; CGP-NEXT: v_mul_hi_u32 v2, v8, v2 +; CGP-NEXT: v_mov_b32_e32 v3, 0 ; CGP-NEXT: v_mul_lo_u32 v2, v2, v6 ; CGP-NEXT: v_sub_i32_e32 v2, vcc, v8, v2 -; CGP-NEXT: v_sub_i32_e32 v3, vcc, v2, v6 +; CGP-NEXT: v_sub_i32_e32 v4, vcc, v2, v6 ; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v2, v6 -; CGP-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc -; CGP-NEXT: v_sub_i32_e32 v3, vcc, v2, v6 +; CGP-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc +; CGP-NEXT: v_sub_i32_e32 v4, vcc, v2, v6 ; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v2, v6 -; CGP-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc -; CGP-NEXT: v_mov_b32_e32 v3, 0 +; CGP-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc ; CGP-NEXT: s_or_b64 exec, exec, s[4:5] ; CGP-NEXT: s_setpc_b64 s[30:31] %result = srem <2 x i64> %num, %den @@ -979,7 +979,8 @@ ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; CHECK-NEXT: v_cvt_f32_u32_e32 v2, 0x1000 ; CHECK-NEXT: v_cvt_f32_ubyte0_e32 v3, 0 -; CHECK-NEXT: s_movk_i32 s6, 0xf000 +; CHECK-NEXT: v_mov_b32_e32 v6, 0xfffff000 +; CHECK-NEXT: s_bfe_i32 s6, 1, 0x10000 ; CHECK-NEXT: v_mac_f32_e32 v2, 0x4f800000, v3 ; CHECK-NEXT: v_rcp_iflag_f32_e32 v2, v2 ; CHECK-NEXT: v_mul_f32_e32 v2, 0x5f7ffffc, v2 @@ -987,118 +988,115 @@ ; CHECK-NEXT: v_trunc_f32_e32 v4, v3 ; CHECK-NEXT: v_mac_f32_e32 v2, 0xcf800000, v4 ; CHECK-NEXT: v_cvt_u32_f32_e32 v5, v2 -; CHECK-NEXT: v_mov_b32_e32 v2, 0xfffff000 -; CHECK-NEXT: v_cvt_u32_f32_e32 v6, v4 -; CHECK-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v2, v5, 0 -; CHECK-NEXT: v_mad_u64_u32 v[3:4], s[4:5], s6, v6, v[3:4] -; CHECK-NEXT: v_mul_hi_u32 v7, v5, v2 +; CHECK-NEXT: v_cvt_u32_f32_e32 v7, v4 +; CHECK-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v6, v5, 0 +; CHECK-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v6, v7, v[3:4] +; CHECK-NEXT: v_mul_hi_u32 v8, v5, v2 ; CHECK-NEXT: v_mad_u64_u32 v[3:4], s[4:5], -1, v5, v[3:4] -; CHECK-NEXT: v_mul_lo_u32 v4, v6, v2 -; CHECK-NEXT: v_mul_hi_u32 v2, v6, v2 -; CHECK-NEXT: v_mul_lo_u32 v8, v5, v3 -; CHECK-NEXT: v_mul_lo_u32 v9, v6, v3 -; CHECK-NEXT: v_mul_hi_u32 v10, v5, v3 -; CHECK-NEXT: v_mul_hi_u32 v3, v6, v3 +; CHECK-NEXT: v_mul_lo_u32 v4, v7, v2 +; CHECK-NEXT: v_mul_hi_u32 v2, v7, v2 +; CHECK-NEXT: v_mul_lo_u32 v9, v5, v3 +; CHECK-NEXT: v_mul_lo_u32 v10, v7, v3 +; CHECK-NEXT: v_mul_hi_u32 v11, v5, v3 +; CHECK-NEXT: v_mul_hi_u32 v3, v7, v3 +; CHECK-NEXT: v_add_i32_e32 v4, vcc, v4, v9 +; CHECK-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc +; CHECK-NEXT: v_add_i32_e32 v2, vcc, v10, v2 +; CHECK-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc ; CHECK-NEXT: v_add_i32_e32 v4, vcc, v4, v8 -; CHECK-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc -; CHECK-NEXT: v_add_i32_e32 v4, vcc, v4, v7 ; CHECK-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc -; CHECK-NEXT: v_add_i32_e32 v4, vcc, v8, v4 -; CHECK-NEXT: v_add_i32_e32 v2, vcc, v9, v2 -; CHECK-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc -; CHECK-NEXT: v_add_i32_e32 v2, vcc, v2, v10 +; CHECK-NEXT: v_add_i32_e32 v4, vcc, v9, v4 +; CHECK-NEXT: v_add_i32_e32 v2, vcc, v2, v11 ; CHECK-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc -; CHECK-NEXT: v_add_i32_e32 v7, vcc, v7, v8 +; CHECK-NEXT: v_add_i32_e32 v8, vcc, v10, v8 ; CHECK-NEXT: v_add_i32_e32 v2, vcc, v2, v4 ; CHECK-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc -; CHECK-NEXT: v_add_i32_e32 v4, vcc, v7, v4 +; CHECK-NEXT: v_add_i32_e32 v4, vcc, v8, v4 ; CHECK-NEXT: v_add_i32_e32 v3, vcc, v3, v4 ; CHECK-NEXT: v_add_i32_e32 v5, vcc, v5, v2 -; CHECK-NEXT: v_addc_u32_e32 v6, vcc, v6, v3, vcc -; CHECK-NEXT: v_mad_u64_u32 v[2:3], s[4:5], s6, v5, 0 -; CHECK-NEXT: v_ashrrev_i32_e32 v7, 31, v1 -; CHECK-NEXT: v_add_i32_e32 v8, vcc, v0, v7 -; CHECK-NEXT: v_mad_u64_u32 v[3:4], s[4:5], s6, v6, v[3:4] -; CHECK-NEXT: v_addc_u32_e32 v9, vcc, v1, v7, vcc -; CHECK-NEXT: v_mad_u64_u32 v[0:1], s[4:5], -1, v5, v[3:4] -; CHECK-NEXT: v_xor_b32_e32 v3, v8, v7 -; CHECK-NEXT: v_mul_lo_u32 v1, v6, v2 -; CHECK-NEXT: v_mul_lo_u32 v8, v5, v0 -; CHECK-NEXT: v_xor_b32_e32 v4, v9, v7 -; CHECK-NEXT: v_mul_hi_u32 v9, v5, v2 -; CHECK-NEXT: v_mul_hi_u32 v2, v6, v2 +; CHECK-NEXT: v_addc_u32_e32 v7, vcc, v7, v3, vcc +; CHECK-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v6, v5, 0 +; CHECK-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v6, v7, v[3:4] +; CHECK-NEXT: v_ashrrev_i32_e32 v6, 31, v1 +; CHECK-NEXT: v_add_i32_e32 v0, vcc, v0, v6 +; CHECK-NEXT: v_mad_u64_u32 v[3:4], s[4:5], -1, v5, v[3:4] +; CHECK-NEXT: v_addc_u32_e32 v1, vcc, v1, v6, vcc +; CHECK-NEXT: v_xor_b32_e32 v4, v0, v6 +; CHECK-NEXT: v_mul_lo_u32 v0, v7, v2 +; CHECK-NEXT: v_mul_lo_u32 v8, v5, v3 +; CHECK-NEXT: v_xor_b32_e32 v9, v1, v6 +; CHECK-NEXT: v_mul_hi_u32 v1, v5, v2 +; CHECK-NEXT: v_mul_hi_u32 v2, v7, v2 +; CHECK-NEXT: v_add_i32_e32 v0, vcc, v0, v8 +; CHECK-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc +; CHECK-NEXT: v_add_i32_e32 v0, vcc, v0, v1 +; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc +; CHECK-NEXT: v_mul_lo_u32 v1, v7, v3 +; CHECK-NEXT: v_add_i32_e32 v0, vcc, v8, v0 +; CHECK-NEXT: v_mul_hi_u32 v8, v5, v3 +; CHECK-NEXT: v_add_i32_e32 v1, vcc, v1, v2 +; CHECK-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc ; CHECK-NEXT: v_add_i32_e32 v1, vcc, v1, v8 ; CHECK-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc -; CHECK-NEXT: v_add_i32_e32 v1, vcc, v1, v9 -; CHECK-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc -; CHECK-NEXT: v_mul_lo_u32 v9, v6, v0 -; CHECK-NEXT: v_add_i32_e32 v1, vcc, v8, v1 -; CHECK-NEXT: v_mul_hi_u32 v8, v5, v0 -; CHECK-NEXT: v_add_i32_e32 v2, vcc, v9, v2 -; CHECK-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc ; CHECK-NEXT: v_add_i32_e32 v2, vcc, v2, v8 -; CHECK-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc -; CHECK-NEXT: v_add_i32_e32 v8, vcc, v9, v8 -; CHECK-NEXT: v_mul_hi_u32 v0, v6, v0 +; CHECK-NEXT: v_mul_hi_u32 v3, v7, v3 +; CHECK-NEXT: v_add_i32_e32 v0, vcc, v1, v0 +; CHECK-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc ; CHECK-NEXT: v_add_i32_e32 v1, vcc, v2, v1 +; CHECK-NEXT: v_add_i32_e32 v1, vcc, v3, v1 +; CHECK-NEXT: v_add_i32_e32 v0, vcc, v5, v0 +; CHECK-NEXT: v_addc_u32_e32 v1, vcc, v7, v1, vcc +; CHECK-NEXT: v_mul_lo_u32 v2, v9, v0 +; CHECK-NEXT: v_mul_lo_u32 v3, v4, v1 +; CHECK-NEXT: v_mul_hi_u32 v7, v4, v0 +; CHECK-NEXT: v_mul_hi_u32 v0, v9, v0 +; CHECK-NEXT: v_mov_b32_e32 v5, 0x1000 +; CHECK-NEXT: v_add_i32_e32 v2, vcc, v2, v3 +; CHECK-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc +; CHECK-NEXT: v_add_i32_e32 v2, vcc, v2, v7 ; CHECK-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc -; CHECK-NEXT: v_add_i32_e32 v2, vcc, v8, v2 +; CHECK-NEXT: v_mul_lo_u32 v7, v9, v1 +; CHECK-NEXT: v_add_i32_e32 v2, vcc, v3, v2 +; CHECK-NEXT: v_mul_hi_u32 v3, v4, v1 +; CHECK-NEXT: v_add_i32_e32 v0, vcc, v7, v0 +; CHECK-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc +; CHECK-NEXT: v_add_i32_e32 v0, vcc, v0, v3 +; CHECK-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc +; CHECK-NEXT: v_add_i32_e32 v3, vcc, v7, v3 ; CHECK-NEXT: v_add_i32_e32 v0, vcc, v0, v2 -; CHECK-NEXT: v_add_i32_e32 v1, vcc, v5, v1 -; CHECK-NEXT: v_addc_u32_e32 v0, vcc, v6, v0, vcc -; CHECK-NEXT: v_mul_lo_u32 v2, v4, v1 -; CHECK-NEXT: v_mul_lo_u32 v5, v3, v0 -; CHECK-NEXT: v_mul_hi_u32 v8, v3, v1 -; CHECK-NEXT: v_mul_hi_u32 v1, v4, v1 -; CHECK-NEXT: s_movk_i32 s6, 0x1000 -; CHECK-NEXT: v_add_i32_e32 v2, vcc, v2, v5 -; CHECK-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc -; CHECK-NEXT: v_add_i32_e32 v2, vcc, v2, v8 +; CHECK-NEXT: v_mul_hi_u32 v7, v9, v1 +; CHECK-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v5, v0, 0 ; CHECK-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc -; CHECK-NEXT: v_mul_lo_u32 v8, v4, v0 -; CHECK-NEXT: v_add_i32_e32 v2, vcc, v5, v2 -; CHECK-NEXT: v_mul_hi_u32 v5, v3, v0 -; CHECK-NEXT: v_add_i32_e32 v1, vcc, v8, v1 -; CHECK-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc -; CHECK-NEXT: v_add_i32_e32 v1, vcc, v1, v5 -; CHECK-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc -; CHECK-NEXT: v_add_i32_e32 v5, vcc, v8, v5 -; CHECK-NEXT: v_add_i32_e32 v1, vcc, v1, v2 -; CHECK-NEXT: v_mul_hi_u32 v8, v4, v0 -; CHECK-NEXT: v_mad_u64_u32 v[0:1], s[4:5], s6, v1, 0 -; CHECK-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc -; CHECK-NEXT: v_add_i32_e32 v2, vcc, v5, v2 -; CHECK-NEXT: v_add_i32_e32 v2, vcc, v8, v2 -; CHECK-NEXT: v_mad_u64_u32 v[1:2], s[4:5], s6, v2, v[1:2] -; CHECK-NEXT: v_sub_i32_e64 v0, s[4:5], v3, v0 -; CHECK-NEXT: v_mov_b32_e32 v6, 0x1000 -; CHECK-NEXT: v_subb_u32_e64 v2, vcc, v4, v1, s[4:5] -; CHECK-NEXT: v_sub_i32_e32 v1, vcc, v4, v1 -; CHECK-NEXT: s_bfe_i32 s6, 1, 0x10000 -; CHECK-NEXT: v_cmp_ge_u32_e32 vcc, v0, v6 +; CHECK-NEXT: v_add_i32_e32 v2, vcc, v3, v2 +; CHECK-NEXT: v_add_i32_e32 v2, vcc, v7, v2 +; CHECK-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v5, v2, v[1:2] +; CHECK-NEXT: v_sub_i32_e64 v0, s[4:5], v4, v0 +; CHECK-NEXT: v_subb_u32_e64 v2, vcc, v9, v1, s[4:5] +; CHECK-NEXT: v_sub_i32_e32 v1, vcc, v9, v1 +; CHECK-NEXT: v_cmp_ge_u32_e32 vcc, v0, v5 ; CHECK-NEXT: v_cndmask_b32_e64 v3, 0, -1, vcc ; CHECK-NEXT: v_mov_b32_e32 v4, s6 ; CHECK-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 ; CHECK-NEXT: v_cndmask_b32_e32 v3, v4, v3, vcc -; CHECK-NEXT: v_sub_i32_e32 v5, vcc, v0, v6 +; CHECK-NEXT: v_sub_i32_e32 v7, vcc, v0, v5 ; CHECK-NEXT: v_subbrev_u32_e64 v1, s[4:5], 0, v1, s[4:5] ; CHECK-NEXT: v_subbrev_u32_e32 v1, vcc, 0, v1, vcc -; CHECK-NEXT: v_cmp_ge_u32_e32 vcc, v5, v6 -; CHECK-NEXT: v_cndmask_b32_e64 v6, 0, -1, vcc +; CHECK-NEXT: v_cmp_ge_u32_e32 vcc, v7, v5 +; CHECK-NEXT: v_cndmask_b32_e64 v5, 0, -1, vcc ; CHECK-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 -; CHECK-NEXT: v_cndmask_b32_e32 v4, v4, v6, vcc -; CHECK-NEXT: v_subrev_i32_e32 v6, vcc, 0x1000, v5 +; CHECK-NEXT: v_cndmask_b32_e32 v4, v4, v5, vcc +; CHECK-NEXT: v_subrev_i32_e32 v5, vcc, 0x1000, v7 ; CHECK-NEXT: v_subbrev_u32_e32 v8, vcc, 0, v1, vcc ; CHECK-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 -; CHECK-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc +; CHECK-NEXT: v_cndmask_b32_e32 v4, v7, v5, vcc ; CHECK-NEXT: v_cndmask_b32_e32 v1, v1, v8, vcc ; CHECK-NEXT: v_cmp_ne_u32_e32 vcc, 0, v3 ; CHECK-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc ; CHECK-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc -; CHECK-NEXT: v_xor_b32_e32 v0, v0, v7 -; CHECK-NEXT: v_xor_b32_e32 v1, v1, v7 -; CHECK-NEXT: v_sub_i32_e32 v0, vcc, v0, v7 -; CHECK-NEXT: v_subb_u32_e32 v1, vcc, v1, v7, vcc +; CHECK-NEXT: v_xor_b32_e32 v0, v0, v6 +; CHECK-NEXT: v_xor_b32_e32 v1, v1, v6 +; CHECK-NEXT: v_sub_i32_e32 v0, vcc, v0, v6 +; CHECK-NEXT: v_subb_u32_e32 v1, vcc, v1, v6, vcc ; CHECK-NEXT: s_setpc_b64 s[30:31] %result = srem i64 %num, 4096 ret i64 %result @@ -1375,243 +1373,232 @@ ; CGP-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; CGP-NEXT: v_cvt_f32_u32_e32 v4, 0x1000 ; CGP-NEXT: v_cvt_f32_ubyte0_e32 v5, 0 -; CGP-NEXT: s_movk_i32 s6, 0xf000 -; CGP-NEXT: s_movk_i32 s7, 0x1000 +; CGP-NEXT: s_bfe_i32 s6, 1, 0x10000 ; CGP-NEXT: v_mac_f32_e32 v4, 0x4f800000, v5 ; CGP-NEXT: v_rcp_iflag_f32_e32 v4, v4 -; CGP-NEXT: s_bfe_i32 s8, 1, 0x10000 ; CGP-NEXT: v_mul_f32_e32 v4, 0x5f7ffffc, v4 ; CGP-NEXT: v_mul_f32_e32 v5, 0x2f800000, v4 -; CGP-NEXT: v_trunc_f32_e32 v6, v5 -; CGP-NEXT: v_mac_f32_e32 v4, 0xcf800000, v6 -; CGP-NEXT: v_cvt_u32_f32_e32 v7, v4 -; CGP-NEXT: v_mov_b32_e32 v4, 0xfffff000 -; CGP-NEXT: v_cvt_u32_f32_e32 v8, v6 -; CGP-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v4, v7, 0 -; CGP-NEXT: v_mad_u64_u32 v[5:6], s[4:5], s6, v8, v[5:6] -; CGP-NEXT: v_mul_hi_u32 v9, v7, v4 -; CGP-NEXT: v_mad_u64_u32 v[5:6], s[4:5], -1, v7, v[5:6] -; CGP-NEXT: v_mul_lo_u32 v6, v8, v4 -; CGP-NEXT: v_mul_hi_u32 v4, v8, v4 -; CGP-NEXT: v_mul_lo_u32 v10, v7, v5 -; CGP-NEXT: v_mul_lo_u32 v11, v8, v5 -; CGP-NEXT: v_mul_hi_u32 v12, v7, v5 -; CGP-NEXT: v_mul_hi_u32 v5, v8, v5 -; CGP-NEXT: v_add_i32_e32 v6, vcc, v6, v10 -; CGP-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v6, vcc, v6, v9 -; CGP-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v6, vcc, v10, v6 +; CGP-NEXT: v_trunc_f32_e32 v7, v5 +; CGP-NEXT: v_mac_f32_e32 v4, 0xcf800000, v7 +; CGP-NEXT: v_cvt_u32_f32_e32 v6, v4 +; CGP-NEXT: v_mov_b32_e32 v5, 0xfffff000 +; CGP-NEXT: v_cvt_u32_f32_e32 v8, v7 +; CGP-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v5, v6, 0 +; CGP-NEXT: v_mov_b32_e32 v4, v10 +; CGP-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v5, v8, v[4:5] +; CGP-NEXT: v_mul_lo_u32 v4, v8, v9 +; CGP-NEXT: v_mul_hi_u32 v7, v6, v9 +; CGP-NEXT: v_mad_u64_u32 v[10:11], s[4:5], -1, v6, v[10:11] +; CGP-NEXT: v_mul_hi_u32 v9, v8, v9 +; CGP-NEXT: v_mul_lo_u32 v11, v6, v10 +; CGP-NEXT: v_mul_lo_u32 v12, v8, v10 +; CGP-NEXT: v_mul_hi_u32 v13, v6, v10 +; CGP-NEXT: v_mul_hi_u32 v10, v8, v10 +; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v11 +; CGP-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v7 +; CGP-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v4, vcc, v11, v4 +; CGP-NEXT: v_add_i32_e32 v7, vcc, v12, v9 +; CGP-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v7, vcc, v7, v13 +; CGP-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v9, vcc, v9, v11 +; CGP-NEXT: v_add_i32_e32 v4, vcc, v7, v4 +; CGP-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v7, vcc, v9, v7 +; CGP-NEXT: v_add_i32_e32 v7, vcc, v10, v7 +; CGP-NEXT: v_add_i32_e32 v12, vcc, v6, v4 +; CGP-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v5, v12, 0 +; CGP-NEXT: v_addc_u32_e32 v13, vcc, v8, v7, vcc +; CGP-NEXT: v_mov_b32_e32 v4, v10 +; CGP-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v5, v13, v[4:5] +; CGP-NEXT: v_ashrrev_i32_e32 v7, 31, v1 +; CGP-NEXT: v_add_i32_e32 v0, vcc, v0, v7 +; CGP-NEXT: v_mad_u64_u32 v[10:11], s[4:5], -1, v12, v[10:11] +; CGP-NEXT: v_addc_u32_e32 v1, vcc, v1, v7, vcc +; CGP-NEXT: v_xor_b32_e32 v11, v0, v7 +; CGP-NEXT: v_mul_lo_u32 v0, v13, v9 +; CGP-NEXT: v_mul_lo_u32 v4, v12, v10 +; CGP-NEXT: v_xor_b32_e32 v14, v1, v7 +; CGP-NEXT: v_mul_hi_u32 v1, v12, v9 +; CGP-NEXT: v_mul_hi_u32 v9, v13, v9 +; CGP-NEXT: v_add_i32_e32 v0, vcc, v0, v4 +; CGP-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v0, vcc, v0, v1 +; CGP-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc +; CGP-NEXT: v_mul_lo_u32 v1, v13, v10 +; CGP-NEXT: v_add_i32_e32 v0, vcc, v4, v0 +; CGP-NEXT: v_mul_hi_u32 v4, v12, v10 +; CGP-NEXT: v_add_i32_e32 v1, vcc, v1, v9 ; CGP-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v12 -; CGP-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v9, vcc, v9, v10 -; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v6 -; CGP-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v6, vcc, v9, v6 -; CGP-NEXT: v_add_i32_e32 v5, vcc, v5, v6 -; CGP-NEXT: v_add_i32_e32 v9, vcc, v7, v4 -; CGP-NEXT: v_mad_u64_u32 v[6:7], s[4:5], s6, v9, 0 -; CGP-NEXT: v_addc_u32_e32 v10, vcc, v8, v5, vcc -; CGP-NEXT: v_ashrrev_i32_e32 v5, 31, v1 -; CGP-NEXT: v_mov_b32_e32 v4, v7 -; CGP-NEXT: v_mad_u64_u32 v[7:8], s[4:5], s6, v10, v[4:5] -; CGP-NEXT: v_add_i32_e32 v4, vcc, v0, v5 -; CGP-NEXT: v_addc_u32_e32 v11, vcc, v1, v5, vcc -; CGP-NEXT: v_mad_u64_u32 v[0:1], s[4:5], -1, v9, v[7:8] -; CGP-NEXT: v_xor_b32_e32 v8, v4, v5 -; CGP-NEXT: v_mul_lo_u32 v1, v10, v6 -; CGP-NEXT: v_mul_lo_u32 v4, v9, v0 -; CGP-NEXT: v_mul_hi_u32 v7, v9, v6 -; CGP-NEXT: v_mul_hi_u32 v6, v10, v6 -; CGP-NEXT: v_xor_b32_e32 v11, v11, v5 ; CGP-NEXT: v_add_i32_e32 v1, vcc, v1, v4 ; CGP-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v1, vcc, v1, v7 +; CGP-NEXT: v_add_i32_e32 v4, vcc, v9, v4 +; CGP-NEXT: v_mul_hi_u32 v9, v13, v10 +; CGP-NEXT: v_add_i32_e32 v0, vcc, v1, v0 ; CGP-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc -; CGP-NEXT: v_mul_lo_u32 v7, v10, v0 ; CGP-NEXT: v_add_i32_e32 v1, vcc, v4, v1 -; CGP-NEXT: v_mul_hi_u32 v4, v9, v0 -; CGP-NEXT: v_add_i32_e32 v6, vcc, v7, v6 -; CGP-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v4, vcc, v6, v4 -; CGP-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v6, vcc, v7, v6 -; CGP-NEXT: v_mul_hi_u32 v0, v10, v0 -; CGP-NEXT: v_add_i32_e32 v1, vcc, v4, v1 -; CGP-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v4, vcc, v6, v4 -; CGP-NEXT: v_add_i32_e32 v0, vcc, v0, v4 ; CGP-NEXT: v_add_i32_e32 v1, vcc, v9, v1 -; CGP-NEXT: v_addc_u32_e32 v0, vcc, v10, v0, vcc -; CGP-NEXT: v_mul_lo_u32 v6, v11, v1 -; CGP-NEXT: v_mul_lo_u32 v7, v8, v0 -; CGP-NEXT: v_mul_hi_u32 v9, v8, v1 -; CGP-NEXT: v_mul_hi_u32 v1, v11, v1 +; CGP-NEXT: v_add_i32_e32 v0, vcc, v12, v0 +; CGP-NEXT: v_addc_u32_e32 v1, vcc, v13, v1, vcc +; CGP-NEXT: v_mul_lo_u32 v9, v14, v0 +; CGP-NEXT: v_mul_lo_u32 v10, v11, v1 +; CGP-NEXT: v_mul_hi_u32 v12, v11, v0 +; CGP-NEXT: v_mul_hi_u32 v0, v14, v0 ; CGP-NEXT: v_mov_b32_e32 v4, 0x1000 -; CGP-NEXT: v_add_i32_e32 v6, vcc, v6, v7 -; CGP-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v6, vcc, v6, v9 -; CGP-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc -; CGP-NEXT: v_mul_lo_u32 v9, v11, v0 -; CGP-NEXT: v_add_i32_e32 v6, vcc, v7, v6 -; CGP-NEXT: v_mul_hi_u32 v7, v8, v0 -; CGP-NEXT: v_add_i32_e32 v1, vcc, v9, v1 +; CGP-NEXT: v_add_i32_e32 v9, vcc, v9, v10 +; CGP-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v9, vcc, v9, v12 ; CGP-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v1, vcc, v1, v7 -; CGP-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v7, vcc, v9, v7 -; CGP-NEXT: v_add_i32_e32 v1, vcc, v1, v6 -; CGP-NEXT: v_mul_hi_u32 v9, v11, v0 -; CGP-NEXT: v_mad_u64_u32 v[0:1], s[4:5], s7, v1, 0 -; CGP-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v6, vcc, v7, v6 -; CGP-NEXT: v_add_i32_e32 v6, vcc, v9, v6 -; CGP-NEXT: v_mad_u64_u32 v[6:7], s[4:5], s7, v6, v[1:2] -; CGP-NEXT: v_sub_i32_e32 v9, vcc, v8, v0 -; CGP-NEXT: v_subb_u32_e64 v10, s[4:5], v11, v6, vcc -; CGP-NEXT: v_sub_i32_e64 v0, s[4:5], v11, v6 -; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v9, v4 -; CGP-NEXT: v_cndmask_b32_e64 v1, 0, -1, s[4:5] -; CGP-NEXT: v_mov_b32_e32 v6, s8 -; CGP-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v10 -; CGP-NEXT: v_cndmask_b32_e64 v11, v6, v1, s[4:5] -; CGP-NEXT: v_cvt_f32_u32_e32 v1, 0x1000 -; CGP-NEXT: v_cvt_f32_ubyte0_e32 v7, 0 -; CGP-NEXT: v_subbrev_u32_e32 v0, vcc, 0, v0, vcc -; CGP-NEXT: v_mac_f32_e32 v1, 0x4f800000, v7 -; CGP-NEXT: v_rcp_iflag_f32_e32 v1, v1 -; CGP-NEXT: v_sub_i32_e32 v12, vcc, v9, v4 -; CGP-NEXT: v_subbrev_u32_e32 v13, vcc, 0, v0, vcc -; CGP-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v1 -; CGP-NEXT: v_mul_f32_e32 v1, 0x2f800000, v0 -; CGP-NEXT: v_trunc_f32_e32 v7, v1 -; CGP-NEXT: v_mac_f32_e32 v0, 0xcf800000, v7 -; CGP-NEXT: v_cvt_u32_f32_e32 v14, v0 -; CGP-NEXT: v_cvt_u32_f32_e32 v15, v7 -; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v12, v4 -; CGP-NEXT: v_cndmask_b32_e64 v8, 0, -1, vcc -; CGP-NEXT: v_mad_u64_u32 v[0:1], s[4:5], s6, v14, 0 -; CGP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v13 -; CGP-NEXT: v_cndmask_b32_e32 v16, v6, v8, vcc -; CGP-NEXT: v_mad_u64_u32 v[7:8], s[4:5], s6, v15, v[1:2] -; CGP-NEXT: v_sub_i32_e32 v1, vcc, v12, v4 -; CGP-NEXT: v_mad_u64_u32 v[7:8], s[4:5], -1, v14, v[7:8] -; CGP-NEXT: v_subbrev_u32_e32 v17, vcc, 0, v13, vcc -; CGP-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 -; CGP-NEXT: v_cndmask_b32_e32 v8, v12, v1, vcc -; CGP-NEXT: v_mul_lo_u32 v1, v15, v0 -; CGP-NEXT: v_mul_lo_u32 v12, v14, v7 -; CGP-NEXT: v_mul_hi_u32 v16, v14, v0 -; CGP-NEXT: v_cndmask_b32_e32 v13, v13, v17, vcc -; CGP-NEXT: v_mul_hi_u32 v0, v15, v0 -; CGP-NEXT: v_add_i32_e32 v1, vcc, v1, v12 +; CGP-NEXT: v_mul_lo_u32 v12, v14, v1 +; CGP-NEXT: v_add_i32_e32 v9, vcc, v10, v9 +; CGP-NEXT: v_mul_hi_u32 v10, v11, v1 +; CGP-NEXT: v_add_i32_e32 v0, vcc, v12, v0 ; CGP-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v1, vcc, v1, v16 +; CGP-NEXT: v_add_i32_e32 v0, vcc, v0, v10 +; CGP-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v10, vcc, v12, v10 +; CGP-NEXT: v_add_i32_e32 v0, vcc, v0, v9 +; CGP-NEXT: v_mul_hi_u32 v12, v14, v1 +; CGP-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v4, v0, 0 +; CGP-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v9, vcc, v10, v9 +; CGP-NEXT: v_add_i32_e32 v9, vcc, v12, v9 +; CGP-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v4, v9, v[1:2] +; CGP-NEXT: v_sub_i32_e32 v11, vcc, v11, v0 +; CGP-NEXT: v_subb_u32_e64 v12, s[4:5], v14, v9, vcc +; CGP-NEXT: v_sub_i32_e64 v0, s[4:5], v14, v9 +; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v11, v4 +; CGP-NEXT: v_subbrev_u32_e32 v0, vcc, 0, v0, vcc +; CGP-NEXT: v_cndmask_b32_e64 v1, 0, -1, s[4:5] +; CGP-NEXT: v_mov_b32_e32 v13, s6 +; CGP-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v12 +; CGP-NEXT: v_sub_i32_e32 v15, vcc, v11, v4 +; CGP-NEXT: v_cndmask_b32_e64 v14, v13, v1, s[4:5] +; CGP-NEXT: v_subbrev_u32_e32 v16, vcc, 0, v0, vcc +; CGP-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v5, v6, 0 +; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v15, v4 +; CGP-NEXT: v_cndmask_b32_e64 v9, 0, -1, vcc +; CGP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v16 +; CGP-NEXT: v_cndmask_b32_e32 v17, v13, v9, vcc +; CGP-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v5, v8, v[1:2] +; CGP-NEXT: v_sub_i32_e32 v1, vcc, v15, v4 +; CGP-NEXT: v_mad_u64_u32 v[9:10], s[4:5], -1, v6, v[9:10] +; CGP-NEXT: v_subbrev_u32_e32 v18, vcc, 0, v16, vcc +; CGP-NEXT: v_cmp_ne_u32_e32 vcc, 0, v17 +; CGP-NEXT: v_cndmask_b32_e32 v10, v15, v1, vcc +; CGP-NEXT: v_mul_lo_u32 v1, v8, v0 +; CGP-NEXT: v_mul_lo_u32 v15, v6, v9 +; CGP-NEXT: v_mul_hi_u32 v17, v6, v0 +; CGP-NEXT: v_cndmask_b32_e32 v16, v16, v18, vcc +; CGP-NEXT: v_mul_hi_u32 v0, v8, v0 +; CGP-NEXT: v_add_i32_e32 v1, vcc, v1, v15 +; CGP-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v1, vcc, v1, v17 ; CGP-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc -; CGP-NEXT: v_mul_lo_u32 v16, v15, v7 -; CGP-NEXT: v_add_i32_e32 v1, vcc, v12, v1 -; CGP-NEXT: v_mul_hi_u32 v12, v14, v7 -; CGP-NEXT: v_add_i32_e32 v0, vcc, v16, v0 -; CGP-NEXT: v_cndmask_b32_e64 v16, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v0, vcc, v0, v12 -; CGP-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v12, vcc, v16, v12 -; CGP-NEXT: v_mul_hi_u32 v7, v15, v7 +; CGP-NEXT: v_mul_lo_u32 v17, v8, v9 +; CGP-NEXT: v_add_i32_e32 v1, vcc, v15, v1 +; CGP-NEXT: v_mul_hi_u32 v15, v6, v9 +; CGP-NEXT: v_add_i32_e32 v0, vcc, v17, v0 +; CGP-NEXT: v_cndmask_b32_e64 v17, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v0, vcc, v0, v15 +; CGP-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v15, vcc, v17, v15 +; CGP-NEXT: v_mul_hi_u32 v9, v8, v9 ; CGP-NEXT: v_add_i32_e32 v0, vcc, v0, v1 ; CGP-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v1, vcc, v12, v1 -; CGP-NEXT: v_add_i32_e32 v1, vcc, v7, v1 -; CGP-NEXT: v_add_i32_e32 v12, vcc, v14, v0 -; CGP-NEXT: v_addc_u32_e32 v14, vcc, v15, v1, vcc -; CGP-NEXT: v_mad_u64_u32 v[0:1], s[4:5], s6, v12, 0 -; CGP-NEXT: v_cmp_ne_u32_e32 vcc, 0, v11 -; CGP-NEXT: v_cndmask_b32_e32 v7, v9, v8, vcc -; CGP-NEXT: v_cndmask_b32_e32 v9, v10, v13, vcc -; CGP-NEXT: v_xor_b32_e32 v10, v7, v5 -; CGP-NEXT: v_mad_u64_u32 v[7:8], s[4:5], s6, v14, v[1:2] -; CGP-NEXT: v_xor_b32_e32 v1, v9, v5 -; CGP-NEXT: v_ashrrev_i32_e32 v9, 31, v3 -; CGP-NEXT: v_mad_u64_u32 v[7:8], s[4:5], -1, v12, v[7:8] -; CGP-NEXT: v_add_i32_e32 v2, vcc, v2, v9 -; CGP-NEXT: v_addc_u32_e32 v3, vcc, v3, v9, vcc -; CGP-NEXT: v_xor_b32_e32 v11, v2, v9 -; CGP-NEXT: v_mul_lo_u32 v2, v14, v0 -; CGP-NEXT: v_mul_lo_u32 v8, v12, v7 -; CGP-NEXT: v_xor_b32_e32 v13, v3, v9 -; CGP-NEXT: v_mul_hi_u32 v3, v12, v0 -; CGP-NEXT: v_mul_hi_u32 v0, v14, v0 -; CGP-NEXT: v_add_i32_e32 v2, vcc, v2, v8 -; CGP-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v1, vcc, v15, v1 +; CGP-NEXT: v_add_i32_e32 v1, vcc, v9, v1 +; CGP-NEXT: v_add_i32_e32 v9, vcc, v6, v0 +; CGP-NEXT: v_addc_u32_e32 v8, vcc, v8, v1, vcc +; CGP-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v5, v9, 0 +; CGP-NEXT: v_cmp_ne_u32_e32 vcc, 0, v14 +; CGP-NEXT: v_cndmask_b32_e32 v6, v11, v10, vcc +; CGP-NEXT: v_xor_b32_e32 v11, v6, v7 +; CGP-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v5, v8, v[1:2] +; CGP-NEXT: v_cndmask_b32_e32 v10, v12, v16, vcc +; CGP-NEXT: v_xor_b32_e32 v1, v10, v7 +; CGP-NEXT: v_mad_u64_u32 v[5:6], s[4:5], -1, v9, v[5:6] +; CGP-NEXT: v_ashrrev_i32_e32 v10, 31, v3 +; CGP-NEXT: v_add_i32_e32 v2, vcc, v2, v10 +; CGP-NEXT: v_addc_u32_e32 v3, vcc, v3, v10, vcc +; CGP-NEXT: v_xor_b32_e32 v12, v2, v10 +; CGP-NEXT: v_mul_lo_u32 v2, v8, v0 +; CGP-NEXT: v_mul_lo_u32 v6, v9, v5 +; CGP-NEXT: v_xor_b32_e32 v14, v3, v10 +; CGP-NEXT: v_mul_hi_u32 v3, v9, v0 +; CGP-NEXT: v_mul_hi_u32 v0, v8, v0 +; CGP-NEXT: v_add_i32_e32 v2, vcc, v2, v6 +; CGP-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v2, vcc, v2, v3 ; CGP-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc -; CGP-NEXT: v_mul_lo_u32 v3, v14, v7 -; CGP-NEXT: v_add_i32_e32 v2, vcc, v8, v2 -; CGP-NEXT: v_mul_hi_u32 v8, v12, v7 +; CGP-NEXT: v_mul_lo_u32 v3, v8, v5 +; CGP-NEXT: v_add_i32_e32 v2, vcc, v6, v2 +; CGP-NEXT: v_mul_hi_u32 v6, v9, v5 ; CGP-NEXT: v_add_i32_e32 v0, vcc, v3, v0 ; CGP-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v0, vcc, v0, v8 -; CGP-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v3, vcc, v3, v8 -; CGP-NEXT: v_mul_hi_u32 v7, v14, v7 +; CGP-NEXT: v_add_i32_e32 v0, vcc, v0, v6 +; CGP-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v3, vcc, v3, v6 +; CGP-NEXT: v_mul_hi_u32 v5, v8, v5 ; CGP-NEXT: v_add_i32_e32 v0, vcc, v0, v2 ; CGP-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v2, vcc, v3, v2 -; CGP-NEXT: v_add_i32_e32 v2, vcc, v7, v2 -; CGP-NEXT: v_add_i32_e32 v3, vcc, v12, v0 -; CGP-NEXT: v_addc_u32_e32 v2, vcc, v14, v2, vcc -; CGP-NEXT: v_mul_lo_u32 v7, v13, v3 -; CGP-NEXT: v_mul_lo_u32 v8, v11, v2 -; CGP-NEXT: v_sub_i32_e32 v0, vcc, v10, v5 -; CGP-NEXT: v_subb_u32_e32 v1, vcc, v1, v5, vcc -; CGP-NEXT: v_mul_hi_u32 v5, v11, v3 -; CGP-NEXT: v_add_i32_e32 v7, vcc, v7, v8 -; CGP-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v5, vcc, v7, v5 +; CGP-NEXT: v_add_i32_e32 v2, vcc, v5, v2 +; CGP-NEXT: v_add_i32_e32 v3, vcc, v9, v0 +; CGP-NEXT: v_addc_u32_e32 v2, vcc, v8, v2, vcc +; CGP-NEXT: v_mul_lo_u32 v5, v14, v3 +; CGP-NEXT: v_mul_lo_u32 v6, v12, v2 +; CGP-NEXT: v_sub_i32_e32 v0, vcc, v11, v7 +; CGP-NEXT: v_subb_u32_e32 v1, vcc, v1, v7, vcc +; CGP-NEXT: v_mul_hi_u32 v7, v12, v3 +; CGP-NEXT: v_add_i32_e32 v5, vcc, v5, v6 +; CGP-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v5, vcc, v5, v7 ; CGP-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc -; CGP-NEXT: v_mul_lo_u32 v7, v13, v2 -; CGP-NEXT: v_mul_hi_u32 v3, v13, v3 -; CGP-NEXT: v_add_i32_e32 v5, vcc, v8, v5 -; CGP-NEXT: v_mul_hi_u32 v8, v11, v2 +; CGP-NEXT: v_mul_lo_u32 v7, v14, v2 +; CGP-NEXT: v_mul_hi_u32 v3, v14, v3 +; CGP-NEXT: v_add_i32_e32 v5, vcc, v6, v5 +; CGP-NEXT: v_mul_hi_u32 v6, v12, v2 ; CGP-NEXT: v_add_i32_e32 v3, vcc, v7, v3 ; CGP-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v3, vcc, v3, v8 -; CGP-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v7, vcc, v7, v8 +; CGP-NEXT: v_add_i32_e32 v3, vcc, v3, v6 +; CGP-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v6, vcc, v7, v6 ; CGP-NEXT: v_add_i32_e32 v3, vcc, v3, v5 -; CGP-NEXT: v_mul_hi_u32 v8, v13, v2 -; CGP-NEXT: v_mad_u64_u32 v[2:3], s[4:5], s7, v3, 0 +; CGP-NEXT: v_mul_hi_u32 v7, v14, v2 +; CGP-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v4, v3, 0 ; CGP-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v5, vcc, v6, v5 ; CGP-NEXT: v_add_i32_e32 v5, vcc, v7, v5 -; CGP-NEXT: v_add_i32_e32 v5, vcc, v8, v5 -; CGP-NEXT: v_mad_u64_u32 v[7:8], s[4:5], s7, v5, v[3:4] -; CGP-NEXT: v_sub_i32_e32 v2, vcc, v11, v2 -; CGP-NEXT: v_sub_i32_e64 v5, s[4:5], v13, v7 -; CGP-NEXT: v_subb_u32_e64 v3, s[4:5], v13, v7, vcc +; CGP-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v4, v5, v[3:4] +; CGP-NEXT: v_sub_i32_e32 v2, vcc, v12, v2 +; CGP-NEXT: v_subb_u32_e64 v3, s[4:5], v14, v5, vcc +; CGP-NEXT: v_sub_i32_e64 v5, s[4:5], v14, v5 ; CGP-NEXT: v_subbrev_u32_e32 v5, vcc, 0, v5, vcc -; CGP-NEXT: v_sub_i32_e32 v8, vcc, v2, v4 +; CGP-NEXT: v_sub_i32_e32 v7, vcc, v2, v4 ; CGP-NEXT: v_subbrev_u32_e32 v5, vcc, 0, v5, vcc +; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v7, v4 +; CGP-NEXT: v_cndmask_b32_e64 v8, 0, -1, vcc +; CGP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v5 ; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v2, v4 -; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v8, v4 -; CGP-NEXT: v_cndmask_b32_e64 v7, 0, -1, s[4:5] +; CGP-NEXT: v_cndmask_b32_e32 v8, v13, v8, vcc +; CGP-NEXT: v_sub_i32_e32 v4, vcc, v7, v4 +; CGP-NEXT: v_cndmask_b32_e64 v6, 0, -1, s[4:5] ; CGP-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v3 -; CGP-NEXT: v_cndmask_b32_e64 v10, 0, -1, vcc -; CGP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v5 -; CGP-NEXT: v_cndmask_b32_e64 v7, v6, v7, s[4:5] -; CGP-NEXT: v_cndmask_b32_e32 v6, v6, v10, vcc -; CGP-NEXT: v_sub_i32_e32 v4, vcc, v8, v4 -; CGP-NEXT: v_subbrev_u32_e32 v10, vcc, 0, v5, vcc +; CGP-NEXT: v_subbrev_u32_e32 v9, vcc, 0, v5, vcc +; CGP-NEXT: v_cndmask_b32_e64 v6, v13, v6, s[4:5] +; CGP-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 +; CGP-NEXT: v_cndmask_b32_e32 v4, v7, v4, vcc +; CGP-NEXT: v_cndmask_b32_e32 v5, v5, v9, vcc ; CGP-NEXT: v_cmp_ne_u32_e32 vcc, 0, v6 -; CGP-NEXT: v_cndmask_b32_e32 v4, v8, v4, vcc -; CGP-NEXT: v_cndmask_b32_e32 v5, v5, v10, vcc -; CGP-NEXT: v_cmp_ne_u32_e32 vcc, 0, v7 ; CGP-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc ; CGP-NEXT: v_cndmask_b32_e32 v3, v3, v5, vcc -; CGP-NEXT: v_xor_b32_e32 v2, v2, v9 -; CGP-NEXT: v_xor_b32_e32 v3, v3, v9 -; CGP-NEXT: v_sub_i32_e32 v2, vcc, v2, v9 -; CGP-NEXT: v_subb_u32_e32 v3, vcc, v3, v9, vcc +; CGP-NEXT: v_xor_b32_e32 v2, v2, v10 +; CGP-NEXT: v_xor_b32_e32 v3, v3, v10 +; CGP-NEXT: v_sub_i32_e32 v2, vcc, v2, v10 +; CGP-NEXT: v_subb_u32_e32 v3, vcc, v3, v10, vcc ; CGP-NEXT: s_setpc_b64 s[30:31] %result = srem <2 x i64> %num, ret <2 x i64> %result @@ -1623,7 +1610,8 @@ ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; CHECK-NEXT: v_cvt_f32_u32_e32 v2, 0x12d8fb ; CHECK-NEXT: v_cvt_f32_ubyte0_e32 v3, 0 -; CHECK-NEXT: s_mov_b32 s6, 0xffed2705 +; CHECK-NEXT: v_mov_b32_e32 v6, 0xffed2705 +; CHECK-NEXT: s_bfe_i32 s6, 1, 0x10000 ; CHECK-NEXT: v_mac_f32_e32 v2, 0x4f800000, v3 ; CHECK-NEXT: v_rcp_iflag_f32_e32 v2, v2 ; CHECK-NEXT: v_mul_f32_e32 v2, 0x5f7ffffc, v2 @@ -1631,118 +1619,115 @@ ; CHECK-NEXT: v_trunc_f32_e32 v4, v3 ; CHECK-NEXT: v_mac_f32_e32 v2, 0xcf800000, v4 ; CHECK-NEXT: v_cvt_u32_f32_e32 v5, v2 -; CHECK-NEXT: v_mov_b32_e32 v2, 0xffed2705 -; CHECK-NEXT: v_cvt_u32_f32_e32 v6, v4 -; CHECK-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v2, v5, 0 -; CHECK-NEXT: v_mad_u64_u32 v[3:4], s[4:5], s6, v6, v[3:4] -; CHECK-NEXT: v_mul_hi_u32 v7, v5, v2 +; CHECK-NEXT: v_cvt_u32_f32_e32 v7, v4 +; CHECK-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v6, v5, 0 +; CHECK-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v6, v7, v[3:4] +; CHECK-NEXT: v_mul_hi_u32 v8, v5, v2 ; CHECK-NEXT: v_mad_u64_u32 v[3:4], s[4:5], -1, v5, v[3:4] -; CHECK-NEXT: v_mul_lo_u32 v4, v6, v2 -; CHECK-NEXT: v_mul_hi_u32 v2, v6, v2 -; CHECK-NEXT: v_mul_lo_u32 v8, v5, v3 -; CHECK-NEXT: v_mul_lo_u32 v9, v6, v3 -; CHECK-NEXT: v_mul_hi_u32 v10, v5, v3 -; CHECK-NEXT: v_mul_hi_u32 v3, v6, v3 +; CHECK-NEXT: v_mul_lo_u32 v4, v7, v2 +; CHECK-NEXT: v_mul_hi_u32 v2, v7, v2 +; CHECK-NEXT: v_mul_lo_u32 v9, v5, v3 +; CHECK-NEXT: v_mul_lo_u32 v10, v7, v3 +; CHECK-NEXT: v_mul_hi_u32 v11, v5, v3 +; CHECK-NEXT: v_mul_hi_u32 v3, v7, v3 +; CHECK-NEXT: v_add_i32_e32 v4, vcc, v4, v9 +; CHECK-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc +; CHECK-NEXT: v_add_i32_e32 v2, vcc, v10, v2 +; CHECK-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc ; CHECK-NEXT: v_add_i32_e32 v4, vcc, v4, v8 -; CHECK-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc -; CHECK-NEXT: v_add_i32_e32 v4, vcc, v4, v7 ; CHECK-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc -; CHECK-NEXT: v_add_i32_e32 v4, vcc, v8, v4 -; CHECK-NEXT: v_add_i32_e32 v2, vcc, v9, v2 -; CHECK-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc -; CHECK-NEXT: v_add_i32_e32 v2, vcc, v2, v10 +; CHECK-NEXT: v_add_i32_e32 v4, vcc, v9, v4 +; CHECK-NEXT: v_add_i32_e32 v2, vcc, v2, v11 ; CHECK-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc -; CHECK-NEXT: v_add_i32_e32 v7, vcc, v7, v8 +; CHECK-NEXT: v_add_i32_e32 v8, vcc, v10, v8 ; CHECK-NEXT: v_add_i32_e32 v2, vcc, v2, v4 ; CHECK-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc -; CHECK-NEXT: v_add_i32_e32 v4, vcc, v7, v4 +; CHECK-NEXT: v_add_i32_e32 v4, vcc, v8, v4 ; CHECK-NEXT: v_add_i32_e32 v3, vcc, v3, v4 ; CHECK-NEXT: v_add_i32_e32 v5, vcc, v5, v2 -; CHECK-NEXT: v_addc_u32_e32 v6, vcc, v6, v3, vcc -; CHECK-NEXT: v_mad_u64_u32 v[2:3], s[4:5], s6, v5, 0 -; CHECK-NEXT: v_ashrrev_i32_e32 v7, 31, v1 -; CHECK-NEXT: v_add_i32_e32 v8, vcc, v0, v7 -; CHECK-NEXT: v_mad_u64_u32 v[3:4], s[4:5], s6, v6, v[3:4] -; CHECK-NEXT: v_addc_u32_e32 v9, vcc, v1, v7, vcc -; CHECK-NEXT: v_mad_u64_u32 v[0:1], s[4:5], -1, v5, v[3:4] -; CHECK-NEXT: v_xor_b32_e32 v3, v8, v7 -; CHECK-NEXT: v_mul_lo_u32 v1, v6, v2 -; CHECK-NEXT: v_mul_lo_u32 v8, v5, v0 -; CHECK-NEXT: v_xor_b32_e32 v4, v9, v7 -; CHECK-NEXT: v_mul_hi_u32 v9, v5, v2 -; CHECK-NEXT: v_mul_hi_u32 v2, v6, v2 +; CHECK-NEXT: v_addc_u32_e32 v7, vcc, v7, v3, vcc +; CHECK-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v6, v5, 0 +; CHECK-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v6, v7, v[3:4] +; CHECK-NEXT: v_ashrrev_i32_e32 v6, 31, v1 +; CHECK-NEXT: v_add_i32_e32 v0, vcc, v0, v6 +; CHECK-NEXT: v_mad_u64_u32 v[3:4], s[4:5], -1, v5, v[3:4] +; CHECK-NEXT: v_addc_u32_e32 v1, vcc, v1, v6, vcc +; CHECK-NEXT: v_xor_b32_e32 v4, v0, v6 +; CHECK-NEXT: v_mul_lo_u32 v0, v7, v2 +; CHECK-NEXT: v_mul_lo_u32 v8, v5, v3 +; CHECK-NEXT: v_xor_b32_e32 v9, v1, v6 +; CHECK-NEXT: v_mul_hi_u32 v1, v5, v2 +; CHECK-NEXT: v_mul_hi_u32 v2, v7, v2 +; CHECK-NEXT: v_add_i32_e32 v0, vcc, v0, v8 +; CHECK-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc +; CHECK-NEXT: v_add_i32_e32 v0, vcc, v0, v1 +; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc +; CHECK-NEXT: v_mul_lo_u32 v1, v7, v3 +; CHECK-NEXT: v_add_i32_e32 v0, vcc, v8, v0 +; CHECK-NEXT: v_mul_hi_u32 v8, v5, v3 +; CHECK-NEXT: v_add_i32_e32 v1, vcc, v1, v2 +; CHECK-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc ; CHECK-NEXT: v_add_i32_e32 v1, vcc, v1, v8 ; CHECK-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc -; CHECK-NEXT: v_add_i32_e32 v1, vcc, v1, v9 -; CHECK-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc -; CHECK-NEXT: v_mul_lo_u32 v9, v6, v0 -; CHECK-NEXT: v_add_i32_e32 v1, vcc, v8, v1 -; CHECK-NEXT: v_mul_hi_u32 v8, v5, v0 -; CHECK-NEXT: v_add_i32_e32 v2, vcc, v9, v2 -; CHECK-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc ; CHECK-NEXT: v_add_i32_e32 v2, vcc, v2, v8 -; CHECK-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc -; CHECK-NEXT: v_add_i32_e32 v8, vcc, v9, v8 -; CHECK-NEXT: v_mul_hi_u32 v0, v6, v0 +; CHECK-NEXT: v_mul_hi_u32 v3, v7, v3 +; CHECK-NEXT: v_add_i32_e32 v0, vcc, v1, v0 +; CHECK-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc ; CHECK-NEXT: v_add_i32_e32 v1, vcc, v2, v1 +; CHECK-NEXT: v_add_i32_e32 v1, vcc, v3, v1 +; CHECK-NEXT: v_add_i32_e32 v0, vcc, v5, v0 +; CHECK-NEXT: v_addc_u32_e32 v1, vcc, v7, v1, vcc +; CHECK-NEXT: v_mul_lo_u32 v2, v9, v0 +; CHECK-NEXT: v_mul_lo_u32 v3, v4, v1 +; CHECK-NEXT: v_mul_hi_u32 v7, v4, v0 +; CHECK-NEXT: v_mul_hi_u32 v0, v9, v0 +; CHECK-NEXT: v_mov_b32_e32 v5, 0x12d8fb +; CHECK-NEXT: v_add_i32_e32 v2, vcc, v2, v3 +; CHECK-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc +; CHECK-NEXT: v_add_i32_e32 v2, vcc, v2, v7 ; CHECK-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc -; CHECK-NEXT: v_add_i32_e32 v2, vcc, v8, v2 +; CHECK-NEXT: v_mul_lo_u32 v7, v9, v1 +; CHECK-NEXT: v_add_i32_e32 v2, vcc, v3, v2 +; CHECK-NEXT: v_mul_hi_u32 v3, v4, v1 +; CHECK-NEXT: v_add_i32_e32 v0, vcc, v7, v0 +; CHECK-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc +; CHECK-NEXT: v_add_i32_e32 v0, vcc, v0, v3 +; CHECK-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc +; CHECK-NEXT: v_add_i32_e32 v3, vcc, v7, v3 ; CHECK-NEXT: v_add_i32_e32 v0, vcc, v0, v2 -; CHECK-NEXT: v_add_i32_e32 v1, vcc, v5, v1 -; CHECK-NEXT: v_addc_u32_e32 v0, vcc, v6, v0, vcc -; CHECK-NEXT: v_mul_lo_u32 v2, v4, v1 -; CHECK-NEXT: v_mul_lo_u32 v5, v3, v0 -; CHECK-NEXT: v_mul_hi_u32 v8, v3, v1 -; CHECK-NEXT: v_mul_hi_u32 v1, v4, v1 -; CHECK-NEXT: s_mov_b32 s6, 0x12d8fb -; CHECK-NEXT: v_add_i32_e32 v2, vcc, v2, v5 -; CHECK-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc -; CHECK-NEXT: v_add_i32_e32 v2, vcc, v2, v8 -; CHECK-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc -; CHECK-NEXT: v_mul_lo_u32 v8, v4, v0 -; CHECK-NEXT: v_add_i32_e32 v2, vcc, v5, v2 -; CHECK-NEXT: v_mul_hi_u32 v5, v3, v0 -; CHECK-NEXT: v_add_i32_e32 v1, vcc, v8, v1 -; CHECK-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc -; CHECK-NEXT: v_add_i32_e32 v1, vcc, v1, v5 -; CHECK-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc -; CHECK-NEXT: v_add_i32_e32 v5, vcc, v8, v5 -; CHECK-NEXT: v_add_i32_e32 v1, vcc, v1, v2 -; CHECK-NEXT: v_mul_hi_u32 v8, v4, v0 -; CHECK-NEXT: v_mad_u64_u32 v[0:1], s[4:5], s6, v1, 0 +; CHECK-NEXT: v_mul_hi_u32 v7, v9, v1 +; CHECK-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v5, v0, 0 ; CHECK-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc -; CHECK-NEXT: v_add_i32_e32 v2, vcc, v5, v2 -; CHECK-NEXT: v_add_i32_e32 v2, vcc, v8, v2 -; CHECK-NEXT: v_mad_u64_u32 v[1:2], s[4:5], s6, v2, v[1:2] -; CHECK-NEXT: v_sub_i32_e64 v0, s[4:5], v3, v0 -; CHECK-NEXT: v_mov_b32_e32 v6, 0x12d8fb -; CHECK-NEXT: v_subb_u32_e64 v2, vcc, v4, v1, s[4:5] -; CHECK-NEXT: v_sub_i32_e32 v1, vcc, v4, v1 -; CHECK-NEXT: s_bfe_i32 s6, 1, 0x10000 -; CHECK-NEXT: v_cmp_ge_u32_e32 vcc, v0, v6 +; CHECK-NEXT: v_add_i32_e32 v2, vcc, v3, v2 +; CHECK-NEXT: v_add_i32_e32 v2, vcc, v7, v2 +; CHECK-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v5, v2, v[1:2] +; CHECK-NEXT: v_sub_i32_e64 v0, s[4:5], v4, v0 +; CHECK-NEXT: v_subb_u32_e64 v2, vcc, v9, v1, s[4:5] +; CHECK-NEXT: v_sub_i32_e32 v1, vcc, v9, v1 +; CHECK-NEXT: v_cmp_ge_u32_e32 vcc, v0, v5 ; CHECK-NEXT: v_cndmask_b32_e64 v3, 0, -1, vcc ; CHECK-NEXT: v_mov_b32_e32 v4, s6 ; CHECK-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 ; CHECK-NEXT: v_cndmask_b32_e32 v3, v4, v3, vcc -; CHECK-NEXT: v_sub_i32_e32 v5, vcc, v0, v6 +; CHECK-NEXT: v_sub_i32_e32 v7, vcc, v0, v5 ; CHECK-NEXT: v_subbrev_u32_e64 v1, s[4:5], 0, v1, s[4:5] ; CHECK-NEXT: v_subbrev_u32_e32 v1, vcc, 0, v1, vcc -; CHECK-NEXT: v_cmp_ge_u32_e32 vcc, v5, v6 -; CHECK-NEXT: v_cndmask_b32_e64 v6, 0, -1, vcc +; CHECK-NEXT: v_cmp_ge_u32_e32 vcc, v7, v5 +; CHECK-NEXT: v_cndmask_b32_e64 v5, 0, -1, vcc ; CHECK-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 -; CHECK-NEXT: v_cndmask_b32_e32 v4, v4, v6, vcc -; CHECK-NEXT: v_subrev_i32_e32 v6, vcc, 0x12d8fb, v5 +; CHECK-NEXT: v_cndmask_b32_e32 v4, v4, v5, vcc +; CHECK-NEXT: v_subrev_i32_e32 v5, vcc, 0x12d8fb, v7 ; CHECK-NEXT: v_subbrev_u32_e32 v8, vcc, 0, v1, vcc ; CHECK-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 -; CHECK-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc +; CHECK-NEXT: v_cndmask_b32_e32 v4, v7, v5, vcc ; CHECK-NEXT: v_cndmask_b32_e32 v1, v1, v8, vcc ; CHECK-NEXT: v_cmp_ne_u32_e32 vcc, 0, v3 ; CHECK-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc ; CHECK-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc -; CHECK-NEXT: v_xor_b32_e32 v0, v0, v7 -; CHECK-NEXT: v_xor_b32_e32 v1, v1, v7 -; CHECK-NEXT: v_sub_i32_e32 v0, vcc, v0, v7 -; CHECK-NEXT: v_subb_u32_e32 v1, vcc, v1, v7, vcc +; CHECK-NEXT: v_xor_b32_e32 v0, v0, v6 +; CHECK-NEXT: v_xor_b32_e32 v1, v1, v6 +; CHECK-NEXT: v_sub_i32_e32 v0, vcc, v0, v6 +; CHECK-NEXT: v_subb_u32_e32 v1, vcc, v1, v6, vcc ; CHECK-NEXT: s_setpc_b64 s[30:31] %result = srem i64 %num, 1235195 ret i64 %result @@ -2019,243 +2004,232 @@ ; CGP-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; CGP-NEXT: v_cvt_f32_u32_e32 v4, 0x12d8fb ; CGP-NEXT: v_cvt_f32_ubyte0_e32 v5, 0 -; CGP-NEXT: s_mov_b32 s6, 0xffed2705 -; CGP-NEXT: s_mov_b32 s7, 0x12d8fb +; CGP-NEXT: s_bfe_i32 s6, 1, 0x10000 ; CGP-NEXT: v_mac_f32_e32 v4, 0x4f800000, v5 ; CGP-NEXT: v_rcp_iflag_f32_e32 v4, v4 -; CGP-NEXT: s_bfe_i32 s8, 1, 0x10000 ; CGP-NEXT: v_mul_f32_e32 v4, 0x5f7ffffc, v4 ; CGP-NEXT: v_mul_f32_e32 v5, 0x2f800000, v4 -; CGP-NEXT: v_trunc_f32_e32 v6, v5 -; CGP-NEXT: v_mac_f32_e32 v4, 0xcf800000, v6 -; CGP-NEXT: v_cvt_u32_f32_e32 v7, v4 -; CGP-NEXT: v_mov_b32_e32 v4, 0xffed2705 -; CGP-NEXT: v_cvt_u32_f32_e32 v8, v6 -; CGP-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v4, v7, 0 -; CGP-NEXT: v_mad_u64_u32 v[5:6], s[4:5], s6, v8, v[5:6] -; CGP-NEXT: v_mul_hi_u32 v9, v7, v4 -; CGP-NEXT: v_mad_u64_u32 v[5:6], s[4:5], -1, v7, v[5:6] -; CGP-NEXT: v_mul_lo_u32 v6, v8, v4 -; CGP-NEXT: v_mul_hi_u32 v4, v8, v4 -; CGP-NEXT: v_mul_lo_u32 v10, v7, v5 -; CGP-NEXT: v_mul_lo_u32 v11, v8, v5 -; CGP-NEXT: v_mul_hi_u32 v12, v7, v5 -; CGP-NEXT: v_mul_hi_u32 v5, v8, v5 -; CGP-NEXT: v_add_i32_e32 v6, vcc, v6, v10 -; CGP-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v6, vcc, v6, v9 -; CGP-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v6, vcc, v10, v6 +; CGP-NEXT: v_trunc_f32_e32 v7, v5 +; CGP-NEXT: v_mac_f32_e32 v4, 0xcf800000, v7 +; CGP-NEXT: v_cvt_u32_f32_e32 v6, v4 +; CGP-NEXT: v_mov_b32_e32 v5, 0xffed2705 +; CGP-NEXT: v_cvt_u32_f32_e32 v8, v7 +; CGP-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v5, v6, 0 +; CGP-NEXT: v_mov_b32_e32 v4, v10 +; CGP-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v5, v8, v[4:5] +; CGP-NEXT: v_mul_lo_u32 v4, v8, v9 +; CGP-NEXT: v_mul_hi_u32 v7, v6, v9 +; CGP-NEXT: v_mad_u64_u32 v[10:11], s[4:5], -1, v6, v[10:11] +; CGP-NEXT: v_mul_hi_u32 v9, v8, v9 +; CGP-NEXT: v_mul_lo_u32 v11, v6, v10 +; CGP-NEXT: v_mul_lo_u32 v12, v8, v10 +; CGP-NEXT: v_mul_hi_u32 v13, v6, v10 +; CGP-NEXT: v_mul_hi_u32 v10, v8, v10 +; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v11 +; CGP-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v7 +; CGP-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v4, vcc, v11, v4 +; CGP-NEXT: v_add_i32_e32 v7, vcc, v12, v9 +; CGP-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v7, vcc, v7, v13 +; CGP-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v9, vcc, v9, v11 +; CGP-NEXT: v_add_i32_e32 v4, vcc, v7, v4 +; CGP-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v7, vcc, v9, v7 +; CGP-NEXT: v_add_i32_e32 v7, vcc, v10, v7 +; CGP-NEXT: v_add_i32_e32 v12, vcc, v6, v4 +; CGP-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v5, v12, 0 +; CGP-NEXT: v_addc_u32_e32 v13, vcc, v8, v7, vcc +; CGP-NEXT: v_mov_b32_e32 v4, v10 +; CGP-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v5, v13, v[4:5] +; CGP-NEXT: v_ashrrev_i32_e32 v7, 31, v1 +; CGP-NEXT: v_add_i32_e32 v0, vcc, v0, v7 +; CGP-NEXT: v_mad_u64_u32 v[10:11], s[4:5], -1, v12, v[10:11] +; CGP-NEXT: v_addc_u32_e32 v1, vcc, v1, v7, vcc +; CGP-NEXT: v_xor_b32_e32 v11, v0, v7 +; CGP-NEXT: v_mul_lo_u32 v0, v13, v9 +; CGP-NEXT: v_mul_lo_u32 v4, v12, v10 +; CGP-NEXT: v_xor_b32_e32 v14, v1, v7 +; CGP-NEXT: v_mul_hi_u32 v1, v12, v9 +; CGP-NEXT: v_mul_hi_u32 v9, v13, v9 +; CGP-NEXT: v_add_i32_e32 v0, vcc, v0, v4 +; CGP-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v0, vcc, v0, v1 +; CGP-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc +; CGP-NEXT: v_mul_lo_u32 v1, v13, v10 +; CGP-NEXT: v_add_i32_e32 v0, vcc, v4, v0 +; CGP-NEXT: v_mul_hi_u32 v4, v12, v10 +; CGP-NEXT: v_add_i32_e32 v1, vcc, v1, v9 ; CGP-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v12 -; CGP-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v9, vcc, v9, v10 -; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v6 -; CGP-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v6, vcc, v9, v6 -; CGP-NEXT: v_add_i32_e32 v5, vcc, v5, v6 -; CGP-NEXT: v_add_i32_e32 v9, vcc, v7, v4 -; CGP-NEXT: v_mad_u64_u32 v[6:7], s[4:5], s6, v9, 0 -; CGP-NEXT: v_addc_u32_e32 v10, vcc, v8, v5, vcc -; CGP-NEXT: v_ashrrev_i32_e32 v5, 31, v1 -; CGP-NEXT: v_mov_b32_e32 v4, v7 -; CGP-NEXT: v_mad_u64_u32 v[7:8], s[4:5], s6, v10, v[4:5] -; CGP-NEXT: v_add_i32_e32 v4, vcc, v0, v5 -; CGP-NEXT: v_addc_u32_e32 v11, vcc, v1, v5, vcc -; CGP-NEXT: v_mad_u64_u32 v[0:1], s[4:5], -1, v9, v[7:8] -; CGP-NEXT: v_xor_b32_e32 v8, v4, v5 -; CGP-NEXT: v_mul_lo_u32 v1, v10, v6 -; CGP-NEXT: v_mul_lo_u32 v4, v9, v0 -; CGP-NEXT: v_mul_hi_u32 v7, v9, v6 -; CGP-NEXT: v_mul_hi_u32 v6, v10, v6 -; CGP-NEXT: v_xor_b32_e32 v11, v11, v5 ; CGP-NEXT: v_add_i32_e32 v1, vcc, v1, v4 ; CGP-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v1, vcc, v1, v7 +; CGP-NEXT: v_add_i32_e32 v4, vcc, v9, v4 +; CGP-NEXT: v_mul_hi_u32 v9, v13, v10 +; CGP-NEXT: v_add_i32_e32 v0, vcc, v1, v0 ; CGP-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc -; CGP-NEXT: v_mul_lo_u32 v7, v10, v0 -; CGP-NEXT: v_add_i32_e32 v1, vcc, v4, v1 -; CGP-NEXT: v_mul_hi_u32 v4, v9, v0 -; CGP-NEXT: v_add_i32_e32 v6, vcc, v7, v6 -; CGP-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v4, vcc, v6, v4 -; CGP-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v6, vcc, v7, v6 -; CGP-NEXT: v_mul_hi_u32 v0, v10, v0 ; CGP-NEXT: v_add_i32_e32 v1, vcc, v4, v1 -; CGP-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v4, vcc, v6, v4 -; CGP-NEXT: v_add_i32_e32 v0, vcc, v0, v4 ; CGP-NEXT: v_add_i32_e32 v1, vcc, v9, v1 -; CGP-NEXT: v_addc_u32_e32 v0, vcc, v10, v0, vcc -; CGP-NEXT: v_mul_lo_u32 v6, v11, v1 -; CGP-NEXT: v_mul_lo_u32 v7, v8, v0 -; CGP-NEXT: v_mul_hi_u32 v9, v8, v1 -; CGP-NEXT: v_mul_hi_u32 v1, v11, v1 +; CGP-NEXT: v_add_i32_e32 v0, vcc, v12, v0 +; CGP-NEXT: v_addc_u32_e32 v1, vcc, v13, v1, vcc +; CGP-NEXT: v_mul_lo_u32 v9, v14, v0 +; CGP-NEXT: v_mul_lo_u32 v10, v11, v1 +; CGP-NEXT: v_mul_hi_u32 v12, v11, v0 +; CGP-NEXT: v_mul_hi_u32 v0, v14, v0 ; CGP-NEXT: v_mov_b32_e32 v4, 0x12d8fb -; CGP-NEXT: v_add_i32_e32 v6, vcc, v6, v7 -; CGP-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v6, vcc, v6, v9 -; CGP-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc -; CGP-NEXT: v_mul_lo_u32 v9, v11, v0 -; CGP-NEXT: v_add_i32_e32 v6, vcc, v7, v6 -; CGP-NEXT: v_mul_hi_u32 v7, v8, v0 -; CGP-NEXT: v_add_i32_e32 v1, vcc, v9, v1 +; CGP-NEXT: v_add_i32_e32 v9, vcc, v9, v10 +; CGP-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v9, vcc, v9, v12 ; CGP-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v1, vcc, v1, v7 -; CGP-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v7, vcc, v9, v7 -; CGP-NEXT: v_add_i32_e32 v1, vcc, v1, v6 -; CGP-NEXT: v_mul_hi_u32 v9, v11, v0 -; CGP-NEXT: v_mad_u64_u32 v[0:1], s[4:5], s7, v1, 0 -; CGP-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v6, vcc, v7, v6 -; CGP-NEXT: v_add_i32_e32 v6, vcc, v9, v6 -; CGP-NEXT: v_mad_u64_u32 v[6:7], s[4:5], s7, v6, v[1:2] -; CGP-NEXT: v_sub_i32_e32 v9, vcc, v8, v0 -; CGP-NEXT: v_subb_u32_e64 v10, s[4:5], v11, v6, vcc -; CGP-NEXT: v_sub_i32_e64 v0, s[4:5], v11, v6 -; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v9, v4 -; CGP-NEXT: v_cndmask_b32_e64 v1, 0, -1, s[4:5] -; CGP-NEXT: v_mov_b32_e32 v6, s8 -; CGP-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v10 -; CGP-NEXT: v_cndmask_b32_e64 v11, v6, v1, s[4:5] -; CGP-NEXT: v_cvt_f32_u32_e32 v1, 0x12d8fb -; CGP-NEXT: v_cvt_f32_ubyte0_e32 v7, 0 -; CGP-NEXT: v_subbrev_u32_e32 v0, vcc, 0, v0, vcc -; CGP-NEXT: v_mac_f32_e32 v1, 0x4f800000, v7 -; CGP-NEXT: v_rcp_iflag_f32_e32 v1, v1 -; CGP-NEXT: v_sub_i32_e32 v12, vcc, v9, v4 -; CGP-NEXT: v_subbrev_u32_e32 v13, vcc, 0, v0, vcc -; CGP-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v1 -; CGP-NEXT: v_mul_f32_e32 v1, 0x2f800000, v0 -; CGP-NEXT: v_trunc_f32_e32 v7, v1 -; CGP-NEXT: v_mac_f32_e32 v0, 0xcf800000, v7 -; CGP-NEXT: v_cvt_u32_f32_e32 v14, v0 -; CGP-NEXT: v_cvt_u32_f32_e32 v15, v7 -; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v12, v4 -; CGP-NEXT: v_cndmask_b32_e64 v8, 0, -1, vcc -; CGP-NEXT: v_mad_u64_u32 v[0:1], s[4:5], s6, v14, 0 -; CGP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v13 -; CGP-NEXT: v_cndmask_b32_e32 v16, v6, v8, vcc -; CGP-NEXT: v_mad_u64_u32 v[7:8], s[4:5], s6, v15, v[1:2] -; CGP-NEXT: v_sub_i32_e32 v1, vcc, v12, v4 -; CGP-NEXT: v_mad_u64_u32 v[7:8], s[4:5], -1, v14, v[7:8] -; CGP-NEXT: v_subbrev_u32_e32 v17, vcc, 0, v13, vcc -; CGP-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 -; CGP-NEXT: v_cndmask_b32_e32 v8, v12, v1, vcc -; CGP-NEXT: v_mul_lo_u32 v1, v15, v0 -; CGP-NEXT: v_mul_lo_u32 v12, v14, v7 -; CGP-NEXT: v_mul_hi_u32 v16, v14, v0 -; CGP-NEXT: v_cndmask_b32_e32 v13, v13, v17, vcc -; CGP-NEXT: v_mul_hi_u32 v0, v15, v0 -; CGP-NEXT: v_add_i32_e32 v1, vcc, v1, v12 +; CGP-NEXT: v_mul_lo_u32 v12, v14, v1 +; CGP-NEXT: v_add_i32_e32 v9, vcc, v10, v9 +; CGP-NEXT: v_mul_hi_u32 v10, v11, v1 +; CGP-NEXT: v_add_i32_e32 v0, vcc, v12, v0 ; CGP-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v1, vcc, v1, v16 +; CGP-NEXT: v_add_i32_e32 v0, vcc, v0, v10 +; CGP-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v10, vcc, v12, v10 +; CGP-NEXT: v_add_i32_e32 v0, vcc, v0, v9 +; CGP-NEXT: v_mul_hi_u32 v12, v14, v1 +; CGP-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v4, v0, 0 +; CGP-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v9, vcc, v10, v9 +; CGP-NEXT: v_add_i32_e32 v9, vcc, v12, v9 +; CGP-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v4, v9, v[1:2] +; CGP-NEXT: v_sub_i32_e32 v11, vcc, v11, v0 +; CGP-NEXT: v_subb_u32_e64 v12, s[4:5], v14, v9, vcc +; CGP-NEXT: v_sub_i32_e64 v0, s[4:5], v14, v9 +; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v11, v4 +; CGP-NEXT: v_subbrev_u32_e32 v0, vcc, 0, v0, vcc +; CGP-NEXT: v_cndmask_b32_e64 v1, 0, -1, s[4:5] +; CGP-NEXT: v_mov_b32_e32 v13, s6 +; CGP-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v12 +; CGP-NEXT: v_sub_i32_e32 v15, vcc, v11, v4 +; CGP-NEXT: v_cndmask_b32_e64 v14, v13, v1, s[4:5] +; CGP-NEXT: v_subbrev_u32_e32 v16, vcc, 0, v0, vcc +; CGP-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v5, v6, 0 +; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v15, v4 +; CGP-NEXT: v_cndmask_b32_e64 v9, 0, -1, vcc +; CGP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v16 +; CGP-NEXT: v_cndmask_b32_e32 v17, v13, v9, vcc +; CGP-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v5, v8, v[1:2] +; CGP-NEXT: v_sub_i32_e32 v1, vcc, v15, v4 +; CGP-NEXT: v_mad_u64_u32 v[9:10], s[4:5], -1, v6, v[9:10] +; CGP-NEXT: v_subbrev_u32_e32 v18, vcc, 0, v16, vcc +; CGP-NEXT: v_cmp_ne_u32_e32 vcc, 0, v17 +; CGP-NEXT: v_cndmask_b32_e32 v10, v15, v1, vcc +; CGP-NEXT: v_mul_lo_u32 v1, v8, v0 +; CGP-NEXT: v_mul_lo_u32 v15, v6, v9 +; CGP-NEXT: v_mul_hi_u32 v17, v6, v0 +; CGP-NEXT: v_cndmask_b32_e32 v16, v16, v18, vcc +; CGP-NEXT: v_mul_hi_u32 v0, v8, v0 +; CGP-NEXT: v_add_i32_e32 v1, vcc, v1, v15 +; CGP-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v1, vcc, v1, v17 ; CGP-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc -; CGP-NEXT: v_mul_lo_u32 v16, v15, v7 -; CGP-NEXT: v_add_i32_e32 v1, vcc, v12, v1 -; CGP-NEXT: v_mul_hi_u32 v12, v14, v7 -; CGP-NEXT: v_add_i32_e32 v0, vcc, v16, v0 -; CGP-NEXT: v_cndmask_b32_e64 v16, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v0, vcc, v0, v12 -; CGP-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v12, vcc, v16, v12 -; CGP-NEXT: v_mul_hi_u32 v7, v15, v7 +; CGP-NEXT: v_mul_lo_u32 v17, v8, v9 +; CGP-NEXT: v_add_i32_e32 v1, vcc, v15, v1 +; CGP-NEXT: v_mul_hi_u32 v15, v6, v9 +; CGP-NEXT: v_add_i32_e32 v0, vcc, v17, v0 +; CGP-NEXT: v_cndmask_b32_e64 v17, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v0, vcc, v0, v15 +; CGP-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v15, vcc, v17, v15 +; CGP-NEXT: v_mul_hi_u32 v9, v8, v9 ; CGP-NEXT: v_add_i32_e32 v0, vcc, v0, v1 ; CGP-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v1, vcc, v12, v1 -; CGP-NEXT: v_add_i32_e32 v1, vcc, v7, v1 -; CGP-NEXT: v_add_i32_e32 v12, vcc, v14, v0 -; CGP-NEXT: v_addc_u32_e32 v14, vcc, v15, v1, vcc -; CGP-NEXT: v_mad_u64_u32 v[0:1], s[4:5], s6, v12, 0 -; CGP-NEXT: v_cmp_ne_u32_e32 vcc, 0, v11 -; CGP-NEXT: v_cndmask_b32_e32 v7, v9, v8, vcc -; CGP-NEXT: v_cndmask_b32_e32 v9, v10, v13, vcc -; CGP-NEXT: v_xor_b32_e32 v10, v7, v5 -; CGP-NEXT: v_mad_u64_u32 v[7:8], s[4:5], s6, v14, v[1:2] -; CGP-NEXT: v_xor_b32_e32 v1, v9, v5 -; CGP-NEXT: v_ashrrev_i32_e32 v9, 31, v3 -; CGP-NEXT: v_mad_u64_u32 v[7:8], s[4:5], -1, v12, v[7:8] -; CGP-NEXT: v_add_i32_e32 v2, vcc, v2, v9 -; CGP-NEXT: v_addc_u32_e32 v3, vcc, v3, v9, vcc -; CGP-NEXT: v_xor_b32_e32 v11, v2, v9 -; CGP-NEXT: v_mul_lo_u32 v2, v14, v0 -; CGP-NEXT: v_mul_lo_u32 v8, v12, v7 -; CGP-NEXT: v_xor_b32_e32 v13, v3, v9 -; CGP-NEXT: v_mul_hi_u32 v3, v12, v0 -; CGP-NEXT: v_mul_hi_u32 v0, v14, v0 -; CGP-NEXT: v_add_i32_e32 v2, vcc, v2, v8 -; CGP-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v1, vcc, v15, v1 +; CGP-NEXT: v_add_i32_e32 v1, vcc, v9, v1 +; CGP-NEXT: v_add_i32_e32 v9, vcc, v6, v0 +; CGP-NEXT: v_addc_u32_e32 v8, vcc, v8, v1, vcc +; CGP-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v5, v9, 0 +; CGP-NEXT: v_cmp_ne_u32_e32 vcc, 0, v14 +; CGP-NEXT: v_cndmask_b32_e32 v6, v11, v10, vcc +; CGP-NEXT: v_xor_b32_e32 v11, v6, v7 +; CGP-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v5, v8, v[1:2] +; CGP-NEXT: v_cndmask_b32_e32 v10, v12, v16, vcc +; CGP-NEXT: v_xor_b32_e32 v1, v10, v7 +; CGP-NEXT: v_mad_u64_u32 v[5:6], s[4:5], -1, v9, v[5:6] +; CGP-NEXT: v_ashrrev_i32_e32 v10, 31, v3 +; CGP-NEXT: v_add_i32_e32 v2, vcc, v2, v10 +; CGP-NEXT: v_addc_u32_e32 v3, vcc, v3, v10, vcc +; CGP-NEXT: v_xor_b32_e32 v12, v2, v10 +; CGP-NEXT: v_mul_lo_u32 v2, v8, v0 +; CGP-NEXT: v_mul_lo_u32 v6, v9, v5 +; CGP-NEXT: v_xor_b32_e32 v14, v3, v10 +; CGP-NEXT: v_mul_hi_u32 v3, v9, v0 +; CGP-NEXT: v_mul_hi_u32 v0, v8, v0 +; CGP-NEXT: v_add_i32_e32 v2, vcc, v2, v6 +; CGP-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v2, vcc, v2, v3 ; CGP-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc -; CGP-NEXT: v_mul_lo_u32 v3, v14, v7 -; CGP-NEXT: v_add_i32_e32 v2, vcc, v8, v2 -; CGP-NEXT: v_mul_hi_u32 v8, v12, v7 +; CGP-NEXT: v_mul_lo_u32 v3, v8, v5 +; CGP-NEXT: v_add_i32_e32 v2, vcc, v6, v2 +; CGP-NEXT: v_mul_hi_u32 v6, v9, v5 ; CGP-NEXT: v_add_i32_e32 v0, vcc, v3, v0 ; CGP-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v0, vcc, v0, v8 -; CGP-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v3, vcc, v3, v8 -; CGP-NEXT: v_mul_hi_u32 v7, v14, v7 +; CGP-NEXT: v_add_i32_e32 v0, vcc, v0, v6 +; CGP-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v3, vcc, v3, v6 +; CGP-NEXT: v_mul_hi_u32 v5, v8, v5 ; CGP-NEXT: v_add_i32_e32 v0, vcc, v0, v2 ; CGP-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v2, vcc, v3, v2 -; CGP-NEXT: v_add_i32_e32 v2, vcc, v7, v2 -; CGP-NEXT: v_add_i32_e32 v3, vcc, v12, v0 -; CGP-NEXT: v_addc_u32_e32 v2, vcc, v14, v2, vcc -; CGP-NEXT: v_mul_lo_u32 v7, v13, v3 -; CGP-NEXT: v_mul_lo_u32 v8, v11, v2 -; CGP-NEXT: v_sub_i32_e32 v0, vcc, v10, v5 -; CGP-NEXT: v_subb_u32_e32 v1, vcc, v1, v5, vcc -; CGP-NEXT: v_mul_hi_u32 v5, v11, v3 -; CGP-NEXT: v_add_i32_e32 v7, vcc, v7, v8 -; CGP-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v5, vcc, v7, v5 +; CGP-NEXT: v_add_i32_e32 v2, vcc, v5, v2 +; CGP-NEXT: v_add_i32_e32 v3, vcc, v9, v0 +; CGP-NEXT: v_addc_u32_e32 v2, vcc, v8, v2, vcc +; CGP-NEXT: v_mul_lo_u32 v5, v14, v3 +; CGP-NEXT: v_mul_lo_u32 v6, v12, v2 +; CGP-NEXT: v_sub_i32_e32 v0, vcc, v11, v7 +; CGP-NEXT: v_subb_u32_e32 v1, vcc, v1, v7, vcc +; CGP-NEXT: v_mul_hi_u32 v7, v12, v3 +; CGP-NEXT: v_add_i32_e32 v5, vcc, v5, v6 +; CGP-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v5, vcc, v5, v7 ; CGP-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc -; CGP-NEXT: v_mul_lo_u32 v7, v13, v2 -; CGP-NEXT: v_mul_hi_u32 v3, v13, v3 -; CGP-NEXT: v_add_i32_e32 v5, vcc, v8, v5 -; CGP-NEXT: v_mul_hi_u32 v8, v11, v2 +; CGP-NEXT: v_mul_lo_u32 v7, v14, v2 +; CGP-NEXT: v_mul_hi_u32 v3, v14, v3 +; CGP-NEXT: v_add_i32_e32 v5, vcc, v6, v5 +; CGP-NEXT: v_mul_hi_u32 v6, v12, v2 ; CGP-NEXT: v_add_i32_e32 v3, vcc, v7, v3 ; CGP-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v3, vcc, v3, v8 -; CGP-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v7, vcc, v7, v8 +; CGP-NEXT: v_add_i32_e32 v3, vcc, v3, v6 +; CGP-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v6, vcc, v7, v6 ; CGP-NEXT: v_add_i32_e32 v3, vcc, v3, v5 -; CGP-NEXT: v_mul_hi_u32 v8, v13, v2 -; CGP-NEXT: v_mad_u64_u32 v[2:3], s[4:5], s7, v3, 0 +; CGP-NEXT: v_mul_hi_u32 v7, v14, v2 +; CGP-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v4, v3, 0 ; CGP-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v5, vcc, v6, v5 ; CGP-NEXT: v_add_i32_e32 v5, vcc, v7, v5 -; CGP-NEXT: v_add_i32_e32 v5, vcc, v8, v5 -; CGP-NEXT: v_mad_u64_u32 v[7:8], s[4:5], s7, v5, v[3:4] -; CGP-NEXT: v_sub_i32_e32 v2, vcc, v11, v2 -; CGP-NEXT: v_sub_i32_e64 v5, s[4:5], v13, v7 -; CGP-NEXT: v_subb_u32_e64 v3, s[4:5], v13, v7, vcc +; CGP-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v4, v5, v[3:4] +; CGP-NEXT: v_sub_i32_e32 v2, vcc, v12, v2 +; CGP-NEXT: v_subb_u32_e64 v3, s[4:5], v14, v5, vcc +; CGP-NEXT: v_sub_i32_e64 v5, s[4:5], v14, v5 ; CGP-NEXT: v_subbrev_u32_e32 v5, vcc, 0, v5, vcc -; CGP-NEXT: v_sub_i32_e32 v8, vcc, v2, v4 +; CGP-NEXT: v_sub_i32_e32 v7, vcc, v2, v4 ; CGP-NEXT: v_subbrev_u32_e32 v5, vcc, 0, v5, vcc +; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v7, v4 +; CGP-NEXT: v_cndmask_b32_e64 v8, 0, -1, vcc +; CGP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v5 ; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v2, v4 -; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v8, v4 -; CGP-NEXT: v_cndmask_b32_e64 v7, 0, -1, s[4:5] +; CGP-NEXT: v_cndmask_b32_e32 v8, v13, v8, vcc +; CGP-NEXT: v_sub_i32_e32 v4, vcc, v7, v4 +; CGP-NEXT: v_cndmask_b32_e64 v6, 0, -1, s[4:5] ; CGP-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v3 -; CGP-NEXT: v_cndmask_b32_e64 v10, 0, -1, vcc -; CGP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v5 -; CGP-NEXT: v_cndmask_b32_e64 v7, v6, v7, s[4:5] -; CGP-NEXT: v_cndmask_b32_e32 v6, v6, v10, vcc -; CGP-NEXT: v_sub_i32_e32 v4, vcc, v8, v4 -; CGP-NEXT: v_subbrev_u32_e32 v10, vcc, 0, v5, vcc +; CGP-NEXT: v_subbrev_u32_e32 v9, vcc, 0, v5, vcc +; CGP-NEXT: v_cndmask_b32_e64 v6, v13, v6, s[4:5] +; CGP-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 +; CGP-NEXT: v_cndmask_b32_e32 v4, v7, v4, vcc +; CGP-NEXT: v_cndmask_b32_e32 v5, v5, v9, vcc ; CGP-NEXT: v_cmp_ne_u32_e32 vcc, 0, v6 -; CGP-NEXT: v_cndmask_b32_e32 v4, v8, v4, vcc -; CGP-NEXT: v_cndmask_b32_e32 v5, v5, v10, vcc -; CGP-NEXT: v_cmp_ne_u32_e32 vcc, 0, v7 ; CGP-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc ; CGP-NEXT: v_cndmask_b32_e32 v3, v3, v5, vcc -; CGP-NEXT: v_xor_b32_e32 v2, v2, v9 -; CGP-NEXT: v_xor_b32_e32 v3, v3, v9 -; CGP-NEXT: v_sub_i32_e32 v2, vcc, v2, v9 -; CGP-NEXT: v_subb_u32_e32 v3, vcc, v3, v9, vcc +; CGP-NEXT: v_xor_b32_e32 v2, v2, v10 +; CGP-NEXT: v_xor_b32_e32 v3, v3, v10 +; CGP-NEXT: v_sub_i32_e32 v2, vcc, v2, v10 +; CGP-NEXT: v_subb_u32_e32 v3, vcc, v3, v10, vcc ; CGP-NEXT: s_setpc_b64 s[30:31] %result = srem <2 x i64> %num, ret <2 x i64> %result @@ -2426,15 +2400,15 @@ ; CHECK-NEXT: v_mul_hi_u32 v1, v0, v1 ; CHECK-NEXT: v_add_i32_e32 v0, vcc, v0, v1 ; CHECK-NEXT: v_mul_hi_u32 v0, v3, v0 +; CHECK-NEXT: v_mov_b32_e32 v1, 0 ; CHECK-NEXT: v_mul_lo_u32 v0, v0, v5 ; CHECK-NEXT: v_sub_i32_e32 v0, vcc, v3, v0 -; CHECK-NEXT: v_sub_i32_e32 v1, vcc, v0, v5 +; CHECK-NEXT: v_sub_i32_e32 v2, vcc, v0, v5 ; CHECK-NEXT: v_cmp_ge_u32_e32 vcc, v0, v5 -; CHECK-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc -; CHECK-NEXT: v_sub_i32_e32 v1, vcc, v0, v5 +; CHECK-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc +; CHECK-NEXT: v_sub_i32_e32 v2, vcc, v0, v5 ; CHECK-NEXT: v_cmp_ge_u32_e32 vcc, v0, v5 -; CHECK-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc -; CHECK-NEXT: v_mov_b32_e32 v1, 0 +; CHECK-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc ; CHECK-NEXT: s_or_b64 exec, exec, s[4:5] ; CHECK-NEXT: s_setpc_b64 s[30:31] %shl.y = shl i64 4096, %y @@ -2875,15 +2849,15 @@ ; CGP-NEXT: v_mul_hi_u32 v1, v0, v1 ; CGP-NEXT: v_add_i32_e32 v0, vcc, v0, v1 ; CGP-NEXT: v_mul_hi_u32 v0, v8, v0 +; CGP-NEXT: v_mov_b32_e32 v1, 0 ; CGP-NEXT: v_mul_lo_u32 v0, v0, v2 ; CGP-NEXT: v_sub_i32_e32 v0, vcc, v8, v0 -; CGP-NEXT: v_sub_i32_e32 v1, vcc, v0, v2 +; CGP-NEXT: v_sub_i32_e32 v3, vcc, v0, v2 ; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v0, v2 -; CGP-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc -; CGP-NEXT: v_sub_i32_e32 v1, vcc, v0, v2 +; CGP-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc +; CGP-NEXT: v_sub_i32_e32 v3, vcc, v0, v2 ; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v0, v2 -; CGP-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc -; CGP-NEXT: v_mov_b32_e32 v1, 0 +; CGP-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc ; CGP-NEXT: .LBB8_4: ; CGP-NEXT: s_or_b64 exec, exec, s[4:5] ; CGP-NEXT: v_or_b32_e32 v3, v7, v10 @@ -3043,15 +3017,15 @@ ; CGP-NEXT: v_mul_hi_u32 v3, v2, v3 ; CGP-NEXT: v_add_i32_e32 v2, vcc, v2, v3 ; CGP-NEXT: v_mul_hi_u32 v2, v5, v2 +; CGP-NEXT: v_mov_b32_e32 v3, 0 ; CGP-NEXT: v_mul_lo_u32 v2, v2, v9 ; CGP-NEXT: v_sub_i32_e32 v2, vcc, v5, v2 -; CGP-NEXT: v_sub_i32_e32 v3, vcc, v2, v9 +; CGP-NEXT: v_sub_i32_e32 v4, vcc, v2, v9 ; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v2, v9 -; CGP-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc -; CGP-NEXT: v_sub_i32_e32 v3, vcc, v2, v9 +; CGP-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc +; CGP-NEXT: v_sub_i32_e32 v4, vcc, v2, v9 ; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v2, v9 -; CGP-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc -; CGP-NEXT: v_mov_b32_e32 v3, 0 +; CGP-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc ; CGP-NEXT: s_or_b64 exec, exec, s[4:5] ; CGP-NEXT: s_setpc_b64 s[30:31] %shl.y = shl <2 x i64> , %y Index: llvm/test/CodeGen/AMDGPU/GlobalISel/ssubsat.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/GlobalISel/ssubsat.ll +++ llvm/test/CodeGen/AMDGPU/GlobalISel/ssubsat.ll @@ -233,23 +233,21 @@ ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: v_lshrrev_b32_e32 v2, 8, v0 ; GFX6-NEXT: v_lshlrev_b32_e32 v0, 24, v0 -; GFX6-NEXT: s_brev_b32 s4, -2 ; GFX6-NEXT: v_max_i32_e32 v4, -1, v0 ; GFX6-NEXT: v_lshrrev_b32_e32 v3, 8, v1 ; GFX6-NEXT: v_lshlrev_b32_e32 v1, 24, v1 -; GFX6-NEXT: s_brev_b32 s5, 1 -; GFX6-NEXT: v_subrev_i32_e32 v4, vcc, s4, v4 +; GFX6-NEXT: v_subrev_i32_e32 v4, vcc, 0x7fffffff, v4 ; GFX6-NEXT: v_min_i32_e32 v5, -1, v0 -; GFX6-NEXT: v_subrev_i32_e32 v5, vcc, s5, v5 +; GFX6-NEXT: v_subrev_i32_e32 v5, vcc, 0x80000000, v5 ; GFX6-NEXT: v_max_i32_e32 v1, v4, v1 ; GFX6-NEXT: v_min_i32_e32 v1, v1, v5 ; GFX6-NEXT: v_sub_i32_e32 v0, vcc, v0, v1 ; GFX6-NEXT: v_lshlrev_b32_e32 v1, 24, v2 ; GFX6-NEXT: v_lshlrev_b32_e32 v2, 24, v3 ; GFX6-NEXT: v_max_i32_e32 v3, -1, v1 -; GFX6-NEXT: v_subrev_i32_e32 v3, vcc, s4, v3 +; GFX6-NEXT: v_subrev_i32_e32 v3, vcc, 0x7fffffff, v3 ; GFX6-NEXT: v_min_i32_e32 v4, -1, v1 -; GFX6-NEXT: v_subrev_i32_e32 v4, vcc, s5, v4 +; GFX6-NEXT: v_subrev_i32_e32 v4, vcc, 0x80000000, v4 ; GFX6-NEXT: v_max_i32_e32 v2, v3, v2 ; GFX6-NEXT: v_min_i32_e32 v2, v2, v4 ; GFX6-NEXT: v_sub_i32_e32 v1, vcc, v1, v2 @@ -302,8 +300,8 @@ ; GFX9-NEXT: v_pk_lshlrev_b16 v1, 8, v1 op_sel_hi:[0,1] ; GFX9-NEXT: v_pk_sub_i16 v0, v0, v1 clamp ; GFX9-NEXT: v_pk_ashrrev_i16 v0, 8, v0 op_sel_hi:[0,1] -; GFX9-NEXT: s_movk_i32 s4, 0xff -; GFX9-NEXT: v_and_b32_sdwa v1, v0, s4 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX9-NEXT: v_mov_b32_e32 v1, 0xff +; GFX9-NEXT: v_and_b32_sdwa v1, v0, v1 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -315,14 +313,14 @@ ; GFX10-NEXT: v_lshrrev_b32_e32 v3, 8, v1 ; GFX10-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX10-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GFX10-NEXT: s_movk_i32 s4, 0xff ; GFX10-NEXT: v_lshl_or_b32 v0, v2, 16, v0 ; GFX10-NEXT: v_lshl_or_b32 v1, v3, 16, v1 ; GFX10-NEXT: v_pk_lshlrev_b16 v0, 8, v0 op_sel_hi:[0,1] ; GFX10-NEXT: v_pk_lshlrev_b16 v1, 8, v1 op_sel_hi:[0,1] ; GFX10-NEXT: v_pk_sub_i16 v0, v0, v1 clamp +; GFX10-NEXT: v_mov_b32_e32 v1, 0xff ; GFX10-NEXT: v_pk_ashrrev_i16 v0, 8, v0 op_sel_hi:[0,1] -; GFX10-NEXT: v_and_b32_sdwa v1, v0, s4 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX10-NEXT: v_and_b32_sdwa v1, v0, v1 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; GFX10-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX10-NEXT: s_setpc_b64 s[30:31] ; @@ -444,8 +442,8 @@ ; GFX9-NEXT: v_mov_b32_e32 v0, s1 ; GFX9-NEXT: v_pk_sub_i16 v0, s0, v0 clamp ; GFX9-NEXT: v_pk_ashrrev_i16 v0, 8, v0 op_sel_hi:[0,1] -; GFX9-NEXT: s_movk_i32 s0, 0xff -; GFX9-NEXT: v_and_b32_sdwa v1, v0, s0 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX9-NEXT: v_mov_b32_e32 v1, 0xff +; GFX9-NEXT: v_and_b32_sdwa v1, v0, v1 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_readfirstlane_b32 s0, v0 ; GFX9-NEXT: ; return to shader part epilog @@ -464,10 +462,10 @@ ; GFX10-NEXT: s_lshl_b32 s3, s3, 8 ; GFX10-NEXT: s_pack_ll_b32_b16 s0, s0, s2 ; GFX10-NEXT: s_pack_ll_b32_b16 s1, s1, s3 +; GFX10-NEXT: v_mov_b32_e32 v1, 0xff ; GFX10-NEXT: v_pk_sub_i16 v0, s0, s1 clamp -; GFX10-NEXT: s_movk_i32 s0, 0xff ; GFX10-NEXT: v_pk_ashrrev_i16 v0, 8, v0 op_sel_hi:[0,1] -; GFX10-NEXT: v_and_b32_sdwa v1, v0, s0 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX10-NEXT: v_and_b32_sdwa v1, v0, v1 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; GFX10-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX10-NEXT: v_readfirstlane_b32 s0, v0 ; GFX10-NEXT: ; return to shader part epilog @@ -510,47 +508,45 @@ ; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v0 ; GFX6-NEXT: v_lshrrev_b32_e32 v4, 24, v0 ; GFX6-NEXT: v_lshlrev_b32_e32 v0, 24, v0 -; GFX6-NEXT: s_brev_b32 s4, -2 -; GFX6-NEXT: v_max_i32_e32 v8, -1, v0 +; GFX6-NEXT: v_max_i32_e32 v10, -1, v0 ; GFX6-NEXT: v_lshrrev_b32_e32 v5, 8, v1 ; GFX6-NEXT: v_lshrrev_b32_e32 v6, 16, v1 ; GFX6-NEXT: v_lshrrev_b32_e32 v7, 24, v1 ; GFX6-NEXT: v_lshlrev_b32_e32 v1, 24, v1 -; GFX6-NEXT: s_brev_b32 s5, 1 -; GFX6-NEXT: v_subrev_i32_e32 v8, vcc, s4, v8 -; GFX6-NEXT: v_min_i32_e32 v10, -1, v0 -; GFX6-NEXT: v_subrev_i32_e32 v10, vcc, s5, v10 -; GFX6-NEXT: v_max_i32_e32 v1, v8, v1 -; GFX6-NEXT: v_min_i32_e32 v1, v1, v10 +; GFX6-NEXT: v_bfrev_b32_e32 v9, 1 +; GFX6-NEXT: v_subrev_i32_e32 v10, vcc, 0x7fffffff, v10 +; GFX6-NEXT: v_min_i32_e32 v11, -1, v0 +; GFX6-NEXT: v_sub_i32_e32 v11, vcc, v11, v9 +; GFX6-NEXT: v_max_i32_e32 v1, v10, v1 +; GFX6-NEXT: v_min_i32_e32 v1, v1, v11 ; GFX6-NEXT: v_sub_i32_e32 v0, vcc, v0, v1 ; GFX6-NEXT: v_lshlrev_b32_e32 v1, 24, v2 +; GFX6-NEXT: v_bfrev_b32_e32 v8, -2 ; GFX6-NEXT: v_lshlrev_b32_e32 v2, 24, v5 ; GFX6-NEXT: v_max_i32_e32 v5, -1, v1 -; GFX6-NEXT: v_subrev_i32_e32 v5, vcc, s4, v5 -; GFX6-NEXT: v_min_i32_e32 v8, -1, v1 -; GFX6-NEXT: v_subrev_i32_e32 v8, vcc, s5, v8 +; GFX6-NEXT: v_sub_i32_e32 v5, vcc, v5, v8 +; GFX6-NEXT: v_min_i32_e32 v10, -1, v1 +; GFX6-NEXT: v_sub_i32_e32 v10, vcc, v10, v9 ; GFX6-NEXT: v_max_i32_e32 v2, v5, v2 -; GFX6-NEXT: v_min_i32_e32 v2, v2, v8 +; GFX6-NEXT: v_min_i32_e32 v2, v2, v10 ; GFX6-NEXT: v_sub_i32_e32 v1, vcc, v1, v2 ; GFX6-NEXT: v_lshlrev_b32_e32 v2, 24, v3 -; GFX6-NEXT: v_bfrev_b32_e32 v9, -2 ; GFX6-NEXT: v_max_i32_e32 v5, -1, v2 ; GFX6-NEXT: v_lshlrev_b32_e32 v3, 24, v6 -; GFX6-NEXT: v_sub_i32_e32 v5, vcc, v5, v9 +; GFX6-NEXT: v_sub_i32_e32 v5, vcc, v5, v8 ; GFX6-NEXT: v_min_i32_e32 v6, -1, v2 -; GFX6-NEXT: v_subrev_i32_e32 v6, vcc, s5, v6 +; GFX6-NEXT: v_sub_i32_e32 v6, vcc, v6, v9 ; GFX6-NEXT: v_max_i32_e32 v3, v5, v3 ; GFX6-NEXT: v_min_i32_e32 v3, v3, v6 ; GFX6-NEXT: v_sub_i32_e32 v2, vcc, v2, v3 ; GFX6-NEXT: v_lshlrev_b32_e32 v3, 24, v4 ; GFX6-NEXT: v_max_i32_e32 v5, -1, v3 -; GFX6-NEXT: v_bfrev_b32_e32 v11, 1 ; GFX6-NEXT: v_ashrrev_i32_e32 v1, 24, v1 ; GFX6-NEXT: v_lshlrev_b32_e32 v4, 24, v7 -; GFX6-NEXT: v_sub_i32_e32 v5, vcc, v5, v9 +; GFX6-NEXT: v_sub_i32_e32 v5, vcc, v5, v8 ; GFX6-NEXT: v_min_i32_e32 v6, -1, v3 ; GFX6-NEXT: v_ashrrev_i32_e32 v0, 24, v0 -; GFX6-NEXT: v_sub_i32_e32 v6, vcc, v6, v11 +; GFX6-NEXT: v_sub_i32_e32 v6, vcc, v6, v9 ; GFX6-NEXT: v_max_i32_e32 v4, v5, v4 ; GFX6-NEXT: v_and_b32_e32 v1, 0xff, v1 ; GFX6-NEXT: v_ashrrev_i32_e32 v2, 24, v2 @@ -643,11 +639,11 @@ ; GFX9-NEXT: v_pk_sub_i16 v2, v2, v3 clamp ; GFX9-NEXT: v_pk_sub_i16 v0, v0, v1 clamp ; GFX9-NEXT: v_pk_ashrrev_i16 v1, 8, v2 op_sel_hi:[0,1] -; GFX9-NEXT: v_mov_b32_e32 v2, 8 +; GFX9-NEXT: v_mov_b32_e32 v3, 8 ; GFX9-NEXT: v_pk_ashrrev_i16 v0, 8, v0 op_sel_hi:[0,1] -; GFX9-NEXT: s_movk_i32 s4, 0xff -; GFX9-NEXT: v_lshlrev_b32_sdwa v2, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 -; GFX9-NEXT: v_and_or_b32 v1, v1, s4, v2 +; GFX9-NEXT: v_mov_b32_e32 v2, 0xff +; GFX9-NEXT: v_lshlrev_b32_sdwa v3, v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 +; GFX9-NEXT: v_and_or_b32 v1, v1, v2, v3 ; GFX9-NEXT: v_and_b32_e32 v2, 0xff, v0 ; GFX9-NEXT: v_mov_b32_e32 v3, 24 ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 16, v2 @@ -682,7 +678,7 @@ ; GFX10-NEXT: v_lshlrev_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 ; GFX10-NEXT: v_and_b32_e32 v3, 0xff, v0 ; GFX10-NEXT: v_lshlrev_b32_sdwa v0, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 -; GFX10-NEXT: v_and_or_b32 v1, v2, 0xff, v1 +; GFX10-NEXT: v_and_or_b32 v1, 0xff, v2, v1 ; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v3 ; GFX10-NEXT: v_or3_b32 v0, v1, v2, v0 ; GFX10-NEXT: s_setpc_b64 s[30:31] @@ -715,7 +711,7 @@ ; GFX11-NEXT: v_lshlrev_b32_e32 v2, 8, v2 ; GFX11-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 24, v0 -; GFX11-NEXT: v_and_or_b32 v1, v1, 0xff, v2 +; GFX11-NEXT: v_and_or_b32 v1, 0xff, v1, v2 ; GFX11-NEXT: v_or3_b32 v0, v1, v3, v0 ; GFX11-NEXT: s_setpc_b64 s[30:31] %lhs = bitcast i32 %lhs.arg to <4 x i8> @@ -873,46 +869,46 @@ ; ; GFX9-LABEL: s_ssubsat_v4i8: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_lshr_b32 s3, s0, 8 +; GFX9-NEXT: s_lshr_b32 s2, s0, 8 +; GFX9-NEXT: s_lshr_b32 s3, s0, 16 +; GFX9-NEXT: s_lshr_b32 s4, s0, 24 +; GFX9-NEXT: s_pack_ll_b32_b16 s0, s0, s2 +; GFX9-NEXT: s_pack_ll_b32_b16 s2, s3, s4 ; GFX9-NEXT: s_lshr_b32 s4, s0, 16 -; GFX9-NEXT: s_lshr_b32 s6, s0, 24 -; GFX9-NEXT: s_pack_ll_b32_b16 s0, s0, s3 -; GFX9-NEXT: s_pack_ll_b32_b16 s3, s4, s6 -; GFX9-NEXT: s_lshr_b32 s6, s0, 16 ; GFX9-NEXT: s_lshl_b32 s0, s0, 0x80008 -; GFX9-NEXT: s_lshl_b32 s6, s6, 8 -; GFX9-NEXT: s_lshr_b32 s7, s1, 8 -; GFX9-NEXT: s_pack_ll_b32_b16 s0, s0, s6 -; GFX9-NEXT: s_lshr_b32 s6, s3, 16 -; GFX9-NEXT: s_lshr_b32 s8, s1, 16 -; GFX9-NEXT: s_lshr_b32 s9, s1, 24 -; GFX9-NEXT: s_pack_ll_b32_b16 s1, s1, s7 -; GFX9-NEXT: s_lshl_b32 s3, s3, 0x80008 -; GFX9-NEXT: s_lshl_b32 s6, s6, 8 -; GFX9-NEXT: s_pack_ll_b32_b16 s3, s3, s6 +; GFX9-NEXT: s_lshl_b32 s4, s4, 8 +; GFX9-NEXT: s_lshr_b32 s5, s1, 8 +; GFX9-NEXT: s_pack_ll_b32_b16 s0, s0, s4 +; GFX9-NEXT: s_lshr_b32 s4, s2, 16 ; GFX9-NEXT: s_lshr_b32 s6, s1, 16 -; GFX9-NEXT: s_pack_ll_b32_b16 s4, s8, s9 +; GFX9-NEXT: s_lshr_b32 s7, s1, 24 +; GFX9-NEXT: s_pack_ll_b32_b16 s1, s1, s5 +; GFX9-NEXT: s_lshl_b32 s2, s2, 0x80008 +; GFX9-NEXT: s_lshl_b32 s4, s4, 8 +; GFX9-NEXT: s_pack_ll_b32_b16 s2, s2, s4 +; GFX9-NEXT: s_lshr_b32 s4, s1, 16 +; GFX9-NEXT: s_pack_ll_b32_b16 s3, s6, s7 ; GFX9-NEXT: s_lshl_b32 s1, s1, 0x80008 -; GFX9-NEXT: s_lshl_b32 s6, s6, 8 -; GFX9-NEXT: s_pack_ll_b32_b16 s1, s1, s6 -; GFX9-NEXT: s_lshr_b32 s6, s4, 16 -; GFX9-NEXT: s_lshl_b32 s4, s4, 0x80008 -; GFX9-NEXT: s_lshl_b32 s6, s6, 8 -; GFX9-NEXT: s_pack_ll_b32_b16 s4, s4, s6 +; GFX9-NEXT: s_lshl_b32 s4, s4, 8 +; GFX9-NEXT: s_pack_ll_b32_b16 s1, s1, s4 +; GFX9-NEXT: s_lshr_b32 s4, s3, 16 +; GFX9-NEXT: s_lshl_b32 s3, s3, 0x80008 +; GFX9-NEXT: s_lshl_b32 s4, s4, 8 +; GFX9-NEXT: s_pack_ll_b32_b16 s3, s3, s4 ; GFX9-NEXT: v_mov_b32_e32 v0, s1 ; GFX9-NEXT: v_pk_sub_i16 v0, s0, v0 clamp -; GFX9-NEXT: v_mov_b32_e32 v1, s4 -; GFX9-NEXT: s_mov_b32 s2, 8 -; GFX9-NEXT: v_pk_sub_i16 v1, s3, v1 clamp +; GFX9-NEXT: v_mov_b32_e32 v1, s3 +; GFX9-NEXT: v_pk_sub_i16 v1, s2, v1 clamp ; GFX9-NEXT: v_pk_ashrrev_i16 v0, 8, v0 op_sel_hi:[0,1] +; GFX9-NEXT: v_mov_b32_e32 v3, 8 ; GFX9-NEXT: v_pk_ashrrev_i16 v1, 8, v1 op_sel_hi:[0,1] -; GFX9-NEXT: s_movk_i32 s0, 0xff -; GFX9-NEXT: v_lshlrev_b32_sdwa v2, s2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 -; GFX9-NEXT: s_mov_b32 s5, 24 -; GFX9-NEXT: v_and_or_b32 v0, v0, s0, v2 +; GFX9-NEXT: v_mov_b32_e32 v2, 0xff +; GFX9-NEXT: v_lshlrev_b32_sdwa v3, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 +; GFX9-NEXT: v_and_or_b32 v0, v0, v2, v3 ; GFX9-NEXT: v_and_b32_e32 v2, 0xff, v1 +; GFX9-NEXT: v_mov_b32_e32 v3, 24 ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX9-NEXT: v_lshlrev_b32_sdwa v1, s5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 +; GFX9-NEXT: v_lshlrev_b32_sdwa v1, v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 ; GFX9-NEXT: v_or3_b32 v0, v0, v2, v1 ; GFX9-NEXT: v_readfirstlane_b32 s0, v0 ; GFX9-NEXT: ; return to shader part epilog @@ -947,14 +943,14 @@ ; GFX10-NEXT: s_pack_ll_b32_b16 s3, s3, s5 ; GFX10-NEXT: v_pk_sub_i16 v0, s0, s1 clamp ; GFX10-NEXT: v_pk_sub_i16 v1, s2, s3 clamp -; GFX10-NEXT: s_mov_b32 s0, 8 +; GFX10-NEXT: v_mov_b32_e32 v2, 8 +; GFX10-NEXT: v_mov_b32_e32 v4, 24 ; GFX10-NEXT: v_pk_ashrrev_i16 v0, 8, v0 op_sel_hi:[0,1] ; GFX10-NEXT: v_pk_ashrrev_i16 v1, 8, v1 op_sel_hi:[0,1] -; GFX10-NEXT: v_lshlrev_b32_sdwa v2, s0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 +; GFX10-NEXT: v_lshlrev_b32_sdwa v2, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 ; GFX10-NEXT: v_and_b32_e32 v3, 0xff, v1 -; GFX10-NEXT: s_mov_b32 s0, 24 -; GFX10-NEXT: v_lshlrev_b32_sdwa v1, s0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 -; GFX10-NEXT: v_and_or_b32 v0, v0, 0xff, v2 +; GFX10-NEXT: v_lshlrev_b32_sdwa v1, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 +; GFX10-NEXT: v_and_or_b32 v0, 0xff, v0, v2 ; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v3 ; GFX10-NEXT: v_or3_b32 v0, v0, v2, v1 ; GFX10-NEXT: v_readfirstlane_b32 s0, v0 @@ -994,7 +990,7 @@ ; GFX11-NEXT: v_lshlrev_b32_e32 v2, 8, v2 ; GFX11-NEXT: v_and_b32_e32 v3, 0xff, v1 ; GFX11-NEXT: v_bfe_u32 v1, v1, 16, 8 -; GFX11-NEXT: v_and_or_b32 v0, v0, 0xff, v2 +; GFX11-NEXT: v_and_or_b32 v0, 0xff, v0, v2 ; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v3 ; GFX11-NEXT: v_lshlrev_b32_e32 v1, 24, v1 ; GFX11-NEXT: v_or3_b32 v0, v0, v2, v1 @@ -1273,19 +1269,17 @@ ; GFX6-LABEL: v_ssubsat_v2i32: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: s_brev_b32 s4, -2 ; GFX6-NEXT: v_max_i32_e32 v4, -1, v0 -; GFX6-NEXT: s_brev_b32 s5, 1 -; GFX6-NEXT: v_subrev_i32_e32 v4, vcc, s4, v4 +; GFX6-NEXT: v_subrev_i32_e32 v4, vcc, 0x7fffffff, v4 ; GFX6-NEXT: v_min_i32_e32 v5, -1, v0 -; GFX6-NEXT: v_subrev_i32_e32 v5, vcc, s5, v5 +; GFX6-NEXT: v_subrev_i32_e32 v5, vcc, 0x80000000, v5 ; GFX6-NEXT: v_max_i32_e32 v2, v4, v2 ; GFX6-NEXT: v_min_i32_e32 v2, v2, v5 ; GFX6-NEXT: v_sub_i32_e32 v0, vcc, v0, v2 ; GFX6-NEXT: v_max_i32_e32 v2, -1, v1 -; GFX6-NEXT: v_subrev_i32_e32 v2, vcc, s4, v2 +; GFX6-NEXT: v_subrev_i32_e32 v2, vcc, 0x7fffffff, v2 ; GFX6-NEXT: v_min_i32_e32 v4, -1, v1 -; GFX6-NEXT: v_subrev_i32_e32 v4, vcc, s5, v4 +; GFX6-NEXT: v_subrev_i32_e32 v4, vcc, 0x80000000, v4 ; GFX6-NEXT: v_max_i32_e32 v2, v2, v3 ; GFX6-NEXT: v_min_i32_e32 v2, v2, v4 ; GFX6-NEXT: v_sub_i32_e32 v1, vcc, v1, v2 @@ -1294,19 +1288,17 @@ ; GFX8-LABEL: v_ssubsat_v2i32: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: s_brev_b32 s4, -2 ; GFX8-NEXT: v_max_i32_e32 v4, -1, v0 -; GFX8-NEXT: s_brev_b32 s5, 1 -; GFX8-NEXT: v_subrev_u32_e32 v4, vcc, s4, v4 +; GFX8-NEXT: v_subrev_u32_e32 v4, vcc, 0x7fffffff, v4 ; GFX8-NEXT: v_min_i32_e32 v5, -1, v0 -; GFX8-NEXT: v_subrev_u32_e32 v5, vcc, s5, v5 +; GFX8-NEXT: v_subrev_u32_e32 v5, vcc, 0x80000000, v5 ; GFX8-NEXT: v_max_i32_e32 v2, v4, v2 ; GFX8-NEXT: v_min_i32_e32 v2, v2, v5 ; GFX8-NEXT: v_sub_u32_e32 v0, vcc, v0, v2 ; GFX8-NEXT: v_max_i32_e32 v2, -1, v1 -; GFX8-NEXT: v_subrev_u32_e32 v2, vcc, s4, v2 +; GFX8-NEXT: v_subrev_u32_e32 v2, vcc, 0x7fffffff, v2 ; GFX8-NEXT: v_min_i32_e32 v4, -1, v1 -; GFX8-NEXT: v_subrev_u32_e32 v4, vcc, s5, v4 +; GFX8-NEXT: v_subrev_u32_e32 v4, vcc, 0x80000000, v4 ; GFX8-NEXT: v_max_i32_e32 v2, v2, v3 ; GFX8-NEXT: v_min_i32_e32 v2, v2, v4 ; GFX8-NEXT: v_sub_u32_e32 v1, vcc, v1, v2 @@ -1392,26 +1384,25 @@ ; GFX6-LABEL: v_ssubsat_v3i32: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: s_brev_b32 s4, -2 -; GFX6-NEXT: v_max_i32_e32 v6, -1, v0 -; GFX6-NEXT: s_brev_b32 s5, 1 -; GFX6-NEXT: v_subrev_i32_e32 v6, vcc, s4, v6 -; GFX6-NEXT: v_min_i32_e32 v7, -1, v0 -; GFX6-NEXT: v_subrev_i32_e32 v7, vcc, s5, v7 -; GFX6-NEXT: v_max_i32_e32 v3, v6, v3 -; GFX6-NEXT: v_min_i32_e32 v3, v3, v7 +; GFX6-NEXT: v_max_i32_e32 v7, -1, v0 +; GFX6-NEXT: v_subrev_i32_e32 v7, vcc, 0x7fffffff, v7 +; GFX6-NEXT: v_min_i32_e32 v8, -1, v0 +; GFX6-NEXT: v_subrev_i32_e32 v8, vcc, 0x80000000, v8 +; GFX6-NEXT: v_max_i32_e32 v3, v7, v3 +; GFX6-NEXT: v_min_i32_e32 v3, v3, v8 +; GFX6-NEXT: v_bfrev_b32_e32 v6, -2 ; GFX6-NEXT: v_sub_i32_e32 v0, vcc, v0, v3 ; GFX6-NEXT: v_max_i32_e32 v3, -1, v1 -; GFX6-NEXT: v_subrev_i32_e32 v3, vcc, s4, v3 +; GFX6-NEXT: v_sub_i32_e32 v3, vcc, v3, v6 ; GFX6-NEXT: v_min_i32_e32 v6, -1, v1 -; GFX6-NEXT: v_subrev_i32_e32 v6, vcc, s5, v6 +; GFX6-NEXT: v_subrev_i32_e32 v6, vcc, 0x80000000, v6 ; GFX6-NEXT: v_max_i32_e32 v3, v3, v4 ; GFX6-NEXT: v_min_i32_e32 v3, v3, v6 ; GFX6-NEXT: v_sub_i32_e32 v1, vcc, v1, v3 ; GFX6-NEXT: v_max_i32_e32 v3, -1, v2 -; GFX6-NEXT: v_subrev_i32_e32 v3, vcc, s4, v3 +; GFX6-NEXT: v_subrev_i32_e32 v3, vcc, 0x7fffffff, v3 ; GFX6-NEXT: v_min_i32_e32 v4, -1, v2 -; GFX6-NEXT: v_subrev_i32_e32 v4, vcc, s5, v4 +; GFX6-NEXT: v_subrev_i32_e32 v4, vcc, 0x80000000, v4 ; GFX6-NEXT: v_max_i32_e32 v3, v3, v5 ; GFX6-NEXT: v_min_i32_e32 v3, v3, v4 ; GFX6-NEXT: v_sub_i32_e32 v2, vcc, v2, v3 @@ -1420,26 +1411,25 @@ ; GFX8-LABEL: v_ssubsat_v3i32: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: s_brev_b32 s4, -2 -; GFX8-NEXT: v_max_i32_e32 v6, -1, v0 -; GFX8-NEXT: s_brev_b32 s5, 1 -; GFX8-NEXT: v_subrev_u32_e32 v6, vcc, s4, v6 -; GFX8-NEXT: v_min_i32_e32 v7, -1, v0 -; GFX8-NEXT: v_subrev_u32_e32 v7, vcc, s5, v7 -; GFX8-NEXT: v_max_i32_e32 v3, v6, v3 -; GFX8-NEXT: v_min_i32_e32 v3, v3, v7 +; GFX8-NEXT: v_max_i32_e32 v7, -1, v0 +; GFX8-NEXT: v_subrev_u32_e32 v7, vcc, 0x7fffffff, v7 +; GFX8-NEXT: v_min_i32_e32 v8, -1, v0 +; GFX8-NEXT: v_subrev_u32_e32 v8, vcc, 0x80000000, v8 +; GFX8-NEXT: v_max_i32_e32 v3, v7, v3 +; GFX8-NEXT: v_min_i32_e32 v3, v3, v8 +; GFX8-NEXT: v_bfrev_b32_e32 v6, -2 ; GFX8-NEXT: v_sub_u32_e32 v0, vcc, v0, v3 ; GFX8-NEXT: v_max_i32_e32 v3, -1, v1 -; GFX8-NEXT: v_subrev_u32_e32 v3, vcc, s4, v3 +; GFX8-NEXT: v_sub_u32_e32 v3, vcc, v3, v6 ; GFX8-NEXT: v_min_i32_e32 v6, -1, v1 -; GFX8-NEXT: v_subrev_u32_e32 v6, vcc, s5, v6 +; GFX8-NEXT: v_subrev_u32_e32 v6, vcc, 0x80000000, v6 ; GFX8-NEXT: v_max_i32_e32 v3, v3, v4 ; GFX8-NEXT: v_min_i32_e32 v3, v3, v6 ; GFX8-NEXT: v_sub_u32_e32 v1, vcc, v1, v3 ; GFX8-NEXT: v_max_i32_e32 v3, -1, v2 -; GFX8-NEXT: v_subrev_u32_e32 v3, vcc, s4, v3 +; GFX8-NEXT: v_subrev_u32_e32 v3, vcc, 0x7fffffff, v3 ; GFX8-NEXT: v_min_i32_e32 v4, -1, v2 -; GFX8-NEXT: v_subrev_u32_e32 v4, vcc, s5, v4 +; GFX8-NEXT: v_subrev_u32_e32 v4, vcc, 0x80000000, v4 ; GFX8-NEXT: v_max_i32_e32 v3, v3, v5 ; GFX8-NEXT: v_min_i32_e32 v3, v3, v4 ; GFX8-NEXT: v_sub_u32_e32 v2, vcc, v2, v3 @@ -1546,26 +1536,26 @@ ; GFX6-LABEL: v_ssubsat_v4i32: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: s_brev_b32 s4, -2 -; GFX6-NEXT: v_max_i32_e32 v8, -1, v0 -; GFX6-NEXT: s_brev_b32 s5, 1 -; GFX6-NEXT: v_subrev_i32_e32 v8, vcc, s4, v8 -; GFX6-NEXT: v_min_i32_e32 v9, -1, v0 -; GFX6-NEXT: v_subrev_i32_e32 v9, vcc, s5, v9 -; GFX6-NEXT: v_max_i32_e32 v4, v8, v4 +; GFX6-NEXT: v_max_i32_e32 v10, -1, v0 +; GFX6-NEXT: v_bfrev_b32_e32 v9, 1 +; GFX6-NEXT: v_subrev_i32_e32 v10, vcc, 0x7fffffff, v10 +; GFX6-NEXT: v_min_i32_e32 v11, -1, v0 +; GFX6-NEXT: v_sub_i32_e32 v9, vcc, v11, v9 +; GFX6-NEXT: v_max_i32_e32 v4, v10, v4 ; GFX6-NEXT: v_min_i32_e32 v4, v4, v9 +; GFX6-NEXT: v_bfrev_b32_e32 v8, -2 ; GFX6-NEXT: v_sub_i32_e32 v0, vcc, v0, v4 ; GFX6-NEXT: v_max_i32_e32 v4, -1, v1 -; GFX6-NEXT: v_subrev_i32_e32 v4, vcc, s4, v4 -; GFX6-NEXT: v_min_i32_e32 v8, -1, v1 -; GFX6-NEXT: v_subrev_i32_e32 v8, vcc, s5, v8 +; GFX6-NEXT: v_sub_i32_e32 v4, vcc, v4, v8 +; GFX6-NEXT: v_min_i32_e32 v9, -1, v1 +; GFX6-NEXT: v_subrev_i32_e32 v9, vcc, 0x80000000, v9 ; GFX6-NEXT: v_max_i32_e32 v4, v4, v5 -; GFX6-NEXT: v_min_i32_e32 v4, v4, v8 +; GFX6-NEXT: v_min_i32_e32 v4, v4, v9 ; GFX6-NEXT: v_sub_i32_e32 v1, vcc, v1, v4 ; GFX6-NEXT: v_max_i32_e32 v4, -1, v2 -; GFX6-NEXT: v_subrev_i32_e32 v4, vcc, s4, v4 +; GFX6-NEXT: v_sub_i32_e32 v4, vcc, v4, v8 ; GFX6-NEXT: v_min_i32_e32 v5, -1, v2 -; GFX6-NEXT: v_subrev_i32_e32 v5, vcc, s5, v5 +; GFX6-NEXT: v_subrev_i32_e32 v5, vcc, 0x80000000, v5 ; GFX6-NEXT: v_max_i32_e32 v4, v4, v6 ; GFX6-NEXT: v_min_i32_e32 v4, v4, v5 ; GFX6-NEXT: v_sub_i32_e32 v2, vcc, v2, v4 @@ -1581,26 +1571,26 @@ ; GFX8-LABEL: v_ssubsat_v4i32: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: s_brev_b32 s4, -2 -; GFX8-NEXT: v_max_i32_e32 v8, -1, v0 -; GFX8-NEXT: s_brev_b32 s5, 1 -; GFX8-NEXT: v_subrev_u32_e32 v8, vcc, s4, v8 -; GFX8-NEXT: v_min_i32_e32 v9, -1, v0 -; GFX8-NEXT: v_subrev_u32_e32 v9, vcc, s5, v9 -; GFX8-NEXT: v_max_i32_e32 v4, v8, v4 +; GFX8-NEXT: v_max_i32_e32 v10, -1, v0 +; GFX8-NEXT: v_bfrev_b32_e32 v9, 1 +; GFX8-NEXT: v_subrev_u32_e32 v10, vcc, 0x7fffffff, v10 +; GFX8-NEXT: v_min_i32_e32 v11, -1, v0 +; GFX8-NEXT: v_sub_u32_e32 v9, vcc, v11, v9 +; GFX8-NEXT: v_max_i32_e32 v4, v10, v4 ; GFX8-NEXT: v_min_i32_e32 v4, v4, v9 +; GFX8-NEXT: v_bfrev_b32_e32 v8, -2 ; GFX8-NEXT: v_sub_u32_e32 v0, vcc, v0, v4 ; GFX8-NEXT: v_max_i32_e32 v4, -1, v1 -; GFX8-NEXT: v_subrev_u32_e32 v4, vcc, s4, v4 -; GFX8-NEXT: v_min_i32_e32 v8, -1, v1 -; GFX8-NEXT: v_subrev_u32_e32 v8, vcc, s5, v8 +; GFX8-NEXT: v_sub_u32_e32 v4, vcc, v4, v8 +; GFX8-NEXT: v_min_i32_e32 v9, -1, v1 +; GFX8-NEXT: v_subrev_u32_e32 v9, vcc, 0x80000000, v9 ; GFX8-NEXT: v_max_i32_e32 v4, v4, v5 -; GFX8-NEXT: v_min_i32_e32 v4, v4, v8 +; GFX8-NEXT: v_min_i32_e32 v4, v4, v9 ; GFX8-NEXT: v_sub_u32_e32 v1, vcc, v1, v4 ; GFX8-NEXT: v_max_i32_e32 v4, -1, v2 -; GFX8-NEXT: v_subrev_u32_e32 v4, vcc, s4, v4 +; GFX8-NEXT: v_sub_u32_e32 v4, vcc, v4, v8 ; GFX8-NEXT: v_min_i32_e32 v5, -1, v2 -; GFX8-NEXT: v_subrev_u32_e32 v5, vcc, s5, v5 +; GFX8-NEXT: v_subrev_u32_e32 v5, vcc, 0x80000000, v5 ; GFX8-NEXT: v_max_i32_e32 v4, v4, v6 ; GFX8-NEXT: v_min_i32_e32 v4, v4, v5 ; GFX8-NEXT: v_sub_u32_e32 v2, vcc, v2, v4 @@ -1735,32 +1725,31 @@ ; GFX6-LABEL: v_ssubsat_v5i32: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: s_brev_b32 s4, -2 -; GFX6-NEXT: v_max_i32_e32 v10, -1, v0 -; GFX6-NEXT: s_brev_b32 s5, 1 -; GFX6-NEXT: v_subrev_i32_e32 v10, vcc, s4, v10 -; GFX6-NEXT: v_min_i32_e32 v12, -1, v0 -; GFX6-NEXT: v_subrev_i32_e32 v12, vcc, s5, v12 -; GFX6-NEXT: v_max_i32_e32 v5, v10, v5 -; GFX6-NEXT: v_min_i32_e32 v5, v5, v12 +; GFX6-NEXT: v_max_i32_e32 v12, -1, v0 +; GFX6-NEXT: v_bfrev_b32_e32 v11, 1 +; GFX6-NEXT: v_subrev_i32_e32 v12, vcc, 0x7fffffff, v12 +; GFX6-NEXT: v_min_i32_e32 v13, -1, v0 +; GFX6-NEXT: v_sub_i32_e32 v13, vcc, v13, v11 +; GFX6-NEXT: v_max_i32_e32 v5, v12, v5 +; GFX6-NEXT: v_min_i32_e32 v5, v5, v13 +; GFX6-NEXT: v_bfrev_b32_e32 v10, -2 ; GFX6-NEXT: v_sub_i32_e32 v0, vcc, v0, v5 ; GFX6-NEXT: v_max_i32_e32 v5, -1, v1 -; GFX6-NEXT: v_subrev_i32_e32 v5, vcc, s4, v5 -; GFX6-NEXT: v_min_i32_e32 v10, -1, v1 -; GFX6-NEXT: v_subrev_i32_e32 v10, vcc, s5, v10 +; GFX6-NEXT: v_sub_i32_e32 v5, vcc, v5, v10 +; GFX6-NEXT: v_min_i32_e32 v12, -1, v1 +; GFX6-NEXT: v_sub_i32_e32 v11, vcc, v12, v11 ; GFX6-NEXT: v_max_i32_e32 v5, v5, v6 -; GFX6-NEXT: v_min_i32_e32 v5, v5, v10 +; GFX6-NEXT: v_min_i32_e32 v5, v5, v11 ; GFX6-NEXT: v_sub_i32_e32 v1, vcc, v1, v5 ; GFX6-NEXT: v_max_i32_e32 v5, -1, v2 -; GFX6-NEXT: v_subrev_i32_e32 v5, vcc, s4, v5 +; GFX6-NEXT: v_sub_i32_e32 v5, vcc, v5, v10 ; GFX6-NEXT: v_min_i32_e32 v6, -1, v2 -; GFX6-NEXT: v_subrev_i32_e32 v6, vcc, s5, v6 +; GFX6-NEXT: v_subrev_i32_e32 v6, vcc, 0x80000000, v6 ; GFX6-NEXT: v_max_i32_e32 v5, v5, v7 ; GFX6-NEXT: v_min_i32_e32 v5, v5, v6 -; GFX6-NEXT: v_bfrev_b32_e32 v11, -2 ; GFX6-NEXT: v_sub_i32_e32 v2, vcc, v2, v5 ; GFX6-NEXT: v_max_i32_e32 v5, -1, v3 -; GFX6-NEXT: v_sub_i32_e32 v5, vcc, v5, v11 +; GFX6-NEXT: v_sub_i32_e32 v5, vcc, v5, v10 ; GFX6-NEXT: v_min_i32_e32 v6, -1, v3 ; GFX6-NEXT: v_subrev_i32_e32 v6, vcc, 0x80000000, v6 ; GFX6-NEXT: v_max_i32_e32 v5, v5, v8 @@ -1778,32 +1767,31 @@ ; GFX8-LABEL: v_ssubsat_v5i32: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: s_brev_b32 s4, -2 -; GFX8-NEXT: v_max_i32_e32 v10, -1, v0 -; GFX8-NEXT: s_brev_b32 s5, 1 -; GFX8-NEXT: v_subrev_u32_e32 v10, vcc, s4, v10 -; GFX8-NEXT: v_min_i32_e32 v12, -1, v0 -; GFX8-NEXT: v_subrev_u32_e32 v12, vcc, s5, v12 -; GFX8-NEXT: v_max_i32_e32 v5, v10, v5 -; GFX8-NEXT: v_min_i32_e32 v5, v5, v12 +; GFX8-NEXT: v_max_i32_e32 v12, -1, v0 +; GFX8-NEXT: v_bfrev_b32_e32 v11, 1 +; GFX8-NEXT: v_subrev_u32_e32 v12, vcc, 0x7fffffff, v12 +; GFX8-NEXT: v_min_i32_e32 v13, -1, v0 +; GFX8-NEXT: v_sub_u32_e32 v13, vcc, v13, v11 +; GFX8-NEXT: v_max_i32_e32 v5, v12, v5 +; GFX8-NEXT: v_min_i32_e32 v5, v5, v13 +; GFX8-NEXT: v_bfrev_b32_e32 v10, -2 ; GFX8-NEXT: v_sub_u32_e32 v0, vcc, v0, v5 ; GFX8-NEXT: v_max_i32_e32 v5, -1, v1 -; GFX8-NEXT: v_subrev_u32_e32 v5, vcc, s4, v5 -; GFX8-NEXT: v_min_i32_e32 v10, -1, v1 -; GFX8-NEXT: v_subrev_u32_e32 v10, vcc, s5, v10 +; GFX8-NEXT: v_sub_u32_e32 v5, vcc, v5, v10 +; GFX8-NEXT: v_min_i32_e32 v12, -1, v1 +; GFX8-NEXT: v_sub_u32_e32 v11, vcc, v12, v11 ; GFX8-NEXT: v_max_i32_e32 v5, v5, v6 -; GFX8-NEXT: v_min_i32_e32 v5, v5, v10 +; GFX8-NEXT: v_min_i32_e32 v5, v5, v11 ; GFX8-NEXT: v_sub_u32_e32 v1, vcc, v1, v5 ; GFX8-NEXT: v_max_i32_e32 v5, -1, v2 -; GFX8-NEXT: v_subrev_u32_e32 v5, vcc, s4, v5 +; GFX8-NEXT: v_sub_u32_e32 v5, vcc, v5, v10 ; GFX8-NEXT: v_min_i32_e32 v6, -1, v2 -; GFX8-NEXT: v_subrev_u32_e32 v6, vcc, s5, v6 +; GFX8-NEXT: v_subrev_u32_e32 v6, vcc, 0x80000000, v6 ; GFX8-NEXT: v_max_i32_e32 v5, v5, v7 ; GFX8-NEXT: v_min_i32_e32 v5, v5, v6 -; GFX8-NEXT: v_bfrev_b32_e32 v11, -2 ; GFX8-NEXT: v_sub_u32_e32 v2, vcc, v2, v5 ; GFX8-NEXT: v_max_i32_e32 v5, -1, v3 -; GFX8-NEXT: v_sub_u32_e32 v5, vcc, v5, v11 +; GFX8-NEXT: v_sub_u32_e32 v5, vcc, v5, v10 ; GFX8-NEXT: v_min_i32_e32 v6, -1, v3 ; GFX8-NEXT: v_subrev_u32_e32 v6, vcc, 0x80000000, v6 ; GFX8-NEXT: v_max_i32_e32 v5, v5, v8 @@ -1961,246 +1949,242 @@ ; GFX6-LABEL: v_ssubsat_v16i32: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: s_brev_b32 s4, -2 -; GFX6-NEXT: v_max_i32_e32 v31, -1, v0 -; GFX6-NEXT: v_subrev_i32_e32 v31, vcc, s4, v31 -; GFX6-NEXT: v_max_i32_e32 v16, v31, v16 -; GFX6-NEXT: s_brev_b32 s5, 1 -; GFX6-NEXT: v_min_i32_e32 v31, -1, v0 -; GFX6-NEXT: v_subrev_i32_e32 v31, vcc, s5, v31 -; GFX6-NEXT: v_min_i32_e32 v16, v16, v31 +; GFX6-NEXT: v_bfrev_b32_e32 v31, -2 +; GFX6-NEXT: v_max_i32_e32 v32, -1, v0 +; GFX6-NEXT: v_sub_i32_e32 v32, vcc, v32, v31 +; GFX6-NEXT: v_max_i32_e32 v16, v32, v16 +; GFX6-NEXT: v_bfrev_b32_e32 v32, 1 +; GFX6-NEXT: v_min_i32_e32 v33, -1, v0 +; GFX6-NEXT: v_sub_i32_e32 v33, vcc, v33, v32 +; GFX6-NEXT: v_min_i32_e32 v16, v16, v33 ; GFX6-NEXT: v_sub_i32_e32 v0, vcc, v0, v16 ; GFX6-NEXT: v_max_i32_e32 v16, -1, v1 -; GFX6-NEXT: v_subrev_i32_e32 v16, vcc, s4, v16 +; GFX6-NEXT: v_sub_i32_e32 v16, vcc, v16, v31 ; GFX6-NEXT: v_max_i32_e32 v16, v16, v17 ; GFX6-NEXT: v_min_i32_e32 v17, -1, v1 -; GFX6-NEXT: v_subrev_i32_e32 v17, vcc, s5, v17 +; GFX6-NEXT: v_sub_i32_e32 v17, vcc, v17, v32 ; GFX6-NEXT: v_min_i32_e32 v16, v16, v17 ; GFX6-NEXT: v_sub_i32_e32 v1, vcc, v1, v16 ; GFX6-NEXT: v_max_i32_e32 v16, -1, v2 -; GFX6-NEXT: v_subrev_i32_e32 v16, vcc, s4, v16 +; GFX6-NEXT: v_sub_i32_e32 v16, vcc, v16, v31 ; GFX6-NEXT: v_min_i32_e32 v17, -1, v2 ; GFX6-NEXT: v_max_i32_e32 v16, v16, v18 -; GFX6-NEXT: v_subrev_i32_e32 v17, vcc, s5, v17 +; GFX6-NEXT: v_sub_i32_e32 v17, vcc, v17, v32 ; GFX6-NEXT: v_min_i32_e32 v16, v16, v17 ; GFX6-NEXT: v_sub_i32_e32 v2, vcc, v2, v16 -; GFX6-NEXT: v_bfrev_b32_e32 v16, -2 -; GFX6-NEXT: v_max_i32_e32 v17, -1, v3 -; GFX6-NEXT: v_sub_i32_e32 v17, vcc, v17, v16 -; GFX6-NEXT: v_max_i32_e32 v17, v17, v19 -; GFX6-NEXT: v_bfrev_b32_e32 v18, 1 -; GFX6-NEXT: v_min_i32_e32 v19, -1, v3 -; GFX6-NEXT: v_sub_i32_e32 v19, vcc, v19, v18 -; GFX6-NEXT: v_min_i32_e32 v17, v17, v19 -; GFX6-NEXT: v_sub_i32_e32 v3, vcc, v3, v17 -; GFX6-NEXT: v_max_i32_e32 v17, -1, v4 -; GFX6-NEXT: v_sub_i32_e32 v17, vcc, v17, v16 -; GFX6-NEXT: v_min_i32_e32 v19, -1, v4 -; GFX6-NEXT: v_max_i32_e32 v17, v17, v20 -; GFX6-NEXT: v_sub_i32_e32 v19, vcc, v19, v18 -; GFX6-NEXT: v_min_i32_e32 v17, v17, v19 -; GFX6-NEXT: v_sub_i32_e32 v4, vcc, v4, v17 -; GFX6-NEXT: v_max_i32_e32 v17, -1, v5 -; GFX6-NEXT: v_sub_i32_e32 v17, vcc, v17, v16 -; GFX6-NEXT: v_min_i32_e32 v19, -1, v5 -; GFX6-NEXT: v_max_i32_e32 v17, v17, v21 -; GFX6-NEXT: v_sub_i32_e32 v19, vcc, v19, v18 -; GFX6-NEXT: v_min_i32_e32 v17, v17, v19 -; GFX6-NEXT: v_sub_i32_e32 v5, vcc, v5, v17 -; GFX6-NEXT: v_max_i32_e32 v17, -1, v6 -; GFX6-NEXT: v_sub_i32_e32 v17, vcc, v17, v16 -; GFX6-NEXT: v_min_i32_e32 v19, -1, v6 -; GFX6-NEXT: v_max_i32_e32 v17, v17, v22 -; GFX6-NEXT: v_sub_i32_e32 v19, vcc, v19, v18 -; GFX6-NEXT: v_min_i32_e32 v17, v17, v19 -; GFX6-NEXT: buffer_load_dword v19, off, s[0:3], s32 -; GFX6-NEXT: v_sub_i32_e32 v6, vcc, v6, v17 -; GFX6-NEXT: v_max_i32_e32 v17, -1, v7 -; GFX6-NEXT: v_sub_i32_e32 v17, vcc, v17, v16 -; GFX6-NEXT: v_min_i32_e32 v20, -1, v7 -; GFX6-NEXT: v_max_i32_e32 v17, v17, v23 -; GFX6-NEXT: v_sub_i32_e32 v20, vcc, v20, v18 -; GFX6-NEXT: v_min_i32_e32 v17, v17, v20 -; GFX6-NEXT: v_sub_i32_e32 v7, vcc, v7, v17 -; GFX6-NEXT: v_max_i32_e32 v17, -1, v8 -; GFX6-NEXT: v_sub_i32_e32 v17, vcc, v17, v16 -; GFX6-NEXT: v_min_i32_e32 v20, -1, v8 -; GFX6-NEXT: v_sub_i32_e32 v20, vcc, v20, v18 -; GFX6-NEXT: v_max_i32_e32 v17, v17, v24 -; GFX6-NEXT: v_min_i32_e32 v17, v17, v20 -; GFX6-NEXT: v_sub_i32_e32 v8, vcc, v8, v17 -; GFX6-NEXT: v_max_i32_e32 v17, -1, v9 -; GFX6-NEXT: v_sub_i32_e32 v17, vcc, v17, v16 -; GFX6-NEXT: v_min_i32_e32 v20, -1, v9 -; GFX6-NEXT: v_sub_i32_e32 v20, vcc, v20, v18 -; GFX6-NEXT: v_max_i32_e32 v17, v17, v25 -; GFX6-NEXT: v_min_i32_e32 v17, v17, v20 -; GFX6-NEXT: v_sub_i32_e32 v9, vcc, v9, v17 -; GFX6-NEXT: v_max_i32_e32 v17, -1, v10 -; GFX6-NEXT: v_sub_i32_e32 v17, vcc, v17, v16 -; GFX6-NEXT: v_min_i32_e32 v20, -1, v10 -; GFX6-NEXT: v_sub_i32_e32 v20, vcc, v20, v18 -; GFX6-NEXT: v_max_i32_e32 v17, v17, v26 -; GFX6-NEXT: v_min_i32_e32 v17, v17, v20 -; GFX6-NEXT: v_sub_i32_e32 v10, vcc, v10, v17 -; GFX6-NEXT: v_max_i32_e32 v17, -1, v11 -; GFX6-NEXT: v_sub_i32_e32 v17, vcc, v17, v16 -; GFX6-NEXT: v_min_i32_e32 v20, -1, v11 -; GFX6-NEXT: v_sub_i32_e32 v20, vcc, v20, v18 -; GFX6-NEXT: v_max_i32_e32 v17, v17, v27 -; GFX6-NEXT: v_min_i32_e32 v17, v17, v20 -; GFX6-NEXT: v_sub_i32_e32 v11, vcc, v11, v17 -; GFX6-NEXT: v_max_i32_e32 v17, -1, v12 -; GFX6-NEXT: v_sub_i32_e32 v17, vcc, v17, v16 -; GFX6-NEXT: v_min_i32_e32 v20, -1, v12 -; GFX6-NEXT: v_sub_i32_e32 v20, vcc, v20, v18 -; GFX6-NEXT: v_max_i32_e32 v17, v17, v28 -; GFX6-NEXT: v_min_i32_e32 v17, v17, v20 -; GFX6-NEXT: v_sub_i32_e32 v12, vcc, v12, v17 -; GFX6-NEXT: v_max_i32_e32 v17, -1, v13 -; GFX6-NEXT: v_sub_i32_e32 v17, vcc, v17, v16 -; GFX6-NEXT: v_min_i32_e32 v20, -1, v13 -; GFX6-NEXT: v_sub_i32_e32 v20, vcc, v20, v18 -; GFX6-NEXT: v_max_i32_e32 v17, v17, v29 -; GFX6-NEXT: v_min_i32_e32 v17, v17, v20 -; GFX6-NEXT: v_sub_i32_e32 v13, vcc, v13, v17 -; GFX6-NEXT: v_max_i32_e32 v17, -1, v14 -; GFX6-NEXT: v_sub_i32_e32 v17, vcc, v17, v16 -; GFX6-NEXT: v_min_i32_e32 v20, -1, v14 -; GFX6-NEXT: v_sub_i32_e32 v20, vcc, v20, v18 -; GFX6-NEXT: v_max_i32_e32 v17, v17, v30 -; GFX6-NEXT: v_min_i32_e32 v17, v17, v20 -; GFX6-NEXT: v_sub_i32_e32 v14, vcc, v14, v17 -; GFX6-NEXT: v_max_i32_e32 v17, -1, v15 -; GFX6-NEXT: v_sub_i32_e32 v16, vcc, v17, v16 -; GFX6-NEXT: v_min_i32_e32 v17, -1, v15 -; GFX6-NEXT: v_sub_i32_e32 v17, vcc, v17, v18 -; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: v_max_i32_e32 v16, -1, v3 +; GFX6-NEXT: v_sub_i32_e32 v16, vcc, v16, v31 +; GFX6-NEXT: v_min_i32_e32 v17, -1, v3 ; GFX6-NEXT: v_max_i32_e32 v16, v16, v19 +; GFX6-NEXT: v_sub_i32_e32 v17, vcc, v17, v32 +; GFX6-NEXT: v_min_i32_e32 v16, v16, v17 +; GFX6-NEXT: v_sub_i32_e32 v3, vcc, v3, v16 +; GFX6-NEXT: v_max_i32_e32 v16, -1, v4 +; GFX6-NEXT: v_sub_i32_e32 v16, vcc, v16, v31 +; GFX6-NEXT: v_min_i32_e32 v17, -1, v4 +; GFX6-NEXT: v_max_i32_e32 v16, v16, v20 +; GFX6-NEXT: v_sub_i32_e32 v17, vcc, v17, v32 +; GFX6-NEXT: v_min_i32_e32 v16, v16, v17 +; GFX6-NEXT: v_sub_i32_e32 v4, vcc, v4, v16 +; GFX6-NEXT: v_max_i32_e32 v16, -1, v5 +; GFX6-NEXT: v_sub_i32_e32 v16, vcc, v16, v31 +; GFX6-NEXT: v_min_i32_e32 v17, -1, v5 +; GFX6-NEXT: v_max_i32_e32 v16, v16, v21 +; GFX6-NEXT: v_sub_i32_e32 v17, vcc, v17, v32 +; GFX6-NEXT: v_min_i32_e32 v16, v16, v17 +; GFX6-NEXT: v_sub_i32_e32 v5, vcc, v5, v16 +; GFX6-NEXT: v_max_i32_e32 v16, -1, v6 +; GFX6-NEXT: v_sub_i32_e32 v16, vcc, v16, v31 +; GFX6-NEXT: v_min_i32_e32 v17, -1, v6 +; GFX6-NEXT: v_max_i32_e32 v16, v16, v22 +; GFX6-NEXT: v_sub_i32_e32 v17, vcc, v17, v32 ; GFX6-NEXT: v_min_i32_e32 v16, v16, v17 +; GFX6-NEXT: buffer_load_dword v17, off, s[0:3], s32 +; GFX6-NEXT: v_sub_i32_e32 v6, vcc, v6, v16 +; GFX6-NEXT: v_max_i32_e32 v16, -1, v7 +; GFX6-NEXT: v_sub_i32_e32 v16, vcc, v16, v31 +; GFX6-NEXT: v_min_i32_e32 v18, -1, v7 +; GFX6-NEXT: v_max_i32_e32 v16, v16, v23 +; GFX6-NEXT: v_sub_i32_e32 v18, vcc, v18, v32 +; GFX6-NEXT: v_min_i32_e32 v16, v16, v18 +; GFX6-NEXT: v_sub_i32_e32 v7, vcc, v7, v16 +; GFX6-NEXT: v_max_i32_e32 v16, -1, v8 +; GFX6-NEXT: v_sub_i32_e32 v16, vcc, v16, v31 +; GFX6-NEXT: v_min_i32_e32 v18, -1, v8 +; GFX6-NEXT: v_sub_i32_e32 v18, vcc, v18, v32 +; GFX6-NEXT: v_max_i32_e32 v16, v16, v24 +; GFX6-NEXT: v_min_i32_e32 v16, v16, v18 +; GFX6-NEXT: v_sub_i32_e32 v8, vcc, v8, v16 +; GFX6-NEXT: v_max_i32_e32 v16, -1, v9 +; GFX6-NEXT: v_sub_i32_e32 v16, vcc, v16, v31 +; GFX6-NEXT: v_min_i32_e32 v18, -1, v9 +; GFX6-NEXT: v_sub_i32_e32 v18, vcc, v18, v32 +; GFX6-NEXT: v_max_i32_e32 v16, v16, v25 +; GFX6-NEXT: v_min_i32_e32 v16, v16, v18 +; GFX6-NEXT: v_sub_i32_e32 v9, vcc, v9, v16 +; GFX6-NEXT: v_max_i32_e32 v16, -1, v10 +; GFX6-NEXT: v_sub_i32_e32 v16, vcc, v16, v31 +; GFX6-NEXT: v_min_i32_e32 v18, -1, v10 +; GFX6-NEXT: v_sub_i32_e32 v18, vcc, v18, v32 +; GFX6-NEXT: v_max_i32_e32 v16, v16, v26 +; GFX6-NEXT: v_min_i32_e32 v16, v16, v18 +; GFX6-NEXT: v_sub_i32_e32 v10, vcc, v10, v16 +; GFX6-NEXT: v_max_i32_e32 v16, -1, v11 +; GFX6-NEXT: v_sub_i32_e32 v16, vcc, v16, v31 +; GFX6-NEXT: v_min_i32_e32 v18, -1, v11 +; GFX6-NEXT: v_sub_i32_e32 v18, vcc, v18, v32 +; GFX6-NEXT: v_max_i32_e32 v16, v16, v27 +; GFX6-NEXT: v_min_i32_e32 v16, v16, v18 +; GFX6-NEXT: v_sub_i32_e32 v11, vcc, v11, v16 +; GFX6-NEXT: v_max_i32_e32 v16, -1, v12 +; GFX6-NEXT: v_sub_i32_e32 v16, vcc, v16, v31 +; GFX6-NEXT: v_min_i32_e32 v18, -1, v12 +; GFX6-NEXT: v_sub_i32_e32 v18, vcc, v18, v32 +; GFX6-NEXT: v_max_i32_e32 v16, v16, v28 +; GFX6-NEXT: v_min_i32_e32 v16, v16, v18 +; GFX6-NEXT: v_sub_i32_e32 v12, vcc, v12, v16 +; GFX6-NEXT: v_max_i32_e32 v16, -1, v13 +; GFX6-NEXT: v_sub_i32_e32 v16, vcc, v16, v31 +; GFX6-NEXT: v_min_i32_e32 v18, -1, v13 +; GFX6-NEXT: v_sub_i32_e32 v18, vcc, v18, v32 +; GFX6-NEXT: v_max_i32_e32 v16, v16, v29 +; GFX6-NEXT: v_min_i32_e32 v16, v16, v18 +; GFX6-NEXT: v_sub_i32_e32 v13, vcc, v13, v16 +; GFX6-NEXT: v_max_i32_e32 v16, -1, v14 +; GFX6-NEXT: v_sub_i32_e32 v16, vcc, v16, v31 +; GFX6-NEXT: v_min_i32_e32 v18, -1, v14 +; GFX6-NEXT: v_sub_i32_e32 v18, vcc, v18, v32 +; GFX6-NEXT: v_max_i32_e32 v16, v16, v30 +; GFX6-NEXT: v_min_i32_e32 v16, v16, v18 +; GFX6-NEXT: v_sub_i32_e32 v14, vcc, v14, v16 +; GFX6-NEXT: v_max_i32_e32 v16, -1, v15 +; GFX6-NEXT: v_sub_i32_e32 v16, vcc, v16, v31 +; GFX6-NEXT: v_min_i32_e32 v18, -1, v15 +; GFX6-NEXT: v_sub_i32_e32 v18, vcc, v18, v32 +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: v_max_i32_e32 v16, v16, v17 +; GFX6-NEXT: v_min_i32_e32 v16, v16, v18 ; GFX6-NEXT: v_sub_i32_e32 v15, vcc, v15, v16 ; GFX6-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: v_ssubsat_v16i32: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: s_brev_b32 s4, -2 -; GFX8-NEXT: v_max_i32_e32 v31, -1, v0 -; GFX8-NEXT: v_subrev_u32_e32 v31, vcc, s4, v31 -; GFX8-NEXT: v_max_i32_e32 v16, v31, v16 -; GFX8-NEXT: s_brev_b32 s5, 1 -; GFX8-NEXT: v_min_i32_e32 v31, -1, v0 -; GFX8-NEXT: v_subrev_u32_e32 v31, vcc, s5, v31 -; GFX8-NEXT: v_min_i32_e32 v16, v16, v31 +; GFX8-NEXT: v_bfrev_b32_e32 v31, -2 +; GFX8-NEXT: v_max_i32_e32 v32, -1, v0 +; GFX8-NEXT: v_sub_u32_e32 v32, vcc, v32, v31 +; GFX8-NEXT: v_max_i32_e32 v16, v32, v16 +; GFX8-NEXT: v_bfrev_b32_e32 v32, 1 +; GFX8-NEXT: v_min_i32_e32 v33, -1, v0 +; GFX8-NEXT: v_sub_u32_e32 v33, vcc, v33, v32 +; GFX8-NEXT: v_min_i32_e32 v16, v16, v33 ; GFX8-NEXT: v_sub_u32_e32 v0, vcc, v0, v16 ; GFX8-NEXT: v_max_i32_e32 v16, -1, v1 -; GFX8-NEXT: v_subrev_u32_e32 v16, vcc, s4, v16 +; GFX8-NEXT: v_sub_u32_e32 v16, vcc, v16, v31 ; GFX8-NEXT: v_max_i32_e32 v16, v16, v17 ; GFX8-NEXT: v_min_i32_e32 v17, -1, v1 -; GFX8-NEXT: v_subrev_u32_e32 v17, vcc, s5, v17 +; GFX8-NEXT: v_sub_u32_e32 v17, vcc, v17, v32 ; GFX8-NEXT: v_min_i32_e32 v16, v16, v17 ; GFX8-NEXT: v_sub_u32_e32 v1, vcc, v1, v16 ; GFX8-NEXT: v_max_i32_e32 v16, -1, v2 -; GFX8-NEXT: v_subrev_u32_e32 v16, vcc, s4, v16 +; GFX8-NEXT: v_sub_u32_e32 v16, vcc, v16, v31 ; GFX8-NEXT: v_min_i32_e32 v17, -1, v2 ; GFX8-NEXT: v_max_i32_e32 v16, v16, v18 -; GFX8-NEXT: v_subrev_u32_e32 v17, vcc, s5, v17 +; GFX8-NEXT: v_sub_u32_e32 v17, vcc, v17, v32 ; GFX8-NEXT: v_min_i32_e32 v16, v16, v17 ; GFX8-NEXT: v_sub_u32_e32 v2, vcc, v2, v16 -; GFX8-NEXT: v_bfrev_b32_e32 v16, -2 -; GFX8-NEXT: v_max_i32_e32 v17, -1, v3 -; GFX8-NEXT: v_sub_u32_e32 v17, vcc, v17, v16 -; GFX8-NEXT: v_max_i32_e32 v17, v17, v19 -; GFX8-NEXT: v_bfrev_b32_e32 v18, 1 -; GFX8-NEXT: v_min_i32_e32 v19, -1, v3 -; GFX8-NEXT: v_sub_u32_e32 v19, vcc, v19, v18 -; GFX8-NEXT: v_min_i32_e32 v17, v17, v19 -; GFX8-NEXT: v_sub_u32_e32 v3, vcc, v3, v17 -; GFX8-NEXT: v_max_i32_e32 v17, -1, v4 -; GFX8-NEXT: v_sub_u32_e32 v17, vcc, v17, v16 -; GFX8-NEXT: v_min_i32_e32 v19, -1, v4 -; GFX8-NEXT: v_max_i32_e32 v17, v17, v20 -; GFX8-NEXT: v_sub_u32_e32 v19, vcc, v19, v18 -; GFX8-NEXT: v_min_i32_e32 v17, v17, v19 -; GFX8-NEXT: v_sub_u32_e32 v4, vcc, v4, v17 -; GFX8-NEXT: v_max_i32_e32 v17, -1, v5 -; GFX8-NEXT: v_sub_u32_e32 v17, vcc, v17, v16 -; GFX8-NEXT: v_min_i32_e32 v19, -1, v5 -; GFX8-NEXT: v_max_i32_e32 v17, v17, v21 -; GFX8-NEXT: v_sub_u32_e32 v19, vcc, v19, v18 -; GFX8-NEXT: v_min_i32_e32 v17, v17, v19 -; GFX8-NEXT: v_sub_u32_e32 v5, vcc, v5, v17 -; GFX8-NEXT: v_max_i32_e32 v17, -1, v6 -; GFX8-NEXT: v_sub_u32_e32 v17, vcc, v17, v16 -; GFX8-NEXT: v_min_i32_e32 v19, -1, v6 -; GFX8-NEXT: v_max_i32_e32 v17, v17, v22 -; GFX8-NEXT: v_sub_u32_e32 v19, vcc, v19, v18 -; GFX8-NEXT: v_min_i32_e32 v17, v17, v19 -; GFX8-NEXT: buffer_load_dword v19, off, s[0:3], s32 -; GFX8-NEXT: v_sub_u32_e32 v6, vcc, v6, v17 -; GFX8-NEXT: v_max_i32_e32 v17, -1, v7 -; GFX8-NEXT: v_sub_u32_e32 v17, vcc, v17, v16 -; GFX8-NEXT: v_min_i32_e32 v20, -1, v7 -; GFX8-NEXT: v_max_i32_e32 v17, v17, v23 -; GFX8-NEXT: v_sub_u32_e32 v20, vcc, v20, v18 -; GFX8-NEXT: v_min_i32_e32 v17, v17, v20 -; GFX8-NEXT: v_sub_u32_e32 v7, vcc, v7, v17 -; GFX8-NEXT: v_max_i32_e32 v17, -1, v8 -; GFX8-NEXT: v_sub_u32_e32 v17, vcc, v17, v16 -; GFX8-NEXT: v_min_i32_e32 v20, -1, v8 -; GFX8-NEXT: v_sub_u32_e32 v20, vcc, v20, v18 -; GFX8-NEXT: v_max_i32_e32 v17, v17, v24 -; GFX8-NEXT: v_min_i32_e32 v17, v17, v20 -; GFX8-NEXT: v_sub_u32_e32 v8, vcc, v8, v17 -; GFX8-NEXT: v_max_i32_e32 v17, -1, v9 -; GFX8-NEXT: v_sub_u32_e32 v17, vcc, v17, v16 -; GFX8-NEXT: v_min_i32_e32 v20, -1, v9 -; GFX8-NEXT: v_sub_u32_e32 v20, vcc, v20, v18 -; GFX8-NEXT: v_max_i32_e32 v17, v17, v25 -; GFX8-NEXT: v_min_i32_e32 v17, v17, v20 -; GFX8-NEXT: v_sub_u32_e32 v9, vcc, v9, v17 -; GFX8-NEXT: v_max_i32_e32 v17, -1, v10 -; GFX8-NEXT: v_sub_u32_e32 v17, vcc, v17, v16 -; GFX8-NEXT: v_min_i32_e32 v20, -1, v10 -; GFX8-NEXT: v_sub_u32_e32 v20, vcc, v20, v18 -; GFX8-NEXT: v_max_i32_e32 v17, v17, v26 -; GFX8-NEXT: v_min_i32_e32 v17, v17, v20 -; GFX8-NEXT: v_sub_u32_e32 v10, vcc, v10, v17 -; GFX8-NEXT: v_max_i32_e32 v17, -1, v11 -; GFX8-NEXT: v_sub_u32_e32 v17, vcc, v17, v16 -; GFX8-NEXT: v_min_i32_e32 v20, -1, v11 -; GFX8-NEXT: v_sub_u32_e32 v20, vcc, v20, v18 -; GFX8-NEXT: v_max_i32_e32 v17, v17, v27 -; GFX8-NEXT: v_min_i32_e32 v17, v17, v20 -; GFX8-NEXT: v_sub_u32_e32 v11, vcc, v11, v17 -; GFX8-NEXT: v_max_i32_e32 v17, -1, v12 -; GFX8-NEXT: v_sub_u32_e32 v17, vcc, v17, v16 -; GFX8-NEXT: v_min_i32_e32 v20, -1, v12 -; GFX8-NEXT: v_sub_u32_e32 v20, vcc, v20, v18 -; GFX8-NEXT: v_max_i32_e32 v17, v17, v28 -; GFX8-NEXT: v_min_i32_e32 v17, v17, v20 -; GFX8-NEXT: v_sub_u32_e32 v12, vcc, v12, v17 -; GFX8-NEXT: v_max_i32_e32 v17, -1, v13 -; GFX8-NEXT: v_sub_u32_e32 v17, vcc, v17, v16 -; GFX8-NEXT: v_min_i32_e32 v20, -1, v13 -; GFX8-NEXT: v_sub_u32_e32 v20, vcc, v20, v18 -; GFX8-NEXT: v_max_i32_e32 v17, v17, v29 -; GFX8-NEXT: v_min_i32_e32 v17, v17, v20 -; GFX8-NEXT: v_sub_u32_e32 v13, vcc, v13, v17 -; GFX8-NEXT: v_max_i32_e32 v17, -1, v14 -; GFX8-NEXT: v_sub_u32_e32 v17, vcc, v17, v16 -; GFX8-NEXT: v_min_i32_e32 v20, -1, v14 -; GFX8-NEXT: v_sub_u32_e32 v20, vcc, v20, v18 -; GFX8-NEXT: v_max_i32_e32 v17, v17, v30 -; GFX8-NEXT: v_min_i32_e32 v17, v17, v20 -; GFX8-NEXT: v_sub_u32_e32 v14, vcc, v14, v17 -; GFX8-NEXT: v_max_i32_e32 v17, -1, v15 -; GFX8-NEXT: v_sub_u32_e32 v16, vcc, v17, v16 -; GFX8-NEXT: v_min_i32_e32 v17, -1, v15 -; GFX8-NEXT: v_sub_u32_e32 v17, vcc, v17, v18 -; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: v_max_i32_e32 v16, -1, v3 +; GFX8-NEXT: v_sub_u32_e32 v16, vcc, v16, v31 +; GFX8-NEXT: v_min_i32_e32 v17, -1, v3 ; GFX8-NEXT: v_max_i32_e32 v16, v16, v19 +; GFX8-NEXT: v_sub_u32_e32 v17, vcc, v17, v32 +; GFX8-NEXT: v_min_i32_e32 v16, v16, v17 +; GFX8-NEXT: v_sub_u32_e32 v3, vcc, v3, v16 +; GFX8-NEXT: v_max_i32_e32 v16, -1, v4 +; GFX8-NEXT: v_sub_u32_e32 v16, vcc, v16, v31 +; GFX8-NEXT: v_min_i32_e32 v17, -1, v4 +; GFX8-NEXT: v_max_i32_e32 v16, v16, v20 +; GFX8-NEXT: v_sub_u32_e32 v17, vcc, v17, v32 +; GFX8-NEXT: v_min_i32_e32 v16, v16, v17 +; GFX8-NEXT: v_sub_u32_e32 v4, vcc, v4, v16 +; GFX8-NEXT: v_max_i32_e32 v16, -1, v5 +; GFX8-NEXT: v_sub_u32_e32 v16, vcc, v16, v31 +; GFX8-NEXT: v_min_i32_e32 v17, -1, v5 +; GFX8-NEXT: v_max_i32_e32 v16, v16, v21 +; GFX8-NEXT: v_sub_u32_e32 v17, vcc, v17, v32 ; GFX8-NEXT: v_min_i32_e32 v16, v16, v17 +; GFX8-NEXT: v_sub_u32_e32 v5, vcc, v5, v16 +; GFX8-NEXT: v_max_i32_e32 v16, -1, v6 +; GFX8-NEXT: v_sub_u32_e32 v16, vcc, v16, v31 +; GFX8-NEXT: v_min_i32_e32 v17, -1, v6 +; GFX8-NEXT: v_max_i32_e32 v16, v16, v22 +; GFX8-NEXT: v_sub_u32_e32 v17, vcc, v17, v32 +; GFX8-NEXT: v_min_i32_e32 v16, v16, v17 +; GFX8-NEXT: buffer_load_dword v17, off, s[0:3], s32 +; GFX8-NEXT: v_sub_u32_e32 v6, vcc, v6, v16 +; GFX8-NEXT: v_max_i32_e32 v16, -1, v7 +; GFX8-NEXT: v_sub_u32_e32 v16, vcc, v16, v31 +; GFX8-NEXT: v_min_i32_e32 v18, -1, v7 +; GFX8-NEXT: v_max_i32_e32 v16, v16, v23 +; GFX8-NEXT: v_sub_u32_e32 v18, vcc, v18, v32 +; GFX8-NEXT: v_min_i32_e32 v16, v16, v18 +; GFX8-NEXT: v_sub_u32_e32 v7, vcc, v7, v16 +; GFX8-NEXT: v_max_i32_e32 v16, -1, v8 +; GFX8-NEXT: v_sub_u32_e32 v16, vcc, v16, v31 +; GFX8-NEXT: v_min_i32_e32 v18, -1, v8 +; GFX8-NEXT: v_sub_u32_e32 v18, vcc, v18, v32 +; GFX8-NEXT: v_max_i32_e32 v16, v16, v24 +; GFX8-NEXT: v_min_i32_e32 v16, v16, v18 +; GFX8-NEXT: v_sub_u32_e32 v8, vcc, v8, v16 +; GFX8-NEXT: v_max_i32_e32 v16, -1, v9 +; GFX8-NEXT: v_sub_u32_e32 v16, vcc, v16, v31 +; GFX8-NEXT: v_min_i32_e32 v18, -1, v9 +; GFX8-NEXT: v_sub_u32_e32 v18, vcc, v18, v32 +; GFX8-NEXT: v_max_i32_e32 v16, v16, v25 +; GFX8-NEXT: v_min_i32_e32 v16, v16, v18 +; GFX8-NEXT: v_sub_u32_e32 v9, vcc, v9, v16 +; GFX8-NEXT: v_max_i32_e32 v16, -1, v10 +; GFX8-NEXT: v_sub_u32_e32 v16, vcc, v16, v31 +; GFX8-NEXT: v_min_i32_e32 v18, -1, v10 +; GFX8-NEXT: v_sub_u32_e32 v18, vcc, v18, v32 +; GFX8-NEXT: v_max_i32_e32 v16, v16, v26 +; GFX8-NEXT: v_min_i32_e32 v16, v16, v18 +; GFX8-NEXT: v_sub_u32_e32 v10, vcc, v10, v16 +; GFX8-NEXT: v_max_i32_e32 v16, -1, v11 +; GFX8-NEXT: v_sub_u32_e32 v16, vcc, v16, v31 +; GFX8-NEXT: v_min_i32_e32 v18, -1, v11 +; GFX8-NEXT: v_sub_u32_e32 v18, vcc, v18, v32 +; GFX8-NEXT: v_max_i32_e32 v16, v16, v27 +; GFX8-NEXT: v_min_i32_e32 v16, v16, v18 +; GFX8-NEXT: v_sub_u32_e32 v11, vcc, v11, v16 +; GFX8-NEXT: v_max_i32_e32 v16, -1, v12 +; GFX8-NEXT: v_sub_u32_e32 v16, vcc, v16, v31 +; GFX8-NEXT: v_min_i32_e32 v18, -1, v12 +; GFX8-NEXT: v_sub_u32_e32 v18, vcc, v18, v32 +; GFX8-NEXT: v_max_i32_e32 v16, v16, v28 +; GFX8-NEXT: v_min_i32_e32 v16, v16, v18 +; GFX8-NEXT: v_sub_u32_e32 v12, vcc, v12, v16 +; GFX8-NEXT: v_max_i32_e32 v16, -1, v13 +; GFX8-NEXT: v_sub_u32_e32 v16, vcc, v16, v31 +; GFX8-NEXT: v_min_i32_e32 v18, -1, v13 +; GFX8-NEXT: v_sub_u32_e32 v18, vcc, v18, v32 +; GFX8-NEXT: v_max_i32_e32 v16, v16, v29 +; GFX8-NEXT: v_min_i32_e32 v16, v16, v18 +; GFX8-NEXT: v_sub_u32_e32 v13, vcc, v13, v16 +; GFX8-NEXT: v_max_i32_e32 v16, -1, v14 +; GFX8-NEXT: v_sub_u32_e32 v16, vcc, v16, v31 +; GFX8-NEXT: v_min_i32_e32 v18, -1, v14 +; GFX8-NEXT: v_sub_u32_e32 v18, vcc, v18, v32 +; GFX8-NEXT: v_max_i32_e32 v16, v16, v30 +; GFX8-NEXT: v_min_i32_e32 v16, v16, v18 +; GFX8-NEXT: v_sub_u32_e32 v14, vcc, v14, v16 +; GFX8-NEXT: v_max_i32_e32 v16, -1, v15 +; GFX8-NEXT: v_sub_u32_e32 v16, vcc, v16, v31 +; GFX8-NEXT: v_min_i32_e32 v18, -1, v15 +; GFX8-NEXT: v_sub_u32_e32 v18, vcc, v18, v32 +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: v_max_i32_e32 v16, v16, v17 +; GFX8-NEXT: v_min_i32_e32 v16, v16, v18 ; GFX8-NEXT: v_sub_u32_e32 v15, vcc, v15, v16 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; @@ -2781,22 +2765,20 @@ ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX6-NEXT: s_brev_b32 s4, -2 ; GFX6-NEXT: v_max_i32_e32 v4, -1, v0 ; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX6-NEXT: s_brev_b32 s5, 1 -; GFX6-NEXT: v_subrev_i32_e32 v4, vcc, s4, v4 +; GFX6-NEXT: v_subrev_i32_e32 v4, vcc, 0x7fffffff, v4 ; GFX6-NEXT: v_min_i32_e32 v5, -1, v0 -; GFX6-NEXT: v_subrev_i32_e32 v5, vcc, s5, v5 +; GFX6-NEXT: v_subrev_i32_e32 v5, vcc, 0x80000000, v5 ; GFX6-NEXT: v_max_i32_e32 v2, v4, v2 ; GFX6-NEXT: v_min_i32_e32 v2, v2, v5 ; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX6-NEXT: v_sub_i32_e32 v0, vcc, v0, v2 ; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v3 ; GFX6-NEXT: v_max_i32_e32 v3, -1, v1 -; GFX6-NEXT: v_subrev_i32_e32 v3, vcc, s4, v3 +; GFX6-NEXT: v_subrev_i32_e32 v3, vcc, 0x7fffffff, v3 ; GFX6-NEXT: v_min_i32_e32 v4, -1, v1 -; GFX6-NEXT: v_subrev_i32_e32 v4, vcc, s5, v4 +; GFX6-NEXT: v_subrev_i32_e32 v4, vcc, 0x80000000, v4 ; GFX6-NEXT: v_max_i32_e32 v2, v3, v2 ; GFX6-NEXT: v_min_i32_e32 v2, v2, v4 ; GFX6-NEXT: v_sub_i32_e32 v1, vcc, v1, v2 @@ -2994,22 +2976,20 @@ ; GFX6-LABEL: ssubsat_v2i16_vs: ; GFX6: ; %bb.0: ; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX6-NEXT: s_brev_b32 s2, -2 ; GFX6-NEXT: v_max_i32_e32 v2, -1, v0 ; GFX6-NEXT: s_lshl_b32 s0, s0, 16 -; GFX6-NEXT: s_brev_b32 s3, 1 -; GFX6-NEXT: v_subrev_i32_e32 v2, vcc, s2, v2 +; GFX6-NEXT: v_subrev_i32_e32 v2, vcc, 0x7fffffff, v2 ; GFX6-NEXT: v_min_i32_e32 v3, -1, v0 -; GFX6-NEXT: v_subrev_i32_e32 v3, vcc, s3, v3 +; GFX6-NEXT: v_subrev_i32_e32 v3, vcc, 0x80000000, v3 ; GFX6-NEXT: v_max_i32_e32 v2, s0, v2 ; GFX6-NEXT: v_min_i32_e32 v2, v2, v3 ; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX6-NEXT: v_sub_i32_e32 v0, vcc, v0, v2 ; GFX6-NEXT: v_max_i32_e32 v2, -1, v1 ; GFX6-NEXT: s_lshl_b32 s0, s1, 16 -; GFX6-NEXT: v_subrev_i32_e32 v2, vcc, s2, v2 +; GFX6-NEXT: v_subrev_i32_e32 v2, vcc, 0x7fffffff, v2 ; GFX6-NEXT: v_min_i32_e32 v3, -1, v1 -; GFX6-NEXT: v_subrev_i32_e32 v3, vcc, s3, v3 +; GFX6-NEXT: v_subrev_i32_e32 v3, vcc, 0x80000000, v3 ; GFX6-NEXT: v_max_i32_e32 v2, s0, v2 ; GFX6-NEXT: v_min_i32_e32 v2, v2, v3 ; GFX6-NEXT: v_sub_i32_e32 v1, vcc, v1, v2 @@ -3072,42 +3052,40 @@ ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX6-NEXT: s_brev_b32 s4, -2 -; GFX6-NEXT: v_max_i32_e32 v8, -1, v0 +; GFX6-NEXT: v_max_i32_e32 v10, -1, v0 ; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GFX6-NEXT: s_brev_b32 s5, 1 -; GFX6-NEXT: v_subrev_i32_e32 v8, vcc, s4, v8 -; GFX6-NEXT: v_min_i32_e32 v10, -1, v0 -; GFX6-NEXT: v_subrev_i32_e32 v10, vcc, s5, v10 -; GFX6-NEXT: v_max_i32_e32 v4, v8, v4 -; GFX6-NEXT: v_min_i32_e32 v4, v4, v10 +; GFX6-NEXT: v_bfrev_b32_e32 v9, 1 +; GFX6-NEXT: v_subrev_i32_e32 v10, vcc, 0x7fffffff, v10 +; GFX6-NEXT: v_min_i32_e32 v11, -1, v0 +; GFX6-NEXT: v_sub_i32_e32 v11, vcc, v11, v9 +; GFX6-NEXT: v_max_i32_e32 v4, v10, v4 +; GFX6-NEXT: v_min_i32_e32 v4, v4, v11 ; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX6-NEXT: v_bfrev_b32_e32 v8, -2 ; GFX6-NEXT: v_sub_i32_e32 v0, vcc, v0, v4 ; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v5 ; GFX6-NEXT: v_max_i32_e32 v5, -1, v1 -; GFX6-NEXT: v_subrev_i32_e32 v5, vcc, s4, v5 -; GFX6-NEXT: v_min_i32_e32 v8, -1, v1 -; GFX6-NEXT: v_subrev_i32_e32 v8, vcc, s5, v8 +; GFX6-NEXT: v_sub_i32_e32 v5, vcc, v5, v8 +; GFX6-NEXT: v_min_i32_e32 v10, -1, v1 +; GFX6-NEXT: v_sub_i32_e32 v10, vcc, v10, v9 ; GFX6-NEXT: v_max_i32_e32 v4, v5, v4 ; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX6-NEXT: v_bfrev_b32_e32 v9, -2 -; GFX6-NEXT: v_min_i32_e32 v4, v4, v8 +; GFX6-NEXT: v_min_i32_e32 v4, v4, v10 ; GFX6-NEXT: v_max_i32_e32 v5, -1, v2 ; GFX6-NEXT: v_sub_i32_e32 v1, vcc, v1, v4 ; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v6 -; GFX6-NEXT: v_sub_i32_e32 v5, vcc, v5, v9 +; GFX6-NEXT: v_sub_i32_e32 v5, vcc, v5, v8 ; GFX6-NEXT: v_min_i32_e32 v6, -1, v2 -; GFX6-NEXT: v_subrev_i32_e32 v6, vcc, s5, v6 +; GFX6-NEXT: v_sub_i32_e32 v6, vcc, v6, v9 ; GFX6-NEXT: v_max_i32_e32 v4, v5, v4 ; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; GFX6-NEXT: v_min_i32_e32 v4, v4, v6 ; GFX6-NEXT: v_max_i32_e32 v5, -1, v3 -; GFX6-NEXT: v_bfrev_b32_e32 v11, 1 ; GFX6-NEXT: v_sub_i32_e32 v2, vcc, v2, v4 ; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v7 -; GFX6-NEXT: v_sub_i32_e32 v5, vcc, v5, v9 +; GFX6-NEXT: v_sub_i32_e32 v5, vcc, v5, v8 ; GFX6-NEXT: v_min_i32_e32 v6, -1, v3 -; GFX6-NEXT: v_sub_i32_e32 v6, vcc, v6, v11 +; GFX6-NEXT: v_sub_i32_e32 v6, vcc, v6, v9 ; GFX6-NEXT: v_max_i32_e32 v4, v5, v4 ; GFX6-NEXT: v_ashrrev_i32_e32 v1, 16, v1 ; GFX6-NEXT: v_min_i32_e32 v4, v4, v6 @@ -3337,61 +3315,59 @@ ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX6-NEXT: s_brev_b32 s4, -2 -; GFX6-NEXT: v_max_i32_e32 v12, -1, v0 +; GFX6-NEXT: v_bfrev_b32_e32 v12, -2 +; GFX6-NEXT: v_max_i32_e32 v14, -1, v0 ; GFX6-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; GFX6-NEXT: s_brev_b32 s5, 1 -; GFX6-NEXT: v_subrev_i32_e32 v12, vcc, s4, v12 -; GFX6-NEXT: v_min_i32_e32 v14, -1, v0 -; GFX6-NEXT: v_subrev_i32_e32 v14, vcc, s5, v14 -; GFX6-NEXT: v_max_i32_e32 v6, v12, v6 -; GFX6-NEXT: v_min_i32_e32 v6, v6, v14 +; GFX6-NEXT: v_bfrev_b32_e32 v13, 1 +; GFX6-NEXT: v_sub_i32_e32 v14, vcc, v14, v12 +; GFX6-NEXT: v_min_i32_e32 v15, -1, v0 +; GFX6-NEXT: v_sub_i32_e32 v15, vcc, v15, v13 +; GFX6-NEXT: v_max_i32_e32 v6, v14, v6 +; GFX6-NEXT: v_min_i32_e32 v6, v6, v15 ; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX6-NEXT: v_sub_i32_e32 v0, vcc, v0, v6 ; GFX6-NEXT: v_lshlrev_b32_e32 v6, 16, v7 ; GFX6-NEXT: v_max_i32_e32 v7, -1, v1 -; GFX6-NEXT: v_subrev_i32_e32 v7, vcc, s4, v7 -; GFX6-NEXT: v_min_i32_e32 v12, -1, v1 -; GFX6-NEXT: v_subrev_i32_e32 v12, vcc, s5, v12 +; GFX6-NEXT: v_sub_i32_e32 v7, vcc, v7, v12 +; GFX6-NEXT: v_min_i32_e32 v14, -1, v1 +; GFX6-NEXT: v_sub_i32_e32 v14, vcc, v14, v13 ; GFX6-NEXT: v_max_i32_e32 v6, v7, v6 ; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX6-NEXT: v_bfrev_b32_e32 v13, -2 -; GFX6-NEXT: v_min_i32_e32 v6, v6, v12 +; GFX6-NEXT: v_min_i32_e32 v6, v6, v14 ; GFX6-NEXT: v_max_i32_e32 v7, -1, v2 ; GFX6-NEXT: v_sub_i32_e32 v1, vcc, v1, v6 ; GFX6-NEXT: v_lshlrev_b32_e32 v6, 16, v8 -; GFX6-NEXT: v_sub_i32_e32 v7, vcc, v7, v13 +; GFX6-NEXT: v_sub_i32_e32 v7, vcc, v7, v12 ; GFX6-NEXT: v_min_i32_e32 v8, -1, v2 -; GFX6-NEXT: v_subrev_i32_e32 v8, vcc, s5, v8 +; GFX6-NEXT: v_sub_i32_e32 v8, vcc, v8, v13 ; GFX6-NEXT: v_max_i32_e32 v6, v7, v6 ; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; GFX6-NEXT: v_min_i32_e32 v6, v6, v8 ; GFX6-NEXT: v_max_i32_e32 v7, -1, v3 -; GFX6-NEXT: v_bfrev_b32_e32 v15, 1 ; GFX6-NEXT: v_sub_i32_e32 v2, vcc, v2, v6 ; GFX6-NEXT: v_lshlrev_b32_e32 v6, 16, v9 -; GFX6-NEXT: v_sub_i32_e32 v7, vcc, v7, v13 +; GFX6-NEXT: v_sub_i32_e32 v7, vcc, v7, v12 ; GFX6-NEXT: v_min_i32_e32 v8, -1, v3 -; GFX6-NEXT: v_sub_i32_e32 v8, vcc, v8, v15 +; GFX6-NEXT: v_sub_i32_e32 v8, vcc, v8, v13 ; GFX6-NEXT: v_max_i32_e32 v6, v7, v6 ; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v4 ; GFX6-NEXT: v_min_i32_e32 v6, v6, v8 ; GFX6-NEXT: v_max_i32_e32 v7, -1, v4 ; GFX6-NEXT: v_sub_i32_e32 v3, vcc, v3, v6 ; GFX6-NEXT: v_lshlrev_b32_e32 v6, 16, v10 -; GFX6-NEXT: v_sub_i32_e32 v7, vcc, v7, v13 +; GFX6-NEXT: v_sub_i32_e32 v7, vcc, v7, v12 ; GFX6-NEXT: v_min_i32_e32 v8, -1, v4 -; GFX6-NEXT: v_sub_i32_e32 v8, vcc, v8, v15 +; GFX6-NEXT: v_sub_i32_e32 v8, vcc, v8, v13 ; GFX6-NEXT: v_max_i32_e32 v6, v7, v6 ; GFX6-NEXT: v_lshlrev_b32_e32 v5, 16, v5 ; GFX6-NEXT: v_min_i32_e32 v6, v6, v8 ; GFX6-NEXT: v_max_i32_e32 v7, -1, v5 ; GFX6-NEXT: v_sub_i32_e32 v4, vcc, v4, v6 ; GFX6-NEXT: v_lshlrev_b32_e32 v6, 16, v11 -; GFX6-NEXT: v_sub_i32_e32 v7, vcc, v7, v13 +; GFX6-NEXT: v_sub_i32_e32 v7, vcc, v7, v12 ; GFX6-NEXT: v_min_i32_e32 v8, -1, v5 ; GFX6-NEXT: v_ashrrev_i32_e32 v1, 16, v1 -; GFX6-NEXT: v_sub_i32_e32 v8, vcc, v8, v15 +; GFX6-NEXT: v_sub_i32_e32 v8, vcc, v8, v13 ; GFX6-NEXT: v_max_i32_e32 v6, v7, v6 ; GFX6-NEXT: v_ashrrev_i32_e32 v0, 16, v0 ; GFX6-NEXT: v_min_i32_e32 v6, v6, v8 @@ -3692,69 +3668,67 @@ ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX6-NEXT: s_brev_b32 s4, -2 -; GFX6-NEXT: v_max_i32_e32 v16, -1, v0 +; GFX6-NEXT: v_bfrev_b32_e32 v16, -2 +; GFX6-NEXT: v_max_i32_e32 v18, -1, v0 ; GFX6-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; GFX6-NEXT: s_brev_b32 s5, 1 -; GFX6-NEXT: v_subrev_i32_e32 v16, vcc, s4, v16 -; GFX6-NEXT: v_min_i32_e32 v18, -1, v0 -; GFX6-NEXT: v_subrev_i32_e32 v18, vcc, s5, v18 -; GFX6-NEXT: v_max_i32_e32 v8, v16, v8 -; GFX6-NEXT: v_min_i32_e32 v8, v8, v18 +; GFX6-NEXT: v_bfrev_b32_e32 v17, 1 +; GFX6-NEXT: v_sub_i32_e32 v18, vcc, v18, v16 +; GFX6-NEXT: v_min_i32_e32 v19, -1, v0 +; GFX6-NEXT: v_sub_i32_e32 v19, vcc, v19, v17 +; GFX6-NEXT: v_max_i32_e32 v8, v18, v8 +; GFX6-NEXT: v_min_i32_e32 v8, v8, v19 ; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX6-NEXT: v_sub_i32_e32 v0, vcc, v0, v8 ; GFX6-NEXT: v_lshlrev_b32_e32 v8, 16, v9 ; GFX6-NEXT: v_max_i32_e32 v9, -1, v1 -; GFX6-NEXT: v_subrev_i32_e32 v9, vcc, s4, v9 -; GFX6-NEXT: v_min_i32_e32 v16, -1, v1 -; GFX6-NEXT: v_subrev_i32_e32 v16, vcc, s5, v16 +; GFX6-NEXT: v_sub_i32_e32 v9, vcc, v9, v16 +; GFX6-NEXT: v_min_i32_e32 v18, -1, v1 +; GFX6-NEXT: v_sub_i32_e32 v18, vcc, v18, v17 ; GFX6-NEXT: v_max_i32_e32 v8, v9, v8 ; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX6-NEXT: v_bfrev_b32_e32 v17, -2 -; GFX6-NEXT: v_min_i32_e32 v8, v8, v16 +; GFX6-NEXT: v_min_i32_e32 v8, v8, v18 ; GFX6-NEXT: v_max_i32_e32 v9, -1, v2 ; GFX6-NEXT: v_sub_i32_e32 v1, vcc, v1, v8 ; GFX6-NEXT: v_lshlrev_b32_e32 v8, 16, v10 -; GFX6-NEXT: v_sub_i32_e32 v9, vcc, v9, v17 +; GFX6-NEXT: v_sub_i32_e32 v9, vcc, v9, v16 ; GFX6-NEXT: v_min_i32_e32 v10, -1, v2 -; GFX6-NEXT: v_subrev_i32_e32 v10, vcc, s5, v10 +; GFX6-NEXT: v_sub_i32_e32 v10, vcc, v10, v17 ; GFX6-NEXT: v_max_i32_e32 v8, v9, v8 ; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; GFX6-NEXT: v_min_i32_e32 v8, v8, v10 ; GFX6-NEXT: v_max_i32_e32 v9, -1, v3 -; GFX6-NEXT: v_bfrev_b32_e32 v19, 1 ; GFX6-NEXT: v_sub_i32_e32 v2, vcc, v2, v8 ; GFX6-NEXT: v_lshlrev_b32_e32 v8, 16, v11 -; GFX6-NEXT: v_sub_i32_e32 v9, vcc, v9, v17 +; GFX6-NEXT: v_sub_i32_e32 v9, vcc, v9, v16 ; GFX6-NEXT: v_min_i32_e32 v10, -1, v3 -; GFX6-NEXT: v_sub_i32_e32 v10, vcc, v10, v19 +; GFX6-NEXT: v_sub_i32_e32 v10, vcc, v10, v17 ; GFX6-NEXT: v_max_i32_e32 v8, v9, v8 ; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v4 ; GFX6-NEXT: v_min_i32_e32 v8, v8, v10 ; GFX6-NEXT: v_max_i32_e32 v9, -1, v4 ; GFX6-NEXT: v_sub_i32_e32 v3, vcc, v3, v8 ; GFX6-NEXT: v_lshlrev_b32_e32 v8, 16, v12 -; GFX6-NEXT: v_sub_i32_e32 v9, vcc, v9, v17 +; GFX6-NEXT: v_sub_i32_e32 v9, vcc, v9, v16 ; GFX6-NEXT: v_min_i32_e32 v10, -1, v4 -; GFX6-NEXT: v_sub_i32_e32 v10, vcc, v10, v19 +; GFX6-NEXT: v_sub_i32_e32 v10, vcc, v10, v17 ; GFX6-NEXT: v_max_i32_e32 v8, v9, v8 ; GFX6-NEXT: v_lshlrev_b32_e32 v5, 16, v5 ; GFX6-NEXT: v_min_i32_e32 v8, v8, v10 ; GFX6-NEXT: v_max_i32_e32 v9, -1, v5 ; GFX6-NEXT: v_sub_i32_e32 v4, vcc, v4, v8 ; GFX6-NEXT: v_lshlrev_b32_e32 v8, 16, v13 -; GFX6-NEXT: v_sub_i32_e32 v9, vcc, v9, v17 +; GFX6-NEXT: v_sub_i32_e32 v9, vcc, v9, v16 ; GFX6-NEXT: v_min_i32_e32 v10, -1, v5 -; GFX6-NEXT: v_sub_i32_e32 v10, vcc, v10, v19 +; GFX6-NEXT: v_sub_i32_e32 v10, vcc, v10, v17 ; GFX6-NEXT: v_max_i32_e32 v8, v9, v8 ; GFX6-NEXT: v_lshlrev_b32_e32 v6, 16, v6 ; GFX6-NEXT: v_min_i32_e32 v8, v8, v10 ; GFX6-NEXT: v_max_i32_e32 v9, -1, v6 ; GFX6-NEXT: v_sub_i32_e32 v5, vcc, v5, v8 ; GFX6-NEXT: v_lshlrev_b32_e32 v8, 16, v14 -; GFX6-NEXT: v_sub_i32_e32 v9, vcc, v9, v17 +; GFX6-NEXT: v_sub_i32_e32 v9, vcc, v9, v16 ; GFX6-NEXT: v_min_i32_e32 v10, -1, v6 -; GFX6-NEXT: v_sub_i32_e32 v10, vcc, v10, v19 +; GFX6-NEXT: v_sub_i32_e32 v10, vcc, v10, v17 ; GFX6-NEXT: v_max_i32_e32 v8, v9, v8 ; GFX6-NEXT: v_lshlrev_b32_e32 v7, 16, v7 ; GFX6-NEXT: v_min_i32_e32 v8, v8, v10 @@ -3762,10 +3736,10 @@ ; GFX6-NEXT: v_ashrrev_i32_e32 v1, 16, v1 ; GFX6-NEXT: v_sub_i32_e32 v6, vcc, v6, v8 ; GFX6-NEXT: v_lshlrev_b32_e32 v8, 16, v15 -; GFX6-NEXT: v_sub_i32_e32 v9, vcc, v9, v17 +; GFX6-NEXT: v_sub_i32_e32 v9, vcc, v9, v16 ; GFX6-NEXT: v_min_i32_e32 v10, -1, v7 ; GFX6-NEXT: v_ashrrev_i32_e32 v0, 16, v0 -; GFX6-NEXT: v_sub_i32_e32 v10, vcc, v10, v19 +; GFX6-NEXT: v_sub_i32_e32 v10, vcc, v10, v17 ; GFX6-NEXT: v_max_i32_e32 v8, v9, v8 ; GFX6-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; GFX6-NEXT: v_ashrrev_i32_e32 v2, 16, v2 Index: llvm/test/CodeGen/AMDGPU/GlobalISel/uaddsat.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/GlobalISel/uaddsat.ll +++ llvm/test/CodeGen/AMDGPU/GlobalISel/uaddsat.ll @@ -227,8 +227,8 @@ ; GFX9-NEXT: v_pk_lshlrev_b16 v1, 8, v1 op_sel_hi:[0,1] ; GFX9-NEXT: v_pk_add_u16 v0, v0, v1 clamp ; GFX9-NEXT: v_pk_lshrrev_b16 v0, 8, v0 op_sel_hi:[0,1] -; GFX9-NEXT: s_movk_i32 s4, 0xff -; GFX9-NEXT: v_and_b32_sdwa v1, v0, s4 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX9-NEXT: v_mov_b32_e32 v1, 0xff +; GFX9-NEXT: v_and_b32_sdwa v1, v0, v1 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -240,14 +240,14 @@ ; GFX10-NEXT: v_lshrrev_b32_e32 v3, 8, v1 ; GFX10-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX10-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GFX10-NEXT: s_movk_i32 s4, 0xff ; GFX10-NEXT: v_lshl_or_b32 v0, v2, 16, v0 ; GFX10-NEXT: v_lshl_or_b32 v1, v3, 16, v1 ; GFX10-NEXT: v_pk_lshlrev_b16 v0, 8, v0 op_sel_hi:[0,1] ; GFX10-NEXT: v_pk_lshlrev_b16 v1, 8, v1 op_sel_hi:[0,1] ; GFX10-NEXT: v_pk_add_u16 v0, v0, v1 clamp +; GFX10-NEXT: v_mov_b32_e32 v1, 0xff ; GFX10-NEXT: v_pk_lshrrev_b16 v0, 8, v0 op_sel_hi:[0,1] -; GFX10-NEXT: v_and_b32_sdwa v1, v0, s4 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX10-NEXT: v_and_b32_sdwa v1, v0, v1 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; GFX10-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX10-NEXT: s_setpc_b64 s[30:31] ; @@ -334,8 +334,8 @@ ; GFX9-NEXT: v_mov_b32_e32 v0, s1 ; GFX9-NEXT: v_pk_add_u16 v0, s0, v0 clamp ; GFX9-NEXT: v_pk_lshrrev_b16 v0, 8, v0 op_sel_hi:[0,1] -; GFX9-NEXT: s_movk_i32 s0, 0xff -; GFX9-NEXT: v_and_b32_sdwa v1, v0, s0 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX9-NEXT: v_mov_b32_e32 v1, 0xff +; GFX9-NEXT: v_and_b32_sdwa v1, v0, v1 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_readfirstlane_b32 s0, v0 ; GFX9-NEXT: ; return to shader part epilog @@ -354,10 +354,10 @@ ; GFX10-NEXT: s_lshl_b32 s3, s3, 8 ; GFX10-NEXT: s_pack_ll_b32_b16 s0, s0, s2 ; GFX10-NEXT: s_pack_ll_b32_b16 s1, s1, s3 +; GFX10-NEXT: v_mov_b32_e32 v1, 0xff ; GFX10-NEXT: v_pk_add_u16 v0, s0, s1 clamp -; GFX10-NEXT: s_movk_i32 s0, 0xff ; GFX10-NEXT: v_pk_lshrrev_b16 v0, 8, v0 op_sel_hi:[0,1] -; GFX10-NEXT: v_and_b32_sdwa v1, v0, s0 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX10-NEXT: v_and_b32_sdwa v1, v0, v1 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; GFX10-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX10-NEXT: v_readfirstlane_b32 s0, v0 ; GFX10-NEXT: ; return to shader part epilog @@ -483,11 +483,11 @@ ; GFX9-NEXT: v_pk_add_u16 v2, v2, v3 clamp ; GFX9-NEXT: v_pk_add_u16 v0, v0, v1 clamp ; GFX9-NEXT: v_pk_lshrrev_b16 v1, 8, v2 op_sel_hi:[0,1] -; GFX9-NEXT: v_mov_b32_e32 v2, 8 +; GFX9-NEXT: v_mov_b32_e32 v3, 8 ; GFX9-NEXT: v_pk_lshrrev_b16 v0, 8, v0 op_sel_hi:[0,1] -; GFX9-NEXT: s_movk_i32 s4, 0xff -; GFX9-NEXT: v_lshlrev_b32_sdwa v2, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 -; GFX9-NEXT: v_and_or_b32 v1, v1, s4, v2 +; GFX9-NEXT: v_mov_b32_e32 v2, 0xff +; GFX9-NEXT: v_lshlrev_b32_sdwa v3, v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 +; GFX9-NEXT: v_and_or_b32 v1, v1, v2, v3 ; GFX9-NEXT: v_and_b32_e32 v2, 0xff, v0 ; GFX9-NEXT: v_mov_b32_e32 v3, 24 ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 16, v2 @@ -522,7 +522,7 @@ ; GFX10-NEXT: v_lshlrev_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 ; GFX10-NEXT: v_and_b32_e32 v3, 0xff, v0 ; GFX10-NEXT: v_lshlrev_b32_sdwa v0, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 -; GFX10-NEXT: v_and_or_b32 v1, v2, 0xff, v1 +; GFX10-NEXT: v_and_or_b32 v1, 0xff, v2, v1 ; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v3 ; GFX10-NEXT: v_or3_b32 v0, v1, v2, v0 ; GFX10-NEXT: s_setpc_b64 s[30:31] @@ -555,7 +555,7 @@ ; GFX11-NEXT: v_lshlrev_b32_e32 v2, 8, v2 ; GFX11-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 24, v0 -; GFX11-NEXT: v_and_or_b32 v1, v1, 0xff, v2 +; GFX11-NEXT: v_and_or_b32 v1, 0xff, v1, v2 ; GFX11-NEXT: v_or3_b32 v0, v1, v3, v0 ; GFX11-NEXT: s_setpc_b64 s[30:31] %lhs = bitcast i32 %lhs.arg to <4 x i8> @@ -644,46 +644,46 @@ ; ; GFX9-LABEL: s_uaddsat_v4i8: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_lshr_b32 s3, s0, 8 +; GFX9-NEXT: s_lshr_b32 s2, s0, 8 +; GFX9-NEXT: s_lshr_b32 s3, s0, 16 +; GFX9-NEXT: s_lshr_b32 s4, s0, 24 +; GFX9-NEXT: s_pack_ll_b32_b16 s0, s0, s2 +; GFX9-NEXT: s_pack_ll_b32_b16 s2, s3, s4 ; GFX9-NEXT: s_lshr_b32 s4, s0, 16 -; GFX9-NEXT: s_lshr_b32 s6, s0, 24 -; GFX9-NEXT: s_pack_ll_b32_b16 s0, s0, s3 -; GFX9-NEXT: s_pack_ll_b32_b16 s3, s4, s6 -; GFX9-NEXT: s_lshr_b32 s6, s0, 16 ; GFX9-NEXT: s_lshl_b32 s0, s0, 0x80008 -; GFX9-NEXT: s_lshl_b32 s6, s6, 8 -; GFX9-NEXT: s_lshr_b32 s7, s1, 8 -; GFX9-NEXT: s_pack_ll_b32_b16 s0, s0, s6 -; GFX9-NEXT: s_lshr_b32 s6, s3, 16 -; GFX9-NEXT: s_lshr_b32 s8, s1, 16 -; GFX9-NEXT: s_lshr_b32 s9, s1, 24 -; GFX9-NEXT: s_pack_ll_b32_b16 s1, s1, s7 -; GFX9-NEXT: s_lshl_b32 s3, s3, 0x80008 -; GFX9-NEXT: s_lshl_b32 s6, s6, 8 -; GFX9-NEXT: s_pack_ll_b32_b16 s3, s3, s6 +; GFX9-NEXT: s_lshl_b32 s4, s4, 8 +; GFX9-NEXT: s_lshr_b32 s5, s1, 8 +; GFX9-NEXT: s_pack_ll_b32_b16 s0, s0, s4 +; GFX9-NEXT: s_lshr_b32 s4, s2, 16 ; GFX9-NEXT: s_lshr_b32 s6, s1, 16 -; GFX9-NEXT: s_pack_ll_b32_b16 s4, s8, s9 +; GFX9-NEXT: s_lshr_b32 s7, s1, 24 +; GFX9-NEXT: s_pack_ll_b32_b16 s1, s1, s5 +; GFX9-NEXT: s_lshl_b32 s2, s2, 0x80008 +; GFX9-NEXT: s_lshl_b32 s4, s4, 8 +; GFX9-NEXT: s_pack_ll_b32_b16 s2, s2, s4 +; GFX9-NEXT: s_lshr_b32 s4, s1, 16 +; GFX9-NEXT: s_pack_ll_b32_b16 s3, s6, s7 ; GFX9-NEXT: s_lshl_b32 s1, s1, 0x80008 -; GFX9-NEXT: s_lshl_b32 s6, s6, 8 -; GFX9-NEXT: s_pack_ll_b32_b16 s1, s1, s6 -; GFX9-NEXT: s_lshr_b32 s6, s4, 16 -; GFX9-NEXT: s_lshl_b32 s4, s4, 0x80008 -; GFX9-NEXT: s_lshl_b32 s6, s6, 8 -; GFX9-NEXT: s_pack_ll_b32_b16 s4, s4, s6 +; GFX9-NEXT: s_lshl_b32 s4, s4, 8 +; GFX9-NEXT: s_pack_ll_b32_b16 s1, s1, s4 +; GFX9-NEXT: s_lshr_b32 s4, s3, 16 +; GFX9-NEXT: s_lshl_b32 s3, s3, 0x80008 +; GFX9-NEXT: s_lshl_b32 s4, s4, 8 +; GFX9-NEXT: s_pack_ll_b32_b16 s3, s3, s4 ; GFX9-NEXT: v_mov_b32_e32 v0, s1 ; GFX9-NEXT: v_pk_add_u16 v0, s0, v0 clamp -; GFX9-NEXT: v_mov_b32_e32 v1, s4 -; GFX9-NEXT: s_mov_b32 s2, 8 -; GFX9-NEXT: v_pk_add_u16 v1, s3, v1 clamp +; GFX9-NEXT: v_mov_b32_e32 v1, s3 +; GFX9-NEXT: v_pk_add_u16 v1, s2, v1 clamp ; GFX9-NEXT: v_pk_lshrrev_b16 v0, 8, v0 op_sel_hi:[0,1] +; GFX9-NEXT: v_mov_b32_e32 v3, 8 ; GFX9-NEXT: v_pk_lshrrev_b16 v1, 8, v1 op_sel_hi:[0,1] -; GFX9-NEXT: s_movk_i32 s0, 0xff -; GFX9-NEXT: v_lshlrev_b32_sdwa v2, s2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 -; GFX9-NEXT: s_mov_b32 s5, 24 -; GFX9-NEXT: v_and_or_b32 v0, v0, s0, v2 +; GFX9-NEXT: v_mov_b32_e32 v2, 0xff +; GFX9-NEXT: v_lshlrev_b32_sdwa v3, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 +; GFX9-NEXT: v_and_or_b32 v0, v0, v2, v3 ; GFX9-NEXT: v_and_b32_e32 v2, 0xff, v1 +; GFX9-NEXT: v_mov_b32_e32 v3, 24 ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX9-NEXT: v_lshlrev_b32_sdwa v1, s5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 +; GFX9-NEXT: v_lshlrev_b32_sdwa v1, v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 ; GFX9-NEXT: v_or3_b32 v0, v0, v2, v1 ; GFX9-NEXT: v_readfirstlane_b32 s0, v0 ; GFX9-NEXT: ; return to shader part epilog @@ -718,14 +718,14 @@ ; GFX10-NEXT: s_pack_ll_b32_b16 s3, s3, s5 ; GFX10-NEXT: v_pk_add_u16 v0, s0, s1 clamp ; GFX10-NEXT: v_pk_add_u16 v1, s2, s3 clamp -; GFX10-NEXT: s_mov_b32 s0, 8 +; GFX10-NEXT: v_mov_b32_e32 v2, 8 +; GFX10-NEXT: v_mov_b32_e32 v4, 24 ; GFX10-NEXT: v_pk_lshrrev_b16 v0, 8, v0 op_sel_hi:[0,1] ; GFX10-NEXT: v_pk_lshrrev_b16 v1, 8, v1 op_sel_hi:[0,1] -; GFX10-NEXT: v_lshlrev_b32_sdwa v2, s0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 +; GFX10-NEXT: v_lshlrev_b32_sdwa v2, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 ; GFX10-NEXT: v_and_b32_e32 v3, 0xff, v1 -; GFX10-NEXT: s_mov_b32 s0, 24 -; GFX10-NEXT: v_lshlrev_b32_sdwa v1, s0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 -; GFX10-NEXT: v_and_or_b32 v0, v0, 0xff, v2 +; GFX10-NEXT: v_lshlrev_b32_sdwa v1, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 +; GFX10-NEXT: v_and_or_b32 v0, 0xff, v0, v2 ; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v3 ; GFX10-NEXT: v_or3_b32 v0, v0, v2, v1 ; GFX10-NEXT: v_readfirstlane_b32 s0, v0 @@ -765,7 +765,7 @@ ; GFX11-NEXT: v_lshlrev_b32_e32 v2, 8, v2 ; GFX11-NEXT: v_and_b32_e32 v3, 0xff, v1 ; GFX11-NEXT: v_bfe_u32 v1, v1, 16, 8 -; GFX11-NEXT: v_and_or_b32 v0, v0, 0xff, v2 +; GFX11-NEXT: v_and_or_b32 v0, 0xff, v0, v2 ; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v3 ; GFX11-NEXT: v_lshlrev_b32_e32 v1, 24, v1 ; GFX11-NEXT: v_or3_b32 v0, v0, v2, v1 Index: llvm/test/CodeGen/AMDGPU/GlobalISel/udiv.i32.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/GlobalISel/udiv.i32.ll +++ llvm/test/CodeGen/AMDGPU/GlobalISel/udiv.i32.ll @@ -242,15 +242,15 @@ ; CHECK-LABEL: v_udiv_v2i32_oddk_denom: ; CHECK: ; %bb.0: ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_mov_b32 s4, 0xb2a50881 -; CHECK-NEXT: v_mul_hi_u32 v2, v0, s4 -; CHECK-NEXT: v_mul_hi_u32 v3, v1, s4 -; CHECK-NEXT: v_sub_i32_e32 v0, vcc, v0, v2 -; CHECK-NEXT: v_sub_i32_e32 v1, vcc, v1, v3 +; CHECK-NEXT: v_mov_b32_e32 v2, 0xb2a50881 +; CHECK-NEXT: v_mul_hi_u32 v3, v0, v2 +; CHECK-NEXT: v_mul_hi_u32 v2, v1, v2 +; CHECK-NEXT: v_sub_i32_e32 v0, vcc, v0, v3 +; CHECK-NEXT: v_sub_i32_e32 v1, vcc, v1, v2 ; CHECK-NEXT: v_lshrrev_b32_e32 v0, 1, v0 ; CHECK-NEXT: v_lshrrev_b32_e32 v1, 1, v1 -; CHECK-NEXT: v_add_i32_e32 v0, vcc, v0, v2 -; CHECK-NEXT: v_add_i32_e32 v1, vcc, v1, v3 +; CHECK-NEXT: v_add_i32_e32 v0, vcc, v0, v3 +; CHECK-NEXT: v_add_i32_e32 v1, vcc, v1, v2 ; CHECK-NEXT: v_lshrrev_b32_e32 v0, 20, v0 ; CHECK-NEXT: v_lshrrev_b32_e32 v1, 20, v1 ; CHECK-NEXT: s_setpc_b64 s[30:31] Index: llvm/test/CodeGen/AMDGPU/GlobalISel/udiv.i64.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/GlobalISel/udiv.i64.ll +++ llvm/test/CodeGen/AMDGPU/GlobalISel/udiv.i64.ll @@ -156,24 +156,24 @@ ; CHECK-NEXT: s_cbranch_execz .LBB0_2 ; CHECK-NEXT: .LBB0_4: ; CHECK-NEXT: v_rcp_iflag_f32_e32 v0, v6 -; CHECK-NEXT: v_sub_i32_e32 v1, vcc, 0, v2 +; CHECK-NEXT: v_mov_b32_e32 v1, 0 +; CHECK-NEXT: v_sub_i32_e32 v3, vcc, 0, v2 ; CHECK-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; CHECK-NEXT: v_cvt_u32_f32_e32 v0, v0 -; CHECK-NEXT: v_mul_lo_u32 v1, v1, v0 -; CHECK-NEXT: v_mul_hi_u32 v1, v0, v1 -; CHECK-NEXT: v_add_i32_e32 v0, vcc, v0, v1 +; CHECK-NEXT: v_mul_lo_u32 v3, v3, v0 +; CHECK-NEXT: v_mul_hi_u32 v3, v0, v3 +; CHECK-NEXT: v_add_i32_e32 v0, vcc, v0, v3 ; CHECK-NEXT: v_mul_hi_u32 v0, v4, v0 -; CHECK-NEXT: v_mul_lo_u32 v1, v0, v2 -; CHECK-NEXT: v_add_i32_e32 v3, vcc, 1, v0 -; CHECK-NEXT: v_sub_i32_e32 v1, vcc, v4, v1 -; CHECK-NEXT: v_cmp_ge_u32_e32 vcc, v1, v2 -; CHECK-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc -; CHECK-NEXT: v_sub_i32_e64 v3, s[4:5], v1, v2 -; CHECK-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc -; CHECK-NEXT: v_add_i32_e32 v3, vcc, 1, v0 -; CHECK-NEXT: v_cmp_ge_u32_e32 vcc, v1, v2 -; CHECK-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc -; CHECK-NEXT: v_mov_b32_e32 v1, 0 +; CHECK-NEXT: v_mul_lo_u32 v3, v0, v2 +; CHECK-NEXT: v_add_i32_e32 v5, vcc, 1, v0 +; CHECK-NEXT: v_sub_i32_e32 v3, vcc, v4, v3 +; CHECK-NEXT: v_cmp_ge_u32_e32 vcc, v3, v2 +; CHECK-NEXT: v_cndmask_b32_e32 v0, v0, v5, vcc +; CHECK-NEXT: v_sub_i32_e64 v4, s[4:5], v3, v2 +; CHECK-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc +; CHECK-NEXT: v_add_i32_e32 v4, vcc, 1, v0 +; CHECK-NEXT: v_cmp_ge_u32_e32 vcc, v3, v2 +; CHECK-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc ; CHECK-NEXT: s_or_b64 exec, exec, s[6:7] ; CHECK-NEXT: s_setpc_b64 s[30:31] %result = udiv i64 %num, %den @@ -765,24 +765,24 @@ ; CGP-NEXT: s_cbranch_execz .LBB2_4 ; CGP-NEXT: ; %bb.3: ; CGP-NEXT: v_rcp_iflag_f32_e32 v0, v2 -; CGP-NEXT: v_sub_i32_e32 v1, vcc, 0, v4 +; CGP-NEXT: v_mov_b32_e32 v1, 0 +; CGP-NEXT: v_sub_i32_e32 v2, vcc, 0, v4 ; CGP-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; CGP-NEXT: v_cvt_u32_f32_e32 v0, v0 -; CGP-NEXT: v_mul_lo_u32 v1, v1, v0 -; CGP-NEXT: v_mul_hi_u32 v1, v0, v1 -; CGP-NEXT: v_add_i32_e32 v0, vcc, v0, v1 +; CGP-NEXT: v_mul_lo_u32 v2, v2, v0 +; CGP-NEXT: v_mul_hi_u32 v2, v0, v2 +; CGP-NEXT: v_add_i32_e32 v0, vcc, v0, v2 ; CGP-NEXT: v_mul_hi_u32 v0, v10, v0 -; CGP-NEXT: v_mul_lo_u32 v1, v0, v4 -; CGP-NEXT: v_add_i32_e32 v2, vcc, 1, v0 -; CGP-NEXT: v_sub_i32_e32 v1, vcc, v10, v1 -; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v1, v4 -; CGP-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc -; CGP-NEXT: v_sub_i32_e64 v2, s[4:5], v1, v4 -; CGP-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc -; CGP-NEXT: v_add_i32_e32 v2, vcc, 1, v0 -; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v1, v4 -; CGP-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc -; CGP-NEXT: v_mov_b32_e32 v1, 0 +; CGP-NEXT: v_mul_lo_u32 v2, v0, v4 +; CGP-NEXT: v_add_i32_e32 v3, vcc, 1, v0 +; CGP-NEXT: v_sub_i32_e32 v2, vcc, v10, v2 +; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v2, v4 +; CGP-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc +; CGP-NEXT: v_sub_i32_e64 v3, s[4:5], v2, v4 +; CGP-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc +; CGP-NEXT: v_add_i32_e32 v3, vcc, 1, v0 +; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v2, v4 +; CGP-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc ; CGP-NEXT: .LBB2_4: ; CGP-NEXT: s_or_b64 exec, exec, s[6:7] ; CGP-NEXT: v_or_b32_e32 v3, v9, v7 @@ -931,24 +931,24 @@ ; CGP-NEXT: s_cbranch_execz .LBB2_6 ; CGP-NEXT: .LBB2_8: ; CGP-NEXT: v_rcp_iflag_f32_e32 v2, v4 -; CGP-NEXT: v_sub_i32_e32 v3, vcc, 0, v6 +; CGP-NEXT: v_mov_b32_e32 v3, 0 +; CGP-NEXT: v_sub_i32_e32 v4, vcc, 0, v6 ; CGP-NEXT: v_mul_f32_e32 v2, 0x4f7ffffe, v2 ; CGP-NEXT: v_cvt_u32_f32_e32 v2, v2 -; CGP-NEXT: v_mul_lo_u32 v3, v3, v2 -; CGP-NEXT: v_mul_hi_u32 v3, v2, v3 -; CGP-NEXT: v_add_i32_e32 v2, vcc, v2, v3 +; CGP-NEXT: v_mul_lo_u32 v4, v4, v2 +; CGP-NEXT: v_mul_hi_u32 v4, v2, v4 +; CGP-NEXT: v_add_i32_e32 v2, vcc, v2, v4 ; CGP-NEXT: v_mul_hi_u32 v2, v8, v2 -; CGP-NEXT: v_mul_lo_u32 v3, v2, v6 -; CGP-NEXT: v_add_i32_e32 v4, vcc, 1, v2 -; CGP-NEXT: v_sub_i32_e32 v3, vcc, v8, v3 -; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v3, v6 -; CGP-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc -; CGP-NEXT: v_sub_i32_e64 v4, s[4:5], v3, v6 -; CGP-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc -; CGP-NEXT: v_add_i32_e32 v4, vcc, 1, v2 -; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v3, v6 -; CGP-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc -; CGP-NEXT: v_mov_b32_e32 v3, 0 +; CGP-NEXT: v_mul_lo_u32 v4, v2, v6 +; CGP-NEXT: v_add_i32_e32 v5, vcc, 1, v2 +; CGP-NEXT: v_sub_i32_e32 v4, vcc, v8, v4 +; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v4, v6 +; CGP-NEXT: v_cndmask_b32_e32 v2, v2, v5, vcc +; CGP-NEXT: v_sub_i32_e64 v5, s[4:5], v4, v6 +; CGP-NEXT: v_cndmask_b32_e32 v4, v4, v5, vcc +; CGP-NEXT: v_add_i32_e32 v5, vcc, 1, v2 +; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v4, v6 +; CGP-NEXT: v_cndmask_b32_e32 v2, v2, v5, vcc ; CGP-NEXT: s_or_b64 exec, exec, s[6:7] ; CGP-NEXT: s_setpc_b64 s[30:31] %result = udiv <2 x i64> %num, %den @@ -980,28 +980,28 @@ ; CHECK-LABEL: v_udiv_i64_oddk_denom: ; CHECK: ; %bb.0: ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_mov_b32 s4, 0x1fb03c31 -; CHECK-NEXT: s_mov_b32 s5, 0xd9528440 -; CHECK-NEXT: v_mul_lo_u32 v2, v1, s4 -; CHECK-NEXT: v_mul_lo_u32 v3, v0, s5 -; CHECK-NEXT: v_mul_hi_u32 v4, v0, s4 -; CHECK-NEXT: v_mul_lo_u32 v5, v1, s5 -; CHECK-NEXT: v_mul_hi_u32 v6, v1, s4 -; CHECK-NEXT: v_mul_hi_u32 v0, v0, s5 -; CHECK-NEXT: v_add_i32_e32 v2, vcc, v2, v3 -; CHECK-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc -; CHECK-NEXT: v_add_i32_e32 v5, vcc, v5, v6 -; CHECK-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc -; CHECK-NEXT: v_add_i32_e32 v2, vcc, v2, v4 -; CHECK-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc -; CHECK-NEXT: v_add_i32_e32 v0, vcc, v5, v0 +; CHECK-NEXT: v_mov_b32_e32 v2, 0x1fb03c31 +; CHECK-NEXT: v_mov_b32_e32 v3, 0xd9528440 +; CHECK-NEXT: v_mul_lo_u32 v4, v1, v2 +; CHECK-NEXT: v_mul_lo_u32 v5, v0, v3 +; CHECK-NEXT: v_mul_hi_u32 v6, v0, v2 +; CHECK-NEXT: v_mul_lo_u32 v7, v1, v3 +; CHECK-NEXT: v_mul_hi_u32 v2, v1, v2 +; CHECK-NEXT: v_mul_hi_u32 v0, v0, v3 +; CHECK-NEXT: v_add_i32_e32 v4, vcc, v4, v5 +; CHECK-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc +; CHECK-NEXT: v_add_i32_e32 v2, vcc, v7, v2 +; CHECK-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc +; CHECK-NEXT: v_add_i32_e32 v4, vcc, v4, v6 ; CHECK-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc -; CHECK-NEXT: v_add_i32_e32 v2, vcc, v3, v2 -; CHECK-NEXT: v_add_i32_e32 v3, vcc, v6, v4 -; CHECK-NEXT: v_add_i32_e32 v0, vcc, v0, v2 +; CHECK-NEXT: v_add_i32_e32 v0, vcc, v2, v0 ; CHECK-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc -; CHECK-NEXT: v_add_i32_e32 v2, vcc, v3, v2 -; CHECK-NEXT: v_mul_hi_u32 v1, v1, s5 +; CHECK-NEXT: v_add_i32_e32 v4, vcc, v5, v4 +; CHECK-NEXT: v_add_i32_e32 v2, vcc, v7, v2 +; CHECK-NEXT: v_add_i32_e32 v0, vcc, v0, v4 +; CHECK-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc +; CHECK-NEXT: v_add_i32_e32 v2, vcc, v2, v4 +; CHECK-NEXT: v_mul_hi_u32 v1, v1, v3 ; CHECK-NEXT: v_add_i32_e32 v1, vcc, v1, v2 ; CHECK-NEXT: v_lshr_b64 v[0:1], v[0:1], 20 ; CHECK-NEXT: s_setpc_b64 s[30:31] @@ -1013,50 +1013,50 @@ ; CHECK-LABEL: v_udiv_v2i64_oddk_denom: ; CHECK: ; %bb.0: ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_mov_b32 s4, 0x1fb03c31 -; CHECK-NEXT: s_mov_b32 s5, 0xd9528440 -; CHECK-NEXT: v_mul_lo_u32 v4, v1, s4 -; CHECK-NEXT: v_mul_lo_u32 v5, v0, s5 -; CHECK-NEXT: v_mul_hi_u32 v6, v0, s4 -; CHECK-NEXT: v_mul_lo_u32 v7, v1, s5 -; CHECK-NEXT: v_mul_hi_u32 v8, v1, s4 -; CHECK-NEXT: v_mul_hi_u32 v0, v0, s5 -; CHECK-NEXT: v_mul_hi_u32 v1, v1, s5 -; CHECK-NEXT: v_mul_lo_u32 v9, v3, s4 -; CHECK-NEXT: v_mul_lo_u32 v10, v2, s5 -; CHECK-NEXT: v_mul_hi_u32 v11, v2, s4 -; CHECK-NEXT: v_mul_lo_u32 v12, v3, s5 -; CHECK-NEXT: v_mul_hi_u32 v13, v3, s4 -; CHECK-NEXT: v_mul_hi_u32 v2, v2, s5 -; CHECK-NEXT: v_mul_hi_u32 v3, v3, s5 -; CHECK-NEXT: v_add_i32_e32 v4, vcc, v4, v5 +; CHECK-NEXT: v_mov_b32_e32 v4, 0x1fb03c31 +; CHECK-NEXT: v_mov_b32_e32 v5, 0xd9528440 +; CHECK-NEXT: v_mul_lo_u32 v6, v1, v4 +; CHECK-NEXT: v_mul_lo_u32 v7, v0, v5 +; CHECK-NEXT: v_mul_hi_u32 v8, v0, v4 +; CHECK-NEXT: v_mul_lo_u32 v9, v1, v5 +; CHECK-NEXT: v_mul_hi_u32 v10, v1, v4 +; CHECK-NEXT: v_mul_hi_u32 v0, v0, v5 +; CHECK-NEXT: v_mul_hi_u32 v1, v1, v5 +; CHECK-NEXT: v_mul_lo_u32 v11, v3, v4 +; CHECK-NEXT: v_mul_lo_u32 v12, v2, v5 +; CHECK-NEXT: v_mul_hi_u32 v13, v2, v4 +; CHECK-NEXT: v_mul_lo_u32 v14, v3, v5 +; CHECK-NEXT: v_mul_hi_u32 v4, v3, v4 +; CHECK-NEXT: v_mul_hi_u32 v2, v2, v5 +; CHECK-NEXT: v_mul_hi_u32 v3, v3, v5 +; CHECK-NEXT: v_add_i32_e32 v5, vcc, v6, v7 +; CHECK-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc +; CHECK-NEXT: v_add_i32_e32 v7, vcc, v9, v10 +; CHECK-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc +; CHECK-NEXT: v_add_i32_e32 v10, vcc, v11, v12 +; CHECK-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc +; CHECK-NEXT: v_add_i32_e32 v4, vcc, v14, v4 +; CHECK-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc +; CHECK-NEXT: v_add_i32_e32 v5, vcc, v5, v8 ; CHECK-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc -; CHECK-NEXT: v_add_i32_e32 v7, vcc, v7, v8 -; CHECK-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc -; CHECK-NEXT: v_add_i32_e32 v9, vcc, v9, v10 -; CHECK-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc -; CHECK-NEXT: v_add_i32_e32 v12, vcc, v12, v13 -; CHECK-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc -; CHECK-NEXT: v_add_i32_e32 v4, vcc, v4, v6 -; CHECK-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc ; CHECK-NEXT: v_add_i32_e32 v0, vcc, v7, v0 -; CHECK-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc -; CHECK-NEXT: v_add_i32_e32 v7, vcc, v9, v11 ; CHECK-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc -; CHECK-NEXT: v_add_i32_e32 v2, vcc, v12, v2 -; CHECK-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc -; CHECK-NEXT: v_add_i32_e32 v4, vcc, v5, v4 -; CHECK-NEXT: v_add_i32_e32 v5, vcc, v8, v6 -; CHECK-NEXT: v_add_i32_e32 v6, vcc, v10, v7 -; CHECK-NEXT: v_add_i32_e32 v7, vcc, v13, v9 -; CHECK-NEXT: v_add_i32_e32 v0, vcc, v0, v4 +; CHECK-NEXT: v_add_i32_e32 v8, vcc, v10, v13 +; CHECK-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc +; CHECK-NEXT: v_add_i32_e32 v2, vcc, v4, v2 ; CHECK-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc -; CHECK-NEXT: v_add_i32_e32 v2, vcc, v2, v6 -; CHECK-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc -; CHECK-NEXT: v_add_i32_e32 v4, vcc, v5, v4 -; CHECK-NEXT: v_add_i32_e32 v5, vcc, v7, v6 -; CHECK-NEXT: v_add_i32_e32 v1, vcc, v1, v4 -; CHECK-NEXT: v_add_i32_e32 v3, vcc, v3, v5 +; CHECK-NEXT: v_add_i32_e32 v5, vcc, v6, v5 +; CHECK-NEXT: v_add_i32_e32 v6, vcc, v9, v7 +; CHECK-NEXT: v_add_i32_e32 v7, vcc, v11, v8 +; CHECK-NEXT: v_add_i32_e32 v4, vcc, v12, v4 +; CHECK-NEXT: v_add_i32_e32 v0, vcc, v0, v5 +; CHECK-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc +; CHECK-NEXT: v_add_i32_e32 v2, vcc, v2, v7 +; CHECK-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc +; CHECK-NEXT: v_add_i32_e32 v5, vcc, v6, v5 +; CHECK-NEXT: v_add_i32_e32 v4, vcc, v4, v7 +; CHECK-NEXT: v_add_i32_e32 v1, vcc, v1, v5 +; CHECK-NEXT: v_add_i32_e32 v3, vcc, v3, v4 ; CHECK-NEXT: v_lshr_b64 v[0:1], v[0:1], 20 ; CHECK-NEXT: v_lshr_b64 v[2:3], v[2:3], 20 ; CHECK-NEXT: s_setpc_b64 s[30:31] @@ -1218,24 +1218,24 @@ ; CHECK-NEXT: s_cbranch_execz .LBB7_2 ; CHECK-NEXT: .LBB7_4: ; CHECK-NEXT: v_rcp_iflag_f32_e32 v0, v2 -; CHECK-NEXT: v_sub_i32_e32 v1, vcc, 0, v5 +; CHECK-NEXT: v_mov_b32_e32 v1, 0 +; CHECK-NEXT: v_sub_i32_e32 v2, vcc, 0, v5 ; CHECK-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; CHECK-NEXT: v_cvt_u32_f32_e32 v0, v0 -; CHECK-NEXT: v_mul_lo_u32 v1, v1, v0 -; CHECK-NEXT: v_mul_hi_u32 v1, v0, v1 -; CHECK-NEXT: v_add_i32_e32 v0, vcc, v0, v1 +; CHECK-NEXT: v_mul_lo_u32 v2, v2, v0 +; CHECK-NEXT: v_mul_hi_u32 v2, v0, v2 +; CHECK-NEXT: v_add_i32_e32 v0, vcc, v0, v2 ; CHECK-NEXT: v_mul_hi_u32 v0, v3, v0 -; CHECK-NEXT: v_mul_lo_u32 v1, v0, v5 -; CHECK-NEXT: v_add_i32_e32 v2, vcc, 1, v0 -; CHECK-NEXT: v_sub_i32_e32 v1, vcc, v3, v1 -; CHECK-NEXT: v_cmp_ge_u32_e32 vcc, v1, v5 -; CHECK-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc -; CHECK-NEXT: v_sub_i32_e64 v2, s[4:5], v1, v5 -; CHECK-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc -; CHECK-NEXT: v_add_i32_e32 v2, vcc, 1, v0 -; CHECK-NEXT: v_cmp_ge_u32_e32 vcc, v1, v5 -; CHECK-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc -; CHECK-NEXT: v_mov_b32_e32 v1, 0 +; CHECK-NEXT: v_mul_lo_u32 v2, v0, v5 +; CHECK-NEXT: v_add_i32_e32 v4, vcc, 1, v0 +; CHECK-NEXT: v_sub_i32_e32 v2, vcc, v3, v2 +; CHECK-NEXT: v_cmp_ge_u32_e32 vcc, v2, v5 +; CHECK-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc +; CHECK-NEXT: v_sub_i32_e64 v3, s[4:5], v2, v5 +; CHECK-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc +; CHECK-NEXT: v_add_i32_e32 v3, vcc, 1, v0 +; CHECK-NEXT: v_cmp_ge_u32_e32 vcc, v2, v5 +; CHECK-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc ; CHECK-NEXT: s_or_b64 exec, exec, s[6:7] ; CHECK-NEXT: s_setpc_b64 s[30:31] %shl.y = shl i64 4096, %y @@ -1654,24 +1654,24 @@ ; CGP-NEXT: s_cbranch_execz .LBB8_4 ; CGP-NEXT: ; %bb.3: ; CGP-NEXT: v_rcp_iflag_f32_e32 v0, v4 -; CGP-NEXT: v_sub_i32_e32 v1, vcc, 0, v2 +; CGP-NEXT: v_mov_b32_e32 v1, 0 +; CGP-NEXT: v_sub_i32_e32 v3, vcc, 0, v2 ; CGP-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; CGP-NEXT: v_cvt_u32_f32_e32 v0, v0 -; CGP-NEXT: v_mul_lo_u32 v1, v1, v0 -; CGP-NEXT: v_mul_hi_u32 v1, v0, v1 -; CGP-NEXT: v_add_i32_e32 v0, vcc, v0, v1 +; CGP-NEXT: v_mul_lo_u32 v3, v3, v0 +; CGP-NEXT: v_mul_hi_u32 v3, v0, v3 +; CGP-NEXT: v_add_i32_e32 v0, vcc, v0, v3 ; CGP-NEXT: v_mul_hi_u32 v0, v8, v0 -; CGP-NEXT: v_mul_lo_u32 v1, v0, v2 -; CGP-NEXT: v_add_i32_e32 v3, vcc, 1, v0 -; CGP-NEXT: v_sub_i32_e32 v1, vcc, v8, v1 -; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v1, v2 -; CGP-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc -; CGP-NEXT: v_sub_i32_e64 v3, s[4:5], v1, v2 -; CGP-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc -; CGP-NEXT: v_add_i32_e32 v3, vcc, 1, v0 -; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v1, v2 -; CGP-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc -; CGP-NEXT: v_mov_b32_e32 v1, 0 +; CGP-NEXT: v_mul_lo_u32 v3, v0, v2 +; CGP-NEXT: v_add_i32_e32 v4, vcc, 1, v0 +; CGP-NEXT: v_sub_i32_e32 v3, vcc, v8, v3 +; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v3, v2 +; CGP-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc +; CGP-NEXT: v_sub_i32_e64 v4, s[4:5], v3, v2 +; CGP-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc +; CGP-NEXT: v_add_i32_e32 v4, vcc, 1, v0 +; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v3, v2 +; CGP-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc ; CGP-NEXT: .LBB8_4: ; CGP-NEXT: s_or_b64 exec, exec, s[8:9] ; CGP-NEXT: v_or_b32_e32 v3, v7, v10 @@ -1820,24 +1820,24 @@ ; CGP-NEXT: s_cbranch_execz .LBB8_6 ; CGP-NEXT: .LBB8_8: ; CGP-NEXT: v_rcp_iflag_f32_e32 v2, v4 -; CGP-NEXT: v_sub_i32_e32 v3, vcc, 0, v9 +; CGP-NEXT: v_mov_b32_e32 v3, 0 +; CGP-NEXT: v_sub_i32_e32 v4, vcc, 0, v9 ; CGP-NEXT: v_mul_f32_e32 v2, 0x4f7ffffe, v2 ; CGP-NEXT: v_cvt_u32_f32_e32 v2, v2 -; CGP-NEXT: v_mul_lo_u32 v3, v3, v2 -; CGP-NEXT: v_mul_hi_u32 v3, v2, v3 -; CGP-NEXT: v_add_i32_e32 v2, vcc, v2, v3 +; CGP-NEXT: v_mul_lo_u32 v4, v4, v2 +; CGP-NEXT: v_mul_hi_u32 v4, v2, v4 +; CGP-NEXT: v_add_i32_e32 v2, vcc, v2, v4 ; CGP-NEXT: v_mul_hi_u32 v2, v5, v2 -; CGP-NEXT: v_mul_lo_u32 v3, v2, v9 -; CGP-NEXT: v_add_i32_e32 v4, vcc, 1, v2 -; CGP-NEXT: v_sub_i32_e32 v3, vcc, v5, v3 -; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v3, v9 -; CGP-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc -; CGP-NEXT: v_sub_i32_e64 v4, s[4:5], v3, v9 -; CGP-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc -; CGP-NEXT: v_add_i32_e32 v4, vcc, 1, v2 -; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v3, v9 -; CGP-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc -; CGP-NEXT: v_mov_b32_e32 v3, 0 +; CGP-NEXT: v_mul_lo_u32 v4, v2, v9 +; CGP-NEXT: v_add_i32_e32 v6, vcc, 1, v2 +; CGP-NEXT: v_sub_i32_e32 v4, vcc, v5, v4 +; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v4, v9 +; CGP-NEXT: v_cndmask_b32_e32 v2, v2, v6, vcc +; CGP-NEXT: v_sub_i32_e64 v5, s[4:5], v4, v9 +; CGP-NEXT: v_cndmask_b32_e32 v4, v4, v5, vcc +; CGP-NEXT: v_add_i32_e32 v5, vcc, 1, v2 +; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v4, v9 +; CGP-NEXT: v_cndmask_b32_e32 v2, v2, v5, vcc ; CGP-NEXT: s_or_b64 exec, exec, s[6:7] ; CGP-NEXT: s_setpc_b64 s[30:31] %shl.y = shl <2 x i64> , %y @@ -1905,255 +1905,249 @@ ; GISEL-NEXT: v_and_b32_e32 v2, 0xffffff, v2 ; GISEL-NEXT: v_and_b32_e32 v1, 0xffffff, v4 ; GISEL-NEXT: v_and_b32_e32 v0, 0xffffff, v6 -; GISEL-NEXT: v_cvt_f32_ubyte0_e32 v4, 0 -; GISEL-NEXT: s_bfe_i32 s12, 1, 0x10000 -; GISEL-NEXT: s_bfe_i32 s13, 1, 0x10000 -; GISEL-NEXT: s_bfe_i32 s14, 1, 0x10000 -; GISEL-NEXT: s_bfe_i32 s15, 1, 0x10000 -; GISEL-NEXT: v_cvt_f32_u32_e32 v5, v1 -; GISEL-NEXT: v_sub_i32_e32 v6, vcc, 0, v1 -; GISEL-NEXT: v_cvt_f32_u32_e32 v7, v0 -; GISEL-NEXT: v_sub_i32_e64 v8, s[4:5], 0, v0 -; GISEL-NEXT: v_subb_u32_e64 v9, s[6:7], 0, 0, vcc -; GISEL-NEXT: v_subb_u32_e64 v10, s[4:5], 0, 0, s[4:5] -; GISEL-NEXT: v_mac_f32_e32 v5, 0x4f800000, v4 -; GISEL-NEXT: v_mac_f32_e32 v7, 0x4f800000, v4 -; GISEL-NEXT: v_rcp_iflag_f32_e32 v4, v5 -; GISEL-NEXT: v_rcp_iflag_f32_e32 v5, v7 -; GISEL-NEXT: v_mul_f32_e32 v4, 0x5f7ffffc, v4 +; GISEL-NEXT: v_cvt_f32_ubyte0_e32 v5, 0 +; GISEL-NEXT: s_bfe_i32 s6, 1, 0x10000 +; GISEL-NEXT: v_cvt_f32_u32_e32 v6, v1 +; GISEL-NEXT: v_sub_i32_e32 v7, vcc, 0, v1 +; GISEL-NEXT: v_subb_u32_e64 v8, s[4:5], 0, 0, vcc +; GISEL-NEXT: v_mov_b32_e32 v4, s6 +; GISEL-NEXT: v_cvt_f32_u32_e32 v9, v0 +; GISEL-NEXT: v_sub_i32_e32 v10, vcc, 0, v0 +; GISEL-NEXT: v_subb_u32_e64 v11, s[4:5], 0, 0, vcc +; GISEL-NEXT: v_mac_f32_e32 v6, 0x4f800000, v5 +; GISEL-NEXT: v_mac_f32_e32 v9, 0x4f800000, v5 +; GISEL-NEXT: v_rcp_iflag_f32_e32 v5, v6 +; GISEL-NEXT: v_rcp_iflag_f32_e32 v6, v9 ; GISEL-NEXT: v_mul_f32_e32 v5, 0x5f7ffffc, v5 -; GISEL-NEXT: v_mul_f32_e32 v7, 0x2f800000, v4 -; GISEL-NEXT: v_mul_f32_e32 v11, 0x2f800000, v5 -; GISEL-NEXT: v_trunc_f32_e32 v7, v7 -; GISEL-NEXT: v_trunc_f32_e32 v11, v11 -; GISEL-NEXT: v_mac_f32_e32 v4, 0xcf800000, v7 -; GISEL-NEXT: v_cvt_u32_f32_e32 v7, v7 -; GISEL-NEXT: v_mac_f32_e32 v5, 0xcf800000, v11 -; GISEL-NEXT: v_cvt_u32_f32_e32 v11, v11 -; GISEL-NEXT: v_cvt_u32_f32_e32 v12, v4 -; GISEL-NEXT: v_mul_lo_u32 v4, v6, v7 +; GISEL-NEXT: v_mul_f32_e32 v6, 0x5f7ffffc, v6 +; GISEL-NEXT: v_mul_f32_e32 v9, 0x2f800000, v5 +; GISEL-NEXT: v_mul_f32_e32 v12, 0x2f800000, v6 +; GISEL-NEXT: v_trunc_f32_e32 v9, v9 +; GISEL-NEXT: v_trunc_f32_e32 v12, v12 +; GISEL-NEXT: v_mac_f32_e32 v5, 0xcf800000, v9 +; GISEL-NEXT: v_cvt_u32_f32_e32 v9, v9 +; GISEL-NEXT: v_mac_f32_e32 v6, 0xcf800000, v12 +; GISEL-NEXT: v_cvt_u32_f32_e32 v12, v12 ; GISEL-NEXT: v_cvt_u32_f32_e32 v5, v5 -; GISEL-NEXT: v_mul_lo_u32 v13, v8, v11 -; GISEL-NEXT: v_mul_lo_u32 v14, v6, v12 -; GISEL-NEXT: v_mul_lo_u32 v15, v9, v12 -; GISEL-NEXT: v_mul_hi_u32 v16, v6, v12 -; GISEL-NEXT: v_mul_lo_u32 v17, v8, v5 -; GISEL-NEXT: v_mul_lo_u32 v18, v10, v5 -; GISEL-NEXT: v_mul_hi_u32 v19, v8, v5 -; GISEL-NEXT: v_add_i32_e32 v4, vcc, v15, v4 -; GISEL-NEXT: v_mul_lo_u32 v15, v7, v14 -; GISEL-NEXT: v_mul_hi_u32 v20, v12, v14 -; GISEL-NEXT: v_mul_hi_u32 v14, v7, v14 -; GISEL-NEXT: v_add_i32_e32 v13, vcc, v18, v13 -; GISEL-NEXT: v_mul_lo_u32 v18, v11, v17 -; GISEL-NEXT: v_add_i32_e32 v16, vcc, v4, v16 -; GISEL-NEXT: v_mul_hi_u32 v4, v5, v17 -; GISEL-NEXT: v_mul_hi_u32 v17, v11, v17 -; GISEL-NEXT: v_add_i32_e32 v13, vcc, v13, v19 -; GISEL-NEXT: v_mul_lo_u32 v19, v5, v13 +; GISEL-NEXT: v_mul_lo_u32 v13, v7, v9 +; GISEL-NEXT: v_cvt_u32_f32_e32 v6, v6 +; GISEL-NEXT: v_mul_lo_u32 v14, v10, v12 +; GISEL-NEXT: v_mul_lo_u32 v15, v7, v5 +; GISEL-NEXT: v_mul_lo_u32 v16, v8, v5 +; GISEL-NEXT: v_mul_hi_u32 v17, v7, v5 +; GISEL-NEXT: v_mul_lo_u32 v18, v10, v6 +; GISEL-NEXT: v_mul_lo_u32 v19, v11, v6 +; GISEL-NEXT: v_mul_hi_u32 v20, v10, v6 +; GISEL-NEXT: v_add_i32_e32 v13, vcc, v16, v13 +; GISEL-NEXT: v_mul_lo_u32 v16, v9, v15 +; GISEL-NEXT: v_mul_hi_u32 v21, v5, v15 +; GISEL-NEXT: v_mul_hi_u32 v15, v9, v15 +; GISEL-NEXT: v_add_i32_e32 v14, vcc, v19, v14 +; GISEL-NEXT: v_mul_lo_u32 v19, v12, v18 +; GISEL-NEXT: v_mul_hi_u32 v22, v6, v18 +; GISEL-NEXT: v_mul_hi_u32 v18, v12, v18 +; GISEL-NEXT: v_add_i32_e32 v13, vcc, v13, v17 +; GISEL-NEXT: v_add_i32_e32 v14, vcc, v14, v20 +; GISEL-NEXT: v_mul_lo_u32 v17, v5, v13 +; GISEL-NEXT: v_mul_lo_u32 v20, v9, v13 +; GISEL-NEXT: v_mul_hi_u32 v23, v5, v13 +; GISEL-NEXT: v_mul_hi_u32 v13, v9, v13 +; GISEL-NEXT: v_mul_lo_u32 v24, v6, v14 +; GISEL-NEXT: v_mul_lo_u32 v25, v12, v14 +; GISEL-NEXT: v_mul_hi_u32 v26, v6, v14 +; GISEL-NEXT: v_mul_hi_u32 v14, v12, v14 +; GISEL-NEXT: v_add_i32_e32 v16, vcc, v16, v17 +; GISEL-NEXT: v_cndmask_b32_e64 v17, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v15, vcc, v20, v15 +; GISEL-NEXT: v_cndmask_b32_e64 v20, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v19, vcc, v19, v24 +; GISEL-NEXT: v_cndmask_b32_e64 v24, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v18, vcc, v25, v18 +; GISEL-NEXT: v_cndmask_b32_e64 v25, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v16, vcc, v16, v21 +; GISEL-NEXT: v_cndmask_b32_e64 v16, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v15, vcc, v15, v23 +; GISEL-NEXT: v_cndmask_b32_e64 v21, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v19, vcc, v19, v22 +; GISEL-NEXT: v_cndmask_b32_e64 v19, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v18, vcc, v18, v26 +; GISEL-NEXT: v_cndmask_b32_e64 v22, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v16, vcc, v17, v16 +; GISEL-NEXT: v_add_i32_e32 v17, vcc, v20, v21 +; GISEL-NEXT: v_add_i32_e32 v19, vcc, v24, v19 +; GISEL-NEXT: v_add_i32_e32 v20, vcc, v25, v22 +; GISEL-NEXT: v_add_i32_e32 v15, vcc, v15, v16 +; GISEL-NEXT: v_cndmask_b32_e64 v16, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v18, vcc, v18, v19 -; GISEL-NEXT: v_mul_lo_u32 v19, v12, v16 -; GISEL-NEXT: v_add_i32_e64 v4, s[4:5], v18, v4 -; GISEL-NEXT: v_mul_lo_u32 v4, v7, v16 -; GISEL-NEXT: v_mul_hi_u32 v18, v12, v16 -; GISEL-NEXT: v_add_i32_e64 v15, s[6:7], v15, v19 -; GISEL-NEXT: v_mul_lo_u32 v19, v11, v13 -; GISEL-NEXT: v_add_i32_e64 v4, s[8:9], v4, v14 -; GISEL-NEXT: v_mul_hi_u32 v14, v5, v13 -; GISEL-NEXT: v_add_i32_e64 v17, s[10:11], v19, v17 ; GISEL-NEXT: v_cndmask_b32_e64 v19, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v15, vcc, v15, v20 -; GISEL-NEXT: v_cndmask_b32_e64 v15, 0, 1, s[6:7] -; GISEL-NEXT: v_cndmask_b32_e64 v20, 0, 1, s[8:9] -; GISEL-NEXT: v_add_i32_e64 v4, s[6:7], v4, v18 -; GISEL-NEXT: v_cndmask_b32_e64 v18, 0, 1, s[10:11] -; GISEL-NEXT: v_add_i32_e64 v14, s[8:9], v17, v14 +; GISEL-NEXT: v_add_i32_e32 v16, vcc, v17, v16 +; GISEL-NEXT: v_add_i32_e32 v17, vcc, v20, v19 +; GISEL-NEXT: v_add_i32_e32 v13, vcc, v13, v16 +; GISEL-NEXT: v_add_i32_e32 v14, vcc, v14, v17 +; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v15 +; GISEL-NEXT: v_addc_u32_e32 v9, vcc, v9, v13, vcc +; GISEL-NEXT: v_mul_lo_u32 v13, v7, v5 +; GISEL-NEXT: v_mul_lo_u32 v8, v8, v5 +; GISEL-NEXT: v_mul_hi_u32 v15, v7, v5 +; GISEL-NEXT: v_add_i32_e32 v6, vcc, v6, v18 +; GISEL-NEXT: v_addc_u32_e32 v12, vcc, v12, v14, vcc +; GISEL-NEXT: v_mul_lo_u32 v14, v10, v6 +; GISEL-NEXT: v_mul_lo_u32 v11, v11, v6 +; GISEL-NEXT: v_mul_hi_u32 v16, v10, v6 +; GISEL-NEXT: v_mul_lo_u32 v7, v7, v9 +; GISEL-NEXT: v_mul_lo_u32 v17, v9, v13 +; GISEL-NEXT: v_mul_hi_u32 v18, v5, v13 +; GISEL-NEXT: v_mul_hi_u32 v13, v9, v13 +; GISEL-NEXT: v_mul_lo_u32 v10, v10, v12 +; GISEL-NEXT: v_mul_lo_u32 v19, v12, v14 +; GISEL-NEXT: v_mul_hi_u32 v20, v6, v14 +; GISEL-NEXT: v_mul_hi_u32 v14, v12, v14 +; GISEL-NEXT: v_add_i32_e32 v7, vcc, v8, v7 +; GISEL-NEXT: v_add_i32_e32 v8, vcc, v11, v10 +; GISEL-NEXT: v_add_i32_e32 v7, vcc, v7, v15 +; GISEL-NEXT: v_add_i32_e32 v8, vcc, v8, v16 +; GISEL-NEXT: v_mul_lo_u32 v10, v5, v7 +; GISEL-NEXT: v_mul_lo_u32 v11, v9, v7 +; GISEL-NEXT: v_mul_hi_u32 v15, v5, v7 +; GISEL-NEXT: v_mul_hi_u32 v7, v9, v7 +; GISEL-NEXT: v_mul_lo_u32 v16, v6, v8 +; GISEL-NEXT: v_mul_lo_u32 v21, v12, v8 +; GISEL-NEXT: v_mul_hi_u32 v22, v6, v8 +; GISEL-NEXT: v_mul_hi_u32 v8, v12, v8 +; GISEL-NEXT: v_add_i32_e32 v10, vcc, v17, v10 ; GISEL-NEXT: v_cndmask_b32_e64 v17, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v15, vcc, v15, v17 -; GISEL-NEXT: v_cndmask_b32_e64 v17, 0, 1, s[6:7] -; GISEL-NEXT: v_add_i32_e32 v17, vcc, v20, v17 -; GISEL-NEXT: v_cndmask_b32_e64 v20, 0, 1, s[4:5] -; GISEL-NEXT: v_add_i32_e32 v19, vcc, v19, v20 -; GISEL-NEXT: v_cndmask_b32_e64 v20, 0, 1, s[8:9] -; GISEL-NEXT: v_add_i32_e32 v18, vcc, v18, v20 -; GISEL-NEXT: v_mov_b32_e32 v20, s12 -; GISEL-NEXT: v_add_i32_e32 v15, vcc, v4, v15 -; GISEL-NEXT: v_mov_b32_e32 v4, s13 -; GISEL-NEXT: v_add_i32_e64 v14, s[4:5], v14, v19 -; GISEL-NEXT: v_mov_b32_e32 v19, s14 -; GISEL-NEXT: v_add_i32_e64 v12, s[6:7], v12, v15 -; GISEL-NEXT: v_mov_b32_e32 v15, s15 -; GISEL-NEXT: v_mul_hi_u32 v16, v7, v16 -; GISEL-NEXT: v_mul_hi_u32 v13, v11, v13 -; GISEL-NEXT: v_add_i32_e64 v5, s[8:9], v5, v14 -; GISEL-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v14, vcc, v17, v14 -; GISEL-NEXT: v_cndmask_b32_e64 v17, 0, 1, s[4:5] -; GISEL-NEXT: v_add_i32_e32 v17, vcc, v18, v17 -; GISEL-NEXT: v_mul_lo_u32 v18, v6, v12 -; GISEL-NEXT: v_mul_lo_u32 v9, v9, v12 -; GISEL-NEXT: v_add_i32_e32 v14, vcc, v16, v14 -; GISEL-NEXT: v_mul_hi_u32 v16, v6, v12 -; GISEL-NEXT: v_add_i32_e32 v13, vcc, v13, v17 -; GISEL-NEXT: v_mul_lo_u32 v17, v8, v5 -; GISEL-NEXT: v_mul_lo_u32 v10, v10, v5 -; GISEL-NEXT: v_addc_u32_e64 v7, vcc, v7, v14, s[6:7] -; GISEL-NEXT: v_mul_hi_u32 v14, v8, v5 -; GISEL-NEXT: v_addc_u32_e64 v11, vcc, v11, v13, s[8:9] -; GISEL-NEXT: v_mul_hi_u32 v13, v12, v18 -; GISEL-NEXT: v_mul_lo_u32 v6, v6, v7 -; GISEL-NEXT: v_add_i32_e32 v6, vcc, v9, v6 -; GISEL-NEXT: v_mul_hi_u32 v9, v5, v17 -; GISEL-NEXT: v_mul_lo_u32 v8, v8, v11 -; GISEL-NEXT: v_add_i32_e32 v8, vcc, v10, v8 -; GISEL-NEXT: v_mul_lo_u32 v10, v7, v18 -; GISEL-NEXT: v_mul_hi_u32 v18, v7, v18 -; GISEL-NEXT: v_add_i32_e32 v6, vcc, v6, v16 -; GISEL-NEXT: v_mul_lo_u32 v16, v11, v17 -; GISEL-NEXT: v_mul_hi_u32 v17, v11, v17 -; GISEL-NEXT: v_add_i32_e32 v8, vcc, v8, v14 -; GISEL-NEXT: v_mul_lo_u32 v14, v12, v6 -; GISEL-NEXT: v_add_i32_e32 v10, vcc, v10, v14 -; GISEL-NEXT: v_mul_lo_u32 v14, v7, v6 -; GISEL-NEXT: v_add_i32_e64 v10, s[4:5], v10, v13 -; GISEL-NEXT: v_mul_hi_u32 v10, v12, v6 -; GISEL-NEXT: v_mul_hi_u32 v6, v7, v6 -; GISEL-NEXT: v_mul_lo_u32 v13, v5, v8 -; GISEL-NEXT: v_add_i32_e64 v14, s[6:7], v14, v18 -; GISEL-NEXT: v_mul_lo_u32 v18, v11, v8 -; GISEL-NEXT: v_add_i32_e64 v13, s[8:9], v16, v13 -; GISEL-NEXT: v_mul_hi_u32 v16, v5, v8 -; GISEL-NEXT: v_mul_hi_u32 v8, v11, v8 -; GISEL-NEXT: v_add_i32_e64 v17, s[10:11], v18, v17 -; GISEL-NEXT: v_cndmask_b32_e64 v18, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v9, vcc, v13, v9 -; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 1, s[6:7] -; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, s[8:9] -; GISEL-NEXT: v_add_i32_e64 v10, s[6:7], v14, v10 -; GISEL-NEXT: v_cndmask_b32_e64 v14, 0, 1, s[10:11] -; GISEL-NEXT: v_add_i32_e64 v16, s[8:9], v17, v16 -; GISEL-NEXT: v_cndmask_b32_e64 v17, 0, 1, s[4:5] -; GISEL-NEXT: v_add_i32_e64 v17, s[4:5], v18, v17 -; GISEL-NEXT: v_cndmask_b32_e64 v18, 0, 1, s[6:7] -; GISEL-NEXT: v_add_i32_e64 v9, s[4:5], v9, v18 -; GISEL-NEXT: v_cndmask_b32_e64 v18, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v13, vcc, v13, v18 -; GISEL-NEXT: v_cndmask_b32_e64 v18, 0, 1, s[8:9] -; GISEL-NEXT: v_add_i32_e32 v14, vcc, v14, v18 -; GISEL-NEXT: v_add_i32_e32 v10, vcc, v10, v17 -; GISEL-NEXT: v_add_i32_e64 v13, s[4:5], v16, v13 +; GISEL-NEXT: v_add_i32_e32 v11, vcc, v11, v13 +; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v16, vcc, v19, v16 +; GISEL-NEXT: v_cndmask_b32_e64 v19, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v14, vcc, v21, v14 +; GISEL-NEXT: v_cndmask_b32_e64 v21, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v10, vcc, v10, v18 +; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v11, vcc, v11, v15 +; GISEL-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v16, vcc, v16, v20 ; GISEL-NEXT: v_cndmask_b32_e64 v16, 0, 1, vcc -; GISEL-NEXT: v_cndmask_b32_e64 v17, 0, 1, s[4:5] -; GISEL-NEXT: v_add_i32_e32 v10, vcc, v12, v10 -; GISEL-NEXT: v_add_i32_e64 v5, s[4:5], v5, v13 -; GISEL-NEXT: v_add_i32_e64 v9, s[6:7], v9, v16 -; GISEL-NEXT: v_add_i32_e64 v12, s[6:7], v14, v17 -; GISEL-NEXT: v_mul_lo_u32 v13, 0, v10 -; GISEL-NEXT: v_mul_hi_u32 v14, v3, v10 -; GISEL-NEXT: v_mul_hi_u32 v10, 0, v10 -; GISEL-NEXT: v_mul_lo_u32 v16, 0, v5 -; GISEL-NEXT: v_mul_hi_u32 v17, v2, v5 +; GISEL-NEXT: v_add_i32_e32 v14, vcc, v14, v22 +; GISEL-NEXT: v_cndmask_b32_e64 v18, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v10, vcc, v17, v10 +; GISEL-NEXT: v_add_i32_e32 v13, vcc, v13, v15 +; GISEL-NEXT: v_add_i32_e32 v15, vcc, v19, v16 +; GISEL-NEXT: v_add_i32_e32 v16, vcc, v21, v18 +; GISEL-NEXT: v_add_i32_e32 v10, vcc, v11, v10 +; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v14, vcc, v14, v15 +; GISEL-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v11, vcc, v13, v11 +; GISEL-NEXT: v_add_i32_e32 v13, vcc, v16, v15 +; GISEL-NEXT: v_add_i32_e32 v7, vcc, v7, v11 +; GISEL-NEXT: v_add_i32_e32 v8, vcc, v8, v13 +; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v10 +; GISEL-NEXT: v_addc_u32_e32 v7, vcc, v9, v7, vcc +; GISEL-NEXT: v_mul_lo_u32 v9, 0, v5 +; GISEL-NEXT: v_mul_hi_u32 v10, v3, v5 ; GISEL-NEXT: v_mul_hi_u32 v5, 0, v5 -; GISEL-NEXT: v_add_i32_e64 v6, s[6:7], v6, v9 -; GISEL-NEXT: v_add_i32_e64 v8, s[6:7], v8, v12 -; GISEL-NEXT: v_addc_u32_e32 v6, vcc, v7, v6, vcc -; GISEL-NEXT: v_addc_u32_e64 v7, vcc, v11, v8, s[4:5] -; GISEL-NEXT: v_mul_lo_u32 v8, v3, v6 -; GISEL-NEXT: v_mul_lo_u32 v9, 0, v6 -; GISEL-NEXT: v_mul_hi_u32 v11, v3, v6 +; GISEL-NEXT: v_add_i32_e32 v6, vcc, v6, v14 +; GISEL-NEXT: v_addc_u32_e32 v8, vcc, v12, v8, vcc +; GISEL-NEXT: v_mul_lo_u32 v11, 0, v6 +; GISEL-NEXT: v_mul_hi_u32 v12, v2, v6 ; GISEL-NEXT: v_mul_hi_u32 v6, 0, v6 -; GISEL-NEXT: v_mul_lo_u32 v12, v2, v7 -; GISEL-NEXT: v_mul_lo_u32 v18, 0, v7 -; GISEL-NEXT: v_add_i32_e32 v8, vcc, v13, v8 -; GISEL-NEXT: v_mul_hi_u32 v13, v2, v7 +; GISEL-NEXT: v_mul_lo_u32 v13, v3, v7 +; GISEL-NEXT: v_mul_lo_u32 v14, 0, v7 +; GISEL-NEXT: v_mul_hi_u32 v15, v3, v7 ; GISEL-NEXT: v_mul_hi_u32 v7, 0, v7 -; GISEL-NEXT: v_add_i32_e64 v9, s[4:5], v9, v10 -; GISEL-NEXT: v_add_i32_e64 v10, s[6:7], v16, v12 -; GISEL-NEXT: v_add_i32_e64 v5, s[8:9], v18, v5 -; GISEL-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc -; GISEL-NEXT: v_cndmask_b32_e64 v16, 0, 1, s[4:5] -; GISEL-NEXT: v_cndmask_b32_e64 v18, 0, 1, s[6:7] -; GISEL-NEXT: v_add_i32_e32 v8, vcc, v8, v14 -; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, 1, s[8:9] -; GISEL-NEXT: v_add_i32_e64 v9, s[4:5], v9, v11 -; GISEL-NEXT: v_add_i32_e64 v10, s[6:7], v10, v17 -; GISEL-NEXT: v_add_i32_e64 v5, s[8:9], v5, v13 -; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc -; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, 1, s[4:5] -; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, s[6:7] -; GISEL-NEXT: v_cndmask_b32_e64 v14, 0, 1, s[8:9] -; GISEL-NEXT: v_add_i32_e32 v10, vcc, v12, v10 -; GISEL-NEXT: v_add_i32_e32 v11, vcc, v16, v11 -; GISEL-NEXT: v_add_i32_e32 v12, vcc, v18, v13 -; GISEL-NEXT: v_add_i32_e32 v8, vcc, v8, v14 +; GISEL-NEXT: v_mul_lo_u32 v16, v2, v8 +; GISEL-NEXT: v_mul_lo_u32 v17, 0, v8 +; GISEL-NEXT: v_mul_hi_u32 v18, v2, v8 +; GISEL-NEXT: v_mul_hi_u32 v8, 0, v8 +; GISEL-NEXT: v_add_i32_e32 v9, vcc, v9, v13 +; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v5, vcc, v14, v5 +; GISEL-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v11, vcc, v11, v16 +; GISEL-NEXT: v_cndmask_b32_e64 v16, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v6, vcc, v17, v6 +; GISEL-NEXT: v_cndmask_b32_e64 v17, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v9, vcc, v9, v10 -; GISEL-NEXT: v_add_i32_e64 v5, s[4:5], v5, v12 +; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v15 ; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc -; GISEL-NEXT: v_cndmask_b32_e64 v12, 0, 1, s[4:5] -; GISEL-NEXT: v_mul_lo_u32 v13, v1, v9 -; GISEL-NEXT: v_mul_lo_u32 v14, 0, v9 -; GISEL-NEXT: v_mul_hi_u32 v16, v1, v9 -; GISEL-NEXT: v_mul_lo_u32 v17, v0, v5 -; GISEL-NEXT: v_mul_lo_u32 v18, 0, v5 -; GISEL-NEXT: v_add_i32_e32 v10, vcc, v11, v10 -; GISEL-NEXT: v_mul_hi_u32 v11, v0, v5 -; GISEL-NEXT: v_add_i32_e32 v8, vcc, v8, v12 -; GISEL-NEXT: v_add_i32_e32 v12, vcc, 1, v9 -; GISEL-NEXT: v_sub_i32_e64 v3, s[4:5], v3, v13 -; GISEL-NEXT: v_add_i32_e64 v13, s[6:7], 1, v5 -; GISEL-NEXT: v_sub_i32_e64 v2, s[8:9], v2, v17 -; GISEL-NEXT: v_add_i32_e64 v17, s[10:11], 1, v12 -; GISEL-NEXT: v_add_i32_e64 v6, s[12:13], v6, v10 -; GISEL-NEXT: v_add_i32_e64 v10, s[12:13], 1, v13 -; GISEL-NEXT: v_add_i32_e64 v7, s[14:15], v7, v8 -; GISEL-NEXT: v_cmp_ge_u32_e64 s[14:15], v3, v1 -; GISEL-NEXT: v_cmp_ge_u32_e64 s[16:17], v2, v0 -; GISEL-NEXT: v_sub_i32_e64 v3, s[18:19], v3, v1 -; GISEL-NEXT: v_sub_i32_e64 v2, s[20:21], v2, v0 -; GISEL-NEXT: v_mul_lo_u32 v8, v1, v6 -; GISEL-NEXT: v_cmp_ge_u32_e64 s[22:23], v3, v1 -; GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v6, vcc -; GISEL-NEXT: v_mul_lo_u32 v3, v0, v7 +; GISEL-NEXT: v_add_i32_e32 v11, vcc, v11, v12 +; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v6, vcc, v6, v18 +; GISEL-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v9, vcc, v13, v9 +; GISEL-NEXT: v_add_i32_e32 v10, vcc, v14, v10 +; GISEL-NEXT: v_add_i32_e32 v11, vcc, v16, v11 +; GISEL-NEXT: v_add_i32_e32 v12, vcc, v17, v12 +; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v9 +; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v6, vcc, v6, v11 +; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v9, vcc, v10, v9 +; GISEL-NEXT: v_mul_lo_u32 v10, v1, v5 +; GISEL-NEXT: v_mul_lo_u32 v13, 0, v5 +; GISEL-NEXT: v_mul_hi_u32 v14, v1, v5 +; GISEL-NEXT: v_add_i32_e32 v11, vcc, v12, v11 +; GISEL-NEXT: v_mul_lo_u32 v12, v0, v6 +; GISEL-NEXT: v_mul_lo_u32 v15, 0, v6 +; GISEL-NEXT: v_mul_hi_u32 v16, v0, v6 +; GISEL-NEXT: v_add_i32_e32 v7, vcc, v7, v9 +; GISEL-NEXT: v_add_i32_e32 v8, vcc, v8, v11 +; GISEL-NEXT: v_mul_lo_u32 v9, v1, v7 +; GISEL-NEXT: v_add_i32_e32 v11, vcc, 1, v5 +; GISEL-NEXT: v_addc_u32_e32 v17, vcc, 0, v7, vcc +; GISEL-NEXT: v_mul_lo_u32 v18, v0, v8 +; GISEL-NEXT: v_add_i32_e32 v19, vcc, 1, v6 +; GISEL-NEXT: v_addc_u32_e32 v20, vcc, 0, v8, vcc +; GISEL-NEXT: v_add_i32_e32 v9, vcc, v13, v9 +; GISEL-NEXT: v_add_i32_e32 v13, vcc, 1, v11 +; GISEL-NEXT: v_addc_u32_e32 v21, vcc, 0, v17, vcc +; GISEL-NEXT: v_add_i32_e32 v15, vcc, v15, v18 +; GISEL-NEXT: v_add_i32_e32 v18, vcc, 1, v19 +; GISEL-NEXT: v_addc_u32_e32 v22, vcc, 0, v20, vcc +; GISEL-NEXT: v_add_i32_e32 v9, vcc, v9, v14 +; GISEL-NEXT: v_add_i32_e32 v14, vcc, v15, v16 +; GISEL-NEXT: v_sub_i32_e32 v3, vcc, v3, v10 +; GISEL-NEXT: v_subb_u32_e64 v10, s[4:5], 0, v9, vcc +; GISEL-NEXT: v_sub_i32_e64 v9, s[4:5], 0, v9 +; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v3, v1 +; GISEL-NEXT: v_cndmask_b32_e64 v15, 0, -1, s[4:5] +; GISEL-NEXT: v_sub_i32_e64 v2, s[4:5], v2, v12 +; GISEL-NEXT: v_subb_u32_e64 v12, s[6:7], 0, v14, s[4:5] +; GISEL-NEXT: v_sub_i32_e64 v14, s[6:7], 0, v14 +; GISEL-NEXT: v_cmp_ge_u32_e64 s[6:7], v2, v0 +; GISEL-NEXT: v_cndmask_b32_e64 v16, 0, -1, s[6:7] +; GISEL-NEXT: v_cmp_eq_u32_e64 s[6:7], 0, v10 +; GISEL-NEXT: v_cndmask_b32_e64 v10, v4, v15, s[6:7] +; GISEL-NEXT: v_subbrev_u32_e32 v9, vcc, 0, v9, vcc +; GISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v12 +; GISEL-NEXT: v_cndmask_b32_e32 v12, v4, v16, vcc +; GISEL-NEXT: v_subbrev_u32_e64 v14, vcc, 0, v14, s[4:5] +; GISEL-NEXT: v_sub_i32_e32 v3, vcc, v3, v1 +; GISEL-NEXT: v_subbrev_u32_e32 v9, vcc, 0, v9, vcc +; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v3, v1 +; GISEL-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc +; GISEL-NEXT: v_sub_i32_e32 v2, vcc, v2, v0 +; GISEL-NEXT: v_subbrev_u32_e32 v3, vcc, 0, v14, vcc ; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v2, v0 -; GISEL-NEXT: v_addc_u32_e64 v0, s[6:7], 0, v7, s[6:7] -; GISEL-NEXT: v_cndmask_b32_e64 v2, 0, -1, s[14:15] -; GISEL-NEXT: v_add_i32_e64 v8, s[6:7], v14, v8 -; GISEL-NEXT: v_cndmask_b32_e64 v14, 0, -1, s[16:17] -; GISEL-NEXT: v_add_i32_e64 v3, s[6:7], v18, v3 -; GISEL-NEXT: v_addc_u32_e64 v18, s[6:7], 0, v1, s[10:11] -; GISEL-NEXT: v_add_i32_e64 v8, s[6:7], v8, v16 -; GISEL-NEXT: v_addc_u32_e64 v16, s[6:7], 0, v0, s[12:13] -; GISEL-NEXT: v_add_i32_e64 v3, s[6:7], v3, v11 -; GISEL-NEXT: v_subb_u32_e64 v11, s[6:7], 0, v8, s[4:5] -; GISEL-NEXT: v_cmp_eq_u32_e64 s[6:7], 0, v11 -; GISEL-NEXT: v_subb_u32_e64 v11, s[10:11], 0, v3, s[8:9] -; GISEL-NEXT: v_cmp_eq_u32_e64 s[10:11], 0, v11 -; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, -1, s[22:23] -; GISEL-NEXT: v_cndmask_b32_e64 v2, v20, v2, s[6:7] -; GISEL-NEXT: v_cndmask_b32_e64 v20, 0, -1, vcc -; GISEL-NEXT: v_sub_i32_e32 v8, vcc, 0, v8 -; GISEL-NEXT: v_sub_i32_e32 v3, vcc, 0, v3 -; GISEL-NEXT: v_subbrev_u32_e64 v8, vcc, 0, v8, s[4:5] -; GISEL-NEXT: v_subbrev_u32_e64 v3, vcc, 0, v3, s[8:9] -; GISEL-NEXT: v_cndmask_b32_e64 v14, v19, v14, s[10:11] -; GISEL-NEXT: v_subbrev_u32_e64 v8, vcc, 0, v8, s[18:19] -; GISEL-NEXT: v_subbrev_u32_e64 v3, vcc, 0, v3, s[20:21] -; GISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v8 -; GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v3 -; GISEL-NEXT: v_cmp_ne_u32_e64 s[6:7], 0, v2 -; GISEL-NEXT: v_cmp_ne_u32_e64 s[8:9], 0, v14 -; GISEL-NEXT: v_cndmask_b32_e32 v2, v4, v11, vcc -; GISEL-NEXT: v_cndmask_b32_e64 v3, v15, v20, s[4:5] -; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 -; GISEL-NEXT: v_cmp_ne_u32_e64 s[4:5], 0, v3 -; GISEL-NEXT: v_cndmask_b32_e32 v2, v12, v17, vcc -; GISEL-NEXT: v_cndmask_b32_e64 v3, v13, v10, s[4:5] -; GISEL-NEXT: v_cndmask_b32_e32 v1, v1, v18, vcc -; GISEL-NEXT: v_cndmask_b32_e64 v4, v0, v16, s[4:5] -; GISEL-NEXT: v_cndmask_b32_e64 v0, v9, v2, s[6:7] -; GISEL-NEXT: v_cndmask_b32_e64 v2, v5, v3, s[8:9] -; GISEL-NEXT: v_cndmask_b32_e64 v1, v6, v1, s[6:7] -; GISEL-NEXT: v_cndmask_b32_e64 v3, v7, v4, s[8:9] +; GISEL-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc +; GISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v9 +; GISEL-NEXT: v_cndmask_b32_e32 v1, v4, v1, vcc +; GISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 +; GISEL-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc +; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v1 +; GISEL-NEXT: v_cndmask_b32_e32 v1, v11, v13, vcc +; GISEL-NEXT: v_cmp_ne_u32_e64 s[4:5], 0, v0 +; GISEL-NEXT: v_cndmask_b32_e64 v2, v19, v18, s[4:5] +; GISEL-NEXT: v_cndmask_b32_e32 v3, v17, v21, vcc +; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10 +; GISEL-NEXT: v_cndmask_b32_e32 v0, v5, v1, vcc +; GISEL-NEXT: v_cndmask_b32_e64 v4, v20, v22, s[4:5] +; GISEL-NEXT: v_cmp_ne_u32_e64 s[4:5], 0, v12 +; GISEL-NEXT: v_cndmask_b32_e64 v2, v6, v2, s[4:5] +; GISEL-NEXT: v_cndmask_b32_e32 v1, v7, v3, vcc +; GISEL-NEXT: v_cndmask_b32_e64 v3, v8, v4, s[4:5] ; GISEL-NEXT: s_setpc_b64 s[30:31] ; ; CGP-LABEL: v_udiv_v2i64_24bit: Index: llvm/test/CodeGen/AMDGPU/GlobalISel/udivrem.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/GlobalISel/udivrem.ll +++ llvm/test/CodeGen/AMDGPU/GlobalISel/udivrem.ll @@ -234,14 +234,13 @@ ; GFX8-NEXT: v_cmp_ne_u32_e32 vcc, 0, v11 ; GFX8-NEXT: v_cndmask_b32_e32 v0, v9, v12, vcc ; GFX8-NEXT: v_cndmask_b32_e32 v9, v10, v13, vcc -; GFX8-NEXT: v_cmp_ne_u32_e32 vcc, 0, v1 -; GFX8-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v11 -; GFX8-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc -; GFX8-NEXT: v_cndmask_b32_e32 v1, v3, v9, vcc -; GFX8-NEXT: v_cndmask_b32_e64 v3, v7, v5, s[0:1] -; GFX8-NEXT: v_cndmask_b32_e64 v4, v8, v14, s[0:1] -; GFX8-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc -; GFX8-NEXT: v_cndmask_b32_e32 v3, v6, v4, vcc +; GFX8-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v1 +; GFX8-NEXT: v_cndmask_b32_e64 v0, v4, v0, s[0:1] +; GFX8-NEXT: v_cndmask_b32_e64 v1, v3, v9, s[0:1] +; GFX8-NEXT: v_cndmask_b32_e32 v3, v7, v5, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v4, v8, v14, vcc +; GFX8-NEXT: v_cndmask_b32_e64 v2, v2, v3, s[0:1] +; GFX8-NEXT: v_cndmask_b32_e64 v3, v6, v4, s[0:1] ; GFX8-NEXT: v_mov_b32_e32 v4, s4 ; GFX8-NEXT: v_mov_b32_e32 v5, s5 ; GFX8-NEXT: flat_store_dwordx2 v[4:5], v[0:1] @@ -294,7 +293,7 @@ ; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, v3, v0 ; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, v4, v1, vcc ; GFX9-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s2, v3, 0 -; GFX9-NEXT: v_mov_b32_e32 v7, s11 +; GFX9-NEXT: v_mov_b32_e32 v7, 0 ; GFX9-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s2, v4, v[1:2] ; GFX9-NEXT: v_mul_hi_u32 v6, v3, v0 ; GFX9-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s3, v3, v[1:2] @@ -342,7 +341,7 @@ ; GFX9-NEXT: v_add3_u32 v3, v3, v2, v6 ; GFX9-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s10, v3, v[1:2] ; GFX9-NEXT: v_mov_b32_e32 v6, s9 -; GFX9-NEXT: v_mov_b32_e32 v4, 0 +; GFX9-NEXT: v_mov_b32_e32 v4, s11 ; GFX9-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s11, v5, v[1:2] ; GFX9-NEXT: v_sub_co_u32_e32 v2, vcc, s8, v0 ; GFX9-NEXT: v_subb_co_u32_e64 v6, s[0:1], v6, v1, vcc @@ -352,7 +351,7 @@ ; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s10, v2 ; GFX9-NEXT: v_cndmask_b32_e64 v8, 0, -1, s[0:1] ; GFX9-NEXT: v_cmp_eq_u32_e64 s[0:1], s11, v6 -; GFX9-NEXT: v_subb_co_u32_e32 v0, vcc, v0, v7, vcc +; GFX9-NEXT: v_subb_co_u32_e32 v0, vcc, v0, v4, vcc ; GFX9-NEXT: v_cndmask_b32_e64 v1, v1, v8, s[0:1] ; GFX9-NEXT: v_subrev_co_u32_e32 v8, vcc, s10, v2 ; GFX9-NEXT: v_subbrev_co_u32_e64 v9, s[0:1], 0, v0, vcc @@ -361,10 +360,10 @@ ; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s11, v9 ; GFX9-NEXT: v_cndmask_b32_e64 v12, 0, -1, s[0:1] ; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s10, v8 -; GFX9-NEXT: v_subb_co_u32_e32 v0, vcc, v0, v7, vcc +; GFX9-NEXT: v_subb_co_u32_e32 v0, vcc, v0, v4, vcc ; GFX9-NEXT: v_cndmask_b32_e64 v13, 0, -1, s[0:1] ; GFX9-NEXT: v_cmp_eq_u32_e64 s[0:1], s11, v9 -; GFX9-NEXT: v_subrev_co_u32_e32 v7, vcc, s10, v8 +; GFX9-NEXT: v_subrev_co_u32_e32 v4, vcc, s10, v8 ; GFX9-NEXT: v_cndmask_b32_e64 v12, v12, v13, s[0:1] ; GFX9-NEXT: v_add_co_u32_e64 v13, s[0:1], 1, v10 ; GFX9-NEXT: v_subbrev_co_u32_e32 v15, vcc, 0, v0, vcc @@ -372,16 +371,15 @@ ; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v12 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v10, v13, vcc ; GFX9-NEXT: v_cndmask_b32_e32 v10, v11, v14, vcc -; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v1 -; GFX9-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v12 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v5, v0, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v1, v3, v10, vcc -; GFX9-NEXT: v_cndmask_b32_e64 v3, v8, v7, s[0:1] -; GFX9-NEXT: v_cndmask_b32_e64 v5, v9, v15, s[0:1] -; GFX9-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v3, v6, v5, vcc -; GFX9-NEXT: global_store_dwordx2 v4, v[0:1], s[4:5] -; GFX9-NEXT: global_store_dwordx2 v4, v[2:3], s[6:7] +; GFX9-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v1 +; GFX9-NEXT: v_cndmask_b32_e64 v0, v5, v0, s[0:1] +; GFX9-NEXT: v_cndmask_b32_e64 v1, v3, v10, s[0:1] +; GFX9-NEXT: v_cndmask_b32_e32 v3, v8, v4, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v4, v9, v15, vcc +; GFX9-NEXT: v_cndmask_b32_e64 v2, v2, v3, s[0:1] +; GFX9-NEXT: v_cndmask_b32_e64 v3, v6, v4, s[0:1] +; GFX9-NEXT: global_store_dwordx2 v7, v[0:1], s[4:5] +; GFX9-NEXT: global_store_dwordx2 v7, v[2:3], s[6:7] ; GFX9-NEXT: s_endpgm ; ; GFX10-LABEL: udivrem_i64: @@ -474,49 +472,48 @@ ; GFX10-NEXT: v_add_co_u32 v2, s0, v2, v0 ; GFX10-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0 ; GFX10-NEXT: v_mul_lo_u32 v4, s11, v2 -; GFX10-NEXT: v_add_co_u32 v6, vcc_lo, v2, 1 ; GFX10-NEXT: v_add3_u32 v3, v3, v0, v1 ; GFX10-NEXT: v_mad_u64_u32 v[0:1], s0, s10, v2, 0 ; GFX10-NEXT: v_mul_lo_u32 v5, s10, v3 -; GFX10-NEXT: v_add_co_ci_u32_e32 v7, vcc_lo, 0, v3, vcc_lo ; GFX10-NEXT: v_add3_u32 v1, v1, v5, v4 -; GFX10-NEXT: v_add_co_u32 v4, vcc_lo, v6, 1 -; GFX10-NEXT: v_add_co_ci_u32_e32 v5, vcc_lo, 0, v7, vcc_lo -; GFX10-NEXT: v_sub_nc_u32_e32 v8, s9, v1 -; GFX10-NEXT: v_sub_co_u32 v9, vcc_lo, s8, v0 -; GFX10-NEXT: v_sub_co_ci_u32_e64 v10, s0, s9, v1, vcc_lo -; GFX10-NEXT: v_subrev_co_ci_u32_e32 v0, vcc_lo, s11, v8, vcc_lo -; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, s10, v9 +; GFX10-NEXT: v_add_co_u32 v4, vcc_lo, v2, 1 +; GFX10-NEXT: v_add_co_ci_u32_e32 v5, vcc_lo, 0, v3, vcc_lo +; GFX10-NEXT: v_sub_nc_u32_e32 v6, s9, v1 +; GFX10-NEXT: v_sub_co_u32 v7, vcc_lo, s8, v0 +; GFX10-NEXT: v_sub_co_ci_u32_e64 v8, s0, s9, v1, vcc_lo +; GFX10-NEXT: v_subrev_co_ci_u32_e32 v0, vcc_lo, s11, v6, vcc_lo +; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, s10, v7 ; GFX10-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc_lo -; GFX10-NEXT: v_sub_co_u32 v8, vcc_lo, v9, s10 -; GFX10-NEXT: v_subrev_co_ci_u32_e64 v11, s0, 0, v0, vcc_lo -; GFX10-NEXT: v_cmp_le_u32_e64 s0, s11, v10 +; GFX10-NEXT: v_sub_co_u32 v6, vcc_lo, v7, s10 +; GFX10-NEXT: v_subrev_co_ci_u32_e64 v9, s0, 0, v0, vcc_lo +; GFX10-NEXT: v_cmp_le_u32_e64 s0, s11, v8 ; GFX10-NEXT: v_subrev_co_ci_u32_e32 v0, vcc_lo, s11, v0, vcc_lo -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, s11, v11 +; GFX10-NEXT: v_cndmask_b32_e64 v10, 0, -1, s0 +; GFX10-NEXT: v_cmp_le_u32_e64 s0, s10, v6 +; GFX10-NEXT: v_cndmask_b32_e64 v11, 0, -1, s0 +; GFX10-NEXT: v_cmp_le_u32_e64 s0, s11, v9 ; GFX10-NEXT: v_cndmask_b32_e64 v12, 0, -1, s0 -; GFX10-NEXT: v_cmp_le_u32_e64 s0, s10, v8 -; GFX10-NEXT: v_cndmask_b32_e64 v13, 0, -1, s0 -; GFX10-NEXT: v_cmp_le_u32_e64 s0, s11, v11 -; GFX10-NEXT: v_cndmask_b32_e64 v14, 0, -1, s0 -; GFX10-NEXT: v_cmp_eq_u32_e64 s0, s11, v10 -; GFX10-NEXT: v_cndmask_b32_e64 v1, v12, v1, s0 -; GFX10-NEXT: v_cndmask_b32_e32 v12, v14, v13, vcc_lo -; GFX10-NEXT: v_sub_co_u32 v13, vcc_lo, v8, s10 -; GFX10-NEXT: v_subrev_co_ci_u32_e32 v0, vcc_lo, 0, v0, vcc_lo -; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v12 -; GFX10-NEXT: v_cmp_ne_u32_e64 s0, 0, v12 -; GFX10-NEXT: v_cmp_ne_u32_e64 s1, 0, v1 -; GFX10-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e32 v1, v7, v5, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e64 v6, v8, v13, s0 -; GFX10-NEXT: v_cndmask_b32_e64 v5, v11, v0, s0 -; GFX10-NEXT: v_mov_b32_e32 v7, 0 -; GFX10-NEXT: v_cndmask_b32_e64 v0, v2, v4, s1 -; GFX10-NEXT: v_cndmask_b32_e64 v1, v3, v1, s1 -; GFX10-NEXT: v_cndmask_b32_e64 v2, v9, v6, s1 -; GFX10-NEXT: v_cndmask_b32_e64 v3, v10, v5, s1 -; GFX10-NEXT: global_store_dwordx2 v7, v[0:1], s[4:5] -; GFX10-NEXT: global_store_dwordx2 v7, v[2:3], s[6:7] +; GFX10-NEXT: v_add_co_u32 v13, s0, v4, 1 +; GFX10-NEXT: v_add_co_ci_u32_e64 v14, s0, 0, v5, s0 +; GFX10-NEXT: v_cmp_eq_u32_e64 s0, s11, v9 +; GFX10-NEXT: v_cndmask_b32_e64 v11, v12, v11, s0 +; GFX10-NEXT: v_cmp_eq_u32_e64 s0, s11, v8 +; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v11 +; GFX10-NEXT: v_cndmask_b32_e64 v1, v10, v1, s0 +; GFX10-NEXT: v_sub_co_u32 v10, s0, v6, s10 +; GFX10-NEXT: v_subrev_co_ci_u32_e64 v0, s0, 0, v0, s0 +; GFX10-NEXT: v_cndmask_b32_e32 v4, v4, v13, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e32 v5, v5, v14, vcc_lo +; GFX10-NEXT: v_cmp_ne_u32_e64 s0, 0, v1 +; GFX10-NEXT: v_cndmask_b32_e32 v6, v6, v10, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e32 v9, v9, v0, vcc_lo +; GFX10-NEXT: v_mov_b32_e32 v10, 0 +; GFX10-NEXT: v_cndmask_b32_e64 v0, v2, v4, s0 +; GFX10-NEXT: v_cndmask_b32_e64 v1, v3, v5, s0 +; GFX10-NEXT: v_cndmask_b32_e64 v2, v7, v6, s0 +; GFX10-NEXT: v_cndmask_b32_e64 v3, v8, v9, s0 +; GFX10-NEXT: global_store_dwordx2 v10, v[0:1], s[4:5] +; GFX10-NEXT: global_store_dwordx2 v10, v[2:3], s[6:7] ; GFX10-NEXT: s_endpgm %div = udiv i64 %x, %y store i64 %div, ptr addrspace(1) %out0 @@ -1151,40 +1148,39 @@ ; GFX8-NEXT: v_mov_b32_e32 v0, v3 ; GFX8-NEXT: v_mad_u64_u32 v[3:4], s[0:1], s2, v14, v[0:1] ; GFX8-NEXT: v_cndmask_b32_e32 v12, v13, v18, vcc -; GFX8-NEXT: v_cmp_ne_u32_e32 vcc, 0, v9 -; GFX8-NEXT: v_mad_u64_u32 v[3:4], s[0:1], s3, v15, v[3:4] -; GFX8-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v16 -; GFX8-NEXT: v_cndmask_b32_e32 v0, v5, v1, vcc -; GFX8-NEXT: v_cndmask_b32_e32 v1, v6, v12, vcc -; GFX8-NEXT: v_cndmask_b32_e64 v4, v10, v19, s[0:1] +; GFX8-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v9 +; GFX8-NEXT: v_mad_u64_u32 v[3:4], s[2:3], s3, v15, v[3:4] +; GFX8-NEXT: v_cndmask_b32_e64 v0, v5, v1, s[0:1] +; GFX8-NEXT: v_cndmask_b32_e64 v1, v6, v12, s[0:1] +; GFX8-NEXT: v_cndmask_b32_e32 v4, v10, v19, vcc ; GFX8-NEXT: v_mul_lo_u32 v6, v14, v2 ; GFX8-NEXT: v_mul_lo_u32 v9, v15, v3 -; GFX8-NEXT: v_cndmask_b32_e32 v4, v7, v4, vcc +; GFX8-NEXT: v_cndmask_b32_e64 v4, v7, v4, s[0:1] ; GFX8-NEXT: v_mul_hi_u32 v7, v15, v2 -; GFX8-NEXT: v_cndmask_b32_e64 v5, v11, v20, s[0:1] -; GFX8-NEXT: v_add_u32_e64 v6, s[0:1], v6, v9 -; GFX8-NEXT: v_cndmask_b32_e64 v9, 0, 1, s[0:1] -; GFX8-NEXT: v_add_u32_e64 v6, s[0:1], v6, v7 -; GFX8-NEXT: v_cndmask_b32_e64 v6, 0, 1, s[0:1] +; GFX8-NEXT: v_cndmask_b32_e32 v5, v11, v20, vcc +; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v9 +; GFX8-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc +; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v7 +; GFX8-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc ; GFX8-NEXT: v_mul_lo_u32 v7, v14, v3 ; GFX8-NEXT: v_mul_hi_u32 v2, v14, v2 -; GFX8-NEXT: v_add_u32_e64 v6, s[0:1], v9, v6 +; GFX8-NEXT: v_add_u32_e32 v6, vcc, v9, v6 ; GFX8-NEXT: v_mul_hi_u32 v9, v15, v3 -; GFX8-NEXT: v_add_u32_e64 v2, s[0:1], v7, v2 -; GFX8-NEXT: v_cndmask_b32_e64 v7, 0, 1, s[0:1] -; GFX8-NEXT: v_add_u32_e64 v2, s[0:1], v2, v9 -; GFX8-NEXT: v_cndmask_b32_e64 v9, 0, 1, s[0:1] -; GFX8-NEXT: v_add_u32_e64 v7, s[0:1], v7, v9 +; GFX8-NEXT: v_add_u32_e32 v2, vcc, v7, v2 +; GFX8-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc +; GFX8-NEXT: v_add_u32_e32 v2, vcc, v2, v9 +; GFX8-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc +; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v9 ; GFX8-NEXT: v_mul_hi_u32 v3, v14, v3 -; GFX8-NEXT: v_add_u32_e64 v2, s[0:1], v2, v6 -; GFX8-NEXT: v_cndmask_b32_e64 v6, 0, 1, s[0:1] -; GFX8-NEXT: v_add_u32_e64 v6, s[0:1], v7, v6 -; GFX8-NEXT: v_add_u32_e64 v3, s[0:1], v3, v6 -; GFX8-NEXT: v_add_u32_e64 v2, s[0:1], v15, v2 -; GFX8-NEXT: v_addc_u32_e64 v3, s[0:1], v14, v3, s[0:1] +; GFX8-NEXT: v_add_u32_e32 v2, vcc, v2, v6 +; GFX8-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc +; GFX8-NEXT: v_add_u32_e32 v6, vcc, v7, v6 +; GFX8-NEXT: v_add_u32_e32 v3, vcc, v3, v6 +; GFX8-NEXT: v_add_u32_e32 v2, vcc, v15, v2 +; GFX8-NEXT: v_addc_u32_e32 v3, vcc, v14, v3, vcc ; GFX8-NEXT: v_mul_lo_u32 v6, s11, v2 ; GFX8-NEXT: v_mul_lo_u32 v7, s10, v3 -; GFX8-NEXT: v_cndmask_b32_e32 v5, v8, v5, vcc +; GFX8-NEXT: v_cndmask_b32_e64 v5, v8, v5, s[0:1] ; GFX8-NEXT: v_mul_hi_u32 v8, s10, v2 ; GFX8-NEXT: v_mul_hi_u32 v2, s11, v2 ; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v7 @@ -1298,7 +1294,6 @@ ; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, v3, v0 ; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, v4, v1, vcc ; GFX9-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s2, v3, 0 -; GFX9-NEXT: v_mov_b32_e32 v7, s13 ; GFX9-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s2, v4, v[1:2] ; GFX9-NEXT: v_mul_hi_u32 v6, v3, v0 ; GFX9-NEXT: s_sub_u32 s2, 0, s14 @@ -1330,7 +1325,7 @@ ; GFX9-NEXT: v_mul_lo_u32 v3, s8, v1 ; GFX9-NEXT: v_mul_hi_u32 v4, s8, v0 ; GFX9-NEXT: v_mul_hi_u32 v0, s9, v0 -; GFX9-NEXT: v_mul_hi_u32 v5, s9, v1 +; GFX9-NEXT: v_mul_hi_u32 v7, s9, v1 ; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v2, v3 ; GFX9-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc ; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v2, v4 @@ -1344,152 +1339,151 @@ ; GFX9-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc ; GFX9-NEXT: v_add_co_u32_e32 v6, vcc, v0, v2 ; GFX9-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s12, v6, 0 -; GFX9-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc +; GFX9-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc ; GFX9-NEXT: v_add_u32_e32 v3, v4, v3 -; GFX9-NEXT: v_add3_u32 v8, v3, v0, v5 -; GFX9-NEXT: v_mov_b32_e32 v0, v2 -; GFX9-NEXT: v_mad_u64_u32 v[2:3], s[0:1], s12, v8, v[0:1] -; GFX9-NEXT: v_mov_b32_e32 v4, s9 -; GFX9-NEXT: v_sub_co_u32_e32 v9, vcc, s8, v1 +; GFX9-NEXT: v_add3_u32 v7, v3, v5, v7 +; GFX9-NEXT: v_mad_u64_u32 v[2:3], s[0:1], s12, v7, v[2:3] +; GFX9-NEXT: v_mov_b32_e32 v5, s9 +; GFX9-NEXT: v_sub_co_u32_e32 v8, vcc, s8, v1 ; GFX9-NEXT: v_mad_u64_u32 v[2:3], s[0:1], s13, v6, v[2:3] +; GFX9-NEXT: v_mov_b32_e32 v4, s13 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 -; GFX9-NEXT: v_subb_co_u32_e64 v10, s[0:1], v4, v2, vcc -; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s13, v10 +; GFX9-NEXT: v_subb_co_u32_e64 v9, s[0:1], v5, v2, vcc +; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s13, v9 ; GFX9-NEXT: v_sub_u32_e32 v1, s9, v2 ; GFX9-NEXT: v_cndmask_b32_e64 v2, 0, -1, s[0:1] -; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s12, v9 +; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s12, v8 ; GFX9-NEXT: v_cndmask_b32_e64 v3, 0, -1, s[0:1] -; GFX9-NEXT: v_cmp_eq_u32_e64 s[0:1], s13, v10 -; GFX9-NEXT: v_cndmask_b32_e64 v11, v2, v3, s[0:1] +; GFX9-NEXT: v_cmp_eq_u32_e64 s[0:1], s13, v9 +; GFX9-NEXT: v_cndmask_b32_e64 v10, v2, v3, s[0:1] ; GFX9-NEXT: v_cvt_f32_u32_e32 v2, s15 -; GFX9-NEXT: v_subb_co_u32_e32 v4, vcc, v1, v7, vcc +; GFX9-NEXT: v_subb_co_u32_e32 v5, vcc, v1, v4, vcc ; GFX9-NEXT: v_cvt_f32_u32_e32 v1, s14 ; GFX9-NEXT: v_mul_f32_e32 v2, 0x4f800000, v2 -; GFX9-NEXT: v_subrev_co_u32_e32 v12, vcc, s12, v9 +; GFX9-NEXT: v_subrev_co_u32_e32 v11, vcc, s12, v8 ; GFX9-NEXT: v_add_f32_e32 v1, v2, v1 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v1, v1 -; GFX9-NEXT: v_subbrev_co_u32_e64 v13, s[0:1], 0, v4, vcc -; GFX9-NEXT: v_add_co_u32_e64 v5, s[0:1], 1, v6 +; GFX9-NEXT: v_subbrev_co_u32_e64 v12, s[0:1], 0, v5, vcc +; GFX9-NEXT: v_add_co_u32_e64 v13, s[0:1], 1, v6 ; GFX9-NEXT: v_mul_f32_e32 v1, 0x5f7ffffc, v1 ; GFX9-NEXT: v_mul_f32_e32 v2, 0x2f800000, v1 ; GFX9-NEXT: v_trunc_f32_e32 v15, v2 ; GFX9-NEXT: v_mul_f32_e32 v2, 0xcf800000, v15 ; GFX9-NEXT: v_add_f32_e32 v1, v2, v1 ; GFX9-NEXT: v_cvt_u32_f32_e32 v16, v1 -; GFX9-NEXT: v_addc_co_u32_e64 v14, s[0:1], 0, v8, s[0:1] -; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s13, v13 +; GFX9-NEXT: v_addc_co_u32_e64 v14, s[0:1], 0, v7, s[0:1] +; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s13, v12 ; GFX9-NEXT: v_cndmask_b32_e64 v3, 0, -1, s[0:1] -; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s12, v12 +; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s12, v11 ; GFX9-NEXT: v_cndmask_b32_e64 v17, 0, -1, s[0:1] ; GFX9-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s2, v16, 0 ; GFX9-NEXT: v_cvt_u32_f32_e32 v15, v15 -; GFX9-NEXT: v_cmp_eq_u32_e64 s[0:1], s13, v13 +; GFX9-NEXT: v_cmp_eq_u32_e64 s[0:1], s13, v12 ; GFX9-NEXT: v_cndmask_b32_e64 v17, v3, v17, s[0:1] ; GFX9-NEXT: v_mad_u64_u32 v[2:3], s[0:1], s2, v15, v[2:3] -; GFX9-NEXT: v_add_co_u32_e64 v18, s[0:1], 1, v5 +; GFX9-NEXT: v_add_co_u32_e64 v18, s[0:1], 1, v13 ; GFX9-NEXT: v_addc_co_u32_e64 v19, s[0:1], 0, v14, s[0:1] ; GFX9-NEXT: v_mad_u64_u32 v[2:3], s[0:1], s3, v16, v[2:3] -; GFX9-NEXT: v_subb_co_u32_e32 v3, vcc, v4, v7, vcc +; GFX9-NEXT: v_subb_co_u32_e32 v3, vcc, v5, v4, vcc ; GFX9-NEXT: v_mul_lo_u32 v4, v15, v1 -; GFX9-NEXT: v_mul_lo_u32 v7, v16, v2 -; GFX9-NEXT: v_subrev_co_u32_e32 v20, vcc, s12, v12 +; GFX9-NEXT: v_mul_lo_u32 v5, v16, v2 +; GFX9-NEXT: v_subrev_co_u32_e32 v20, vcc, s12, v11 ; GFX9-NEXT: v_subbrev_co_u32_e32 v21, vcc, 0, v3, vcc ; GFX9-NEXT: v_mul_hi_u32 v3, v16, v1 -; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, v4, v7 -; GFX9-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc +; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, v4, v5 +; GFX9-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc ; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, v4, v3 ; GFX9-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc ; GFX9-NEXT: v_mul_lo_u32 v4, v15, v2 ; GFX9-NEXT: v_mul_hi_u32 v1, v15, v1 -; GFX9-NEXT: v_add_u32_e32 v3, v7, v3 -; GFX9-NEXT: v_mul_hi_u32 v7, v16, v2 +; GFX9-NEXT: v_add_u32_e32 v3, v5, v3 +; GFX9-NEXT: v_mul_hi_u32 v5, v16, v2 ; GFX9-NEXT: v_mul_hi_u32 v2, v15, v2 ; GFX9-NEXT: v_add_co_u32_e32 v1, vcc, v4, v1 ; GFX9-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc -; GFX9-NEXT: v_add_co_u32_e32 v1, vcc, v1, v7 -; GFX9-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc +; GFX9-NEXT: v_add_co_u32_e32 v1, vcc, v1, v5 +; GFX9-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc ; GFX9-NEXT: v_add_co_u32_e32 v1, vcc, v1, v3 -; GFX9-NEXT: v_add_u32_e32 v4, v4, v7 +; GFX9-NEXT: v_add_u32_e32 v4, v4, v5 ; GFX9-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc -; GFX9-NEXT: v_add_co_u32_e32 v7, vcc, v16, v1 +; GFX9-NEXT: v_add_co_u32_e32 v16, vcc, v16, v1 ; GFX9-NEXT: v_add3_u32 v2, v4, v3, v2 -; GFX9-NEXT: v_mad_u64_u32 v[3:4], s[0:1], s2, v7, 0 +; GFX9-NEXT: v_mad_u64_u32 v[3:4], s[0:1], s2, v16, 0 ; GFX9-NEXT: v_addc_co_u32_e32 v15, vcc, v15, v2, vcc ; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v17 -; GFX9-NEXT: v_cndmask_b32_e32 v2, v5, v18, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v2, v13, v18, vcc ; GFX9-NEXT: v_mov_b32_e32 v1, v4 ; GFX9-NEXT: v_mad_u64_u32 v[4:5], s[0:1], s2, v15, v[1:2] -; GFX9-NEXT: v_cndmask_b32_e32 v14, v14, v19, vcc -; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v11 -; GFX9-NEXT: v_mad_u64_u32 v[4:5], s[0:1], s3, v7, v[4:5] -; GFX9-NEXT: v_cndmask_b32_e32 v1, v6, v2, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v2, v8, v14, vcc -; GFX9-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v17 +; GFX9-NEXT: v_cndmask_b32_e32 v13, v14, v19, vcc +; GFX9-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v10 +; GFX9-NEXT: v_mad_u64_u32 v[4:5], s[2:3], s3, v16, v[4:5] +; GFX9-NEXT: v_cndmask_b32_e64 v1, v6, v2, s[0:1] +; GFX9-NEXT: v_cndmask_b32_e64 v2, v7, v13, s[0:1] ; GFX9-NEXT: v_mul_lo_u32 v6, v15, v3 -; GFX9-NEXT: v_mul_lo_u32 v8, v7, v4 -; GFX9-NEXT: v_cndmask_b32_e64 v5, v12, v20, s[0:1] -; GFX9-NEXT: v_mul_hi_u32 v12, v7, v3 -; GFX9-NEXT: v_cndmask_b32_e64 v11, v13, v21, s[0:1] -; GFX9-NEXT: v_add_co_u32_e64 v6, s[0:1], v6, v8 -; GFX9-NEXT: v_cndmask_b32_e64 v8, 0, 1, s[0:1] -; GFX9-NEXT: v_add_co_u32_e64 v6, s[0:1], v6, v12 -; GFX9-NEXT: v_cndmask_b32_e64 v6, 0, 1, s[0:1] -; GFX9-NEXT: v_mul_lo_u32 v12, v15, v4 +; GFX9-NEXT: v_mul_lo_u32 v7, v16, v4 +; GFX9-NEXT: v_cndmask_b32_e32 v5, v11, v20, vcc +; GFX9-NEXT: v_mul_hi_u32 v11, v16, v3 +; GFX9-NEXT: v_cndmask_b32_e32 v10, v12, v21, vcc +; GFX9-NEXT: v_add_co_u32_e32 v6, vcc, v6, v7 +; GFX9-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc +; GFX9-NEXT: v_add_co_u32_e32 v6, vcc, v6, v11 +; GFX9-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc +; GFX9-NEXT: v_mul_lo_u32 v11, v15, v4 ; GFX9-NEXT: v_mul_hi_u32 v3, v15, v3 -; GFX9-NEXT: v_add_u32_e32 v6, v8, v6 -; GFX9-NEXT: v_mul_hi_u32 v8, v7, v4 +; GFX9-NEXT: v_add_u32_e32 v6, v7, v6 +; GFX9-NEXT: v_mul_hi_u32 v7, v16, v4 ; GFX9-NEXT: v_mul_hi_u32 v4, v15, v4 -; GFX9-NEXT: v_add_co_u32_e64 v3, s[0:1], v12, v3 -; GFX9-NEXT: v_cndmask_b32_e64 v12, 0, 1, s[0:1] -; GFX9-NEXT: v_add_co_u32_e64 v3, s[0:1], v3, v8 -; GFX9-NEXT: v_cndmask_b32_e64 v8, 0, 1, s[0:1] -; GFX9-NEXT: v_add_co_u32_e64 v3, s[0:1], v3, v6 -; GFX9-NEXT: v_add_u32_e32 v8, v12, v8 -; GFX9-NEXT: v_cndmask_b32_e64 v6, 0, 1, s[0:1] -; GFX9-NEXT: v_add3_u32 v4, v8, v6, v4 -; GFX9-NEXT: v_add_co_u32_e64 v3, s[0:1], v7, v3 -; GFX9-NEXT: v_addc_co_u32_e64 v4, s[0:1], v15, v4, s[0:1] +; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, v11, v3 +; GFX9-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc +; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, v3, v7 +; GFX9-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc +; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, v3, v6 +; GFX9-NEXT: v_add_u32_e32 v7, v11, v7 +; GFX9-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc +; GFX9-NEXT: v_add3_u32 v4, v7, v6, v4 +; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, v16, v3 +; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, v15, v4, vcc ; GFX9-NEXT: v_mul_lo_u32 v6, s11, v3 ; GFX9-NEXT: v_mul_lo_u32 v7, s10, v4 +; GFX9-NEXT: v_cndmask_b32_e64 v5, v8, v5, s[0:1] ; GFX9-NEXT: v_mul_hi_u32 v8, s10, v3 ; GFX9-NEXT: v_mul_hi_u32 v3, s11, v3 -; GFX9-NEXT: v_cndmask_b32_e32 v5, v9, v5, vcc -; GFX9-NEXT: v_add_co_u32_e64 v6, s[0:1], v6, v7 -; GFX9-NEXT: v_cndmask_b32_e64 v7, 0, 1, s[0:1] -; GFX9-NEXT: v_add_co_u32_e64 v6, s[0:1], v6, v8 -; GFX9-NEXT: v_cndmask_b32_e64 v6, 0, 1, s[0:1] +; GFX9-NEXT: v_add_co_u32_e32 v6, vcc, v6, v7 +; GFX9-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc +; GFX9-NEXT: v_add_co_u32_e32 v6, vcc, v6, v8 +; GFX9-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc ; GFX9-NEXT: v_mul_lo_u32 v8, s11, v4 ; GFX9-NEXT: v_add_u32_e32 v6, v7, v6 ; GFX9-NEXT: v_mul_hi_u32 v7, s10, v4 ; GFX9-NEXT: v_mul_hi_u32 v13, s11, v4 -; GFX9-NEXT: v_add_co_u32_e64 v3, s[0:1], v8, v3 -; GFX9-NEXT: v_cndmask_b32_e64 v8, 0, 1, s[0:1] -; GFX9-NEXT: v_add_co_u32_e64 v3, s[0:1], v3, v7 -; GFX9-NEXT: v_cndmask_b32_e64 v7, 0, 1, s[0:1] -; GFX9-NEXT: v_add_co_u32_e64 v9, s[0:1], v3, v6 -; GFX9-NEXT: v_cndmask_b32_e64 v12, 0, 1, s[0:1] -; GFX9-NEXT: v_mad_u64_u32 v[3:4], s[0:1], s14, v9, 0 +; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, v8, v3 +; GFX9-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc +; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, v3, v7 +; GFX9-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc +; GFX9-NEXT: v_add_co_u32_e32 v11, vcc, v3, v6 +; GFX9-NEXT: v_mad_u64_u32 v[3:4], s[2:3], s14, v11, 0 +; GFX9-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc ; GFX9-NEXT: v_add_u32_e32 v7, v8, v7 -; GFX9-NEXT: v_cndmask_b32_e32 v6, v10, v11, vcc -; GFX9-NEXT: v_add3_u32 v10, v7, v12, v13 -; GFX9-NEXT: v_mad_u64_u32 v[7:8], s[0:1], s14, v10, v[4:5] -; GFX9-NEXT: v_mov_b32_e32 v11, s11 +; GFX9-NEXT: v_cndmask_b32_e64 v6, v9, v10, s[0:1] +; GFX9-NEXT: v_add3_u32 v9, v7, v12, v13 +; GFX9-NEXT: v_mad_u64_u32 v[7:8], s[0:1], s14, v9, v[4:5] +; GFX9-NEXT: v_mov_b32_e32 v10, s11 ; GFX9-NEXT: v_mov_b32_e32 v4, s15 -; GFX9-NEXT: v_mad_u64_u32 v[7:8], s[0:1], s15, v9, v[7:8] +; GFX9-NEXT: v_mad_u64_u32 v[7:8], s[0:1], s15, v11, v[7:8] ; GFX9-NEXT: v_sub_co_u32_e32 v8, vcc, s10, v3 -; GFX9-NEXT: v_subb_co_u32_e64 v11, s[0:1], v11, v7, vcc -; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s15, v11 +; GFX9-NEXT: v_subb_co_u32_e64 v10, s[0:1], v10, v7, vcc +; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s15, v10 ; GFX9-NEXT: v_sub_u32_e32 v3, s11, v7 ; GFX9-NEXT: v_cndmask_b32_e64 v7, 0, -1, s[0:1] ; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s14, v8 ; GFX9-NEXT: v_cndmask_b32_e64 v12, 0, -1, s[0:1] -; GFX9-NEXT: v_cmp_eq_u32_e64 s[0:1], s15, v11 +; GFX9-NEXT: v_cmp_eq_u32_e64 s[0:1], s15, v10 ; GFX9-NEXT: v_subb_co_u32_e32 v3, vcc, v3, v4, vcc ; GFX9-NEXT: v_cndmask_b32_e64 v7, v7, v12, s[0:1] ; GFX9-NEXT: v_subrev_co_u32_e32 v12, vcc, s14, v8 ; GFX9-NEXT: v_subbrev_co_u32_e64 v13, s[0:1], 0, v3, vcc -; GFX9-NEXT: v_add_co_u32_e64 v14, s[0:1], 1, v9 -; GFX9-NEXT: v_addc_co_u32_e64 v15, s[0:1], 0, v10, s[0:1] +; GFX9-NEXT: v_add_co_u32_e64 v14, s[0:1], 1, v11 +; GFX9-NEXT: v_addc_co_u32_e64 v15, s[0:1], 0, v9, s[0:1] ; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s15, v13 ; GFX9-NEXT: v_cndmask_b32_e64 v16, 0, -1, s[0:1] ; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s14, v12 @@ -1505,12 +1499,12 @@ ; GFX9-NEXT: v_cndmask_b32_e32 v3, v14, v17, vcc ; GFX9-NEXT: v_cndmask_b32_e32 v4, v15, v18, vcc ; GFX9-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v7 -; GFX9-NEXT: v_cndmask_b32_e64 v3, v9, v3, s[0:1] -; GFX9-NEXT: v_cndmask_b32_e64 v4, v10, v4, s[0:1] +; GFX9-NEXT: v_cndmask_b32_e64 v3, v11, v3, s[0:1] +; GFX9-NEXT: v_cndmask_b32_e64 v4, v9, v4, s[0:1] ; GFX9-NEXT: v_cndmask_b32_e32 v7, v12, v19, vcc ; GFX9-NEXT: v_cndmask_b32_e32 v9, v13, v20, vcc ; GFX9-NEXT: v_cndmask_b32_e64 v7, v8, v7, s[0:1] -; GFX9-NEXT: v_cndmask_b32_e64 v8, v11, v9, s[0:1] +; GFX9-NEXT: v_cndmask_b32_e64 v8, v10, v9, s[0:1] ; GFX9-NEXT: global_store_dwordx4 v0, v[1:4], s[4:5] ; GFX9-NEXT: global_store_dwordx4 v0, v[5:8], s[6:7] ; GFX9-NEXT: s_endpgm @@ -1704,7 +1698,6 @@ ; GFX10-NEXT: v_sub_co_u32 v8, vcc_lo, v9, s12 ; GFX10-NEXT: v_subrev_co_ci_u32_e64 v12, s0, 0, v0, vcc_lo ; GFX10-NEXT: v_cmp_le_u32_e64 s0, s13, v11 -; GFX10-NEXT: v_subrev_co_ci_u32_e32 v0, vcc_lo, s13, v0, vcc_lo ; GFX10-NEXT: v_cndmask_b32_e64 v13, 0, -1, s0 ; GFX10-NEXT: v_cmp_le_u32_e64 s0, s12, v8 ; GFX10-NEXT: v_cndmask_b32_e64 v14, 0, -1, s0 @@ -1716,63 +1709,63 @@ ; GFX10-NEXT: v_cndmask_b32_e64 v13, v13, v1, s0 ; GFX10-NEXT: v_cmp_eq_u32_e64 s0, s13, v12 ; GFX10-NEXT: v_cndmask_b32_e64 v14, v15, v14, s0 -; GFX10-NEXT: v_add_co_u32 v6, s0, v2, v6 +; GFX10-NEXT: v_add_co_u32 v2, s0, v2, v6 ; GFX10-NEXT: v_cndmask_b32_e64 v1, 0, 1, s0 -; GFX10-NEXT: v_add_co_u32 v15, s0, v16, 1 -; GFX10-NEXT: v_add_co_ci_u32_e64 v18, s0, 0, v17, s0 +; GFX10-NEXT: v_add_co_u32 v6, s0, v16, 1 +; GFX10-NEXT: v_add_co_ci_u32_e64 v15, s0, 0, v17, s0 ; GFX10-NEXT: v_add3_u32 v3, v7, v1, v3 -; GFX10-NEXT: v_mad_u64_u32 v[1:2], s0, s14, v6, 0 -; GFX10-NEXT: v_mul_lo_u32 v19, s15, v6 +; GFX10-NEXT: v_subrev_co_ci_u32_e32 v7, vcc_lo, s13, v0, vcc_lo +; GFX10-NEXT: v_mad_u64_u32 v[0:1], s0, s14, v2, 0 +; GFX10-NEXT: v_mul_lo_u32 v18, s14, v3 +; GFX10-NEXT: v_mul_lo_u32 v19, s15, v2 ; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v14 -; GFX10-NEXT: v_mul_lo_u32 v7, s14, v3 -; GFX10-NEXT: v_cndmask_b32_e32 v15, v16, v15, vcc_lo -; GFX10-NEXT: v_sub_co_u32 v16, s0, v8, s12 -; GFX10-NEXT: v_subrev_co_ci_u32_e64 v20, s0, 0, v0, s0 -; GFX10-NEXT: v_add3_u32 v2, v2, v7, v19 -; GFX10-NEXT: v_sub_co_u32 v7, s0, s10, v1 -; GFX10-NEXT: v_cndmask_b32_e32 v17, v17, v18, vcc_lo -; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v13 -; GFX10-NEXT: v_sub_co_ci_u32_e64 v13, s1, s11, v2, s0 -; GFX10-NEXT: v_sub_nc_u32_e32 v2, s11, v2 -; GFX10-NEXT: v_cmp_ne_u32_e64 s1, 0, v14 -; GFX10-NEXT: v_cndmask_b32_e32 v0, v4, v15, vcc_lo -; GFX10-NEXT: v_cmp_le_u32_e64 s2, s15, v13 -; GFX10-NEXT: v_cndmask_b32_e32 v1, v5, v17, vcc_lo -; GFX10-NEXT: v_subrev_co_ci_u32_e64 v2, s0, s15, v2, s0 -; GFX10-NEXT: v_cmp_le_u32_e64 s0, s14, v7 -; GFX10-NEXT: v_cndmask_b32_e64 v4, v8, v16, s1 -; GFX10-NEXT: v_cndmask_b32_e64 v5, 0, -1, s2 -; GFX10-NEXT: v_cndmask_b32_e64 v12, v12, v20, s1 -; GFX10-NEXT: v_cmp_eq_u32_e64 s1, s15, v13 -; GFX10-NEXT: v_cndmask_b32_e64 v8, 0, -1, s0 -; GFX10-NEXT: v_sub_co_u32 v14, s0, v7, s14 -; GFX10-NEXT: v_subrev_co_ci_u32_e64 v15, s2, 0, v2, s0 -; GFX10-NEXT: v_cndmask_b32_e64 v5, v5, v8, s1 -; GFX10-NEXT: v_cndmask_b32_e32 v4, v9, v4, vcc_lo -; GFX10-NEXT: v_subrev_co_ci_u32_e64 v2, s0, s15, v2, s0 -; GFX10-NEXT: v_cmp_le_u32_e64 s1, s15, v15 -; GFX10-NEXT: v_cndmask_b32_e64 v8, 0, -1, s1 -; GFX10-NEXT: v_cmp_le_u32_e64 s1, s14, v14 -; GFX10-NEXT: v_cndmask_b32_e64 v9, 0, -1, s1 -; GFX10-NEXT: v_add_co_u32 v16, s1, v6, 1 -; GFX10-NEXT: v_add_co_ci_u32_e64 v17, s1, 0, v3, s1 -; GFX10-NEXT: v_cmp_eq_u32_e64 s1, s15, v15 -; GFX10-NEXT: v_cndmask_b32_e64 v8, v8, v9, s1 -; GFX10-NEXT: v_add_co_u32 v9, s1, v16, 1 -; GFX10-NEXT: v_add_co_ci_u32_e64 v18, s1, 0, v17, s1 -; GFX10-NEXT: v_cmp_ne_u32_e64 s0, 0, v8 -; GFX10-NEXT: v_sub_co_u32 v8, s1, v14, s14 -; GFX10-NEXT: v_subrev_co_ci_u32_e64 v2, s1, 0, v2, s1 -; GFX10-NEXT: v_cndmask_b32_e64 v9, v16, v9, s0 -; GFX10-NEXT: v_cndmask_b32_e64 v16, v17, v18, s0 +; GFX10-NEXT: v_sub_co_u32 v14, s0, v8, s12 +; GFX10-NEXT: v_subrev_co_ci_u32_e64 v7, s0, 0, v7, s0 +; GFX10-NEXT: v_cndmask_b32_e32 v6, v16, v6, vcc_lo +; GFX10-NEXT: v_cmp_ne_u32_e64 s0, 0, v13 +; GFX10-NEXT: v_add3_u32 v16, v1, v18, v19 +; GFX10-NEXT: v_cndmask_b32_e32 v15, v17, v15, vcc_lo +; GFX10-NEXT: v_sub_co_u32 v13, s1, s10, v0 +; GFX10-NEXT: v_cndmask_b32_e64 v0, v4, v6, s0 +; GFX10-NEXT: v_sub_nc_u32_e32 v4, s11, v16 +; GFX10-NEXT: v_sub_co_ci_u32_e64 v17, s2, s11, v16, s1 +; GFX10-NEXT: v_cndmask_b32_e64 v1, v5, v15, s0 +; GFX10-NEXT: v_cndmask_b32_e32 v5, v8, v14, vcc_lo +; GFX10-NEXT: v_subrev_co_ci_u32_e64 v8, s1, s15, v4, s1 +; GFX10-NEXT: v_cmp_le_u32_e64 s1, s14, v13 +; GFX10-NEXT: v_cmp_le_u32_e64 s2, s15, v17 +; GFX10-NEXT: v_cndmask_b32_e32 v7, v12, v7, vcc_lo +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, s15, v17 +; GFX10-NEXT: v_cndmask_b32_e64 v4, v9, v5, s0 +; GFX10-NEXT: v_cndmask_b32_e64 v14, 0, -1, s1 +; GFX10-NEXT: v_sub_co_u32 v15, s1, v13, s14 +; GFX10-NEXT: v_cndmask_b32_e64 v6, 0, -1, s2 +; GFX10-NEXT: v_subrev_co_ci_u32_e64 v16, s2, 0, v8, s1 +; GFX10-NEXT: v_cndmask_b32_e32 v5, v6, v14, vcc_lo +; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, s15, v16 +; GFX10-NEXT: v_cndmask_b32_e64 v6, 0, -1, vcc_lo +; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, s14, v15 +; GFX10-NEXT: v_cndmask_b32_e64 v9, 0, -1, vcc_lo +; GFX10-NEXT: v_add_co_u32 v12, vcc_lo, v2, 1 +; GFX10-NEXT: v_add_co_ci_u32_e32 v14, vcc_lo, 0, v3, vcc_lo +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, s15, v16 +; GFX10-NEXT: v_cndmask_b32_e32 v6, v6, v9, vcc_lo +; GFX10-NEXT: v_add_co_u32 v9, vcc_lo, v12, 1 +; GFX10-NEXT: v_add_co_ci_u32_e32 v18, vcc_lo, 0, v14, vcc_lo +; GFX10-NEXT: v_subrev_co_ci_u32_e64 v8, vcc_lo, s15, v8, s1 +; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v6 +; GFX10-NEXT: v_sub_co_u32 v6, s1, v15, s14 +; GFX10-NEXT: v_subrev_co_ci_u32_e64 v8, s1, 0, v8, s1 +; GFX10-NEXT: v_cndmask_b32_e32 v9, v12, v9, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e32 v12, v14, v18, vcc_lo ; GFX10-NEXT: v_cmp_ne_u32_e64 s1, 0, v5 -; GFX10-NEXT: v_cndmask_b32_e64 v8, v14, v8, s0 -; GFX10-NEXT: v_cndmask_b32_e64 v14, v15, v2, s0 -; GFX10-NEXT: v_cndmask_b32_e32 v5, v11, v12, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e64 v2, v6, v9, s1 -; GFX10-NEXT: v_cndmask_b32_e64 v3, v3, v16, s1 -; GFX10-NEXT: v_cndmask_b32_e64 v6, v7, v8, s1 -; GFX10-NEXT: v_cndmask_b32_e64 v7, v13, v14, s1 +; GFX10-NEXT: v_cndmask_b32_e32 v6, v15, v6, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e32 v8, v16, v8, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e64 v5, v11, v7, s0 +; GFX10-NEXT: v_cndmask_b32_e64 v2, v2, v9, s1 +; GFX10-NEXT: v_cndmask_b32_e64 v3, v3, v12, s1 +; GFX10-NEXT: v_cndmask_b32_e64 v6, v13, v6, s1 +; GFX10-NEXT: v_cndmask_b32_e64 v7, v17, v8, s1 ; GFX10-NEXT: global_store_dwordx4 v10, v[0:3], s[4:5] ; GFX10-NEXT: global_store_dwordx4 v10, v[4:7], s[6:7] ; GFX10-NEXT: s_endpgm @@ -2034,17 +2027,17 @@ ; GFX10-NEXT: v_cvt_f32_ubyte0_e32 v1, s1 ; GFX10-NEXT: s_sub_i32 s3, 0, s2 ; GFX10-NEXT: v_rcp_iflag_f32_e32 v0, v0 -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x0 +; GFX10-NEXT: s_sub_i32 s6, 0, s1 ; GFX10-NEXT: v_rcp_iflag_f32_e32 v1, v1 ; GFX10-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX10-NEXT: v_mul_f32_e32 v1, 0x4f7ffffe, v1 ; GFX10-NEXT: v_cvt_u32_f32_e32 v0, v0 ; GFX10-NEXT: v_cvt_u32_f32_e32 v1, v1 ; GFX10-NEXT: v_mul_lo_u32 v2, s3, v0 -; GFX10-NEXT: s_sub_i32 s3, 0, s1 -; GFX10-NEXT: v_mul_lo_u32 v3, s3, v1 ; GFX10-NEXT: s_bfe_u32 s3, s0, 0x80008 ; GFX10-NEXT: s_and_b32 s0, s0, 0xff +; GFX10-NEXT: v_mul_lo_u32 v3, s6, v1 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x0 ; GFX10-NEXT: v_mul_hi_u32 v2, v0, v2 ; GFX10-NEXT: v_mul_hi_u32 v3, v1, v3 ; GFX10-NEXT: v_add_nc_u32_e32 v0, v0, v2 @@ -2054,30 +2047,30 @@ ; GFX10-NEXT: v_mul_lo_u32 v2, v0, s2 ; GFX10-NEXT: v_add_nc_u32_e32 v4, 1, v0 ; GFX10-NEXT: v_mul_lo_u32 v3, v1, s1 -; GFX10-NEXT: v_add_nc_u32_e32 v6, 1, v1 +; GFX10-NEXT: v_add_nc_u32_e32 v5, 1, v1 ; GFX10-NEXT: v_sub_nc_u32_e32 v2, s3, v2 ; GFX10-NEXT: v_sub_nc_u32_e32 v3, s0, v3 -; GFX10-NEXT: v_subrev_nc_u32_e32 v5, s2, v2 +; GFX10-NEXT: v_subrev_nc_u32_e32 v6, s2, v2 ; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, s2, v2 ; GFX10-NEXT: v_cmp_le_u32_e64 s0, s1, v3 ; GFX10-NEXT: v_subrev_nc_u32_e32 v7, s1, v3 ; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e32 v2, v2, v5, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, v6, s0 +; GFX10-NEXT: v_cndmask_b32_e32 v2, v2, v6, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, v5, s0 ; GFX10-NEXT: v_cndmask_b32_e64 v3, v3, v7, s0 ; GFX10-NEXT: v_add_nc_u32_e32 v4, 1, v0 ; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, s2, v2 -; GFX10-NEXT: v_subrev_nc_u32_e32 v5, s2, v2 -; GFX10-NEXT: v_add_nc_u32_e32 v6, 1, v1 +; GFX10-NEXT: v_subrev_nc_u32_e32 v6, s2, v2 +; GFX10-NEXT: v_add_nc_u32_e32 v5, 1, v1 ; GFX10-NEXT: v_cmp_le_u32_e64 s0, s1, v3 ; GFX10-NEXT: v_subrev_nc_u32_e32 v7, s1, v3 ; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e32 v2, v2, v5, vcc_lo -; GFX10-NEXT: s_movk_i32 s1, 0xff -; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, v6, s0 +; GFX10-NEXT: v_mov_b32_e32 v4, 0xff +; GFX10-NEXT: v_cndmask_b32_e32 v2, v2, v6, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, v5, s0 ; GFX10-NEXT: v_cndmask_b32_e64 v3, v3, v7, s0 -; GFX10-NEXT: v_and_b32_sdwa v0, v0, s1 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX10-NEXT: v_and_b32_sdwa v2, v2, s1 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX10-NEXT: v_and_b32_sdwa v0, v0, v4 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX10-NEXT: v_and_b32_sdwa v2, v2, v4 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX10-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD Index: llvm/test/CodeGen/AMDGPU/GlobalISel/urem.i32.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/GlobalISel/urem.i32.ll +++ llvm/test/CodeGen/AMDGPU/GlobalISel/urem.i32.ll @@ -211,23 +211,23 @@ ; CHECK-LABEL: v_urem_i32_oddk_denom: ; CHECK: ; %bb.0: ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_mov_b32 s4, 0x12d8fb -; CHECK-NEXT: v_rcp_iflag_f32_e32 v1, 0x4996c7d8 -; CHECK-NEXT: v_mov_b32_e32 v2, 0xffed2705 -; CHECK-NEXT: v_mul_f32_e32 v1, 0x4f7ffffe, v1 -; CHECK-NEXT: v_cvt_u32_f32_e32 v1, v1 -; CHECK-NEXT: v_mul_lo_u32 v2, v1, v2 -; CHECK-NEXT: v_mul_hi_u32 v2, v1, v2 -; CHECK-NEXT: v_add_i32_e32 v1, vcc, v1, v2 -; CHECK-NEXT: v_mul_hi_u32 v1, v0, v1 -; CHECK-NEXT: v_mul_lo_u32 v1, v1, s4 -; CHECK-NEXT: v_sub_i32_e32 v0, vcc, v0, v1 -; CHECK-NEXT: v_subrev_i32_e32 v1, vcc, s4, v0 -; CHECK-NEXT: v_cmp_le_u32_e32 vcc, s4, v0 -; CHECK-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc -; CHECK-NEXT: v_subrev_i32_e32 v1, vcc, s4, v0 -; CHECK-NEXT: v_cmp_le_u32_e32 vcc, s4, v0 -; CHECK-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc +; CHECK-NEXT: v_mov_b32_e32 v1, 0x12d8fb +; CHECK-NEXT: v_rcp_iflag_f32_e32 v2, 0x4996c7d8 +; CHECK-NEXT: v_mov_b32_e32 v3, 0xffed2705 +; CHECK-NEXT: v_mul_f32_e32 v2, 0x4f7ffffe, v2 +; CHECK-NEXT: v_cvt_u32_f32_e32 v2, v2 +; CHECK-NEXT: v_mul_lo_u32 v3, v2, v3 +; CHECK-NEXT: v_mul_hi_u32 v3, v2, v3 +; CHECK-NEXT: v_add_i32_e32 v2, vcc, v2, v3 +; CHECK-NEXT: v_mul_hi_u32 v2, v0, v2 +; CHECK-NEXT: v_mul_lo_u32 v2, v2, v1 +; CHECK-NEXT: v_sub_i32_e32 v0, vcc, v0, v2 +; CHECK-NEXT: v_subrev_i32_e32 v2, vcc, 0x12d8fb, v0 +; CHECK-NEXT: v_cmp_ge_u32_e32 vcc, v0, v1 +; CHECK-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc +; CHECK-NEXT: v_subrev_i32_e32 v2, vcc, 0x12d8fb, v0 +; CHECK-NEXT: v_cmp_ge_u32_e32 vcc, v0, v1 +; CHECK-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc ; CHECK-NEXT: s_setpc_b64 s[30:31] %result = urem i32 %num, 1235195 ret i32 %result @@ -237,7 +237,6 @@ ; GISEL-LABEL: v_urem_v2i32_oddk_denom: ; GISEL: ; %bb.0: ; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GISEL-NEXT: s_mov_b32 s4, 0x12d8fb ; GISEL-NEXT: v_mov_b32_e32 v2, 0x12d8fb ; GISEL-NEXT: v_cvt_f32_u32_e32 v3, 0x12d8fb ; GISEL-NEXT: v_mov_b32_e32 v4, 0xffed2705 @@ -249,19 +248,19 @@ ; GISEL-NEXT: v_add_i32_e32 v3, vcc, v3, v4 ; GISEL-NEXT: v_mul_hi_u32 v4, v0, v3 ; GISEL-NEXT: v_mul_hi_u32 v3, v1, v3 -; GISEL-NEXT: v_mul_lo_u32 v4, v4, s4 +; GISEL-NEXT: v_mul_lo_u32 v4, v4, v2 ; GISEL-NEXT: v_mul_lo_u32 v3, v3, v2 ; GISEL-NEXT: v_sub_i32_e32 v0, vcc, v0, v4 ; GISEL-NEXT: v_sub_i32_e32 v1, vcc, v1, v3 -; GISEL-NEXT: v_subrev_i32_e32 v3, vcc, s4, v0 +; GISEL-NEXT: v_subrev_i32_e32 v3, vcc, 0x12d8fb, v0 ; GISEL-NEXT: v_subrev_i32_e32 v4, vcc, 0x12d8fb, v1 -; GISEL-NEXT: v_cmp_le_u32_e32 vcc, s4, v0 +; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v0, v2 ; GISEL-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc ; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v1, v2 ; GISEL-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc -; GISEL-NEXT: v_subrev_i32_e32 v3, vcc, s4, v0 +; GISEL-NEXT: v_subrev_i32_e32 v3, vcc, 0x12d8fb, v0 ; GISEL-NEXT: v_subrev_i32_e32 v4, vcc, 0x12d8fb, v1 -; GISEL-NEXT: v_cmp_le_u32_e32 vcc, s4, v0 +; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v0, v2 ; GISEL-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc ; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v1, v2 ; GISEL-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc @@ -270,32 +269,32 @@ ; CGP-LABEL: v_urem_v2i32_oddk_denom: ; CGP: ; %bb.0: ; CGP-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CGP-NEXT: s_mov_b32 s4, 0x12d8fb -; CGP-NEXT: v_rcp_iflag_f32_e32 v2, 0x4996c7d8 -; CGP-NEXT: s_mov_b32 s5, 0xffed2705 -; CGP-NEXT: v_mul_f32_e32 v2, 0x4f7ffffe, v2 -; CGP-NEXT: v_cvt_u32_f32_e32 v2, v2 -; CGP-NEXT: v_mul_lo_u32 v3, v2, s5 -; CGP-NEXT: v_mul_hi_u32 v3, v2, v3 -; CGP-NEXT: v_add_i32_e32 v2, vcc, v2, v3 -; CGP-NEXT: v_mul_hi_u32 v3, v0, v2 -; CGP-NEXT: v_mul_hi_u32 v2, v1, v2 -; CGP-NEXT: v_mul_lo_u32 v3, v3, s4 -; CGP-NEXT: v_mul_lo_u32 v2, v2, s4 -; CGP-NEXT: v_sub_i32_e32 v0, vcc, v0, v3 -; CGP-NEXT: v_sub_i32_e32 v1, vcc, v1, v2 -; CGP-NEXT: v_subrev_i32_e32 v2, vcc, s4, v0 -; CGP-NEXT: v_subrev_i32_e32 v3, vcc, s4, v1 -; CGP-NEXT: v_cmp_le_u32_e32 vcc, s4, v0 -; CGP-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc -; CGP-NEXT: v_cmp_le_u32_e32 vcc, s4, v1 -; CGP-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc -; CGP-NEXT: v_subrev_i32_e32 v2, vcc, s4, v0 -; CGP-NEXT: v_subrev_i32_e32 v3, vcc, 0x12d8fb, v1 -; CGP-NEXT: v_cmp_le_u32_e32 vcc, s4, v0 -; CGP-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc -; CGP-NEXT: v_cmp_le_u32_e32 vcc, s4, v1 -; CGP-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc +; CGP-NEXT: v_mov_b32_e32 v2, 0x12d8fb +; CGP-NEXT: v_rcp_iflag_f32_e32 v3, 0x4996c7d8 +; CGP-NEXT: v_mov_b32_e32 v4, 0xffed2705 +; CGP-NEXT: v_mul_f32_e32 v3, 0x4f7ffffe, v3 +; CGP-NEXT: v_cvt_u32_f32_e32 v3, v3 +; CGP-NEXT: v_mul_lo_u32 v4, v3, v4 +; CGP-NEXT: v_mul_hi_u32 v4, v3, v4 +; CGP-NEXT: v_add_i32_e32 v3, vcc, v3, v4 +; CGP-NEXT: v_mul_hi_u32 v4, v0, v3 +; CGP-NEXT: v_mul_hi_u32 v3, v1, v3 +; CGP-NEXT: v_mul_lo_u32 v4, v4, v2 +; CGP-NEXT: v_mul_lo_u32 v3, v3, v2 +; CGP-NEXT: v_sub_i32_e32 v0, vcc, v0, v4 +; CGP-NEXT: v_sub_i32_e32 v1, vcc, v1, v3 +; CGP-NEXT: v_subrev_i32_e32 v3, vcc, 0x12d8fb, v0 +; CGP-NEXT: v_subrev_i32_e32 v4, vcc, 0x12d8fb, v1 +; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v0, v2 +; CGP-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc +; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v1, v2 +; CGP-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc +; CGP-NEXT: v_subrev_i32_e32 v3, vcc, 0x12d8fb, v0 +; CGP-NEXT: v_subrev_i32_e32 v4, vcc, 0x12d8fb, v1 +; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v0, v2 +; CGP-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc +; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v1, v2 +; CGP-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc ; CGP-NEXT: s_setpc_b64 s[30:31] %result = urem <2 x i32> %num, ret <2 x i32> %result Index: llvm/test/CodeGen/AMDGPU/GlobalISel/urem.i64.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/GlobalISel/urem.i64.ll +++ llvm/test/CodeGen/AMDGPU/GlobalISel/urem.i64.ll @@ -155,22 +155,22 @@ ; CHECK-NEXT: s_cbranch_execz .LBB0_2 ; CHECK-NEXT: .LBB0_4: ; CHECK-NEXT: v_rcp_iflag_f32_e32 v0, v6 -; CHECK-NEXT: v_sub_i32_e32 v1, vcc, 0, v2 +; CHECK-NEXT: v_mov_b32_e32 v1, 0 +; CHECK-NEXT: v_sub_i32_e32 v3, vcc, 0, v2 ; CHECK-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; CHECK-NEXT: v_cvt_u32_f32_e32 v0, v0 -; CHECK-NEXT: v_mul_lo_u32 v1, v1, v0 -; CHECK-NEXT: v_mul_hi_u32 v1, v0, v1 -; CHECK-NEXT: v_add_i32_e32 v0, vcc, v0, v1 +; CHECK-NEXT: v_mul_lo_u32 v3, v3, v0 +; CHECK-NEXT: v_mul_hi_u32 v3, v0, v3 +; CHECK-NEXT: v_add_i32_e32 v0, vcc, v0, v3 ; CHECK-NEXT: v_mul_hi_u32 v0, v4, v0 ; CHECK-NEXT: v_mul_lo_u32 v0, v0, v2 ; CHECK-NEXT: v_sub_i32_e32 v0, vcc, v4, v0 -; CHECK-NEXT: v_sub_i32_e32 v1, vcc, v0, v2 +; CHECK-NEXT: v_sub_i32_e32 v3, vcc, v0, v2 ; CHECK-NEXT: v_cmp_ge_u32_e32 vcc, v0, v2 -; CHECK-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc -; CHECK-NEXT: v_sub_i32_e32 v1, vcc, v0, v2 +; CHECK-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc +; CHECK-NEXT: v_sub_i32_e32 v3, vcc, v0, v2 ; CHECK-NEXT: v_cmp_ge_u32_e32 vcc, v0, v2 -; CHECK-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc -; CHECK-NEXT: v_mov_b32_e32 v1, 0 +; CHECK-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc ; CHECK-NEXT: s_or_b64 exec, exec, s[4:5] ; CHECK-NEXT: s_setpc_b64 s[30:31] %result = urem i64 %num, %den @@ -756,22 +756,22 @@ ; CGP-NEXT: s_cbranch_execz .LBB2_4 ; CGP-NEXT: ; %bb.3: ; CGP-NEXT: v_rcp_iflag_f32_e32 v0, v2 -; CGP-NEXT: v_sub_i32_e32 v1, vcc, 0, v4 +; CGP-NEXT: v_mov_b32_e32 v1, 0 +; CGP-NEXT: v_sub_i32_e32 v2, vcc, 0, v4 ; CGP-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; CGP-NEXT: v_cvt_u32_f32_e32 v0, v0 -; CGP-NEXT: v_mul_lo_u32 v1, v1, v0 -; CGP-NEXT: v_mul_hi_u32 v1, v0, v1 -; CGP-NEXT: v_add_i32_e32 v0, vcc, v0, v1 +; CGP-NEXT: v_mul_lo_u32 v2, v2, v0 +; CGP-NEXT: v_mul_hi_u32 v2, v0, v2 +; CGP-NEXT: v_add_i32_e32 v0, vcc, v0, v2 ; CGP-NEXT: v_mul_hi_u32 v0, v10, v0 ; CGP-NEXT: v_mul_lo_u32 v0, v0, v4 ; CGP-NEXT: v_sub_i32_e32 v0, vcc, v10, v0 -; CGP-NEXT: v_sub_i32_e32 v1, vcc, v0, v4 +; CGP-NEXT: v_sub_i32_e32 v2, vcc, v0, v4 ; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v0, v4 -; CGP-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc -; CGP-NEXT: v_sub_i32_e32 v1, vcc, v0, v4 +; CGP-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc +; CGP-NEXT: v_sub_i32_e32 v2, vcc, v0, v4 ; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v0, v4 -; CGP-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc -; CGP-NEXT: v_mov_b32_e32 v1, 0 +; CGP-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc ; CGP-NEXT: .LBB2_4: ; CGP-NEXT: s_or_b64 exec, exec, s[4:5] ; CGP-NEXT: v_or_b32_e32 v3, v9, v7 @@ -919,22 +919,22 @@ ; CGP-NEXT: s_cbranch_execz .LBB2_6 ; CGP-NEXT: .LBB2_8: ; CGP-NEXT: v_rcp_iflag_f32_e32 v2, v4 -; CGP-NEXT: v_sub_i32_e32 v3, vcc, 0, v6 +; CGP-NEXT: v_mov_b32_e32 v3, 0 +; CGP-NEXT: v_sub_i32_e32 v4, vcc, 0, v6 ; CGP-NEXT: v_mul_f32_e32 v2, 0x4f7ffffe, v2 ; CGP-NEXT: v_cvt_u32_f32_e32 v2, v2 -; CGP-NEXT: v_mul_lo_u32 v3, v3, v2 -; CGP-NEXT: v_mul_hi_u32 v3, v2, v3 -; CGP-NEXT: v_add_i32_e32 v2, vcc, v2, v3 +; CGP-NEXT: v_mul_lo_u32 v4, v4, v2 +; CGP-NEXT: v_mul_hi_u32 v4, v2, v4 +; CGP-NEXT: v_add_i32_e32 v2, vcc, v2, v4 ; CGP-NEXT: v_mul_hi_u32 v2, v8, v2 ; CGP-NEXT: v_mul_lo_u32 v2, v2, v6 ; CGP-NEXT: v_sub_i32_e32 v2, vcc, v8, v2 -; CGP-NEXT: v_sub_i32_e32 v3, vcc, v2, v6 +; CGP-NEXT: v_sub_i32_e32 v4, vcc, v2, v6 ; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v2, v6 -; CGP-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc -; CGP-NEXT: v_sub_i32_e32 v3, vcc, v2, v6 +; CGP-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc +; CGP-NEXT: v_sub_i32_e32 v4, vcc, v2, v6 ; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v2, v6 -; CGP-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc -; CGP-NEXT: v_mov_b32_e32 v3, 0 +; CGP-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc ; CGP-NEXT: s_or_b64 exec, exec, s[4:5] ; CGP-NEXT: s_setpc_b64 s[30:31] %result = urem <2 x i64> %num, %den @@ -969,77 +969,76 @@ ; CHECK-LABEL: v_urem_i64_oddk_denom: ; CHECK: ; %bb.0: ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_mov_b32 s4, 0x12d8fb ; CHECK-NEXT: v_mov_b32_e32 v2, 0x12d8fb ; CHECK-NEXT: v_cvt_f32_u32_e32 v3, 0x12d8fb ; CHECK-NEXT: v_cvt_f32_ubyte0_e32 v4, 0 -; CHECK-NEXT: s_mov_b32 s5, 0xffed2705 -; CHECK-NEXT: s_bfe_i32 s6, 1, 0x10000 +; CHECK-NEXT: v_mov_b32_e32 v5, 0xffed2705 +; CHECK-NEXT: s_bfe_i32 s4, 1, 0x10000 ; CHECK-NEXT: v_mac_f32_e32 v3, 0x4f800000, v4 -; CHECK-NEXT: v_mov_b32_e32 v4, s6 +; CHECK-NEXT: v_mov_b32_e32 v4, s4 ; CHECK-NEXT: v_rcp_iflag_f32_e32 v3, v3 ; CHECK-NEXT: v_mul_f32_e32 v3, 0x5f7ffffc, v3 -; CHECK-NEXT: v_mul_f32_e32 v5, 0x2f800000, v3 -; CHECK-NEXT: v_trunc_f32_e32 v5, v5 -; CHECK-NEXT: v_mac_f32_e32 v3, 0xcf800000, v5 -; CHECK-NEXT: v_cvt_u32_f32_e32 v5, v5 +; CHECK-NEXT: v_mul_f32_e32 v6, 0x2f800000, v3 +; CHECK-NEXT: v_trunc_f32_e32 v6, v6 +; CHECK-NEXT: v_mac_f32_e32 v3, 0xcf800000, v6 +; CHECK-NEXT: v_cvt_u32_f32_e32 v6, v6 ; CHECK-NEXT: v_cvt_u32_f32_e32 v3, v3 -; CHECK-NEXT: v_mul_lo_u32 v6, v5, s5 -; CHECK-NEXT: v_mul_lo_u32 v7, v3, s5 -; CHECK-NEXT: v_mul_hi_u32 v8, s5, v3 -; CHECK-NEXT: v_sub_i32_e32 v6, vcc, v6, v3 -; CHECK-NEXT: v_add_i32_e32 v6, vcc, v6, v8 -; CHECK-NEXT: v_mul_lo_u32 v8, v5, v7 -; CHECK-NEXT: v_mul_hi_u32 v9, v3, v7 -; CHECK-NEXT: v_mul_hi_u32 v7, v5, v7 -; CHECK-NEXT: v_mul_lo_u32 v10, v3, v6 -; CHECK-NEXT: v_mul_lo_u32 v11, v5, v6 -; CHECK-NEXT: v_mul_hi_u32 v12, v3, v6 -; CHECK-NEXT: v_mul_hi_u32 v6, v5, v6 -; CHECK-NEXT: v_add_i32_e32 v8, vcc, v8, v10 -; CHECK-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc -; CHECK-NEXT: v_add_i32_e32 v7, vcc, v11, v7 +; CHECK-NEXT: v_mul_lo_u32 v7, v6, v5 +; CHECK-NEXT: v_mul_lo_u32 v8, v3, v5 +; CHECK-NEXT: v_mul_hi_u32 v9, v5, v3 +; CHECK-NEXT: v_sub_i32_e32 v7, vcc, v7, v3 +; CHECK-NEXT: v_add_i32_e32 v7, vcc, v7, v9 +; CHECK-NEXT: v_mul_lo_u32 v9, v6, v8 +; CHECK-NEXT: v_mul_hi_u32 v10, v3, v8 +; CHECK-NEXT: v_mul_hi_u32 v8, v6, v8 +; CHECK-NEXT: v_mul_lo_u32 v11, v3, v7 +; CHECK-NEXT: v_mul_lo_u32 v12, v6, v7 +; CHECK-NEXT: v_mul_hi_u32 v13, v3, v7 +; CHECK-NEXT: v_mul_hi_u32 v7, v6, v7 +; CHECK-NEXT: v_add_i32_e32 v9, vcc, v9, v11 ; CHECK-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc -; CHECK-NEXT: v_add_i32_e32 v8, vcc, v8, v9 -; CHECK-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc -; CHECK-NEXT: v_add_i32_e32 v7, vcc, v7, v12 +; CHECK-NEXT: v_add_i32_e32 v8, vcc, v12, v8 +; CHECK-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc +; CHECK-NEXT: v_add_i32_e32 v9, vcc, v9, v10 ; CHECK-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc -; CHECK-NEXT: v_add_i32_e32 v8, vcc, v10, v8 +; CHECK-NEXT: v_add_i32_e32 v8, vcc, v8, v13 +; CHECK-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc ; CHECK-NEXT: v_add_i32_e32 v9, vcc, v11, v9 -; CHECK-NEXT: v_add_i32_e32 v7, vcc, v7, v8 -; CHECK-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc -; CHECK-NEXT: v_add_i32_e32 v8, vcc, v9, v8 -; CHECK-NEXT: v_add_i32_e32 v6, vcc, v6, v8 -; CHECK-NEXT: v_add_i32_e32 v3, vcc, v3, v7 -; CHECK-NEXT: v_addc_u32_e32 v5, vcc, v5, v6, vcc -; CHECK-NEXT: v_mul_lo_u32 v6, v3, s5 -; CHECK-NEXT: v_mul_hi_u32 v7, s5, v3 -; CHECK-NEXT: v_mul_lo_u32 v8, v5, s5 -; CHECK-NEXT: v_mul_lo_u32 v9, v5, v6 -; CHECK-NEXT: v_mul_hi_u32 v10, v3, v6 -; CHECK-NEXT: v_mul_hi_u32 v6, v5, v6 -; CHECK-NEXT: v_sub_i32_e32 v8, vcc, v8, v3 -; CHECK-NEXT: v_add_i32_e32 v7, vcc, v8, v7 -; CHECK-NEXT: v_mul_lo_u32 v8, v3, v7 -; CHECK-NEXT: v_mul_lo_u32 v11, v5, v7 -; CHECK-NEXT: v_mul_hi_u32 v12, v3, v7 -; CHECK-NEXT: v_mul_hi_u32 v7, v5, v7 +; CHECK-NEXT: v_add_i32_e32 v10, vcc, v12, v10 +; CHECK-NEXT: v_add_i32_e32 v8, vcc, v8, v9 +; CHECK-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc +; CHECK-NEXT: v_add_i32_e32 v9, vcc, v10, v9 +; CHECK-NEXT: v_add_i32_e32 v7, vcc, v7, v9 +; CHECK-NEXT: v_add_i32_e32 v3, vcc, v3, v8 +; CHECK-NEXT: v_addc_u32_e32 v6, vcc, v6, v7, vcc +; CHECK-NEXT: v_mul_lo_u32 v7, v3, v5 +; CHECK-NEXT: v_mul_hi_u32 v8, v5, v3 +; CHECK-NEXT: v_mul_lo_u32 v5, v6, v5 +; CHECK-NEXT: v_mul_lo_u32 v9, v6, v7 +; CHECK-NEXT: v_mul_hi_u32 v10, v3, v7 +; CHECK-NEXT: v_mul_hi_u32 v7, v6, v7 +; CHECK-NEXT: v_sub_i32_e32 v5, vcc, v5, v3 +; CHECK-NEXT: v_add_i32_e32 v5, vcc, v5, v8 +; CHECK-NEXT: v_mul_lo_u32 v8, v3, v5 +; CHECK-NEXT: v_mul_lo_u32 v11, v6, v5 +; CHECK-NEXT: v_mul_hi_u32 v12, v3, v5 +; CHECK-NEXT: v_mul_hi_u32 v5, v6, v5 ; CHECK-NEXT: v_add_i32_e32 v8, vcc, v9, v8 ; CHECK-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc -; CHECK-NEXT: v_add_i32_e32 v6, vcc, v11, v6 +; CHECK-NEXT: v_add_i32_e32 v7, vcc, v11, v7 ; CHECK-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc ; CHECK-NEXT: v_add_i32_e32 v8, vcc, v8, v10 ; CHECK-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc -; CHECK-NEXT: v_add_i32_e32 v6, vcc, v6, v12 +; CHECK-NEXT: v_add_i32_e32 v7, vcc, v7, v12 ; CHECK-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc ; CHECK-NEXT: v_add_i32_e32 v8, vcc, v9, v8 ; CHECK-NEXT: v_add_i32_e32 v9, vcc, v11, v10 -; CHECK-NEXT: v_add_i32_e32 v6, vcc, v6, v8 +; CHECK-NEXT: v_add_i32_e32 v7, vcc, v7, v8 ; CHECK-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc ; CHECK-NEXT: v_add_i32_e32 v8, vcc, v9, v8 -; CHECK-NEXT: v_add_i32_e32 v7, vcc, v7, v8 -; CHECK-NEXT: v_add_i32_e32 v3, vcc, v3, v6 -; CHECK-NEXT: v_addc_u32_e32 v5, vcc, v5, v7, vcc +; CHECK-NEXT: v_add_i32_e32 v5, vcc, v5, v8 +; CHECK-NEXT: v_add_i32_e32 v3, vcc, v3, v7 +; CHECK-NEXT: v_addc_u32_e32 v5, vcc, v6, v5, vcc ; CHECK-NEXT: v_mul_lo_u32 v6, v1, v3 ; CHECK-NEXT: v_mul_hi_u32 v7, v0, v3 ; CHECK-NEXT: v_mul_hi_u32 v3, v1, v3 @@ -1060,10 +1059,10 @@ ; CHECK-NEXT: v_add_i32_e32 v3, vcc, v3, v6 ; CHECK-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc ; CHECK-NEXT: v_add_i32_e32 v6, vcc, v7, v6 -; CHECK-NEXT: v_mul_lo_u32 v7, v3, s4 -; CHECK-NEXT: v_mul_hi_u32 v3, s4, v3 +; CHECK-NEXT: v_mul_lo_u32 v7, v3, v2 +; CHECK-NEXT: v_mul_hi_u32 v3, v2, v3 ; CHECK-NEXT: v_add_i32_e32 v5, vcc, v5, v6 -; CHECK-NEXT: v_mul_lo_u32 v5, v5, s4 +; CHECK-NEXT: v_mul_lo_u32 v5, v5, v2 ; CHECK-NEXT: v_add_i32_e32 v3, vcc, v5, v3 ; CHECK-NEXT: v_sub_i32_e64 v0, s[4:5], v0, v7 ; CHECK-NEXT: v_subb_u32_e64 v5, vcc, v1, v3, s[4:5] @@ -1097,215 +1096,206 @@ ; GISEL-LABEL: v_urem_v2i64_oddk_denom: ; GISEL: ; %bb.0: ; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GISEL-NEXT: s_mov_b32 s4, 0x12d8fb ; GISEL-NEXT: v_mov_b32_e32 v4, 0x12d8fb -; GISEL-NEXT: v_cvt_f32_u32_e32 v6, 0x12d8fb -; GISEL-NEXT: v_cvt_f32_ubyte0_e32 v5, 0 -; GISEL-NEXT: s_sub_u32 s5, 0, 0x12d8fb -; GISEL-NEXT: v_madmk_f32 v7, v5, 0x4f800000, v6 -; GISEL-NEXT: s_subb_u32 s6, 0, 0 -; GISEL-NEXT: s_bfe_i32 s7, 1, 0x10000 -; GISEL-NEXT: v_mac_f32_e32 v6, 0x4f800000, v5 -; GISEL-NEXT: v_rcp_iflag_f32_e32 v7, v7 -; GISEL-NEXT: v_mov_b32_e32 v5, s7 -; GISEL-NEXT: v_rcp_iflag_f32_e32 v6, v6 -; GISEL-NEXT: s_sub_u32 s7, 0, 0x12d8fb -; GISEL-NEXT: v_mul_f32_e32 v7, 0x5f7ffffc, v7 +; GISEL-NEXT: v_cvt_f32_u32_e32 v5, 0x12d8fb +; GISEL-NEXT: v_cvt_f32_ubyte0_e32 v6, 0 +; GISEL-NEXT: s_sub_u32 s4, 0, 0x12d8fb +; GISEL-NEXT: v_mac_f32_e32 v5, 0x4f800000, v6 +; GISEL-NEXT: s_subb_u32 s5, 0, 0 +; GISEL-NEXT: s_bfe_i32 s6, 1, 0x10000 +; GISEL-NEXT: v_rcp_iflag_f32_e32 v6, v5 +; GISEL-NEXT: v_mov_b32_e32 v5, s6 +; GISEL-NEXT: s_sub_u32 s6, 0, 0x12d8fb ; GISEL-NEXT: v_mul_f32_e32 v6, 0x5f7ffffc, v6 -; GISEL-NEXT: s_subb_u32 s8, 0, 0 -; GISEL-NEXT: s_bfe_i32 s9, 1, 0x10000 -; GISEL-NEXT: v_mul_f32_e32 v8, 0x2f800000, v7 -; GISEL-NEXT: v_mul_f32_e32 v9, 0x2f800000, v6 -; GISEL-NEXT: v_mov_b32_e32 v10, s9 -; GISEL-NEXT: v_trunc_f32_e32 v8, v8 -; GISEL-NEXT: v_trunc_f32_e32 v9, v9 -; GISEL-NEXT: v_mac_f32_e32 v7, 0xcf800000, v8 -; GISEL-NEXT: v_cvt_u32_f32_e32 v8, v8 -; GISEL-NEXT: v_mac_f32_e32 v6, 0xcf800000, v9 -; GISEL-NEXT: v_cvt_u32_f32_e32 v9, v9 +; GISEL-NEXT: s_subb_u32 s7, 0, 0 +; GISEL-NEXT: s_bfe_i32 s8, 1, 0x10000 +; GISEL-NEXT: v_mul_f32_e32 v7, 0x2f800000, v6 +; GISEL-NEXT: v_mov_b32_e32 v8, s8 +; GISEL-NEXT: v_trunc_f32_e32 v7, v7 +; GISEL-NEXT: v_mac_f32_e32 v6, 0xcf800000, v7 ; GISEL-NEXT: v_cvt_u32_f32_e32 v7, v7 -; GISEL-NEXT: v_mul_lo_u32 v11, s5, v8 ; GISEL-NEXT: v_cvt_u32_f32_e32 v6, v6 -; GISEL-NEXT: v_mul_lo_u32 v12, s7, v9 -; GISEL-NEXT: v_mul_lo_u32 v13, s5, v7 -; GISEL-NEXT: v_mul_lo_u32 v14, s6, v7 -; GISEL-NEXT: v_mul_hi_u32 v15, s5, v7 -; GISEL-NEXT: v_mul_lo_u32 v16, s7, v6 -; GISEL-NEXT: v_mul_lo_u32 v17, s8, v6 -; GISEL-NEXT: v_mul_hi_u32 v18, s7, v6 -; GISEL-NEXT: v_add_i32_e32 v11, vcc, v14, v11 -; GISEL-NEXT: v_mul_lo_u32 v14, v8, v13 -; GISEL-NEXT: v_mul_hi_u32 v19, v7, v13 -; GISEL-NEXT: v_mul_hi_u32 v13, v8, v13 -; GISEL-NEXT: v_add_i32_e32 v12, vcc, v17, v12 -; GISEL-NEXT: v_mul_lo_u32 v17, v9, v16 -; GISEL-NEXT: v_mul_hi_u32 v20, v6, v16 -; GISEL-NEXT: v_mul_hi_u32 v16, v9, v16 -; GISEL-NEXT: v_add_i32_e32 v11, vcc, v11, v15 -; GISEL-NEXT: v_add_i32_e32 v12, vcc, v12, v18 -; GISEL-NEXT: v_mul_lo_u32 v15, v7, v11 -; GISEL-NEXT: v_mul_lo_u32 v18, v8, v11 -; GISEL-NEXT: v_mul_hi_u32 v21, v7, v11 -; GISEL-NEXT: v_mul_hi_u32 v11, v8, v11 -; GISEL-NEXT: v_mul_lo_u32 v22, v6, v12 -; GISEL-NEXT: v_mul_lo_u32 v23, v9, v12 -; GISEL-NEXT: v_mul_hi_u32 v24, v6, v12 -; GISEL-NEXT: v_mul_hi_u32 v12, v9, v12 -; GISEL-NEXT: v_add_i32_e32 v14, vcc, v14, v15 -; GISEL-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v13, vcc, v18, v13 -; GISEL-NEXT: v_cndmask_b32_e64 v18, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v17, vcc, v17, v22 -; GISEL-NEXT: v_cndmask_b32_e64 v22, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v16, vcc, v23, v16 -; GISEL-NEXT: v_cndmask_b32_e64 v23, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v14, vcc, v14, v19 -; GISEL-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v13, vcc, v13, v21 -; GISEL-NEXT: v_cndmask_b32_e64 v19, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v17, vcc, v17, v20 -; GISEL-NEXT: v_cndmask_b32_e64 v17, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v16, vcc, v16, v24 +; GISEL-NEXT: v_mul_lo_u32 v9, s4, v7 +; GISEL-NEXT: v_mul_lo_u32 v10, s6, v7 +; GISEL-NEXT: v_mul_lo_u32 v11, s4, v6 +; GISEL-NEXT: v_mul_lo_u32 v12, s5, v6 +; GISEL-NEXT: v_mul_hi_u32 v13, s4, v6 +; GISEL-NEXT: v_mul_lo_u32 v14, s6, v6 +; GISEL-NEXT: v_mul_lo_u32 v15, s7, v6 +; GISEL-NEXT: v_mul_hi_u32 v16, s6, v6 +; GISEL-NEXT: v_add_i32_e32 v9, vcc, v12, v9 +; GISEL-NEXT: v_mul_lo_u32 v12, v7, v11 +; GISEL-NEXT: v_mul_hi_u32 v17, v6, v11 +; GISEL-NEXT: v_mul_hi_u32 v11, v7, v11 +; GISEL-NEXT: v_add_i32_e32 v10, vcc, v15, v10 +; GISEL-NEXT: v_mul_lo_u32 v15, v7, v14 +; GISEL-NEXT: v_mul_hi_u32 v18, v6, v14 +; GISEL-NEXT: v_mul_hi_u32 v14, v7, v14 +; GISEL-NEXT: v_add_i32_e32 v9, vcc, v9, v13 +; GISEL-NEXT: v_add_i32_e32 v10, vcc, v10, v16 +; GISEL-NEXT: v_mul_lo_u32 v13, v6, v9 +; GISEL-NEXT: v_mul_lo_u32 v16, v7, v9 +; GISEL-NEXT: v_mul_hi_u32 v19, v6, v9 +; GISEL-NEXT: v_mul_hi_u32 v9, v7, v9 +; GISEL-NEXT: v_mul_lo_u32 v20, v6, v10 +; GISEL-NEXT: v_mul_lo_u32 v21, v7, v10 +; GISEL-NEXT: v_mul_hi_u32 v22, v6, v10 +; GISEL-NEXT: v_mul_hi_u32 v10, v7, v10 +; GISEL-NEXT: v_add_i32_e32 v12, vcc, v12, v13 +; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v11, vcc, v16, v11 +; GISEL-NEXT: v_cndmask_b32_e64 v16, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v15, vcc, v15, v20 ; GISEL-NEXT: v_cndmask_b32_e64 v20, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v14, vcc, v15, v14 -; GISEL-NEXT: v_add_i32_e32 v15, vcc, v18, v19 -; GISEL-NEXT: v_add_i32_e32 v17, vcc, v22, v17 -; GISEL-NEXT: v_add_i32_e32 v18, vcc, v23, v20 -; GISEL-NEXT: v_add_i32_e32 v13, vcc, v13, v14 -; GISEL-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v16, vcc, v16, v17 +; GISEL-NEXT: v_add_i32_e32 v14, vcc, v21, v14 +; GISEL-NEXT: v_cndmask_b32_e64 v21, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v12, vcc, v12, v17 +; GISEL-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v11, vcc, v11, v19 ; GISEL-NEXT: v_cndmask_b32_e64 v17, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v14, vcc, v15, v14 -; GISEL-NEXT: v_add_i32_e32 v15, vcc, v18, v17 -; GISEL-NEXT: v_add_i32_e32 v11, vcc, v11, v14 -; GISEL-NEXT: v_add_i32_e32 v12, vcc, v12, v15 -; GISEL-NEXT: v_add_i32_e32 v7, vcc, v7, v13 -; GISEL-NEXT: v_addc_u32_e32 v8, vcc, v8, v11, vcc -; GISEL-NEXT: v_mul_lo_u32 v11, s5, v7 -; GISEL-NEXT: v_mul_lo_u32 v13, s6, v7 -; GISEL-NEXT: v_mul_hi_u32 v14, s5, v7 -; GISEL-NEXT: v_add_i32_e32 v6, vcc, v6, v16 -; GISEL-NEXT: v_addc_u32_e32 v9, vcc, v9, v12, vcc -; GISEL-NEXT: v_mul_lo_u32 v12, s7, v6 -; GISEL-NEXT: v_mul_lo_u32 v15, s8, v6 -; GISEL-NEXT: v_mul_hi_u32 v16, s7, v6 -; GISEL-NEXT: v_mul_lo_u32 v17, s5, v8 -; GISEL-NEXT: v_mul_lo_u32 v18, v8, v11 -; GISEL-NEXT: v_mul_hi_u32 v19, v7, v11 -; GISEL-NEXT: v_mul_hi_u32 v11, v8, v11 -; GISEL-NEXT: v_mul_lo_u32 v20, s7, v9 -; GISEL-NEXT: v_mul_lo_u32 v21, v9, v12 -; GISEL-NEXT: v_mul_hi_u32 v22, v6, v12 +; GISEL-NEXT: v_add_i32_e32 v15, vcc, v15, v18 +; GISEL-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v14, vcc, v14, v22 +; GISEL-NEXT: v_cndmask_b32_e64 v18, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v12, vcc, v13, v12 +; GISEL-NEXT: v_add_i32_e32 v13, vcc, v16, v17 +; GISEL-NEXT: v_add_i32_e32 v15, vcc, v20, v15 +; GISEL-NEXT: v_add_i32_e32 v16, vcc, v21, v18 +; GISEL-NEXT: v_add_i32_e32 v11, vcc, v11, v12 +; GISEL-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v14, vcc, v14, v15 +; GISEL-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v12, vcc, v13, v12 +; GISEL-NEXT: v_add_i32_e32 v13, vcc, v16, v15 +; GISEL-NEXT: v_add_i32_e32 v9, vcc, v9, v12 +; GISEL-NEXT: v_add_i32_e32 v10, vcc, v10, v13 +; GISEL-NEXT: v_add_i32_e32 v11, vcc, v6, v11 +; GISEL-NEXT: v_addc_u32_e32 v9, vcc, v7, v9, vcc +; GISEL-NEXT: v_mul_lo_u32 v12, s4, v11 +; GISEL-NEXT: v_mul_lo_u32 v13, s5, v11 +; GISEL-NEXT: v_mul_hi_u32 v15, s4, v11 +; GISEL-NEXT: v_add_i32_e32 v6, vcc, v6, v14 +; GISEL-NEXT: v_addc_u32_e32 v7, vcc, v7, v10, vcc +; GISEL-NEXT: v_mul_lo_u32 v10, s6, v6 +; GISEL-NEXT: v_mul_lo_u32 v14, s7, v6 +; GISEL-NEXT: v_mul_hi_u32 v16, s6, v6 +; GISEL-NEXT: v_mul_lo_u32 v17, s4, v9 +; GISEL-NEXT: v_mul_lo_u32 v18, v9, v12 +; GISEL-NEXT: v_mul_hi_u32 v19, v11, v12 ; GISEL-NEXT: v_mul_hi_u32 v12, v9, v12 +; GISEL-NEXT: v_mul_lo_u32 v20, s6, v7 +; GISEL-NEXT: v_mul_lo_u32 v21, v7, v10 +; GISEL-NEXT: v_mul_hi_u32 v22, v6, v10 +; GISEL-NEXT: v_mul_hi_u32 v10, v7, v10 ; GISEL-NEXT: v_add_i32_e32 v13, vcc, v13, v17 -; GISEL-NEXT: v_add_i32_e32 v15, vcc, v15, v20 -; GISEL-NEXT: v_add_i32_e32 v13, vcc, v13, v14 -; GISEL-NEXT: v_add_i32_e32 v14, vcc, v15, v16 -; GISEL-NEXT: v_mul_lo_u32 v15, v7, v13 -; GISEL-NEXT: v_mul_lo_u32 v16, v8, v13 -; GISEL-NEXT: v_mul_hi_u32 v17, v7, v13 -; GISEL-NEXT: v_mul_hi_u32 v13, v8, v13 +; GISEL-NEXT: v_add_i32_e32 v14, vcc, v14, v20 +; GISEL-NEXT: v_add_i32_e32 v13, vcc, v13, v15 +; GISEL-NEXT: v_add_i32_e32 v14, vcc, v14, v16 +; GISEL-NEXT: v_mul_lo_u32 v15, v11, v13 +; GISEL-NEXT: v_mul_lo_u32 v16, v9, v13 +; GISEL-NEXT: v_mul_hi_u32 v17, v11, v13 +; GISEL-NEXT: v_mul_hi_u32 v13, v9, v13 ; GISEL-NEXT: v_mul_lo_u32 v20, v6, v14 -; GISEL-NEXT: v_mul_lo_u32 v23, v9, v14 +; GISEL-NEXT: v_mul_lo_u32 v23, v7, v14 ; GISEL-NEXT: v_mul_hi_u32 v24, v6, v14 -; GISEL-NEXT: v_mul_hi_u32 v14, v9, v14 +; GISEL-NEXT: v_mul_hi_u32 v14, v7, v14 ; GISEL-NEXT: v_add_i32_e32 v15, vcc, v18, v15 ; GISEL-NEXT: v_cndmask_b32_e64 v18, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v11, vcc, v16, v11 +; GISEL-NEXT: v_add_i32_e32 v12, vcc, v16, v12 ; GISEL-NEXT: v_cndmask_b32_e64 v16, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v20, vcc, v21, v20 ; GISEL-NEXT: v_cndmask_b32_e64 v21, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v12, vcc, v23, v12 +; GISEL-NEXT: v_add_i32_e32 v10, vcc, v23, v10 ; GISEL-NEXT: v_cndmask_b32_e64 v23, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v15, vcc, v15, v19 ; GISEL-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v11, vcc, v11, v17 +; GISEL-NEXT: v_add_i32_e32 v12, vcc, v12, v17 ; GISEL-NEXT: v_cndmask_b32_e64 v17, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v19, vcc, v20, v22 ; GISEL-NEXT: v_cndmask_b32_e64 v19, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v12, vcc, v12, v24 +; GISEL-NEXT: v_add_i32_e32 v10, vcc, v10, v24 ; GISEL-NEXT: v_cndmask_b32_e64 v20, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v15, vcc, v18, v15 ; GISEL-NEXT: v_add_i32_e32 v16, vcc, v16, v17 ; GISEL-NEXT: v_add_i32_e32 v17, vcc, v21, v19 ; GISEL-NEXT: v_add_i32_e32 v18, vcc, v23, v20 -; GISEL-NEXT: v_add_i32_e32 v11, vcc, v11, v15 +; GISEL-NEXT: v_add_i32_e32 v12, vcc, v12, v15 ; GISEL-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v12, vcc, v12, v17 +; GISEL-NEXT: v_add_i32_e32 v10, vcc, v10, v17 ; GISEL-NEXT: v_cndmask_b32_e64 v17, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v15, vcc, v16, v15 ; GISEL-NEXT: v_add_i32_e32 v16, vcc, v18, v17 ; GISEL-NEXT: v_add_i32_e32 v13, vcc, v13, v15 ; GISEL-NEXT: v_add_i32_e32 v14, vcc, v14, v16 -; GISEL-NEXT: v_add_i32_e32 v7, vcc, v7, v11 -; GISEL-NEXT: v_addc_u32_e32 v8, vcc, v8, v13, vcc -; GISEL-NEXT: v_mul_lo_u32 v11, v1, v7 -; GISEL-NEXT: v_mul_hi_u32 v13, v0, v7 -; GISEL-NEXT: v_mul_hi_u32 v7, v1, v7 -; GISEL-NEXT: v_add_i32_e32 v6, vcc, v6, v12 -; GISEL-NEXT: v_addc_u32_e32 v9, vcc, v9, v14, vcc -; GISEL-NEXT: v_mul_lo_u32 v12, v3, v6 +; GISEL-NEXT: v_add_i32_e32 v11, vcc, v11, v12 +; GISEL-NEXT: v_addc_u32_e32 v9, vcc, v9, v13, vcc +; GISEL-NEXT: v_mul_lo_u32 v12, v1, v11 +; GISEL-NEXT: v_mul_hi_u32 v13, v0, v11 +; GISEL-NEXT: v_mul_hi_u32 v11, v1, v11 +; GISEL-NEXT: v_add_i32_e32 v6, vcc, v6, v10 +; GISEL-NEXT: v_addc_u32_e32 v7, vcc, v7, v14, vcc +; GISEL-NEXT: v_mul_lo_u32 v10, v3, v6 ; GISEL-NEXT: v_mul_hi_u32 v14, v2, v6 ; GISEL-NEXT: v_mul_hi_u32 v6, v3, v6 -; GISEL-NEXT: v_mul_lo_u32 v15, v0, v8 -; GISEL-NEXT: v_mul_lo_u32 v16, v1, v8 -; GISEL-NEXT: v_mul_hi_u32 v17, v0, v8 -; GISEL-NEXT: v_mul_hi_u32 v8, v1, v8 -; GISEL-NEXT: v_mul_lo_u32 v18, v2, v9 -; GISEL-NEXT: v_mul_lo_u32 v19, v3, v9 -; GISEL-NEXT: v_mul_hi_u32 v20, v2, v9 -; GISEL-NEXT: v_mul_hi_u32 v9, v3, v9 -; GISEL-NEXT: v_add_i32_e32 v11, vcc, v11, v15 +; GISEL-NEXT: v_mul_lo_u32 v15, v0, v9 +; GISEL-NEXT: v_mul_lo_u32 v16, v1, v9 +; GISEL-NEXT: v_mul_hi_u32 v17, v0, v9 +; GISEL-NEXT: v_mul_hi_u32 v9, v1, v9 +; GISEL-NEXT: v_mul_lo_u32 v18, v2, v7 +; GISEL-NEXT: v_mul_lo_u32 v19, v3, v7 +; GISEL-NEXT: v_mul_hi_u32 v20, v2, v7 +; GISEL-NEXT: v_mul_hi_u32 v7, v3, v7 +; GISEL-NEXT: v_add_i32_e32 v12, vcc, v12, v15 ; GISEL-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v7, vcc, v16, v7 +; GISEL-NEXT: v_add_i32_e32 v11, vcc, v16, v11 ; GISEL-NEXT: v_cndmask_b32_e64 v16, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v12, vcc, v12, v18 +; GISEL-NEXT: v_add_i32_e32 v10, vcc, v10, v18 ; GISEL-NEXT: v_cndmask_b32_e64 v18, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v6, vcc, v19, v6 ; GISEL-NEXT: v_cndmask_b32_e64 v19, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v11, vcc, v11, v13 -; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v7, vcc, v7, v17 -; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v12, vcc, v12, v14 +; GISEL-NEXT: v_add_i32_e32 v12, vcc, v12, v13 ; GISEL-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v11, vcc, v11, v17 +; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v10, vcc, v10, v14 +; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v6, vcc, v6, v20 ; GISEL-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v11, vcc, v15, v11 +; GISEL-NEXT: v_add_i32_e32 v12, vcc, v15, v12 ; GISEL-NEXT: v_add_i32_e32 v13, vcc, v16, v13 -; GISEL-NEXT: v_add_i32_e32 v12, vcc, v18, v12 +; GISEL-NEXT: v_add_i32_e32 v10, vcc, v18, v10 ; GISEL-NEXT: v_add_i32_e32 v14, vcc, v19, v14 -; GISEL-NEXT: v_add_i32_e32 v7, vcc, v7, v11 -; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v6, vcc, v6, v12 +; GISEL-NEXT: v_add_i32_e32 v11, vcc, v11, v12 ; GISEL-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v11, vcc, v13, v11 -; GISEL-NEXT: v_mul_lo_u32 v13, v7, s4 -; GISEL-NEXT: v_mul_hi_u32 v7, s4, v7 -; GISEL-NEXT: v_add_i32_e32 v12, vcc, v14, v12 -; GISEL-NEXT: v_mul_lo_u32 v14, v6, s4 -; GISEL-NEXT: v_mul_hi_u32 v6, s4, v6 -; GISEL-NEXT: v_add_i32_e32 v8, vcc, v8, v11 +; GISEL-NEXT: v_add_i32_e32 v6, vcc, v6, v10 +; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v12, vcc, v13, v12 +; GISEL-NEXT: v_mul_lo_u32 v13, v11, v4 +; GISEL-NEXT: v_mul_hi_u32 v11, v4, v11 +; GISEL-NEXT: v_add_i32_e32 v10, vcc, v14, v10 +; GISEL-NEXT: v_mul_lo_u32 v14, v6, v4 +; GISEL-NEXT: v_mul_hi_u32 v6, v4, v6 ; GISEL-NEXT: v_add_i32_e32 v9, vcc, v9, v12 -; GISEL-NEXT: v_mul_lo_u32 v8, v8, s4 -; GISEL-NEXT: v_mul_lo_u32 v9, v9, s4 -; GISEL-NEXT: v_add_i32_e32 v7, vcc, v8, v7 -; GISEL-NEXT: v_add_i32_e32 v6, vcc, v9, v6 +; GISEL-NEXT: v_add_i32_e32 v7, vcc, v7, v10 +; GISEL-NEXT: v_mul_lo_u32 v9, v9, v4 +; GISEL-NEXT: v_mul_lo_u32 v7, v7, v4 +; GISEL-NEXT: v_add_i32_e32 v9, vcc, v9, v11 +; GISEL-NEXT: v_add_i32_e32 v6, vcc, v7, v6 ; GISEL-NEXT: v_sub_i32_e64 v0, s[4:5], v0, v13 -; GISEL-NEXT: v_subb_u32_e64 v8, vcc, v1, v7, s[4:5] -; GISEL-NEXT: v_sub_i32_e32 v1, vcc, v1, v7 +; GISEL-NEXT: v_subb_u32_e64 v7, vcc, v1, v9, s[4:5] +; GISEL-NEXT: v_sub_i32_e32 v1, vcc, v1, v9 ; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v0, v4 -; GISEL-NEXT: v_cndmask_b32_e64 v7, 0, -1, vcc +; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, -1, vcc ; GISEL-NEXT: v_sub_i32_e64 v2, s[6:7], v2, v14 -; GISEL-NEXT: v_subb_u32_e64 v9, vcc, v3, v6, s[6:7] +; GISEL-NEXT: v_subb_u32_e64 v10, vcc, v3, v6, s[6:7] ; GISEL-NEXT: v_sub_i32_e32 v3, vcc, v3, v6 ; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v2, v4 ; GISEL-NEXT: v_cndmask_b32_e64 v6, 0, -1, vcc ; GISEL-NEXT: v_sub_i32_e32 v11, vcc, v2, v4 -; GISEL-NEXT: v_cmp_eq_u32_e64 s[8:9], 0, v8 -; GISEL-NEXT: v_cndmask_b32_e64 v7, v5, v7, s[8:9] +; GISEL-NEXT: v_cmp_eq_u32_e64 s[8:9], 0, v7 +; GISEL-NEXT: v_cndmask_b32_e64 v9, v5, v9, s[8:9] ; GISEL-NEXT: v_subbrev_u32_e64 v1, s[4:5], 0, v1, s[4:5] -; GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v9 +; GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v10 ; GISEL-NEXT: v_cndmask_b32_e64 v6, v5, v6, s[4:5] ; GISEL-NEXT: v_subbrev_u32_e64 v3, s[4:5], 0, v3, s[6:7] ; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v11, v4 @@ -1322,228 +1312,161 @@ ; GISEL-NEXT: v_sub_i32_e64 v4, s[4:5], v14, v4 ; GISEL-NEXT: v_subbrev_u32_e64 v15, s[4:5], 0, v1, s[4:5] ; GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v3 -; GISEL-NEXT: v_cndmask_b32_e64 v10, v10, v12, s[4:5] +; GISEL-NEXT: v_cndmask_b32_e64 v8, v8, v12, s[4:5] ; GISEL-NEXT: v_subbrev_u32_e32 v12, vcc, 0, v3, vcc ; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v5 ; GISEL-NEXT: v_cndmask_b32_e32 v4, v14, v4, vcc -; GISEL-NEXT: v_cmp_ne_u32_e64 s[4:5], 0, v10 +; GISEL-NEXT: v_cmp_ne_u32_e64 s[4:5], 0, v8 ; GISEL-NEXT: v_cndmask_b32_e64 v5, v11, v13, s[4:5] ; GISEL-NEXT: v_cndmask_b32_e32 v1, v1, v15, vcc -; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v7 +; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v9 ; GISEL-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc ; GISEL-NEXT: v_cndmask_b32_e64 v3, v3, v12, s[4:5] ; GISEL-NEXT: v_cmp_ne_u32_e64 s[4:5], 0, v6 ; GISEL-NEXT: v_cndmask_b32_e64 v2, v2, v5, s[4:5] -; GISEL-NEXT: v_cndmask_b32_e32 v1, v8, v1, vcc -; GISEL-NEXT: v_cndmask_b32_e64 v3, v9, v3, s[4:5] +; GISEL-NEXT: v_cndmask_b32_e32 v1, v7, v1, vcc +; GISEL-NEXT: v_cndmask_b32_e64 v3, v10, v3, s[4:5] ; GISEL-NEXT: s_setpc_b64 s[30:31] ; ; CGP-LABEL: v_urem_v2i64_oddk_denom: ; CGP: ; %bb.0: ; CGP-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CGP-NEXT: s_mov_b32 s4, 0x12d8fb ; CGP-NEXT: v_mov_b32_e32 v4, 0x12d8fb -; CGP-NEXT: v_cvt_f32_u32_e32 v6, 0x12d8fb -; CGP-NEXT: v_cvt_f32_ubyte0_e32 v5, 0 -; CGP-NEXT: s_mov_b32 s5, 0xffed2705 -; CGP-NEXT: s_bfe_i32 s6, 1, 0x10000 -; CGP-NEXT: v_cvt_f32_u32_e32 v7, 0x12d8fb -; CGP-NEXT: v_cvt_f32_ubyte0_e32 v8, 0 -; CGP-NEXT: v_mac_f32_e32 v6, 0x4f800000, v5 -; CGP-NEXT: v_mov_b32_e32 v5, s6 -; CGP-NEXT: v_mac_f32_e32 v7, 0x4f800000, v8 -; CGP-NEXT: v_rcp_iflag_f32_e32 v6, v6 -; CGP-NEXT: v_rcp_iflag_f32_e32 v7, v7 -; CGP-NEXT: v_mul_f32_e32 v6, 0x5f7ffffc, v6 -; CGP-NEXT: v_mul_f32_e32 v7, 0x5f7ffffc, v7 -; CGP-NEXT: v_mul_f32_e32 v8, 0x2f800000, v6 -; CGP-NEXT: v_mul_f32_e32 v9, 0x2f800000, v7 +; CGP-NEXT: v_cvt_f32_u32_e32 v5, 0x12d8fb +; CGP-NEXT: v_cvt_f32_ubyte0_e32 v6, 0 +; CGP-NEXT: v_mov_b32_e32 v7, 0xffed2705 +; CGP-NEXT: s_bfe_i32 s4, 1, 0x10000 +; CGP-NEXT: v_mac_f32_e32 v5, 0x4f800000, v6 +; CGP-NEXT: v_mov_b32_e32 v6, s4 +; CGP-NEXT: v_rcp_iflag_f32_e32 v5, v5 +; CGP-NEXT: v_mul_f32_e32 v5, 0x5f7ffffc, v5 +; CGP-NEXT: v_mul_f32_e32 v8, 0x2f800000, v5 ; CGP-NEXT: v_trunc_f32_e32 v8, v8 -; CGP-NEXT: v_trunc_f32_e32 v9, v9 -; CGP-NEXT: v_mac_f32_e32 v6, 0xcf800000, v8 +; CGP-NEXT: v_mac_f32_e32 v5, 0xcf800000, v8 ; CGP-NEXT: v_cvt_u32_f32_e32 v8, v8 -; CGP-NEXT: v_mac_f32_e32 v7, 0xcf800000, v9 -; CGP-NEXT: v_cvt_u32_f32_e32 v9, v9 -; CGP-NEXT: v_cvt_u32_f32_e32 v6, v6 -; CGP-NEXT: v_mul_lo_u32 v10, v8, s5 -; CGP-NEXT: v_cvt_u32_f32_e32 v7, v7 -; CGP-NEXT: v_mul_lo_u32 v11, v9, s5 -; CGP-NEXT: v_mul_lo_u32 v12, v6, s5 -; CGP-NEXT: v_mul_hi_u32 v13, s5, v6 -; CGP-NEXT: v_sub_i32_e32 v10, vcc, v10, v6 -; CGP-NEXT: v_mul_lo_u32 v14, v7, s5 -; CGP-NEXT: v_mul_hi_u32 v15, s5, v7 -; CGP-NEXT: v_sub_i32_e32 v11, vcc, v11, v7 -; CGP-NEXT: v_add_i32_e32 v10, vcc, v10, v13 -; CGP-NEXT: v_mul_lo_u32 v13, v8, v12 -; CGP-NEXT: v_mul_hi_u32 v16, v6, v12 -; CGP-NEXT: v_mul_hi_u32 v12, v8, v12 -; CGP-NEXT: v_add_i32_e32 v11, vcc, v11, v15 -; CGP-NEXT: v_mul_lo_u32 v15, v9, v14 -; CGP-NEXT: v_mul_hi_u32 v17, v7, v14 -; CGP-NEXT: v_mul_hi_u32 v14, v9, v14 -; CGP-NEXT: v_mul_lo_u32 v18, v6, v10 -; CGP-NEXT: v_mul_lo_u32 v19, v8, v10 -; CGP-NEXT: v_mul_hi_u32 v20, v6, v10 +; CGP-NEXT: v_cvt_u32_f32_e32 v5, v5 +; CGP-NEXT: v_mul_lo_u32 v9, v8, v7 +; CGP-NEXT: v_mul_lo_u32 v10, v5, v7 +; CGP-NEXT: v_mul_hi_u32 v11, v7, v5 +; CGP-NEXT: v_sub_i32_e32 v9, vcc, v9, v5 +; CGP-NEXT: v_add_i32_e32 v9, vcc, v9, v11 +; CGP-NEXT: v_mul_lo_u32 v11, v8, v10 +; CGP-NEXT: v_mul_hi_u32 v12, v5, v10 ; CGP-NEXT: v_mul_hi_u32 v10, v8, v10 -; CGP-NEXT: v_mul_lo_u32 v21, v7, v11 -; CGP-NEXT: v_mul_lo_u32 v22, v9, v11 -; CGP-NEXT: v_mul_hi_u32 v23, v7, v11 -; CGP-NEXT: v_mul_hi_u32 v11, v9, v11 -; CGP-NEXT: v_add_i32_e32 v13, vcc, v13, v18 -; CGP-NEXT: v_cndmask_b32_e64 v18, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v12, vcc, v19, v12 -; CGP-NEXT: v_cndmask_b32_e64 v19, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v15, vcc, v15, v21 -; CGP-NEXT: v_cndmask_b32_e64 v21, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v14, vcc, v22, v14 -; CGP-NEXT: v_cndmask_b32_e64 v22, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v13, vcc, v13, v16 -; CGP-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v12, vcc, v12, v20 -; CGP-NEXT: v_cndmask_b32_e64 v16, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v15, vcc, v15, v17 -; CGP-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v14, vcc, v14, v23 -; CGP-NEXT: v_cndmask_b32_e64 v17, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v13, vcc, v18, v13 -; CGP-NEXT: v_add_i32_e32 v16, vcc, v19, v16 -; CGP-NEXT: v_add_i32_e32 v15, vcc, v21, v15 -; CGP-NEXT: v_add_i32_e32 v17, vcc, v22, v17 -; CGP-NEXT: v_add_i32_e32 v12, vcc, v12, v13 +; CGP-NEXT: v_mul_lo_u32 v13, v5, v9 +; CGP-NEXT: v_mul_lo_u32 v14, v8, v9 +; CGP-NEXT: v_mul_hi_u32 v15, v5, v9 +; CGP-NEXT: v_mul_hi_u32 v9, v8, v9 +; CGP-NEXT: v_add_i32_e32 v11, vcc, v11, v13 ; CGP-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v14, vcc, v14, v15 -; CGP-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v13, vcc, v16, v13 -; CGP-NEXT: v_add_i32_e32 v15, vcc, v17, v15 -; CGP-NEXT: v_add_i32_e32 v10, vcc, v10, v13 -; CGP-NEXT: v_add_i32_e32 v11, vcc, v11, v15 -; CGP-NEXT: v_add_i32_e32 v6, vcc, v6, v12 -; CGP-NEXT: v_addc_u32_e32 v8, vcc, v8, v10, vcc -; CGP-NEXT: v_mul_lo_u32 v10, v6, s5 -; CGP-NEXT: v_mul_hi_u32 v12, s5, v6 -; CGP-NEXT: v_add_i32_e32 v7, vcc, v7, v14 -; CGP-NEXT: v_addc_u32_e32 v9, vcc, v9, v11, vcc -; CGP-NEXT: v_mul_lo_u32 v11, v7, s5 -; CGP-NEXT: v_mul_hi_u32 v13, s5, v7 -; CGP-NEXT: v_mul_lo_u32 v14, v8, s5 -; CGP-NEXT: v_mul_lo_u32 v15, v8, v10 -; CGP-NEXT: v_mul_hi_u32 v16, v6, v10 -; CGP-NEXT: v_mul_hi_u32 v10, v8, v10 -; CGP-NEXT: v_mul_lo_u32 v17, v9, s5 -; CGP-NEXT: v_mul_lo_u32 v18, v9, v11 -; CGP-NEXT: v_mul_hi_u32 v19, v7, v11 -; CGP-NEXT: v_mul_hi_u32 v11, v9, v11 -; CGP-NEXT: v_sub_i32_e32 v14, vcc, v14, v6 -; CGP-NEXT: v_sub_i32_e32 v17, vcc, v17, v7 -; CGP-NEXT: v_add_i32_e32 v12, vcc, v14, v12 -; CGP-NEXT: v_add_i32_e32 v13, vcc, v17, v13 -; CGP-NEXT: v_mul_lo_u32 v14, v6, v12 -; CGP-NEXT: v_mul_lo_u32 v17, v8, v12 -; CGP-NEXT: v_mul_hi_u32 v20, v6, v12 -; CGP-NEXT: v_mul_hi_u32 v12, v8, v12 -; CGP-NEXT: v_mul_lo_u32 v21, v7, v13 -; CGP-NEXT: v_mul_lo_u32 v22, v9, v13 -; CGP-NEXT: v_mul_hi_u32 v23, v7, v13 -; CGP-NEXT: v_mul_hi_u32 v13, v9, v13 -; CGP-NEXT: v_add_i32_e32 v14, vcc, v15, v14 -; CGP-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v10, vcc, v17, v10 -; CGP-NEXT: v_cndmask_b32_e64 v17, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v18, vcc, v18, v21 -; CGP-NEXT: v_cndmask_b32_e64 v21, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v11, vcc, v22, v11 -; CGP-NEXT: v_cndmask_b32_e64 v22, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v14, vcc, v14, v16 -; CGP-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v10, vcc, v10, v20 -; CGP-NEXT: v_cndmask_b32_e64 v16, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v18, vcc, v18, v19 -; CGP-NEXT: v_cndmask_b32_e64 v18, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v11, vcc, v11, v23 -; CGP-NEXT: v_cndmask_b32_e64 v19, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v14, vcc, v15, v14 -; CGP-NEXT: v_add_i32_e32 v15, vcc, v17, v16 -; CGP-NEXT: v_add_i32_e32 v16, vcc, v21, v18 -; CGP-NEXT: v_add_i32_e32 v17, vcc, v22, v19 -; CGP-NEXT: v_add_i32_e32 v10, vcc, v10, v14 +; CGP-NEXT: v_add_i32_e32 v10, vcc, v14, v10 ; CGP-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v11, vcc, v11, v16 -; CGP-NEXT: v_cndmask_b32_e64 v16, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v14, vcc, v15, v14 -; CGP-NEXT: v_add_i32_e32 v15, vcc, v17, v16 -; CGP-NEXT: v_add_i32_e32 v12, vcc, v12, v14 -; CGP-NEXT: v_add_i32_e32 v13, vcc, v13, v15 -; CGP-NEXT: v_add_i32_e32 v6, vcc, v6, v10 -; CGP-NEXT: v_addc_u32_e32 v8, vcc, v8, v12, vcc -; CGP-NEXT: v_mul_lo_u32 v10, v1, v6 -; CGP-NEXT: v_mul_hi_u32 v12, v0, v6 -; CGP-NEXT: v_mul_hi_u32 v6, v1, v6 -; CGP-NEXT: v_add_i32_e32 v7, vcc, v7, v11 -; CGP-NEXT: v_addc_u32_e32 v9, vcc, v9, v13, vcc -; CGP-NEXT: v_mul_lo_u32 v11, v3, v7 -; CGP-NEXT: v_mul_hi_u32 v13, v2, v7 +; CGP-NEXT: v_add_i32_e32 v11, vcc, v11, v12 +; CGP-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v10, vcc, v10, v15 +; CGP-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v11, vcc, v13, v11 +; CGP-NEXT: v_add_i32_e32 v12, vcc, v14, v12 +; CGP-NEXT: v_add_i32_e32 v10, vcc, v10, v11 +; CGP-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v11, vcc, v12, v11 +; CGP-NEXT: v_add_i32_e32 v9, vcc, v9, v11 +; CGP-NEXT: v_add_i32_e32 v5, vcc, v5, v10 +; CGP-NEXT: v_addc_u32_e32 v8, vcc, v8, v9, vcc +; CGP-NEXT: v_mul_lo_u32 v9, v5, v7 +; CGP-NEXT: v_mul_hi_u32 v10, v7, v5 +; CGP-NEXT: v_mul_lo_u32 v7, v8, v7 +; CGP-NEXT: v_mul_lo_u32 v11, v8, v9 +; CGP-NEXT: v_mul_hi_u32 v12, v5, v9 +; CGP-NEXT: v_mul_hi_u32 v9, v8, v9 +; CGP-NEXT: v_sub_i32_e32 v7, vcc, v7, v5 +; CGP-NEXT: v_add_i32_e32 v7, vcc, v7, v10 +; CGP-NEXT: v_mul_lo_u32 v10, v5, v7 +; CGP-NEXT: v_mul_lo_u32 v13, v8, v7 +; CGP-NEXT: v_mul_hi_u32 v14, v5, v7 +; CGP-NEXT: v_mul_hi_u32 v7, v8, v7 +; CGP-NEXT: v_add_i32_e32 v10, vcc, v11, v10 +; CGP-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v9, vcc, v13, v9 +; CGP-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v10, vcc, v10, v12 +; CGP-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v9, vcc, v9, v14 +; CGP-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v10, vcc, v11, v10 +; CGP-NEXT: v_add_i32_e32 v11, vcc, v13, v12 +; CGP-NEXT: v_add_i32_e32 v9, vcc, v9, v10 +; CGP-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v10, vcc, v11, v10 +; CGP-NEXT: v_add_i32_e32 v7, vcc, v7, v10 +; CGP-NEXT: v_add_i32_e32 v5, vcc, v5, v9 +; CGP-NEXT: v_addc_u32_e32 v7, vcc, v8, v7, vcc +; CGP-NEXT: v_mul_lo_u32 v8, v1, v5 +; CGP-NEXT: v_mul_hi_u32 v9, v0, v5 +; CGP-NEXT: v_mul_hi_u32 v10, v1, v5 +; CGP-NEXT: v_mul_lo_u32 v11, v3, v5 +; CGP-NEXT: v_mul_hi_u32 v12, v2, v5 +; CGP-NEXT: v_mul_hi_u32 v5, v3, v5 +; CGP-NEXT: v_mul_lo_u32 v13, v0, v7 +; CGP-NEXT: v_mul_lo_u32 v14, v1, v7 +; CGP-NEXT: v_mul_hi_u32 v15, v0, v7 +; CGP-NEXT: v_mul_hi_u32 v16, v1, v7 +; CGP-NEXT: v_mul_lo_u32 v17, v2, v7 +; CGP-NEXT: v_mul_lo_u32 v18, v3, v7 +; CGP-NEXT: v_mul_hi_u32 v19, v2, v7 ; CGP-NEXT: v_mul_hi_u32 v7, v3, v7 -; CGP-NEXT: v_mul_lo_u32 v14, v0, v8 -; CGP-NEXT: v_mul_lo_u32 v15, v1, v8 -; CGP-NEXT: v_mul_hi_u32 v16, v0, v8 -; CGP-NEXT: v_mul_hi_u32 v8, v1, v8 -; CGP-NEXT: v_mul_lo_u32 v17, v2, v9 -; CGP-NEXT: v_mul_lo_u32 v18, v3, v9 -; CGP-NEXT: v_mul_hi_u32 v19, v2, v9 -; CGP-NEXT: v_mul_hi_u32 v9, v3, v9 -; CGP-NEXT: v_add_i32_e32 v10, vcc, v10, v14 +; CGP-NEXT: v_add_i32_e32 v8, vcc, v8, v13 +; CGP-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v10, vcc, v14, v10 ; CGP-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v6, vcc, v15, v6 -; CGP-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v11, vcc, v11, v17 ; CGP-NEXT: v_cndmask_b32_e64 v17, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v7, vcc, v18, v7 +; CGP-NEXT: v_add_i32_e32 v5, vcc, v18, v5 ; CGP-NEXT: v_cndmask_b32_e64 v18, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v10, vcc, v10, v12 +; CGP-NEXT: v_add_i32_e32 v8, vcc, v8, v9 +; CGP-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v9, vcc, v10, v15 ; CGP-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v6, vcc, v6, v16 -; CGP-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v11, vcc, v11, v13 +; CGP-NEXT: v_add_i32_e32 v11, vcc, v11, v12 ; CGP-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v7, vcc, v7, v19 -; CGP-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v5, vcc, v5, v19 +; CGP-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v8, vcc, v13, v8 ; CGP-NEXT: v_add_i32_e32 v10, vcc, v14, v10 -; CGP-NEXT: v_add_i32_e32 v12, vcc, v15, v12 ; CGP-NEXT: v_add_i32_e32 v11, vcc, v17, v11 -; CGP-NEXT: v_add_i32_e32 v13, vcc, v18, v13 -; CGP-NEXT: v_add_i32_e32 v6, vcc, v6, v10 -; CGP-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v7, vcc, v7, v11 +; CGP-NEXT: v_add_i32_e32 v12, vcc, v18, v12 +; CGP-NEXT: v_add_i32_e32 v8, vcc, v9, v8 +; CGP-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v5, vcc, v5, v11 ; CGP-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v10, vcc, v12, v10 -; CGP-NEXT: v_mul_lo_u32 v12, v6, s4 -; CGP-NEXT: v_mul_hi_u32 v6, s4, v6 -; CGP-NEXT: v_add_i32_e32 v11, vcc, v13, v11 -; CGP-NEXT: v_mul_lo_u32 v13, v7, s4 -; CGP-NEXT: v_mul_hi_u32 v7, s4, v7 -; CGP-NEXT: v_add_i32_e32 v8, vcc, v8, v10 -; CGP-NEXT: v_add_i32_e32 v9, vcc, v9, v11 -; CGP-NEXT: v_mul_lo_u32 v8, v8, s4 -; CGP-NEXT: v_mul_lo_u32 v9, v9, s4 -; CGP-NEXT: v_add_i32_e32 v6, vcc, v8, v6 -; CGP-NEXT: v_add_i32_e32 v7, vcc, v9, v7 -; CGP-NEXT: v_sub_i32_e64 v0, s[4:5], v0, v12 -; CGP-NEXT: v_subb_u32_e64 v8, vcc, v1, v6, s[4:5] -; CGP-NEXT: v_sub_i32_e32 v1, vcc, v1, v6 +; CGP-NEXT: v_add_i32_e32 v9, vcc, v10, v9 +; CGP-NEXT: v_mul_lo_u32 v10, v8, v4 +; CGP-NEXT: v_mul_hi_u32 v8, v4, v8 +; CGP-NEXT: v_add_i32_e32 v11, vcc, v12, v11 +; CGP-NEXT: v_mul_lo_u32 v12, v5, v4 +; CGP-NEXT: v_mul_hi_u32 v5, v4, v5 +; CGP-NEXT: v_add_i32_e32 v9, vcc, v16, v9 +; CGP-NEXT: v_add_i32_e32 v7, vcc, v7, v11 +; CGP-NEXT: v_mul_lo_u32 v9, v9, v4 +; CGP-NEXT: v_mul_lo_u32 v7, v7, v4 +; CGP-NEXT: v_add_i32_e32 v8, vcc, v9, v8 +; CGP-NEXT: v_add_i32_e32 v5, vcc, v7, v5 +; CGP-NEXT: v_sub_i32_e64 v0, s[4:5], v0, v10 +; CGP-NEXT: v_subb_u32_e64 v7, vcc, v1, v8, s[4:5] +; CGP-NEXT: v_sub_i32_e32 v1, vcc, v1, v8 ; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v0, v4 -; CGP-NEXT: v_cndmask_b32_e64 v6, 0, -1, vcc -; CGP-NEXT: v_sub_i32_e64 v2, s[6:7], v2, v13 -; CGP-NEXT: v_subb_u32_e64 v9, vcc, v3, v7, s[6:7] -; CGP-NEXT: v_sub_i32_e32 v3, vcc, v3, v7 +; CGP-NEXT: v_cndmask_b32_e64 v8, 0, -1, vcc +; CGP-NEXT: v_sub_i32_e64 v2, s[6:7], v2, v12 +; CGP-NEXT: v_subb_u32_e64 v9, vcc, v3, v5, s[6:7] +; CGP-NEXT: v_sub_i32_e32 v3, vcc, v3, v5 ; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v2, v4 -; CGP-NEXT: v_cndmask_b32_e64 v7, 0, -1, vcc +; CGP-NEXT: v_cndmask_b32_e64 v5, 0, -1, vcc ; CGP-NEXT: v_sub_i32_e32 v10, vcc, v2, v4 -; CGP-NEXT: v_cmp_eq_u32_e64 s[8:9], 0, v8 -; CGP-NEXT: v_cndmask_b32_e64 v6, v5, v6, s[8:9] +; CGP-NEXT: v_cmp_eq_u32_e64 s[8:9], 0, v7 +; CGP-NEXT: v_cndmask_b32_e64 v8, v6, v8, s[8:9] ; CGP-NEXT: v_subbrev_u32_e64 v1, s[4:5], 0, v1, s[4:5] ; CGP-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v9 -; CGP-NEXT: v_cndmask_b32_e64 v7, v5, v7, s[4:5] +; CGP-NEXT: v_cndmask_b32_e64 v5, v6, v5, s[4:5] ; CGP-NEXT: v_subbrev_u32_e64 v3, s[4:5], 0, v3, s[6:7] ; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v10, v4 ; CGP-NEXT: v_cndmask_b32_e64 v11, 0, -1, s[4:5] @@ -1555,23 +1478,23 @@ ; CGP-NEXT: v_cndmask_b32_e64 v14, 0, -1, s[6:7] ; CGP-NEXT: v_subbrev_u32_e64 v3, s[4:5], 0, v3, s[4:5] ; CGP-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v1 -; CGP-NEXT: v_cndmask_b32_e64 v14, v5, v14, s[4:5] +; CGP-NEXT: v_cndmask_b32_e64 v14, v6, v14, s[4:5] ; CGP-NEXT: v_sub_i32_e64 v4, s[4:5], v13, v4 ; CGP-NEXT: v_subbrev_u32_e64 v15, s[4:5], 0, v1, s[4:5] ; CGP-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v3 -; CGP-NEXT: v_cndmask_b32_e64 v5, v5, v11, s[4:5] +; CGP-NEXT: v_cndmask_b32_e64 v6, v6, v11, s[4:5] ; CGP-NEXT: v_subbrev_u32_e32 v11, vcc, 0, v3, vcc ; CGP-NEXT: v_cmp_ne_u32_e32 vcc, 0, v14 ; CGP-NEXT: v_cndmask_b32_e32 v4, v13, v4, vcc -; CGP-NEXT: v_cmp_ne_u32_e64 s[4:5], 0, v5 -; CGP-NEXT: v_cndmask_b32_e64 v5, v10, v12, s[4:5] +; CGP-NEXT: v_cmp_ne_u32_e64 s[4:5], 0, v6 +; CGP-NEXT: v_cndmask_b32_e64 v6, v10, v12, s[4:5] ; CGP-NEXT: v_cndmask_b32_e32 v1, v1, v15, vcc -; CGP-NEXT: v_cmp_ne_u32_e32 vcc, 0, v6 +; CGP-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 ; CGP-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc ; CGP-NEXT: v_cndmask_b32_e64 v3, v3, v11, s[4:5] -; CGP-NEXT: v_cmp_ne_u32_e64 s[4:5], 0, v7 -; CGP-NEXT: v_cndmask_b32_e64 v2, v2, v5, s[4:5] -; CGP-NEXT: v_cndmask_b32_e32 v1, v8, v1, vcc +; CGP-NEXT: v_cmp_ne_u32_e64 s[4:5], 0, v5 +; CGP-NEXT: v_cndmask_b32_e64 v2, v2, v6, s[4:5] +; CGP-NEXT: v_cndmask_b32_e32 v1, v7, v1, vcc ; CGP-NEXT: v_cndmask_b32_e64 v3, v9, v3, s[4:5] ; CGP-NEXT: s_setpc_b64 s[30:31] %result = urem <2 x i64> %num, @@ -1731,22 +1654,22 @@ ; CHECK-NEXT: s_cbranch_execz .LBB7_2 ; CHECK-NEXT: .LBB7_4: ; CHECK-NEXT: v_rcp_iflag_f32_e32 v0, v2 -; CHECK-NEXT: v_sub_i32_e32 v1, vcc, 0, v5 +; CHECK-NEXT: v_mov_b32_e32 v1, 0 +; CHECK-NEXT: v_sub_i32_e32 v2, vcc, 0, v5 ; CHECK-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; CHECK-NEXT: v_cvt_u32_f32_e32 v0, v0 -; CHECK-NEXT: v_mul_lo_u32 v1, v1, v0 -; CHECK-NEXT: v_mul_hi_u32 v1, v0, v1 -; CHECK-NEXT: v_add_i32_e32 v0, vcc, v0, v1 +; CHECK-NEXT: v_mul_lo_u32 v2, v2, v0 +; CHECK-NEXT: v_mul_hi_u32 v2, v0, v2 +; CHECK-NEXT: v_add_i32_e32 v0, vcc, v0, v2 ; CHECK-NEXT: v_mul_hi_u32 v0, v3, v0 ; CHECK-NEXT: v_mul_lo_u32 v0, v0, v5 ; CHECK-NEXT: v_sub_i32_e32 v0, vcc, v3, v0 -; CHECK-NEXT: v_sub_i32_e32 v1, vcc, v0, v5 +; CHECK-NEXT: v_sub_i32_e32 v2, vcc, v0, v5 ; CHECK-NEXT: v_cmp_ge_u32_e32 vcc, v0, v5 -; CHECK-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc -; CHECK-NEXT: v_sub_i32_e32 v1, vcc, v0, v5 +; CHECK-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc +; CHECK-NEXT: v_sub_i32_e32 v2, vcc, v0, v5 ; CHECK-NEXT: v_cmp_ge_u32_e32 vcc, v0, v5 -; CHECK-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc -; CHECK-NEXT: v_mov_b32_e32 v1, 0 +; CHECK-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc ; CHECK-NEXT: s_or_b64 exec, exec, s[4:5] ; CHECK-NEXT: s_setpc_b64 s[30:31] %shl.y = shl i64 4096, %y @@ -2162,22 +2085,22 @@ ; CGP-NEXT: s_cbranch_execz .LBB8_4 ; CGP-NEXT: ; %bb.3: ; CGP-NEXT: v_rcp_iflag_f32_e32 v0, v4 -; CGP-NEXT: v_sub_i32_e32 v1, vcc, 0, v2 +; CGP-NEXT: v_mov_b32_e32 v1, 0 +; CGP-NEXT: v_sub_i32_e32 v3, vcc, 0, v2 ; CGP-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; CGP-NEXT: v_cvt_u32_f32_e32 v0, v0 -; CGP-NEXT: v_mul_lo_u32 v1, v1, v0 -; CGP-NEXT: v_mul_hi_u32 v1, v0, v1 -; CGP-NEXT: v_add_i32_e32 v0, vcc, v0, v1 +; CGP-NEXT: v_mul_lo_u32 v3, v3, v0 +; CGP-NEXT: v_mul_hi_u32 v3, v0, v3 +; CGP-NEXT: v_add_i32_e32 v0, vcc, v0, v3 ; CGP-NEXT: v_mul_hi_u32 v0, v8, v0 ; CGP-NEXT: v_mul_lo_u32 v0, v0, v2 ; CGP-NEXT: v_sub_i32_e32 v0, vcc, v8, v0 -; CGP-NEXT: v_sub_i32_e32 v1, vcc, v0, v2 +; CGP-NEXT: v_sub_i32_e32 v3, vcc, v0, v2 ; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v0, v2 -; CGP-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc -; CGP-NEXT: v_sub_i32_e32 v1, vcc, v0, v2 +; CGP-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc +; CGP-NEXT: v_sub_i32_e32 v3, vcc, v0, v2 ; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v0, v2 -; CGP-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc -; CGP-NEXT: v_mov_b32_e32 v1, 0 +; CGP-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc ; CGP-NEXT: .LBB8_4: ; CGP-NEXT: s_or_b64 exec, exec, s[4:5] ; CGP-NEXT: v_or_b32_e32 v3, v7, v10 @@ -2325,22 +2248,22 @@ ; CGP-NEXT: s_cbranch_execz .LBB8_6 ; CGP-NEXT: .LBB8_8: ; CGP-NEXT: v_rcp_iflag_f32_e32 v2, v4 -; CGP-NEXT: v_sub_i32_e32 v3, vcc, 0, v9 +; CGP-NEXT: v_mov_b32_e32 v3, 0 +; CGP-NEXT: v_sub_i32_e32 v4, vcc, 0, v9 ; CGP-NEXT: v_mul_f32_e32 v2, 0x4f7ffffe, v2 ; CGP-NEXT: v_cvt_u32_f32_e32 v2, v2 -; CGP-NEXT: v_mul_lo_u32 v3, v3, v2 -; CGP-NEXT: v_mul_hi_u32 v3, v2, v3 -; CGP-NEXT: v_add_i32_e32 v2, vcc, v2, v3 +; CGP-NEXT: v_mul_lo_u32 v4, v4, v2 +; CGP-NEXT: v_mul_hi_u32 v4, v2, v4 +; CGP-NEXT: v_add_i32_e32 v2, vcc, v2, v4 ; CGP-NEXT: v_mul_hi_u32 v2, v5, v2 ; CGP-NEXT: v_mul_lo_u32 v2, v2, v9 ; CGP-NEXT: v_sub_i32_e32 v2, vcc, v5, v2 -; CGP-NEXT: v_sub_i32_e32 v3, vcc, v2, v9 +; CGP-NEXT: v_sub_i32_e32 v4, vcc, v2, v9 ; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v2, v9 -; CGP-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc -; CGP-NEXT: v_sub_i32_e32 v3, vcc, v2, v9 +; CGP-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc +; CGP-NEXT: v_sub_i32_e32 v4, vcc, v2, v9 ; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v2, v9 -; CGP-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc -; CGP-NEXT: v_mov_b32_e32 v3, 0 +; CGP-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc ; CGP-NEXT: s_or_b64 exec, exec, s[4:5] ; CGP-NEXT: s_setpc_b64 s[30:31] %shl.y = shl <2 x i64> , %y Index: llvm/test/CodeGen/AMDGPU/GlobalISel/usubsat.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/GlobalISel/usubsat.ll +++ llvm/test/CodeGen/AMDGPU/GlobalISel/usubsat.ll @@ -221,8 +221,8 @@ ; GFX9-NEXT: v_pk_lshlrev_b16 v1, 8, v1 op_sel_hi:[0,1] ; GFX9-NEXT: v_pk_sub_u16 v0, v0, v1 clamp ; GFX9-NEXT: v_pk_lshrrev_b16 v0, 8, v0 op_sel_hi:[0,1] -; GFX9-NEXT: s_movk_i32 s4, 0xff -; GFX9-NEXT: v_and_b32_sdwa v1, v0, s4 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX9-NEXT: v_mov_b32_e32 v1, 0xff +; GFX9-NEXT: v_and_b32_sdwa v1, v0, v1 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -234,14 +234,14 @@ ; GFX10-NEXT: v_lshrrev_b32_e32 v3, 8, v1 ; GFX10-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX10-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GFX10-NEXT: s_movk_i32 s4, 0xff ; GFX10-NEXT: v_lshl_or_b32 v0, v2, 16, v0 ; GFX10-NEXT: v_lshl_or_b32 v1, v3, 16, v1 ; GFX10-NEXT: v_pk_lshlrev_b16 v0, 8, v0 op_sel_hi:[0,1] ; GFX10-NEXT: v_pk_lshlrev_b16 v1, 8, v1 op_sel_hi:[0,1] ; GFX10-NEXT: v_pk_sub_u16 v0, v0, v1 clamp +; GFX10-NEXT: v_mov_b32_e32 v1, 0xff ; GFX10-NEXT: v_pk_lshrrev_b16 v0, 8, v0 op_sel_hi:[0,1] -; GFX10-NEXT: v_and_b32_sdwa v1, v0, s4 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX10-NEXT: v_and_b32_sdwa v1, v0, v1 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; GFX10-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX10-NEXT: s_setpc_b64 s[30:31] ; @@ -326,8 +326,8 @@ ; GFX9-NEXT: v_mov_b32_e32 v0, s1 ; GFX9-NEXT: v_pk_sub_u16 v0, s0, v0 clamp ; GFX9-NEXT: v_pk_lshrrev_b16 v0, 8, v0 op_sel_hi:[0,1] -; GFX9-NEXT: s_movk_i32 s0, 0xff -; GFX9-NEXT: v_and_b32_sdwa v1, v0, s0 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX9-NEXT: v_mov_b32_e32 v1, 0xff +; GFX9-NEXT: v_and_b32_sdwa v1, v0, v1 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_readfirstlane_b32 s0, v0 ; GFX9-NEXT: ; return to shader part epilog @@ -346,10 +346,10 @@ ; GFX10-NEXT: s_lshl_b32 s3, s3, 8 ; GFX10-NEXT: s_pack_ll_b32_b16 s0, s0, s2 ; GFX10-NEXT: s_pack_ll_b32_b16 s1, s1, s3 +; GFX10-NEXT: v_mov_b32_e32 v1, 0xff ; GFX10-NEXT: v_pk_sub_u16 v0, s0, s1 clamp -; GFX10-NEXT: s_movk_i32 s0, 0xff ; GFX10-NEXT: v_pk_lshrrev_b16 v0, 8, v0 op_sel_hi:[0,1] -; GFX10-NEXT: v_and_b32_sdwa v1, v0, s0 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX10-NEXT: v_and_b32_sdwa v1, v0, v1 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; GFX10-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX10-NEXT: v_readfirstlane_b32 s0, v0 ; GFX10-NEXT: ; return to shader part epilog @@ -471,11 +471,11 @@ ; GFX9-NEXT: v_pk_sub_u16 v2, v2, v3 clamp ; GFX9-NEXT: v_pk_sub_u16 v0, v0, v1 clamp ; GFX9-NEXT: v_pk_lshrrev_b16 v1, 8, v2 op_sel_hi:[0,1] -; GFX9-NEXT: v_mov_b32_e32 v2, 8 +; GFX9-NEXT: v_mov_b32_e32 v3, 8 ; GFX9-NEXT: v_pk_lshrrev_b16 v0, 8, v0 op_sel_hi:[0,1] -; GFX9-NEXT: s_movk_i32 s4, 0xff -; GFX9-NEXT: v_lshlrev_b32_sdwa v2, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 -; GFX9-NEXT: v_and_or_b32 v1, v1, s4, v2 +; GFX9-NEXT: v_mov_b32_e32 v2, 0xff +; GFX9-NEXT: v_lshlrev_b32_sdwa v3, v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 +; GFX9-NEXT: v_and_or_b32 v1, v1, v2, v3 ; GFX9-NEXT: v_and_b32_e32 v2, 0xff, v0 ; GFX9-NEXT: v_mov_b32_e32 v3, 24 ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 16, v2 @@ -510,7 +510,7 @@ ; GFX10-NEXT: v_lshlrev_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 ; GFX10-NEXT: v_and_b32_e32 v3, 0xff, v0 ; GFX10-NEXT: v_lshlrev_b32_sdwa v0, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 -; GFX10-NEXT: v_and_or_b32 v1, v2, 0xff, v1 +; GFX10-NEXT: v_and_or_b32 v1, 0xff, v2, v1 ; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v3 ; GFX10-NEXT: v_or3_b32 v0, v1, v2, v0 ; GFX10-NEXT: s_setpc_b64 s[30:31] @@ -543,7 +543,7 @@ ; GFX11-NEXT: v_lshlrev_b32_e32 v2, 8, v2 ; GFX11-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 24, v0 -; GFX11-NEXT: v_and_or_b32 v1, v1, 0xff, v2 +; GFX11-NEXT: v_and_or_b32 v1, 0xff, v1, v2 ; GFX11-NEXT: v_or3_b32 v0, v1, v3, v0 ; GFX11-NEXT: s_setpc_b64 s[30:31] %lhs = bitcast i32 %lhs.arg to <4 x i8> @@ -628,46 +628,46 @@ ; ; GFX9-LABEL: s_usubsat_v4i8: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_lshr_b32 s3, s0, 8 +; GFX9-NEXT: s_lshr_b32 s2, s0, 8 +; GFX9-NEXT: s_lshr_b32 s3, s0, 16 +; GFX9-NEXT: s_lshr_b32 s4, s0, 24 +; GFX9-NEXT: s_pack_ll_b32_b16 s0, s0, s2 +; GFX9-NEXT: s_pack_ll_b32_b16 s2, s3, s4 ; GFX9-NEXT: s_lshr_b32 s4, s0, 16 -; GFX9-NEXT: s_lshr_b32 s6, s0, 24 -; GFX9-NEXT: s_pack_ll_b32_b16 s0, s0, s3 -; GFX9-NEXT: s_pack_ll_b32_b16 s3, s4, s6 -; GFX9-NEXT: s_lshr_b32 s6, s0, 16 ; GFX9-NEXT: s_lshl_b32 s0, s0, 0x80008 -; GFX9-NEXT: s_lshl_b32 s6, s6, 8 -; GFX9-NEXT: s_lshr_b32 s7, s1, 8 -; GFX9-NEXT: s_pack_ll_b32_b16 s0, s0, s6 -; GFX9-NEXT: s_lshr_b32 s6, s3, 16 -; GFX9-NEXT: s_lshr_b32 s8, s1, 16 -; GFX9-NEXT: s_lshr_b32 s9, s1, 24 -; GFX9-NEXT: s_pack_ll_b32_b16 s1, s1, s7 -; GFX9-NEXT: s_lshl_b32 s3, s3, 0x80008 -; GFX9-NEXT: s_lshl_b32 s6, s6, 8 -; GFX9-NEXT: s_pack_ll_b32_b16 s3, s3, s6 +; GFX9-NEXT: s_lshl_b32 s4, s4, 8 +; GFX9-NEXT: s_lshr_b32 s5, s1, 8 +; GFX9-NEXT: s_pack_ll_b32_b16 s0, s0, s4 +; GFX9-NEXT: s_lshr_b32 s4, s2, 16 ; GFX9-NEXT: s_lshr_b32 s6, s1, 16 -; GFX9-NEXT: s_pack_ll_b32_b16 s4, s8, s9 +; GFX9-NEXT: s_lshr_b32 s7, s1, 24 +; GFX9-NEXT: s_pack_ll_b32_b16 s1, s1, s5 +; GFX9-NEXT: s_lshl_b32 s2, s2, 0x80008 +; GFX9-NEXT: s_lshl_b32 s4, s4, 8 +; GFX9-NEXT: s_pack_ll_b32_b16 s2, s2, s4 +; GFX9-NEXT: s_lshr_b32 s4, s1, 16 +; GFX9-NEXT: s_pack_ll_b32_b16 s3, s6, s7 ; GFX9-NEXT: s_lshl_b32 s1, s1, 0x80008 -; GFX9-NEXT: s_lshl_b32 s6, s6, 8 -; GFX9-NEXT: s_pack_ll_b32_b16 s1, s1, s6 -; GFX9-NEXT: s_lshr_b32 s6, s4, 16 -; GFX9-NEXT: s_lshl_b32 s4, s4, 0x80008 -; GFX9-NEXT: s_lshl_b32 s6, s6, 8 -; GFX9-NEXT: s_pack_ll_b32_b16 s4, s4, s6 +; GFX9-NEXT: s_lshl_b32 s4, s4, 8 +; GFX9-NEXT: s_pack_ll_b32_b16 s1, s1, s4 +; GFX9-NEXT: s_lshr_b32 s4, s3, 16 +; GFX9-NEXT: s_lshl_b32 s3, s3, 0x80008 +; GFX9-NEXT: s_lshl_b32 s4, s4, 8 +; GFX9-NEXT: s_pack_ll_b32_b16 s3, s3, s4 ; GFX9-NEXT: v_mov_b32_e32 v0, s1 ; GFX9-NEXT: v_pk_sub_u16 v0, s0, v0 clamp -; GFX9-NEXT: v_mov_b32_e32 v1, s4 -; GFX9-NEXT: s_mov_b32 s2, 8 -; GFX9-NEXT: v_pk_sub_u16 v1, s3, v1 clamp +; GFX9-NEXT: v_mov_b32_e32 v1, s3 +; GFX9-NEXT: v_pk_sub_u16 v1, s2, v1 clamp ; GFX9-NEXT: v_pk_lshrrev_b16 v0, 8, v0 op_sel_hi:[0,1] +; GFX9-NEXT: v_mov_b32_e32 v3, 8 ; GFX9-NEXT: v_pk_lshrrev_b16 v1, 8, v1 op_sel_hi:[0,1] -; GFX9-NEXT: s_movk_i32 s0, 0xff -; GFX9-NEXT: v_lshlrev_b32_sdwa v2, s2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 -; GFX9-NEXT: s_mov_b32 s5, 24 -; GFX9-NEXT: v_and_or_b32 v0, v0, s0, v2 +; GFX9-NEXT: v_mov_b32_e32 v2, 0xff +; GFX9-NEXT: v_lshlrev_b32_sdwa v3, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 +; GFX9-NEXT: v_and_or_b32 v0, v0, v2, v3 ; GFX9-NEXT: v_and_b32_e32 v2, 0xff, v1 +; GFX9-NEXT: v_mov_b32_e32 v3, 24 ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX9-NEXT: v_lshlrev_b32_sdwa v1, s5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 +; GFX9-NEXT: v_lshlrev_b32_sdwa v1, v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 ; GFX9-NEXT: v_or3_b32 v0, v0, v2, v1 ; GFX9-NEXT: v_readfirstlane_b32 s0, v0 ; GFX9-NEXT: ; return to shader part epilog @@ -702,14 +702,14 @@ ; GFX10-NEXT: s_pack_ll_b32_b16 s3, s3, s5 ; GFX10-NEXT: v_pk_sub_u16 v0, s0, s1 clamp ; GFX10-NEXT: v_pk_sub_u16 v1, s2, s3 clamp -; GFX10-NEXT: s_mov_b32 s0, 8 +; GFX10-NEXT: v_mov_b32_e32 v2, 8 +; GFX10-NEXT: v_mov_b32_e32 v4, 24 ; GFX10-NEXT: v_pk_lshrrev_b16 v0, 8, v0 op_sel_hi:[0,1] ; GFX10-NEXT: v_pk_lshrrev_b16 v1, 8, v1 op_sel_hi:[0,1] -; GFX10-NEXT: v_lshlrev_b32_sdwa v2, s0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 +; GFX10-NEXT: v_lshlrev_b32_sdwa v2, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 ; GFX10-NEXT: v_and_b32_e32 v3, 0xff, v1 -; GFX10-NEXT: s_mov_b32 s0, 24 -; GFX10-NEXT: v_lshlrev_b32_sdwa v1, s0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 -; GFX10-NEXT: v_and_or_b32 v0, v0, 0xff, v2 +; GFX10-NEXT: v_lshlrev_b32_sdwa v1, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 +; GFX10-NEXT: v_and_or_b32 v0, 0xff, v0, v2 ; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v3 ; GFX10-NEXT: v_or3_b32 v0, v0, v2, v1 ; GFX10-NEXT: v_readfirstlane_b32 s0, v0 @@ -749,7 +749,7 @@ ; GFX11-NEXT: v_lshlrev_b32_e32 v2, 8, v2 ; GFX11-NEXT: v_and_b32_e32 v3, 0xff, v1 ; GFX11-NEXT: v_bfe_u32 v1, v1, 16, 8 -; GFX11-NEXT: v_and_or_b32 v0, v0, 0xff, v2 +; GFX11-NEXT: v_and_or_b32 v0, 0xff, v0, v2 ; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v3 ; GFX11-NEXT: v_lshlrev_b32_e32 v1, 24, v1 ; GFX11-NEXT: v_or3_b32 v0, v0, v2, v1 Index: llvm/test/CodeGen/AMDGPU/constrained-shift.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/constrained-shift.ll +++ llvm/test/CodeGen/AMDGPU/constrained-shift.ll @@ -16,7 +16,6 @@ ; GISEL-LABEL: csh_16: ; GISEL: ; %bb.0: ; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GISEL-NEXT: v_and_b32_e32 v1, 15, v1 ; GISEL-NEXT: v_lshlrev_b16_e32 v2, v1, v0 ; GISEL-NEXT: v_lshrrev_b16_e32 v3, v1, v0 ; GISEL-NEXT: v_ashrrev_i16_e32 v0, v1, v0 @@ -339,7 +338,6 @@ ; GISEL-LABEL: cshl_or: ; GISEL: ; %bb.0: ; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GISEL-NEXT: v_and_b32_e32 v1, 31, v1 ; GISEL-NEXT: v_lshl_or_b32 v0, v0, v1, v0 ; GISEL-NEXT: s_setpc_b64 s[30:31] %and = and i32 %b, 31 @@ -358,7 +356,6 @@ ; GISEL-LABEL: cshl_add: ; GISEL: ; %bb.0: ; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GISEL-NEXT: v_and_b32_e32 v1, 31, v1 ; GISEL-NEXT: v_lshl_add_u32 v0, v0, v1, v2 ; GISEL-NEXT: s_setpc_b64 s[30:31] %and = and i32 %b, 31 @@ -377,8 +374,7 @@ ; GISEL-LABEL: add_cshl: ; GISEL: ; %bb.0: ; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GISEL-NEXT: v_and_b32_e32 v2, 31, v1 -; GISEL-NEXT: v_add_lshl_u32 v0, v0, v1, v2 +; GISEL-NEXT: v_add_lshl_u32 v0, v0, v1, v1 ; GISEL-NEXT: s_setpc_b64 s[30:31] %add = add i32 %a, %b %and = and i32 %b, 31 Index: llvm/test/CodeGen/AMDGPU/ctlz.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/ctlz.ll +++ llvm/test/CodeGen/AMDGPU/ctlz.ll @@ -1115,6 +1115,7 @@ ; GFX10-GISEL: ; %bb.0: ; GFX10-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX10-GISEL-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-GISEL-NEXT: global_load_dword v0, v0, s[2:3] ; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) @@ -1122,8 +1123,7 @@ ; GFX10-GISEL-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX10-GISEL-NEXT: v_min_u32_e32 v1, 32, v1 ; GFX10-GISEL-NEXT: v_cndmask_b32_e64 v0, v1, -1, vcc_lo -; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, 0 -; GFX10-GISEL-NEXT: global_store_dword v1, v0, s[0:1] +; GFX10-GISEL-NEXT: global_store_dword v2, v0, s[0:1] ; GFX10-GISEL-NEXT: s_endpgm ; ; GFX11-LABEL: v_ctlz_i32_sel_eq_neg1: @@ -1221,6 +1221,7 @@ ; GFX10-GISEL: ; %bb.0: ; GFX10-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX10-GISEL-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-GISEL-NEXT: global_load_dword v0, v0, s[2:3] ; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) @@ -1228,8 +1229,7 @@ ; GFX10-GISEL-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0 ; GFX10-GISEL-NEXT: v_min_u32_e32 v1, 32, v1 ; GFX10-GISEL-NEXT: v_cndmask_b32_e32 v0, -1, v1, vcc_lo -; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, 0 -; GFX10-GISEL-NEXT: global_store_dword v1, v0, s[0:1] +; GFX10-GISEL-NEXT: global_store_dword v2, v0, s[0:1] ; GFX10-GISEL-NEXT: s_endpgm ; ; GFX11-LABEL: v_ctlz_i32_sel_ne_neg1: @@ -1577,6 +1577,7 @@ ; GFX10-GISEL-NEXT: v_mov_b32_e32 v2, s3 ; GFX10-GISEL-NEXT: v_add_co_u32 v0, vcc_lo, v1, v0 ; GFX10-GISEL-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v2, v3, vcc_lo +; GFX10-GISEL-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-GISEL-NEXT: global_load_ubyte v0, v[0:1], off ; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX10-GISEL-NEXT: v_ffbh_u32_e32 v1, v0 @@ -1584,8 +1585,7 @@ ; GFX10-GISEL-NEXT: v_min_u32_e32 v1, 32, v1 ; GFX10-GISEL-NEXT: v_subrev_nc_u32_e32 v1, 24, v1 ; GFX10-GISEL-NEXT: v_cndmask_b32_e64 v0, v1, -1, vcc_lo -; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, 0 -; GFX10-GISEL-NEXT: global_store_byte v1, v0, s[0:1] +; GFX10-GISEL-NEXT: global_store_byte v2, v0, s[0:1] ; GFX10-GISEL-NEXT: s_endpgm ; ; GFX11-LABEL: v_ctlz_i8_sel_eq_neg1: Index: llvm/test/CodeGen/AMDGPU/ctlz_zero_undef.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/ctlz_zero_undef.ll +++ llvm/test/CodeGen/AMDGPU/ctlz_zero_undef.ll @@ -763,13 +763,13 @@ ; GFX9-GISEL: ; %bb.0: ; GFX9-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX9-GISEL-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-GISEL-NEXT: global_load_dword v0, v0, s[2:3] ; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) -; GFX9-GISEL-NEXT: v_ffbh_u32_e32 v1, v0 +; GFX9-GISEL-NEXT: v_ffbh_u32_e32 v2, v0 ; GFX9-GISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX9-GISEL-NEXT: v_cndmask_b32_e64 v0, v1, -1, vcc -; GFX9-GISEL-NEXT: v_mov_b32_e32 v1, 0 +; GFX9-GISEL-NEXT: v_cndmask_b32_e64 v0, v2, -1, vcc ; GFX9-GISEL-NEXT: global_store_dword v1, v0, s[0:1] ; GFX9-GISEL-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() @@ -842,13 +842,13 @@ ; GFX9-GISEL: ; %bb.0: ; GFX9-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX9-GISEL-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-GISEL-NEXT: global_load_dword v0, v0, s[2:3] ; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) -; GFX9-GISEL-NEXT: v_ffbh_u32_e32 v1, v0 +; GFX9-GISEL-NEXT: v_ffbh_u32_e32 v2, v0 ; GFX9-GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 -; GFX9-GISEL-NEXT: v_cndmask_b32_e32 v0, -1, v1, vcc -; GFX9-GISEL-NEXT: v_mov_b32_e32 v1, 0 +; GFX9-GISEL-NEXT: v_cndmask_b32_e32 v0, -1, v2, vcc ; GFX9-GISEL-NEXT: global_store_dword v1, v0, s[0:1] ; GFX9-GISEL-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() @@ -932,12 +932,12 @@ ; GFX9-GISEL-NEXT: v_add_co_u32_e32 v0, vcc, v1, v0 ; GFX9-GISEL-NEXT: v_addc_co_u32_e32 v1, vcc, v2, v3, vcc ; GFX9-GISEL-NEXT: global_load_ubyte v0, v[0:1], off +; GFX9-GISEL-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) -; GFX9-GISEL-NEXT: v_ffbh_u32_e32 v1, v0 -; GFX9-GISEL-NEXT: v_subrev_u32_e32 v1, 24, v1 +; GFX9-GISEL-NEXT: v_ffbh_u32_e32 v2, v0 +; GFX9-GISEL-NEXT: v_subrev_u32_e32 v2, 24, v2 ; GFX9-GISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX9-GISEL-NEXT: v_cndmask_b32_e64 v0, v1, -1, vcc -; GFX9-GISEL-NEXT: v_mov_b32_e32 v1, 0 +; GFX9-GISEL-NEXT: v_cndmask_b32_e64 v0, v2, -1, vcc ; GFX9-GISEL-NEXT: global_store_byte v1, v0, s[0:1] ; GFX9-GISEL-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() @@ -1117,13 +1117,13 @@ ; GFX9-GISEL: ; %bb.0: ; GFX9-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX9-GISEL-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-GISEL-NEXT: global_load_dword v0, v0, s[2:3] ; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) -; GFX9-GISEL-NEXT: v_ffbh_u32_e32 v1, v0 +; GFX9-GISEL-NEXT: v_ffbh_u32_e32 v2, v0 ; GFX9-GISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX9-GISEL-NEXT: v_cndmask_b32_e64 v0, v1, 0, vcc -; GFX9-GISEL-NEXT: v_mov_b32_e32 v1, 0 +; GFX9-GISEL-NEXT: v_cndmask_b32_e64 v0, v2, 0, vcc ; GFX9-GISEL-NEXT: global_store_dword v1, v0, s[0:1] ; GFX9-GISEL-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() @@ -1201,13 +1201,13 @@ ; GFX9-GISEL: ; %bb.0: ; GFX9-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX9-GISEL-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-GISEL-NEXT: global_load_dword v0, v0, s[2:3] ; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) -; GFX9-GISEL-NEXT: v_ffbh_u32_e32 v1, v0 +; GFX9-GISEL-NEXT: v_ffbh_u32_e32 v2, v0 ; GFX9-GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 -; GFX9-GISEL-NEXT: v_cndmask_b32_e32 v0, 0, v1, vcc -; GFX9-GISEL-NEXT: v_mov_b32_e32 v1, 0 +; GFX9-GISEL-NEXT: v_cndmask_b32_e32 v0, 0, v2, vcc ; GFX9-GISEL-NEXT: global_store_dword v1, v0, s[0:1] ; GFX9-GISEL-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() @@ -1286,13 +1286,13 @@ ; GFX9-GISEL: ; %bb.0: ; GFX9-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX9-GISEL-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-GISEL-NEXT: global_load_dword v0, v0, s[2:3] ; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) -; GFX9-GISEL-NEXT: v_ffbh_u32_e32 v1, v0 +; GFX9-GISEL-NEXT: v_ffbh_u32_e32 v2, v0 ; GFX9-GISEL-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 -; GFX9-GISEL-NEXT: v_cndmask_b32_e64 v0, v1, 0, vcc -; GFX9-GISEL-NEXT: v_mov_b32_e32 v1, 0 +; GFX9-GISEL-NEXT: v_cndmask_b32_e64 v0, v2, 0, vcc ; GFX9-GISEL-NEXT: global_store_dword v1, v0, s[0:1] ; GFX9-GISEL-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() @@ -1371,13 +1371,13 @@ ; GFX9-GISEL: ; %bb.0: ; GFX9-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX9-GISEL-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-GISEL-NEXT: global_load_dword v0, v0, s[2:3] ; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) -; GFX9-GISEL-NEXT: v_ffbh_u32_e32 v1, v0 +; GFX9-GISEL-NEXT: v_ffbh_u32_e32 v2, v0 ; GFX9-GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 1, v0 -; GFX9-GISEL-NEXT: v_cndmask_b32_e32 v0, 0, v1, vcc -; GFX9-GISEL-NEXT: v_mov_b32_e32 v1, 0 +; GFX9-GISEL-NEXT: v_cndmask_b32_e32 v0, 0, v2, vcc ; GFX9-GISEL-NEXT: global_store_dword v1, v0, s[0:1] ; GFX9-GISEL-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() Index: llvm/test/CodeGen/AMDGPU/cttz.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/cttz.ll +++ llvm/test/CodeGen/AMDGPU/cttz.ll @@ -957,6 +957,7 @@ ; GFX10-GISEL: ; %bb.0: ; GFX10-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX10-GISEL-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-GISEL-NEXT: global_load_dword v0, v0, s[2:3] ; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) @@ -964,8 +965,7 @@ ; GFX10-GISEL-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX10-GISEL-NEXT: v_min_u32_e32 v1, 32, v1 ; GFX10-GISEL-NEXT: v_cndmask_b32_e64 v0, v1, -1, vcc_lo -; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, 0 -; GFX10-GISEL-NEXT: global_store_dword v1, v0, s[0:1] +; GFX10-GISEL-NEXT: global_store_dword v2, v0, s[0:1] ; GFX10-GISEL-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %in.gep = getelementptr i32, ptr addrspace(1) %valptr, i32 %tid @@ -1051,6 +1051,7 @@ ; GFX10-GISEL: ; %bb.0: ; GFX10-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX10-GISEL-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-GISEL-NEXT: global_load_dword v0, v0, s[2:3] ; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) @@ -1058,8 +1059,7 @@ ; GFX10-GISEL-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0 ; GFX10-GISEL-NEXT: v_min_u32_e32 v1, 32, v1 ; GFX10-GISEL-NEXT: v_cndmask_b32_e32 v0, -1, v1, vcc_lo -; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, 0 -; GFX10-GISEL-NEXT: global_store_dword v1, v0, s[0:1] +; GFX10-GISEL-NEXT: global_store_dword v2, v0, s[0:1] ; GFX10-GISEL-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %in.gep = getelementptr i32, ptr addrspace(1) %valptr, i32 %tid Index: llvm/test/CodeGen/AMDGPU/ds-alignment.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/ds-alignment.ll +++ llvm/test/CodeGen/AMDGPU/ds-alignment.ll @@ -105,14 +105,14 @@ ; ALIGNED-GISEL-LABEL: ds4align1: ; ALIGNED-GISEL: ; %bb.0: ; ALIGNED-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; ALIGNED-GISEL-NEXT: v_mov_b32_e32 v4, 8 ; ALIGNED-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; ALIGNED-GISEL-NEXT: v_mov_b32_e32 v0, s0 ; ALIGNED-GISEL-NEXT: ds_read_u8 v1, v0 ; ALIGNED-GISEL-NEXT: ds_read_u8 v2, v0 offset:1 ; ALIGNED-GISEL-NEXT: ds_read_u8 v3, v0 offset:3 ; ALIGNED-GISEL-NEXT: ds_read_u8 v0, v0 offset:2 -; ALIGNED-GISEL-NEXT: s_mov_b32 s0, 8 -; ALIGNED-GISEL-NEXT: v_mov_b32_e32 v4, s1 +; ALIGNED-GISEL-NEXT: v_mov_b32_e32 v5, s1 ; ALIGNED-GISEL-NEXT: s_waitcnt lgkmcnt(2) ; ALIGNED-GISEL-NEXT: v_lshl_or_b32 v1, v2, 8, v1 ; ALIGNED-GISEL-NEXT: s_waitcnt lgkmcnt(1) @@ -121,11 +121,11 @@ ; ALIGNED-GISEL-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; ALIGNED-GISEL-NEXT: v_or3_b32 v0, v2, v0, v1 ; ALIGNED-GISEL-NEXT: v_lshrrev_b16_e32 v1, 8, v0 -; ALIGNED-GISEL-NEXT: ds_write_b8 v4, v0 -; ALIGNED-GISEL-NEXT: ds_write_b8 v4, v1 offset:1 -; ALIGNED-GISEL-NEXT: v_lshrrev_b16_sdwa v1, s0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; ALIGNED-GISEL-NEXT: ds_write_b8_d16_hi v4, v0 offset:2 -; ALIGNED-GISEL-NEXT: ds_write_b8 v4, v1 offset:3 +; ALIGNED-GISEL-NEXT: ds_write_b8 v5, v0 +; ALIGNED-GISEL-NEXT: ds_write_b8 v5, v1 offset:1 +; ALIGNED-GISEL-NEXT: v_lshrrev_b16_sdwa v1, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; ALIGNED-GISEL-NEXT: ds_write_b8_d16_hi v5, v0 offset:2 +; ALIGNED-GISEL-NEXT: ds_write_b8 v5, v1 offset:3 ; ALIGNED-GISEL-NEXT: s_endpgm ; ; UNALIGNED-LABEL: ds4align1: @@ -235,7 +235,6 @@ ; ALIGNED-GISEL-LABEL: ds8align1: ; ALIGNED-GISEL: ; %bb.0: ; ALIGNED-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; ALIGNED-GISEL-NEXT: s_mov_b32 s2, 8 ; ALIGNED-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; ALIGNED-GISEL-NEXT: v_mov_b32_e32 v0, s0 ; ALIGNED-GISEL-NEXT: ds_read_u8 v1, v0 @@ -262,13 +261,14 @@ ; ALIGNED-GISEL-NEXT: v_mov_b32_e32 v3, s1 ; ALIGNED-GISEL-NEXT: ds_write_b8 v3, v1 ; ALIGNED-GISEL-NEXT: ds_write_b8 v3, v2 offset:1 -; ALIGNED-GISEL-NEXT: v_lshrrev_b16_sdwa v2, s2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; ALIGNED-GISEL-NEXT: v_mov_b32_e32 v2, 8 +; ALIGNED-GISEL-NEXT: v_lshrrev_b16_sdwa v4, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; ALIGNED-GISEL-NEXT: ds_write_b8_d16_hi v3, v1 offset:2 -; ALIGNED-GISEL-NEXT: ds_write_b8 v3, v2 offset:3 +; ALIGNED-GISEL-NEXT: ds_write_b8 v3, v4 offset:3 ; ALIGNED-GISEL-NEXT: v_lshrrev_b16_e32 v1, 8, v0 ; ALIGNED-GISEL-NEXT: ds_write_b8 v3, v0 offset:4 ; ALIGNED-GISEL-NEXT: ds_write_b8 v3, v1 offset:5 -; ALIGNED-GISEL-NEXT: v_lshrrev_b16_sdwa v1, s2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; ALIGNED-GISEL-NEXT: v_lshrrev_b16_sdwa v1, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; ALIGNED-GISEL-NEXT: ds_write_b8_d16_hi v3, v0 offset:6 ; ALIGNED-GISEL-NEXT: ds_write_b8 v3, v1 offset:7 ; ALIGNED-GISEL-NEXT: s_endpgm @@ -416,7 +416,6 @@ ; ALIGNED-GISEL-LABEL: ds12align1: ; ALIGNED-GISEL: ; %bb.0: ; ALIGNED-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; ALIGNED-GISEL-NEXT: s_mov_b32 s2, 8 ; ALIGNED-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; ALIGNED-GISEL-NEXT: v_mov_b32_e32 v0, s0 ; ALIGNED-GISEL-NEXT: ds_read_u8 v1, v0 @@ -448,25 +447,26 @@ ; ALIGNED-GISEL-NEXT: v_lshlrev_b32_e32 v4, 16, v5 ; ALIGNED-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; ALIGNED-GISEL-NEXT: v_lshlrev_b32_e32 v0, 24, v0 -; ALIGNED-GISEL-NEXT: v_or3_b32 v2, v6, v7, v2 ; ALIGNED-GISEL-NEXT: v_or3_b32 v0, v0, v4, v3 ; ALIGNED-GISEL-NEXT: v_lshrrev_b16_e32 v3, 8, v1 ; ALIGNED-GISEL-NEXT: v_mov_b32_e32 v4, s1 +; ALIGNED-GISEL-NEXT: v_or3_b32 v2, v6, v7, v2 ; ALIGNED-GISEL-NEXT: ds_write_b8 v4, v1 ; ALIGNED-GISEL-NEXT: ds_write_b8 v4, v3 offset:1 -; ALIGNED-GISEL-NEXT: v_lshrrev_b16_sdwa v3, s2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; ALIGNED-GISEL-NEXT: v_mov_b32_e32 v3, 8 +; ALIGNED-GISEL-NEXT: v_lshrrev_b16_sdwa v5, v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; ALIGNED-GISEL-NEXT: ds_write_b8_d16_hi v4, v1 offset:2 -; ALIGNED-GISEL-NEXT: ds_write_b8 v4, v3 offset:3 +; ALIGNED-GISEL-NEXT: ds_write_b8 v4, v5 offset:3 ; ALIGNED-GISEL-NEXT: v_lshrrev_b16_e32 v1, 8, v2 ; ALIGNED-GISEL-NEXT: ds_write_b8 v4, v2 offset:4 ; ALIGNED-GISEL-NEXT: ds_write_b8 v4, v1 offset:5 -; ALIGNED-GISEL-NEXT: v_lshrrev_b16_sdwa v1, s2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; ALIGNED-GISEL-NEXT: v_lshrrev_b16_sdwa v1, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; ALIGNED-GISEL-NEXT: ds_write_b8_d16_hi v4, v2 offset:6 ; ALIGNED-GISEL-NEXT: ds_write_b8 v4, v1 offset:7 ; ALIGNED-GISEL-NEXT: v_lshrrev_b16_e32 v1, 8, v0 ; ALIGNED-GISEL-NEXT: ds_write_b8 v4, v0 offset:8 ; ALIGNED-GISEL-NEXT: ds_write_b8 v4, v1 offset:9 -; ALIGNED-GISEL-NEXT: v_lshrrev_b16_sdwa v1, s2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; ALIGNED-GISEL-NEXT: v_lshrrev_b16_sdwa v1, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; ALIGNED-GISEL-NEXT: ds_write_b8_d16_hi v4, v0 offset:10 ; ALIGNED-GISEL-NEXT: ds_write_b8 v4, v1 offset:11 ; ALIGNED-GISEL-NEXT: s_endpgm @@ -717,7 +717,6 @@ ; ALIGNED-GISEL-LABEL: ds16align1: ; ALIGNED-GISEL: ; %bb.0: ; ALIGNED-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; ALIGNED-GISEL-NEXT: s_mov_b32 s2, 8 ; ALIGNED-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; ALIGNED-GISEL-NEXT: v_mov_b32_e32 v0, s0 ; ALIGNED-GISEL-NEXT: ds_read_u8 v1, v0 @@ -764,26 +763,26 @@ ; ALIGNED-GISEL-NEXT: v_mov_b32_e32 v5, s1 ; ALIGNED-GISEL-NEXT: ds_write_b8 v5, v1 ; ALIGNED-GISEL-NEXT: ds_write_b8 v5, v4 offset:1 -; ALIGNED-GISEL-NEXT: v_lshrrev_b16_sdwa v4, s2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; ALIGNED-GISEL-NEXT: v_mov_b32_e32 v4, 8 +; ALIGNED-GISEL-NEXT: v_lshrrev_b16_sdwa v6, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; ALIGNED-GISEL-NEXT: ds_write_b8_d16_hi v5, v1 offset:2 -; ALIGNED-GISEL-NEXT: ds_write_b8 v5, v4 offset:3 +; ALIGNED-GISEL-NEXT: ds_write_b8 v5, v6 offset:3 ; ALIGNED-GISEL-NEXT: v_lshrrev_b16_e32 v1, 8, v2 ; ALIGNED-GISEL-NEXT: ds_write_b8 v5, v2 offset:4 ; ALIGNED-GISEL-NEXT: ds_write_b8 v5, v1 offset:5 -; ALIGNED-GISEL-NEXT: v_lshrrev_b16_sdwa v1, s2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; ALIGNED-GISEL-NEXT: v_lshrrev_b16_sdwa v1, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; ALIGNED-GISEL-NEXT: ds_write_b8_d16_hi v5, v2 offset:6 ; ALIGNED-GISEL-NEXT: ds_write_b8 v5, v1 offset:7 ; ALIGNED-GISEL-NEXT: v_lshrrev_b16_e32 v1, 8, v3 ; ALIGNED-GISEL-NEXT: ds_write_b8 v5, v3 offset:8 ; ALIGNED-GISEL-NEXT: ds_write_b8 v5, v1 offset:9 -; ALIGNED-GISEL-NEXT: v_lshrrev_b16_sdwa v1, s2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; ALIGNED-GISEL-NEXT: v_lshrrev_b16_sdwa v1, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; ALIGNED-GISEL-NEXT: ds_write_b8_d16_hi v5, v3 offset:10 ; ALIGNED-GISEL-NEXT: ds_write_b8 v5, v1 offset:11 ; ALIGNED-GISEL-NEXT: v_lshrrev_b16_e32 v1, 8, v0 ; ALIGNED-GISEL-NEXT: ds_write_b8 v5, v0 offset:12 ; ALIGNED-GISEL-NEXT: ds_write_b8 v5, v1 offset:13 -; ALIGNED-GISEL-NEXT: v_mov_b32_e32 v1, 8 -; ALIGNED-GISEL-NEXT: v_lshrrev_b16_sdwa v1, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; ALIGNED-GISEL-NEXT: v_lshrrev_b16_sdwa v1, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; ALIGNED-GISEL-NEXT: ds_write_b8_d16_hi v5, v0 offset:14 ; ALIGNED-GISEL-NEXT: ds_write_b8 v5, v1 offset:15 ; ALIGNED-GISEL-NEXT: s_endpgm Index: llvm/test/CodeGen/AMDGPU/flat-scratch-svs.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/flat-scratch-svs.ll +++ llvm/test/CodeGen/AMDGPU/flat-scratch-svs.ll @@ -36,17 +36,17 @@ ; GFX940-GISEL-NEXT: s_load_dword s0, s[0:1], 0x24 ; GFX940-GISEL-NEXT: v_mov_b32_e32 v1, 4 ; GFX940-GISEL-NEXT: v_mov_b32_e32 v2, 1 +; GFX940-GISEL-NEXT: v_mov_b32_e32 v3, 2 ; GFX940-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-GISEL-NEXT: v_add3_u32 v0, v1, s0, v0 ; GFX940-GISEL-NEXT: v_add_u32_e32 v1, 1, v0 -; GFX940-GISEL-NEXT: v_add_u32_e32 v3, 2, v0 ; GFX940-GISEL-NEXT: scratch_store_byte v1, v2, off sc0 sc1 ; GFX940-GISEL-NEXT: s_waitcnt vmcnt(0) -; GFX940-GISEL-NEXT: v_mov_b32_e32 v1, 2 -; GFX940-GISEL-NEXT: scratch_store_byte v3, v1, off sc0 sc1 +; GFX940-GISEL-NEXT: v_add_u32_e32 v1, 2, v0 +; GFX940-GISEL-NEXT: scratch_store_byte v1, v3, off sc0 sc1 ; GFX940-GISEL-NEXT: s_waitcnt vmcnt(0) -; GFX940-GISEL-NEXT: v_add_u32_e32 v0, 4, v0 ; GFX940-GISEL-NEXT: v_mov_b32_e32 v1, 4 +; GFX940-GISEL-NEXT: v_add_u32_e32 v0, 4, v0 ; GFX940-GISEL-NEXT: scratch_store_byte v0, v1, off sc0 sc1 ; GFX940-GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX940-GISEL-NEXT: s_endpgm @@ -135,12 +135,12 @@ ; GFX940-GISEL-NEXT: v_add_u32_e32 v1, 1, v0 ; GFX940-GISEL-NEXT: scratch_store_byte v1, v2, off sc0 sc1 ; GFX940-GISEL-NEXT: s_waitcnt vmcnt(0) -; GFX940-GISEL-NEXT: v_add_u32_e32 v1, 2, v0 -; GFX940-GISEL-NEXT: v_mov_b32_e32 v2, 2 -; GFX940-GISEL-NEXT: scratch_store_byte v1, v2, off sc0 sc1 +; GFX940-GISEL-NEXT: v_mov_b32_e32 v1, 2 +; GFX940-GISEL-NEXT: v_add_u32_e32 v2, 2, v0 +; GFX940-GISEL-NEXT: scratch_store_byte v2, v1, off sc0 sc1 ; GFX940-GISEL-NEXT: s_waitcnt vmcnt(0) -; GFX940-GISEL-NEXT: v_add_u32_e32 v0, 4, v0 ; GFX940-GISEL-NEXT: v_mov_b32_e32 v1, 4 +; GFX940-GISEL-NEXT: v_add_u32_e32 v0, 4, v0 ; GFX940-GISEL-NEXT: scratch_store_byte v0, v1, off sc0 sc1 ; GFX940-GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX940-GISEL-NEXT: s_endpgm @@ -225,18 +225,18 @@ ; GFX940-GISEL-NEXT: s_load_dword s0, s[0:1], 0x24 ; GFX940-GISEL-NEXT: v_mov_b32_e32 v1, 4 ; GFX940-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX940-GISEL-NEXT: v_mov_b32_e32 v2, 1 +; GFX940-GISEL-NEXT: v_mov_b32_e32 v2, 2 ; GFX940-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-GISEL-NEXT: v_add3_u32 v0, v1, s0, v0 -; GFX940-GISEL-NEXT: v_add_u32_e32 v1, 1, v0 -; GFX940-GISEL-NEXT: scratch_store_byte v1, v2, off sc0 sc1 +; GFX940-GISEL-NEXT: v_mov_b32_e32 v1, 1 +; GFX940-GISEL-NEXT: v_add_u32_e32 v3, 1, v0 +; GFX940-GISEL-NEXT: scratch_store_byte v3, v1, off sc0 sc1 ; GFX940-GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX940-GISEL-NEXT: v_add_u32_e32 v1, 2, v0 -; GFX940-GISEL-NEXT: v_mov_b32_e32 v2, 2 ; GFX940-GISEL-NEXT: scratch_store_byte v1, v2, off sc0 sc1 ; GFX940-GISEL-NEXT: s_waitcnt vmcnt(0) -; GFX940-GISEL-NEXT: v_add_u32_e32 v0, 4, v0 ; GFX940-GISEL-NEXT: v_mov_b32_e32 v1, 4 +; GFX940-GISEL-NEXT: v_add_u32_e32 v0, 4, v0 ; GFX940-GISEL-NEXT: scratch_store_byte v0, v1, off sc0 sc1 ; GFX940-GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX940-GISEL-NEXT: s_endpgm @@ -263,17 +263,17 @@ ; GFX11-GISEL-LABEL: soff1_voff4: ; GFX11-GISEL: ; %bb.0: ; %bb ; GFX11-GISEL-NEXT: s_load_b32 s0, s[0:1], 0x24 -; GFX11-GISEL-NEXT: v_dual_mov_b32 v1, 1 :: v_dual_lshlrev_b32 v0, 2, v0 -; GFX11-GISEL-NEXT: v_dual_mov_b32 v2, 2 :: v_dual_mov_b32 v3, 4 +; GFX11-GISEL-NEXT: v_dual_mov_b32 v1, 2 :: v_dual_lshlrev_b32 v0, 2, v0 +; GFX11-GISEL-NEXT: v_dual_mov_b32 v2, 1 :: v_dual_mov_b32 v3, 4 ; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-GISEL-NEXT: v_add3_u32 v0, 4, s0, v0 ; GFX11-GISEL-NEXT: v_add_nc_u32_e32 v4, 1, v0 ; GFX11-GISEL-NEXT: v_add_nc_u32_e32 v5, 2, v0 ; GFX11-GISEL-NEXT: v_add_nc_u32_e32 v0, 4, v0 -; GFX11-GISEL-NEXT: scratch_store_b8 v4, v1, off dlc +; GFX11-GISEL-NEXT: scratch_store_b8 v4, v2, off dlc ; GFX11-GISEL-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-GISEL-NEXT: scratch_store_b8 v5, v2, off dlc +; GFX11-GISEL-NEXT: scratch_store_b8 v5, v1, off dlc ; GFX11-GISEL-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-GISEL-NEXT: scratch_store_b8 v0, v3, off dlc ; GFX11-GISEL-NEXT: s_waitcnt_vscnt null, 0x0 @@ -322,18 +322,18 @@ ; GFX940-GISEL-NEXT: s_load_dword s0, s[0:1], 0x24 ; GFX940-GISEL-NEXT: v_mov_b32_e32 v1, 4 ; GFX940-GISEL-NEXT: v_mov_b32_e32 v2, 1 +; GFX940-GISEL-NEXT: v_mov_b32_e32 v3, 2 ; GFX940-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-GISEL-NEXT: s_lshl_b32 s0, s0, 1 ; GFX940-GISEL-NEXT: v_add3_u32 v0, v1, s0, v0 ; GFX940-GISEL-NEXT: v_add_u32_e32 v1, 1, v0 -; GFX940-GISEL-NEXT: v_add_u32_e32 v3, 2, v0 ; GFX940-GISEL-NEXT: scratch_store_byte v1, v2, off sc0 sc1 ; GFX940-GISEL-NEXT: s_waitcnt vmcnt(0) -; GFX940-GISEL-NEXT: v_mov_b32_e32 v1, 2 -; GFX940-GISEL-NEXT: scratch_store_byte v3, v1, off sc0 sc1 +; GFX940-GISEL-NEXT: v_add_u32_e32 v1, 2, v0 +; GFX940-GISEL-NEXT: scratch_store_byte v1, v3, off sc0 sc1 ; GFX940-GISEL-NEXT: s_waitcnt vmcnt(0) -; GFX940-GISEL-NEXT: v_add_u32_e32 v0, 4, v0 ; GFX940-GISEL-NEXT: v_mov_b32_e32 v1, 4 +; GFX940-GISEL-NEXT: v_add_u32_e32 v0, 4, v0 ; GFX940-GISEL-NEXT: scratch_store_byte v0, v1, off sc0 sc1 ; GFX940-GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX940-GISEL-NEXT: s_endpgm @@ -426,12 +426,12 @@ ; GFX940-GISEL-NEXT: v_add_u32_e32 v1, 1, v0 ; GFX940-GISEL-NEXT: scratch_store_byte v1, v2, off sc0 sc1 ; GFX940-GISEL-NEXT: s_waitcnt vmcnt(0) -; GFX940-GISEL-NEXT: v_add_u32_e32 v1, 2, v0 -; GFX940-GISEL-NEXT: v_mov_b32_e32 v2, 2 -; GFX940-GISEL-NEXT: scratch_store_byte v1, v2, off sc0 sc1 +; GFX940-GISEL-NEXT: v_mov_b32_e32 v1, 2 +; GFX940-GISEL-NEXT: v_add_u32_e32 v2, 2, v0 +; GFX940-GISEL-NEXT: scratch_store_byte v2, v1, off sc0 sc1 ; GFX940-GISEL-NEXT: s_waitcnt vmcnt(0) -; GFX940-GISEL-NEXT: v_add_u32_e32 v0, 4, v0 ; GFX940-GISEL-NEXT: v_mov_b32_e32 v1, 4 +; GFX940-GISEL-NEXT: v_add_u32_e32 v0, 4, v0 ; GFX940-GISEL-NEXT: scratch_store_byte v0, v1, off sc0 sc1 ; GFX940-GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX940-GISEL-NEXT: s_endpgm @@ -521,19 +521,19 @@ ; GFX940-GISEL-NEXT: s_load_dword s0, s[0:1], 0x24 ; GFX940-GISEL-NEXT: v_mov_b32_e32 v1, 4 ; GFX940-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX940-GISEL-NEXT: v_mov_b32_e32 v2, 1 +; GFX940-GISEL-NEXT: v_mov_b32_e32 v2, 2 ; GFX940-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-GISEL-NEXT: s_lshl_b32 s0, s0, 1 ; GFX940-GISEL-NEXT: v_add3_u32 v0, v1, s0, v0 -; GFX940-GISEL-NEXT: v_add_u32_e32 v1, 1, v0 -; GFX940-GISEL-NEXT: scratch_store_byte v1, v2, off sc0 sc1 +; GFX940-GISEL-NEXT: v_mov_b32_e32 v1, 1 +; GFX940-GISEL-NEXT: v_add_u32_e32 v3, 1, v0 +; GFX940-GISEL-NEXT: scratch_store_byte v3, v1, off sc0 sc1 ; GFX940-GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX940-GISEL-NEXT: v_add_u32_e32 v1, 2, v0 -; GFX940-GISEL-NEXT: v_mov_b32_e32 v2, 2 ; GFX940-GISEL-NEXT: scratch_store_byte v1, v2, off sc0 sc1 ; GFX940-GISEL-NEXT: s_waitcnt vmcnt(0) -; GFX940-GISEL-NEXT: v_add_u32_e32 v0, 4, v0 ; GFX940-GISEL-NEXT: v_mov_b32_e32 v1, 4 +; GFX940-GISEL-NEXT: v_add_u32_e32 v0, 4, v0 ; GFX940-GISEL-NEXT: scratch_store_byte v0, v1, off sc0 sc1 ; GFX940-GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX940-GISEL-NEXT: s_endpgm @@ -562,8 +562,8 @@ ; GFX11-GISEL-LABEL: soff2_voff4: ; GFX11-GISEL: ; %bb.0: ; %bb ; GFX11-GISEL-NEXT: s_load_b32 s0, s[0:1], 0x24 -; GFX11-GISEL-NEXT: v_dual_mov_b32 v1, 1 :: v_dual_lshlrev_b32 v0, 2, v0 -; GFX11-GISEL-NEXT: v_dual_mov_b32 v2, 2 :: v_dual_mov_b32 v3, 4 +; GFX11-GISEL-NEXT: v_dual_mov_b32 v1, 2 :: v_dual_lshlrev_b32 v0, 2, v0 +; GFX11-GISEL-NEXT: v_dual_mov_b32 v2, 1 :: v_dual_mov_b32 v3, 4 ; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-GISEL-NEXT: s_lshl_b32 s0, s0, 1 ; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instid1(SALU_CYCLE_1) @@ -572,9 +572,9 @@ ; GFX11-GISEL-NEXT: v_add_nc_u32_e32 v4, 1, v0 ; GFX11-GISEL-NEXT: v_add_nc_u32_e32 v5, 2, v0 ; GFX11-GISEL-NEXT: v_add_nc_u32_e32 v0, 4, v0 -; GFX11-GISEL-NEXT: scratch_store_b8 v4, v1, off dlc +; GFX11-GISEL-NEXT: scratch_store_b8 v4, v2, off dlc ; GFX11-GISEL-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-GISEL-NEXT: scratch_store_b8 v5, v2, off dlc +; GFX11-GISEL-NEXT: scratch_store_b8 v5, v1, off dlc ; GFX11-GISEL-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-GISEL-NEXT: scratch_store_b8 v0, v3, off dlc ; GFX11-GISEL-NEXT: s_waitcnt_vscnt null, 0x0 @@ -623,18 +623,18 @@ ; GFX940-GISEL-NEXT: s_load_dword s0, s[0:1], 0x24 ; GFX940-GISEL-NEXT: v_mov_b32_e32 v1, 4 ; GFX940-GISEL-NEXT: v_mov_b32_e32 v2, 1 +; GFX940-GISEL-NEXT: v_mov_b32_e32 v3, 2 ; GFX940-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-GISEL-NEXT: s_lshl_b32 s0, s0, 2 ; GFX940-GISEL-NEXT: v_add3_u32 v0, v1, s0, v0 ; GFX940-GISEL-NEXT: v_add_u32_e32 v1, 1, v0 -; GFX940-GISEL-NEXT: v_add_u32_e32 v3, 2, v0 ; GFX940-GISEL-NEXT: scratch_store_byte v1, v2, off sc0 sc1 ; GFX940-GISEL-NEXT: s_waitcnt vmcnt(0) -; GFX940-GISEL-NEXT: v_mov_b32_e32 v1, 2 -; GFX940-GISEL-NEXT: scratch_store_byte v3, v1, off sc0 sc1 +; GFX940-GISEL-NEXT: v_add_u32_e32 v1, 2, v0 +; GFX940-GISEL-NEXT: scratch_store_byte v1, v3, off sc0 sc1 ; GFX940-GISEL-NEXT: s_waitcnt vmcnt(0) -; GFX940-GISEL-NEXT: v_add_u32_e32 v0, 4, v0 ; GFX940-GISEL-NEXT: v_mov_b32_e32 v1, 4 +; GFX940-GISEL-NEXT: v_add_u32_e32 v0, 4, v0 ; GFX940-GISEL-NEXT: scratch_store_byte v0, v1, off sc0 sc1 ; GFX940-GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX940-GISEL-NEXT: s_endpgm @@ -727,12 +727,12 @@ ; GFX940-GISEL-NEXT: v_add_u32_e32 v1, 1, v0 ; GFX940-GISEL-NEXT: scratch_store_byte v1, v2, off sc0 sc1 ; GFX940-GISEL-NEXT: s_waitcnt vmcnt(0) -; GFX940-GISEL-NEXT: v_add_u32_e32 v1, 2, v0 -; GFX940-GISEL-NEXT: v_mov_b32_e32 v2, 2 -; GFX940-GISEL-NEXT: scratch_store_byte v1, v2, off sc0 sc1 +; GFX940-GISEL-NEXT: v_mov_b32_e32 v1, 2 +; GFX940-GISEL-NEXT: v_add_u32_e32 v2, 2, v0 +; GFX940-GISEL-NEXT: scratch_store_byte v2, v1, off sc0 sc1 ; GFX940-GISEL-NEXT: s_waitcnt vmcnt(0) -; GFX940-GISEL-NEXT: v_add_u32_e32 v0, 4, v0 ; GFX940-GISEL-NEXT: v_mov_b32_e32 v1, 4 +; GFX940-GISEL-NEXT: v_add_u32_e32 v0, 4, v0 ; GFX940-GISEL-NEXT: scratch_store_byte v0, v1, off sc0 sc1 ; GFX940-GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX940-GISEL-NEXT: s_endpgm @@ -822,19 +822,19 @@ ; GFX940-GISEL-NEXT: s_load_dword s0, s[0:1], 0x24 ; GFX940-GISEL-NEXT: v_mov_b32_e32 v1, 4 ; GFX940-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX940-GISEL-NEXT: v_mov_b32_e32 v2, 1 +; GFX940-GISEL-NEXT: v_mov_b32_e32 v2, 2 ; GFX940-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-GISEL-NEXT: s_lshl_b32 s0, s0, 2 ; GFX940-GISEL-NEXT: v_add3_u32 v0, v1, s0, v0 -; GFX940-GISEL-NEXT: v_add_u32_e32 v1, 1, v0 -; GFX940-GISEL-NEXT: scratch_store_byte v1, v2, off sc0 sc1 +; GFX940-GISEL-NEXT: v_mov_b32_e32 v1, 1 +; GFX940-GISEL-NEXT: v_add_u32_e32 v3, 1, v0 +; GFX940-GISEL-NEXT: scratch_store_byte v3, v1, off sc0 sc1 ; GFX940-GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX940-GISEL-NEXT: v_add_u32_e32 v1, 2, v0 -; GFX940-GISEL-NEXT: v_mov_b32_e32 v2, 2 ; GFX940-GISEL-NEXT: scratch_store_byte v1, v2, off sc0 sc1 ; GFX940-GISEL-NEXT: s_waitcnt vmcnt(0) -; GFX940-GISEL-NEXT: v_add_u32_e32 v0, 4, v0 ; GFX940-GISEL-NEXT: v_mov_b32_e32 v1, 4 +; GFX940-GISEL-NEXT: v_add_u32_e32 v0, 4, v0 ; GFX940-GISEL-NEXT: scratch_store_byte v0, v1, off sc0 sc1 ; GFX940-GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX940-GISEL-NEXT: s_endpgm @@ -863,8 +863,8 @@ ; GFX11-GISEL-LABEL: soff4_voff4: ; GFX11-GISEL: ; %bb.0: ; %bb ; GFX11-GISEL-NEXT: s_load_b32 s0, s[0:1], 0x24 -; GFX11-GISEL-NEXT: v_dual_mov_b32 v1, 1 :: v_dual_lshlrev_b32 v0, 2, v0 -; GFX11-GISEL-NEXT: v_dual_mov_b32 v2, 2 :: v_dual_mov_b32 v3, 4 +; GFX11-GISEL-NEXT: v_dual_mov_b32 v1, 2 :: v_dual_lshlrev_b32 v0, 2, v0 +; GFX11-GISEL-NEXT: v_dual_mov_b32 v2, 1 :: v_dual_mov_b32 v3, 4 ; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-GISEL-NEXT: s_lshl_b32 s0, s0, 2 ; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instid1(SALU_CYCLE_1) @@ -873,9 +873,9 @@ ; GFX11-GISEL-NEXT: v_add_nc_u32_e32 v4, 1, v0 ; GFX11-GISEL-NEXT: v_add_nc_u32_e32 v5, 2, v0 ; GFX11-GISEL-NEXT: v_add_nc_u32_e32 v0, 4, v0 -; GFX11-GISEL-NEXT: scratch_store_b8 v4, v1, off dlc +; GFX11-GISEL-NEXT: scratch_store_b8 v4, v2, off dlc ; GFX11-GISEL-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-GISEL-NEXT: scratch_store_b8 v5, v2, off dlc +; GFX11-GISEL-NEXT: scratch_store_b8 v5, v1, off dlc ; GFX11-GISEL-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-GISEL-NEXT: scratch_store_b8 v0, v3, off dlc ; GFX11-GISEL-NEXT: s_waitcnt_vscnt null, 0x0 Index: llvm/test/CodeGen/AMDGPU/fma.f16.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/fma.f16.ll +++ llvm/test/CodeGen/AMDGPU/fma.f16.ll @@ -96,18 +96,31 @@ ; Regression test for a crash caused by D139469. define i32 @test_D139469_f16(half %arg) { -; GFX9-LABEL: test_D139469_f16: -; GFX9: ; %bb.0: ; %bb -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mul_f16_e32 v1, 0x291e, v0 -; GFX9-NEXT: s_movk_i32 s4, 0x291e -; GFX9-NEXT: v_cmp_gt_f16_e32 vcc, 0, v1 -; GFX9-NEXT: v_mov_b32_e32 v1, 0x211e -; GFX9-NEXT: v_fma_f16 v0, v0, s4, v1 -; GFX9-NEXT: v_cmp_gt_f16_e64 s[4:5], 0, v0 -; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX9-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[4:5] -; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-SDAG-LABEL: test_D139469_f16: +; GFX9-SDAG: ; %bb.0: ; %bb +; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-SDAG-NEXT: v_mul_f16_e32 v1, 0x291e, v0 +; GFX9-SDAG-NEXT: s_movk_i32 s4, 0x291e +; GFX9-SDAG-NEXT: v_cmp_gt_f16_e32 vcc, 0, v1 +; GFX9-SDAG-NEXT: v_mov_b32_e32 v1, 0x211e +; GFX9-SDAG-NEXT: v_fma_f16 v0, v0, s4, v1 +; GFX9-SDAG-NEXT: v_cmp_gt_f16_e64 s[4:5], 0, v0 +; GFX9-SDAG-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX9-SDAG-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[4:5] +; GFX9-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-GISEL-LABEL: test_D139469_f16: +; GFX9-GISEL: ; %bb.0: ; %bb +; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-GISEL-NEXT: v_mul_f16_e32 v2, 0x291e, v0 +; GFX9-GISEL-NEXT: v_mov_b32_e32 v1, 0x291e +; GFX9-GISEL-NEXT: v_cmp_gt_f16_e32 vcc, 0, v2 +; GFX9-GISEL-NEXT: v_mov_b32_e32 v2, 0x211e +; GFX9-GISEL-NEXT: v_fma_f16 v0, v0, v1, v2 +; GFX9-GISEL-NEXT: v_cmp_gt_f16_e64 s[4:5], 0, v0 +; GFX9-GISEL-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX9-GISEL-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[4:5] +; GFX9-GISEL-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-SDAG-LABEL: test_D139469_f16: ; GFX10-SDAG: ; %bb.0: ; %bb @@ -126,10 +139,10 @@ ; GFX10-GISEL: ; %bb.0: ; %bb ; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-GISEL-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-GISEL-NEXT: s_movk_i32 s4, 0x291e -; GFX10-GISEL-NEXT: v_mul_f16_e32 v1, 0x291e, v0 -; GFX10-GISEL-NEXT: v_fmaak_f16 v0, s4, v0, 0x211e -; GFX10-GISEL-NEXT: v_cmp_gt_f16_e32 vcc_lo, 0, v1 +; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, 0x291e +; GFX10-GISEL-NEXT: v_mul_f16_e32 v2, 0x291e, v0 +; GFX10-GISEL-NEXT: v_fmaak_f16 v0, v0, v1, 0x211e +; GFX10-GISEL-NEXT: v_cmp_gt_f16_e32 vcc_lo, 0, v2 ; GFX10-GISEL-NEXT: v_cmp_gt_f16_e64 s4, 0, v0 ; GFX10-GISEL-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX10-GISEL-NEXT: v_cndmask_b32_e64 v0, 0, 1, s4 @@ -167,14 +180,14 @@ ; GFX9-GISEL: ; %bb.0: ; %bb ; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-GISEL-NEXT: s_mov_b32 s4, 0x291e291e -; GFX9-GISEL-NEXT: s_mov_b32 s8, 0 ; GFX9-GISEL-NEXT: v_pk_mul_f16 v1, v0, s4 +; GFX9-GISEL-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-GISEL-NEXT: v_cmp_gt_f16_e32 vcc, 0, v1 -; GFX9-GISEL-NEXT: v_cmp_lt_f16_sdwa s[6:7], v1, s8 src0_sel:WORD_1 src1_sel:DWORD +; GFX9-GISEL-NEXT: v_cmp_lt_f16_sdwa s[6:7], v1, v2 src0_sel:WORD_1 src1_sel:DWORD ; GFX9-GISEL-NEXT: v_mov_b32_e32 v1, 0x211e211e ; GFX9-GISEL-NEXT: v_pk_fma_f16 v0, v0, s4, v1 ; GFX9-GISEL-NEXT: v_cmp_gt_f16_e64 s[4:5], 0, v0 -; GFX9-GISEL-NEXT: v_cmp_lt_f16_sdwa s[8:9], v0, s8 src0_sel:WORD_1 src1_sel:DWORD +; GFX9-GISEL-NEXT: v_cmp_lt_f16_sdwa s[8:9], v0, v2 src0_sel:WORD_1 src1_sel:DWORD ; GFX9-GISEL-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX9-GISEL-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[4:5] ; GFX9-GISEL-NEXT: s_or_b64 s[4:5], s[6:7], s[8:9] @@ -206,14 +219,14 @@ ; GFX10-GISEL-NEXT: s_mov_b32 s4, 0x291e291e ; GFX10-GISEL-NEXT: v_pk_mul_f16 v1, v0, 0x291e op_sel_hi:[1,0] ; GFX10-GISEL-NEXT: v_pk_fma_f16 v0, v0, s4, 0x211e op_sel_hi:[1,1,0] -; GFX10-GISEL-NEXT: s_mov_b32 s5, 0 +; GFX10-GISEL-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-GISEL-NEXT: v_cmp_gt_f16_e32 vcc_lo, 0, v1 ; GFX10-GISEL-NEXT: v_cmp_gt_f16_e64 s4, 0, v0 -; GFX10-GISEL-NEXT: v_cmp_lt_f16_sdwa s6, v1, s5 src0_sel:WORD_1 src1_sel:DWORD -; GFX10-GISEL-NEXT: v_cmp_lt_f16_sdwa s5, v0, s5 src0_sel:WORD_1 src1_sel:DWORD +; GFX10-GISEL-NEXT: v_cmp_lt_f16_sdwa s5, v1, v2 src0_sel:WORD_1 src1_sel:DWORD +; GFX10-GISEL-NEXT: v_cmp_lt_f16_sdwa s6, v0, v2 src0_sel:WORD_1 src1_sel:DWORD ; GFX10-GISEL-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX10-GISEL-NEXT: v_cndmask_b32_e64 v0, 0, 1, s4 -; GFX10-GISEL-NEXT: s_or_b32 s4, s6, s5 +; GFX10-GISEL-NEXT: s_or_b32 s4, s5, s6 ; GFX10-GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, s4 ; GFX10-GISEL-NEXT: s_setpc_b64 s[30:31] bb: Index: llvm/test/CodeGen/AMDGPU/integer-mad-patterns.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/integer-mad-patterns.ll +++ llvm/test/CodeGen/AMDGPU/integer-mad-patterns.ll @@ -404,10 +404,9 @@ ; GFX8-GISEL-LABEL: clpeak_imad_pat_v2i16: ; GFX8-GISEL: ; %bb.0: ; %entry ; GFX8-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-GISEL-NEXT: v_mov_b32_e32 v4, 1 ; GFX8-GISEL-NEXT: v_lshrrev_b32_e32 v2, 16, v0 ; GFX8-GISEL-NEXT: v_add_u16_e32 v3, 1, v0 -; GFX8-GISEL-NEXT: v_add_u16_sdwa v4, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-GISEL-NEXT: v_add_u16_e32 v4, 1, v2 ; GFX8-GISEL-NEXT: v_lshrrev_b32_e32 v5, 16, v1 ; GFX8-GISEL-NEXT: v_mad_u16 v0, v3, v1, v0 ; GFX8-GISEL-NEXT: v_mad_u16 v2, v4, v5, v2 @@ -983,19 +982,18 @@ ; GFX8-GISEL-LABEL: clpeak_imad_pat_v4i16: ; GFX8-GISEL: ; %bb.0: ; %entry ; GFX8-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-GISEL-NEXT: v_mov_b32_e32 v7, 1 ; GFX8-GISEL-NEXT: v_lshrrev_b32_e32 v4, 16, v0 +; GFX8-GISEL-NEXT: v_lshrrev_b32_e32 v5, 16, v1 ; GFX8-GISEL-NEXT: v_add_u16_e32 v6, 1, v0 -; GFX8-GISEL-NEXT: v_add_u16_sdwa v8, v0, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-GISEL-NEXT: v_add_u16_e32 v7, 1, v4 ; GFX8-GISEL-NEXT: v_lshrrev_b32_e32 v10, 16, v2 -; GFX8-GISEL-NEXT: v_lshrrev_b32_e32 v5, 16, v1 -; GFX8-GISEL-NEXT: v_add_u16_e32 v9, 1, v1 -; GFX8-GISEL-NEXT: v_add_u16_sdwa v7, v1, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-GISEL-NEXT: v_add_u16_e32 v8, 1, v1 +; GFX8-GISEL-NEXT: v_add_u16_e32 v9, 1, v5 ; GFX8-GISEL-NEXT: v_lshrrev_b32_e32 v11, 16, v3 ; GFX8-GISEL-NEXT: v_mad_u16 v0, v6, v2, v0 -; GFX8-GISEL-NEXT: v_mad_u16 v4, v8, v10, v4 -; GFX8-GISEL-NEXT: v_mad_u16 v1, v9, v3, v1 -; GFX8-GISEL-NEXT: v_mad_u16 v5, v7, v11, v5 +; GFX8-GISEL-NEXT: v_mad_u16 v4, v7, v10, v4 +; GFX8-GISEL-NEXT: v_mad_u16 v1, v8, v3, v1 +; GFX8-GISEL-NEXT: v_mad_u16 v5, v9, v11, v5 ; GFX8-GISEL-NEXT: v_add_u16_e32 v0, 1, v0 ; GFX8-GISEL-NEXT: v_add_u16_e32 v4, 1, v4 ; GFX8-GISEL-NEXT: v_add_u16_e32 v1, 1, v1 @@ -1003,19 +1001,19 @@ ; GFX8-GISEL-NEXT: v_mul_lo_u16_e32 v12, v0, v2 ; GFX8-GISEL-NEXT: v_mul_lo_u16_e32 v13, v4, v10 ; GFX8-GISEL-NEXT: v_mad_u16 v6, v6, v2, 1 -; GFX8-GISEL-NEXT: v_mad_u16 v8, v8, v10, 1 +; GFX8-GISEL-NEXT: v_mad_u16 v7, v7, v10, 1 ; GFX8-GISEL-NEXT: v_mul_lo_u16_e32 v14, v1, v3 ; GFX8-GISEL-NEXT: v_mul_lo_u16_e32 v15, v5, v11 -; GFX8-GISEL-NEXT: v_mad_u16 v9, v9, v3, 1 -; GFX8-GISEL-NEXT: v_mad_u16 v7, v7, v11, 1 +; GFX8-GISEL-NEXT: v_mad_u16 v8, v8, v3, 1 +; GFX8-GISEL-NEXT: v_mad_u16 v9, v9, v11, 1 ; GFX8-GISEL-NEXT: v_mad_u16 v0, v0, v2, 1 ; GFX8-GISEL-NEXT: v_mad_u16 v2, v4, v10, 1 ; GFX8-GISEL-NEXT: v_mad_u16 v1, v1, v3, 1 ; GFX8-GISEL-NEXT: v_mad_u16 v3, v5, v11, 1 ; GFX8-GISEL-NEXT: v_mul_lo_u16_e32 v4, v12, v6 -; GFX8-GISEL-NEXT: v_mul_lo_u16_e32 v5, v13, v8 -; GFX8-GISEL-NEXT: v_mul_lo_u16_e32 v6, v14, v9 -; GFX8-GISEL-NEXT: v_mul_lo_u16_e32 v7, v15, v7 +; GFX8-GISEL-NEXT: v_mul_lo_u16_e32 v5, v13, v7 +; GFX8-GISEL-NEXT: v_mul_lo_u16_e32 v6, v14, v8 +; GFX8-GISEL-NEXT: v_mul_lo_u16_e32 v7, v15, v9 ; GFX8-GISEL-NEXT: v_mul_lo_u16_e32 v0, v4, v0 ; GFX8-GISEL-NEXT: v_mul_lo_u16_sdwa v2, v5, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX8-GISEL-NEXT: v_or_b32_e32 v0, v0, v2 @@ -1364,10 +1362,9 @@ ; GFX8-GISEL-LABEL: clpeak_umad_pat_v2i16: ; GFX8-GISEL: ; %bb.0: ; %entry ; GFX8-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-GISEL-NEXT: v_mov_b32_e32 v4, 1 ; GFX8-GISEL-NEXT: v_lshrrev_b32_e32 v2, 16, v0 ; GFX8-GISEL-NEXT: v_add_u16_e32 v3, 1, v0 -; GFX8-GISEL-NEXT: v_add_u16_sdwa v4, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-GISEL-NEXT: v_add_u16_e32 v4, 1, v2 ; GFX8-GISEL-NEXT: v_lshrrev_b32_e32 v5, 16, v1 ; GFX8-GISEL-NEXT: v_mad_u16 v0, v3, v1, v0 ; GFX8-GISEL-NEXT: v_mad_u16 v2, v4, v5, v2 @@ -1943,19 +1940,18 @@ ; GFX8-GISEL-LABEL: clpeak_umad_pat_v4i16: ; GFX8-GISEL: ; %bb.0: ; %entry ; GFX8-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-GISEL-NEXT: v_mov_b32_e32 v7, 1 ; GFX8-GISEL-NEXT: v_lshrrev_b32_e32 v4, 16, v0 +; GFX8-GISEL-NEXT: v_lshrrev_b32_e32 v5, 16, v1 ; GFX8-GISEL-NEXT: v_add_u16_e32 v6, 1, v0 -; GFX8-GISEL-NEXT: v_add_u16_sdwa v8, v0, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-GISEL-NEXT: v_add_u16_e32 v7, 1, v4 ; GFX8-GISEL-NEXT: v_lshrrev_b32_e32 v10, 16, v2 -; GFX8-GISEL-NEXT: v_lshrrev_b32_e32 v5, 16, v1 -; GFX8-GISEL-NEXT: v_add_u16_e32 v9, 1, v1 -; GFX8-GISEL-NEXT: v_add_u16_sdwa v7, v1, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-GISEL-NEXT: v_add_u16_e32 v8, 1, v1 +; GFX8-GISEL-NEXT: v_add_u16_e32 v9, 1, v5 ; GFX8-GISEL-NEXT: v_lshrrev_b32_e32 v11, 16, v3 ; GFX8-GISEL-NEXT: v_mad_u16 v0, v6, v2, v0 -; GFX8-GISEL-NEXT: v_mad_u16 v4, v8, v10, v4 -; GFX8-GISEL-NEXT: v_mad_u16 v1, v9, v3, v1 -; GFX8-GISEL-NEXT: v_mad_u16 v5, v7, v11, v5 +; GFX8-GISEL-NEXT: v_mad_u16 v4, v7, v10, v4 +; GFX8-GISEL-NEXT: v_mad_u16 v1, v8, v3, v1 +; GFX8-GISEL-NEXT: v_mad_u16 v5, v9, v11, v5 ; GFX8-GISEL-NEXT: v_add_u16_e32 v0, 1, v0 ; GFX8-GISEL-NEXT: v_add_u16_e32 v4, 1, v4 ; GFX8-GISEL-NEXT: v_add_u16_e32 v1, 1, v1 @@ -1963,19 +1959,19 @@ ; GFX8-GISEL-NEXT: v_mul_lo_u16_e32 v12, v0, v2 ; GFX8-GISEL-NEXT: v_mul_lo_u16_e32 v13, v4, v10 ; GFX8-GISEL-NEXT: v_mad_u16 v6, v6, v2, 1 -; GFX8-GISEL-NEXT: v_mad_u16 v8, v8, v10, 1 +; GFX8-GISEL-NEXT: v_mad_u16 v7, v7, v10, 1 ; GFX8-GISEL-NEXT: v_mul_lo_u16_e32 v14, v1, v3 ; GFX8-GISEL-NEXT: v_mul_lo_u16_e32 v15, v5, v11 -; GFX8-GISEL-NEXT: v_mad_u16 v9, v9, v3, 1 -; GFX8-GISEL-NEXT: v_mad_u16 v7, v7, v11, 1 +; GFX8-GISEL-NEXT: v_mad_u16 v8, v8, v3, 1 +; GFX8-GISEL-NEXT: v_mad_u16 v9, v9, v11, 1 ; GFX8-GISEL-NEXT: v_mad_u16 v0, v0, v2, 1 ; GFX8-GISEL-NEXT: v_mad_u16 v2, v4, v10, 1 ; GFX8-GISEL-NEXT: v_mad_u16 v1, v1, v3, 1 ; GFX8-GISEL-NEXT: v_mad_u16 v3, v5, v11, 1 ; GFX8-GISEL-NEXT: v_mul_lo_u16_e32 v4, v12, v6 -; GFX8-GISEL-NEXT: v_mul_lo_u16_e32 v5, v13, v8 -; GFX8-GISEL-NEXT: v_mul_lo_u16_e32 v6, v14, v9 -; GFX8-GISEL-NEXT: v_mul_lo_u16_e32 v7, v15, v7 +; GFX8-GISEL-NEXT: v_mul_lo_u16_e32 v5, v13, v7 +; GFX8-GISEL-NEXT: v_mul_lo_u16_e32 v6, v14, v8 +; GFX8-GISEL-NEXT: v_mul_lo_u16_e32 v7, v15, v9 ; GFX8-GISEL-NEXT: v_mul_lo_u16_e32 v0, v4, v0 ; GFX8-GISEL-NEXT: v_mul_lo_u16_sdwa v2, v5, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX8-GISEL-NEXT: v_or_b32_e32 v0, v0, v2 @@ -6343,17 +6339,16 @@ ; GFX8-GISEL-LABEL: clpeak_imad_pat_v2i16_x2: ; GFX8-GISEL: ; %bb.0: ; %entry ; GFX8-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-GISEL-NEXT: v_mov_b32_e32 v4, 1 ; GFX8-GISEL-NEXT: v_lshrrev_b32_e32 v2, 16, v0 ; GFX8-GISEL-NEXT: v_add_u16_e32 v3, 1, v0 -; GFX8-GISEL-NEXT: v_add_u16_sdwa v4, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-GISEL-NEXT: v_add_u16_e32 v4, 1, v2 ; GFX8-GISEL-NEXT: v_lshrrev_b32_e32 v5, 16, v1 ; GFX8-GISEL-NEXT: v_mad_u16 v0, v3, v1, v0 ; GFX8-GISEL-NEXT: v_mad_u16 v2, v4, v5, v2 ; GFX8-GISEL-NEXT: v_add_u16_e32 v0, 1, v0 ; GFX8-GISEL-NEXT: v_add_u16_e32 v2, 1, v2 ; GFX8-GISEL-NEXT: v_mul_lo_u16_e32 v0, v0, v1 -; GFX8-GISEL-NEXT: v_mul_lo_u16_sdwa v2, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX8-GISEL-NEXT: v_mul_lo_u16_e32 v2, v2, v5 ; GFX8-GISEL-NEXT: v_mad_u16 v1, v3, v1, 1 ; GFX8-GISEL-NEXT: v_mad_u16 v3, v4, v5, 1 ; GFX8-GISEL-NEXT: v_mad_u16 v4, v0, v1, v1 @@ -6679,17 +6674,16 @@ ; GFX8-GISEL-LABEL: clpeak_umad_pat_v2i16_x2: ; GFX8-GISEL: ; %bb.0: ; %entry ; GFX8-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-GISEL-NEXT: v_mov_b32_e32 v4, 1 ; GFX8-GISEL-NEXT: v_lshrrev_b32_e32 v2, 16, v0 ; GFX8-GISEL-NEXT: v_add_u16_e32 v3, 1, v0 -; GFX8-GISEL-NEXT: v_add_u16_sdwa v4, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-GISEL-NEXT: v_add_u16_e32 v4, 1, v2 ; GFX8-GISEL-NEXT: v_lshrrev_b32_e32 v5, 16, v1 ; GFX8-GISEL-NEXT: v_mad_u16 v0, v3, v1, v0 ; GFX8-GISEL-NEXT: v_mad_u16 v2, v4, v5, v2 ; GFX8-GISEL-NEXT: v_add_u16_e32 v0, 1, v0 ; GFX8-GISEL-NEXT: v_add_u16_e32 v2, 1, v2 ; GFX8-GISEL-NEXT: v_mul_lo_u16_e32 v0, v0, v1 -; GFX8-GISEL-NEXT: v_mul_lo_u16_sdwa v2, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX8-GISEL-NEXT: v_mul_lo_u16_e32 v2, v2, v5 ; GFX8-GISEL-NEXT: v_mad_u16 v1, v3, v1, 1 ; GFX8-GISEL-NEXT: v_mad_u16 v3, v4, v5, 1 ; GFX8-GISEL-NEXT: v_mad_u16 v4, v0, v1, v1 Index: llvm/test/CodeGen/AMDGPU/lds-atomic-fmin-fmax.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/lds-atomic-fmin-fmax.ll +++ llvm/test/CodeGen/AMDGPU/lds-atomic-fmin-fmax.ll @@ -269,18 +269,18 @@ ; G_GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c ; G_GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; G_GFX9-NEXT: s_addc_u32 s9, s9, 0 -; G_GFX9-NEXT: v_mov_b32_e32 v1, 0x42280000 +; G_GFX9-NEXT: v_mov_b32_e32 v0, 0x42280000 ; G_GFX9-NEXT: s_waitcnt lgkmcnt(0) ; G_GFX9-NEXT: s_add_i32 s4, s4, 4 ; G_GFX9-NEXT: s_lshl_b32 s0, s4, 3 -; G_GFX9-NEXT: v_mov_b32_e32 v0, s0 -; G_GFX9-NEXT: ds_min_rtn_f32 v0, v0, v1 +; G_GFX9-NEXT: v_mov_b32_e32 v1, s0 +; G_GFX9-NEXT: ds_min_rtn_f32 v1, v1, v0 ; G_GFX9-NEXT: s_lshl_b32 s0, s4, 4 ; G_GFX9-NEXT: v_mov_b32_e32 v2, s0 -; G_GFX9-NEXT: ds_min_f32 v2, v1 -; G_GFX9-NEXT: v_mov_b32_e32 v1, s3 +; G_GFX9-NEXT: ds_min_f32 v2, v0 +; G_GFX9-NEXT: v_mov_b32_e32 v0, s3 ; G_GFX9-NEXT: s_waitcnt lgkmcnt(1) -; G_GFX9-NEXT: ds_min_rtn_f32 v0, v1, v0 +; G_GFX9-NEXT: ds_min_rtn_f32 v0, v0, v1 ; G_GFX9-NEXT: v_mov_b32_e32 v1, s2 ; G_GFX9-NEXT: s_waitcnt lgkmcnt(0) ; G_GFX9-NEXT: buffer_store_dword v0, v1, s[8:11], 0 offen @@ -296,18 +296,18 @@ ; G_GFX10-NEXT: s_add_u32 s4, s4, s3 ; G_GFX10-NEXT: s_addc_u32 s5, s5, 0 ; G_GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; G_GFX10-NEXT: v_mov_b32_e32 v1, 0x42280000 +; G_GFX10-NEXT: v_mov_b32_e32 v0, 0x42280000 ; G_GFX10-NEXT: s_waitcnt lgkmcnt(0) ; G_GFX10-NEXT: s_add_i32 s2, s2, 4 ; G_GFX10-NEXT: s_lshl_b32 s3, s2, 3 ; G_GFX10-NEXT: s_lshl_b32 s2, s2, 4 -; G_GFX10-NEXT: v_mov_b32_e32 v0, s3 +; G_GFX10-NEXT: v_mov_b32_e32 v1, s3 ; G_GFX10-NEXT: v_mov_b32_e32 v2, s2 ; G_GFX10-NEXT: v_mov_b32_e32 v3, s1 -; G_GFX10-NEXT: ds_min_rtn_f32 v0, v0, v1 -; G_GFX10-NEXT: ds_min_f32 v2, v1 +; G_GFX10-NEXT: ds_min_rtn_f32 v1, v1, v0 +; G_GFX10-NEXT: ds_min_f32 v2, v0 ; G_GFX10-NEXT: s_waitcnt lgkmcnt(1) -; G_GFX10-NEXT: ds_min_rtn_f32 v0, v3, v0 +; G_GFX10-NEXT: ds_min_rtn_f32 v0, v3, v1 ; G_GFX10-NEXT: v_mov_b32_e32 v1, s0 ; G_GFX10-NEXT: s_waitcnt lgkmcnt(0) ; G_GFX10-NEXT: buffer_store_dword v0, v1, s[4:7], 0 offen @@ -318,18 +318,18 @@ ; G_GFX11-NEXT: s_clause 0x1 ; G_GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c ; G_GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 -; G_GFX11-NEXT: v_mov_b32_e32 v1, 0x42280000 ; G_GFX11-NEXT: s_waitcnt lgkmcnt(0) ; G_GFX11-NEXT: s_add_i32 s2, s2, 4 ; G_GFX11-NEXT: v_mov_b32_e32 v3, s1 ; G_GFX11-NEXT: s_lshl_b32 s3, s2, 3 +; G_GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; G_GFX11-NEXT: v_dual_mov_b32 v0, 0x42280000 :: v_dual_mov_b32 v1, s3 ; G_GFX11-NEXT: s_lshl_b32 s2, s2, 4 -; G_GFX11-NEXT: v_mov_b32_e32 v0, s3 ; G_GFX11-NEXT: v_mov_b32_e32 v2, s2 -; G_GFX11-NEXT: ds_min_rtn_f32 v0, v0, v1 -; G_GFX11-NEXT: ds_min_f32 v2, v1 +; G_GFX11-NEXT: ds_min_rtn_f32 v1, v1, v0 +; G_GFX11-NEXT: ds_min_f32 v2, v0 ; G_GFX11-NEXT: s_waitcnt lgkmcnt(1) -; G_GFX11-NEXT: ds_min_rtn_f32 v0, v3, v0 +; G_GFX11-NEXT: ds_min_rtn_f32 v0, v3, v1 ; G_GFX11-NEXT: s_waitcnt lgkmcnt(0) ; G_GFX11-NEXT: scratch_store_b32 off, v0, s0 ; G_GFX11-NEXT: s_endpgm @@ -595,18 +595,18 @@ ; G_GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c ; G_GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; G_GFX9-NEXT: s_addc_u32 s9, s9, 0 -; G_GFX9-NEXT: v_mov_b32_e32 v1, 0x42280000 +; G_GFX9-NEXT: v_mov_b32_e32 v0, 0x42280000 ; G_GFX9-NEXT: s_waitcnt lgkmcnt(0) ; G_GFX9-NEXT: s_add_i32 s4, s4, 4 ; G_GFX9-NEXT: s_lshl_b32 s0, s4, 3 -; G_GFX9-NEXT: v_mov_b32_e32 v0, s0 -; G_GFX9-NEXT: ds_max_rtn_f32 v0, v0, v1 +; G_GFX9-NEXT: v_mov_b32_e32 v1, s0 +; G_GFX9-NEXT: ds_max_rtn_f32 v1, v1, v0 ; G_GFX9-NEXT: s_lshl_b32 s0, s4, 4 ; G_GFX9-NEXT: v_mov_b32_e32 v2, s0 -; G_GFX9-NEXT: ds_max_f32 v2, v1 -; G_GFX9-NEXT: v_mov_b32_e32 v1, s3 +; G_GFX9-NEXT: ds_max_f32 v2, v0 +; G_GFX9-NEXT: v_mov_b32_e32 v0, s3 ; G_GFX9-NEXT: s_waitcnt lgkmcnt(1) -; G_GFX9-NEXT: ds_max_rtn_f32 v0, v1, v0 +; G_GFX9-NEXT: ds_max_rtn_f32 v0, v0, v1 ; G_GFX9-NEXT: v_mov_b32_e32 v1, s2 ; G_GFX9-NEXT: s_waitcnt lgkmcnt(0) ; G_GFX9-NEXT: buffer_store_dword v0, v1, s[8:11], 0 offen @@ -622,18 +622,18 @@ ; G_GFX10-NEXT: s_add_u32 s4, s4, s3 ; G_GFX10-NEXT: s_addc_u32 s5, s5, 0 ; G_GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; G_GFX10-NEXT: v_mov_b32_e32 v1, 0x42280000 +; G_GFX10-NEXT: v_mov_b32_e32 v0, 0x42280000 ; G_GFX10-NEXT: s_waitcnt lgkmcnt(0) ; G_GFX10-NEXT: s_add_i32 s2, s2, 4 ; G_GFX10-NEXT: s_lshl_b32 s3, s2, 3 ; G_GFX10-NEXT: s_lshl_b32 s2, s2, 4 -; G_GFX10-NEXT: v_mov_b32_e32 v0, s3 +; G_GFX10-NEXT: v_mov_b32_e32 v1, s3 ; G_GFX10-NEXT: v_mov_b32_e32 v2, s2 ; G_GFX10-NEXT: v_mov_b32_e32 v3, s1 -; G_GFX10-NEXT: ds_max_rtn_f32 v0, v0, v1 -; G_GFX10-NEXT: ds_max_f32 v2, v1 +; G_GFX10-NEXT: ds_max_rtn_f32 v1, v1, v0 +; G_GFX10-NEXT: ds_max_f32 v2, v0 ; G_GFX10-NEXT: s_waitcnt lgkmcnt(1) -; G_GFX10-NEXT: ds_max_rtn_f32 v0, v3, v0 +; G_GFX10-NEXT: ds_max_rtn_f32 v0, v3, v1 ; G_GFX10-NEXT: v_mov_b32_e32 v1, s0 ; G_GFX10-NEXT: s_waitcnt lgkmcnt(0) ; G_GFX10-NEXT: buffer_store_dword v0, v1, s[4:7], 0 offen @@ -644,18 +644,18 @@ ; G_GFX11-NEXT: s_clause 0x1 ; G_GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c ; G_GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 -; G_GFX11-NEXT: v_mov_b32_e32 v1, 0x42280000 ; G_GFX11-NEXT: s_waitcnt lgkmcnt(0) ; G_GFX11-NEXT: s_add_i32 s2, s2, 4 ; G_GFX11-NEXT: v_mov_b32_e32 v3, s1 ; G_GFX11-NEXT: s_lshl_b32 s3, s2, 3 +; G_GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; G_GFX11-NEXT: v_dual_mov_b32 v0, 0x42280000 :: v_dual_mov_b32 v1, s3 ; G_GFX11-NEXT: s_lshl_b32 s2, s2, 4 -; G_GFX11-NEXT: v_mov_b32_e32 v0, s3 ; G_GFX11-NEXT: v_mov_b32_e32 v2, s2 -; G_GFX11-NEXT: ds_max_rtn_f32 v0, v0, v1 -; G_GFX11-NEXT: ds_max_f32 v2, v1 +; G_GFX11-NEXT: ds_max_rtn_f32 v1, v1, v0 +; G_GFX11-NEXT: ds_max_f32 v2, v0 ; G_GFX11-NEXT: s_waitcnt lgkmcnt(1) -; G_GFX11-NEXT: ds_max_rtn_f32 v0, v3, v0 +; G_GFX11-NEXT: ds_max_rtn_f32 v0, v3, v1 ; G_GFX11-NEXT: s_waitcnt lgkmcnt(0) ; G_GFX11-NEXT: scratch_store_b32 off, v0, s0 ; G_GFX11-NEXT: s_endpgm Index: llvm/test/CodeGen/AMDGPU/llvm.is.fpclass.f16.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/llvm.is.fpclass.f16.ll +++ llvm/test/CodeGen/AMDGPU/llvm.is.fpclass.f16.ll @@ -955,9 +955,9 @@ ; GFX7GLISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7GLISEL-NEXT: v_and_b32_e32 v0, 0x7fff, v0 ; GFX7GLISEL-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX7GLISEL-NEXT: s_movk_i32 s4, 0x7c00 -; GFX7GLISEL-NEXT: v_cmp_gt_u32_e32 vcc, s4, v0 -; GFX7GLISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], s4, v0 +; GFX7GLISEL-NEXT: v_mov_b32_e32 v1, 0x7c00 +; GFX7GLISEL-NEXT: v_cmp_lt_u32_e32 vcc, v0, v1 +; GFX7GLISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], v0, v1 ; GFX7GLISEL-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX7GLISEL-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[4:5] ; GFX7GLISEL-NEXT: s_setpc_b64 s[30:31] @@ -1017,12 +1017,12 @@ ; GFX7GLISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7GLISEL-NEXT: v_and_b32_e32 v0, 0x7fff, v0 ; GFX7GLISEL-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX7GLISEL-NEXT: s_movk_i32 s4, 0x7c00 +; GFX7GLISEL-NEXT: v_mov_b32_e32 v2, 0x7c00 ; GFX7GLISEL-NEXT: v_and_b32_e32 v1, 0x7fff, v1 ; GFX7GLISEL-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GFX7GLISEL-NEXT: v_cmp_lt_u32_e32 vcc, s4, v0 +; GFX7GLISEL-NEXT: v_cmp_gt_u32_e32 vcc, v0, v2 ; GFX7GLISEL-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc -; GFX7GLISEL-NEXT: v_cmp_lt_u32_e32 vcc, s4, v1 +; GFX7GLISEL-NEXT: v_cmp_gt_u32_e32 vcc, v1, v2 ; GFX7GLISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc ; GFX7GLISEL-NEXT: s_setpc_b64 s[30:31] ; @@ -1097,16 +1097,16 @@ ; GFX7GLISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7GLISEL-NEXT: v_and_b32_e32 v0, 0x7fff, v0 ; GFX7GLISEL-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX7GLISEL-NEXT: s_movk_i32 s4, 0x7c00 +; GFX7GLISEL-NEXT: v_mov_b32_e32 v3, 0x7c00 ; GFX7GLISEL-NEXT: v_and_b32_e32 v1, 0x7fff, v1 ; GFX7GLISEL-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; GFX7GLISEL-NEXT: v_and_b32_e32 v2, 0x7fff, v2 -; GFX7GLISEL-NEXT: v_cmp_lt_u32_e32 vcc, s4, v0 +; GFX7GLISEL-NEXT: v_cmp_gt_u32_e32 vcc, v0, v3 ; GFX7GLISEL-NEXT: v_and_b32_e32 v2, 0xffff, v2 ; GFX7GLISEL-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc -; GFX7GLISEL-NEXT: v_cmp_lt_u32_e32 vcc, s4, v1 +; GFX7GLISEL-NEXT: v_cmp_gt_u32_e32 vcc, v1, v3 ; GFX7GLISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc -; GFX7GLISEL-NEXT: v_cmp_lt_u32_e32 vcc, s4, v2 +; GFX7GLISEL-NEXT: v_cmp_gt_u32_e32 vcc, v2, v3 ; GFX7GLISEL-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc ; GFX7GLISEL-NEXT: s_setpc_b64 s[30:31] ; @@ -1249,20 +1249,20 @@ ; GFX7GLISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7GLISEL-NEXT: v_and_b32_e32 v0, 0x7fff, v0 ; GFX7GLISEL-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX7GLISEL-NEXT: s_movk_i32 s4, 0x7c00 +; GFX7GLISEL-NEXT: v_mov_b32_e32 v4, 0x7c00 ; GFX7GLISEL-NEXT: v_and_b32_e32 v1, 0x7fff, v1 ; GFX7GLISEL-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; GFX7GLISEL-NEXT: v_and_b32_e32 v2, 0x7fff, v2 -; GFX7GLISEL-NEXT: v_cmp_lt_u32_e32 vcc, s4, v0 +; GFX7GLISEL-NEXT: v_cmp_gt_u32_e32 vcc, v0, v4 ; GFX7GLISEL-NEXT: v_and_b32_e32 v2, 0xffff, v2 ; GFX7GLISEL-NEXT: v_and_b32_e32 v3, 0x7fff, v3 ; GFX7GLISEL-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc -; GFX7GLISEL-NEXT: v_cmp_lt_u32_e32 vcc, s4, v1 +; GFX7GLISEL-NEXT: v_cmp_gt_u32_e32 vcc, v1, v4 ; GFX7GLISEL-NEXT: v_and_b32_e32 v3, 0xffff, v3 ; GFX7GLISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc -; GFX7GLISEL-NEXT: v_cmp_lt_u32_e32 vcc, s4, v2 +; GFX7GLISEL-NEXT: v_cmp_gt_u32_e32 vcc, v2, v4 ; GFX7GLISEL-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc -; GFX7GLISEL-NEXT: v_cmp_lt_u32_e32 vcc, s4, v3 +; GFX7GLISEL-NEXT: v_cmp_gt_u32_e32 vcc, v3, v4 ; GFX7GLISEL-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc ; GFX7GLISEL-NEXT: s_setpc_b64 s[30:31] ; @@ -1633,9 +1633,9 @@ ; GFX7GLISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7GLISEL-NEXT: v_and_b32_e32 v0, 0x7fff, v0 ; GFX7GLISEL-NEXT: v_and_b32_e32 v1, 0xffff, v0 -; GFX7GLISEL-NEXT: s_movk_i32 s4, 0x7c00 -; GFX7GLISEL-NEXT: v_cmp_eq_u32_e32 vcc, s4, v1 -; GFX7GLISEL-NEXT: v_cmp_lt_u32_e64 s[4:5], s4, v1 +; GFX7GLISEL-NEXT: v_mov_b32_e32 v2, 0x7c00 +; GFX7GLISEL-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2 +; GFX7GLISEL-NEXT: v_cmp_gt_u32_e64 s[4:5], v1, v2 ; GFX7GLISEL-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX7GLISEL-NEXT: v_subrev_i32_e32 v0, vcc, 0x400, v0 ; GFX7GLISEL-NEXT: v_and_b32_e32 v0, 0xffff, v0 @@ -1763,11 +1763,11 @@ ; GFX7GLISEL-NEXT: v_mov_b32_e32 v2, 0x3ff ; GFX7GLISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 ; GFX7GLISEL-NEXT: v_cmp_lt_u32_e64 s[4:5], v0, v2 -; GFX7GLISEL-NEXT: s_movk_i32 s6, 0x7c00 +; GFX7GLISEL-NEXT: v_mov_b32_e32 v0, 0x7c00 ; GFX7GLISEL-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX7GLISEL-NEXT: v_cmp_eq_u32_e32 vcc, s6, v1 +; GFX7GLISEL-NEXT: v_cmp_eq_u32_e32 vcc, v1, v0 ; GFX7GLISEL-NEXT: s_or_b64 s[4:5], s[4:5], vcc -; GFX7GLISEL-NEXT: v_cmp_lt_u32_e32 vcc, s6, v1 +; GFX7GLISEL-NEXT: v_cmp_gt_u32_e32 vcc, v1, v0 ; GFX7GLISEL-NEXT: s_or_b64 s[4:5], s[4:5], vcc ; GFX7GLISEL-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[4:5] ; GFX7GLISEL-NEXT: s_setpc_b64 s[30:31] @@ -1835,11 +1835,11 @@ ; GFX7GLISEL-NEXT: v_mov_b32_e32 v3, 0x3ff ; GFX7GLISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 ; GFX7GLISEL-NEXT: v_cmp_lt_u32_e64 s[6:7], v0, v3 -; GFX7GLISEL-NEXT: s_movk_i32 s8, 0x7c00 +; GFX7GLISEL-NEXT: v_mov_b32_e32 v0, 0x7c00 ; GFX7GLISEL-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX7GLISEL-NEXT: v_cmp_eq_u32_e32 vcc, s8, v2 +; GFX7GLISEL-NEXT: v_cmp_eq_u32_e32 vcc, v2, v0 ; GFX7GLISEL-NEXT: s_or_b64 s[6:7], s[6:7], vcc -; GFX7GLISEL-NEXT: v_cmp_lt_u32_e32 vcc, s8, v2 +; GFX7GLISEL-NEXT: v_cmp_gt_u32_e32 vcc, v2, v0 ; GFX7GLISEL-NEXT: s_or_b64 s[6:7], s[6:7], vcc ; GFX7GLISEL-NEXT: v_subrev_i32_e32 v0, vcc, 0x400, v1 ; GFX7GLISEL-NEXT: v_and_b32_e32 v0, 0xffff, v0 @@ -1913,11 +1913,11 @@ ; GFX7GLISEL-NEXT: v_mov_b32_e32 v3, 0x3ff ; GFX7GLISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 ; GFX7GLISEL-NEXT: v_cmp_lt_u32_e64 s[6:7], v0, v3 -; GFX7GLISEL-NEXT: s_movk_i32 s8, 0x7c00 +; GFX7GLISEL-NEXT: v_mov_b32_e32 v0, 0x7c00 ; GFX7GLISEL-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX7GLISEL-NEXT: v_cmp_eq_u32_e32 vcc, s8, v2 +; GFX7GLISEL-NEXT: v_cmp_eq_u32_e32 vcc, v2, v0 ; GFX7GLISEL-NEXT: s_or_b64 s[6:7], s[6:7], vcc -; GFX7GLISEL-NEXT: v_cmp_lt_u32_e32 vcc, s8, v2 +; GFX7GLISEL-NEXT: v_cmp_gt_u32_e32 vcc, v2, v0 ; GFX7GLISEL-NEXT: s_or_b64 s[6:7], s[6:7], vcc ; GFX7GLISEL-NEXT: v_subrev_i32_e32 v0, vcc, 0x400, v1 ; GFX7GLISEL-NEXT: v_and_b32_e32 v0, 0xffff, v0 @@ -2038,11 +2038,11 @@ ; GFX7GLISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7GLISEL-NEXT: v_and_b32_e32 v0, 0x7fff, v0 ; GFX7GLISEL-NEXT: v_and_b32_e32 v1, 0xffff, v0 -; GFX7GLISEL-NEXT: s_movk_i32 s6, 0x7c00 +; GFX7GLISEL-NEXT: v_mov_b32_e32 v2, 0x7c00 ; GFX7GLISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 -; GFX7GLISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], s6, v1 +; GFX7GLISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], v1, v2 ; GFX7GLISEL-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX7GLISEL-NEXT: v_cmp_lt_u32_e32 vcc, s6, v1 +; GFX7GLISEL-NEXT: v_cmp_gt_u32_e32 vcc, v1, v2 ; GFX7GLISEL-NEXT: s_or_b64 s[4:5], s[4:5], vcc ; GFX7GLISEL-NEXT: v_subrev_i32_e32 v0, vcc, 0x400, v0 ; GFX7GLISEL-NEXT: v_and_b32_e32 v0, 0xffff, v0 @@ -2160,10 +2160,10 @@ ; GFX7GLISEL-NEXT: v_mov_b32_e32 v2, 0x3ff ; GFX7GLISEL-NEXT: v_cmp_lt_u32_e32 vcc, v1, v2 ; GFX7GLISEL-NEXT: v_and_b32_e32 v1, 0xffff, v0 -; GFX7GLISEL-NEXT: s_movk_i32 s6, 0x7c00 -; GFX7GLISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], s6, v1 +; GFX7GLISEL-NEXT: v_mov_b32_e32 v2, 0x7c00 +; GFX7GLISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], v1, v2 ; GFX7GLISEL-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX7GLISEL-NEXT: v_cmp_lt_u32_e32 vcc, s6, v1 +; GFX7GLISEL-NEXT: v_cmp_gt_u32_e32 vcc, v1, v2 ; GFX7GLISEL-NEXT: s_or_b64 s[4:5], s[4:5], vcc ; GFX7GLISEL-NEXT: v_subrev_i32_e32 v0, vcc, 0x400, v0 ; GFX7GLISEL-NEXT: v_and_b32_e32 v0, 0xffff, v0 @@ -2222,9 +2222,9 @@ ; GFX7GLISEL: ; %bb.0: ; GFX7GLISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7GLISEL-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX7GLISEL-NEXT: s_movk_i32 s4, 0x7c00 -; GFX7GLISEL-NEXT: v_cmp_gt_u32_e32 vcc, s4, v0 -; GFX7GLISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], s4, v0 +; GFX7GLISEL-NEXT: v_mov_b32_e32 v1, 0x7c00 +; GFX7GLISEL-NEXT: v_cmp_lt_u32_e32 vcc, v0, v1 +; GFX7GLISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], v0, v1 ; GFX7GLISEL-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX7GLISEL-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[4:5] ; GFX7GLISEL-NEXT: s_setpc_b64 s[30:31] @@ -2289,14 +2289,14 @@ ; GFX7GLISEL-NEXT: v_and_b32_e32 v1, 0x7fff, v0 ; GFX7GLISEL-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX7GLISEL-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GFX7GLISEL-NEXT: s_movk_i32 s6, 0x7c00 +; GFX7GLISEL-NEXT: v_mov_b32_e32 v2, 0x7c00 ; GFX7GLISEL-NEXT: v_cmp_ne_u32_e32 vcc, v0, v1 -; GFX7GLISEL-NEXT: v_cmp_gt_u32_e64 s[4:5], s6, v1 -; GFX7GLISEL-NEXT: v_mov_b32_e32 v2, 0xfc00 +; GFX7GLISEL-NEXT: v_cmp_lt_u32_e64 s[4:5], v1, v2 +; GFX7GLISEL-NEXT: v_mov_b32_e32 v3, 0xfc00 ; GFX7GLISEL-NEXT: s_and_b64 s[4:5], s[4:5], vcc -; GFX7GLISEL-NEXT: v_cmp_eq_u32_e32 vcc, v0, v2 +; GFX7GLISEL-NEXT: v_cmp_eq_u32_e32 vcc, v0, v3 ; GFX7GLISEL-NEXT: s_or_b64 s[4:5], s[4:5], vcc -; GFX7GLISEL-NEXT: v_cmp_lt_u32_e32 vcc, s6, v1 +; GFX7GLISEL-NEXT: v_cmp_gt_u32_e32 vcc, v1, v2 ; GFX7GLISEL-NEXT: s_or_b64 s[4:5], s[4:5], vcc ; GFX7GLISEL-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[4:5] ; GFX7GLISEL-NEXT: s_setpc_b64 s[30:31] @@ -2419,12 +2419,12 @@ ; GFX7GLISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7GLISEL-NEXT: v_and_b32_e32 v1, 0x7fff, v0 ; GFX7GLISEL-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX7GLISEL-NEXT: s_movk_i32 s6, 0x7c00 -; GFX7GLISEL-NEXT: v_cmp_gt_u32_e32 vcc, s6, v0 -; GFX7GLISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], s6, v0 +; GFX7GLISEL-NEXT: v_mov_b32_e32 v2, 0x7c00 +; GFX7GLISEL-NEXT: v_cmp_lt_u32_e32 vcc, v0, v2 +; GFX7GLISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], v0, v2 ; GFX7GLISEL-NEXT: v_and_b32_e32 v0, 0xffff, v1 ; GFX7GLISEL-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX7GLISEL-NEXT: v_cmp_lt_u32_e32 vcc, s6, v0 +; GFX7GLISEL-NEXT: v_cmp_gt_u32_e32 vcc, v0, v2 ; GFX7GLISEL-NEXT: s_or_b64 s[4:5], s[4:5], vcc ; GFX7GLISEL-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[4:5] ; GFX7GLISEL-NEXT: s_setpc_b64 s[30:31] @@ -3037,11 +3037,11 @@ ; GFX7GLISEL-NEXT: v_mov_b32_e32 v2, 0x3ff ; GFX7GLISEL-NEXT: v_cmp_lt_u32_e32 vcc, v1, v2 ; GFX7GLISEL-NEXT: v_and_b32_e32 v1, 0xffff, v0 -; GFX7GLISEL-NEXT: s_movk_i32 s8, 0x7c00 -; GFX7GLISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], s8, v1 -; GFX7GLISEL-NEXT: v_mov_b32_e32 v2, 0x7e00 +; GFX7GLISEL-NEXT: v_mov_b32_e32 v2, 0x7c00 +; GFX7GLISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], v1, v2 ; GFX7GLISEL-NEXT: s_or_b64 s[6:7], vcc, s[4:5] -; GFX7GLISEL-NEXT: v_cmp_lt_u32_e32 vcc, s8, v1 +; GFX7GLISEL-NEXT: v_cmp_gt_u32_e32 vcc, v1, v2 +; GFX7GLISEL-NEXT: v_mov_b32_e32 v2, 0x7e00 ; GFX7GLISEL-NEXT: v_cmp_lt_u32_e64 s[4:5], v1, v2 ; GFX7GLISEL-NEXT: s_and_b64 s[4:5], vcc, s[4:5] ; GFX7GLISEL-NEXT: v_subrev_i32_e32 v0, vcc, 0x400, v0 @@ -3187,9 +3187,9 @@ ; GFX7GLISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7GLISEL-NEXT: v_and_b32_e32 v0, 0x7fff, v0 ; GFX7GLISEL-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX7GLISEL-NEXT: s_movk_i32 s4, 0x7c00 -; GFX7GLISEL-NEXT: v_cmp_eq_u32_e32 vcc, s4, v0 -; GFX7GLISEL-NEXT: v_cmp_lt_u32_e64 s[4:5], s4, v0 +; GFX7GLISEL-NEXT: v_mov_b32_e32 v1, 0x7c00 +; GFX7GLISEL-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GFX7GLISEL-NEXT: v_cmp_gt_u32_e64 s[4:5], v0, v1 ; GFX7GLISEL-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX7GLISEL-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[4:5] ; GFX7GLISEL-NEXT: s_setpc_b64 s[30:31] @@ -3303,9 +3303,9 @@ ; GFX7GLISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7GLISEL-NEXT: v_and_b32_e32 v0, 0x7fff, v0 ; GFX7GLISEL-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX7GLISEL-NEXT: s_movk_i32 s4, 0x7c00 -; GFX7GLISEL-NEXT: v_cmp_gt_u32_e32 vcc, s4, v0 -; GFX7GLISEL-NEXT: v_cmp_lt_u32_e64 s[4:5], s4, v0 +; GFX7GLISEL-NEXT: v_mov_b32_e32 v1, 0x7c00 +; GFX7GLISEL-NEXT: v_cmp_lt_u32_e32 vcc, v0, v1 +; GFX7GLISEL-NEXT: v_cmp_gt_u32_e64 s[4:5], v0, v1 ; GFX7GLISEL-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX7GLISEL-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[4:5] ; GFX7GLISEL-NEXT: s_setpc_b64 s[30:31] Index: llvm/test/CodeGen/AMDGPU/mad-mix.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/mad-mix.ll +++ llvm/test/CodeGen/AMDGPU/mad-mix.ll @@ -861,10 +861,9 @@ ; GISEL-CI-LABEL: v_mad_mix_f32_f16lo_f16lo_f32imminv2pi: ; GISEL-CI: ; %bb.0: ; GISEL-CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v2, v0 +; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GISEL-CI-NEXT: v_mov_b32_e32 v0, 0x3e22f983 -; GISEL-CI-NEXT: v_mac_f32_e32 v0, v2, v1 +; GISEL-CI-NEXT: v_madak_f32 v0, v0, v1, 0x3e22f983 ; GISEL-CI-NEXT: s_setpc_b64 s[30:31] %src0.ext = fpext half %src0 to float %src1.ext = fpext half %src1 to float @@ -902,13 +901,13 @@ ; SDAG-GFX906-NEXT: v_fma_mix_f32 v0, v0, v1, s4 op_sel_hi:[1,1,0] ; SDAG-GFX906-NEXT: s_setpc_b64 s[30:31] ; -; SDAG-VI-LABEL: v_mad_mix_f32_f16lo_f16lo_cvtf16imminv2pi: -; SDAG-VI: ; %bb.0: -; SDAG-VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SDAG-VI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SDAG-VI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SDAG-VI-NEXT: v_madak_f32 v0, v0, v1, 0x3e230000 -; SDAG-VI-NEXT: s_setpc_b64 s[30:31] +; VI-LABEL: v_mad_mix_f32_f16lo_f16lo_cvtf16imminv2pi: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; VI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; VI-NEXT: v_madak_f32 v0, v0, v1, 0x3e230000 +; VI-NEXT: s_setpc_b64 s[30:31] ; ; SDAG-CI-LABEL: v_mad_mix_f32_f16lo_f16lo_cvtf16imminv2pi: ; SDAG-CI: ; %bb.0: @@ -939,22 +938,12 @@ ; GISEL-GFX906-NEXT: v_fma_mix_f32 v0, v0, v1, v2 op_sel_hi:[1,1,0] ; GISEL-GFX906-NEXT: s_setpc_b64 s[30:31] ; -; GISEL-VI-LABEL: v_mad_mix_f32_f16lo_f16lo_cvtf16imminv2pi: -; GISEL-VI: ; %bb.0: -; GISEL-VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GISEL-VI-NEXT: v_cvt_f32_f16_e32 v2, v0 -; GISEL-VI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GISEL-VI-NEXT: v_mov_b32_e32 v0, 0x3e230000 -; GISEL-VI-NEXT: v_mac_f32_e32 v0, v2, v1 -; GISEL-VI-NEXT: s_setpc_b64 s[30:31] -; ; GISEL-CI-LABEL: v_mad_mix_f32_f16lo_f16lo_cvtf16imminv2pi: ; GISEL-CI: ; %bb.0: ; GISEL-CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v2, v0 +; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GISEL-CI-NEXT: v_mov_b32_e32 v0, 0x3e230000 -; GISEL-CI-NEXT: v_mac_f32_e32 v0, v2, v1 +; GISEL-CI-NEXT: v_madak_f32 v0, v0, v1, 0x3e230000 ; GISEL-CI-NEXT: s_setpc_b64 s[30:31] %src0.ext = fpext half %src0 to float %src1.ext = fpext half %src1 to float @@ -988,13 +977,13 @@ ; SDAG-GFX906-NEXT: v_fma_mix_f32 v0, v0, v1, s4 op_sel_hi:[1,1,0] ; SDAG-GFX906-NEXT: s_setpc_b64 s[30:31] ; -; SDAG-VI-LABEL: v_mad_mix_f32_f16lo_f16lo_cvtf16imm63: -; SDAG-VI: ; %bb.0: -; SDAG-VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SDAG-VI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SDAG-VI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SDAG-VI-NEXT: v_madak_f32 v0, v0, v1, 0x367c0000 -; SDAG-VI-NEXT: s_setpc_b64 s[30:31] +; VI-LABEL: v_mad_mix_f32_f16lo_f16lo_cvtf16imm63: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; VI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; VI-NEXT: v_madak_f32 v0, v0, v1, 0x367c0000 +; VI-NEXT: s_setpc_b64 s[30:31] ; ; SDAG-CI-LABEL: v_mad_mix_f32_f16lo_f16lo_cvtf16imm63: ; SDAG-CI: ; %bb.0: @@ -1025,22 +1014,12 @@ ; GISEL-GFX906-NEXT: v_fma_mix_f32 v0, v0, v1, v2 op_sel_hi:[1,1,0] ; GISEL-GFX906-NEXT: s_setpc_b64 s[30:31] ; -; GISEL-VI-LABEL: v_mad_mix_f32_f16lo_f16lo_cvtf16imm63: -; GISEL-VI: ; %bb.0: -; GISEL-VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GISEL-VI-NEXT: v_cvt_f32_f16_e32 v2, v0 -; GISEL-VI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GISEL-VI-NEXT: v_mov_b32_e32 v0, 0x367c0000 -; GISEL-VI-NEXT: v_mac_f32_e32 v0, v2, v1 -; GISEL-VI-NEXT: s_setpc_b64 s[30:31] -; ; GISEL-CI-LABEL: v_mad_mix_f32_f16lo_f16lo_cvtf16imm63: ; GISEL-CI: ; %bb.0: ; GISEL-CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v2, v0 +; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GISEL-CI-NEXT: v_mov_b32_e32 v0, 0x367c0000 -; GISEL-CI-NEXT: v_mac_f32_e32 v0, v2, v1 +; GISEL-CI-NEXT: v_madak_f32 v0, v0, v1, 0x367c0000 ; GISEL-CI-NEXT: s_setpc_b64 s[30:31] %src0.ext = fpext half %src0 to float %src1.ext = fpext half %src1 to float @@ -1050,16 +1029,16 @@ } define <2 x float> @v_mad_mix_v2f32_f32imm1(<2 x half> %src0, <2 x half> %src1) #0 { -; GFX1100-LABEL: v_mad_mix_v2f32_f32imm1: -; GFX1100: ; %bb.0: -; GFX1100-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX1100-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX1100-NEXT: s_mov_b32 s0, 1.0 -; GFX1100-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX1100-NEXT: v_fma_mix_f32 v2, v0, v1, s0 op_sel_hi:[1,1,0] -; GFX1100-NEXT: v_fma_mix_f32 v1, v0, v1, s0 op_sel:[1,1,0] op_sel_hi:[1,1,0] -; GFX1100-NEXT: v_mov_b32_e32 v0, v2 -; GFX1100-NEXT: s_setpc_b64 s[30:31] +; SDAG-GFX1100-LABEL: v_mad_mix_v2f32_f32imm1: +; SDAG-GFX1100: ; %bb.0: +; SDAG-GFX1100-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-GFX1100-NEXT: s_waitcnt_vscnt null, 0x0 +; SDAG-GFX1100-NEXT: s_mov_b32 s0, 1.0 +; SDAG-GFX1100-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; SDAG-GFX1100-NEXT: v_fma_mix_f32 v2, v0, v1, s0 op_sel_hi:[1,1,0] +; SDAG-GFX1100-NEXT: v_fma_mix_f32 v1, v0, v1, s0 op_sel:[1,1,0] op_sel_hi:[1,1,0] +; SDAG-GFX1100-NEXT: v_mov_b32_e32 v0, v2 +; SDAG-GFX1100-NEXT: s_setpc_b64 s[30:31] ; ; SDAG-GFX900-LABEL: v_mad_mix_v2f32_f32imm1: ; SDAG-GFX900: ; %bb.0: @@ -1105,21 +1084,32 @@ ; SDAG-CI-NEXT: v_mad_f32 v1, v1, v3, 1.0 ; SDAG-CI-NEXT: s_setpc_b64 s[30:31] ; +; GISEL-GFX1100-LABEL: v_mad_mix_v2f32_f32imm1: +; GISEL-GFX1100: ; %bb.0: +; GISEL-GFX1100-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-GFX1100-NEXT: s_waitcnt_vscnt null, 0x0 +; GISEL-GFX1100-NEXT: v_mov_b32_e32 v3, 1.0 +; GISEL-GFX1100-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GISEL-GFX1100-NEXT: v_fma_mix_f32 v2, v0, v1, v3 op_sel_hi:[1,1,0] +; GISEL-GFX1100-NEXT: v_fma_mix_f32 v1, v0, v1, v3 op_sel:[1,1,0] op_sel_hi:[1,1,0] +; GISEL-GFX1100-NEXT: v_mov_b32_e32 v0, v2 +; GISEL-GFX1100-NEXT: s_setpc_b64 s[30:31] +; ; GISEL-GFX900-LABEL: v_mad_mix_v2f32_f32imm1: ; GISEL-GFX900: ; %bb.0: ; GISEL-GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GISEL-GFX900-NEXT: s_mov_b32 s4, 1.0 -; GISEL-GFX900-NEXT: v_mad_mix_f32 v2, v0, v1, s4 op_sel_hi:[1,1,0] -; GISEL-GFX900-NEXT: v_mad_mix_f32 v1, v0, v1, s4 op_sel:[1,1,0] op_sel_hi:[1,1,0] +; GISEL-GFX900-NEXT: v_mov_b32_e32 v3, 1.0 +; GISEL-GFX900-NEXT: v_mad_mix_f32 v2, v0, v1, v3 op_sel_hi:[1,1,0] +; GISEL-GFX900-NEXT: v_mad_mix_f32 v1, v0, v1, v3 op_sel:[1,1,0] op_sel_hi:[1,1,0] ; GISEL-GFX900-NEXT: v_mov_b32_e32 v0, v2 ; GISEL-GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GISEL-GFX906-LABEL: v_mad_mix_v2f32_f32imm1: ; GISEL-GFX906: ; %bb.0: ; GISEL-GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GISEL-GFX906-NEXT: s_mov_b32 s4, 1.0 -; GISEL-GFX906-NEXT: v_fma_mix_f32 v2, v0, v1, s4 op_sel_hi:[1,1,0] -; GISEL-GFX906-NEXT: v_fma_mix_f32 v1, v0, v1, s4 op_sel:[1,1,0] op_sel_hi:[1,1,0] +; GISEL-GFX906-NEXT: v_mov_b32_e32 v3, 1.0 +; GISEL-GFX906-NEXT: v_fma_mix_f32 v2, v0, v1, v3 op_sel_hi:[1,1,0] +; GISEL-GFX906-NEXT: v_fma_mix_f32 v1, v0, v1, v3 op_sel:[1,1,0] op_sel_hi:[1,1,0] ; GISEL-GFX906-NEXT: v_mov_b32_e32 v0, v2 ; GISEL-GFX906-NEXT: s_setpc_b64 s[30:31] ; @@ -1151,16 +1141,16 @@ } define <2 x float> @v_mad_mix_v2f32_cvtf16imminv2pi(<2 x half> %src0, <2 x half> %src1) #0 { -; GFX1100-LABEL: v_mad_mix_v2f32_cvtf16imminv2pi: -; GFX1100: ; %bb.0: -; GFX1100-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX1100-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX1100-NEXT: s_mov_b32 s0, 0x3e230000 -; GFX1100-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX1100-NEXT: v_fma_mix_f32 v2, v0, v1, s0 op_sel_hi:[1,1,0] -; GFX1100-NEXT: v_fma_mix_f32 v1, v0, v1, s0 op_sel:[1,1,0] op_sel_hi:[1,1,0] -; GFX1100-NEXT: v_mov_b32_e32 v0, v2 -; GFX1100-NEXT: s_setpc_b64 s[30:31] +; SDAG-GFX1100-LABEL: v_mad_mix_v2f32_cvtf16imminv2pi: +; SDAG-GFX1100: ; %bb.0: +; SDAG-GFX1100-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-GFX1100-NEXT: s_waitcnt_vscnt null, 0x0 +; SDAG-GFX1100-NEXT: s_mov_b32 s0, 0x3e230000 +; SDAG-GFX1100-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; SDAG-GFX1100-NEXT: v_fma_mix_f32 v2, v0, v1, s0 op_sel_hi:[1,1,0] +; SDAG-GFX1100-NEXT: v_fma_mix_f32 v1, v0, v1, s0 op_sel:[1,1,0] op_sel_hi:[1,1,0] +; SDAG-GFX1100-NEXT: v_mov_b32_e32 v0, v2 +; SDAG-GFX1100-NEXT: s_setpc_b64 s[30:31] ; ; SDAG-GFX900-LABEL: v_mad_mix_v2f32_cvtf16imminv2pi: ; SDAG-GFX900: ; %bb.0: @@ -1208,21 +1198,32 @@ ; SDAG-CI-NEXT: v_mac_f32_e32 v1, v4, v3 ; SDAG-CI-NEXT: s_setpc_b64 s[30:31] ; +; GISEL-GFX1100-LABEL: v_mad_mix_v2f32_cvtf16imminv2pi: +; GISEL-GFX1100: ; %bb.0: +; GISEL-GFX1100-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-GFX1100-NEXT: s_waitcnt_vscnt null, 0x0 +; GISEL-GFX1100-NEXT: v_mov_b32_e32 v3, 0x3e230000 +; GISEL-GFX1100-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GISEL-GFX1100-NEXT: v_fma_mix_f32 v2, v0, v1, v3 op_sel_hi:[1,1,0] +; GISEL-GFX1100-NEXT: v_fma_mix_f32 v1, v0, v1, v3 op_sel:[1,1,0] op_sel_hi:[1,1,0] +; GISEL-GFX1100-NEXT: v_mov_b32_e32 v0, v2 +; GISEL-GFX1100-NEXT: s_setpc_b64 s[30:31] +; ; GISEL-GFX900-LABEL: v_mad_mix_v2f32_cvtf16imminv2pi: ; GISEL-GFX900: ; %bb.0: ; GISEL-GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GISEL-GFX900-NEXT: s_mov_b32 s4, 0x3e230000 -; GISEL-GFX900-NEXT: v_mad_mix_f32 v2, v0, v1, s4 op_sel_hi:[1,1,0] -; GISEL-GFX900-NEXT: v_mad_mix_f32 v1, v0, v1, s4 op_sel:[1,1,0] op_sel_hi:[1,1,0] +; GISEL-GFX900-NEXT: v_mov_b32_e32 v3, 0x3e230000 +; GISEL-GFX900-NEXT: v_mad_mix_f32 v2, v0, v1, v3 op_sel_hi:[1,1,0] +; GISEL-GFX900-NEXT: v_mad_mix_f32 v1, v0, v1, v3 op_sel:[1,1,0] op_sel_hi:[1,1,0] ; GISEL-GFX900-NEXT: v_mov_b32_e32 v0, v2 ; GISEL-GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GISEL-GFX906-LABEL: v_mad_mix_v2f32_cvtf16imminv2pi: ; GISEL-GFX906: ; %bb.0: ; GISEL-GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GISEL-GFX906-NEXT: s_mov_b32 s4, 0x3e230000 -; GISEL-GFX906-NEXT: v_fma_mix_f32 v2, v0, v1, s4 op_sel_hi:[1,1,0] -; GISEL-GFX906-NEXT: v_fma_mix_f32 v1, v0, v1, s4 op_sel:[1,1,0] op_sel_hi:[1,1,0] +; GISEL-GFX906-NEXT: v_mov_b32_e32 v3, 0x3e230000 +; GISEL-GFX906-NEXT: v_fma_mix_f32 v2, v0, v1, v3 op_sel_hi:[1,1,0] +; GISEL-GFX906-NEXT: v_fma_mix_f32 v1, v0, v1, v3 op_sel:[1,1,0] op_sel_hi:[1,1,0] ; GISEL-GFX906-NEXT: v_mov_b32_e32 v0, v2 ; GISEL-GFX906-NEXT: s_setpc_b64 s[30:31] ; @@ -1232,22 +1233,22 @@ ; GISEL-VI-NEXT: v_cvt_f32_f16_e32 v2, v0 ; GISEL-VI-NEXT: v_cvt_f32_f16_sdwa v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 ; GISEL-VI-NEXT: v_cvt_f32_f16_e32 v0, v1 -; GISEL-VI-NEXT: v_cvt_f32_f16_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 -; GISEL-VI-NEXT: s_mov_b32 s4, 0x3e230000 -; GISEL-VI-NEXT: v_mad_f32 v0, v2, v0, s4 -; GISEL-VI-NEXT: v_mad_f32 v1, v3, v1, s4 +; GISEL-VI-NEXT: v_cvt_f32_f16_sdwa v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 +; GISEL-VI-NEXT: v_mov_b32_e32 v1, 0x3e230000 +; GISEL-VI-NEXT: v_madak_f32 v0, v2, v0, 0x3e230000 +; GISEL-VI-NEXT: v_mac_f32_e32 v1, v3, v4 ; GISEL-VI-NEXT: s_setpc_b64 s[30:31] ; ; GISEL-CI-LABEL: v_mad_mix_v2f32_cvtf16imminv2pi: ; GISEL-CI: ; %bb.0: ; GISEL-CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v4, v1 ; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v2, v2 ; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; GISEL-CI-NEXT: s_mov_b32 s4, 0x3e230000 -; GISEL-CI-NEXT: v_mad_f32 v0, v0, v2, s4 -; GISEL-CI-NEXT: v_mad_f32 v1, v1, v3, s4 +; GISEL-CI-NEXT: v_mov_b32_e32 v1, 0x3e230000 +; GISEL-CI-NEXT: v_madak_f32 v0, v0, v2, 0x3e230000 +; GISEL-CI-NEXT: v_mac_f32_e32 v1, v4, v3 ; GISEL-CI-NEXT: s_setpc_b64 s[30:31] %src0.ext = fpext <2 x half> %src0 to <2 x float> %src1.ext = fpext <2 x half> %src1 to <2 x float> @@ -1257,16 +1258,16 @@ } define <2 x float> @v_mad_mix_v2f32_f32imminv2pi(<2 x half> %src0, <2 x half> %src1) #0 { -; GFX1100-LABEL: v_mad_mix_v2f32_f32imminv2pi: -; GFX1100: ; %bb.0: -; GFX1100-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX1100-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX1100-NEXT: s_mov_b32 s0, 0.15915494 -; GFX1100-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX1100-NEXT: v_fma_mix_f32 v2, v0, v1, s0 op_sel_hi:[1,1,0] -; GFX1100-NEXT: v_fma_mix_f32 v1, v0, v1, s0 op_sel:[1,1,0] op_sel_hi:[1,1,0] -; GFX1100-NEXT: v_mov_b32_e32 v0, v2 -; GFX1100-NEXT: s_setpc_b64 s[30:31] +; SDAG-GFX1100-LABEL: v_mad_mix_v2f32_f32imminv2pi: +; SDAG-GFX1100: ; %bb.0: +; SDAG-GFX1100-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-GFX1100-NEXT: s_waitcnt_vscnt null, 0x0 +; SDAG-GFX1100-NEXT: s_mov_b32 s0, 0.15915494 +; SDAG-GFX1100-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; SDAG-GFX1100-NEXT: v_fma_mix_f32 v2, v0, v1, s0 op_sel_hi:[1,1,0] +; SDAG-GFX1100-NEXT: v_fma_mix_f32 v1, v0, v1, s0 op_sel:[1,1,0] op_sel_hi:[1,1,0] +; SDAG-GFX1100-NEXT: v_mov_b32_e32 v0, v2 +; SDAG-GFX1100-NEXT: s_setpc_b64 s[30:31] ; ; SDAG-GFX900-LABEL: v_mad_mix_v2f32_f32imminv2pi: ; SDAG-GFX900: ; %bb.0: @@ -1313,21 +1314,32 @@ ; SDAG-CI-NEXT: v_mac_f32_e32 v1, v4, v3 ; SDAG-CI-NEXT: s_setpc_b64 s[30:31] ; +; GISEL-GFX1100-LABEL: v_mad_mix_v2f32_f32imminv2pi: +; GISEL-GFX1100: ; %bb.0: +; GISEL-GFX1100-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-GFX1100-NEXT: s_waitcnt_vscnt null, 0x0 +; GISEL-GFX1100-NEXT: v_mov_b32_e32 v3, 0.15915494 +; GISEL-GFX1100-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GISEL-GFX1100-NEXT: v_fma_mix_f32 v2, v0, v1, v3 op_sel_hi:[1,1,0] +; GISEL-GFX1100-NEXT: v_fma_mix_f32 v1, v0, v1, v3 op_sel:[1,1,0] op_sel_hi:[1,1,0] +; GISEL-GFX1100-NEXT: v_mov_b32_e32 v0, v2 +; GISEL-GFX1100-NEXT: s_setpc_b64 s[30:31] +; ; GISEL-GFX900-LABEL: v_mad_mix_v2f32_f32imminv2pi: ; GISEL-GFX900: ; %bb.0: ; GISEL-GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GISEL-GFX900-NEXT: s_mov_b32 s4, 0.15915494 -; GISEL-GFX900-NEXT: v_mad_mix_f32 v2, v0, v1, s4 op_sel_hi:[1,1,0] -; GISEL-GFX900-NEXT: v_mad_mix_f32 v1, v0, v1, s4 op_sel:[1,1,0] op_sel_hi:[1,1,0] +; GISEL-GFX900-NEXT: v_mov_b32_e32 v3, 0.15915494 +; GISEL-GFX900-NEXT: v_mad_mix_f32 v2, v0, v1, v3 op_sel_hi:[1,1,0] +; GISEL-GFX900-NEXT: v_mad_mix_f32 v1, v0, v1, v3 op_sel:[1,1,0] op_sel_hi:[1,1,0] ; GISEL-GFX900-NEXT: v_mov_b32_e32 v0, v2 ; GISEL-GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GISEL-GFX906-LABEL: v_mad_mix_v2f32_f32imminv2pi: ; GISEL-GFX906: ; %bb.0: ; GISEL-GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GISEL-GFX906-NEXT: s_mov_b32 s4, 0.15915494 -; GISEL-GFX906-NEXT: v_fma_mix_f32 v2, v0, v1, s4 op_sel_hi:[1,1,0] -; GISEL-GFX906-NEXT: v_fma_mix_f32 v1, v0, v1, s4 op_sel:[1,1,0] op_sel_hi:[1,1,0] +; GISEL-GFX906-NEXT: v_mov_b32_e32 v3, 0.15915494 +; GISEL-GFX906-NEXT: v_fma_mix_f32 v2, v0, v1, v3 op_sel_hi:[1,1,0] +; GISEL-GFX906-NEXT: v_fma_mix_f32 v1, v0, v1, v3 op_sel:[1,1,0] op_sel_hi:[1,1,0] ; GISEL-GFX906-NEXT: v_mov_b32_e32 v0, v2 ; GISEL-GFX906-NEXT: s_setpc_b64 s[30:31] ; @@ -1346,12 +1358,12 @@ ; GISEL-CI: ; %bb.0: ; GISEL-CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v4, v1 ; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v2, v2 ; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; GISEL-CI-NEXT: s_mov_b32 s4, 0x3e22f983 -; GISEL-CI-NEXT: v_mad_f32 v0, v0, v2, s4 -; GISEL-CI-NEXT: v_mad_f32 v1, v1, v3, s4 +; GISEL-CI-NEXT: v_mov_b32_e32 v1, 0x3e22f983 +; GISEL-CI-NEXT: v_madak_f32 v0, v0, v2, 0x3e22f983 +; GISEL-CI-NEXT: v_mac_f32_e32 v1, v4, v3 ; GISEL-CI-NEXT: s_setpc_b64 s[30:31] %src0.ext = fpext <2 x half> %src0 to <2 x float> %src1.ext = fpext <2 x half> %src1 to <2 x float> Index: llvm/test/CodeGen/AMDGPU/remat-fp64-constants.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/remat-fp64-constants.ll +++ llvm/test/CodeGen/AMDGPU/remat-fp64-constants.ll @@ -1,10 +1,11 @@ -; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -verify-machineinstrs --stress-regalloc=10 < %s | FileCheck -check-prefix=GCN %s -; RUN: llc -global-isel -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -verify-machineinstrs --stress-regalloc=10 < %s | FileCheck -check-prefix=GCN %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -verify-machineinstrs --stress-regalloc=10 < %s | FileCheck -check-prefixes=GCN,SDAG %s +; RUN: llc -global-isel -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -verify-machineinstrs --stress-regalloc=10 < %s | FileCheck -check-prefixes=GCN,GISEL %s ; GCN-LABEL: {{^}}test_remat_sgpr: ; GCN-NOT: v_writelane_b32 ; GCN: {{^}}[[LOOP:.LBB[0-9_]+]]: -; GCN-COUNT-6: s_mov_b32 s{{[0-9]+}}, 0x +; SDAG-COUNT-6: s_mov_b32 s{{[0-9]+}}, 0x +; GISEL-COUNT-4: s_mov_b32 s{{[0-9]+}}, 0x ; GCN-NOT: v_writelane_b32 ; GCN: s_cbranch_{{[^ ]+}} [[LOOP]] ; GCN: .sgpr_spill_count: 0 Index: llvm/test/CodeGen/AMDGPU/v_sat_pk_u8_i16.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/v_sat_pk_u8_i16.ll +++ llvm/test/CodeGen/AMDGPU/v_sat_pk_u8_i16.ll @@ -51,8 +51,8 @@ ; GISEL-VI: ; %bb.0: ; GISEL-VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GISEL-VI-NEXT: v_max_i16_e32 v0, 0, v0 -; GISEL-VI-NEXT: v_max_i16_e32 v1, 0, v1 ; GISEL-VI-NEXT: v_mov_b32_e32 v2, 0xff +; GISEL-VI-NEXT: v_max_i16_e32 v1, 0, v1 ; GISEL-VI-NEXT: v_min_i16_e32 v0, 0xff, v0 ; GISEL-VI-NEXT: v_min_i16_sdwa v1, v1, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GISEL-VI-NEXT: v_or_b32_e32 v0, v0, v1 @@ -315,9 +315,9 @@ ; GISEL-VI-LABEL: basic_smin_smax_combined: ; GISEL-VI: ; %bb.0: ; GISEL-VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-VI-NEXT: v_mov_b32_e32 v2, 0xff ; GISEL-VI-NEXT: v_min_i16_e32 v0, 0xff, v0 ; GISEL-VI-NEXT: v_max_i16_e32 v1, 0, v1 -; GISEL-VI-NEXT: v_mov_b32_e32 v2, 0xff ; GISEL-VI-NEXT: v_max_i16_e32 v0, 0, v0 ; GISEL-VI-NEXT: v_min_i16_sdwa v1, v1, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GISEL-VI-NEXT: v_or_b32_e32 v0, v0, v1 @@ -385,13 +385,13 @@ ; GISEL-VI-LABEL: vec_smax_smin: ; GISEL-VI: ; %bb.0: ; GISEL-VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GISEL-VI-NEXT: v_mov_b32_e32 v2, 0 -; GISEL-VI-NEXT: v_max_i16_e32 v1, 0, v0 -; GISEL-VI-NEXT: v_max_i16_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; GISEL-VI-NEXT: v_mov_b32_e32 v2, 0xff -; GISEL-VI-NEXT: v_min_i16_e32 v1, 0xff, v1 -; GISEL-VI-NEXT: v_min_i16_sdwa v0, v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GISEL-VI-NEXT: v_or_b32_e32 v0, v1, v0 +; GISEL-VI-NEXT: v_mov_b32_e32 v3, 0 +; GISEL-VI-NEXT: v_mov_b32_e32 v1, 0xff +; GISEL-VI-NEXT: v_max_i16_e32 v2, 0, v0 +; GISEL-VI-NEXT: v_max_i16_sdwa v0, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GISEL-VI-NEXT: v_min_i16_e32 v2, 0xff, v2 +; GISEL-VI-NEXT: v_min_i16_sdwa v0, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GISEL-VI-NEXT: v_or_b32_e32 v0, v2, v0 ; GISEL-VI-NEXT: s_setpc_b64 s[30:31] ; ; GISEL-GFX9-LABEL: vec_smax_smin: @@ -563,11 +563,11 @@ ; GISEL-VI-LABEL: vec_smin_smax: ; GISEL-VI: ; %bb.0: ; GISEL-VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GISEL-VI-NEXT: v_mov_b32_e32 v2, 0xff -; GISEL-VI-NEXT: v_min_i16_e32 v1, 0xff, v0 -; GISEL-VI-NEXT: v_min_i16_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GISEL-VI-NEXT: v_mov_b32_e32 v1, 0xff +; GISEL-VI-NEXT: v_min_i16_e32 v2, 0xff, v0 +; GISEL-VI-NEXT: v_min_i16_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GISEL-VI-NEXT: v_max_i16_e32 v1, 0, v2 ; GISEL-VI-NEXT: v_mov_b32_e32 v2, 0 -; GISEL-VI-NEXT: v_max_i16_e32 v1, 0, v1 ; GISEL-VI-NEXT: v_max_i16_sdwa v0, v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GISEL-VI-NEXT: v_or_b32_e32 v0, v1, v0 ; GISEL-VI-NEXT: s_setpc_b64 s[30:31]