Index: llvm/include/llvm/CodeGen/GlobalISel/RegBankSelect.h =================================================================== --- llvm/include/llvm/CodeGen/GlobalISel/RegBankSelect.h +++ llvm/include/llvm/CodeGen/GlobalISel/RegBankSelect.h @@ -321,6 +321,8 @@ Insert, /// (Re)assign the register bank of the operand. Reassign, + /// (Re)assign the register bank and remove all prevously inserted copies + ReassignAndRemoveCopies, /// Mark this repairing placement as impossible. Impossible }; Index: llvm/include/llvm/CodeGen/GlobalISel/RegisterBankInfo.h =================================================================== --- llvm/include/llvm/CodeGen/GlobalISel/RegisterBankInfo.h +++ llvm/include/llvm/CodeGen/GlobalISel/RegisterBankInfo.h @@ -19,6 +19,7 @@ #include "llvm/ADT/SmallVector.h" #include "llvm/ADT/iterator_range.h" #include "llvm/CodeGen/Register.h" +#include "llvm/CodeGen/MachineOperand.h" #include "llvm/Support/ErrorHandling.h" #include "llvm/Support/LowLevelTypeImpl.h" #include @@ -620,6 +621,15 @@ return &A != &B; } + /// Check if target can reassing RegBank of constants instead of inserting + /// copies. + virtual bool + canReassingRegBankForConst(const MachineOperand &MO, + const RegisterBank &DstBank, + const MachineRegisterInfo &MRI) const { + return false; + } + /// \returns true if emitting a copy from \p Src to \p Dst is impossible. bool cannotCopy(const RegisterBank &Dst, const RegisterBank &Src, unsigned Size) const { Index: llvm/lib/CodeGen/GlobalISel/RegBankSelect.cpp =================================================================== --- llvm/lib/CodeGen/GlobalISel/RegBankSelect.cpp +++ llvm/lib/CodeGen/GlobalISel/RegBankSelect.cpp @@ -488,6 +488,25 @@ continue; } + if (RBI->canReassingRegBankForConst(MO, *ValMapping.BreakDown[0].RegBank, + *MRI)) { + if (MRI->hasOneNonDBGUse(Reg)) { + // Single use of const can be easily updated by just changing regbank. + LLVM_DEBUG(dbgs() << "=> is free (simple reassignment).\n"); + RepairPts.emplace_back(RepairingPlacement( + MI, OpIdx, *TRI, *this, RepairingPlacement::Reassign)); + } else { + // If there are multiple uses we also need to update all previously + // inserted copies. + LLVM_DEBUG( + dbgs() << "=> is free (reassignment of all previous copies).\n"); + RepairPts.emplace_back( + RepairingPlacement(MI, OpIdx, *TRI, *this, + RepairingPlacement::ReassignAndRemoveCopies)); + } + continue; + } + // Find the insertion point for the repairing code. RepairPts.emplace_back( RepairingPlacement(MI, OpIdx, *TRI, *this, RepairingPlacement::Insert)); @@ -608,6 +627,34 @@ if (!repairReg(MO, ValMapping, RepairPt, OpdMapper.getVRegs(OpIdx))) return false; break; + case RepairingPlacement::ReassignAndRemoveCopies: { + // Reassign register regbank + assert(ValMapping.NumBreakDowns == 1 && + "Reassignment should only be for simple mapping"); + MRI->setRegBank(Reg, *ValMapping.BreakDown[0].RegBank); + + // Remove all copies + llvm::SmallVector Ops; + for (MachineOperand &Use : MRI->use_nodbg_operands(Reg)) { + MachineInstr *UseMI = Use.getParent(); + if (&Use == &MO) + continue; // Skip this use + + assert(UseMI->getOpcode() == TargetOpcode::COPY && "Should be a copy"); + Register UseMIDef = UseMI->getOperand(0).getReg(); + MachineOperand &CopyUse = *MRI->use_nodbg_begin(UseMIDef); + Ops.push_back(&CopyUse); + } + + for (MachineOperand *CopyUse : Ops) { + MachineInstr *CopyDef = MRI->getVRegDef(CopyUse->getReg()); + CopyUse->setReg(Reg); + assert(MRI->use_empty(CopyDef->getOperand(0).getReg()) && + "There should be no more uses"); + CopyDef->eraseFromParent(); // TODO: check if now dead after setreg + } + break; + } default: llvm_unreachable("Other kind should not happen"); } Index: llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.h =================================================================== --- llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.h +++ llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.h @@ -166,6 +166,11 @@ unsigned copyCost(const RegisterBank &A, const RegisterBank &B, unsigned Size) const override; + bool + canReassingRegBankForConst(const MachineOperand &MO, + const RegisterBank &DstBank, + const MachineRegisterInfo &MRI) const override; + unsigned getBreakDownCost(const ValueMapping &ValMapping, const RegisterBank *CurBank = nullptr) const override; Index: llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp =================================================================== --- llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp +++ llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp @@ -80,6 +80,7 @@ #include "llvm/CodeGen/GlobalISel/MIPatternMatch.h" #include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h" #include "llvm/CodeGen/GlobalISel/RegisterBank.h" +#include "llvm/CodeGen/MachineOperand.h" #include "llvm/IR/IntrinsicsAMDGPU.h" #define GET_TARGET_REGBANK_IMPL @@ -247,6 +248,51 @@ return RegisterBankInfo::copyCost(Dst, Src, Size); } +bool AMDGPURegisterBankInfo::canReassingRegBankForConst( + const MachineOperand &MO, const RegisterBank &DstBank, + const MachineRegisterInfo &MRI) const { + Register Reg = MO.getReg(); + MachineInstr *OpDef = MRI.getVRegDef(Reg); + + if (OpDef->getOpcode() != TargetOpcode::G_CONSTANT) + return false; + + // G_CONSTANTS are assigned SGPRRegBank by default so we only check uses that + // require a vpgr. + if (DstBank.getID() != AMDGPU::VGPRRegBankID) + return false; + + LLT Ty = MRI.getType(Reg); + // v_mov_b32 exists, but there is no v_mov_b64 instruction. + if (Ty.getSizeInBits() > 32) + return false; + + // If this is the only use of constant, we can simply reassing the regbank. + if (MRI.hasOneNonDBGUse(Reg)) + return true; + + // Check all uses of const are copies to a VGPR. + for (MachineOperand &Use : MRI.use_nodbg_operands(Reg)) { + // Skip use for which mapping is currently computed. + if (&MO == &Use) + continue; + + MachineInstr *UseMI = Use.getParent(); + if (UseMI->getOpcode() != TargetOpcode::COPY) + return false; + + Register Dst = UseMI->getOperand(0).getReg(); + if (Dst.isPhysical()) + return false; + + const RegisterBank *CopyBank = MRI.getRegBankOrNull(Dst); + if (CopyBank && CopyBank->getID() != AMDGPU::VGPRRegBankID) + return false; + } + + return true; +} + unsigned AMDGPURegisterBankInfo::getBreakDownCost( const ValueMapping &ValMapping, const RegisterBank *CurBank) const { Index: llvm/test/CodeGen/AMDGPU/GlobalISel/add.v2i16.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/GlobalISel/add.v2i16.ll +++ llvm/test/CodeGen/AMDGPU/GlobalISel/add.v2i16.ll @@ -127,10 +127,9 @@ ; GFX8-LABEL: v_add_v2i16_neg_inline_imm_splat: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: s_movk_i32 s4, 0xffc0 -; GFX8-NEXT: v_mov_b32_e32 v2, s4 -; GFX8-NEXT: v_add_u16_e32 v1, s4, v0 -; GFX8-NEXT: v_add_u16_sdwa v0, v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-NEXT: v_mov_b32_e32 v2, 64 +; GFX8-NEXT: v_subrev_u16_e32 v1, 64, v0 +; GFX8-NEXT: v_sub_u16_sdwa v0, v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; GFX8-NEXT: v_or_b32_e32 v0, v1, v0 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; @@ -156,7 +155,7 @@ ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v2, 4 -; GFX8-NEXT: v_add_u16_e32 v1, 0xffc0, v0 +; GFX8-NEXT: v_subrev_u16_e32 v1, 64, v0 ; GFX8-NEXT: v_add_u16_sdwa v0, v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; GFX8-NEXT: v_or_b32_e32 v0, v1, v0 ; GFX8-NEXT: s_setpc_b64 s[30:31] @@ -182,10 +181,10 @@ ; GFX8-LABEL: v_add_v2i16_neg_inline_imm_hi: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v1, 0xffffffc0 -; GFX8-NEXT: v_add_u16_e32 v2, 4, v0 -; GFX8-NEXT: v_add_u16_sdwa v0, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; GFX8-NEXT: v_or_b32_e32 v0, v2, v0 +; GFX8-NEXT: v_mov_b32_e32 v2, 64 +; GFX8-NEXT: v_add_u16_e32 v1, 4, v0 +; GFX8-NEXT: v_sub_u16_sdwa v0, v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_e32 v0, v1, v0 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_add_v2i16_neg_inline_imm_hi: Index: llvm/test/CodeGen/AMDGPU/GlobalISel/ashr.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/GlobalISel/ashr.ll +++ llvm/test/CodeGen/AMDGPU/GlobalISel/ashr.ll @@ -722,11 +722,11 @@ ; GFX6-LABEL: v_ashr_v2i16: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: s_mov_b32 s4, 0xffff -; GFX6-NEXT: v_and_b32_e32 v2, s4, v2 +; GFX6-NEXT: v_mov_b32_e32 v4, 0xffff +; GFX6-NEXT: v_and_b32_e32 v2, v2, v4 ; GFX6-NEXT: v_bfe_i32 v0, v0, 0, 16 ; GFX6-NEXT: v_ashrrev_i32_e32 v0, v2, v0 -; GFX6-NEXT: v_and_b32_e32 v2, s4, v3 +; GFX6-NEXT: v_and_b32_e32 v2, v3, v4 ; GFX6-NEXT: v_bfe_i32 v1, v1, 0, 16 ; GFX6-NEXT: v_ashrrev_i32_e32 v1, v2, v1 ; GFX6-NEXT: s_setpc_b64 s[30:31] @@ -849,15 +849,15 @@ define amdgpu_ps float @ashr_v2i16_sv(<2 x i16> inreg %value, <2 x i16> %amount) { ; GFX6-LABEL: ashr_v2i16_sv: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_mov_b32 s2, 0xffff -; GFX6-NEXT: v_and_b32_e32 v0, s2, v0 +; GFX6-NEXT: v_mov_b32_e32 v2, 0xffff +; GFX6-NEXT: v_and_b32_e32 v0, v0, v2 ; GFX6-NEXT: s_sext_i32_i16 s0, s0 ; GFX6-NEXT: v_ashr_i32_e32 v0, s0, v0 -; GFX6-NEXT: v_and_b32_e32 v1, s2, v1 +; GFX6-NEXT: v_and_b32_e32 v1, v1, v2 ; GFX6-NEXT: s_sext_i32_i16 s0, s1 ; GFX6-NEXT: v_ashr_i32_e32 v1, s0, v1 -; GFX6-NEXT: v_and_b32_e32 v1, s2, v1 -; GFX6-NEXT: v_and_b32_e32 v0, s2, v0 +; GFX6-NEXT: v_and_b32_e32 v1, v1, v2 +; GFX6-NEXT: v_and_b32_e32 v0, v0, v2 ; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX6-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX6-NEXT: ; return to shader part epilog @@ -939,25 +939,25 @@ ; GFX6-LABEL: v_ashr_v4i16: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: s_mov_b32 s4, 0xffff -; GFX6-NEXT: v_and_b32_e32 v4, s4, v4 +; GFX6-NEXT: v_mov_b32_e32 v8, 0xffff +; GFX6-NEXT: v_and_b32_e32 v4, v4, v8 ; GFX6-NEXT: v_bfe_i32 v0, v0, 0, 16 ; GFX6-NEXT: v_ashrrev_i32_e32 v0, v4, v0 -; GFX6-NEXT: v_and_b32_e32 v4, s4, v5 +; GFX6-NEXT: v_and_b32_e32 v4, v5, v8 ; GFX6-NEXT: v_bfe_i32 v1, v1, 0, 16 ; GFX6-NEXT: v_ashrrev_i32_e32 v1, v4, v1 -; GFX6-NEXT: v_and_b32_e32 v4, s4, v6 +; GFX6-NEXT: v_and_b32_e32 v4, v6, v8 ; GFX6-NEXT: v_bfe_i32 v2, v2, 0, 16 ; GFX6-NEXT: v_ashrrev_i32_e32 v2, v4, v2 -; GFX6-NEXT: v_and_b32_e32 v4, s4, v7 +; GFX6-NEXT: v_and_b32_e32 v4, v7, v8 ; GFX6-NEXT: v_bfe_i32 v3, v3, 0, 16 -; GFX6-NEXT: v_and_b32_e32 v1, s4, v1 +; GFX6-NEXT: v_and_b32_e32 v1, v1, v8 ; GFX6-NEXT: v_ashrrev_i32_e32 v3, v4, v3 -; GFX6-NEXT: v_and_b32_e32 v0, s4, v0 +; GFX6-NEXT: v_and_b32_e32 v0, v0, v8 ; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX6-NEXT: v_or_b32_e32 v0, v0, v1 -; GFX6-NEXT: v_and_b32_e32 v1, s4, v2 -; GFX6-NEXT: v_and_b32_e32 v2, s4, v3 +; GFX6-NEXT: v_and_b32_e32 v1, v2, v8 +; GFX6-NEXT: v_and_b32_e32 v2, v3, v8 ; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX6-NEXT: v_or_b32_e32 v1, v1, v2 ; GFX6-NEXT: s_setpc_b64 s[30:31] @@ -1109,28 +1109,27 @@ ; GFX6-LABEL: v_ashr_v8i16: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: s_mov_b32 s4, 0xffff -; GFX6-NEXT: v_and_b32_e32 v8, s4, v8 +; GFX6-NEXT: v_mov_b32_e32 v16, 0xffff +; GFX6-NEXT: v_and_b32_e32 v8, v8, v16 ; GFX6-NEXT: v_bfe_i32 v0, v0, 0, 16 ; GFX6-NEXT: v_ashrrev_i32_e32 v0, v8, v0 -; GFX6-NEXT: v_and_b32_e32 v8, s4, v9 +; GFX6-NEXT: v_and_b32_e32 v8, v9, v16 ; GFX6-NEXT: v_bfe_i32 v1, v1, 0, 16 ; GFX6-NEXT: v_ashrrev_i32_e32 v1, v8, v1 -; GFX6-NEXT: v_and_b32_e32 v8, s4, v10 +; GFX6-NEXT: v_and_b32_e32 v8, v10, v16 ; GFX6-NEXT: v_bfe_i32 v2, v2, 0, 16 ; GFX6-NEXT: v_ashrrev_i32_e32 v2, v8, v2 -; GFX6-NEXT: v_and_b32_e32 v8, s4, v11 +; GFX6-NEXT: v_and_b32_e32 v8, v11, v16 ; GFX6-NEXT: v_bfe_i32 v3, v3, 0, 16 -; GFX6-NEXT: v_mov_b32_e32 v16, 0xffff ; GFX6-NEXT: v_ashrrev_i32_e32 v3, v8, v3 -; GFX6-NEXT: v_and_b32_e32 v8, s4, v12 +; GFX6-NEXT: v_and_b32_e32 v8, v12, v16 ; GFX6-NEXT: v_bfe_i32 v4, v4, 0, 16 ; GFX6-NEXT: v_ashrrev_i32_e32 v4, v8, v4 -; GFX6-NEXT: v_and_b32_e32 v8, s4, v13 +; GFX6-NEXT: v_and_b32_e32 v8, v13, v16 ; GFX6-NEXT: v_bfe_i32 v5, v5, 0, 16 ; GFX6-NEXT: v_and_b32_e32 v1, v1, v16 ; GFX6-NEXT: v_ashrrev_i32_e32 v5, v8, v5 -; GFX6-NEXT: v_and_b32_e32 v8, s4, v14 +; GFX6-NEXT: v_and_b32_e32 v8, v14, v16 ; GFX6-NEXT: v_bfe_i32 v6, v6, 0, 16 ; GFX6-NEXT: v_and_b32_e32 v0, v0, v16 ; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1 Index: llvm/test/CodeGen/AMDGPU/GlobalISel/combine-short-clamp.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/GlobalISel/combine-short-clamp.ll +++ llvm/test/CodeGen/AMDGPU/GlobalISel/combine-short-clamp.ll @@ -8,12 +8,12 @@ ; GFX10-LABEL: {{^}}v_clamp_i64_i16 ; GFX678: v_cvt_pk_i16_i32_e32 [[A:v[0-9]+]], [[A]], [[B:v[0-9]+]] ; GFX9: v_cvt_pk_i16_i32 [[A:v[0-9]+]], [[A]], [[B:v[0-9]+]] -; GFX6789: v_mov_b32_e32 [[B]], 0xffff8000 -; GFX6789: v_mov_b32_e32 [[C:v[0-9]+]], 0x7fff -; GFX6789: v_med3_i32 [[A]], [[B]], [[A]], [[C]] +; GFX6789: v_mov_b32_e32 [[B]], 0x7fff +; GFX6789: v_mov_b32_e32 [[C:v[0-9]+]], 0xffff8000 +; GFX6789: v_med3_i32 [[A]], [[C]], [[A]], [[B]] ; GFX10: v_cvt_pk_i16_i32 [[A:v[0-9]+]], [[A]], [[B:v[0-9]+]] -; GFX10: v_mov_b32_e32 [[B]], 0x7fff -; GFX10: v_med3_i32 [[A]], 0xffff8000, [[A]], [[B]] +; GFX10: v_mov_b32_e32 [[B]], 0xffff8000 +; GFX10: v_med3_i32 [[A]], [[B]], [[A]], 0x7fff define i16 @v_clamp_i64_i16(i64 %in) #0 { entry: %max = call i64 @llvm.smax.i64(i64 %in, i64 -32768) @@ -25,12 +25,12 @@ ; GFX10-LABEL: {{^}}v_clamp_i64_i16_reverse ; GFX678: v_cvt_pk_i16_i32_e32 [[A:v[0-9]+]], [[A]], [[B:v[0-9]+]] ; GFX9: v_cvt_pk_i16_i32 [[A:v[0-9]+]], [[A]], [[B:v[0-9]+]] -; GFX6789: v_mov_b32_e32 [[B]], 0xffff8000 -; GFX6789: v_mov_b32_e32 [[C:v[0-9]+]], 0x7fff -; GFX6789: v_med3_i32 [[A]], [[B]], [[A]], [[C]] +; GFX6789: v_mov_b32_e32 [[B]], 0x7fff +; GFX6789: v_mov_b32_e32 [[C:v[0-9]+]], 0xffff8000 +; GFX6789: v_med3_i32 [[A]], [[C]], [[A]], [[B]] ; GFX10: v_cvt_pk_i16_i32 [[A:v[0-9]+]], [[A]], [[B:v[0-9]+]] -; GFX10: v_mov_b32_e32 [[B]], 0x7fff -; GFX10: v_med3_i32 [[A]], 0xffff8000, [[A]], [[B]] +; GFX10: v_mov_b32_e32 [[B]], 0xffff8000 +; GFX10: v_med3_i32 [[A]], [[B]], [[A]], 0x7fff define i16 @v_clamp_i64_i16_reverse(i64 %in) #0 { entry: %min = call i64 @llvm.smin.i64(i64 %in, i64 32767) @@ -69,12 +69,12 @@ ; GFX10-LABEL: {{^}}v_clamp_i64_i16_lower_than_short ; GFX678: v_cvt_pk_i16_i32_e32 [[A:v[0-9]+]], [[A]], [[B:v[0-9]+]] ; GFX9: v_cvt_pk_i16_i32 [[A:v[0-9]+]], [[A]], [[B:v[0-9]+]] -; GFX6789: v_mov_b32_e32 [[B]], 0xffffff01 -; GFX6789: v_mov_b32_e32 [[C:v[0-9]+]], 0x100 -; GFX6789: v_med3_i32 [[A]], [[B]], [[A]], [[C]] +; GFX6789: v_mov_b32_e32 [[B]], 0x100 +; GFX6789: v_mov_b32_e32 [[C:v[0-9]+]], 0xffffff01 +; GFX6789: v_med3_i32 [[A]], [[C]], [[A]], [[B]] ; GFX10: v_cvt_pk_i16_i32 [[A:v[0-9]+]], [[A]], [[B:v[0-9]+]] -; GFX10: v_mov_b32_e32 [[B]], 0x100 -; GFX10: v_med3_i32 [[A]], 0xffffff01, [[A]], [[B]] +; GFX10: v_mov_b32_e32 [[B]], 0xffffff01 +; GFX10: v_med3_i32 [[A]], [[B]], [[A]], 0x100 define i16 @v_clamp_i64_i16_lower_than_short(i64 %in) #0 { entry: %min = call i64 @llvm.smin.i64(i64 %in, i64 256) @@ -86,12 +86,12 @@ ; GFX10-LABEL: {{^}}v_clamp_i64_i16_lower_than_short_reverse ; GFX678: v_cvt_pk_i16_i32_e32 [[A:v[0-9]+]], [[A]], [[B:v[0-9]+]] ; GFX9: v_cvt_pk_i16_i32 [[A:v[0-9]+]], [[A]], [[B:v[0-9]+]] -; GFX6789: v_mov_b32_e32 [[B]], 0xffffff01 -; GFX6789: v_mov_b32_e32 [[C:v[0-9]+]], 0x100 -; GFX6789: v_med3_i32 [[A]], [[B]], [[A]], [[C]] +; GFX6789: v_mov_b32_e32 [[B]], 0x100 +; GFX6789: v_mov_b32_e32 [[C:v[0-9]+]], 0xffffff01 +; GFX6789: v_med3_i32 [[A]], [[C]], [[A]], [[B]] ; GFX10: v_cvt_pk_i16_i32 [[A:v[0-9]+]], [[A]], [[B:v[0-9]+]] -; GFX10: v_mov_b32_e32 [[B]], 0x100 -; GFX10: v_med3_i32 [[A]], 0xffffff01, [[A]], [[B]] +; GFX10: v_mov_b32_e32 [[B]], 0xffffff01 +; GFX10: v_med3_i32 [[A]], [[B]], [[A]], 0x100 define i16 @v_clamp_i64_i16_lower_than_short_reverse(i64 %in) #0 { entry: %max = call i64 @llvm.smax.i64(i64 %in, i64 -255) Index: llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.i8.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.i8.ll +++ llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.i8.ll @@ -533,16 +533,16 @@ ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: global_load_dword v0, v[0:1], off -; GFX9-NEXT: s_mov_b32 s4, 8 +; GFX9-NEXT: v_mov_b32_e32 v2, 8 ; GFX9-NEXT: v_mov_b32_e32 v1, 0xff -; GFX9-NEXT: v_mov_b32_e32 v2, 16 +; GFX9-NEXT: v_mov_b32_e32 v3, 16 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_lshrrev_b32_e32 v3, 24, v0 -; GFX9-NEXT: v_lshlrev_b32_sdwa v4, s4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 -; GFX9-NEXT: v_lshlrev_b32_sdwa v2, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 -; GFX9-NEXT: v_and_or_b32 v0, v0, v1, v4 -; GFX9-NEXT: v_lshlrev_b32_e32 v1, 24, v3 -; GFX9-NEXT: v_or3_b32 v0, v0, v2, v1 +; GFX9-NEXT: v_lshrrev_b32_e32 v4, 24, v0 +; GFX9-NEXT: v_lshlrev_b32_sdwa v2, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 +; GFX9-NEXT: v_lshlrev_b32_sdwa v3, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 +; GFX9-NEXT: v_and_or_b32 v0, v0, v1, v2 +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 24, v4 +; GFX9-NEXT: v_or3_b32 v0, v0, v3, v1 ; GFX9-NEXT: v_lshrrev_b32_e32 v0, 8, v0 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -589,15 +589,15 @@ ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: global_load_dword v0, v[0:1], off -; GFX10-NEXT: s_mov_b32 s4, 8 -; GFX10-NEXT: v_mov_b32_e32 v1, 16 +; GFX10-NEXT: v_mov_b32_e32 v1, 8 +; GFX10-NEXT: v_mov_b32_e32 v2, 16 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_lshlrev_b32_sdwa v2, s4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 +; GFX10-NEXT: v_lshlrev_b32_sdwa v1, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 ; GFX10-NEXT: v_lshrrev_b32_e32 v3, 24, v0 -; GFX10-NEXT: v_lshlrev_b32_sdwa v1, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 -; GFX10-NEXT: v_and_or_b32 v0, 0xff, v0, v2 -; GFX10-NEXT: v_lshlrev_b32_e32 v2, 24, v3 -; GFX10-NEXT: v_or3_b32 v0, v0, v1, v2 +; GFX10-NEXT: v_lshlrev_b32_sdwa v2, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 +; GFX10-NEXT: v_and_or_b32 v0, 0xff, v0, v1 +; GFX10-NEXT: v_lshlrev_b32_e32 v1, 24, v3 +; GFX10-NEXT: v_or3_b32 v0, v0, v2, v1 ; GFX10-NEXT: v_lshrrev_b32_e32 v0, 8, v0 ; GFX10-NEXT: s_setpc_b64 s[30:31] %vector = load <4 x i8>, <4 x i8> addrspace(1)* %ptr @@ -611,15 +611,15 @@ ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: global_load_dword v0, v[0:1], off ; GFX9-NEXT: v_mov_b32_e32 v2, 8 -; GFX9-NEXT: s_mov_b32 s4, 16 ; GFX9-NEXT: v_mov_b32_e32 v1, 0xff +; GFX9-NEXT: v_mov_b32_e32 v3, 16 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_lshrrev_b32_e32 v3, 24, v0 +; GFX9-NEXT: v_lshrrev_b32_e32 v4, 24, v0 ; GFX9-NEXT: v_lshlrev_b32_sdwa v2, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 -; GFX9-NEXT: v_lshlrev_b32_sdwa v4, s4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 +; GFX9-NEXT: v_lshlrev_b32_sdwa v3, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 ; GFX9-NEXT: v_and_or_b32 v0, v0, v1, v2 -; GFX9-NEXT: v_lshlrev_b32_e32 v1, 24, v3 -; GFX9-NEXT: v_or3_b32 v0, v0, v4, v1 +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 24, v4 +; GFX9-NEXT: v_or3_b32 v0, v0, v3, v1 ; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -667,14 +667,14 @@ ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: global_load_dword v0, v[0:1], off ; GFX10-NEXT: v_mov_b32_e32 v1, 8 -; GFX10-NEXT: s_mov_b32 s4, 16 +; GFX10-NEXT: v_mov_b32_e32 v2, 16 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_lshlrev_b32_sdwa v1, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 -; GFX10-NEXT: v_lshrrev_b32_e32 v2, 24, v0 -; GFX10-NEXT: v_lshlrev_b32_sdwa v3, s4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 +; GFX10-NEXT: v_lshrrev_b32_e32 v3, 24, v0 +; GFX10-NEXT: v_lshlrev_b32_sdwa v2, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 ; GFX10-NEXT: v_and_or_b32 v0, 0xff, v0, v1 -; GFX10-NEXT: v_lshlrev_b32_e32 v1, 24, v2 -; GFX10-NEXT: v_or3_b32 v0, v0, v3, v1 +; GFX10-NEXT: v_lshlrev_b32_e32 v1, 24, v3 +; GFX10-NEXT: v_or3_b32 v0, v0, v2, v1 ; GFX10-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX10-NEXT: s_setpc_b64 s[30:31] %vector = load <4 x i8>, <4 x i8> addrspace(1)* %ptr @@ -838,27 +838,27 @@ ; GFX9-LABEL: extractelement_vgpr_v8i8_sgpr_idx: ; GFX9: ; %bb.0: ; GFX9-NEXT: global_load_dwordx2 v[0:1], v[0:1], off -; GFX9-NEXT: s_mov_b32 s0, 8 -; GFX9-NEXT: s_mov_b32 s1, 16 -; GFX9-NEXT: s_movk_i32 s3, 0xff -; GFX9-NEXT: s_lshr_b32 s4, s2, 2 -; GFX9-NEXT: s_and_b32 s2, s2, 3 -; GFX9-NEXT: v_cmp_eq_u32_e64 vcc, s4, 1 +; GFX9-NEXT: v_mov_b32_e32 v3, 8 +; GFX9-NEXT: v_mov_b32_e32 v2, 0xff +; GFX9-NEXT: v_mov_b32_e32 v4, 16 +; GFX9-NEXT: s_lshr_b32 s0, s2, 2 +; GFX9-NEXT: s_and_b32 s1, s2, 3 +; GFX9-NEXT: v_cmp_eq_u32_e64 vcc, s0, 1 +; GFX9-NEXT: s_lshl_b32 s0, s1, 3 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_lshrrev_b32_e32 v2, 24, v0 -; GFX9-NEXT: v_lshrrev_b32_e32 v3, 24, v1 -; GFX9-NEXT: v_lshlrev_b32_sdwa v4, s0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 -; GFX9-NEXT: v_lshlrev_b32_sdwa v6, s0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 -; GFX9-NEXT: v_lshlrev_b32_sdwa v5, s1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 -; GFX9-NEXT: v_lshlrev_b32_sdwa v7, s1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 -; GFX9-NEXT: v_and_or_b32 v0, v0, s3, v4 -; GFX9-NEXT: v_lshlrev_b32_e32 v2, 24, v2 -; GFX9-NEXT: v_and_or_b32 v1, v1, s3, v6 -; GFX9-NEXT: v_lshlrev_b32_e32 v3, 24, v3 -; GFX9-NEXT: v_or3_b32 v0, v0, v5, v2 -; GFX9-NEXT: v_or3_b32 v1, v1, v7, v3 +; GFX9-NEXT: v_lshrrev_b32_e32 v5, 24, v0 +; GFX9-NEXT: v_lshrrev_b32_e32 v6, 24, v1 +; GFX9-NEXT: v_lshlrev_b32_sdwa v7, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 +; GFX9-NEXT: v_lshlrev_b32_sdwa v3, v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 +; GFX9-NEXT: v_lshlrev_b32_sdwa v8, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 +; GFX9-NEXT: v_lshlrev_b32_sdwa v4, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 +; GFX9-NEXT: v_and_or_b32 v0, v0, v2, v7 +; GFX9-NEXT: v_lshlrev_b32_e32 v5, 24, v5 +; GFX9-NEXT: v_and_or_b32 v1, v1, v2, v3 +; GFX9-NEXT: v_lshlrev_b32_e32 v2, 24, v6 +; GFX9-NEXT: v_or3_b32 v0, v0, v8, v5 +; GFX9-NEXT: v_or3_b32 v1, v1, v4, v2 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc -; GFX9-NEXT: s_lshl_b32 s0, s2, 3 ; GFX9-NEXT: v_lshrrev_b32_e32 v0, s0, v0 ; GFX9-NEXT: v_readfirstlane_b32 s0, v0 ; GFX9-NEXT: ; return to shader part epilog @@ -898,33 +898,33 @@ ; GFX7-NEXT: s_mov_b32 s7, 0xf000 ; GFX7-NEXT: s_mov_b64 s[4:5], 0 ; GFX7-NEXT: buffer_load_dwordx2 v[0:1], v[0:1], s[4:7], 0 addr64 -; GFX7-NEXT: s_movk_i32 s0, 0xff -; GFX7-NEXT: s_lshr_b32 s1, s2, 2 -; GFX7-NEXT: s_and_b32 s2, s2, 3 -; GFX7-NEXT: v_cmp_eq_u32_e64 vcc, s1, 1 +; GFX7-NEXT: v_mov_b32_e32 v2, 0xff +; GFX7-NEXT: s_lshr_b32 s0, s2, 2 +; GFX7-NEXT: s_and_b32 s1, s2, 3 +; GFX7-NEXT: v_cmp_eq_u32_e64 vcc, s0, 1 +; GFX7-NEXT: s_lshl_b32 s0, s1, 3 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_bfe_u32 v5, v0, 8, 8 +; GFX7-NEXT: v_bfe_u32 v6, v0, 8, 8 ; GFX7-NEXT: v_bfe_u32 v7, v1, 8, 8 -; GFX7-NEXT: v_lshrrev_b32_e32 v2, 24, v0 -; GFX7-NEXT: v_lshrrev_b32_e32 v3, 24, v1 -; GFX7-NEXT: v_and_b32_e32 v4, s0, v0 +; GFX7-NEXT: v_lshrrev_b32_e32 v3, 24, v0 +; GFX7-NEXT: v_lshrrev_b32_e32 v4, 24, v1 +; GFX7-NEXT: v_and_b32_e32 v5, v0, v2 ; GFX7-NEXT: v_bfe_u32 v0, v0, 16, 8 -; GFX7-NEXT: v_and_b32_e32 v6, s0, v1 +; GFX7-NEXT: v_and_b32_e32 v2, v1, v2 ; GFX7-NEXT: v_bfe_u32 v1, v1, 16, 8 -; GFX7-NEXT: v_lshlrev_b32_e32 v5, 8, v5 +; GFX7-NEXT: v_lshlrev_b32_e32 v6, 8, v6 ; GFX7-NEXT: v_lshlrev_b32_e32 v7, 8, v7 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX7-NEXT: v_or_b32_e32 v4, v4, v5 -; GFX7-NEXT: v_or_b32_e32 v5, v6, v7 -; GFX7-NEXT: v_lshlrev_b32_e32 v2, 24, v2 +; GFX7-NEXT: v_or_b32_e32 v5, v5, v6 +; GFX7-NEXT: v_or_b32_e32 v2, v2, v7 ; GFX7-NEXT: v_lshlrev_b32_e32 v3, 24, v3 -; GFX7-NEXT: v_or_b32_e32 v0, v4, v0 -; GFX7-NEXT: v_or_b32_e32 v1, v5, v1 -; GFX7-NEXT: v_or_b32_e32 v0, v0, v2 -; GFX7-NEXT: v_or_b32_e32 v1, v1, v3 +; GFX7-NEXT: v_lshlrev_b32_e32 v4, 24, v4 +; GFX7-NEXT: v_or_b32_e32 v0, v5, v0 +; GFX7-NEXT: v_or_b32_e32 v1, v2, v1 +; GFX7-NEXT: v_or_b32_e32 v0, v0, v3 +; GFX7-NEXT: v_or_b32_e32 v1, v1, v4 ; GFX7-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc -; GFX7-NEXT: s_lshl_b32 s0, s2, 3 ; GFX7-NEXT: v_lshrrev_b32_e32 v0, s0, v0 ; GFX7-NEXT: v_readfirstlane_b32 s0, v0 ; GFX7-NEXT: ; return to shader part epilog @@ -932,26 +932,26 @@ ; GFX10-LABEL: extractelement_vgpr_v8i8_sgpr_idx: ; GFX10: ; %bb.0: ; GFX10-NEXT: global_load_dwordx2 v[0:1], v[0:1], off -; GFX10-NEXT: s_mov_b32 s0, 8 -; GFX10-NEXT: s_mov_b32 s1, 16 -; GFX10-NEXT: s_movk_i32 s3, 0xff -; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_lshrrev_b32_e32 v2, 24, v0 -; GFX10-NEXT: v_lshlrev_b32_sdwa v3, s0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 -; GFX10-NEXT: v_lshrrev_b32_e32 v4, 24, v1 -; GFX10-NEXT: v_lshlrev_b32_sdwa v5, s0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 -; GFX10-NEXT: v_lshlrev_b32_sdwa v6, s1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 -; GFX10-NEXT: v_lshlrev_b32_sdwa v7, s1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 -; GFX10-NEXT: v_and_or_b32 v0, v0, s3, v3 -; GFX10-NEXT: v_lshlrev_b32_e32 v2, 24, v2 -; GFX10-NEXT: v_and_or_b32 v1, v1, s3, v5 -; GFX10-NEXT: v_lshlrev_b32_e32 v3, 24, v4 +; GFX10-NEXT: v_mov_b32_e32 v2, 8 +; GFX10-NEXT: v_mov_b32_e32 v3, 0xff +; GFX10-NEXT: v_mov_b32_e32 v4, 16 ; GFX10-NEXT: s_lshr_b32 s0, s2, 2 -; GFX10-NEXT: v_or3_b32 v0, v0, v6, v2 ; GFX10-NEXT: v_cmp_eq_u32_e64 vcc_lo, s0, 1 -; GFX10-NEXT: v_or3_b32 v1, v1, v7, v3 ; GFX10-NEXT: s_and_b32 s0, s2, 3 ; GFX10-NEXT: s_lshl_b32 s0, s0, 3 +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_lshrrev_b32_e32 v5, 24, v0 +; GFX10-NEXT: v_lshlrev_b32_sdwa v6, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 +; GFX10-NEXT: v_lshrrev_b32_e32 v7, 24, v1 +; GFX10-NEXT: v_lshlrev_b32_sdwa v2, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 +; GFX10-NEXT: v_lshlrev_b32_sdwa v8, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 +; GFX10-NEXT: v_lshlrev_b32_sdwa v4, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 +; GFX10-NEXT: v_and_or_b32 v0, v0, v3, v6 +; GFX10-NEXT: v_lshlrev_b32_e32 v5, 24, v5 +; GFX10-NEXT: v_and_or_b32 v1, v1, v3, v2 +; GFX10-NEXT: v_lshlrev_b32_e32 v2, 24, v7 +; GFX10-NEXT: v_or3_b32 v0, v0, v8, v5 +; GFX10-NEXT: v_or3_b32 v1, v1, v4, v2 ; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc_lo ; GFX10-NEXT: v_lshrrev_b32_e32 v0, s0, v0 ; GFX10-NEXT: v_readfirstlane_b32 s0, v0 @@ -966,25 +966,25 @@ ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: global_load_dwordx2 v[0:1], v[0:1], off -; GFX9-NEXT: s_mov_b32 s4, 8 -; GFX9-NEXT: s_mov_b32 s5, 16 -; GFX9-NEXT: s_movk_i32 s6, 0xff -; GFX9-NEXT: v_lshrrev_b32_e32 v3, 2, v2 +; GFX9-NEXT: v_mov_b32_e32 v4, 8 +; GFX9-NEXT: v_mov_b32_e32 v3, 0xff +; GFX9-NEXT: v_mov_b32_e32 v5, 16 +; GFX9-NEXT: v_lshrrev_b32_e32 v6, 2, v2 ; GFX9-NEXT: v_and_b32_e32 v2, 3, v2 -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v3 +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v6 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_lshrrev_b32_e32 v4, 24, v0 -; GFX9-NEXT: v_lshrrev_b32_e32 v5, 24, v1 -; GFX9-NEXT: v_lshlrev_b32_sdwa v6, s4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 -; GFX9-NEXT: v_lshlrev_b32_sdwa v8, s4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 -; GFX9-NEXT: v_lshlrev_b32_sdwa v7, s5, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 -; GFX9-NEXT: v_lshlrev_b32_sdwa v9, s5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 -; GFX9-NEXT: v_and_or_b32 v0, v0, s6, v6 -; GFX9-NEXT: v_lshlrev_b32_e32 v4, 24, v4 -; GFX9-NEXT: v_and_or_b32 v1, v1, s6, v8 -; GFX9-NEXT: v_lshlrev_b32_e32 v5, 24, v5 -; GFX9-NEXT: v_or3_b32 v0, v0, v7, v4 -; GFX9-NEXT: v_or3_b32 v1, v1, v9, v5 +; GFX9-NEXT: v_lshrrev_b32_e32 v7, 24, v0 +; GFX9-NEXT: v_lshrrev_b32_e32 v8, 24, v1 +; GFX9-NEXT: v_lshlrev_b32_sdwa v9, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 +; GFX9-NEXT: v_lshlrev_b32_sdwa v4, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 +; GFX9-NEXT: v_lshlrev_b32_sdwa v10, v5, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 +; GFX9-NEXT: v_lshlrev_b32_sdwa v5, v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 +; GFX9-NEXT: v_and_or_b32 v0, v0, v3, v9 +; GFX9-NEXT: v_lshlrev_b32_e32 v7, 24, v7 +; GFX9-NEXT: v_and_or_b32 v1, v1, v3, v4 +; GFX9-NEXT: v_lshlrev_b32_e32 v3, 24, v8 +; GFX9-NEXT: v_or3_b32 v0, v0, v10, v7 +; GFX9-NEXT: v_or3_b32 v1, v1, v5, v3 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 3, v2 ; GFX9-NEXT: v_lshrrev_b32_e32 v0, v1, v0 @@ -1026,31 +1026,31 @@ ; GFX7-NEXT: s_mov_b32 s7, 0xf000 ; GFX7-NEXT: s_mov_b64 s[4:5], 0 ; GFX7-NEXT: buffer_load_dwordx2 v[0:1], v[0:1], s[4:7], 0 addr64 -; GFX7-NEXT: s_movk_i32 s4, 0xff -; GFX7-NEXT: v_lshrrev_b32_e32 v3, 2, v2 +; GFX7-NEXT: v_mov_b32_e32 v3, 0xff +; GFX7-NEXT: v_lshrrev_b32_e32 v4, 2, v2 ; GFX7-NEXT: v_and_b32_e32 v2, 3, v2 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v3 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v4 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_bfe_u32 v7, v0, 8, 8 +; GFX7-NEXT: v_bfe_u32 v8, v0, 8, 8 ; GFX7-NEXT: v_bfe_u32 v9, v1, 8, 8 -; GFX7-NEXT: v_lshrrev_b32_e32 v4, 24, v0 -; GFX7-NEXT: v_lshrrev_b32_e32 v5, 24, v1 -; GFX7-NEXT: v_and_b32_e32 v6, s4, v0 +; GFX7-NEXT: v_lshrrev_b32_e32 v5, 24, v0 +; GFX7-NEXT: v_lshrrev_b32_e32 v6, 24, v1 +; GFX7-NEXT: v_and_b32_e32 v7, v0, v3 ; GFX7-NEXT: v_bfe_u32 v0, v0, 16, 8 -; GFX7-NEXT: v_and_b32_e32 v8, s4, v1 +; GFX7-NEXT: v_and_b32_e32 v3, v1, v3 ; GFX7-NEXT: v_bfe_u32 v1, v1, 16, 8 -; GFX7-NEXT: v_lshlrev_b32_e32 v7, 8, v7 +; GFX7-NEXT: v_lshlrev_b32_e32 v8, 8, v8 ; GFX7-NEXT: v_lshlrev_b32_e32 v9, 8, v9 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX7-NEXT: v_or_b32_e32 v6, v6, v7 -; GFX7-NEXT: v_or_b32_e32 v7, v8, v9 -; GFX7-NEXT: v_lshlrev_b32_e32 v4, 24, v4 +; GFX7-NEXT: v_or_b32_e32 v7, v7, v8 +; GFX7-NEXT: v_or_b32_e32 v3, v3, v9 ; GFX7-NEXT: v_lshlrev_b32_e32 v5, 24, v5 -; GFX7-NEXT: v_or_b32_e32 v0, v6, v0 -; GFX7-NEXT: v_or_b32_e32 v1, v7, v1 -; GFX7-NEXT: v_or_b32_e32 v0, v0, v4 -; GFX7-NEXT: v_or_b32_e32 v1, v1, v5 +; GFX7-NEXT: v_lshlrev_b32_e32 v6, 24, v6 +; GFX7-NEXT: v_or_b32_e32 v0, v7, v0 +; GFX7-NEXT: v_or_b32_e32 v1, v3, v1 +; GFX7-NEXT: v_or_b32_e32 v0, v0, v5 +; GFX7-NEXT: v_or_b32_e32 v1, v1, v6 ; GFX7-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc ; GFX7-NEXT: v_lshlrev_b32_e32 v1, 3, v2 ; GFX7-NEXT: v_lshrrev_b32_e32 v0, v1, v0 @@ -1061,25 +1061,25 @@ ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: global_load_dwordx2 v[0:1], v[0:1], off -; GFX10-NEXT: s_mov_b32 s4, 8 -; GFX10-NEXT: s_mov_b32 s5, 16 -; GFX10-NEXT: s_movk_i32 s6, 0xff +; GFX10-NEXT: v_mov_b32_e32 v3, 8 +; GFX10-NEXT: v_mov_b32_e32 v4, 0xff +; GFX10-NEXT: v_mov_b32_e32 v5, 16 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_lshrrev_b32_e32 v3, 24, v0 -; GFX10-NEXT: v_lshlrev_b32_sdwa v4, s4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 -; GFX10-NEXT: v_lshrrev_b32_e32 v5, 24, v1 -; GFX10-NEXT: v_lshlrev_b32_sdwa v6, s4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 -; GFX10-NEXT: v_lshlrev_b32_sdwa v7, s5, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 -; GFX10-NEXT: v_lshlrev_b32_sdwa v8, s5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 -; GFX10-NEXT: v_and_or_b32 v0, v0, s6, v4 -; GFX10-NEXT: v_lshlrev_b32_e32 v3, 24, v3 -; GFX10-NEXT: v_and_or_b32 v1, v1, s6, v6 -; GFX10-NEXT: v_lshlrev_b32_e32 v4, 24, v5 -; GFX10-NEXT: v_lshrrev_b32_e32 v5, 2, v2 +; GFX10-NEXT: v_lshrrev_b32_e32 v6, 24, v0 +; GFX10-NEXT: v_lshlrev_b32_sdwa v7, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 +; GFX10-NEXT: v_lshrrev_b32_e32 v8, 24, v1 +; GFX10-NEXT: v_lshlrev_b32_sdwa v3, v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 +; GFX10-NEXT: v_lshlrev_b32_sdwa v9, v5, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 +; GFX10-NEXT: v_lshlrev_b32_sdwa v5, v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 +; GFX10-NEXT: v_and_or_b32 v0, v0, v4, v7 +; GFX10-NEXT: v_lshlrev_b32_e32 v6, 24, v6 +; GFX10-NEXT: v_and_or_b32 v1, v1, v4, v3 +; GFX10-NEXT: v_lshlrev_b32_e32 v3, 24, v8 +; GFX10-NEXT: v_lshrrev_b32_e32 v4, 2, v2 ; GFX10-NEXT: v_and_b32_e32 v2, 3, v2 -; GFX10-NEXT: v_or3_b32 v0, v0, v7, v3 -; GFX10-NEXT: v_or3_b32 v1, v1, v8, v4 -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v5 +; GFX10-NEXT: v_or3_b32 v0, v0, v9, v6 +; GFX10-NEXT: v_or3_b32 v1, v1, v5, v3 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v4 ; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc_lo ; GFX10-NEXT: v_lshlrev_b32_e32 v1, 3, v2 ; GFX10-NEXT: v_lshrrev_b32_e32 v0, v1, v0 @@ -1555,16 +1555,16 @@ ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: global_load_dwordx2 v[0:1], v[0:1], off -; GFX9-NEXT: s_mov_b32 s4, 8 +; GFX9-NEXT: v_mov_b32_e32 v2, 8 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v1, 0xff -; GFX9-NEXT: v_mov_b32_e32 v2, 16 -; GFX9-NEXT: v_lshrrev_b32_e32 v3, 24, v0 -; GFX9-NEXT: v_lshlrev_b32_sdwa v4, s4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 -; GFX9-NEXT: v_lshlrev_b32_sdwa v2, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 -; GFX9-NEXT: v_and_or_b32 v0, v0, v1, v4 -; GFX9-NEXT: v_lshlrev_b32_e32 v1, 24, v3 -; GFX9-NEXT: v_or3_b32 v0, v0, v2, v1 +; GFX9-NEXT: v_mov_b32_e32 v3, 16 +; GFX9-NEXT: v_lshrrev_b32_e32 v4, 24, v0 +; GFX9-NEXT: v_lshlrev_b32_sdwa v2, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 +; GFX9-NEXT: v_lshlrev_b32_sdwa v3, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 +; GFX9-NEXT: v_and_or_b32 v0, v0, v1, v2 +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 24, v4 +; GFX9-NEXT: v_or3_b32 v0, v0, v3, v1 ; GFX9-NEXT: v_lshrrev_b32_e32 v0, 8, v0 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -1611,15 +1611,15 @@ ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: global_load_dwordx2 v[0:1], v[0:1], off -; GFX10-NEXT: s_mov_b32 s4, 8 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v1, 16 -; GFX10-NEXT: v_lshlrev_b32_sdwa v2, s4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 +; GFX10-NEXT: v_mov_b32_e32 v1, 8 +; GFX10-NEXT: v_mov_b32_e32 v2, 16 +; GFX10-NEXT: v_lshlrev_b32_sdwa v1, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 ; GFX10-NEXT: v_lshrrev_b32_e32 v3, 24, v0 -; GFX10-NEXT: v_lshlrev_b32_sdwa v1, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 -; GFX10-NEXT: v_and_or_b32 v0, 0xff, v0, v2 -; GFX10-NEXT: v_lshlrev_b32_e32 v2, 24, v3 -; GFX10-NEXT: v_or3_b32 v0, v0, v1, v2 +; GFX10-NEXT: v_lshlrev_b32_sdwa v2, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 +; GFX10-NEXT: v_and_or_b32 v0, 0xff, v0, v1 +; GFX10-NEXT: v_lshlrev_b32_e32 v1, 24, v3 +; GFX10-NEXT: v_or3_b32 v0, v0, v2, v1 ; GFX10-NEXT: v_lshrrev_b32_e32 v0, 8, v0 ; GFX10-NEXT: s_setpc_b64 s[30:31] %vector = load <8 x i8>, <8 x i8> addrspace(1)* %ptr @@ -1633,15 +1633,15 @@ ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: global_load_dwordx2 v[0:1], v[0:1], off ; GFX9-NEXT: v_mov_b32_e32 v2, 8 -; GFX9-NEXT: s_mov_b32 s4, 16 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v1, 0xff -; GFX9-NEXT: v_lshrrev_b32_e32 v3, 24, v0 +; GFX9-NEXT: v_mov_b32_e32 v3, 16 +; GFX9-NEXT: v_lshrrev_b32_e32 v4, 24, v0 ; GFX9-NEXT: v_lshlrev_b32_sdwa v2, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 -; GFX9-NEXT: v_lshlrev_b32_sdwa v4, s4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 +; GFX9-NEXT: v_lshlrev_b32_sdwa v3, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 ; GFX9-NEXT: v_and_or_b32 v0, v0, v1, v2 -; GFX9-NEXT: v_lshlrev_b32_e32 v1, 24, v3 -; GFX9-NEXT: v_or3_b32 v0, v0, v4, v1 +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 24, v4 +; GFX9-NEXT: v_or3_b32 v0, v0, v3, v1 ; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -1690,13 +1690,13 @@ ; GFX10-NEXT: global_load_dwordx2 v[0:1], v[0:1], off ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v1, 8 -; GFX10-NEXT: s_mov_b32 s4, 16 +; GFX10-NEXT: v_mov_b32_e32 v2, 16 ; GFX10-NEXT: v_lshlrev_b32_sdwa v1, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 -; GFX10-NEXT: v_lshrrev_b32_e32 v2, 24, v0 -; GFX10-NEXT: v_lshlrev_b32_sdwa v3, s4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 +; GFX10-NEXT: v_lshrrev_b32_e32 v3, 24, v0 +; GFX10-NEXT: v_lshlrev_b32_sdwa v2, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 ; GFX10-NEXT: v_and_or_b32 v0, 0xff, v0, v1 -; GFX10-NEXT: v_lshlrev_b32_e32 v1, 24, v2 -; GFX10-NEXT: v_or3_b32 v0, v0, v3, v1 +; GFX10-NEXT: v_lshlrev_b32_e32 v1, 24, v3 +; GFX10-NEXT: v_or3_b32 v0, v0, v2, v1 ; GFX10-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX10-NEXT: s_setpc_b64 s[30:31] %vector = load <8 x i8>, <8 x i8> addrspace(1)* %ptr @@ -1859,16 +1859,16 @@ ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: global_load_dwordx2 v[0:1], v[0:1], off -; GFX9-NEXT: s_mov_b32 s4, 8 +; GFX9-NEXT: v_mov_b32_e32 v2, 8 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, 0xff -; GFX9-NEXT: v_mov_b32_e32 v2, 16 -; GFX9-NEXT: v_lshrrev_b32_e32 v3, 24, v1 -; GFX9-NEXT: v_lshlrev_b32_sdwa v4, s4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 -; GFX9-NEXT: v_lshlrev_b32_sdwa v2, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 -; GFX9-NEXT: v_and_or_b32 v0, v1, v0, v4 -; GFX9-NEXT: v_lshlrev_b32_e32 v1, 24, v3 -; GFX9-NEXT: v_or3_b32 v0, v0, v2, v1 +; GFX9-NEXT: v_mov_b32_e32 v3, 16 +; GFX9-NEXT: v_lshrrev_b32_e32 v4, 24, v1 +; GFX9-NEXT: v_lshlrev_b32_sdwa v2, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 +; GFX9-NEXT: v_lshlrev_b32_sdwa v3, v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 +; GFX9-NEXT: v_and_or_b32 v0, v1, v0, v2 +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 24, v4 +; GFX9-NEXT: v_or3_b32 v0, v0, v3, v1 ; GFX9-NEXT: v_lshrrev_b32_e32 v0, 8, v0 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -1915,15 +1915,15 @@ ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: global_load_dwordx2 v[0:1], v[0:1], off -; GFX10-NEXT: s_mov_b32 s4, 8 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v0, 16 -; GFX10-NEXT: v_lshlrev_b32_sdwa v2, s4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 +; GFX10-NEXT: v_mov_b32_e32 v0, 8 +; GFX10-NEXT: v_mov_b32_e32 v2, 16 +; GFX10-NEXT: v_lshlrev_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 ; GFX10-NEXT: v_lshrrev_b32_e32 v3, 24, v1 -; GFX10-NEXT: v_lshlrev_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 -; GFX10-NEXT: v_and_or_b32 v1, 0xff, v1, v2 -; GFX10-NEXT: v_lshlrev_b32_e32 v2, 24, v3 -; GFX10-NEXT: v_or3_b32 v0, v1, v0, v2 +; GFX10-NEXT: v_lshlrev_b32_sdwa v2, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 +; GFX10-NEXT: v_and_or_b32 v0, 0xff, v1, v0 +; GFX10-NEXT: v_lshlrev_b32_e32 v1, 24, v3 +; GFX10-NEXT: v_or3_b32 v0, v0, v2, v1 ; GFX10-NEXT: v_lshrrev_b32_e32 v0, 8, v0 ; GFX10-NEXT: s_setpc_b64 s[30:31] %vector = load <8 x i8>, <8 x i8> addrspace(1)* %ptr @@ -1937,15 +1937,15 @@ ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: global_load_dwordx2 v[0:1], v[0:1], off ; GFX9-NEXT: v_mov_b32_e32 v2, 8 -; GFX9-NEXT: s_mov_b32 s4, 16 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, 0xff -; GFX9-NEXT: v_lshrrev_b32_e32 v3, 24, v1 +; GFX9-NEXT: v_mov_b32_e32 v3, 16 +; GFX9-NEXT: v_lshrrev_b32_e32 v4, 24, v1 ; GFX9-NEXT: v_lshlrev_b32_sdwa v2, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 -; GFX9-NEXT: v_lshlrev_b32_sdwa v4, s4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 +; GFX9-NEXT: v_lshlrev_b32_sdwa v3, v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 ; GFX9-NEXT: v_and_or_b32 v0, v1, v0, v2 -; GFX9-NEXT: v_lshlrev_b32_e32 v1, 24, v3 -; GFX9-NEXT: v_or3_b32 v0, v0, v4, v1 +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 24, v4 +; GFX9-NEXT: v_or3_b32 v0, v0, v3, v1 ; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -1994,13 +1994,13 @@ ; GFX10-NEXT: global_load_dwordx2 v[0:1], v[0:1], off ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v0, 8 -; GFX10-NEXT: s_mov_b32 s4, 16 +; GFX10-NEXT: v_mov_b32_e32 v2, 16 ; GFX10-NEXT: v_lshlrev_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 -; GFX10-NEXT: v_lshrrev_b32_e32 v2, 24, v1 -; GFX10-NEXT: v_lshlrev_b32_sdwa v3, s4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 +; GFX10-NEXT: v_lshrrev_b32_e32 v3, 24, v1 +; GFX10-NEXT: v_lshlrev_b32_sdwa v2, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 ; GFX10-NEXT: v_and_or_b32 v0, 0xff, v1, v0 -; GFX10-NEXT: v_lshlrev_b32_e32 v1, 24, v2 -; GFX10-NEXT: v_or3_b32 v0, v0, v3, v1 +; GFX10-NEXT: v_lshlrev_b32_e32 v1, 24, v3 +; GFX10-NEXT: v_or3_b32 v0, v0, v2, v1 ; GFX10-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX10-NEXT: s_setpc_b64 s[30:31] %vector = load <8 x i8>, <8 x i8> addrspace(1)* %ptr @@ -2212,27 +2212,24 @@ ; GFX9-LABEL: extractelement_vgpr_v16i8_sgpr_idx: ; GFX9: ; %bb.0: ; GFX9-NEXT: global_load_dwordx4 v[0:3], v[0:1], off -; GFX9-NEXT: s_mov_b32 s0, 8 -; GFX9-NEXT: s_mov_b32 s1, 16 -; GFX9-NEXT: s_movk_i32 s3, 0xff ; GFX9-NEXT: v_mov_b32_e32 v5, 8 ; GFX9-NEXT: v_mov_b32_e32 v4, 0xff ; GFX9-NEXT: v_mov_b32_e32 v6, 16 -; GFX9-NEXT: s_lshr_b32 s4, s2, 2 -; GFX9-NEXT: v_cmp_eq_u32_e64 vcc, s4, 1 -; GFX9-NEXT: s_and_b32 s2, s2, 3 +; GFX9-NEXT: s_lshr_b32 s0, s2, 2 +; GFX9-NEXT: v_cmp_eq_u32_e64 vcc, s0, 1 +; GFX9-NEXT: s_and_b32 s1, s2, 3 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_lshrrev_b32_e32 v7, 24, v0 ; GFX9-NEXT: v_lshrrev_b32_e32 v8, 24, v1 -; GFX9-NEXT: v_lshlrev_b32_sdwa v11, s0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 -; GFX9-NEXT: v_lshlrev_b32_sdwa v13, s0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 +; GFX9-NEXT: v_lshlrev_b32_sdwa v11, v5, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 +; GFX9-NEXT: v_lshlrev_b32_sdwa v13, v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 ; GFX9-NEXT: v_lshrrev_b32_e32 v9, 24, v2 -; GFX9-NEXT: v_lshlrev_b32_sdwa v12, s1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 -; GFX9-NEXT: v_lshlrev_b32_sdwa v14, s1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 +; GFX9-NEXT: v_lshlrev_b32_sdwa v12, v6, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 +; GFX9-NEXT: v_lshlrev_b32_sdwa v14, v6, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 ; GFX9-NEXT: v_lshlrev_b32_sdwa v15, v5, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 -; GFX9-NEXT: v_and_or_b32 v0, v0, s3, v11 +; GFX9-NEXT: v_and_or_b32 v0, v0, v4, v11 ; GFX9-NEXT: v_lshlrev_b32_e32 v7, 24, v7 -; GFX9-NEXT: v_and_or_b32 v1, v1, s3, v13 +; GFX9-NEXT: v_and_or_b32 v1, v1, v4, v13 ; GFX9-NEXT: v_lshlrev_b32_e32 v8, 24, v8 ; GFX9-NEXT: v_lshrrev_b32_e32 v10, 24, v3 ; GFX9-NEXT: v_lshlrev_b32_sdwa v16, v6, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 @@ -2246,12 +2243,12 @@ ; GFX9-NEXT: v_lshlrev_b32_e32 v4, 24, v10 ; GFX9-NEXT: v_or3_b32 v2, v2, v16, v9 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc -; GFX9-NEXT: v_cmp_eq_u32_e64 vcc, s4, 2 +; GFX9-NEXT: v_cmp_eq_u32_e64 vcc, s0, 2 ; GFX9-NEXT: v_or3_b32 v3, v3, v6, v4 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc -; GFX9-NEXT: v_cmp_eq_u32_e64 vcc, s4, 3 +; GFX9-NEXT: v_cmp_eq_u32_e64 vcc, s0, 3 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc -; GFX9-NEXT: s_lshl_b32 s0, s2, 3 +; GFX9-NEXT: s_lshl_b32 s0, s1, 3 ; GFX9-NEXT: v_lshrrev_b32_e32 v0, s0, v0 ; GFX9-NEXT: v_readfirstlane_b32 s0, v0 ; GFX9-NEXT: ; return to shader part epilog @@ -2261,42 +2258,40 @@ ; GFX8-NEXT: flat_load_dwordx4 v[0:3], v[0:1] ; GFX8-NEXT: v_mov_b32_e32 v4, 8 ; GFX8-NEXT: v_mov_b32_e32 v5, 16 -; GFX8-NEXT: v_mov_b32_e32 v6, 8 -; GFX8-NEXT: v_mov_b32_e32 v7, 16 ; GFX8-NEXT: s_lshr_b32 s0, s2, 2 ; GFX8-NEXT: v_cmp_eq_u32_e64 vcc, s0, 1 ; GFX8-NEXT: s_and_b32 s1, s2, 3 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_lshlrev_b32_sdwa v12, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 -; GFX8-NEXT: v_lshlrev_b32_sdwa v4, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 -; GFX8-NEXT: v_lshrrev_b32_e32 v8, 24, v0 -; GFX8-NEXT: v_lshrrev_b32_e32 v9, 24, v1 -; GFX8-NEXT: v_lshlrev_b32_sdwa v13, v5, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 -; GFX8-NEXT: v_lshlrev_b32_sdwa v5, v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 -; GFX8-NEXT: v_lshlrev_b32_sdwa v14, v6, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 -; GFX8-NEXT: v_or_b32_sdwa v0, v0, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX8-NEXT: v_or_b32_sdwa v1, v1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX8-NEXT: v_lshrrev_b32_e32 v10, 24, v2 -; GFX8-NEXT: v_lshlrev_b32_sdwa v15, v7, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 -; GFX8-NEXT: v_lshlrev_b32_sdwa v6, v6, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 -; GFX8-NEXT: v_lshlrev_b32_e32 v8, 24, v8 -; GFX8-NEXT: v_lshlrev_b32_e32 v4, 24, v9 +; GFX8-NEXT: v_lshlrev_b32_sdwa v10, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 +; GFX8-NEXT: v_lshlrev_b32_sdwa v12, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 +; GFX8-NEXT: v_lshrrev_b32_e32 v6, 24, v0 +; GFX8-NEXT: v_lshrrev_b32_e32 v7, 24, v1 +; GFX8-NEXT: v_lshlrev_b32_sdwa v11, v5, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 +; GFX8-NEXT: v_lshlrev_b32_sdwa v13, v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 +; GFX8-NEXT: v_lshlrev_b32_sdwa v14, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 +; GFX8-NEXT: v_or_b32_sdwa v0, v0, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_sdwa v1, v1, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX8-NEXT: v_lshrrev_b32_e32 v8, 24, v2 +; GFX8-NEXT: v_lshlrev_b32_sdwa v15, v5, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 +; GFX8-NEXT: v_lshlrev_b32_sdwa v4, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 +; GFX8-NEXT: v_lshlrev_b32_e32 v6, 24, v6 +; GFX8-NEXT: v_lshlrev_b32_e32 v7, 24, v7 ; GFX8-NEXT: v_or_b32_sdwa v2, v2, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX8-NEXT: v_or_b32_e32 v0, v0, v13 -; GFX8-NEXT: v_or_b32_e32 v1, v1, v5 -; GFX8-NEXT: v_lshrrev_b32_e32 v11, 24, v3 -; GFX8-NEXT: v_lshlrev_b32_sdwa v7, v7, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 -; GFX8-NEXT: v_lshlrev_b32_e32 v9, 24, v10 -; GFX8-NEXT: v_or_b32_sdwa v3, v3, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_e32 v0, v0, v11 +; GFX8-NEXT: v_or_b32_e32 v1, v1, v13 +; GFX8-NEXT: v_lshrrev_b32_e32 v9, 24, v3 +; GFX8-NEXT: v_lshlrev_b32_sdwa v5, v5, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 +; GFX8-NEXT: v_lshlrev_b32_e32 v8, 24, v8 +; GFX8-NEXT: v_or_b32_sdwa v3, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX8-NEXT: v_or_b32_e32 v2, v2, v15 -; GFX8-NEXT: v_or_b32_e32 v0, v0, v8 -; GFX8-NEXT: v_or_b32_e32 v1, v1, v4 -; GFX8-NEXT: v_lshlrev_b32_e32 v6, 24, v11 -; GFX8-NEXT: v_or_b32_e32 v3, v3, v7 -; GFX8-NEXT: v_or_b32_e32 v2, v2, v9 +; GFX8-NEXT: v_or_b32_e32 v0, v0, v6 +; GFX8-NEXT: v_or_b32_e32 v1, v1, v7 +; GFX8-NEXT: v_lshlrev_b32_e32 v4, 24, v9 +; GFX8-NEXT: v_or_b32_e32 v3, v3, v5 +; GFX8-NEXT: v_or_b32_e32 v2, v2, v8 ; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc ; GFX8-NEXT: v_cmp_eq_u32_e64 vcc, s0, 2 -; GFX8-NEXT: v_or_b32_e32 v3, v3, v6 +; GFX8-NEXT: v_or_b32_e32 v3, v3, v4 ; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc ; GFX8-NEXT: v_cmp_eq_u32_e64 vcc, s0, 3 ; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc @@ -2311,19 +2306,18 @@ ; GFX7-NEXT: s_mov_b32 s7, 0xf000 ; GFX7-NEXT: s_mov_b64 s[4:5], 0 ; GFX7-NEXT: buffer_load_dwordx4 v[0:3], v[0:1], s[4:7], 0 addr64 -; GFX7-NEXT: s_movk_i32 s0, 0xff ; GFX7-NEXT: v_mov_b32_e32 v4, 0xff -; GFX7-NEXT: s_lshr_b32 s1, s2, 2 -; GFX7-NEXT: v_cmp_eq_u32_e64 vcc, s1, 1 -; GFX7-NEXT: s_and_b32 s2, s2, 3 +; GFX7-NEXT: s_lshr_b32 s0, s2, 2 +; GFX7-NEXT: v_cmp_eq_u32_e64 vcc, s0, 1 +; GFX7-NEXT: s_and_b32 s1, s2, 3 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_bfe_u32 v10, v0, 8, 8 ; GFX7-NEXT: v_bfe_u32 v12, v1, 8, 8 ; GFX7-NEXT: v_lshrrev_b32_e32 v5, 24, v0 ; GFX7-NEXT: v_lshrrev_b32_e32 v6, 24, v1 -; GFX7-NEXT: v_and_b32_e32 v9, s0, v0 +; GFX7-NEXT: v_and_b32_e32 v9, v0, v4 ; GFX7-NEXT: v_bfe_u32 v0, v0, 16, 8 -; GFX7-NEXT: v_and_b32_e32 v11, s0, v1 +; GFX7-NEXT: v_and_b32_e32 v11, v1, v4 ; GFX7-NEXT: v_bfe_u32 v1, v1, 16, 8 ; GFX7-NEXT: v_bfe_u32 v14, v2, 8, 8 ; GFX7-NEXT: v_lshlrev_b32_e32 v10, 8, v10 @@ -2357,12 +2351,12 @@ ; GFX7-NEXT: v_or_b32_e32 v3, v4, v3 ; GFX7-NEXT: v_or_b32_e32 v2, v2, v7 ; GFX7-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc -; GFX7-NEXT: v_cmp_eq_u32_e64 vcc, s1, 2 +; GFX7-NEXT: v_cmp_eq_u32_e64 vcc, s0, 2 ; GFX7-NEXT: v_or_b32_e32 v3, v3, v8 ; GFX7-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc -; GFX7-NEXT: v_cmp_eq_u32_e64 vcc, s1, 3 +; GFX7-NEXT: v_cmp_eq_u32_e64 vcc, s0, 3 ; GFX7-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc -; GFX7-NEXT: s_lshl_b32 s0, s2, 3 +; GFX7-NEXT: s_lshl_b32 s0, s1, 3 ; GFX7-NEXT: v_lshrrev_b32_e32 v0, s0, v0 ; GFX7-NEXT: v_readfirstlane_b32 s0, v0 ; GFX7-NEXT: ; return to shader part epilog @@ -2370,26 +2364,24 @@ ; GFX10-LABEL: extractelement_vgpr_v16i8_sgpr_idx: ; GFX10: ; %bb.0: ; GFX10-NEXT: global_load_dwordx4 v[0:3], v[0:1], off -; GFX10-NEXT: s_mov_b32 s0, 8 ; GFX10-NEXT: v_mov_b32_e32 v5, 8 -; GFX10-NEXT: s_mov_b32 s1, 16 -; GFX10-NEXT: s_movk_i32 s3, 0xff ; GFX10-NEXT: v_mov_b32_e32 v4, 0xff ; GFX10-NEXT: v_mov_b32_e32 v6, 16 +; GFX10-NEXT: s_lshr_b32 s0, s2, 2 +; GFX10-NEXT: v_cmp_eq_u32_e64 vcc_lo, s0, 1 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_lshrrev_b32_e32 v7, 24, v0 ; GFX10-NEXT: v_lshrrev_b32_e32 v8, 24, v1 -; GFX10-NEXT: v_lshlrev_b32_sdwa v10, s0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 -; GFX10-NEXT: v_lshlrev_b32_sdwa v12, s0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 +; GFX10-NEXT: v_lshlrev_b32_sdwa v10, v5, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 +; GFX10-NEXT: v_lshlrev_b32_sdwa v12, v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 ; GFX10-NEXT: v_lshrrev_b32_e32 v9, 24, v2 -; GFX10-NEXT: v_lshlrev_b32_sdwa v11, s1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 -; GFX10-NEXT: v_lshlrev_b32_sdwa v13, s1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 +; GFX10-NEXT: v_lshlrev_b32_sdwa v11, v6, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 +; GFX10-NEXT: v_lshlrev_b32_sdwa v13, v6, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 ; GFX10-NEXT: v_lshlrev_b32_sdwa v14, v5, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 -; GFX10-NEXT: v_and_or_b32 v0, v0, s3, v10 +; GFX10-NEXT: v_and_or_b32 v0, v0, v4, v10 ; GFX10-NEXT: v_lshlrev_b32_e32 v7, 24, v7 -; GFX10-NEXT: v_and_or_b32 v1, v1, s3, v12 +; GFX10-NEXT: v_and_or_b32 v1, v1, v4, v12 ; GFX10-NEXT: v_lshlrev_b32_e32 v8, 24, v8 -; GFX10-NEXT: s_lshr_b32 s0, s2, 2 ; GFX10-NEXT: v_lshlrev_b32_sdwa v15, v6, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 ; GFX10-NEXT: v_and_or_b32 v2, v2, v4, v14 ; GFX10-NEXT: v_lshlrev_b32_e32 v9, 24, v9 @@ -2397,7 +2389,6 @@ ; GFX10-NEXT: v_lshrrev_b32_e32 v10, 24, v3 ; GFX10-NEXT: v_or3_b32 v0, v0, v11, v7 ; GFX10-NEXT: v_or3_b32 v1, v1, v13, v8 -; GFX10-NEXT: v_cmp_eq_u32_e64 vcc_lo, s0, 1 ; GFX10-NEXT: v_or3_b32 v2, v2, v15, v9 ; GFX10-NEXT: v_and_or_b32 v4, v3, v4, v5 ; GFX10-NEXT: v_lshlrev_b32_sdwa v3, v6, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 @@ -2423,9 +2414,6 @@ ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: global_load_dwordx4 v[3:6], v[0:1], off -; GFX9-NEXT: s_mov_b32 s4, 8 -; GFX9-NEXT: s_mov_b32 s5, 16 -; GFX9-NEXT: s_movk_i32 s6, 0xff ; GFX9-NEXT: v_mov_b32_e32 v1, 8 ; GFX9-NEXT: v_mov_b32_e32 v0, 0xff ; GFX9-NEXT: v_mov_b32_e32 v7, 16 @@ -2435,17 +2423,17 @@ ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_lshrrev_b32_e32 v9, 24, v3 ; GFX9-NEXT: v_lshrrev_b32_e32 v10, 24, v4 -; GFX9-NEXT: v_lshlrev_b32_sdwa v13, s4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 -; GFX9-NEXT: v_lshlrev_b32_sdwa v15, s4, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 +; GFX9-NEXT: v_lshlrev_b32_sdwa v13, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 +; GFX9-NEXT: v_lshlrev_b32_sdwa v15, v1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 ; GFX9-NEXT: v_lshrrev_b32_e32 v11, 24, v5 ; GFX9-NEXT: v_lshrrev_b32_e32 v12, 24, v6 -; GFX9-NEXT: v_lshlrev_b32_sdwa v14, s5, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 -; GFX9-NEXT: v_lshlrev_b32_sdwa v16, s5, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 +; GFX9-NEXT: v_lshlrev_b32_sdwa v14, v7, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 +; GFX9-NEXT: v_lshlrev_b32_sdwa v16, v7, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 ; GFX9-NEXT: v_lshlrev_b32_sdwa v17, v1, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 ; GFX9-NEXT: v_lshlrev_b32_sdwa v1, v1, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 -; GFX9-NEXT: v_and_or_b32 v3, v3, s6, v13 +; GFX9-NEXT: v_and_or_b32 v3, v3, v0, v13 ; GFX9-NEXT: v_lshlrev_b32_e32 v9, 24, v9 -; GFX9-NEXT: v_and_or_b32 v4, v4, s6, v15 +; GFX9-NEXT: v_and_or_b32 v4, v4, v0, v15 ; GFX9-NEXT: v_lshlrev_b32_e32 v10, 24, v10 ; GFX9-NEXT: v_lshlrev_b32_sdwa v18, v7, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 ; GFX9-NEXT: v_lshlrev_b32_sdwa v7, v7, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 @@ -2472,45 +2460,43 @@ ; GFX8-NEXT: flat_load_dwordx4 v[3:6], v[0:1] ; GFX8-NEXT: v_mov_b32_e32 v0, 8 ; GFX8-NEXT: v_mov_b32_e32 v1, 16 -; GFX8-NEXT: v_mov_b32_e32 v7, 8 -; GFX8-NEXT: v_mov_b32_e32 v8, 16 -; GFX8-NEXT: v_lshrrev_b32_e32 v9, 2, v2 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 1, v9 +; GFX8-NEXT: v_lshrrev_b32_e32 v7, 2, v2 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 1, v7 ; GFX8-NEXT: v_and_b32_e32 v2, 3, v2 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_lshlrev_b32_sdwa v14, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 -; GFX8-NEXT: v_lshlrev_b32_sdwa v0, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 -; GFX8-NEXT: v_lshrrev_b32_e32 v10, 24, v3 -; GFX8-NEXT: v_lshrrev_b32_e32 v11, 24, v4 -; GFX8-NEXT: v_lshlrev_b32_sdwa v15, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 -; GFX8-NEXT: v_lshlrev_b32_sdwa v1, v1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 -; GFX8-NEXT: v_lshlrev_b32_sdwa v16, v7, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 -; GFX8-NEXT: v_or_b32_sdwa v3, v3, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX8-NEXT: v_or_b32_sdwa v0, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX8-NEXT: v_lshrrev_b32_e32 v12, 24, v5 -; GFX8-NEXT: v_lshlrev_b32_sdwa v17, v8, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 -; GFX8-NEXT: v_lshlrev_b32_sdwa v7, v7, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 -; GFX8-NEXT: v_lshlrev_b32_e32 v10, 24, v10 -; GFX8-NEXT: v_lshlrev_b32_e32 v4, 24, v11 +; GFX8-NEXT: v_lshlrev_b32_sdwa v12, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 +; GFX8-NEXT: v_lshlrev_b32_sdwa v14, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 +; GFX8-NEXT: v_lshrrev_b32_e32 v8, 24, v3 +; GFX8-NEXT: v_lshrrev_b32_e32 v9, 24, v4 +; GFX8-NEXT: v_lshlrev_b32_sdwa v13, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 +; GFX8-NEXT: v_lshlrev_b32_sdwa v15, v1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 +; GFX8-NEXT: v_lshlrev_b32_sdwa v16, v0, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 +; GFX8-NEXT: v_lshlrev_b32_sdwa v0, v0, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 +; GFX8-NEXT: v_or_b32_sdwa v3, v3, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_sdwa v4, v4, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX8-NEXT: v_lshrrev_b32_e32 v10, 24, v5 +; GFX8-NEXT: v_lshlrev_b32_sdwa v17, v1, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 +; GFX8-NEXT: v_lshlrev_b32_sdwa v1, v1, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 +; GFX8-NEXT: v_lshlrev_b32_e32 v8, 24, v8 +; GFX8-NEXT: v_lshlrev_b32_e32 v9, 24, v9 ; GFX8-NEXT: v_or_b32_sdwa v5, v5, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX8-NEXT: v_or_b32_e32 v3, v3, v15 +; GFX8-NEXT: v_or_b32_sdwa v0, v6, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_e32 v3, v3, v13 +; GFX8-NEXT: v_or_b32_e32 v4, v4, v15 +; GFX8-NEXT: v_lshrrev_b32_e32 v11, 24, v6 +; GFX8-NEXT: v_lshlrev_b32_e32 v10, 24, v10 +; GFX8-NEXT: v_or_b32_e32 v5, v5, v17 ; GFX8-NEXT: v_or_b32_e32 v0, v0, v1 -; GFX8-NEXT: v_lshrrev_b32_e32 v13, 24, v6 -; GFX8-NEXT: v_lshlrev_b32_sdwa v8, v8, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 -; GFX8-NEXT: v_lshlrev_b32_e32 v11, 24, v12 -; GFX8-NEXT: v_or_b32_sdwa v6, v6, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX8-NEXT: v_or_b32_e32 v1, v5, v17 -; GFX8-NEXT: v_or_b32_e32 v3, v3, v10 -; GFX8-NEXT: v_or_b32_e32 v0, v0, v4 -; GFX8-NEXT: v_lshlrev_b32_e32 v7, 24, v13 -; GFX8-NEXT: v_or_b32_e32 v5, v6, v8 -; GFX8-NEXT: v_or_b32_e32 v1, v1, v11 -; GFX8-NEXT: v_cndmask_b32_e32 v0, v3, v0, vcc -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 2, v9 -; GFX8-NEXT: v_or_b32_e32 v4, v5, v7 -; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 3, v9 -; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc +; GFX8-NEXT: v_or_b32_e32 v1, v3, v8 +; GFX8-NEXT: v_or_b32_e32 v3, v4, v9 +; GFX8-NEXT: v_lshlrev_b32_e32 v6, 24, v11 +; GFX8-NEXT: v_or_b32_e32 v4, v5, v10 +; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 2, v7 +; GFX8-NEXT: v_or_b32_e32 v0, v0, v6 +; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 3, v7 +; GFX8-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc ; GFX8-NEXT: v_lshlrev_b32_e32 v1, 3, v2 ; GFX8-NEXT: v_lshrrev_b32_e32 v0, v1, v0 ; GFX8-NEXT: s_setpc_b64 s[30:31] @@ -2522,7 +2508,6 @@ ; GFX7-NEXT: s_mov_b32 s7, 0xf000 ; GFX7-NEXT: s_mov_b64 s[4:5], 0 ; GFX7-NEXT: buffer_load_dwordx4 v[3:6], v[0:1], s[4:7], 0 addr64 -; GFX7-NEXT: s_movk_i32 s4, 0xff ; GFX7-NEXT: v_mov_b32_e32 v0, 0xff ; GFX7-NEXT: v_lshrrev_b32_e32 v17, 2, v2 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v17 @@ -2532,9 +2517,9 @@ ; GFX7-NEXT: v_bfe_u32 v13, v4, 8, 8 ; GFX7-NEXT: v_lshrrev_b32_e32 v1, 24, v3 ; GFX7-NEXT: v_lshrrev_b32_e32 v7, 24, v4 -; GFX7-NEXT: v_and_b32_e32 v10, s4, v3 +; GFX7-NEXT: v_and_b32_e32 v10, v3, v0 ; GFX7-NEXT: v_bfe_u32 v3, v3, 16, 8 -; GFX7-NEXT: v_and_b32_e32 v12, s4, v4 +; GFX7-NEXT: v_and_b32_e32 v12, v4, v0 ; GFX7-NEXT: v_bfe_u32 v4, v4, 16, 8 ; GFX7-NEXT: v_bfe_u32 v15, v5, 8, 8 ; GFX7-NEXT: v_lshlrev_b32_e32 v11, 8, v11 @@ -2582,10 +2567,7 @@ ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: global_load_dwordx4 v[3:6], v[0:1], off -; GFX10-NEXT: s_mov_b32 s4, 8 ; GFX10-NEXT: v_mov_b32_e32 v1, 8 -; GFX10-NEXT: s_mov_b32 s5, 16 -; GFX10-NEXT: s_movk_i32 s6, 0xff ; GFX10-NEXT: v_mov_b32_e32 v0, 0xff ; GFX10-NEXT: v_mov_b32_e32 v7, 16 ; GFX10-NEXT: v_lshrrev_b32_e32 v8, 2, v2 @@ -2594,15 +2576,15 @@ ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_lshrrev_b32_e32 v9, 24, v3 ; GFX10-NEXT: v_lshrrev_b32_e32 v10, 24, v4 -; GFX10-NEXT: v_lshlrev_b32_sdwa v13, s4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 -; GFX10-NEXT: v_lshlrev_b32_sdwa v15, s4, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 +; GFX10-NEXT: v_lshlrev_b32_sdwa v13, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 +; GFX10-NEXT: v_lshlrev_b32_sdwa v15, v1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 ; GFX10-NEXT: v_lshrrev_b32_e32 v11, 24, v5 -; GFX10-NEXT: v_lshlrev_b32_sdwa v14, s5, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 -; GFX10-NEXT: v_lshlrev_b32_sdwa v16, s5, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 +; GFX10-NEXT: v_lshlrev_b32_sdwa v14, v7, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 +; GFX10-NEXT: v_lshlrev_b32_sdwa v16, v7, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 ; GFX10-NEXT: v_lshlrev_b32_sdwa v17, v1, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 -; GFX10-NEXT: v_and_or_b32 v3, v3, s6, v13 +; GFX10-NEXT: v_and_or_b32 v3, v3, v0, v13 ; GFX10-NEXT: v_lshlrev_b32_e32 v9, 24, v9 -; GFX10-NEXT: v_and_or_b32 v4, v4, s6, v15 +; GFX10-NEXT: v_and_or_b32 v4, v4, v0, v15 ; GFX10-NEXT: v_lshlrev_b32_e32 v10, 24, v10 ; GFX10-NEXT: v_lshrrev_b32_e32 v12, 24, v6 ; GFX10-NEXT: v_lshlrev_b32_sdwa v18, v7, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 @@ -2837,16 +2819,16 @@ ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: global_load_dwordx4 v[0:3], v[0:1], off -; GFX9-NEXT: s_mov_b32 s4, 8 ; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v2, 8 ; GFX9-NEXT: v_mov_b32_e32 v1, 0xff -; GFX9-NEXT: v_mov_b32_e32 v2, 16 -; GFX9-NEXT: v_lshrrev_b32_e32 v3, 24, v0 -; GFX9-NEXT: v_lshlrev_b32_sdwa v4, s4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 -; GFX9-NEXT: v_lshlrev_b32_sdwa v2, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 -; GFX9-NEXT: v_and_or_b32 v0, v0, v1, v4 -; GFX9-NEXT: v_lshlrev_b32_e32 v1, 24, v3 -; GFX9-NEXT: v_or3_b32 v0, v0, v2, v1 +; GFX9-NEXT: v_mov_b32_e32 v3, 16 +; GFX9-NEXT: v_lshrrev_b32_e32 v4, 24, v0 +; GFX9-NEXT: v_lshlrev_b32_sdwa v2, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 +; GFX9-NEXT: v_lshlrev_b32_sdwa v3, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 +; GFX9-NEXT: v_and_or_b32 v0, v0, v1, v2 +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 24, v4 +; GFX9-NEXT: v_or3_b32 v0, v0, v3, v1 ; GFX9-NEXT: v_lshrrev_b32_e32 v0, 8, v0 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -2893,15 +2875,15 @@ ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: global_load_dwordx4 v[0:3], v[0:1], off -; GFX10-NEXT: s_mov_b32 s4, 8 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v1, 16 -; GFX10-NEXT: v_lshlrev_b32_sdwa v2, s4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 +; GFX10-NEXT: v_mov_b32_e32 v1, 8 +; GFX10-NEXT: v_mov_b32_e32 v2, 16 +; GFX10-NEXT: v_lshlrev_b32_sdwa v1, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 ; GFX10-NEXT: v_lshrrev_b32_e32 v3, 24, v0 -; GFX10-NEXT: v_lshlrev_b32_sdwa v1, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 -; GFX10-NEXT: v_and_or_b32 v0, 0xff, v0, v2 -; GFX10-NEXT: v_lshlrev_b32_e32 v2, 24, v3 -; GFX10-NEXT: v_or3_b32 v0, v0, v1, v2 +; GFX10-NEXT: v_lshlrev_b32_sdwa v2, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 +; GFX10-NEXT: v_and_or_b32 v0, 0xff, v0, v1 +; GFX10-NEXT: v_lshlrev_b32_e32 v1, 24, v3 +; GFX10-NEXT: v_or3_b32 v0, v0, v2, v1 ; GFX10-NEXT: v_lshrrev_b32_e32 v0, 8, v0 ; GFX10-NEXT: s_setpc_b64 s[30:31] %vector = load <16 x i8>, <16 x i8> addrspace(1)* %ptr @@ -2916,14 +2898,14 @@ ; GFX9-NEXT: global_load_dwordx4 v[0:3], v[0:1], off ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v2, 8 -; GFX9-NEXT: s_mov_b32 s4, 16 ; GFX9-NEXT: v_mov_b32_e32 v1, 0xff -; GFX9-NEXT: v_lshrrev_b32_e32 v3, 24, v0 +; GFX9-NEXT: v_mov_b32_e32 v3, 16 +; GFX9-NEXT: v_lshrrev_b32_e32 v4, 24, v0 ; GFX9-NEXT: v_lshlrev_b32_sdwa v2, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 -; GFX9-NEXT: v_lshlrev_b32_sdwa v4, s4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 +; GFX9-NEXT: v_lshlrev_b32_sdwa v3, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 ; GFX9-NEXT: v_and_or_b32 v0, v0, v1, v2 -; GFX9-NEXT: v_lshlrev_b32_e32 v1, 24, v3 -; GFX9-NEXT: v_or3_b32 v0, v0, v4, v1 +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 24, v4 +; GFX9-NEXT: v_or3_b32 v0, v0, v3, v1 ; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -2972,13 +2954,13 @@ ; GFX10-NEXT: global_load_dwordx4 v[0:3], v[0:1], off ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v1, 8 -; GFX10-NEXT: s_mov_b32 s4, 16 +; GFX10-NEXT: v_mov_b32_e32 v2, 16 ; GFX10-NEXT: v_lshlrev_b32_sdwa v1, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 -; GFX10-NEXT: v_lshrrev_b32_e32 v2, 24, v0 -; GFX10-NEXT: v_lshlrev_b32_sdwa v3, s4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 +; GFX10-NEXT: v_lshrrev_b32_e32 v3, 24, v0 +; GFX10-NEXT: v_lshlrev_b32_sdwa v2, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 ; GFX10-NEXT: v_and_or_b32 v0, 0xff, v0, v1 -; GFX10-NEXT: v_lshlrev_b32_e32 v1, 24, v2 -; GFX10-NEXT: v_or3_b32 v0, v0, v3, v1 +; GFX10-NEXT: v_lshlrev_b32_e32 v1, 24, v3 +; GFX10-NEXT: v_or3_b32 v0, v0, v2, v1 ; GFX10-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX10-NEXT: s_setpc_b64 s[30:31] %vector = load <16 x i8>, <16 x i8> addrspace(1)* %ptr @@ -3141,16 +3123,16 @@ ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: global_load_dwordx4 v[0:3], v[0:1], off -; GFX9-NEXT: s_mov_b32 s4, 8 ; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v2, 8 ; GFX9-NEXT: v_mov_b32_e32 v0, 0xff -; GFX9-NEXT: v_mov_b32_e32 v2, 16 -; GFX9-NEXT: v_lshrrev_b32_e32 v3, 24, v1 -; GFX9-NEXT: v_lshlrev_b32_sdwa v4, s4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 -; GFX9-NEXT: v_lshlrev_b32_sdwa v2, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 -; GFX9-NEXT: v_and_or_b32 v0, v1, v0, v4 -; GFX9-NEXT: v_lshlrev_b32_e32 v1, 24, v3 -; GFX9-NEXT: v_or3_b32 v0, v0, v2, v1 +; GFX9-NEXT: v_mov_b32_e32 v3, 16 +; GFX9-NEXT: v_lshrrev_b32_e32 v4, 24, v1 +; GFX9-NEXT: v_lshlrev_b32_sdwa v2, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 +; GFX9-NEXT: v_lshlrev_b32_sdwa v3, v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 +; GFX9-NEXT: v_and_or_b32 v0, v1, v0, v2 +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 24, v4 +; GFX9-NEXT: v_or3_b32 v0, v0, v3, v1 ; GFX9-NEXT: v_lshrrev_b32_e32 v0, 8, v0 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -3197,15 +3179,15 @@ ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: global_load_dwordx4 v[0:3], v[0:1], off -; GFX10-NEXT: s_mov_b32 s4, 8 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v0, 16 -; GFX10-NEXT: v_lshlrev_b32_sdwa v2, s4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 +; GFX10-NEXT: v_mov_b32_e32 v0, 8 +; GFX10-NEXT: v_mov_b32_e32 v2, 16 +; GFX10-NEXT: v_lshlrev_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 ; GFX10-NEXT: v_lshrrev_b32_e32 v3, 24, v1 -; GFX10-NEXT: v_lshlrev_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 -; GFX10-NEXT: v_and_or_b32 v1, 0xff, v1, v2 -; GFX10-NEXT: v_lshlrev_b32_e32 v2, 24, v3 -; GFX10-NEXT: v_or3_b32 v0, v1, v0, v2 +; GFX10-NEXT: v_lshlrev_b32_sdwa v2, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 +; GFX10-NEXT: v_and_or_b32 v0, 0xff, v1, v0 +; GFX10-NEXT: v_lshlrev_b32_e32 v1, 24, v3 +; GFX10-NEXT: v_or3_b32 v0, v0, v2, v1 ; GFX10-NEXT: v_lshrrev_b32_e32 v0, 8, v0 ; GFX10-NEXT: s_setpc_b64 s[30:31] %vector = load <16 x i8>, <16 x i8> addrspace(1)* %ptr @@ -3220,14 +3202,14 @@ ; GFX9-NEXT: global_load_dwordx4 v[0:3], v[0:1], off ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v2, 8 -; GFX9-NEXT: s_mov_b32 s4, 16 ; GFX9-NEXT: v_mov_b32_e32 v0, 0xff -; GFX9-NEXT: v_lshrrev_b32_e32 v3, 24, v1 +; GFX9-NEXT: v_mov_b32_e32 v3, 16 +; GFX9-NEXT: v_lshrrev_b32_e32 v4, 24, v1 ; GFX9-NEXT: v_lshlrev_b32_sdwa v2, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 -; GFX9-NEXT: v_lshlrev_b32_sdwa v4, s4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 +; GFX9-NEXT: v_lshlrev_b32_sdwa v3, v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 ; GFX9-NEXT: v_and_or_b32 v0, v1, v0, v2 -; GFX9-NEXT: v_lshlrev_b32_e32 v1, 24, v3 -; GFX9-NEXT: v_or3_b32 v0, v0, v4, v1 +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 24, v4 +; GFX9-NEXT: v_or3_b32 v0, v0, v3, v1 ; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -3276,13 +3258,13 @@ ; GFX10-NEXT: global_load_dwordx4 v[0:3], v[0:1], off ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v0, 8 -; GFX10-NEXT: s_mov_b32 s4, 16 +; GFX10-NEXT: v_mov_b32_e32 v2, 16 ; GFX10-NEXT: v_lshlrev_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 -; GFX10-NEXT: v_lshrrev_b32_e32 v2, 24, v1 -; GFX10-NEXT: v_lshlrev_b32_sdwa v3, s4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 +; GFX10-NEXT: v_lshrrev_b32_e32 v3, 24, v1 +; GFX10-NEXT: v_lshlrev_b32_sdwa v2, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 ; GFX10-NEXT: v_and_or_b32 v0, 0xff, v1, v0 -; GFX10-NEXT: v_lshlrev_b32_e32 v1, 24, v2 -; GFX10-NEXT: v_or3_b32 v0, v0, v3, v1 +; GFX10-NEXT: v_lshlrev_b32_e32 v1, 24, v3 +; GFX10-NEXT: v_or3_b32 v0, v0, v2, v1 ; GFX10-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX10-NEXT: s_setpc_b64 s[30:31] %vector = load <16 x i8>, <16 x i8> addrspace(1)* %ptr @@ -3445,16 +3427,16 @@ ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: global_load_dwordx4 v[0:3], v[0:1], off -; GFX9-NEXT: s_mov_b32 s4, 8 ; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v1, 8 ; GFX9-NEXT: v_mov_b32_e32 v0, 0xff -; GFX9-NEXT: v_mov_b32_e32 v1, 16 -; GFX9-NEXT: v_lshrrev_b32_e32 v3, 24, v2 -; GFX9-NEXT: v_lshlrev_b32_sdwa v4, s4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 -; GFX9-NEXT: v_lshlrev_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 -; GFX9-NEXT: v_and_or_b32 v0, v2, v0, v4 -; GFX9-NEXT: v_lshlrev_b32_e32 v2, 24, v3 -; GFX9-NEXT: v_or3_b32 v0, v0, v1, v2 +; GFX9-NEXT: v_mov_b32_e32 v3, 16 +; GFX9-NEXT: v_lshrrev_b32_e32 v4, 24, v2 +; GFX9-NEXT: v_lshlrev_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 +; GFX9-NEXT: v_lshlrev_b32_sdwa v3, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 +; GFX9-NEXT: v_and_or_b32 v0, v2, v0, v1 +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 24, v4 +; GFX9-NEXT: v_or3_b32 v0, v0, v3, v1 ; GFX9-NEXT: v_lshrrev_b32_e32 v0, 8, v0 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -3501,15 +3483,15 @@ ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: global_load_dwordx4 v[0:3], v[0:1], off -; GFX10-NEXT: s_mov_b32 s4, 8 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v0, 16 -; GFX10-NEXT: v_lshlrev_b32_sdwa v1, s4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 +; GFX10-NEXT: v_mov_b32_e32 v0, 8 +; GFX10-NEXT: v_mov_b32_e32 v1, 16 +; GFX10-NEXT: v_lshlrev_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 ; GFX10-NEXT: v_lshrrev_b32_e32 v3, 24, v2 -; GFX10-NEXT: v_lshlrev_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 -; GFX10-NEXT: v_and_or_b32 v1, 0xff, v2, v1 +; GFX10-NEXT: v_lshlrev_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 +; GFX10-NEXT: v_and_or_b32 v0, 0xff, v2, v0 ; GFX10-NEXT: v_lshlrev_b32_e32 v2, 24, v3 -; GFX10-NEXT: v_or3_b32 v0, v1, v0, v2 +; GFX10-NEXT: v_or3_b32 v0, v0, v1, v2 ; GFX10-NEXT: v_lshrrev_b32_e32 v0, 8, v0 ; GFX10-NEXT: s_setpc_b64 s[30:31] %vector = load <16 x i8>, <16 x i8> addrspace(1)* %ptr @@ -3524,14 +3506,14 @@ ; GFX9-NEXT: global_load_dwordx4 v[0:3], v[0:1], off ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v1, 8 -; GFX9-NEXT: s_mov_b32 s4, 16 ; GFX9-NEXT: v_mov_b32_e32 v0, 0xff -; GFX9-NEXT: v_lshrrev_b32_e32 v3, 24, v2 +; GFX9-NEXT: v_mov_b32_e32 v3, 16 +; GFX9-NEXT: v_lshrrev_b32_e32 v4, 24, v2 ; GFX9-NEXT: v_lshlrev_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 -; GFX9-NEXT: v_lshlrev_b32_sdwa v4, s4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 +; GFX9-NEXT: v_lshlrev_b32_sdwa v3, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 ; GFX9-NEXT: v_and_or_b32 v0, v2, v0, v1 -; GFX9-NEXT: v_lshlrev_b32_e32 v1, 24, v3 -; GFX9-NEXT: v_or3_b32 v0, v0, v4, v1 +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 24, v4 +; GFX9-NEXT: v_or3_b32 v0, v0, v3, v1 ; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -3580,13 +3562,13 @@ ; GFX10-NEXT: global_load_dwordx4 v[0:3], v[0:1], off ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v0, 8 -; GFX10-NEXT: s_mov_b32 s4, 16 +; GFX10-NEXT: v_mov_b32_e32 v1, 16 ; GFX10-NEXT: v_lshlrev_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 -; GFX10-NEXT: v_lshrrev_b32_e32 v1, 24, v2 -; GFX10-NEXT: v_lshlrev_b32_sdwa v3, s4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 +; GFX10-NEXT: v_lshrrev_b32_e32 v3, 24, v2 +; GFX10-NEXT: v_lshlrev_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 ; GFX10-NEXT: v_and_or_b32 v0, 0xff, v2, v0 -; GFX10-NEXT: v_lshlrev_b32_e32 v1, 24, v1 -; GFX10-NEXT: v_or3_b32 v0, v0, v3, v1 +; GFX10-NEXT: v_lshlrev_b32_e32 v2, 24, v3 +; GFX10-NEXT: v_or3_b32 v0, v0, v1, v2 ; GFX10-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX10-NEXT: s_setpc_b64 s[30:31] %vector = load <16 x i8>, <16 x i8> addrspace(1)* %ptr @@ -3749,16 +3731,16 @@ ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: global_load_dwordx4 v[0:3], v[0:1], off -; GFX9-NEXT: s_mov_b32 s4, 8 ; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v1, 8 ; GFX9-NEXT: v_mov_b32_e32 v0, 0xff -; GFX9-NEXT: v_mov_b32_e32 v1, 16 -; GFX9-NEXT: v_lshrrev_b32_e32 v2, 24, v3 -; GFX9-NEXT: v_lshlrev_b32_sdwa v4, s4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 -; GFX9-NEXT: v_lshlrev_b32_sdwa v1, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 -; GFX9-NEXT: v_and_or_b32 v0, v3, v0, v4 -; GFX9-NEXT: v_lshlrev_b32_e32 v2, 24, v2 -; GFX9-NEXT: v_or3_b32 v0, v0, v1, v2 +; GFX9-NEXT: v_mov_b32_e32 v2, 16 +; GFX9-NEXT: v_lshrrev_b32_e32 v4, 24, v3 +; GFX9-NEXT: v_lshlrev_b32_sdwa v1, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 +; GFX9-NEXT: v_lshlrev_b32_sdwa v2, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 +; GFX9-NEXT: v_and_or_b32 v0, v3, v0, v1 +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 24, v4 +; GFX9-NEXT: v_or3_b32 v0, v0, v2, v1 ; GFX9-NEXT: v_lshrrev_b32_e32 v0, 8, v0 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -3805,15 +3787,15 @@ ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: global_load_dwordx4 v[0:3], v[0:1], off -; GFX10-NEXT: s_mov_b32 s4, 8 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v0, 16 -; GFX10-NEXT: v_lshlrev_b32_sdwa v1, s4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 +; GFX10-NEXT: v_mov_b32_e32 v0, 8 +; GFX10-NEXT: v_mov_b32_e32 v1, 16 +; GFX10-NEXT: v_lshlrev_b32_sdwa v0, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 ; GFX10-NEXT: v_lshrrev_b32_e32 v2, 24, v3 -; GFX10-NEXT: v_lshlrev_b32_sdwa v0, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 -; GFX10-NEXT: v_and_or_b32 v1, 0xff, v3, v1 +; GFX10-NEXT: v_lshlrev_b32_sdwa v1, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 +; GFX10-NEXT: v_and_or_b32 v0, 0xff, v3, v0 ; GFX10-NEXT: v_lshlrev_b32_e32 v2, 24, v2 -; GFX10-NEXT: v_or3_b32 v0, v1, v0, v2 +; GFX10-NEXT: v_or3_b32 v0, v0, v1, v2 ; GFX10-NEXT: v_lshrrev_b32_e32 v0, 8, v0 ; GFX10-NEXT: s_setpc_b64 s[30:31] %vector = load <16 x i8>, <16 x i8> addrspace(1)* %ptr @@ -3828,14 +3810,14 @@ ; GFX9-NEXT: global_load_dwordx4 v[0:3], v[0:1], off ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v1, 8 -; GFX9-NEXT: s_mov_b32 s4, 16 ; GFX9-NEXT: v_mov_b32_e32 v0, 0xff -; GFX9-NEXT: v_lshrrev_b32_e32 v2, 24, v3 +; GFX9-NEXT: v_mov_b32_e32 v2, 16 +; GFX9-NEXT: v_lshrrev_b32_e32 v4, 24, v3 ; GFX9-NEXT: v_lshlrev_b32_sdwa v1, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 -; GFX9-NEXT: v_lshlrev_b32_sdwa v4, s4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 +; GFX9-NEXT: v_lshlrev_b32_sdwa v2, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 ; GFX9-NEXT: v_and_or_b32 v0, v3, v0, v1 -; GFX9-NEXT: v_lshlrev_b32_e32 v1, 24, v2 -; GFX9-NEXT: v_or3_b32 v0, v0, v4, v1 +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 24, v4 +; GFX9-NEXT: v_or3_b32 v0, v0, v2, v1 ; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -3884,13 +3866,13 @@ ; GFX10-NEXT: global_load_dwordx4 v[0:3], v[0:1], off ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v0, 8 -; GFX10-NEXT: s_mov_b32 s4, 16 +; GFX10-NEXT: v_mov_b32_e32 v1, 16 ; GFX10-NEXT: v_lshlrev_b32_sdwa v0, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 -; GFX10-NEXT: v_lshrrev_b32_e32 v1, 24, v3 -; GFX10-NEXT: v_lshlrev_b32_sdwa v2, s4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 +; GFX10-NEXT: v_lshrrev_b32_e32 v2, 24, v3 +; GFX10-NEXT: v_lshlrev_b32_sdwa v1, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 ; GFX10-NEXT: v_and_or_b32 v0, 0xff, v3, v0 -; GFX10-NEXT: v_lshlrev_b32_e32 v1, 24, v1 -; GFX10-NEXT: v_or3_b32 v0, v0, v2, v1 +; GFX10-NEXT: v_lshlrev_b32_e32 v2, 24, v2 +; GFX10-NEXT: v_or3_b32 v0, v0, v1, v2 ; GFX10-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX10-NEXT: s_setpc_b64 s[30:31] %vector = load <16 x i8>, <16 x i8> addrspace(1)* %ptr Index: llvm/test/CodeGen/AMDGPU/GlobalISel/fdiv.f32.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/GlobalISel/fdiv.f32.ll +++ llvm/test/CodeGen/AMDGPU/GlobalISel/fdiv.f32.ll @@ -824,19 +824,19 @@ ; GCN-FLUSH-LABEL: v_fdiv_v2f32_ulp25: ; GCN-FLUSH: ; %bb.0: ; GCN-FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-FLUSH-NEXT: s_mov_b32 s4, 0x6f800000 -; GCN-FLUSH-NEXT: v_mov_b32_e32 v4, 0x2f800000 -; GCN-FLUSH-NEXT: v_cmp_gt_f32_e64 vcc, |v2|, s4 -; GCN-FLUSH-NEXT: v_cndmask_b32_e32 v5, 1.0, v4, vcc -; GCN-FLUSH-NEXT: v_cmp_gt_f32_e64 vcc, |v3|, s4 -; GCN-FLUSH-NEXT: v_cndmask_b32_e32 v4, 1.0, v4, vcc -; GCN-FLUSH-NEXT: v_mul_f32_e32 v2, v2, v5 +; GCN-FLUSH-NEXT: v_mov_b32_e32 v4, 0x6f800000 +; GCN-FLUSH-NEXT: v_mov_b32_e32 v5, 0x2f800000 +; GCN-FLUSH-NEXT: v_cmp_gt_f32_e64 vcc, |v2|, v4 +; GCN-FLUSH-NEXT: v_cndmask_b32_e32 v6, 1.0, v5, vcc +; GCN-FLUSH-NEXT: v_cmp_gt_f32_e64 vcc, |v3|, v4 +; GCN-FLUSH-NEXT: v_cndmask_b32_e32 v4, 1.0, v5, vcc +; GCN-FLUSH-NEXT: v_mul_f32_e32 v2, v2, v6 ; GCN-FLUSH-NEXT: v_mul_f32_e32 v3, v3, v4 ; GCN-FLUSH-NEXT: v_rcp_f32_e32 v2, v2 ; GCN-FLUSH-NEXT: v_rcp_f32_e32 v3, v3 ; GCN-FLUSH-NEXT: v_mul_f32_e32 v0, v0, v2 ; GCN-FLUSH-NEXT: v_mul_f32_e32 v1, v1, v3 -; GCN-FLUSH-NEXT: v_mul_f32_e32 v0, v5, v0 +; GCN-FLUSH-NEXT: v_mul_f32_e32 v0, v6, v0 ; GCN-FLUSH-NEXT: v_mul_f32_e32 v1, v4, v1 ; GCN-FLUSH-NEXT: s_setpc_b64 s[30:31] ; @@ -901,20 +901,20 @@ ; GFX10-FLUSH: ; %bb.0: ; GFX10-FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-FLUSH-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-FLUSH-NEXT: s_mov_b32 s4, 0x6f800000 -; GFX10-FLUSH-NEXT: s_mov_b32 s5, 0x2f800000 -; GFX10-FLUSH-NEXT: v_cmp_gt_f32_e64 s6, |v2|, s4 -; GFX10-FLUSH-NEXT: v_cmp_gt_f32_e64 s4, |v3|, s4 -; GFX10-FLUSH-NEXT: v_cndmask_b32_e64 v4, 1.0, s5, s6 -; GFX10-FLUSH-NEXT: v_cndmask_b32_e64 v5, 1.0, s5, s4 -; GFX10-FLUSH-NEXT: v_mul_f32_e32 v2, v2, v4 -; GFX10-FLUSH-NEXT: v_mul_f32_e32 v3, v3, v5 +; GFX10-FLUSH-NEXT: v_mov_b32_e32 v4, 0x6f800000 +; GFX10-FLUSH-NEXT: v_mov_b32_e32 v5, 0x2f800000 +; GFX10-FLUSH-NEXT: v_cmp_gt_f32_e64 vcc_lo, |v2|, v4 +; GFX10-FLUSH-NEXT: v_cndmask_b32_e32 v6, 1.0, v5, vcc_lo +; GFX10-FLUSH-NEXT: v_cmp_gt_f32_e64 vcc_lo, |v3|, v4 +; GFX10-FLUSH-NEXT: v_mul_f32_e32 v2, v2, v6 +; GFX10-FLUSH-NEXT: v_cndmask_b32_e32 v4, 1.0, v5, vcc_lo ; GFX10-FLUSH-NEXT: v_rcp_f32_e32 v2, v2 +; GFX10-FLUSH-NEXT: v_mul_f32_e32 v3, v3, v4 ; GFX10-FLUSH-NEXT: v_rcp_f32_e32 v3, v3 ; GFX10-FLUSH-NEXT: v_mul_f32_e32 v0, v0, v2 +; GFX10-FLUSH-NEXT: v_mul_f32_e32 v0, v6, v0 ; GFX10-FLUSH-NEXT: v_mul_f32_e32 v1, v1, v3 -; GFX10-FLUSH-NEXT: v_mul_f32_e32 v0, v4, v0 -; GFX10-FLUSH-NEXT: v_mul_f32_e32 v1, v5, v1 +; GFX10-FLUSH-NEXT: v_mul_f32_e32 v1, v4, v1 ; GFX10-FLUSH-NEXT: s_setpc_b64 s[30:31] %fdiv = fdiv <2 x float> %a, %b, !fpmath !0 ret <2 x float> %fdiv @@ -1307,19 +1307,19 @@ ; GCN-IEEE-LABEL: v_rcp_v2f32_ulp25: ; GCN-IEEE: ; %bb.0: ; GCN-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-IEEE-NEXT: s_mov_b32 s4, 0x6f800000 -; GCN-IEEE-NEXT: v_mov_b32_e32 v2, 0x2f800000 -; GCN-IEEE-NEXT: v_cmp_gt_f32_e64 vcc, |v0|, s4 -; GCN-IEEE-NEXT: v_cndmask_b32_e32 v3, 1.0, v2, vcc -; GCN-IEEE-NEXT: v_cmp_gt_f32_e64 vcc, |v1|, s4 -; GCN-IEEE-NEXT: v_cndmask_b32_e32 v2, 1.0, v2, vcc -; GCN-IEEE-NEXT: v_mul_f32_e32 v0, v0, v3 +; GCN-IEEE-NEXT: v_mov_b32_e32 v2, 0x6f800000 +; GCN-IEEE-NEXT: v_mov_b32_e32 v3, 0x2f800000 +; GCN-IEEE-NEXT: v_cmp_gt_f32_e64 vcc, |v0|, v2 +; GCN-IEEE-NEXT: v_cndmask_b32_e32 v4, 1.0, v3, vcc +; GCN-IEEE-NEXT: v_cmp_gt_f32_e64 vcc, |v1|, v2 +; GCN-IEEE-NEXT: v_cndmask_b32_e32 v2, 1.0, v3, vcc +; GCN-IEEE-NEXT: v_mul_f32_e32 v0, v0, v4 ; GCN-IEEE-NEXT: v_mul_f32_e32 v1, v1, v2 ; GCN-IEEE-NEXT: v_rcp_f32_e32 v0, v0 ; GCN-IEEE-NEXT: v_rcp_f32_e32 v1, v1 ; GCN-IEEE-NEXT: v_mul_f32_e32 v0, 1.0, v0 ; GCN-IEEE-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; GCN-IEEE-NEXT: v_mul_f32_e32 v0, v3, v0 +; GCN-IEEE-NEXT: v_mul_f32_e32 v0, v4, v0 ; GCN-IEEE-NEXT: v_mul_f32_e32 v1, v2, v1 ; GCN-IEEE-NEXT: s_setpc_b64 s[30:31] ; @@ -1334,20 +1334,20 @@ ; GFX10-IEEE: ; %bb.0: ; GFX10-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-IEEE-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-IEEE-NEXT: s_mov_b32 s4, 0x6f800000 -; GFX10-IEEE-NEXT: s_mov_b32 s5, 0x2f800000 -; GFX10-IEEE-NEXT: v_cmp_gt_f32_e64 s6, |v0|, s4 -; GFX10-IEEE-NEXT: v_cmp_gt_f32_e64 s4, |v1|, s4 -; GFX10-IEEE-NEXT: v_cndmask_b32_e64 v2, 1.0, s5, s6 -; GFX10-IEEE-NEXT: v_cndmask_b32_e64 v3, 1.0, s5, s4 -; GFX10-IEEE-NEXT: v_mul_f32_e32 v0, v0, v2 -; GFX10-IEEE-NEXT: v_mul_f32_e32 v1, v1, v3 +; GFX10-IEEE-NEXT: v_mov_b32_e32 v2, 0x6f800000 +; GFX10-IEEE-NEXT: v_mov_b32_e32 v3, 0x2f800000 +; GFX10-IEEE-NEXT: v_cmp_gt_f32_e64 vcc_lo, |v0|, v2 +; GFX10-IEEE-NEXT: v_cndmask_b32_e32 v4, 1.0, v3, vcc_lo +; GFX10-IEEE-NEXT: v_cmp_gt_f32_e64 vcc_lo, |v1|, v2 +; GFX10-IEEE-NEXT: v_mul_f32_e32 v0, v0, v4 +; GFX10-IEEE-NEXT: v_cndmask_b32_e32 v2, 1.0, v3, vcc_lo ; GFX10-IEEE-NEXT: v_rcp_f32_e32 v0, v0 +; GFX10-IEEE-NEXT: v_mul_f32_e32 v1, v1, v2 ; GFX10-IEEE-NEXT: v_rcp_f32_e32 v1, v1 ; GFX10-IEEE-NEXT: v_mul_f32_e32 v0, 1.0, v0 +; GFX10-IEEE-NEXT: v_mul_f32_e32 v0, v4, v0 ; GFX10-IEEE-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; GFX10-IEEE-NEXT: v_mul_f32_e32 v0, v2, v0 -; GFX10-IEEE-NEXT: v_mul_f32_e32 v1, v3, v1 +; GFX10-IEEE-NEXT: v_mul_f32_e32 v1, v2, v1 ; GFX10-IEEE-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-FLUSH-LABEL: v_rcp_v2f32_ulp25: @@ -1415,19 +1415,19 @@ ; GCN-FLUSH-LABEL: v_fdiv_v2f32_arcp_ulp25: ; GCN-FLUSH: ; %bb.0: ; GCN-FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-FLUSH-NEXT: s_mov_b32 s4, 0x6f800000 -; GCN-FLUSH-NEXT: v_mov_b32_e32 v4, 0x2f800000 -; GCN-FLUSH-NEXT: v_cmp_gt_f32_e64 vcc, |v2|, s4 -; GCN-FLUSH-NEXT: v_cndmask_b32_e32 v5, 1.0, v4, vcc -; GCN-FLUSH-NEXT: v_cmp_gt_f32_e64 vcc, |v3|, s4 -; GCN-FLUSH-NEXT: v_cndmask_b32_e32 v4, 1.0, v4, vcc -; GCN-FLUSH-NEXT: v_mul_f32_e32 v2, v2, v5 +; GCN-FLUSH-NEXT: v_mov_b32_e32 v4, 0x6f800000 +; GCN-FLUSH-NEXT: v_mov_b32_e32 v5, 0x2f800000 +; GCN-FLUSH-NEXT: v_cmp_gt_f32_e64 vcc, |v2|, v4 +; GCN-FLUSH-NEXT: v_cndmask_b32_e32 v6, 1.0, v5, vcc +; GCN-FLUSH-NEXT: v_cmp_gt_f32_e64 vcc, |v3|, v4 +; GCN-FLUSH-NEXT: v_cndmask_b32_e32 v4, 1.0, v5, vcc +; GCN-FLUSH-NEXT: v_mul_f32_e32 v2, v2, v6 ; GCN-FLUSH-NEXT: v_mul_f32_e32 v3, v3, v4 ; GCN-FLUSH-NEXT: v_rcp_f32_e32 v2, v2 ; GCN-FLUSH-NEXT: v_rcp_f32_e32 v3, v3 ; GCN-FLUSH-NEXT: v_mul_f32_e32 v0, v0, v2 ; GCN-FLUSH-NEXT: v_mul_f32_e32 v1, v1, v3 -; GCN-FLUSH-NEXT: v_mul_f32_e32 v0, v5, v0 +; GCN-FLUSH-NEXT: v_mul_f32_e32 v0, v6, v0 ; GCN-FLUSH-NEXT: v_mul_f32_e32 v1, v4, v1 ; GCN-FLUSH-NEXT: s_setpc_b64 s[30:31] ; @@ -1492,20 +1492,20 @@ ; GFX10-FLUSH: ; %bb.0: ; GFX10-FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-FLUSH-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-FLUSH-NEXT: s_mov_b32 s4, 0x6f800000 -; GFX10-FLUSH-NEXT: s_mov_b32 s5, 0x2f800000 -; GFX10-FLUSH-NEXT: v_cmp_gt_f32_e64 s6, |v2|, s4 -; GFX10-FLUSH-NEXT: v_cmp_gt_f32_e64 s4, |v3|, s4 -; GFX10-FLUSH-NEXT: v_cndmask_b32_e64 v4, 1.0, s5, s6 -; GFX10-FLUSH-NEXT: v_cndmask_b32_e64 v5, 1.0, s5, s4 -; GFX10-FLUSH-NEXT: v_mul_f32_e32 v2, v2, v4 -; GFX10-FLUSH-NEXT: v_mul_f32_e32 v3, v3, v5 +; GFX10-FLUSH-NEXT: v_mov_b32_e32 v4, 0x6f800000 +; GFX10-FLUSH-NEXT: v_mov_b32_e32 v5, 0x2f800000 +; GFX10-FLUSH-NEXT: v_cmp_gt_f32_e64 vcc_lo, |v2|, v4 +; GFX10-FLUSH-NEXT: v_cndmask_b32_e32 v6, 1.0, v5, vcc_lo +; GFX10-FLUSH-NEXT: v_cmp_gt_f32_e64 vcc_lo, |v3|, v4 +; GFX10-FLUSH-NEXT: v_mul_f32_e32 v2, v2, v6 +; GFX10-FLUSH-NEXT: v_cndmask_b32_e32 v4, 1.0, v5, vcc_lo ; GFX10-FLUSH-NEXT: v_rcp_f32_e32 v2, v2 +; GFX10-FLUSH-NEXT: v_mul_f32_e32 v3, v3, v4 ; GFX10-FLUSH-NEXT: v_rcp_f32_e32 v3, v3 ; GFX10-FLUSH-NEXT: v_mul_f32_e32 v0, v0, v2 +; GFX10-FLUSH-NEXT: v_mul_f32_e32 v0, v6, v0 ; GFX10-FLUSH-NEXT: v_mul_f32_e32 v1, v1, v3 -; GFX10-FLUSH-NEXT: v_mul_f32_e32 v0, v4, v0 -; GFX10-FLUSH-NEXT: v_mul_f32_e32 v1, v5, v1 +; GFX10-FLUSH-NEXT: v_mul_f32_e32 v1, v4, v1 ; GFX10-FLUSH-NEXT: s_setpc_b64 s[30:31] %fdiv = fdiv arcp <2 x float> %a, %b, !fpmath !0 ret <2 x float> %fdiv Index: llvm/test/CodeGen/AMDGPU/GlobalISel/flat-scratch.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/GlobalISel/flat-scratch.ll +++ llvm/test/CodeGen/AMDGPU/GlobalISel/flat-scratch.ll @@ -523,14 +523,14 @@ ; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s1, 0 ; GFX9-NEXT: v_mov_b32_e32 v0, 13 ; GFX9-NEXT: s_mov_b32 vcc_hi, 0 -; GFX9-NEXT: s_movk_i32 s0, 0x3e80 ; GFX9-NEXT: scratch_store_dword off, v0, vcc_hi offset:4 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, 15 -; GFX9-NEXT: s_add_i32 s0, s0, 4 -; GFX9-NEXT: scratch_store_dword off, v0, s0 +; GFX9-NEXT: v_mov_b32_e32 v0, 0x3e80 +; GFX9-NEXT: v_add_u32_e32 v0, 4, v0 +; GFX9-NEXT: v_mov_b32_e32 v1, 15 +; GFX9-NEXT: scratch_store_dword v0, v1, off ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: scratch_load_dword v0, off, s0 glc +; GFX9-NEXT: scratch_load_dword v0, v0, off glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_endpgm ; @@ -540,15 +540,15 @@ ; GFX10-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s0 ; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1 -; GFX10-NEXT: v_mov_b32_e32 v0, 13 -; GFX10-NEXT: v_mov_b32_e32 v1, 15 -; GFX10-NEXT: s_movk_i32 s0, 0x3e80 -; GFX10-NEXT: s_add_i32 s0, s0, 4 -; GFX10-NEXT: scratch_store_dword off, v0, off offset:4 +; GFX10-NEXT: v_mov_b32_e32 v0, 0x3e80 +; GFX10-NEXT: v_mov_b32_e32 v1, 13 +; GFX10-NEXT: v_mov_b32_e32 v2, 15 +; GFX10-NEXT: v_add_nc_u32_e32 v0, 4, v0 +; GFX10-NEXT: scratch_store_dword off, v1, off offset:4 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: scratch_store_dword off, v1, s0 +; GFX10-NEXT: scratch_store_dword v0, v2, off ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: scratch_load_dword v0, off, s0 glc dlc +; GFX10-NEXT: scratch_load_dword v0, v0, off glc dlc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_endpgm bb: @@ -567,14 +567,14 @@ ; GFX9: ; %bb.0: ; %bb ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, 13 -; GFX9-NEXT: s_movk_i32 s0, 0x3e80 ; GFX9-NEXT: scratch_store_dword off, v0, s32 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, 15 -; GFX9-NEXT: s_add_i32 s0, s0, s32 -; GFX9-NEXT: scratch_store_dword off, v0, s0 +; GFX9-NEXT: v_mov_b32_e32 v0, 0x3e80 +; GFX9-NEXT: v_add_u32_e32 v0, s32, v0 +; GFX9-NEXT: v_mov_b32_e32 v1, 15 +; GFX9-NEXT: scratch_store_dword v0, v1, off ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: scratch_load_dword v0, off, s0 glc +; GFX9-NEXT: scratch_load_dword v0, v0, off glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -582,15 +582,15 @@ ; GFX10: ; %bb.0: ; %bb ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: v_mov_b32_e32 v0, 13 -; GFX10-NEXT: v_mov_b32_e32 v1, 15 -; GFX10-NEXT: s_movk_i32 s0, 0x3e80 -; GFX10-NEXT: s_add_i32 s0, s0, s32 -; GFX10-NEXT: scratch_store_dword off, v0, s32 +; GFX10-NEXT: v_mov_b32_e32 v0, 0x3e80 +; GFX10-NEXT: v_mov_b32_e32 v1, 13 +; GFX10-NEXT: v_mov_b32_e32 v2, 15 +; GFX10-NEXT: v_add_nc_u32_e32 v0, s32, v0 +; GFX10-NEXT: scratch_store_dword off, v1, s32 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: scratch_store_dword off, v1, s0 +; GFX10-NEXT: scratch_store_dword v0, v2, off ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: scratch_load_dword v0, off, s0 glc dlc +; GFX10-NEXT: scratch_load_dword v0, v0, off glc dlc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] bb: Index: llvm/test/CodeGen/AMDGPU/GlobalISel/fmul.v2f16.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/GlobalISel/fmul.v2f16.ll +++ llvm/test/CodeGen/AMDGPU/GlobalISel/fmul.v2f16.ll @@ -303,12 +303,11 @@ ; GFX8-NEXT: v_mul_f16_e32 v4, v2, v5 ; GFX8-NEXT: v_mul_f16_sdwa v2, v2, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; GFX8-NEXT: v_mov_b32_e32 v5, 16 -; GFX8-NEXT: v_lshlrev_b32_sdwa v1, v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; GFX8-NEXT: v_or_b32_sdwa v1, v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX8-NEXT: v_mov_b32_e32 v3, 16 ; GFX8-NEXT: v_lshlrev_b32_sdwa v0, v5, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; GFX8-NEXT: v_lshlrev_b32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; GFX8-NEXT: v_lshlrev_b32_sdwa v1, v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; GFX8-NEXT: v_lshlrev_b32_sdwa v2, v5, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; GFX8-NEXT: v_or_b32_sdwa v0, v6, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_sdwa v1, v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX8-NEXT: v_or_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX8-NEXT: s_setpc_b64 s[30:31] ; @@ -347,12 +346,11 @@ ; GFX8-NEXT: v_mul_f16_e32 v4, v2, v5 ; GFX8-NEXT: v_mul_f16_sdwa v2, v2, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; GFX8-NEXT: v_mov_b32_e32 v5, 16 -; GFX8-NEXT: v_lshlrev_b32_sdwa v1, v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; GFX8-NEXT: v_or_b32_sdwa v1, v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX8-NEXT: v_mov_b32_e32 v3, 16 ; GFX8-NEXT: v_lshlrev_b32_sdwa v0, v5, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; GFX8-NEXT: v_lshlrev_b32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; GFX8-NEXT: v_lshlrev_b32_sdwa v1, v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; GFX8-NEXT: v_lshlrev_b32_sdwa v2, v5, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; GFX8-NEXT: v_or_b32_sdwa v0, v6, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_sdwa v1, v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX8-NEXT: v_or_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX8-NEXT: s_setpc_b64 s[30:31] ; @@ -392,12 +390,11 @@ ; GFX8-NEXT: v_mul_f16_e32 v4, v2, v5 ; GFX8-NEXT: v_mul_f16_sdwa v2, v2, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; GFX8-NEXT: v_mov_b32_e32 v5, 16 -; GFX8-NEXT: v_lshlrev_b32_sdwa v1, v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; GFX8-NEXT: v_or_b32_sdwa v1, v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX8-NEXT: v_mov_b32_e32 v3, 16 ; GFX8-NEXT: v_lshlrev_b32_sdwa v0, v5, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; GFX8-NEXT: v_lshlrev_b32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; GFX8-NEXT: v_lshlrev_b32_sdwa v1, v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; GFX8-NEXT: v_lshlrev_b32_sdwa v2, v5, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; GFX8-NEXT: v_or_b32_sdwa v0, v6, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_sdwa v1, v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX8-NEXT: v_or_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX8-NEXT: s_setpc_b64 s[30:31] ; @@ -433,12 +430,11 @@ ; GFX8-NEXT: v_mul_f16_e32 v4, v2, v5 ; GFX8-NEXT: v_mul_f16_sdwa v2, v2, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; GFX8-NEXT: v_mov_b32_e32 v5, 16 -; GFX8-NEXT: v_lshlrev_b32_sdwa v1, v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; GFX8-NEXT: v_or_b32_sdwa v1, v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX8-NEXT: v_mov_b32_e32 v3, 16 ; GFX8-NEXT: v_lshlrev_b32_sdwa v0, v5, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; GFX8-NEXT: v_lshlrev_b32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; GFX8-NEXT: v_lshlrev_b32_sdwa v1, v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; GFX8-NEXT: v_lshlrev_b32_sdwa v2, v5, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; GFX8-NEXT: v_or_b32_sdwa v0, v6, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_sdwa v1, v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX8-NEXT: v_or_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX8-NEXT: s_setpc_b64 s[30:31] ; @@ -479,7 +475,6 @@ ; GFX8-NEXT: v_mul_f16_sdwa v3, v3, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; GFX8-NEXT: v_mov_b32_e32 v7, 16 ; GFX8-NEXT: v_lshlrev_b32_sdwa v0, v7, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; GFX8-NEXT: v_mov_b32_e32 v7, 16 ; GFX8-NEXT: v_lshlrev_b32_sdwa v1, v7, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; GFX8-NEXT: v_lshlrev_b32_sdwa v2, v7, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; GFX8-NEXT: v_lshlrev_b32_sdwa v3, v7, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 @@ -530,7 +525,6 @@ ; GFX8-NEXT: v_mul_f16_sdwa v3, v3, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; GFX8-NEXT: v_mov_b32_e32 v7, 16 ; GFX8-NEXT: v_lshlrev_b32_sdwa v0, v7, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; GFX8-NEXT: v_mov_b32_e32 v7, 16 ; GFX8-NEXT: v_lshlrev_b32_sdwa v1, v7, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; GFX8-NEXT: v_lshlrev_b32_sdwa v2, v7, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; GFX8-NEXT: v_lshlrev_b32_sdwa v3, v7, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 @@ -582,7 +576,6 @@ ; GFX8-NEXT: v_mul_f16_sdwa v3, v3, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; GFX8-NEXT: v_mov_b32_e32 v7, 16 ; GFX8-NEXT: v_lshlrev_b32_sdwa v0, v7, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; GFX8-NEXT: v_mov_b32_e32 v7, 16 ; GFX8-NEXT: v_lshlrev_b32_sdwa v1, v7, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; GFX8-NEXT: v_lshlrev_b32_sdwa v2, v7, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; GFX8-NEXT: v_lshlrev_b32_sdwa v3, v7, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 @@ -629,7 +622,6 @@ ; GFX8-NEXT: v_mul_f16_sdwa v3, v3, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; GFX8-NEXT: v_mov_b32_e32 v7, 16 ; GFX8-NEXT: v_lshlrev_b32_sdwa v0, v7, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; GFX8-NEXT: v_mov_b32_e32 v7, 16 ; GFX8-NEXT: v_lshlrev_b32_sdwa v1, v7, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; GFX8-NEXT: v_lshlrev_b32_sdwa v2, v7, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; GFX8-NEXT: v_lshlrev_b32_sdwa v3, v7, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 Index: llvm/test/CodeGen/AMDGPU/GlobalISel/fshl.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/GlobalISel/fshl.ll +++ llvm/test/CodeGen/AMDGPU/GlobalISel/fshl.ll @@ -140,7 +140,8 @@ ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: v_cvt_f32_ubyte0_e32 v3, 7 ; GFX6-NEXT: v_rcp_iflag_f32_e32 v3, v3 -; GFX6-NEXT: v_and_b32_e32 v2, 0x7f, v2 +; GFX6-NEXT: v_mov_b32_e32 v5, 0x7f +; GFX6-NEXT: v_and_b32_e32 v2, v2, v5 ; GFX6-NEXT: v_bfe_u32 v1, v1, 1, 6 ; GFX6-NEXT: v_mul_f32_e32 v3, 0x4f7ffffe, v3 ; GFX6-NEXT: v_cvt_u32_f32_e32 v3, v3 @@ -148,7 +149,6 @@ ; GFX6-NEXT: v_mul_hi_u32 v4, v3, v4 ; GFX6-NEXT: v_add_i32_e32 v3, vcc, v3, v4 ; GFX6-NEXT: v_mul_hi_u32 v3, v2, v3 -; GFX6-NEXT: v_mov_b32_e32 v4, 0x7f ; GFX6-NEXT: v_mul_lo_u32 v3, v3, 7 ; GFX6-NEXT: v_sub_i32_e32 v2, vcc, v2, v3 ; GFX6-NEXT: v_subrev_i32_e32 v3, vcc, 7, v2 @@ -158,9 +158,9 @@ ; GFX6-NEXT: v_cmp_le_u32_e32 vcc, 7, v2 ; GFX6-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc ; GFX6-NEXT: v_sub_i32_e32 v3, vcc, 6, v2 -; GFX6-NEXT: v_and_b32_e32 v2, v2, v4 +; GFX6-NEXT: v_and_b32_e32 v2, v2, v5 ; GFX6-NEXT: v_lshlrev_b32_e32 v0, v2, v0 -; GFX6-NEXT: v_and_b32_e32 v2, v3, v4 +; GFX6-NEXT: v_and_b32_e32 v2, v3, v5 ; GFX6-NEXT: v_lshrrev_b32_e32 v1, v2, v1 ; GFX6-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX6-NEXT: s_setpc_b64 s[30:31] @@ -170,16 +170,16 @@ ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_cvt_f32_ubyte0_e32 v3, 7 ; GFX8-NEXT: v_rcp_iflag_f32_e32 v3, v3 -; GFX8-NEXT: v_and_b32_e32 v2, 0x7f, v2 +; GFX8-NEXT: v_mov_b32_e32 v5, 0x7f +; GFX8-NEXT: v_and_b32_e32 v2, v2, v5 +; GFX8-NEXT: v_and_b32_e32 v1, v1, v5 ; GFX8-NEXT: v_mul_f32_e32 v3, 0x4f7ffffe, v3 ; GFX8-NEXT: v_cvt_u32_f32_e32 v3, v3 +; GFX8-NEXT: v_lshrrev_b16_e32 v1, 1, v1 ; GFX8-NEXT: v_mul_lo_u32 v4, -7, v3 ; GFX8-NEXT: v_mul_hi_u32 v4, v3, v4 ; GFX8-NEXT: v_add_u32_e32 v3, vcc, v3, v4 ; GFX8-NEXT: v_mul_hi_u32 v3, v2, v3 -; GFX8-NEXT: v_mov_b32_e32 v4, 0x7f -; GFX8-NEXT: v_and_b32_e32 v1, v1, v4 -; GFX8-NEXT: v_lshrrev_b16_e32 v1, 1, v1 ; GFX8-NEXT: v_mul_lo_u32 v3, v3, 7 ; GFX8-NEXT: v_sub_u32_e32 v2, vcc, v2, v3 ; GFX8-NEXT: v_subrev_u32_e32 v3, vcc, 7, v2 @@ -189,9 +189,9 @@ ; GFX8-NEXT: v_cmp_le_u32_e32 vcc, 7, v2 ; GFX8-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc ; GFX8-NEXT: v_sub_u16_e32 v3, 6, v2 -; GFX8-NEXT: v_and_b32_e32 v2, v2, v4 +; GFX8-NEXT: v_and_b32_e32 v2, v2, v5 ; GFX8-NEXT: v_lshlrev_b16_e32 v0, v2, v0 -; GFX8-NEXT: v_and_b32_e32 v2, v3, v4 +; GFX8-NEXT: v_and_b32_e32 v2, v3, v5 ; GFX8-NEXT: v_lshrrev_b16_e32 v1, v2, v1 ; GFX8-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX8-NEXT: s_setpc_b64 s[30:31] @@ -201,16 +201,16 @@ ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: v_cvt_f32_ubyte0_e32 v3, 7 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v3, v3 -; GFX9-NEXT: v_and_b32_e32 v2, 0x7f, v2 +; GFX9-NEXT: v_mov_b32_e32 v5, 0x7f +; GFX9-NEXT: v_and_b32_e32 v2, v2, v5 +; GFX9-NEXT: v_and_b32_e32 v1, v1, v5 ; GFX9-NEXT: v_mul_f32_e32 v3, 0x4f7ffffe, v3 ; GFX9-NEXT: v_cvt_u32_f32_e32 v3, v3 +; GFX9-NEXT: v_lshrrev_b16_e32 v1, 1, v1 ; GFX9-NEXT: v_mul_lo_u32 v4, -7, v3 ; GFX9-NEXT: v_mul_hi_u32 v4, v3, v4 ; GFX9-NEXT: v_add_u32_e32 v3, v3, v4 ; GFX9-NEXT: v_mul_hi_u32 v3, v2, v3 -; GFX9-NEXT: v_mov_b32_e32 v4, 0x7f -; GFX9-NEXT: v_and_b32_e32 v1, v1, v4 -; GFX9-NEXT: v_lshrrev_b16_e32 v1, 1, v1 ; GFX9-NEXT: v_mul_lo_u32 v3, v3, 7 ; GFX9-NEXT: v_sub_u32_e32 v2, v2, v3 ; GFX9-NEXT: v_subrev_u32_e32 v3, 7, v2 @@ -220,9 +220,9 @@ ; GFX9-NEXT: v_cmp_le_u32_e32 vcc, 7, v2 ; GFX9-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc ; GFX9-NEXT: v_sub_u16_e32 v3, 6, v2 -; GFX9-NEXT: v_and_b32_e32 v2, v2, v4 +; GFX9-NEXT: v_and_b32_e32 v2, v2, v5 ; GFX9-NEXT: v_lshlrev_b16_e32 v0, v2, v0 -; GFX9-NEXT: v_and_b32_e32 v2, v3, v4 +; GFX9-NEXT: v_and_b32_e32 v2, v3, v5 ; GFX9-NEXT: v_lshrrev_b16_e32 v1, v2, v1 ; GFX9-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -232,9 +232,10 @@ ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: v_cvt_f32_ubyte0_e32 v3, 7 -; GFX10-NEXT: v_and_b32_e32 v2, 0x7f, v2 -; GFX10-NEXT: v_and_b32_e32 v1, 0x7f, v1 +; GFX10-NEXT: v_mov_b32_e32 v5, 0x7f ; GFX10-NEXT: v_rcp_iflag_f32_e32 v3, v3 +; GFX10-NEXT: v_and_b32_e32 v2, v2, v5 +; GFX10-NEXT: v_and_b32_e32 v1, v1, v5 ; GFX10-NEXT: v_lshrrev_b16 v1, 1, v1 ; GFX10-NEXT: v_mul_f32_e32 v3, 0x4f7ffffe, v3 ; GFX10-NEXT: v_cvt_u32_f32_e32 v3, v3 @@ -250,10 +251,9 @@ ; GFX10-NEXT: v_subrev_nc_u32_e32 v3, 7, v2 ; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, 7, v2 ; GFX10-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc_lo -; GFX10-NEXT: v_mov_b32_e32 v3, 0x7f -; GFX10-NEXT: v_sub_nc_u16 v4, 6, v2 -; GFX10-NEXT: v_and_b32_e32 v2, v2, v3 -; GFX10-NEXT: v_and_b32_e32 v3, v4, v3 +; GFX10-NEXT: v_sub_nc_u16 v3, 6, v2 +; GFX10-NEXT: v_and_b32_e32 v2, v2, v5 +; GFX10-NEXT: v_and_b32_e32 v3, v3, v5 ; GFX10-NEXT: v_lshlrev_b16 v0, v2, v0 ; GFX10-NEXT: v_lshrrev_b16 v1, v3, v1 ; GFX10-NEXT: v_or_b32_e32 v0, v0, v1 @@ -663,10 +663,10 @@ ; GFX6-NEXT: v_lshrrev_b32_e32 v1, 1, v1 ; GFX6-NEXT: v_lshlrev_b32_e32 v2, v2, v3 ; GFX6-NEXT: v_lshrrev_b32_e32 v1, v4, v1 +; GFX6-NEXT: v_mov_b32_e32 v6, 0xff ; GFX6-NEXT: v_or_b32_e32 v1, v2, v1 -; GFX6-NEXT: v_mov_b32_e32 v2, 0xff -; GFX6-NEXT: v_and_b32_e32 v1, v1, v2 -; GFX6-NEXT: v_and_b32_e32 v0, v0, v2 +; GFX6-NEXT: v_and_b32_e32 v1, v1, v6 +; GFX6-NEXT: v_and_b32_e32 v0, v0, v6 ; GFX6-NEXT: v_lshlrev_b32_e32 v1, 8, v1 ; GFX6-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX6-NEXT: s_setpc_b64 s[30:31] @@ -727,27 +727,27 @@ ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: v_lshrrev_b32_e32 v3, 8, v2 -; GFX10-NEXT: v_lshrrev_b32_e32 v4, 8, v1 -; GFX10-NEXT: s_movk_i32 s4, 0xff -; GFX10-NEXT: v_lshrrev_b32_e32 v5, 8, v0 -; GFX10-NEXT: v_xor_b32_e32 v7, -1, v2 -; GFX10-NEXT: v_xor_b32_e32 v6, -1, v3 -; GFX10-NEXT: v_and_b32_e32 v4, s4, v4 -; GFX10-NEXT: v_and_b32_e32 v1, s4, v1 -; GFX10-NEXT: v_and_b32_e32 v3, 7, v3 +; GFX10-NEXT: v_mov_b32_e32 v3, 0xff +; GFX10-NEXT: v_lshrrev_b32_e32 v4, 8, v2 +; GFX10-NEXT: v_lshrrev_b32_e32 v5, 8, v1 +; GFX10-NEXT: v_lshrrev_b32_e32 v6, 8, v0 +; GFX10-NEXT: v_xor_b32_e32 v8, -1, v2 +; GFX10-NEXT: v_and_b32_e32 v1, v1, v3 +; GFX10-NEXT: v_xor_b32_e32 v7, -1, v4 +; GFX10-NEXT: v_and_b32_e32 v5, v5, v3 +; GFX10-NEXT: v_and_b32_e32 v4, 7, v4 ; GFX10-NEXT: v_and_b32_e32 v2, 7, v2 -; GFX10-NEXT: v_and_b32_e32 v6, 7, v6 -; GFX10-NEXT: v_lshrrev_b16 v4, 1, v4 +; GFX10-NEXT: v_and_b32_e32 v8, 7, v8 ; GFX10-NEXT: v_and_b32_e32 v7, 7, v7 +; GFX10-NEXT: v_lshrrev_b16 v5, 1, v5 ; GFX10-NEXT: v_lshrrev_b16 v1, 1, v1 -; GFX10-NEXT: v_lshlrev_b16 v3, v3, v5 +; GFX10-NEXT: v_lshlrev_b16 v4, v4, v6 ; GFX10-NEXT: v_lshlrev_b16 v0, v2, v0 -; GFX10-NEXT: v_lshrrev_b16 v4, v6, v4 -; GFX10-NEXT: v_lshrrev_b16 v1, v7, v1 -; GFX10-NEXT: v_or_b32_e32 v2, v3, v4 +; GFX10-NEXT: v_lshrrev_b16 v5, v7, v5 +; GFX10-NEXT: v_lshrrev_b16 v1, v8, v1 +; GFX10-NEXT: v_or_b32_e32 v2, v4, v5 ; GFX10-NEXT: v_or_b32_e32 v0, v0, v1 -; GFX10-NEXT: v_and_b32_sdwa v1, v2, s4 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX10-NEXT: v_and_b32_sdwa v1, v2, v3 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX10-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX10-NEXT: s_setpc_b64 s[30:31] %lhs = bitcast i16 %lhs.arg to <2 x i8> @@ -1006,23 +1006,23 @@ ; GFX6-NEXT: v_bfe_u32 v4, v1, 16, 8 ; GFX6-NEXT: v_and_b32_e32 v6, 7, v6 ; GFX6-NEXT: v_lshrrev_b32_e32 v4, 1, v4 -; GFX6-NEXT: s_movk_i32 s4, 0xff +; GFX6-NEXT: v_mov_b32_e32 v10, 0xff ; GFX6-NEXT: v_lshrrev_b32_e32 v4, v6, v4 ; GFX6-NEXT: v_xor_b32_e32 v6, -1, v8 ; GFX6-NEXT: v_or_b32_e32 v3, v3, v4 ; GFX6-NEXT: v_and_b32_e32 v4, 7, v8 ; GFX6-NEXT: v_and_b32_e32 v6, 7, v6 ; GFX6-NEXT: v_lshrrev_b32_e32 v1, 25, v1 -; GFX6-NEXT: v_and_b32_e32 v2, s4, v2 +; GFX6-NEXT: v_and_b32_e32 v2, v2, v10 ; GFX6-NEXT: v_lshlrev_b32_e32 v4, v4, v5 ; GFX6-NEXT: v_lshrrev_b32_e32 v1, v6, v1 -; GFX6-NEXT: v_and_b32_e32 v0, s4, v0 +; GFX6-NEXT: v_and_b32_e32 v0, v0, v10 ; GFX6-NEXT: v_lshlrev_b32_e32 v2, 8, v2 ; GFX6-NEXT: v_or_b32_e32 v1, v4, v1 ; GFX6-NEXT: v_or_b32_e32 v0, v0, v2 -; GFX6-NEXT: v_and_b32_e32 v2, s4, v3 +; GFX6-NEXT: v_and_b32_e32 v2, v3, v10 ; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX6-NEXT: v_and_b32_e32 v1, s4, v1 +; GFX6-NEXT: v_and_b32_e32 v1, v1, v10 ; GFX6-NEXT: v_or_b32_e32 v0, v0, v2 ; GFX6-NEXT: v_lshlrev_b32_e32 v1, 24, v1 ; GFX6-NEXT: v_or_b32_e32 v0, v0, v1 @@ -1069,12 +1069,11 @@ ; GFX8-NEXT: v_lshrrev_b16_e32 v1, v6, v1 ; GFX8-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX8-NEXT: v_mov_b32_e32 v1, 8 -; GFX8-NEXT: s_movk_i32 s4, 0xff ; GFX8-NEXT: v_lshlrev_b32_sdwa v1, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX8-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX8-NEXT: v_and_b32_e32 v2, s4, v4 +; GFX8-NEXT: v_and_b32_e32 v2, v4, v9 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX8-NEXT: v_and_b32_e32 v0, s4, v0 +; GFX8-NEXT: v_and_b32_e32 v0, v0, v9 ; GFX8-NEXT: v_or_b32_e32 v1, v1, v2 ; GFX8-NEXT: v_lshlrev_b32_e32 v0, 24, v0 ; GFX8-NEXT: v_or_b32_e32 v0, v1, v0 @@ -1088,9 +1087,9 @@ ; GFX9-NEXT: v_lshrrev_b32_e32 v7, 24, v2 ; GFX9-NEXT: v_and_b32_e32 v8, 7, v2 ; GFX9-NEXT: v_xor_b32_e32 v2, -1, v2 -; GFX9-NEXT: s_mov_b32 s5, 1 +; GFX9-NEXT: s_mov_b32 s4, 1 ; GFX9-NEXT: v_and_b32_e32 v2, 7, v2 -; GFX9-NEXT: v_lshrrev_b16_sdwa v10, s5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_lshrrev_b16_sdwa v10, s4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_lshlrev_b16_e32 v8, v8, v0 ; GFX9-NEXT: v_lshrrev_b16_e32 v2, v2, v10 ; GFX9-NEXT: v_lshrrev_b32_e32 v4, 8, v1 @@ -1099,7 +1098,7 @@ ; GFX9-NEXT: v_xor_b32_e32 v5, -1, v5 ; GFX9-NEXT: v_lshrrev_b32_e32 v3, 8, v0 ; GFX9-NEXT: v_and_b32_e32 v5, 7, v5 -; GFX9-NEXT: v_lshrrev_b16_sdwa v4, s5, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_lshrrev_b16_sdwa v4, s4, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_mov_b32_e32 v9, 0xff ; GFX9-NEXT: v_lshlrev_b16_e32 v3, v8, v3 ; GFX9-NEXT: v_lshrrev_b16_e32 v4, v5, v4 @@ -1121,11 +1120,10 @@ ; GFX9-NEXT: v_lshrrev_b16_e32 v1, v6, v1 ; GFX9-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX9-NEXT: v_mov_b32_e32 v1, 8 -; GFX9-NEXT: s_movk_i32 s4, 0xff ; GFX9-NEXT: v_lshlrev_b32_sdwa v1, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_and_or_b32 v1, v2, s4, v1 -; GFX9-NEXT: v_and_b32_e32 v2, s4, v4 -; GFX9-NEXT: v_and_b32_e32 v0, s4, v0 +; GFX9-NEXT: v_and_or_b32 v1, v2, v9, v1 +; GFX9-NEXT: v_and_b32_e32 v2, v4, v9 +; GFX9-NEXT: v_and_b32_e32 v0, v0, v9 ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 24, v0 ; GFX9-NEXT: v_or3_b32 v0, v1, v2, v0 @@ -1136,53 +1134,52 @@ ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: v_lshrrev_b32_e32 v8, 8, v2 -; GFX10-NEXT: v_and_b32_e32 v11, 7, v2 +; GFX10-NEXT: v_mov_b32_e32 v9, 0xff ; GFX10-NEXT: v_lshrrev_b32_e32 v3, 8, v0 -; GFX10-NEXT: v_lshrrev_b32_e32 v4, 16, v0 -; GFX10-NEXT: v_lshrrev_b32_e32 v5, 24, v0 ; GFX10-NEXT: v_lshrrev_b32_e32 v6, 8, v1 -; GFX10-NEXT: v_lshrrev_b32_e32 v9, 16, v2 ; GFX10-NEXT: v_xor_b32_e32 v10, -1, v2 -; GFX10-NEXT: v_lshrrev_b32_e32 v2, 24, v2 -; GFX10-NEXT: v_lshlrev_b16 v0, v11, v0 -; GFX10-NEXT: v_xor_b32_e32 v11, -1, v8 +; GFX10-NEXT: v_lshrrev_b32_e32 v11, 16, v2 +; GFX10-NEXT: v_lshrrev_b32_e32 v12, 24, v2 +; GFX10-NEXT: v_and_b32_e32 v2, 7, v2 +; GFX10-NEXT: v_and_b32_e32 v13, v1, v9 +; GFX10-NEXT: v_xor_b32_e32 v14, -1, v8 ; GFX10-NEXT: v_and_b32_e32 v8, 7, v8 -; GFX10-NEXT: v_mov_b32_e32 v13, 0xff -; GFX10-NEXT: s_movk_i32 s4, 0xff +; GFX10-NEXT: v_lshrrev_b32_e32 v4, 16, v0 +; GFX10-NEXT: v_lshrrev_b32_e32 v5, 24, v0 ; GFX10-NEXT: v_lshrrev_b32_e32 v7, 24, v1 -; GFX10-NEXT: v_and_b32_e32 v12, s4, v1 -; GFX10-NEXT: v_and_b32_e32 v6, s4, v6 +; GFX10-NEXT: v_lshlrev_b16 v0, v2, v0 +; GFX10-NEXT: v_lshrrev_b16 v2, 1, v13 +; GFX10-NEXT: v_and_b32_e32 v6, v6, v9 +; GFX10-NEXT: v_and_b32_e32 v13, 7, v14 ; GFX10-NEXT: v_lshlrev_b16 v3, v8, v3 -; GFX10-NEXT: v_xor_b32_e32 v8, -1, v9 -; GFX10-NEXT: v_and_b32_sdwa v1, v1, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; GFX10-NEXT: v_xor_b32_e32 v13, -1, v2 -; GFX10-NEXT: v_and_b32_e32 v11, 7, v11 +; GFX10-NEXT: v_xor_b32_e32 v8, -1, v11 +; GFX10-NEXT: v_and_b32_sdwa v1, v1, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX10-NEXT: v_xor_b32_e32 v14, -1, v12 ; GFX10-NEXT: v_lshrrev_b16 v6, 1, v6 -; GFX10-NEXT: v_and_b32_e32 v9, 7, v9 +; GFX10-NEXT: v_and_b32_e32 v11, 7, v11 ; GFX10-NEXT: v_and_b32_e32 v8, 7, v8 ; GFX10-NEXT: v_lshrrev_b16 v1, 1, v1 -; GFX10-NEXT: v_and_b32_e32 v2, 7, v2 -; GFX10-NEXT: v_and_b32_e32 v13, 7, v13 +; GFX10-NEXT: v_and_b32_e32 v12, 7, v12 +; GFX10-NEXT: v_and_b32_e32 v14, 7, v14 ; GFX10-NEXT: v_lshrrev_b16 v7, 1, v7 ; GFX10-NEXT: v_and_b32_e32 v10, 7, v10 -; GFX10-NEXT: v_lshrrev_b16 v12, 1, v12 -; GFX10-NEXT: v_lshrrev_b16 v6, v11, v6 -; GFX10-NEXT: v_lshlrev_b16 v4, v9, v4 +; GFX10-NEXT: v_lshrrev_b16 v6, v13, v6 +; GFX10-NEXT: v_lshlrev_b16 v4, v11, v4 ; GFX10-NEXT: v_lshrrev_b16 v1, v8, v1 -; GFX10-NEXT: v_lshlrev_b16 v2, v2, v5 -; GFX10-NEXT: v_lshrrev_b16 v5, v13, v7 -; GFX10-NEXT: v_lshrrev_b16 v7, v10, v12 +; GFX10-NEXT: v_lshlrev_b16 v5, v12, v5 +; GFX10-NEXT: v_lshrrev_b16 v7, v14, v7 +; GFX10-NEXT: v_lshrrev_b16 v2, v10, v2 ; GFX10-NEXT: v_or_b32_e32 v3, v3, v6 ; GFX10-NEXT: v_mov_b32_e32 v6, 8 ; GFX10-NEXT: v_or_b32_e32 v1, v4, v1 -; GFX10-NEXT: v_or_b32_e32 v2, v2, v5 -; GFX10-NEXT: v_or_b32_e32 v0, v0, v7 -; GFX10-NEXT: v_lshlrev_b32_sdwa v3, v6, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX10-NEXT: v_and_b32_e32 v1, s4, v1 -; GFX10-NEXT: v_and_b32_e32 v2, s4, v2 -; GFX10-NEXT: v_and_or_b32 v0, v0, s4, v3 +; GFX10-NEXT: v_or_b32_e32 v4, v5, v7 +; GFX10-NEXT: v_or_b32_e32 v0, v0, v2 +; GFX10-NEXT: v_lshlrev_b32_sdwa v2, v6, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX10-NEXT: v_and_b32_e32 v1, v1, v9 +; GFX10-NEXT: v_and_b32_e32 v3, v4, v9 +; GFX10-NEXT: v_and_or_b32 v0, v0, v9, v2 ; GFX10-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX10-NEXT: v_lshlrev_b32_e32 v2, 24, v2 +; GFX10-NEXT: v_lshlrev_b32_e32 v2, 24, v3 ; GFX10-NEXT: v_or3_b32 v0, v0, v1, v2 ; GFX10-NEXT: s_setpc_b64 s[30:31] %lhs = bitcast i32 %lhs.arg to <4 x i8> @@ -1325,15 +1322,15 @@ ; GFX6-NEXT: v_cvt_f32_ubyte0_e32 v3, 24 ; GFX6-NEXT: v_rcp_iflag_f32_e32 v3, v3 ; GFX6-NEXT: v_mov_b32_e32 v4, 0xffffffe8 -; GFX6-NEXT: v_and_b32_e32 v2, 0xffffff, v2 -; GFX6-NEXT: v_bfe_u32 v1, v1, 1, 23 +; GFX6-NEXT: v_mov_b32_e32 v5, 0xffffff +; GFX6-NEXT: v_and_b32_e32 v2, v2, v5 ; GFX6-NEXT: v_mul_f32_e32 v3, 0x4f7ffffe, v3 ; GFX6-NEXT: v_cvt_u32_f32_e32 v3, v3 +; GFX6-NEXT: v_bfe_u32 v1, v1, 1, 23 ; GFX6-NEXT: v_mul_lo_u32 v4, v4, v3 ; GFX6-NEXT: v_mul_hi_u32 v4, v3, v4 ; GFX6-NEXT: v_add_i32_e32 v3, vcc, v3, v4 ; GFX6-NEXT: v_mul_hi_u32 v3, v2, v3 -; GFX6-NEXT: v_mov_b32_e32 v4, 0xffffff ; GFX6-NEXT: v_mul_lo_u32 v3, v3, 24 ; GFX6-NEXT: v_sub_i32_e32 v2, vcc, v2, v3 ; GFX6-NEXT: v_subrev_i32_e32 v3, vcc, 24, v2 @@ -1343,9 +1340,9 @@ ; GFX6-NEXT: v_cmp_le_u32_e32 vcc, 24, v2 ; GFX6-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc ; GFX6-NEXT: v_sub_i32_e32 v3, vcc, 23, v2 -; GFX6-NEXT: v_and_b32_e32 v2, v2, v4 +; GFX6-NEXT: v_and_b32_e32 v2, v2, v5 ; GFX6-NEXT: v_lshlrev_b32_e32 v0, v2, v0 -; GFX6-NEXT: v_and_b32_e32 v2, v3, v4 +; GFX6-NEXT: v_and_b32_e32 v2, v3, v5 ; GFX6-NEXT: v_lshrrev_b32_e32 v1, v2, v1 ; GFX6-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX6-NEXT: s_setpc_b64 s[30:31] @@ -1356,15 +1353,15 @@ ; GFX8-NEXT: v_cvt_f32_ubyte0_e32 v3, 24 ; GFX8-NEXT: v_rcp_iflag_f32_e32 v3, v3 ; GFX8-NEXT: v_mov_b32_e32 v4, 0xffffffe8 -; GFX8-NEXT: v_and_b32_e32 v2, 0xffffff, v2 -; GFX8-NEXT: v_bfe_u32 v1, v1, 1, 23 +; GFX8-NEXT: v_mov_b32_e32 v5, 0xffffff +; GFX8-NEXT: v_and_b32_e32 v2, v2, v5 ; GFX8-NEXT: v_mul_f32_e32 v3, 0x4f7ffffe, v3 ; GFX8-NEXT: v_cvt_u32_f32_e32 v3, v3 +; GFX8-NEXT: v_bfe_u32 v1, v1, 1, 23 ; GFX8-NEXT: v_mul_lo_u32 v4, v4, v3 ; GFX8-NEXT: v_mul_hi_u32 v4, v3, v4 ; GFX8-NEXT: v_add_u32_e32 v3, vcc, v3, v4 ; GFX8-NEXT: v_mul_hi_u32 v3, v2, v3 -; GFX8-NEXT: v_mov_b32_e32 v4, 0xffffff ; GFX8-NEXT: v_mul_lo_u32 v3, v3, 24 ; GFX8-NEXT: v_sub_u32_e32 v2, vcc, v2, v3 ; GFX8-NEXT: v_subrev_u32_e32 v3, vcc, 24, v2 @@ -1374,9 +1371,9 @@ ; GFX8-NEXT: v_cmp_le_u32_e32 vcc, 24, v2 ; GFX8-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc ; GFX8-NEXT: v_sub_u32_e32 v3, vcc, 23, v2 -; GFX8-NEXT: v_and_b32_e32 v2, v2, v4 +; GFX8-NEXT: v_and_b32_e32 v2, v2, v5 ; GFX8-NEXT: v_lshlrev_b32_e32 v0, v2, v0 -; GFX8-NEXT: v_and_b32_e32 v2, v3, v4 +; GFX8-NEXT: v_and_b32_e32 v2, v3, v5 ; GFX8-NEXT: v_lshrrev_b32_e32 v1, v2, v1 ; GFX8-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX8-NEXT: s_setpc_b64 s[30:31] @@ -1387,15 +1384,15 @@ ; GFX9-NEXT: v_cvt_f32_ubyte0_e32 v3, 24 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v3, v3 ; GFX9-NEXT: v_mov_b32_e32 v4, 0xffffffe8 -; GFX9-NEXT: v_and_b32_e32 v2, 0xffffff, v2 -; GFX9-NEXT: v_bfe_u32 v1, v1, 1, 23 +; GFX9-NEXT: v_mov_b32_e32 v5, 0xffffff +; GFX9-NEXT: v_and_b32_e32 v2, v2, v5 ; GFX9-NEXT: v_mul_f32_e32 v3, 0x4f7ffffe, v3 ; GFX9-NEXT: v_cvt_u32_f32_e32 v3, v3 +; GFX9-NEXT: v_bfe_u32 v1, v1, 1, 23 ; GFX9-NEXT: v_mul_lo_u32 v4, v4, v3 ; GFX9-NEXT: v_mul_hi_u32 v4, v3, v4 ; GFX9-NEXT: v_add_u32_e32 v3, v3, v4 ; GFX9-NEXT: v_mul_hi_u32 v3, v2, v3 -; GFX9-NEXT: v_mov_b32_e32 v4, 0xffffff ; GFX9-NEXT: v_mul_lo_u32 v3, v3, 24 ; GFX9-NEXT: v_sub_u32_e32 v2, v2, v3 ; GFX9-NEXT: v_subrev_u32_e32 v3, 24, v2 @@ -1405,8 +1402,8 @@ ; GFX9-NEXT: v_cmp_le_u32_e32 vcc, 24, v2 ; GFX9-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc ; GFX9-NEXT: v_sub_u32_e32 v3, 23, v2 -; GFX9-NEXT: v_and_b32_e32 v3, v3, v4 -; GFX9-NEXT: v_and_b32_e32 v2, v2, v4 +; GFX9-NEXT: v_and_b32_e32 v3, v3, v5 +; GFX9-NEXT: v_and_b32_e32 v2, v2, v5 ; GFX9-NEXT: v_lshrrev_b32_e32 v1, v3, v1 ; GFX9-NEXT: v_lshl_or_b32 v0, v0, v2, v1 ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -1416,9 +1413,10 @@ ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: v_cvt_f32_ubyte0_e32 v3, 24 -; GFX10-NEXT: v_and_b32_e32 v2, 0xffffff, v2 +; GFX10-NEXT: v_mov_b32_e32 v5, 0xffffff ; GFX10-NEXT: v_bfe_u32 v1, v1, 1, 23 ; GFX10-NEXT: v_rcp_iflag_f32_e32 v3, v3 +; GFX10-NEXT: v_and_b32_e32 v2, v2, v5 ; GFX10-NEXT: v_mul_f32_e32 v3, 0x4f7ffffe, v3 ; GFX10-NEXT: v_cvt_u32_f32_e32 v3, v3 ; GFX10-NEXT: v_mul_lo_u32 v4, 0xffffffe8, v3 @@ -1433,11 +1431,10 @@ ; GFX10-NEXT: v_subrev_nc_u32_e32 v3, 24, v2 ; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, 24, v2 ; GFX10-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc_lo -; GFX10-NEXT: v_mov_b32_e32 v3, 0xffffff -; GFX10-NEXT: v_sub_nc_u32_e32 v4, 23, v2 -; GFX10-NEXT: v_and_b32_e32 v2, v2, v3 -; GFX10-NEXT: v_and_b32_e32 v4, v4, v3 -; GFX10-NEXT: v_lshrrev_b32_e32 v1, v4, v1 +; GFX10-NEXT: v_sub_nc_u32_e32 v3, 23, v2 +; GFX10-NEXT: v_and_b32_e32 v2, v2, v5 +; GFX10-NEXT: v_and_b32_e32 v3, v3, v5 +; GFX10-NEXT: v_lshrrev_b32_e32 v1, v3, v1 ; GFX10-NEXT: v_lshl_or_b32 v0, v0, v2, v1 ; GFX10-NEXT: s_setpc_b64 s[30:31] %result = call i24 @llvm.fshl.i24(i24 %lhs, i24 %rhs, i24 %amt) @@ -1519,28 +1516,28 @@ ; GFX6-NEXT: v_mul_lo_u32 v1, v1, v2 ; GFX6-NEXT: s_lshl_b32 s5, s5, 8 ; GFX6-NEXT: v_sub_i32_e32 v0, vcc, s4, v0 -; GFX6-NEXT: v_subrev_i32_e32 v3, vcc, 24, v0 +; GFX6-NEXT: v_subrev_i32_e32 v4, vcc, 24, v0 ; GFX6-NEXT: v_cmp_le_u32_e32 vcc, 24, v0 ; GFX6-NEXT: v_mul_hi_u32 v1, v2, v1 ; GFX6-NEXT: s_and_b32 s6, s8, s9 -; GFX6-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc +; GFX6-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc ; GFX6-NEXT: s_or_b32 s5, s7, s5 ; GFX6-NEXT: s_bfe_u32 s6, s6, 0x100000 -; GFX6-NEXT: v_subrev_i32_e32 v3, vcc, 24, v0 +; GFX6-NEXT: v_subrev_i32_e32 v4, vcc, 24, v0 ; GFX6-NEXT: s_bfe_u32 s5, s5, 0x100000 ; GFX6-NEXT: s_lshl_b32 s6, s6, 16 ; GFX6-NEXT: v_cmp_le_u32_e32 vcc, 24, v0 ; GFX6-NEXT: s_or_b32 s5, s5, s6 -; GFX6-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc +; GFX6-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc ; GFX6-NEXT: v_add_i32_e32 v1, vcc, v2, v1 ; GFX6-NEXT: v_mul_hi_u32 v1, s5, v1 -; GFX6-NEXT: s_mov_b32 s6, 0xffffff -; GFX6-NEXT: v_sub_i32_e32 v3, vcc, 23, v0 +; GFX6-NEXT: v_mov_b32_e32 v3, 0xffffff +; GFX6-NEXT: v_sub_i32_e32 v4, vcc, 23, v0 ; GFX6-NEXT: v_mul_lo_u32 v1, v1, 24 -; GFX6-NEXT: v_and_b32_e32 v0, s6, v0 +; GFX6-NEXT: v_and_b32_e32 v0, v0, v3 ; GFX6-NEXT: v_lshl_b32_e32 v0, s0, v0 ; GFX6-NEXT: s_lshr_b32 s0, s2, 1 -; GFX6-NEXT: v_and_b32_e32 v2, s6, v3 +; GFX6-NEXT: v_and_b32_e32 v2, v4, v3 ; GFX6-NEXT: v_lshr_b32_e32 v2, s0, v2 ; GFX6-NEXT: v_sub_i32_e32 v1, vcc, s5, v1 ; GFX6-NEXT: v_or_b32_e32 v0, v0, v2 @@ -1550,11 +1547,10 @@ ; GFX6-NEXT: v_subrev_i32_e32 v2, vcc, 24, v1 ; GFX6-NEXT: v_cmp_le_u32_e32 vcc, 24, v1 ; GFX6-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc -; GFX6-NEXT: v_mov_b32_e32 v4, 0xffffff ; GFX6-NEXT: v_sub_i32_e32 v2, vcc, 23, v1 -; GFX6-NEXT: v_and_b32_e32 v1, v1, v4 +; GFX6-NEXT: v_and_b32_e32 v1, v1, v3 ; GFX6-NEXT: s_lshr_b32 s0, s3, 1 -; GFX6-NEXT: v_and_b32_e32 v2, v2, v4 +; GFX6-NEXT: v_and_b32_e32 v2, v2, v3 ; GFX6-NEXT: v_lshl_b32_e32 v1, s1, v1 ; GFX6-NEXT: v_lshr_b32_e32 v2, s0, v2 ; GFX6-NEXT: v_bfe_u32 v3, v0, 8, 8 @@ -1653,28 +1649,28 @@ ; GFX8-NEXT: v_mul_lo_u32 v1, v1, v2 ; GFX8-NEXT: s_lshl_b32 s5, s5, s11 ; GFX8-NEXT: v_sub_u32_e32 v0, vcc, s4, v0 -; GFX8-NEXT: v_subrev_u32_e32 v3, vcc, 24, v0 +; GFX8-NEXT: v_subrev_u32_e32 v4, vcc, 24, v0 ; GFX8-NEXT: v_cmp_le_u32_e32 vcc, 24, v0 ; GFX8-NEXT: v_mul_hi_u32 v1, v2, v1 ; GFX8-NEXT: s_and_b32 s6, s9, s10 -; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc ; GFX8-NEXT: s_or_b32 s5, s8, s5 ; GFX8-NEXT: s_bfe_u32 s6, s6, 0x100000 -; GFX8-NEXT: v_subrev_u32_e32 v3, vcc, 24, v0 +; GFX8-NEXT: v_subrev_u32_e32 v4, vcc, 24, v0 ; GFX8-NEXT: s_bfe_u32 s5, s5, 0x100000 ; GFX8-NEXT: s_lshl_b32 s6, s6, 16 ; GFX8-NEXT: v_cmp_le_u32_e32 vcc, 24, v0 ; GFX8-NEXT: s_or_b32 s5, s5, s6 -; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc ; GFX8-NEXT: v_add_u32_e32 v1, vcc, v2, v1 ; GFX8-NEXT: v_mul_hi_u32 v1, s5, v1 -; GFX8-NEXT: s_mov_b32 s6, 0xffffff -; GFX8-NEXT: v_sub_u32_e32 v3, vcc, 23, v0 +; GFX8-NEXT: v_mov_b32_e32 v3, 0xffffff +; GFX8-NEXT: v_sub_u32_e32 v4, vcc, 23, v0 ; GFX8-NEXT: v_mul_lo_u32 v1, v1, 24 -; GFX8-NEXT: v_and_b32_e32 v0, s6, v0 +; GFX8-NEXT: v_and_b32_e32 v0, v0, v3 ; GFX8-NEXT: v_lshlrev_b32_e64 v0, v0, s0 ; GFX8-NEXT: s_lshr_b32 s0, s2, 1 -; GFX8-NEXT: v_and_b32_e32 v2, s6, v3 +; GFX8-NEXT: v_and_b32_e32 v2, v4, v3 ; GFX8-NEXT: v_lshrrev_b32_e64 v2, v2, s0 ; GFX8-NEXT: v_sub_u32_e32 v1, vcc, s5, v1 ; GFX8-NEXT: v_or_b32_e32 v0, v0, v2 @@ -1684,11 +1680,10 @@ ; GFX8-NEXT: v_subrev_u32_e32 v2, vcc, 24, v1 ; GFX8-NEXT: v_cmp_le_u32_e32 vcc, 24, v1 ; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc -; GFX8-NEXT: v_mov_b32_e32 v4, 0xffffff ; GFX8-NEXT: v_sub_u32_e32 v2, vcc, 23, v1 -; GFX8-NEXT: v_and_b32_e32 v1, v1, v4 +; GFX8-NEXT: v_and_b32_e32 v1, v1, v3 ; GFX8-NEXT: s_lshr_b32 s0, s3, 1 -; GFX8-NEXT: v_and_b32_e32 v2, v2, v4 +; GFX8-NEXT: v_and_b32_e32 v2, v2, v3 ; GFX8-NEXT: v_lshlrev_b32_e64 v1, v1, s1 ; GFX8-NEXT: v_lshrrev_b32_e64 v2, v2, s0 ; GFX8-NEXT: v_or_b32_e32 v1, v1, v2 @@ -1698,7 +1693,7 @@ ; GFX8-NEXT: v_or_b32_sdwa v3, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX8-NEXT: v_lshlrev_b32_sdwa v0, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 ; GFX8-NEXT: v_or_b32_e32 v0, v3, v0 -; GFX8-NEXT: v_and_b32_e32 v3, s10, v1 +; GFX8-NEXT: v_and_b32_e32 v3, 0xff, v1 ; GFX8-NEXT: v_lshlrev_b32_e32 v3, 24, v3 ; GFX8-NEXT: v_lshlrev_b32_sdwa v2, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 ; GFX8-NEXT: v_or_b32_e32 v0, v0, v3 @@ -1793,42 +1788,42 @@ ; GFX9-NEXT: v_add_u32_e32 v1, v2, v1 ; GFX9-NEXT: v_mul_hi_u32 v1, s5, v1 ; GFX9-NEXT: v_sub_u32_e32 v0, s4, v0 -; GFX9-NEXT: v_subrev_u32_e32 v3, 24, v0 +; GFX9-NEXT: v_subrev_u32_e32 v4, 24, v0 ; GFX9-NEXT: v_cmp_le_u32_e32 vcc, 24, v0 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc -; GFX9-NEXT: v_subrev_u32_e32 v3, 24, v0 +; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc +; GFX9-NEXT: v_subrev_u32_e32 v4, 24, v0 ; GFX9-NEXT: v_cmp_le_u32_e32 vcc, 24, v0 ; GFX9-NEXT: v_mul_lo_u32 v1, v1, 24 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc -; GFX9-NEXT: s_mov_b32 s7, 0xffffff -; GFX9-NEXT: v_sub_u32_e32 v3, 23, v0 +; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc +; GFX9-NEXT: v_mov_b32_e32 v3, 0xffffff +; GFX9-NEXT: v_sub_u32_e32 v2, 23, v0 ; GFX9-NEXT: s_lshr_b32 s2, s2, 1 -; GFX9-NEXT: v_and_b32_e32 v3, s7, v3 -; GFX9-NEXT: v_and_b32_e32 v0, s7, v0 -; GFX9-NEXT: v_lshrrev_b32_e64 v3, v3, s2 +; GFX9-NEXT: v_and_b32_e32 v2, v2, v3 +; GFX9-NEXT: v_and_b32_e32 v0, v0, v3 +; GFX9-NEXT: v_lshrrev_b32_e64 v2, v2, s2 ; GFX9-NEXT: v_sub_u32_e32 v1, s5, v1 -; GFX9-NEXT: v_lshl_or_b32 v0, s0, v0, v3 -; GFX9-NEXT: v_subrev_u32_e32 v3, 24, v1 +; GFX9-NEXT: v_lshl_or_b32 v0, s0, v0, v2 +; GFX9-NEXT: v_subrev_u32_e32 v2, 24, v1 ; GFX9-NEXT: v_cmp_le_u32_e32 vcc, 24, v1 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc -; GFX9-NEXT: v_subrev_u32_e32 v3, 24, v1 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc +; GFX9-NEXT: v_subrev_u32_e32 v2, 24, v1 ; GFX9-NEXT: v_cmp_le_u32_e32 vcc, 24, v1 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc -; GFX9-NEXT: v_mov_b32_e32 v2, 0xffffff -; GFX9-NEXT: v_sub_u32_e32 v3, 23, v1 -; GFX9-NEXT: v_and_b32_e32 v1, v1, v2 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc +; GFX9-NEXT: v_sub_u32_e32 v2, 23, v1 ; GFX9-NEXT: s_lshr_b32 s0, s3, 1 -; GFX9-NEXT: v_and_b32_e32 v2, v3, v2 -; GFX9-NEXT: v_lshrrev_b32_e64 v2, v2, s0 +; GFX9-NEXT: v_and_b32_e32 v2, v2, v3 ; GFX9-NEXT: s_mov_b32 s6, 8 +; GFX9-NEXT: v_and_b32_e32 v1, v1, v3 +; GFX9-NEXT: v_lshrrev_b32_e64 v2, v2, s0 ; GFX9-NEXT: v_lshl_or_b32 v1, s1, v1, v2 +; GFX9-NEXT: v_mov_b32_e32 v2, 0xff +; GFX9-NEXT: v_lshlrev_b32_sdwa v3, s6, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 ; GFX9-NEXT: s_mov_b32 s8, 16 -; GFX9-NEXT: v_lshlrev_b32_sdwa v2, s6, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 -; GFX9-NEXT: v_and_b32_e32 v3, s12, v1 -; GFX9-NEXT: v_and_or_b32 v2, v0, s12, v2 +; GFX9-NEXT: v_and_or_b32 v3, v0, v2, v3 +; GFX9-NEXT: v_and_b32_e32 v2, v1, v2 ; GFX9-NEXT: v_lshlrev_b32_sdwa v0, s8, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 -; GFX9-NEXT: v_lshlrev_b32_e32 v3, 24, v3 -; GFX9-NEXT: v_or3_b32 v0, v2, v0, v3 +; GFX9-NEXT: v_lshlrev_b32_e32 v2, 24, v2 +; GFX9-NEXT: v_or3_b32 v0, v3, v0, v2 ; GFX9-NEXT: v_bfe_u32 v2, v1, 8, 8 ; GFX9-NEXT: v_bfe_u32 v1, v1, 16, 8 ; GFX9-NEXT: v_lshl_or_b32 v1, v1, 8, v2 @@ -1840,128 +1835,129 @@ ; GFX10: ; %bb.0: ; GFX10-NEXT: v_cvt_f32_ubyte0_e32 v0, 24 ; GFX10-NEXT: v_cvt_f32_ubyte0_e32 v1, 24 +; GFX10-NEXT: v_mov_b32_e32 v2, 0xffffffe8 +; GFX10-NEXT: s_lshr_b32 s6, s0, 8 ; GFX10-NEXT: s_movk_i32 s9, 0xff -; GFX10-NEXT: s_lshr_b32 s10, s1, 8 -; GFX10-NEXT: s_bfe_u32 s11, 8, 0x100000 ; GFX10-NEXT: v_rcp_iflag_f32_e32 v0, v0 ; GFX10-NEXT: v_rcp_iflag_f32_e32 v1, v1 -; GFX10-NEXT: s_and_b32 s1, s1, s9 -; GFX10-NEXT: s_lshr_b32 s6, s0, 8 -; GFX10-NEXT: s_lshr_b32 s8, s0, 24 -; GFX10-NEXT: s_lshl_b32 s1, s1, s11 -; GFX10-NEXT: s_and_b32 s6, s6, s9 -; GFX10-NEXT: s_or_b32 s1, s8, s1 -; GFX10-NEXT: s_lshr_b32 s8, s4, 8 ; GFX10-NEXT: s_lshr_b32 s7, s0, 16 -; GFX10-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 -; GFX10-NEXT: v_mul_f32_e32 v1, 0x4f7ffffe, v1 +; GFX10-NEXT: s_and_b32 s6, s6, s9 +; GFX10-NEXT: s_bfe_u32 s11, 8, 0x100000 +; GFX10-NEXT: s_lshr_b32 s8, s0, 24 ; GFX10-NEXT: s_and_b32 s0, s0, s9 ; GFX10-NEXT: s_lshl_b32 s6, s6, s11 -; GFX10-NEXT: s_and_b32 s8, s8, s9 +; GFX10-NEXT: s_and_b32 s7, s7, s9 +; GFX10-NEXT: s_or_b32 s0, s0, s6 +; GFX10-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 +; GFX10-NEXT: v_mul_f32_e32 v1, 0x4f7ffffe, v1 +; GFX10-NEXT: s_bfe_u32 s6, s7, 0x100000 +; GFX10-NEXT: s_lshr_b32 s7, s4, 8 +; GFX10-NEXT: s_lshr_b32 s12, s4, 16 ; GFX10-NEXT: v_cvt_u32_f32_e32 v0, v0 ; GFX10-NEXT: v_cvt_u32_f32_e32 v1, v1 -; GFX10-NEXT: s_or_b32 s0, s0, s6 -; GFX10-NEXT: s_and_b32 s6, s7, s9 -; GFX10-NEXT: s_and_b32 s7, s10, s9 -; GFX10-NEXT: v_mul_lo_u32 v2, 0xffffffe8, v0 -; GFX10-NEXT: v_mul_lo_u32 v3, 0xffffffe8, v1 -; GFX10-NEXT: s_lshr_b32 s10, s4, 16 -; GFX10-NEXT: s_lshr_b32 s12, s4, 24 +; GFX10-NEXT: s_and_b32 s7, s7, s9 +; GFX10-NEXT: s_lshr_b32 s13, s4, 24 ; GFX10-NEXT: s_and_b32 s4, s4, s9 -; GFX10-NEXT: s_lshl_b32 s8, s8, s11 -; GFX10-NEXT: s_lshr_b32 s13, s5, 8 -; GFX10-NEXT: s_or_b32 s4, s4, s8 -; GFX10-NEXT: v_mul_hi_u32 v2, v0, v2 -; GFX10-NEXT: s_and_b32 s8, s10, s9 -; GFX10-NEXT: v_mul_hi_u32 v3, v1, v3 -; GFX10-NEXT: s_bfe_u32 s8, s8, 0x100000 +; GFX10-NEXT: v_mul_lo_u32 v3, v2, v0 +; GFX10-NEXT: v_mul_lo_u32 v2, v2, v1 +; GFX10-NEXT: s_lshl_b32 s7, s7, s11 +; GFX10-NEXT: s_lshr_b32 s14, s5, 8 +; GFX10-NEXT: s_or_b32 s4, s4, s7 +; GFX10-NEXT: s_and_b32 s7, s12, s9 ; GFX10-NEXT: s_bfe_u32 s4, s4, 0x100000 -; GFX10-NEXT: s_lshl_b32 s8, s8, 16 +; GFX10-NEXT: s_bfe_u32 s7, s7, 0x100000 +; GFX10-NEXT: v_mul_hi_u32 v3, v0, v3 +; GFX10-NEXT: v_mul_hi_u32 v2, v1, v2 +; GFX10-NEXT: s_lshl_b32 s7, s7, 16 ; GFX10-NEXT: s_and_b32 s5, s5, s9 -; GFX10-NEXT: s_or_b32 s4, s4, s8 -; GFX10-NEXT: v_add_nc_u32_e32 v0, v0, v2 +; GFX10-NEXT: s_or_b32 s4, s4, s7 ; GFX10-NEXT: s_lshl_b32 s5, s5, s11 -; GFX10-NEXT: s_and_b32 s8, s13, s9 -; GFX10-NEXT: s_or_b32 s5, s12, s5 -; GFX10-NEXT: s_bfe_u32 s8, s8, 0x100000 -; GFX10-NEXT: v_add_nc_u32_e32 v1, v1, v3 -; GFX10-NEXT: v_mul_hi_u32 v0, s4, v0 +; GFX10-NEXT: s_and_b32 s7, s14, s9 +; GFX10-NEXT: s_or_b32 s5, s13, s5 +; GFX10-NEXT: v_add_nc_u32_e32 v0, v0, v3 +; GFX10-NEXT: s_bfe_u32 s7, s7, 0x100000 +; GFX10-NEXT: v_add_nc_u32_e32 v1, v1, v2 ; GFX10-NEXT: s_bfe_u32 s5, s5, 0x100000 -; GFX10-NEXT: s_lshl_b32 s8, s8, 16 +; GFX10-NEXT: s_lshl_b32 s7, s7, 16 +; GFX10-NEXT: v_mul_hi_u32 v0, s4, v0 +; GFX10-NEXT: s_or_b32 s5, s5, s7 +; GFX10-NEXT: s_lshr_b32 s10, s1, 8 +; GFX10-NEXT: v_mul_hi_u32 v1, s5, v1 +; GFX10-NEXT: s_and_b32 s1, s1, s9 +; GFX10-NEXT: s_and_b32 s7, s10, s9 +; GFX10-NEXT: s_lshl_b32 s1, s1, s11 ; GFX10-NEXT: s_lshr_b32 s10, s2, 16 -; GFX10-NEXT: s_or_b32 s5, s5, s8 +; GFX10-NEXT: v_mul_lo_u32 v0, v0, 24 +; GFX10-NEXT: s_or_b32 s1, s8, s1 ; GFX10-NEXT: s_lshr_b32 s8, s2, 8 -; GFX10-NEXT: v_mul_hi_u32 v1, s5, v1 +; GFX10-NEXT: v_mul_lo_u32 v1, v1, 24 ; GFX10-NEXT: s_and_b32 s8, s8, s9 -; GFX10-NEXT: v_mul_lo_u32 v0, v0, 24 -; GFX10-NEXT: s_and_b32 s12, s2, s9 +; GFX10-NEXT: s_lshr_b32 s12, s2, 24 +; GFX10-NEXT: s_and_b32 s2, s2, s9 ; GFX10-NEXT: s_lshl_b32 s8, s8, s11 -; GFX10-NEXT: s_and_b32 s10, s10, s9 -; GFX10-NEXT: s_or_b32 s8, s12, s8 -; GFX10-NEXT: s_lshr_b32 s2, s2, 24 -; GFX10-NEXT: v_mul_lo_u32 v1, v1, 24 -; GFX10-NEXT: s_bfe_u32 s6, s6, 0x100000 ; GFX10-NEXT: v_sub_nc_u32_e32 v0, s4, v0 -; GFX10-NEXT: s_bfe_u32 s4, s8, 0x100000 -; GFX10-NEXT: s_bfe_u32 s8, s10, 0x100000 -; GFX10-NEXT: s_bfe_u32 s7, s7, 0x100000 -; GFX10-NEXT: s_bfe_u32 s0, s0, 0x100000 -; GFX10-NEXT: v_subrev_nc_u32_e32 v2, 24, v0 +; GFX10-NEXT: s_and_b32 s4, s10, s9 +; GFX10-NEXT: s_or_b32 s2, s2, s8 ; GFX10-NEXT: v_sub_nc_u32_e32 v1, s5, v1 +; GFX10-NEXT: s_bfe_u32 s4, s4, 0x100000 +; GFX10-NEXT: v_subrev_nc_u32_e32 v2, 24, v0 ; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, 24, v0 -; GFX10-NEXT: s_lshl_b32 s5, s8, 16 -; GFX10-NEXT: s_lshr_b32 s8, s3, 8 -; GFX10-NEXT: s_and_b32 s3, s3, s9 +; GFX10-NEXT: s_lshr_b32 s13, s3, 8 ; GFX10-NEXT: v_subrev_nc_u32_e32 v3, 24, v1 +; GFX10-NEXT: s_bfe_u32 s2, s2, 0x100000 +; GFX10-NEXT: s_lshl_b32 s4, s4, 16 ; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc_lo ; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, 24, v1 +; GFX10-NEXT: s_and_b32 s3, s3, s9 +; GFX10-NEXT: s_or_b32 s2, s2, s4 ; GFX10-NEXT: s_lshl_b32 s3, s3, s11 -; GFX10-NEXT: s_or_b32 s4, s4, s5 -; GFX10-NEXT: s_or_b32 s2, s2, s3 ; GFX10-NEXT: v_subrev_nc_u32_e32 v2, 24, v0 ; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc_lo ; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, 24, v0 -; GFX10-NEXT: s_and_b32 s3, s8, s9 -; GFX10-NEXT: s_mov_b32 s5, 0xffffff -; GFX10-NEXT: s_bfe_u32 s3, s3, 0x100000 +; GFX10-NEXT: s_and_b32 s4, s13, s9 +; GFX10-NEXT: s_or_b32 s3, s12, s3 +; GFX10-NEXT: s_bfe_u32 s4, s4, 0x100000 ; GFX10-NEXT: v_subrev_nc_u32_e32 v3, 24, v1 ; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc_lo ; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, 24, v1 -; GFX10-NEXT: s_bfe_u32 s2, s2, 0x100000 -; GFX10-NEXT: s_lshl_b32 s3, s3, 16 -; GFX10-NEXT: s_lshl_b32 s6, s6, 16 -; GFX10-NEXT: v_sub_nc_u32_e32 v2, 23, v0 +; GFX10-NEXT: v_mov_b32_e32 v2, 0xffffff +; GFX10-NEXT: s_bfe_u32 s3, s3, 0x100000 +; GFX10-NEXT: s_lshl_b32 s4, s4, 16 +; GFX10-NEXT: s_lshr_b32 s2, s2, 1 ; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc_lo -; GFX10-NEXT: v_mov_b32_e32 v3, 0xffffff -; GFX10-NEXT: s_or_b32 s2, s2, s3 -; GFX10-NEXT: s_lshr_b32 s3, s4, 1 -; GFX10-NEXT: v_and_b32_e32 v2, s5, v2 +; GFX10-NEXT: v_sub_nc_u32_e32 v3, 23, v0 +; GFX10-NEXT: s_or_b32 s3, s3, s4 +; GFX10-NEXT: s_bfe_u32 s7, s7, 0x100000 +; GFX10-NEXT: v_and_b32_e32 v0, v0, v2 ; GFX10-NEXT: v_sub_nc_u32_e32 v4, 23, v1 -; GFX10-NEXT: s_lshr_b32 s2, s2, 1 -; GFX10-NEXT: v_and_b32_e32 v0, s5, v0 -; GFX10-NEXT: v_and_b32_e32 v1, v1, v3 -; GFX10-NEXT: v_lshrrev_b32_e64 v2, v2, s3 -; GFX10-NEXT: v_and_b32_e32 v4, v4, v3 +; GFX10-NEXT: v_and_b32_e32 v3, v3, v2 +; GFX10-NEXT: v_and_b32_e32 v1, v1, v2 +; GFX10-NEXT: s_bfe_u32 s0, s0, 0x100000 +; GFX10-NEXT: s_lshl_b32 s6, s6, 16 +; GFX10-NEXT: v_and_b32_e32 v4, v4, v2 +; GFX10-NEXT: v_lshrrev_b32_e64 v3, v3, s2 +; GFX10-NEXT: s_lshr_b32 s2, s3, 1 ; GFX10-NEXT: s_bfe_u32 s1, s1, 0x100000 -; GFX10-NEXT: s_lshl_b32 s7, s7, 16 ; GFX10-NEXT: s_or_b32 s0, s0, s6 -; GFX10-NEXT: s_or_b32 s1, s1, s7 -; GFX10-NEXT: v_lshrrev_b32_e64 v3, v4, s2 -; GFX10-NEXT: v_lshl_or_b32 v0, s0, v0, v2 +; GFX10-NEXT: v_lshrrev_b32_e64 v2, v4, s2 +; GFX10-NEXT: s_lshl_b32 s2, s7, 16 +; GFX10-NEXT: v_lshl_or_b32 v0, s0, v0, v3 +; GFX10-NEXT: s_or_b32 s1, s1, s2 ; GFX10-NEXT: s_mov_b32 s0, 8 -; GFX10-NEXT: v_lshl_or_b32 v1, s1, v1, v3 -; GFX10-NEXT: v_lshlrev_b32_sdwa v2, s0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 +; GFX10-NEXT: v_lshl_or_b32 v1, s1, v1, v2 +; GFX10-NEXT: v_mov_b32_e32 v2, 0xff +; GFX10-NEXT: v_lshlrev_b32_sdwa v3, s0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 ; GFX10-NEXT: s_mov_b32 s0, 16 -; GFX10-NEXT: v_and_b32_e32 v3, s9, v1 -; GFX10-NEXT: v_and_or_b32 v2, v0, s9, v2 +; GFX10-NEXT: v_and_b32_e32 v4, v1, v2 +; GFX10-NEXT: v_and_or_b32 v2, v0, v2, v3 ; GFX10-NEXT: v_lshlrev_b32_sdwa v0, s0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 +; GFX10-NEXT: v_lshlrev_b32_e32 v3, 24, v4 ; GFX10-NEXT: v_bfe_u32 v4, v1, 8, 8 ; GFX10-NEXT: v_bfe_u32 v1, v1, 16, 8 -; GFX10-NEXT: v_lshlrev_b32_e32 v3, 24, v3 -; GFX10-NEXT: v_lshl_or_b32 v1, v1, 8, v4 ; GFX10-NEXT: v_or3_b32 v0, v2, v0, v3 -; GFX10-NEXT: v_readfirstlane_b32 s1, v1 +; GFX10-NEXT: v_lshl_or_b32 v1, v1, 8, v4 ; GFX10-NEXT: v_readfirstlane_b32 s0, v0 +; GFX10-NEXT: v_readfirstlane_b32 s1, v1 ; GFX10-NEXT: ; return to shader part epilog %lhs = bitcast i48 %lhs.arg to <2 x i24> %rhs = bitcast i48 %rhs.arg to <2 x i24> @@ -1977,38 +1973,36 @@ ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: v_cvt_f32_ubyte0_e32 v6, 24 ; GFX6-NEXT: v_rcp_iflag_f32_e32 v6, v6 -; GFX6-NEXT: v_mov_b32_e32 v7, 0xffffffe8 -; GFX6-NEXT: v_cvt_f32_ubyte0_e32 v9, 24 -; GFX6-NEXT: v_and_b32_e32 v4, 0xffffff, v4 +; GFX6-NEXT: v_mov_b32_e32 v8, 0xffffffe8 +; GFX6-NEXT: v_mov_b32_e32 v10, 0xffffff +; GFX6-NEXT: v_and_b32_e32 v4, v4, v10 +; GFX6-NEXT: v_mul_f32_e32 v7, 0x4f7ffffe, v6 +; GFX6-NEXT: v_cvt_u32_f32_e32 v7, v7 ; GFX6-NEXT: v_mul_f32_e32 v6, 0x4f7ffffe, v6 ; GFX6-NEXT: v_cvt_u32_f32_e32 v6, v6 +; GFX6-NEXT: v_and_b32_e32 v5, v5, v10 +; GFX6-NEXT: v_mul_lo_u32 v9, v8, v7 ; GFX6-NEXT: v_bfe_u32 v2, v2, 1, 23 -; GFX6-NEXT: v_mul_lo_u32 v8, v7, v6 +; GFX6-NEXT: v_mul_lo_u32 v8, v8, v6 +; GFX6-NEXT: v_mul_hi_u32 v9, v7, v9 ; GFX6-NEXT: v_mul_hi_u32 v8, v6, v8 -; GFX6-NEXT: v_add_i32_e32 v6, vcc, v6, v8 -; GFX6-NEXT: v_rcp_iflag_f32_e32 v8, v9 -; GFX6-NEXT: v_mul_hi_u32 v6, v4, v6 -; GFX6-NEXT: v_mov_b32_e32 v9, 0xffffff -; GFX6-NEXT: v_and_b32_e32 v5, v5, v9 -; GFX6-NEXT: v_mul_f32_e32 v8, 0x4f7ffffe, v8 -; GFX6-NEXT: v_cvt_u32_f32_e32 v8, v8 -; GFX6-NEXT: v_mul_lo_u32 v6, v6, 24 -; GFX6-NEXT: v_mul_lo_u32 v7, v7, v8 -; GFX6-NEXT: v_sub_i32_e32 v4, vcc, v4, v6 -; GFX6-NEXT: v_subrev_i32_e32 v6, vcc, 24, v4 +; GFX6-NEXT: v_add_i32_e32 v7, vcc, v7, v9 +; GFX6-NEXT: v_mul_hi_u32 v7, v4, v7 +; GFX6-NEXT: v_mul_lo_u32 v7, v7, 24 +; GFX6-NEXT: v_sub_i32_e32 v4, vcc, v4, v7 +; GFX6-NEXT: v_subrev_i32_e32 v7, vcc, 24, v4 ; GFX6-NEXT: v_cmp_le_u32_e32 vcc, 24, v4 -; GFX6-NEXT: v_mul_hi_u32 v7, v8, v7 -; GFX6-NEXT: v_cndmask_b32_e32 v4, v4, v6, vcc -; GFX6-NEXT: v_subrev_i32_e32 v6, vcc, 24, v4 +; GFX6-NEXT: v_cndmask_b32_e32 v4, v4, v7, vcc +; GFX6-NEXT: v_subrev_i32_e32 v7, vcc, 24, v4 ; GFX6-NEXT: v_cmp_le_u32_e32 vcc, 24, v4 -; GFX6-NEXT: v_cndmask_b32_e32 v4, v4, v6, vcc -; GFX6-NEXT: v_add_i32_e32 v7, vcc, v8, v7 -; GFX6-NEXT: v_mul_hi_u32 v7, v5, v7 -; GFX6-NEXT: v_sub_i32_e32 v6, vcc, 23, v4 -; GFX6-NEXT: v_and_b32_e32 v4, v4, v9 +; GFX6-NEXT: v_cndmask_b32_e32 v4, v4, v7, vcc +; GFX6-NEXT: v_add_i32_e32 v6, vcc, v6, v8 +; GFX6-NEXT: v_mul_hi_u32 v6, v5, v6 +; GFX6-NEXT: v_sub_i32_e32 v7, vcc, 23, v4 +; GFX6-NEXT: v_and_b32_e32 v4, v4, v10 +; GFX6-NEXT: v_mul_lo_u32 v6, v6, 24 ; GFX6-NEXT: v_lshlrev_b32_e32 v0, v4, v0 -; GFX6-NEXT: v_and_b32_e32 v4, v6, v9 -; GFX6-NEXT: v_mul_lo_u32 v6, v7, 24 +; GFX6-NEXT: v_and_b32_e32 v4, v7, v10 ; GFX6-NEXT: v_lshrrev_b32_e32 v2, v4, v2 ; GFX6-NEXT: v_or_b32_e32 v0, v0, v2 ; GFX6-NEXT: v_sub_i32_e32 v2, vcc, v5, v6 @@ -2019,10 +2013,10 @@ ; GFX6-NEXT: v_cmp_le_u32_e32 vcc, 24, v2 ; GFX6-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc ; GFX6-NEXT: v_sub_i32_e32 v4, vcc, 23, v2 -; GFX6-NEXT: v_and_b32_e32 v2, v2, v9 +; GFX6-NEXT: v_and_b32_e32 v2, v2, v10 ; GFX6-NEXT: v_lshlrev_b32_e32 v1, v2, v1 ; GFX6-NEXT: v_bfe_u32 v2, v3, 1, 23 -; GFX6-NEXT: v_and_b32_e32 v3, v4, v9 +; GFX6-NEXT: v_and_b32_e32 v3, v4, v10 ; GFX6-NEXT: v_lshrrev_b32_e32 v2, v3, v2 ; GFX6-NEXT: v_or_b32_e32 v1, v1, v2 ; GFX6-NEXT: s_setpc_b64 s[30:31] @@ -2032,38 +2026,36 @@ ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_cvt_f32_ubyte0_e32 v6, 24 ; GFX8-NEXT: v_rcp_iflag_f32_e32 v6, v6 -; GFX8-NEXT: v_mov_b32_e32 v7, 0xffffffe8 -; GFX8-NEXT: v_cvt_f32_ubyte0_e32 v9, 24 -; GFX8-NEXT: v_and_b32_e32 v4, 0xffffff, v4 +; GFX8-NEXT: v_mov_b32_e32 v8, 0xffffffe8 +; GFX8-NEXT: v_mov_b32_e32 v10, 0xffffff +; GFX8-NEXT: v_and_b32_e32 v4, v4, v10 +; GFX8-NEXT: v_mul_f32_e32 v7, 0x4f7ffffe, v6 +; GFX8-NEXT: v_cvt_u32_f32_e32 v7, v7 ; GFX8-NEXT: v_mul_f32_e32 v6, 0x4f7ffffe, v6 ; GFX8-NEXT: v_cvt_u32_f32_e32 v6, v6 +; GFX8-NEXT: v_and_b32_e32 v5, v5, v10 +; GFX8-NEXT: v_mul_lo_u32 v9, v8, v7 ; GFX8-NEXT: v_bfe_u32 v2, v2, 1, 23 -; GFX8-NEXT: v_mul_lo_u32 v8, v7, v6 +; GFX8-NEXT: v_mul_lo_u32 v8, v8, v6 +; GFX8-NEXT: v_mul_hi_u32 v9, v7, v9 ; GFX8-NEXT: v_mul_hi_u32 v8, v6, v8 -; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v8 -; GFX8-NEXT: v_rcp_iflag_f32_e32 v8, v9 -; GFX8-NEXT: v_mul_hi_u32 v6, v4, v6 -; GFX8-NEXT: v_mov_b32_e32 v9, 0xffffff -; GFX8-NEXT: v_and_b32_e32 v5, v5, v9 -; GFX8-NEXT: v_mul_f32_e32 v8, 0x4f7ffffe, v8 -; GFX8-NEXT: v_cvt_u32_f32_e32 v8, v8 -; GFX8-NEXT: v_mul_lo_u32 v6, v6, 24 -; GFX8-NEXT: v_mul_lo_u32 v7, v7, v8 -; GFX8-NEXT: v_sub_u32_e32 v4, vcc, v4, v6 -; GFX8-NEXT: v_subrev_u32_e32 v6, vcc, 24, v4 +; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v9 +; GFX8-NEXT: v_mul_hi_u32 v7, v4, v7 +; GFX8-NEXT: v_mul_lo_u32 v7, v7, 24 +; GFX8-NEXT: v_sub_u32_e32 v4, vcc, v4, v7 +; GFX8-NEXT: v_subrev_u32_e32 v7, vcc, 24, v4 ; GFX8-NEXT: v_cmp_le_u32_e32 vcc, 24, v4 -; GFX8-NEXT: v_mul_hi_u32 v7, v8, v7 -; GFX8-NEXT: v_cndmask_b32_e32 v4, v4, v6, vcc -; GFX8-NEXT: v_subrev_u32_e32 v6, vcc, 24, v4 +; GFX8-NEXT: v_cndmask_b32_e32 v4, v4, v7, vcc +; GFX8-NEXT: v_subrev_u32_e32 v7, vcc, 24, v4 ; GFX8-NEXT: v_cmp_le_u32_e32 vcc, 24, v4 -; GFX8-NEXT: v_cndmask_b32_e32 v4, v4, v6, vcc -; GFX8-NEXT: v_add_u32_e32 v7, vcc, v8, v7 -; GFX8-NEXT: v_mul_hi_u32 v7, v5, v7 -; GFX8-NEXT: v_sub_u32_e32 v6, vcc, 23, v4 -; GFX8-NEXT: v_and_b32_e32 v4, v4, v9 +; GFX8-NEXT: v_cndmask_b32_e32 v4, v4, v7, vcc +; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v8 +; GFX8-NEXT: v_mul_hi_u32 v6, v5, v6 +; GFX8-NEXT: v_sub_u32_e32 v7, vcc, 23, v4 +; GFX8-NEXT: v_and_b32_e32 v4, v4, v10 +; GFX8-NEXT: v_mul_lo_u32 v6, v6, 24 ; GFX8-NEXT: v_lshlrev_b32_e32 v0, v4, v0 -; GFX8-NEXT: v_and_b32_e32 v4, v6, v9 -; GFX8-NEXT: v_mul_lo_u32 v6, v7, 24 +; GFX8-NEXT: v_and_b32_e32 v4, v7, v10 ; GFX8-NEXT: v_lshrrev_b32_e32 v2, v4, v2 ; GFX8-NEXT: v_or_b32_e32 v0, v0, v2 ; GFX8-NEXT: v_sub_u32_e32 v2, vcc, v5, v6 @@ -2074,10 +2066,10 @@ ; GFX8-NEXT: v_cmp_le_u32_e32 vcc, 24, v2 ; GFX8-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc ; GFX8-NEXT: v_sub_u32_e32 v4, vcc, 23, v2 -; GFX8-NEXT: v_and_b32_e32 v2, v2, v9 +; GFX8-NEXT: v_and_b32_e32 v2, v2, v10 ; GFX8-NEXT: v_lshlrev_b32_e32 v1, v2, v1 ; GFX8-NEXT: v_bfe_u32 v2, v3, 1, 23 -; GFX8-NEXT: v_and_b32_e32 v3, v4, v9 +; GFX8-NEXT: v_and_b32_e32 v3, v4, v10 ; GFX8-NEXT: v_lshrrev_b32_e32 v2, v3, v2 ; GFX8-NEXT: v_or_b32_e32 v1, v1, v2 ; GFX8-NEXT: s_setpc_b64 s[30:31] @@ -2087,38 +2079,36 @@ ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: v_cvt_f32_ubyte0_e32 v6, 24 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v6, v6 -; GFX9-NEXT: v_mov_b32_e32 v7, 0xffffffe8 -; GFX9-NEXT: v_cvt_f32_ubyte0_e32 v9, 24 -; GFX9-NEXT: v_rcp_iflag_f32_e32 v9, v9 +; GFX9-NEXT: v_mov_b32_e32 v8, 0xffffffe8 +; GFX9-NEXT: v_mov_b32_e32 v10, 0xffffff +; GFX9-NEXT: v_and_b32_e32 v4, v4, v10 +; GFX9-NEXT: v_mul_f32_e32 v7, 0x4f7ffffe, v6 +; GFX9-NEXT: v_cvt_u32_f32_e32 v7, v7 ; GFX9-NEXT: v_mul_f32_e32 v6, 0x4f7ffffe, v6 ; GFX9-NEXT: v_cvt_u32_f32_e32 v6, v6 -; GFX9-NEXT: v_and_b32_e32 v4, 0xffffff, v4 -; GFX9-NEXT: v_mul_f32_e32 v9, 0x4f7ffffe, v9 +; GFX9-NEXT: v_and_b32_e32 v5, v5, v10 +; GFX9-NEXT: v_mul_lo_u32 v9, v8, v7 ; GFX9-NEXT: v_bfe_u32 v2, v2, 1, 23 -; GFX9-NEXT: v_mul_lo_u32 v8, v7, v6 +; GFX9-NEXT: v_mul_lo_u32 v8, v8, v6 ; GFX9-NEXT: v_bfe_u32 v3, v3, 1, 23 +; GFX9-NEXT: v_mul_hi_u32 v9, v7, v9 ; GFX9-NEXT: v_mul_hi_u32 v8, v6, v8 +; GFX9-NEXT: v_add_u32_e32 v7, v7, v9 +; GFX9-NEXT: v_mul_hi_u32 v7, v4, v7 ; GFX9-NEXT: v_add_u32_e32 v6, v6, v8 -; GFX9-NEXT: v_cvt_u32_f32_e32 v8, v9 -; GFX9-NEXT: v_mul_hi_u32 v6, v4, v6 -; GFX9-NEXT: v_mov_b32_e32 v9, 0xffffff -; GFX9-NEXT: v_and_b32_e32 v5, v5, v9 -; GFX9-NEXT: v_mul_lo_u32 v7, v7, v8 +; GFX9-NEXT: v_mul_hi_u32 v6, v5, v6 +; GFX9-NEXT: v_mul_lo_u32 v7, v7, 24 ; GFX9-NEXT: v_mul_lo_u32 v6, v6, 24 -; GFX9-NEXT: v_mul_hi_u32 v7, v8, v7 -; GFX9-NEXT: v_sub_u32_e32 v4, v4, v6 -; GFX9-NEXT: v_subrev_u32_e32 v6, 24, v4 +; GFX9-NEXT: v_sub_u32_e32 v4, v4, v7 +; GFX9-NEXT: v_subrev_u32_e32 v7, 24, v4 ; GFX9-NEXT: v_cmp_le_u32_e32 vcc, 24, v4 -; GFX9-NEXT: v_cndmask_b32_e32 v4, v4, v6, vcc -; GFX9-NEXT: v_subrev_u32_e32 v6, 24, v4 +; GFX9-NEXT: v_cndmask_b32_e32 v4, v4, v7, vcc +; GFX9-NEXT: v_subrev_u32_e32 v7, 24, v4 ; GFX9-NEXT: v_cmp_le_u32_e32 vcc, 24, v4 -; GFX9-NEXT: v_cndmask_b32_e32 v4, v4, v6, vcc -; GFX9-NEXT: v_add_u32_e32 v6, v8, v7 -; GFX9-NEXT: v_mul_hi_u32 v6, v5, v6 +; GFX9-NEXT: v_cndmask_b32_e32 v4, v4, v7, vcc ; GFX9-NEXT: v_sub_u32_e32 v7, 23, v4 -; GFX9-NEXT: v_and_b32_e32 v7, v7, v9 -; GFX9-NEXT: v_and_b32_e32 v4, v4, v9 -; GFX9-NEXT: v_mul_lo_u32 v6, v6, 24 +; GFX9-NEXT: v_and_b32_e32 v7, v7, v10 +; GFX9-NEXT: v_and_b32_e32 v4, v4, v10 ; GFX9-NEXT: v_lshrrev_b32_e32 v2, v7, v2 ; GFX9-NEXT: v_lshl_or_b32 v0, v0, v4, v2 ; GFX9-NEXT: v_sub_u32_e32 v2, v5, v6 @@ -2129,8 +2119,8 @@ ; GFX9-NEXT: v_cmp_le_u32_e32 vcc, 24, v2 ; GFX9-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc ; GFX9-NEXT: v_sub_u32_e32 v4, 23, v2 -; GFX9-NEXT: v_and_b32_e32 v4, v4, v9 -; GFX9-NEXT: v_and_b32_e32 v2, v2, v9 +; GFX9-NEXT: v_and_b32_e32 v4, v4, v10 +; GFX9-NEXT: v_and_b32_e32 v2, v2, v10 ; GFX9-NEXT: v_lshrrev_b32_e32 v3, v4, v3 ; GFX9-NEXT: v_lshl_or_b32 v1, v1, v2, v3 ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -2140,30 +2130,29 @@ ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: v_cvt_f32_ubyte0_e32 v6, 24 -; GFX10-NEXT: v_cvt_f32_ubyte0_e32 v7, 24 +; GFX10-NEXT: v_mov_b32_e32 v8, 0xffffffe8 ; GFX10-NEXT: v_mov_b32_e32 v10, 0xffffff -; GFX10-NEXT: v_and_b32_e32 v4, 0xffffff, v4 ; GFX10-NEXT: v_bfe_u32 v2, v2, 1, 23 +; GFX10-NEXT: v_bfe_u32 v3, v3, 1, 23 ; GFX10-NEXT: v_rcp_iflag_f32_e32 v6, v6 -; GFX10-NEXT: v_rcp_iflag_f32_e32 v7, v7 +; GFX10-NEXT: v_and_b32_e32 v4, v4, v10 ; GFX10-NEXT: v_and_b32_e32 v5, v5, v10 -; GFX10-NEXT: v_bfe_u32 v3, v3, 1, 23 +; GFX10-NEXT: v_mul_f32_e32 v7, 0x4f7ffffe, v6 ; GFX10-NEXT: v_mul_f32_e32 v6, 0x4f7ffffe, v6 -; GFX10-NEXT: v_mul_f32_e32 v7, 0x4f7ffffe, v7 -; GFX10-NEXT: v_cvt_u32_f32_e32 v6, v6 ; GFX10-NEXT: v_cvt_u32_f32_e32 v7, v7 -; GFX10-NEXT: v_mul_lo_u32 v8, 0xffffffe8, v6 -; GFX10-NEXT: v_mul_lo_u32 v9, 0xffffffe8, v7 -; GFX10-NEXT: v_mul_hi_u32 v8, v6, v8 +; GFX10-NEXT: v_cvt_u32_f32_e32 v6, v6 +; GFX10-NEXT: v_mul_lo_u32 v9, v8, v7 +; GFX10-NEXT: v_mul_lo_u32 v8, v8, v6 ; GFX10-NEXT: v_mul_hi_u32 v9, v7, v9 -; GFX10-NEXT: v_add_nc_u32_e32 v6, v6, v8 +; GFX10-NEXT: v_mul_hi_u32 v8, v6, v8 ; GFX10-NEXT: v_add_nc_u32_e32 v7, v7, v9 -; GFX10-NEXT: v_mul_hi_u32 v6, v4, v6 -; GFX10-NEXT: v_mul_hi_u32 v7, v5, v7 -; GFX10-NEXT: v_mul_lo_u32 v6, v6, 24 +; GFX10-NEXT: v_add_nc_u32_e32 v6, v6, v8 +; GFX10-NEXT: v_mul_hi_u32 v7, v4, v7 +; GFX10-NEXT: v_mul_hi_u32 v6, v5, v6 ; GFX10-NEXT: v_mul_lo_u32 v7, v7, 24 -; GFX10-NEXT: v_sub_nc_u32_e32 v4, v4, v6 -; GFX10-NEXT: v_sub_nc_u32_e32 v5, v5, v7 +; GFX10-NEXT: v_mul_lo_u32 v6, v6, 24 +; GFX10-NEXT: v_sub_nc_u32_e32 v4, v4, v7 +; GFX10-NEXT: v_sub_nc_u32_e32 v5, v5, v6 ; GFX10-NEXT: v_subrev_nc_u32_e32 v6, 24, v4 ; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, 24, v4 ; GFX10-NEXT: v_subrev_nc_u32_e32 v7, 24, v5 @@ -4922,10 +4911,10 @@ define amdgpu_ps <4 x float> @v_fshl_i128_ssv(i128 inreg %lhs, i128 inreg %rhs, i128 %amt) { ; GFX6-LABEL: v_fshl_i128_ssv: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_movk_i32 s8, 0x7f -; GFX6-NEXT: v_and_b32_e32 v6, s8, v0 +; GFX6-NEXT: s_movk_i32 s9, 0x7f +; GFX6-NEXT: v_and_b32_e32 v6, s9, v0 ; GFX6-NEXT: v_xor_b32_e32 v0, -1, v0 -; GFX6-NEXT: v_and_b32_e32 v7, s8, v0 +; GFX6-NEXT: v_and_b32_e32 v7, s9, v0 ; GFX6-NEXT: v_sub_i32_e32 v0, vcc, 64, v6 ; GFX6-NEXT: v_lshr_b64 v[0:1], s[0:1], v0 ; GFX6-NEXT: v_lshl_b64 v[2:3], s[2:3], v6 @@ -4975,10 +4964,10 @@ ; ; GFX8-LABEL: v_fshl_i128_ssv: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_movk_i32 s8, 0x7f -; GFX8-NEXT: v_and_b32_e32 v6, s8, v0 +; GFX8-NEXT: s_movk_i32 s9, 0x7f +; GFX8-NEXT: v_and_b32_e32 v6, s9, v0 ; GFX8-NEXT: v_xor_b32_e32 v0, -1, v0 -; GFX8-NEXT: v_and_b32_e32 v7, s8, v0 +; GFX8-NEXT: v_and_b32_e32 v7, s9, v0 ; GFX8-NEXT: v_sub_u32_e32 v0, vcc, 64, v6 ; GFX8-NEXT: v_lshrrev_b64 v[0:1], v0, s[0:1] ; GFX8-NEXT: v_lshlrev_b64 v[2:3], v6, s[2:3] @@ -5028,10 +5017,10 @@ ; ; GFX9-LABEL: v_fshl_i128_ssv: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_movk_i32 s8, 0x7f -; GFX9-NEXT: v_and_b32_e32 v6, s8, v0 +; GFX9-NEXT: s_movk_i32 s9, 0x7f +; GFX9-NEXT: v_and_b32_e32 v6, s9, v0 ; GFX9-NEXT: v_xor_b32_e32 v0, -1, v0 -; GFX9-NEXT: v_and_b32_e32 v7, s8, v0 +; GFX9-NEXT: v_and_b32_e32 v7, s9, v0 ; GFX9-NEXT: v_sub_u32_e32 v0, 64, v6 ; GFX9-NEXT: v_lshrrev_b64 v[0:1], v0, s[0:1] ; GFX9-NEXT: v_lshlrev_b64 v[2:3], v6, s[2:3] Index: llvm/test/CodeGen/AMDGPU/GlobalISel/fshr.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/GlobalISel/fshr.ll +++ llvm/test/CodeGen/AMDGPU/GlobalISel/fshr.ll @@ -138,16 +138,16 @@ ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: v_cvt_f32_ubyte0_e32 v3, 7 ; GFX6-NEXT: v_rcp_iflag_f32_e32 v3, v3 -; GFX6-NEXT: v_and_b32_e32 v2, 0x7f, v2 +; GFX6-NEXT: v_mov_b32_e32 v5, 0x7f +; GFX6-NEXT: v_and_b32_e32 v2, v2, v5 ; GFX6-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX6-NEXT: v_mul_f32_e32 v3, 0x4f7ffffe, v3 ; GFX6-NEXT: v_cvt_u32_f32_e32 v3, v3 +; GFX6-NEXT: v_and_b32_e32 v1, v1, v5 ; GFX6-NEXT: v_mul_lo_u32 v4, -7, v3 ; GFX6-NEXT: v_mul_hi_u32 v4, v3, v4 ; GFX6-NEXT: v_add_i32_e32 v3, vcc, v3, v4 ; GFX6-NEXT: v_mul_hi_u32 v3, v2, v3 -; GFX6-NEXT: v_mov_b32_e32 v4, 0x7f -; GFX6-NEXT: v_and_b32_e32 v1, v1, v4 ; GFX6-NEXT: v_mul_lo_u32 v3, v3, 7 ; GFX6-NEXT: v_sub_i32_e32 v2, vcc, v2, v3 ; GFX6-NEXT: v_subrev_i32_e32 v3, vcc, 7, v2 @@ -157,8 +157,8 @@ ; GFX6-NEXT: v_cmp_le_u32_e32 vcc, 7, v2 ; GFX6-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc ; GFX6-NEXT: v_sub_i32_e32 v3, vcc, 6, v2 -; GFX6-NEXT: v_and_b32_e32 v2, v2, v4 -; GFX6-NEXT: v_and_b32_e32 v3, v3, v4 +; GFX6-NEXT: v_and_b32_e32 v2, v2, v5 +; GFX6-NEXT: v_and_b32_e32 v3, v3, v5 ; GFX6-NEXT: v_lshlrev_b32_e32 v0, v3, v0 ; GFX6-NEXT: v_lshrrev_b32_e32 v1, v2, v1 ; GFX6-NEXT: v_or_b32_e32 v0, v0, v1 @@ -169,16 +169,16 @@ ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_cvt_f32_ubyte0_e32 v3, 7 ; GFX8-NEXT: v_rcp_iflag_f32_e32 v3, v3 -; GFX8-NEXT: v_and_b32_e32 v2, 0x7f, v2 +; GFX8-NEXT: v_mov_b32_e32 v5, 0x7f +; GFX8-NEXT: v_and_b32_e32 v2, v2, v5 ; GFX8-NEXT: v_lshlrev_b16_e32 v0, 1, v0 ; GFX8-NEXT: v_mul_f32_e32 v3, 0x4f7ffffe, v3 ; GFX8-NEXT: v_cvt_u32_f32_e32 v3, v3 +; GFX8-NEXT: v_and_b32_e32 v1, v1, v5 ; GFX8-NEXT: v_mul_lo_u32 v4, -7, v3 ; GFX8-NEXT: v_mul_hi_u32 v4, v3, v4 ; GFX8-NEXT: v_add_u32_e32 v3, vcc, v3, v4 ; GFX8-NEXT: v_mul_hi_u32 v3, v2, v3 -; GFX8-NEXT: v_mov_b32_e32 v4, 0x7f -; GFX8-NEXT: v_and_b32_e32 v1, v1, v4 ; GFX8-NEXT: v_mul_lo_u32 v3, v3, 7 ; GFX8-NEXT: v_sub_u32_e32 v2, vcc, v2, v3 ; GFX8-NEXT: v_subrev_u32_e32 v3, vcc, 7, v2 @@ -188,8 +188,8 @@ ; GFX8-NEXT: v_cmp_le_u32_e32 vcc, 7, v2 ; GFX8-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc ; GFX8-NEXT: v_sub_u16_e32 v3, 6, v2 -; GFX8-NEXT: v_and_b32_e32 v2, v2, v4 -; GFX8-NEXT: v_and_b32_e32 v3, v3, v4 +; GFX8-NEXT: v_and_b32_e32 v2, v2, v5 +; GFX8-NEXT: v_and_b32_e32 v3, v3, v5 ; GFX8-NEXT: v_lshlrev_b16_e32 v0, v3, v0 ; GFX8-NEXT: v_lshrrev_b16_e32 v1, v2, v1 ; GFX8-NEXT: v_or_b32_e32 v0, v0, v1 @@ -200,16 +200,16 @@ ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: v_cvt_f32_ubyte0_e32 v3, 7 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v3, v3 -; GFX9-NEXT: v_and_b32_e32 v2, 0x7f, v2 +; GFX9-NEXT: v_mov_b32_e32 v5, 0x7f +; GFX9-NEXT: v_and_b32_e32 v2, v2, v5 ; GFX9-NEXT: v_lshlrev_b16_e32 v0, 1, v0 ; GFX9-NEXT: v_mul_f32_e32 v3, 0x4f7ffffe, v3 ; GFX9-NEXT: v_cvt_u32_f32_e32 v3, v3 +; GFX9-NEXT: v_and_b32_e32 v1, v1, v5 ; GFX9-NEXT: v_mul_lo_u32 v4, -7, v3 ; GFX9-NEXT: v_mul_hi_u32 v4, v3, v4 ; GFX9-NEXT: v_add_u32_e32 v3, v3, v4 ; GFX9-NEXT: v_mul_hi_u32 v3, v2, v3 -; GFX9-NEXT: v_mov_b32_e32 v4, 0x7f -; GFX9-NEXT: v_and_b32_e32 v1, v1, v4 ; GFX9-NEXT: v_mul_lo_u32 v3, v3, 7 ; GFX9-NEXT: v_sub_u32_e32 v2, v2, v3 ; GFX9-NEXT: v_subrev_u32_e32 v3, 7, v2 @@ -219,8 +219,8 @@ ; GFX9-NEXT: v_cmp_le_u32_e32 vcc, 7, v2 ; GFX9-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc ; GFX9-NEXT: v_sub_u16_e32 v3, 6, v2 -; GFX9-NEXT: v_and_b32_e32 v2, v2, v4 -; GFX9-NEXT: v_and_b32_e32 v3, v3, v4 +; GFX9-NEXT: v_and_b32_e32 v2, v2, v5 +; GFX9-NEXT: v_and_b32_e32 v3, v3, v5 ; GFX9-NEXT: v_lshlrev_b16_e32 v0, v3, v0 ; GFX9-NEXT: v_lshrrev_b16_e32 v1, v2, v1 ; GFX9-NEXT: v_or_b32_e32 v0, v0, v1 @@ -231,10 +231,11 @@ ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: v_cvt_f32_ubyte0_e32 v3, 7 -; GFX10-NEXT: v_and_b32_e32 v2, 0x7f, v2 +; GFX10-NEXT: v_mov_b32_e32 v5, 0x7f ; GFX10-NEXT: v_lshlrev_b16 v0, 1, v0 -; GFX10-NEXT: v_and_b32_e32 v1, 0x7f, v1 ; GFX10-NEXT: v_rcp_iflag_f32_e32 v3, v3 +; GFX10-NEXT: v_and_b32_e32 v2, v2, v5 +; GFX10-NEXT: v_and_b32_e32 v1, v1, v5 ; GFX10-NEXT: v_mul_f32_e32 v3, 0x4f7ffffe, v3 ; GFX10-NEXT: v_cvt_u32_f32_e32 v3, v3 ; GFX10-NEXT: v_mul_lo_u32 v4, -7, v3 @@ -249,12 +250,11 @@ ; GFX10-NEXT: v_subrev_nc_u32_e32 v3, 7, v2 ; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, 7, v2 ; GFX10-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc_lo -; GFX10-NEXT: v_mov_b32_e32 v3, 0x7f -; GFX10-NEXT: v_sub_nc_u16 v4, 6, v2 -; GFX10-NEXT: v_and_b32_e32 v2, v2, v3 -; GFX10-NEXT: v_and_b32_e32 v4, v4, v3 +; GFX10-NEXT: v_sub_nc_u16 v3, 6, v2 +; GFX10-NEXT: v_and_b32_e32 v2, v2, v5 +; GFX10-NEXT: v_and_b32_e32 v3, v3, v5 ; GFX10-NEXT: v_lshrrev_b16 v1, v2, v1 -; GFX10-NEXT: v_lshlrev_b16 v0, v4, v0 +; GFX10-NEXT: v_lshlrev_b16 v0, v3, v0 ; GFX10-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX10-NEXT: s_setpc_b64 s[30:31] %result = call i7 @llvm.fshr.i7(i7 %lhs, i7 %rhs, i7 %amt) @@ -652,10 +652,10 @@ ; GFX6-NEXT: v_xor_b32_e32 v2, -1, v2 ; GFX6-NEXT: v_lshrrev_b32_e32 v3, 8, v0 ; GFX6-NEXT: v_and_b32_e32 v2, 7, v2 -; GFX6-NEXT: s_movk_i32 s4, 0xff +; GFX6-NEXT: v_mov_b32_e32 v6, 0xff ; GFX6-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX6-NEXT: v_lshlrev_b32_e32 v0, v2, v0 -; GFX6-NEXT: v_and_b32_e32 v2, s4, v1 +; GFX6-NEXT: v_and_b32_e32 v2, v1, v6 ; GFX6-NEXT: v_lshrrev_b32_e32 v2, v5, v2 ; GFX6-NEXT: v_or_b32_e32 v0, v0, v2 ; GFX6-NEXT: v_and_b32_e32 v2, 7, v4 @@ -666,8 +666,8 @@ ; GFX6-NEXT: v_lshlrev_b32_e32 v3, v4, v3 ; GFX6-NEXT: v_lshrrev_b32_e32 v1, v2, v1 ; GFX6-NEXT: v_or_b32_e32 v1, v3, v1 -; GFX6-NEXT: v_and_b32_e32 v1, s4, v1 -; GFX6-NEXT: v_and_b32_e32 v0, 0xff, v0 +; GFX6-NEXT: v_and_b32_e32 v1, v1, v6 +; GFX6-NEXT: v_and_b32_e32 v0, v0, v6 ; GFX6-NEXT: v_lshlrev_b32_e32 v1, 8, v1 ; GFX6-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX6-NEXT: s_setpc_b64 s[30:31] @@ -727,26 +727,26 @@ ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: v_lshrrev_b32_e32 v3, 8, v2 -; GFX10-NEXT: v_lshrrev_b32_e32 v4, 8, v0 -; GFX10-NEXT: v_lshrrev_b32_e32 v5, 8, v1 -; GFX10-NEXT: s_movk_i32 s4, 0xff -; GFX10-NEXT: v_and_b32_e32 v7, 7, v2 -; GFX10-NEXT: v_xor_b32_e32 v6, -1, v3 +; GFX10-NEXT: v_mov_b32_e32 v4, 0xff +; GFX10-NEXT: v_lshrrev_b32_e32 v5, 8, v0 +; GFX10-NEXT: v_lshrrev_b32_e32 v6, 8, v1 +; GFX10-NEXT: v_and_b32_e32 v8, 7, v2 +; GFX10-NEXT: v_xor_b32_e32 v7, -1, v3 ; GFX10-NEXT: v_xor_b32_e32 v2, -1, v2 ; GFX10-NEXT: v_and_b32_e32 v3, 7, v3 -; GFX10-NEXT: v_lshlrev_b16 v4, 1, v4 -; GFX10-NEXT: v_and_b32_e32 v5, s4, v5 -; GFX10-NEXT: v_and_b32_e32 v6, 7, v6 +; GFX10-NEXT: v_lshlrev_b16 v5, 1, v5 +; GFX10-NEXT: v_and_b32_e32 v6, v6, v4 +; GFX10-NEXT: v_and_b32_e32 v7, 7, v7 ; GFX10-NEXT: v_lshlrev_b16 v0, 1, v0 -; GFX10-NEXT: v_and_b32_e32 v1, s4, v1 +; GFX10-NEXT: v_and_b32_e32 v1, v1, v4 ; GFX10-NEXT: v_and_b32_e32 v2, 7, v2 -; GFX10-NEXT: v_lshrrev_b16 v3, v3, v5 -; GFX10-NEXT: v_lshlrev_b16 v4, v6, v4 -; GFX10-NEXT: v_lshrrev_b16 v1, v7, v1 +; GFX10-NEXT: v_lshrrev_b16 v3, v3, v6 +; GFX10-NEXT: v_lshlrev_b16 v5, v7, v5 +; GFX10-NEXT: v_lshrrev_b16 v1, v8, v1 ; GFX10-NEXT: v_lshlrev_b16 v0, v2, v0 -; GFX10-NEXT: v_or_b32_e32 v2, v4, v3 +; GFX10-NEXT: v_or_b32_e32 v2, v5, v3 ; GFX10-NEXT: v_or_b32_e32 v0, v0, v1 -; GFX10-NEXT: v_and_b32_sdwa v1, v2, s4 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX10-NEXT: v_and_b32_sdwa v1, v2, v4 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX10-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX10-NEXT: s_setpc_b64 s[30:31] %lhs = bitcast i16 %lhs.arg to <2 x i8> @@ -989,44 +989,44 @@ ; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v0 ; GFX6-NEXT: v_lshrrev_b32_e32 v5, 24, v0 ; GFX6-NEXT: v_and_b32_e32 v2, 7, v2 +; GFX6-NEXT: v_mov_b32_e32 v11, 0xff ; GFX6-NEXT: v_lshlrev_b32_e32 v0, 1, v0 -; GFX6-NEXT: v_and_b32_e32 v11, 0xff, v1 ; GFX6-NEXT: v_lshlrev_b32_e32 v0, v2, v0 -; GFX6-NEXT: v_lshrrev_b32_e32 v10, v10, v11 -; GFX6-NEXT: v_or_b32_e32 v0, v0, v10 -; GFX6-NEXT: v_and_b32_e32 v10, 7, v7 +; GFX6-NEXT: v_and_b32_e32 v2, v1, v11 +; GFX6-NEXT: v_lshrrev_b32_e32 v2, v10, v2 +; GFX6-NEXT: v_or_b32_e32 v0, v0, v2 +; GFX6-NEXT: v_and_b32_e32 v2, 7, v7 ; GFX6-NEXT: v_xor_b32_e32 v7, -1, v7 ; GFX6-NEXT: v_and_b32_e32 v7, 7, v7 ; GFX6-NEXT: v_lshlrev_b32_e32 v3, 1, v3 ; GFX6-NEXT: v_lshlrev_b32_e32 v3, v7, v3 ; GFX6-NEXT: v_bfe_u32 v7, v1, 8, 8 -; GFX6-NEXT: v_lshrrev_b32_e32 v7, v10, v7 -; GFX6-NEXT: v_or_b32_e32 v3, v3, v7 -; GFX6-NEXT: v_and_b32_e32 v7, 7, v8 -; GFX6-NEXT: v_xor_b32_e32 v8, -1, v8 +; GFX6-NEXT: v_lshrrev_b32_e32 v2, v2, v7 +; GFX6-NEXT: v_xor_b32_e32 v7, -1, v8 ; GFX6-NEXT: v_lshrrev_b32_e32 v6, 24, v1 -; GFX6-NEXT: v_and_b32_e32 v8, 7, v8 +; GFX6-NEXT: v_or_b32_e32 v2, v3, v2 +; GFX6-NEXT: v_and_b32_e32 v3, 7, v8 +; GFX6-NEXT: v_and_b32_e32 v7, 7, v7 ; GFX6-NEXT: v_lshlrev_b32_e32 v4, 1, v4 ; GFX6-NEXT: v_bfe_u32 v1, v1, 16, 8 -; GFX6-NEXT: v_mov_b32_e32 v2, 0xff -; GFX6-NEXT: v_lshlrev_b32_e32 v4, v8, v4 -; GFX6-NEXT: v_lshrrev_b32_e32 v1, v7, v1 -; GFX6-NEXT: v_xor_b32_e32 v7, -1, v9 +; GFX6-NEXT: v_lshlrev_b32_e32 v4, v7, v4 +; GFX6-NEXT: v_lshrrev_b32_e32 v1, v3, v1 ; GFX6-NEXT: v_or_b32_e32 v1, v4, v1 -; GFX6-NEXT: v_and_b32_e32 v4, 7, v9 -; GFX6-NEXT: v_and_b32_e32 v7, 7, v7 +; GFX6-NEXT: v_xor_b32_e32 v4, -1, v9 +; GFX6-NEXT: v_and_b32_e32 v3, 7, v9 +; GFX6-NEXT: v_and_b32_e32 v4, 7, v4 ; GFX6-NEXT: v_lshlrev_b32_e32 v5, 1, v5 -; GFX6-NEXT: v_and_b32_e32 v3, v3, v2 -; GFX6-NEXT: v_lshlrev_b32_e32 v5, v7, v5 -; GFX6-NEXT: v_lshrrev_b32_e32 v4, v4, v6 -; GFX6-NEXT: v_and_b32_e32 v0, v0, v2 -; GFX6-NEXT: v_lshlrev_b32_e32 v3, 8, v3 -; GFX6-NEXT: v_and_b32_e32 v1, v1, v2 -; GFX6-NEXT: v_or_b32_e32 v4, v5, v4 -; GFX6-NEXT: v_or_b32_e32 v0, v0, v3 +; GFX6-NEXT: v_and_b32_e32 v2, v2, v11 +; GFX6-NEXT: v_lshlrev_b32_e32 v4, v4, v5 +; GFX6-NEXT: v_lshrrev_b32_e32 v3, v3, v6 +; GFX6-NEXT: v_and_b32_e32 v0, v0, v11 +; GFX6-NEXT: v_lshlrev_b32_e32 v2, 8, v2 +; GFX6-NEXT: v_and_b32_e32 v1, v1, v11 +; GFX6-NEXT: v_or_b32_e32 v3, v4, v3 +; GFX6-NEXT: v_or_b32_e32 v0, v0, v2 ; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX6-NEXT: v_or_b32_e32 v0, v0, v1 -; GFX6-NEXT: v_and_b32_e32 v1, v4, v2 +; GFX6-NEXT: v_and_b32_e32 v1, v3, v11 ; GFX6-NEXT: v_lshlrev_b32_e32 v1, 24, v1 ; GFX6-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX6-NEXT: s_setpc_b64 s[30:31] @@ -1040,8 +1040,8 @@ ; GFX8-NEXT: v_and_b32_e32 v8, 7, v2 ; GFX8-NEXT: v_xor_b32_e32 v2, -1, v2 ; GFX8-NEXT: v_and_b32_e32 v2, 7, v2 -; GFX8-NEXT: v_lshlrev_b16_e32 v9, 1, v0 -; GFX8-NEXT: v_lshlrev_b16_e32 v2, v2, v9 +; GFX8-NEXT: v_lshlrev_b16_e32 v10, 1, v0 +; GFX8-NEXT: v_lshlrev_b16_e32 v2, v2, v10 ; GFX8-NEXT: v_lshrrev_b16_sdwa v8, v8, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX8-NEXT: v_lshrrev_b32_e32 v3, 8, v0 ; GFX8-NEXT: v_or_b32_e32 v2, v2, v8 @@ -1058,25 +1058,25 @@ ; GFX8-NEXT: v_mov_b32_e32 v6, 1 ; GFX8-NEXT: v_mov_b32_e32 v9, 0xff ; GFX8-NEXT: v_and_b32_e32 v5, 7, v5 -; GFX8-NEXT: v_lshlrev_b16_sdwa v8, v6, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX8-NEXT: v_lshlrev_b16_e32 v5, v5, v8 -; GFX8-NEXT: v_and_b32_sdwa v8, v1, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; GFX8-NEXT: v_lshrrev_b16_e32 v4, v4, v8 +; GFX8-NEXT: v_lshlrev_b16_sdwa v6, v6, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX8-NEXT: v_lshlrev_b16_e32 v5, v5, v6 +; GFX8-NEXT: v_and_b32_sdwa v6, v1, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-NEXT: v_lshrrev_b16_e32 v4, v4, v6 ; GFX8-NEXT: v_or_b32_e32 v4, v5, v4 ; GFX8-NEXT: v_and_b32_e32 v5, 7, v7 -; GFX8-NEXT: v_xor_b32_e32 v7, -1, v7 -; GFX8-NEXT: v_and_b32_e32 v7, 7, v7 -; GFX8-NEXT: v_lshlrev_b16_sdwa v0, v6, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3 -; GFX8-NEXT: v_lshlrev_b16_e32 v0, v7, v0 +; GFX8-NEXT: v_xor_b32_e32 v6, -1, v7 +; GFX8-NEXT: v_mov_b32_e32 v7, 1 +; GFX8-NEXT: v_and_b32_e32 v6, 7, v6 +; GFX8-NEXT: v_lshlrev_b16_sdwa v0, v7, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3 +; GFX8-NEXT: v_lshlrev_b16_e32 v0, v6, v0 ; GFX8-NEXT: v_lshrrev_b16_sdwa v1, v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3 ; GFX8-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX8-NEXT: v_mov_b32_e32 v1, 8 -; GFX8-NEXT: s_movk_i32 s4, 0xff ; GFX8-NEXT: v_lshlrev_b32_sdwa v1, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX8-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX8-NEXT: v_and_b32_e32 v2, s4, v4 +; GFX8-NEXT: v_and_b32_e32 v2, v4, v9 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX8-NEXT: v_and_b32_e32 v0, s4, v0 +; GFX8-NEXT: v_and_b32_e32 v0, v0, v9 ; GFX8-NEXT: v_or_b32_e32 v1, v1, v2 ; GFX8-NEXT: v_lshlrev_b32_e32 v0, 24, v0 ; GFX8-NEXT: v_or_b32_e32 v0, v1, v0 @@ -1091,8 +1091,8 @@ ; GFX9-NEXT: v_and_b32_e32 v8, 7, v2 ; GFX9-NEXT: v_xor_b32_e32 v2, -1, v2 ; GFX9-NEXT: v_and_b32_e32 v2, 7, v2 -; GFX9-NEXT: v_lshlrev_b16_e32 v9, 1, v0 -; GFX9-NEXT: v_lshlrev_b16_e32 v2, v2, v9 +; GFX9-NEXT: v_lshlrev_b16_e32 v10, 1, v0 +; GFX9-NEXT: v_lshlrev_b16_e32 v2, v2, v10 ; GFX9-NEXT: v_lshrrev_b16_sdwa v8, v8, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_lshrrev_b32_e32 v3, 8, v0 ; GFX9-NEXT: v_or_b32_e32 v2, v2, v8 @@ -1101,32 +1101,32 @@ ; GFX9-NEXT: v_lshrrev_b32_e32 v4, 8, v1 ; GFX9-NEXT: v_and_b32_e32 v5, 7, v5 ; GFX9-NEXT: v_lshlrev_b16_e32 v3, 1, v3 +; GFX9-NEXT: s_mov_b32 s4, 1 ; GFX9-NEXT: v_lshlrev_b16_e32 v3, v5, v3 ; GFX9-NEXT: v_lshrrev_b16_sdwa v4, v8, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_or_b32_e32 v3, v3, v4 -; GFX9-NEXT: v_and_b32_e32 v4, 7, v6 ; GFX9-NEXT: v_xor_b32_e32 v5, -1, v6 -; GFX9-NEXT: v_mov_b32_e32 v6, 1 ; GFX9-NEXT: v_mov_b32_e32 v9, 0xff +; GFX9-NEXT: v_or_b32_e32 v3, v3, v4 +; GFX9-NEXT: v_and_b32_e32 v4, 7, v6 ; GFX9-NEXT: v_and_b32_e32 v5, 7, v5 -; GFX9-NEXT: v_lshlrev_b16_sdwa v8, v6, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX9-NEXT: v_lshlrev_b16_e32 v5, v5, v8 -; GFX9-NEXT: v_and_b32_sdwa v8, v1, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; GFX9-NEXT: v_lshrrev_b16_e32 v4, v4, v8 +; GFX9-NEXT: v_lshlrev_b16_sdwa v6, s4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_lshlrev_b16_e32 v5, v5, v6 +; GFX9-NEXT: v_and_b32_sdwa v6, v1, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX9-NEXT: v_lshrrev_b16_e32 v4, v4, v6 ; GFX9-NEXT: v_or_b32_e32 v4, v5, v4 ; GFX9-NEXT: v_and_b32_e32 v5, 7, v7 -; GFX9-NEXT: v_xor_b32_e32 v7, -1, v7 -; GFX9-NEXT: v_and_b32_e32 v7, 7, v7 -; GFX9-NEXT: v_lshlrev_b16_sdwa v0, v6, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3 -; GFX9-NEXT: v_lshlrev_b16_e32 v0, v7, v0 +; GFX9-NEXT: v_xor_b32_e32 v6, -1, v7 +; GFX9-NEXT: v_mov_b32_e32 v7, 1 +; GFX9-NEXT: v_and_b32_e32 v6, 7, v6 +; GFX9-NEXT: v_lshlrev_b16_sdwa v0, v7, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3 +; GFX9-NEXT: v_lshlrev_b16_e32 v0, v6, v0 ; GFX9-NEXT: v_lshrrev_b16_sdwa v1, v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3 ; GFX9-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX9-NEXT: v_mov_b32_e32 v1, 8 -; GFX9-NEXT: s_movk_i32 s4, 0xff ; GFX9-NEXT: v_lshlrev_b32_sdwa v1, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_and_or_b32 v1, v2, s4, v1 -; GFX9-NEXT: v_and_b32_e32 v2, s4, v4 -; GFX9-NEXT: v_and_b32_e32 v0, s4, v0 +; GFX9-NEXT: v_and_or_b32 v1, v2, v9, v1 +; GFX9-NEXT: v_and_b32_e32 v2, v4, v9 +; GFX9-NEXT: v_and_b32_e32 v0, v0, v9 ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 24, v0 ; GFX9-NEXT: v_or3_b32 v0, v1, v2, v0 @@ -1136,42 +1136,41 @@ ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: v_lshrrev_b32_e32 v6, 8, v2 +; GFX10-NEXT: v_lshrrev_b32_e32 v7, 8, v2 ; GFX10-NEXT: v_lshrrev_b32_e32 v3, 8, v0 ; GFX10-NEXT: v_xor_b32_e32 v8, -1, v2 ; GFX10-NEXT: v_lshrrev_b32_e32 v10, 16, v2 -; GFX10-NEXT: v_lshrrev_b32_e32 v12, 24, v2 -; GFX10-NEXT: v_xor_b32_e32 v11, -1, v6 +; GFX10-NEXT: v_lshrrev_b32_e32 v11, 24, v2 +; GFX10-NEXT: v_xor_b32_e32 v12, -1, v7 ; GFX10-NEXT: v_lshlrev_b16 v3, 1, v3 ; GFX10-NEXT: v_lshrrev_b32_e32 v4, 16, v0 ; GFX10-NEXT: v_lshrrev_b32_e32 v5, 24, v0 -; GFX10-NEXT: v_lshrrev_b32_e32 v7, 8, v1 -; GFX10-NEXT: v_and_b32_e32 v11, 7, v11 +; GFX10-NEXT: v_lshrrev_b32_e32 v6, 8, v1 +; GFX10-NEXT: v_and_b32_e32 v12, 7, v12 ; GFX10-NEXT: v_and_b32_e32 v8, 7, v8 ; GFX10-NEXT: v_lshlrev_b16 v0, 1, v0 ; GFX10-NEXT: v_mov_b32_e32 v13, 0xff -; GFX10-NEXT: v_xor_b32_e32 v14, -1, v12 -; GFX10-NEXT: v_lshlrev_b16 v3, v11, v3 -; GFX10-NEXT: v_xor_b32_e32 v11, -1, v10 -; GFX10-NEXT: s_movk_i32 s4, 0xff +; GFX10-NEXT: v_xor_b32_e32 v14, -1, v10 +; GFX10-NEXT: v_lshlrev_b16 v3, v12, v3 +; GFX10-NEXT: v_xor_b32_e32 v12, -1, v11 ; GFX10-NEXT: v_lshrrev_b32_e32 v9, 24, v1 ; GFX10-NEXT: v_lshlrev_b16 v0, v8, v0 -; GFX10-NEXT: v_and_b32_e32 v8, s4, v1 -; GFX10-NEXT: v_and_b32_e32 v6, 7, v6 -; GFX10-NEXT: v_and_b32_e32 v7, s4, v7 +; GFX10-NEXT: v_and_b32_e32 v8, v1, v13 +; GFX10-NEXT: v_and_b32_e32 v7, 7, v7 +; GFX10-NEXT: v_and_b32_e32 v6, v6, v13 ; GFX10-NEXT: v_and_b32_e32 v10, 7, v10 -; GFX10-NEXT: v_and_b32_e32 v11, 7, v11 +; GFX10-NEXT: v_and_b32_e32 v14, 7, v14 ; GFX10-NEXT: v_lshlrev_b16 v4, 1, v4 ; GFX10-NEXT: v_and_b32_sdwa v1, v1, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; GFX10-NEXT: v_and_b32_e32 v13, 7, v14 -; GFX10-NEXT: v_lshlrev_b16 v5, 1, v5 ; GFX10-NEXT: v_and_b32_e32 v12, 7, v12 +; GFX10-NEXT: v_lshlrev_b16 v5, 1, v5 +; GFX10-NEXT: v_and_b32_e32 v11, 7, v11 ; GFX10-NEXT: v_and_b32_e32 v2, 7, v2 -; GFX10-NEXT: v_lshrrev_b16 v6, v6, v7 -; GFX10-NEXT: v_lshlrev_b16 v4, v11, v4 +; GFX10-NEXT: v_lshrrev_b16 v6, v7, v6 +; GFX10-NEXT: v_lshlrev_b16 v4, v14, v4 ; GFX10-NEXT: v_lshrrev_b16 v1, v10, v1 -; GFX10-NEXT: v_lshlrev_b16 v5, v13, v5 -; GFX10-NEXT: v_lshrrev_b16 v7, v12, v9 +; GFX10-NEXT: v_lshlrev_b16 v5, v12, v5 +; GFX10-NEXT: v_lshrrev_b16 v7, v11, v9 ; GFX10-NEXT: v_lshrrev_b16 v2, v2, v8 ; GFX10-NEXT: v_or_b32_e32 v3, v3, v6 ; GFX10-NEXT: v_mov_b32_e32 v6, 8 @@ -1179,9 +1178,9 @@ ; GFX10-NEXT: v_or_b32_e32 v4, v5, v7 ; GFX10-NEXT: v_or_b32_e32 v0, v0, v2 ; GFX10-NEXT: v_lshlrev_b32_sdwa v2, v6, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX10-NEXT: v_and_b32_e32 v1, s4, v1 -; GFX10-NEXT: v_and_b32_e32 v3, s4, v4 -; GFX10-NEXT: v_and_or_b32 v0, v0, s4, v2 +; GFX10-NEXT: v_and_b32_e32 v1, v1, v13 +; GFX10-NEXT: v_and_b32_e32 v3, v4, v13 +; GFX10-NEXT: v_and_or_b32 v0, v0, v13, v2 ; GFX10-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX10-NEXT: v_lshlrev_b32_e32 v2, 24, v3 ; GFX10-NEXT: v_or3_b32 v0, v0, v1, v2 @@ -1330,16 +1329,16 @@ ; GFX6-NEXT: v_cvt_f32_ubyte0_e32 v3, 24 ; GFX6-NEXT: v_rcp_iflag_f32_e32 v3, v3 ; GFX6-NEXT: v_mov_b32_e32 v4, 0xffffffe8 -; GFX6-NEXT: v_and_b32_e32 v2, 0xffffff, v2 -; GFX6-NEXT: v_lshlrev_b32_e32 v0, 1, v0 +; GFX6-NEXT: v_mov_b32_e32 v5, 0xffffff +; GFX6-NEXT: v_and_b32_e32 v2, v2, v5 ; GFX6-NEXT: v_mul_f32_e32 v3, 0x4f7ffffe, v3 ; GFX6-NEXT: v_cvt_u32_f32_e32 v3, v3 +; GFX6-NEXT: v_lshlrev_b32_e32 v0, 1, v0 +; GFX6-NEXT: v_and_b32_e32 v1, v1, v5 ; GFX6-NEXT: v_mul_lo_u32 v4, v4, v3 ; GFX6-NEXT: v_mul_hi_u32 v4, v3, v4 ; GFX6-NEXT: v_add_i32_e32 v3, vcc, v3, v4 ; GFX6-NEXT: v_mul_hi_u32 v3, v2, v3 -; GFX6-NEXT: v_mov_b32_e32 v4, 0xffffff -; GFX6-NEXT: v_and_b32_e32 v1, v1, v4 ; GFX6-NEXT: v_mul_lo_u32 v3, v3, 24 ; GFX6-NEXT: v_sub_i32_e32 v2, vcc, v2, v3 ; GFX6-NEXT: v_subrev_i32_e32 v3, vcc, 24, v2 @@ -1349,8 +1348,8 @@ ; GFX6-NEXT: v_cmp_le_u32_e32 vcc, 24, v2 ; GFX6-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc ; GFX6-NEXT: v_sub_i32_e32 v3, vcc, 23, v2 -; GFX6-NEXT: v_and_b32_e32 v2, v2, v4 -; GFX6-NEXT: v_and_b32_e32 v3, v3, v4 +; GFX6-NEXT: v_and_b32_e32 v2, v2, v5 +; GFX6-NEXT: v_and_b32_e32 v3, v3, v5 ; GFX6-NEXT: v_lshlrev_b32_e32 v0, v3, v0 ; GFX6-NEXT: v_lshrrev_b32_e32 v1, v2, v1 ; GFX6-NEXT: v_or_b32_e32 v0, v0, v1 @@ -1362,16 +1361,16 @@ ; GFX8-NEXT: v_cvt_f32_ubyte0_e32 v3, 24 ; GFX8-NEXT: v_rcp_iflag_f32_e32 v3, v3 ; GFX8-NEXT: v_mov_b32_e32 v4, 0xffffffe8 -; GFX8-NEXT: v_and_b32_e32 v2, 0xffffff, v2 -; GFX8-NEXT: v_lshlrev_b32_e32 v0, 1, v0 +; GFX8-NEXT: v_mov_b32_e32 v5, 0xffffff +; GFX8-NEXT: v_and_b32_e32 v2, v2, v5 ; GFX8-NEXT: v_mul_f32_e32 v3, 0x4f7ffffe, v3 ; GFX8-NEXT: v_cvt_u32_f32_e32 v3, v3 +; GFX8-NEXT: v_lshlrev_b32_e32 v0, 1, v0 +; GFX8-NEXT: v_and_b32_e32 v1, v1, v5 ; GFX8-NEXT: v_mul_lo_u32 v4, v4, v3 ; GFX8-NEXT: v_mul_hi_u32 v4, v3, v4 ; GFX8-NEXT: v_add_u32_e32 v3, vcc, v3, v4 ; GFX8-NEXT: v_mul_hi_u32 v3, v2, v3 -; GFX8-NEXT: v_mov_b32_e32 v4, 0xffffff -; GFX8-NEXT: v_and_b32_e32 v1, v1, v4 ; GFX8-NEXT: v_mul_lo_u32 v3, v3, 24 ; GFX8-NEXT: v_sub_u32_e32 v2, vcc, v2, v3 ; GFX8-NEXT: v_subrev_u32_e32 v3, vcc, 24, v2 @@ -1381,8 +1380,8 @@ ; GFX8-NEXT: v_cmp_le_u32_e32 vcc, 24, v2 ; GFX8-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc ; GFX8-NEXT: v_sub_u32_e32 v3, vcc, 23, v2 -; GFX8-NEXT: v_and_b32_e32 v2, v2, v4 -; GFX8-NEXT: v_and_b32_e32 v3, v3, v4 +; GFX8-NEXT: v_and_b32_e32 v2, v2, v5 +; GFX8-NEXT: v_and_b32_e32 v3, v3, v5 ; GFX8-NEXT: v_lshlrev_b32_e32 v0, v3, v0 ; GFX8-NEXT: v_lshrrev_b32_e32 v1, v2, v1 ; GFX8-NEXT: v_or_b32_e32 v0, v0, v1 @@ -1394,16 +1393,16 @@ ; GFX9-NEXT: v_cvt_f32_ubyte0_e32 v3, 24 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v3, v3 ; GFX9-NEXT: v_mov_b32_e32 v4, 0xffffffe8 -; GFX9-NEXT: v_and_b32_e32 v2, 0xffffff, v2 -; GFX9-NEXT: v_lshlrev_b32_e32 v0, 1, v0 +; GFX9-NEXT: v_mov_b32_e32 v5, 0xffffff +; GFX9-NEXT: v_and_b32_e32 v2, v2, v5 ; GFX9-NEXT: v_mul_f32_e32 v3, 0x4f7ffffe, v3 ; GFX9-NEXT: v_cvt_u32_f32_e32 v3, v3 +; GFX9-NEXT: v_and_b32_e32 v1, v1, v5 +; GFX9-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX9-NEXT: v_mul_lo_u32 v4, v4, v3 ; GFX9-NEXT: v_mul_hi_u32 v4, v3, v4 ; GFX9-NEXT: v_add_u32_e32 v3, v3, v4 ; GFX9-NEXT: v_mul_hi_u32 v3, v2, v3 -; GFX9-NEXT: v_mov_b32_e32 v4, 0xffffff -; GFX9-NEXT: v_and_b32_e32 v1, v1, v4 ; GFX9-NEXT: v_mul_lo_u32 v3, v3, 24 ; GFX9-NEXT: v_sub_u32_e32 v2, v2, v3 ; GFX9-NEXT: v_subrev_u32_e32 v3, 24, v2 @@ -1413,8 +1412,8 @@ ; GFX9-NEXT: v_cmp_le_u32_e32 vcc, 24, v2 ; GFX9-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc ; GFX9-NEXT: v_sub_u32_e32 v3, 23, v2 -; GFX9-NEXT: v_and_b32_e32 v2, v2, v4 -; GFX9-NEXT: v_and_b32_e32 v3, v3, v4 +; GFX9-NEXT: v_and_b32_e32 v2, v2, v5 +; GFX9-NEXT: v_and_b32_e32 v3, v3, v5 ; GFX9-NEXT: v_lshrrev_b32_e32 v1, v2, v1 ; GFX9-NEXT: v_lshl_or_b32 v0, v0, v3, v1 ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -1424,17 +1423,17 @@ ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: v_cvt_f32_ubyte0_e32 v3, 24 -; GFX10-NEXT: v_and_b32_e32 v2, 0xffffff, v2 +; GFX10-NEXT: v_mov_b32_e32 v5, 0xffffff ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX10-NEXT: v_rcp_iflag_f32_e32 v3, v3 +; GFX10-NEXT: v_and_b32_e32 v2, v2, v5 +; GFX10-NEXT: v_and_b32_e32 v1, v1, v5 ; GFX10-NEXT: v_mul_f32_e32 v3, 0x4f7ffffe, v3 ; GFX10-NEXT: v_cvt_u32_f32_e32 v3, v3 ; GFX10-NEXT: v_mul_lo_u32 v4, 0xffffffe8, v3 ; GFX10-NEXT: v_mul_hi_u32 v4, v3, v4 ; GFX10-NEXT: v_add_nc_u32_e32 v3, v3, v4 -; GFX10-NEXT: v_mov_b32_e32 v4, 0xffffff ; GFX10-NEXT: v_mul_hi_u32 v3, v2, v3 -; GFX10-NEXT: v_and_b32_e32 v1, v1, v4 ; GFX10-NEXT: v_mul_lo_u32 v3, v3, 24 ; GFX10-NEXT: v_sub_nc_u32_e32 v2, v2, v3 ; GFX10-NEXT: v_subrev_nc_u32_e32 v3, 24, v2 @@ -1444,8 +1443,8 @@ ; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, 24, v2 ; GFX10-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc_lo ; GFX10-NEXT: v_sub_nc_u32_e32 v3, 23, v2 -; GFX10-NEXT: v_and_b32_e32 v2, v2, v4 -; GFX10-NEXT: v_and_b32_e32 v3, v3, v4 +; GFX10-NEXT: v_and_b32_e32 v2, v2, v5 +; GFX10-NEXT: v_and_b32_e32 v3, v3, v5 ; GFX10-NEXT: v_lshrrev_b32_e32 v1, v2, v1 ; GFX10-NEXT: v_lshl_or_b32 v0, v0, v3, v1 ; GFX10-NEXT: s_setpc_b64 s[30:31] @@ -1461,16 +1460,15 @@ ; GFX6-NEXT: s_movk_i32 s9, 0xff ; GFX6-NEXT: s_mov_b32 s11, 0x80008 ; GFX6-NEXT: s_lshr_b32 s6, s0, 16 -; GFX6-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 -; GFX6-NEXT: v_cvt_u32_f32_e32 v0, v0 ; GFX6-NEXT: s_lshr_b32 s7, s0, 24 ; GFX6-NEXT: s_lshr_b32 s8, s1, 8 ; GFX6-NEXT: s_and_b32 s10, s0, s9 ; GFX6-NEXT: s_bfe_u32 s0, s0, s11 ; GFX6-NEXT: s_and_b32 s1, s1, s9 +; GFX6-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX6-NEXT: s_lshl_b32 s0, s0, 8 ; GFX6-NEXT: s_lshl_b32 s1, s1, 8 -; GFX6-NEXT: v_mov_b32_e32 v1, 0xffffffe8 +; GFX6-NEXT: v_cvt_u32_f32_e32 v0, v0 ; GFX6-NEXT: s_or_b32 s0, s10, s0 ; GFX6-NEXT: s_or_b32 s1, s7, s1 ; GFX6-NEXT: s_and_b32 s7, s8, s9 @@ -1478,19 +1476,19 @@ ; GFX6-NEXT: s_lshr_b32 s10, s2, 24 ; GFX6-NEXT: s_and_b32 s13, s2, s9 ; GFX6-NEXT: s_bfe_u32 s2, s2, s11 -; GFX6-NEXT: v_mul_lo_u32 v2, v1, v0 ; GFX6-NEXT: s_lshl_b32 s2, s2, 8 ; GFX6-NEXT: s_and_b32 s8, s8, s9 ; GFX6-NEXT: s_or_b32 s2, s13, s2 ; GFX6-NEXT: s_bfe_u32 s8, s8, 0x100000 +; GFX6-NEXT: v_mov_b32_e32 v1, 0xffffffe8 ; GFX6-NEXT: s_lshr_b32 s12, s3, 8 ; GFX6-NEXT: s_bfe_u32 s2, s2, 0x100000 ; GFX6-NEXT: s_lshl_b32 s8, s8, 16 ; GFX6-NEXT: s_and_b32 s3, s3, s9 +; GFX6-NEXT: v_mul_lo_u32 v2, v1, v0 ; GFX6-NEXT: s_or_b32 s2, s2, s8 ; GFX6-NEXT: s_lshl_b32 s3, s3, 8 ; GFX6-NEXT: s_and_b32 s8, s12, s9 -; GFX6-NEXT: v_mul_hi_u32 v2, v0, v2 ; GFX6-NEXT: s_or_b32 s3, s10, s3 ; GFX6-NEXT: s_bfe_u32 s8, s8, 0x100000 ; GFX6-NEXT: s_bfe_u32 s3, s3, 0x100000 @@ -1500,74 +1498,74 @@ ; GFX6-NEXT: s_lshr_b32 s10, s4, 24 ; GFX6-NEXT: s_and_b32 s13, s4, s9 ; GFX6-NEXT: s_bfe_u32 s4, s4, s11 +; GFX6-NEXT: v_mul_hi_u32 v2, v0, v2 ; GFX6-NEXT: s_lshl_b32 s4, s4, 8 ; GFX6-NEXT: s_and_b32 s8, s8, s9 -; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v2 -; GFX6-NEXT: v_cvt_f32_ubyte0_e32 v2, 24 +; GFX6-NEXT: v_cvt_f32_ubyte0_e32 v3, 24 ; GFX6-NEXT: s_or_b32 s4, s13, s4 ; GFX6-NEXT: s_bfe_u32 s8, s8, 0x100000 -; GFX6-NEXT: v_rcp_iflag_f32_e32 v2, v2 +; GFX6-NEXT: v_rcp_iflag_f32_e32 v3, v3 ; GFX6-NEXT: s_bfe_u32 s4, s4, 0x100000 ; GFX6-NEXT: s_lshl_b32 s8, s8, 16 ; GFX6-NEXT: s_or_b32 s4, s4, s8 +; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v2 ; GFX6-NEXT: v_mul_hi_u32 v0, s4, v0 -; GFX6-NEXT: v_mul_f32_e32 v2, 0x4f7ffffe, v2 -; GFX6-NEXT: v_cvt_u32_f32_e32 v2, v2 +; GFX6-NEXT: v_mul_f32_e32 v3, 0x4f7ffffe, v3 +; GFX6-NEXT: v_cvt_u32_f32_e32 v3, v3 ; GFX6-NEXT: s_lshr_b32 s12, s5, 8 ; GFX6-NEXT: v_mul_lo_u32 v0, v0, 24 ; GFX6-NEXT: s_and_b32 s5, s5, s9 -; GFX6-NEXT: v_mul_lo_u32 v1, v1, v2 +; GFX6-NEXT: v_mul_lo_u32 v1, v1, v3 ; GFX6-NEXT: s_lshl_b32 s5, s5, 8 ; GFX6-NEXT: v_sub_i32_e32 v0, vcc, s4, v0 -; GFX6-NEXT: v_subrev_i32_e32 v3, vcc, 24, v0 +; GFX6-NEXT: v_subrev_i32_e32 v4, vcc, 24, v0 ; GFX6-NEXT: v_cmp_le_u32_e32 vcc, 24, v0 -; GFX6-NEXT: v_mul_hi_u32 v1, v2, v1 +; GFX6-NEXT: v_mul_hi_u32 v1, v3, v1 ; GFX6-NEXT: s_and_b32 s8, s12, s9 -; GFX6-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc +; GFX6-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc ; GFX6-NEXT: s_or_b32 s5, s10, s5 ; GFX6-NEXT: s_bfe_u32 s8, s8, 0x100000 -; GFX6-NEXT: v_subrev_i32_e32 v3, vcc, 24, v0 +; GFX6-NEXT: v_subrev_i32_e32 v4, vcc, 24, v0 ; GFX6-NEXT: s_bfe_u32 s5, s5, 0x100000 ; GFX6-NEXT: s_lshl_b32 s8, s8, 16 ; GFX6-NEXT: v_cmp_le_u32_e32 vcc, 24, v0 ; GFX6-NEXT: s_or_b32 s5, s5, s8 -; GFX6-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc -; GFX6-NEXT: v_add_i32_e32 v1, vcc, v2, v1 +; GFX6-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc +; GFX6-NEXT: v_add_i32_e32 v1, vcc, v3, v1 ; GFX6-NEXT: v_mul_hi_u32 v1, s5, v1 ; GFX6-NEXT: s_and_b32 s6, s6, s9 ; GFX6-NEXT: s_bfe_u32 s0, s0, 0x100000 ; GFX6-NEXT: s_bfe_u32 s6, s6, 0x100000 ; GFX6-NEXT: v_mul_lo_u32 v1, v1, 24 -; GFX6-NEXT: s_mov_b32 s8, 0xffffff -; GFX6-NEXT: v_sub_i32_e32 v3, vcc, 23, v0 +; GFX6-NEXT: v_mov_b32_e32 v2, 0xffffff +; GFX6-NEXT: v_sub_i32_e32 v4, vcc, 23, v0 ; GFX6-NEXT: s_lshl_b32 s4, s6, 17 ; GFX6-NEXT: s_lshl_b32 s0, s0, 1 ; GFX6-NEXT: s_or_b32 s0, s4, s0 -; GFX6-NEXT: v_and_b32_e32 v2, s8, v3 -; GFX6-NEXT: v_and_b32_e32 v0, s8, v0 -; GFX6-NEXT: v_lshl_b32_e32 v2, s0, v2 +; GFX6-NEXT: v_and_b32_e32 v3, v4, v2 +; GFX6-NEXT: v_and_b32_e32 v0, v0, v2 +; GFX6-NEXT: v_lshl_b32_e32 v3, s0, v3 ; GFX6-NEXT: v_lshr_b32_e32 v0, s2, v0 ; GFX6-NEXT: v_sub_i32_e32 v1, vcc, s5, v1 -; GFX6-NEXT: v_or_b32_e32 v0, v2, v0 -; GFX6-NEXT: v_subrev_i32_e32 v2, vcc, 24, v1 +; GFX6-NEXT: v_or_b32_e32 v0, v3, v0 +; GFX6-NEXT: v_subrev_i32_e32 v3, vcc, 24, v1 ; GFX6-NEXT: v_cmp_le_u32_e32 vcc, 24, v1 -; GFX6-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc -; GFX6-NEXT: v_subrev_i32_e32 v2, vcc, 24, v1 +; GFX6-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc +; GFX6-NEXT: v_subrev_i32_e32 v3, vcc, 24, v1 ; GFX6-NEXT: v_cmp_le_u32_e32 vcc, 24, v1 ; GFX6-NEXT: s_bfe_u32 s1, s1, 0x100000 ; GFX6-NEXT: s_bfe_u32 s7, s7, 0x100000 -; GFX6-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc -; GFX6-NEXT: v_mov_b32_e32 v4, 0xffffff -; GFX6-NEXT: v_sub_i32_e32 v2, vcc, 23, v1 +; GFX6-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc +; GFX6-NEXT: v_sub_i32_e32 v3, vcc, 23, v1 ; GFX6-NEXT: s_lshl_b32 s0, s7, 17 ; GFX6-NEXT: s_lshl_b32 s1, s1, 1 ; GFX6-NEXT: s_or_b32 s0, s0, s1 -; GFX6-NEXT: v_and_b32_e32 v2, v2, v4 -; GFX6-NEXT: v_and_b32_e32 v1, v1, v4 -; GFX6-NEXT: v_lshl_b32_e32 v2, s0, v2 +; GFX6-NEXT: v_and_b32_e32 v3, v3, v2 +; GFX6-NEXT: v_and_b32_e32 v1, v1, v2 +; GFX6-NEXT: v_lshl_b32_e32 v3, s0, v3 ; GFX6-NEXT: v_lshr_b32_e32 v1, s3, v1 +; GFX6-NEXT: v_or_b32_e32 v1, v3, v1 ; GFX6-NEXT: v_bfe_u32 v3, v0, 8, 8 -; GFX6-NEXT: v_or_b32_e32 v1, v2, v1 ; GFX6-NEXT: v_and_b32_e32 v2, s9, v0 ; GFX6-NEXT: v_lshlrev_b32_e32 v3, 8, v3 ; GFX6-NEXT: v_bfe_u32 v0, v0, 16, 8 @@ -1587,20 +1585,18 @@ ; ; GFX8-LABEL: s_fshr_v2i24: ; GFX8: ; %bb.0: -; GFX8-NEXT: v_cvt_f32_ubyte0_e32 v0, 24 -; GFX8-NEXT: v_rcp_iflag_f32_e32 v0, v0 ; GFX8-NEXT: s_movk_i32 s10, 0xff +; GFX8-NEXT: v_cvt_f32_ubyte0_e32 v0, 24 ; GFX8-NEXT: s_lshr_b32 s9, s1, 8 ; GFX8-NEXT: s_bfe_u32 s11, 8, 0x100000 ; GFX8-NEXT: s_and_b32 s1, s1, s10 +; GFX8-NEXT: v_rcp_iflag_f32_e32 v0, v0 ; GFX8-NEXT: s_lshr_b32 s6, s0, 8 ; GFX8-NEXT: s_lshr_b32 s8, s0, 24 ; GFX8-NEXT: s_lshl_b32 s1, s1, s11 -; GFX8-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX8-NEXT: s_and_b32 s6, s6, s10 ; GFX8-NEXT: s_or_b32 s1, s8, s1 ; GFX8-NEXT: s_lshr_b32 s8, s2, 8 -; GFX8-NEXT: v_cvt_u32_f32_e32 v0, v0 ; GFX8-NEXT: s_lshr_b32 s7, s0, 16 ; GFX8-NEXT: s_and_b32 s0, s0, s10 ; GFX8-NEXT: s_lshl_b32 s6, s6, s11 @@ -1612,11 +1608,11 @@ ; GFX8-NEXT: s_lshr_b32 s12, s2, 24 ; GFX8-NEXT: s_and_b32 s2, s2, s10 ; GFX8-NEXT: s_lshl_b32 s8, s8, s11 +; GFX8-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX8-NEXT: s_or_b32 s2, s2, s8 ; GFX8-NEXT: s_and_b32 s8, s9, s10 -; GFX8-NEXT: v_mov_b32_e32 v1, 0xffffffe8 +; GFX8-NEXT: v_cvt_u32_f32_e32 v0, v0 ; GFX8-NEXT: s_bfe_u32 s8, s8, 0x100000 -; GFX8-NEXT: v_mul_lo_u32 v2, v1, v0 ; GFX8-NEXT: s_lshr_b32 s13, s3, 8 ; GFX8-NEXT: s_bfe_u32 s2, s2, 0x100000 ; GFX8-NEXT: s_lshl_b32 s8, s8, 16 @@ -1624,11 +1620,12 @@ ; GFX8-NEXT: s_or_b32 s2, s2, s8 ; GFX8-NEXT: s_lshl_b32 s3, s3, s11 ; GFX8-NEXT: s_and_b32 s8, s13, s10 +; GFX8-NEXT: v_mov_b32_e32 v1, 0xffffffe8 ; GFX8-NEXT: s_or_b32 s3, s12, s3 ; GFX8-NEXT: s_bfe_u32 s8, s8, 0x100000 +; GFX8-NEXT: v_mul_lo_u32 v2, v1, v0 ; GFX8-NEXT: s_bfe_u32 s3, s3, 0x100000 ; GFX8-NEXT: s_lshl_b32 s8, s8, 16 -; GFX8-NEXT: v_mul_hi_u32 v2, v0, v2 ; GFX8-NEXT: s_or_b32 s3, s3, s8 ; GFX8-NEXT: s_lshr_b32 s8, s4, 8 ; GFX8-NEXT: s_and_b32 s8, s8, s10 @@ -1636,78 +1633,78 @@ ; GFX8-NEXT: s_lshr_b32 s12, s4, 24 ; GFX8-NEXT: s_and_b32 s4, s4, s10 ; GFX8-NEXT: s_lshl_b32 s8, s8, s11 +; GFX8-NEXT: v_mul_hi_u32 v2, v0, v2 ; GFX8-NEXT: s_or_b32 s4, s4, s8 ; GFX8-NEXT: s_and_b32 s8, s9, s10 -; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v2 -; GFX8-NEXT: v_cvt_f32_ubyte0_e32 v2, 24 +; GFX8-NEXT: v_cvt_f32_ubyte0_e32 v3, 24 ; GFX8-NEXT: s_bfe_u32 s8, s8, 0x100000 -; GFX8-NEXT: v_rcp_iflag_f32_e32 v2, v2 +; GFX8-NEXT: v_rcp_iflag_f32_e32 v3, v3 ; GFX8-NEXT: s_bfe_u32 s4, s4, 0x100000 ; GFX8-NEXT: s_lshl_b32 s8, s8, 16 ; GFX8-NEXT: s_or_b32 s4, s4, s8 +; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v2 ; GFX8-NEXT: v_mul_hi_u32 v0, s4, v0 -; GFX8-NEXT: v_mul_f32_e32 v2, 0x4f7ffffe, v2 -; GFX8-NEXT: v_cvt_u32_f32_e32 v2, v2 +; GFX8-NEXT: v_mul_f32_e32 v3, 0x4f7ffffe, v3 +; GFX8-NEXT: v_cvt_u32_f32_e32 v3, v3 ; GFX8-NEXT: s_lshr_b32 s13, s5, 8 ; GFX8-NEXT: v_mul_lo_u32 v0, v0, 24 ; GFX8-NEXT: s_and_b32 s5, s5, s10 -; GFX8-NEXT: v_mul_lo_u32 v1, v1, v2 +; GFX8-NEXT: v_mul_lo_u32 v1, v1, v3 ; GFX8-NEXT: s_lshl_b32 s5, s5, s11 ; GFX8-NEXT: v_sub_u32_e32 v0, vcc, s4, v0 -; GFX8-NEXT: v_subrev_u32_e32 v3, vcc, 24, v0 +; GFX8-NEXT: v_subrev_u32_e32 v4, vcc, 24, v0 ; GFX8-NEXT: v_cmp_le_u32_e32 vcc, 24, v0 -; GFX8-NEXT: v_mul_hi_u32 v1, v2, v1 +; GFX8-NEXT: v_mul_hi_u32 v1, v3, v1 ; GFX8-NEXT: s_and_b32 s8, s13, s10 -; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc ; GFX8-NEXT: s_or_b32 s5, s12, s5 ; GFX8-NEXT: s_bfe_u32 s8, s8, 0x100000 -; GFX8-NEXT: v_subrev_u32_e32 v3, vcc, 24, v0 +; GFX8-NEXT: v_subrev_u32_e32 v4, vcc, 24, v0 ; GFX8-NEXT: s_bfe_u32 s5, s5, 0x100000 ; GFX8-NEXT: s_lshl_b32 s8, s8, 16 ; GFX8-NEXT: v_cmp_le_u32_e32 vcc, 24, v0 ; GFX8-NEXT: s_or_b32 s5, s5, s8 -; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc -; GFX8-NEXT: v_add_u32_e32 v1, vcc, v2, v1 +; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc +; GFX8-NEXT: v_add_u32_e32 v1, vcc, v3, v1 ; GFX8-NEXT: v_mul_hi_u32 v1, s5, v1 ; GFX8-NEXT: s_bfe_u32 s0, s0, 0x100000 ; GFX8-NEXT: s_bfe_u32 s6, s6, 0x100000 -; GFX8-NEXT: s_mov_b32 s8, 0xffffff +; GFX8-NEXT: v_mov_b32_e32 v2, 0xffffff ; GFX8-NEXT: v_mul_lo_u32 v1, v1, 24 -; GFX8-NEXT: v_sub_u32_e32 v3, vcc, 23, v0 +; GFX8-NEXT: v_sub_u32_e32 v4, vcc, 23, v0 ; GFX8-NEXT: s_lshl_b32 s4, s6, 17 ; GFX8-NEXT: s_lshl_b32 s0, s0, 1 ; GFX8-NEXT: s_or_b32 s0, s4, s0 -; GFX8-NEXT: v_and_b32_e32 v2, s8, v3 -; GFX8-NEXT: v_and_b32_e32 v0, s8, v0 -; GFX8-NEXT: v_lshlrev_b32_e64 v2, v2, s0 +; GFX8-NEXT: v_and_b32_e32 v3, v4, v2 +; GFX8-NEXT: v_and_b32_e32 v0, v0, v2 +; GFX8-NEXT: v_lshlrev_b32_e64 v3, v3, s0 ; GFX8-NEXT: v_lshrrev_b32_e64 v0, v0, s2 ; GFX8-NEXT: v_sub_u32_e32 v1, vcc, s5, v1 -; GFX8-NEXT: v_or_b32_e32 v0, v2, v0 -; GFX8-NEXT: v_subrev_u32_e32 v2, vcc, 24, v1 +; GFX8-NEXT: v_or_b32_e32 v0, v3, v0 +; GFX8-NEXT: v_subrev_u32_e32 v3, vcc, 24, v1 ; GFX8-NEXT: v_cmp_le_u32_e32 vcc, 24, v1 -; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc -; GFX8-NEXT: v_subrev_u32_e32 v2, vcc, 24, v1 +; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc +; GFX8-NEXT: v_subrev_u32_e32 v3, vcc, 24, v1 ; GFX8-NEXT: v_cmp_le_u32_e32 vcc, 24, v1 ; GFX8-NEXT: s_bfe_u32 s1, s1, 0x100000 ; GFX8-NEXT: s_bfe_u32 s7, s7, 0x100000 -; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc -; GFX8-NEXT: v_mov_b32_e32 v4, 0xffffff -; GFX8-NEXT: v_sub_u32_e32 v2, vcc, 23, v1 +; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc +; GFX8-NEXT: v_sub_u32_e32 v3, vcc, 23, v1 ; GFX8-NEXT: s_lshl_b32 s0, s7, 17 ; GFX8-NEXT: s_lshl_b32 s1, s1, 1 ; GFX8-NEXT: s_or_b32 s0, s0, s1 -; GFX8-NEXT: v_and_b32_e32 v2, v2, v4 -; GFX8-NEXT: v_and_b32_e32 v1, v1, v4 -; GFX8-NEXT: v_lshlrev_b32_e64 v2, v2, s0 +; GFX8-NEXT: v_and_b32_e32 v3, v3, v2 +; GFX8-NEXT: v_and_b32_e32 v1, v1, v2 +; GFX8-NEXT: v_lshlrev_b32_e64 v3, v3, s0 ; GFX8-NEXT: v_lshrrev_b32_e64 v1, v1, s3 -; GFX8-NEXT: v_or_b32_e32 v1, v2, v1 ; GFX8-NEXT: v_mov_b32_e32 v2, 8 +; GFX8-NEXT: v_or_b32_e32 v1, v3, v1 ; GFX8-NEXT: v_lshlrev_b32_sdwa v3, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 ; GFX8-NEXT: v_mov_b32_e32 v4, 16 ; GFX8-NEXT: v_or_b32_sdwa v3, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX8-NEXT: v_lshlrev_b32_sdwa v0, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 ; GFX8-NEXT: v_or_b32_e32 v0, v3, v0 -; GFX8-NEXT: v_and_b32_e32 v3, s10, v1 +; GFX8-NEXT: v_and_b32_e32 v3, 0xff, v1 ; GFX8-NEXT: v_lshlrev_b32_e32 v3, 24, v3 ; GFX8-NEXT: v_lshlrev_b32_sdwa v2, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 ; GFX8-NEXT: v_or_b32_e32 v0, v0, v3 @@ -1794,50 +1791,50 @@ ; GFX9-NEXT: v_add_u32_e32 v1, v2, v1 ; GFX9-NEXT: v_mul_hi_u32 v1, s5, v1 ; GFX9-NEXT: v_sub_u32_e32 v0, s4, v0 -; GFX9-NEXT: v_subrev_u32_e32 v3, 24, v0 +; GFX9-NEXT: v_subrev_u32_e32 v4, 24, v0 ; GFX9-NEXT: v_cmp_le_u32_e32 vcc, 24, v0 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc ; GFX9-NEXT: v_mul_lo_u32 v1, v1, 24 -; GFX9-NEXT: v_subrev_u32_e32 v3, 24, v0 +; GFX9-NEXT: v_subrev_u32_e32 v4, 24, v0 ; GFX9-NEXT: v_cmp_le_u32_e32 vcc, 24, v0 ; GFX9-NEXT: s_bfe_u32 s0, s0, 0x100000 ; GFX9-NEXT: s_bfe_u32 s7, s7, 0x100000 -; GFX9-NEXT: s_mov_b32 s10, 0xffffff -; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc -; GFX9-NEXT: v_sub_u32_e32 v3, 23, v0 +; GFX9-NEXT: v_mov_b32_e32 v3, 0xffffff +; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc +; GFX9-NEXT: v_sub_u32_e32 v4, 23, v0 ; GFX9-NEXT: s_lshl_b32 s4, s7, 17 ; GFX9-NEXT: s_lshl_b32 s0, s0, 1 -; GFX9-NEXT: v_and_b32_e32 v0, s10, v0 +; GFX9-NEXT: v_and_b32_e32 v0, v0, v3 ; GFX9-NEXT: s_or_b32 s0, s4, s0 -; GFX9-NEXT: v_and_b32_e32 v3, s10, v3 +; GFX9-NEXT: v_and_b32_e32 v2, v4, v3 ; GFX9-NEXT: v_lshrrev_b32_e64 v0, v0, s2 ; GFX9-NEXT: v_sub_u32_e32 v1, s5, v1 -; GFX9-NEXT: v_lshl_or_b32 v0, s0, v3, v0 -; GFX9-NEXT: v_subrev_u32_e32 v3, 24, v1 +; GFX9-NEXT: v_lshl_or_b32 v0, s0, v2, v0 +; GFX9-NEXT: v_subrev_u32_e32 v2, 24, v1 ; GFX9-NEXT: v_cmp_le_u32_e32 vcc, 24, v1 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc -; GFX9-NEXT: v_subrev_u32_e32 v3, 24, v1 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc +; GFX9-NEXT: v_subrev_u32_e32 v2, 24, v1 ; GFX9-NEXT: v_cmp_le_u32_e32 vcc, 24, v1 ; GFX9-NEXT: s_bfe_u32 s1, s1, 0x100000 ; GFX9-NEXT: s_bfe_u32 s9, s9, 0x100000 -; GFX9-NEXT: v_mov_b32_e32 v2, 0xffffff -; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc -; GFX9-NEXT: v_sub_u32_e32 v3, 23, v1 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc +; GFX9-NEXT: v_sub_u32_e32 v2, 23, v1 ; GFX9-NEXT: s_lshl_b32 s0, s9, 17 ; GFX9-NEXT: s_lshl_b32 s1, s1, 1 -; GFX9-NEXT: v_and_b32_e32 v1, v1, v2 +; GFX9-NEXT: v_and_b32_e32 v1, v1, v3 +; GFX9-NEXT: s_mov_b32 s6, 8 ; GFX9-NEXT: s_or_b32 s0, s0, s1 -; GFX9-NEXT: v_and_b32_e32 v3, v3, v2 +; GFX9-NEXT: v_and_b32_e32 v2, v2, v3 ; GFX9-NEXT: v_lshrrev_b32_e64 v1, v1, s3 -; GFX9-NEXT: s_mov_b32 s6, 8 -; GFX9-NEXT: v_lshl_or_b32 v1, s0, v3, v1 +; GFX9-NEXT: v_lshl_or_b32 v1, s0, v2, v1 +; GFX9-NEXT: v_mov_b32_e32 v2, 0xff +; GFX9-NEXT: v_lshlrev_b32_sdwa v3, s6, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 ; GFX9-NEXT: s_mov_b32 s8, 16 -; GFX9-NEXT: v_lshlrev_b32_sdwa v2, s6, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 -; GFX9-NEXT: v_and_b32_e32 v3, s12, v1 -; GFX9-NEXT: v_and_or_b32 v2, v0, s12, v2 +; GFX9-NEXT: v_and_or_b32 v3, v0, v2, v3 +; GFX9-NEXT: v_and_b32_e32 v2, v1, v2 ; GFX9-NEXT: v_lshlrev_b32_sdwa v0, s8, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 -; GFX9-NEXT: v_lshlrev_b32_e32 v3, 24, v3 -; GFX9-NEXT: v_or3_b32 v0, v2, v0, v3 +; GFX9-NEXT: v_lshlrev_b32_e32 v2, 24, v2 +; GFX9-NEXT: v_or3_b32 v0, v3, v0, v2 ; GFX9-NEXT: v_bfe_u32 v2, v1, 8, 8 ; GFX9-NEXT: v_bfe_u32 v1, v1, 16, 8 ; GFX9-NEXT: v_lshl_or_b32 v1, v1, 8, v2 @@ -1849,128 +1846,129 @@ ; GFX10: ; %bb.0: ; GFX10-NEXT: v_cvt_f32_ubyte0_e32 v0, 24 ; GFX10-NEXT: v_cvt_f32_ubyte0_e32 v1, 24 -; GFX10-NEXT: s_movk_i32 s9, 0xff -; GFX10-NEXT: s_lshr_b32 s12, s4, 8 +; GFX10-NEXT: v_mov_b32_e32 v2, 0xffffffe8 +; GFX10-NEXT: s_movk_i32 s8, 0xff ; GFX10-NEXT: s_bfe_u32 s10, 8, 0x100000 ; GFX10-NEXT: v_rcp_iflag_f32_e32 v0, v0 ; GFX10-NEXT: v_rcp_iflag_f32_e32 v1, v1 -; GFX10-NEXT: s_lshr_b32 s13, s4, 16 -; GFX10-NEXT: s_and_b32 s12, s12, s9 -; GFX10-NEXT: s_lshr_b32 s14, s4, 24 -; GFX10-NEXT: s_and_b32 s4, s4, s9 -; GFX10-NEXT: s_lshl_b32 s12, s12, s10 -; GFX10-NEXT: s_and_b32 s13, s13, s9 -; GFX10-NEXT: s_or_b32 s4, s4, s12 -; GFX10-NEXT: s_bfe_u32 s12, s13, 0x100000 -; GFX10-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 -; GFX10-NEXT: v_mul_f32_e32 v1, 0x4f7ffffe, v1 -; GFX10-NEXT: s_lshr_b32 s15, s5, 8 -; GFX10-NEXT: s_bfe_u32 s4, s4, 0x100000 -; GFX10-NEXT: s_lshl_b32 s12, s12, 16 -; GFX10-NEXT: v_cvt_u32_f32_e32 v0, v0 -; GFX10-NEXT: v_cvt_u32_f32_e32 v1, v1 -; GFX10-NEXT: s_and_b32 s5, s5, s9 -; GFX10-NEXT: s_or_b32 s4, s4, s12 -; GFX10-NEXT: s_lshl_b32 s5, s5, s10 -; GFX10-NEXT: v_mul_lo_u32 v2, 0xffffffe8, v0 -; GFX10-NEXT: v_mul_lo_u32 v3, 0xffffffe8, v1 -; GFX10-NEXT: s_and_b32 s12, s15, s9 -; GFX10-NEXT: s_or_b32 s5, s14, s5 -; GFX10-NEXT: s_bfe_u32 s12, s12, 0x100000 -; GFX10-NEXT: s_bfe_u32 s5, s5, 0x100000 -; GFX10-NEXT: s_lshl_b32 s12, s12, 16 ; GFX10-NEXT: s_lshr_b32 s11, s1, 8 -; GFX10-NEXT: v_mul_hi_u32 v2, v0, v2 -; GFX10-NEXT: s_or_b32 s5, s5, s12 -; GFX10-NEXT: s_and_b32 s1, s1, s9 +; GFX10-NEXT: s_and_b32 s1, s1, s8 ; GFX10-NEXT: s_lshr_b32 s6, s0, 8 -; GFX10-NEXT: s_lshr_b32 s8, s0, 24 +; GFX10-NEXT: s_lshr_b32 s9, s0, 24 ; GFX10-NEXT: s_lshl_b32 s1, s1, s10 -; GFX10-NEXT: s_and_b32 s6, s6, s9 -; GFX10-NEXT: s_or_b32 s1, s8, s1 -; GFX10-NEXT: v_add_nc_u32_e32 v0, v0, v2 -; GFX10-NEXT: v_mul_hi_u32 v2, v1, v3 -; GFX10-NEXT: s_lshr_b32 s8, s2, 8 +; GFX10-NEXT: s_and_b32 s6, s6, s8 +; GFX10-NEXT: s_or_b32 s1, s9, s1 +; GFX10-NEXT: s_lshr_b32 s9, s4, 8 +; GFX10-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 +; GFX10-NEXT: v_mul_f32_e32 v1, 0x4f7ffffe, v1 ; GFX10-NEXT: s_lshr_b32 s7, s0, 16 -; GFX10-NEXT: s_and_b32 s0, s0, s9 -; GFX10-NEXT: v_mul_hi_u32 v0, s4, v0 +; GFX10-NEXT: s_and_b32 s0, s0, s8 ; GFX10-NEXT: s_lshl_b32 s6, s6, s10 -; GFX10-NEXT: s_and_b32 s8, s8, s9 +; GFX10-NEXT: v_cvt_u32_f32_e32 v0, v0 +; GFX10-NEXT: v_cvt_u32_f32_e32 v1, v1 +; GFX10-NEXT: s_and_b32 s9, s9, s8 ; GFX10-NEXT: s_or_b32 s0, s0, s6 +; GFX10-NEXT: s_and_b32 s6, s7, s8 +; GFX10-NEXT: v_mul_lo_u32 v3, v2, v0 +; GFX10-NEXT: v_mul_lo_u32 v2, v2, v1 +; GFX10-NEXT: s_and_b32 s7, s11, s8 +; GFX10-NEXT: s_lshr_b32 s11, s4, 16 +; GFX10-NEXT: s_lshr_b32 s12, s4, 24 +; GFX10-NEXT: s_and_b32 s4, s4, s8 +; GFX10-NEXT: s_lshl_b32 s9, s9, s10 +; GFX10-NEXT: s_lshr_b32 s13, s5, 8 +; GFX10-NEXT: v_mul_hi_u32 v3, v0, v3 +; GFX10-NEXT: s_or_b32 s4, s4, s9 +; GFX10-NEXT: s_and_b32 s9, s11, s8 +; GFX10-NEXT: v_mul_hi_u32 v2, v1, v2 +; GFX10-NEXT: s_bfe_u32 s9, s9, 0x100000 +; GFX10-NEXT: s_bfe_u32 s4, s4, 0x100000 +; GFX10-NEXT: s_lshl_b32 s9, s9, 16 +; GFX10-NEXT: s_and_b32 s5, s5, s8 +; GFX10-NEXT: v_add_nc_u32_e32 v0, v0, v3 +; GFX10-NEXT: s_lshl_b32 s5, s5, s10 +; GFX10-NEXT: s_or_b32 s4, s4, s9 +; GFX10-NEXT: s_and_b32 s9, s13, s8 +; GFX10-NEXT: s_or_b32 s5, s12, s5 +; GFX10-NEXT: s_bfe_u32 s9, s9, 0x100000 ; GFX10-NEXT: v_add_nc_u32_e32 v1, v1, v2 -; GFX10-NEXT: s_and_b32 s6, s7, s9 -; GFX10-NEXT: s_and_b32 s7, s11, s9 +; GFX10-NEXT: v_mul_hi_u32 v0, s4, v0 +; GFX10-NEXT: s_bfe_u32 s5, s5, 0x100000 +; GFX10-NEXT: s_lshl_b32 s9, s9, 16 ; GFX10-NEXT: s_lshr_b32 s11, s2, 16 -; GFX10-NEXT: v_mul_lo_u32 v0, v0, 24 +; GFX10-NEXT: s_or_b32 s5, s5, s9 +; GFX10-NEXT: s_lshr_b32 s9, s2, 8 ; GFX10-NEXT: v_mul_hi_u32 v1, s5, v1 -; GFX10-NEXT: s_lshr_b32 s13, s2, 24 -; GFX10-NEXT: s_and_b32 s2, s2, s9 -; GFX10-NEXT: s_lshl_b32 s8, s8, s10 -; GFX10-NEXT: s_lshr_b32 s12, s3, 8 -; GFX10-NEXT: s_or_b32 s2, s2, s8 -; GFX10-NEXT: s_and_b32 s8, s11, s9 -; GFX10-NEXT: v_sub_nc_u32_e32 v0, s4, v0 +; GFX10-NEXT: s_and_b32 s9, s9, s8 +; GFX10-NEXT: v_mul_lo_u32 v0, v0, 24 +; GFX10-NEXT: s_lshr_b32 s12, s2, 24 +; GFX10-NEXT: s_and_b32 s2, s2, s8 +; GFX10-NEXT: s_lshl_b32 s9, s9, s10 +; GFX10-NEXT: s_lshr_b32 s13, s3, 8 +; GFX10-NEXT: s_or_b32 s2, s2, s9 ; GFX10-NEXT: v_mul_lo_u32 v1, v1, 24 -; GFX10-NEXT: s_bfe_u32 s4, s8, 0x100000 +; GFX10-NEXT: s_and_b32 s9, s11, s8 +; GFX10-NEXT: v_sub_nc_u32_e32 v0, s4, v0 +; GFX10-NEXT: s_bfe_u32 s9, s9, 0x100000 ; GFX10-NEXT: s_bfe_u32 s2, s2, 0x100000 -; GFX10-NEXT: s_lshl_b32 s4, s4, 16 +; GFX10-NEXT: s_lshl_b32 s4, s9, 16 +; GFX10-NEXT: s_and_b32 s3, s3, s8 ; GFX10-NEXT: v_subrev_nc_u32_e32 v2, 24, v0 +; GFX10-NEXT: v_sub_nc_u32_e32 v1, s5, v1 ; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, 24, v0 -; GFX10-NEXT: s_and_b32 s3, s3, s9 +; GFX10-NEXT: s_lshl_b32 s3, s3, s10 ; GFX10-NEXT: s_or_b32 s2, s2, s4 -; GFX10-NEXT: v_sub_nc_u32_e32 v1, s5, v1 -; GFX10-NEXT: s_mov_b32 s4, 0xffffff +; GFX10-NEXT: s_and_b32 s4, s13, s8 +; GFX10-NEXT: v_subrev_nc_u32_e32 v3, 24, v1 ; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc_lo -; GFX10-NEXT: s_lshl_b32 s3, s3, s10 -; GFX10-NEXT: s_and_b32 s5, s12, s9 -; GFX10-NEXT: v_subrev_nc_u32_e32 v2, 24, v1 ; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, 24, v1 -; GFX10-NEXT: v_subrev_nc_u32_e32 v3, 24, v0 -; GFX10-NEXT: s_or_b32 s3, s13, s3 -; GFX10-NEXT: s_bfe_u32 s5, s5, 0x100000 +; GFX10-NEXT: s_or_b32 s3, s12, s3 +; GFX10-NEXT: s_bfe_u32 s4, s4, 0x100000 ; GFX10-NEXT: s_bfe_u32 s3, s3, 0x100000 -; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc_lo +; GFX10-NEXT: v_subrev_nc_u32_e32 v2, 24, v0 +; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc_lo ; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, 24, v0 -; GFX10-NEXT: s_lshl_b32 s5, s5, 16 +; GFX10-NEXT: s_lshl_b32 s4, s4, 16 ; GFX10-NEXT: s_bfe_u32 s0, s0, 0x100000 -; GFX10-NEXT: s_bfe_u32 s6, s6, 0x100000 -; GFX10-NEXT: v_subrev_nc_u32_e32 v2, 24, v1 -; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc_lo +; GFX10-NEXT: s_or_b32 s3, s3, s4 +; GFX10-NEXT: v_subrev_nc_u32_e32 v3, 24, v1 +; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc_lo ; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, 24, v1 -; GFX10-NEXT: s_or_b32 s3, s3, s5 +; GFX10-NEXT: v_mov_b32_e32 v2, 0xffffff +; GFX10-NEXT: s_bfe_u32 s6, s6, 0x100000 ; GFX10-NEXT: s_bfe_u32 s1, s1, 0x100000 ; GFX10-NEXT: s_bfe_u32 s7, s7, 0x100000 +; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc_lo ; GFX10-NEXT: v_sub_nc_u32_e32 v3, 23, v0 -; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc_lo -; GFX10-NEXT: v_mov_b32_e32 v2, 0xffffff -; GFX10-NEXT: v_and_b32_e32 v0, s4, v0 -; GFX10-NEXT: s_lshl_b32 s5, s6, 17 -; GFX10-NEXT: v_and_b32_e32 v3, s4, v3 +; GFX10-NEXT: v_and_b32_e32 v0, v0, v2 +; GFX10-NEXT: s_lshl_b32 s4, s6, 17 +; GFX10-NEXT: s_lshl_b32 s0, s0, 1 ; GFX10-NEXT: v_sub_nc_u32_e32 v4, 23, v1 ; GFX10-NEXT: v_and_b32_e32 v1, v1, v2 +; GFX10-NEXT: v_and_b32_e32 v3, v3, v2 ; GFX10-NEXT: v_lshrrev_b32_e64 v0, v0, s2 -; GFX10-NEXT: s_lshl_b32 s0, s0, 1 ; GFX10-NEXT: s_lshl_b32 s2, s7, 17 ; GFX10-NEXT: v_and_b32_e32 v2, v4, v2 ; GFX10-NEXT: v_lshrrev_b32_e64 v1, v1, s3 -; GFX10-NEXT: s_or_b32 s0, s5, s0 ; GFX10-NEXT: s_lshl_b32 s1, s1, 1 +; GFX10-NEXT: s_or_b32 s0, s4, s0 +; GFX10-NEXT: s_or_b32 s1, s2, s1 ; GFX10-NEXT: v_lshl_or_b32 v0, s0, v3, v0 -; GFX10-NEXT: s_or_b32 s0, s2, s1 -; GFX10-NEXT: v_lshl_or_b32 v1, s0, v2, v1 +; GFX10-NEXT: v_lshl_or_b32 v1, s1, v2, v1 +; GFX10-NEXT: v_mov_b32_e32 v2, 0xff ; GFX10-NEXT: s_mov_b32 s0, 8 -; GFX10-NEXT: v_lshlrev_b32_sdwa v2, s0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 +; GFX10-NEXT: v_lshlrev_b32_sdwa v3, s0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 ; GFX10-NEXT: s_mov_b32 s0, 16 -; GFX10-NEXT: v_and_b32_e32 v3, s9, v1 +; GFX10-NEXT: v_and_b32_e32 v4, v1, v2 +; GFX10-NEXT: v_and_or_b32 v2, v0, v2, v3 +; GFX10-NEXT: v_lshlrev_b32_sdwa v0, s0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 +; GFX10-NEXT: v_lshlrev_b32_e32 v3, 24, v4 ; GFX10-NEXT: v_bfe_u32 v4, v1, 8, 8 ; GFX10-NEXT: v_bfe_u32 v1, v1, 16, 8 -; GFX10-NEXT: v_and_or_b32 v2, v0, s9, v2 -; GFX10-NEXT: v_lshlrev_b32_sdwa v0, s0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 -; GFX10-NEXT: v_lshlrev_b32_e32 v3, 24, v3 -; GFX10-NEXT: v_lshl_or_b32 v1, v1, 8, v4 ; GFX10-NEXT: v_or3_b32 v0, v2, v0, v3 -; GFX10-NEXT: v_readfirstlane_b32 s1, v1 +; GFX10-NEXT: v_lshl_or_b32 v1, v1, 8, v4 ; GFX10-NEXT: v_readfirstlane_b32 s0, v0 +; GFX10-NEXT: v_readfirstlane_b32 s1, v1 ; GFX10-NEXT: ; return to shader part epilog %lhs = bitcast i48 %lhs.arg to <2 x i24> %rhs = bitcast i48 %rhs.arg to <2 x i24> @@ -1986,42 +1984,40 @@ ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: v_cvt_f32_ubyte0_e32 v6, 24 ; GFX6-NEXT: v_rcp_iflag_f32_e32 v6, v6 -; GFX6-NEXT: v_mov_b32_e32 v7, 0xffffffe8 -; GFX6-NEXT: v_and_b32_e32 v4, 0xffffff, v4 -; GFX6-NEXT: v_cvt_f32_ubyte0_e32 v9, 24 +; GFX6-NEXT: v_mov_b32_e32 v8, 0xffffffe8 +; GFX6-NEXT: v_mov_b32_e32 v10, 0xffffff +; GFX6-NEXT: v_and_b32_e32 v4, v4, v10 +; GFX6-NEXT: v_mul_f32_e32 v7, 0x4f7ffffe, v6 +; GFX6-NEXT: v_cvt_u32_f32_e32 v7, v7 ; GFX6-NEXT: v_mul_f32_e32 v6, 0x4f7ffffe, v6 ; GFX6-NEXT: v_cvt_u32_f32_e32 v6, v6 +; GFX6-NEXT: v_and_b32_e32 v5, v5, v10 +; GFX6-NEXT: v_mul_lo_u32 v9, v8, v7 ; GFX6-NEXT: v_lshlrev_b32_e32 v0, 1, v0 +; GFX6-NEXT: v_and_b32_e32 v2, v2, v10 ; GFX6-NEXT: v_lshlrev_b32_e32 v1, 1, v1 -; GFX6-NEXT: v_mul_lo_u32 v8, v7, v6 -; GFX6-NEXT: v_mul_hi_u32 v8, v6, v8 -; GFX6-NEXT: v_add_i32_e32 v6, vcc, v6, v8 -; GFX6-NEXT: v_mul_hi_u32 v6, v4, v6 -; GFX6-NEXT: v_rcp_iflag_f32_e32 v8, v9 -; GFX6-NEXT: v_mov_b32_e32 v9, 0xffffff -; GFX6-NEXT: v_and_b32_e32 v5, v5, v9 -; GFX6-NEXT: v_mul_lo_u32 v6, v6, 24 -; GFX6-NEXT: v_mul_f32_e32 v8, 0x4f7ffffe, v8 -; GFX6-NEXT: v_cvt_u32_f32_e32 v8, v8 -; GFX6-NEXT: v_and_b32_e32 v2, v2, v9 -; GFX6-NEXT: v_sub_i32_e32 v4, vcc, v4, v6 -; GFX6-NEXT: v_subrev_i32_e32 v6, vcc, 24, v4 +; GFX6-NEXT: v_mul_hi_u32 v9, v7, v9 +; GFX6-NEXT: v_and_b32_e32 v3, v3, v10 +; GFX6-NEXT: v_add_i32_e32 v7, vcc, v7, v9 +; GFX6-NEXT: v_mul_hi_u32 v7, v4, v7 +; GFX6-NEXT: v_mul_lo_u32 v7, v7, 24 +; GFX6-NEXT: v_sub_i32_e32 v4, vcc, v4, v7 +; GFX6-NEXT: v_subrev_i32_e32 v7, vcc, 24, v4 ; GFX6-NEXT: v_cmp_le_u32_e32 vcc, 24, v4 -; GFX6-NEXT: v_cndmask_b32_e32 v4, v4, v6, vcc -; GFX6-NEXT: v_subrev_i32_e32 v6, vcc, 24, v4 +; GFX6-NEXT: v_cndmask_b32_e32 v4, v4, v7, vcc +; GFX6-NEXT: v_subrev_i32_e32 v7, vcc, 24, v4 ; GFX6-NEXT: v_cmp_le_u32_e32 vcc, 24, v4 -; GFX6-NEXT: v_cndmask_b32_e32 v4, v4, v6, vcc -; GFX6-NEXT: v_mul_lo_u32 v6, v7, v8 -; GFX6-NEXT: v_sub_i32_e32 v7, vcc, 23, v4 -; GFX6-NEXT: v_and_b32_e32 v7, v7, v9 -; GFX6-NEXT: v_mul_hi_u32 v6, v8, v6 -; GFX6-NEXT: v_and_b32_e32 v4, v4, v9 -; GFX6-NEXT: v_lshlrev_b32_e32 v0, v7, v0 +; GFX6-NEXT: v_cndmask_b32_e32 v4, v4, v7, vcc +; GFX6-NEXT: v_mul_lo_u32 v7, v8, v6 +; GFX6-NEXT: v_sub_i32_e32 v8, vcc, 23, v4 +; GFX6-NEXT: v_and_b32_e32 v8, v8, v10 +; GFX6-NEXT: v_mul_hi_u32 v7, v6, v7 +; GFX6-NEXT: v_and_b32_e32 v4, v4, v10 +; GFX6-NEXT: v_lshlrev_b32_e32 v0, v8, v0 ; GFX6-NEXT: v_lshrrev_b32_e32 v2, v4, v2 -; GFX6-NEXT: v_add_i32_e32 v6, vcc, v8, v6 +; GFX6-NEXT: v_add_i32_e32 v6, vcc, v6, v7 ; GFX6-NEXT: v_mul_hi_u32 v6, v5, v6 ; GFX6-NEXT: v_or_b32_e32 v0, v0, v2 -; GFX6-NEXT: v_and_b32_e32 v3, v3, v9 ; GFX6-NEXT: v_mul_lo_u32 v6, v6, 24 ; GFX6-NEXT: v_sub_i32_e32 v2, vcc, v5, v6 ; GFX6-NEXT: v_subrev_i32_e32 v4, vcc, 24, v2 @@ -2031,8 +2027,8 @@ ; GFX6-NEXT: v_cmp_le_u32_e32 vcc, 24, v2 ; GFX6-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc ; GFX6-NEXT: v_sub_i32_e32 v4, vcc, 23, v2 -; GFX6-NEXT: v_and_b32_e32 v4, v4, v9 -; GFX6-NEXT: v_and_b32_e32 v2, v2, v9 +; GFX6-NEXT: v_and_b32_e32 v4, v4, v10 +; GFX6-NEXT: v_and_b32_e32 v2, v2, v10 ; GFX6-NEXT: v_lshlrev_b32_e32 v1, v4, v1 ; GFX6-NEXT: v_lshrrev_b32_e32 v2, v2, v3 ; GFX6-NEXT: v_or_b32_e32 v1, v1, v2 @@ -2043,42 +2039,40 @@ ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_cvt_f32_ubyte0_e32 v6, 24 ; GFX8-NEXT: v_rcp_iflag_f32_e32 v6, v6 -; GFX8-NEXT: v_mov_b32_e32 v7, 0xffffffe8 -; GFX8-NEXT: v_and_b32_e32 v4, 0xffffff, v4 -; GFX8-NEXT: v_cvt_f32_ubyte0_e32 v9, 24 +; GFX8-NEXT: v_mov_b32_e32 v8, 0xffffffe8 +; GFX8-NEXT: v_mov_b32_e32 v10, 0xffffff +; GFX8-NEXT: v_and_b32_e32 v4, v4, v10 +; GFX8-NEXT: v_mul_f32_e32 v7, 0x4f7ffffe, v6 +; GFX8-NEXT: v_cvt_u32_f32_e32 v7, v7 ; GFX8-NEXT: v_mul_f32_e32 v6, 0x4f7ffffe, v6 ; GFX8-NEXT: v_cvt_u32_f32_e32 v6, v6 +; GFX8-NEXT: v_and_b32_e32 v5, v5, v10 +; GFX8-NEXT: v_mul_lo_u32 v9, v8, v7 ; GFX8-NEXT: v_lshlrev_b32_e32 v0, 1, v0 +; GFX8-NEXT: v_and_b32_e32 v2, v2, v10 ; GFX8-NEXT: v_lshlrev_b32_e32 v1, 1, v1 -; GFX8-NEXT: v_mul_lo_u32 v8, v7, v6 -; GFX8-NEXT: v_mul_hi_u32 v8, v6, v8 -; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v8 -; GFX8-NEXT: v_mul_hi_u32 v6, v4, v6 -; GFX8-NEXT: v_rcp_iflag_f32_e32 v8, v9 -; GFX8-NEXT: v_mov_b32_e32 v9, 0xffffff -; GFX8-NEXT: v_and_b32_e32 v5, v5, v9 -; GFX8-NEXT: v_mul_lo_u32 v6, v6, 24 -; GFX8-NEXT: v_mul_f32_e32 v8, 0x4f7ffffe, v8 -; GFX8-NEXT: v_cvt_u32_f32_e32 v8, v8 -; GFX8-NEXT: v_and_b32_e32 v2, v2, v9 -; GFX8-NEXT: v_sub_u32_e32 v4, vcc, v4, v6 -; GFX8-NEXT: v_subrev_u32_e32 v6, vcc, 24, v4 +; GFX8-NEXT: v_mul_hi_u32 v9, v7, v9 +; GFX8-NEXT: v_and_b32_e32 v3, v3, v10 +; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v9 +; GFX8-NEXT: v_mul_hi_u32 v7, v4, v7 +; GFX8-NEXT: v_mul_lo_u32 v7, v7, 24 +; GFX8-NEXT: v_sub_u32_e32 v4, vcc, v4, v7 +; GFX8-NEXT: v_subrev_u32_e32 v7, vcc, 24, v4 ; GFX8-NEXT: v_cmp_le_u32_e32 vcc, 24, v4 -; GFX8-NEXT: v_cndmask_b32_e32 v4, v4, v6, vcc -; GFX8-NEXT: v_subrev_u32_e32 v6, vcc, 24, v4 +; GFX8-NEXT: v_cndmask_b32_e32 v4, v4, v7, vcc +; GFX8-NEXT: v_subrev_u32_e32 v7, vcc, 24, v4 ; GFX8-NEXT: v_cmp_le_u32_e32 vcc, 24, v4 -; GFX8-NEXT: v_cndmask_b32_e32 v4, v4, v6, vcc -; GFX8-NEXT: v_mul_lo_u32 v6, v7, v8 -; GFX8-NEXT: v_sub_u32_e32 v7, vcc, 23, v4 -; GFX8-NEXT: v_and_b32_e32 v7, v7, v9 -; GFX8-NEXT: v_mul_hi_u32 v6, v8, v6 -; GFX8-NEXT: v_and_b32_e32 v4, v4, v9 -; GFX8-NEXT: v_lshlrev_b32_e32 v0, v7, v0 +; GFX8-NEXT: v_cndmask_b32_e32 v4, v4, v7, vcc +; GFX8-NEXT: v_mul_lo_u32 v7, v8, v6 +; GFX8-NEXT: v_sub_u32_e32 v8, vcc, 23, v4 +; GFX8-NEXT: v_and_b32_e32 v8, v8, v10 +; GFX8-NEXT: v_mul_hi_u32 v7, v6, v7 +; GFX8-NEXT: v_and_b32_e32 v4, v4, v10 +; GFX8-NEXT: v_lshlrev_b32_e32 v0, v8, v0 ; GFX8-NEXT: v_lshrrev_b32_e32 v2, v4, v2 -; GFX8-NEXT: v_add_u32_e32 v6, vcc, v8, v6 +; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v7 ; GFX8-NEXT: v_mul_hi_u32 v6, v5, v6 ; GFX8-NEXT: v_or_b32_e32 v0, v0, v2 -; GFX8-NEXT: v_and_b32_e32 v3, v3, v9 ; GFX8-NEXT: v_mul_lo_u32 v6, v6, 24 ; GFX8-NEXT: v_sub_u32_e32 v2, vcc, v5, v6 ; GFX8-NEXT: v_subrev_u32_e32 v4, vcc, 24, v2 @@ -2088,8 +2082,8 @@ ; GFX8-NEXT: v_cmp_le_u32_e32 vcc, 24, v2 ; GFX8-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc ; GFX8-NEXT: v_sub_u32_e32 v4, vcc, 23, v2 -; GFX8-NEXT: v_and_b32_e32 v4, v4, v9 -; GFX8-NEXT: v_and_b32_e32 v2, v2, v9 +; GFX8-NEXT: v_and_b32_e32 v4, v4, v10 +; GFX8-NEXT: v_and_b32_e32 v2, v2, v10 ; GFX8-NEXT: v_lshlrev_b32_e32 v1, v4, v1 ; GFX8-NEXT: v_lshrrev_b32_e32 v2, v2, v3 ; GFX8-NEXT: v_or_b32_e32 v1, v1, v2 @@ -2100,43 +2094,41 @@ ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: v_cvt_f32_ubyte0_e32 v6, 24 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v6, v6 -; GFX9-NEXT: v_mov_b32_e32 v7, 0xffffffe8 -; GFX9-NEXT: v_cvt_f32_ubyte0_e32 v9, 24 -; GFX9-NEXT: v_rcp_iflag_f32_e32 v9, v9 +; GFX9-NEXT: v_mov_b32_e32 v8, 0xffffffe8 +; GFX9-NEXT: v_mov_b32_e32 v10, 0xffffff +; GFX9-NEXT: v_and_b32_e32 v4, v4, v10 +; GFX9-NEXT: v_mul_f32_e32 v7, 0x4f7ffffe, v6 +; GFX9-NEXT: v_cvt_u32_f32_e32 v7, v7 ; GFX9-NEXT: v_mul_f32_e32 v6, 0x4f7ffffe, v6 ; GFX9-NEXT: v_cvt_u32_f32_e32 v6, v6 -; GFX9-NEXT: v_and_b32_e32 v4, 0xffffff, v4 -; GFX9-NEXT: v_mul_f32_e32 v9, 0x4f7ffffe, v9 +; GFX9-NEXT: v_and_b32_e32 v5, v5, v10 +; GFX9-NEXT: v_mul_lo_u32 v9, v8, v7 +; GFX9-NEXT: v_and_b32_e32 v2, v2, v10 +; GFX9-NEXT: v_mul_lo_u32 v8, v8, v6 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 1, v0 -; GFX9-NEXT: v_mul_lo_u32 v8, v7, v6 -; GFX9-NEXT: v_lshlrev_b32_e32 v1, 1, v1 +; GFX9-NEXT: v_mul_hi_u32 v9, v7, v9 +; GFX9-NEXT: v_and_b32_e32 v3, v3, v10 ; GFX9-NEXT: v_mul_hi_u32 v8, v6, v8 +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 1, v1 +; GFX9-NEXT: v_add_u32_e32 v7, v7, v9 +; GFX9-NEXT: v_mul_hi_u32 v7, v4, v7 ; GFX9-NEXT: v_add_u32_e32 v6, v6, v8 -; GFX9-NEXT: v_cvt_u32_f32_e32 v8, v9 -; GFX9-NEXT: v_mul_hi_u32 v6, v4, v6 -; GFX9-NEXT: v_mov_b32_e32 v9, 0xffffff -; GFX9-NEXT: v_and_b32_e32 v5, v5, v9 -; GFX9-NEXT: v_mul_lo_u32 v7, v7, v8 +; GFX9-NEXT: v_mul_hi_u32 v6, v5, v6 +; GFX9-NEXT: v_mul_lo_u32 v7, v7, 24 ; GFX9-NEXT: v_mul_lo_u32 v6, v6, 24 -; GFX9-NEXT: v_and_b32_e32 v2, v2, v9 -; GFX9-NEXT: v_and_b32_e32 v3, v3, v9 -; GFX9-NEXT: v_mul_hi_u32 v7, v8, v7 -; GFX9-NEXT: v_sub_u32_e32 v4, v4, v6 -; GFX9-NEXT: v_subrev_u32_e32 v6, 24, v4 +; GFX9-NEXT: v_sub_u32_e32 v4, v4, v7 +; GFX9-NEXT: v_subrev_u32_e32 v7, 24, v4 ; GFX9-NEXT: v_cmp_le_u32_e32 vcc, 24, v4 -; GFX9-NEXT: v_add_u32_e32 v7, v8, v7 -; GFX9-NEXT: v_mul_hi_u32 v7, v5, v7 -; GFX9-NEXT: v_cndmask_b32_e32 v4, v4, v6, vcc -; GFX9-NEXT: v_subrev_u32_e32 v6, 24, v4 +; GFX9-NEXT: v_cndmask_b32_e32 v4, v4, v7, vcc +; GFX9-NEXT: v_subrev_u32_e32 v7, 24, v4 ; GFX9-NEXT: v_cmp_le_u32_e32 vcc, 24, v4 -; GFX9-NEXT: v_mul_lo_u32 v7, v7, 24 -; GFX9-NEXT: v_cndmask_b32_e32 v4, v4, v6, vcc -; GFX9-NEXT: v_sub_u32_e32 v6, 23, v4 -; GFX9-NEXT: v_and_b32_e32 v4, v4, v9 -; GFX9-NEXT: v_and_b32_e32 v6, v6, v9 +; GFX9-NEXT: v_cndmask_b32_e32 v4, v4, v7, vcc +; GFX9-NEXT: v_sub_u32_e32 v7, 23, v4 +; GFX9-NEXT: v_and_b32_e32 v4, v4, v10 +; GFX9-NEXT: v_and_b32_e32 v7, v7, v10 ; GFX9-NEXT: v_lshrrev_b32_e32 v2, v4, v2 -; GFX9-NEXT: v_lshl_or_b32 v0, v0, v6, v2 -; GFX9-NEXT: v_sub_u32_e32 v2, v5, v7 +; GFX9-NEXT: v_lshl_or_b32 v0, v0, v7, v2 +; GFX9-NEXT: v_sub_u32_e32 v2, v5, v6 ; GFX9-NEXT: v_subrev_u32_e32 v4, 24, v2 ; GFX9-NEXT: v_cmp_le_u32_e32 vcc, 24, v2 ; GFX9-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc @@ -2144,8 +2136,8 @@ ; GFX9-NEXT: v_cmp_le_u32_e32 vcc, 24, v2 ; GFX9-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc ; GFX9-NEXT: v_sub_u32_e32 v4, 23, v2 -; GFX9-NEXT: v_and_b32_e32 v2, v2, v9 -; GFX9-NEXT: v_and_b32_e32 v4, v4, v9 +; GFX9-NEXT: v_and_b32_e32 v2, v2, v10 +; GFX9-NEXT: v_and_b32_e32 v4, v4, v10 ; GFX9-NEXT: v_lshrrev_b32_e32 v2, v2, v3 ; GFX9-NEXT: v_lshl_or_b32 v1, v1, v4, v2 ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -2155,32 +2147,31 @@ ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: v_cvt_f32_ubyte0_e32 v6, 24 -; GFX10-NEXT: v_cvt_f32_ubyte0_e32 v7, 24 +; GFX10-NEXT: v_mov_b32_e32 v8, 0xffffffe8 ; GFX10-NEXT: v_mov_b32_e32 v10, 0xffffff -; GFX10-NEXT: v_and_b32_e32 v4, 0xffffff, v4 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 1, v0 +; GFX10-NEXT: v_lshlrev_b32_e32 v1, 1, v1 ; GFX10-NEXT: v_rcp_iflag_f32_e32 v6, v6 -; GFX10-NEXT: v_rcp_iflag_f32_e32 v7, v7 +; GFX10-NEXT: v_and_b32_e32 v4, v4, v10 ; GFX10-NEXT: v_and_b32_e32 v5, v5, v10 ; GFX10-NEXT: v_and_b32_e32 v2, v2, v10 ; GFX10-NEXT: v_and_b32_e32 v3, v3, v10 -; GFX10-NEXT: v_lshlrev_b32_e32 v1, 1, v1 +; GFX10-NEXT: v_mul_f32_e32 v7, 0x4f7ffffe, v6 ; GFX10-NEXT: v_mul_f32_e32 v6, 0x4f7ffffe, v6 -; GFX10-NEXT: v_mul_f32_e32 v7, 0x4f7ffffe, v7 -; GFX10-NEXT: v_cvt_u32_f32_e32 v6, v6 ; GFX10-NEXT: v_cvt_u32_f32_e32 v7, v7 -; GFX10-NEXT: v_mul_lo_u32 v8, 0xffffffe8, v6 -; GFX10-NEXT: v_mul_lo_u32 v9, 0xffffffe8, v7 -; GFX10-NEXT: v_mul_hi_u32 v8, v6, v8 +; GFX10-NEXT: v_cvt_u32_f32_e32 v6, v6 +; GFX10-NEXT: v_mul_lo_u32 v9, v8, v7 +; GFX10-NEXT: v_mul_lo_u32 v8, v8, v6 ; GFX10-NEXT: v_mul_hi_u32 v9, v7, v9 -; GFX10-NEXT: v_add_nc_u32_e32 v6, v6, v8 +; GFX10-NEXT: v_mul_hi_u32 v8, v6, v8 ; GFX10-NEXT: v_add_nc_u32_e32 v7, v7, v9 -; GFX10-NEXT: v_mul_hi_u32 v6, v4, v6 -; GFX10-NEXT: v_mul_hi_u32 v7, v5, v7 -; GFX10-NEXT: v_mul_lo_u32 v6, v6, 24 +; GFX10-NEXT: v_add_nc_u32_e32 v6, v6, v8 +; GFX10-NEXT: v_mul_hi_u32 v7, v4, v7 +; GFX10-NEXT: v_mul_hi_u32 v6, v5, v6 ; GFX10-NEXT: v_mul_lo_u32 v7, v7, 24 -; GFX10-NEXT: v_sub_nc_u32_e32 v4, v4, v6 -; GFX10-NEXT: v_sub_nc_u32_e32 v5, v5, v7 +; GFX10-NEXT: v_mul_lo_u32 v6, v6, 24 +; GFX10-NEXT: v_sub_nc_u32_e32 v4, v4, v7 +; GFX10-NEXT: v_sub_nc_u32_e32 v5, v5, v6 ; GFX10-NEXT: v_subrev_nc_u32_e32 v6, 24, v4 ; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, 24, v4 ; GFX10-NEXT: v_subrev_nc_u32_e32 v7, 24, v5 @@ -4008,16 +3999,16 @@ ; GFX8-NEXT: v_or_b32_e32 v0, v0, v9 ; GFX8-NEXT: v_lshlrev_b16_e32 v9, 1, v2 ; GFX8-NEXT: v_xor_b32_e32 v4, -1, v4 -; GFX8-NEXT: v_lshlrev_b16_sdwa v2, v7, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX8-NEXT: v_lshrrev_b32_e32 v7, 16, v4 -; GFX8-NEXT: v_xor_b32_e32 v10, -1, v4 +; GFX8-NEXT: v_lshrrev_b32_e32 v10, 16, v4 +; GFX8-NEXT: v_xor_b32_e32 v11, -1, v4 ; GFX8-NEXT: v_lshlrev_b16_e32 v4, v4, v6 ; GFX8-NEXT: v_lshrrev_b16_e32 v6, 1, v9 -; GFX8-NEXT: v_lshrrev_b16_e32 v6, v10, v6 +; GFX8-NEXT: v_lshlrev_b16_sdwa v2, v7, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX8-NEXT: v_lshrrev_b16_e32 v6, v11, v6 ; GFX8-NEXT: v_or_b32_e32 v4, v4, v6 -; GFX8-NEXT: v_xor_b32_e32 v6, -1, v7 +; GFX8-NEXT: v_xor_b32_e32 v6, -1, v10 ; GFX8-NEXT: v_lshrrev_b16_e32 v2, 1, v2 -; GFX8-NEXT: v_lshlrev_b16_e32 v0, v7, v0 +; GFX8-NEXT: v_lshlrev_b16_e32 v0, v10, v0 ; GFX8-NEXT: v_lshrrev_b16_e32 v2, v6, v2 ; GFX8-NEXT: v_or_b32_e32 v0, v0, v2 ; GFX8-NEXT: v_mov_b32_e32 v2, 16 @@ -4026,22 +4017,21 @@ ; GFX8-NEXT: v_lshlrev_b16_e32 v4, 1, v1 ; GFX8-NEXT: v_lshrrev_b16_e32 v6, 15, v3 ; GFX8-NEXT: v_or_b32_e32 v4, v4, v6 -; GFX8-NEXT: v_mov_b32_e32 v6, 1 -; GFX8-NEXT: v_lshlrev_b16_sdwa v1, v6, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX8-NEXT: v_lshrrev_b16_sdwa v7, v8, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX8-NEXT: v_or_b32_e32 v1, v1, v7 -; GFX8-NEXT: v_lshlrev_b16_e32 v7, 1, v3 +; GFX8-NEXT: v_lshlrev_b16_sdwa v1, v7, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX8-NEXT: v_lshrrev_b16_sdwa v6, v8, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX8-NEXT: v_or_b32_e32 v1, v1, v6 +; GFX8-NEXT: v_lshlrev_b16_e32 v6, 1, v3 ; GFX8-NEXT: v_xor_b32_e32 v5, -1, v5 -; GFX8-NEXT: v_lshlrev_b16_sdwa v3, v6, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX8-NEXT: v_lshrrev_b32_e32 v6, 16, v5 +; GFX8-NEXT: v_lshlrev_b16_sdwa v3, v7, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX8-NEXT: v_lshrrev_b32_e32 v7, 16, v5 ; GFX8-NEXT: v_xor_b32_e32 v8, -1, v5 ; GFX8-NEXT: v_lshlrev_b16_e32 v4, v5, v4 -; GFX8-NEXT: v_lshrrev_b16_e32 v5, 1, v7 +; GFX8-NEXT: v_lshrrev_b16_e32 v5, 1, v6 ; GFX8-NEXT: v_lshrrev_b16_e32 v5, v8, v5 ; GFX8-NEXT: v_or_b32_e32 v4, v4, v5 -; GFX8-NEXT: v_xor_b32_e32 v5, -1, v6 +; GFX8-NEXT: v_xor_b32_e32 v5, -1, v7 ; GFX8-NEXT: v_lshrrev_b16_e32 v3, 1, v3 -; GFX8-NEXT: v_lshlrev_b16_e32 v1, v6, v1 +; GFX8-NEXT: v_lshlrev_b16_e32 v1, v7, v1 ; GFX8-NEXT: v_lshrrev_b16_e32 v3, v5, v3 ; GFX8-NEXT: v_or_b32_e32 v1, v1, v3 ; GFX8-NEXT: v_lshlrev_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 Index: llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.i16.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.i16.ll +++ llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.i16.ll @@ -314,14 +314,14 @@ ; GFX9-NEXT: s_load_dword s0, s[2:3], 0x0 ; GFX9-NEXT: v_and_b32_e32 v1, 1, v1 ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 4, v1 -; GFX9-NEXT: s_mov_b32 s1, 0xffff -; GFX9-NEXT: v_lshlrev_b32_sdwa v2, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; GFX9-NEXT: v_lshlrev_b32_e64 v0, v1, s1 -; GFX9-NEXT: v_xor_b32_e32 v3, -1, v0 +; GFX9-NEXT: v_mov_b32_e32 v2, 0xffff +; GFX9-NEXT: v_lshlrev_b32_sdwa v3, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; GFX9-NEXT: v_lshlrev_b32_e32 v0, v1, v2 +; GFX9-NEXT: v_xor_b32_e32 v2, -1, v0 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_and_or_b32 v2, s0, v3, v2 +; GFX9-NEXT: v_and_or_b32 v2, s0, v2, v3 ; GFX9-NEXT: global_store_dword v[0:1], v2, off ; GFX9-NEXT: s_endpgm ; @@ -330,15 +330,15 @@ ; GFX8-NEXT: s_load_dword s0, s[2:3], 0x0 ; GFX8-NEXT: v_and_b32_e32 v1, 1, v1 ; GFX8-NEXT: v_lshlrev_b32_e32 v1, 4, v1 -; GFX8-NEXT: s_mov_b32 s1, 0xffff -; GFX8-NEXT: v_lshlrev_b32_sdwa v2, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; GFX8-NEXT: v_lshlrev_b32_e64 v0, v1, s1 +; GFX8-NEXT: v_mov_b32_e32 v2, 0xffff +; GFX8-NEXT: v_lshlrev_b32_sdwa v3, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; GFX8-NEXT: v_lshlrev_b32_e32 v0, v1, v2 ; GFX8-NEXT: v_xor_b32_e32 v0, -1, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_and_b32_e32 v3, s0, v0 +; GFX8-NEXT: v_and_b32_e32 v2, s0, v0 ; GFX8-NEXT: v_mov_b32_e32 v0, 0 ; GFX8-NEXT: v_mov_b32_e32 v1, 0 -; GFX8-NEXT: v_or_b32_e32 v2, v3, v2 +; GFX8-NEXT: v_or_b32_e32 v2, v2, v3 ; GFX8-NEXT: flat_store_dword v[0:1], v2 ; GFX8-NEXT: s_endpgm ; @@ -346,27 +346,27 @@ ; GFX7: ; %bb.0: ; GFX7-NEXT: s_load_dword s0, s[2:3], 0x0 ; GFX7-NEXT: v_and_b32_e32 v1, 1, v1 -; GFX7-NEXT: s_mov_b32 s1, 0xffff +; GFX7-NEXT: v_mov_b32_e32 v2, 0xffff ; GFX7-NEXT: v_lshlrev_b32_e32 v1, 4, v1 -; GFX7-NEXT: v_and_b32_e32 v0, s1, v0 -; GFX7-NEXT: v_lshlrev_b32_e32 v2, v1, v0 -; GFX7-NEXT: v_lshl_b32_e32 v0, s1, v1 +; GFX7-NEXT: v_and_b32_e32 v0, v0, v2 +; GFX7-NEXT: v_lshlrev_b32_e32 v3, v1, v0 +; GFX7-NEXT: v_lshlrev_b32_e32 v0, v1, v2 ; GFX7-NEXT: v_xor_b32_e32 v0, -1, v0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_and_b32_e32 v3, s0, v0 +; GFX7-NEXT: v_and_b32_e32 v2, s0, v0 ; GFX7-NEXT: v_mov_b32_e32 v0, 0 ; GFX7-NEXT: v_mov_b32_e32 v1, 0 -; GFX7-NEXT: v_or_b32_e32 v2, v3, v2 +; GFX7-NEXT: v_or_b32_e32 v2, v2, v3 ; GFX7-NEXT: flat_store_dword v[0:1], v2 ; GFX7-NEXT: s_endpgm ; ; GFX10-LABEL: insertelement_s_v2i16_v_v: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dword s0, s[2:3], 0x0 ; GFX10-NEXT: v_and_b32_e32 v1, 1, v1 -; GFX10-NEXT: s_mov_b32 s1, 0xffff +; GFX10-NEXT: s_load_dword s0, s[2:3], 0x0 +; GFX10-NEXT: v_mov_b32_e32 v2, 0xffff ; GFX10-NEXT: v_lshlrev_b32_e32 v1, 4, v1 -; GFX10-NEXT: v_lshlrev_b32_e64 v2, v1, s1 +; GFX10-NEXT: v_lshlrev_b32_e32 v2, v1, v2 ; GFX10-NEXT: v_lshlrev_b32_sdwa v3, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 @@ -537,9 +537,9 @@ ; GFX9-NEXT: global_load_dword v4, v[0:1], off ; GFX9-NEXT: v_and_b32_e32 v0, 1, v3 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 4, v0 -; GFX9-NEXT: s_mov_b32 s0, 0xffff +; GFX9-NEXT: v_mov_b32_e32 v1, 0xffff ; GFX9-NEXT: v_lshlrev_b32_sdwa v2, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; GFX9-NEXT: v_lshlrev_b32_e64 v0, v0, s0 +; GFX9-NEXT: v_lshlrev_b32_e32 v0, v0, v1 ; GFX9-NEXT: v_xor_b32_e32 v3, -1, v0 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 @@ -551,12 +551,12 @@ ; GFX8-LABEL: insertelement_v_v2i16_v_v: ; GFX8: ; %bb.0: ; GFX8-NEXT: flat_load_dword v0, v[0:1] -; GFX8-NEXT: v_and_b32_e32 v1, 1, v3 -; GFX8-NEXT: s_mov_b32 s0, 0xffff -; GFX8-NEXT: v_lshlrev_b32_e32 v1, 4, v1 -; GFX8-NEXT: v_lshlrev_b32_sdwa v2, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; GFX8-NEXT: v_lshlrev_b32_e64 v1, v1, s0 +; GFX8-NEXT: v_and_b32_e32 v3, 1, v3 +; GFX8-NEXT: v_mov_b32_e32 v1, 0xffff +; GFX8-NEXT: v_lshlrev_b32_e32 v3, 4, v3 +; GFX8-NEXT: v_lshlrev_b32_e32 v1, v3, v1 ; GFX8-NEXT: v_xor_b32_e32 v1, -1, v1 +; GFX8-NEXT: v_lshlrev_b32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_and_b32_e32 v3, v0, v1 ; GFX8-NEXT: v_mov_b32_e32 v0, 0 @@ -568,13 +568,13 @@ ; GFX7-LABEL: insertelement_v_v2i16_v_v: ; GFX7: ; %bb.0: ; GFX7-NEXT: flat_load_dword v0, v[0:1] -; GFX7-NEXT: s_mov_b32 s0, 0xffff -; GFX7-NEXT: v_and_b32_e32 v1, 1, v3 -; GFX7-NEXT: v_and_b32_e32 v2, s0, v2 -; GFX7-NEXT: v_lshlrev_b32_e32 v1, 4, v1 -; GFX7-NEXT: v_lshlrev_b32_e32 v2, v1, v2 -; GFX7-NEXT: v_lshl_b32_e32 v1, s0, v1 +; GFX7-NEXT: v_and_b32_e32 v3, 1, v3 +; GFX7-NEXT: v_mov_b32_e32 v1, 0xffff +; GFX7-NEXT: v_lshlrev_b32_e32 v3, 4, v3 +; GFX7-NEXT: v_and_b32_e32 v2, v2, v1 +; GFX7-NEXT: v_lshlrev_b32_e32 v1, v3, v1 ; GFX7-NEXT: v_xor_b32_e32 v1, -1, v1 +; GFX7-NEXT: v_lshlrev_b32_e32 v2, v3, v2 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_and_b32_e32 v3, v0, v1 ; GFX7-NEXT: v_mov_b32_e32 v0, 0 @@ -587,9 +587,9 @@ ; GFX10: ; %bb.0: ; GFX10-NEXT: global_load_dword v4, v[0:1], off ; GFX10-NEXT: v_and_b32_e32 v0, 1, v3 -; GFX10-NEXT: s_mov_b32 s0, 0xffff +; GFX10-NEXT: v_mov_b32_e32 v1, 0xffff ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 4, v0 -; GFX10-NEXT: v_lshlrev_b32_e64 v1, v0, s0 +; GFX10-NEXT: v_lshlrev_b32_e32 v1, v0, v1 ; GFX10-NEXT: v_lshlrev_b32_sdwa v2, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; GFX10-NEXT: v_xor_b32_e32 v3, -1, v1 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 @@ -991,16 +991,16 @@ ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; GFX9-NEXT: v_lshrrev_b32_e32 v2, 1, v1 +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v2 ; GFX9-NEXT: v_and_b32_e32 v1, 1, v1 ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 4, v1 -; GFX9-NEXT: s_mov_b32 s2, 0xffff ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v3, s0 ; GFX9-NEXT: v_mov_b32_e32 v4, s1 -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v2 -; GFX9-NEXT: v_lshlrev_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; GFX9-NEXT: v_lshlrev_b32_e64 v1, v1, s2 ; GFX9-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc +; GFX9-NEXT: v_mov_b32_e32 v4, 0xffff +; GFX9-NEXT: v_lshlrev_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; GFX9-NEXT: v_lshlrev_b32_e32 v1, v1, v4 ; GFX9-NEXT: v_xor_b32_e32 v1, -1, v1 ; GFX9-NEXT: v_and_or_b32 v4, v3, v1, v0 ; GFX9-NEXT: v_mov_b32_e32 v0, s0 @@ -1017,16 +1017,16 @@ ; GFX8: ; %bb.0: ; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; GFX8-NEXT: v_lshrrev_b32_e32 v2, 1, v1 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 1, v2 ; GFX8-NEXT: v_and_b32_e32 v1, 1, v1 ; GFX8-NEXT: v_lshlrev_b32_e32 v1, 4, v1 -; GFX8-NEXT: s_mov_b32 s2, 0xffff ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v3, s0 ; GFX8-NEXT: v_mov_b32_e32 v4, s1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 1, v2 -; GFX8-NEXT: v_lshlrev_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; GFX8-NEXT: v_lshlrev_b32_e64 v1, v1, s2 ; GFX8-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc +; GFX8-NEXT: v_mov_b32_e32 v4, 0xffff +; GFX8-NEXT: v_lshlrev_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; GFX8-NEXT: v_lshlrev_b32_e32 v1, v1, v4 ; GFX8-NEXT: v_xor_b32_e32 v1, -1, v1 ; GFX8-NEXT: v_and_b32_e32 v1, v3, v1 ; GFX8-NEXT: v_or_b32_e32 v4, v1, v0 @@ -1044,17 +1044,17 @@ ; GFX7: ; %bb.0: ; GFX7-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; GFX7-NEXT: v_lshrrev_b32_e32 v2, 1, v1 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v2 ; GFX7-NEXT: v_and_b32_e32 v1, 1, v1 -; GFX7-NEXT: s_mov_b32 s2, 0xffff ; GFX7-NEXT: v_lshlrev_b32_e32 v1, 4, v1 -; GFX7-NEXT: v_and_b32_e32 v0, s2, v0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v3, s0 ; GFX7-NEXT: v_mov_b32_e32 v4, s1 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v2 -; GFX7-NEXT: v_lshlrev_b32_e32 v0, v1, v0 -; GFX7-NEXT: v_lshl_b32_e32 v1, s2, v1 ; GFX7-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc +; GFX7-NEXT: v_mov_b32_e32 v4, 0xffff +; GFX7-NEXT: v_and_b32_e32 v0, v0, v4 +; GFX7-NEXT: v_lshlrev_b32_e32 v0, v1, v0 +; GFX7-NEXT: v_lshlrev_b32_e32 v1, v1, v4 ; GFX7-NEXT: v_xor_b32_e32 v1, -1, v1 ; GFX7-NEXT: v_and_b32_e32 v1, v3, v1 ; GFX7-NEXT: v_or_b32_e32 v4, v1, v0 @@ -1072,11 +1072,11 @@ ; GFX10: ; %bb.0: ; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; GFX10-NEXT: v_and_b32_e32 v2, 1, v1 +; GFX10-NEXT: v_mov_b32_e32 v3, 0xffff ; GFX10-NEXT: v_lshrrev_b32_e32 v4, 1, v1 -; GFX10-NEXT: s_mov_b32 s2, 0xffff ; GFX10-NEXT: v_lshlrev_b32_e32 v2, 4, v2 ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v4 -; GFX10-NEXT: v_lshlrev_b32_e64 v3, v2, s2 +; GFX10-NEXT: v_lshlrev_b32_e32 v3, v2, v3 ; GFX10-NEXT: v_lshlrev_b32_sdwa v2, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; GFX10-NEXT: v_xor_b32_e32 v3, -1, v3 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) @@ -1300,20 +1300,20 @@ ; GFX9-LABEL: insertelement_v_v4i16_v_v: ; GFX9: ; %bb.0: ; GFX9-NEXT: global_load_dwordx2 v[0:1], v[0:1], off -; GFX9-NEXT: v_lshrrev_b32_e32 v6, 1, v3 +; GFX9-NEXT: v_lshrrev_b32_e32 v7, 1, v3 ; GFX9-NEXT: v_and_b32_e32 v3, 1, v3 -; GFX9-NEXT: s_mov_b32 s0, 0xffff +; GFX9-NEXT: v_mov_b32_e32 v6, 0xffff ; GFX9-NEXT: v_lshlrev_b32_e32 v3, 4, v3 ; GFX9-NEXT: v_lshlrev_b32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; GFX9-NEXT: v_lshlrev_b32_e64 v3, v3, s0 -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v6 +; GFX9-NEXT: v_lshlrev_b32_e32 v3, v3, v6 +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v7 ; GFX9-NEXT: v_xor_b32_e32 v3, -1, v3 ; GFX9-NEXT: v_mov_b32_e32 v4, 0 -; GFX9-NEXT: v_cmp_eq_u32_e64 s[0:1], 0, v6 +; GFX9-NEXT: v_cmp_eq_u32_e64 s[0:1], 0, v7 ; GFX9-NEXT: v_mov_b32_e32 v5, 0 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_cndmask_b32_e32 v7, v0, v1, vcc -; GFX9-NEXT: v_and_or_b32 v2, v7, v3, v2 +; GFX9-NEXT: v_cndmask_b32_e32 v6, v0, v1, vcc +; GFX9-NEXT: v_and_or_b32 v2, v6, v3, v2 ; GFX9-NEXT: v_cndmask_b32_e64 v0, v0, v2, s[0:1] ; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc ; GFX9-NEXT: global_store_dwordx2 v[4:5], v[0:1], off @@ -1322,20 +1322,20 @@ ; GFX8-LABEL: insertelement_v_v4i16_v_v: ; GFX8: ; %bb.0: ; GFX8-NEXT: flat_load_dwordx2 v[0:1], v[0:1] -; GFX8-NEXT: v_lshrrev_b32_e32 v6, 1, v3 +; GFX8-NEXT: v_lshrrev_b32_e32 v7, 1, v3 ; GFX8-NEXT: v_and_b32_e32 v3, 1, v3 -; GFX8-NEXT: s_mov_b32 s0, 0xffff +; GFX8-NEXT: v_mov_b32_e32 v6, 0xffff ; GFX8-NEXT: v_lshlrev_b32_e32 v3, 4, v3 ; GFX8-NEXT: v_lshlrev_b32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; GFX8-NEXT: v_lshlrev_b32_e64 v3, v3, s0 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 1, v6 +; GFX8-NEXT: v_lshlrev_b32_e32 v3, v3, v6 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 1, v7 ; GFX8-NEXT: v_xor_b32_e32 v3, -1, v3 ; GFX8-NEXT: v_mov_b32_e32 v4, 0 -; GFX8-NEXT: v_cmp_eq_u32_e64 s[0:1], 0, v6 +; GFX8-NEXT: v_cmp_eq_u32_e64 s[0:1], 0, v7 ; GFX8-NEXT: v_mov_b32_e32 v5, 0 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_cndmask_b32_e32 v7, v0, v1, vcc -; GFX8-NEXT: v_and_b32_e32 v3, v7, v3 +; GFX8-NEXT: v_cndmask_b32_e32 v6, v0, v1, vcc +; GFX8-NEXT: v_and_b32_e32 v3, v6, v3 ; GFX8-NEXT: v_or_b32_e32 v2, v3, v2 ; GFX8-NEXT: v_cndmask_b32_e64 v0, v0, v2, s[0:1] ; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc @@ -1345,21 +1345,21 @@ ; GFX7-LABEL: insertelement_v_v4i16_v_v: ; GFX7: ; %bb.0: ; GFX7-NEXT: flat_load_dwordx2 v[0:1], v[0:1] -; GFX7-NEXT: s_mov_b32 s0, 0xffff -; GFX7-NEXT: v_lshrrev_b32_e32 v6, 1, v3 +; GFX7-NEXT: v_mov_b32_e32 v6, 0xffff +; GFX7-NEXT: v_lshrrev_b32_e32 v7, 1, v3 ; GFX7-NEXT: v_and_b32_e32 v3, 1, v3 -; GFX7-NEXT: v_and_b32_e32 v2, s0, v2 +; GFX7-NEXT: v_and_b32_e32 v2, v2, v6 ; GFX7-NEXT: v_lshlrev_b32_e32 v3, 4, v3 ; GFX7-NEXT: v_lshlrev_b32_e32 v2, v3, v2 -; GFX7-NEXT: v_lshl_b32_e32 v3, s0, v3 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v6 +; GFX7-NEXT: v_lshlrev_b32_e32 v3, v3, v6 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v7 ; GFX7-NEXT: v_xor_b32_e32 v3, -1, v3 ; GFX7-NEXT: v_mov_b32_e32 v4, 0 -; GFX7-NEXT: v_cmp_eq_u32_e64 s[0:1], 0, v6 +; GFX7-NEXT: v_cmp_eq_u32_e64 s[0:1], 0, v7 ; GFX7-NEXT: v_mov_b32_e32 v5, 0 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_cndmask_b32_e32 v7, v0, v1, vcc -; GFX7-NEXT: v_and_b32_e32 v3, v7, v3 +; GFX7-NEXT: v_cndmask_b32_e32 v6, v0, v1, vcc +; GFX7-NEXT: v_and_b32_e32 v3, v6, v3 ; GFX7-NEXT: v_or_b32_e32 v2, v3, v2 ; GFX7-NEXT: v_cndmask_b32_e64 v0, v0, v2, s[0:1] ; GFX7-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc @@ -1370,13 +1370,13 @@ ; GFX10: ; %bb.0: ; GFX10-NEXT: global_load_dwordx2 v[0:1], v[0:1], off ; GFX10-NEXT: v_and_b32_e32 v4, 1, v3 +; GFX10-NEXT: v_mov_b32_e32 v5, 0xffff ; GFX10-NEXT: v_lshrrev_b32_e32 v6, 1, v3 -; GFX10-NEXT: s_mov_b32 s0, 0xffff ; GFX10-NEXT: v_lshlrev_b32_e32 v4, 4, v4 ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v6 -; GFX10-NEXT: v_lshlrev_b32_e64 v5, v4, s0 -; GFX10-NEXT: v_lshlrev_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; GFX10-NEXT: v_cmp_eq_u32_e64 s0, 0, v6 +; GFX10-NEXT: v_lshlrev_b32_e32 v5, v4, v5 +; GFX10-NEXT: v_lshlrev_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; GFX10-NEXT: v_xor_b32_e32 v3, -1, v5 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_cndmask_b32_e32 v4, v0, v1, vcc_lo @@ -1989,12 +1989,12 @@ ; GFX9-NEXT: v_mov_b32_e32 v5, s6 ; GFX9-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 4, v1 -; GFX9-NEXT: s_mov_b32 s8, 0xffff +; GFX9-NEXT: v_mov_b32_e32 v3, 0xffff ; GFX9-NEXT: v_mov_b32_e32 v6, s7 ; GFX9-NEXT: v_cndmask_b32_e64 v2, v2, v5, s[0:1] ; GFX9-NEXT: v_cmp_eq_u32_e64 s[2:3], 3, v4 ; GFX9-NEXT: v_lshlrev_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; GFX9-NEXT: v_lshlrev_b32_e64 v1, v1, s8 +; GFX9-NEXT: v_lshlrev_b32_e32 v1, v1, v3 ; GFX9-NEXT: v_cndmask_b32_e64 v2, v2, v6, s[2:3] ; GFX9-NEXT: v_xor_b32_e32 v1, -1, v1 ; GFX9-NEXT: v_and_or_b32 v6, v2, v1, v0 @@ -2025,12 +2025,12 @@ ; GFX8-NEXT: v_mov_b32_e32 v5, s6 ; GFX8-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc ; GFX8-NEXT: v_lshlrev_b32_e32 v1, 4, v1 -; GFX8-NEXT: s_mov_b32 s8, 0xffff +; GFX8-NEXT: v_mov_b32_e32 v3, 0xffff ; GFX8-NEXT: v_mov_b32_e32 v6, s7 ; GFX8-NEXT: v_cndmask_b32_e64 v2, v2, v5, s[0:1] ; GFX8-NEXT: v_cmp_eq_u32_e64 s[2:3], 3, v4 ; GFX8-NEXT: v_lshlrev_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; GFX8-NEXT: v_lshlrev_b32_e64 v1, v1, s8 +; GFX8-NEXT: v_lshlrev_b32_e32 v1, v1, v3 ; GFX8-NEXT: v_cndmask_b32_e64 v2, v2, v6, s[2:3] ; GFX8-NEXT: v_xor_b32_e32 v1, -1, v1 ; GFX8-NEXT: v_and_b32_e32 v1, v2, v1 @@ -2055,20 +2055,20 @@ ; GFX7-NEXT: v_lshrrev_b32_e32 v4, 1, v1 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v4 ; GFX7-NEXT: v_and_b32_e32 v1, 1, v1 -; GFX7-NEXT: s_mov_b32 s8, 0xffff +; GFX7-NEXT: v_cmp_eq_u32_e64 s[0:1], 2, v4 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v2, s4 ; GFX7-NEXT: v_mov_b32_e32 v3, s5 -; GFX7-NEXT: v_mov_b32_e32 v5, s6 ; GFX7-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc -; GFX7-NEXT: v_cmp_eq_u32_e64 s[0:1], 2, v4 +; GFX7-NEXT: v_mov_b32_e32 v3, 0xffff +; GFX7-NEXT: v_mov_b32_e32 v5, s6 ; GFX7-NEXT: v_lshlrev_b32_e32 v1, 4, v1 -; GFX7-NEXT: v_and_b32_e32 v0, s8, v0 +; GFX7-NEXT: v_and_b32_e32 v0, v0, v3 ; GFX7-NEXT: v_mov_b32_e32 v6, s7 ; GFX7-NEXT: v_cndmask_b32_e64 v2, v2, v5, s[0:1] ; GFX7-NEXT: v_cmp_eq_u32_e64 s[2:3], 3, v4 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, v1, v0 -; GFX7-NEXT: v_lshl_b32_e32 v1, s8, v1 +; GFX7-NEXT: v_lshlrev_b32_e32 v1, v1, v3 ; GFX7-NEXT: v_cndmask_b32_e64 v2, v2, v6, s[2:3] ; GFX7-NEXT: v_xor_b32_e32 v1, -1, v1 ; GFX7-NEXT: v_and_b32_e32 v1, v2, v1 @@ -2091,15 +2091,16 @@ ; GFX10-LABEL: insertelement_s_v8i16_v_v: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0 -; GFX10-NEXT: v_and_b32_e32 v2, 1, v1 ; GFX10-NEXT: v_lshrrev_b32_e32 v6, 1, v1 -; GFX10-NEXT: s_mov_b32 s0, 0xffff -; GFX10-NEXT: v_lshlrev_b32_e32 v2, 4, v2 +; GFX10-NEXT: v_and_b32_e32 v2, 1, v1 +; GFX10-NEXT: v_mov_b32_e32 v3, 0xffff ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v6 +; GFX10-NEXT: v_lshlrev_b32_e32 v2, 4, v2 +; GFX10-NEXT: v_cmp_eq_u32_e64 s0, 2, v6 ; GFX10-NEXT: v_cmp_eq_u32_e64 s1, 3, v6 +; GFX10-NEXT: s_mov_b32 null, 0 ; GFX10-NEXT: v_cmp_eq_u32_e64 s2, 0, v6 -; GFX10-NEXT: v_lshlrev_b32_e64 v3, v2, s0 -; GFX10-NEXT: v_cmp_eq_u32_e64 s0, 2, v6 +; GFX10-NEXT: v_lshlrev_b32_e32 v3, v2, v3 ; GFX10-NEXT: v_lshlrev_b32_sdwa v4, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; GFX10-NEXT: v_xor_b32_e32 v5, -1, v3 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) @@ -2380,24 +2381,24 @@ ; GFX9-LABEL: insertelement_v_v8i16_v_v: ; GFX9: ; %bb.0: ; GFX9-NEXT: global_load_dwordx4 v[4:7], v[0:1], off -; GFX9-NEXT: v_lshrrev_b32_e32 v0, 1, v3 -; GFX9-NEXT: v_and_b32_e32 v1, 1, v3 -; GFX9-NEXT: s_mov_b32 s0, 0xffff -; GFX9-NEXT: v_lshlrev_b32_e32 v1, 4, v1 -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 -; GFX9-NEXT: v_lshlrev_b32_sdwa v2, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; GFX9-NEXT: v_lshlrev_b32_e64 v1, v1, s0 -; GFX9-NEXT: v_cmp_eq_u32_e64 s[0:1], 2, v0 -; GFX9-NEXT: v_cmp_eq_u32_e64 s[2:3], 3, v0 -; GFX9-NEXT: v_xor_b32_e32 v1, -1, v1 +; GFX9-NEXT: v_lshrrev_b32_e32 v1, 1, v3 +; GFX9-NEXT: v_and_b32_e32 v3, 1, v3 +; GFX9-NEXT: v_mov_b32_e32 v0, 0xffff +; GFX9-NEXT: v_lshlrev_b32_e32 v3, 4, v3 +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v1 +; GFX9-NEXT: v_lshlrev_b32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; GFX9-NEXT: v_lshlrev_b32_e32 v0, v3, v0 +; GFX9-NEXT: v_cmp_eq_u32_e64 s[0:1], 2, v1 +; GFX9-NEXT: v_cmp_eq_u32_e64 s[2:3], 3, v1 +; GFX9-NEXT: v_xor_b32_e32 v0, -1, v0 ; GFX9-NEXT: v_mov_b32_e32 v8, 0 -; GFX9-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v0 +; GFX9-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v1 ; GFX9-NEXT: v_mov_b32_e32 v9, 0 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc ; GFX9-NEXT: v_cndmask_b32_e64 v3, v3, v6, s[0:1] ; GFX9-NEXT: v_cndmask_b32_e64 v3, v3, v7, s[2:3] -; GFX9-NEXT: v_and_or_b32 v3, v3, v1, v2 +; GFX9-NEXT: v_and_or_b32 v3, v3, v0, v2 ; GFX9-NEXT: v_cndmask_b32_e64 v0, v4, v3, s[4:5] ; GFX9-NEXT: v_cndmask_b32_e32 v1, v5, v3, vcc ; GFX9-NEXT: v_cndmask_b32_e64 v2, v6, v3, s[0:1] @@ -2408,25 +2409,25 @@ ; GFX8-LABEL: insertelement_v_v8i16_v_v: ; GFX8: ; %bb.0: ; GFX8-NEXT: flat_load_dwordx4 v[4:7], v[0:1] -; GFX8-NEXT: v_lshrrev_b32_e32 v0, 1, v3 -; GFX8-NEXT: v_and_b32_e32 v1, 1, v3 -; GFX8-NEXT: s_mov_b32 s0, 0xffff -; GFX8-NEXT: v_lshlrev_b32_e32 v1, 4, v1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 -; GFX8-NEXT: v_lshlrev_b32_sdwa v2, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; GFX8-NEXT: v_lshlrev_b32_e64 v1, v1, s0 -; GFX8-NEXT: v_cmp_eq_u32_e64 s[0:1], 2, v0 -; GFX8-NEXT: v_cmp_eq_u32_e64 s[2:3], 3, v0 -; GFX8-NEXT: v_xor_b32_e32 v1, -1, v1 +; GFX8-NEXT: v_lshrrev_b32_e32 v1, 1, v3 +; GFX8-NEXT: v_and_b32_e32 v3, 1, v3 +; GFX8-NEXT: v_mov_b32_e32 v0, 0xffff +; GFX8-NEXT: v_lshlrev_b32_e32 v3, 4, v3 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 1, v1 +; GFX8-NEXT: v_lshlrev_b32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; GFX8-NEXT: v_lshlrev_b32_e32 v0, v3, v0 +; GFX8-NEXT: v_cmp_eq_u32_e64 s[0:1], 2, v1 +; GFX8-NEXT: v_cmp_eq_u32_e64 s[2:3], 3, v1 +; GFX8-NEXT: v_xor_b32_e32 v0, -1, v0 ; GFX8-NEXT: v_mov_b32_e32 v8, 0 -; GFX8-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v0 +; GFX8-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v1 ; GFX8-NEXT: v_mov_b32_e32 v9, 0 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc ; GFX8-NEXT: v_cndmask_b32_e64 v3, v3, v6, s[0:1] ; GFX8-NEXT: v_cndmask_b32_e64 v3, v3, v7, s[2:3] -; GFX8-NEXT: v_and_b32_e32 v1, v3, v1 -; GFX8-NEXT: v_or_b32_e32 v3, v1, v2 +; GFX8-NEXT: v_and_b32_e32 v0, v3, v0 +; GFX8-NEXT: v_or_b32_e32 v3, v0, v2 ; GFX8-NEXT: v_cndmask_b32_e64 v0, v4, v3, s[4:5] ; GFX8-NEXT: v_cndmask_b32_e32 v1, v5, v3, vcc ; GFX8-NEXT: v_cndmask_b32_e64 v2, v6, v3, s[0:1] @@ -2440,25 +2441,25 @@ ; GFX7-NEXT: s_mov_b32 s11, 0xf000 ; GFX7-NEXT: s_mov_b64 s[8:9], 0 ; GFX7-NEXT: buffer_load_dwordx4 v[4:7], v[0:1], s[8:11], 0 addr64 -; GFX7-NEXT: s_mov_b32 s0, 0xffff -; GFX7-NEXT: v_lshrrev_b32_e32 v0, 1, v3 -; GFX7-NEXT: v_and_b32_e32 v1, 1, v3 -; GFX7-NEXT: v_and_b32_e32 v2, s0, v2 -; GFX7-NEXT: v_lshlrev_b32_e32 v1, 4, v1 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 -; GFX7-NEXT: v_lshlrev_b32_e32 v2, v1, v2 -; GFX7-NEXT: v_lshl_b32_e32 v1, s0, v1 -; GFX7-NEXT: v_cmp_eq_u32_e64 s[0:1], 2, v0 -; GFX7-NEXT: v_cmp_eq_u32_e64 s[2:3], 3, v0 -; GFX7-NEXT: v_xor_b32_e32 v1, -1, v1 -; GFX7-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v0 +; GFX7-NEXT: v_mov_b32_e32 v0, 0xffff +; GFX7-NEXT: v_lshrrev_b32_e32 v1, 1, v3 +; GFX7-NEXT: v_and_b32_e32 v3, 1, v3 +; GFX7-NEXT: v_and_b32_e32 v2, v2, v0 +; GFX7-NEXT: v_lshlrev_b32_e32 v3, 4, v3 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v1 +; GFX7-NEXT: v_lshlrev_b32_e32 v2, v3, v2 +; GFX7-NEXT: v_lshlrev_b32_e32 v0, v3, v0 +; GFX7-NEXT: v_cmp_eq_u32_e64 s[0:1], 2, v1 +; GFX7-NEXT: v_cmp_eq_u32_e64 s[2:3], 3, v1 +; GFX7-NEXT: v_xor_b32_e32 v0, -1, v0 +; GFX7-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v1 ; GFX7-NEXT: s_mov_b32 s10, -1 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc ; GFX7-NEXT: v_cndmask_b32_e64 v3, v3, v6, s[0:1] ; GFX7-NEXT: v_cndmask_b32_e64 v3, v3, v7, s[2:3] -; GFX7-NEXT: v_and_b32_e32 v1, v3, v1 -; GFX7-NEXT: v_or_b32_e32 v3, v1, v2 +; GFX7-NEXT: v_and_b32_e32 v0, v3, v0 +; GFX7-NEXT: v_or_b32_e32 v3, v0, v2 ; GFX7-NEXT: v_cndmask_b32_e64 v0, v4, v3, s[4:5] ; GFX7-NEXT: v_cndmask_b32_e32 v1, v5, v3, vcc ; GFX7-NEXT: v_cndmask_b32_e64 v2, v6, v3, s[0:1] @@ -2469,23 +2470,23 @@ ; GFX10-LABEL: insertelement_v_v8i16_v_v: ; GFX10: ; %bb.0: ; GFX10-NEXT: global_load_dwordx4 v[4:7], v[0:1], off -; GFX10-NEXT: v_and_b32_e32 v0, 1, v3 ; GFX10-NEXT: v_lshrrev_b32_e32 v1, 1, v3 -; GFX10-NEXT: s_mov_b32 s0, 0xffff -; GFX10-NEXT: v_lshlrev_b32_e32 v0, 4, v0 +; GFX10-NEXT: v_and_b32_e32 v0, 1, v3 +; GFX10-NEXT: v_mov_b32_e32 v3, 0xffff ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v1 +; GFX10-NEXT: v_lshlrev_b32_e32 v0, 4, v0 +; GFX10-NEXT: v_cmp_eq_u32_e64 s0, 2, v1 ; GFX10-NEXT: v_cmp_eq_u32_e64 s1, 3, v1 ; GFX10-NEXT: v_cmp_eq_u32_e64 s2, 0, v1 -; GFX10-NEXT: v_lshlrev_b32_e64 v8, v0, s0 -; GFX10-NEXT: v_cmp_eq_u32_e64 s0, 2, v1 +; GFX10-NEXT: v_lshlrev_b32_e32 v3, v0, v3 ; GFX10-NEXT: v_lshlrev_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; GFX10-NEXT: v_xor_b32_e32 v2, -1, v8 +; GFX10-NEXT: v_xor_b32_e32 v2, -1, v3 +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_cndmask_b32_e32 v8, v4, v5, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e64 v8, v8, v6, s0 +; GFX10-NEXT: v_cndmask_b32_e64 v3, v8, v7, s1 ; GFX10-NEXT: v_mov_b32_e32 v8, 0 ; GFX10-NEXT: v_mov_b32_e32 v9, 0 -; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e64 v3, v3, v6, s0 -; GFX10-NEXT: v_cndmask_b32_e64 v3, v3, v7, s1 ; GFX10-NEXT: v_and_or_b32 v3, v3, v2, v0 ; GFX10-NEXT: v_cndmask_b32_e64 v0, v4, v3, s2 ; GFX10-NEXT: v_cndmask_b32_e32 v1, v5, v3, vcc_lo @@ -3446,12 +3447,12 @@ ; GFX9-NEXT: v_cndmask_b32_e64 v2, v2, v7, s[6:7] ; GFX9-NEXT: v_cmp_eq_u32_e64 s[8:9], 6, v8 ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 4, v1 -; GFX9-NEXT: s_mov_b32 s20, 0xffff +; GFX9-NEXT: v_mov_b32_e32 v3, 0xffff ; GFX9-NEXT: v_mov_b32_e32 v10, s19 ; GFX9-NEXT: v_cndmask_b32_e64 v2, v2, v9, s[8:9] ; GFX9-NEXT: v_cmp_eq_u32_e64 s[10:11], 7, v8 ; GFX9-NEXT: v_lshlrev_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; GFX9-NEXT: v_lshlrev_b32_e64 v1, v1, s20 +; GFX9-NEXT: v_lshlrev_b32_e32 v1, v1, v3 ; GFX9-NEXT: v_cndmask_b32_e64 v2, v2, v10, s[10:11] ; GFX9-NEXT: v_xor_b32_e32 v1, -1, v1 ; GFX9-NEXT: v_and_or_b32 v9, v2, v1, v0 @@ -3505,12 +3506,12 @@ ; GFX8-NEXT: v_cndmask_b32_e64 v2, v2, v7, s[6:7] ; GFX8-NEXT: v_cmp_eq_u32_e64 s[8:9], 6, v8 ; GFX8-NEXT: v_lshlrev_b32_e32 v1, 4, v1 -; GFX8-NEXT: s_mov_b32 s20, 0xffff +; GFX8-NEXT: v_mov_b32_e32 v3, 0xffff ; GFX8-NEXT: v_mov_b32_e32 v10, s19 ; GFX8-NEXT: v_cndmask_b32_e64 v2, v2, v9, s[8:9] ; GFX8-NEXT: v_cmp_eq_u32_e64 s[10:11], 7, v8 ; GFX8-NEXT: v_lshlrev_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; GFX8-NEXT: v_lshlrev_b32_e64 v1, v1, s20 +; GFX8-NEXT: v_lshlrev_b32_e32 v1, v1, v3 ; GFX8-NEXT: v_cndmask_b32_e64 v2, v2, v10, s[10:11] ; GFX8-NEXT: v_xor_b32_e32 v1, -1, v1 ; GFX8-NEXT: v_and_b32_e32 v1, v2, v1 @@ -3561,17 +3562,17 @@ ; GFX7-NEXT: v_cndmask_b32_e64 v2, v2, v6, s[4:5] ; GFX7-NEXT: v_cmp_eq_u32_e64 s[6:7], 5, v8 ; GFX7-NEXT: v_and_b32_e32 v1, 1, v1 -; GFX7-NEXT: s_mov_b32 s20, 0xffff +; GFX7-NEXT: v_mov_b32_e32 v3, 0xffff ; GFX7-NEXT: v_mov_b32_e32 v9, s18 ; GFX7-NEXT: v_cndmask_b32_e64 v2, v2, v7, s[6:7] ; GFX7-NEXT: v_cmp_eq_u32_e64 s[8:9], 6, v8 ; GFX7-NEXT: v_lshlrev_b32_e32 v1, 4, v1 -; GFX7-NEXT: v_and_b32_e32 v0, s20, v0 +; GFX7-NEXT: v_and_b32_e32 v0, v0, v3 ; GFX7-NEXT: v_mov_b32_e32 v10, s19 ; GFX7-NEXT: v_cndmask_b32_e64 v2, v2, v9, s[8:9] ; GFX7-NEXT: v_cmp_eq_u32_e64 s[10:11], 7, v8 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, v1, v0 -; GFX7-NEXT: v_lshl_b32_e32 v1, s20, v1 +; GFX7-NEXT: v_lshlrev_b32_e32 v1, v1, v3 ; GFX7-NEXT: v_cndmask_b32_e64 v2, v2, v10, s[10:11] ; GFX7-NEXT: v_xor_b32_e32 v1, -1, v1 ; GFX7-NEXT: v_and_b32_e32 v1, v2, v1 @@ -3606,22 +3607,23 @@ ; GFX10-NEXT: s_load_dwordx8 s[8:15], s[2:3], 0x0 ; GFX10-NEXT: v_lshrrev_b32_e32 v10, 1, v1 ; GFX10-NEXT: v_and_b32_e32 v1, 1, v1 -; GFX10-NEXT: s_mov_b32 s4, 0xffff +; GFX10-NEXT: v_mov_b32_e32 v3, 0xffff ; GFX10-NEXT: v_mov_b32_e32 v12, 0 ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v10 ; GFX10-NEXT: v_cmp_eq_u32_e64 s0, 2, v10 ; GFX10-NEXT: v_cmp_eq_u32_e64 s1, 3, v10 +; GFX10-NEXT: s_mov_b32 null, 0 ; GFX10-NEXT: v_cmp_eq_u32_e64 s2, 4, v10 -; GFX10-NEXT: v_lshlrev_b32_e32 v1, 4, v1 ; GFX10-NEXT: v_cmp_eq_u32_e64 s3, 5, v10 +; GFX10-NEXT: v_lshlrev_b32_e32 v1, 4, v1 +; GFX10-NEXT: v_cmp_eq_u32_e64 s4, 6, v10 ; GFX10-NEXT: v_cmp_eq_u32_e64 s5, 7, v10 ; GFX10-NEXT: v_cmp_eq_u32_e64 s6, 0, v10 -; GFX10-NEXT: v_lshlrev_b32_e64 v3, v1, s4 -; GFX10-NEXT: v_cmp_eq_u32_e64 s4, 6, v10 +; GFX10-NEXT: v_lshlrev_b32_e32 v3, v1, v3 ; GFX10-NEXT: v_lshlrev_b32_sdwa v8, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; GFX10-NEXT: v_xor_b32_e32 v9, -1, v3 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v2, s9 +; GFX10-NEXT: v_xor_b32_e32 v9, -1, v3 ; GFX10-NEXT: v_cndmask_b32_e32 v2, s8, v2, vcc_lo ; GFX10-NEXT: v_cndmask_b32_e64 v2, v2, s10, s0 ; GFX10-NEXT: v_cndmask_b32_e64 v2, v2, s11, s1 @@ -4053,21 +4055,21 @@ ; GFX9: ; %bb.0: ; GFX9-NEXT: global_load_dwordx4 v[4:7], v[0:1], off ; GFX9-NEXT: global_load_dwordx4 v[8:11], v[0:1], off offset:16 -; GFX9-NEXT: v_lshrrev_b32_e32 v0, 1, v3 -; GFX9-NEXT: v_and_b32_e32 v1, 1, v3 -; GFX9-NEXT: s_mov_b32 s0, 0xffff -; GFX9-NEXT: v_lshlrev_b32_e32 v1, 4, v1 -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 -; GFX9-NEXT: v_lshlrev_b32_sdwa v2, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; GFX9-NEXT: v_lshlrev_b32_e64 v1, v1, s0 -; GFX9-NEXT: v_cmp_eq_u32_e64 s[0:1], 2, v0 -; GFX9-NEXT: v_cmp_eq_u32_e64 s[2:3], 3, v0 -; GFX9-NEXT: v_cmp_eq_u32_e64 s[4:5], 4, v0 -; GFX9-NEXT: v_cmp_eq_u32_e64 s[6:7], 5, v0 -; GFX9-NEXT: v_cmp_eq_u32_e64 s[8:9], 6, v0 -; GFX9-NEXT: v_cmp_eq_u32_e64 s[10:11], 7, v0 -; GFX9-NEXT: v_xor_b32_e32 v1, -1, v1 -; GFX9-NEXT: v_cmp_eq_u32_e64 s[12:13], 0, v0 +; GFX9-NEXT: v_lshrrev_b32_e32 v1, 1, v3 +; GFX9-NEXT: v_and_b32_e32 v3, 1, v3 +; GFX9-NEXT: v_mov_b32_e32 v0, 0xffff +; GFX9-NEXT: v_lshlrev_b32_e32 v3, 4, v3 +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v1 +; GFX9-NEXT: v_lshlrev_b32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; GFX9-NEXT: v_lshlrev_b32_e32 v0, v3, v0 +; GFX9-NEXT: v_cmp_eq_u32_e64 s[0:1], 2, v1 +; GFX9-NEXT: v_cmp_eq_u32_e64 s[2:3], 3, v1 +; GFX9-NEXT: v_cmp_eq_u32_e64 s[4:5], 4, v1 +; GFX9-NEXT: v_cmp_eq_u32_e64 s[6:7], 5, v1 +; GFX9-NEXT: v_cmp_eq_u32_e64 s[8:9], 6, v1 +; GFX9-NEXT: v_cmp_eq_u32_e64 s[10:11], 7, v1 +; GFX9-NEXT: v_xor_b32_e32 v0, -1, v0 +; GFX9-NEXT: v_cmp_eq_u32_e64 s[12:13], 0, v1 ; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc ; GFX9-NEXT: v_cndmask_b32_e64 v3, v3, v6, s[0:1] @@ -4077,7 +4079,7 @@ ; GFX9-NEXT: v_cndmask_b32_e64 v3, v3, v9, s[6:7] ; GFX9-NEXT: v_cndmask_b32_e64 v3, v3, v10, s[8:9] ; GFX9-NEXT: v_cndmask_b32_e64 v3, v3, v11, s[10:11] -; GFX9-NEXT: v_and_or_b32 v12, v3, v1, v2 +; GFX9-NEXT: v_and_or_b32 v12, v3, v0, v2 ; GFX9-NEXT: v_cndmask_b32_e64 v0, v4, v12, s[12:13] ; GFX9-NEXT: v_cndmask_b32_e32 v1, v5, v12, vcc ; GFX9-NEXT: v_cndmask_b32_e64 v4, v8, v12, s[4:5] @@ -4100,21 +4102,21 @@ ; GFX8-NEXT: v_addc_u32_e32 v9, vcc, 0, v1, vcc ; GFX8-NEXT: flat_load_dwordx4 v[4:7], v[0:1] ; GFX8-NEXT: flat_load_dwordx4 v[8:11], v[8:9] -; GFX8-NEXT: v_lshrrev_b32_e32 v0, 1, v3 -; GFX8-NEXT: v_and_b32_e32 v1, 1, v3 -; GFX8-NEXT: s_mov_b32 s0, 0xffff -; GFX8-NEXT: v_lshlrev_b32_e32 v1, 4, v1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 -; GFX8-NEXT: v_lshlrev_b32_sdwa v2, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; GFX8-NEXT: v_lshlrev_b32_e64 v1, v1, s0 -; GFX8-NEXT: v_cmp_eq_u32_e64 s[0:1], 2, v0 -; GFX8-NEXT: v_cmp_eq_u32_e64 s[2:3], 3, v0 -; GFX8-NEXT: v_cmp_eq_u32_e64 s[4:5], 4, v0 -; GFX8-NEXT: v_cmp_eq_u32_e64 s[6:7], 5, v0 -; GFX8-NEXT: v_cmp_eq_u32_e64 s[8:9], 6, v0 -; GFX8-NEXT: v_cmp_eq_u32_e64 s[10:11], 7, v0 -; GFX8-NEXT: v_xor_b32_e32 v1, -1, v1 -; GFX8-NEXT: v_cmp_eq_u32_e64 s[12:13], 0, v0 +; GFX8-NEXT: v_lshrrev_b32_e32 v1, 1, v3 +; GFX8-NEXT: v_and_b32_e32 v3, 1, v3 +; GFX8-NEXT: v_mov_b32_e32 v0, 0xffff +; GFX8-NEXT: v_lshlrev_b32_e32 v3, 4, v3 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 1, v1 +; GFX8-NEXT: v_lshlrev_b32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; GFX8-NEXT: v_lshlrev_b32_e32 v0, v3, v0 +; GFX8-NEXT: v_cmp_eq_u32_e64 s[0:1], 2, v1 +; GFX8-NEXT: v_cmp_eq_u32_e64 s[2:3], 3, v1 +; GFX8-NEXT: v_cmp_eq_u32_e64 s[4:5], 4, v1 +; GFX8-NEXT: v_cmp_eq_u32_e64 s[6:7], 5, v1 +; GFX8-NEXT: v_cmp_eq_u32_e64 s[8:9], 6, v1 +; GFX8-NEXT: v_cmp_eq_u32_e64 s[10:11], 7, v1 +; GFX8-NEXT: v_xor_b32_e32 v0, -1, v0 +; GFX8-NEXT: v_cmp_eq_u32_e64 s[12:13], 0, v1 ; GFX8-NEXT: s_waitcnt vmcnt(1) ; GFX8-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc ; GFX8-NEXT: v_cndmask_b32_e64 v3, v3, v6, s[0:1] @@ -4124,8 +4126,8 @@ ; GFX8-NEXT: v_cndmask_b32_e64 v3, v3, v9, s[6:7] ; GFX8-NEXT: v_cndmask_b32_e64 v3, v3, v10, s[8:9] ; GFX8-NEXT: v_cndmask_b32_e64 v3, v3, v11, s[10:11] -; GFX8-NEXT: v_and_b32_e32 v1, v3, v1 -; GFX8-NEXT: v_or_b32_e32 v12, v1, v2 +; GFX8-NEXT: v_and_b32_e32 v0, v3, v0 +; GFX8-NEXT: v_or_b32_e32 v12, v0, v2 ; GFX8-NEXT: v_cndmask_b32_e64 v0, v4, v12, s[12:13] ; GFX8-NEXT: v_cndmask_b32_e32 v1, v5, v12, vcc ; GFX8-NEXT: v_cndmask_b32_e64 v4, v8, v12, s[4:5] @@ -4149,22 +4151,22 @@ ; GFX7-NEXT: s_mov_b64 s[16:17], 0 ; GFX7-NEXT: buffer_load_dwordx4 v[4:7], v[0:1], s[16:19], 0 addr64 ; GFX7-NEXT: buffer_load_dwordx4 v[8:11], v[0:1], s[16:19], 0 addr64 offset:16 -; GFX7-NEXT: s_mov_b32 s0, 0xffff -; GFX7-NEXT: v_lshrrev_b32_e32 v0, 1, v3 -; GFX7-NEXT: v_and_b32_e32 v1, 1, v3 -; GFX7-NEXT: v_and_b32_e32 v2, s0, v2 -; GFX7-NEXT: v_lshlrev_b32_e32 v1, 4, v1 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 -; GFX7-NEXT: v_lshlrev_b32_e32 v2, v1, v2 -; GFX7-NEXT: v_lshl_b32_e32 v1, s0, v1 -; GFX7-NEXT: v_cmp_eq_u32_e64 s[0:1], 2, v0 -; GFX7-NEXT: v_cmp_eq_u32_e64 s[2:3], 3, v0 -; GFX7-NEXT: v_cmp_eq_u32_e64 s[4:5], 4, v0 -; GFX7-NEXT: v_cmp_eq_u32_e64 s[6:7], 5, v0 -; GFX7-NEXT: v_cmp_eq_u32_e64 s[8:9], 6, v0 -; GFX7-NEXT: v_cmp_eq_u32_e64 s[10:11], 7, v0 -; GFX7-NEXT: v_xor_b32_e32 v1, -1, v1 -; GFX7-NEXT: v_cmp_eq_u32_e64 s[12:13], 0, v0 +; GFX7-NEXT: v_mov_b32_e32 v0, 0xffff +; GFX7-NEXT: v_lshrrev_b32_e32 v1, 1, v3 +; GFX7-NEXT: v_and_b32_e32 v3, 1, v3 +; GFX7-NEXT: v_and_b32_e32 v2, v2, v0 +; GFX7-NEXT: v_lshlrev_b32_e32 v3, 4, v3 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v1 +; GFX7-NEXT: v_lshlrev_b32_e32 v2, v3, v2 +; GFX7-NEXT: v_lshlrev_b32_e32 v0, v3, v0 +; GFX7-NEXT: v_cmp_eq_u32_e64 s[0:1], 2, v1 +; GFX7-NEXT: v_cmp_eq_u32_e64 s[2:3], 3, v1 +; GFX7-NEXT: v_cmp_eq_u32_e64 s[4:5], 4, v1 +; GFX7-NEXT: v_cmp_eq_u32_e64 s[6:7], 5, v1 +; GFX7-NEXT: v_cmp_eq_u32_e64 s[8:9], 6, v1 +; GFX7-NEXT: v_cmp_eq_u32_e64 s[10:11], 7, v1 +; GFX7-NEXT: v_xor_b32_e32 v0, -1, v0 +; GFX7-NEXT: v_cmp_eq_u32_e64 s[12:13], 0, v1 ; GFX7-NEXT: s_mov_b32 s18, -1 ; GFX7-NEXT: s_waitcnt vmcnt(1) ; GFX7-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc @@ -4175,8 +4177,8 @@ ; GFX7-NEXT: v_cndmask_b32_e64 v3, v3, v9, s[6:7] ; GFX7-NEXT: v_cndmask_b32_e64 v3, v3, v10, s[8:9] ; GFX7-NEXT: v_cndmask_b32_e64 v3, v3, v11, s[10:11] -; GFX7-NEXT: v_and_b32_e32 v1, v3, v1 -; GFX7-NEXT: v_or_b32_e32 v12, v1, v2 +; GFX7-NEXT: v_and_b32_e32 v0, v3, v0 +; GFX7-NEXT: v_or_b32_e32 v12, v0, v2 ; GFX7-NEXT: v_cndmask_b32_e64 v0, v4, v12, s[12:13] ; GFX7-NEXT: v_cndmask_b32_e32 v1, v5, v12, vcc ; GFX7-NEXT: v_cndmask_b32_e64 v2, v6, v12, s[0:1] @@ -4197,18 +4199,18 @@ ; GFX10-NEXT: global_load_dwordx4 v[8:11], v[0:1], off offset:16 ; GFX10-NEXT: v_lshrrev_b32_e32 v0, 1, v3 ; GFX10-NEXT: v_and_b32_e32 v3, 1, v3 -; GFX10-NEXT: s_mov_b32 s4, 0xffff +; GFX10-NEXT: v_mov_b32_e32 v12, 0xffff ; GFX10-NEXT: v_mov_b32_e32 v15, 0 ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 ; GFX10-NEXT: v_cmp_eq_u32_e64 s0, 2, v0 ; GFX10-NEXT: v_cmp_eq_u32_e64 s1, 3, v0 ; GFX10-NEXT: v_cmp_eq_u32_e64 s2, 4, v0 -; GFX10-NEXT: v_lshlrev_b32_e32 v3, 4, v3 ; GFX10-NEXT: v_cmp_eq_u32_e64 s3, 5, v0 +; GFX10-NEXT: v_lshlrev_b32_e32 v3, 4, v3 +; GFX10-NEXT: v_cmp_eq_u32_e64 s4, 6, v0 ; GFX10-NEXT: v_cmp_eq_u32_e64 s5, 7, v0 ; GFX10-NEXT: v_cmp_eq_u32_e64 s6, 0, v0 -; GFX10-NEXT: v_lshlrev_b32_e64 v12, v3, s4 -; GFX10-NEXT: v_cmp_eq_u32_e64 s4, 6, v0 +; GFX10-NEXT: v_lshlrev_b32_e32 v12, v3, v12 ; GFX10-NEXT: v_lshlrev_b32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; GFX10-NEXT: v_xor_b32_e32 v3, -1, v12 ; GFX10-NEXT: v_mov_b32_e32 v12, 0 Index: llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.i8.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.i8.ll +++ llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.i8.ll @@ -71,17 +71,17 @@ ; GFX10: ; %bb.0: ; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: v_cmp_eq_u32_e64 s0, s5, 1 -; GFX10-NEXT: v_cmp_eq_u32_e64 s1, s5, 0 +; GFX10-NEXT: v_mov_b32_e32 v2, 0xff ; GFX10-NEXT: global_load_ushort v0, v0, s[2:3] ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_lshrrev_b32_e32 v1, 8, v0 -; GFX10-NEXT: v_cndmask_b32_e64 v2, v0, s4, s1 ; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, s4, s0 -; GFX10-NEXT: s_movk_i32 s0, 0xff -; GFX10-NEXT: v_and_b32_sdwa v3, v1, s0 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX10-NEXT: v_cmp_eq_u32_e64 s0, s5, 0 +; GFX10-NEXT: v_and_b32_sdwa v2, v1, v2 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX10-NEXT: v_cndmask_b32_e64 v3, v0, s4, s0 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 -; GFX10-NEXT: v_or_b32_sdwa v2, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX10-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX10-NEXT: global_store_short v[0:1], v2, off ; GFX10-NEXT: s_endpgm %vec = load <2 x i8>, <2 x i8> addrspace(4)* %ptr @@ -153,16 +153,16 @@ ; GFX10: ; %bb.0: ; GFX10-NEXT: global_load_ushort v0, v[0:1], off ; GFX10-NEXT: v_cmp_eq_u32_e64 s0, s3, 1 -; GFX10-NEXT: v_cmp_eq_u32_e64 s1, s3, 0 +; GFX10-NEXT: v_mov_b32_e32 v2, 0xff ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_lshrrev_b32_e32 v1, 8, v0 -; GFX10-NEXT: v_cndmask_b32_e64 v2, v0, s2, s1 ; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, s2, s0 -; GFX10-NEXT: s_movk_i32 s0, 0xff -; GFX10-NEXT: v_and_b32_sdwa v3, v1, s0 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX10-NEXT: v_cmp_eq_u32_e64 s0, s3, 0 +; GFX10-NEXT: v_and_b32_sdwa v2, v1, v2 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX10-NEXT: v_cndmask_b32_e64 v3, v0, s2, s0 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 -; GFX10-NEXT: v_or_b32_sdwa v2, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX10-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX10-NEXT: global_store_short v[0:1], v2, off ; GFX10-NEXT: s_endpgm %vec = load <2 x i8>, <2 x i8> addrspace(1 )* %ptr @@ -235,17 +235,17 @@ ; GFX10: ; %bb.0: ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: v_cmp_eq_u32_e64 vcc_lo, s4, 1 -; GFX10-NEXT: s_movk_i32 s0, 0xff +; GFX10-NEXT: v_mov_b32_e32 v3, 0xff ; GFX10-NEXT: global_load_ushort v1, v1, s[2:3] ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_lshrrev_b32_e32 v2, 8, v1 ; GFX10-NEXT: v_cndmask_b32_e32 v2, v2, v0, vcc_lo ; GFX10-NEXT: v_cmp_eq_u32_e64 vcc_lo, s4, 0 -; GFX10-NEXT: v_and_b32_sdwa v2, v2, s0 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX10-NEXT: v_cndmask_b32_e32 v3, v1, v0, vcc_lo +; GFX10-NEXT: v_and_b32_sdwa v2, v2, v3 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX10-NEXT: v_cndmask_b32_e32 v4, v1, v0, vcc_lo ; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 -; GFX10-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX10-NEXT: v_or_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX10-NEXT: global_store_short v[0:1], v2, off ; GFX10-NEXT: s_endpgm %vec = load <2 x i8>, <2 x i8> addrspace(4)* %ptr @@ -321,17 +321,17 @@ ; GFX10: ; %bb.0: ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 -; GFX10-NEXT: s_movk_i32 s0, 0xff +; GFX10-NEXT: v_mov_b32_e32 v3, 0xff ; GFX10-NEXT: global_load_ushort v1, v1, s[2:3] ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_lshrrev_b32_e32 v2, 8, v1 ; GFX10-NEXT: v_cndmask_b32_e64 v2, v2, s4, vcc_lo ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX10-NEXT: v_and_b32_sdwa v2, v2, s0 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX10-NEXT: v_cndmask_b32_e64 v3, v1, s4, vcc_lo +; GFX10-NEXT: v_and_b32_sdwa v2, v2, v3 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX10-NEXT: v_cndmask_b32_e64 v4, v1, s4, vcc_lo ; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 -; GFX10-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX10-NEXT: v_or_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX10-NEXT: global_store_short v[0:1], v2, off ; GFX10-NEXT: s_endpgm %vec = load <2 x i8>, <2 x i8> addrspace(4)* %ptr @@ -404,13 +404,13 @@ ; GFX10: ; %bb.0: ; GFX10-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v1 -; GFX10-NEXT: s_movk_i32 s0, 0xff +; GFX10-NEXT: v_mov_b32_e32 v4, 0xff ; GFX10-NEXT: global_load_ushort v2, v2, s[2:3] ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_lshrrev_b32_e32 v3, 8, v2 ; GFX10-NEXT: v_cndmask_b32_e32 v3, v3, v0, vcc_lo ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1 -; GFX10-NEXT: v_and_b32_sdwa v3, v3, s0 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX10-NEXT: v_and_b32_sdwa v3, v3, v4 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX10-NEXT: v_cndmask_b32_e32 v2, v2, v0, vcc_lo ; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 @@ -486,12 +486,12 @@ ; GFX10: ; %bb.0: ; GFX10-NEXT: global_load_ushort v0, v[0:1], off ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v2 -; GFX10-NEXT: s_movk_i32 s0, 0xff +; GFX10-NEXT: v_mov_b32_e32 v3, 0xff ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_lshrrev_b32_e32 v1, 8, v0 ; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, s2, vcc_lo ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v2 -; GFX10-NEXT: v_and_b32_sdwa v3, v1, s0 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX10-NEXT: v_and_b32_sdwa v3, v1, v3 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX10-NEXT: v_cndmask_b32_e64 v2, v0, s2, vcc_lo ; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 @@ -564,12 +564,12 @@ ; GFX10: ; %bb.0: ; GFX10-NEXT: global_load_ushort v0, v[0:1], off ; GFX10-NEXT: v_cmp_eq_u32_e64 vcc_lo, s2, 1 -; GFX10-NEXT: s_movk_i32 s0, 0xff +; GFX10-NEXT: v_mov_b32_e32 v3, 0xff ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_lshrrev_b32_e32 v1, 8, v0 ; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc_lo ; GFX10-NEXT: v_cmp_eq_u32_e64 vcc_lo, s2, 0 -; GFX10-NEXT: v_and_b32_sdwa v3, v1, s0 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX10-NEXT: v_and_b32_sdwa v3, v1, v3 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX10-NEXT: v_cndmask_b32_e32 v2, v0, v2, vcc_lo ; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 @@ -642,12 +642,12 @@ ; GFX10: ; %bb.0: ; GFX10-NEXT: global_load_ushort v0, v[0:1], off ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v3 -; GFX10-NEXT: s_movk_i32 s0, 0xff +; GFX10-NEXT: v_mov_b32_e32 v4, 0xff ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_lshrrev_b32_e32 v1, 8, v0 ; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc_lo ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v3 -; GFX10-NEXT: v_and_b32_sdwa v3, v1, s0 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX10-NEXT: v_and_b32_sdwa v3, v1, v4 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX10-NEXT: v_cndmask_b32_e32 v2, v0, v2, vcc_lo ; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 @@ -721,30 +721,28 @@ ; GFX9-LABEL: insertelement_v_v4i8_s_s: ; GFX9: ; %bb.0: ; GFX9-NEXT: global_load_dword v0, v[0:1], off -; GFX9-NEXT: s_mov_b32 s0, 8 -; GFX9-NEXT: s_movk_i32 s4, 0xff -; GFX9-NEXT: s_and_b32 s3, s3, 3 -; GFX9-NEXT: s_mov_b32 s1, 16 -; GFX9-NEXT: s_and_b32 s2, s2, s4 -; GFX9-NEXT: s_lshl_b32 s3, s3, 3 -; GFX9-NEXT: s_lshl_b32 s2, s2, s3 -; GFX9-NEXT: s_lshl_b32 s3, s4, s3 -; GFX9-NEXT: s_not_b32 s3, s3 -; GFX9-NEXT: v_mov_b32_e32 v3, s2 +; GFX9-NEXT: s_movk_i32 s0, 0xff ; GFX9-NEXT: v_mov_b32_e32 v1, 8 +; GFX9-NEXT: s_and_b32 s1, s3, 3 ; GFX9-NEXT: v_mov_b32_e32 v2, 16 +; GFX9-NEXT: s_and_b32 s2, s2, s0 +; GFX9-NEXT: s_lshl_b32 s1, s1, 3 +; GFX9-NEXT: s_lshl_b32 s2, s2, s1 +; GFX9-NEXT: s_lshl_b32 s1, s0, s1 +; GFX9-NEXT: s_not_b32 s1, s1 +; GFX9-NEXT: v_mov_b32_e32 v3, s2 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_lshrrev_b32_e32 v4, 24, v0 -; GFX9-NEXT: v_lshlrev_b32_sdwa v5, s0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 -; GFX9-NEXT: v_lshlrev_b32_sdwa v6, s1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 -; GFX9-NEXT: v_and_or_b32 v0, v0, s4, v5 +; GFX9-NEXT: v_lshlrev_b32_sdwa v5, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 +; GFX9-NEXT: v_lshlrev_b32_sdwa v6, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 +; GFX9-NEXT: v_and_or_b32 v0, v0, s0, v5 ; GFX9-NEXT: v_lshlrev_b32_e32 v4, 24, v4 ; GFX9-NEXT: v_or3_b32 v0, v0, v6, v4 -; GFX9-NEXT: v_and_or_b32 v0, v0, s3, v3 +; GFX9-NEXT: v_and_or_b32 v0, v0, s1, v3 ; GFX9-NEXT: v_lshrrev_b32_e32 v3, 24, v0 ; GFX9-NEXT: v_lshlrev_b32_sdwa v1, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 ; GFX9-NEXT: v_lshlrev_b32_sdwa v2, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 -; GFX9-NEXT: v_and_or_b32 v4, v0, s4, v1 +; GFX9-NEXT: v_and_or_b32 v4, v0, s0, v1 ; GFX9-NEXT: v_lshlrev_b32_e32 v3, 24, v3 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 @@ -764,27 +762,25 @@ ; GFX8-NEXT: s_lshl_b32 s0, s0, s1 ; GFX8-NEXT: s_not_b32 s0, s0 ; GFX8-NEXT: s_lshl_b32 s2, s2, s1 -; GFX8-NEXT: v_mov_b32_e32 v3, 8 -; GFX8-NEXT: v_mov_b32_e32 v4, 16 ; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: v_lshlrev_b32_sdwa v4, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 +; GFX8-NEXT: v_lshrrev_b32_e32 v3, 24, v0 +; GFX8-NEXT: v_lshlrev_b32_sdwa v5, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 +; GFX8-NEXT: v_or_b32_sdwa v0, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX8-NEXT: v_lshlrev_b32_e32 v3, 24, v3 +; GFX8-NEXT: v_or_b32_e32 v0, v0, v5 +; GFX8-NEXT: v_or_b32_e32 v0, v0, v3 +; GFX8-NEXT: v_and_b32_e32 v0, s0, v0 +; GFX8-NEXT: v_or_b32_e32 v0, s2, v0 ; GFX8-NEXT: v_lshlrev_b32_sdwa v1, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 -; GFX8-NEXT: v_lshrrev_b32_e32 v5, 24, v0 +; GFX8-NEXT: v_lshrrev_b32_e32 v3, 24, v0 ; GFX8-NEXT: v_lshlrev_b32_sdwa v2, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 ; GFX8-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX8-NEXT: v_lshlrev_b32_e32 v1, 24, v5 -; GFX8-NEXT: v_or_b32_e32 v0, v0, v2 -; GFX8-NEXT: v_or_b32_e32 v0, v0, v1 -; GFX8-NEXT: v_and_b32_e32 v0, s0, v0 -; GFX8-NEXT: v_or_b32_e32 v0, s2, v0 -; GFX8-NEXT: v_lshlrev_b32_sdwa v2, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 -; GFX8-NEXT: v_lshrrev_b32_e32 v1, 24, v0 -; GFX8-NEXT: v_lshlrev_b32_sdwa v3, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 -; GFX8-NEXT: v_or_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX8-NEXT: v_lshlrev_b32_e32 v2, 24, v1 -; GFX8-NEXT: v_or_b32_e32 v3, v0, v3 +; GFX8-NEXT: v_lshlrev_b32_e32 v3, 24, v3 +; GFX8-NEXT: v_or_b32_e32 v2, v0, v2 ; GFX8-NEXT: v_mov_b32_e32 v0, 0 ; GFX8-NEXT: v_mov_b32_e32 v1, 0 -; GFX8-NEXT: v_or_b32_e32 v2, v3, v2 +; GFX8-NEXT: v_or_b32_e32 v2, v2, v3 ; GFX8-NEXT: flat_store_dword v[0:1], v2 ; GFX8-NEXT: s_endpgm ; @@ -831,29 +827,27 @@ ; GFX10-LABEL: insertelement_v_v4i8_s_s: ; GFX10: ; %bb.0: ; GFX10-NEXT: global_load_dword v0, v[0:1], off -; GFX10-NEXT: s_mov_b32 s0, 8 -; GFX10-NEXT: s_movk_i32 s1, 0xff -; GFX10-NEXT: s_and_b32 s2, s2, s1 -; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_lshlrev_b32_sdwa v1, s0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 -; GFX10-NEXT: v_lshrrev_b32_e32 v2, 24, v0 -; GFX10-NEXT: s_mov_b32 s0, 16 -; GFX10-NEXT: v_lshlrev_b32_sdwa v3, s0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 -; GFX10-NEXT: v_and_or_b32 v0, v0, s1, v1 -; GFX10-NEXT: v_lshlrev_b32_e32 v1, 24, v2 -; GFX10-NEXT: s_and_b32 s0, s3, 3 +; GFX10-NEXT: v_mov_b32_e32 v1, 8 ; GFX10-NEXT: v_mov_b32_e32 v2, 16 -; GFX10-NEXT: s_lshl_b32 s0, s0, 3 -; GFX10-NEXT: v_or3_b32 v0, v0, v3, v1 -; GFX10-NEXT: s_lshl_b32 s3, s1, s0 -; GFX10-NEXT: s_lshl_b32 s0, s2, s0 +; GFX10-NEXT: s_movk_i32 s0, 0xff +; GFX10-NEXT: s_and_b32 s1, s3, 3 +; GFX10-NEXT: s_and_b32 s2, s2, s0 +; GFX10-NEXT: s_lshl_b32 s1, s1, 3 +; GFX10-NEXT: s_lshl_b32 s3, s0, s1 +; GFX10-NEXT: s_lshl_b32 s1, s2, s1 ; GFX10-NEXT: s_not_b32 s2, s3 -; GFX10-NEXT: v_mov_b32_e32 v1, 8 -; GFX10-NEXT: v_and_or_b32 v0, v0, s2, s0 +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_lshlrev_b32_sdwa v3, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 +; GFX10-NEXT: v_lshrrev_b32_e32 v4, 24, v0 +; GFX10-NEXT: v_lshlrev_b32_sdwa v5, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 +; GFX10-NEXT: v_and_or_b32 v0, v0, s0, v3 +; GFX10-NEXT: v_lshlrev_b32_e32 v3, 24, v4 +; GFX10-NEXT: v_or3_b32 v0, v0, v5, v3 +; GFX10-NEXT: v_and_or_b32 v0, v0, s2, s1 ; GFX10-NEXT: v_lshlrev_b32_sdwa v1, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 ; GFX10-NEXT: v_lshrrev_b32_e32 v3, 24, v0 ; GFX10-NEXT: v_lshlrev_b32_sdwa v2, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 -; GFX10-NEXT: v_and_or_b32 v4, v0, s1, v1 +; GFX10-NEXT: v_and_or_b32 v4, v0, s0, v1 ; GFX10-NEXT: v_lshlrev_b32_e32 v3, 24, v3 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 @@ -1320,29 +1314,27 @@ ; GFX9-LABEL: insertelement_v_v4i8_s_v: ; GFX9: ; %bb.0: ; GFX9-NEXT: global_load_dword v0, v[0:1], off -; GFX9-NEXT: s_mov_b32 s0, 8 -; GFX9-NEXT: s_movk_i32 s3, 0xff +; GFX9-NEXT: s_movk_i32 s0, 0xff +; GFX9-NEXT: v_mov_b32_e32 v1, 8 ; GFX9-NEXT: v_and_b32_e32 v2, 3, v2 -; GFX9-NEXT: s_mov_b32 s1, 16 -; GFX9-NEXT: s_and_b32 s2, s2, s3 +; GFX9-NEXT: v_mov_b32_e32 v3, 16 +; GFX9-NEXT: s_and_b32 s1, s2, s0 ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 3, v2 -; GFX9-NEXT: v_lshlrev_b32_e64 v4, v2, s2 -; GFX9-NEXT: v_lshlrev_b32_e64 v2, v2, s3 +; GFX9-NEXT: v_lshlrev_b32_e64 v4, v2, s1 +; GFX9-NEXT: v_lshlrev_b32_e64 v2, v2, s0 ; GFX9-NEXT: v_xor_b32_e32 v2, -1, v2 -; GFX9-NEXT: v_mov_b32_e32 v1, 8 -; GFX9-NEXT: v_mov_b32_e32 v3, 16 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_lshrrev_b32_e32 v5, 24, v0 -; GFX9-NEXT: v_lshlrev_b32_sdwa v6, s0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 -; GFX9-NEXT: v_lshlrev_b32_sdwa v7, s1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 -; GFX9-NEXT: v_and_or_b32 v0, v0, s3, v6 +; GFX9-NEXT: v_lshlrev_b32_sdwa v6, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 +; GFX9-NEXT: v_lshlrev_b32_sdwa v7, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 +; GFX9-NEXT: v_and_or_b32 v0, v0, s0, v6 ; GFX9-NEXT: v_lshlrev_b32_e32 v5, 24, v5 ; GFX9-NEXT: v_or3_b32 v0, v0, v7, v5 ; GFX9-NEXT: v_and_or_b32 v0, v0, v2, v4 ; GFX9-NEXT: v_lshrrev_b32_e32 v2, 24, v0 ; GFX9-NEXT: v_lshlrev_b32_sdwa v1, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 ; GFX9-NEXT: v_lshlrev_b32_sdwa v3, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 -; GFX9-NEXT: v_and_or_b32 v4, v0, s3, v1 +; GFX9-NEXT: v_and_or_b32 v4, v0, s0, v1 ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 24, v2 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 @@ -1359,26 +1351,24 @@ ; GFX8-NEXT: v_and_b32_e32 v2, 3, v2 ; GFX8-NEXT: s_and_b32 s1, s2, s0 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 3, v2 -; GFX8-NEXT: v_lshlrev_b32_e64 v6, v2, s1 +; GFX8-NEXT: v_lshlrev_b32_e64 v4, v2, s1 ; GFX8-NEXT: v_lshlrev_b32_e64 v2, v2, s0 ; GFX8-NEXT: v_xor_b32_e32 v2, -1, v2 -; GFX8-NEXT: v_mov_b32_e32 v4, 8 -; GFX8-NEXT: v_mov_b32_e32 v5, 16 ; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: v_lshlrev_b32_sdwa v6, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 +; GFX8-NEXT: v_lshrrev_b32_e32 v5, 24, v0 +; GFX8-NEXT: v_lshlrev_b32_sdwa v7, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 +; GFX8-NEXT: v_or_b32_sdwa v0, v0, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX8-NEXT: v_lshlrev_b32_e32 v5, 24, v5 +; GFX8-NEXT: v_or_b32_e32 v0, v0, v7 +; GFX8-NEXT: v_or_b32_e32 v0, v0, v5 +; GFX8-NEXT: v_and_b32_e32 v0, v0, v2 +; GFX8-NEXT: v_or_b32_e32 v0, v0, v4 ; GFX8-NEXT: v_lshlrev_b32_sdwa v1, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 -; GFX8-NEXT: v_lshrrev_b32_e32 v7, 24, v0 +; GFX8-NEXT: v_lshrrev_b32_e32 v2, 24, v0 ; GFX8-NEXT: v_lshlrev_b32_sdwa v3, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 ; GFX8-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX8-NEXT: v_lshlrev_b32_e32 v1, 24, v7 -; GFX8-NEXT: v_or_b32_e32 v0, v0, v3 -; GFX8-NEXT: v_or_b32_e32 v0, v0, v1 -; GFX8-NEXT: v_and_b32_e32 v0, v0, v2 -; GFX8-NEXT: v_or_b32_e32 v0, v0, v6 -; GFX8-NEXT: v_lshlrev_b32_sdwa v2, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 -; GFX8-NEXT: v_lshrrev_b32_e32 v1, 24, v0 -; GFX8-NEXT: v_lshlrev_b32_sdwa v3, v5, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 -; GFX8-NEXT: v_or_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX8-NEXT: v_lshlrev_b32_e32 v2, 24, v1 +; GFX8-NEXT: v_lshlrev_b32_e32 v2, 24, v2 ; GFX8-NEXT: v_or_b32_e32 v3, v0, v3 ; GFX8-NEXT: v_mov_b32_e32 v0, 0 ; GFX8-NEXT: v_mov_b32_e32 v1, 0 @@ -1430,32 +1420,30 @@ ; GFX10: ; %bb.0: ; GFX10-NEXT: global_load_dword v0, v[0:1], off ; GFX10-NEXT: v_and_b32_e32 v1, 3, v2 -; GFX10-NEXT: s_mov_b32 s0, 8 -; GFX10-NEXT: s_movk_i32 s1, 0xff +; GFX10-NEXT: v_mov_b32_e32 v2, 8 +; GFX10-NEXT: v_mov_b32_e32 v3, 16 +; GFX10-NEXT: s_movk_i32 s0, 0xff ; GFX10-NEXT: v_lshlrev_b32_e32 v1, 3, v1 -; GFX10-NEXT: v_lshlrev_b32_e64 v4, v1, s1 +; GFX10-NEXT: s_and_b32 s1, s2, s0 +; GFX10-NEXT: v_lshlrev_b32_e64 v6, v1, s0 +; GFX10-NEXT: v_lshlrev_b32_e64 v1, v1, s1 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_lshlrev_b32_sdwa v2, s0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 -; GFX10-NEXT: v_lshrrev_b32_e32 v3, 24, v0 -; GFX10-NEXT: s_mov_b32 s0, 16 -; GFX10-NEXT: v_lshlrev_b32_sdwa v5, s0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 -; GFX10-NEXT: v_and_or_b32 v0, v0, s1, v2 -; GFX10-NEXT: v_lshlrev_b32_e32 v2, 24, v3 -; GFX10-NEXT: s_and_b32 s0, s2, s1 -; GFX10-NEXT: v_xor_b32_e32 v3, -1, v4 -; GFX10-NEXT: v_lshlrev_b32_e64 v1, v1, s0 -; GFX10-NEXT: v_or3_b32 v0, v0, v5, v2 -; GFX10-NEXT: v_mov_b32_e32 v2, 8 -; GFX10-NEXT: v_and_or_b32 v0, v0, v3, v1 -; GFX10-NEXT: v_mov_b32_e32 v1, 16 -; GFX10-NEXT: v_lshlrev_b32_sdwa v2, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 -; GFX10-NEXT: v_lshrrev_b32_e32 v3, 24, v0 -; GFX10-NEXT: v_lshlrev_b32_sdwa v4, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 -; GFX10-NEXT: v_and_or_b32 v2, v0, s1, v2 -; GFX10-NEXT: v_lshlrev_b32_e32 v3, 24, v3 +; GFX10-NEXT: v_lshlrev_b32_sdwa v4, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 +; GFX10-NEXT: v_lshrrev_b32_e32 v5, 24, v0 +; GFX10-NEXT: v_lshlrev_b32_sdwa v7, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 +; GFX10-NEXT: v_and_or_b32 v0, v0, s0, v4 +; GFX10-NEXT: v_lshlrev_b32_e32 v4, 24, v5 +; GFX10-NEXT: v_xor_b32_e32 v5, -1, v6 +; GFX10-NEXT: v_or3_b32 v0, v0, v7, v4 +; GFX10-NEXT: v_and_or_b32 v0, v0, v5, v1 +; GFX10-NEXT: v_lshlrev_b32_sdwa v1, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 +; GFX10-NEXT: v_lshrrev_b32_e32 v2, 24, v0 +; GFX10-NEXT: v_lshlrev_b32_sdwa v3, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 +; GFX10-NEXT: v_and_or_b32 v4, v0, s0, v1 +; GFX10-NEXT: v_lshlrev_b32_e32 v2, 24, v2 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 -; GFX10-NEXT: v_or3_b32 v2, v2, v4, v3 +; GFX10-NEXT: v_or3_b32 v2, v4, v3, v2 ; GFX10-NEXT: global_store_dword v[0:1], v2, off ; GFX10-NEXT: s_endpgm %vec = load <4 x i8>, <4 x i8> addrspace(1)* %ptr @@ -1468,28 +1456,26 @@ ; GFX9-LABEL: insertelement_v_v4i8_v_s: ; GFX9: ; %bb.0: ; GFX9-NEXT: global_load_dword v0, v[0:1], off -; GFX9-NEXT: s_mov_b32 s0, 8 -; GFX9-NEXT: s_and_b32 s2, s2, 3 -; GFX9-NEXT: s_mov_b32 s1, 16 -; GFX9-NEXT: s_movk_i32 s3, 0xff -; GFX9-NEXT: s_lshl_b32 s2, s2, 3 -; GFX9-NEXT: v_lshlrev_b32_sdwa v2, s2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: s_lshl_b32 s2, s3, s2 -; GFX9-NEXT: s_not_b32 s2, s2 ; GFX9-NEXT: v_mov_b32_e32 v1, 8 +; GFX9-NEXT: s_and_b32 s1, s2, 3 +; GFX9-NEXT: s_movk_i32 s0, 0xff ; GFX9-NEXT: v_mov_b32_e32 v3, 16 +; GFX9-NEXT: s_lshl_b32 s1, s1, 3 +; GFX9-NEXT: v_lshlrev_b32_sdwa v2, s1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: s_lshl_b32 s1, s0, s1 +; GFX9-NEXT: s_not_b32 s1, s1 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_lshrrev_b32_e32 v4, 24, v0 -; GFX9-NEXT: v_lshlrev_b32_sdwa v5, s0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 -; GFX9-NEXT: v_lshlrev_b32_sdwa v6, s1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 -; GFX9-NEXT: v_and_or_b32 v0, v0, s3, v5 +; GFX9-NEXT: v_lshlrev_b32_sdwa v5, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 +; GFX9-NEXT: v_lshlrev_b32_sdwa v6, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 +; GFX9-NEXT: v_and_or_b32 v0, v0, s0, v5 ; GFX9-NEXT: v_lshlrev_b32_e32 v4, 24, v4 ; GFX9-NEXT: v_or3_b32 v0, v0, v6, v4 -; GFX9-NEXT: v_and_or_b32 v0, v0, s2, v2 +; GFX9-NEXT: v_and_or_b32 v0, v0, s1, v2 ; GFX9-NEXT: v_lshrrev_b32_e32 v2, 24, v0 ; GFX9-NEXT: v_lshlrev_b32_sdwa v1, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 ; GFX9-NEXT: v_lshlrev_b32_sdwa v3, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 -; GFX9-NEXT: v_and_or_b32 v4, v0, s3, v1 +; GFX9-NEXT: v_and_or_b32 v4, v0, s0, v1 ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 24, v2 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 @@ -1504,28 +1490,26 @@ ; GFX8-NEXT: v_mov_b32_e32 v1, 8 ; GFX8-NEXT: s_lshl_b32 s1, s1, 3 ; GFX8-NEXT: v_mov_b32_e32 v3, 16 -; GFX8-NEXT: v_mov_b32_e32 v6, s1 +; GFX8-NEXT: v_mov_b32_e32 v4, s1 ; GFX8-NEXT: s_movk_i32 s0, 0xff -; GFX8-NEXT: v_lshlrev_b32_sdwa v2, v6, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX8-NEXT: v_lshlrev_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX8-NEXT: s_lshl_b32 s0, s0, s1 ; GFX8-NEXT: s_not_b32 s0, s0 -; GFX8-NEXT: v_mov_b32_e32 v4, 8 -; GFX8-NEXT: v_mov_b32_e32 v5, 16 ; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: v_lshlrev_b32_sdwa v5, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 +; GFX8-NEXT: v_lshrrev_b32_e32 v4, 24, v0 +; GFX8-NEXT: v_lshlrev_b32_sdwa v6, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 +; GFX8-NEXT: v_or_b32_sdwa v0, v0, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX8-NEXT: v_lshlrev_b32_e32 v4, 24, v4 +; GFX8-NEXT: v_or_b32_e32 v0, v0, v6 +; GFX8-NEXT: v_or_b32_e32 v0, v0, v4 +; GFX8-NEXT: v_and_b32_e32 v0, s0, v0 +; GFX8-NEXT: v_or_b32_e32 v0, v0, v2 ; GFX8-NEXT: v_lshlrev_b32_sdwa v1, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 -; GFX8-NEXT: v_lshrrev_b32_e32 v6, 24, v0 +; GFX8-NEXT: v_lshrrev_b32_e32 v2, 24, v0 ; GFX8-NEXT: v_lshlrev_b32_sdwa v3, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 ; GFX8-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX8-NEXT: v_lshlrev_b32_e32 v1, 24, v6 -; GFX8-NEXT: v_or_b32_e32 v0, v0, v3 -; GFX8-NEXT: v_or_b32_e32 v0, v0, v1 -; GFX8-NEXT: v_and_b32_e32 v0, s0, v0 -; GFX8-NEXT: v_or_b32_e32 v0, v0, v2 -; GFX8-NEXT: v_lshlrev_b32_sdwa v2, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 -; GFX8-NEXT: v_lshrrev_b32_e32 v1, 24, v0 -; GFX8-NEXT: v_lshlrev_b32_sdwa v3, v5, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 -; GFX8-NEXT: v_or_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX8-NEXT: v_lshlrev_b32_e32 v2, 24, v1 +; GFX8-NEXT: v_lshlrev_b32_e32 v2, 24, v2 ; GFX8-NEXT: v_or_b32_e32 v3, v0, v3 ; GFX8-NEXT: v_mov_b32_e32 v0, 0 ; GFX8-NEXT: v_mov_b32_e32 v1, 0 @@ -1576,32 +1560,30 @@ ; GFX10-LABEL: insertelement_v_v4i8_v_s: ; GFX10: ; %bb.0: ; GFX10-NEXT: global_load_dword v0, v[0:1], off -; GFX10-NEXT: s_mov_b32 s0, 8 -; GFX10-NEXT: s_mov_b32 s1, 16 -; GFX10-NEXT: s_and_b32 s2, s2, 3 -; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_lshlrev_b32_sdwa v1, s0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 -; GFX10-NEXT: v_lshrrev_b32_e32 v3, 24, v0 +; GFX10-NEXT: v_mov_b32_e32 v1, 8 +; GFX10-NEXT: v_mov_b32_e32 v3, 16 ; GFX10-NEXT: s_movk_i32 s0, 0xff -; GFX10-NEXT: v_lshlrev_b32_sdwa v4, s1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 -; GFX10-NEXT: s_lshl_b32 s1, s2, 3 -; GFX10-NEXT: v_and_or_b32 v0, v0, s0, v1 -; GFX10-NEXT: v_lshlrev_b32_e32 v1, 24, v3 +; GFX10-NEXT: s_and_b32 s1, s2, 3 +; GFX10-NEXT: s_lshl_b32 s1, s1, 3 ; GFX10-NEXT: v_lshlrev_b32_sdwa v2, s1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX10-NEXT: s_lshl_b32 s1, s0, s1 ; GFX10-NEXT: s_not_b32 s1, s1 -; GFX10-NEXT: v_or3_b32 v0, v0, v4, v1 -; GFX10-NEXT: v_mov_b32_e32 v1, 8 +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_lshlrev_b32_sdwa v4, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 +; GFX10-NEXT: v_lshrrev_b32_e32 v5, 24, v0 +; GFX10-NEXT: v_lshlrev_b32_sdwa v6, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 +; GFX10-NEXT: v_and_or_b32 v0, v0, s0, v4 +; GFX10-NEXT: v_lshlrev_b32_e32 v4, 24, v5 +; GFX10-NEXT: v_or3_b32 v0, v0, v6, v4 ; GFX10-NEXT: v_and_or_b32 v0, v0, s1, v2 -; GFX10-NEXT: v_mov_b32_e32 v2, 16 ; GFX10-NEXT: v_lshlrev_b32_sdwa v1, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 -; GFX10-NEXT: v_lshrrev_b32_e32 v3, 24, v0 -; GFX10-NEXT: v_lshlrev_b32_sdwa v2, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 +; GFX10-NEXT: v_lshrrev_b32_e32 v2, 24, v0 +; GFX10-NEXT: v_lshlrev_b32_sdwa v3, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 ; GFX10-NEXT: v_and_or_b32 v4, v0, s0, v1 -; GFX10-NEXT: v_lshlrev_b32_e32 v3, 24, v3 +; GFX10-NEXT: v_lshlrev_b32_e32 v2, 24, v2 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 -; GFX10-NEXT: v_or3_b32 v2, v4, v2, v3 +; GFX10-NEXT: v_or3_b32 v2, v4, v3, v2 ; GFX10-NEXT: global_store_dword v[0:1], v2, off ; GFX10-NEXT: s_endpgm %vec = load <4 x i8>, <4 x i8> addrspace(1)* %ptr @@ -1614,22 +1596,19 @@ ; GFX9-LABEL: insertelement_v_v4i8_v_v: ; GFX9: ; %bb.0: ; GFX9-NEXT: global_load_dword v0, v[0:1], off -; GFX9-NEXT: s_mov_b32 s0, 8 +; GFX9-NEXT: v_mov_b32_e32 v4, 8 ; GFX9-NEXT: v_and_b32_e32 v3, 3, v3 -; GFX9-NEXT: s_mov_b32 s1, 16 -; GFX9-NEXT: s_movk_i32 s2, 0xff +; GFX9-NEXT: v_mov_b32_e32 v1, 0xff +; GFX9-NEXT: v_mov_b32_e32 v5, 16 ; GFX9-NEXT: v_lshlrev_b32_e32 v3, 3, v3 ; GFX9-NEXT: v_lshlrev_b32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_lshlrev_b32_e64 v3, v3, s2 +; GFX9-NEXT: v_lshlrev_b32_e32 v3, v3, v1 ; GFX9-NEXT: v_xor_b32_e32 v3, -1, v3 -; GFX9-NEXT: v_mov_b32_e32 v4, 8 -; GFX9-NEXT: v_mov_b32_e32 v1, 0xff -; GFX9-NEXT: v_mov_b32_e32 v5, 16 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_lshrrev_b32_e32 v6, 24, v0 -; GFX9-NEXT: v_lshlrev_b32_sdwa v7, s0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 -; GFX9-NEXT: v_lshlrev_b32_sdwa v8, s1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 -; GFX9-NEXT: v_and_or_b32 v0, v0, s2, v7 +; GFX9-NEXT: v_lshlrev_b32_sdwa v7, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 +; GFX9-NEXT: v_lshlrev_b32_sdwa v8, v5, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 +; GFX9-NEXT: v_and_or_b32 v0, v0, v1, v7 ; GFX9-NEXT: v_lshlrev_b32_e32 v6, 24, v6 ; GFX9-NEXT: v_or3_b32 v0, v0, v8, v6 ; GFX9-NEXT: v_and_or_b32 v0, v0, v3, v2 @@ -1655,21 +1634,19 @@ ; GFX8-NEXT: v_lshlrev_b32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX8-NEXT: v_lshlrev_b32_e32 v1, v3, v1 ; GFX8-NEXT: v_xor_b32_e32 v1, -1, v1 -; GFX8-NEXT: v_mov_b32_e32 v6, 8 -; GFX8-NEXT: v_mov_b32_e32 v7, 16 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_lshlrev_b32_sdwa v4, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 +; GFX8-NEXT: v_lshlrev_b32_sdwa v6, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 ; GFX8-NEXT: v_lshrrev_b32_e32 v3, 24, v0 -; GFX8-NEXT: v_lshlrev_b32_sdwa v5, v5, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 -; GFX8-NEXT: v_or_b32_sdwa v0, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX8-NEXT: v_lshlrev_b32_sdwa v7, v5, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 +; GFX8-NEXT: v_or_b32_sdwa v0, v0, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX8-NEXT: v_lshlrev_b32_e32 v3, 24, v3 -; GFX8-NEXT: v_or_b32_e32 v0, v0, v5 +; GFX8-NEXT: v_or_b32_e32 v0, v0, v7 ; GFX8-NEXT: v_or_b32_e32 v0, v0, v3 ; GFX8-NEXT: v_and_b32_e32 v0, v0, v1 ; GFX8-NEXT: v_or_b32_e32 v0, v0, v2 -; GFX8-NEXT: v_lshlrev_b32_sdwa v2, v6, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 +; GFX8-NEXT: v_lshlrev_b32_sdwa v2, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 ; GFX8-NEXT: v_lshrrev_b32_e32 v1, 24, v0 -; GFX8-NEXT: v_lshlrev_b32_sdwa v3, v7, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 +; GFX8-NEXT: v_lshlrev_b32_sdwa v3, v5, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 ; GFX8-NEXT: v_or_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 24, v1 ; GFX8-NEXT: v_or_b32_e32 v3, v0, v3 @@ -1685,18 +1662,18 @@ ; GFX7-NEXT: s_mov_b32 s3, 0xf000 ; GFX7-NEXT: s_mov_b64 s[0:1], 0 ; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[0:3], 0 addr64 -; GFX7-NEXT: s_movk_i32 s2, 0xff -; GFX7-NEXT: v_and_b32_e32 v3, 3, v3 ; GFX7-NEXT: v_mov_b32_e32 v1, 0xff -; GFX7-NEXT: v_and_b32_e32 v2, s2, v2 +; GFX7-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX7-NEXT: v_and_b32_e32 v2, v2, v1 ; GFX7-NEXT: v_lshlrev_b32_e32 v3, 3, v3 ; GFX7-NEXT: v_lshlrev_b32_e32 v2, v3, v2 ; GFX7-NEXT: v_lshlrev_b32_e32 v3, v3, v1 ; GFX7-NEXT: v_xor_b32_e32 v3, -1, v3 +; GFX7-NEXT: s_mov_b32 s2, -1 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_bfe_u32 v6, v0, 8, 8 ; GFX7-NEXT: v_lshrrev_b32_e32 v4, 24, v0 -; GFX7-NEXT: v_and_b32_e32 v5, s2, v0 +; GFX7-NEXT: v_and_b32_e32 v5, v0, v1 ; GFX7-NEXT: v_bfe_u32 v0, v0, 16, 8 ; GFX7-NEXT: v_lshlrev_b32_e32 v6, 8, v6 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 @@ -1716,7 +1693,6 @@ ; GFX7-NEXT: v_lshlrev_b32_e32 v2, 24, v2 ; GFX7-NEXT: v_or_b32_e32 v0, v1, v0 ; GFX7-NEXT: v_or_b32_e32 v0, v0, v2 -; GFX7-NEXT: s_mov_b32 s2, -1 ; GFX7-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX7-NEXT: s_endpgm ; @@ -1724,31 +1700,29 @@ ; GFX10: ; %bb.0: ; GFX10-NEXT: global_load_dword v0, v[0:1], off ; GFX10-NEXT: v_and_b32_e32 v1, 3, v3 -; GFX10-NEXT: s_mov_b32 s0, 8 -; GFX10-NEXT: s_movk_i32 s1, 0xff +; GFX10-NEXT: v_mov_b32_e32 v3, 8 +; GFX10-NEXT: v_mov_b32_e32 v4, 0xff +; GFX10-NEXT: v_mov_b32_e32 v5, 16 ; GFX10-NEXT: v_lshlrev_b32_e32 v1, 3, v1 -; GFX10-NEXT: v_lshlrev_b32_e64 v5, v1, s1 +; GFX10-NEXT: v_lshlrev_b32_e32 v8, v1, v4 ; GFX10-NEXT: v_lshlrev_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX10-NEXT: v_xor_b32_e32 v2, -1, v5 +; GFX10-NEXT: v_xor_b32_e32 v2, -1, v8 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_lshlrev_b32_sdwa v3, s0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 -; GFX10-NEXT: v_lshrrev_b32_e32 v4, 24, v0 -; GFX10-NEXT: s_mov_b32 s0, 16 -; GFX10-NEXT: v_lshlrev_b32_sdwa v6, s0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 -; GFX10-NEXT: v_and_or_b32 v0, v0, s1, v3 -; GFX10-NEXT: v_lshlrev_b32_e32 v3, 24, v4 -; GFX10-NEXT: v_or3_b32 v0, v0, v6, v3 -; GFX10-NEXT: v_mov_b32_e32 v3, 8 +; GFX10-NEXT: v_lshlrev_b32_sdwa v6, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 +; GFX10-NEXT: v_lshrrev_b32_e32 v7, 24, v0 +; GFX10-NEXT: v_lshlrev_b32_sdwa v9, v5, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 +; GFX10-NEXT: v_and_or_b32 v0, v0, v4, v6 +; GFX10-NEXT: v_lshlrev_b32_e32 v6, 24, v7 +; GFX10-NEXT: v_or3_b32 v0, v0, v9, v6 ; GFX10-NEXT: v_and_or_b32 v0, v0, v2, v1 -; GFX10-NEXT: v_mov_b32_e32 v1, 16 -; GFX10-NEXT: v_lshlrev_b32_sdwa v2, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 -; GFX10-NEXT: v_lshrrev_b32_e32 v3, 24, v0 -; GFX10-NEXT: v_lshlrev_b32_sdwa v4, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 -; GFX10-NEXT: v_and_or_b32 v2, 0xff, v0, v2 -; GFX10-NEXT: v_lshlrev_b32_e32 v3, 24, v3 +; GFX10-NEXT: v_lshlrev_b32_sdwa v1, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 +; GFX10-NEXT: v_lshrrev_b32_e32 v2, 24, v0 +; GFX10-NEXT: v_lshlrev_b32_sdwa v3, v5, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 +; GFX10-NEXT: v_and_or_b32 v4, v0, v4, v1 +; GFX10-NEXT: v_lshlrev_b32_e32 v2, 24, v2 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 -; GFX10-NEXT: v_or3_b32 v2, v2, v4, v3 +; GFX10-NEXT: v_or3_b32 v2, v4, v3, v2 ; GFX10-NEXT: global_store_dword v[0:1], v2, off ; GFX10-NEXT: s_endpgm %vec = load <4 x i8>, <4 x i8> addrspace(1)* %ptr @@ -2040,29 +2014,27 @@ ; GFX9-LABEL: insertelement_v_v8i8_s_s: ; GFX9: ; %bb.0: ; GFX9-NEXT: global_load_dwordx2 v[0:1], v[0:1], off -; GFX9-NEXT: s_mov_b32 s0, 8 -; GFX9-NEXT: s_mov_b32 s1, 16 +; GFX9-NEXT: v_mov_b32_e32 v4, 8 ; GFX9-NEXT: s_movk_i32 s4, 0xff -; GFX9-NEXT: s_lshr_b32 s5, s3, 2 -; GFX9-NEXT: s_and_b32 s3, s3, 3 +; GFX9-NEXT: v_mov_b32_e32 v5, 16 +; GFX9-NEXT: s_and_b32 s1, s3, 3 +; GFX9-NEXT: s_lshr_b32 s0, s3, 2 ; GFX9-NEXT: s_and_b32 s2, s2, s4 -; GFX9-NEXT: s_lshl_b32 s3, s3, 3 -; GFX9-NEXT: s_lshl_b32 s2, s2, s3 -; GFX9-NEXT: s_lshl_b32 s3, s4, s3 -; GFX9-NEXT: v_cmp_eq_u32_e64 vcc, s5, 1 -; GFX9-NEXT: s_not_b32 s3, s3 +; GFX9-NEXT: s_lshl_b32 s1, s1, 3 +; GFX9-NEXT: s_lshl_b32 s2, s2, s1 +; GFX9-NEXT: s_lshl_b32 s1, s4, s1 +; GFX9-NEXT: v_cmp_eq_u32_e64 vcc, s0, 1 +; GFX9-NEXT: s_not_b32 s1, s1 ; GFX9-NEXT: v_mov_b32_e32 v6, s2 -; GFX9-NEXT: v_mov_b32_e32 v4, 8 -; GFX9-NEXT: v_mov_b32_e32 v5, 16 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: v_mov_b32_e32 v3, 0 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_lshrrev_b32_e32 v7, 24, v0 ; GFX9-NEXT: v_lshrrev_b32_e32 v8, 24, v1 -; GFX9-NEXT: v_lshlrev_b32_sdwa v9, s0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 -; GFX9-NEXT: v_lshlrev_b32_sdwa v11, s0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 -; GFX9-NEXT: v_lshlrev_b32_sdwa v10, s1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 -; GFX9-NEXT: v_lshlrev_b32_sdwa v12, s1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 +; GFX9-NEXT: v_lshlrev_b32_sdwa v9, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 +; GFX9-NEXT: v_lshlrev_b32_sdwa v11, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 +; GFX9-NEXT: v_lshlrev_b32_sdwa v10, v5, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 +; GFX9-NEXT: v_lshlrev_b32_sdwa v12, v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 ; GFX9-NEXT: v_and_or_b32 v0, v0, s4, v9 ; GFX9-NEXT: v_lshlrev_b32_e32 v7, 24, v7 ; GFX9-NEXT: v_and_or_b32 v1, v1, s4, v11 @@ -2070,8 +2042,8 @@ ; GFX9-NEXT: v_or3_b32 v0, v0, v10, v7 ; GFX9-NEXT: v_or3_b32 v1, v1, v12, v8 ; GFX9-NEXT: v_cndmask_b32_e32 v7, v0, v1, vcc -; GFX9-NEXT: v_and_or_b32 v6, v7, s3, v6 -; GFX9-NEXT: v_cmp_eq_u32_e64 s[0:1], s5, 0 +; GFX9-NEXT: v_and_or_b32 v6, v7, s1, v6 +; GFX9-NEXT: v_cmp_eq_u32_e64 s[0:1], s0, 0 ; GFX9-NEXT: v_cndmask_b32_e64 v0, v0, v6, s[0:1] ; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v6, vcc ; GFX9-NEXT: v_lshrrev_b32_e32 v6, 24, v0 @@ -2103,45 +2075,43 @@ ; GFX8-NEXT: v_cmp_eq_u32_e64 vcc, s1, 1 ; GFX8-NEXT: s_not_b32 s0, s0 ; GFX8-NEXT: s_lshl_b32 s2, s2, s3 -; GFX8-NEXT: v_mov_b32_e32 v6, 8 -; GFX8-NEXT: v_mov_b32_e32 v7, 16 ; GFX8-NEXT: v_mov_b32_e32 v2, 0 ; GFX8-NEXT: v_mov_b32_e32 v3, 0 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_lshlrev_b32_sdwa v10, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 +; GFX8-NEXT: v_lshlrev_b32_sdwa v8, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 +; GFX8-NEXT: v_lshlrev_b32_sdwa v10, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 +; GFX8-NEXT: v_lshrrev_b32_e32 v6, 24, v0 +; GFX8-NEXT: v_lshrrev_b32_e32 v7, 24, v1 +; GFX8-NEXT: v_lshlrev_b32_sdwa v9, v5, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 +; GFX8-NEXT: v_lshlrev_b32_sdwa v11, v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 +; GFX8-NEXT: v_or_b32_sdwa v0, v0, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_sdwa v1, v1, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX8-NEXT: v_lshlrev_b32_e32 v6, 24, v6 +; GFX8-NEXT: v_lshlrev_b32_e32 v7, 24, v7 +; GFX8-NEXT: v_or_b32_e32 v0, v0, v9 +; GFX8-NEXT: v_or_b32_e32 v1, v1, v11 +; GFX8-NEXT: v_or_b32_e32 v0, v0, v6 +; GFX8-NEXT: v_or_b32_e32 v1, v1, v7 +; GFX8-NEXT: v_cndmask_b32_e32 v6, v0, v1, vcc +; GFX8-NEXT: v_and_b32_e32 v6, s0, v6 +; GFX8-NEXT: v_or_b32_e32 v6, s2, v6 +; GFX8-NEXT: v_cmp_eq_u32_e64 s[0:1], s1, 0 +; GFX8-NEXT: v_cndmask_b32_e64 v0, v0, v6, s[0:1] +; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v6, vcc +; GFX8-NEXT: v_lshlrev_b32_sdwa v8, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 ; GFX8-NEXT: v_lshlrev_b32_sdwa v4, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 -; GFX8-NEXT: v_lshrrev_b32_e32 v8, 24, v0 -; GFX8-NEXT: v_lshrrev_b32_e32 v9, 24, v1 -; GFX8-NEXT: v_lshlrev_b32_sdwa v11, v5, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 +; GFX8-NEXT: v_lshrrev_b32_e32 v6, 24, v0 +; GFX8-NEXT: v_lshrrev_b32_e32 v7, 24, v1 +; GFX8-NEXT: v_lshlrev_b32_sdwa v9, v5, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 ; GFX8-NEXT: v_lshlrev_b32_sdwa v5, v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 -; GFX8-NEXT: v_or_b32_sdwa v0, v0, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX8-NEXT: v_or_b32_sdwa v1, v1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX8-NEXT: v_lshlrev_b32_e32 v8, 24, v8 -; GFX8-NEXT: v_lshlrev_b32_e32 v4, 24, v9 -; GFX8-NEXT: v_or_b32_e32 v0, v0, v11 -; GFX8-NEXT: v_or_b32_e32 v1, v1, v5 -; GFX8-NEXT: v_or_b32_e32 v0, v0, v8 -; GFX8-NEXT: v_or_b32_e32 v1, v1, v4 -; GFX8-NEXT: v_cndmask_b32_e32 v4, v0, v1, vcc -; GFX8-NEXT: v_and_b32_e32 v4, s0, v4 -; GFX8-NEXT: v_or_b32_e32 v4, s2, v4 -; GFX8-NEXT: v_cmp_eq_u32_e64 s[0:1], s1, 0 -; GFX8-NEXT: v_cndmask_b32_e64 v0, v0, v4, s[0:1] -; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc -; GFX8-NEXT: v_lshlrev_b32_sdwa v8, v6, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 -; GFX8-NEXT: v_lshlrev_b32_sdwa v6, v6, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 -; GFX8-NEXT: v_lshrrev_b32_e32 v4, 24, v0 -; GFX8-NEXT: v_lshrrev_b32_e32 v5, 24, v1 -; GFX8-NEXT: v_lshlrev_b32_sdwa v9, v7, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 -; GFX8-NEXT: v_lshlrev_b32_sdwa v7, v7, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 ; GFX8-NEXT: v_or_b32_sdwa v0, v0, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX8-NEXT: v_or_b32_sdwa v1, v1, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX8-NEXT: v_lshlrev_b32_e32 v4, 24, v4 -; GFX8-NEXT: v_lshlrev_b32_e32 v5, 24, v5 +; GFX8-NEXT: v_or_b32_sdwa v1, v1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX8-NEXT: v_lshlrev_b32_e32 v6, 24, v6 +; GFX8-NEXT: v_lshlrev_b32_e32 v4, 24, v7 ; GFX8-NEXT: v_or_b32_e32 v0, v0, v9 -; GFX8-NEXT: v_or_b32_e32 v1, v1, v7 -; GFX8-NEXT: v_or_b32_e32 v0, v0, v4 ; GFX8-NEXT: v_or_b32_e32 v1, v1, v5 +; GFX8-NEXT: v_or_b32_e32 v0, v0, v6 +; GFX8-NEXT: v_or_b32_e32 v1, v1, v4 ; GFX8-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GFX8-NEXT: s_endpgm ; @@ -2214,51 +2184,49 @@ ; GFX10-LABEL: insertelement_v_v8i8_s_s: ; GFX10: ; %bb.0: ; GFX10-NEXT: global_load_dwordx2 v[0:1], v[0:1], off -; GFX10-NEXT: s_mov_b32 s0, 8 -; GFX10-NEXT: s_mov_b32 s1, 16 -; GFX10-NEXT: s_movk_i32 s4, 0xff -; GFX10-NEXT: s_and_b32 s2, s2, s4 -; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_lshrrev_b32_e32 v2, 24, v0 -; GFX10-NEXT: v_lshlrev_b32_sdwa v3, s0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 -; GFX10-NEXT: v_lshrrev_b32_e32 v4, 24, v1 -; GFX10-NEXT: v_lshlrev_b32_sdwa v5, s0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 -; GFX10-NEXT: v_lshlrev_b32_sdwa v6, s1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 -; GFX10-NEXT: v_lshlrev_b32_sdwa v7, s1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 -; GFX10-NEXT: v_and_or_b32 v0, v0, s4, v3 -; GFX10-NEXT: v_lshlrev_b32_e32 v2, 24, v2 -; GFX10-NEXT: v_and_or_b32 v1, v1, s4, v5 -; GFX10-NEXT: v_lshlrev_b32_e32 v3, 24, v4 +; GFX10-NEXT: v_mov_b32_e32 v2, 8 +; GFX10-NEXT: v_mov_b32_e32 v3, 16 +; GFX10-NEXT: s_movk_i32 s1, 0xff ; GFX10-NEXT: s_lshr_b32 s0, s3, 2 -; GFX10-NEXT: s_and_b32 s1, s3, 3 -; GFX10-NEXT: v_or3_b32 v0, v0, v6, v2 +; GFX10-NEXT: s_and_b32 s3, s3, 3 ; GFX10-NEXT: v_cmp_eq_u32_e64 vcc_lo, s0, 1 -; GFX10-NEXT: v_or3_b32 v1, v1, v7, v3 -; GFX10-NEXT: s_lshl_b32 s1, s1, 3 +; GFX10-NEXT: s_lshl_b32 s3, s3, 3 +; GFX10-NEXT: s_and_b32 s2, s2, s1 +; GFX10-NEXT: s_lshl_b32 s4, s1, s3 +; GFX10-NEXT: s_lshl_b32 s2, s2, s3 +; GFX10-NEXT: s_not_b32 s3, s4 ; GFX10-NEXT: v_cmp_eq_u32_e64 s0, s0, 0 -; GFX10-NEXT: s_lshl_b32 s3, s4, s1 -; GFX10-NEXT: s_lshl_b32 s1, s2, s1 -; GFX10-NEXT: v_cndmask_b32_e32 v2, v0, v1, vcc_lo -; GFX10-NEXT: s_not_b32 s2, s3 -; GFX10-NEXT: v_mov_b32_e32 v3, 8 -; GFX10-NEXT: v_and_or_b32 v2, v2, s2, s1 -; GFX10-NEXT: v_cndmask_b32_e64 v0, v0, v2, s0 -; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc_lo -; GFX10-NEXT: v_mov_b32_e32 v2, 16 +; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_lshrrev_b32_e32 v4, 24, v0 -; GFX10-NEXT: v_lshlrev_b32_sdwa v5, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 +; GFX10-NEXT: v_lshlrev_b32_sdwa v5, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 ; GFX10-NEXT: v_lshrrev_b32_e32 v6, 24, v1 -; GFX10-NEXT: v_lshlrev_b32_sdwa v3, v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 -; GFX10-NEXT: v_lshlrev_b32_sdwa v7, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 -; GFX10-NEXT: v_lshlrev_b32_sdwa v8, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 -; GFX10-NEXT: v_and_or_b32 v2, v0, s4, v5 +; GFX10-NEXT: v_lshlrev_b32_sdwa v7, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 +; GFX10-NEXT: v_lshlrev_b32_sdwa v8, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 +; GFX10-NEXT: v_lshlrev_b32_sdwa v9, v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 +; GFX10-NEXT: v_and_or_b32 v0, v0, s1, v5 ; GFX10-NEXT: v_lshlrev_b32_e32 v4, 24, v4 -; GFX10-NEXT: v_and_or_b32 v3, v1, s4, v3 +; GFX10-NEXT: v_and_or_b32 v1, v1, s1, v7 ; GFX10-NEXT: v_lshlrev_b32_e32 v5, 24, v6 +; GFX10-NEXT: v_or3_b32 v0, v0, v8, v4 +; GFX10-NEXT: v_or3_b32 v1, v1, v9, v5 +; GFX10-NEXT: v_cndmask_b32_e32 v4, v0, v1, vcc_lo +; GFX10-NEXT: v_and_or_b32 v4, v4, s3, s2 +; GFX10-NEXT: v_cndmask_b32_e64 v0, v0, v4, s0 +; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc_lo +; GFX10-NEXT: v_lshrrev_b32_e32 v4, 24, v0 +; GFX10-NEXT: v_lshlrev_b32_sdwa v5, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 +; GFX10-NEXT: v_lshrrev_b32_e32 v6, 24, v1 +; GFX10-NEXT: v_lshlrev_b32_sdwa v2, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 +; GFX10-NEXT: v_lshlrev_b32_sdwa v7, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 +; GFX10-NEXT: v_lshlrev_b32_sdwa v3, v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 +; GFX10-NEXT: v_and_or_b32 v5, v0, s1, v5 +; GFX10-NEXT: v_lshlrev_b32_e32 v4, 24, v4 +; GFX10-NEXT: v_and_or_b32 v8, v1, s1, v2 +; GFX10-NEXT: v_lshlrev_b32_e32 v6, 24, v6 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 -; GFX10-NEXT: v_or3_b32 v2, v2, v7, v4 -; GFX10-NEXT: v_or3_b32 v3, v3, v8, v5 +; GFX10-NEXT: v_or3_b32 v2, v5, v7, v4 +; GFX10-NEXT: v_or3_b32 v3, v8, v3, v6 ; GFX10-NEXT: global_store_dwordx2 v[0:1], v[2:3], off ; GFX10-NEXT: s_endpgm %vec = load <8 x i8>, <8 x i8> addrspace(1 )* %ptr @@ -3066,28 +3034,27 @@ ; GFX9-LABEL: insertelement_v_v8i8_s_v: ; GFX9: ; %bb.0: ; GFX9-NEXT: global_load_dwordx2 v[0:1], v[0:1], off -; GFX9-NEXT: s_mov_b32 s0, 8 -; GFX9-NEXT: s_mov_b32 s1, 16 +; GFX9-NEXT: v_mov_b32_e32 v5, 8 ; GFX9-NEXT: s_movk_i32 s3, 0xff +; GFX9-NEXT: v_mov_b32_e32 v6, 16 ; GFX9-NEXT: v_lshrrev_b32_e32 v7, 2, v2 ; GFX9-NEXT: v_and_b32_e32 v2, 3, v2 -; GFX9-NEXT: s_and_b32 s2, s2, s3 +; GFX9-NEXT: s_and_b32 s0, s2, s3 ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 3, v2 -; GFX9-NEXT: v_lshlrev_b32_e64 v8, v2, s2 +; GFX9-NEXT: v_lshlrev_b32_e64 v8, v2, s0 ; GFX9-NEXT: v_lshlrev_b32_e64 v2, v2, s3 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v7 ; GFX9-NEXT: v_xor_b32_e32 v2, -1, v2 -; GFX9-NEXT: v_mov_b32_e32 v5, 8 -; GFX9-NEXT: v_mov_b32_e32 v6, 16 +; GFX9-NEXT: v_cmp_eq_u32_e64 s[0:1], 0, v7 ; GFX9-NEXT: v_mov_b32_e32 v3, 0 ; GFX9-NEXT: v_mov_b32_e32 v4, 0 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_lshrrev_b32_e32 v9, 24, v0 ; GFX9-NEXT: v_lshrrev_b32_e32 v10, 24, v1 -; GFX9-NEXT: v_lshlrev_b32_sdwa v11, s0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 -; GFX9-NEXT: v_lshlrev_b32_sdwa v13, s0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 -; GFX9-NEXT: v_lshlrev_b32_sdwa v12, s1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 -; GFX9-NEXT: v_lshlrev_b32_sdwa v14, s1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 +; GFX9-NEXT: v_lshlrev_b32_sdwa v11, v5, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 +; GFX9-NEXT: v_lshlrev_b32_sdwa v13, v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 +; GFX9-NEXT: v_lshlrev_b32_sdwa v12, v6, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 +; GFX9-NEXT: v_lshlrev_b32_sdwa v14, v6, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 ; GFX9-NEXT: v_and_or_b32 v0, v0, s3, v11 ; GFX9-NEXT: v_lshlrev_b32_e32 v9, 24, v9 ; GFX9-NEXT: v_and_or_b32 v1, v1, s3, v13 @@ -3096,7 +3063,6 @@ ; GFX9-NEXT: v_or3_b32 v1, v1, v14, v10 ; GFX9-NEXT: v_cndmask_b32_e32 v9, v0, v1, vcc ; GFX9-NEXT: v_and_or_b32 v2, v9, v2, v8 -; GFX9-NEXT: v_cmp_eq_u32_e64 s[0:1], 0, v7 ; GFX9-NEXT: v_cndmask_b32_e64 v0, v0, v2, s[0:1] ; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc ; GFX9-NEXT: v_lshrrev_b32_e32 v2, 24, v0 @@ -3120,51 +3086,49 @@ ; GFX8-NEXT: v_mov_b32_e32 v5, 8 ; GFX8-NEXT: v_mov_b32_e32 v6, 16 ; GFX8-NEXT: s_movk_i32 s0, 0xff -; GFX8-NEXT: v_lshrrev_b32_e32 v9, 2, v2 +; GFX8-NEXT: v_lshrrev_b32_e32 v7, 2, v2 ; GFX8-NEXT: v_and_b32_e32 v2, 3, v2 ; GFX8-NEXT: s_and_b32 s1, s2, s0 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 3, v2 -; GFX8-NEXT: v_lshlrev_b32_e64 v10, v2, s1 +; GFX8-NEXT: v_lshlrev_b32_e64 v8, v2, s1 ; GFX8-NEXT: v_lshlrev_b32_e64 v2, v2, s0 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 1, v9 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 1, v7 ; GFX8-NEXT: v_xor_b32_e32 v2, -1, v2 -; GFX8-NEXT: v_cmp_eq_u32_e64 s[0:1], 0, v9 -; GFX8-NEXT: v_mov_b32_e32 v7, 8 -; GFX8-NEXT: v_mov_b32_e32 v8, 16 +; GFX8-NEXT: v_cmp_eq_u32_e64 s[0:1], 0, v7 ; GFX8-NEXT: v_mov_b32_e32 v3, 0 ; GFX8-NEXT: v_mov_b32_e32 v4, 0 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_lshlrev_b32_sdwa v13, v5, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 -; GFX8-NEXT: v_lshlrev_b32_sdwa v5, v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 -; GFX8-NEXT: v_lshrrev_b32_e32 v11, 24, v0 -; GFX8-NEXT: v_lshrrev_b32_e32 v12, 24, v1 -; GFX8-NEXT: v_lshlrev_b32_sdwa v14, v6, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 -; GFX8-NEXT: v_lshlrev_b32_sdwa v6, v6, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 -; GFX8-NEXT: v_or_b32_sdwa v0, v0, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX8-NEXT: v_or_b32_sdwa v1, v1, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX8-NEXT: v_lshlrev_b32_e32 v11, 24, v11 -; GFX8-NEXT: v_lshlrev_b32_e32 v5, 24, v12 -; GFX8-NEXT: v_or_b32_e32 v0, v0, v14 -; GFX8-NEXT: v_or_b32_e32 v1, v1, v6 -; GFX8-NEXT: v_or_b32_e32 v0, v0, v11 -; GFX8-NEXT: v_or_b32_e32 v1, v1, v5 -; GFX8-NEXT: v_cndmask_b32_e32 v5, v0, v1, vcc -; GFX8-NEXT: v_and_b32_e32 v2, v5, v2 -; GFX8-NEXT: v_or_b32_e32 v2, v2, v10 +; GFX8-NEXT: v_lshlrev_b32_sdwa v11, v5, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 +; GFX8-NEXT: v_lshlrev_b32_sdwa v13, v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 +; GFX8-NEXT: v_lshrrev_b32_e32 v9, 24, v0 +; GFX8-NEXT: v_lshrrev_b32_e32 v10, 24, v1 +; GFX8-NEXT: v_lshlrev_b32_sdwa v12, v6, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 +; GFX8-NEXT: v_lshlrev_b32_sdwa v14, v6, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 +; GFX8-NEXT: v_or_b32_sdwa v0, v0, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_sdwa v1, v1, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX8-NEXT: v_lshlrev_b32_e32 v9, 24, v9 +; GFX8-NEXT: v_lshlrev_b32_e32 v10, 24, v10 +; GFX8-NEXT: v_or_b32_e32 v0, v0, v12 +; GFX8-NEXT: v_or_b32_e32 v1, v1, v14 +; GFX8-NEXT: v_or_b32_e32 v0, v0, v9 +; GFX8-NEXT: v_or_b32_e32 v1, v1, v10 +; GFX8-NEXT: v_cndmask_b32_e32 v9, v0, v1, vcc +; GFX8-NEXT: v_and_b32_e32 v2, v9, v2 +; GFX8-NEXT: v_or_b32_e32 v2, v2, v8 ; GFX8-NEXT: v_cndmask_b32_e64 v0, v0, v2, s[0:1] ; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc -; GFX8-NEXT: v_lshlrev_b32_sdwa v6, v7, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 -; GFX8-NEXT: v_lshlrev_b32_sdwa v7, v7, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 +; GFX8-NEXT: v_lshlrev_b32_sdwa v8, v5, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 +; GFX8-NEXT: v_lshlrev_b32_sdwa v5, v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 ; GFX8-NEXT: v_lshrrev_b32_e32 v2, 24, v0 -; GFX8-NEXT: v_lshrrev_b32_e32 v5, 24, v1 -; GFX8-NEXT: v_lshlrev_b32_sdwa v9, v8, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 -; GFX8-NEXT: v_lshlrev_b32_sdwa v8, v8, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 -; GFX8-NEXT: v_or_b32_sdwa v0, v0, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX8-NEXT: v_or_b32_sdwa v1, v1, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX8-NEXT: v_lshrrev_b32_e32 v7, 24, v1 +; GFX8-NEXT: v_lshlrev_b32_sdwa v9, v6, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 +; GFX8-NEXT: v_lshlrev_b32_sdwa v6, v6, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 +; GFX8-NEXT: v_or_b32_sdwa v0, v0, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_sdwa v1, v1, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 24, v2 -; GFX8-NEXT: v_lshlrev_b32_e32 v5, 24, v5 +; GFX8-NEXT: v_lshlrev_b32_e32 v5, 24, v7 ; GFX8-NEXT: v_or_b32_e32 v0, v0, v9 -; GFX8-NEXT: v_or_b32_e32 v1, v1, v8 +; GFX8-NEXT: v_or_b32_e32 v1, v1, v6 ; GFX8-NEXT: v_or_b32_e32 v0, v0, v2 ; GFX8-NEXT: v_or_b32_e32 v1, v1, v5 ; GFX8-NEXT: flat_store_dwordx2 v[3:4], v[0:1] @@ -3239,51 +3203,49 @@ ; GFX10-LABEL: insertelement_v_v8i8_s_v: ; GFX10: ; %bb.0: ; GFX10-NEXT: global_load_dwordx2 v[0:1], v[0:1], off -; GFX10-NEXT: s_mov_b32 s0, 8 +; GFX10-NEXT: v_mov_b32_e32 v4, 8 ; GFX10-NEXT: v_and_b32_e32 v3, 3, v2 -; GFX10-NEXT: s_mov_b32 s1, 16 -; GFX10-NEXT: s_movk_i32 s3, 0xff +; GFX10-NEXT: v_mov_b32_e32 v5, 16 +; GFX10-NEXT: s_movk_i32 s1, 0xff ; GFX10-NEXT: v_lshrrev_b32_e32 v2, 2, v2 +; GFX10-NEXT: s_and_b32 s0, s2, s1 ; GFX10-NEXT: v_lshlrev_b32_e32 v3, 3, v3 ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v2 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_lshrrev_b32_e32 v4, 24, v0 -; GFX10-NEXT: v_lshlrev_b32_sdwa v5, s0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 -; GFX10-NEXT: v_lshrrev_b32_e32 v6, 24, v1 -; GFX10-NEXT: v_lshlrev_b32_sdwa v7, s0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 -; GFX10-NEXT: v_lshlrev_b32_sdwa v8, s1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 -; GFX10-NEXT: v_lshlrev_b32_sdwa v9, s1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 -; GFX10-NEXT: v_and_or_b32 v0, v0, s3, v5 -; GFX10-NEXT: v_lshlrev_b32_e32 v4, 24, v4 -; GFX10-NEXT: v_and_or_b32 v1, v1, s3, v7 -; GFX10-NEXT: v_lshlrev_b32_e32 v5, 24, v6 -; GFX10-NEXT: v_lshlrev_b32_e64 v6, v3, s3 -; GFX10-NEXT: s_and_b32 s0, s2, s3 -; GFX10-NEXT: v_or3_b32 v0, v0, v8, v4 +; GFX10-NEXT: v_lshrrev_b32_e32 v6, 24, v0 +; GFX10-NEXT: v_lshlrev_b32_sdwa v7, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 +; GFX10-NEXT: v_lshrrev_b32_e32 v8, 24, v1 +; GFX10-NEXT: v_lshlrev_b32_sdwa v9, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 +; GFX10-NEXT: v_lshlrev_b32_sdwa v10, v5, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 +; GFX10-NEXT: v_lshlrev_b32_sdwa v11, v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 +; GFX10-NEXT: v_and_or_b32 v0, v0, s1, v7 +; GFX10-NEXT: v_lshlrev_b32_e32 v6, 24, v6 +; GFX10-NEXT: v_and_or_b32 v1, v1, s1, v9 +; GFX10-NEXT: v_lshlrev_b32_e32 v7, 24, v8 +; GFX10-NEXT: v_lshlrev_b32_e64 v8, v3, s1 ; GFX10-NEXT: v_lshlrev_b32_e64 v3, v3, s0 -; GFX10-NEXT: v_or3_b32 v1, v1, v9, v5 -; GFX10-NEXT: v_xor_b32_e32 v4, -1, v6 +; GFX10-NEXT: v_or3_b32 v0, v0, v10, v6 ; GFX10-NEXT: v_cmp_eq_u32_e64 s0, 0, v2 -; GFX10-NEXT: v_mov_b32_e32 v2, 8 -; GFX10-NEXT: v_cndmask_b32_e32 v5, v0, v1, vcc_lo -; GFX10-NEXT: v_and_or_b32 v3, v5, v4, v3 +; GFX10-NEXT: v_or3_b32 v1, v1, v11, v7 +; GFX10-NEXT: v_xor_b32_e32 v6, -1, v8 +; GFX10-NEXT: v_cndmask_b32_e32 v7, v0, v1, vcc_lo +; GFX10-NEXT: v_and_or_b32 v3, v7, v6, v3 ; GFX10-NEXT: v_cndmask_b32_e64 v0, v0, v3, s0 ; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc_lo -; GFX10-NEXT: v_mov_b32_e32 v3, 16 -; GFX10-NEXT: v_lshrrev_b32_e32 v4, 24, v0 -; GFX10-NEXT: v_lshlrev_b32_sdwa v5, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 +; GFX10-NEXT: v_lshrrev_b32_e32 v2, 24, v0 +; GFX10-NEXT: v_lshlrev_b32_sdwa v3, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 ; GFX10-NEXT: v_lshrrev_b32_e32 v6, 24, v1 -; GFX10-NEXT: v_lshlrev_b32_sdwa v2, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 -; GFX10-NEXT: v_lshlrev_b32_sdwa v7, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 -; GFX10-NEXT: v_lshlrev_b32_sdwa v3, v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 -; GFX10-NEXT: v_and_or_b32 v5, v0, s3, v5 -; GFX10-NEXT: v_lshlrev_b32_e32 v4, 24, v4 -; GFX10-NEXT: v_and_or_b32 v8, v1, s3, v2 +; GFX10-NEXT: v_lshlrev_b32_sdwa v4, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 +; GFX10-NEXT: v_lshlrev_b32_sdwa v7, v5, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 +; GFX10-NEXT: v_lshlrev_b32_sdwa v5, v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 +; GFX10-NEXT: v_and_or_b32 v3, v0, s1, v3 +; GFX10-NEXT: v_lshlrev_b32_e32 v2, 24, v2 +; GFX10-NEXT: v_and_or_b32 v4, v1, s1, v4 ; GFX10-NEXT: v_lshlrev_b32_e32 v6, 24, v6 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 -; GFX10-NEXT: v_or3_b32 v2, v5, v7, v4 -; GFX10-NEXT: v_or3_b32 v3, v8, v3, v6 +; GFX10-NEXT: v_or3_b32 v2, v3, v7, v2 +; GFX10-NEXT: v_or3_b32 v3, v4, v5, v6 ; GFX10-NEXT: global_store_dwordx2 v[0:1], v[2:3], off ; GFX10-NEXT: s_endpgm %vec = load <8 x i8>, <8 x i8> addrspace(1)* %ptr @@ -3296,27 +3258,25 @@ ; GFX9-LABEL: insertelement_v_v8i8_v_s: ; GFX9: ; %bb.0: ; GFX9-NEXT: global_load_dwordx2 v[0:1], v[0:1], off -; GFX9-NEXT: s_mov_b32 s0, 8 -; GFX9-NEXT: s_mov_b32 s1, 16 -; GFX9-NEXT: s_movk_i32 s3, 0xff -; GFX9-NEXT: s_lshr_b32 s4, s2, 2 -; GFX9-NEXT: s_and_b32 s2, s2, 3 -; GFX9-NEXT: s_lshl_b32 s2, s2, 3 -; GFX9-NEXT: v_lshlrev_b32_sdwa v2, s2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: s_lshl_b32 s2, s3, s2 -; GFX9-NEXT: v_cmp_eq_u32_e64 vcc, s4, 1 -; GFX9-NEXT: s_not_b32 s2, s2 ; GFX9-NEXT: v_mov_b32_e32 v5, 8 +; GFX9-NEXT: s_movk_i32 s3, 0xff ; GFX9-NEXT: v_mov_b32_e32 v6, 16 +; GFX9-NEXT: s_and_b32 s1, s2, 3 +; GFX9-NEXT: s_lshr_b32 s0, s2, 2 +; GFX9-NEXT: s_lshl_b32 s1, s1, 3 +; GFX9-NEXT: v_lshlrev_b32_sdwa v2, s1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: s_lshl_b32 s1, s3, s1 +; GFX9-NEXT: v_cmp_eq_u32_e64 vcc, s0, 1 +; GFX9-NEXT: s_not_b32 s1, s1 ; GFX9-NEXT: v_mov_b32_e32 v3, 0 ; GFX9-NEXT: v_mov_b32_e32 v4, 0 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_lshrrev_b32_e32 v7, 24, v0 ; GFX9-NEXT: v_lshrrev_b32_e32 v8, 24, v1 -; GFX9-NEXT: v_lshlrev_b32_sdwa v9, s0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 -; GFX9-NEXT: v_lshlrev_b32_sdwa v11, s0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 -; GFX9-NEXT: v_lshlrev_b32_sdwa v10, s1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 -; GFX9-NEXT: v_lshlrev_b32_sdwa v12, s1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 +; GFX9-NEXT: v_lshlrev_b32_sdwa v9, v5, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 +; GFX9-NEXT: v_lshlrev_b32_sdwa v11, v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 +; GFX9-NEXT: v_lshlrev_b32_sdwa v10, v6, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 +; GFX9-NEXT: v_lshlrev_b32_sdwa v12, v6, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 ; GFX9-NEXT: v_and_or_b32 v0, v0, s3, v9 ; GFX9-NEXT: v_lshlrev_b32_e32 v7, 24, v7 ; GFX9-NEXT: v_and_or_b32 v1, v1, s3, v11 @@ -3324,8 +3284,8 @@ ; GFX9-NEXT: v_or3_b32 v0, v0, v10, v7 ; GFX9-NEXT: v_or3_b32 v1, v1, v12, v8 ; GFX9-NEXT: v_cndmask_b32_e32 v7, v0, v1, vcc -; GFX9-NEXT: v_and_or_b32 v2, v7, s2, v2 -; GFX9-NEXT: v_cmp_eq_u32_e64 s[0:1], s4, 0 +; GFX9-NEXT: v_and_or_b32 v2, v7, s1, v2 +; GFX9-NEXT: v_cmp_eq_u32_e64 s[0:1], s0, 0 ; GFX9-NEXT: v_cndmask_b32_e64 v0, v0, v2, s[0:1] ; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc ; GFX9-NEXT: v_lshrrev_b32_e32 v2, 24, v0 @@ -3351,49 +3311,47 @@ ; GFX8-NEXT: v_mov_b32_e32 v5, 8 ; GFX8-NEXT: s_lshl_b32 s2, s2, 3 ; GFX8-NEXT: v_mov_b32_e32 v6, 16 -; GFX8-NEXT: v_mov_b32_e32 v9, s2 -; GFX8-NEXT: v_lshlrev_b32_sdwa v2, v9, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX8-NEXT: v_mov_b32_e32 v7, s2 +; GFX8-NEXT: v_lshlrev_b32_sdwa v2, v7, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX8-NEXT: s_movk_i32 s0, 0xff ; GFX8-NEXT: s_lshl_b32 s0, s0, s2 ; GFX8-NEXT: v_cmp_eq_u32_e64 vcc, s1, 1 ; GFX8-NEXT: s_not_b32 s0, s0 -; GFX8-NEXT: v_mov_b32_e32 v7, 8 -; GFX8-NEXT: v_mov_b32_e32 v8, 16 ; GFX8-NEXT: v_mov_b32_e32 v3, 0 ; GFX8-NEXT: v_mov_b32_e32 v4, 0 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_lshlrev_b32_sdwa v11, v5, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 -; GFX8-NEXT: v_lshlrev_b32_sdwa v5, v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 -; GFX8-NEXT: v_lshrrev_b32_e32 v9, 24, v0 -; GFX8-NEXT: v_lshrrev_b32_e32 v10, 24, v1 -; GFX8-NEXT: v_lshlrev_b32_sdwa v12, v6, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 -; GFX8-NEXT: v_lshlrev_b32_sdwa v6, v6, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 -; GFX8-NEXT: v_or_b32_sdwa v0, v0, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX8-NEXT: v_or_b32_sdwa v1, v1, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX8-NEXT: v_lshlrev_b32_e32 v9, 24, v9 -; GFX8-NEXT: v_lshlrev_b32_e32 v5, 24, v10 -; GFX8-NEXT: v_or_b32_e32 v0, v0, v12 -; GFX8-NEXT: v_or_b32_e32 v1, v1, v6 -; GFX8-NEXT: v_or_b32_e32 v0, v0, v9 -; GFX8-NEXT: v_or_b32_e32 v1, v1, v5 -; GFX8-NEXT: v_cndmask_b32_e32 v5, v0, v1, vcc -; GFX8-NEXT: v_and_b32_e32 v5, s0, v5 -; GFX8-NEXT: v_or_b32_e32 v2, v5, v2 +; GFX8-NEXT: v_lshlrev_b32_sdwa v9, v5, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 +; GFX8-NEXT: v_lshlrev_b32_sdwa v11, v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 +; GFX8-NEXT: v_lshrrev_b32_e32 v7, 24, v0 +; GFX8-NEXT: v_lshrrev_b32_e32 v8, 24, v1 +; GFX8-NEXT: v_lshlrev_b32_sdwa v10, v6, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 +; GFX8-NEXT: v_lshlrev_b32_sdwa v12, v6, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 +; GFX8-NEXT: v_or_b32_sdwa v0, v0, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_sdwa v1, v1, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX8-NEXT: v_lshlrev_b32_e32 v7, 24, v7 +; GFX8-NEXT: v_lshlrev_b32_e32 v8, 24, v8 +; GFX8-NEXT: v_or_b32_e32 v0, v0, v10 +; GFX8-NEXT: v_or_b32_e32 v1, v1, v12 +; GFX8-NEXT: v_or_b32_e32 v0, v0, v7 +; GFX8-NEXT: v_or_b32_e32 v1, v1, v8 +; GFX8-NEXT: v_cndmask_b32_e32 v7, v0, v1, vcc +; GFX8-NEXT: v_and_b32_e32 v7, s0, v7 +; GFX8-NEXT: v_or_b32_e32 v2, v7, v2 ; GFX8-NEXT: v_cmp_eq_u32_e64 s[0:1], s1, 0 ; GFX8-NEXT: v_cndmask_b32_e64 v0, v0, v2, s[0:1] ; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc -; GFX8-NEXT: v_lshlrev_b32_sdwa v6, v7, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 -; GFX8-NEXT: v_lshlrev_b32_sdwa v7, v7, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 +; GFX8-NEXT: v_lshlrev_b32_sdwa v8, v5, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 +; GFX8-NEXT: v_lshlrev_b32_sdwa v5, v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 ; GFX8-NEXT: v_lshrrev_b32_e32 v2, 24, v0 -; GFX8-NEXT: v_lshrrev_b32_e32 v5, 24, v1 -; GFX8-NEXT: v_lshlrev_b32_sdwa v9, v8, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 -; GFX8-NEXT: v_lshlrev_b32_sdwa v8, v8, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 -; GFX8-NEXT: v_or_b32_sdwa v0, v0, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX8-NEXT: v_or_b32_sdwa v1, v1, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX8-NEXT: v_lshrrev_b32_e32 v7, 24, v1 +; GFX8-NEXT: v_lshlrev_b32_sdwa v9, v6, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 +; GFX8-NEXT: v_lshlrev_b32_sdwa v6, v6, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 +; GFX8-NEXT: v_or_b32_sdwa v0, v0, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_sdwa v1, v1, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 24, v2 -; GFX8-NEXT: v_lshlrev_b32_e32 v5, 24, v5 +; GFX8-NEXT: v_lshlrev_b32_e32 v5, 24, v7 ; GFX8-NEXT: v_or_b32_e32 v0, v0, v9 -; GFX8-NEXT: v_or_b32_e32 v1, v1, v8 +; GFX8-NEXT: v_or_b32_e32 v1, v1, v6 ; GFX8-NEXT: v_or_b32_e32 v0, v0, v2 ; GFX8-NEXT: v_or_b32_e32 v1, v1, v5 ; GFX8-NEXT: flat_store_dwordx2 v[3:4], v[0:1] @@ -3468,50 +3426,48 @@ ; GFX10-LABEL: insertelement_v_v8i8_v_s: ; GFX10: ; %bb.0: ; GFX10-NEXT: global_load_dwordx2 v[0:1], v[0:1], off -; GFX10-NEXT: s_mov_b32 s0, 8 -; GFX10-NEXT: s_mov_b32 s1, 16 -; GFX10-NEXT: s_movk_i32 s3, 0xff -; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_lshrrev_b32_e32 v3, 24, v0 -; GFX10-NEXT: v_lshlrev_b32_sdwa v4, s0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 -; GFX10-NEXT: v_lshrrev_b32_e32 v5, 24, v1 -; GFX10-NEXT: v_lshlrev_b32_sdwa v6, s0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 -; GFX10-NEXT: v_lshlrev_b32_sdwa v7, s1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 -; GFX10-NEXT: v_lshlrev_b32_sdwa v8, s1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 -; GFX10-NEXT: v_and_or_b32 v0, v0, s3, v4 -; GFX10-NEXT: v_lshlrev_b32_e32 v3, 24, v3 -; GFX10-NEXT: v_and_or_b32 v1, v1, s3, v6 -; GFX10-NEXT: v_lshlrev_b32_e32 v4, 24, v5 -; GFX10-NEXT: s_lshr_b32 s1, s2, 2 +; GFX10-NEXT: v_mov_b32_e32 v3, 8 +; GFX10-NEXT: v_mov_b32_e32 v4, 16 +; GFX10-NEXT: s_movk_i32 s1, 0xff ; GFX10-NEXT: s_and_b32 s0, s2, 3 -; GFX10-NEXT: v_or3_b32 v0, v0, v7, v3 -; GFX10-NEXT: v_cmp_eq_u32_e64 vcc_lo, s1, 1 -; GFX10-NEXT: v_or3_b32 v1, v1, v8, v4 +; GFX10-NEXT: s_lshr_b32 s2, s2, 2 ; GFX10-NEXT: s_lshl_b32 s0, s0, 3 +; GFX10-NEXT: v_cmp_eq_u32_e64 vcc_lo, s2, 1 ; GFX10-NEXT: v_lshlrev_b32_sdwa v2, s0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX10-NEXT: s_lshl_b32 s0, s3, s0 -; GFX10-NEXT: v_cndmask_b32_e32 v3, v0, v1, vcc_lo +; GFX10-NEXT: s_lshl_b32 s0, s1, s0 ; GFX10-NEXT: s_not_b32 s0, s0 -; GFX10-NEXT: v_and_or_b32 v2, v3, s0, v2 -; GFX10-NEXT: v_cmp_eq_u32_e64 s0, s1, 0 -; GFX10-NEXT: v_mov_b32_e32 v3, 8 +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_lshrrev_b32_e32 v5, 24, v0 +; GFX10-NEXT: v_lshlrev_b32_sdwa v6, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 +; GFX10-NEXT: v_lshrrev_b32_e32 v7, 24, v1 +; GFX10-NEXT: v_lshlrev_b32_sdwa v8, v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 +; GFX10-NEXT: v_lshlrev_b32_sdwa v9, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 +; GFX10-NEXT: v_lshlrev_b32_sdwa v10, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 +; GFX10-NEXT: v_and_or_b32 v0, v0, s1, v6 +; GFX10-NEXT: v_lshlrev_b32_e32 v5, 24, v5 +; GFX10-NEXT: v_and_or_b32 v1, v1, s1, v8 +; GFX10-NEXT: v_lshlrev_b32_e32 v6, 24, v7 +; GFX10-NEXT: v_or3_b32 v0, v0, v9, v5 +; GFX10-NEXT: v_or3_b32 v1, v1, v10, v6 +; GFX10-NEXT: v_cndmask_b32_e32 v5, v0, v1, vcc_lo +; GFX10-NEXT: v_and_or_b32 v2, v5, s0, v2 +; GFX10-NEXT: v_cmp_eq_u32_e64 s0, s2, 0 ; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc_lo ; GFX10-NEXT: v_cndmask_b32_e64 v0, v0, v2, s0 -; GFX10-NEXT: v_mov_b32_e32 v2, 16 ; GFX10-NEXT: v_lshrrev_b32_e32 v6, 24, v1 -; GFX10-NEXT: v_lshrrev_b32_e32 v4, 24, v0 +; GFX10-NEXT: v_lshrrev_b32_e32 v2, 24, v0 ; GFX10-NEXT: v_lshlrev_b32_sdwa v5, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 ; GFX10-NEXT: v_lshlrev_b32_sdwa v3, v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 -; GFX10-NEXT: v_lshlrev_b32_sdwa v7, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 -; GFX10-NEXT: v_lshlrev_b32_sdwa v8, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 -; GFX10-NEXT: v_lshlrev_b32_e32 v4, 24, v4 -; GFX10-NEXT: v_and_or_b32 v2, v0, s3, v5 -; GFX10-NEXT: v_and_or_b32 v3, v1, s3, v3 -; GFX10-NEXT: v_lshlrev_b32_e32 v5, 24, v6 +; GFX10-NEXT: v_lshlrev_b32_sdwa v7, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 +; GFX10-NEXT: v_lshlrev_b32_sdwa v4, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 +; GFX10-NEXT: v_lshlrev_b32_e32 v2, 24, v2 +; GFX10-NEXT: v_and_or_b32 v5, v0, s1, v5 +; GFX10-NEXT: v_and_or_b32 v3, v1, s1, v3 +; GFX10-NEXT: v_lshlrev_b32_e32 v6, 24, v6 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 -; GFX10-NEXT: v_or3_b32 v2, v2, v7, v4 -; GFX10-NEXT: v_or3_b32 v3, v3, v8, v5 +; GFX10-NEXT: v_or3_b32 v2, v5, v7, v2 +; GFX10-NEXT: v_or3_b32 v3, v3, v4, v6 ; GFX10-NEXT: global_store_dwordx2 v[0:1], v[2:3], off ; GFX10-NEXT: s_endpgm %vec = load <8 x i8>, <8 x i8> addrspace(1)* %ptr @@ -3524,37 +3480,34 @@ ; GFX9-LABEL: insertelement_v_v8i8_v_v: ; GFX9: ; %bb.0: ; GFX9-NEXT: global_load_dwordx2 v[0:1], v[0:1], off -; GFX9-NEXT: s_mov_b32 s0, 8 -; GFX9-NEXT: s_mov_b32 s1, 16 -; GFX9-NEXT: s_movk_i32 s2, 0xff +; GFX9-NEXT: v_mov_b32_e32 v7, 8 +; GFX9-NEXT: v_mov_b32_e32 v6, 0xff +; GFX9-NEXT: v_mov_b32_e32 v8, 16 ; GFX9-NEXT: v_lshrrev_b32_e32 v9, 2, v3 ; GFX9-NEXT: v_and_b32_e32 v3, 3, v3 -; GFX9-NEXT: v_mov_b32_e32 v6, 0xff ; GFX9-NEXT: v_lshlrev_b32_e32 v3, 3, v3 ; GFX9-NEXT: v_lshlrev_b32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_lshlrev_b32_e32 v3, v3, v6 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v9 ; GFX9-NEXT: v_xor_b32_e32 v3, -1, v3 -; GFX9-NEXT: v_mov_b32_e32 v7, 8 -; GFX9-NEXT: v_mov_b32_e32 v8, 16 +; GFX9-NEXT: v_cmp_eq_u32_e64 s[0:1], 0, v9 ; GFX9-NEXT: v_mov_b32_e32 v4, 0 ; GFX9-NEXT: v_mov_b32_e32 v5, 0 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_lshrrev_b32_e32 v10, 24, v0 ; GFX9-NEXT: v_lshrrev_b32_e32 v11, 24, v1 -; GFX9-NEXT: v_lshlrev_b32_sdwa v12, s0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 -; GFX9-NEXT: v_lshlrev_b32_sdwa v14, s0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 -; GFX9-NEXT: v_lshlrev_b32_sdwa v13, s1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 -; GFX9-NEXT: v_lshlrev_b32_sdwa v15, s1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 -; GFX9-NEXT: v_and_or_b32 v0, v0, s2, v12 +; GFX9-NEXT: v_lshlrev_b32_sdwa v12, v7, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 +; GFX9-NEXT: v_lshlrev_b32_sdwa v14, v7, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 +; GFX9-NEXT: v_lshlrev_b32_sdwa v13, v8, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 +; GFX9-NEXT: v_lshlrev_b32_sdwa v15, v8, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 +; GFX9-NEXT: v_and_or_b32 v0, v0, v6, v12 ; GFX9-NEXT: v_lshlrev_b32_e32 v10, 24, v10 -; GFX9-NEXT: v_and_or_b32 v1, v1, s2, v14 +; GFX9-NEXT: v_and_or_b32 v1, v1, v6, v14 ; GFX9-NEXT: v_lshlrev_b32_e32 v11, 24, v11 ; GFX9-NEXT: v_or3_b32 v0, v0, v13, v10 ; GFX9-NEXT: v_or3_b32 v1, v1, v15, v11 ; GFX9-NEXT: v_cndmask_b32_e32 v10, v0, v1, vcc ; GFX9-NEXT: v_and_or_b32 v2, v10, v3, v2 -; GFX9-NEXT: v_cmp_eq_u32_e64 s[0:1], 0, v9 ; GFX9-NEXT: v_cndmask_b32_e64 v0, v0, v2, s[0:1] ; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc ; GFX9-NEXT: v_lshrrev_b32_e32 v2, 24, v0 @@ -3576,52 +3529,50 @@ ; GFX8: ; %bb.0: ; GFX8-NEXT: flat_load_dwordx2 v[0:1], v[0:1] ; GFX8-NEXT: v_mov_b32_e32 v7, 8 -; GFX8-NEXT: v_lshrrev_b32_e32 v11, 2, v3 +; GFX8-NEXT: v_lshrrev_b32_e32 v9, 2, v3 ; GFX8-NEXT: v_and_b32_e32 v3, 3, v3 ; GFX8-NEXT: v_mov_b32_e32 v6, 0xff ; GFX8-NEXT: v_mov_b32_e32 v8, 16 ; GFX8-NEXT: v_lshlrev_b32_e32 v3, 3, v3 ; GFX8-NEXT: v_lshlrev_b32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX8-NEXT: v_lshlrev_b32_e32 v3, v3, v6 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 1, v11 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 1, v9 ; GFX8-NEXT: v_xor_b32_e32 v3, -1, v3 -; GFX8-NEXT: v_cmp_eq_u32_e64 s[0:1], 0, v11 -; GFX8-NEXT: v_mov_b32_e32 v9, 8 -; GFX8-NEXT: v_mov_b32_e32 v10, 16 +; GFX8-NEXT: v_cmp_eq_u32_e64 s[0:1], 0, v9 ; GFX8-NEXT: v_mov_b32_e32 v4, 0 ; GFX8-NEXT: v_mov_b32_e32 v5, 0 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_lshlrev_b32_sdwa v13, v7, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 -; GFX8-NEXT: v_lshlrev_b32_sdwa v7, v7, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 +; GFX8-NEXT: v_lshlrev_b32_sdwa v11, v7, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 +; GFX8-NEXT: v_lshlrev_b32_sdwa v13, v7, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 ; GFX8-NEXT: v_lshrrev_b32_e32 v6, 24, v0 -; GFX8-NEXT: v_lshrrev_b32_e32 v12, 24, v1 -; GFX8-NEXT: v_lshlrev_b32_sdwa v14, v8, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 -; GFX8-NEXT: v_lshlrev_b32_sdwa v8, v8, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 -; GFX8-NEXT: v_or_b32_sdwa v0, v0, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX8-NEXT: v_or_b32_sdwa v1, v1, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX8-NEXT: v_lshrrev_b32_e32 v10, 24, v1 +; GFX8-NEXT: v_lshlrev_b32_sdwa v12, v8, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 +; GFX8-NEXT: v_lshlrev_b32_sdwa v14, v8, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 +; GFX8-NEXT: v_or_b32_sdwa v0, v0, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_sdwa v1, v1, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX8-NEXT: v_lshlrev_b32_e32 v6, 24, v6 -; GFX8-NEXT: v_lshlrev_b32_e32 v7, 24, v12 -; GFX8-NEXT: v_or_b32_e32 v0, v0, v14 -; GFX8-NEXT: v_or_b32_e32 v1, v1, v8 +; GFX8-NEXT: v_lshlrev_b32_e32 v10, 24, v10 +; GFX8-NEXT: v_or_b32_e32 v0, v0, v12 +; GFX8-NEXT: v_or_b32_e32 v1, v1, v14 ; GFX8-NEXT: v_or_b32_e32 v0, v0, v6 -; GFX8-NEXT: v_or_b32_e32 v1, v1, v7 +; GFX8-NEXT: v_or_b32_e32 v1, v1, v10 ; GFX8-NEXT: v_cndmask_b32_e32 v6, v0, v1, vcc ; GFX8-NEXT: v_and_b32_e32 v3, v6, v3 ; GFX8-NEXT: v_or_b32_e32 v2, v3, v2 ; GFX8-NEXT: v_cndmask_b32_e64 v0, v0, v2, s[0:1] ; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc -; GFX8-NEXT: v_lshlrev_b32_sdwa v6, v9, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 -; GFX8-NEXT: v_lshlrev_b32_sdwa v8, v9, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 +; GFX8-NEXT: v_lshlrev_b32_sdwa v6, v7, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 +; GFX8-NEXT: v_lshlrev_b32_sdwa v7, v7, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 ; GFX8-NEXT: v_lshrrev_b32_e32 v2, 24, v0 ; GFX8-NEXT: v_lshrrev_b32_e32 v3, 24, v1 -; GFX8-NEXT: v_lshlrev_b32_sdwa v7, v10, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 -; GFX8-NEXT: v_lshlrev_b32_sdwa v9, v10, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 +; GFX8-NEXT: v_lshlrev_b32_sdwa v9, v8, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 +; GFX8-NEXT: v_lshlrev_b32_sdwa v8, v8, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 ; GFX8-NEXT: v_or_b32_sdwa v0, v0, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX8-NEXT: v_or_b32_sdwa v1, v1, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_sdwa v1, v1, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 24, v2 ; GFX8-NEXT: v_lshlrev_b32_e32 v3, 24, v3 -; GFX8-NEXT: v_or_b32_e32 v0, v0, v7 -; GFX8-NEXT: v_or_b32_e32 v1, v1, v9 +; GFX8-NEXT: v_or_b32_e32 v0, v0, v9 +; GFX8-NEXT: v_or_b32_e32 v1, v1, v8 ; GFX8-NEXT: v_or_b32_e32 v0, v0, v2 ; GFX8-NEXT: v_or_b32_e32 v1, v1, v3 ; GFX8-NEXT: flat_store_dwordx2 v[4:5], v[0:1] @@ -3633,7 +3584,6 @@ ; GFX7-NEXT: s_mov_b32 s7, 0xf000 ; GFX7-NEXT: s_mov_b64 s[4:5], 0 ; GFX7-NEXT: buffer_load_dwordx2 v[0:1], v[0:1], s[4:7], 0 addr64 -; GFX7-NEXT: s_movk_i32 s0, 0xff ; GFX7-NEXT: v_mov_b32_e32 v4, 0xff ; GFX7-NEXT: v_lshrrev_b32_e32 v5, 2, v3 ; GFX7-NEXT: v_and_b32_e32 v3, 3, v3 @@ -3643,15 +3593,16 @@ ; GFX7-NEXT: v_lshlrev_b32_e32 v3, v3, v4 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v5 ; GFX7-NEXT: v_xor_b32_e32 v3, -1, v3 +; GFX7-NEXT: v_cmp_eq_u32_e64 s[0:1], 0, v5 ; GFX7-NEXT: s_mov_b32 s6, -1 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_bfe_u32 v9, v0, 8, 8 ; GFX7-NEXT: v_bfe_u32 v11, v1, 8, 8 ; GFX7-NEXT: v_lshrrev_b32_e32 v6, 24, v0 ; GFX7-NEXT: v_lshrrev_b32_e32 v7, 24, v1 -; GFX7-NEXT: v_and_b32_e32 v8, s0, v0 +; GFX7-NEXT: v_and_b32_e32 v8, v0, v4 ; GFX7-NEXT: v_bfe_u32 v0, v0, 16, 8 -; GFX7-NEXT: v_and_b32_e32 v10, s0, v1 +; GFX7-NEXT: v_and_b32_e32 v10, v1, v4 ; GFX7-NEXT: v_bfe_u32 v1, v1, 16, 8 ; GFX7-NEXT: v_lshlrev_b32_e32 v9, 8, v9 ; GFX7-NEXT: v_lshlrev_b32_e32 v11, 8, v11 @@ -3668,7 +3619,6 @@ ; GFX7-NEXT: v_cndmask_b32_e32 v6, v0, v1, vcc ; GFX7-NEXT: v_and_b32_e32 v3, v6, v3 ; GFX7-NEXT: v_or_b32_e32 v2, v3, v2 -; GFX7-NEXT: v_cmp_eq_u32_e64 s[0:1], 0, v5 ; GFX7-NEXT: v_cndmask_b32_e64 v0, v0, v2, s[0:1] ; GFX7-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc ; GFX7-NEXT: v_bfe_u32 v6, v0, 8, 8 @@ -3697,51 +3647,48 @@ ; GFX10-LABEL: insertelement_v_v8i8_v_v: ; GFX10: ; %bb.0: ; GFX10-NEXT: global_load_dwordx2 v[0:1], v[0:1], off -; GFX10-NEXT: s_mov_b32 s0, 8 +; GFX10-NEXT: v_mov_b32_e32 v5, 8 ; GFX10-NEXT: v_and_b32_e32 v4, 3, v3 -; GFX10-NEXT: s_mov_b32 s1, 16 -; GFX10-NEXT: s_movk_i32 s2, 0xff -; GFX10-NEXT: v_mov_b32_e32 v5, 0xff +; GFX10-NEXT: v_mov_b32_e32 v6, 0xff +; GFX10-NEXT: v_mov_b32_e32 v7, 16 ; GFX10-NEXT: v_lshrrev_b32_e32 v3, 2, v3 ; GFX10-NEXT: v_lshlrev_b32_e32 v4, 3, v4 ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v3 +; GFX10-NEXT: v_cmp_eq_u32_e64 s0, 0, v3 ; GFX10-NEXT: v_lshlrev_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_lshrrev_b32_e32 v6, 24, v0 -; GFX10-NEXT: v_lshlrev_b32_sdwa v7, s0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 -; GFX10-NEXT: v_lshrrev_b32_e32 v8, 24, v1 -; GFX10-NEXT: v_lshlrev_b32_sdwa v9, s0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 -; GFX10-NEXT: v_lshlrev_b32_sdwa v10, s1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 -; GFX10-NEXT: v_lshlrev_b32_sdwa v11, s1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 -; GFX10-NEXT: v_and_or_b32 v0, v0, s2, v7 -; GFX10-NEXT: v_lshlrev_b32_e32 v6, 24, v6 -; GFX10-NEXT: v_and_or_b32 v1, v1, s2, v9 -; GFX10-NEXT: v_lshlrev_b32_e32 v7, 24, v8 -; GFX10-NEXT: v_lshlrev_b32_e32 v8, v4, v5 -; GFX10-NEXT: v_cmp_eq_u32_e64 s0, 0, v3 -; GFX10-NEXT: v_or3_b32 v0, v0, v10, v6 -; GFX10-NEXT: v_mov_b32_e32 v3, 8 -; GFX10-NEXT: v_or3_b32 v1, v1, v11, v7 -; GFX10-NEXT: v_xor_b32_e32 v4, -1, v8 -; GFX10-NEXT: v_cndmask_b32_e32 v6, v0, v1, vcc_lo -; GFX10-NEXT: v_and_or_b32 v2, v6, v4, v2 +; GFX10-NEXT: v_lshrrev_b32_e32 v8, 24, v0 +; GFX10-NEXT: v_lshlrev_b32_sdwa v9, v5, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 +; GFX10-NEXT: v_lshrrev_b32_e32 v10, 24, v1 +; GFX10-NEXT: v_lshlrev_b32_sdwa v11, v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 +; GFX10-NEXT: v_lshlrev_b32_sdwa v12, v7, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 +; GFX10-NEXT: v_lshlrev_b32_sdwa v13, v7, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 +; GFX10-NEXT: v_and_or_b32 v0, v0, v6, v9 +; GFX10-NEXT: v_lshlrev_b32_e32 v8, 24, v8 +; GFX10-NEXT: v_and_or_b32 v1, v1, v6, v11 +; GFX10-NEXT: v_lshlrev_b32_e32 v9, 24, v10 +; GFX10-NEXT: v_lshlrev_b32_e32 v10, v4, v6 +; GFX10-NEXT: v_or3_b32 v0, v0, v12, v8 +; GFX10-NEXT: v_or3_b32 v1, v1, v13, v9 +; GFX10-NEXT: v_xor_b32_e32 v4, -1, v10 +; GFX10-NEXT: v_cndmask_b32_e32 v8, v0, v1, vcc_lo +; GFX10-NEXT: v_and_or_b32 v2, v8, v4, v2 ; GFX10-NEXT: v_cndmask_b32_e64 v0, v0, v2, s0 ; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc_lo -; GFX10-NEXT: v_mov_b32_e32 v2, 16 -; GFX10-NEXT: v_lshrrev_b32_e32 v4, 24, v0 -; GFX10-NEXT: v_lshlrev_b32_sdwa v6, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 -; GFX10-NEXT: v_lshrrev_b32_e32 v7, 24, v1 -; GFX10-NEXT: v_lshlrev_b32_sdwa v3, v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 -; GFX10-NEXT: v_lshlrev_b32_sdwa v8, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 -; GFX10-NEXT: v_lshlrev_b32_sdwa v9, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 -; GFX10-NEXT: v_and_or_b32 v2, v0, v5, v6 +; GFX10-NEXT: v_lshrrev_b32_e32 v2, 24, v0 +; GFX10-NEXT: v_lshlrev_b32_sdwa v3, v5, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 +; GFX10-NEXT: v_lshrrev_b32_e32 v4, 24, v1 +; GFX10-NEXT: v_lshlrev_b32_sdwa v5, v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 +; GFX10-NEXT: v_lshlrev_b32_sdwa v8, v7, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 +; GFX10-NEXT: v_lshlrev_b32_sdwa v7, v7, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 +; GFX10-NEXT: v_and_or_b32 v3, v0, v6, v3 +; GFX10-NEXT: v_lshlrev_b32_e32 v2, 24, v2 +; GFX10-NEXT: v_and_or_b32 v5, v1, v6, v5 ; GFX10-NEXT: v_lshlrev_b32_e32 v4, 24, v4 -; GFX10-NEXT: v_and_or_b32 v3, v1, v5, v3 -; GFX10-NEXT: v_lshlrev_b32_e32 v5, 24, v7 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 -; GFX10-NEXT: v_or3_b32 v2, v2, v8, v4 -; GFX10-NEXT: v_or3_b32 v3, v3, v9, v5 +; GFX10-NEXT: v_or3_b32 v2, v3, v8, v2 +; GFX10-NEXT: v_or3_b32 v3, v5, v7, v4 ; GFX10-NEXT: global_store_dwordx2 v[0:1], v[2:3], off ; GFX10-NEXT: s_endpgm %vec = load <8 x i8>, <8 x i8> addrspace(1)* %ptr @@ -4233,31 +4180,30 @@ ; GFX9-LABEL: insertelement_v_v16i8_s_s: ; GFX9: ; %bb.0: ; GFX9-NEXT: global_load_dwordx4 v[0:3], v[0:1], off -; GFX9-NEXT: s_mov_b32 s0, 8 -; GFX9-NEXT: s_mov_b32 s1, 16 -; GFX9-NEXT: s_movk_i32 s6, 0xff ; GFX9-NEXT: v_mov_b32_e32 v6, 8 +; GFX9-NEXT: s_movk_i32 s6, 0xff ; GFX9-NEXT: v_mov_b32_e32 v7, 16 +; GFX9-NEXT: s_and_b32 s0, s3, 3 ; GFX9-NEXT: s_lshr_b32 s4, s3, 2 -; GFX9-NEXT: s_and_b32 s3, s3, 3 -; GFX9-NEXT: s_and_b32 s2, s2, s6 -; GFX9-NEXT: s_lshl_b32 s3, s3, 3 +; GFX9-NEXT: s_and_b32 s1, s2, s6 +; GFX9-NEXT: s_lshl_b32 s0, s0, 3 +; GFX9-NEXT: s_lshl_b32 s1, s1, s0 +; GFX9-NEXT: s_lshl_b32 s0, s6, s0 ; GFX9-NEXT: v_cmp_eq_u32_e64 vcc, s4, 1 -; GFX9-NEXT: s_lshl_b32 s2, s2, s3 -; GFX9-NEXT: s_lshl_b32 s3, s6, s3 -; GFX9-NEXT: s_not_b32 s5, s3 -; GFX9-NEXT: v_mov_b32_e32 v8, s2 +; GFX9-NEXT: s_not_b32 s5, s0 +; GFX9-NEXT: v_mov_b32_e32 v8, s1 +; GFX9-NEXT: v_cmp_eq_u32_e64 s[0:1], s4, 2 ; GFX9-NEXT: v_cmp_eq_u32_e64 s[2:3], s4, 3 ; GFX9-NEXT: v_mov_b32_e32 v4, 0 ; GFX9-NEXT: v_mov_b32_e32 v5, 0 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_lshrrev_b32_e32 v9, 24, v0 ; GFX9-NEXT: v_lshrrev_b32_e32 v10, 24, v1 -; GFX9-NEXT: v_lshlrev_b32_sdwa v13, s0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 -; GFX9-NEXT: v_lshlrev_b32_sdwa v15, s0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 +; GFX9-NEXT: v_lshlrev_b32_sdwa v13, v6, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 +; GFX9-NEXT: v_lshlrev_b32_sdwa v15, v6, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 ; GFX9-NEXT: v_lshrrev_b32_e32 v11, 24, v2 -; GFX9-NEXT: v_lshlrev_b32_sdwa v14, s1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 -; GFX9-NEXT: v_lshlrev_b32_sdwa v16, s1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 +; GFX9-NEXT: v_lshlrev_b32_sdwa v14, v7, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 +; GFX9-NEXT: v_lshlrev_b32_sdwa v16, v7, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 ; GFX9-NEXT: v_lshlrev_b32_sdwa v17, v6, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 ; GFX9-NEXT: v_and_or_b32 v0, v0, s6, v13 ; GFX9-NEXT: v_and_or_b32 v1, v1, s6, v15 @@ -4275,7 +4221,6 @@ ; GFX9-NEXT: v_lshlrev_b32_e32 v12, 24, v12 ; GFX9-NEXT: v_or3_b32 v2, v2, v18, v11 ; GFX9-NEXT: v_cndmask_b32_e32 v9, v0, v1, vcc -; GFX9-NEXT: v_cmp_eq_u32_e64 s[0:1], s4, 2 ; GFX9-NEXT: v_or3_b32 v3, v13, v3, v12 ; GFX9-NEXT: v_cndmask_b32_e64 v9, v9, v2, s[0:1] ; GFX9-NEXT: v_cndmask_b32_e64 v9, v9, v3, s[2:3] @@ -4317,8 +4262,6 @@ ; GFX8-NEXT: flat_load_dwordx4 v[0:3], v[0:1] ; GFX8-NEXT: v_mov_b32_e32 v6, 8 ; GFX8-NEXT: v_mov_b32_e32 v7, 16 -; GFX8-NEXT: v_mov_b32_e32 v8, 8 -; GFX8-NEXT: v_mov_b32_e32 v9, 16 ; GFX8-NEXT: s_and_b32 s1, s3, 3 ; GFX8-NEXT: s_movk_i32 s0, 0xff ; GFX8-NEXT: s_lshr_b32 s4, s3, 2 @@ -4333,72 +4276,72 @@ ; GFX8-NEXT: v_mov_b32_e32 v4, 0 ; GFX8-NEXT: v_mov_b32_e32 v5, 0 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_lshlrev_b32_sdwa v14, v6, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 -; GFX8-NEXT: v_lshlrev_b32_sdwa v6, v6, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 -; GFX8-NEXT: v_lshrrev_b32_e32 v10, 24, v0 -; GFX8-NEXT: v_lshrrev_b32_e32 v11, 24, v1 -; GFX8-NEXT: v_lshlrev_b32_sdwa v15, v7, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 -; GFX8-NEXT: v_lshlrev_b32_sdwa v7, v7, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 -; GFX8-NEXT: v_lshlrev_b32_sdwa v16, v8, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 -; GFX8-NEXT: v_or_b32_sdwa v0, v0, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX8-NEXT: v_or_b32_sdwa v1, v1, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX8-NEXT: v_lshrrev_b32_e32 v12, 24, v2 -; GFX8-NEXT: v_lshlrev_b32_sdwa v17, v9, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 -; GFX8-NEXT: v_lshlrev_b32_sdwa v18, v8, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 -; GFX8-NEXT: v_lshlrev_b32_e32 v10, 24, v10 -; GFX8-NEXT: v_lshlrev_b32_e32 v6, 24, v11 +; GFX8-NEXT: v_lshlrev_b32_sdwa v12, v6, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 +; GFX8-NEXT: v_lshlrev_b32_sdwa v14, v6, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 +; GFX8-NEXT: v_lshrrev_b32_e32 v8, 24, v0 +; GFX8-NEXT: v_lshrrev_b32_e32 v9, 24, v1 +; GFX8-NEXT: v_lshlrev_b32_sdwa v13, v7, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 +; GFX8-NEXT: v_lshlrev_b32_sdwa v15, v7, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 +; GFX8-NEXT: v_lshlrev_b32_sdwa v16, v6, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 +; GFX8-NEXT: v_or_b32_sdwa v0, v0, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_sdwa v1, v1, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX8-NEXT: v_lshrrev_b32_e32 v10, 24, v2 +; GFX8-NEXT: v_lshlrev_b32_sdwa v17, v7, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 +; GFX8-NEXT: v_lshlrev_b32_sdwa v18, v6, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 +; GFX8-NEXT: v_lshlrev_b32_e32 v8, 24, v8 +; GFX8-NEXT: v_lshlrev_b32_e32 v9, 24, v9 ; GFX8-NEXT: v_or_b32_sdwa v2, v2, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX8-NEXT: v_or_b32_e32 v0, v0, v15 -; GFX8-NEXT: v_or_b32_e32 v1, v1, v7 -; GFX8-NEXT: v_lshrrev_b32_e32 v13, 24, v3 -; GFX8-NEXT: v_lshlrev_b32_sdwa v19, v9, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 -; GFX8-NEXT: v_lshlrev_b32_e32 v11, 24, v12 +; GFX8-NEXT: v_or_b32_e32 v0, v0, v13 +; GFX8-NEXT: v_or_b32_e32 v1, v1, v15 +; GFX8-NEXT: v_lshrrev_b32_e32 v11, 24, v3 +; GFX8-NEXT: v_lshlrev_b32_sdwa v19, v7, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 +; GFX8-NEXT: v_lshlrev_b32_e32 v10, 24, v10 ; GFX8-NEXT: v_or_b32_sdwa v3, v3, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX8-NEXT: v_or_b32_e32 v2, v2, v17 -; GFX8-NEXT: v_or_b32_e32 v0, v0, v10 -; GFX8-NEXT: v_or_b32_e32 v1, v1, v6 -; GFX8-NEXT: v_lshlrev_b32_e32 v12, 24, v13 +; GFX8-NEXT: v_or_b32_e32 v0, v0, v8 +; GFX8-NEXT: v_or_b32_e32 v1, v1, v9 +; GFX8-NEXT: v_lshlrev_b32_e32 v11, 24, v11 ; GFX8-NEXT: v_or_b32_e32 v3, v3, v19 -; GFX8-NEXT: v_or_b32_e32 v2, v2, v11 -; GFX8-NEXT: v_cndmask_b32_e32 v6, v0, v1, vcc -; GFX8-NEXT: v_or_b32_e32 v3, v3, v12 -; GFX8-NEXT: v_cndmask_b32_e64 v6, v6, v2, s[0:1] -; GFX8-NEXT: v_cndmask_b32_e64 v6, v6, v3, s[2:3] -; GFX8-NEXT: v_and_b32_e32 v6, s6, v6 -; GFX8-NEXT: v_or_b32_e32 v6, s5, v6 +; GFX8-NEXT: v_or_b32_e32 v2, v2, v10 +; GFX8-NEXT: v_cndmask_b32_e32 v8, v0, v1, vcc +; GFX8-NEXT: v_or_b32_e32 v3, v3, v11 +; GFX8-NEXT: v_cndmask_b32_e64 v8, v8, v2, s[0:1] +; GFX8-NEXT: v_cndmask_b32_e64 v8, v8, v3, s[2:3] +; GFX8-NEXT: v_and_b32_e32 v8, s6, v8 +; GFX8-NEXT: v_or_b32_e32 v8, s5, v8 ; GFX8-NEXT: v_cmp_eq_u32_e64 s[4:5], s4, 0 -; GFX8-NEXT: v_cndmask_b32_e64 v0, v0, v6, s[4:5] -; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v6, vcc -; GFX8-NEXT: v_cndmask_b32_e64 v2, v2, v6, s[0:1] -; GFX8-NEXT: v_cndmask_b32_e64 v3, v3, v6, s[2:3] -; GFX8-NEXT: v_lshlrev_b32_sdwa v12, v8, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 -; GFX8-NEXT: v_lshlrev_b32_sdwa v14, v8, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 -; GFX8-NEXT: v_lshlrev_b32_sdwa v16, v8, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 -; GFX8-NEXT: v_lshlrev_b32_sdwa v8, v8, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 -; GFX8-NEXT: v_lshrrev_b32_e32 v6, 24, v0 -; GFX8-NEXT: v_lshrrev_b32_e32 v7, 24, v1 +; GFX8-NEXT: v_cndmask_b32_e64 v0, v0, v8, s[4:5] +; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v8, vcc +; GFX8-NEXT: v_cndmask_b32_e64 v2, v2, v8, s[0:1] +; GFX8-NEXT: v_cndmask_b32_e64 v3, v3, v8, s[2:3] +; GFX8-NEXT: v_lshlrev_b32_sdwa v12, v6, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 +; GFX8-NEXT: v_lshlrev_b32_sdwa v14, v6, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 +; GFX8-NEXT: v_lshlrev_b32_sdwa v16, v6, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 +; GFX8-NEXT: v_lshlrev_b32_sdwa v6, v6, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 +; GFX8-NEXT: v_lshrrev_b32_e32 v8, 24, v0 +; GFX8-NEXT: v_lshrrev_b32_e32 v9, 24, v1 ; GFX8-NEXT: v_lshrrev_b32_e32 v10, 24, v2 ; GFX8-NEXT: v_lshrrev_b32_e32 v11, 24, v3 -; GFX8-NEXT: v_lshlrev_b32_sdwa v13, v9, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 -; GFX8-NEXT: v_lshlrev_b32_sdwa v15, v9, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 -; GFX8-NEXT: v_lshlrev_b32_sdwa v17, v9, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 -; GFX8-NEXT: v_lshlrev_b32_sdwa v9, v9, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 +; GFX8-NEXT: v_lshlrev_b32_sdwa v13, v7, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 +; GFX8-NEXT: v_lshlrev_b32_sdwa v15, v7, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 +; GFX8-NEXT: v_lshlrev_b32_sdwa v17, v7, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 +; GFX8-NEXT: v_lshlrev_b32_sdwa v7, v7, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 ; GFX8-NEXT: v_or_b32_sdwa v0, v0, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX8-NEXT: v_or_b32_sdwa v1, v1, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX8-NEXT: v_or_b32_sdwa v2, v2, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX8-NEXT: v_or_b32_sdwa v3, v3, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX8-NEXT: v_lshlrev_b32_e32 v6, 24, v6 -; GFX8-NEXT: v_lshlrev_b32_e32 v7, 24, v7 +; GFX8-NEXT: v_or_b32_sdwa v3, v3, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX8-NEXT: v_lshlrev_b32_e32 v8, 24, v8 +; GFX8-NEXT: v_lshlrev_b32_e32 v9, 24, v9 ; GFX8-NEXT: v_lshlrev_b32_e32 v10, 24, v10 -; GFX8-NEXT: v_lshlrev_b32_e32 v8, 24, v11 +; GFX8-NEXT: v_lshlrev_b32_e32 v6, 24, v11 ; GFX8-NEXT: v_or_b32_e32 v0, v0, v13 ; GFX8-NEXT: v_or_b32_e32 v1, v1, v15 ; GFX8-NEXT: v_or_b32_e32 v2, v2, v17 -; GFX8-NEXT: v_or_b32_e32 v3, v3, v9 -; GFX8-NEXT: v_or_b32_e32 v0, v0, v6 -; GFX8-NEXT: v_or_b32_e32 v1, v1, v7 +; GFX8-NEXT: v_or_b32_e32 v3, v3, v7 +; GFX8-NEXT: v_or_b32_e32 v0, v0, v8 +; GFX8-NEXT: v_or_b32_e32 v1, v1, v9 ; GFX8-NEXT: v_or_b32_e32 v2, v2, v10 -; GFX8-NEXT: v_or_b32_e32 v3, v3, v8 +; GFX8-NEXT: v_or_b32_e32 v3, v3, v6 ; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX8-NEXT: s_endpgm ; @@ -4517,22 +4460,27 @@ ; GFX10-LABEL: insertelement_v_v16i8_s_s: ; GFX10: ; %bb.0: ; GFX10-NEXT: global_load_dwordx4 v[0:3], v[0:1], off -; GFX10-NEXT: s_mov_b32 s0, 8 ; GFX10-NEXT: v_mov_b32_e32 v4, 8 -; GFX10-NEXT: s_mov_b32 s1, 16 -; GFX10-NEXT: s_movk_i32 s4, 0xff ; GFX10-NEXT: v_mov_b32_e32 v5, 16 +; GFX10-NEXT: s_movk_i32 s4, 0xff ; GFX10-NEXT: s_lshr_b32 s5, s3, 2 -; GFX10-NEXT: s_and_b32 s2, s2, s4 +; GFX10-NEXT: s_and_b32 s1, s3, 3 ; GFX10-NEXT: v_cmp_eq_u32_e64 vcc_lo, s5, 1 +; GFX10-NEXT: v_cmp_eq_u32_e64 s0, s5, 2 +; GFX10-NEXT: s_lshl_b32 s3, s1, 3 +; GFX10-NEXT: v_cmp_eq_u32_e64 s1, s5, 3 +; GFX10-NEXT: s_and_b32 s2, s2, s4 +; GFX10-NEXT: s_lshl_b32 s6, s4, s3 +; GFX10-NEXT: s_lshl_b32 s2, s2, s3 +; GFX10-NEXT: s_not_b32 s3, s6 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_lshrrev_b32_e32 v6, 24, v0 ; GFX10-NEXT: v_lshrrev_b32_e32 v7, 24, v1 -; GFX10-NEXT: v_lshlrev_b32_sdwa v10, s0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 -; GFX10-NEXT: v_lshlrev_b32_sdwa v12, s0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 +; GFX10-NEXT: v_lshlrev_b32_sdwa v10, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 +; GFX10-NEXT: v_lshlrev_b32_sdwa v12, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 ; GFX10-NEXT: v_lshrrev_b32_e32 v8, 24, v2 -; GFX10-NEXT: v_lshlrev_b32_sdwa v11, s1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 -; GFX10-NEXT: v_lshlrev_b32_sdwa v13, s1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 +; GFX10-NEXT: v_lshlrev_b32_sdwa v11, v5, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 +; GFX10-NEXT: v_lshlrev_b32_sdwa v13, v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 ; GFX10-NEXT: v_lshlrev_b32_sdwa v14, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 ; GFX10-NEXT: v_and_or_b32 v0, v0, s4, v10 ; GFX10-NEXT: v_lshlrev_b32_e32 v6, 24, v6 @@ -4550,15 +4498,8 @@ ; GFX10-NEXT: v_lshlrev_b32_e32 v6, 24, v9 ; GFX10-NEXT: v_or3_b32 v2, v2, v15, v8 ; GFX10-NEXT: v_cndmask_b32_e32 v7, v0, v1, vcc_lo -; GFX10-NEXT: v_cmp_eq_u32_e64 s0, s5, 2 -; GFX10-NEXT: s_and_b32 s1, s3, 3 ; GFX10-NEXT: v_or3_b32 v3, v3, v10, v6 -; GFX10-NEXT: s_lshl_b32 s3, s1, 3 -; GFX10-NEXT: v_cmp_eq_u32_e64 s1, s5, 3 ; GFX10-NEXT: v_cndmask_b32_e64 v6, v7, v2, s0 -; GFX10-NEXT: s_lshl_b32 s6, s4, s3 -; GFX10-NEXT: s_lshl_b32 s2, s2, s3 -; GFX10-NEXT: s_not_b32 s3, s6 ; GFX10-NEXT: v_cndmask_b32_e64 v6, v6, v3, s1 ; GFX10-NEXT: v_and_or_b32 v6, v6, s3, s2 ; GFX10-NEXT: v_cmp_eq_u32_e64 s2, s5, 0 @@ -4687,19 +4628,19 @@ ; GFX9-NEXT: v_and_or_b32 v4, v1, s11, v4 ; GFX9-NEXT: v_lshlrev_b32_sdwa v1, s6, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 ; GFX9-NEXT: v_lshlrev_b32_e32 v5, 24, v5 +; GFX9-NEXT: v_lshrrev_b32_e32 v6, 24, v2 ; GFX9-NEXT: v_or3_b32 v1, v4, v1, v5 +; GFX9-NEXT: v_lshlrev_b32_sdwa v4, s5, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 +; GFX9-NEXT: v_and_or_b32 v4, v2, s11, v4 +; GFX9-NEXT: v_lshlrev_b32_sdwa v2, s6, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 +; GFX9-NEXT: v_lshlrev_b32_e32 v5, 24, v6 +; GFX9-NEXT: v_or3_b32 v2, v4, v2, v5 ; GFX9-NEXT: v_mov_b32_e32 v4, 8 -; GFX9-NEXT: v_lshrrev_b32_e32 v6, 24, v2 -; GFX9-NEXT: v_lshlrev_b32_sdwa v5, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 -; GFX9-NEXT: v_mov_b32_e32 v8, 16 -; GFX9-NEXT: v_lshrrev_b32_e32 v7, 24, v3 -; GFX9-NEXT: v_and_or_b32 v5, v2, s11, v5 -; GFX9-NEXT: v_lshlrev_b32_sdwa v2, v8, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 -; GFX9-NEXT: v_lshlrev_b32_e32 v6, 24, v6 ; GFX9-NEXT: v_lshlrev_b32_sdwa v4, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 -; GFX9-NEXT: v_or3_b32 v2, v5, v2, v6 +; GFX9-NEXT: v_lshrrev_b32_e32 v7, 24, v3 ; GFX9-NEXT: v_and_or_b32 v6, v3, s11, v4 -; GFX9-NEXT: v_lshlrev_b32_sdwa v3, v8, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 +; GFX9-NEXT: v_mov_b32_e32 v4, 16 +; GFX9-NEXT: v_lshlrev_b32_sdwa v3, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 ; GFX9-NEXT: v_lshlrev_b32_e32 v7, 24, v7 ; GFX9-NEXT: v_mov_b32_e32 v4, 0 ; GFX9-NEXT: v_mov_b32_e32 v5, 0 @@ -4796,19 +4737,19 @@ ; GFX8-NEXT: v_or_b32_e32 v1, v4, v1 ; GFX8-NEXT: v_lshlrev_b32_e32 v4, 24, v5 ; GFX8-NEXT: v_or_b32_e32 v1, v1, v4 -; GFX8-NEXT: v_mov_b32_e32 v4, 8 -; GFX8-NEXT: v_lshlrev_b32_sdwa v5, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 -; GFX8-NEXT: v_mov_b32_e32 v8, 16 +; GFX8-NEXT: v_lshlrev_b32_sdwa v4, v8, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 ; GFX8-NEXT: v_lshrrev_b32_e32 v6, 24, v2 -; GFX8-NEXT: v_or_b32_sdwa v5, v2, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX8-NEXT: v_lshlrev_b32_sdwa v2, v8, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 +; GFX8-NEXT: v_or_b32_sdwa v4, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX8-NEXT: v_lshlrev_b32_sdwa v2, v10, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 +; GFX8-NEXT: v_or_b32_e32 v2, v4, v2 +; GFX8-NEXT: v_lshlrev_b32_e32 v4, 24, v6 +; GFX8-NEXT: v_or_b32_e32 v2, v2, v4 +; GFX8-NEXT: v_mov_b32_e32 v4, 8 ; GFX8-NEXT: v_lshlrev_b32_sdwa v4, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 +; GFX8-NEXT: v_mov_b32_e32 v5, 16 ; GFX8-NEXT: v_lshrrev_b32_e32 v7, 24, v3 -; GFX8-NEXT: v_or_b32_e32 v2, v5, v2 -; GFX8-NEXT: v_lshlrev_b32_e32 v5, 24, v6 ; GFX8-NEXT: v_or_b32_sdwa v4, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX8-NEXT: v_lshlrev_b32_sdwa v3, v8, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 -; GFX8-NEXT: v_or_b32_e32 v2, v2, v5 +; GFX8-NEXT: v_lshlrev_b32_sdwa v3, v5, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 ; GFX8-NEXT: v_or_b32_e32 v3, v4, v3 ; GFX8-NEXT: v_lshlrev_b32_e32 v6, 24, v7 ; GFX8-NEXT: v_mov_b32_e32 v4, 0 @@ -5007,19 +4948,19 @@ ; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc_lo ; GFX10-NEXT: v_cmp_eq_u32_e64 vcc_lo, s7, 1 ; GFX10-NEXT: s_mov_b32 s1, 16 -; GFX10-NEXT: v_lshlrev_b32_sdwa v6, s0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 +; GFX10-NEXT: v_lshlrev_b32_sdwa v5, s0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 ; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc_lo ; GFX10-NEXT: v_cmp_eq_u32_e64 vcc_lo, s7, 2 -; GFX10-NEXT: v_and_or_b32 v6, v0, s5, v6 -; GFX10-NEXT: v_lshrrev_b32_e32 v5, 24, v1 +; GFX10-NEXT: v_and_or_b32 v5, v0, s5, v5 +; GFX10-NEXT: v_lshrrev_b32_e32 v6, 24, v1 ; GFX10-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc_lo ; GFX10-NEXT: v_cmp_eq_u32_e64 vcc_lo, s7, 3 ; GFX10-NEXT: v_lshlrev_b32_sdwa v9, s0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 -; GFX10-NEXT: v_lshlrev_b32_e32 v5, 24, v5 +; GFX10-NEXT: v_lshlrev_b32_e32 v6, 24, v6 ; GFX10-NEXT: v_lshrrev_b32_e32 v7, 24, v2 ; GFX10-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc_lo ; GFX10-NEXT: v_lshrrev_b32_e32 v4, 24, v0 -; GFX10-NEXT: v_lshlrev_b32_sdwa v11, v10, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 +; GFX10-NEXT: v_lshlrev_b32_sdwa v11, s0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 ; GFX10-NEXT: v_lshlrev_b32_sdwa v0, s1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 ; GFX10-NEXT: v_and_or_b32 v9, v1, s5, v9 ; GFX10-NEXT: v_lshrrev_b32_e32 v8, 24, v3 @@ -5027,14 +4968,14 @@ ; GFX10-NEXT: v_lshlrev_b32_e32 v4, 24, v4 ; GFX10-NEXT: v_lshlrev_b32_sdwa v1, s1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 ; GFX10-NEXT: v_and_or_b32 v11, v2, s5, v11 -; GFX10-NEXT: v_lshlrev_b32_sdwa v2, v12, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 +; GFX10-NEXT: v_lshlrev_b32_sdwa v2, s1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 ; GFX10-NEXT: v_lshlrev_b32_e32 v7, 24, v7 ; GFX10-NEXT: v_and_or_b32 v10, v3, s5, v10 ; GFX10-NEXT: v_lshlrev_b32_sdwa v3, v12, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 ; GFX10-NEXT: v_lshlrev_b32_e32 v8, 24, v8 -; GFX10-NEXT: v_or3_b32 v0, v6, v0, v4 -; GFX10-NEXT: v_or3_b32 v1, v9, v1, v5 +; GFX10-NEXT: v_or3_b32 v0, v5, v0, v4 ; GFX10-NEXT: v_mov_b32_e32 v4, 0 +; GFX10-NEXT: v_or3_b32 v1, v9, v1, v6 ; GFX10-NEXT: v_or3_b32 v2, v11, v2, v7 ; GFX10-NEXT: v_mov_b32_e32 v5, 0 ; GFX10-NEXT: v_or3_b32 v3, v10, v3, v8 @@ -5134,19 +5075,19 @@ ; GFX9-NEXT: v_and_or_b32 v4, v1, s12, v4 ; GFX9-NEXT: v_lshlrev_b32_sdwa v1, s7, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 ; GFX9-NEXT: v_lshlrev_b32_e32 v5, 24, v5 +; GFX9-NEXT: v_lshrrev_b32_e32 v6, 24, v2 ; GFX9-NEXT: v_or3_b32 v1, v4, v1, v5 +; GFX9-NEXT: v_lshlrev_b32_sdwa v4, s6, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 +; GFX9-NEXT: v_and_or_b32 v4, v2, s12, v4 +; GFX9-NEXT: v_lshlrev_b32_sdwa v2, s7, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 +; GFX9-NEXT: v_lshlrev_b32_e32 v5, 24, v6 +; GFX9-NEXT: v_or3_b32 v2, v4, v2, v5 ; GFX9-NEXT: v_mov_b32_e32 v4, 8 -; GFX9-NEXT: v_lshrrev_b32_e32 v6, 24, v2 -; GFX9-NEXT: v_lshlrev_b32_sdwa v5, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 -; GFX9-NEXT: v_mov_b32_e32 v8, 16 -; GFX9-NEXT: v_lshrrev_b32_e32 v7, 24, v3 -; GFX9-NEXT: v_and_or_b32 v5, v2, s12, v5 -; GFX9-NEXT: v_lshlrev_b32_sdwa v2, v8, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 -; GFX9-NEXT: v_lshlrev_b32_e32 v6, 24, v6 ; GFX9-NEXT: v_lshlrev_b32_sdwa v4, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 -; GFX9-NEXT: v_or3_b32 v2, v5, v2, v6 +; GFX9-NEXT: v_lshrrev_b32_e32 v7, 24, v3 ; GFX9-NEXT: v_and_or_b32 v6, v3, s12, v4 -; GFX9-NEXT: v_lshlrev_b32_sdwa v3, v8, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 +; GFX9-NEXT: v_mov_b32_e32 v4, 16 +; GFX9-NEXT: v_lshlrev_b32_sdwa v3, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 ; GFX9-NEXT: v_lshlrev_b32_e32 v7, 24, v7 ; GFX9-NEXT: v_mov_b32_e32 v4, 0 ; GFX9-NEXT: v_mov_b32_e32 v5, 0 @@ -5245,19 +5186,19 @@ ; GFX8-NEXT: v_or_b32_e32 v1, v4, v1 ; GFX8-NEXT: v_lshlrev_b32_e32 v4, 24, v5 ; GFX8-NEXT: v_or_b32_e32 v1, v1, v4 -; GFX8-NEXT: v_mov_b32_e32 v4, 8 -; GFX8-NEXT: v_lshlrev_b32_sdwa v5, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 -; GFX8-NEXT: v_mov_b32_e32 v8, 16 +; GFX8-NEXT: v_lshlrev_b32_sdwa v4, v8, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 ; GFX8-NEXT: v_lshrrev_b32_e32 v6, 24, v2 -; GFX8-NEXT: v_or_b32_sdwa v5, v2, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX8-NEXT: v_lshlrev_b32_sdwa v2, v8, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 +; GFX8-NEXT: v_or_b32_sdwa v4, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX8-NEXT: v_lshlrev_b32_sdwa v2, v10, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 +; GFX8-NEXT: v_or_b32_e32 v2, v4, v2 +; GFX8-NEXT: v_lshlrev_b32_e32 v4, 24, v6 +; GFX8-NEXT: v_or_b32_e32 v2, v2, v4 +; GFX8-NEXT: v_mov_b32_e32 v4, 8 ; GFX8-NEXT: v_lshlrev_b32_sdwa v4, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 +; GFX8-NEXT: v_mov_b32_e32 v5, 16 ; GFX8-NEXT: v_lshrrev_b32_e32 v7, 24, v3 -; GFX8-NEXT: v_or_b32_e32 v2, v5, v2 -; GFX8-NEXT: v_lshlrev_b32_e32 v5, 24, v6 ; GFX8-NEXT: v_or_b32_sdwa v4, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX8-NEXT: v_lshlrev_b32_sdwa v3, v8, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 -; GFX8-NEXT: v_or_b32_e32 v2, v2, v5 +; GFX8-NEXT: v_lshlrev_b32_sdwa v3, v5, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 ; GFX8-NEXT: v_or_b32_e32 v3, v4, v3 ; GFX8-NEXT: v_lshlrev_b32_e32 v6, 24, v7 ; GFX8-NEXT: v_mov_b32_e32 v4, 0 @@ -5463,28 +5404,28 @@ ; GFX10-NEXT: v_cndmask_b32_e64 v3, v3, v5, s1 ; GFX10-NEXT: s_mov_b32 s2, 16 ; GFX10-NEXT: v_lshrrev_b32_e32 v4, 24, v0 -; GFX10-NEXT: v_lshrrev_b32_e32 v5, 24, v1 -; GFX10-NEXT: v_lshlrev_b32_sdwa v6, s3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 -; GFX10-NEXT: v_lshlrev_b32_sdwa v9, s3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 +; GFX10-NEXT: v_lshlrev_b32_sdwa v5, s3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 +; GFX10-NEXT: v_lshrrev_b32_e32 v6, 24, v1 ; GFX10-NEXT: v_lshrrev_b32_e32 v7, 24, v2 ; GFX10-NEXT: v_lshrrev_b32_e32 v8, 24, v3 -; GFX10-NEXT: v_lshlrev_b32_sdwa v11, v10, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 +; GFX10-NEXT: v_lshlrev_b32_sdwa v9, s3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 +; GFX10-NEXT: v_lshlrev_b32_sdwa v11, s3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 ; GFX10-NEXT: v_lshlrev_b32_sdwa v10, v10, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 -; GFX10-NEXT: v_and_or_b32 v6, v0, s5, v6 +; GFX10-NEXT: v_and_or_b32 v5, v0, s5, v5 ; GFX10-NEXT: v_lshlrev_b32_sdwa v0, s2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 ; GFX10-NEXT: v_lshlrev_b32_e32 v4, 24, v4 ; GFX10-NEXT: v_and_or_b32 v9, v1, s5, v9 ; GFX10-NEXT: v_lshlrev_b32_sdwa v1, s2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 -; GFX10-NEXT: v_lshlrev_b32_e32 v5, 24, v5 +; GFX10-NEXT: v_lshlrev_b32_e32 v6, 24, v6 ; GFX10-NEXT: v_and_or_b32 v11, v2, s5, v11 -; GFX10-NEXT: v_lshlrev_b32_sdwa v2, v12, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 +; GFX10-NEXT: v_lshlrev_b32_sdwa v2, s2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 ; GFX10-NEXT: v_lshlrev_b32_e32 v7, 24, v7 ; GFX10-NEXT: v_and_or_b32 v10, v3, s5, v10 ; GFX10-NEXT: v_lshlrev_b32_sdwa v3, v12, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 ; GFX10-NEXT: v_lshlrev_b32_e32 v8, 24, v8 -; GFX10-NEXT: v_or3_b32 v0, v6, v0, v4 -; GFX10-NEXT: v_or3_b32 v1, v9, v1, v5 +; GFX10-NEXT: v_or3_b32 v0, v5, v0, v4 ; GFX10-NEXT: v_mov_b32_e32 v4, 0 +; GFX10-NEXT: v_or3_b32 v1, v9, v1, v6 ; GFX10-NEXT: v_or3_b32 v2, v11, v2, v7 ; GFX10-NEXT: v_mov_b32_e32 v5, 0 ; GFX10-NEXT: v_or3_b32 v3, v10, v3, v8 @@ -5583,19 +5524,19 @@ ; GFX9-NEXT: v_and_or_b32 v4, v1, s10, v4 ; GFX9-NEXT: v_lshlrev_b32_sdwa v1, s9, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 ; GFX9-NEXT: v_lshlrev_b32_e32 v5, 24, v5 +; GFX9-NEXT: v_lshrrev_b32_e32 v6, 24, v2 ; GFX9-NEXT: v_or3_b32 v1, v4, v1, v5 +; GFX9-NEXT: v_lshlrev_b32_sdwa v4, s8, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 +; GFX9-NEXT: v_and_or_b32 v4, v2, s10, v4 +; GFX9-NEXT: v_lshlrev_b32_sdwa v2, s9, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 +; GFX9-NEXT: v_lshlrev_b32_e32 v5, 24, v6 +; GFX9-NEXT: v_or3_b32 v2, v4, v2, v5 ; GFX9-NEXT: v_mov_b32_e32 v4, 8 -; GFX9-NEXT: v_lshrrev_b32_e32 v6, 24, v2 -; GFX9-NEXT: v_lshlrev_b32_sdwa v5, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 -; GFX9-NEXT: v_mov_b32_e32 v8, 16 -; GFX9-NEXT: v_lshrrev_b32_e32 v7, 24, v3 -; GFX9-NEXT: v_and_or_b32 v5, v2, s10, v5 -; GFX9-NEXT: v_lshlrev_b32_sdwa v2, v8, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 -; GFX9-NEXT: v_lshlrev_b32_e32 v6, 24, v6 ; GFX9-NEXT: v_lshlrev_b32_sdwa v4, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 -; GFX9-NEXT: v_or3_b32 v2, v5, v2, v6 +; GFX9-NEXT: v_lshrrev_b32_e32 v7, 24, v3 ; GFX9-NEXT: v_and_or_b32 v6, v3, s10, v4 -; GFX9-NEXT: v_lshlrev_b32_sdwa v3, v8, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 +; GFX9-NEXT: v_mov_b32_e32 v4, 16 +; GFX9-NEXT: v_lshlrev_b32_sdwa v3, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 ; GFX9-NEXT: v_lshlrev_b32_e32 v7, 24, v7 ; GFX9-NEXT: v_mov_b32_e32 v4, 0 ; GFX9-NEXT: v_mov_b32_e32 v5, 0 @@ -5693,19 +5634,19 @@ ; GFX8-NEXT: v_or_b32_e32 v1, v4, v1 ; GFX8-NEXT: v_lshlrev_b32_e32 v4, 24, v5 ; GFX8-NEXT: v_or_b32_e32 v1, v1, v4 -; GFX8-NEXT: v_mov_b32_e32 v4, 8 -; GFX8-NEXT: v_lshlrev_b32_sdwa v5, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 -; GFX8-NEXT: v_mov_b32_e32 v8, 16 +; GFX8-NEXT: v_lshlrev_b32_sdwa v4, v8, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 ; GFX8-NEXT: v_lshrrev_b32_e32 v6, 24, v2 -; GFX8-NEXT: v_or_b32_sdwa v5, v2, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX8-NEXT: v_lshlrev_b32_sdwa v2, v8, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 +; GFX8-NEXT: v_or_b32_sdwa v4, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX8-NEXT: v_lshlrev_b32_sdwa v2, v10, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 +; GFX8-NEXT: v_or_b32_e32 v2, v4, v2 +; GFX8-NEXT: v_lshlrev_b32_e32 v4, 24, v6 +; GFX8-NEXT: v_or_b32_e32 v2, v2, v4 +; GFX8-NEXT: v_mov_b32_e32 v4, 8 ; GFX8-NEXT: v_lshlrev_b32_sdwa v4, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 +; GFX8-NEXT: v_mov_b32_e32 v5, 16 ; GFX8-NEXT: v_lshrrev_b32_e32 v7, 24, v3 -; GFX8-NEXT: v_or_b32_e32 v2, v5, v2 -; GFX8-NEXT: v_lshlrev_b32_e32 v5, 24, v6 ; GFX8-NEXT: v_or_b32_sdwa v4, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX8-NEXT: v_lshlrev_b32_sdwa v3, v8, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 -; GFX8-NEXT: v_or_b32_e32 v2, v2, v5 +; GFX8-NEXT: v_lshlrev_b32_sdwa v3, v5, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 ; GFX8-NEXT: v_or_b32_e32 v3, v4, v3 ; GFX8-NEXT: v_lshlrev_b32_e32 v6, 24, v7 ; GFX8-NEXT: v_mov_b32_e32 v4, 0 @@ -5910,28 +5851,28 @@ ; GFX10-NEXT: v_cndmask_b32_e64 v3, v3, v5, s1 ; GFX10-NEXT: s_mov_b32 s2, 16 ; GFX10-NEXT: v_lshrrev_b32_e32 v4, 24, v0 -; GFX10-NEXT: v_lshrrev_b32_e32 v5, 24, v1 -; GFX10-NEXT: v_lshlrev_b32_sdwa v6, s3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 -; GFX10-NEXT: v_lshlrev_b32_sdwa v9, s3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 +; GFX10-NEXT: v_lshlrev_b32_sdwa v5, s3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 +; GFX10-NEXT: v_lshrrev_b32_e32 v6, 24, v1 ; GFX10-NEXT: v_lshrrev_b32_e32 v7, 24, v2 ; GFX10-NEXT: v_lshrrev_b32_e32 v8, 24, v3 -; GFX10-NEXT: v_lshlrev_b32_sdwa v11, v10, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 +; GFX10-NEXT: v_lshlrev_b32_sdwa v9, s3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 +; GFX10-NEXT: v_lshlrev_b32_sdwa v11, s3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 ; GFX10-NEXT: v_lshlrev_b32_sdwa v10, v10, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 -; GFX10-NEXT: v_and_or_b32 v6, v0, s8, v6 +; GFX10-NEXT: v_and_or_b32 v5, v0, s8, v5 ; GFX10-NEXT: v_lshlrev_b32_sdwa v0, s2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 ; GFX10-NEXT: v_lshlrev_b32_e32 v4, 24, v4 ; GFX10-NEXT: v_and_or_b32 v9, v1, s8, v9 ; GFX10-NEXT: v_lshlrev_b32_sdwa v1, s2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 -; GFX10-NEXT: v_lshlrev_b32_e32 v5, 24, v5 +; GFX10-NEXT: v_lshlrev_b32_e32 v6, 24, v6 ; GFX10-NEXT: v_and_or_b32 v11, v2, s8, v11 -; GFX10-NEXT: v_lshlrev_b32_sdwa v2, v12, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 +; GFX10-NEXT: v_lshlrev_b32_sdwa v2, s2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 ; GFX10-NEXT: v_lshlrev_b32_e32 v7, 24, v7 ; GFX10-NEXT: v_and_or_b32 v10, v3, s8, v10 ; GFX10-NEXT: v_lshlrev_b32_sdwa v3, v12, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 ; GFX10-NEXT: v_lshlrev_b32_e32 v8, 24, v8 -; GFX10-NEXT: v_or3_b32 v0, v6, v0, v4 -; GFX10-NEXT: v_or3_b32 v1, v9, v1, v5 +; GFX10-NEXT: v_or3_b32 v0, v5, v0, v4 ; GFX10-NEXT: v_mov_b32_e32 v4, 0 +; GFX10-NEXT: v_or3_b32 v1, v9, v1, v6 ; GFX10-NEXT: v_or3_b32 v2, v11, v2, v7 ; GFX10-NEXT: v_mov_b32_e32 v5, 0 ; GFX10-NEXT: v_or3_b32 v3, v10, v3, v8 @@ -5947,21 +5888,20 @@ ; GFX9-LABEL: insertelement_v_v16i8_s_v: ; GFX9: ; %bb.0: ; GFX9-NEXT: global_load_dwordx4 v[3:6], v[0:1], off -; GFX9-NEXT: s_mov_b32 s0, 8 -; GFX9-NEXT: s_mov_b32 s1, 16 -; GFX9-NEXT: s_movk_i32 s6, 0xff ; GFX9-NEXT: v_mov_b32_e32 v0, 8 +; GFX9-NEXT: s_movk_i32 s6, 0xff ; GFX9-NEXT: v_mov_b32_e32 v1, 16 +; GFX9-NEXT: s_and_b32 s0, s2, s6 ; GFX9-NEXT: v_mov_b32_e32 v7, 0 ; GFX9-NEXT: v_mov_b32_e32 v8, 0 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_lshrrev_b32_e32 v9, 24, v3 ; GFX9-NEXT: v_lshrrev_b32_e32 v10, 24, v4 -; GFX9-NEXT: v_lshlrev_b32_sdwa v13, s0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 -; GFX9-NEXT: v_lshlrev_b32_sdwa v15, s0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 +; GFX9-NEXT: v_lshlrev_b32_sdwa v13, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 +; GFX9-NEXT: v_lshlrev_b32_sdwa v15, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 ; GFX9-NEXT: v_lshrrev_b32_e32 v11, 24, v5 -; GFX9-NEXT: v_lshlrev_b32_sdwa v14, s1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 -; GFX9-NEXT: v_lshlrev_b32_sdwa v16, s1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 +; GFX9-NEXT: v_lshlrev_b32_sdwa v14, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 +; GFX9-NEXT: v_lshlrev_b32_sdwa v16, v1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 ; GFX9-NEXT: v_lshlrev_b32_sdwa v17, v0, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 ; GFX9-NEXT: v_and_or_b32 v3, v3, s6, v13 ; GFX9-NEXT: v_and_or_b32 v4, v4, s6, v15 @@ -5973,7 +5913,6 @@ ; GFX9-NEXT: v_lshlrev_b32_sdwa v18, v1, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 ; GFX9-NEXT: v_lshlrev_b32_sdwa v19, v0, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 ; GFX9-NEXT: v_and_or_b32 v5, v5, s6, v17 -; GFX9-NEXT: s_and_b32 s0, s2, s6 ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 3, v2 ; GFX9-NEXT: v_lshlrev_b32_e32 v11, 24, v11 ; GFX9-NEXT: v_or3_b32 v3, v3, v14, v9 @@ -6030,88 +5969,86 @@ ; GFX8-NEXT: flat_load_dwordx4 v[3:6], v[0:1] ; GFX8-NEXT: v_mov_b32_e32 v0, 8 ; GFX8-NEXT: v_mov_b32_e32 v1, 16 -; GFX8-NEXT: v_mov_b32_e32 v9, 8 -; GFX8-NEXT: v_mov_b32_e32 v10, 16 ; GFX8-NEXT: s_movk_i32 s0, 0xff ; GFX8-NEXT: s_and_b32 s1, s2, s0 ; GFX8-NEXT: v_mov_b32_e32 v7, 0 ; GFX8-NEXT: v_mov_b32_e32 v8, 0 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_lshlrev_b32_sdwa v15, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 -; GFX8-NEXT: v_lshlrev_b32_sdwa v0, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 -; GFX8-NEXT: v_lshrrev_b32_e32 v11, 24, v3 -; GFX8-NEXT: v_lshrrev_b32_e32 v12, 24, v4 -; GFX8-NEXT: v_lshlrev_b32_sdwa v16, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 -; GFX8-NEXT: v_lshlrev_b32_sdwa v1, v1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 -; GFX8-NEXT: v_lshlrev_b32_sdwa v17, v9, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 -; GFX8-NEXT: v_or_b32_sdwa v3, v3, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX8-NEXT: v_or_b32_sdwa v0, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX8-NEXT: v_lshrrev_b32_e32 v13, 24, v5 -; GFX8-NEXT: v_lshlrev_b32_sdwa v18, v10, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 -; GFX8-NEXT: v_lshlrev_b32_sdwa v19, v9, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 -; GFX8-NEXT: v_or_b32_sdwa v4, v5, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX8-NEXT: v_lshlrev_b32_sdwa v13, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 +; GFX8-NEXT: v_lshlrev_b32_sdwa v15, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 +; GFX8-NEXT: v_lshrrev_b32_e32 v9, 24, v3 +; GFX8-NEXT: v_lshrrev_b32_e32 v10, 24, v4 +; GFX8-NEXT: v_lshlrev_b32_sdwa v14, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 +; GFX8-NEXT: v_lshlrev_b32_sdwa v16, v1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 +; GFX8-NEXT: v_lshlrev_b32_sdwa v17, v0, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 +; GFX8-NEXT: v_or_b32_sdwa v3, v3, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_sdwa v4, v4, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX8-NEXT: v_lshrrev_b32_e32 v11, 24, v5 +; GFX8-NEXT: v_lshlrev_b32_sdwa v18, v1, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 +; GFX8-NEXT: v_lshlrev_b32_sdwa v19, v0, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 +; GFX8-NEXT: v_or_b32_sdwa v5, v5, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX8-NEXT: v_lshrrev_b32_e32 v15, 2, v2 ; GFX8-NEXT: v_and_b32_e32 v2, 3, v2 -; GFX8-NEXT: v_lshlrev_b32_e32 v11, 24, v11 -; GFX8-NEXT: v_lshlrev_b32_e32 v12, 24, v12 -; GFX8-NEXT: v_or_b32_e32 v3, v3, v16 -; GFX8-NEXT: v_or_b32_e32 v0, v0, v1 -; GFX8-NEXT: v_lshrrev_b32_e32 v14, 24, v6 -; GFX8-NEXT: v_or_b32_sdwa v5, v6, v19 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX8-NEXT: v_lshlrev_b32_sdwa v6, v10, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 +; GFX8-NEXT: v_lshlrev_b32_e32 v9, 24, v9 +; GFX8-NEXT: v_lshlrev_b32_e32 v10, 24, v10 +; GFX8-NEXT: v_or_b32_e32 v3, v3, v14 +; GFX8-NEXT: v_or_b32_e32 v4, v4, v16 +; GFX8-NEXT: v_lshrrev_b32_e32 v12, 24, v6 +; GFX8-NEXT: v_or_b32_sdwa v13, v6, v19 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX8-NEXT: v_lshlrev_b32_sdwa v6, v1, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 3, v2 -; GFX8-NEXT: v_lshlrev_b32_e32 v13, 24, v13 -; GFX8-NEXT: v_or_b32_e32 v1, v4, v18 -; GFX8-NEXT: v_or_b32_e32 v3, v3, v11 -; GFX8-NEXT: v_or_b32_e32 v0, v0, v12 +; GFX8-NEXT: v_lshlrev_b32_e32 v11, 24, v11 +; GFX8-NEXT: v_or_b32_e32 v5, v5, v18 +; GFX8-NEXT: v_or_b32_e32 v3, v3, v9 +; GFX8-NEXT: v_or_b32_e32 v4, v4, v10 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 1, v15 ; GFX8-NEXT: v_lshlrev_b32_e64 v17, v2, s1 ; GFX8-NEXT: v_lshlrev_b32_e64 v2, v2, s0 -; GFX8-NEXT: v_lshlrev_b32_e32 v14, 24, v14 -; GFX8-NEXT: v_or_b32_e32 v4, v5, v6 -; GFX8-NEXT: v_or_b32_e32 v1, v1, v13 -; GFX8-NEXT: v_cndmask_b32_e32 v5, v3, v0, vcc +; GFX8-NEXT: v_lshlrev_b32_e32 v12, 24, v12 +; GFX8-NEXT: v_or_b32_e32 v6, v13, v6 +; GFX8-NEXT: v_or_b32_e32 v5, v5, v11 +; GFX8-NEXT: v_cndmask_b32_e32 v9, v3, v4, vcc ; GFX8-NEXT: v_cmp_eq_u32_e64 s[0:1], 2, v15 -; GFX8-NEXT: v_or_b32_e32 v4, v4, v14 -; GFX8-NEXT: v_cndmask_b32_e64 v5, v5, v1, s[0:1] +; GFX8-NEXT: v_or_b32_e32 v6, v6, v12 +; GFX8-NEXT: v_cndmask_b32_e64 v9, v9, v5, s[0:1] ; GFX8-NEXT: v_cmp_eq_u32_e64 s[2:3], 3, v15 ; GFX8-NEXT: v_xor_b32_e32 v2, -1, v2 -; GFX8-NEXT: v_cndmask_b32_e64 v5, v5, v4, s[2:3] -; GFX8-NEXT: v_and_b32_e32 v2, v5, v2 +; GFX8-NEXT: v_cndmask_b32_e64 v9, v9, v6, s[2:3] +; GFX8-NEXT: v_and_b32_e32 v2, v9, v2 ; GFX8-NEXT: v_or_b32_e32 v2, v2, v17 ; GFX8-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v15 ; GFX8-NEXT: v_cndmask_b32_e64 v3, v3, v2, s[4:5] -; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc -; GFX8-NEXT: v_cndmask_b32_e64 v1, v1, v2, s[0:1] -; GFX8-NEXT: v_cndmask_b32_e64 v2, v4, v2, s[2:3] -; GFX8-NEXT: v_lshlrev_b32_sdwa v12, v9, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 -; GFX8-NEXT: v_lshlrev_b32_sdwa v14, v9, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 -; GFX8-NEXT: v_lshlrev_b32_sdwa v16, v9, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 -; GFX8-NEXT: v_lshlrev_b32_sdwa v9, v9, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 -; GFX8-NEXT: v_lshrrev_b32_e32 v4, 24, v3 -; GFX8-NEXT: v_lshrrev_b32_e32 v5, 24, v0 -; GFX8-NEXT: v_lshrrev_b32_e32 v6, 24, v1 +; GFX8-NEXT: v_cndmask_b32_e32 v4, v4, v2, vcc +; GFX8-NEXT: v_cndmask_b32_e64 v5, v5, v2, s[0:1] +; GFX8-NEXT: v_cndmask_b32_e64 v2, v6, v2, s[2:3] +; GFX8-NEXT: v_lshlrev_b32_sdwa v12, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 +; GFX8-NEXT: v_lshlrev_b32_sdwa v14, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 +; GFX8-NEXT: v_lshlrev_b32_sdwa v16, v0, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 +; GFX8-NEXT: v_lshlrev_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 +; GFX8-NEXT: v_lshrrev_b32_e32 v6, 24, v3 +; GFX8-NEXT: v_lshrrev_b32_e32 v9, 24, v4 +; GFX8-NEXT: v_lshrrev_b32_e32 v10, 24, v5 ; GFX8-NEXT: v_lshrrev_b32_e32 v11, 24, v2 -; GFX8-NEXT: v_lshlrev_b32_sdwa v13, v10, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 -; GFX8-NEXT: v_lshlrev_b32_sdwa v15, v10, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 -; GFX8-NEXT: v_lshlrev_b32_sdwa v17, v10, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 -; GFX8-NEXT: v_lshlrev_b32_sdwa v10, v10, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 +; GFX8-NEXT: v_lshlrev_b32_sdwa v13, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 +; GFX8-NEXT: v_lshlrev_b32_sdwa v15, v1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 +; GFX8-NEXT: v_lshlrev_b32_sdwa v17, v1, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 +; GFX8-NEXT: v_lshlrev_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 ; GFX8-NEXT: v_or_b32_sdwa v3, v3, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX8-NEXT: v_or_b32_sdwa v0, v0, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX8-NEXT: v_or_b32_sdwa v1, v1, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX8-NEXT: v_or_b32_sdwa v2, v2, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX8-NEXT: v_lshlrev_b32_e32 v4, 24, v4 -; GFX8-NEXT: v_lshlrev_b32_e32 v5, 24, v5 +; GFX8-NEXT: v_or_b32_sdwa v4, v4, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_sdwa v5, v5, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX8-NEXT: v_lshlrev_b32_e32 v6, 24, v6 -; GFX8-NEXT: v_lshlrev_b32_e32 v9, 24, v11 -; GFX8-NEXT: v_or_b32_e32 v3, v3, v13 -; GFX8-NEXT: v_or_b32_e32 v11, v0, v15 -; GFX8-NEXT: v_or_b32_e32 v12, v1, v17 -; GFX8-NEXT: v_or_b32_e32 v10, v2, v10 -; GFX8-NEXT: v_or_b32_e32 v0, v3, v4 -; GFX8-NEXT: v_or_b32_e32 v1, v11, v5 -; GFX8-NEXT: v_or_b32_e32 v2, v12, v6 -; GFX8-NEXT: v_or_b32_e32 v3, v10, v9 +; GFX8-NEXT: v_lshlrev_b32_e32 v9, 24, v9 +; GFX8-NEXT: v_lshlrev_b32_e32 v10, 24, v10 +; GFX8-NEXT: v_lshlrev_b32_e32 v11, 24, v11 +; GFX8-NEXT: v_or_b32_e32 v2, v3, v13 +; GFX8-NEXT: v_or_b32_e32 v3, v4, v15 +; GFX8-NEXT: v_or_b32_e32 v4, v5, v17 +; GFX8-NEXT: v_or_b32_e32 v5, v0, v1 +; GFX8-NEXT: v_or_b32_e32 v0, v2, v6 +; GFX8-NEXT: v_or_b32_e32 v1, v3, v9 +; GFX8-NEXT: v_or_b32_e32 v2, v4, v10 +; GFX8-NEXT: v_or_b32_e32 v3, v5, v11 ; GFX8-NEXT: flat_store_dwordx4 v[7:8], v[0:3] ; GFX8-NEXT: s_endpgm ; @@ -6230,23 +6167,24 @@ ; GFX10-LABEL: insertelement_v_v16i8_s_v: ; GFX10: ; %bb.0: ; GFX10-NEXT: global_load_dwordx4 v[3:6], v[0:1], off -; GFX10-NEXT: s_mov_b32 s0, 8 ; GFX10-NEXT: v_mov_b32_e32 v1, 8 -; GFX10-NEXT: s_mov_b32 s1, 16 +; GFX10-NEXT: v_mov_b32_e32 v7, 16 ; GFX10-NEXT: s_movk_i32 s3, 0xff ; GFX10-NEXT: v_and_b32_e32 v0, 3, v2 -; GFX10-NEXT: v_mov_b32_e32 v7, 16 ; GFX10-NEXT: v_lshrrev_b32_e32 v2, 2, v2 +; GFX10-NEXT: s_and_b32 s1, s2, s3 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v2 +; GFX10-NEXT: v_cmp_eq_u32_e64 s0, 2, v2 +; GFX10-NEXT: v_cmp_eq_u32_e64 s2, 0, v2 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_lshrrev_b32_e32 v8, 24, v3 ; GFX10-NEXT: v_lshrrev_b32_e32 v9, 24, v4 -; GFX10-NEXT: v_lshlrev_b32_sdwa v12, s0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 -; GFX10-NEXT: v_lshlrev_b32_sdwa v14, s0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 +; GFX10-NEXT: v_lshlrev_b32_sdwa v12, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 +; GFX10-NEXT: v_lshlrev_b32_sdwa v14, v1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 ; GFX10-NEXT: v_lshrrev_b32_e32 v10, 24, v5 -; GFX10-NEXT: v_lshlrev_b32_sdwa v13, s1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 -; GFX10-NEXT: v_lshlrev_b32_sdwa v15, s1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 +; GFX10-NEXT: v_lshlrev_b32_sdwa v13, v7, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 +; GFX10-NEXT: v_lshlrev_b32_sdwa v15, v7, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 ; GFX10-NEXT: v_lshlrev_b32_sdwa v16, v1, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 ; GFX10-NEXT: v_and_or_b32 v3, v3, s3, v12 ; GFX10-NEXT: v_lshlrev_b32_e32 v8, 24, v8 @@ -6264,15 +6202,12 @@ ; GFX10-NEXT: v_lshlrev_b32_e32 v8, 24, v11 ; GFX10-NEXT: v_or3_b32 v5, v5, v17, v10 ; GFX10-NEXT: v_cndmask_b32_e32 v9, v3, v4, vcc_lo -; GFX10-NEXT: v_cmp_eq_u32_e64 s0, 2, v2 -; GFX10-NEXT: s_and_b32 s1, s2, s3 ; GFX10-NEXT: v_lshlrev_b32_e64 v10, v0, s3 -; GFX10-NEXT: v_or3_b32 v6, v6, v12, v8 ; GFX10-NEXT: v_lshlrev_b32_e64 v0, v0, s1 -; GFX10-NEXT: v_cndmask_b32_e64 v8, v9, v5, s0 +; GFX10-NEXT: v_or3_b32 v6, v6, v12, v8 ; GFX10-NEXT: v_cmp_eq_u32_e64 s1, 3, v2 +; GFX10-NEXT: v_cndmask_b32_e64 v8, v9, v5, s0 ; GFX10-NEXT: v_xor_b32_e32 v9, -1, v10 -; GFX10-NEXT: v_cmp_eq_u32_e64 s2, 0, v2 ; GFX10-NEXT: v_cndmask_b32_e64 v8, v8, v6, s1 ; GFX10-NEXT: v_and_or_b32 v0, v8, v9, v0 ; GFX10-NEXT: v_cndmask_b32_e64 v2, v3, v0, s2 @@ -6317,29 +6252,28 @@ ; GFX9-LABEL: insertelement_v_v16i8_v_s: ; GFX9: ; %bb.0: ; GFX9-NEXT: global_load_dwordx4 v[3:6], v[0:1], off -; GFX9-NEXT: s_mov_b32 s0, 8 -; GFX9-NEXT: s_mov_b32 s1, 16 -; GFX9-NEXT: s_movk_i32 s6, 0xff ; GFX9-NEXT: v_mov_b32_e32 v0, 8 +; GFX9-NEXT: s_movk_i32 s6, 0xff ; GFX9-NEXT: v_mov_b32_e32 v1, 16 +; GFX9-NEXT: s_and_b32 s0, s2, 3 ; GFX9-NEXT: s_lshr_b32 s4, s2, 2 -; GFX9-NEXT: s_and_b32 s2, s2, 3 -; GFX9-NEXT: s_lshl_b32 s2, s2, 3 +; GFX9-NEXT: s_lshl_b32 s0, s0, 3 +; GFX9-NEXT: v_lshlrev_b32_sdwa v2, s0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: s_lshl_b32 s0, s6, s0 ; GFX9-NEXT: v_cmp_eq_u32_e64 vcc, s4, 1 -; GFX9-NEXT: v_lshlrev_b32_sdwa v2, s2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: s_lshl_b32 s2, s6, s2 -; GFX9-NEXT: s_not_b32 s5, s2 +; GFX9-NEXT: s_not_b32 s5, s0 +; GFX9-NEXT: v_cmp_eq_u32_e64 s[0:1], s4, 2 ; GFX9-NEXT: v_cmp_eq_u32_e64 s[2:3], s4, 3 ; GFX9-NEXT: v_mov_b32_e32 v7, 0 ; GFX9-NEXT: v_mov_b32_e32 v8, 0 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_lshrrev_b32_e32 v9, 24, v3 ; GFX9-NEXT: v_lshrrev_b32_e32 v10, 24, v4 -; GFX9-NEXT: v_lshlrev_b32_sdwa v13, s0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 -; GFX9-NEXT: v_lshlrev_b32_sdwa v15, s0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 +; GFX9-NEXT: v_lshlrev_b32_sdwa v13, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 +; GFX9-NEXT: v_lshlrev_b32_sdwa v15, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 ; GFX9-NEXT: v_lshrrev_b32_e32 v11, 24, v5 -; GFX9-NEXT: v_lshlrev_b32_sdwa v14, s1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 -; GFX9-NEXT: v_lshlrev_b32_sdwa v16, s1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 +; GFX9-NEXT: v_lshlrev_b32_sdwa v14, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 +; GFX9-NEXT: v_lshlrev_b32_sdwa v16, v1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 ; GFX9-NEXT: v_lshlrev_b32_sdwa v17, v0, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 ; GFX9-NEXT: v_and_or_b32 v3, v3, s6, v13 ; GFX9-NEXT: v_and_or_b32 v4, v4, s6, v15 @@ -6357,7 +6291,6 @@ ; GFX9-NEXT: v_lshlrev_b32_e32 v12, 24, v12 ; GFX9-NEXT: v_or3_b32 v5, v5, v18, v11 ; GFX9-NEXT: v_cndmask_b32_e32 v9, v3, v4, vcc -; GFX9-NEXT: v_cmp_eq_u32_e64 s[0:1], s4, 2 ; GFX9-NEXT: v_or3_b32 v6, v13, v6, v12 ; GFX9-NEXT: v_cndmask_b32_e64 v9, v9, v5, s[0:1] ; GFX9-NEXT: v_cndmask_b32_e64 v9, v9, v6, s[2:3] @@ -6401,10 +6334,8 @@ ; GFX8-NEXT: v_mov_b32_e32 v0, 8 ; GFX8-NEXT: s_lshl_b32 s1, s1, 3 ; GFX8-NEXT: v_mov_b32_e32 v1, 16 -; GFX8-NEXT: v_mov_b32_e32 v9, 8 -; GFX8-NEXT: v_mov_b32_e32 v11, s1 -; GFX8-NEXT: v_mov_b32_e32 v10, 16 -; GFX8-NEXT: v_lshlrev_b32_sdwa v2, v11, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX8-NEXT: v_mov_b32_e32 v9, s1 +; GFX8-NEXT: v_lshlrev_b32_sdwa v2, v9, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX8-NEXT: s_movk_i32 s0, 0xff ; GFX8-NEXT: s_lshr_b32 s4, s2, 2 ; GFX8-NEXT: s_lshl_b32 s0, s0, s1 @@ -6415,72 +6346,72 @@ ; GFX8-NEXT: v_mov_b32_e32 v7, 0 ; GFX8-NEXT: v_mov_b32_e32 v8, 0 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_lshlrev_b32_sdwa v15, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 -; GFX8-NEXT: v_lshlrev_b32_sdwa v0, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 -; GFX8-NEXT: v_lshrrev_b32_e32 v11, 24, v3 -; GFX8-NEXT: v_lshrrev_b32_e32 v12, 24, v4 -; GFX8-NEXT: v_lshlrev_b32_sdwa v16, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 -; GFX8-NEXT: v_lshlrev_b32_sdwa v1, v1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 -; GFX8-NEXT: v_lshlrev_b32_sdwa v17, v9, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 -; GFX8-NEXT: v_or_b32_sdwa v3, v3, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX8-NEXT: v_or_b32_sdwa v0, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX8-NEXT: v_lshrrev_b32_e32 v13, 24, v5 -; GFX8-NEXT: v_lshlrev_b32_sdwa v18, v10, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 -; GFX8-NEXT: v_lshlrev_b32_sdwa v19, v9, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 -; GFX8-NEXT: v_or_b32_sdwa v4, v5, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX8-NEXT: v_lshlrev_b32_sdwa v13, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 +; GFX8-NEXT: v_lshlrev_b32_sdwa v15, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 +; GFX8-NEXT: v_lshrrev_b32_e32 v9, 24, v3 +; GFX8-NEXT: v_lshrrev_b32_e32 v10, 24, v4 +; GFX8-NEXT: v_lshlrev_b32_sdwa v14, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 +; GFX8-NEXT: v_lshlrev_b32_sdwa v16, v1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 +; GFX8-NEXT: v_lshlrev_b32_sdwa v17, v0, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 +; GFX8-NEXT: v_or_b32_sdwa v3, v3, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_sdwa v4, v4, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX8-NEXT: v_lshrrev_b32_e32 v11, 24, v5 +; GFX8-NEXT: v_lshlrev_b32_sdwa v18, v1, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 +; GFX8-NEXT: v_lshlrev_b32_sdwa v19, v0, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 +; GFX8-NEXT: v_or_b32_sdwa v5, v5, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX8-NEXT: v_lshlrev_b32_e32 v9, 24, v9 +; GFX8-NEXT: v_lshlrev_b32_e32 v10, 24, v10 +; GFX8-NEXT: v_or_b32_e32 v3, v3, v14 +; GFX8-NEXT: v_or_b32_e32 v4, v4, v16 +; GFX8-NEXT: v_lshrrev_b32_e32 v12, 24, v6 +; GFX8-NEXT: v_or_b32_sdwa v13, v6, v19 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX8-NEXT: v_lshlrev_b32_sdwa v6, v1, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 ; GFX8-NEXT: v_lshlrev_b32_e32 v11, 24, v11 +; GFX8-NEXT: v_or_b32_e32 v5, v5, v18 +; GFX8-NEXT: v_or_b32_e32 v3, v3, v9 +; GFX8-NEXT: v_or_b32_e32 v4, v4, v10 ; GFX8-NEXT: v_lshlrev_b32_e32 v12, 24, v12 -; GFX8-NEXT: v_or_b32_e32 v3, v3, v16 -; GFX8-NEXT: v_or_b32_e32 v0, v0, v1 -; GFX8-NEXT: v_lshrrev_b32_e32 v14, 24, v6 -; GFX8-NEXT: v_or_b32_sdwa v5, v6, v19 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX8-NEXT: v_lshlrev_b32_sdwa v6, v10, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 -; GFX8-NEXT: v_lshlrev_b32_e32 v13, 24, v13 -; GFX8-NEXT: v_or_b32_e32 v1, v4, v18 -; GFX8-NEXT: v_or_b32_e32 v3, v3, v11 -; GFX8-NEXT: v_or_b32_e32 v0, v0, v12 -; GFX8-NEXT: v_lshlrev_b32_e32 v14, 24, v14 -; GFX8-NEXT: v_or_b32_e32 v4, v5, v6 -; GFX8-NEXT: v_or_b32_e32 v1, v1, v13 -; GFX8-NEXT: v_cndmask_b32_e32 v5, v3, v0, vcc -; GFX8-NEXT: v_or_b32_e32 v4, v4, v14 -; GFX8-NEXT: v_cndmask_b32_e64 v5, v5, v1, s[0:1] -; GFX8-NEXT: v_cndmask_b32_e64 v5, v5, v4, s[2:3] -; GFX8-NEXT: v_and_b32_e32 v5, s5, v5 -; GFX8-NEXT: v_or_b32_e32 v2, v5, v2 +; GFX8-NEXT: v_or_b32_e32 v6, v13, v6 +; GFX8-NEXT: v_or_b32_e32 v5, v5, v11 +; GFX8-NEXT: v_cndmask_b32_e32 v9, v3, v4, vcc +; GFX8-NEXT: v_or_b32_e32 v6, v6, v12 +; GFX8-NEXT: v_cndmask_b32_e64 v9, v9, v5, s[0:1] +; GFX8-NEXT: v_cndmask_b32_e64 v9, v9, v6, s[2:3] +; GFX8-NEXT: v_and_b32_e32 v9, s5, v9 +; GFX8-NEXT: v_or_b32_e32 v2, v9, v2 ; GFX8-NEXT: v_cmp_eq_u32_e64 s[4:5], s4, 0 ; GFX8-NEXT: v_cndmask_b32_e64 v3, v3, v2, s[4:5] -; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc -; GFX8-NEXT: v_cndmask_b32_e64 v1, v1, v2, s[0:1] -; GFX8-NEXT: v_cndmask_b32_e64 v2, v4, v2, s[2:3] -; GFX8-NEXT: v_lshlrev_b32_sdwa v12, v9, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 -; GFX8-NEXT: v_lshlrev_b32_sdwa v14, v9, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 -; GFX8-NEXT: v_lshlrev_b32_sdwa v16, v9, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 -; GFX8-NEXT: v_lshlrev_b32_sdwa v9, v9, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 -; GFX8-NEXT: v_lshrrev_b32_e32 v4, 24, v3 -; GFX8-NEXT: v_lshrrev_b32_e32 v5, 24, v0 -; GFX8-NEXT: v_lshrrev_b32_e32 v6, 24, v1 +; GFX8-NEXT: v_cndmask_b32_e32 v4, v4, v2, vcc +; GFX8-NEXT: v_cndmask_b32_e64 v5, v5, v2, s[0:1] +; GFX8-NEXT: v_cndmask_b32_e64 v2, v6, v2, s[2:3] +; GFX8-NEXT: v_lshlrev_b32_sdwa v12, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 +; GFX8-NEXT: v_lshlrev_b32_sdwa v14, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 +; GFX8-NEXT: v_lshlrev_b32_sdwa v16, v0, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 +; GFX8-NEXT: v_lshlrev_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 +; GFX8-NEXT: v_lshrrev_b32_e32 v6, 24, v3 +; GFX8-NEXT: v_lshrrev_b32_e32 v9, 24, v4 +; GFX8-NEXT: v_lshrrev_b32_e32 v10, 24, v5 ; GFX8-NEXT: v_lshrrev_b32_e32 v11, 24, v2 -; GFX8-NEXT: v_lshlrev_b32_sdwa v13, v10, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 -; GFX8-NEXT: v_lshlrev_b32_sdwa v15, v10, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 -; GFX8-NEXT: v_lshlrev_b32_sdwa v17, v10, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 -; GFX8-NEXT: v_lshlrev_b32_sdwa v10, v10, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 +; GFX8-NEXT: v_lshlrev_b32_sdwa v13, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 +; GFX8-NEXT: v_lshlrev_b32_sdwa v15, v1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 +; GFX8-NEXT: v_lshlrev_b32_sdwa v17, v1, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 +; GFX8-NEXT: v_lshlrev_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 ; GFX8-NEXT: v_or_b32_sdwa v3, v3, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX8-NEXT: v_or_b32_sdwa v0, v0, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX8-NEXT: v_or_b32_sdwa v1, v1, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX8-NEXT: v_or_b32_sdwa v2, v2, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX8-NEXT: v_lshlrev_b32_e32 v4, 24, v4 -; GFX8-NEXT: v_lshlrev_b32_e32 v5, 24, v5 +; GFX8-NEXT: v_or_b32_sdwa v4, v4, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_sdwa v5, v5, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX8-NEXT: v_lshlrev_b32_e32 v6, 24, v6 -; GFX8-NEXT: v_lshlrev_b32_e32 v9, 24, v11 -; GFX8-NEXT: v_or_b32_e32 v3, v3, v13 -; GFX8-NEXT: v_or_b32_e32 v11, v0, v15 -; GFX8-NEXT: v_or_b32_e32 v12, v1, v17 -; GFX8-NEXT: v_or_b32_e32 v10, v2, v10 -; GFX8-NEXT: v_or_b32_e32 v0, v3, v4 -; GFX8-NEXT: v_or_b32_e32 v1, v11, v5 -; GFX8-NEXT: v_or_b32_e32 v2, v12, v6 -; GFX8-NEXT: v_or_b32_e32 v3, v10, v9 +; GFX8-NEXT: v_lshlrev_b32_e32 v9, 24, v9 +; GFX8-NEXT: v_lshlrev_b32_e32 v10, 24, v10 +; GFX8-NEXT: v_lshlrev_b32_e32 v11, 24, v11 +; GFX8-NEXT: v_or_b32_e32 v2, v3, v13 +; GFX8-NEXT: v_or_b32_e32 v3, v4, v15 +; GFX8-NEXT: v_or_b32_e32 v4, v5, v17 +; GFX8-NEXT: v_or_b32_e32 v5, v0, v1 +; GFX8-NEXT: v_or_b32_e32 v0, v2, v6 +; GFX8-NEXT: v_or_b32_e32 v1, v3, v9 +; GFX8-NEXT: v_or_b32_e32 v2, v4, v10 +; GFX8-NEXT: v_or_b32_e32 v3, v5, v11 ; GFX8-NEXT: flat_store_dwordx4 v[7:8], v[0:3] ; GFX8-NEXT: s_endpgm ; @@ -6599,21 +6530,26 @@ ; GFX10-LABEL: insertelement_v_v16i8_v_s: ; GFX10: ; %bb.0: ; GFX10-NEXT: global_load_dwordx4 v[3:6], v[0:1], off -; GFX10-NEXT: s_mov_b32 s0, 8 ; GFX10-NEXT: v_mov_b32_e32 v0, 8 -; GFX10-NEXT: s_mov_b32 s1, 16 -; GFX10-NEXT: s_movk_i32 s3, 0xff ; GFX10-NEXT: v_mov_b32_e32 v1, 16 +; GFX10-NEXT: s_movk_i32 s3, 0xff ; GFX10-NEXT: s_lshr_b32 s4, s2, 2 +; GFX10-NEXT: s_and_b32 s1, s2, 3 ; GFX10-NEXT: v_cmp_eq_u32_e64 vcc_lo, s4, 1 +; GFX10-NEXT: v_cmp_eq_u32_e64 s0, s4, 2 +; GFX10-NEXT: s_lshl_b32 s2, s1, 3 +; GFX10-NEXT: v_cmp_eq_u32_e64 s1, s4, 3 +; GFX10-NEXT: v_lshlrev_b32_sdwa v2, s2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX10-NEXT: s_lshl_b32 s2, s3, s2 +; GFX10-NEXT: s_not_b32 s2, s2 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_lshrrev_b32_e32 v7, 24, v3 ; GFX10-NEXT: v_lshrrev_b32_e32 v8, 24, v4 -; GFX10-NEXT: v_lshlrev_b32_sdwa v11, s0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 -; GFX10-NEXT: v_lshlrev_b32_sdwa v13, s0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 +; GFX10-NEXT: v_lshlrev_b32_sdwa v11, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 +; GFX10-NEXT: v_lshlrev_b32_sdwa v13, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 ; GFX10-NEXT: v_lshrrev_b32_e32 v9, 24, v5 -; GFX10-NEXT: v_lshlrev_b32_sdwa v12, s1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 -; GFX10-NEXT: v_lshlrev_b32_sdwa v14, s1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 +; GFX10-NEXT: v_lshlrev_b32_sdwa v12, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 +; GFX10-NEXT: v_lshlrev_b32_sdwa v14, v1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 ; GFX10-NEXT: v_lshlrev_b32_sdwa v15, v0, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 ; GFX10-NEXT: v_and_or_b32 v3, v3, s3, v11 ; GFX10-NEXT: v_lshlrev_b32_e32 v7, 24, v7 @@ -6631,15 +6567,8 @@ ; GFX10-NEXT: v_lshlrev_b32_e32 v7, 24, v10 ; GFX10-NEXT: v_or3_b32 v5, v5, v16, v9 ; GFX10-NEXT: v_cndmask_b32_e32 v8, v3, v4, vcc_lo -; GFX10-NEXT: v_cmp_eq_u32_e64 s0, s4, 2 -; GFX10-NEXT: s_and_b32 s1, s2, 3 ; GFX10-NEXT: v_or3_b32 v6, v6, v11, v7 -; GFX10-NEXT: s_lshl_b32 s2, s1, 3 -; GFX10-NEXT: v_cmp_eq_u32_e64 s1, s4, 3 ; GFX10-NEXT: v_cndmask_b32_e64 v7, v8, v5, s0 -; GFX10-NEXT: v_lshlrev_b32_sdwa v2, s2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX10-NEXT: s_lshl_b32 s2, s3, s2 -; GFX10-NEXT: s_not_b32 s2, s2 ; GFX10-NEXT: v_cndmask_b32_e64 v7, v7, v6, s1 ; GFX10-NEXT: v_and_or_b32 v2, v7, s2, v2 ; GFX10-NEXT: v_cmp_eq_u32_e64 s2, s4, 0 @@ -6685,25 +6614,22 @@ ; GFX9-LABEL: insertelement_v_v16i8_v_v: ; GFX9: ; %bb.0: ; GFX9-NEXT: global_load_dwordx4 v[4:7], v[0:1], off -; GFX9-NEXT: s_mov_b32 s0, 8 ; GFX9-NEXT: v_mov_b32_e32 v1, 8 -; GFX9-NEXT: s_mov_b32 s1, 16 -; GFX9-NEXT: s_movk_i32 s2, 0xff ; GFX9-NEXT: v_mov_b32_e32 v0, 0xff ; GFX9-NEXT: v_mov_b32_e32 v8, 16 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_lshrrev_b32_e32 v9, 24, v4 ; GFX9-NEXT: v_lshrrev_b32_e32 v10, 24, v5 -; GFX9-NEXT: v_lshlrev_b32_sdwa v13, s0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 -; GFX9-NEXT: v_lshlrev_b32_sdwa v15, s0, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 +; GFX9-NEXT: v_lshlrev_b32_sdwa v13, v1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 +; GFX9-NEXT: v_lshlrev_b32_sdwa v15, v1, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 ; GFX9-NEXT: v_lshlrev_b32_sdwa v17, v1, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 ; GFX9-NEXT: v_lshlrev_b32_sdwa v19, v1, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 ; GFX9-NEXT: v_lshrrev_b32_e32 v11, 24, v6 -; GFX9-NEXT: v_lshlrev_b32_sdwa v14, s1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 -; GFX9-NEXT: v_lshlrev_b32_sdwa v16, s1, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 +; GFX9-NEXT: v_lshlrev_b32_sdwa v14, v8, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 +; GFX9-NEXT: v_lshlrev_b32_sdwa v16, v8, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 ; GFX9-NEXT: v_lshlrev_b32_sdwa v18, v8, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 -; GFX9-NEXT: v_and_or_b32 v13, v4, s2, v13 -; GFX9-NEXT: v_and_or_b32 v15, v5, s2, v15 +; GFX9-NEXT: v_and_or_b32 v13, v4, v0, v13 +; GFX9-NEXT: v_and_or_b32 v15, v5, v0, v15 ; GFX9-NEXT: v_and_or_b32 v6, v6, v0, v17 ; GFX9-NEXT: v_and_or_b32 v17, v7, v0, v19 ; GFX9-NEXT: v_lshrrev_b32_e32 v19, 2, v3 @@ -6766,89 +6692,87 @@ ; GFX8-LABEL: insertelement_v_v16i8_v_v: ; GFX8: ; %bb.0: ; GFX8-NEXT: flat_load_dwordx4 v[4:7], v[0:1] -; GFX8-NEXT: v_mov_b32_e32 v9, 8 ; GFX8-NEXT: v_mov_b32_e32 v1, 8 -; GFX8-NEXT: v_mov_b32_e32 v10, 16 ; GFX8-NEXT: v_mov_b32_e32 v8, 16 ; GFX8-NEXT: v_mov_b32_e32 v0, 0xff ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_lshlrev_b32_sdwa v17, v9, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 -; GFX8-NEXT: v_lshlrev_b32_sdwa v19, v9, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 -; GFX8-NEXT: v_lshrrev_b32_e32 v13, 24, v6 -; GFX8-NEXT: v_lshlrev_b32_sdwa v15, v1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 -; GFX8-NEXT: v_lshlrev_b32_sdwa v1, v1, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 -; GFX8-NEXT: v_lshlrev_b32_sdwa v18, v10, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 +; GFX8-NEXT: v_lshlrev_b32_sdwa v17, v1, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 +; GFX8-NEXT: v_lshlrev_b32_sdwa v19, v1, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 +; GFX8-NEXT: v_lshrrev_b32_e32 v11, 24, v6 +; GFX8-NEXT: v_lshlrev_b32_sdwa v13, v1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 +; GFX8-NEXT: v_lshlrev_b32_sdwa v15, v1, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 +; GFX8-NEXT: v_lshlrev_b32_sdwa v18, v8, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 ; GFX8-NEXT: v_or_b32_sdwa v6, v6, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX8-NEXT: v_or_b32_sdwa v17, v7, v19 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX8-NEXT: v_lshrrev_b32_e32 v19, 2, v3 ; GFX8-NEXT: v_and_b32_e32 v3, 3, v3 -; GFX8-NEXT: v_lshrrev_b32_e32 v11, 24, v4 -; GFX8-NEXT: v_lshrrev_b32_e32 v12, 24, v5 -; GFX8-NEXT: v_lshrrev_b32_e32 v14, 24, v7 -; GFX8-NEXT: v_lshlrev_b32_sdwa v16, v8, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 -; GFX8-NEXT: v_lshlrev_b32_sdwa v8, v8, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 -; GFX8-NEXT: v_or_b32_sdwa v15, v4, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX8-NEXT: v_or_b32_sdwa v1, v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX8-NEXT: v_lshrrev_b32_e32 v9, 24, v4 +; GFX8-NEXT: v_lshrrev_b32_e32 v10, 24, v5 +; GFX8-NEXT: v_lshrrev_b32_e32 v12, 24, v7 +; GFX8-NEXT: v_lshlrev_b32_sdwa v14, v8, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 +; GFX8-NEXT: v_lshlrev_b32_sdwa v16, v8, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 +; GFX8-NEXT: v_or_b32_sdwa v13, v4, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_sdwa v15, v5, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX8-NEXT: v_lshlrev_b32_e32 v3, 3, v3 ; GFX8-NEXT: v_lshlrev_b32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX8-NEXT: v_lshlrev_b32_e32 v0, v3, v0 -; GFX8-NEXT: v_lshlrev_b32_e32 v3, 24, v11 +; GFX8-NEXT: v_lshlrev_b32_e32 v3, 24, v9 +; GFX8-NEXT: v_lshlrev_b32_e32 v9, 24, v10 +; GFX8-NEXT: v_lshlrev_b32_e32 v10, 24, v11 ; GFX8-NEXT: v_lshlrev_b32_e32 v11, 24, v12 -; GFX8-NEXT: v_lshlrev_b32_e32 v12, 24, v13 -; GFX8-NEXT: v_lshlrev_b32_e32 v13, 24, v14 -; GFX8-NEXT: v_or_b32_e32 v14, v15, v16 -; GFX8-NEXT: v_or_b32_e32 v1, v1, v8 -; GFX8-NEXT: v_lshlrev_b32_sdwa v7, v10, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 +; GFX8-NEXT: v_or_b32_e32 v12, v13, v14 +; GFX8-NEXT: v_or_b32_e32 v13, v15, v16 +; GFX8-NEXT: v_lshlrev_b32_sdwa v7, v8, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 ; GFX8-NEXT: v_or_b32_e32 v6, v6, v18 -; GFX8-NEXT: v_or_b32_e32 v3, v14, v3 -; GFX8-NEXT: v_or_b32_e32 v1, v1, v11 +; GFX8-NEXT: v_or_b32_e32 v3, v12, v3 +; GFX8-NEXT: v_or_b32_e32 v9, v13, v9 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 1, v19 ; GFX8-NEXT: v_or_b32_e32 v7, v17, v7 -; GFX8-NEXT: v_or_b32_e32 v6, v6, v12 -; GFX8-NEXT: v_cndmask_b32_e32 v8, v3, v1, vcc +; GFX8-NEXT: v_or_b32_e32 v6, v6, v10 +; GFX8-NEXT: v_cndmask_b32_e32 v10, v3, v9, vcc ; GFX8-NEXT: v_cmp_eq_u32_e64 s[0:1], 2, v19 -; GFX8-NEXT: v_or_b32_e32 v7, v7, v13 -; GFX8-NEXT: v_cndmask_b32_e64 v8, v8, v6, s[0:1] +; GFX8-NEXT: v_or_b32_e32 v7, v7, v11 +; GFX8-NEXT: v_cndmask_b32_e64 v10, v10, v6, s[0:1] ; GFX8-NEXT: v_cmp_eq_u32_e64 s[2:3], 3, v19 ; GFX8-NEXT: v_xor_b32_e32 v0, -1, v0 -; GFX8-NEXT: v_cndmask_b32_e64 v8, v8, v7, s[2:3] -; GFX8-NEXT: v_and_b32_e32 v0, v8, v0 +; GFX8-NEXT: v_cndmask_b32_e64 v10, v10, v7, s[2:3] +; GFX8-NEXT: v_and_b32_e32 v0, v10, v0 ; GFX8-NEXT: v_or_b32_e32 v0, v0, v2 ; GFX8-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v19 ; GFX8-NEXT: v_cndmask_b32_e64 v2, v3, v0, s[4:5] -; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v0, vcc -; GFX8-NEXT: v_cndmask_b32_e64 v3, v6, v0, s[0:1] +; GFX8-NEXT: v_cndmask_b32_e32 v3, v9, v0, vcc +; GFX8-NEXT: v_cndmask_b32_e64 v6, v6, v0, s[0:1] ; GFX8-NEXT: v_cndmask_b32_e64 v0, v7, v0, s[2:3] -; GFX8-NEXT: v_lshlrev_b32_sdwa v12, v9, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 -; GFX8-NEXT: v_lshlrev_b32_sdwa v14, v9, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 -; GFX8-NEXT: v_lshlrev_b32_sdwa v16, v9, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 -; GFX8-NEXT: v_lshlrev_b32_sdwa v9, v9, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 -; GFX8-NEXT: v_lshrrev_b32_e32 v6, 24, v2 -; GFX8-NEXT: v_lshrrev_b32_e32 v7, 24, v1 -; GFX8-NEXT: v_lshrrev_b32_e32 v8, 24, v3 +; GFX8-NEXT: v_lshlrev_b32_sdwa v12, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 +; GFX8-NEXT: v_lshlrev_b32_sdwa v14, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 +; GFX8-NEXT: v_lshlrev_b32_sdwa v16, v1, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 +; GFX8-NEXT: v_lshlrev_b32_sdwa v1, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 +; GFX8-NEXT: v_lshrrev_b32_e32 v7, 24, v2 +; GFX8-NEXT: v_lshrrev_b32_e32 v9, 24, v3 +; GFX8-NEXT: v_lshrrev_b32_e32 v10, 24, v6 ; GFX8-NEXT: v_lshrrev_b32_e32 v11, 24, v0 -; GFX8-NEXT: v_lshlrev_b32_sdwa v13, v10, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 -; GFX8-NEXT: v_lshlrev_b32_sdwa v15, v10, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 -; GFX8-NEXT: v_lshlrev_b32_sdwa v17, v10, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 -; GFX8-NEXT: v_lshlrev_b32_sdwa v10, v10, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 +; GFX8-NEXT: v_lshlrev_b32_sdwa v13, v8, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 +; GFX8-NEXT: v_lshlrev_b32_sdwa v15, v8, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 +; GFX8-NEXT: v_lshlrev_b32_sdwa v17, v8, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 +; GFX8-NEXT: v_lshlrev_b32_sdwa v8, v8, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 ; GFX8-NEXT: v_or_b32_sdwa v2, v2, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX8-NEXT: v_or_b32_sdwa v1, v1, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX8-NEXT: v_or_b32_sdwa v3, v3, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX8-NEXT: v_or_b32_sdwa v0, v0, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_sdwa v3, v3, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_sdwa v6, v6, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX8-NEXT: v_mov_b32_e32 v4, 0 -; GFX8-NEXT: v_lshlrev_b32_e32 v6, 24, v6 ; GFX8-NEXT: v_lshlrev_b32_e32 v7, 24, v7 -; GFX8-NEXT: v_lshlrev_b32_e32 v8, 24, v8 -; GFX8-NEXT: v_lshlrev_b32_e32 v9, 24, v11 -; GFX8-NEXT: v_or_b32_e32 v2, v2, v13 -; GFX8-NEXT: v_or_b32_e32 v1, v1, v15 -; GFX8-NEXT: v_or_b32_e32 v3, v3, v17 -; GFX8-NEXT: v_or_b32_e32 v10, v0, v10 +; GFX8-NEXT: v_lshlrev_b32_e32 v9, 24, v9 +; GFX8-NEXT: v_lshlrev_b32_e32 v10, 24, v10 +; GFX8-NEXT: v_lshlrev_b32_e32 v11, 24, v11 +; GFX8-NEXT: v_or_b32_e32 v1, v2, v13 +; GFX8-NEXT: v_or_b32_e32 v2, v3, v15 +; GFX8-NEXT: v_or_b32_e32 v3, v6, v17 +; GFX8-NEXT: v_or_b32_e32 v6, v0, v8 ; GFX8-NEXT: v_mov_b32_e32 v5, 0 -; GFX8-NEXT: v_or_b32_e32 v0, v2, v6 -; GFX8-NEXT: v_or_b32_e32 v1, v1, v7 -; GFX8-NEXT: v_or_b32_e32 v2, v3, v8 -; GFX8-NEXT: v_or_b32_e32 v3, v10, v9 +; GFX8-NEXT: v_or_b32_e32 v0, v1, v7 +; GFX8-NEXT: v_or_b32_e32 v1, v2, v9 +; GFX8-NEXT: v_or_b32_e32 v2, v3, v10 +; GFX8-NEXT: v_or_b32_e32 v3, v6, v11 ; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX8-NEXT: s_endpgm ; @@ -6858,13 +6782,13 @@ ; GFX7-NEXT: s_mov_b32 s11, 0xf000 ; GFX7-NEXT: s_mov_b64 s[8:9], 0 ; GFX7-NEXT: buffer_load_dwordx4 v[4:7], v[0:1], s[8:11], 0 addr64 -; GFX7-NEXT: s_movk_i32 s0, 0xff ; GFX7-NEXT: v_mov_b32_e32 v8, 0xff ; GFX7-NEXT: v_lshrrev_b32_e32 v19, 2, v3 ; GFX7-NEXT: v_and_b32_e32 v3, 3, v3 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v19 ; GFX7-NEXT: v_and_b32_e32 v2, v2, v8 ; GFX7-NEXT: v_lshlrev_b32_e32 v3, 3, v3 +; GFX7-NEXT: v_cmp_eq_u32_e64 s[0:1], 2, v19 ; GFX7-NEXT: v_lshlrev_b32_e32 v2, v3, v2 ; GFX7-NEXT: v_lshlrev_b32_e32 v3, v3, v8 ; GFX7-NEXT: v_cmp_eq_u32_e64 s[2:3], 3, v19 @@ -6876,9 +6800,9 @@ ; GFX7-NEXT: v_bfe_u32 v14, v5, 8, 8 ; GFX7-NEXT: v_lshrrev_b32_e32 v0, 24, v4 ; GFX7-NEXT: v_lshrrev_b32_e32 v1, 24, v5 -; GFX7-NEXT: v_and_b32_e32 v11, s0, v4 +; GFX7-NEXT: v_and_b32_e32 v11, v4, v8 ; GFX7-NEXT: v_bfe_u32 v4, v4, 16, 8 -; GFX7-NEXT: v_and_b32_e32 v13, s0, v5 +; GFX7-NEXT: v_and_b32_e32 v13, v5, v8 ; GFX7-NEXT: v_bfe_u32 v5, v5, 16, 8 ; GFX7-NEXT: v_bfe_u32 v16, v6, 8, 8 ; GFX7-NEXT: v_lshlrev_b32_e32 v12, 8, v12 @@ -6912,7 +6836,6 @@ ; GFX7-NEXT: v_or_b32_e32 v7, v14, v7 ; GFX7-NEXT: v_or_b32_e32 v4, v6, v9 ; GFX7-NEXT: v_cndmask_b32_e32 v6, v0, v1, vcc -; GFX7-NEXT: v_cmp_eq_u32_e64 s[0:1], 2, v19 ; GFX7-NEXT: v_or_b32_e32 v5, v7, v10 ; GFX7-NEXT: v_cndmask_b32_e64 v6, v6, v4, s[0:1] ; GFX7-NEXT: v_cndmask_b32_e64 v6, v6, v5, s[2:3] @@ -6920,36 +6843,36 @@ ; GFX7-NEXT: v_or_b32_e32 v2, v3, v2 ; GFX7-NEXT: v_cndmask_b32_e64 v0, v0, v2, s[4:5] ; GFX7-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc -; GFX7-NEXT: v_bfe_u32 v10, v0, 8, 8 ; GFX7-NEXT: v_cndmask_b32_e64 v3, v4, v2, s[0:1] +; GFX7-NEXT: v_bfe_u32 v10, v0, 8, 8 +; GFX7-NEXT: v_bfe_u32 v12, v1, 8, 8 ; GFX7-NEXT: v_cndmask_b32_e64 v4, v5, v2, s[2:3] ; GFX7-NEXT: v_lshrrev_b32_e32 v2, 24, v0 +; GFX7-NEXT: v_lshrrev_b32_e32 v5, 24, v1 ; GFX7-NEXT: v_and_b32_e32 v9, v0, v8 ; GFX7-NEXT: v_bfe_u32 v0, v0, 16, 8 -; GFX7-NEXT: v_bfe_u32 v12, v1, 8, 8 -; GFX7-NEXT: v_lshlrev_b32_e32 v10, 8, v10 -; GFX7-NEXT: v_lshrrev_b32_e32 v5, 24, v1 ; GFX7-NEXT: v_and_b32_e32 v11, v1, v8 ; GFX7-NEXT: v_bfe_u32 v1, v1, 16, 8 ; GFX7-NEXT: v_bfe_u32 v14, v3, 8, 8 -; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX7-NEXT: v_lshlrev_b32_e32 v10, 8, v10 ; GFX7-NEXT: v_lshlrev_b32_e32 v12, 8, v12 -; GFX7-NEXT: v_or_b32_e32 v9, v9, v10 ; GFX7-NEXT: v_lshrrev_b32_e32 v6, 24, v3 ; GFX7-NEXT: v_and_b32_e32 v13, v3, v8 ; GFX7-NEXT: v_bfe_u32 v3, v3, 16, 8 -; GFX7-NEXT: v_lshlrev_b32_e32 v2, 24, v2 +; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX7-NEXT: v_lshlrev_b32_e32 v14, 8, v14 +; GFX7-NEXT: v_or_b32_e32 v9, v9, v10 ; GFX7-NEXT: v_or_b32_e32 v10, v11, v12 -; GFX7-NEXT: v_or_b32_e32 v0, v9, v0 +; GFX7-NEXT: v_lshlrev_b32_e32 v2, 24, v2 ; GFX7-NEXT: v_lshlrev_b32_e32 v5, 24, v5 +; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; GFX7-NEXT: v_or_b32_e32 v11, v13, v14 +; GFX7-NEXT: v_or_b32_e32 v0, v9, v0 ; GFX7-NEXT: v_or_b32_e32 v1, v10, v1 ; GFX7-NEXT: v_or_b32_e32 v0, v0, v2 -; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v3 ; GFX7-NEXT: v_or_b32_e32 v1, v1, v5 -; GFX7-NEXT: v_or_b32_e32 v2, v11, v2 +; GFX7-NEXT: v_or_b32_e32 v2, v11, v3 ; GFX7-NEXT: v_lshlrev_b32_e32 v3, 24, v6 ; GFX7-NEXT: v_bfe_u32 v5, v4, 8, 8 ; GFX7-NEXT: v_lshrrev_b32_e32 v7, 24, v4 @@ -6968,28 +6891,28 @@ ; GFX10-LABEL: insertelement_v_v16i8_v_v: ; GFX10: ; %bb.0: ; GFX10-NEXT: global_load_dwordx4 v[4:7], v[0:1], off -; GFX10-NEXT: s_mov_b32 s0, 8 ; GFX10-NEXT: v_mov_b32_e32 v8, 8 -; GFX10-NEXT: s_mov_b32 s1, 16 -; GFX10-NEXT: s_movk_i32 s2, 0xff -; GFX10-NEXT: v_and_b32_e32 v0, 3, v3 ; GFX10-NEXT: v_mov_b32_e32 v1, 0xff ; GFX10-NEXT: v_mov_b32_e32 v9, 16 +; GFX10-NEXT: v_and_b32_e32 v0, 3, v3 ; GFX10-NEXT: v_lshrrev_b32_e32 v3, 2, v3 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v3 +; GFX10-NEXT: v_cmp_eq_u32_e64 s0, 2, v3 +; GFX10-NEXT: v_cmp_eq_u32_e64 s1, 3, v3 +; GFX10-NEXT: v_cmp_eq_u32_e64 s2, 0, v3 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_lshrrev_b32_e32 v10, 24, v4 ; GFX10-NEXT: v_lshrrev_b32_e32 v11, 24, v5 -; GFX10-NEXT: v_lshlrev_b32_sdwa v14, s0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 -; GFX10-NEXT: v_lshlrev_b32_sdwa v16, s0, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 +; GFX10-NEXT: v_lshlrev_b32_sdwa v14, v8, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 +; GFX10-NEXT: v_lshlrev_b32_sdwa v16, v8, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 ; GFX10-NEXT: v_lshrrev_b32_e32 v12, 24, v6 -; GFX10-NEXT: v_lshlrev_b32_sdwa v15, s1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 -; GFX10-NEXT: v_lshlrev_b32_sdwa v17, s1, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 +; GFX10-NEXT: v_lshlrev_b32_sdwa v15, v9, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 +; GFX10-NEXT: v_lshlrev_b32_sdwa v17, v9, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 ; GFX10-NEXT: v_lshlrev_b32_sdwa v18, v8, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 -; GFX10-NEXT: v_and_or_b32 v4, v4, s2, v14 +; GFX10-NEXT: v_and_or_b32 v4, v4, v1, v14 ; GFX10-NEXT: v_lshlrev_b32_e32 v10, 24, v10 -; GFX10-NEXT: v_and_or_b32 v5, v5, s2, v16 +; GFX10-NEXT: v_and_or_b32 v5, v5, v1, v16 ; GFX10-NEXT: v_lshlrev_b32_e32 v11, 24, v11 ; GFX10-NEXT: v_lshrrev_b32_e32 v13, 24, v7 ; GFX10-NEXT: v_lshlrev_b32_sdwa v19, v9, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 @@ -7003,14 +6926,11 @@ ; GFX10-NEXT: v_lshlrev_b32_e32 v10, 24, v13 ; GFX10-NEXT: v_or3_b32 v6, v6, v19, v12 ; GFX10-NEXT: v_cndmask_b32_e32 v11, v4, v5, vcc_lo -; GFX10-NEXT: v_cmp_eq_u32_e64 s0, 2, v3 ; GFX10-NEXT: v_lshlrev_b32_e32 v12, v0, v1 -; GFX10-NEXT: v_or3_b32 v7, v7, v14, v10 -; GFX10-NEXT: v_cmp_eq_u32_e64 s1, 3, v3 ; GFX10-NEXT: v_lshlrev_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX10-NEXT: v_or3_b32 v7, v7, v14, v10 ; GFX10-NEXT: v_cndmask_b32_e64 v10, v11, v6, s0 ; GFX10-NEXT: v_xor_b32_e32 v2, -1, v12 -; GFX10-NEXT: v_cmp_eq_u32_e64 s2, 0, v3 ; GFX10-NEXT: v_cndmask_b32_e64 v10, v10, v7, s1 ; GFX10-NEXT: v_and_or_b32 v0, v10, v2, v0 ; GFX10-NEXT: v_cndmask_b32_e64 v2, v4, v0, s2 Index: llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.atomic.inc.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.atomic.inc.ll +++ llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.atomic.inc.ll @@ -53,10 +53,10 @@ ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX9-NEXT: s_load_dword s2, s[4:5], 0x8 -; GFX9-NEXT: v_mov_b32_e32 v1, 42 +; GFX9-NEXT: v_mov_b32_e32 v0, 42 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s2 -; GFX9-NEXT: ds_inc_rtn_u32 v0, v0, v1 +; GFX9-NEXT: v_mov_b32_e32 v1, s2 +; GFX9-NEXT: ds_inc_rtn_u32 v0, v1, v0 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_store_dword v1, v0, s[0:1] @@ -65,11 +65,11 @@ ; GFX10-LABEL: lds_atomic_inc_ret_i32: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_load_dword s0, s[4:5], 0x8 -; GFX10-NEXT: v_mov_b32_e32 v1, 42 +; GFX10-NEXT: v_mov_b32_e32 v0, 42 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-NEXT: v_mov_b32_e32 v1, s0 ; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX10-NEXT: ds_inc_rtn_u32 v0, v0, v1 +; GFX10-NEXT: ds_inc_rtn_u32 v0, v1, v0 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: global_store_dword v1, v0, s[0:1] @@ -169,19 +169,19 @@ ; GFX9-LABEL: lds_atomic_inc_noret_i32: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dword s0, s[4:5], 0x0 -; GFX9-NEXT: v_mov_b32_e32 v1, 42 +; GFX9-NEXT: v_mov_b32_e32 v0, 42 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NEXT: ds_inc_rtn_u32 v0, v0, v1 +; GFX9-NEXT: v_mov_b32_e32 v1, s0 +; GFX9-NEXT: ds_inc_rtn_u32 v0, v1, v0 ; GFX9-NEXT: s_endpgm ; ; GFX10-LABEL: lds_atomic_inc_noret_i32: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_load_dword s0, s[4:5], 0x0 -; GFX10-NEXT: v_mov_b32_e32 v1, 42 +; GFX10-NEXT: v_mov_b32_e32 v0, 42 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-NEXT: ds_inc_rtn_u32 v0, v0, v1 +; GFX10-NEXT: v_mov_b32_e32 v1, s0 +; GFX10-NEXT: ds_inc_rtn_u32 v0, v1, v0 ; GFX10-NEXT: s_endpgm %result = call i32 @llvm.amdgcn.atomic.inc.i32.p3i32(i32 addrspace(3)* %ptr, i32 42, i32 0, i32 0, i1 false) ret void Index: llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.load.1d.d16.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.load.1d.d16.ll +++ llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.load.1d.d16.ll @@ -587,10 +587,10 @@ ; GFX8-UNPACKED-NEXT: s_mov_b32 s6, s8 ; GFX8-UNPACKED-NEXT: s_mov_b32 s7, s9 ; GFX8-UNPACKED-NEXT: image_load v[0:3], v0, s[0:7] dmask:0xf unorm d16 -; GFX8-UNPACKED-NEXT: s_mov_b32 s0, 0xffff +; GFX8-UNPACKED-NEXT: v_mov_b32_e32 v4, 0xffff ; GFX8-UNPACKED-NEXT: s_waitcnt vmcnt(0) -; GFX8-UNPACKED-NEXT: v_and_b32_e32 v1, s0, v1 -; GFX8-UNPACKED-NEXT: v_and_b32_e32 v3, s0, v3 +; GFX8-UNPACKED-NEXT: v_and_b32_e32 v1, v1, v4 +; GFX8-UNPACKED-NEXT: v_and_b32_e32 v3, v3, v4 ; GFX8-UNPACKED-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX8-UNPACKED-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; GFX8-UNPACKED-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD Index: llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.intersect_ray.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.intersect_ray.ll +++ llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.intersect_ray.ll @@ -59,21 +59,37 @@ } define amdgpu_ps <4 x float> @image_bvh_intersect_ray_a16(i32 %node_ptr, float %ray_extent, <4 x float> %ray_origin, <4 x half> %ray_dir, <4 x half> %ray_inv_dir, <4 x i32> inreg %tdescr) { -; GCN-LABEL: image_bvh_intersect_ray_a16: -; GCN: ; %bb.0: -; GCN-NEXT: s_mov_b32 s4, 0xffff -; GCN-NEXT: v_lshrrev_b32_e32 v5, 16, v6 -; GCN-NEXT: v_and_b32_e32 v10, s4, v8 -; GCN-NEXT: v_lshrrev_b32_e32 v8, 16, v8 -; GCN-NEXT: v_and_b32_e32 v9, s4, v9 -; GCN-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GCN-NEXT: v_lshlrev_b32_e32 v10, 16, v10 -; GCN-NEXT: v_and_or_b32 v5, v6, s4, v5 -; GCN-NEXT: v_and_or_b32 v6, v7, s4, v10 -; GCN-NEXT: v_lshl_or_b32 v7, v9, 16, v8 -; GCN-NEXT: image_bvh_intersect_ray v[0:3], v[0:7], s[0:3] a16 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: ; return to shader part epilog +; GFX1030-LABEL: image_bvh_intersect_ray_a16: +; GFX1030: ; %bb.0: +; GFX1030-NEXT: v_mov_b32_e32 v5, 0xffff +; GFX1030-NEXT: v_lshrrev_b32_e32 v10, 16, v6 +; GFX1030-NEXT: v_and_b32_e32 v11, v8, v5 +; GFX1030-NEXT: v_lshrrev_b32_e32 v8, 16, v8 +; GFX1030-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; GFX1030-NEXT: v_and_b32_e32 v9, v9, v5 +; GFX1030-NEXT: v_lshlrev_b32_e32 v11, 16, v11 +; GFX1030-NEXT: v_and_or_b32 v6, v6, v5, v10 +; GFX1030-NEXT: v_and_or_b32 v5, v7, v5, v11 +; GFX1030-NEXT: v_lshl_or_b32 v7, v9, 16, v8 +; GFX1030-NEXT: image_bvh_intersect_ray v[0:3], [v0, v1, v2, v3, v4, v6, v5, v7], s[0:3] a16 +; GFX1030-NEXT: s_waitcnt vmcnt(0) +; GFX1030-NEXT: ; return to shader part epilog +; +; GFX1013-LABEL: image_bvh_intersect_ray_a16: +; GFX1013: ; %bb.0: +; GFX1013-NEXT: v_mov_b32_e32 v10, 0xffff +; GFX1013-NEXT: v_lshrrev_b32_e32 v5, 16, v6 +; GFX1013-NEXT: v_and_b32_e32 v11, v8, v10 +; GFX1013-NEXT: v_lshrrev_b32_e32 v8, 16, v8 +; GFX1013-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; GFX1013-NEXT: v_and_b32_e32 v9, v9, v10 +; GFX1013-NEXT: v_lshlrev_b32_e32 v11, 16, v11 +; GFX1013-NEXT: v_and_or_b32 v5, v6, v10, v5 +; GFX1013-NEXT: v_and_or_b32 v6, v7, v10, v11 +; GFX1013-NEXT: v_lshl_or_b32 v7, v9, 16, v8 +; GFX1013-NEXT: image_bvh_intersect_ray v[0:3], v[0:7], s[0:3] a16 +; GFX1013-NEXT: s_waitcnt vmcnt(0) +; GFX1013-NEXT: ; return to shader part epilog %v = call <4 x i32> @llvm.amdgcn.image.bvh.intersect.ray.i32.v4f16(i32 %node_ptr, float %ray_extent, <4 x float> %ray_origin, <4 x half> %ray_dir, <4 x half> %ray_inv_dir, <4 x i32> %tdescr) %r = bitcast <4 x i32> %v to <4 x float> ret <4 x float> %r @@ -124,21 +140,37 @@ } define amdgpu_ps <4 x float> @image_bvh64_intersect_ray_a16(i64 %node_ptr, float %ray_extent, <4 x float> %ray_origin, <4 x half> %ray_dir, <4 x half> %ray_inv_dir, <4 x i32> inreg %tdescr) { -; GCN-LABEL: image_bvh64_intersect_ray_a16: -; GCN: ; %bb.0: -; GCN-NEXT: s_mov_b32 s4, 0xffff -; GCN-NEXT: v_lshrrev_b32_e32 v6, 16, v7 -; GCN-NEXT: v_and_b32_e32 v11, s4, v9 -; GCN-NEXT: v_lshrrev_b32_e32 v9, 16, v9 -; GCN-NEXT: v_and_b32_e32 v10, s4, v10 -; GCN-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; GCN-NEXT: v_lshlrev_b32_e32 v11, 16, v11 -; GCN-NEXT: v_and_or_b32 v6, v7, s4, v6 -; GCN-NEXT: v_and_or_b32 v7, v8, s4, v11 -; GCN-NEXT: v_lshl_or_b32 v8, v10, 16, v9 -; GCN-NEXT: image_bvh64_intersect_ray v[0:3], v[0:15], s[0:3] a16 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: ; return to shader part epilog +; GFX1030-LABEL: image_bvh64_intersect_ray_a16: +; GFX1030: ; %bb.0: +; GFX1030-NEXT: v_mov_b32_e32 v6, 0xffff +; GFX1030-NEXT: v_lshrrev_b32_e32 v11, 16, v7 +; GFX1030-NEXT: v_and_b32_e32 v12, v9, v6 +; GFX1030-NEXT: v_lshrrev_b32_e32 v9, 16, v9 +; GFX1030-NEXT: v_lshlrev_b32_e32 v11, 16, v11 +; GFX1030-NEXT: v_and_b32_e32 v10, v10, v6 +; GFX1030-NEXT: v_lshlrev_b32_e32 v12, 16, v12 +; GFX1030-NEXT: v_and_or_b32 v7, v7, v6, v11 +; GFX1030-NEXT: v_and_or_b32 v6, v8, v6, v12 +; GFX1030-NEXT: v_lshl_or_b32 v8, v10, 16, v9 +; GFX1030-NEXT: image_bvh64_intersect_ray v[0:3], [v0, v1, v2, v3, v4, v5, v7, v6, v8], s[0:3] a16 +; GFX1030-NEXT: s_waitcnt vmcnt(0) +; GFX1030-NEXT: ; return to shader part epilog +; +; GFX1013-LABEL: image_bvh64_intersect_ray_a16: +; GFX1013: ; %bb.0: +; GFX1013-NEXT: v_mov_b32_e32 v11, 0xffff +; GFX1013-NEXT: v_lshrrev_b32_e32 v6, 16, v7 +; GFX1013-NEXT: v_and_b32_e32 v12, v9, v11 +; GFX1013-NEXT: v_lshrrev_b32_e32 v9, 16, v9 +; GFX1013-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; GFX1013-NEXT: v_and_b32_e32 v10, v10, v11 +; GFX1013-NEXT: v_lshlrev_b32_e32 v12, 16, v12 +; GFX1013-NEXT: v_and_or_b32 v6, v7, v11, v6 +; GFX1013-NEXT: v_and_or_b32 v7, v8, v11, v12 +; GFX1013-NEXT: v_lshl_or_b32 v8, v10, 16, v9 +; GFX1013-NEXT: image_bvh64_intersect_ray v[0:3], v[0:15], s[0:3] a16 +; GFX1013-NEXT: s_waitcnt vmcnt(0) +; GFX1013-NEXT: ; return to shader part epilog %v = call <4 x i32> @llvm.amdgcn.image.bvh.intersect.ray.i64.v4f16(i64 %node_ptr, float %ray_extent, <4 x float> %ray_origin, <4 x half> %ray_dir, <4 x half> %ray_inv_dir, <4 x i32> %tdescr) %r = bitcast <4 x i32> %v to <4 x float> ret <4 x float> %r @@ -232,22 +264,22 @@ define amdgpu_ps <4 x float> @image_bvh_intersect_ray_a16_vgpr_descr(i32 %node_ptr, float %ray_extent, <4 x float> %ray_origin, <4 x half> %ray_dir, <4 x half> %ray_inv_dir, <4 x i32> %tdescr) { ; GFX1030-LABEL: image_bvh_intersect_ray_a16_vgpr_descr: ; GFX1030: ; %bb.0: -; GFX1030-NEXT: s_mov_b32 s0, 0xffff ; GFX1030-NEXT: v_mov_b32_e32 v14, v0 +; GFX1030-NEXT: v_mov_b32_e32 v0, 0xffff ; GFX1030-NEXT: v_mov_b32_e32 v15, v1 -; GFX1030-NEXT: v_lshrrev_b32_e32 v0, 16, v6 -; GFX1030-NEXT: v_and_b32_e32 v1, s0, v8 ; GFX1030-NEXT: v_mov_b32_e32 v16, v2 +; GFX1030-NEXT: v_lshrrev_b32_e32 v1, 16, v6 ; GFX1030-NEXT: v_mov_b32_e32 v17, v3 -; GFX1030-NEXT: v_lshrrev_b32_e32 v2, 16, v8 -; GFX1030-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX1030-NEXT: v_and_b32_e32 v2, v8, v0 +; GFX1030-NEXT: v_lshrrev_b32_e32 v3, 16, v8 +; GFX1030-NEXT: v_and_b32_e32 v8, v9, v0 ; GFX1030-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX1030-NEXT: v_and_b32_e32 v3, s0, v9 ; GFX1030-NEXT: v_mov_b32_e32 v18, v4 +; GFX1030-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX1030-NEXT: s_mov_b32 s1, exec_lo -; GFX1030-NEXT: v_and_or_b32 v19, v6, s0, v0 -; GFX1030-NEXT: v_and_or_b32 v20, v7, s0, v1 -; GFX1030-NEXT: v_lshl_or_b32 v21, v3, 16, v2 +; GFX1030-NEXT: v_lshl_or_b32 v21, v8, 16, v3 +; GFX1030-NEXT: v_and_or_b32 v19, v6, v0, v1 +; GFX1030-NEXT: v_and_or_b32 v20, v7, v0, v2 ; GFX1030-NEXT: BB7_1: ; =>This Inner Loop Header: Depth=1 ; GFX1030-NEXT: v_readfirstlane_b32 s4, v10 ; GFX1030-NEXT: v_readfirstlane_b32 s5, v11 @@ -277,16 +309,16 @@ ; ; GFX1013-LABEL: image_bvh_intersect_ray_a16_vgpr_descr: ; GFX1013: ; %bb.0: -; GFX1013-NEXT: s_mov_b32 s0, 0xffff +; GFX1013-NEXT: v_mov_b32_e32 v14, 0xffff ; GFX1013-NEXT: v_lshrrev_b32_e32 v5, 16, v6 -; GFX1013-NEXT: v_and_b32_e32 v14, s0, v8 -; GFX1013-NEXT: v_lshrrev_b32_e32 v8, 16, v8 -; GFX1013-NEXT: v_and_b32_e32 v9, s0, v9 ; GFX1013-NEXT: s_mov_b32 s1, exec_lo +; GFX1013-NEXT: v_and_b32_e32 v15, v8, v14 +; GFX1013-NEXT: v_lshrrev_b32_e32 v8, 16, v8 ; GFX1013-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GFX1013-NEXT: v_lshlrev_b32_e32 v14, 16, v14 -; GFX1013-NEXT: v_and_or_b32 v5, v6, s0, v5 -; GFX1013-NEXT: v_and_or_b32 v6, v7, s0, v14 +; GFX1013-NEXT: v_and_b32_e32 v9, v9, v14 +; GFX1013-NEXT: v_lshlrev_b32_e32 v15, 16, v15 +; GFX1013-NEXT: v_and_or_b32 v5, v6, v14, v5 +; GFX1013-NEXT: v_and_or_b32 v6, v7, v14, v15 ; GFX1013-NEXT: v_lshl_or_b32 v7, v9, 16, v8 ; GFX1013-NEXT: BB7_1: ; =>This Inner Loop Header: Depth=1 ; GFX1013-NEXT: v_readfirstlane_b32 s4, v10 @@ -407,23 +439,23 @@ define amdgpu_ps <4 x float> @image_bvh64_intersect_ray_a16_vgpr_descr(i64 %node_ptr, float %ray_extent, <4 x float> %ray_origin, <4 x half> %ray_dir, <4 x half> %ray_inv_dir, <4 x i32> %tdescr) { ; GFX1030-LABEL: image_bvh64_intersect_ray_a16_vgpr_descr: ; GFX1030: ; %bb.0: -; GFX1030-NEXT: s_mov_b32 s0, 0xffff ; GFX1030-NEXT: v_mov_b32_e32 v15, v0 +; GFX1030-NEXT: v_mov_b32_e32 v0, 0xffff ; GFX1030-NEXT: v_mov_b32_e32 v16, v1 -; GFX1030-NEXT: v_lshrrev_b32_e32 v0, 16, v7 -; GFX1030-NEXT: v_and_b32_e32 v1, s0, v9 ; GFX1030-NEXT: v_mov_b32_e32 v17, v2 +; GFX1030-NEXT: v_lshrrev_b32_e32 v1, 16, v7 ; GFX1030-NEXT: v_mov_b32_e32 v18, v3 -; GFX1030-NEXT: v_lshrrev_b32_e32 v2, 16, v9 -; GFX1030-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX1030-NEXT: v_and_b32_e32 v2, v9, v0 +; GFX1030-NEXT: v_lshrrev_b32_e32 v3, 16, v9 +; GFX1030-NEXT: v_and_b32_e32 v9, v10, v0 ; GFX1030-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX1030-NEXT: v_and_b32_e32 v3, s0, v10 ; GFX1030-NEXT: v_mov_b32_e32 v19, v4 +; GFX1030-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX1030-NEXT: v_mov_b32_e32 v20, v5 -; GFX1030-NEXT: v_and_or_b32 v21, v7, s0, v0 -; GFX1030-NEXT: v_and_or_b32 v22, v8, s0, v1 -; GFX1030-NEXT: v_lshl_or_b32 v23, v3, 16, v2 +; GFX1030-NEXT: v_lshl_or_b32 v23, v9, 16, v3 +; GFX1030-NEXT: v_and_or_b32 v21, v7, v0, v1 ; GFX1030-NEXT: s_mov_b32 s1, exec_lo +; GFX1030-NEXT: v_and_or_b32 v22, v8, v0, v2 ; GFX1030-NEXT: BB9_1: ; =>This Inner Loop Header: Depth=1 ; GFX1030-NEXT: v_readfirstlane_b32 s4, v11 ; GFX1030-NEXT: v_readfirstlane_b32 s5, v12 @@ -454,20 +486,20 @@ ; ; GFX1013-LABEL: image_bvh64_intersect_ray_a16_vgpr_descr: ; GFX1013: ; %bb.0: -; GFX1013-NEXT: s_mov_b32 s0, 0xffff ; GFX1013-NEXT: v_mov_b32_e32 v16, v11 -; GFX1013-NEXT: v_lshrrev_b32_e32 v6, 16, v7 -; GFX1013-NEXT: v_and_b32_e32 v11, s0, v9 -; GFX1013-NEXT: v_lshrrev_b32_e32 v9, 16, v9 -; GFX1013-NEXT: v_and_b32_e32 v10, s0, v10 +; GFX1013-NEXT: v_mov_b32_e32 v11, 0xffff ; GFX1013-NEXT: v_mov_b32_e32 v17, v12 -; GFX1013-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; GFX1013-NEXT: v_lshlrev_b32_e32 v11, 16, v11 +; GFX1013-NEXT: v_lshrrev_b32_e32 v6, 16, v7 ; GFX1013-NEXT: v_mov_b32_e32 v18, v13 ; GFX1013-NEXT: v_mov_b32_e32 v19, v14 +; GFX1013-NEXT: v_and_b32_e32 v12, v9, v11 +; GFX1013-NEXT: v_lshrrev_b32_e32 v9, 16, v9 +; GFX1013-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; GFX1013-NEXT: v_and_b32_e32 v10, v10, v11 ; GFX1013-NEXT: s_mov_b32 s1, exec_lo -; GFX1013-NEXT: v_and_or_b32 v6, v7, s0, v6 -; GFX1013-NEXT: v_and_or_b32 v7, v8, s0, v11 +; GFX1013-NEXT: v_lshlrev_b32_e32 v12, 16, v12 +; GFX1013-NEXT: v_and_or_b32 v6, v7, v11, v6 +; GFX1013-NEXT: v_and_or_b32 v7, v8, v11, v12 ; GFX1013-NEXT: v_lshl_or_b32 v8, v10, 16, v9 ; GFX1013-NEXT: BB9_1: ; =>This Inner Loop Header: Depth=1 ; GFX1013-NEXT: v_readfirstlane_b32 s4, v16 @@ -778,8 +810,8 @@ ; GFX1030-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX1030-NEXT: s_movk_i32 s6, 0x4200 ; GFX1030-NEXT: s_movk_i32 s7, 0x4800 -; GFX1030-NEXT: s_bfe_u32 s6, s6, 0x100000 ; GFX1030-NEXT: s_movk_i32 s9, 0x4600 +; GFX1030-NEXT: s_bfe_u32 s6, s6, 0x100000 ; GFX1030-NEXT: s_movk_i32 s8, 0x4700 ; GFX1030-NEXT: s_bfe_u32 s7, s7, 0x100000 ; GFX1030-NEXT: s_bfe_u32 s8, s8, 0x100000 Index: llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.buffer.load.format.f16.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.buffer.load.format.f16.ll +++ llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.buffer.load.format.f16.ll @@ -61,14 +61,11 @@ ; UNPACKED: [[BUFFER_LOAD_FORMAT_D16_XY_gfx80_OFFEN:%[0-9]+]]:vreg_64 = BUFFER_LOAD_FORMAT_D16_XY_gfx80_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[COPY5]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (<2 x s16>), align 1, addrspace 4) ; UNPACKED: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_FORMAT_D16_XY_gfx80_OFFEN]].sub0 ; UNPACKED: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_FORMAT_D16_XY_gfx80_OFFEN]].sub1 - ; UNPACKED: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 65535 - ; UNPACKED: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] - ; UNPACKED: [[V_AND_B32_e64_:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 [[COPY6]], [[COPY8]], implicit $exec - ; UNPACKED: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] - ; UNPACKED: [[V_AND_B32_e64_1:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 [[COPY7]], [[COPY9]], implicit $exec - ; UNPACKED: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 16 - ; UNPACKED: [[COPY10:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_1]] - ; UNPACKED: [[V_LSHLREV_B32_e64_:%[0-9]+]]:vgpr_32 = V_LSHLREV_B32_e64 [[COPY10]], [[V_AND_B32_e64_1]], implicit $exec + ; UNPACKED: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 65535, implicit $exec + ; UNPACKED: [[V_AND_B32_e64_:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 [[COPY6]], [[V_MOV_B32_e32_]], implicit $exec + ; UNPACKED: [[V_AND_B32_e64_1:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 [[COPY7]], [[V_MOV_B32_e32_]], implicit $exec + ; UNPACKED: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 16, implicit $exec + ; UNPACKED: [[V_LSHLREV_B32_e64_:%[0-9]+]]:vgpr_32 = V_LSHLREV_B32_e64 [[V_MOV_B32_e32_1]], [[V_AND_B32_e64_1]], implicit $exec ; UNPACKED: [[V_OR_B32_e64_:%[0-9]+]]:vgpr_32 = V_OR_B32_e64 [[V_AND_B32_e64_]], [[V_LSHLREV_B32_e64_]], implicit $exec ; UNPACKED: $vgpr0 = COPY [[V_OR_B32_e64_]] ; UNPACKED: SI_RETURN_TO_EPILOG implicit $vgpr0 @@ -114,21 +111,15 @@ ; UNPACKED: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_FORMAT_D16_XYZW_gfx80_OFFEN]].sub1 ; UNPACKED: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_FORMAT_D16_XYZW_gfx80_OFFEN]].sub2 ; UNPACKED: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_FORMAT_D16_XYZW_gfx80_OFFEN]].sub3 - ; UNPACKED: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 65535 - ; UNPACKED: [[COPY10:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] - ; UNPACKED: [[V_AND_B32_e64_:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 [[COPY6]], [[COPY10]], implicit $exec - ; UNPACKED: [[COPY11:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] - ; UNPACKED: [[V_AND_B32_e64_1:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 [[COPY7]], [[COPY11]], implicit $exec - ; UNPACKED: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 16 - ; UNPACKED: [[COPY12:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_1]] - ; UNPACKED: [[V_LSHLREV_B32_e64_:%[0-9]+]]:vgpr_32 = V_LSHLREV_B32_e64 [[COPY12]], [[V_AND_B32_e64_1]], implicit $exec + ; UNPACKED: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 65535, implicit $exec + ; UNPACKED: [[V_AND_B32_e64_:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 [[COPY6]], [[V_MOV_B32_e32_]], implicit $exec + ; UNPACKED: [[V_AND_B32_e64_1:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 [[COPY7]], [[V_MOV_B32_e32_]], implicit $exec + ; UNPACKED: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 16, implicit $exec + ; UNPACKED: [[V_LSHLREV_B32_e64_:%[0-9]+]]:vgpr_32 = V_LSHLREV_B32_e64 [[V_MOV_B32_e32_1]], [[V_AND_B32_e64_1]], implicit $exec ; UNPACKED: [[V_OR_B32_e64_:%[0-9]+]]:vgpr_32 = V_OR_B32_e64 [[V_AND_B32_e64_]], [[V_LSHLREV_B32_e64_]], implicit $exec - ; UNPACKED: [[COPY13:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] - ; UNPACKED: [[V_AND_B32_e64_2:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 [[COPY8]], [[COPY13]], implicit $exec - ; UNPACKED: [[COPY14:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] - ; UNPACKED: [[V_AND_B32_e64_3:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 [[COPY9]], [[COPY14]], implicit $exec - ; UNPACKED: [[COPY15:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_1]] - ; UNPACKED: [[V_LSHLREV_B32_e64_1:%[0-9]+]]:vgpr_32 = V_LSHLREV_B32_e64 [[COPY15]], [[V_AND_B32_e64_3]], implicit $exec + ; UNPACKED: [[V_AND_B32_e64_2:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 [[COPY8]], [[V_MOV_B32_e32_]], implicit $exec + ; UNPACKED: [[V_AND_B32_e64_3:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 [[COPY9]], [[V_MOV_B32_e32_]], implicit $exec + ; UNPACKED: [[V_LSHLREV_B32_e64_1:%[0-9]+]]:vgpr_32 = V_LSHLREV_B32_e64 [[V_MOV_B32_e32_1]], [[V_AND_B32_e64_3]], implicit $exec ; UNPACKED: [[V_OR_B32_e64_1:%[0-9]+]]:vgpr_32 = V_OR_B32_e64 [[V_AND_B32_e64_2]], [[V_LSHLREV_B32_e64_1]], implicit $exec ; UNPACKED: $vgpr0 = COPY [[V_OR_B32_e64_]] ; UNPACKED: $vgpr1 = COPY [[V_OR_B32_e64_1]] @@ -255,21 +246,15 @@ ; UNPACKED: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_FORMAT_D16_XYZW_gfx80_OFFEN]].sub1 ; UNPACKED: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_FORMAT_D16_XYZW_gfx80_OFFEN]].sub2 ; UNPACKED: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_FORMAT_D16_XYZW_gfx80_OFFEN]].sub3 - ; UNPACKED: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 65535 - ; UNPACKED: [[COPY10:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] - ; UNPACKED: [[V_AND_B32_e64_:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 [[COPY6]], [[COPY10]], implicit $exec - ; UNPACKED: [[COPY11:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] - ; UNPACKED: [[V_AND_B32_e64_1:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 [[COPY7]], [[COPY11]], implicit $exec - ; UNPACKED: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 16 - ; UNPACKED: [[COPY12:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_1]] - ; UNPACKED: [[V_LSHLREV_B32_e64_:%[0-9]+]]:vgpr_32 = V_LSHLREV_B32_e64 [[COPY12]], [[V_AND_B32_e64_1]], implicit $exec + ; UNPACKED: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 65535, implicit $exec + ; UNPACKED: [[V_AND_B32_e64_:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 [[COPY6]], [[V_MOV_B32_e32_]], implicit $exec + ; UNPACKED: [[V_AND_B32_e64_1:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 [[COPY7]], [[V_MOV_B32_e32_]], implicit $exec + ; UNPACKED: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 16, implicit $exec + ; UNPACKED: [[V_LSHLREV_B32_e64_:%[0-9]+]]:vgpr_32 = V_LSHLREV_B32_e64 [[V_MOV_B32_e32_1]], [[V_AND_B32_e64_1]], implicit $exec ; UNPACKED: [[V_OR_B32_e64_:%[0-9]+]]:vgpr_32 = V_OR_B32_e64 [[V_AND_B32_e64_]], [[V_LSHLREV_B32_e64_]], implicit $exec - ; UNPACKED: [[COPY13:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] - ; UNPACKED: [[V_AND_B32_e64_2:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 [[COPY8]], [[COPY13]], implicit $exec - ; UNPACKED: [[COPY14:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] - ; UNPACKED: [[V_AND_B32_e64_3:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 [[COPY9]], [[COPY14]], implicit $exec - ; UNPACKED: [[COPY15:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_1]] - ; UNPACKED: [[V_LSHLREV_B32_e64_1:%[0-9]+]]:vgpr_32 = V_LSHLREV_B32_e64 [[COPY15]], [[V_AND_B32_e64_3]], implicit $exec + ; UNPACKED: [[V_AND_B32_e64_2:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 [[COPY8]], [[V_MOV_B32_e32_]], implicit $exec + ; UNPACKED: [[V_AND_B32_e64_3:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 [[COPY9]], [[V_MOV_B32_e32_]], implicit $exec + ; UNPACKED: [[V_LSHLREV_B32_e64_1:%[0-9]+]]:vgpr_32 = V_LSHLREV_B32_e64 [[V_MOV_B32_e32_1]], [[V_AND_B32_e64_3]], implicit $exec ; UNPACKED: [[V_OR_B32_e64_1:%[0-9]+]]:vgpr_32 = V_OR_B32_e64 [[V_AND_B32_e64_2]], [[V_LSHLREV_B32_e64_1]], implicit $exec ; UNPACKED: $vgpr0 = COPY [[V_OR_B32_e64_]] ; UNPACKED: $vgpr1 = COPY [[V_OR_B32_e64_1]] Index: llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.buffer.load.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.buffer.load.ll +++ llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.buffer.load.ll @@ -546,9 +546,8 @@ ; CHECK: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5 ; CHECK: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; CHECK: [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr6 - ; CHECK: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 4096 - ; CHECK: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] - ; CHECK: [[BUFFER_LOAD_DWORD_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN [[COPY5]], [[REG_SEQUENCE]], [[COPY4]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4) + ; CHECK: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4096, implicit $exec + ; CHECK: [[BUFFER_LOAD_DWORD_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN [[V_MOV_B32_e32_]], [[REG_SEQUENCE]], [[COPY4]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4) ; CHECK: $vgpr0 = COPY [[BUFFER_LOAD_DWORD_OFFEN]] ; CHECK: SI_RETURN_TO_EPILOG implicit $vgpr0 %val = call float @llvm.amdgcn.raw.buffer.load.f32(<4 x i32> %rsrc, i32 4096, i32 %soffset, i32 0) @@ -604,9 +603,8 @@ ; CHECK: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; CHECK: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; CHECK: [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr6 - ; CHECK: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 4096 - ; CHECK: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] - ; CHECK: %10:vgpr_32, dead %15:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY4]], [[COPY6]], 0, implicit $exec + ; CHECK: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4096, implicit $exec + ; CHECK: %10:vgpr_32, dead %13:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY4]], [[V_MOV_B32_e32_]], 0, implicit $exec ; CHECK: [[BUFFER_LOAD_DWORD_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN %10, [[REG_SEQUENCE]], [[COPY5]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4) ; CHECK: $vgpr0 = COPY [[BUFFER_LOAD_DWORD_OFFEN]] ; CHECK: SI_RETURN_TO_EPILOG implicit $vgpr0 @@ -772,22 +770,21 @@ ; CHECK: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; CHECK: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr4 ; CHECK: [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr2 - ; CHECK: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 4096 - ; CHECK: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] - ; CHECK: %14:vgpr_32, dead %33:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY4]], [[COPY6]], 0, implicit $exec - ; CHECK: [[COPY7:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub0_sub1 - ; CHECK: [[COPY8:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub2_sub3 + ; CHECK: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4096, implicit $exec + ; CHECK: %14:vgpr_32, dead %31:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY4]], [[V_MOV_B32_e32_]], 0, implicit $exec + ; CHECK: [[COPY6:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub0_sub1 + ; CHECK: [[COPY7:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub2_sub3 ; CHECK: [[S_MOV_B64_term:%[0-9]+]]:sreg_64_xexec = S_MOV_B64_term $exec ; CHECK: bb.2: ; CHECK: successors: %bb.3(0x40000000), %bb.2(0x40000000) - ; CHECK: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY7]].sub0, implicit $exec - ; CHECK: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY7]].sub1, implicit $exec + ; CHECK: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY6]].sub0, implicit $exec + ; CHECK: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY6]].sub1, implicit $exec ; CHECK: [[REG_SEQUENCE1:%[0-9]+]]:sreg_64_xexec = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1 - ; CHECK: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE1]], [[COPY7]], implicit $exec - ; CHECK: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY8]].sub0, implicit $exec - ; CHECK: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY8]].sub1, implicit $exec + ; CHECK: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE1]], [[COPY6]], implicit $exec + ; CHECK: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY7]].sub0, implicit $exec + ; CHECK: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY7]].sub1, implicit $exec ; CHECK: [[REG_SEQUENCE2:%[0-9]+]]:sreg_64_xexec = REG_SEQUENCE [[V_READFIRSTLANE_B32_2]], %subreg.sub0, [[V_READFIRSTLANE_B32_3]], %subreg.sub1 - ; CHECK: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE2]], [[COPY8]], implicit $exec + ; CHECK: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE2]], [[COPY7]], implicit $exec ; CHECK: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_1]], [[V_CMP_EQ_U64_e64_]], implicit-def $scc ; CHECK: [[REG_SEQUENCE3:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3 ; CHECK: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_]], implicit-def $exec, implicit-def $scc, implicit $exec Index: llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.buffer.store.format.f16.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.buffer.store.format.f16.ll +++ llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.buffer.store.format.f16.ll @@ -74,9 +74,8 @@ ; UNPACKED: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; UNPACKED: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1 ; UNPACKED: [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6 - ; UNPACKED: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 16 - ; UNPACKED: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] - ; UNPACKED: [[V_LSHRREV_B32_e64_:%[0-9]+]]:vgpr_32 = V_LSHRREV_B32_e64 [[COPY7]], [[COPY4]], implicit $exec + ; UNPACKED: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 16, implicit $exec + ; UNPACKED: [[V_LSHRREV_B32_e64_:%[0-9]+]]:vgpr_32 = V_LSHRREV_B32_e64 [[V_MOV_B32_e32_]], [[COPY4]], implicit $exec ; UNPACKED: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[V_LSHRREV_B32_e64_]], %subreg.sub1 ; UNPACKED: BUFFER_STORE_FORMAT_D16_XY_gfx80_OFFEN_exact [[REG_SEQUENCE1]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 0, 0, 0, 0, implicit $exec :: (dereferenceable store (<2 x s16>), align 1, addrspace 4) ; UNPACKED: S_ENDPGM 0 @@ -110,11 +109,9 @@ ; UNPACKED: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1 ; UNPACKED: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr2 ; UNPACKED: [[COPY7:%[0-9]+]]:sreg_32 = COPY $sgpr6 - ; UNPACKED: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 16 - ; UNPACKED: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] - ; UNPACKED: [[V_LSHRREV_B32_e64_:%[0-9]+]]:vgpr_32 = V_LSHRREV_B32_e64 [[COPY8]], [[COPY4]], implicit $exec - ; UNPACKED: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] - ; UNPACKED: [[V_LSHRREV_B32_e64_1:%[0-9]+]]:vgpr_32 = V_LSHRREV_B32_e64 [[COPY9]], [[COPY5]], implicit $exec + ; UNPACKED: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 16, implicit $exec + ; UNPACKED: [[V_LSHRREV_B32_e64_:%[0-9]+]]:vgpr_32 = V_LSHRREV_B32_e64 [[V_MOV_B32_e32_]], [[COPY4]], implicit $exec + ; UNPACKED: [[V_LSHRREV_B32_e64_1:%[0-9]+]]:vgpr_32 = V_LSHRREV_B32_e64 [[V_MOV_B32_e32_]], [[COPY5]], implicit $exec ; UNPACKED: [[REG_SEQUENCE1:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[V_LSHRREV_B32_e64_]], %subreg.sub1, [[COPY5]], %subreg.sub2, [[V_LSHRREV_B32_e64_1]], %subreg.sub3 ; UNPACKED: BUFFER_STORE_FORMAT_D16_XYZW_gfx80_OFFEN_exact [[REG_SEQUENCE1]], [[COPY6]], [[REG_SEQUENCE]], [[COPY7]], 0, 0, 0, 0, implicit $exec :: (dereferenceable store (<4 x s16>), align 1, addrspace 4) ; UNPACKED: S_ENDPGM 0 @@ -152,25 +149,23 @@ ; UNPACKED: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr5 ; UNPACKED: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr6 ; UNPACKED: [[COPY7:%[0-9]+]]:sreg_32 = COPY $sgpr2 - ; UNPACKED: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 16 - ; UNPACKED: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] - ; UNPACKED: [[V_LSHRREV_B32_e64_:%[0-9]+]]:vgpr_32 = V_LSHRREV_B32_e64 [[COPY8]], [[COPY4]], implicit $exec - ; UNPACKED: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] - ; UNPACKED: [[V_LSHRREV_B32_e64_1:%[0-9]+]]:vgpr_32 = V_LSHRREV_B32_e64 [[COPY9]], [[COPY5]], implicit $exec + ; UNPACKED: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 16, implicit $exec + ; UNPACKED: [[V_LSHRREV_B32_e64_:%[0-9]+]]:vgpr_32 = V_LSHRREV_B32_e64 [[V_MOV_B32_e32_]], [[COPY4]], implicit $exec + ; UNPACKED: [[V_LSHRREV_B32_e64_1:%[0-9]+]]:vgpr_32 = V_LSHRREV_B32_e64 [[V_MOV_B32_e32_]], [[COPY5]], implicit $exec ; UNPACKED: [[REG_SEQUENCE1:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[V_LSHRREV_B32_e64_]], %subreg.sub1, [[COPY5]], %subreg.sub2, [[V_LSHRREV_B32_e64_1]], %subreg.sub3 - ; UNPACKED: [[COPY10:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub0_sub1 - ; UNPACKED: [[COPY11:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub2_sub3 + ; UNPACKED: [[COPY8:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub0_sub1 + ; UNPACKED: [[COPY9:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub2_sub3 ; UNPACKED: [[S_MOV_B64_term:%[0-9]+]]:sreg_64_xexec = S_MOV_B64_term $exec ; UNPACKED: bb.2: ; UNPACKED: successors: %bb.3(0x40000000), %bb.2(0x40000000) - ; UNPACKED: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY10]].sub0, implicit $exec - ; UNPACKED: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY10]].sub1, implicit $exec + ; UNPACKED: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY8]].sub0, implicit $exec + ; UNPACKED: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY8]].sub1, implicit $exec ; UNPACKED: [[REG_SEQUENCE2:%[0-9]+]]:sreg_64_xexec = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1 - ; UNPACKED: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE2]], [[COPY10]], implicit $exec - ; UNPACKED: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY11]].sub0, implicit $exec - ; UNPACKED: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY11]].sub1, implicit $exec + ; UNPACKED: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE2]], [[COPY8]], implicit $exec + ; UNPACKED: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY9]].sub0, implicit $exec + ; UNPACKED: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY9]].sub1, implicit $exec ; UNPACKED: [[REG_SEQUENCE3:%[0-9]+]]:sreg_64_xexec = REG_SEQUENCE [[V_READFIRSTLANE_B32_2]], %subreg.sub0, [[V_READFIRSTLANE_B32_3]], %subreg.sub1 - ; UNPACKED: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE3]], [[COPY11]], implicit $exec + ; UNPACKED: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE3]], [[COPY9]], implicit $exec ; UNPACKED: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_1]], [[V_CMP_EQ_U64_e64_]], implicit-def $scc ; UNPACKED: [[REG_SEQUENCE4:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3 ; UNPACKED: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_]], implicit-def $exec, implicit-def $scc, implicit $exec @@ -236,9 +231,8 @@ ; UNPACKED: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; UNPACKED: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1 ; UNPACKED: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 4095 - ; UNPACKED: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 16 - ; UNPACKED: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_1]] - ; UNPACKED: [[V_LSHRREV_B32_e64_:%[0-9]+]]:vgpr_32 = V_LSHRREV_B32_e64 [[COPY6]], [[COPY4]], implicit $exec + ; UNPACKED: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 16, implicit $exec + ; UNPACKED: [[V_LSHRREV_B32_e64_:%[0-9]+]]:vgpr_32 = V_LSHRREV_B32_e64 [[V_MOV_B32_e32_]], [[COPY4]], implicit $exec ; UNPACKED: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[V_LSHRREV_B32_e64_]], %subreg.sub1 ; UNPACKED: BUFFER_STORE_FORMAT_D16_XY_gfx80_OFFEN_exact [[REG_SEQUENCE1]], [[COPY5]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 0, 0, 0, 0, implicit $exec :: (dereferenceable store (<2 x s16>), align 1, addrspace 4) ; UNPACKED: S_ENDPGM 0 @@ -271,9 +265,8 @@ ; UNPACKED: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; UNPACKED: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1 ; UNPACKED: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 4096 - ; UNPACKED: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 16 - ; UNPACKED: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_1]] - ; UNPACKED: [[V_LSHRREV_B32_e64_:%[0-9]+]]:vgpr_32 = V_LSHRREV_B32_e64 [[COPY6]], [[COPY4]], implicit $exec + ; UNPACKED: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 16, implicit $exec + ; UNPACKED: [[V_LSHRREV_B32_e64_:%[0-9]+]]:vgpr_32 = V_LSHRREV_B32_e64 [[V_MOV_B32_e32_]], [[COPY4]], implicit $exec ; UNPACKED: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[V_LSHRREV_B32_e64_]], %subreg.sub1 ; UNPACKED: BUFFER_STORE_FORMAT_D16_XY_gfx80_OFFEN_exact [[REG_SEQUENCE1]], [[COPY5]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 0, 0, 0, 0, implicit $exec :: (dereferenceable store (<2 x s16>), align 1, addrspace 4) ; UNPACKED: S_ENDPGM 0 @@ -306,9 +299,8 @@ ; UNPACKED: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; UNPACKED: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1 ; UNPACKED: [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6 - ; UNPACKED: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 16 - ; UNPACKED: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] - ; UNPACKED: [[V_LSHRREV_B32_e64_:%[0-9]+]]:vgpr_32 = V_LSHRREV_B32_e64 [[COPY7]], [[COPY4]], implicit $exec + ; UNPACKED: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 16, implicit $exec + ; UNPACKED: [[V_LSHRREV_B32_e64_:%[0-9]+]]:vgpr_32 = V_LSHRREV_B32_e64 [[V_MOV_B32_e32_]], [[COPY4]], implicit $exec ; UNPACKED: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[V_LSHRREV_B32_e64_]], %subreg.sub1 ; UNPACKED: BUFFER_STORE_FORMAT_D16_XY_gfx80_OFFEN_exact [[REG_SEQUENCE1]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 16, 0, 0, 0, implicit $exec :: (dereferenceable store (<2 x s16>), align 1, addrspace 4) ; UNPACKED: S_ENDPGM 0 @@ -342,9 +334,8 @@ ; UNPACKED: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; UNPACKED: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1 ; UNPACKED: [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6 - ; UNPACKED: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 16 - ; UNPACKED: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] - ; UNPACKED: [[V_LSHRREV_B32_e64_:%[0-9]+]]:vgpr_32 = V_LSHRREV_B32_e64 [[COPY7]], [[COPY4]], implicit $exec + ; UNPACKED: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 16, implicit $exec + ; UNPACKED: [[V_LSHRREV_B32_e64_:%[0-9]+]]:vgpr_32 = V_LSHRREV_B32_e64 [[V_MOV_B32_e32_]], [[COPY4]], implicit $exec ; UNPACKED: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[V_LSHRREV_B32_e64_]], %subreg.sub1 ; UNPACKED: BUFFER_STORE_FORMAT_D16_XY_gfx80_OFFEN_exact [[REG_SEQUENCE1]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 4095, 0, 0, 0, implicit $exec :: (dereferenceable store (<2 x s16>), align 1, addrspace 4) ; UNPACKED: S_ENDPGM 0 @@ -378,12 +369,10 @@ ; UNPACKED: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; UNPACKED: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1 ; UNPACKED: [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6 - ; UNPACKED: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 4096 - ; UNPACKED: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] - ; UNPACKED: %11:vgpr_32, dead %24:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY5]], [[COPY7]], 0, implicit $exec - ; UNPACKED: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 16 - ; UNPACKED: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_1]] - ; UNPACKED: [[V_LSHRREV_B32_e64_:%[0-9]+]]:vgpr_32 = V_LSHRREV_B32_e64 [[COPY8]], [[COPY4]], implicit $exec + ; UNPACKED: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4096, implicit $exec + ; UNPACKED: %11:vgpr_32, dead %21:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY5]], [[V_MOV_B32_e32_]], 0, implicit $exec + ; UNPACKED: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 16, implicit $exec + ; UNPACKED: [[V_LSHRREV_B32_e64_:%[0-9]+]]:vgpr_32 = V_LSHRREV_B32_e64 [[V_MOV_B32_e32_1]], [[COPY4]], implicit $exec ; UNPACKED: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[V_LSHRREV_B32_e64_]], %subreg.sub1 ; UNPACKED: BUFFER_STORE_FORMAT_D16_XY_gfx80_OFFEN_exact [[REG_SEQUENCE1]], %11, [[REG_SEQUENCE]], [[COPY6]], 0, 0, 0, 0, implicit $exec :: (dereferenceable store (<2 x s16>), align 1, addrspace 4) ; UNPACKED: S_ENDPGM 0 @@ -398,9 +387,8 @@ ; PACKED: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; PACKED: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1 ; PACKED: [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6 - ; PACKED: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 4096 - ; PACKED: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] - ; PACKED: %11:vgpr_32, dead %15:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY5]], [[COPY7]], 0, implicit $exec + ; PACKED: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4096, implicit $exec + ; PACKED: %11:vgpr_32, dead %13:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY5]], [[V_MOV_B32_e32_]], 0, implicit $exec ; PACKED: BUFFER_STORE_FORMAT_D16_XY_OFFEN_exact [[COPY4]], %11, [[REG_SEQUENCE]], [[COPY6]], 0, 0, 0, 0, implicit $exec :: (dereferenceable store (<2 x s16>), align 1, addrspace 4) ; PACKED: S_ENDPGM 0 %voffset.add = add i32 %voffset, 4096 @@ -424,28 +412,25 @@ ; UNPACKED: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr5 ; UNPACKED: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr6 ; UNPACKED: [[COPY7:%[0-9]+]]:sreg_32 = COPY $sgpr2 - ; UNPACKED: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 4096 - ; UNPACKED: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] - ; UNPACKED: %13:vgpr_32, dead %49:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY6]], [[COPY8]], 0, implicit $exec - ; UNPACKED: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 16 - ; UNPACKED: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_1]] - ; UNPACKED: [[V_LSHRREV_B32_e64_:%[0-9]+]]:vgpr_32 = V_LSHRREV_B32_e64 [[COPY9]], [[COPY4]], implicit $exec - ; UNPACKED: [[COPY10:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_1]] - ; UNPACKED: [[V_LSHRREV_B32_e64_1:%[0-9]+]]:vgpr_32 = V_LSHRREV_B32_e64 [[COPY10]], [[COPY5]], implicit $exec + ; UNPACKED: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4096, implicit $exec + ; UNPACKED: %13:vgpr_32, dead %46:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY6]], [[V_MOV_B32_e32_]], 0, implicit $exec + ; UNPACKED: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 16, implicit $exec + ; UNPACKED: [[V_LSHRREV_B32_e64_:%[0-9]+]]:vgpr_32 = V_LSHRREV_B32_e64 [[V_MOV_B32_e32_1]], [[COPY4]], implicit $exec + ; UNPACKED: [[V_LSHRREV_B32_e64_1:%[0-9]+]]:vgpr_32 = V_LSHRREV_B32_e64 [[V_MOV_B32_e32_1]], [[COPY5]], implicit $exec ; UNPACKED: [[REG_SEQUENCE1:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[V_LSHRREV_B32_e64_]], %subreg.sub1, [[COPY5]], %subreg.sub2, [[V_LSHRREV_B32_e64_1]], %subreg.sub3 - ; UNPACKED: [[COPY11:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub0_sub1 - ; UNPACKED: [[COPY12:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub2_sub3 + ; UNPACKED: [[COPY8:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub0_sub1 + ; UNPACKED: [[COPY9:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub2_sub3 ; UNPACKED: [[S_MOV_B64_term:%[0-9]+]]:sreg_64_xexec = S_MOV_B64_term $exec ; UNPACKED: bb.2: ; UNPACKED: successors: %bb.3(0x40000000), %bb.2(0x40000000) - ; UNPACKED: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY11]].sub0, implicit $exec - ; UNPACKED: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY11]].sub1, implicit $exec + ; UNPACKED: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY8]].sub0, implicit $exec + ; UNPACKED: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY8]].sub1, implicit $exec ; UNPACKED: [[REG_SEQUENCE2:%[0-9]+]]:sreg_64_xexec = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1 - ; UNPACKED: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE2]], [[COPY11]], implicit $exec - ; UNPACKED: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY12]].sub0, implicit $exec - ; UNPACKED: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY12]].sub1, implicit $exec + ; UNPACKED: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE2]], [[COPY8]], implicit $exec + ; UNPACKED: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY9]].sub0, implicit $exec + ; UNPACKED: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY9]].sub1, implicit $exec ; UNPACKED: [[REG_SEQUENCE3:%[0-9]+]]:sreg_64_xexec = REG_SEQUENCE [[V_READFIRSTLANE_B32_2]], %subreg.sub0, [[V_READFIRSTLANE_B32_3]], %subreg.sub1 - ; UNPACKED: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE3]], [[COPY12]], implicit $exec + ; UNPACKED: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE3]], [[COPY9]], implicit $exec ; UNPACKED: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_1]], [[V_CMP_EQ_U64_e64_]], implicit-def $scc ; UNPACKED: [[REG_SEQUENCE4:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3 ; UNPACKED: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_]], implicit-def $exec, implicit-def $scc, implicit $exec @@ -471,22 +456,21 @@ ; PACKED: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY5]], %subreg.sub1 ; PACKED: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr6 ; PACKED: [[COPY7:%[0-9]+]]:sreg_32 = COPY $sgpr2 - ; PACKED: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 4096 - ; PACKED: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] - ; PACKED: %13:vgpr_32, dead %33:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY6]], [[COPY8]], 0, implicit $exec - ; PACKED: [[COPY9:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub0_sub1 - ; PACKED: [[COPY10:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub2_sub3 + ; PACKED: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4096, implicit $exec + ; PACKED: %13:vgpr_32, dead %31:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY6]], [[V_MOV_B32_e32_]], 0, implicit $exec + ; PACKED: [[COPY8:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub0_sub1 + ; PACKED: [[COPY9:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub2_sub3 ; PACKED: [[S_MOV_B64_term:%[0-9]+]]:sreg_64_xexec = S_MOV_B64_term $exec ; PACKED: bb.2: ; PACKED: successors: %bb.3(0x40000000), %bb.2(0x40000000) - ; PACKED: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY9]].sub0, implicit $exec - ; PACKED: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY9]].sub1, implicit $exec + ; PACKED: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY8]].sub0, implicit $exec + ; PACKED: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY8]].sub1, implicit $exec ; PACKED: [[REG_SEQUENCE2:%[0-9]+]]:sreg_64_xexec = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1 - ; PACKED: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE2]], [[COPY9]], implicit $exec - ; PACKED: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY10]].sub0, implicit $exec - ; PACKED: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY10]].sub1, implicit $exec + ; PACKED: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE2]], [[COPY8]], implicit $exec + ; PACKED: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY9]].sub0, implicit $exec + ; PACKED: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY9]].sub1, implicit $exec ; PACKED: [[REG_SEQUENCE3:%[0-9]+]]:sreg_64_xexec = REG_SEQUENCE [[V_READFIRSTLANE_B32_2]], %subreg.sub0, [[V_READFIRSTLANE_B32_3]], %subreg.sub1 - ; PACKED: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE3]], [[COPY10]], implicit $exec + ; PACKED: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE3]], [[COPY9]], implicit $exec ; PACKED: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_1]], [[V_CMP_EQ_U64_e64_]], implicit-def $scc ; PACKED: [[REG_SEQUENCE4:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3 ; PACKED: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_]], implicit-def $exec, implicit-def $scc, implicit $exec Index: llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.buffer.store.format.f32.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.buffer.store.format.f32.ll +++ llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.buffer.store.format.f32.ll @@ -241,9 +241,8 @@ ; CHECK: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY5]], %subreg.sub1 ; CHECK: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr2 ; CHECK: [[COPY7:%[0-9]+]]:sreg_32 = COPY $sgpr6 - ; CHECK: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 4096 - ; CHECK: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] - ; CHECK: %13:vgpr_32, dead %17:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY6]], [[COPY8]], 0, implicit $exec + ; CHECK: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4096, implicit $exec + ; CHECK: %13:vgpr_32, dead %15:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY6]], [[V_MOV_B32_e32_]], 0, implicit $exec ; CHECK: BUFFER_STORE_FORMAT_XY_OFFEN_exact [[REG_SEQUENCE1]], %13, [[REG_SEQUENCE]], [[COPY7]], 0, 0, 0, 0, implicit $exec :: (dereferenceable store (<2 x s32>), align 1, addrspace 4) ; CHECK: S_ENDPGM 0 %voffset.add = add i32 %voffset, 4096 @@ -270,22 +269,21 @@ ; CHECK: [[REG_SEQUENCE1:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY5]], %subreg.sub1, [[COPY6]], %subreg.sub2, [[COPY7]], %subreg.sub3 ; CHECK: [[COPY8:%[0-9]+]]:vgpr_32 = COPY $vgpr8 ; CHECK: [[COPY9:%[0-9]+]]:sreg_32 = COPY $sgpr2 - ; CHECK: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 4096 - ; CHECK: [[COPY10:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] - ; CHECK: %15:vgpr_32, dead %35:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY8]], [[COPY10]], 0, implicit $exec - ; CHECK: [[COPY11:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub0_sub1 - ; CHECK: [[COPY12:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub2_sub3 + ; CHECK: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4096, implicit $exec + ; CHECK: %15:vgpr_32, dead %33:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY8]], [[V_MOV_B32_e32_]], 0, implicit $exec + ; CHECK: [[COPY10:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub0_sub1 + ; CHECK: [[COPY11:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub2_sub3 ; CHECK: [[S_MOV_B64_term:%[0-9]+]]:sreg_64_xexec = S_MOV_B64_term $exec ; CHECK: bb.2: ; CHECK: successors: %bb.3(0x40000000), %bb.2(0x40000000) - ; CHECK: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY11]].sub0, implicit $exec - ; CHECK: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY11]].sub1, implicit $exec + ; CHECK: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY10]].sub0, implicit $exec + ; CHECK: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY10]].sub1, implicit $exec ; CHECK: [[REG_SEQUENCE2:%[0-9]+]]:sreg_64_xexec = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1 - ; CHECK: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE2]], [[COPY11]], implicit $exec - ; CHECK: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY12]].sub0, implicit $exec - ; CHECK: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY12]].sub1, implicit $exec + ; CHECK: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE2]], [[COPY10]], implicit $exec + ; CHECK: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY11]].sub0, implicit $exec + ; CHECK: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY11]].sub1, implicit $exec ; CHECK: [[REG_SEQUENCE3:%[0-9]+]]:sreg_64_xexec = REG_SEQUENCE [[V_READFIRSTLANE_B32_2]], %subreg.sub0, [[V_READFIRSTLANE_B32_3]], %subreg.sub1 - ; CHECK: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE3]], [[COPY12]], implicit $exec + ; CHECK: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE3]], [[COPY11]], implicit $exec ; CHECK: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_1]], [[V_CMP_EQ_U64_e64_]], implicit-def $scc ; CHECK: [[REG_SEQUENCE4:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3 ; CHECK: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_]], implicit-def $exec, implicit-def $scc, implicit $exec Index: llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.buffer.store.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.buffer.store.ll +++ llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.buffer.store.ll @@ -515,9 +515,8 @@ ; CHECK: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; CHECK: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; CHECK: [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr6 - ; CHECK: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 4096 - ; CHECK: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] - ; CHECK: BUFFER_STORE_DWORD_OFFEN_exact [[COPY4]], [[COPY6]], [[REG_SEQUENCE]], [[COPY5]], 0, 0, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 4) + ; CHECK: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4096, implicit $exec + ; CHECK: BUFFER_STORE_DWORD_OFFEN_exact [[COPY4]], [[V_MOV_B32_e32_]], [[REG_SEQUENCE]], [[COPY5]], 0, 0, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 4) ; CHECK: S_ENDPGM 0 call void @llvm.amdgcn.raw.buffer.store.f32(float %val, <4 x i32> %rsrc, i32 4096, i32 %soffset, i32 0) ret void @@ -573,9 +572,8 @@ ; CHECK: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; CHECK: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1 ; CHECK: [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6 - ; CHECK: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 4096 - ; CHECK: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] - ; CHECK: %11:vgpr_32, dead %15:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY5]], [[COPY7]], 0, implicit $exec + ; CHECK: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4096, implicit $exec + ; CHECK: %11:vgpr_32, dead %13:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY5]], [[V_MOV_B32_e32_]], 0, implicit $exec ; CHECK: BUFFER_STORE_DWORD_OFFEN_exact [[COPY4]], %11, [[REG_SEQUENCE]], [[COPY6]], 0, 0, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 4) ; CHECK: S_ENDPGM 0 %voffset.add = add i32 %voffset, 4096 @@ -669,9 +667,8 @@ ; CHECK: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; CHECK: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1 ; CHECK: [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6 - ; CHECK: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 4096 - ; CHECK: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] - ; CHECK: %11:vgpr_32, dead %15:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY5]], [[COPY7]], 0, implicit $exec + ; CHECK: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4096, implicit $exec + ; CHECK: %11:vgpr_32, dead %13:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY5]], [[V_MOV_B32_e32_]], 0, implicit $exec ; CHECK: BUFFER_STORE_DWORD_OFFEN_exact [[COPY4]], %11, [[REG_SEQUENCE]], [[COPY6]], 0, 0, 0, 0, implicit $exec :: (dereferenceable store (<2 x s16>), align 1, addrspace 4) ; CHECK: S_ENDPGM 0 %voffset.add = add i32 %voffset, 4096 @@ -693,22 +690,21 @@ ; CHECK: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr4 ; CHECK: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr5 ; CHECK: [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr2 - ; CHECK: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 4096 - ; CHECK: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] - ; CHECK: %14:vgpr_32, dead %33:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY5]], [[COPY7]], 0, implicit $exec - ; CHECK: [[COPY8:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub0_sub1 - ; CHECK: [[COPY9:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub2_sub3 + ; CHECK: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4096, implicit $exec + ; CHECK: %14:vgpr_32, dead %31:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY5]], [[V_MOV_B32_e32_]], 0, implicit $exec + ; CHECK: [[COPY7:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub0_sub1 + ; CHECK: [[COPY8:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub2_sub3 ; CHECK: [[S_MOV_B64_term:%[0-9]+]]:sreg_64_xexec = S_MOV_B64_term $exec ; CHECK: bb.2: ; CHECK: successors: %bb.3(0x40000000), %bb.2(0x40000000) - ; CHECK: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY8]].sub0, implicit $exec - ; CHECK: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY8]].sub1, implicit $exec + ; CHECK: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY7]].sub0, implicit $exec + ; CHECK: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY7]].sub1, implicit $exec ; CHECK: [[REG_SEQUENCE1:%[0-9]+]]:sreg_64_xexec = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1 - ; CHECK: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE1]], [[COPY8]], implicit $exec - ; CHECK: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY9]].sub0, implicit $exec - ; CHECK: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY9]].sub1, implicit $exec + ; CHECK: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE1]], [[COPY7]], implicit $exec + ; CHECK: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY8]].sub0, implicit $exec + ; CHECK: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY8]].sub1, implicit $exec ; CHECK: [[REG_SEQUENCE2:%[0-9]+]]:sreg_64_xexec = REG_SEQUENCE [[V_READFIRSTLANE_B32_2]], %subreg.sub0, [[V_READFIRSTLANE_B32_3]], %subreg.sub1 - ; CHECK: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE2]], [[COPY9]], implicit $exec + ; CHECK: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE2]], [[COPY8]], implicit $exec ; CHECK: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_1]], [[V_CMP_EQ_U64_e64_]], implicit-def $scc ; CHECK: [[REG_SEQUENCE3:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3 ; CHECK: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_]], implicit-def $exec, implicit-def $scc, implicit $exec @@ -738,25 +734,24 @@ ; CHECK: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; CHECK: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr4 ; CHECK: [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr2 - ; CHECK: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 4096 - ; CHECK: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] - ; CHECK: [[COPY7:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub0_sub1 - ; CHECK: [[COPY8:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub2_sub3 + ; CHECK: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4096, implicit $exec + ; CHECK: [[COPY6:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub0_sub1 + ; CHECK: [[COPY7:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub2_sub3 ; CHECK: [[S_MOV_B64_term:%[0-9]+]]:sreg_64_xexec = S_MOV_B64_term $exec ; CHECK: bb.2: ; CHECK: successors: %bb.3(0x40000000), %bb.2(0x40000000) - ; CHECK: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY7]].sub0, implicit $exec - ; CHECK: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY7]].sub1, implicit $exec + ; CHECK: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY6]].sub0, implicit $exec + ; CHECK: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY6]].sub1, implicit $exec ; CHECK: [[REG_SEQUENCE1:%[0-9]+]]:sreg_64_xexec = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1 - ; CHECK: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE1]], [[COPY7]], implicit $exec - ; CHECK: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY8]].sub0, implicit $exec - ; CHECK: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY8]].sub1, implicit $exec + ; CHECK: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE1]], [[COPY6]], implicit $exec + ; CHECK: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY7]].sub0, implicit $exec + ; CHECK: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY7]].sub1, implicit $exec ; CHECK: [[REG_SEQUENCE2:%[0-9]+]]:sreg_64_xexec = REG_SEQUENCE [[V_READFIRSTLANE_B32_2]], %subreg.sub0, [[V_READFIRSTLANE_B32_3]], %subreg.sub1 - ; CHECK: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE2]], [[COPY8]], implicit $exec + ; CHECK: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE2]], [[COPY7]], implicit $exec ; CHECK: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_1]], [[V_CMP_EQ_U64_e64_]], implicit-def $scc ; CHECK: [[REG_SEQUENCE3:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3 ; CHECK: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_]], implicit-def $exec, implicit-def $scc, implicit $exec - ; CHECK: BUFFER_STORE_DWORD_OFFEN_exact [[COPY4]], [[COPY6]], [[REG_SEQUENCE3]], [[COPY5]], 904, 0, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 4) + ; CHECK: BUFFER_STORE_DWORD_OFFEN_exact [[COPY4]], [[V_MOV_B32_e32_]], [[REG_SEQUENCE3]], [[COPY5]], 904, 0, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 4) ; CHECK: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc ; CHECK: SI_WATERFALL_LOOP %bb.2, implicit $exec ; CHECK: bb.3: Index: llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.tbuffer.load.f16.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.tbuffer.load.f16.ll +++ llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.tbuffer.load.f16.ll @@ -47,14 +47,11 @@ ; UNPACKED: [[TBUFFER_LOAD_FORMAT_D16_XY_gfx80_OFFEN:%[0-9]+]]:vreg_64 = TBUFFER_LOAD_FORMAT_D16_XY_gfx80_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[COPY5]], 0, 78, 0, 0, 0, implicit $exec :: (dereferenceable load (<2 x s16>), align 1, addrspace 4) ; UNPACKED: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[TBUFFER_LOAD_FORMAT_D16_XY_gfx80_OFFEN]].sub0 ; UNPACKED: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[TBUFFER_LOAD_FORMAT_D16_XY_gfx80_OFFEN]].sub1 - ; UNPACKED: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 65535 - ; UNPACKED: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] - ; UNPACKED: [[V_AND_B32_e64_:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 [[COPY6]], [[COPY8]], implicit $exec - ; UNPACKED: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] - ; UNPACKED: [[V_AND_B32_e64_1:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 [[COPY7]], [[COPY9]], implicit $exec - ; UNPACKED: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 16 - ; UNPACKED: [[COPY10:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_1]] - ; UNPACKED: [[V_LSHLREV_B32_e64_:%[0-9]+]]:vgpr_32 = V_LSHLREV_B32_e64 [[COPY10]], [[V_AND_B32_e64_1]], implicit $exec + ; UNPACKED: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 65535, implicit $exec + ; UNPACKED: [[V_AND_B32_e64_:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 [[COPY6]], [[V_MOV_B32_e32_]], implicit $exec + ; UNPACKED: [[V_AND_B32_e64_1:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 [[COPY7]], [[V_MOV_B32_e32_]], implicit $exec + ; UNPACKED: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 16, implicit $exec + ; UNPACKED: [[V_LSHLREV_B32_e64_:%[0-9]+]]:vgpr_32 = V_LSHLREV_B32_e64 [[V_MOV_B32_e32_1]], [[V_AND_B32_e64_1]], implicit $exec ; UNPACKED: [[V_OR_B32_e64_:%[0-9]+]]:vgpr_32 = V_OR_B32_e64 [[V_AND_B32_e64_]], [[V_LSHLREV_B32_e64_]], implicit $exec ; UNPACKED: $vgpr0 = COPY [[V_OR_B32_e64_]] ; UNPACKED: SI_RETURN_TO_EPILOG implicit $vgpr0 @@ -97,21 +94,15 @@ ; UNPACKED: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[TBUFFER_LOAD_FORMAT_D16_XYZW_gfx80_OFFEN]].sub1 ; UNPACKED: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[TBUFFER_LOAD_FORMAT_D16_XYZW_gfx80_OFFEN]].sub2 ; UNPACKED: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[TBUFFER_LOAD_FORMAT_D16_XYZW_gfx80_OFFEN]].sub3 - ; UNPACKED: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 65535 - ; UNPACKED: [[COPY10:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] - ; UNPACKED: [[V_AND_B32_e64_:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 [[COPY6]], [[COPY10]], implicit $exec - ; UNPACKED: [[COPY11:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] - ; UNPACKED: [[V_AND_B32_e64_1:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 [[COPY7]], [[COPY11]], implicit $exec - ; UNPACKED: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 16 - ; UNPACKED: [[COPY12:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_1]] - ; UNPACKED: [[V_LSHLREV_B32_e64_:%[0-9]+]]:vgpr_32 = V_LSHLREV_B32_e64 [[COPY12]], [[V_AND_B32_e64_1]], implicit $exec + ; UNPACKED: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 65535, implicit $exec + ; UNPACKED: [[V_AND_B32_e64_:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 [[COPY6]], [[V_MOV_B32_e32_]], implicit $exec + ; UNPACKED: [[V_AND_B32_e64_1:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 [[COPY7]], [[V_MOV_B32_e32_]], implicit $exec + ; UNPACKED: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 16, implicit $exec + ; UNPACKED: [[V_LSHLREV_B32_e64_:%[0-9]+]]:vgpr_32 = V_LSHLREV_B32_e64 [[V_MOV_B32_e32_1]], [[V_AND_B32_e64_1]], implicit $exec ; UNPACKED: [[V_OR_B32_e64_:%[0-9]+]]:vgpr_32 = V_OR_B32_e64 [[V_AND_B32_e64_]], [[V_LSHLREV_B32_e64_]], implicit $exec - ; UNPACKED: [[COPY13:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] - ; UNPACKED: [[V_AND_B32_e64_2:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 [[COPY8]], [[COPY13]], implicit $exec - ; UNPACKED: [[COPY14:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] - ; UNPACKED: [[V_AND_B32_e64_3:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 [[COPY9]], [[COPY14]], implicit $exec - ; UNPACKED: [[COPY15:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_1]] - ; UNPACKED: [[V_LSHLREV_B32_e64_1:%[0-9]+]]:vgpr_32 = V_LSHLREV_B32_e64 [[COPY15]], [[V_AND_B32_e64_3]], implicit $exec + ; UNPACKED: [[V_AND_B32_e64_2:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 [[COPY8]], [[V_MOV_B32_e32_]], implicit $exec + ; UNPACKED: [[V_AND_B32_e64_3:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 [[COPY9]], [[V_MOV_B32_e32_]], implicit $exec + ; UNPACKED: [[V_LSHLREV_B32_e64_1:%[0-9]+]]:vgpr_32 = V_LSHLREV_B32_e64 [[V_MOV_B32_e32_1]], [[V_AND_B32_e64_3]], implicit $exec ; UNPACKED: [[V_OR_B32_e64_1:%[0-9]+]]:vgpr_32 = V_OR_B32_e64 [[V_AND_B32_e64_2]], [[V_LSHLREV_B32_e64_1]], implicit $exec ; UNPACKED: $vgpr0 = COPY [[V_OR_B32_e64_]] ; UNPACKED: $vgpr1 = COPY [[V_OR_B32_e64_1]] Index: llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.tbuffer.store.f16.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.tbuffer.store.f16.ll +++ llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.tbuffer.store.f16.ll @@ -45,9 +45,8 @@ ; UNPACKED: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3 ; UNPACKED: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1 ; UNPACKED: [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6 - ; UNPACKED: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 16 - ; UNPACKED: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] - ; UNPACKED: [[V_LSHRREV_B32_e64_:%[0-9]+]]:vgpr_32 = V_LSHRREV_B32_e64 [[COPY7]], [[COPY]], implicit $exec + ; UNPACKED: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 16, implicit $exec + ; UNPACKED: [[V_LSHRREV_B32_e64_:%[0-9]+]]:vgpr_32 = V_LSHRREV_B32_e64 [[V_MOV_B32_e32_]], [[COPY]], implicit $exec ; UNPACKED: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[V_LSHRREV_B32_e64_]], %subreg.sub1 ; UNPACKED: TBUFFER_STORE_FORMAT_D16_XY_gfx80_OFFEN_exact [[REG_SEQUENCE1]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 0, 78, 0, 0, 0, implicit $exec :: (dereferenceable store (<2 x s16>), align 1, addrspace 4) ; UNPACKED: S_ENDPGM 0 @@ -87,11 +86,9 @@ ; UNPACKED: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY3]], %subreg.sub1, [[COPY4]], %subreg.sub2, [[COPY5]], %subreg.sub3 ; UNPACKED: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr2 ; UNPACKED: [[COPY7:%[0-9]+]]:sreg_32 = COPY $sgpr6 - ; UNPACKED: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 16 - ; UNPACKED: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] - ; UNPACKED: [[V_LSHRREV_B32_e64_:%[0-9]+]]:vgpr_32 = V_LSHRREV_B32_e64 [[COPY8]], [[COPY]], implicit $exec - ; UNPACKED: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] - ; UNPACKED: [[V_LSHRREV_B32_e64_1:%[0-9]+]]:vgpr_32 = V_LSHRREV_B32_e64 [[COPY9]], [[COPY1]], implicit $exec + ; UNPACKED: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 16, implicit $exec + ; UNPACKED: [[V_LSHRREV_B32_e64_:%[0-9]+]]:vgpr_32 = V_LSHRREV_B32_e64 [[V_MOV_B32_e32_]], [[COPY]], implicit $exec + ; UNPACKED: [[V_LSHRREV_B32_e64_1:%[0-9]+]]:vgpr_32 = V_LSHRREV_B32_e64 [[V_MOV_B32_e32_]], [[COPY1]], implicit $exec ; UNPACKED: [[REG_SEQUENCE1:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[V_LSHRREV_B32_e64_]], %subreg.sub1, [[COPY1]], %subreg.sub2, [[V_LSHRREV_B32_e64_1]], %subreg.sub3 ; UNPACKED: TBUFFER_STORE_FORMAT_D16_XYZW_gfx80_OFFEN_exact [[REG_SEQUENCE1]], [[COPY6]], [[REG_SEQUENCE]], [[COPY7]], 0, 78, 0, 0, 0, implicit $exec :: (dereferenceable store (<4 x s16>), align 1, addrspace 4) ; UNPACKED: S_ENDPGM 0 Index: llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.tbuffer.store.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.tbuffer.store.ll +++ llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.tbuffer.store.ll @@ -362,9 +362,8 @@ ; CHECK: [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr5 ; CHECK: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3 ; CHECK: [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr6 - ; CHECK: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 4096 - ; CHECK: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] - ; CHECK: TBUFFER_STORE_FORMAT_X_OFFEN_exact [[COPY]], [[COPY6]], [[REG_SEQUENCE]], [[COPY5]], 0, 94, 0, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 4) + ; CHECK: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4096, implicit $exec + ; CHECK: TBUFFER_STORE_FORMAT_X_OFFEN_exact [[COPY]], [[V_MOV_B32_e32_]], [[REG_SEQUENCE]], [[COPY5]], 0, 94, 0, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 4) ; CHECK: S_ENDPGM 0 call void @llvm.amdgcn.raw.tbuffer.store.f32(float %val, <4 x i32> %rsrc, i32 4096, i32 %soffset, i32 94, i32 0) ret void @@ -420,9 +419,8 @@ ; CHECK: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3 ; CHECK: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1 ; CHECK: [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6 - ; CHECK: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 4096 - ; CHECK: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] - ; CHECK: [[V_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[COPY5]], [[COPY7]], 0, implicit $exec + ; CHECK: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4096, implicit $exec + ; CHECK: [[V_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[COPY5]], [[V_MOV_B32_e32_]], 0, implicit $exec ; CHECK: TBUFFER_STORE_FORMAT_X_OFFEN_exact [[COPY]], [[V_ADD_U32_e64_]], [[REG_SEQUENCE]], [[COPY6]], 0, 94, 0, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 4) ; CHECK: S_ENDPGM 0 %voffset = add i32 %voffset.base, 4096 @@ -588,22 +586,21 @@ ; CHECK: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3 ; CHECK: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr5 ; CHECK: [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr2 - ; CHECK: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 4096 - ; CHECK: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] - ; CHECK: [[V_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[COPY5]], [[COPY7]], 0, implicit $exec - ; CHECK: [[COPY8:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub0_sub1 - ; CHECK: [[COPY9:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub2_sub3 + ; CHECK: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4096, implicit $exec + ; CHECK: [[V_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[COPY5]], [[V_MOV_B32_e32_]], 0, implicit $exec + ; CHECK: [[COPY7:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub0_sub1 + ; CHECK: [[COPY8:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub2_sub3 ; CHECK: [[S_MOV_B32_term:%[0-9]+]]:sreg_32_xm0_xexec = S_MOV_B32_term $exec_lo ; CHECK: bb.2: ; CHECK: successors: %bb.3(0x40000000), %bb.2(0x40000000) - ; CHECK: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY8]].sub0, implicit $exec - ; CHECK: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY8]].sub1, implicit $exec + ; CHECK: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY7]].sub0, implicit $exec + ; CHECK: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY7]].sub1, implicit $exec ; CHECK: [[REG_SEQUENCE1:%[0-9]+]]:sreg_64_xexec = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1 - ; CHECK: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE1]], [[COPY8]], implicit $exec - ; CHECK: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY9]].sub0, implicit $exec - ; CHECK: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY9]].sub1, implicit $exec + ; CHECK: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE1]], [[COPY7]], implicit $exec + ; CHECK: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY8]].sub0, implicit $exec + ; CHECK: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY8]].sub1, implicit $exec ; CHECK: [[REG_SEQUENCE2:%[0-9]+]]:sreg_64_xexec = REG_SEQUENCE [[V_READFIRSTLANE_B32_2]], %subreg.sub0, [[V_READFIRSTLANE_B32_3]], %subreg.sub1 - ; CHECK: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE2]], [[COPY9]], implicit $exec + ; CHECK: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE2]], [[COPY8]], implicit $exec ; CHECK: [[S_AND_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_B32 [[V_CMP_EQ_U64_e64_1]], [[V_CMP_EQ_U64_e64_]], implicit-def $scc ; CHECK: [[REG_SEQUENCE3:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3 ; CHECK: [[S_AND_SAVEEXEC_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_SAVEEXEC_B32 killed [[S_AND_B32_]], implicit-def $exec, implicit-def $scc, implicit $exec Index: llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.s.buffer.load.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.s.buffer.load.ll +++ llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.s.buffer.load.ll @@ -2060,8 +2060,10 @@ ; GFX6: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5 ; GFX6: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; GFX6: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; GFX6: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 4096 - ; GFX6: [[BUFFER_LOAD_DWORD_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load (s32)) + ; GFX6: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4096, implicit $exec + ; GFX6: %9:vgpr_32, dead %13:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY4]], [[V_MOV_B32_e32_]], 0, implicit $exec + ; GFX6: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0 + ; GFX6: [[BUFFER_LOAD_DWORD_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN %9, [[REG_SEQUENCE]], [[S_MOV_B32_]], 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load (s32)) ; GFX6: $vgpr0 = COPY [[BUFFER_LOAD_DWORD_OFFEN]] ; GFX6: SI_RETURN_TO_EPILOG implicit $vgpr0 ; GFX7-LABEL: name: s_buffer_load_f32_vgpr_offset_add_4096 @@ -2073,8 +2075,10 @@ ; GFX7: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5 ; GFX7: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; GFX7: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; GFX7: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 4096 - ; GFX7: [[BUFFER_LOAD_DWORD_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load (s32)) + ; GFX7: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4096, implicit $exec + ; GFX7: %9:vgpr_32, dead %13:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY4]], [[V_MOV_B32_e32_]], 0, implicit $exec + ; GFX7: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0 + ; GFX7: [[BUFFER_LOAD_DWORD_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN %9, [[REG_SEQUENCE]], [[S_MOV_B32_]], 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load (s32)) ; GFX7: $vgpr0 = COPY [[BUFFER_LOAD_DWORD_OFFEN]] ; GFX7: SI_RETURN_TO_EPILOG implicit $vgpr0 ; GFX8-LABEL: name: s_buffer_load_f32_vgpr_offset_add_4096 @@ -2203,9 +2207,11 @@ ; GFX6: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5 ; GFX6: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; GFX6: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; GFX6: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 4068 - ; GFX6: [[BUFFER_LOAD_DWORDX4_OFFEN:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load (s128), align 4) - ; GFX6: [[BUFFER_LOAD_DWORDX4_OFFEN1:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 16, 0, 0, 0, implicit $exec :: (dereferenceable invariant load (s128), align 4) + ; GFX6: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4068, implicit $exec + ; GFX6: %9:vgpr_32, dead %23:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY4]], [[V_MOV_B32_e32_]], 0, implicit $exec + ; GFX6: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0 + ; GFX6: [[BUFFER_LOAD_DWORDX4_OFFEN:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN %9, [[REG_SEQUENCE]], [[S_MOV_B32_]], 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load (s128), align 4) + ; GFX6: [[BUFFER_LOAD_DWORDX4_OFFEN1:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN %9, [[REG_SEQUENCE]], [[S_MOV_B32_]], 16, 0, 0, 0, implicit $exec :: (dereferenceable invariant load (s128), align 4) ; GFX6: [[REG_SEQUENCE1:%[0-9]+]]:vreg_256 = REG_SEQUENCE [[BUFFER_LOAD_DWORDX4_OFFEN]], %subreg.sub0_sub1_sub2_sub3, [[BUFFER_LOAD_DWORDX4_OFFEN1]], %subreg.sub4_sub5_sub6_sub7 ; GFX6: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub0 ; GFX6: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub1 @@ -2233,9 +2239,11 @@ ; GFX7: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5 ; GFX7: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; GFX7: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; GFX7: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 4068 - ; GFX7: [[BUFFER_LOAD_DWORDX4_OFFEN:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load (s128), align 4) - ; GFX7: [[BUFFER_LOAD_DWORDX4_OFFEN1:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 16, 0, 0, 0, implicit $exec :: (dereferenceable invariant load (s128), align 4) + ; GFX7: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4068, implicit $exec + ; GFX7: %9:vgpr_32, dead %23:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY4]], [[V_MOV_B32_e32_]], 0, implicit $exec + ; GFX7: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0 + ; GFX7: [[BUFFER_LOAD_DWORDX4_OFFEN:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN %9, [[REG_SEQUENCE]], [[S_MOV_B32_]], 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load (s128), align 4) + ; GFX7: [[BUFFER_LOAD_DWORDX4_OFFEN1:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN %9, [[REG_SEQUENCE]], [[S_MOV_B32_]], 16, 0, 0, 0, implicit $exec :: (dereferenceable invariant load (s128), align 4) ; GFX7: [[REG_SEQUENCE1:%[0-9]+]]:vreg_256 = REG_SEQUENCE [[BUFFER_LOAD_DWORDX4_OFFEN]], %subreg.sub0_sub1_sub2_sub3, [[BUFFER_LOAD_DWORDX4_OFFEN1]], %subreg.sub4_sub5_sub6_sub7 ; GFX7: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub0 ; GFX7: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub1 @@ -2449,11 +2457,13 @@ ; GFX6: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5 ; GFX6: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; GFX6: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; GFX6: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 4036 - ; GFX6: [[BUFFER_LOAD_DWORDX4_OFFEN:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load (s128), align 4) - ; GFX6: [[BUFFER_LOAD_DWORDX4_OFFEN1:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 16, 0, 0, 0, implicit $exec :: (dereferenceable invariant load (s128), align 4) - ; GFX6: [[BUFFER_LOAD_DWORDX4_OFFEN2:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 32, 0, 0, 0, implicit $exec :: (dereferenceable invariant load (s128) from unknown-address + 16, align 4) - ; GFX6: [[BUFFER_LOAD_DWORDX4_OFFEN3:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 48, 0, 0, 0, implicit $exec :: (dereferenceable invariant load (s128) from unknown-address + 48, align 4) + ; GFX6: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4036, implicit $exec + ; GFX6: %9:vgpr_32, dead %33:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY4]], [[V_MOV_B32_e32_]], 0, implicit $exec + ; GFX6: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0 + ; GFX6: [[BUFFER_LOAD_DWORDX4_OFFEN:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN %9, [[REG_SEQUENCE]], [[S_MOV_B32_]], 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load (s128), align 4) + ; GFX6: [[BUFFER_LOAD_DWORDX4_OFFEN1:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN %9, [[REG_SEQUENCE]], [[S_MOV_B32_]], 16, 0, 0, 0, implicit $exec :: (dereferenceable invariant load (s128), align 4) + ; GFX6: [[BUFFER_LOAD_DWORDX4_OFFEN2:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN %9, [[REG_SEQUENCE]], [[S_MOV_B32_]], 32, 0, 0, 0, implicit $exec :: (dereferenceable invariant load (s128) from unknown-address + 16, align 4) + ; GFX6: [[BUFFER_LOAD_DWORDX4_OFFEN3:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN %9, [[REG_SEQUENCE]], [[S_MOV_B32_]], 48, 0, 0, 0, implicit $exec :: (dereferenceable invariant load (s128) from unknown-address + 48, align 4) ; GFX6: [[REG_SEQUENCE1:%[0-9]+]]:vreg_512 = REG_SEQUENCE [[BUFFER_LOAD_DWORDX4_OFFEN]], %subreg.sub0_sub1_sub2_sub3, [[BUFFER_LOAD_DWORDX4_OFFEN1]], %subreg.sub4_sub5_sub6_sub7, [[BUFFER_LOAD_DWORDX4_OFFEN2]], %subreg.sub8_sub9_sub10_sub11, [[BUFFER_LOAD_DWORDX4_OFFEN3]], %subreg.sub12_sub13_sub14_sub15 ; GFX6: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub0 ; GFX6: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub1 @@ -2497,11 +2507,13 @@ ; GFX7: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5 ; GFX7: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; GFX7: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; GFX7: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 4036 - ; GFX7: [[BUFFER_LOAD_DWORDX4_OFFEN:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load (s128), align 4) - ; GFX7: [[BUFFER_LOAD_DWORDX4_OFFEN1:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 16, 0, 0, 0, implicit $exec :: (dereferenceable invariant load (s128), align 4) - ; GFX7: [[BUFFER_LOAD_DWORDX4_OFFEN2:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 32, 0, 0, 0, implicit $exec :: (dereferenceable invariant load (s128) from unknown-address + 16, align 4) - ; GFX7: [[BUFFER_LOAD_DWORDX4_OFFEN3:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 48, 0, 0, 0, implicit $exec :: (dereferenceable invariant load (s128) from unknown-address + 48, align 4) + ; GFX7: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4036, implicit $exec + ; GFX7: %9:vgpr_32, dead %33:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY4]], [[V_MOV_B32_e32_]], 0, implicit $exec + ; GFX7: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0 + ; GFX7: [[BUFFER_LOAD_DWORDX4_OFFEN:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN %9, [[REG_SEQUENCE]], [[S_MOV_B32_]], 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load (s128), align 4) + ; GFX7: [[BUFFER_LOAD_DWORDX4_OFFEN1:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN %9, [[REG_SEQUENCE]], [[S_MOV_B32_]], 16, 0, 0, 0, implicit $exec :: (dereferenceable invariant load (s128), align 4) + ; GFX7: [[BUFFER_LOAD_DWORDX4_OFFEN2:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN %9, [[REG_SEQUENCE]], [[S_MOV_B32_]], 32, 0, 0, 0, implicit $exec :: (dereferenceable invariant load (s128) from unknown-address + 16, align 4) + ; GFX7: [[BUFFER_LOAD_DWORDX4_OFFEN3:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN %9, [[REG_SEQUENCE]], [[S_MOV_B32_]], 48, 0, 0, 0, implicit $exec :: (dereferenceable invariant load (s128) from unknown-address + 48, align 4) ; GFX7: [[REG_SEQUENCE1:%[0-9]+]]:vreg_512 = REG_SEQUENCE [[BUFFER_LOAD_DWORDX4_OFFEN]], %subreg.sub0_sub1_sub2_sub3, [[BUFFER_LOAD_DWORDX4_OFFEN1]], %subreg.sub4_sub5_sub6_sub7, [[BUFFER_LOAD_DWORDX4_OFFEN2]], %subreg.sub8_sub9_sub10_sub11, [[BUFFER_LOAD_DWORDX4_OFFEN3]], %subreg.sub12_sub13_sub14_sub15 ; GFX7: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub0 ; GFX7: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub1 @@ -3644,7 +3656,9 @@ ; GFX6: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr3 ; GFX6: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; GFX6: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr4 - ; GFX6: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 5000 + ; GFX6: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 5000, implicit $exec + ; GFX6: %9:vgpr_32, dead %39:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY4]], [[V_MOV_B32_e32_]], 0, implicit $exec + ; GFX6: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0 ; GFX6: [[COPY5:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub0_sub1 ; GFX6: [[COPY6:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub2_sub3 ; GFX6: [[S_MOV_B64_term:%[0-9]+]]:sreg_64_xexec = S_MOV_B64_term $exec @@ -3661,8 +3675,8 @@ ; GFX6: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_1]], [[V_CMP_EQ_U64_e64_]], implicit-def $scc ; GFX6: [[REG_SEQUENCE3:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3 ; GFX6: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_]], implicit-def $exec, implicit-def $scc, implicit $exec - ; GFX6: [[BUFFER_LOAD_DWORDX4_OFFEN:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY4]], [[REG_SEQUENCE3]], [[S_MOV_B32_]], 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load (s128), align 4) - ; GFX6: [[BUFFER_LOAD_DWORDX4_OFFEN1:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY4]], [[REG_SEQUENCE3]], [[S_MOV_B32_]], 16, 0, 0, 0, implicit $exec :: (dereferenceable invariant load (s128), align 4) + ; GFX6: [[BUFFER_LOAD_DWORDX4_OFFEN:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN %9, [[REG_SEQUENCE3]], [[S_MOV_B32_]], 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load (s128), align 4) + ; GFX6: [[BUFFER_LOAD_DWORDX4_OFFEN1:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN %9, [[REG_SEQUENCE3]], [[S_MOV_B32_]], 16, 0, 0, 0, implicit $exec :: (dereferenceable invariant load (s128), align 4) ; GFX6: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc ; GFX6: SI_WATERFALL_LOOP %bb.2, implicit $exec ; GFX6: bb.3: @@ -3695,7 +3709,9 @@ ; GFX7: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr3 ; GFX7: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; GFX7: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr4 - ; GFX7: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 5000 + ; GFX7: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 5000, implicit $exec + ; GFX7: %9:vgpr_32, dead %39:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY4]], [[V_MOV_B32_e32_]], 0, implicit $exec + ; GFX7: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0 ; GFX7: [[COPY5:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub0_sub1 ; GFX7: [[COPY6:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub2_sub3 ; GFX7: [[S_MOV_B64_term:%[0-9]+]]:sreg_64_xexec = S_MOV_B64_term $exec @@ -3712,8 +3728,8 @@ ; GFX7: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_1]], [[V_CMP_EQ_U64_e64_]], implicit-def $scc ; GFX7: [[REG_SEQUENCE3:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3 ; GFX7: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_]], implicit-def $exec, implicit-def $scc, implicit $exec - ; GFX7: [[BUFFER_LOAD_DWORDX4_OFFEN:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY4]], [[REG_SEQUENCE3]], [[S_MOV_B32_]], 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load (s128), align 4) - ; GFX7: [[BUFFER_LOAD_DWORDX4_OFFEN1:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY4]], [[REG_SEQUENCE3]], [[S_MOV_B32_]], 16, 0, 0, 0, implicit $exec :: (dereferenceable invariant load (s128), align 4) + ; GFX7: [[BUFFER_LOAD_DWORDX4_OFFEN:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN %9, [[REG_SEQUENCE3]], [[S_MOV_B32_]], 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load (s128), align 4) + ; GFX7: [[BUFFER_LOAD_DWORDX4_OFFEN1:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN %9, [[REG_SEQUENCE3]], [[S_MOV_B32_]], 16, 0, 0, 0, implicit $exec :: (dereferenceable invariant load (s128), align 4) ; GFX7: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc ; GFX7: SI_WATERFALL_LOOP %bb.2, implicit $exec ; GFX7: bb.3: @@ -3803,7 +3819,9 @@ ; GFX6: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr3 ; GFX6: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; GFX6: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr4 - ; GFX6: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 4076 + ; GFX6: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4076, implicit $exec + ; GFX6: %9:vgpr_32, dead %39:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY4]], [[V_MOV_B32_e32_]], 0, implicit $exec + ; GFX6: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0 ; GFX6: [[COPY5:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub0_sub1 ; GFX6: [[COPY6:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub2_sub3 ; GFX6: [[S_MOV_B64_term:%[0-9]+]]:sreg_64_xexec = S_MOV_B64_term $exec @@ -3820,8 +3838,8 @@ ; GFX6: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_1]], [[V_CMP_EQ_U64_e64_]], implicit-def $scc ; GFX6: [[REG_SEQUENCE3:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3 ; GFX6: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_]], implicit-def $exec, implicit-def $scc, implicit $exec - ; GFX6: [[BUFFER_LOAD_DWORDX4_OFFEN:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY4]], [[REG_SEQUENCE3]], [[S_MOV_B32_]], 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load (s128), align 4) - ; GFX6: [[BUFFER_LOAD_DWORDX4_OFFEN1:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY4]], [[REG_SEQUENCE3]], [[S_MOV_B32_]], 16, 0, 0, 0, implicit $exec :: (dereferenceable invariant load (s128), align 4) + ; GFX6: [[BUFFER_LOAD_DWORDX4_OFFEN:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN %9, [[REG_SEQUENCE3]], [[S_MOV_B32_]], 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load (s128), align 4) + ; GFX6: [[BUFFER_LOAD_DWORDX4_OFFEN1:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN %9, [[REG_SEQUENCE3]], [[S_MOV_B32_]], 16, 0, 0, 0, implicit $exec :: (dereferenceable invariant load (s128), align 4) ; GFX6: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc ; GFX6: SI_WATERFALL_LOOP %bb.2, implicit $exec ; GFX6: bb.3: @@ -3854,7 +3872,9 @@ ; GFX7: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr3 ; GFX7: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; GFX7: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr4 - ; GFX7: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 4076 + ; GFX7: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4076, implicit $exec + ; GFX7: %9:vgpr_32, dead %39:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY4]], [[V_MOV_B32_e32_]], 0, implicit $exec + ; GFX7: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0 ; GFX7: [[COPY5:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub0_sub1 ; GFX7: [[COPY6:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub2_sub3 ; GFX7: [[S_MOV_B64_term:%[0-9]+]]:sreg_64_xexec = S_MOV_B64_term $exec @@ -3871,8 +3891,8 @@ ; GFX7: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_1]], [[V_CMP_EQ_U64_e64_]], implicit-def $scc ; GFX7: [[REG_SEQUENCE3:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3 ; GFX7: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_]], implicit-def $exec, implicit-def $scc, implicit $exec - ; GFX7: [[BUFFER_LOAD_DWORDX4_OFFEN:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY4]], [[REG_SEQUENCE3]], [[S_MOV_B32_]], 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load (s128), align 4) - ; GFX7: [[BUFFER_LOAD_DWORDX4_OFFEN1:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY4]], [[REG_SEQUENCE3]], [[S_MOV_B32_]], 16, 0, 0, 0, implicit $exec :: (dereferenceable invariant load (s128), align 4) + ; GFX7: [[BUFFER_LOAD_DWORDX4_OFFEN:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN %9, [[REG_SEQUENCE3]], [[S_MOV_B32_]], 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load (s128), align 4) + ; GFX7: [[BUFFER_LOAD_DWORDX4_OFFEN1:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN %9, [[REG_SEQUENCE3]], [[S_MOV_B32_]], 16, 0, 0, 0, implicit $exec :: (dereferenceable invariant load (s128), align 4) ; GFX7: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc ; GFX7: SI_WATERFALL_LOOP %bb.2, implicit $exec ; GFX7: bb.3: @@ -3962,7 +3982,9 @@ ; GFX6: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr3 ; GFX6: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; GFX6: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr4 - ; GFX6: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 4080 + ; GFX6: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4080, implicit $exec + ; GFX6: %9:vgpr_32, dead %39:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY4]], [[V_MOV_B32_e32_]], 0, implicit $exec + ; GFX6: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0 ; GFX6: [[COPY5:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub0_sub1 ; GFX6: [[COPY6:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub2_sub3 ; GFX6: [[S_MOV_B64_term:%[0-9]+]]:sreg_64_xexec = S_MOV_B64_term $exec @@ -3979,8 +4001,8 @@ ; GFX6: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_1]], [[V_CMP_EQ_U64_e64_]], implicit-def $scc ; GFX6: [[REG_SEQUENCE3:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3 ; GFX6: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_]], implicit-def $exec, implicit-def $scc, implicit $exec - ; GFX6: [[BUFFER_LOAD_DWORDX4_OFFEN:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY4]], [[REG_SEQUENCE3]], [[S_MOV_B32_]], 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load (s128), align 4) - ; GFX6: [[BUFFER_LOAD_DWORDX4_OFFEN1:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY4]], [[REG_SEQUENCE3]], [[S_MOV_B32_]], 16, 0, 0, 0, implicit $exec :: (dereferenceable invariant load (s128), align 4) + ; GFX6: [[BUFFER_LOAD_DWORDX4_OFFEN:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN %9, [[REG_SEQUENCE3]], [[S_MOV_B32_]], 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load (s128), align 4) + ; GFX6: [[BUFFER_LOAD_DWORDX4_OFFEN1:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN %9, [[REG_SEQUENCE3]], [[S_MOV_B32_]], 16, 0, 0, 0, implicit $exec :: (dereferenceable invariant load (s128), align 4) ; GFX6: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc ; GFX6: SI_WATERFALL_LOOP %bb.2, implicit $exec ; GFX6: bb.3: @@ -4013,7 +4035,9 @@ ; GFX7: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr3 ; GFX7: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; GFX7: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr4 - ; GFX7: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 4080 + ; GFX7: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4080, implicit $exec + ; GFX7: %9:vgpr_32, dead %39:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY4]], [[V_MOV_B32_e32_]], 0, implicit $exec + ; GFX7: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0 ; GFX7: [[COPY5:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub0_sub1 ; GFX7: [[COPY6:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub2_sub3 ; GFX7: [[S_MOV_B64_term:%[0-9]+]]:sreg_64_xexec = S_MOV_B64_term $exec @@ -4030,8 +4054,8 @@ ; GFX7: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_1]], [[V_CMP_EQ_U64_e64_]], implicit-def $scc ; GFX7: [[REG_SEQUENCE3:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3 ; GFX7: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_]], implicit-def $exec, implicit-def $scc, implicit $exec - ; GFX7: [[BUFFER_LOAD_DWORDX4_OFFEN:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY4]], [[REG_SEQUENCE3]], [[S_MOV_B32_]], 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load (s128), align 4) - ; GFX7: [[BUFFER_LOAD_DWORDX4_OFFEN1:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY4]], [[REG_SEQUENCE3]], [[S_MOV_B32_]], 16, 0, 0, 0, implicit $exec :: (dereferenceable invariant load (s128), align 4) + ; GFX7: [[BUFFER_LOAD_DWORDX4_OFFEN:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN %9, [[REG_SEQUENCE3]], [[S_MOV_B32_]], 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load (s128), align 4) + ; GFX7: [[BUFFER_LOAD_DWORDX4_OFFEN1:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN %9, [[REG_SEQUENCE3]], [[S_MOV_B32_]], 16, 0, 0, 0, implicit $exec :: (dereferenceable invariant load (s128), align 4) ; GFX7: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc ; GFX7: SI_WATERFALL_LOOP %bb.2, implicit $exec ; GFX7: bb.3: @@ -4368,7 +4392,7 @@ ; GFX6: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; GFX6: [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr6 ; GFX6: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[COPY5]] - ; GFX6: %9:vgpr_32, dead %17:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY4]], [[COPY6]], 0, implicit $exec + ; GFX6: %9:vgpr_32, dead %16:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY4]], [[COPY6]], 0, implicit $exec ; GFX6: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0 ; GFX6: [[BUFFER_LOAD_DWORD_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN %9, [[REG_SEQUENCE]], [[S_MOV_B32_]], 1024, 0, 0, 0, implicit $exec :: (dereferenceable invariant load (s32)) ; GFX6: $vgpr0 = COPY [[BUFFER_LOAD_DWORD_OFFEN]] @@ -4384,7 +4408,7 @@ ; GFX7: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; GFX7: [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr6 ; GFX7: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[COPY5]] - ; GFX7: %9:vgpr_32, dead %17:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY4]], [[COPY6]], 0, implicit $exec + ; GFX7: %9:vgpr_32, dead %16:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY4]], [[COPY6]], 0, implicit $exec ; GFX7: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0 ; GFX7: [[BUFFER_LOAD_DWORD_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN %9, [[REG_SEQUENCE]], [[S_MOV_B32_]], 1024, 0, 0, 0, implicit $exec :: (dereferenceable invariant load (s32)) ; GFX7: $vgpr0 = COPY [[BUFFER_LOAD_DWORD_OFFEN]] @@ -4400,7 +4424,7 @@ ; GFX8: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; GFX8: [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr6 ; GFX8: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[COPY5]] - ; GFX8: %9:vgpr_32, dead %17:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY4]], [[COPY6]], 0, implicit $exec + ; GFX8: %9:vgpr_32, dead %16:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY4]], [[COPY6]], 0, implicit $exec ; GFX8: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0 ; GFX8: [[BUFFER_LOAD_DWORD_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN %9, [[REG_SEQUENCE]], [[S_MOV_B32_]], 1024, 0, 0, 0, implicit $exec :: (dereferenceable invariant load (s32)) ; GFX8: $vgpr0 = COPY [[BUFFER_LOAD_DWORD_OFFEN]] @@ -4423,7 +4447,7 @@ ; GFX6: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; GFX6: [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr6 ; GFX6: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[COPY5]] - ; GFX6: %9:vgpr_32, dead %17:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY6]], [[COPY4]], 0, implicit $exec + ; GFX6: %9:vgpr_32, dead %16:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY6]], [[COPY4]], 0, implicit $exec ; GFX6: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0 ; GFX6: [[BUFFER_LOAD_DWORD_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN %9, [[REG_SEQUENCE]], [[S_MOV_B32_]], 1024, 0, 0, 0, implicit $exec :: (dereferenceable invariant load (s32)) ; GFX6: $vgpr0 = COPY [[BUFFER_LOAD_DWORD_OFFEN]] @@ -4439,7 +4463,7 @@ ; GFX7: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; GFX7: [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr6 ; GFX7: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[COPY5]] - ; GFX7: %9:vgpr_32, dead %17:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY6]], [[COPY4]], 0, implicit $exec + ; GFX7: %9:vgpr_32, dead %16:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY6]], [[COPY4]], 0, implicit $exec ; GFX7: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0 ; GFX7: [[BUFFER_LOAD_DWORD_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN %9, [[REG_SEQUENCE]], [[S_MOV_B32_]], 1024, 0, 0, 0, implicit $exec :: (dereferenceable invariant load (s32)) ; GFX7: $vgpr0 = COPY [[BUFFER_LOAD_DWORD_OFFEN]] @@ -4455,7 +4479,7 @@ ; GFX8: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; GFX8: [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr6 ; GFX8: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[COPY5]] - ; GFX8: %9:vgpr_32, dead %17:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY6]], [[COPY4]], 0, implicit $exec + ; GFX8: %9:vgpr_32, dead %16:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY6]], [[COPY4]], 0, implicit $exec ; GFX8: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0 ; GFX8: [[BUFFER_LOAD_DWORD_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN %9, [[REG_SEQUENCE]], [[S_MOV_B32_]], 1024, 0, 0, 0, implicit $exec :: (dereferenceable invariant load (s32)) ; GFX8: $vgpr0 = COPY [[BUFFER_LOAD_DWORD_OFFEN]] @@ -4530,9 +4554,8 @@ ; GFX6: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; GFX6: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; GFX6: [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr6 - ; GFX6: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 1024 - ; GFX6: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] - ; GFX6: %10:vgpr_32, dead %16:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY4]], [[COPY6]], 0, implicit $exec + ; GFX6: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 1024, implicit $exec + ; GFX6: %10:vgpr_32, dead %15:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY4]], [[V_MOV_B32_e32_]], 0, implicit $exec ; GFX6: [[BUFFER_LOAD_DWORD_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN %10, [[REG_SEQUENCE]], [[COPY5]], 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load (s32)) ; GFX6: $vgpr0 = COPY [[BUFFER_LOAD_DWORD_OFFEN]] ; GFX6: SI_RETURN_TO_EPILOG implicit $vgpr0 @@ -4546,9 +4569,8 @@ ; GFX7: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; GFX7: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; GFX7: [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr6 - ; GFX7: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 1024 - ; GFX7: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] - ; GFX7: %10:vgpr_32, dead %16:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY4]], [[COPY6]], 0, implicit $exec + ; GFX7: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 1024, implicit $exec + ; GFX7: %10:vgpr_32, dead %15:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY4]], [[V_MOV_B32_e32_]], 0, implicit $exec ; GFX7: [[BUFFER_LOAD_DWORD_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN %10, [[REG_SEQUENCE]], [[COPY5]], 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load (s32)) ; GFX7: $vgpr0 = COPY [[BUFFER_LOAD_DWORD_OFFEN]] ; GFX7: SI_RETURN_TO_EPILOG implicit $vgpr0 @@ -4562,9 +4584,8 @@ ; GFX8: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; GFX8: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; GFX8: [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr6 - ; GFX8: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 1024 - ; GFX8: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] - ; GFX8: %10:vgpr_32, dead %16:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY4]], [[COPY6]], 0, implicit $exec + ; GFX8: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 1024, implicit $exec + ; GFX8: %10:vgpr_32, dead %15:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY4]], [[V_MOV_B32_e32_]], 0, implicit $exec ; GFX8: [[BUFFER_LOAD_DWORD_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN %10, [[REG_SEQUENCE]], [[COPY5]], 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load (s32)) ; GFX8: $vgpr0 = COPY [[BUFFER_LOAD_DWORD_OFFEN]] ; GFX8: SI_RETURN_TO_EPILOG implicit $vgpr0 Index: llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.sdot4.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.sdot4.ll +++ llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.sdot4.ll @@ -42,19 +42,19 @@ ; GFX906-LABEL: v_sdot4_cast_v4i8: ; GFX906: ; %bb.0: ; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX906-NEXT: s_mov_b32 s5, 8 -; GFX906-NEXT: s_movk_i32 s4, 0xff -; GFX906-NEXT: v_lshlrev_b32_sdwa v1, s5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX906-NEXT: v_and_or_b32 v0, v0, s4, v1 -; GFX906-NEXT: v_and_b32_e32 v1, s4, v2 -; GFX906-NEXT: v_and_b32_e32 v2, s4, v3 +; GFX906-NEXT: v_mov_b32_e32 v10, 8 +; GFX906-NEXT: v_mov_b32_e32 v9, 0xff +; GFX906-NEXT: v_lshlrev_b32_sdwa v1, v10, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX906-NEXT: v_and_or_b32 v0, v0, v9, v1 +; GFX906-NEXT: v_and_b32_e32 v1, v2, v9 +; GFX906-NEXT: v_and_b32_e32 v2, v3, v9 ; GFX906-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX906-NEXT: v_lshlrev_b32_e32 v2, 24, v2 ; GFX906-NEXT: v_or3_b32 v0, v0, v1, v2 -; GFX906-NEXT: v_lshlrev_b32_sdwa v1, s5, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX906-NEXT: v_and_b32_e32 v2, s4, v6 -; GFX906-NEXT: v_and_b32_e32 v3, s4, v7 -; GFX906-NEXT: v_and_or_b32 v1, v4, s4, v1 +; GFX906-NEXT: v_lshlrev_b32_sdwa v1, v10, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX906-NEXT: v_and_b32_e32 v2, v6, v9 +; GFX906-NEXT: v_and_b32_e32 v3, v7, v9 +; GFX906-NEXT: v_and_or_b32 v1, v4, v9, v1 ; GFX906-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX906-NEXT: v_lshlrev_b32_e32 v3, 24, v3 ; GFX906-NEXT: v_or3_b32 v1, v1, v2, v3 @@ -65,18 +65,18 @@ ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: s_mov_b32 s4, 8 -; GFX10-NEXT: s_movk_i32 s5, 0xff -; GFX10-NEXT: v_lshlrev_b32_sdwa v1, s4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX10-NEXT: v_and_or_b32 v0, v0, s5, v1 -; GFX10-NEXT: v_and_b32_e32 v1, s5, v2 -; GFX10-NEXT: v_and_b32_e32 v2, s5, v3 -; GFX10-NEXT: v_lshlrev_b32_sdwa v3, s4, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX10-NEXT: v_and_b32_e32 v5, s5, v6 -; GFX10-NEXT: v_and_b32_e32 v6, s5, v7 +; GFX10-NEXT: v_mov_b32_e32 v9, 8 +; GFX10-NEXT: v_mov_b32_e32 v10, 0xff +; GFX10-NEXT: v_lshlrev_b32_sdwa v1, v9, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX10-NEXT: v_and_or_b32 v0, v0, v10, v1 +; GFX10-NEXT: v_and_b32_e32 v1, v2, v10 +; GFX10-NEXT: v_and_b32_e32 v2, v3, v10 +; GFX10-NEXT: v_lshlrev_b32_sdwa v3, v9, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX10-NEXT: v_and_b32_e32 v5, v6, v10 +; GFX10-NEXT: v_and_b32_e32 v6, v7, v10 ; GFX10-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX10-NEXT: v_lshlrev_b32_e32 v2, 24, v2 -; GFX10-NEXT: v_and_or_b32 v3, v4, s5, v3 +; GFX10-NEXT: v_and_or_b32 v3, v4, v10, v3 ; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v5 ; GFX10-NEXT: v_lshlrev_b32_e32 v5, 24, v6 ; GFX10-NEXT: v_or3_b32 v0, v0, v1, v2 Index: llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.buffer.load.format.f16.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.buffer.load.format.f16.ll +++ llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.buffer.load.format.f16.ll @@ -53,14 +53,11 @@ ; UNPACKED: [[BUFFER_LOAD_FORMAT_D16_XY_gfx80_BOTHEN:%[0-9]+]]:vreg_64 = BUFFER_LOAD_FORMAT_D16_XY_gfx80_BOTHEN [[REG_SEQUENCE1]], [[REG_SEQUENCE]], [[COPY6]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (<2 x s16>), align 1, addrspace 4) ; UNPACKED: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_FORMAT_D16_XY_gfx80_BOTHEN]].sub0 ; UNPACKED: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_FORMAT_D16_XY_gfx80_BOTHEN]].sub1 - ; UNPACKED: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 65535 - ; UNPACKED: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] - ; UNPACKED: [[V_AND_B32_e64_:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 [[COPY7]], [[COPY9]], implicit $exec - ; UNPACKED: [[COPY10:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] - ; UNPACKED: [[V_AND_B32_e64_1:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 [[COPY8]], [[COPY10]], implicit $exec - ; UNPACKED: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 16 - ; UNPACKED: [[COPY11:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_1]] - ; UNPACKED: [[V_LSHLREV_B32_e64_:%[0-9]+]]:vgpr_32 = V_LSHLREV_B32_e64 [[COPY11]], [[V_AND_B32_e64_1]], implicit $exec + ; UNPACKED: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 65535, implicit $exec + ; UNPACKED: [[V_AND_B32_e64_:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 [[COPY7]], [[V_MOV_B32_e32_]], implicit $exec + ; UNPACKED: [[V_AND_B32_e64_1:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 [[COPY8]], [[V_MOV_B32_e32_]], implicit $exec + ; UNPACKED: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 16, implicit $exec + ; UNPACKED: [[V_LSHLREV_B32_e64_:%[0-9]+]]:vgpr_32 = V_LSHLREV_B32_e64 [[V_MOV_B32_e32_1]], [[V_AND_B32_e64_1]], implicit $exec ; UNPACKED: [[V_OR_B32_e64_:%[0-9]+]]:vgpr_32 = V_OR_B32_e64 [[V_AND_B32_e64_]], [[V_LSHLREV_B32_e64_]], implicit $exec ; UNPACKED: $vgpr0 = COPY [[V_OR_B32_e64_]] ; UNPACKED: SI_RETURN_TO_EPILOG implicit $vgpr0 @@ -107,21 +104,15 @@ ; UNPACKED: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_FORMAT_D16_XYZW_gfx80_BOTHEN]].sub1 ; UNPACKED: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_FORMAT_D16_XYZW_gfx80_BOTHEN]].sub2 ; UNPACKED: [[COPY10:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_FORMAT_D16_XYZW_gfx80_BOTHEN]].sub3 - ; UNPACKED: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 65535 - ; UNPACKED: [[COPY11:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] - ; UNPACKED: [[V_AND_B32_e64_:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 [[COPY7]], [[COPY11]], implicit $exec - ; UNPACKED: [[COPY12:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] - ; UNPACKED: [[V_AND_B32_e64_1:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 [[COPY8]], [[COPY12]], implicit $exec - ; UNPACKED: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 16 - ; UNPACKED: [[COPY13:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_1]] - ; UNPACKED: [[V_LSHLREV_B32_e64_:%[0-9]+]]:vgpr_32 = V_LSHLREV_B32_e64 [[COPY13]], [[V_AND_B32_e64_1]], implicit $exec + ; UNPACKED: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 65535, implicit $exec + ; UNPACKED: [[V_AND_B32_e64_:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 [[COPY7]], [[V_MOV_B32_e32_]], implicit $exec + ; UNPACKED: [[V_AND_B32_e64_1:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 [[COPY8]], [[V_MOV_B32_e32_]], implicit $exec + ; UNPACKED: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 16, implicit $exec + ; UNPACKED: [[V_LSHLREV_B32_e64_:%[0-9]+]]:vgpr_32 = V_LSHLREV_B32_e64 [[V_MOV_B32_e32_1]], [[V_AND_B32_e64_1]], implicit $exec ; UNPACKED: [[V_OR_B32_e64_:%[0-9]+]]:vgpr_32 = V_OR_B32_e64 [[V_AND_B32_e64_]], [[V_LSHLREV_B32_e64_]], implicit $exec - ; UNPACKED: [[COPY14:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] - ; UNPACKED: [[V_AND_B32_e64_2:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 [[COPY9]], [[COPY14]], implicit $exec - ; UNPACKED: [[COPY15:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] - ; UNPACKED: [[V_AND_B32_e64_3:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 [[COPY10]], [[COPY15]], implicit $exec - ; UNPACKED: [[COPY16:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_1]] - ; UNPACKED: [[V_LSHLREV_B32_e64_1:%[0-9]+]]:vgpr_32 = V_LSHLREV_B32_e64 [[COPY16]], [[V_AND_B32_e64_3]], implicit $exec + ; UNPACKED: [[V_AND_B32_e64_2:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 [[COPY9]], [[V_MOV_B32_e32_]], implicit $exec + ; UNPACKED: [[V_AND_B32_e64_3:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 [[COPY10]], [[V_MOV_B32_e32_]], implicit $exec + ; UNPACKED: [[V_LSHLREV_B32_e64_1:%[0-9]+]]:vgpr_32 = V_LSHLREV_B32_e64 [[V_MOV_B32_e32_1]], [[V_AND_B32_e64_3]], implicit $exec ; UNPACKED: [[V_OR_B32_e64_1:%[0-9]+]]:vgpr_32 = V_OR_B32_e64 [[V_AND_B32_e64_2]], [[V_LSHLREV_B32_e64_1]], implicit $exec ; UNPACKED: $vgpr0 = COPY [[V_OR_B32_e64_]] ; UNPACKED: $vgpr1 = COPY [[V_OR_B32_e64_1]] @@ -195,21 +186,15 @@ ; UNPACKED: [[COPY12:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_FORMAT_D16_XYZW_gfx80_BOTHEN]].sub1 ; UNPACKED: [[COPY13:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_FORMAT_D16_XYZW_gfx80_BOTHEN]].sub2 ; UNPACKED: [[COPY14:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_FORMAT_D16_XYZW_gfx80_BOTHEN]].sub3 - ; UNPACKED: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 65535 - ; UNPACKED: [[COPY15:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] - ; UNPACKED: [[V_AND_B32_e64_:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 [[COPY11]], [[COPY15]], implicit $exec - ; UNPACKED: [[COPY16:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] - ; UNPACKED: [[V_AND_B32_e64_1:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 [[COPY12]], [[COPY16]], implicit $exec - ; UNPACKED: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 16 - ; UNPACKED: [[COPY17:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_1]] - ; UNPACKED: [[V_LSHLREV_B32_e64_:%[0-9]+]]:vgpr_32 = V_LSHLREV_B32_e64 [[COPY17]], [[V_AND_B32_e64_1]], implicit $exec + ; UNPACKED: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 65535, implicit $exec + ; UNPACKED: [[V_AND_B32_e64_:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 [[COPY11]], [[V_MOV_B32_e32_]], implicit $exec + ; UNPACKED: [[V_AND_B32_e64_1:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 [[COPY12]], [[V_MOV_B32_e32_]], implicit $exec + ; UNPACKED: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 16, implicit $exec + ; UNPACKED: [[V_LSHLREV_B32_e64_:%[0-9]+]]:vgpr_32 = V_LSHLREV_B32_e64 [[V_MOV_B32_e32_1]], [[V_AND_B32_e64_1]], implicit $exec ; UNPACKED: [[V_OR_B32_e64_:%[0-9]+]]:vgpr_32 = V_OR_B32_e64 [[V_AND_B32_e64_]], [[V_LSHLREV_B32_e64_]], implicit $exec - ; UNPACKED: [[COPY18:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] - ; UNPACKED: [[V_AND_B32_e64_2:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 [[COPY13]], [[COPY18]], implicit $exec - ; UNPACKED: [[COPY19:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] - ; UNPACKED: [[V_AND_B32_e64_3:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 [[COPY14]], [[COPY19]], implicit $exec - ; UNPACKED: [[COPY20:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_1]] - ; UNPACKED: [[V_LSHLREV_B32_e64_1:%[0-9]+]]:vgpr_32 = V_LSHLREV_B32_e64 [[COPY20]], [[V_AND_B32_e64_3]], implicit $exec + ; UNPACKED: [[V_AND_B32_e64_2:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 [[COPY13]], [[V_MOV_B32_e32_]], implicit $exec + ; UNPACKED: [[V_AND_B32_e64_3:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 [[COPY14]], [[V_MOV_B32_e32_]], implicit $exec + ; UNPACKED: [[V_LSHLREV_B32_e64_1:%[0-9]+]]:vgpr_32 = V_LSHLREV_B32_e64 [[V_MOV_B32_e32_1]], [[V_AND_B32_e64_3]], implicit $exec ; UNPACKED: [[V_OR_B32_e64_1:%[0-9]+]]:vgpr_32 = V_OR_B32_e64 [[V_AND_B32_e64_2]], [[V_LSHLREV_B32_e64_1]], implicit $exec ; UNPACKED: $vgpr0 = COPY [[V_OR_B32_e64_]] ; UNPACKED: $vgpr1 = COPY [[V_OR_B32_e64_1]] Index: llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.buffer.load.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.buffer.load.ll +++ llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.buffer.load.ll @@ -113,9 +113,8 @@ ; CHECK: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; CHECK: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; CHECK: [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr6 - ; CHECK: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0 - ; CHECK: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] - ; CHECK: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY6]], %subreg.sub0, [[COPY4]], %subreg.sub1 + ; CHECK: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + ; CHECK: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_MOV_B32_e32_]], %subreg.sub0, [[COPY4]], %subreg.sub1 ; CHECK: [[BUFFER_LOAD_DWORD_BOTHEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_BOTHEN [[REG_SEQUENCE1]], [[REG_SEQUENCE]], [[COPY5]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4) ; CHECK: $vgpr0 = COPY [[BUFFER_LOAD_DWORD_BOTHEN]] ; CHECK: SI_RETURN_TO_EPILOG implicit $vgpr0 Index: llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.buffer.store.format.f16.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.buffer.store.format.f16.ll +++ llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.buffer.store.format.f16.ll @@ -50,9 +50,8 @@ ; UNPACKED: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1 ; UNPACKED: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr2 ; UNPACKED: [[COPY7:%[0-9]+]]:sreg_32 = COPY $sgpr6 - ; UNPACKED: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 16 - ; UNPACKED: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] - ; UNPACKED: [[V_LSHRREV_B32_e64_:%[0-9]+]]:vgpr_32 = V_LSHRREV_B32_e64 [[COPY8]], [[COPY]], implicit $exec + ; UNPACKED: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 16, implicit $exec + ; UNPACKED: [[V_LSHRREV_B32_e64_:%[0-9]+]]:vgpr_32 = V_LSHRREV_B32_e64 [[V_MOV_B32_e32_]], [[COPY]], implicit $exec ; UNPACKED: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[V_LSHRREV_B32_e64_]], %subreg.sub1 ; UNPACKED: [[REG_SEQUENCE2:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY5]], %subreg.sub0, [[COPY6]], %subreg.sub1 ; UNPACKED: BUFFER_STORE_FORMAT_D16_XY_gfx80_BOTHEN_exact [[REG_SEQUENCE1]], [[REG_SEQUENCE2]], [[REG_SEQUENCE]], [[COPY7]], 0, 0, 0, 0, implicit $exec :: (dereferenceable store (<2 x s16>), align 1, addrspace 4) @@ -96,11 +95,9 @@ ; UNPACKED: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr2 ; UNPACKED: [[COPY7:%[0-9]+]]:vgpr_32 = COPY $vgpr3 ; UNPACKED: [[COPY8:%[0-9]+]]:sreg_32 = COPY $sgpr6 - ; UNPACKED: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 16 - ; UNPACKED: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] - ; UNPACKED: [[V_LSHRREV_B32_e64_:%[0-9]+]]:vgpr_32 = V_LSHRREV_B32_e64 [[COPY9]], [[COPY]], implicit $exec - ; UNPACKED: [[COPY10:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] - ; UNPACKED: [[V_LSHRREV_B32_e64_1:%[0-9]+]]:vgpr_32 = V_LSHRREV_B32_e64 [[COPY10]], [[COPY1]], implicit $exec + ; UNPACKED: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 16, implicit $exec + ; UNPACKED: [[V_LSHRREV_B32_e64_:%[0-9]+]]:vgpr_32 = V_LSHRREV_B32_e64 [[V_MOV_B32_e32_]], [[COPY]], implicit $exec + ; UNPACKED: [[V_LSHRREV_B32_e64_1:%[0-9]+]]:vgpr_32 = V_LSHRREV_B32_e64 [[V_MOV_B32_e32_]], [[COPY1]], implicit $exec ; UNPACKED: [[REG_SEQUENCE1:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[V_LSHRREV_B32_e64_]], %subreg.sub1, [[COPY1]], %subreg.sub2, [[V_LSHRREV_B32_e64_1]], %subreg.sub3 ; UNPACKED: [[REG_SEQUENCE2:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY6]], %subreg.sub0, [[COPY7]], %subreg.sub1 ; UNPACKED: BUFFER_STORE_FORMAT_D16_XYZW_gfx80_BOTHEN_exact [[REG_SEQUENCE1]], [[REG_SEQUENCE2]], [[REG_SEQUENCE]], [[COPY8]], 0, 0, 0, 0, implicit $exec :: (dereferenceable store (<4 x s16>), align 1, addrspace 4) Index: llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.tbuffer.load.f16.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.tbuffer.load.f16.ll +++ llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.tbuffer.load.f16.ll @@ -69,14 +69,11 @@ ; UNPACKED: [[TBUFFER_LOAD_FORMAT_D16_XY_gfx80_BOTHEN:%[0-9]+]]:vreg_64 = TBUFFER_LOAD_FORMAT_D16_XY_gfx80_BOTHEN [[REG_SEQUENCE1]], [[REG_SEQUENCE]], [[COPY6]], 0, 78, 0, 0, 0, implicit $exec :: (dereferenceable load (<2 x s16>), align 1, addrspace 4) ; UNPACKED: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[TBUFFER_LOAD_FORMAT_D16_XY_gfx80_BOTHEN]].sub0 ; UNPACKED: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[TBUFFER_LOAD_FORMAT_D16_XY_gfx80_BOTHEN]].sub1 - ; UNPACKED: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 65535 - ; UNPACKED: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] - ; UNPACKED: [[V_AND_B32_e64_:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 [[COPY7]], [[COPY9]], implicit $exec - ; UNPACKED: [[COPY10:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] - ; UNPACKED: [[V_AND_B32_e64_1:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 [[COPY8]], [[COPY10]], implicit $exec - ; UNPACKED: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 16 - ; UNPACKED: [[COPY11:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_1]] - ; UNPACKED: [[V_LSHLREV_B32_e64_:%[0-9]+]]:vgpr_32 = V_LSHLREV_B32_e64 [[COPY11]], [[V_AND_B32_e64_1]], implicit $exec + ; UNPACKED: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 65535, implicit $exec + ; UNPACKED: [[V_AND_B32_e64_:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 [[COPY7]], [[V_MOV_B32_e32_]], implicit $exec + ; UNPACKED: [[V_AND_B32_e64_1:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 [[COPY8]], [[V_MOV_B32_e32_]], implicit $exec + ; UNPACKED: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 16, implicit $exec + ; UNPACKED: [[V_LSHLREV_B32_e64_:%[0-9]+]]:vgpr_32 = V_LSHLREV_B32_e64 [[V_MOV_B32_e32_1]], [[V_AND_B32_e64_1]], implicit $exec ; UNPACKED: [[V_OR_B32_e64_:%[0-9]+]]:vgpr_32 = V_OR_B32_e64 [[V_AND_B32_e64_]], [[V_LSHLREV_B32_e64_]], implicit $exec ; UNPACKED: $vgpr0 = COPY [[V_OR_B32_e64_]] ; UNPACKED: SI_RETURN_TO_EPILOG implicit $vgpr0 @@ -126,21 +123,15 @@ ; UNPACKED: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[TBUFFER_LOAD_FORMAT_D16_XYZW_gfx80_BOTHEN]].sub1 ; UNPACKED: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[TBUFFER_LOAD_FORMAT_D16_XYZW_gfx80_BOTHEN]].sub2 ; UNPACKED: [[COPY10:%[0-9]+]]:vgpr_32 = COPY [[TBUFFER_LOAD_FORMAT_D16_XYZW_gfx80_BOTHEN]].sub3 - ; UNPACKED: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 65535 - ; UNPACKED: [[COPY11:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] - ; UNPACKED: [[V_AND_B32_e64_:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 [[COPY7]], [[COPY11]], implicit $exec - ; UNPACKED: [[COPY12:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] - ; UNPACKED: [[V_AND_B32_e64_1:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 [[COPY8]], [[COPY12]], implicit $exec - ; UNPACKED: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 16 - ; UNPACKED: [[COPY13:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_1]] - ; UNPACKED: [[V_LSHLREV_B32_e64_:%[0-9]+]]:vgpr_32 = V_LSHLREV_B32_e64 [[COPY13]], [[V_AND_B32_e64_1]], implicit $exec + ; UNPACKED: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 65535, implicit $exec + ; UNPACKED: [[V_AND_B32_e64_:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 [[COPY7]], [[V_MOV_B32_e32_]], implicit $exec + ; UNPACKED: [[V_AND_B32_e64_1:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 [[COPY8]], [[V_MOV_B32_e32_]], implicit $exec + ; UNPACKED: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 16, implicit $exec + ; UNPACKED: [[V_LSHLREV_B32_e64_:%[0-9]+]]:vgpr_32 = V_LSHLREV_B32_e64 [[V_MOV_B32_e32_1]], [[V_AND_B32_e64_1]], implicit $exec ; UNPACKED: [[V_OR_B32_e64_:%[0-9]+]]:vgpr_32 = V_OR_B32_e64 [[V_AND_B32_e64_]], [[V_LSHLREV_B32_e64_]], implicit $exec - ; UNPACKED: [[COPY14:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] - ; UNPACKED: [[V_AND_B32_e64_2:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 [[COPY9]], [[COPY14]], implicit $exec - ; UNPACKED: [[COPY15:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] - ; UNPACKED: [[V_AND_B32_e64_3:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 [[COPY10]], [[COPY15]], implicit $exec - ; UNPACKED: [[COPY16:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_1]] - ; UNPACKED: [[V_LSHLREV_B32_e64_1:%[0-9]+]]:vgpr_32 = V_LSHLREV_B32_e64 [[COPY16]], [[V_AND_B32_e64_3]], implicit $exec + ; UNPACKED: [[V_AND_B32_e64_2:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 [[COPY9]], [[V_MOV_B32_e32_]], implicit $exec + ; UNPACKED: [[V_AND_B32_e64_3:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 [[COPY10]], [[V_MOV_B32_e32_]], implicit $exec + ; UNPACKED: [[V_LSHLREV_B32_e64_1:%[0-9]+]]:vgpr_32 = V_LSHLREV_B32_e64 [[V_MOV_B32_e32_1]], [[V_AND_B32_e64_3]], implicit $exec ; UNPACKED: [[V_OR_B32_e64_1:%[0-9]+]]:vgpr_32 = V_OR_B32_e64 [[V_AND_B32_e64_2]], [[V_LSHLREV_B32_e64_1]], implicit $exec ; UNPACKED: $vgpr0 = COPY [[V_OR_B32_e64_]] ; UNPACKED: $vgpr1 = COPY [[V_OR_B32_e64_1]] @@ -160,9 +151,8 @@ ; PACKED: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; PACKED: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; PACKED: [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr6 - ; PACKED: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0 - ; PACKED: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] - ; PACKED: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY6]], %subreg.sub0, [[COPY4]], %subreg.sub1 + ; PACKED: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + ; PACKED: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_MOV_B32_e32_]], %subreg.sub0, [[COPY4]], %subreg.sub1 ; PACKED: [[TBUFFER_LOAD_FORMAT_D16_X_BOTHEN:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_D16_X_BOTHEN [[REG_SEQUENCE1]], [[REG_SEQUENCE]], [[COPY5]], 0, 78, 0, 0, 0, implicit $exec :: (dereferenceable load (s16), align 1, addrspace 4) ; PACKED: $vgpr0 = COPY [[TBUFFER_LOAD_FORMAT_D16_X_BOTHEN]] ; PACKED: SI_RETURN_TO_EPILOG implicit $vgpr0 @@ -176,9 +166,8 @@ ; UNPACKED: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; UNPACKED: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; UNPACKED: [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr6 - ; UNPACKED: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0 - ; UNPACKED: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] - ; UNPACKED: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY6]], %subreg.sub0, [[COPY4]], %subreg.sub1 + ; UNPACKED: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + ; UNPACKED: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_MOV_B32_e32_]], %subreg.sub0, [[COPY4]], %subreg.sub1 ; UNPACKED: [[TBUFFER_LOAD_FORMAT_D16_X_gfx80_BOTHEN:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_D16_X_gfx80_BOTHEN [[REG_SEQUENCE1]], [[REG_SEQUENCE]], [[COPY5]], 0, 78, 0, 0, 0, implicit $exec :: (dereferenceable load (s16), align 1, addrspace 4) ; UNPACKED: $vgpr0 = COPY [[TBUFFER_LOAD_FORMAT_D16_X_gfx80_BOTHEN]] ; UNPACKED: SI_RETURN_TO_EPILOG implicit $vgpr0 @@ -278,21 +267,15 @@ ; UNPACKED: [[COPY12:%[0-9]+]]:vgpr_32 = COPY [[TBUFFER_LOAD_FORMAT_D16_XYZW_gfx80_BOTHEN]].sub1 ; UNPACKED: [[COPY13:%[0-9]+]]:vgpr_32 = COPY [[TBUFFER_LOAD_FORMAT_D16_XYZW_gfx80_BOTHEN]].sub2 ; UNPACKED: [[COPY14:%[0-9]+]]:vgpr_32 = COPY [[TBUFFER_LOAD_FORMAT_D16_XYZW_gfx80_BOTHEN]].sub3 - ; UNPACKED: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 65535 - ; UNPACKED: [[COPY15:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] - ; UNPACKED: [[V_AND_B32_e64_:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 [[COPY11]], [[COPY15]], implicit $exec - ; UNPACKED: [[COPY16:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] - ; UNPACKED: [[V_AND_B32_e64_1:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 [[COPY12]], [[COPY16]], implicit $exec - ; UNPACKED: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 16 - ; UNPACKED: [[COPY17:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_1]] - ; UNPACKED: [[V_LSHLREV_B32_e64_:%[0-9]+]]:vgpr_32 = V_LSHLREV_B32_e64 [[COPY17]], [[V_AND_B32_e64_1]], implicit $exec + ; UNPACKED: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 65535, implicit $exec + ; UNPACKED: [[V_AND_B32_e64_:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 [[COPY11]], [[V_MOV_B32_e32_]], implicit $exec + ; UNPACKED: [[V_AND_B32_e64_1:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 [[COPY12]], [[V_MOV_B32_e32_]], implicit $exec + ; UNPACKED: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 16, implicit $exec + ; UNPACKED: [[V_LSHLREV_B32_e64_:%[0-9]+]]:vgpr_32 = V_LSHLREV_B32_e64 [[V_MOV_B32_e32_1]], [[V_AND_B32_e64_1]], implicit $exec ; UNPACKED: [[V_OR_B32_e64_:%[0-9]+]]:vgpr_32 = V_OR_B32_e64 [[V_AND_B32_e64_]], [[V_LSHLREV_B32_e64_]], implicit $exec - ; UNPACKED: [[COPY18:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] - ; UNPACKED: [[V_AND_B32_e64_2:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 [[COPY13]], [[COPY18]], implicit $exec - ; UNPACKED: [[COPY19:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] - ; UNPACKED: [[V_AND_B32_e64_3:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 [[COPY14]], [[COPY19]], implicit $exec - ; UNPACKED: [[COPY20:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_1]] - ; UNPACKED: [[V_LSHLREV_B32_e64_1:%[0-9]+]]:vgpr_32 = V_LSHLREV_B32_e64 [[COPY20]], [[V_AND_B32_e64_3]], implicit $exec + ; UNPACKED: [[V_AND_B32_e64_2:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 [[COPY13]], [[V_MOV_B32_e32_]], implicit $exec + ; UNPACKED: [[V_AND_B32_e64_3:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 [[COPY14]], [[V_MOV_B32_e32_]], implicit $exec + ; UNPACKED: [[V_LSHLREV_B32_e64_1:%[0-9]+]]:vgpr_32 = V_LSHLREV_B32_e64 [[V_MOV_B32_e32_1]], [[V_AND_B32_e64_3]], implicit $exec ; UNPACKED: [[V_OR_B32_e64_1:%[0-9]+]]:vgpr_32 = V_OR_B32_e64 [[V_AND_B32_e64_2]], [[V_LSHLREV_B32_e64_1]], implicit $exec ; UNPACKED: $vgpr0 = COPY [[V_OR_B32_e64_]] ; UNPACKED: $vgpr1 = COPY [[V_OR_B32_e64_1]] Index: llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.tbuffer.load.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.tbuffer.load.ll +++ llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.tbuffer.load.ll @@ -108,9 +108,8 @@ ; CHECK: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; CHECK: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; CHECK: [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr6 - ; CHECK: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0 - ; CHECK: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] - ; CHECK: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY6]], %subreg.sub0, [[COPY4]], %subreg.sub1 + ; CHECK: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + ; CHECK: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_MOV_B32_e32_]], %subreg.sub0, [[COPY4]], %subreg.sub1 ; CHECK: [[TBUFFER_LOAD_FORMAT_X_BOTHEN:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_BOTHEN [[REG_SEQUENCE1]], [[REG_SEQUENCE]], [[COPY5]], 0, 78, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4) ; CHECK: $vgpr0 = COPY [[TBUFFER_LOAD_FORMAT_X_BOTHEN]] ; CHECK: SI_RETURN_TO_EPILOG implicit $vgpr0 Index: llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.udot4.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.udot4.ll +++ llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.udot4.ll @@ -42,19 +42,19 @@ ; GFX906-LABEL: v_udot4_cast_v4i8: ; GFX906: ; %bb.0: ; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX906-NEXT: s_mov_b32 s5, 8 -; GFX906-NEXT: s_movk_i32 s4, 0xff -; GFX906-NEXT: v_lshlrev_b32_sdwa v1, s5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX906-NEXT: v_and_or_b32 v0, v0, s4, v1 -; GFX906-NEXT: v_and_b32_e32 v1, s4, v2 -; GFX906-NEXT: v_and_b32_e32 v2, s4, v3 +; GFX906-NEXT: v_mov_b32_e32 v10, 8 +; GFX906-NEXT: v_mov_b32_e32 v9, 0xff +; GFX906-NEXT: v_lshlrev_b32_sdwa v1, v10, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX906-NEXT: v_and_or_b32 v0, v0, v9, v1 +; GFX906-NEXT: v_and_b32_e32 v1, v2, v9 +; GFX906-NEXT: v_and_b32_e32 v2, v3, v9 ; GFX906-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX906-NEXT: v_lshlrev_b32_e32 v2, 24, v2 ; GFX906-NEXT: v_or3_b32 v0, v0, v1, v2 -; GFX906-NEXT: v_lshlrev_b32_sdwa v1, s5, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX906-NEXT: v_and_b32_e32 v2, s4, v6 -; GFX906-NEXT: v_and_b32_e32 v3, s4, v7 -; GFX906-NEXT: v_and_or_b32 v1, v4, s4, v1 +; GFX906-NEXT: v_lshlrev_b32_sdwa v1, v10, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX906-NEXT: v_and_b32_e32 v2, v6, v9 +; GFX906-NEXT: v_and_b32_e32 v3, v7, v9 +; GFX906-NEXT: v_and_or_b32 v1, v4, v9, v1 ; GFX906-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX906-NEXT: v_lshlrev_b32_e32 v3, 24, v3 ; GFX906-NEXT: v_or3_b32 v1, v1, v2, v3 @@ -65,18 +65,18 @@ ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: s_mov_b32 s4, 8 -; GFX10-NEXT: s_movk_i32 s5, 0xff -; GFX10-NEXT: v_lshlrev_b32_sdwa v1, s4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX10-NEXT: v_and_or_b32 v0, v0, s5, v1 -; GFX10-NEXT: v_and_b32_e32 v1, s5, v2 -; GFX10-NEXT: v_and_b32_e32 v2, s5, v3 -; GFX10-NEXT: v_lshlrev_b32_sdwa v3, s4, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX10-NEXT: v_and_b32_e32 v5, s5, v6 -; GFX10-NEXT: v_and_b32_e32 v6, s5, v7 +; GFX10-NEXT: v_mov_b32_e32 v9, 8 +; GFX10-NEXT: v_mov_b32_e32 v10, 0xff +; GFX10-NEXT: v_lshlrev_b32_sdwa v1, v9, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX10-NEXT: v_and_or_b32 v0, v0, v10, v1 +; GFX10-NEXT: v_and_b32_e32 v1, v2, v10 +; GFX10-NEXT: v_and_b32_e32 v2, v3, v10 +; GFX10-NEXT: v_lshlrev_b32_sdwa v3, v9, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX10-NEXT: v_and_b32_e32 v5, v6, v10 +; GFX10-NEXT: v_and_b32_e32 v6, v7, v10 ; GFX10-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX10-NEXT: v_lshlrev_b32_e32 v2, 24, v2 -; GFX10-NEXT: v_and_or_b32 v3, v4, s5, v3 +; GFX10-NEXT: v_and_or_b32 v3, v4, v10, v3 ; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v5 ; GFX10-NEXT: v_lshlrev_b32_e32 v5, 24, v6 ; GFX10-NEXT: v_or3_b32 v0, v0, v1, v2 Index: llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.wqm.demote.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.wqm.demote.ll +++ llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.wqm.demote.ll @@ -895,11 +895,11 @@ ; SI-NEXT: s_movk_i32 s2, 0x3c00 ; SI-NEXT: s_bfe_u32 s3, 0, 0x100000 ; SI-NEXT: s_bfe_u32 s2, s2, 0x100000 -; SI-NEXT: s_lshl_b32 s4, s3, 16 -; SI-NEXT: s_or_b32 s6, s2, s4 +; SI-NEXT: s_lshl_b32 s5, s3, 16 +; SI-NEXT: s_or_b32 s6, s2, s5 ; SI-NEXT: s_lshl_b32 s2, s2, 16 -; SI-NEXT: s_or_b32 s7, s3, s2 ; SI-NEXT: s_mov_b32 s4, 0 +; SI-NEXT: s_or_b32 s7, s3, s2 ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 ; SI-NEXT: s_and_saveexec_b64 s[2:3], vcc ; SI-NEXT: s_xor_b64 s[2:3], exec, s[2:3] @@ -965,7 +965,6 @@ ; GFX9-NEXT: s_mov_b64 s[0:1], exec ; GFX9-NEXT: s_wqm_b64 exec, exec ; GFX9-NEXT: v_cvt_i32_f32_e32 v0, v0 -; GFX9-NEXT: s_mov_b32 s4, 0 ; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 ; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX9-NEXT: s_xor_b64 s[2:3], exec, s[2:3] @@ -974,10 +973,11 @@ ; GFX9-NEXT: s_andn2_b64 s[0:1], s[0:1], exec ; GFX9-NEXT: s_cbranch_scc0 BB7_9 ; GFX9-NEXT: ; %bb.2: ; %.demote0 -; GFX9-NEXT: s_wqm_b64 s[6:7], s[0:1] -; GFX9-NEXT: s_and_b64 exec, exec, s[6:7] +; GFX9-NEXT: s_wqm_b64 s[4:5], s[0:1] +; GFX9-NEXT: s_and_b64 exec, exec, s[4:5] ; GFX9-NEXT: BB7_3: ; %.continue0.preheader ; GFX9-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX9-NEXT: s_mov_b32 s4, 0 ; GFX9-NEXT: s_mov_b64 s[2:3], 0 ; GFX9-NEXT: v_mov_b32_e32 v0, s4 ; GFX9-NEXT: s_branch BB7_5 @@ -1031,19 +1031,19 @@ ; GFX10-32-NEXT: s_mov_b32 s0, exec_lo ; GFX10-32-NEXT: s_wqm_b32 exec_lo, exec_lo ; GFX10-32-NEXT: v_cvt_i32_f32_e32 v0, v0 -; GFX10-32-NEXT: s_mov_b32 s1, 0 ; GFX10-32-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0 -; GFX10-32-NEXT: s_and_saveexec_b32 s2, vcc_lo -; GFX10-32-NEXT: s_xor_b32 s2, exec_lo, s2 +; GFX10-32-NEXT: s_and_saveexec_b32 s1, vcc_lo +; GFX10-32-NEXT: s_xor_b32 s1, exec_lo, s1 ; GFX10-32-NEXT: s_cbranch_execz BB7_3 ; GFX10-32-NEXT: ; %bb.1: ; %.demote0 ; GFX10-32-NEXT: s_andn2_b32 s0, s0, exec_lo ; GFX10-32-NEXT: s_cbranch_scc0 BB7_9 ; GFX10-32-NEXT: ; %bb.2: ; %.demote0 -; GFX10-32-NEXT: s_wqm_b32 s3, s0 -; GFX10-32-NEXT: s_and_b32 exec_lo, exec_lo, s3 +; GFX10-32-NEXT: s_wqm_b32 s2, s0 +; GFX10-32-NEXT: s_and_b32 exec_lo, exec_lo, s2 ; GFX10-32-NEXT: BB7_3: ; %.continue0.preheader -; GFX10-32-NEXT: s_or_b32 exec_lo, exec_lo, s2 +; GFX10-32-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; GFX10-32-NEXT: s_mov_b32 s1, 0 ; GFX10-32-NEXT: v_mov_b32_e32 v0, s1 ; GFX10-32-NEXT: s_branch BB7_5 ; GFX10-32-NEXT: BB7_4: ; %.continue1 @@ -1094,7 +1094,6 @@ ; GFX10-64-NEXT: s_mov_b64 s[0:1], exec ; GFX10-64-NEXT: s_wqm_b64 exec, exec ; GFX10-64-NEXT: v_cvt_i32_f32_e32 v0, v0 -; GFX10-64-NEXT: s_mov_b32 s4, 0 ; GFX10-64-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 ; GFX10-64-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX10-64-NEXT: s_xor_b64 s[2:3], exec, s[2:3] @@ -1103,11 +1102,12 @@ ; GFX10-64-NEXT: s_andn2_b64 s[0:1], s[0:1], exec ; GFX10-64-NEXT: s_cbranch_scc0 BB7_9 ; GFX10-64-NEXT: ; %bb.2: ; %.demote0 -; GFX10-64-NEXT: s_wqm_b64 s[6:7], s[0:1] -; GFX10-64-NEXT: s_and_b64 exec, exec, s[6:7] +; GFX10-64-NEXT: s_wqm_b64 s[4:5], s[0:1] +; GFX10-64-NEXT: s_and_b64 exec, exec, s[4:5] ; GFX10-64-NEXT: BB7_3: ; %.continue0.preheader ; GFX10-64-NEXT: s_or_b64 exec, exec, s[2:3] -; GFX10-64-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-64-NEXT: s_mov_b32 s2, 0 +; GFX10-64-NEXT: v_mov_b32_e32 v0, s2 ; GFX10-64-NEXT: s_mov_b64 s[2:3], 0 ; GFX10-64-NEXT: s_branch BB7_5 ; GFX10-64-NEXT: BB7_4: ; %.continue1 Index: llvm/test/CodeGen/AMDGPU/GlobalISel/lshr.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/GlobalISel/lshr.ll +++ llvm/test/CodeGen/AMDGPU/GlobalISel/lshr.ll @@ -8,9 +8,9 @@ ; GFX6-LABEL: v_lshr_i8: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: s_movk_i32 s4, 0xff -; GFX6-NEXT: v_and_b32_e32 v1, s4, v1 -; GFX6-NEXT: v_and_b32_e32 v0, s4, v0 +; GFX6-NEXT: v_mov_b32_e32 v2, 0xff +; GFX6-NEXT: v_and_b32_e32 v1, v1, v2 +; GFX6-NEXT: v_and_b32_e32 v0, v0, v2 ; GFX6-NEXT: v_lshrrev_b32_e32 v0, v1, v0 ; GFX6-NEXT: s_setpc_b64 s[30:31] ; @@ -30,8 +30,9 @@ ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: v_and_b32_e32 v1, 0xff, v1 -; GFX10-NEXT: v_and_b32_e32 v0, 0xff, v0 +; GFX10-NEXT: v_mov_b32_e32 v2, 0xff +; GFX10-NEXT: v_and_b32_e32 v1, v1, v2 +; GFX10-NEXT: v_and_b32_e32 v0, v0, v2 ; GFX10-NEXT: v_lshrrev_b16 v0, v1, v0 ; GFX10-NEXT: s_setpc_b64 s[30:31] %result = lshr i8 %value, %amount @@ -125,9 +126,9 @@ ; GCN-LABEL: v_lshr_i24: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: s_mov_b32 s4, 0xffffff -; GCN-NEXT: v_and_b32_e32 v1, s4, v1 -; GCN-NEXT: v_and_b32_e32 v0, s4, v0 +; GCN-NEXT: v_mov_b32_e32 v2, 0xffffff +; GCN-NEXT: v_and_b32_e32 v1, v1, v2 +; GCN-NEXT: v_and_b32_e32 v0, v0, v2 ; GCN-NEXT: v_lshrrev_b32_e32 v0, v1, v0 ; GCN-NEXT: s_setpc_b64 s[30:31] ; @@ -135,9 +136,9 @@ ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: s_mov_b32 s4, 0xffffff -; GFX10-NEXT: v_and_b32_e32 v1, s4, v1 -; GFX10-NEXT: v_and_b32_e32 v0, s4, v0 +; GFX10-NEXT: v_mov_b32_e32 v2, 0xffffff +; GFX10-NEXT: v_and_b32_e32 v1, v1, v2 +; GFX10-NEXT: v_and_b32_e32 v0, v0, v2 ; GFX10-NEXT: v_lshrrev_b32_e32 v0, v1, v0 ; GFX10-NEXT: s_setpc_b64 s[30:31] %result = lshr i24 %value, %amount @@ -575,9 +576,9 @@ ; GFX6-LABEL: v_lshr_i16: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: s_mov_b32 s4, 0xffff -; GFX6-NEXT: v_and_b32_e32 v1, s4, v1 -; GFX6-NEXT: v_and_b32_e32 v0, s4, v0 +; GFX6-NEXT: v_mov_b32_e32 v2, 0xffff +; GFX6-NEXT: v_and_b32_e32 v1, v1, v2 +; GFX6-NEXT: v_and_b32_e32 v0, v0, v2 ; GFX6-NEXT: v_lshrrev_b32_e32 v0, v1, v0 ; GFX6-NEXT: s_setpc_b64 s[30:31] ; @@ -728,12 +729,12 @@ ; GFX6-LABEL: v_lshr_v2i16: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: s_mov_b32 s4, 0xffff -; GFX6-NEXT: v_and_b32_e32 v2, s4, v2 -; GFX6-NEXT: v_and_b32_e32 v0, s4, v0 +; GFX6-NEXT: v_mov_b32_e32 v4, 0xffff +; GFX6-NEXT: v_and_b32_e32 v2, v2, v4 +; GFX6-NEXT: v_and_b32_e32 v0, v0, v4 ; GFX6-NEXT: v_lshrrev_b32_e32 v0, v2, v0 -; GFX6-NEXT: v_and_b32_e32 v2, s4, v3 -; GFX6-NEXT: v_and_b32_e32 v1, s4, v1 +; GFX6-NEXT: v_and_b32_e32 v2, v3, v4 +; GFX6-NEXT: v_and_b32_e32 v1, v1, v4 ; GFX6-NEXT: v_lshrrev_b32_e32 v1, v2, v1 ; GFX6-NEXT: s_setpc_b64 s[30:31] ; @@ -939,18 +940,18 @@ ; GFX6-LABEL: v_lshr_v4i16: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: s_mov_b32 s4, 0xffff -; GFX6-NEXT: v_and_b32_e32 v4, s4, v4 -; GFX6-NEXT: v_and_b32_e32 v0, s4, v0 +; GFX6-NEXT: v_mov_b32_e32 v8, 0xffff +; GFX6-NEXT: v_and_b32_e32 v4, v4, v8 +; GFX6-NEXT: v_and_b32_e32 v0, v0, v8 ; GFX6-NEXT: v_lshrrev_b32_e32 v0, v4, v0 -; GFX6-NEXT: v_and_b32_e32 v4, s4, v5 -; GFX6-NEXT: v_and_b32_e32 v1, s4, v1 +; GFX6-NEXT: v_and_b32_e32 v4, v5, v8 +; GFX6-NEXT: v_and_b32_e32 v1, v1, v8 ; GFX6-NEXT: v_lshrrev_b32_e32 v1, v4, v1 -; GFX6-NEXT: v_and_b32_e32 v4, s4, v6 -; GFX6-NEXT: v_and_b32_e32 v2, s4, v2 +; GFX6-NEXT: v_and_b32_e32 v4, v6, v8 +; GFX6-NEXT: v_and_b32_e32 v2, v2, v8 ; GFX6-NEXT: v_lshrrev_b32_e32 v2, v4, v2 -; GFX6-NEXT: v_and_b32_e32 v4, s4, v7 -; GFX6-NEXT: v_and_b32_e32 v3, s4, v3 +; GFX6-NEXT: v_and_b32_e32 v4, v7, v8 +; GFX6-NEXT: v_and_b32_e32 v3, v3, v8 ; GFX6-NEXT: v_lshrrev_b32_e32 v3, v4, v3 ; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX6-NEXT: v_or_b32_e32 v0, v0, v1 @@ -1102,28 +1103,27 @@ ; GFX6-LABEL: v_lshr_v8i16: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: s_mov_b32 s4, 0xffff -; GFX6-NEXT: v_and_b32_e32 v8, s4, v8 -; GFX6-NEXT: v_and_b32_e32 v0, s4, v0 +; GFX6-NEXT: v_mov_b32_e32 v16, 0xffff +; GFX6-NEXT: v_and_b32_e32 v8, v8, v16 +; GFX6-NEXT: v_and_b32_e32 v0, v0, v16 ; GFX6-NEXT: v_lshrrev_b32_e32 v0, v8, v0 -; GFX6-NEXT: v_and_b32_e32 v8, s4, v9 -; GFX6-NEXT: v_and_b32_e32 v1, s4, v1 +; GFX6-NEXT: v_and_b32_e32 v8, v9, v16 +; GFX6-NEXT: v_and_b32_e32 v1, v1, v16 ; GFX6-NEXT: v_lshrrev_b32_e32 v1, v8, v1 -; GFX6-NEXT: v_and_b32_e32 v8, s4, v10 -; GFX6-NEXT: v_and_b32_e32 v2, s4, v2 +; GFX6-NEXT: v_and_b32_e32 v8, v10, v16 +; GFX6-NEXT: v_and_b32_e32 v2, v2, v16 ; GFX6-NEXT: v_lshrrev_b32_e32 v2, v8, v2 -; GFX6-NEXT: v_and_b32_e32 v8, s4, v11 -; GFX6-NEXT: v_and_b32_e32 v3, s4, v3 +; GFX6-NEXT: v_and_b32_e32 v8, v11, v16 +; GFX6-NEXT: v_and_b32_e32 v3, v3, v16 ; GFX6-NEXT: v_lshrrev_b32_e32 v3, v8, v3 -; GFX6-NEXT: v_and_b32_e32 v8, s4, v12 -; GFX6-NEXT: v_and_b32_e32 v4, s4, v4 +; GFX6-NEXT: v_and_b32_e32 v8, v12, v16 +; GFX6-NEXT: v_and_b32_e32 v4, v4, v16 ; GFX6-NEXT: v_lshrrev_b32_e32 v4, v8, v4 -; GFX6-NEXT: v_and_b32_e32 v8, s4, v13 -; GFX6-NEXT: v_and_b32_e32 v5, s4, v5 -; GFX6-NEXT: v_mov_b32_e32 v16, 0xffff +; GFX6-NEXT: v_and_b32_e32 v8, v13, v16 +; GFX6-NEXT: v_and_b32_e32 v5, v5, v16 ; GFX6-NEXT: v_lshrrev_b32_e32 v5, v8, v5 -; GFX6-NEXT: v_and_b32_e32 v8, s4, v14 -; GFX6-NEXT: v_and_b32_e32 v6, s4, v6 +; GFX6-NEXT: v_and_b32_e32 v8, v14, v16 +; GFX6-NEXT: v_and_b32_e32 v6, v6, v16 ; GFX6-NEXT: v_lshrrev_b32_e32 v6, v8, v6 ; GFX6-NEXT: v_and_b32_e32 v8, v15, v16 ; GFX6-NEXT: v_and_b32_e32 v7, v7, v16 Index: llvm/test/CodeGen/AMDGPU/GlobalISel/mubuf-global.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/GlobalISel/mubuf-global.ll +++ llvm/test/CodeGen/AMDGPU/GlobalISel/mubuf-global.ll @@ -97,8 +97,8 @@ ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: s_mov_b32 s0, s2 ; GFX6-NEXT: s_mov_b32 s1, s3 -; GFX6-NEXT: s_mov_b32 s2, 0 ; GFX6-NEXT: v_mov_b32_e32 v2, 0 +; GFX6-NEXT: s_mov_b32 s2, 0 ; GFX6-NEXT: s_mov_b32 s3, 0xf000 ; GFX6-NEXT: v_mov_b32_e32 v1, s5 ; GFX6-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 @@ -111,8 +111,8 @@ ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: s_mov_b32 s0, s2 ; GFX7-NEXT: s_mov_b32 s1, s3 -; GFX7-NEXT: s_mov_b32 s2, 0 ; GFX7-NEXT: v_mov_b32_e32 v2, 0 +; GFX7-NEXT: s_mov_b32 s2, 0 ; GFX7-NEXT: s_mov_b32 s3, 0xf000 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 @@ -152,8 +152,8 @@ define amdgpu_ps void @mubuf_store_vgpr_ptr_offset4095(i32 addrspace(1)* %ptr) { ; GFX6-LABEL: mubuf_store_vgpr_ptr_offset4095: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_mov_b32 s2, 0 ; GFX6-NEXT: v_mov_b32_e32 v2, 0 +; GFX6-NEXT: s_mov_b32 s2, 0 ; GFX6-NEXT: s_mov_b32 s3, 0xf000 ; GFX6-NEXT: s_mov_b64 s[0:1], 0 ; GFX6-NEXT: s_movk_i32 s4, 0x3ffc @@ -162,8 +162,8 @@ ; ; GFX7-LABEL: mubuf_store_vgpr_ptr_offset4095: ; GFX7: ; %bb.0: -; GFX7-NEXT: s_mov_b32 s2, 0 ; GFX7-NEXT: v_mov_b32_e32 v2, 0 +; GFX7-NEXT: s_mov_b32 s2, 0 ; GFX7-NEXT: s_mov_b32 s3, 0xf000 ; GFX7-NEXT: s_mov_b64 s[0:1], 0 ; GFX7-NEXT: s_movk_i32 s4, 0x3ffc @@ -204,8 +204,8 @@ ; GFX6: ; %bb.0: ; GFX6-NEXT: s_mov_b32 s0, 4 ; GFX6-NEXT: s_mov_b32 s1, s0 -; GFX6-NEXT: s_mov_b32 s2, 0 ; GFX6-NEXT: v_mov_b32_e32 v2, 0 +; GFX6-NEXT: s_mov_b32 s2, 0 ; GFX6-NEXT: s_mov_b32 s3, 0xf000 ; GFX6-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 ; GFX6-NEXT: s_endpgm @@ -214,8 +214,8 @@ ; GFX7: ; %bb.0: ; GFX7-NEXT: s_mov_b32 s0, 4 ; GFX7-NEXT: s_mov_b32 s1, s0 -; GFX7-NEXT: s_mov_b32 s2, 0 ; GFX7-NEXT: v_mov_b32_e32 v2, 0 +; GFX7-NEXT: s_mov_b32 s2, 0 ; GFX7-NEXT: s_mov_b32 s3, 0xf000 ; GFX7-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 ; GFX7-NEXT: s_endpgm @@ -227,8 +227,8 @@ define amdgpu_ps void @mubuf_store_vgpr_ptr_offset4096(i32 addrspace(1)* %ptr) { ; GFX6-LABEL: mubuf_store_vgpr_ptr_offset4096: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_mov_b32 s2, 0 ; GFX6-NEXT: v_mov_b32_e32 v2, 0 +; GFX6-NEXT: s_mov_b32 s2, 0 ; GFX6-NEXT: s_mov_b32 s3, 0xf000 ; GFX6-NEXT: s_mov_b64 s[0:1], 0 ; GFX6-NEXT: s_movk_i32 s4, 0x4000 @@ -237,8 +237,8 @@ ; ; GFX7-LABEL: mubuf_store_vgpr_ptr_offset4096: ; GFX7: ; %bb.0: -; GFX7-NEXT: s_mov_b32 s2, 0 ; GFX7-NEXT: v_mov_b32_e32 v2, 0 +; GFX7-NEXT: s_mov_b32 s2, 0 ; GFX7-NEXT: s_mov_b32 s3, 0xf000 ; GFX7-NEXT: s_mov_b64 s[0:1], 0 ; GFX7-NEXT: s_movk_i32 s4, 0x4000 @@ -257,8 +257,8 @@ ; GFX6-NEXT: s_bfe_i64 s[2:3], s[4:5], 0x200000 ; GFX6-NEXT: s_lshl_b64 s[4:5], s[2:3], 2 ; GFX6-NEXT: v_mov_b32_e32 v0, s4 -; GFX6-NEXT: s_mov_b32 s2, 0 ; GFX6-NEXT: v_mov_b32_e32 v2, 0 +; GFX6-NEXT: s_mov_b32 s2, 0 ; GFX6-NEXT: s_mov_b32 s3, 0xf000 ; GFX6-NEXT: v_mov_b32_e32 v1, s5 ; GFX6-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 @@ -271,8 +271,8 @@ ; GFX7-NEXT: s_bfe_i64 s[2:3], s[4:5], 0x200000 ; GFX7-NEXT: s_lshl_b64 s[4:5], s[2:3], 2 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 -; GFX7-NEXT: s_mov_b32 s2, 0 ; GFX7-NEXT: v_mov_b32_e32 v2, 0 +; GFX7-NEXT: s_mov_b32 s2, 0 ; GFX7-NEXT: s_mov_b32 s3, 0xf000 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 @@ -287,8 +287,8 @@ ; GFX6: ; %bb.0: ; GFX6-NEXT: s_bfe_i64 s[0:1], s[2:3], 0x200000 ; GFX6-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 -; GFX6-NEXT: s_mov_b32 s2, 0 ; GFX6-NEXT: v_mov_b32_e32 v2, 0 +; GFX6-NEXT: s_mov_b32 s2, 0 ; GFX6-NEXT: s_mov_b32 s3, 0xf000 ; GFX6-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 ; GFX6-NEXT: s_endpgm @@ -297,8 +297,8 @@ ; GFX7: ; %bb.0: ; GFX7-NEXT: s_bfe_i64 s[0:1], s[2:3], 0x200000 ; GFX7-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 -; GFX7-NEXT: s_mov_b32 s2, 0 ; GFX7-NEXT: v_mov_b32_e32 v2, 0 +; GFX7-NEXT: s_mov_b32 s2, 0 ; GFX7-NEXT: s_mov_b32 s3, 0xf000 ; GFX7-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 ; GFX7-NEXT: s_endpgm @@ -312,8 +312,8 @@ ; GFX6: ; %bb.0: ; GFX6-NEXT: s_bfe_i64 s[0:1], s[2:3], 0x200000 ; GFX6-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 -; GFX6-NEXT: s_mov_b32 s2, 0 ; GFX6-NEXT: v_mov_b32_e32 v2, 0 +; GFX6-NEXT: s_mov_b32 s2, 0 ; GFX6-NEXT: s_mov_b32 s3, 0xf000 ; GFX6-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 offset:1024 ; GFX6-NEXT: s_endpgm @@ -322,8 +322,8 @@ ; GFX7: ; %bb.0: ; GFX7-NEXT: s_bfe_i64 s[0:1], s[2:3], 0x200000 ; GFX7-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 -; GFX7-NEXT: s_mov_b32 s2, 0 ; GFX7-NEXT: v_mov_b32_e32 v2, 0 +; GFX7-NEXT: s_mov_b32 s2, 0 ; GFX7-NEXT: s_mov_b32 s3, 0xf000 ; GFX7-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 offset:1024 ; GFX7-NEXT: s_endpgm @@ -338,8 +338,8 @@ ; GFX6: ; %bb.0: ; GFX6-NEXT: s_bfe_i64 s[0:1], s[2:3], 0x200000 ; GFX6-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 -; GFX6-NEXT: s_mov_b32 s2, 0 ; GFX6-NEXT: v_mov_b32_e32 v2, 0 +; GFX6-NEXT: s_mov_b32 s2, 0 ; GFX6-NEXT: s_mov_b32 s3, 0xf000 ; GFX6-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 offset:1024 ; GFX6-NEXT: s_endpgm @@ -348,8 +348,8 @@ ; GFX7: ; %bb.0: ; GFX7-NEXT: s_bfe_i64 s[0:1], s[2:3], 0x200000 ; GFX7-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 -; GFX7-NEXT: s_mov_b32 s2, 0 ; GFX7-NEXT: v_mov_b32_e32 v2, 0 +; GFX7-NEXT: s_mov_b32 s2, 0 ; GFX7-NEXT: s_mov_b32 s3, 0xf000 ; GFX7-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 offset:1024 ; GFX7-NEXT: s_endpgm @@ -366,8 +366,8 @@ ; GFX6-NEXT: v_lshl_b64 v[0:1], v[0:1], 2 ; GFX6-NEXT: s_mov_b32 s0, s2 ; GFX6-NEXT: s_mov_b32 s1, s3 -; GFX6-NEXT: s_mov_b32 s2, 0 ; GFX6-NEXT: v_mov_b32_e32 v2, 0 +; GFX6-NEXT: s_mov_b32 s2, 0 ; GFX6-NEXT: s_mov_b32 s3, 0xf000 ; GFX6-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 ; GFX6-NEXT: s_endpgm @@ -378,8 +378,8 @@ ; GFX7-NEXT: v_lshl_b64 v[0:1], v[0:1], 2 ; GFX7-NEXT: s_mov_b32 s0, s2 ; GFX7-NEXT: s_mov_b32 s1, s3 -; GFX7-NEXT: s_mov_b32 s2, 0 ; GFX7-NEXT: v_mov_b32_e32 v2, 0 +; GFX7-NEXT: s_mov_b32 s2, 0 ; GFX7-NEXT: s_mov_b32 s3, 0xf000 ; GFX7-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 ; GFX7-NEXT: s_endpgm @@ -395,8 +395,8 @@ ; GFX6-NEXT: v_lshl_b64 v[0:1], v[0:1], 2 ; GFX6-NEXT: s_mov_b32 s0, s2 ; GFX6-NEXT: s_mov_b32 s1, s3 -; GFX6-NEXT: s_mov_b32 s2, 0 ; GFX6-NEXT: v_mov_b32_e32 v2, 0 +; GFX6-NEXT: s_mov_b32 s2, 0 ; GFX6-NEXT: s_mov_b32 s3, 0xf000 ; GFX6-NEXT: s_movk_i32 s4, 0x3ffc ; GFX6-NEXT: buffer_store_dword v2, v[0:1], s[0:3], s4 addr64 @@ -408,8 +408,8 @@ ; GFX7-NEXT: v_lshl_b64 v[0:1], v[0:1], 2 ; GFX7-NEXT: s_mov_b32 s0, s2 ; GFX7-NEXT: s_mov_b32 s1, s3 -; GFX7-NEXT: s_mov_b32 s2, 0 ; GFX7-NEXT: v_mov_b32_e32 v2, 0 +; GFX7-NEXT: s_mov_b32 s2, 0 ; GFX7-NEXT: s_mov_b32 s3, 0xf000 ; GFX7-NEXT: s_movk_i32 s4, 0x3ffc ; GFX7-NEXT: buffer_store_dword v2, v[0:1], s[0:3], s4 addr64 @@ -426,8 +426,8 @@ ; GFX6-NEXT: v_lshl_b64 v[0:1], v[0:1], 2 ; GFX6-NEXT: s_mov_b32 s0, s2 ; GFX6-NEXT: s_mov_b32 s1, s3 -; GFX6-NEXT: s_mov_b32 s2, 0 ; GFX6-NEXT: v_mov_b32_e32 v2, 0 +; GFX6-NEXT: s_mov_b32 s2, 0 ; GFX6-NEXT: s_mov_b32 s3, 0xf000 ; GFX6-NEXT: s_movk_i32 s4, 0x3ffc ; GFX6-NEXT: buffer_store_dword v2, v[0:1], s[0:3], s4 addr64 @@ -439,8 +439,8 @@ ; GFX7-NEXT: v_lshl_b64 v[0:1], v[0:1], 2 ; GFX7-NEXT: s_mov_b32 s0, s2 ; GFX7-NEXT: s_mov_b32 s1, s3 -; GFX7-NEXT: s_mov_b32 s2, 0 ; GFX7-NEXT: v_mov_b32_e32 v2, 0 +; GFX7-NEXT: s_mov_b32 s2, 0 ; GFX7-NEXT: s_mov_b32 s3, 0xf000 ; GFX7-NEXT: s_movk_i32 s4, 0x3ffc ; GFX7-NEXT: buffer_store_dword v2, v[0:1], s[0:3], s4 addr64 Index: llvm/test/CodeGen/AMDGPU/GlobalISel/mul.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/GlobalISel/mul.ll +++ llvm/test/CodeGen/AMDGPU/GlobalISel/mul.ll @@ -41,9 +41,9 @@ ; GFX7-LABEL: v_mul_i16: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: s_mov_b32 s4, 0xffff -; GFX7-NEXT: v_and_b32_e32 v0, s4, v0 -; GFX7-NEXT: v_and_b32_e32 v1, s4, v1 +; GFX7-NEXT: v_mov_b32_e32 v2, 0xffff +; GFX7-NEXT: v_and_b32_e32 v0, v0, v2 +; GFX7-NEXT: v_and_b32_e32 v1, v1, v2 ; GFX7-NEXT: v_mul_u32_u24_e32 v0, v0, v1 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; @@ -178,9 +178,9 @@ ; GFX7-LABEL: v_mul_i16_signext: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: s_mov_b32 s4, 0xffff -; GFX7-NEXT: v_and_b32_e32 v0, s4, v0 -; GFX7-NEXT: v_and_b32_e32 v1, s4, v1 +; GFX7-NEXT: v_mov_b32_e32 v2, 0xffff +; GFX7-NEXT: v_and_b32_e32 v0, v0, v2 +; GFX7-NEXT: v_and_b32_e32 v1, v1, v2 ; GFX7-NEXT: v_mul_u32_u24_e32 v0, v0, v1 ; GFX7-NEXT: v_bfe_i32 v0, v0, 0, 16 ; GFX7-NEXT: s_setpc_b64 s[30:31] Index: llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgcn.div.fmas.mir =================================================================== --- llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgcn.div.fmas.mir +++ llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgcn.div.fmas.mir @@ -43,13 +43,12 @@ ; CHECK: [[COPY1:%[0-9]+]]:sgpr(s32) = COPY $sgpr1 ; CHECK: [[COPY2:%[0-9]+]]:sgpr(s32) = COPY $sgpr2 ; CHECK: [[COPY3:%[0-9]+]]:vgpr(s32) = COPY $vgpr0 - ; CHECK: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 0 - ; CHECK: [[COPY4:%[0-9]+]]:vgpr(s32) = COPY [[C]](s32) - ; CHECK: [[ICMP:%[0-9]+]]:vcc(s1) = G_ICMP intpred(eq), [[COPY3]](s32), [[COPY4]] - ; CHECK: [[COPY5:%[0-9]+]]:vgpr(s32) = COPY [[COPY]](s32) - ; CHECK: [[COPY6:%[0-9]+]]:vgpr(s32) = COPY [[COPY1]](s32) - ; CHECK: [[COPY7:%[0-9]+]]:vgpr(s32) = COPY [[COPY2]](s32) - ; CHECK: [[INT:%[0-9]+]]:vgpr(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.div.fmas), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32), [[ICMP]](s1) + ; CHECK: [[C:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0 + ; CHECK: [[ICMP:%[0-9]+]]:vcc(s1) = G_ICMP intpred(eq), [[COPY3]](s32), [[C]] + ; CHECK: [[COPY4:%[0-9]+]]:vgpr(s32) = COPY [[COPY]](s32) + ; CHECK: [[COPY5:%[0-9]+]]:vgpr(s32) = COPY [[COPY1]](s32) + ; CHECK: [[COPY6:%[0-9]+]]:vgpr(s32) = COPY [[COPY2]](s32) + ; CHECK: [[INT:%[0-9]+]]:vgpr(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.div.fmas), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[ICMP]](s1) %0:_(s32) = COPY $sgpr0 %1:_(s32) = COPY $sgpr1 %2:_(s32) = COPY $sgpr2 @@ -71,12 +70,11 @@ ; CHECK: [[COPY1:%[0-9]+]]:sgpr(s32) = COPY $sgpr0 ; CHECK: [[COPY2:%[0-9]+]]:sgpr(s32) = COPY $sgpr1 ; CHECK: [[COPY3:%[0-9]+]]:vgpr(s32) = COPY $vgpr1 - ; CHECK: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 0 - ; CHECK: [[COPY4:%[0-9]+]]:vgpr(s32) = COPY [[C]](s32) - ; CHECK: [[ICMP:%[0-9]+]]:vcc(s1) = G_ICMP intpred(eq), [[COPY3]](s32), [[COPY4]] - ; CHECK: [[COPY5:%[0-9]+]]:vgpr(s32) = COPY [[COPY1]](s32) - ; CHECK: [[COPY6:%[0-9]+]]:vgpr(s32) = COPY [[COPY2]](s32) - ; CHECK: [[INT:%[0-9]+]]:vgpr(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.div.fmas), [[COPY]](s32), [[COPY5]](s32), [[COPY6]](s32), [[ICMP]](s1) + ; CHECK: [[C:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0 + ; CHECK: [[ICMP:%[0-9]+]]:vcc(s1) = G_ICMP intpred(eq), [[COPY3]](s32), [[C]] + ; CHECK: [[COPY4:%[0-9]+]]:vgpr(s32) = COPY [[COPY1]](s32) + ; CHECK: [[COPY5:%[0-9]+]]:vgpr(s32) = COPY [[COPY2]](s32) + ; CHECK: [[INT:%[0-9]+]]:vgpr(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.div.fmas), [[COPY]](s32), [[COPY4]](s32), [[COPY5]](s32), [[ICMP]](s1) %0:_(s32) = COPY $vgpr0 %1:_(s32) = COPY $sgpr0 %2:_(s32) = COPY $sgpr1 @@ -98,9 +96,8 @@ ; CHECK: [[COPY1:%[0-9]+]]:vgpr(s32) = COPY $vgpr1 ; CHECK: [[COPY2:%[0-9]+]]:vgpr(s32) = COPY $vgpr2 ; CHECK: [[COPY3:%[0-9]+]]:vgpr(s32) = COPY $vgpr3 - ; CHECK: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 0 - ; CHECK: [[COPY4:%[0-9]+]]:vgpr(s32) = COPY [[C]](s32) - ; CHECK: [[ICMP:%[0-9]+]]:vcc(s1) = G_ICMP intpred(eq), [[COPY3]](s32), [[COPY4]] + ; CHECK: [[C:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0 + ; CHECK: [[ICMP:%[0-9]+]]:vcc(s1) = G_ICMP intpred(eq), [[COPY3]](s32), [[C]] ; CHECK: [[INT:%[0-9]+]]:vgpr(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.div.fmas), [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[ICMP]](s1) %0:_(s32) = COPY $vgpr0 %1:_(s32) = COPY $vgpr1 Index: llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgcn.raw.buffer.load.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgcn.raw.buffer.load.ll +++ llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgcn.raw.buffer.load.ll @@ -14,9 +14,8 @@ ; CHECK: [[BUILD_VECTOR:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32) ; CHECK: [[COPY4:%[0-9]+]]:vgpr(s32) = COPY $vgpr0 ; CHECK: [[COPY5:%[0-9]+]]:sgpr(s32) = COPY $sgpr6 - ; CHECK: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 0 - ; CHECK: [[COPY6:%[0-9]+]]:vgpr(s32) = COPY [[C]](s32) - ; CHECK: [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(s32) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[COPY6]](s32), [[COPY4]], [[COPY5]], 0, 0, 0 :: (dereferenceable load (s32), align 1, addrspace 4) + ; CHECK: [[C:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0 + ; CHECK: [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(s32) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C]](s32), [[COPY4]], [[COPY5]], 0, 0, 0 :: (dereferenceable load (s32), align 1, addrspace 4) ; CHECK: $vgpr0 = COPY [[AMDGPU_BUFFER_LOAD]](s32) ; CHECK: SI_RETURN_TO_EPILOG implicit $vgpr0 %val = call float @llvm.amdgcn.raw.buffer.load.f32(<4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 0) @@ -35,10 +34,9 @@ ; CHECK: [[BUILD_VECTOR:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32) ; CHECK: [[COPY4:%[0-9]+]]:sgpr(s32) = COPY $sgpr6 ; CHECK: [[COPY5:%[0-9]+]]:sgpr(s32) = COPY $sgpr7 - ; CHECK: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 0 - ; CHECK: [[COPY6:%[0-9]+]]:vgpr(s32) = COPY [[C]](s32) - ; CHECK: [[COPY7:%[0-9]+]]:vgpr(s32) = COPY [[COPY4]](s32) - ; CHECK: [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(s32) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[COPY6]](s32), [[COPY7]], [[COPY5]], 0, 0, 0 :: (dereferenceable load (s32), align 1, addrspace 4) + ; CHECK: [[C:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0 + ; CHECK: [[COPY6:%[0-9]+]]:vgpr(s32) = COPY [[COPY4]](s32) + ; CHECK: [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(s32) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C]](s32), [[COPY6]], [[COPY5]], 0, 0, 0 :: (dereferenceable load (s32), align 1, addrspace 4) ; CHECK: $vgpr0 = COPY [[AMDGPU_BUFFER_LOAD]](s32) ; CHECK: SI_RETURN_TO_EPILOG implicit $vgpr0 %val = call float @llvm.amdgcn.raw.buffer.load.f32(<4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 0) @@ -58,14 +56,13 @@ ; CHECK: [[BUILD_VECTOR:%[0-9]+]]:vgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32) ; CHECK: [[COPY4:%[0-9]+]]:vgpr(s32) = COPY $vgpr4 ; CHECK: [[COPY5:%[0-9]+]]:sgpr(s32) = COPY $sgpr2 - ; CHECK: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 0 - ; CHECK: [[COPY6:%[0-9]+]]:vgpr(s32) = COPY [[C]](s32) + ; CHECK: [[C:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0 ; CHECK: [[DEF:%[0-9]+]]:sreg_64_xexec = IMPLICIT_DEF ; CHECK: [[UV:%[0-9]+]]:vreg_64(s64), [[UV1:%[0-9]+]]:vreg_64(s64) = G_UNMERGE_VALUES [[BUILD_VECTOR]](<4 x s32>) ; CHECK: [[S_MOV_B64_term:%[0-9]+]]:sreg_64_xexec = S_MOV_B64_term $exec ; CHECK: bb.2: ; CHECK: successors: %bb.3(0x40000000), %bb.2(0x40000000) - ; CHECK: [[PHI:%[0-9]+]]:sreg_64_xexec = PHI [[DEF]], %bb.1, %15, %bb.2 + ; CHECK: [[PHI:%[0-9]+]]:sreg_64_xexec = PHI [[DEF]], %bb.1, %14, %bb.2 ; CHECK: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV]].sub0(s64), implicit $exec ; CHECK: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV]].sub1(s64), implicit $exec ; CHECK: [[MV:%[0-9]+]]:sreg_64_xexec(s64) = G_MERGE_VALUES [[V_READFIRSTLANE_B32_]](s32), [[V_READFIRSTLANE_B32_1]](s32) @@ -77,7 +74,7 @@ ; CHECK: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_1]], [[V_CMP_EQ_U64_e64_]], implicit-def $scc ; CHECK: [[BUILD_VECTOR1:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[V_READFIRSTLANE_B32_]](s32), [[V_READFIRSTLANE_B32_1]](s32), [[V_READFIRSTLANE_B32_2]](s32), [[V_READFIRSTLANE_B32_3]](s32) ; CHECK: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_]], implicit-def $exec, implicit-def $scc, implicit $exec - ; CHECK: [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(s32) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR1]](<4 x s32>), [[COPY6]](s32), [[COPY4]], [[COPY5]], 0, 0, 0 :: (dereferenceable load (s32), align 1, addrspace 4) + ; CHECK: [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(s32) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR1]](<4 x s32>), [[C]](s32), [[COPY4]], [[COPY5]], 0, 0, 0 :: (dereferenceable load (s32), align 1, addrspace 4) ; CHECK: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc ; CHECK: SI_WATERFALL_LOOP %bb.2, implicit $exec ; CHECK: bb.3: @@ -103,17 +100,16 @@ ; CHECK: [[BUILD_VECTOR:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32) ; CHECK: [[COPY4:%[0-9]+]]:vgpr(s32) = COPY $vgpr0 ; CHECK: [[COPY5:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr1 - ; CHECK: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 0 - ; CHECK: [[COPY6:%[0-9]+]]:vgpr(s32) = COPY [[C]](s32) + ; CHECK: [[C:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0 ; CHECK: [[DEF:%[0-9]+]]:sreg_64_xexec = IMPLICIT_DEF ; CHECK: [[S_MOV_B64_term:%[0-9]+]]:sreg_64_xexec = S_MOV_B64_term $exec ; CHECK: bb.2: ; CHECK: successors: %bb.3(0x40000000), %bb.2(0x40000000) - ; CHECK: [[PHI:%[0-9]+]]:sreg_64_xexec = PHI [[DEF]], %bb.1, %15, %bb.2 + ; CHECK: [[PHI:%[0-9]+]]:sreg_64_xexec = PHI [[DEF]], %bb.1, %14, %bb.2 ; CHECK: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[COPY5]](s32), implicit $exec ; CHECK: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_]](s32), [[COPY5]](s32), implicit $exec ; CHECK: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[V_CMP_EQ_U32_e64_]], implicit-def $exec, implicit-def $scc, implicit $exec - ; CHECK: [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(s32) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[COPY6]](s32), [[COPY4]], [[V_READFIRSTLANE_B32_]], 0, 0, 0 :: (dereferenceable load (s32), align 1, addrspace 4) + ; CHECK: [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(s32) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C]](s32), [[COPY4]], [[V_READFIRSTLANE_B32_]], 0, 0, 0 :: (dereferenceable load (s32), align 1, addrspace 4) ; CHECK: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc ; CHECK: SI_WATERFALL_LOOP %bb.2, implicit $exec ; CHECK: bb.3: @@ -139,14 +135,13 @@ ; CHECK: [[BUILD_VECTOR:%[0-9]+]]:vgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32) ; CHECK: [[COPY4:%[0-9]+]]:vgpr(s32) = COPY $vgpr4 ; CHECK: [[COPY5:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr5 - ; CHECK: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 0 - ; CHECK: [[COPY6:%[0-9]+]]:vgpr(s32) = COPY [[C]](s32) + ; CHECK: [[C:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0 ; CHECK: [[DEF:%[0-9]+]]:sreg_64_xexec = IMPLICIT_DEF ; CHECK: [[UV:%[0-9]+]]:vreg_64(s64), [[UV1:%[0-9]+]]:vreg_64(s64) = G_UNMERGE_VALUES [[BUILD_VECTOR]](<4 x s32>) ; CHECK: [[S_MOV_B64_term:%[0-9]+]]:sreg_64_xexec = S_MOV_B64_term $exec ; CHECK: bb.2: ; CHECK: successors: %bb.3(0x40000000), %bb.2(0x40000000) - ; CHECK: [[PHI:%[0-9]+]]:sreg_64_xexec = PHI [[DEF]], %bb.1, %15, %bb.2 + ; CHECK: [[PHI:%[0-9]+]]:sreg_64_xexec = PHI [[DEF]], %bb.1, %14, %bb.2 ; CHECK: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV]].sub0(s64), implicit $exec ; CHECK: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV]].sub1(s64), implicit $exec ; CHECK: [[MV:%[0-9]+]]:sreg_64_xexec(s64) = G_MERGE_VALUES [[V_READFIRSTLANE_B32_]](s32), [[V_READFIRSTLANE_B32_1]](s32) @@ -161,7 +156,7 @@ ; CHECK: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_4]](s32), [[COPY5]](s32), implicit $exec ; CHECK: [[S_AND_B64_1:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U32_e64_]], [[S_AND_B64_]], implicit-def $scc ; CHECK: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_1]], implicit-def $exec, implicit-def $scc, implicit $exec - ; CHECK: [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(s32) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR1]](<4 x s32>), [[COPY6]](s32), [[COPY4]], [[V_READFIRSTLANE_B32_4]], 0, 0, 0 :: (dereferenceable load (s32), align 1, addrspace 4) + ; CHECK: [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(s32) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR1]](<4 x s32>), [[C]](s32), [[COPY4]], [[V_READFIRSTLANE_B32_4]], 0, 0, 0 :: (dereferenceable load (s32), align 1, addrspace 4) ; CHECK: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc ; CHECK: SI_WATERFALL_LOOP %bb.2, implicit $exec ; CHECK: bb.3: Index: llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgcn.s.buffer.load.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgcn.s.buffer.load.ll +++ llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgcn.s.buffer.load.ll @@ -1131,9 +1131,8 @@ ; CHECK: [[COPY3:%[0-9]+]]:sgpr(s32) = COPY $sgpr5 ; CHECK: [[BUILD_VECTOR:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32) ; CHECK: [[COPY4:%[0-9]+]]:vgpr(s32) = COPY $vgpr0 - ; CHECK: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 4092 - ; CHECK: [[COPY5:%[0-9]+]]:vgpr(s32) = COPY [[C]](s32) - ; CHECK: [[ADD:%[0-9]+]]:vgpr(s32) = G_ADD [[COPY4]], [[COPY5]] + ; CHECK: [[C:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 4092 + ; CHECK: [[ADD:%[0-9]+]]:vgpr(s32) = G_ADD [[COPY4]], [[C]] ; CHECK: [[C1:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 0 ; CHECK: [[C2:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0 ; CHECK: [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(s32) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C2]](s32), [[COPY4]], [[C1]], 4092, 0, 0 :: (dereferenceable invariant load (s32)) @@ -1148,9 +1147,8 @@ ; GREEDY: [[COPY3:%[0-9]+]]:sgpr(s32) = COPY $sgpr5 ; GREEDY: [[BUILD_VECTOR:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32) ; GREEDY: [[COPY4:%[0-9]+]]:vgpr(s32) = COPY $vgpr0 - ; GREEDY: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 4092 - ; GREEDY: [[COPY5:%[0-9]+]]:vgpr(s32) = COPY [[C]](s32) - ; GREEDY: [[ADD:%[0-9]+]]:vgpr(s32) = G_ADD [[COPY4]], [[COPY5]] + ; GREEDY: [[C:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 4092 + ; GREEDY: [[ADD:%[0-9]+]]:vgpr(s32) = G_ADD [[COPY4]], [[C]] ; GREEDY: [[C1:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 0 ; GREEDY: [[C2:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0 ; GREEDY: [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(s32) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C2]](s32), [[COPY4]], [[C1]], 4092, 0, 0 :: (dereferenceable invariant load (s32)) @@ -1171,9 +1169,8 @@ ; CHECK: [[COPY3:%[0-9]+]]:sgpr(s32) = COPY $sgpr5 ; CHECK: [[BUILD_VECTOR:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32) ; CHECK: [[COPY4:%[0-9]+]]:vgpr(s32) = COPY $vgpr0 - ; CHECK: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 4095 - ; CHECK: [[COPY5:%[0-9]+]]:vgpr(s32) = COPY [[C]](s32) - ; CHECK: [[ADD:%[0-9]+]]:vgpr(s32) = G_ADD [[COPY4]], [[COPY5]] + ; CHECK: [[C:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 4095 + ; CHECK: [[ADD:%[0-9]+]]:vgpr(s32) = G_ADD [[COPY4]], [[C]] ; CHECK: [[C1:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 0 ; CHECK: [[C2:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0 ; CHECK: [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(s32) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C2]](s32), [[COPY4]], [[C1]], 4095, 0, 0 :: (dereferenceable invariant load (s32)) @@ -1188,9 +1185,8 @@ ; GREEDY: [[COPY3:%[0-9]+]]:sgpr(s32) = COPY $sgpr5 ; GREEDY: [[BUILD_VECTOR:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32) ; GREEDY: [[COPY4:%[0-9]+]]:vgpr(s32) = COPY $vgpr0 - ; GREEDY: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 4095 - ; GREEDY: [[COPY5:%[0-9]+]]:vgpr(s32) = COPY [[C]](s32) - ; GREEDY: [[ADD:%[0-9]+]]:vgpr(s32) = G_ADD [[COPY4]], [[COPY5]] + ; GREEDY: [[C:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 4095 + ; GREEDY: [[ADD:%[0-9]+]]:vgpr(s32) = G_ADD [[COPY4]], [[C]] ; GREEDY: [[C1:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 0 ; GREEDY: [[C2:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0 ; GREEDY: [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(s32) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C2]](s32), [[COPY4]], [[C1]], 4095, 0, 0 :: (dereferenceable invariant load (s32)) @@ -1211,11 +1207,11 @@ ; CHECK: [[COPY3:%[0-9]+]]:sgpr(s32) = COPY $sgpr5 ; CHECK: [[BUILD_VECTOR:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32) ; CHECK: [[COPY4:%[0-9]+]]:vgpr(s32) = COPY $vgpr0 - ; CHECK: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 4096 - ; CHECK: [[COPY5:%[0-9]+]]:vgpr(s32) = COPY [[C]](s32) - ; CHECK: [[ADD:%[0-9]+]]:vgpr(s32) = G_ADD [[COPY4]], [[COPY5]] - ; CHECK: [[C1:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0 - ; CHECK: [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(s32) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C1]](s32), [[COPY4]], [[C]], 0, 0, 0 :: (dereferenceable invariant load (s32)) + ; CHECK: [[C:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 4096 + ; CHECK: [[ADD:%[0-9]+]]:vgpr(s32) = G_ADD [[COPY4]], [[C]] + ; CHECK: [[C1:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 0 + ; CHECK: [[C2:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0 + ; CHECK: [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(s32) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C2]](s32), [[ADD]], [[C1]], 0, 0, 0 :: (dereferenceable invariant load (s32)) ; CHECK: $vgpr0 = COPY [[AMDGPU_BUFFER_LOAD]](s32) ; CHECK: SI_RETURN_TO_EPILOG implicit $vgpr0 ; GREEDY-LABEL: name: s_buffer_load_f32_vgpr_offset_add_4096 @@ -1227,11 +1223,11 @@ ; GREEDY: [[COPY3:%[0-9]+]]:sgpr(s32) = COPY $sgpr5 ; GREEDY: [[BUILD_VECTOR:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32) ; GREEDY: [[COPY4:%[0-9]+]]:vgpr(s32) = COPY $vgpr0 - ; GREEDY: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 4096 - ; GREEDY: [[COPY5:%[0-9]+]]:vgpr(s32) = COPY [[C]](s32) - ; GREEDY: [[ADD:%[0-9]+]]:vgpr(s32) = G_ADD [[COPY4]], [[COPY5]] - ; GREEDY: [[C1:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0 - ; GREEDY: [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(s32) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C1]](s32), [[COPY4]], [[C]], 0, 0, 0 :: (dereferenceable invariant load (s32)) + ; GREEDY: [[C:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 4096 + ; GREEDY: [[ADD:%[0-9]+]]:vgpr(s32) = G_ADD [[COPY4]], [[C]] + ; GREEDY: [[C1:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 0 + ; GREEDY: [[C2:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0 + ; GREEDY: [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(s32) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C2]](s32), [[ADD]], [[C1]], 0, 0, 0 :: (dereferenceable invariant load (s32)) ; GREEDY: $vgpr0 = COPY [[AMDGPU_BUFFER_LOAD]](s32) ; GREEDY: SI_RETURN_TO_EPILOG implicit $vgpr0 %soffset = add i32 %soffset.base, 4096 @@ -1250,9 +1246,8 @@ ; CHECK: [[COPY3:%[0-9]+]]:sgpr(s32) = COPY $sgpr5 ; CHECK: [[BUILD_VECTOR:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32) ; CHECK: [[COPY4:%[0-9]+]]:vgpr(s32) = COPY $vgpr0 - ; CHECK: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 4064 - ; CHECK: [[COPY5:%[0-9]+]]:vgpr(s32) = COPY [[C]](s32) - ; CHECK: [[ADD:%[0-9]+]]:vgpr(s32) = G_ADD [[COPY4]], [[COPY5]] + ; CHECK: [[C:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 4064 + ; CHECK: [[ADD:%[0-9]+]]:vgpr(s32) = G_ADD [[COPY4]], [[C]] ; CHECK: [[C1:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 0 ; CHECK: [[C2:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0 ; CHECK: [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(<4 x s32>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C2]](s32), [[COPY4]], [[C1]], 4064, 0, 0 :: (dereferenceable invariant load (s128), align 4) @@ -1277,9 +1272,8 @@ ; GREEDY: [[COPY3:%[0-9]+]]:sgpr(s32) = COPY $sgpr5 ; GREEDY: [[BUILD_VECTOR:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32) ; GREEDY: [[COPY4:%[0-9]+]]:vgpr(s32) = COPY $vgpr0 - ; GREEDY: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 4064 - ; GREEDY: [[COPY5:%[0-9]+]]:vgpr(s32) = COPY [[C]](s32) - ; GREEDY: [[ADD:%[0-9]+]]:vgpr(s32) = G_ADD [[COPY4]], [[COPY5]] + ; GREEDY: [[C:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 4064 + ; GREEDY: [[ADD:%[0-9]+]]:vgpr(s32) = G_ADD [[COPY4]], [[C]] ; GREEDY: [[C1:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 0 ; GREEDY: [[C2:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0 ; GREEDY: [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(<4 x s32>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C2]](s32), [[COPY4]], [[C1]], 4064, 0, 0 :: (dereferenceable invariant load (s128), align 4) @@ -1311,12 +1305,12 @@ ; CHECK: [[COPY3:%[0-9]+]]:sgpr(s32) = COPY $sgpr5 ; CHECK: [[BUILD_VECTOR:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32) ; CHECK: [[COPY4:%[0-9]+]]:vgpr(s32) = COPY $vgpr0 - ; CHECK: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 4068 - ; CHECK: [[COPY5:%[0-9]+]]:vgpr(s32) = COPY [[C]](s32) - ; CHECK: [[ADD:%[0-9]+]]:vgpr(s32) = G_ADD [[COPY4]], [[COPY5]] - ; CHECK: [[C1:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0 - ; CHECK: [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(<4 x s32>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C1]](s32), [[COPY4]], [[C]], 0, 0, 0 :: (dereferenceable invariant load (s128), align 4) - ; CHECK: [[AMDGPU_BUFFER_LOAD1:%[0-9]+]]:vgpr(<4 x s32>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C1]](s32), [[COPY4]], [[C]], 16, 0, 0 :: (dereferenceable invariant load (s128), align 4) + ; CHECK: [[C:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 4068 + ; CHECK: [[ADD:%[0-9]+]]:vgpr(s32) = G_ADD [[COPY4]], [[C]] + ; CHECK: [[C1:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 0 + ; CHECK: [[C2:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0 + ; CHECK: [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(<4 x s32>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C2]](s32), [[ADD]], [[C1]], 0, 0, 0 :: (dereferenceable invariant load (s128), align 4) + ; CHECK: [[AMDGPU_BUFFER_LOAD1:%[0-9]+]]:vgpr(<4 x s32>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C2]](s32), [[ADD]], [[C1]], 16, 0, 0 :: (dereferenceable invariant load (s128), align 4) ; CHECK: [[CONCAT_VECTORS:%[0-9]+]]:vgpr(<8 x s32>) = G_CONCAT_VECTORS [[AMDGPU_BUFFER_LOAD]](<4 x s32>), [[AMDGPU_BUFFER_LOAD1]](<4 x s32>) ; CHECK: [[UV:%[0-9]+]]:vgpr(s32), [[UV1:%[0-9]+]]:vgpr(s32), [[UV2:%[0-9]+]]:vgpr(s32), [[UV3:%[0-9]+]]:vgpr(s32), [[UV4:%[0-9]+]]:vgpr(s32), [[UV5:%[0-9]+]]:vgpr(s32), [[UV6:%[0-9]+]]:vgpr(s32), [[UV7:%[0-9]+]]:vgpr(s32) = G_UNMERGE_VALUES [[CONCAT_VECTORS]](<8 x s32>) ; CHECK: $vgpr0 = COPY [[UV]](s32) @@ -1337,12 +1331,12 @@ ; GREEDY: [[COPY3:%[0-9]+]]:sgpr(s32) = COPY $sgpr5 ; GREEDY: [[BUILD_VECTOR:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32) ; GREEDY: [[COPY4:%[0-9]+]]:vgpr(s32) = COPY $vgpr0 - ; GREEDY: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 4068 - ; GREEDY: [[COPY5:%[0-9]+]]:vgpr(s32) = COPY [[C]](s32) - ; GREEDY: [[ADD:%[0-9]+]]:vgpr(s32) = G_ADD [[COPY4]], [[COPY5]] - ; GREEDY: [[C1:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0 - ; GREEDY: [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(<4 x s32>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C1]](s32), [[COPY4]], [[C]], 0, 0, 0 :: (dereferenceable invariant load (s128), align 4) - ; GREEDY: [[AMDGPU_BUFFER_LOAD1:%[0-9]+]]:vgpr(<4 x s32>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C1]](s32), [[COPY4]], [[C]], 16, 0, 0 :: (dereferenceable invariant load (s128), align 4) + ; GREEDY: [[C:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 4068 + ; GREEDY: [[ADD:%[0-9]+]]:vgpr(s32) = G_ADD [[COPY4]], [[C]] + ; GREEDY: [[C1:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 0 + ; GREEDY: [[C2:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0 + ; GREEDY: [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(<4 x s32>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C2]](s32), [[ADD]], [[C1]], 0, 0, 0 :: (dereferenceable invariant load (s128), align 4) + ; GREEDY: [[AMDGPU_BUFFER_LOAD1:%[0-9]+]]:vgpr(<4 x s32>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C2]](s32), [[ADD]], [[C1]], 16, 0, 0 :: (dereferenceable invariant load (s128), align 4) ; GREEDY: [[CONCAT_VECTORS:%[0-9]+]]:vgpr(<8 x s32>) = G_CONCAT_VECTORS [[AMDGPU_BUFFER_LOAD]](<4 x s32>), [[AMDGPU_BUFFER_LOAD1]](<4 x s32>) ; GREEDY: [[UV:%[0-9]+]]:vgpr(s32), [[UV1:%[0-9]+]]:vgpr(s32), [[UV2:%[0-9]+]]:vgpr(s32), [[UV3:%[0-9]+]]:vgpr(s32), [[UV4:%[0-9]+]]:vgpr(s32), [[UV5:%[0-9]+]]:vgpr(s32), [[UV6:%[0-9]+]]:vgpr(s32), [[UV7:%[0-9]+]]:vgpr(s32) = G_UNMERGE_VALUES [[CONCAT_VECTORS]](<8 x s32>) ; GREEDY: $vgpr0 = COPY [[UV]](s32) @@ -1369,9 +1363,8 @@ ; CHECK: [[COPY3:%[0-9]+]]:sgpr(s32) = COPY $sgpr5 ; CHECK: [[BUILD_VECTOR:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32) ; CHECK: [[COPY4:%[0-9]+]]:vgpr(s32) = COPY $vgpr0 - ; CHECK: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 4032 - ; CHECK: [[COPY5:%[0-9]+]]:vgpr(s32) = COPY [[C]](s32) - ; CHECK: [[ADD:%[0-9]+]]:vgpr(s32) = G_ADD [[COPY4]], [[COPY5]] + ; CHECK: [[C:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 4032 + ; CHECK: [[ADD:%[0-9]+]]:vgpr(s32) = G_ADD [[COPY4]], [[C]] ; CHECK: [[C1:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 0 ; CHECK: [[C2:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0 ; CHECK: [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(<4 x s32>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C2]](s32), [[COPY4]], [[C1]], 4032, 0, 0 :: (dereferenceable invariant load (s128), align 4) @@ -1406,9 +1399,8 @@ ; GREEDY: [[COPY3:%[0-9]+]]:sgpr(s32) = COPY $sgpr5 ; GREEDY: [[BUILD_VECTOR:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32) ; GREEDY: [[COPY4:%[0-9]+]]:vgpr(s32) = COPY $vgpr0 - ; GREEDY: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 4032 - ; GREEDY: [[COPY5:%[0-9]+]]:vgpr(s32) = COPY [[C]](s32) - ; GREEDY: [[ADD:%[0-9]+]]:vgpr(s32) = G_ADD [[COPY4]], [[COPY5]] + ; GREEDY: [[C:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 4032 + ; GREEDY: [[ADD:%[0-9]+]]:vgpr(s32) = G_ADD [[COPY4]], [[C]] ; GREEDY: [[C1:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 0 ; GREEDY: [[C2:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0 ; GREEDY: [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(<4 x s32>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C2]](s32), [[COPY4]], [[C1]], 4032, 0, 0 :: (dereferenceable invariant load (s128), align 4) @@ -1449,14 +1441,14 @@ ; CHECK: [[COPY3:%[0-9]+]]:sgpr(s32) = COPY $sgpr5 ; CHECK: [[BUILD_VECTOR:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32) ; CHECK: [[COPY4:%[0-9]+]]:vgpr(s32) = COPY $vgpr0 - ; CHECK: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 4036 - ; CHECK: [[COPY5:%[0-9]+]]:vgpr(s32) = COPY [[C]](s32) - ; CHECK: [[ADD:%[0-9]+]]:vgpr(s32) = G_ADD [[COPY4]], [[COPY5]] - ; CHECK: [[C1:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0 - ; CHECK: [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(<4 x s32>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C1]](s32), [[COPY4]], [[C]], 0, 0, 0 :: (dereferenceable invariant load (s128), align 4) - ; CHECK: [[AMDGPU_BUFFER_LOAD1:%[0-9]+]]:vgpr(<4 x s32>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C1]](s32), [[COPY4]], [[C]], 16, 0, 0 :: (dereferenceable invariant load (s128), align 4) - ; CHECK: [[AMDGPU_BUFFER_LOAD2:%[0-9]+]]:vgpr(<4 x s32>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C1]](s32), [[COPY4]], [[C]], 32, 0, 0 :: (dereferenceable invariant load (s128) from unknown-address + 16, align 4) - ; CHECK: [[AMDGPU_BUFFER_LOAD3:%[0-9]+]]:vgpr(<4 x s32>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C1]](s32), [[COPY4]], [[C]], 48, 0, 0 :: (dereferenceable invariant load (s128) from unknown-address + 48, align 4) + ; CHECK: [[C:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 4036 + ; CHECK: [[ADD:%[0-9]+]]:vgpr(s32) = G_ADD [[COPY4]], [[C]] + ; CHECK: [[C1:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 0 + ; CHECK: [[C2:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0 + ; CHECK: [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(<4 x s32>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C2]](s32), [[ADD]], [[C1]], 0, 0, 0 :: (dereferenceable invariant load (s128), align 4) + ; CHECK: [[AMDGPU_BUFFER_LOAD1:%[0-9]+]]:vgpr(<4 x s32>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C2]](s32), [[ADD]], [[C1]], 16, 0, 0 :: (dereferenceable invariant load (s128), align 4) + ; CHECK: [[AMDGPU_BUFFER_LOAD2:%[0-9]+]]:vgpr(<4 x s32>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C2]](s32), [[ADD]], [[C1]], 32, 0, 0 :: (dereferenceable invariant load (s128) from unknown-address + 16, align 4) + ; CHECK: [[AMDGPU_BUFFER_LOAD3:%[0-9]+]]:vgpr(<4 x s32>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C2]](s32), [[ADD]], [[C1]], 48, 0, 0 :: (dereferenceable invariant load (s128) from unknown-address + 48, align 4) ; CHECK: [[CONCAT_VECTORS:%[0-9]+]]:vgpr(<16 x s32>) = G_CONCAT_VECTORS [[AMDGPU_BUFFER_LOAD]](<4 x s32>), [[AMDGPU_BUFFER_LOAD1]](<4 x s32>), [[AMDGPU_BUFFER_LOAD2]](<4 x s32>), [[AMDGPU_BUFFER_LOAD3]](<4 x s32>) ; CHECK: [[UV:%[0-9]+]]:vgpr(s32), [[UV1:%[0-9]+]]:vgpr(s32), [[UV2:%[0-9]+]]:vgpr(s32), [[UV3:%[0-9]+]]:vgpr(s32), [[UV4:%[0-9]+]]:vgpr(s32), [[UV5:%[0-9]+]]:vgpr(s32), [[UV6:%[0-9]+]]:vgpr(s32), [[UV7:%[0-9]+]]:vgpr(s32), [[UV8:%[0-9]+]]:vgpr(s32), [[UV9:%[0-9]+]]:vgpr(s32), [[UV10:%[0-9]+]]:vgpr(s32), [[UV11:%[0-9]+]]:vgpr(s32), [[UV12:%[0-9]+]]:vgpr(s32), [[UV13:%[0-9]+]]:vgpr(s32), [[UV14:%[0-9]+]]:vgpr(s32), [[UV15:%[0-9]+]]:vgpr(s32) = G_UNMERGE_VALUES [[CONCAT_VECTORS]](<16 x s32>) ; CHECK: $vgpr0 = COPY [[UV]](s32) @@ -1485,14 +1477,14 @@ ; GREEDY: [[COPY3:%[0-9]+]]:sgpr(s32) = COPY $sgpr5 ; GREEDY: [[BUILD_VECTOR:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32) ; GREEDY: [[COPY4:%[0-9]+]]:vgpr(s32) = COPY $vgpr0 - ; GREEDY: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 4036 - ; GREEDY: [[COPY5:%[0-9]+]]:vgpr(s32) = COPY [[C]](s32) - ; GREEDY: [[ADD:%[0-9]+]]:vgpr(s32) = G_ADD [[COPY4]], [[COPY5]] - ; GREEDY: [[C1:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0 - ; GREEDY: [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(<4 x s32>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C1]](s32), [[COPY4]], [[C]], 0, 0, 0 :: (dereferenceable invariant load (s128), align 4) - ; GREEDY: [[AMDGPU_BUFFER_LOAD1:%[0-9]+]]:vgpr(<4 x s32>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C1]](s32), [[COPY4]], [[C]], 16, 0, 0 :: (dereferenceable invariant load (s128), align 4) - ; GREEDY: [[AMDGPU_BUFFER_LOAD2:%[0-9]+]]:vgpr(<4 x s32>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C1]](s32), [[COPY4]], [[C]], 32, 0, 0 :: (dereferenceable invariant load (s128) from unknown-address + 16, align 4) - ; GREEDY: [[AMDGPU_BUFFER_LOAD3:%[0-9]+]]:vgpr(<4 x s32>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C1]](s32), [[COPY4]], [[C]], 48, 0, 0 :: (dereferenceable invariant load (s128) from unknown-address + 48, align 4) + ; GREEDY: [[C:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 4036 + ; GREEDY: [[ADD:%[0-9]+]]:vgpr(s32) = G_ADD [[COPY4]], [[C]] + ; GREEDY: [[C1:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 0 + ; GREEDY: [[C2:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0 + ; GREEDY: [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(<4 x s32>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C2]](s32), [[ADD]], [[C1]], 0, 0, 0 :: (dereferenceable invariant load (s128), align 4) + ; GREEDY: [[AMDGPU_BUFFER_LOAD1:%[0-9]+]]:vgpr(<4 x s32>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C2]](s32), [[ADD]], [[C1]], 16, 0, 0 :: (dereferenceable invariant load (s128), align 4) + ; GREEDY: [[AMDGPU_BUFFER_LOAD2:%[0-9]+]]:vgpr(<4 x s32>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C2]](s32), [[ADD]], [[C1]], 32, 0, 0 :: (dereferenceable invariant load (s128) from unknown-address + 16, align 4) + ; GREEDY: [[AMDGPU_BUFFER_LOAD3:%[0-9]+]]:vgpr(<4 x s32>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C2]](s32), [[ADD]], [[C1]], 48, 0, 0 :: (dereferenceable invariant load (s128) from unknown-address + 48, align 4) ; GREEDY: [[CONCAT_VECTORS:%[0-9]+]]:vgpr(<16 x s32>) = G_CONCAT_VECTORS [[AMDGPU_BUFFER_LOAD]](<4 x s32>), [[AMDGPU_BUFFER_LOAD1]](<4 x s32>), [[AMDGPU_BUFFER_LOAD2]](<4 x s32>), [[AMDGPU_BUFFER_LOAD3]](<4 x s32>) ; GREEDY: [[UV:%[0-9]+]]:vgpr(s32), [[UV1:%[0-9]+]]:vgpr(s32), [[UV2:%[0-9]+]]:vgpr(s32), [[UV3:%[0-9]+]]:vgpr(s32), [[UV4:%[0-9]+]]:vgpr(s32), [[UV5:%[0-9]+]]:vgpr(s32), [[UV6:%[0-9]+]]:vgpr(s32), [[UV7:%[0-9]+]]:vgpr(s32), [[UV8:%[0-9]+]]:vgpr(s32), [[UV9:%[0-9]+]]:vgpr(s32), [[UV10:%[0-9]+]]:vgpr(s32), [[UV11:%[0-9]+]]:vgpr(s32), [[UV12:%[0-9]+]]:vgpr(s32), [[UV13:%[0-9]+]]:vgpr(s32), [[UV14:%[0-9]+]]:vgpr(s32), [[UV15:%[0-9]+]]:vgpr(s32) = G_UNMERGE_VALUES [[CONCAT_VECTORS]](<16 x s32>) ; GREEDY: $vgpr0 = COPY [[UV]](s32) @@ -2249,10 +2241,10 @@ ; CHECK: [[COPY3:%[0-9]+]]:vgpr(s32) = COPY $vgpr3 ; CHECK: [[BUILD_VECTOR:%[0-9]+]]:vgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32) ; CHECK: [[COPY4:%[0-9]+]]:vgpr(s32) = COPY $vgpr4 - ; CHECK: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 5000 - ; CHECK: [[COPY5:%[0-9]+]]:vgpr(s32) = COPY [[C]](s32) - ; CHECK: [[ADD:%[0-9]+]]:vgpr(s32) = G_ADD [[COPY4]], [[COPY5]] - ; CHECK: [[C1:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0 + ; CHECK: [[C:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 5000 + ; CHECK: [[ADD:%[0-9]+]]:vgpr(s32) = G_ADD [[COPY4]], [[C]] + ; CHECK: [[C1:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 0 + ; CHECK: [[C2:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0 ; CHECK: [[DEF:%[0-9]+]]:sreg_64_xexec = IMPLICIT_DEF ; CHECK: [[UV:%[0-9]+]]:vreg_64(s64), [[UV1:%[0-9]+]]:vreg_64(s64) = G_UNMERGE_VALUES [[BUILD_VECTOR]](<4 x s32>) ; CHECK: [[S_MOV_B64_term:%[0-9]+]]:sreg_64_xexec = S_MOV_B64_term $exec @@ -2270,8 +2262,8 @@ ; CHECK: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_1]], [[V_CMP_EQ_U64_e64_]], implicit-def $scc ; CHECK: [[BUILD_VECTOR1:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[V_READFIRSTLANE_B32_]](s32), [[V_READFIRSTLANE_B32_1]](s32), [[V_READFIRSTLANE_B32_2]](s32), [[V_READFIRSTLANE_B32_3]](s32) ; CHECK: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_]], implicit-def $exec, implicit-def $scc, implicit $exec - ; CHECK: [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(<4 x s32>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR1]](<4 x s32>), [[C1]](s32), [[COPY4]], [[C]], 0, 0, 0 :: (dereferenceable invariant load (s128), align 4) - ; CHECK: [[AMDGPU_BUFFER_LOAD1:%[0-9]+]]:vgpr(<4 x s32>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR1]](<4 x s32>), [[C1]](s32), [[COPY4]], [[C]], 16, 0, 0 :: (dereferenceable invariant load (s128), align 4) + ; CHECK: [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(<4 x s32>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR1]](<4 x s32>), [[C2]](s32), [[ADD]], [[C1]], 0, 0, 0 :: (dereferenceable invariant load (s128), align 4) + ; CHECK: [[AMDGPU_BUFFER_LOAD1:%[0-9]+]]:vgpr(<4 x s32>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR1]](<4 x s32>), [[C2]](s32), [[ADD]], [[C1]], 16, 0, 0 :: (dereferenceable invariant load (s128), align 4) ; CHECK: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc ; CHECK: SI_WATERFALL_LOOP %bb.2, implicit $exec ; CHECK: bb.3: @@ -2297,10 +2289,10 @@ ; GREEDY: [[COPY3:%[0-9]+]]:vgpr(s32) = COPY $vgpr3 ; GREEDY: [[BUILD_VECTOR:%[0-9]+]]:vgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32) ; GREEDY: [[COPY4:%[0-9]+]]:vgpr(s32) = COPY $vgpr4 - ; GREEDY: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 5000 - ; GREEDY: [[COPY5:%[0-9]+]]:vgpr(s32) = COPY [[C]](s32) - ; GREEDY: [[ADD:%[0-9]+]]:vgpr(s32) = G_ADD [[COPY4]], [[COPY5]] - ; GREEDY: [[C1:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0 + ; GREEDY: [[C:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 5000 + ; GREEDY: [[ADD:%[0-9]+]]:vgpr(s32) = G_ADD [[COPY4]], [[C]] + ; GREEDY: [[C1:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 0 + ; GREEDY: [[C2:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0 ; GREEDY: [[DEF:%[0-9]+]]:sreg_64_xexec = IMPLICIT_DEF ; GREEDY: [[UV:%[0-9]+]]:vreg_64(s64), [[UV1:%[0-9]+]]:vreg_64(s64) = G_UNMERGE_VALUES [[BUILD_VECTOR]](<4 x s32>) ; GREEDY: [[S_MOV_B64_term:%[0-9]+]]:sreg_64_xexec = S_MOV_B64_term $exec @@ -2318,8 +2310,8 @@ ; GREEDY: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_1]], [[V_CMP_EQ_U64_e64_]], implicit-def $scc ; GREEDY: [[BUILD_VECTOR1:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[V_READFIRSTLANE_B32_]](s32), [[V_READFIRSTLANE_B32_1]](s32), [[V_READFIRSTLANE_B32_2]](s32), [[V_READFIRSTLANE_B32_3]](s32) ; GREEDY: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_]], implicit-def $exec, implicit-def $scc, implicit $exec - ; GREEDY: [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(<4 x s32>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR1]](<4 x s32>), [[C1]](s32), [[COPY4]], [[C]], 0, 0, 0 :: (dereferenceable invariant load (s128), align 4) - ; GREEDY: [[AMDGPU_BUFFER_LOAD1:%[0-9]+]]:vgpr(<4 x s32>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR1]](<4 x s32>), [[C1]](s32), [[COPY4]], [[C]], 16, 0, 0 :: (dereferenceable invariant load (s128), align 4) + ; GREEDY: [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(<4 x s32>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR1]](<4 x s32>), [[C2]](s32), [[ADD]], [[C1]], 0, 0, 0 :: (dereferenceable invariant load (s128), align 4) + ; GREEDY: [[AMDGPU_BUFFER_LOAD1:%[0-9]+]]:vgpr(<4 x s32>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR1]](<4 x s32>), [[C2]](s32), [[ADD]], [[C1]], 16, 0, 0 :: (dereferenceable invariant load (s128), align 4) ; GREEDY: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc ; GREEDY: SI_WATERFALL_LOOP %bb.2, implicit $exec ; GREEDY: bb.3: @@ -2351,10 +2343,10 @@ ; CHECK: [[COPY3:%[0-9]+]]:vgpr(s32) = COPY $vgpr3 ; CHECK: [[BUILD_VECTOR:%[0-9]+]]:vgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32) ; CHECK: [[COPY4:%[0-9]+]]:vgpr(s32) = COPY $vgpr4 - ; CHECK: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 4076 - ; CHECK: [[COPY5:%[0-9]+]]:vgpr(s32) = COPY [[C]](s32) - ; CHECK: [[ADD:%[0-9]+]]:vgpr(s32) = G_ADD [[COPY4]], [[COPY5]] - ; CHECK: [[C1:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0 + ; CHECK: [[C:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 4076 + ; CHECK: [[ADD:%[0-9]+]]:vgpr(s32) = G_ADD [[COPY4]], [[C]] + ; CHECK: [[C1:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 0 + ; CHECK: [[C2:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0 ; CHECK: [[DEF:%[0-9]+]]:sreg_64_xexec = IMPLICIT_DEF ; CHECK: [[UV:%[0-9]+]]:vreg_64(s64), [[UV1:%[0-9]+]]:vreg_64(s64) = G_UNMERGE_VALUES [[BUILD_VECTOR]](<4 x s32>) ; CHECK: [[S_MOV_B64_term:%[0-9]+]]:sreg_64_xexec = S_MOV_B64_term $exec @@ -2372,8 +2364,8 @@ ; CHECK: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_1]], [[V_CMP_EQ_U64_e64_]], implicit-def $scc ; CHECK: [[BUILD_VECTOR1:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[V_READFIRSTLANE_B32_]](s32), [[V_READFIRSTLANE_B32_1]](s32), [[V_READFIRSTLANE_B32_2]](s32), [[V_READFIRSTLANE_B32_3]](s32) ; CHECK: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_]], implicit-def $exec, implicit-def $scc, implicit $exec - ; CHECK: [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(<4 x s32>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR1]](<4 x s32>), [[C1]](s32), [[COPY4]], [[C]], 0, 0, 0 :: (dereferenceable invariant load (s128), align 4) - ; CHECK: [[AMDGPU_BUFFER_LOAD1:%[0-9]+]]:vgpr(<4 x s32>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR1]](<4 x s32>), [[C1]](s32), [[COPY4]], [[C]], 16, 0, 0 :: (dereferenceable invariant load (s128), align 4) + ; CHECK: [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(<4 x s32>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR1]](<4 x s32>), [[C2]](s32), [[ADD]], [[C1]], 0, 0, 0 :: (dereferenceable invariant load (s128), align 4) + ; CHECK: [[AMDGPU_BUFFER_LOAD1:%[0-9]+]]:vgpr(<4 x s32>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR1]](<4 x s32>), [[C2]](s32), [[ADD]], [[C1]], 16, 0, 0 :: (dereferenceable invariant load (s128), align 4) ; CHECK: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc ; CHECK: SI_WATERFALL_LOOP %bb.2, implicit $exec ; CHECK: bb.3: @@ -2399,10 +2391,10 @@ ; GREEDY: [[COPY3:%[0-9]+]]:vgpr(s32) = COPY $vgpr3 ; GREEDY: [[BUILD_VECTOR:%[0-9]+]]:vgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32) ; GREEDY: [[COPY4:%[0-9]+]]:vgpr(s32) = COPY $vgpr4 - ; GREEDY: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 4076 - ; GREEDY: [[COPY5:%[0-9]+]]:vgpr(s32) = COPY [[C]](s32) - ; GREEDY: [[ADD:%[0-9]+]]:vgpr(s32) = G_ADD [[COPY4]], [[COPY5]] - ; GREEDY: [[C1:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0 + ; GREEDY: [[C:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 4076 + ; GREEDY: [[ADD:%[0-9]+]]:vgpr(s32) = G_ADD [[COPY4]], [[C]] + ; GREEDY: [[C1:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 0 + ; GREEDY: [[C2:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0 ; GREEDY: [[DEF:%[0-9]+]]:sreg_64_xexec = IMPLICIT_DEF ; GREEDY: [[UV:%[0-9]+]]:vreg_64(s64), [[UV1:%[0-9]+]]:vreg_64(s64) = G_UNMERGE_VALUES [[BUILD_VECTOR]](<4 x s32>) ; GREEDY: [[S_MOV_B64_term:%[0-9]+]]:sreg_64_xexec = S_MOV_B64_term $exec @@ -2420,8 +2412,8 @@ ; GREEDY: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_1]], [[V_CMP_EQ_U64_e64_]], implicit-def $scc ; GREEDY: [[BUILD_VECTOR1:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[V_READFIRSTLANE_B32_]](s32), [[V_READFIRSTLANE_B32_1]](s32), [[V_READFIRSTLANE_B32_2]](s32), [[V_READFIRSTLANE_B32_3]](s32) ; GREEDY: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_]], implicit-def $exec, implicit-def $scc, implicit $exec - ; GREEDY: [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(<4 x s32>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR1]](<4 x s32>), [[C1]](s32), [[COPY4]], [[C]], 0, 0, 0 :: (dereferenceable invariant load (s128), align 4) - ; GREEDY: [[AMDGPU_BUFFER_LOAD1:%[0-9]+]]:vgpr(<4 x s32>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR1]](<4 x s32>), [[C1]](s32), [[COPY4]], [[C]], 16, 0, 0 :: (dereferenceable invariant load (s128), align 4) + ; GREEDY: [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(<4 x s32>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR1]](<4 x s32>), [[C2]](s32), [[ADD]], [[C1]], 0, 0, 0 :: (dereferenceable invariant load (s128), align 4) + ; GREEDY: [[AMDGPU_BUFFER_LOAD1:%[0-9]+]]:vgpr(<4 x s32>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR1]](<4 x s32>), [[C2]](s32), [[ADD]], [[C1]], 16, 0, 0 :: (dereferenceable invariant load (s128), align 4) ; GREEDY: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc ; GREEDY: SI_WATERFALL_LOOP %bb.2, implicit $exec ; GREEDY: bb.3: @@ -2453,10 +2445,10 @@ ; CHECK: [[COPY3:%[0-9]+]]:vgpr(s32) = COPY $vgpr3 ; CHECK: [[BUILD_VECTOR:%[0-9]+]]:vgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32) ; CHECK: [[COPY4:%[0-9]+]]:vgpr(s32) = COPY $vgpr4 - ; CHECK: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 4080 - ; CHECK: [[COPY5:%[0-9]+]]:vgpr(s32) = COPY [[C]](s32) - ; CHECK: [[ADD:%[0-9]+]]:vgpr(s32) = G_ADD [[COPY4]], [[COPY5]] - ; CHECK: [[C1:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0 + ; CHECK: [[C:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 4080 + ; CHECK: [[ADD:%[0-9]+]]:vgpr(s32) = G_ADD [[COPY4]], [[C]] + ; CHECK: [[C1:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 0 + ; CHECK: [[C2:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0 ; CHECK: [[DEF:%[0-9]+]]:sreg_64_xexec = IMPLICIT_DEF ; CHECK: [[UV:%[0-9]+]]:vreg_64(s64), [[UV1:%[0-9]+]]:vreg_64(s64) = G_UNMERGE_VALUES [[BUILD_VECTOR]](<4 x s32>) ; CHECK: [[S_MOV_B64_term:%[0-9]+]]:sreg_64_xexec = S_MOV_B64_term $exec @@ -2474,8 +2466,8 @@ ; CHECK: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_1]], [[V_CMP_EQ_U64_e64_]], implicit-def $scc ; CHECK: [[BUILD_VECTOR1:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[V_READFIRSTLANE_B32_]](s32), [[V_READFIRSTLANE_B32_1]](s32), [[V_READFIRSTLANE_B32_2]](s32), [[V_READFIRSTLANE_B32_3]](s32) ; CHECK: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_]], implicit-def $exec, implicit-def $scc, implicit $exec - ; CHECK: [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(<4 x s32>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR1]](<4 x s32>), [[C1]](s32), [[COPY4]], [[C]], 0, 0, 0 :: (dereferenceable invariant load (s128), align 4) - ; CHECK: [[AMDGPU_BUFFER_LOAD1:%[0-9]+]]:vgpr(<4 x s32>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR1]](<4 x s32>), [[C1]](s32), [[COPY4]], [[C]], 16, 0, 0 :: (dereferenceable invariant load (s128), align 4) + ; CHECK: [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(<4 x s32>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR1]](<4 x s32>), [[C2]](s32), [[ADD]], [[C1]], 0, 0, 0 :: (dereferenceable invariant load (s128), align 4) + ; CHECK: [[AMDGPU_BUFFER_LOAD1:%[0-9]+]]:vgpr(<4 x s32>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR1]](<4 x s32>), [[C2]](s32), [[ADD]], [[C1]], 16, 0, 0 :: (dereferenceable invariant load (s128), align 4) ; CHECK: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc ; CHECK: SI_WATERFALL_LOOP %bb.2, implicit $exec ; CHECK: bb.3: @@ -2501,10 +2493,10 @@ ; GREEDY: [[COPY3:%[0-9]+]]:vgpr(s32) = COPY $vgpr3 ; GREEDY: [[BUILD_VECTOR:%[0-9]+]]:vgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32) ; GREEDY: [[COPY4:%[0-9]+]]:vgpr(s32) = COPY $vgpr4 - ; GREEDY: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 4080 - ; GREEDY: [[COPY5:%[0-9]+]]:vgpr(s32) = COPY [[C]](s32) - ; GREEDY: [[ADD:%[0-9]+]]:vgpr(s32) = G_ADD [[COPY4]], [[COPY5]] - ; GREEDY: [[C1:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0 + ; GREEDY: [[C:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 4080 + ; GREEDY: [[ADD:%[0-9]+]]:vgpr(s32) = G_ADD [[COPY4]], [[C]] + ; GREEDY: [[C1:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 0 + ; GREEDY: [[C2:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0 ; GREEDY: [[DEF:%[0-9]+]]:sreg_64_xexec = IMPLICIT_DEF ; GREEDY: [[UV:%[0-9]+]]:vreg_64(s64), [[UV1:%[0-9]+]]:vreg_64(s64) = G_UNMERGE_VALUES [[BUILD_VECTOR]](<4 x s32>) ; GREEDY: [[S_MOV_B64_term:%[0-9]+]]:sreg_64_xexec = S_MOV_B64_term $exec @@ -2522,8 +2514,8 @@ ; GREEDY: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_1]], [[V_CMP_EQ_U64_e64_]], implicit-def $scc ; GREEDY: [[BUILD_VECTOR1:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[V_READFIRSTLANE_B32_]](s32), [[V_READFIRSTLANE_B32_1]](s32), [[V_READFIRSTLANE_B32_2]](s32), [[V_READFIRSTLANE_B32_3]](s32) ; GREEDY: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_]], implicit-def $exec, implicit-def $scc, implicit $exec - ; GREEDY: [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(<4 x s32>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR1]](<4 x s32>), [[C1]](s32), [[COPY4]], [[C]], 0, 0, 0 :: (dereferenceable invariant load (s128), align 4) - ; GREEDY: [[AMDGPU_BUFFER_LOAD1:%[0-9]+]]:vgpr(<4 x s32>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR1]](<4 x s32>), [[C1]](s32), [[COPY4]], [[C]], 16, 0, 0 :: (dereferenceable invariant load (s128), align 4) + ; GREEDY: [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(<4 x s32>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR1]](<4 x s32>), [[C2]](s32), [[ADD]], [[C1]], 0, 0, 0 :: (dereferenceable invariant load (s128), align 4) + ; GREEDY: [[AMDGPU_BUFFER_LOAD1:%[0-9]+]]:vgpr(<4 x s32>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR1]](<4 x s32>), [[C2]](s32), [[ADD]], [[C1]], 16, 0, 0 :: (dereferenceable invariant load (s128), align 4) ; GREEDY: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc ; GREEDY: SI_WATERFALL_LOOP %bb.2, implicit $exec ; GREEDY: bb.3: @@ -2733,9 +2725,8 @@ ; CHECK: [[COPY5:%[0-9]+]]:sgpr(s32) = COPY $sgpr6 ; CHECK: [[COPY6:%[0-9]+]]:vgpr(s32) = COPY [[COPY5]](s32) ; CHECK: [[ADD:%[0-9]+]]:vgpr(s32) = G_ADD [[COPY4]], [[COPY6]] - ; CHECK: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 1024 - ; CHECK: [[COPY7:%[0-9]+]]:vgpr(s32) = COPY [[C]](s32) - ; CHECK: [[ADD1:%[0-9]+]]:vgpr(s32) = G_ADD [[ADD]], [[COPY7]] + ; CHECK: [[C:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 1024 + ; CHECK: [[ADD1:%[0-9]+]]:vgpr(s32) = G_ADD [[ADD]], [[C]] ; CHECK: [[C1:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 0 ; CHECK: [[C2:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0 ; CHECK: [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(s32) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C2]](s32), [[ADD]], [[C1]], 1024, 0, 0 :: (dereferenceable invariant load (s32)) @@ -2753,9 +2744,8 @@ ; GREEDY: [[COPY5:%[0-9]+]]:sgpr(s32) = COPY $sgpr6 ; GREEDY: [[COPY6:%[0-9]+]]:vgpr(s32) = COPY [[COPY5]](s32) ; GREEDY: [[ADD:%[0-9]+]]:vgpr(s32) = G_ADD [[COPY4]], [[COPY6]] - ; GREEDY: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 1024 - ; GREEDY: [[COPY7:%[0-9]+]]:vgpr(s32) = COPY [[C]](s32) - ; GREEDY: [[ADD1:%[0-9]+]]:vgpr(s32) = G_ADD [[ADD]], [[COPY7]] + ; GREEDY: [[C:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 1024 + ; GREEDY: [[ADD1:%[0-9]+]]:vgpr(s32) = G_ADD [[ADD]], [[C]] ; GREEDY: [[C1:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 0 ; GREEDY: [[C2:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0 ; GREEDY: [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(s32) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C2]](s32), [[ADD]], [[C1]], 1024, 0, 0 :: (dereferenceable invariant load (s32)) @@ -2780,9 +2770,8 @@ ; CHECK: [[COPY5:%[0-9]+]]:sgpr(s32) = COPY $sgpr6 ; CHECK: [[COPY6:%[0-9]+]]:vgpr(s32) = COPY [[COPY5]](s32) ; CHECK: [[ADD:%[0-9]+]]:vgpr(s32) = G_ADD [[COPY6]], [[COPY4]] - ; CHECK: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 1024 - ; CHECK: [[COPY7:%[0-9]+]]:vgpr(s32) = COPY [[C]](s32) - ; CHECK: [[ADD1:%[0-9]+]]:vgpr(s32) = G_ADD [[ADD]], [[COPY7]] + ; CHECK: [[C:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 1024 + ; CHECK: [[ADD1:%[0-9]+]]:vgpr(s32) = G_ADD [[ADD]], [[C]] ; CHECK: [[C1:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 0 ; CHECK: [[C2:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0 ; CHECK: [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(s32) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C2]](s32), [[ADD]], [[C1]], 1024, 0, 0 :: (dereferenceable invariant load (s32)) @@ -2800,9 +2789,8 @@ ; GREEDY: [[COPY5:%[0-9]+]]:sgpr(s32) = COPY $sgpr6 ; GREEDY: [[COPY6:%[0-9]+]]:vgpr(s32) = COPY [[COPY5]](s32) ; GREEDY: [[ADD:%[0-9]+]]:vgpr(s32) = G_ADD [[COPY6]], [[COPY4]] - ; GREEDY: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 1024 - ; GREEDY: [[COPY7:%[0-9]+]]:vgpr(s32) = COPY [[C]](s32) - ; GREEDY: [[ADD1:%[0-9]+]]:vgpr(s32) = G_ADD [[ADD]], [[COPY7]] + ; GREEDY: [[C:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 1024 + ; GREEDY: [[ADD1:%[0-9]+]]:vgpr(s32) = G_ADD [[ADD]], [[C]] ; GREEDY: [[C1:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 0 ; GREEDY: [[C2:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0 ; GREEDY: [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(s32) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C2]](s32), [[ADD]], [[C1]], 1024, 0, 0 :: (dereferenceable invariant load (s32)) @@ -2869,11 +2857,10 @@ ; CHECK: [[BUILD_VECTOR:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32) ; CHECK: [[COPY4:%[0-9]+]]:vgpr(s32) = COPY $vgpr0 ; CHECK: [[COPY5:%[0-9]+]]:sgpr(s32) = COPY $sgpr6 - ; CHECK: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 1024 - ; CHECK: [[COPY6:%[0-9]+]]:vgpr(s32) = COPY [[C]](s32) - ; CHECK: [[ADD:%[0-9]+]]:vgpr(s32) = G_ADD [[COPY4]], [[COPY6]] - ; CHECK: [[COPY7:%[0-9]+]]:vgpr(s32) = COPY [[COPY5]](s32) - ; CHECK: [[ADD1:%[0-9]+]]:vgpr(s32) = G_ADD [[ADD]], [[COPY7]] + ; CHECK: [[C:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 1024 + ; CHECK: [[ADD:%[0-9]+]]:vgpr(s32) = G_ADD [[COPY4]], [[C]] + ; CHECK: [[COPY6:%[0-9]+]]:vgpr(s32) = COPY [[COPY5]](s32) + ; CHECK: [[ADD1:%[0-9]+]]:vgpr(s32) = G_ADD [[ADD]], [[COPY6]] ; CHECK: [[C1:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0 ; CHECK: [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(s32) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C1]](s32), [[ADD]], [[COPY5]], 0, 0, 0 :: (dereferenceable invariant load (s32)) ; CHECK: $vgpr0 = COPY [[AMDGPU_BUFFER_LOAD]](s32) @@ -2888,11 +2875,10 @@ ; GREEDY: [[BUILD_VECTOR:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32) ; GREEDY: [[COPY4:%[0-9]+]]:vgpr(s32) = COPY $vgpr0 ; GREEDY: [[COPY5:%[0-9]+]]:sgpr(s32) = COPY $sgpr6 - ; GREEDY: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 1024 - ; GREEDY: [[COPY6:%[0-9]+]]:vgpr(s32) = COPY [[C]](s32) - ; GREEDY: [[ADD:%[0-9]+]]:vgpr(s32) = G_ADD [[COPY4]], [[COPY6]] - ; GREEDY: [[COPY7:%[0-9]+]]:vgpr(s32) = COPY [[COPY5]](s32) - ; GREEDY: [[ADD1:%[0-9]+]]:vgpr(s32) = G_ADD [[ADD]], [[COPY7]] + ; GREEDY: [[C:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 1024 + ; GREEDY: [[ADD:%[0-9]+]]:vgpr(s32) = G_ADD [[COPY4]], [[C]] + ; GREEDY: [[COPY6:%[0-9]+]]:vgpr(s32) = COPY [[COPY5]](s32) + ; GREEDY: [[ADD1:%[0-9]+]]:vgpr(s32) = G_ADD [[ADD]], [[COPY6]] ; GREEDY: [[C1:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0 ; GREEDY: [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(s32) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C1]](s32), [[ADD]], [[COPY5]], 0, 0, 0 :: (dereferenceable invariant load (s32)) ; GREEDY: $vgpr0 = COPY [[AMDGPU_BUFFER_LOAD]](s32) Index: llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgcn.s.buffer.load.mir =================================================================== --- llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgcn.s.buffer.load.mir +++ llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgcn.s.buffer.load.mir @@ -17,27 +17,29 @@ ; FAST-LABEL: name: s_buffer_load_f32_vgpr_offset_cross_bank_copy_add_offset ; FAST: liveins: $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr0 - ; FAST: [[COPY:%[0-9]+]]:sgpr(<4 x s32>) = COPY $sgpr0_sgpr1_sgpr2_sgpr3 - ; FAST: [[COPY1:%[0-9]+]]:sgpr(s32) = COPY $sgpr0 - ; FAST: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 256 - ; FAST: [[COPY2:%[0-9]+]]:vgpr(s32) = COPY [[C]](s32) - ; FAST: [[COPY3:%[0-9]+]]:vgpr(s32) = COPY [[COPY1]](s32) - ; FAST: [[ADD:%[0-9]+]]:vgpr(s32) = G_ADD [[COPY3]], [[COPY2]] - ; FAST: [[C1:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 0 - ; FAST: [[C2:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0 - ; FAST: [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(s32) = G_AMDGPU_BUFFER_LOAD [[COPY]](<4 x s32>), [[C2]](s32), [[COPY3]], [[C1]], 256, 0, 0 :: (dereferenceable invariant load (s32)) - ; FAST: S_ENDPGM 0, implicit [[AMDGPU_BUFFER_LOAD]](s32) + ; FAST-NEXT: {{ $}} + ; FAST-NEXT: [[COPY:%[0-9]+]]:sgpr(<4 x s32>) = COPY $sgpr0_sgpr1_sgpr2_sgpr3 + ; FAST-NEXT: [[COPY1:%[0-9]+]]:sgpr(s32) = COPY $sgpr0 + ; FAST-NEXT: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 256 + ; FAST-NEXT: [[COPY2:%[0-9]+]]:vgpr(s32) = COPY [[C]](s32) + ; FAST-NEXT: [[COPY3:%[0-9]+]]:vgpr(s32) = COPY [[COPY1]](s32) + ; FAST-NEXT: [[ADD:%[0-9]+]]:vgpr(s32) = G_ADD [[COPY3]], [[COPY2]] + ; FAST-NEXT: [[C1:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 0 + ; FAST-NEXT: [[C2:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0 + ; FAST-NEXT: [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(s32) = G_AMDGPU_BUFFER_LOAD [[COPY]](<4 x s32>), [[C2]](s32), [[COPY3]], [[C1]], 256, 0, 0 :: (dereferenceable invariant load (s32)) + ; FAST-NEXT: S_ENDPGM 0, implicit [[AMDGPU_BUFFER_LOAD]](s32) ; GREEDY-LABEL: name: s_buffer_load_f32_vgpr_offset_cross_bank_copy_add_offset ; GREEDY: liveins: $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr0 - ; GREEDY: [[COPY:%[0-9]+]]:sgpr(<4 x s32>) = COPY $sgpr0_sgpr1_sgpr2_sgpr3 - ; GREEDY: [[COPY1:%[0-9]+]]:sgpr(s32) = COPY $sgpr0 - ; GREEDY: [[C:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 256 - ; GREEDY: [[COPY2:%[0-9]+]]:vgpr(s32) = COPY [[COPY1]](s32) - ; GREEDY: [[ADD:%[0-9]+]]:vgpr(s32) = G_ADD [[COPY2]], [[C]] - ; GREEDY: [[C1:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 0 - ; GREEDY: [[C2:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0 - ; GREEDY: [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(s32) = G_AMDGPU_BUFFER_LOAD [[COPY]](<4 x s32>), [[C2]](s32), [[COPY2]], [[C1]], 256, 0, 0 :: (dereferenceable invariant load (s32)) - ; GREEDY: S_ENDPGM 0, implicit [[AMDGPU_BUFFER_LOAD]](s32) + ; GREEDY-NEXT: {{ $}} + ; GREEDY-NEXT: [[COPY:%[0-9]+]]:sgpr(<4 x s32>) = COPY $sgpr0_sgpr1_sgpr2_sgpr3 + ; GREEDY-NEXT: [[COPY1:%[0-9]+]]:sgpr(s32) = COPY $sgpr0 + ; GREEDY-NEXT: [[C:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 256 + ; GREEDY-NEXT: [[COPY2:%[0-9]+]]:vgpr(s32) = COPY [[COPY1]](s32) + ; GREEDY-NEXT: [[ADD:%[0-9]+]]:vgpr(s32) = G_ADD [[COPY2]], [[C]] + ; GREEDY-NEXT: [[C1:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 0 + ; GREEDY-NEXT: [[C2:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0 + ; GREEDY-NEXT: [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(s32) = G_AMDGPU_BUFFER_LOAD [[COPY]](<4 x s32>), [[C2]](s32), [[COPY2]], [[C1]], 256, 0, 0 :: (dereferenceable invariant load (s32)) + ; GREEDY-NEXT: S_ENDPGM 0, implicit [[AMDGPU_BUFFER_LOAD]](s32) %0:_(<4 x s32>) = COPY $sgpr0_sgpr1_sgpr2_sgpr3 %1:_(s32) = COPY $sgpr0 %2:vgpr(s32) = G_CONSTANT i32 256 @@ -57,26 +59,26 @@ ; FAST-LABEL: name: s_buffer_load_negative_offset ; FAST: liveins: $sgpr0_sgpr1_sgpr2_sgpr3, $vgpr0 - ; FAST: [[COPY:%[0-9]+]]:sgpr(<4 x s32>) = COPY $sgpr0_sgpr1_sgpr2_sgpr3 - ; FAST: [[COPY1:%[0-9]+]]:vgpr(s32) = COPY $vgpr0 - ; FAST: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 -60 - ; FAST: [[COPY2:%[0-9]+]]:vgpr(s32) = COPY [[C]](s32) - ; FAST: [[ADD:%[0-9]+]]:vgpr(s32) = G_ADD [[COPY1]], [[COPY2]] - ; FAST: [[C1:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 0 - ; FAST: [[C2:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0 - ; FAST: [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(s32) = G_AMDGPU_BUFFER_LOAD [[COPY]](<4 x s32>), [[C2]](s32), [[ADD]], [[C1]], 0, 0, 0 :: (dereferenceable invariant load (s32)) - ; FAST: S_ENDPGM 0, implicit [[AMDGPU_BUFFER_LOAD]](s32) + ; FAST-NEXT: {{ $}} + ; FAST-NEXT: [[COPY:%[0-9]+]]:sgpr(<4 x s32>) = COPY $sgpr0_sgpr1_sgpr2_sgpr3 + ; FAST-NEXT: [[COPY1:%[0-9]+]]:vgpr(s32) = COPY $vgpr0 + ; FAST-NEXT: [[C:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 -60 + ; FAST-NEXT: [[ADD:%[0-9]+]]:vgpr(s32) = G_ADD [[COPY1]], [[C]] + ; FAST-NEXT: [[C1:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 0 + ; FAST-NEXT: [[C2:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0 + ; FAST-NEXT: [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(s32) = G_AMDGPU_BUFFER_LOAD [[COPY]](<4 x s32>), [[C2]](s32), [[ADD]], [[C1]], 0, 0, 0 :: (dereferenceable invariant load (s32)) + ; FAST-NEXT: S_ENDPGM 0, implicit [[AMDGPU_BUFFER_LOAD]](s32) ; GREEDY-LABEL: name: s_buffer_load_negative_offset ; GREEDY: liveins: $sgpr0_sgpr1_sgpr2_sgpr3, $vgpr0 - ; GREEDY: [[COPY:%[0-9]+]]:sgpr(<4 x s32>) = COPY $sgpr0_sgpr1_sgpr2_sgpr3 - ; GREEDY: [[COPY1:%[0-9]+]]:vgpr(s32) = COPY $vgpr0 - ; GREEDY: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 -60 - ; GREEDY: [[COPY2:%[0-9]+]]:vgpr(s32) = COPY [[C]](s32) - ; GREEDY: [[ADD:%[0-9]+]]:vgpr(s32) = G_ADD [[COPY1]], [[COPY2]] - ; GREEDY: [[C1:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 0 - ; GREEDY: [[C2:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0 - ; GREEDY: [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(s32) = G_AMDGPU_BUFFER_LOAD [[COPY]](<4 x s32>), [[C2]](s32), [[ADD]], [[C1]], 0, 0, 0 :: (dereferenceable invariant load (s32)) - ; GREEDY: S_ENDPGM 0, implicit [[AMDGPU_BUFFER_LOAD]](s32) + ; GREEDY-NEXT: {{ $}} + ; GREEDY-NEXT: [[COPY:%[0-9]+]]:sgpr(<4 x s32>) = COPY $sgpr0_sgpr1_sgpr2_sgpr3 + ; GREEDY-NEXT: [[COPY1:%[0-9]+]]:vgpr(s32) = COPY $vgpr0 + ; GREEDY-NEXT: [[C:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 -60 + ; GREEDY-NEXT: [[ADD:%[0-9]+]]:vgpr(s32) = G_ADD [[COPY1]], [[C]] + ; GREEDY-NEXT: [[C1:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 0 + ; GREEDY-NEXT: [[C2:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0 + ; GREEDY-NEXT: [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(s32) = G_AMDGPU_BUFFER_LOAD [[COPY]](<4 x s32>), [[C2]](s32), [[ADD]], [[C1]], 0, 0, 0 :: (dereferenceable invariant load (s32)) + ; GREEDY-NEXT: S_ENDPGM 0, implicit [[AMDGPU_BUFFER_LOAD]](s32) %0:_(<4 x s32>) = COPY $sgpr0_sgpr1_sgpr2_sgpr3 %1:_(s32) = COPY $vgpr0 %2:_(s32) = G_CONSTANT i32 -60 Index: llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-and-s1.mir =================================================================== --- llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-and-s1.mir +++ llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-and-s1.mir @@ -85,11 +85,9 @@ ; CHECK-LABEL: name: and_s1_vcc_vcc ; CHECK: [[COPY:%[0-9]+]]:vgpr(s32) = COPY $vgpr0 ; CHECK: [[COPY1:%[0-9]+]]:vgpr(s32) = COPY $vgpr1 - ; CHECK: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 0 - ; CHECK: [[COPY2:%[0-9]+]]:vgpr(s32) = COPY [[C]](s32) - ; CHECK: [[ICMP:%[0-9]+]]:vcc(s1) = G_ICMP intpred(eq), [[COPY]](s32), [[COPY2]] - ; CHECK: [[COPY3:%[0-9]+]]:vgpr(s32) = COPY [[C]](s32) - ; CHECK: [[ICMP1:%[0-9]+]]:vcc(s1) = G_ICMP intpred(eq), [[COPY1]](s32), [[COPY3]] + ; CHECK: [[C:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0 + ; CHECK: [[ICMP:%[0-9]+]]:vcc(s1) = G_ICMP intpred(eq), [[COPY]](s32), [[C]] + ; CHECK: [[ICMP1:%[0-9]+]]:vcc(s1) = G_ICMP intpred(eq), [[COPY1]](s32), [[C]] ; CHECK: [[AND:%[0-9]+]]:vcc(s1) = G_AND [[ICMP]], [[ICMP1]] %0:_(s32) = COPY $vgpr0 %1:_(s32) = COPY $vgpr1 @@ -181,11 +179,9 @@ ; CHECK-LABEL: name: and_s1_vcc_scc ; CHECK: [[COPY:%[0-9]+]]:vgpr(s32) = COPY $vgpr0 ; CHECK: [[COPY1:%[0-9]+]]:vgpr(s32) = COPY $vgpr1 - ; CHECK: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 0 - ; CHECK: [[COPY2:%[0-9]+]]:vgpr(s32) = COPY [[C]](s32) - ; CHECK: [[ICMP:%[0-9]+]]:vcc(s1) = G_ICMP intpred(eq), [[COPY]](s32), [[COPY2]] - ; CHECK: [[COPY3:%[0-9]+]]:vgpr(s32) = COPY [[C]](s32) - ; CHECK: [[ICMP1:%[0-9]+]]:vcc(s1) = G_ICMP intpred(eq), [[COPY1]](s32), [[COPY3]] + ; CHECK: [[C:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0 + ; CHECK: [[ICMP:%[0-9]+]]:vcc(s1) = G_ICMP intpred(eq), [[COPY]](s32), [[C]] + ; CHECK: [[ICMP1:%[0-9]+]]:vcc(s1) = G_ICMP intpred(eq), [[COPY1]](s32), [[C]] ; CHECK: [[AND:%[0-9]+]]:vcc(s1) = G_AND [[ICMP]], [[ICMP1]] %0:_(s32) = COPY $vgpr0 %1:_(s32) = COPY $vgpr1 Index: llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-constant.mir =================================================================== --- llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-constant.mir +++ llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-constant.mir @@ -10,9 +10,8 @@ liveins: $vgpr0_vgpr1 ; CHECK-LABEL: name: test_constant_s32_vgpr_use ; CHECK: [[COPY:%[0-9]+]]:vgpr(p1) = COPY $vgpr0_vgpr1 - ; CHECK: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 1 - ; CHECK: [[COPY1:%[0-9]+]]:vgpr(s32) = COPY [[C]](s32) - ; CHECK: G_STORE [[COPY1]](s32), [[COPY]](p1) :: (store (s32)) + ; CHECK: [[C:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 1 + ; CHECK: G_STORE [[C]](s32), [[COPY]](p1) :: (store (s32)) %0:_(p1) = COPY $vgpr0_vgpr1 %1:_(s32) = G_CONSTANT i32 1 G_STORE %1, %0 :: (store (s32)) Index: llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-extract-vector-elt.mir =================================================================== --- llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-extract-vector-elt.mir +++ llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-extract-vector-elt.mir @@ -554,9 +554,8 @@ ; WAVE64: liveins: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, $vgpr16 ; WAVE64: [[COPY:%[0-9]+]]:vgpr(<16 x s32>) = COPY $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 ; WAVE64: [[COPY1:%[0-9]+]]:vgpr(s32) = COPY $vgpr16 - ; WAVE64: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 1 - ; WAVE64: [[COPY2:%[0-9]+]]:vgpr(s32) = COPY [[C]](s32) - ; WAVE64: [[ADD:%[0-9]+]]:vgpr(s32) = G_ADD [[COPY1]], [[COPY2]] + ; WAVE64: [[C:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 1 + ; WAVE64: [[ADD:%[0-9]+]]:vgpr(s32) = G_ADD [[COPY1]], [[C]] ; WAVE64: [[UV:%[0-9]+]]:vgpr(s32), [[UV1:%[0-9]+]]:vgpr(s32), [[UV2:%[0-9]+]]:vgpr(s32), [[UV3:%[0-9]+]]:vgpr(s32), [[UV4:%[0-9]+]]:vgpr(s32), [[UV5:%[0-9]+]]:vgpr(s32), [[UV6:%[0-9]+]]:vgpr(s32), [[UV7:%[0-9]+]]:vgpr(s32), [[UV8:%[0-9]+]]:vgpr(s32), [[UV9:%[0-9]+]]:vgpr(s32), [[UV10:%[0-9]+]]:vgpr(s32), [[UV11:%[0-9]+]]:vgpr(s32), [[UV12:%[0-9]+]]:vgpr(s32), [[UV13:%[0-9]+]]:vgpr(s32), [[UV14:%[0-9]+]]:vgpr(s32), [[UV15:%[0-9]+]]:vgpr(s32) = G_UNMERGE_VALUES [[COPY]](<16 x s32>) ; WAVE64: [[C1:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 1 ; WAVE64: [[ICMP:%[0-9]+]]:vcc(s1) = G_ICMP intpred(eq), [[ADD]](s32), [[C1]] @@ -603,15 +602,14 @@ ; WAVE64: [[C15:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 15 ; WAVE64: [[ICMP14:%[0-9]+]]:vcc(s1) = G_ICMP intpred(eq), [[ADD]](s32), [[C15]] ; WAVE64: [[SELECT14:%[0-9]+]]:vgpr(s32) = G_SELECT [[ICMP14]](s1), [[UV15]], [[SELECT13]] - ; WAVE64: [[COPY3:%[0-9]+]]:vgpr(s32) = COPY [[SELECT14]](s32) - ; WAVE64: $vgpr0 = COPY [[COPY3]](s32) + ; WAVE64: [[COPY2:%[0-9]+]]:vgpr(s32) = COPY [[SELECT14]](s32) + ; WAVE64: $vgpr0 = COPY [[COPY2]](s32) ; WAVE32-LABEL: name: extract_vector_elt_v16s32_vv_idx_add1 ; WAVE32: liveins: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, $vgpr16 ; WAVE32: [[COPY:%[0-9]+]]:vgpr(<16 x s32>) = COPY $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 ; WAVE32: [[COPY1:%[0-9]+]]:vgpr(s32) = COPY $vgpr16 - ; WAVE32: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 1 - ; WAVE32: [[COPY2:%[0-9]+]]:vgpr(s32) = COPY [[C]](s32) - ; WAVE32: [[ADD:%[0-9]+]]:vgpr(s32) = G_ADD [[COPY1]], [[COPY2]] + ; WAVE32: [[C:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 1 + ; WAVE32: [[ADD:%[0-9]+]]:vgpr(s32) = G_ADD [[COPY1]], [[C]] ; WAVE32: [[UV:%[0-9]+]]:vgpr(s32), [[UV1:%[0-9]+]]:vgpr(s32), [[UV2:%[0-9]+]]:vgpr(s32), [[UV3:%[0-9]+]]:vgpr(s32), [[UV4:%[0-9]+]]:vgpr(s32), [[UV5:%[0-9]+]]:vgpr(s32), [[UV6:%[0-9]+]]:vgpr(s32), [[UV7:%[0-9]+]]:vgpr(s32), [[UV8:%[0-9]+]]:vgpr(s32), [[UV9:%[0-9]+]]:vgpr(s32), [[UV10:%[0-9]+]]:vgpr(s32), [[UV11:%[0-9]+]]:vgpr(s32), [[UV12:%[0-9]+]]:vgpr(s32), [[UV13:%[0-9]+]]:vgpr(s32), [[UV14:%[0-9]+]]:vgpr(s32), [[UV15:%[0-9]+]]:vgpr(s32) = G_UNMERGE_VALUES [[COPY]](<16 x s32>) ; WAVE32: [[C1:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 1 ; WAVE32: [[ICMP:%[0-9]+]]:vcc(s1) = G_ICMP intpred(eq), [[ADD]](s32), [[C1]] @@ -658,8 +656,8 @@ ; WAVE32: [[C15:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 15 ; WAVE32: [[ICMP14:%[0-9]+]]:vcc(s1) = G_ICMP intpred(eq), [[ADD]](s32), [[C15]] ; WAVE32: [[SELECT14:%[0-9]+]]:vgpr(s32) = G_SELECT [[ICMP14]](s1), [[UV15]], [[SELECT13]] - ; WAVE32: [[COPY3:%[0-9]+]]:vgpr(s32) = COPY [[SELECT14]](s32) - ; WAVE32: $vgpr0 = COPY [[COPY3]](s32) + ; WAVE32: [[COPY2:%[0-9]+]]:vgpr(s32) = COPY [[SELECT14]](s32) + ; WAVE32: $vgpr0 = COPY [[COPY2]](s32) %0:_(<16 x s32>) = COPY $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 %1:_(s32) = COPY $vgpr16 %2:_(s32) = G_CONSTANT i32 1 @@ -681,9 +679,8 @@ ; WAVE64: liveins: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, $vgpr16 ; WAVE64: [[COPY:%[0-9]+]]:vgpr(<16 x s32>) = COPY $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 ; WAVE64: [[COPY1:%[0-9]+]]:vgpr(s32) = COPY $vgpr16 - ; WAVE64: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 -1 - ; WAVE64: [[COPY2:%[0-9]+]]:vgpr(s32) = COPY [[C]](s32) - ; WAVE64: [[ADD:%[0-9]+]]:vgpr(s32) = G_ADD [[COPY1]], [[COPY2]] + ; WAVE64: [[C:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 -1 + ; WAVE64: [[ADD:%[0-9]+]]:vgpr(s32) = G_ADD [[COPY1]], [[C]] ; WAVE64: [[UV:%[0-9]+]]:vgpr(s32), [[UV1:%[0-9]+]]:vgpr(s32), [[UV2:%[0-9]+]]:vgpr(s32), [[UV3:%[0-9]+]]:vgpr(s32), [[UV4:%[0-9]+]]:vgpr(s32), [[UV5:%[0-9]+]]:vgpr(s32), [[UV6:%[0-9]+]]:vgpr(s32), [[UV7:%[0-9]+]]:vgpr(s32), [[UV8:%[0-9]+]]:vgpr(s32), [[UV9:%[0-9]+]]:vgpr(s32), [[UV10:%[0-9]+]]:vgpr(s32), [[UV11:%[0-9]+]]:vgpr(s32), [[UV12:%[0-9]+]]:vgpr(s32), [[UV13:%[0-9]+]]:vgpr(s32), [[UV14:%[0-9]+]]:vgpr(s32), [[UV15:%[0-9]+]]:vgpr(s32) = G_UNMERGE_VALUES [[COPY]](<16 x s32>) ; WAVE64: [[C1:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 1 ; WAVE64: [[ICMP:%[0-9]+]]:vcc(s1) = G_ICMP intpred(eq), [[ADD]](s32), [[C1]] @@ -730,15 +727,14 @@ ; WAVE64: [[C15:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 15 ; WAVE64: [[ICMP14:%[0-9]+]]:vcc(s1) = G_ICMP intpred(eq), [[ADD]](s32), [[C15]] ; WAVE64: [[SELECT14:%[0-9]+]]:vgpr(s32) = G_SELECT [[ICMP14]](s1), [[UV15]], [[SELECT13]] - ; WAVE64: [[COPY3:%[0-9]+]]:vgpr(s32) = COPY [[SELECT14]](s32) - ; WAVE64: $vgpr0 = COPY [[COPY3]](s32) + ; WAVE64: [[COPY2:%[0-9]+]]:vgpr(s32) = COPY [[SELECT14]](s32) + ; WAVE64: $vgpr0 = COPY [[COPY2]](s32) ; WAVE32-LABEL: name: extract_vector_elt_v16s32_vv_idx_addm1 ; WAVE32: liveins: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, $vgpr16 ; WAVE32: [[COPY:%[0-9]+]]:vgpr(<16 x s32>) = COPY $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 ; WAVE32: [[COPY1:%[0-9]+]]:vgpr(s32) = COPY $vgpr16 - ; WAVE32: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 -1 - ; WAVE32: [[COPY2:%[0-9]+]]:vgpr(s32) = COPY [[C]](s32) - ; WAVE32: [[ADD:%[0-9]+]]:vgpr(s32) = G_ADD [[COPY1]], [[COPY2]] + ; WAVE32: [[C:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 -1 + ; WAVE32: [[ADD:%[0-9]+]]:vgpr(s32) = G_ADD [[COPY1]], [[C]] ; WAVE32: [[UV:%[0-9]+]]:vgpr(s32), [[UV1:%[0-9]+]]:vgpr(s32), [[UV2:%[0-9]+]]:vgpr(s32), [[UV3:%[0-9]+]]:vgpr(s32), [[UV4:%[0-9]+]]:vgpr(s32), [[UV5:%[0-9]+]]:vgpr(s32), [[UV6:%[0-9]+]]:vgpr(s32), [[UV7:%[0-9]+]]:vgpr(s32), [[UV8:%[0-9]+]]:vgpr(s32), [[UV9:%[0-9]+]]:vgpr(s32), [[UV10:%[0-9]+]]:vgpr(s32), [[UV11:%[0-9]+]]:vgpr(s32), [[UV12:%[0-9]+]]:vgpr(s32), [[UV13:%[0-9]+]]:vgpr(s32), [[UV14:%[0-9]+]]:vgpr(s32), [[UV15:%[0-9]+]]:vgpr(s32) = G_UNMERGE_VALUES [[COPY]](<16 x s32>) ; WAVE32: [[C1:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 1 ; WAVE32: [[ICMP:%[0-9]+]]:vcc(s1) = G_ICMP intpred(eq), [[ADD]](s32), [[C1]] @@ -785,8 +781,8 @@ ; WAVE32: [[C15:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 15 ; WAVE32: [[ICMP14:%[0-9]+]]:vcc(s1) = G_ICMP intpred(eq), [[ADD]](s32), [[C15]] ; WAVE32: [[SELECT14:%[0-9]+]]:vgpr(s32) = G_SELECT [[ICMP14]](s1), [[UV15]], [[SELECT13]] - ; WAVE32: [[COPY3:%[0-9]+]]:vgpr(s32) = COPY [[SELECT14]](s32) - ; WAVE32: $vgpr0 = COPY [[COPY3]](s32) + ; WAVE32: [[COPY2:%[0-9]+]]:vgpr(s32) = COPY [[SELECT14]](s32) + ; WAVE32: $vgpr0 = COPY [[COPY2]](s32) %0:_(<16 x s32>) = COPY $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 %1:_(s32) = COPY $vgpr16 %2:_(s32) = G_CONSTANT i32 -1 @@ -808,9 +804,8 @@ ; WAVE64: liveins: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, $vgpr16 ; WAVE64: [[COPY:%[0-9]+]]:vgpr(<16 x s32>) = COPY $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 ; WAVE64: [[COPY1:%[0-9]+]]:vgpr(s32) = COPY $vgpr16 - ; WAVE64: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 16 - ; WAVE64: [[COPY2:%[0-9]+]]:vgpr(s32) = COPY [[C]](s32) - ; WAVE64: [[ADD:%[0-9]+]]:vgpr(s32) = G_ADD [[COPY1]], [[COPY2]] + ; WAVE64: [[C:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 16 + ; WAVE64: [[ADD:%[0-9]+]]:vgpr(s32) = G_ADD [[COPY1]], [[C]] ; WAVE64: [[UV:%[0-9]+]]:vgpr(s32), [[UV1:%[0-9]+]]:vgpr(s32), [[UV2:%[0-9]+]]:vgpr(s32), [[UV3:%[0-9]+]]:vgpr(s32), [[UV4:%[0-9]+]]:vgpr(s32), [[UV5:%[0-9]+]]:vgpr(s32), [[UV6:%[0-9]+]]:vgpr(s32), [[UV7:%[0-9]+]]:vgpr(s32), [[UV8:%[0-9]+]]:vgpr(s32), [[UV9:%[0-9]+]]:vgpr(s32), [[UV10:%[0-9]+]]:vgpr(s32), [[UV11:%[0-9]+]]:vgpr(s32), [[UV12:%[0-9]+]]:vgpr(s32), [[UV13:%[0-9]+]]:vgpr(s32), [[UV14:%[0-9]+]]:vgpr(s32), [[UV15:%[0-9]+]]:vgpr(s32) = G_UNMERGE_VALUES [[COPY]](<16 x s32>) ; WAVE64: [[C1:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 1 ; WAVE64: [[ICMP:%[0-9]+]]:vcc(s1) = G_ICMP intpred(eq), [[ADD]](s32), [[C1]] @@ -857,15 +852,14 @@ ; WAVE64: [[C15:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 15 ; WAVE64: [[ICMP14:%[0-9]+]]:vcc(s1) = G_ICMP intpred(eq), [[ADD]](s32), [[C15]] ; WAVE64: [[SELECT14:%[0-9]+]]:vgpr(s32) = G_SELECT [[ICMP14]](s1), [[UV15]], [[SELECT13]] - ; WAVE64: [[COPY3:%[0-9]+]]:vgpr(s32) = COPY [[SELECT14]](s32) - ; WAVE64: $vgpr0 = COPY [[COPY3]](s32) + ; WAVE64: [[COPY2:%[0-9]+]]:vgpr(s32) = COPY [[SELECT14]](s32) + ; WAVE64: $vgpr0 = COPY [[COPY2]](s32) ; WAVE32-LABEL: name: extract_vector_elt_v16s32_vv_idx_add16 ; WAVE32: liveins: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, $vgpr16 ; WAVE32: [[COPY:%[0-9]+]]:vgpr(<16 x s32>) = COPY $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 ; WAVE32: [[COPY1:%[0-9]+]]:vgpr(s32) = COPY $vgpr16 - ; WAVE32: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 16 - ; WAVE32: [[COPY2:%[0-9]+]]:vgpr(s32) = COPY [[C]](s32) - ; WAVE32: [[ADD:%[0-9]+]]:vgpr(s32) = G_ADD [[COPY1]], [[COPY2]] + ; WAVE32: [[C:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 16 + ; WAVE32: [[ADD:%[0-9]+]]:vgpr(s32) = G_ADD [[COPY1]], [[C]] ; WAVE32: [[UV:%[0-9]+]]:vgpr(s32), [[UV1:%[0-9]+]]:vgpr(s32), [[UV2:%[0-9]+]]:vgpr(s32), [[UV3:%[0-9]+]]:vgpr(s32), [[UV4:%[0-9]+]]:vgpr(s32), [[UV5:%[0-9]+]]:vgpr(s32), [[UV6:%[0-9]+]]:vgpr(s32), [[UV7:%[0-9]+]]:vgpr(s32), [[UV8:%[0-9]+]]:vgpr(s32), [[UV9:%[0-9]+]]:vgpr(s32), [[UV10:%[0-9]+]]:vgpr(s32), [[UV11:%[0-9]+]]:vgpr(s32), [[UV12:%[0-9]+]]:vgpr(s32), [[UV13:%[0-9]+]]:vgpr(s32), [[UV14:%[0-9]+]]:vgpr(s32), [[UV15:%[0-9]+]]:vgpr(s32) = G_UNMERGE_VALUES [[COPY]](<16 x s32>) ; WAVE32: [[C1:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 1 ; WAVE32: [[ICMP:%[0-9]+]]:vcc(s1) = G_ICMP intpred(eq), [[ADD]](s32), [[C1]] @@ -912,8 +906,8 @@ ; WAVE32: [[C15:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 15 ; WAVE32: [[ICMP14:%[0-9]+]]:vcc(s1) = G_ICMP intpred(eq), [[ADD]](s32), [[C15]] ; WAVE32: [[SELECT14:%[0-9]+]]:vgpr(s32) = G_SELECT [[ICMP14]](s1), [[UV15]], [[SELECT13]] - ; WAVE32: [[COPY3:%[0-9]+]]:vgpr(s32) = COPY [[SELECT14]](s32) - ; WAVE32: $vgpr0 = COPY [[COPY3]](s32) + ; WAVE32: [[COPY2:%[0-9]+]]:vgpr(s32) = COPY [[SELECT14]](s32) + ; WAVE32: $vgpr0 = COPY [[COPY2]](s32) %0:_(<16 x s32>) = COPY $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 %1:_(s32) = COPY $vgpr16 %2:_(s32) = G_CONSTANT i32 16 @@ -935,9 +929,8 @@ ; WAVE64: liveins: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, $vgpr16 ; WAVE64: [[COPY:%[0-9]+]]:vgpr(<8 x s64>) = COPY $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 ; WAVE64: [[COPY1:%[0-9]+]]:vgpr(s32) = COPY $vgpr16 - ; WAVE64: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 1 - ; WAVE64: [[COPY2:%[0-9]+]]:vgpr(s32) = COPY [[C]](s32) - ; WAVE64: [[ADD:%[0-9]+]]:vgpr(s32) = G_ADD [[COPY1]], [[COPY2]] + ; WAVE64: [[C:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 1 + ; WAVE64: [[ADD:%[0-9]+]]:vgpr(s32) = G_ADD [[COPY1]], [[C]] ; WAVE64: [[UV:%[0-9]+]]:vgpr(s32), [[UV1:%[0-9]+]]:vgpr(s32), [[UV2:%[0-9]+]]:vgpr(s32), [[UV3:%[0-9]+]]:vgpr(s32), [[UV4:%[0-9]+]]:vgpr(s32), [[UV5:%[0-9]+]]:vgpr(s32), [[UV6:%[0-9]+]]:vgpr(s32), [[UV7:%[0-9]+]]:vgpr(s32), [[UV8:%[0-9]+]]:vgpr(s32), [[UV9:%[0-9]+]]:vgpr(s32), [[UV10:%[0-9]+]]:vgpr(s32), [[UV11:%[0-9]+]]:vgpr(s32), [[UV12:%[0-9]+]]:vgpr(s32), [[UV13:%[0-9]+]]:vgpr(s32), [[UV14:%[0-9]+]]:vgpr(s32), [[UV15:%[0-9]+]]:vgpr(s32) = G_UNMERGE_VALUES [[COPY]](<8 x s64>) ; WAVE64: [[C1:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 1 ; WAVE64: [[ICMP:%[0-9]+]]:vcc(s1) = G_ICMP intpred(eq), [[ADD]](s32), [[C1]] @@ -967,17 +960,16 @@ ; WAVE64: [[ICMP6:%[0-9]+]]:vcc(s1) = G_ICMP intpred(eq), [[ADD]](s32), [[C7]] ; WAVE64: [[SELECT12:%[0-9]+]]:vgpr(s32) = G_SELECT [[ICMP6]](s1), [[UV14]], [[SELECT10]] ; WAVE64: [[SELECT13:%[0-9]+]]:vgpr(s32) = G_SELECT [[ICMP6]](s1), [[UV15]], [[SELECT11]] - ; WAVE64: [[COPY3:%[0-9]+]]:vgpr(s32) = COPY [[SELECT12]](s32) - ; WAVE64: [[COPY4:%[0-9]+]]:vgpr(s32) = COPY [[SELECT13]](s32) - ; WAVE64: [[MV:%[0-9]+]]:vgpr(s64) = G_MERGE_VALUES [[COPY3]](s32), [[COPY4]](s32) + ; WAVE64: [[COPY2:%[0-9]+]]:vgpr(s32) = COPY [[SELECT12]](s32) + ; WAVE64: [[COPY3:%[0-9]+]]:vgpr(s32) = COPY [[SELECT13]](s32) + ; WAVE64: [[MV:%[0-9]+]]:vgpr(s64) = G_MERGE_VALUES [[COPY2]](s32), [[COPY3]](s32) ; WAVE64: $vgpr0_vgpr1 = COPY [[MV]](s64) ; WAVE32-LABEL: name: extract_vector_elt_v8s64_vv_idx_add1 ; WAVE32: liveins: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, $vgpr16 ; WAVE32: [[COPY:%[0-9]+]]:vgpr(<8 x s64>) = COPY $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 ; WAVE32: [[COPY1:%[0-9]+]]:vgpr(s32) = COPY $vgpr16 - ; WAVE32: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 1 - ; WAVE32: [[COPY2:%[0-9]+]]:vgpr(s32) = COPY [[C]](s32) - ; WAVE32: [[ADD:%[0-9]+]]:vgpr(s32) = G_ADD [[COPY1]], [[COPY2]] + ; WAVE32: [[C:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 1 + ; WAVE32: [[ADD:%[0-9]+]]:vgpr(s32) = G_ADD [[COPY1]], [[C]] ; WAVE32: [[UV:%[0-9]+]]:vgpr(s32), [[UV1:%[0-9]+]]:vgpr(s32), [[UV2:%[0-9]+]]:vgpr(s32), [[UV3:%[0-9]+]]:vgpr(s32), [[UV4:%[0-9]+]]:vgpr(s32), [[UV5:%[0-9]+]]:vgpr(s32), [[UV6:%[0-9]+]]:vgpr(s32), [[UV7:%[0-9]+]]:vgpr(s32), [[UV8:%[0-9]+]]:vgpr(s32), [[UV9:%[0-9]+]]:vgpr(s32), [[UV10:%[0-9]+]]:vgpr(s32), [[UV11:%[0-9]+]]:vgpr(s32), [[UV12:%[0-9]+]]:vgpr(s32), [[UV13:%[0-9]+]]:vgpr(s32), [[UV14:%[0-9]+]]:vgpr(s32), [[UV15:%[0-9]+]]:vgpr(s32) = G_UNMERGE_VALUES [[COPY]](<8 x s64>) ; WAVE32: [[C1:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 1 ; WAVE32: [[ICMP:%[0-9]+]]:vcc(s1) = G_ICMP intpred(eq), [[ADD]](s32), [[C1]] @@ -1007,9 +999,9 @@ ; WAVE32: [[ICMP6:%[0-9]+]]:vcc(s1) = G_ICMP intpred(eq), [[ADD]](s32), [[C7]] ; WAVE32: [[SELECT12:%[0-9]+]]:vgpr(s32) = G_SELECT [[ICMP6]](s1), [[UV14]], [[SELECT10]] ; WAVE32: [[SELECT13:%[0-9]+]]:vgpr(s32) = G_SELECT [[ICMP6]](s1), [[UV15]], [[SELECT11]] - ; WAVE32: [[COPY3:%[0-9]+]]:vgpr(s32) = COPY [[SELECT12]](s32) - ; WAVE32: [[COPY4:%[0-9]+]]:vgpr(s32) = COPY [[SELECT13]](s32) - ; WAVE32: [[MV:%[0-9]+]]:vgpr(s64) = G_MERGE_VALUES [[COPY3]](s32), [[COPY4]](s32) + ; WAVE32: [[COPY2:%[0-9]+]]:vgpr(s32) = COPY [[SELECT12]](s32) + ; WAVE32: [[COPY3:%[0-9]+]]:vgpr(s32) = COPY [[SELECT13]](s32) + ; WAVE32: [[MV:%[0-9]+]]:vgpr(s64) = G_MERGE_VALUES [[COPY2]](s32), [[COPY3]](s32) ; WAVE32: $vgpr0_vgpr1 = COPY [[MV]](s64) %0:_(<8 x s64>) = COPY $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 %1:_(s32) = COPY $vgpr16 @@ -1032,9 +1024,8 @@ ; WAVE64: liveins: $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15, $vgpr0 ; WAVE64: [[COPY:%[0-9]+]]:sgpr(<16 x s32>) = COPY $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 ; WAVE64: [[COPY1:%[0-9]+]]:vgpr(s32) = COPY $vgpr0 - ; WAVE64: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 1 - ; WAVE64: [[COPY2:%[0-9]+]]:vgpr(s32) = COPY [[C]](s32) - ; WAVE64: [[ADD:%[0-9]+]]:vgpr(s32) = G_ADD [[COPY1]], [[COPY2]] + ; WAVE64: [[C:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 1 + ; WAVE64: [[ADD:%[0-9]+]]:vgpr(s32) = G_ADD [[COPY1]], [[C]] ; WAVE64: [[UV:%[0-9]+]]:vgpr(s32), [[UV1:%[0-9]+]]:vgpr(s32), [[UV2:%[0-9]+]]:vgpr(s32), [[UV3:%[0-9]+]]:vgpr(s32), [[UV4:%[0-9]+]]:vgpr(s32), [[UV5:%[0-9]+]]:vgpr(s32), [[UV6:%[0-9]+]]:vgpr(s32), [[UV7:%[0-9]+]]:vgpr(s32), [[UV8:%[0-9]+]]:vgpr(s32), [[UV9:%[0-9]+]]:vgpr(s32), [[UV10:%[0-9]+]]:vgpr(s32), [[UV11:%[0-9]+]]:vgpr(s32), [[UV12:%[0-9]+]]:vgpr(s32), [[UV13:%[0-9]+]]:vgpr(s32), [[UV14:%[0-9]+]]:vgpr(s32), [[UV15:%[0-9]+]]:vgpr(s32) = G_UNMERGE_VALUES [[COPY]](<16 x s32>) ; WAVE64: [[C1:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 1 ; WAVE64: [[ICMP:%[0-9]+]]:vcc(s1) = G_ICMP intpred(eq), [[ADD]](s32), [[C1]] @@ -1081,15 +1072,14 @@ ; WAVE64: [[C15:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 15 ; WAVE64: [[ICMP14:%[0-9]+]]:vcc(s1) = G_ICMP intpred(eq), [[ADD]](s32), [[C15]] ; WAVE64: [[SELECT14:%[0-9]+]]:vgpr(s32) = G_SELECT [[ICMP14]](s1), [[UV15]], [[SELECT13]] - ; WAVE64: [[COPY3:%[0-9]+]]:vgpr(s32) = COPY [[SELECT14]](s32) - ; WAVE64: $vgpr0 = COPY [[COPY3]](s32) + ; WAVE64: [[COPY2:%[0-9]+]]:vgpr(s32) = COPY [[SELECT14]](s32) + ; WAVE64: $vgpr0 = COPY [[COPY2]](s32) ; WAVE32-LABEL: name: extract_vector_elt_v16s32_sv_idx_add1 ; WAVE32: liveins: $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15, $vgpr0 ; WAVE32: [[COPY:%[0-9]+]]:sgpr(<16 x s32>) = COPY $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 ; WAVE32: [[COPY1:%[0-9]+]]:vgpr(s32) = COPY $vgpr0 - ; WAVE32: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 1 - ; WAVE32: [[COPY2:%[0-9]+]]:vgpr(s32) = COPY [[C]](s32) - ; WAVE32: [[ADD:%[0-9]+]]:vgpr(s32) = G_ADD [[COPY1]], [[COPY2]] + ; WAVE32: [[C:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 1 + ; WAVE32: [[ADD:%[0-9]+]]:vgpr(s32) = G_ADD [[COPY1]], [[C]] ; WAVE32: [[UV:%[0-9]+]]:vgpr(s32), [[UV1:%[0-9]+]]:vgpr(s32), [[UV2:%[0-9]+]]:vgpr(s32), [[UV3:%[0-9]+]]:vgpr(s32), [[UV4:%[0-9]+]]:vgpr(s32), [[UV5:%[0-9]+]]:vgpr(s32), [[UV6:%[0-9]+]]:vgpr(s32), [[UV7:%[0-9]+]]:vgpr(s32), [[UV8:%[0-9]+]]:vgpr(s32), [[UV9:%[0-9]+]]:vgpr(s32), [[UV10:%[0-9]+]]:vgpr(s32), [[UV11:%[0-9]+]]:vgpr(s32), [[UV12:%[0-9]+]]:vgpr(s32), [[UV13:%[0-9]+]]:vgpr(s32), [[UV14:%[0-9]+]]:vgpr(s32), [[UV15:%[0-9]+]]:vgpr(s32) = G_UNMERGE_VALUES [[COPY]](<16 x s32>) ; WAVE32: [[C1:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 1 ; WAVE32: [[ICMP:%[0-9]+]]:vcc(s1) = G_ICMP intpred(eq), [[ADD]](s32), [[C1]] @@ -1136,8 +1126,8 @@ ; WAVE32: [[C15:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 15 ; WAVE32: [[ICMP14:%[0-9]+]]:vcc(s1) = G_ICMP intpred(eq), [[ADD]](s32), [[C15]] ; WAVE32: [[SELECT14:%[0-9]+]]:vgpr(s32) = G_SELECT [[ICMP14]](s1), [[UV15]], [[SELECT13]] - ; WAVE32: [[COPY3:%[0-9]+]]:vgpr(s32) = COPY [[SELECT14]](s32) - ; WAVE32: $vgpr0 = COPY [[COPY3]](s32) + ; WAVE32: [[COPY2:%[0-9]+]]:vgpr(s32) = COPY [[SELECT14]](s32) + ; WAVE32: $vgpr0 = COPY [[COPY2]](s32) %0:_(<16 x s32>) = COPY $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 %1:_(s32) = COPY $vgpr0 %2:_(s32) = G_CONSTANT i32 1 @@ -1159,9 +1149,8 @@ ; WAVE64: liveins: $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15, $vgpr0 ; WAVE64: [[COPY:%[0-9]+]]:sgpr(<8 x s64>) = COPY $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 ; WAVE64: [[COPY1:%[0-9]+]]:vgpr(s32) = COPY $vgpr0 - ; WAVE64: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 1 - ; WAVE64: [[COPY2:%[0-9]+]]:vgpr(s32) = COPY [[C]](s32) - ; WAVE64: [[ADD:%[0-9]+]]:vgpr(s32) = G_ADD [[COPY1]], [[COPY2]] + ; WAVE64: [[C:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 1 + ; WAVE64: [[ADD:%[0-9]+]]:vgpr(s32) = G_ADD [[COPY1]], [[C]] ; WAVE64: [[UV:%[0-9]+]]:vgpr(s32), [[UV1:%[0-9]+]]:vgpr(s32), [[UV2:%[0-9]+]]:vgpr(s32), [[UV3:%[0-9]+]]:vgpr(s32), [[UV4:%[0-9]+]]:vgpr(s32), [[UV5:%[0-9]+]]:vgpr(s32), [[UV6:%[0-9]+]]:vgpr(s32), [[UV7:%[0-9]+]]:vgpr(s32), [[UV8:%[0-9]+]]:vgpr(s32), [[UV9:%[0-9]+]]:vgpr(s32), [[UV10:%[0-9]+]]:vgpr(s32), [[UV11:%[0-9]+]]:vgpr(s32), [[UV12:%[0-9]+]]:vgpr(s32), [[UV13:%[0-9]+]]:vgpr(s32), [[UV14:%[0-9]+]]:vgpr(s32), [[UV15:%[0-9]+]]:vgpr(s32) = G_UNMERGE_VALUES [[COPY]](<8 x s64>) ; WAVE64: [[C1:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 1 ; WAVE64: [[ICMP:%[0-9]+]]:vcc(s1) = G_ICMP intpred(eq), [[ADD]](s32), [[C1]] @@ -1191,17 +1180,16 @@ ; WAVE64: [[ICMP6:%[0-9]+]]:vcc(s1) = G_ICMP intpred(eq), [[ADD]](s32), [[C7]] ; WAVE64: [[SELECT12:%[0-9]+]]:vgpr(s32) = G_SELECT [[ICMP6]](s1), [[UV14]], [[SELECT10]] ; WAVE64: [[SELECT13:%[0-9]+]]:vgpr(s32) = G_SELECT [[ICMP6]](s1), [[UV15]], [[SELECT11]] - ; WAVE64: [[COPY3:%[0-9]+]]:vgpr(s32) = COPY [[SELECT12]](s32) - ; WAVE64: [[COPY4:%[0-9]+]]:vgpr(s32) = COPY [[SELECT13]](s32) - ; WAVE64: [[MV:%[0-9]+]]:vgpr(s64) = G_MERGE_VALUES [[COPY3]](s32), [[COPY4]](s32) + ; WAVE64: [[COPY2:%[0-9]+]]:vgpr(s32) = COPY [[SELECT12]](s32) + ; WAVE64: [[COPY3:%[0-9]+]]:vgpr(s32) = COPY [[SELECT13]](s32) + ; WAVE64: [[MV:%[0-9]+]]:vgpr(s64) = G_MERGE_VALUES [[COPY2]](s32), [[COPY3]](s32) ; WAVE64: $vgpr0_vgpr1 = COPY [[MV]](s64) ; WAVE32-LABEL: name: extract_vector_elt_v8s64_sv_add1 ; WAVE32: liveins: $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15, $vgpr0 ; WAVE32: [[COPY:%[0-9]+]]:sgpr(<8 x s64>) = COPY $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 ; WAVE32: [[COPY1:%[0-9]+]]:vgpr(s32) = COPY $vgpr0 - ; WAVE32: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 1 - ; WAVE32: [[COPY2:%[0-9]+]]:vgpr(s32) = COPY [[C]](s32) - ; WAVE32: [[ADD:%[0-9]+]]:vgpr(s32) = G_ADD [[COPY1]], [[COPY2]] + ; WAVE32: [[C:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 1 + ; WAVE32: [[ADD:%[0-9]+]]:vgpr(s32) = G_ADD [[COPY1]], [[C]] ; WAVE32: [[UV:%[0-9]+]]:vgpr(s32), [[UV1:%[0-9]+]]:vgpr(s32), [[UV2:%[0-9]+]]:vgpr(s32), [[UV3:%[0-9]+]]:vgpr(s32), [[UV4:%[0-9]+]]:vgpr(s32), [[UV5:%[0-9]+]]:vgpr(s32), [[UV6:%[0-9]+]]:vgpr(s32), [[UV7:%[0-9]+]]:vgpr(s32), [[UV8:%[0-9]+]]:vgpr(s32), [[UV9:%[0-9]+]]:vgpr(s32), [[UV10:%[0-9]+]]:vgpr(s32), [[UV11:%[0-9]+]]:vgpr(s32), [[UV12:%[0-9]+]]:vgpr(s32), [[UV13:%[0-9]+]]:vgpr(s32), [[UV14:%[0-9]+]]:vgpr(s32), [[UV15:%[0-9]+]]:vgpr(s32) = G_UNMERGE_VALUES [[COPY]](<8 x s64>) ; WAVE32: [[C1:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 1 ; WAVE32: [[ICMP:%[0-9]+]]:vcc(s1) = G_ICMP intpred(eq), [[ADD]](s32), [[C1]] @@ -1231,9 +1219,9 @@ ; WAVE32: [[ICMP6:%[0-9]+]]:vcc(s1) = G_ICMP intpred(eq), [[ADD]](s32), [[C7]] ; WAVE32: [[SELECT12:%[0-9]+]]:vgpr(s32) = G_SELECT [[ICMP6]](s1), [[UV14]], [[SELECT10]] ; WAVE32: [[SELECT13:%[0-9]+]]:vgpr(s32) = G_SELECT [[ICMP6]](s1), [[UV15]], [[SELECT11]] - ; WAVE32: [[COPY3:%[0-9]+]]:vgpr(s32) = COPY [[SELECT12]](s32) - ; WAVE32: [[COPY4:%[0-9]+]]:vgpr(s32) = COPY [[SELECT13]](s32) - ; WAVE32: [[MV:%[0-9]+]]:vgpr(s64) = G_MERGE_VALUES [[COPY3]](s32), [[COPY4]](s32) + ; WAVE32: [[COPY2:%[0-9]+]]:vgpr(s32) = COPY [[SELECT12]](s32) + ; WAVE32: [[COPY3:%[0-9]+]]:vgpr(s32) = COPY [[SELECT13]](s32) + ; WAVE32: [[MV:%[0-9]+]]:vgpr(s64) = G_MERGE_VALUES [[COPY2]](s32), [[COPY3]](s32) ; WAVE32: $vgpr0_vgpr1 = COPY [[MV]](s64) %0:_(<8 x s64>) = COPY $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 %1:_(s32) = COPY $vgpr0 Index: llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-or.mir =================================================================== --- llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-or.mir +++ llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-or.mir @@ -107,11 +107,9 @@ ; CHECK-LABEL: name: or_i1_vcc_vcc ; CHECK: [[COPY:%[0-9]+]]:vgpr(s32) = COPY $vgpr0 ; CHECK: [[COPY1:%[0-9]+]]:vgpr(s32) = COPY $vgpr1 - ; CHECK: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 0 - ; CHECK: [[COPY2:%[0-9]+]]:vgpr(s32) = COPY [[C]](s32) - ; CHECK: [[ICMP:%[0-9]+]]:vcc(s1) = G_ICMP intpred(ne), [[COPY]](s32), [[COPY2]] - ; CHECK: [[COPY3:%[0-9]+]]:vgpr(s32) = COPY [[C]](s32) - ; CHECK: [[ICMP1:%[0-9]+]]:vcc(s1) = G_ICMP intpred(ne), [[COPY1]](s32), [[COPY3]] + ; CHECK: [[C:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0 + ; CHECK: [[ICMP:%[0-9]+]]:vcc(s1) = G_ICMP intpred(ne), [[COPY]](s32), [[C]] + ; CHECK: [[ICMP1:%[0-9]+]]:vcc(s1) = G_ICMP intpred(ne), [[COPY1]](s32), [[C]] ; CHECK: [[OR:%[0-9]+]]:vcc(s1) = G_OR [[ICMP]], [[ICMP1]] ; CHECK: S_NOP 0, implicit [[OR]](s1) %0:_(s32) = COPY $vgpr0 Index: llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-phi.mir =================================================================== --- llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-phi.mir +++ llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-phi.mir @@ -306,17 +306,16 @@ ; FAST: [[COPY:%[0-9]+]]:sgpr(s32) = COPY $sgpr0 ; FAST: [[COPY1:%[0-9]+]]:sgpr(s32) = COPY $sgpr1 ; FAST: [[COPY2:%[0-9]+]]:vgpr(s32) = COPY $vgpr0 - ; FAST: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 0 - ; FAST: [[COPY3:%[0-9]+]]:vgpr(s32) = COPY [[C]](s32) - ; FAST: [[ICMP:%[0-9]+]]:vcc(s1) = G_ICMP intpred(eq), [[COPY2]](s32), [[COPY3]] + ; FAST: [[C:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0 + ; FAST: [[ICMP:%[0-9]+]]:vcc(s1) = G_ICMP intpred(eq), [[COPY2]](s32), [[C]] ; FAST: G_BRCOND [[ICMP]](s1), %bb.1 ; FAST: G_BR %bb.2 ; FAST: bb.1: ; FAST: successors: %bb.2(0x80000000) - ; FAST: [[COPY4:%[0-9]+]]:sgpr(s32) = COPY [[COPY1]](s32) + ; FAST: [[COPY3:%[0-9]+]]:sgpr(s32) = COPY [[COPY1]](s32) ; FAST: G_BR %bb.2 ; FAST: bb.2: - ; FAST: [[PHI:%[0-9]+]]:sgpr(s32) = G_PHI [[COPY]](s32), %bb.0, [[COPY4]](s32), %bb.1 + ; FAST: [[PHI:%[0-9]+]]:sgpr(s32) = G_PHI [[COPY]](s32), %bb.0, [[COPY3]](s32), %bb.1 ; FAST: $sgpr0 = COPY [[PHI]](s32) ; FAST: S_SETPC_B64 undef $sgpr30_sgpr31 ; GREEDY-LABEL: name: phi_s32_ss_vcc_sbranch @@ -326,17 +325,16 @@ ; GREEDY: [[COPY:%[0-9]+]]:sgpr(s32) = COPY $sgpr0 ; GREEDY: [[COPY1:%[0-9]+]]:sgpr(s32) = COPY $sgpr1 ; GREEDY: [[COPY2:%[0-9]+]]:vgpr(s32) = COPY $vgpr0 - ; GREEDY: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 0 - ; GREEDY: [[COPY3:%[0-9]+]]:vgpr(s32) = COPY [[C]](s32) - ; GREEDY: [[ICMP:%[0-9]+]]:vcc(s1) = G_ICMP intpred(eq), [[COPY2]](s32), [[COPY3]] + ; GREEDY: [[C:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0 + ; GREEDY: [[ICMP:%[0-9]+]]:vcc(s1) = G_ICMP intpred(eq), [[COPY2]](s32), [[C]] ; GREEDY: G_BRCOND [[ICMP]](s1), %bb.1 ; GREEDY: G_BR %bb.2 ; GREEDY: bb.1: ; GREEDY: successors: %bb.2(0x80000000) - ; GREEDY: [[COPY4:%[0-9]+]]:sgpr(s32) = COPY [[COPY1]](s32) + ; GREEDY: [[COPY3:%[0-9]+]]:sgpr(s32) = COPY [[COPY1]](s32) ; GREEDY: G_BR %bb.2 ; GREEDY: bb.2: - ; GREEDY: [[PHI:%[0-9]+]]:sgpr(s32) = G_PHI [[COPY]](s32), %bb.0, [[COPY4]](s32), %bb.1 + ; GREEDY: [[PHI:%[0-9]+]]:sgpr(s32) = G_PHI [[COPY]](s32), %bb.0, [[COPY3]](s32), %bb.1 ; GREEDY: $sgpr0 = COPY [[PHI]](s32) ; GREEDY: S_SETPC_B64 undef $sgpr30_sgpr31 bb.0: @@ -377,17 +375,16 @@ ; FAST: [[COPY:%[0-9]+]]:sgpr(s32) = COPY $sgpr0 ; FAST: [[COPY1:%[0-9]+]]:vgpr(s32) = COPY $vgpr0 ; FAST: [[COPY2:%[0-9]+]]:vgpr(s32) = COPY $vgpr1 - ; FAST: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 0 - ; FAST: [[COPY3:%[0-9]+]]:vgpr(s32) = COPY [[C]](s32) - ; FAST: [[ICMP:%[0-9]+]]:vcc(s1) = G_ICMP intpred(eq), [[COPY2]](s32), [[COPY3]] + ; FAST: [[C:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0 + ; FAST: [[ICMP:%[0-9]+]]:vcc(s1) = G_ICMP intpred(eq), [[COPY2]](s32), [[C]] ; FAST: G_BRCOND [[ICMP]](s1), %bb.1 ; FAST: G_BR %bb.2 ; FAST: bb.1: ; FAST: successors: %bb.2(0x80000000) - ; FAST: [[COPY4:%[0-9]+]]:vgpr(s32) = COPY [[COPY1]](s32) + ; FAST: [[COPY3:%[0-9]+]]:vgpr(s32) = COPY [[COPY1]](s32) ; FAST: G_BR %bb.2 ; FAST: bb.2: - ; FAST: [[PHI:%[0-9]+]]:vgpr(s32) = G_PHI [[COPY]](s32), %bb.0, [[COPY4]](s32), %bb.1 + ; FAST: [[PHI:%[0-9]+]]:vgpr(s32) = G_PHI [[COPY]](s32), %bb.0, [[COPY3]](s32), %bb.1 ; FAST: $vgpr0 = COPY [[PHI]](s32) ; FAST: S_SETPC_B64 undef $sgpr30_sgpr31 ; GREEDY-LABEL: name: phi_s32_sv_vcc_sbranch @@ -397,17 +394,16 @@ ; GREEDY: [[COPY:%[0-9]+]]:sgpr(s32) = COPY $sgpr0 ; GREEDY: [[COPY1:%[0-9]+]]:vgpr(s32) = COPY $vgpr0 ; GREEDY: [[COPY2:%[0-9]+]]:vgpr(s32) = COPY $vgpr1 - ; GREEDY: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 0 - ; GREEDY: [[COPY3:%[0-9]+]]:vgpr(s32) = COPY [[C]](s32) - ; GREEDY: [[ICMP:%[0-9]+]]:vcc(s1) = G_ICMP intpred(eq), [[COPY2]](s32), [[COPY3]] + ; GREEDY: [[C:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0 + ; GREEDY: [[ICMP:%[0-9]+]]:vcc(s1) = G_ICMP intpred(eq), [[COPY2]](s32), [[C]] ; GREEDY: G_BRCOND [[ICMP]](s1), %bb.1 ; GREEDY: G_BR %bb.2 ; GREEDY: bb.1: ; GREEDY: successors: %bb.2(0x80000000) - ; GREEDY: [[COPY4:%[0-9]+]]:vgpr(s32) = COPY [[COPY1]](s32) + ; GREEDY: [[COPY3:%[0-9]+]]:vgpr(s32) = COPY [[COPY1]](s32) ; GREEDY: G_BR %bb.2 ; GREEDY: bb.2: - ; GREEDY: [[PHI:%[0-9]+]]:vgpr(s32) = G_PHI [[COPY]](s32), %bb.0, [[COPY4]](s32), %bb.1 + ; GREEDY: [[PHI:%[0-9]+]]:vgpr(s32) = G_PHI [[COPY]](s32), %bb.0, [[COPY3]](s32), %bb.1 ; GREEDY: $vgpr0 = COPY [[PHI]](s32) ; GREEDY: S_SETPC_B64 undef $sgpr30_sgpr31 bb.0: @@ -448,17 +444,16 @@ ; FAST: [[COPY:%[0-9]+]]:vgpr(s32) = COPY $vgpr0 ; FAST: [[COPY1:%[0-9]+]]:sgpr(s32) = COPY $sgpr0 ; FAST: [[COPY2:%[0-9]+]]:vgpr(s32) = COPY $vgpr1 - ; FAST: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 0 - ; FAST: [[COPY3:%[0-9]+]]:vgpr(s32) = COPY [[C]](s32) - ; FAST: [[ICMP:%[0-9]+]]:vcc(s1) = G_ICMP intpred(eq), [[COPY2]](s32), [[COPY3]] + ; FAST: [[C:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0 + ; FAST: [[ICMP:%[0-9]+]]:vcc(s1) = G_ICMP intpred(eq), [[COPY2]](s32), [[C]] ; FAST: G_BRCOND [[ICMP]](s1), %bb.1 ; FAST: G_BR %bb.2 ; FAST: bb.1: ; FAST: successors: %bb.2(0x80000000) - ; FAST: [[COPY4:%[0-9]+]]:sgpr(s32) = COPY [[COPY1]](s32) + ; FAST: [[COPY3:%[0-9]+]]:sgpr(s32) = COPY [[COPY1]](s32) ; FAST: G_BR %bb.2 ; FAST: bb.2: - ; FAST: [[PHI:%[0-9]+]]:vgpr(s32) = G_PHI [[COPY]](s32), %bb.0, [[COPY4]](s32), %bb.1 + ; FAST: [[PHI:%[0-9]+]]:vgpr(s32) = G_PHI [[COPY]](s32), %bb.0, [[COPY3]](s32), %bb.1 ; FAST: $vgpr0 = COPY [[PHI]](s32) ; FAST: S_SETPC_B64 undef $sgpr30_sgpr31 ; GREEDY-LABEL: name: phi_s32_vs_vcc_sbranch @@ -468,17 +463,16 @@ ; GREEDY: [[COPY:%[0-9]+]]:vgpr(s32) = COPY $vgpr0 ; GREEDY: [[COPY1:%[0-9]+]]:sgpr(s32) = COPY $sgpr0 ; GREEDY: [[COPY2:%[0-9]+]]:vgpr(s32) = COPY $vgpr1 - ; GREEDY: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 0 - ; GREEDY: [[COPY3:%[0-9]+]]:vgpr(s32) = COPY [[C]](s32) - ; GREEDY: [[ICMP:%[0-9]+]]:vcc(s1) = G_ICMP intpred(eq), [[COPY2]](s32), [[COPY3]] + ; GREEDY: [[C:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0 + ; GREEDY: [[ICMP:%[0-9]+]]:vcc(s1) = G_ICMP intpred(eq), [[COPY2]](s32), [[C]] ; GREEDY: G_BRCOND [[ICMP]](s1), %bb.1 ; GREEDY: G_BR %bb.2 ; GREEDY: bb.1: ; GREEDY: successors: %bb.2(0x80000000) - ; GREEDY: [[COPY4:%[0-9]+]]:sgpr(s32) = COPY [[COPY1]](s32) + ; GREEDY: [[COPY3:%[0-9]+]]:sgpr(s32) = COPY [[COPY1]](s32) ; GREEDY: G_BR %bb.2 ; GREEDY: bb.2: - ; GREEDY: [[PHI:%[0-9]+]]:vgpr(s32) = G_PHI [[COPY]](s32), %bb.0, [[COPY4]](s32), %bb.1 + ; GREEDY: [[PHI:%[0-9]+]]:vgpr(s32) = G_PHI [[COPY]](s32), %bb.0, [[COPY3]](s32), %bb.1 ; GREEDY: $vgpr0 = COPY [[PHI]](s32) ; GREEDY: S_SETPC_B64 undef $sgpr30_sgpr31 bb.0: @@ -519,17 +513,16 @@ ; FAST: [[COPY:%[0-9]+]]:vgpr(s32) = COPY $vgpr0 ; FAST: [[COPY1:%[0-9]+]]:vgpr(s32) = COPY $vgpr1 ; FAST: [[COPY2:%[0-9]+]]:vgpr(s32) = COPY $vgpr2 - ; FAST: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 0 - ; FAST: [[COPY3:%[0-9]+]]:vgpr(s32) = COPY [[C]](s32) - ; FAST: [[ICMP:%[0-9]+]]:vcc(s1) = G_ICMP intpred(eq), [[COPY2]](s32), [[COPY3]] + ; FAST: [[C:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0 + ; FAST: [[ICMP:%[0-9]+]]:vcc(s1) = G_ICMP intpred(eq), [[COPY2]](s32), [[C]] ; FAST: G_BRCOND [[ICMP]](s1), %bb.1 ; FAST: G_BR %bb.2 ; FAST: bb.1: ; FAST: successors: %bb.2(0x80000000) - ; FAST: [[COPY4:%[0-9]+]]:vgpr(s32) = COPY [[COPY1]](s32) + ; FAST: [[COPY3:%[0-9]+]]:vgpr(s32) = COPY [[COPY1]](s32) ; FAST: G_BR %bb.2 ; FAST: bb.2: - ; FAST: [[PHI:%[0-9]+]]:vgpr(s32) = G_PHI [[COPY]](s32), %bb.0, [[COPY4]](s32), %bb.1 + ; FAST: [[PHI:%[0-9]+]]:vgpr(s32) = G_PHI [[COPY]](s32), %bb.0, [[COPY3]](s32), %bb.1 ; FAST: $vgpr0 = COPY [[PHI]](s32) ; FAST: S_SETPC_B64 undef $sgpr30_sgpr31 ; GREEDY-LABEL: name: phi_s32_vv_vcc_sbranch @@ -539,17 +532,16 @@ ; GREEDY: [[COPY:%[0-9]+]]:vgpr(s32) = COPY $vgpr0 ; GREEDY: [[COPY1:%[0-9]+]]:vgpr(s32) = COPY $vgpr1 ; GREEDY: [[COPY2:%[0-9]+]]:vgpr(s32) = COPY $vgpr2 - ; GREEDY: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 0 - ; GREEDY: [[COPY3:%[0-9]+]]:vgpr(s32) = COPY [[C]](s32) - ; GREEDY: [[ICMP:%[0-9]+]]:vcc(s1) = G_ICMP intpred(eq), [[COPY2]](s32), [[COPY3]] + ; GREEDY: [[C:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0 + ; GREEDY: [[ICMP:%[0-9]+]]:vcc(s1) = G_ICMP intpred(eq), [[COPY2]](s32), [[C]] ; GREEDY: G_BRCOND [[ICMP]](s1), %bb.1 ; GREEDY: G_BR %bb.2 ; GREEDY: bb.1: ; GREEDY: successors: %bb.2(0x80000000) - ; GREEDY: [[COPY4:%[0-9]+]]:vgpr(s32) = COPY [[COPY1]](s32) + ; GREEDY: [[COPY3:%[0-9]+]]:vgpr(s32) = COPY [[COPY1]](s32) ; GREEDY: G_BR %bb.2 ; GREEDY: bb.2: - ; GREEDY: [[PHI:%[0-9]+]]:vgpr(s32) = G_PHI [[COPY]](s32), %bb.0, [[COPY4]](s32), %bb.1 + ; GREEDY: [[PHI:%[0-9]+]]:vgpr(s32) = G_PHI [[COPY]](s32), %bb.0, [[COPY3]](s32), %bb.1 ; GREEDY: $vgpr0 = COPY [[PHI]](s32) ; GREEDY: S_SETPC_B64 undef $sgpr30_sgpr31 bb.0: Index: llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-ptr-add.mir =================================================================== --- llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-ptr-add.mir +++ llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-ptr-add.mir @@ -13,7 +13,7 @@ ; CHECK-LABEL: name: gep_p1_s_k ; CHECK: [[COPY:%[0-9]+]]:sgpr(p1) = COPY $sgpr0_sgpr1 ; CHECK: [[C:%[0-9]+]]:sgpr(s64) = G_CONSTANT i64 1 - ; CHECK: [[GEP:%[0-9]+]]:sgpr(p1) = G_PTR_ADD [[COPY]], [[C]](s64) + ; CHECK: [[PTR_ADD:%[0-9]+]]:sgpr(p1) = G_PTR_ADD [[COPY]], [[C]](s64) %0:_(p1) = COPY $sgpr0_sgpr1 %1:_(s64) = G_CONSTANT i64 1 %2:_(p1) = G_PTR_ADD %0, %1 @@ -30,7 +30,7 @@ ; CHECK-LABEL: name: gep_p1_s_s ; CHECK: [[COPY:%[0-9]+]]:sgpr(p1) = COPY $sgpr0_sgpr1 ; CHECK: [[COPY1:%[0-9]+]]:sgpr(s64) = COPY $sgpr2_sgpr3 - ; CHECK: [[GEP:%[0-9]+]]:sgpr(p1) = G_PTR_ADD [[COPY]], [[COPY1]](s64) + ; CHECK: [[PTR_ADD:%[0-9]+]]:sgpr(p1) = G_PTR_ADD [[COPY]], [[COPY1]](s64) %0:_(p1) = COPY $sgpr0_sgpr1 %1:_(s64) = COPY $sgpr2_sgpr3 %2:_(p1) = G_PTR_ADD %0, %1 @@ -48,7 +48,7 @@ ; CHECK: [[COPY:%[0-9]+]]:vgpr(p1) = COPY $vgpr0_vgpr1 ; CHECK: [[C:%[0-9]+]]:sgpr(s64) = G_CONSTANT i64 1 ; CHECK: [[COPY1:%[0-9]+]]:vgpr(s64) = COPY [[C]](s64) - ; CHECK: [[GEP:%[0-9]+]]:vgpr(p1) = G_PTR_ADD [[COPY]], [[COPY1]](s64) + ; CHECK: [[PTR_ADD:%[0-9]+]]:vgpr(p1) = G_PTR_ADD [[COPY]], [[COPY1]](s64) %0:_(p1) = COPY $vgpr0_vgpr1 %1:_(s64) = G_CONSTANT i64 1 %2:_(p1) = G_PTR_ADD %0, %1 @@ -66,7 +66,7 @@ ; CHECK: [[COPY:%[0-9]+]]:vgpr(p1) = COPY $vgpr0_vgpr1 ; CHECK: [[COPY1:%[0-9]+]]:sgpr(s64) = COPY $sgpr0_sgpr1 ; CHECK: [[COPY2:%[0-9]+]]:vgpr(s64) = COPY [[COPY1]](s64) - ; CHECK: [[GEP:%[0-9]+]]:vgpr(p1) = G_PTR_ADD [[COPY]], [[COPY2]](s64) + ; CHECK: [[PTR_ADD:%[0-9]+]]:vgpr(p1) = G_PTR_ADD [[COPY]], [[COPY2]](s64) %0:_(p1) = COPY $vgpr0_vgpr1 %1:_(s64) = COPY $sgpr0_sgpr1 %2:_(p1) = G_PTR_ADD %0, %1 @@ -83,7 +83,7 @@ ; CHECK-LABEL: name: gep_p1_v_v ; CHECK: [[COPY:%[0-9]+]]:vgpr(p1) = COPY $vgpr0_vgpr1 ; CHECK: [[COPY1:%[0-9]+]]:vgpr(s64) = COPY $vgpr2_vgpr3 - ; CHECK: [[GEP:%[0-9]+]]:vgpr(p1) = G_PTR_ADD [[COPY]], [[COPY1]](s64) + ; CHECK: [[PTR_ADD:%[0-9]+]]:vgpr(p1) = G_PTR_ADD [[COPY]], [[COPY1]](s64) %0:_(p1) = COPY $vgpr0_vgpr1 %1:_(s64) = COPY $vgpr2_vgpr3 %2:_(p1) = G_PTR_ADD %0, %1 Index: llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-sbfx.mir =================================================================== --- llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-sbfx.mir +++ llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-sbfx.mir @@ -37,11 +37,9 @@ ; CHECK-LABEL: name: test_sbfx_s32_vii ; CHECK: [[COPY:%[0-9]+]]:vgpr(s32) = COPY $vgpr0 - ; CHECK: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 10 - ; CHECK: [[C1:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 4 - ; CHECK: [[COPY1:%[0-9]+]]:vgpr(s32) = COPY [[C]](s32) - ; CHECK: [[COPY2:%[0-9]+]]:vgpr(s32) = COPY [[C1]](s32) - ; CHECK: [[SBFX:%[0-9]+]]:vgpr(s32) = G_SBFX [[COPY]], [[COPY1]](s32), [[COPY2]] + ; CHECK: [[C:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 10 + ; CHECK: [[C1:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 4 + ; CHECK: [[SBFX:%[0-9]+]]:vgpr(s32) = G_SBFX [[COPY]], [[C]](s32), [[C1]] ; CHECK: $vgpr0 = COPY [[SBFX]](s32) %0:_(s32) = COPY $vgpr0 %1:_(s32) = G_CONSTANT i32 10 @@ -139,14 +137,12 @@ ; CHECK-LABEL: name: test_sbfx_s64_vii_small ; CHECK: [[COPY:%[0-9]+]]:vgpr(s64) = COPY $vgpr0_vgpr1 - ; CHECK: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 31 - ; CHECK: [[C1:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 4 - ; CHECK: [[COPY1:%[0-9]+]]:vgpr(s32) = COPY [[C]](s32) - ; CHECK: [[COPY2:%[0-9]+]]:vgpr(s32) = COPY [[C1]](s32) - ; CHECK: [[ASHR:%[0-9]+]]:vgpr(s64) = G_ASHR [[COPY]], [[COPY1]](s32) + ; CHECK: [[C:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 31 + ; CHECK: [[C1:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 4 + ; CHECK: [[ASHR:%[0-9]+]]:vgpr(s64) = G_ASHR [[COPY]], [[C]](s32) ; CHECK: [[UV:%[0-9]+]]:vgpr(s32), [[UV1:%[0-9]+]]:vgpr(s32) = G_UNMERGE_VALUES [[ASHR]](s64) ; CHECK: [[C2:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0 - ; CHECK: [[SBFX:%[0-9]+]]:vgpr(s32) = G_SBFX [[UV]], [[C2]](s32), [[COPY2]] + ; CHECK: [[SBFX:%[0-9]+]]:vgpr(s32) = G_SBFX [[UV]], [[C2]](s32), [[C1]] ; CHECK: [[C3:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 31 ; CHECK: [[ASHR1:%[0-9]+]]:vgpr(s32) = G_ASHR [[SBFX]], [[C3]](s32) ; CHECK: [[MV:%[0-9]+]]:vgpr(s64) = G_MERGE_VALUES [[SBFX]](s32), [[ASHR1]](s32) @@ -168,11 +164,9 @@ ; CHECK-LABEL: name: test_sbfx_s64_vii_big ; CHECK: [[COPY:%[0-9]+]]:vgpr(s64) = COPY $vgpr0_vgpr1 - ; CHECK: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 8 - ; CHECK: [[C1:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 40 - ; CHECK: [[COPY1:%[0-9]+]]:vgpr(s32) = COPY [[C]](s32) - ; CHECK: [[COPY2:%[0-9]+]]:vgpr(s32) = COPY [[C1]](s32) - ; CHECK: [[ASHR:%[0-9]+]]:vgpr(s64) = G_ASHR [[COPY]], [[COPY1]](s32) + ; CHECK: [[C:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 8 + ; CHECK: [[C1:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 40 + ; CHECK: [[ASHR:%[0-9]+]]:vgpr(s64) = G_ASHR [[COPY]], [[C]](s32) ; CHECK: [[UV:%[0-9]+]]:vgpr(s32), [[UV1:%[0-9]+]]:vgpr(s32) = G_UNMERGE_VALUES [[ASHR]](s64) ; CHECK: [[C2:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0 ; CHECK: [[C3:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 8 Index: llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-ubfx.mir =================================================================== --- llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-ubfx.mir +++ llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-ubfx.mir @@ -37,11 +37,9 @@ ; CHECK-LABEL: name: test_ubfx_s32_vii ; CHECK: [[COPY:%[0-9]+]]:vgpr(s32) = COPY $vgpr0 - ; CHECK: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 10 - ; CHECK: [[C1:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 4 - ; CHECK: [[COPY1:%[0-9]+]]:vgpr(s32) = COPY [[C]](s32) - ; CHECK: [[COPY2:%[0-9]+]]:vgpr(s32) = COPY [[C1]](s32) - ; CHECK: [[UBFX:%[0-9]+]]:vgpr(s32) = G_UBFX [[COPY]], [[COPY1]](s32), [[COPY2]] + ; CHECK: [[C:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 10 + ; CHECK: [[C1:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 4 + ; CHECK: [[UBFX:%[0-9]+]]:vgpr(s32) = G_UBFX [[COPY]], [[C]](s32), [[C1]] ; CHECK: $vgpr0 = COPY [[UBFX]](s32) %0:_(s32) = COPY $vgpr0 %1:_(s32) = G_CONSTANT i32 10 @@ -139,14 +137,12 @@ ; CHECK-LABEL: name: test_ubfx_s64_vii_small ; CHECK: [[COPY:%[0-9]+]]:vgpr(s64) = COPY $vgpr0_vgpr1 - ; CHECK: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 31 - ; CHECK: [[C1:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 4 - ; CHECK: [[COPY1:%[0-9]+]]:vgpr(s32) = COPY [[C]](s32) - ; CHECK: [[COPY2:%[0-9]+]]:vgpr(s32) = COPY [[C1]](s32) - ; CHECK: [[LSHR:%[0-9]+]]:vgpr(s64) = G_LSHR [[COPY]], [[COPY1]](s32) + ; CHECK: [[C:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 31 + ; CHECK: [[C1:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 4 + ; CHECK: [[LSHR:%[0-9]+]]:vgpr(s64) = G_LSHR [[COPY]], [[C]](s32) ; CHECK: [[UV:%[0-9]+]]:vgpr(s32), [[UV1:%[0-9]+]]:vgpr(s32) = G_UNMERGE_VALUES [[LSHR]](s64) ; CHECK: [[C2:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0 - ; CHECK: [[UBFX:%[0-9]+]]:vgpr(s32) = G_UBFX [[UV]], [[C2]](s32), [[COPY2]] + ; CHECK: [[UBFX:%[0-9]+]]:vgpr(s32) = G_UBFX [[UV]], [[C2]](s32), [[C1]] ; CHECK: [[MV:%[0-9]+]]:vgpr(s64) = G_MERGE_VALUES [[UBFX]](s32), [[C2]](s32) ; CHECK: $vgpr0_vgpr1 = COPY [[MV]](s64) %0:_(s64) = COPY $vgpr0_vgpr1 @@ -166,11 +162,9 @@ ; CHECK-LABEL: name: test_ubfx_s64_vii_big ; CHECK: [[COPY:%[0-9]+]]:vgpr(s64) = COPY $vgpr0_vgpr1 - ; CHECK: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 8 - ; CHECK: [[C1:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 40 - ; CHECK: [[COPY1:%[0-9]+]]:vgpr(s32) = COPY [[C]](s32) - ; CHECK: [[COPY2:%[0-9]+]]:vgpr(s32) = COPY [[C1]](s32) - ; CHECK: [[LSHR:%[0-9]+]]:vgpr(s64) = G_LSHR [[COPY]], [[COPY1]](s32) + ; CHECK: [[C:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 8 + ; CHECK: [[C1:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 40 + ; CHECK: [[LSHR:%[0-9]+]]:vgpr(s64) = G_LSHR [[COPY]], [[C]](s32) ; CHECK: [[UV:%[0-9]+]]:vgpr(s32), [[UV1:%[0-9]+]]:vgpr(s32) = G_UNMERGE_VALUES [[LSHR]](s64) ; CHECK: [[C2:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0 ; CHECK: [[C3:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 8 Index: llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-waterfall-agpr.mir =================================================================== --- llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-waterfall-agpr.mir +++ llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-waterfall-agpr.mir @@ -20,18 +20,17 @@ ; CHECK: %rsrc:sgpr(<4 x s32>) = COPY $sgpr0_sgpr1_sgpr2_sgpr3 ; CHECK: %agpr:agpr(s32) = COPY $agpr0 ; CHECK: %voffset:vgpr(s32) = COPY $vgpr1 - ; CHECK: %zero:sgpr(s32) = G_CONSTANT i32 0 - ; CHECK: [[COPY:%[0-9]+]]:vgpr(s32) = COPY %zero(s32) + ; CHECK: %zero:vgpr(s32) = G_CONSTANT i32 0 ; CHECK: [[DEF:%[0-9]+]]:sreg_64_xexec = IMPLICIT_DEF - ; CHECK: [[COPY1:%[0-9]+]]:vgpr_32(s32) = COPY %agpr(s32) + ; CHECK: [[COPY:%[0-9]+]]:vgpr_32(s32) = COPY %agpr(s32) ; CHECK: [[S_MOV_B64_term:%[0-9]+]]:sreg_64_xexec = S_MOV_B64_term $exec ; CHECK: .1: ; CHECK: successors: %bb.2(0x40000000), %bb.1(0x40000000) - ; CHECK: [[PHI:%[0-9]+]]:sreg_64_xexec = PHI [[DEF]], %bb.0, %9, %bb.1 - ; CHECK: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[COPY1]](s32), implicit $exec - ; CHECK: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_]](s32), [[COPY1]](s32), implicit $exec + ; CHECK: [[PHI:%[0-9]+]]:sreg_64_xexec = PHI [[DEF]], %bb.0, %8, %bb.1 + ; CHECK: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[COPY]](s32), implicit $exec + ; CHECK: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_]](s32), [[COPY]](s32), implicit $exec ; CHECK: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[V_CMP_EQ_U32_e64_]], implicit-def $exec, implicit-def $scc, implicit $exec - ; CHECK: G_AMDGPU_BUFFER_STORE %val(s32), %rsrc(<4 x s32>), [[COPY]](s32), %voffset, [[V_READFIRSTLANE_B32_]], 0, 0, 0 :: (dereferenceable store (s32), addrspace 4) + ; CHECK: G_AMDGPU_BUFFER_STORE %val(s32), %rsrc(<4 x s32>), %zero(s32), %voffset, [[V_READFIRSTLANE_B32_]], 0, 0, 0 :: (dereferenceable store (s32), addrspace 4) ; CHECK: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc ; CHECK: SI_WATERFALL_LOOP %bb.1, implicit $exec ; CHECK: .2: Index: llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-xor.mir =================================================================== --- llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-xor.mir +++ llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-xor.mir @@ -107,11 +107,9 @@ ; CHECK-LABEL: name: xor_i1_vcc_vcc ; CHECK: [[COPY:%[0-9]+]]:vgpr(s32) = COPY $vgpr0 ; CHECK: [[COPY1:%[0-9]+]]:vgpr(s32) = COPY $vgpr1 - ; CHECK: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 0 - ; CHECK: [[COPY2:%[0-9]+]]:vgpr(s32) = COPY [[C]](s32) - ; CHECK: [[ICMP:%[0-9]+]]:vcc(s1) = G_ICMP intpred(ne), [[COPY]](s32), [[COPY2]] - ; CHECK: [[COPY3:%[0-9]+]]:vgpr(s32) = COPY [[C]](s32) - ; CHECK: [[ICMP1:%[0-9]+]]:vcc(s1) = G_ICMP intpred(ne), [[COPY1]](s32), [[COPY3]] + ; CHECK: [[C:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0 + ; CHECK: [[ICMP:%[0-9]+]]:vcc(s1) = G_ICMP intpred(ne), [[COPY]](s32), [[C]] + ; CHECK: [[ICMP1:%[0-9]+]]:vcc(s1) = G_ICMP intpred(ne), [[COPY1]](s32), [[C]] ; CHECK: [[XOR:%[0-9]+]]:vcc(s1) = G_XOR [[ICMP]], [[ICMP1]] ; CHECK: S_NOP 0, implicit [[XOR]](s1) %0:_(s32) = COPY $vgpr0 @@ -760,12 +758,11 @@ liveins: $vgpr0, $vgpr1 ; CHECK-LABEL: name: xor_i1_vcc_constant ; CHECK: [[COPY:%[0-9]+]]:vgpr(s32) = COPY $vgpr0 - ; CHECK: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 0 - ; CHECK: [[COPY1:%[0-9]+]]:vgpr(s32) = COPY [[C]](s32) - ; CHECK: [[ICMP:%[0-9]+]]:vcc(s1) = G_ICMP intpred(ne), [[COPY]](s32), [[COPY1]] + ; CHECK: [[C:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0 + ; CHECK: [[ICMP:%[0-9]+]]:vcc(s1) = G_ICMP intpred(ne), [[COPY]](s32), [[C]] ; CHECK: [[C1:%[0-9]+]]:sgpr(s1) = G_CONSTANT i1 true - ; CHECK: [[COPY2:%[0-9]+]]:vcc(s1) = COPY [[C1]](s1) - ; CHECK: [[XOR:%[0-9]+]]:vcc(s1) = G_XOR [[ICMP]], [[COPY2]] + ; CHECK: [[COPY1:%[0-9]+]]:vcc(s1) = COPY [[C1]](s1) + ; CHECK: [[XOR:%[0-9]+]]:vcc(s1) = G_XOR [[ICMP]], [[COPY1]] ; CHECK: S_NOP 0, implicit [[XOR]](s1) %0:_(s32) = COPY $vgpr0 %1:_(s32) = G_CONSTANT i32 0 Index: llvm/test/CodeGen/AMDGPU/GlobalISel/saddsat.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/GlobalISel/saddsat.ll +++ llvm/test/CodeGen/AMDGPU/GlobalISel/saddsat.ll @@ -238,23 +238,23 @@ ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: v_lshrrev_b32_e32 v2, 8, v0 ; GFX6-NEXT: v_lshlrev_b32_e32 v0, 24, v0 -; GFX6-NEXT: s_brev_b32 s5, 1 -; GFX6-NEXT: v_min_i32_e32 v5, 0, v0 +; GFX6-NEXT: v_bfrev_b32_e32 v5, 1 +; GFX6-NEXT: v_min_i32_e32 v7, 0, v0 ; GFX6-NEXT: v_lshrrev_b32_e32 v3, 8, v1 ; GFX6-NEXT: v_lshlrev_b32_e32 v1, 24, v1 -; GFX6-NEXT: s_brev_b32 s4, -2 -; GFX6-NEXT: v_max_i32_e32 v4, 0, v0 -; GFX6-NEXT: v_sub_i32_e32 v5, vcc, s5, v5 -; GFX6-NEXT: v_sub_i32_e32 v4, vcc, s4, v4 -; GFX6-NEXT: v_max_i32_e32 v1, v5, v1 -; GFX6-NEXT: v_min_i32_e32 v1, v1, v4 +; GFX6-NEXT: v_bfrev_b32_e32 v4, -2 +; GFX6-NEXT: v_max_i32_e32 v6, 0, v0 +; GFX6-NEXT: v_sub_i32_e32 v7, vcc, v5, v7 +; GFX6-NEXT: v_sub_i32_e32 v6, vcc, v4, v6 +; GFX6-NEXT: v_max_i32_e32 v1, v7, v1 +; GFX6-NEXT: v_min_i32_e32 v1, v1, v6 ; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v1 ; GFX6-NEXT: v_lshlrev_b32_e32 v1, 24, v2 -; GFX6-NEXT: v_min_i32_e32 v4, 0, v1 ; GFX6-NEXT: v_lshlrev_b32_e32 v2, 24, v3 ; GFX6-NEXT: v_max_i32_e32 v3, 0, v1 -; GFX6-NEXT: v_sub_i32_e32 v4, vcc, s5, v4 -; GFX6-NEXT: v_sub_i32_e32 v3, vcc, s4, v3 +; GFX6-NEXT: v_sub_i32_e32 v3, vcc, v4, v3 +; GFX6-NEXT: v_min_i32_e32 v4, 0, v1 +; GFX6-NEXT: v_sub_i32_e32 v4, vcc, v5, v4 ; GFX6-NEXT: v_max_i32_e32 v2, v4, v2 ; GFX6-NEXT: v_min_i32_e32 v2, v2, v3 ; GFX6-NEXT: v_add_i32_e32 v1, vcc, v1, v2 @@ -273,21 +273,21 @@ ; GFX8-NEXT: v_mov_b32_e32 v2, 8 ; GFX8-NEXT: v_lshrrev_b32_sdwa v3, v2, v0 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX8-NEXT: v_lshlrev_b16_e32 v0, 8, v0 -; GFX8-NEXT: s_movk_i32 s5, 0x8000 -; GFX8-NEXT: v_min_i16_e32 v5, 0, v0 +; GFX8-NEXT: v_mov_b32_e32 v5, 0xffff8000 +; GFX8-NEXT: v_min_i16_e32 v7, 0, v0 ; GFX8-NEXT: v_lshrrev_b32_sdwa v2, v2, v1 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX8-NEXT: v_lshlrev_b16_e32 v1, 8, v1 -; GFX8-NEXT: s_movk_i32 s4, 0x7fff -; GFX8-NEXT: v_max_i16_e32 v4, 0, v0 -; GFX8-NEXT: v_sub_u16_e32 v5, s5, v5 -; GFX8-NEXT: v_sub_u16_e32 v4, s4, v4 -; GFX8-NEXT: v_max_i16_e32 v1, v5, v1 -; GFX8-NEXT: v_min_i16_e32 v1, v1, v4 -; GFX8-NEXT: v_min_i16_e32 v4, 0, v3 +; GFX8-NEXT: v_mov_b32_e32 v4, 0x7fff +; GFX8-NEXT: v_max_i16_e32 v6, 0, v0 +; GFX8-NEXT: v_sub_u16_e32 v7, v5, v7 +; GFX8-NEXT: v_sub_u16_e32 v6, v4, v6 +; GFX8-NEXT: v_max_i16_e32 v1, v7, v1 +; GFX8-NEXT: v_min_i16_e32 v1, v1, v6 ; GFX8-NEXT: v_add_u16_e32 v0, v0, v1 ; GFX8-NEXT: v_max_i16_e32 v1, 0, v3 -; GFX8-NEXT: v_sub_u16_e32 v4, s5, v4 -; GFX8-NEXT: v_sub_u16_e32 v1, s4, v1 +; GFX8-NEXT: v_sub_u16_e32 v1, v4, v1 +; GFX8-NEXT: v_min_i16_e32 v4, 0, v3 +; GFX8-NEXT: v_sub_u16_e32 v4, v5, v4 ; GFX8-NEXT: v_max_i16_e32 v2, v4, v2 ; GFX8-NEXT: v_min_i16_e32 v1, v2, v1 ; GFX8-NEXT: v_add_u16_e32 v1, v3, v1 @@ -310,8 +310,8 @@ ; GFX9-NEXT: v_pk_lshlrev_b16 v1, 8, v1 op_sel_hi:[0,1] ; GFX9-NEXT: v_pk_add_i16 v0, v0, v1 clamp ; GFX9-NEXT: v_pk_ashrrev_i16 v0, 8, v0 op_sel_hi:[0,1] -; GFX9-NEXT: s_movk_i32 s4, 0xff -; GFX9-NEXT: v_and_b32_sdwa v1, v0, s4 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX9-NEXT: v_mov_b32_e32 v1, 0xff +; GFX9-NEXT: v_and_b32_sdwa v1, v0, v1 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -323,14 +323,14 @@ ; GFX10-NEXT: v_mov_b32_e32 v2, 0xffff ; GFX10-NEXT: v_lshrrev_b32_sdwa v3, s4, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX10-NEXT: v_lshrrev_b32_sdwa v4, s4, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX10-NEXT: s_movk_i32 s4, 0xff ; GFX10-NEXT: v_and_or_b32 v0, v0, v2, v3 ; GFX10-NEXT: v_and_or_b32 v1, v1, v2, v4 ; GFX10-NEXT: v_pk_lshlrev_b16 v0, 8, v0 op_sel_hi:[0,1] ; GFX10-NEXT: v_pk_lshlrev_b16 v1, 8, v1 op_sel_hi:[0,1] ; GFX10-NEXT: v_pk_add_i16 v0, v0, v1 clamp +; GFX10-NEXT: v_mov_b32_e32 v1, 0xff ; GFX10-NEXT: v_pk_ashrrev_i16 v0, 8, v0 op_sel_hi:[0,1] -; GFX10-NEXT: v_and_b32_sdwa v1, v0, s4 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX10-NEXT: v_and_b32_sdwa v1, v0, v1 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; GFX10-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX10-NEXT: s_setpc_b64 s[30:31] %lhs = bitcast i16 %lhs.arg to <2 x i8> @@ -439,8 +439,8 @@ ; GFX9-NEXT: v_mov_b32_e32 v0, s1 ; GFX9-NEXT: v_pk_add_i16 v0, s0, v0 clamp ; GFX9-NEXT: v_pk_ashrrev_i16 v0, 8, v0 op_sel_hi:[0,1] -; GFX9-NEXT: s_movk_i32 s0, 0xff -; GFX9-NEXT: v_and_b32_sdwa v1, v0, s0 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX9-NEXT: v_mov_b32_e32 v1, 0xff +; GFX9-NEXT: v_and_b32_sdwa v1, v0, v1 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_readfirstlane_b32 s0, v0 ; GFX9-NEXT: ; return to shader part epilog @@ -460,10 +460,10 @@ ; GFX10-NEXT: s_lshl_b32 s2, s4, 8 ; GFX10-NEXT: s_pack_ll_b32_b16 s0, s0, s3 ; GFX10-NEXT: s_pack_ll_b32_b16 s1, s1, s2 +; GFX10-NEXT: v_mov_b32_e32 v1, 0xff ; GFX10-NEXT: v_pk_add_i16 v0, s0, s1 clamp -; GFX10-NEXT: s_movk_i32 s0, 0xff ; GFX10-NEXT: v_pk_ashrrev_i16 v0, 8, v0 op_sel_hi:[0,1] -; GFX10-NEXT: v_and_b32_sdwa v1, v0, s0 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX10-NEXT: v_and_b32_sdwa v1, v0, v1 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; GFX10-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX10-NEXT: v_readfirstlane_b32 s0, v0 ; GFX10-NEXT: ; return to shader part epilog @@ -482,61 +482,59 @@ ; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v0 ; GFX6-NEXT: v_lshrrev_b32_e32 v4, 24, v0 ; GFX6-NEXT: v_lshlrev_b32_e32 v0, 24, v0 -; GFX6-NEXT: s_brev_b32 s5, 1 -; GFX6-NEXT: v_min_i32_e32 v10, 0, v0 +; GFX6-NEXT: v_bfrev_b32_e32 v9, 1 +; GFX6-NEXT: v_min_i32_e32 v11, 0, v0 ; GFX6-NEXT: v_lshrrev_b32_e32 v5, 8, v1 ; GFX6-NEXT: v_lshrrev_b32_e32 v6, 16, v1 ; GFX6-NEXT: v_lshrrev_b32_e32 v7, 24, v1 ; GFX6-NEXT: v_lshlrev_b32_e32 v1, 24, v1 -; GFX6-NEXT: s_brev_b32 s4, -2 -; GFX6-NEXT: v_max_i32_e32 v8, 0, v0 -; GFX6-NEXT: v_sub_i32_e32 v10, vcc, s5, v10 -; GFX6-NEXT: v_sub_i32_e32 v8, vcc, s4, v8 -; GFX6-NEXT: v_max_i32_e32 v1, v10, v1 -; GFX6-NEXT: v_min_i32_e32 v1, v1, v8 +; GFX6-NEXT: v_bfrev_b32_e32 v8, -2 +; GFX6-NEXT: v_max_i32_e32 v10, 0, v0 +; GFX6-NEXT: v_sub_i32_e32 v11, vcc, v9, v11 +; GFX6-NEXT: v_sub_i32_e32 v10, vcc, v8, v10 +; GFX6-NEXT: v_max_i32_e32 v1, v11, v1 +; GFX6-NEXT: v_min_i32_e32 v1, v1, v10 ; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v1 ; GFX6-NEXT: v_lshlrev_b32_e32 v1, 24, v2 -; GFX6-NEXT: v_min_i32_e32 v8, 0, v1 +; GFX6-NEXT: v_min_i32_e32 v10, 0, v1 ; GFX6-NEXT: v_lshlrev_b32_e32 v2, 24, v5 ; GFX6-NEXT: v_max_i32_e32 v5, 0, v1 -; GFX6-NEXT: v_sub_i32_e32 v8, vcc, s5, v8 -; GFX6-NEXT: v_sub_i32_e32 v5, vcc, s4, v5 -; GFX6-NEXT: v_max_i32_e32 v2, v8, v2 +; GFX6-NEXT: v_sub_i32_e32 v10, vcc, v9, v10 +; GFX6-NEXT: v_sub_i32_e32 v5, vcc, v8, v5 +; GFX6-NEXT: v_max_i32_e32 v2, v10, v2 ; GFX6-NEXT: v_min_i32_e32 v2, v2, v5 ; GFX6-NEXT: v_add_i32_e32 v1, vcc, v1, v2 ; GFX6-NEXT: v_lshlrev_b32_e32 v2, 24, v3 ; GFX6-NEXT: v_lshlrev_b32_e32 v3, 24, v6 ; GFX6-NEXT: v_min_i32_e32 v6, 0, v2 -; GFX6-NEXT: v_bfrev_b32_e32 v9, -2 ; GFX6-NEXT: v_max_i32_e32 v5, 0, v2 -; GFX6-NEXT: v_sub_i32_e32 v6, vcc, s5, v6 -; GFX6-NEXT: v_sub_i32_e32 v5, vcc, v9, v5 +; GFX6-NEXT: v_sub_i32_e32 v6, vcc, v9, v6 +; GFX6-NEXT: v_sub_i32_e32 v5, vcc, v8, v5 ; GFX6-NEXT: v_max_i32_e32 v3, v6, v3 ; GFX6-NEXT: v_min_i32_e32 v3, v3, v5 ; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v3 ; GFX6-NEXT: v_lshlrev_b32_e32 v3, 24, v4 -; GFX6-NEXT: v_bfrev_b32_e32 v11, 1 ; GFX6-NEXT: v_min_i32_e32 v6, 0, v3 -; GFX6-NEXT: v_ashrrev_i32_e32 v1, 24, v1 ; GFX6-NEXT: v_lshlrev_b32_e32 v4, 24, v7 ; GFX6-NEXT: v_max_i32_e32 v5, 0, v3 -; GFX6-NEXT: v_sub_i32_e32 v6, vcc, v11, v6 -; GFX6-NEXT: s_movk_i32 s4, 0xff -; GFX6-NEXT: v_ashrrev_i32_e32 v0, 24, v0 -; GFX6-NEXT: v_sub_i32_e32 v5, vcc, v9, v5 +; GFX6-NEXT: v_sub_i32_e32 v6, vcc, v9, v6 +; GFX6-NEXT: v_sub_i32_e32 v5, vcc, v8, v5 ; GFX6-NEXT: v_max_i32_e32 v4, v6, v4 -; GFX6-NEXT: v_and_b32_e32 v1, s4, v1 -; GFX6-NEXT: v_ashrrev_i32_e32 v2, 24, v2 ; GFX6-NEXT: v_min_i32_e32 v4, v4, v5 -; GFX6-NEXT: v_and_b32_e32 v0, s4, v0 -; GFX6-NEXT: v_lshlrev_b32_e32 v1, 8, v1 +; GFX6-NEXT: v_ashrrev_i32_e32 v1, 24, v1 ; GFX6-NEXT: v_add_i32_e32 v3, vcc, v3, v4 +; GFX6-NEXT: v_mov_b32_e32 v4, 0xff +; GFX6-NEXT: v_ashrrev_i32_e32 v0, 24, v0 +; GFX6-NEXT: v_and_b32_e32 v1, v1, v4 +; GFX6-NEXT: v_ashrrev_i32_e32 v2, 24, v2 +; GFX6-NEXT: v_and_b32_e32 v0, v0, v4 +; GFX6-NEXT: v_lshlrev_b32_e32 v1, 8, v1 ; GFX6-NEXT: v_or_b32_e32 v0, v0, v1 -; GFX6-NEXT: v_and_b32_e32 v1, s4, v2 +; GFX6-NEXT: v_and_b32_e32 v1, v2, v4 ; GFX6-NEXT: v_ashrrev_i32_e32 v3, 24, v3 ; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX6-NEXT: v_or_b32_e32 v0, v0, v1 -; GFX6-NEXT: v_and_b32_e32 v1, s4, v3 +; GFX6-NEXT: v_and_b32_e32 v1, v3, v4 ; GFX6-NEXT: v_lshlrev_b32_e32 v1, 24, v1 ; GFX6-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX6-NEXT: s_setpc_b64 s[30:31] @@ -549,33 +547,32 @@ ; GFX8-NEXT: v_lshrrev_b32_e32 v4, 16, v0 ; GFX8-NEXT: v_lshrrev_b32_e32 v5, 24, v0 ; GFX8-NEXT: v_lshlrev_b16_e32 v0, 8, v0 -; GFX8-NEXT: s_movk_i32 s5, 0x8000 -; GFX8-NEXT: v_min_i16_e32 v10, 0, v0 +; GFX8-NEXT: v_mov_b32_e32 v9, 0xffff8000 +; GFX8-NEXT: v_min_i16_e32 v11, 0, v0 ; GFX8-NEXT: v_lshrrev_b32_sdwa v2, v2, v1 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX8-NEXT: v_lshrrev_b32_e32 v6, 16, v1 ; GFX8-NEXT: v_lshrrev_b32_e32 v7, 24, v1 ; GFX8-NEXT: v_lshlrev_b16_e32 v1, 8, v1 -; GFX8-NEXT: s_movk_i32 s4, 0x7fff -; GFX8-NEXT: v_max_i16_e32 v8, 0, v0 -; GFX8-NEXT: v_sub_u16_e32 v10, s5, v10 -; GFX8-NEXT: v_sub_u16_e32 v8, s4, v8 -; GFX8-NEXT: v_max_i16_e32 v1, v10, v1 -; GFX8-NEXT: v_min_i16_e32 v1, v1, v8 -; GFX8-NEXT: v_min_i16_e32 v8, 0, v3 +; GFX8-NEXT: v_mov_b32_e32 v8, 0x7fff +; GFX8-NEXT: v_max_i16_e32 v10, 0, v0 +; GFX8-NEXT: v_sub_u16_e32 v11, v9, v11 +; GFX8-NEXT: v_sub_u16_e32 v10, v8, v10 +; GFX8-NEXT: v_max_i16_e32 v1, v11, v1 +; GFX8-NEXT: v_min_i16_e32 v1, v1, v10 +; GFX8-NEXT: v_min_i16_e32 v10, 0, v3 ; GFX8-NEXT: v_add_u16_e32 v0, v0, v1 ; GFX8-NEXT: v_max_i16_e32 v1, 0, v3 -; GFX8-NEXT: v_sub_u16_e32 v8, s5, v8 -; GFX8-NEXT: v_sub_u16_e32 v1, s4, v1 -; GFX8-NEXT: v_max_i16_e32 v2, v8, v2 +; GFX8-NEXT: v_sub_u16_e32 v10, v9, v10 +; GFX8-NEXT: v_sub_u16_e32 v1, v8, v1 +; GFX8-NEXT: v_max_i16_e32 v2, v10, v2 ; GFX8-NEXT: v_min_i16_e32 v1, v2, v1 ; GFX8-NEXT: v_lshlrev_b16_e32 v2, 8, v4 ; GFX8-NEXT: v_add_u16_e32 v1, v3, v1 ; GFX8-NEXT: v_lshlrev_b16_e32 v3, 8, v6 ; GFX8-NEXT: v_min_i16_e32 v6, 0, v2 -; GFX8-NEXT: v_mov_b32_e32 v9, 0x7fff ; GFX8-NEXT: v_max_i16_e32 v4, 0, v2 -; GFX8-NEXT: v_sub_u16_e32 v6, s5, v6 -; GFX8-NEXT: v_sub_u16_e32 v4, v9, v4 +; GFX8-NEXT: v_sub_u16_e32 v6, v9, v6 +; GFX8-NEXT: v_sub_u16_e32 v4, v8, v4 ; GFX8-NEXT: v_max_i16_e32 v3, v6, v3 ; GFX8-NEXT: v_min_i16_e32 v3, v3, v4 ; GFX8-NEXT: v_add_u16_e32 v2, v2, v3 @@ -583,8 +580,8 @@ ; GFX8-NEXT: v_min_i16_e32 v6, 0, v3 ; GFX8-NEXT: v_lshlrev_b16_e32 v4, 8, v7 ; GFX8-NEXT: v_max_i16_e32 v5, 0, v3 -; GFX8-NEXT: v_sub_u16_e32 v6, 0x8000, v6 -; GFX8-NEXT: v_sub_u16_e32 v5, v9, v5 +; GFX8-NEXT: v_sub_u16_e32 v6, v9, v6 +; GFX8-NEXT: v_sub_u16_e32 v5, v8, v5 ; GFX8-NEXT: v_max_i16_e32 v4, v6, v4 ; GFX8-NEXT: v_min_i16_e32 v4, v4, v5 ; GFX8-NEXT: v_add_u16_e32 v3, v3, v4 @@ -623,12 +620,12 @@ ; GFX9-NEXT: v_pk_add_i16 v0, v0, v1 clamp ; GFX9-NEXT: v_pk_add_i16 v1, v2, v3 clamp ; GFX9-NEXT: v_pk_ashrrev_i16 v0, 8, v0 op_sel_hi:[0,1] -; GFX9-NEXT: v_mov_b32_e32 v2, 8 +; GFX9-NEXT: v_mov_b32_e32 v3, 8 ; GFX9-NEXT: v_pk_ashrrev_i16 v1, 8, v1 op_sel_hi:[0,1] -; GFX9-NEXT: s_movk_i32 s4, 0xff -; GFX9-NEXT: v_lshlrev_b32_sdwa v2, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 -; GFX9-NEXT: v_and_or_b32 v0, v0, s4, v2 -; GFX9-NEXT: v_and_b32_e32 v2, s4, v1 +; GFX9-NEXT: v_mov_b32_e32 v2, 0xff +; GFX9-NEXT: v_lshlrev_b32_sdwa v3, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 +; GFX9-NEXT: v_and_or_b32 v0, v0, v2, v3 +; GFX9-NEXT: v_and_b32_e32 v2, v1, v2 ; GFX9-NEXT: v_mov_b32_e32 v3, 24 ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX9-NEXT: v_lshlrev_b32_sdwa v1, v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 @@ -649,26 +646,26 @@ ; GFX10-NEXT: v_lshrrev_b32_e32 v8, 16, v1 ; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v4 ; GFX10-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GFX10-NEXT: s_movk_i32 s4, 0xff ; GFX10-NEXT: v_and_or_b32 v0, v0, v7, v2 ; GFX10-NEXT: v_and_or_b32 v1, v1, v7, v6 ; GFX10-NEXT: v_and_or_b32 v2, v3, v7, v4 ; GFX10-NEXT: v_and_or_b32 v3, v8, v7, v5 -; GFX10-NEXT: v_mov_b32_e32 v4, 24 +; GFX10-NEXT: v_mov_b32_e32 v5, 24 ; GFX10-NEXT: v_pk_lshlrev_b16 v0, 8, v0 op_sel_hi:[0,1] ; GFX10-NEXT: v_pk_lshlrev_b16 v1, 8, v1 op_sel_hi:[0,1] ; GFX10-NEXT: v_pk_lshlrev_b16 v2, 8, v2 op_sel_hi:[0,1] ; GFX10-NEXT: v_pk_lshlrev_b16 v3, 8, v3 op_sel_hi:[0,1] ; GFX10-NEXT: v_pk_add_i16 v0, v0, v1 clamp ; GFX10-NEXT: v_pk_add_i16 v1, v2, v3 clamp -; GFX10-NEXT: v_mov_b32_e32 v2, 8 +; GFX10-NEXT: v_mov_b32_e32 v2, 0xff +; GFX10-NEXT: v_mov_b32_e32 v3, 8 ; GFX10-NEXT: v_pk_ashrrev_i16 v0, 8, v0 op_sel_hi:[0,1] ; GFX10-NEXT: v_pk_ashrrev_i16 v1, 8, v1 op_sel_hi:[0,1] -; GFX10-NEXT: v_lshlrev_b32_sdwa v2, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 -; GFX10-NEXT: v_and_b32_e32 v3, s4, v1 -; GFX10-NEXT: v_lshlrev_b32_sdwa v1, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 -; GFX10-NEXT: v_and_or_b32 v0, v0, s4, v2 -; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; GFX10-NEXT: v_lshlrev_b32_sdwa v3, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 +; GFX10-NEXT: v_and_b32_e32 v4, v1, v2 +; GFX10-NEXT: v_lshlrev_b32_sdwa v1, v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 +; GFX10-NEXT: v_and_or_b32 v0, v0, v2, v3 +; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v4 ; GFX10-NEXT: v_or3_b32 v0, v0, v2, v1 ; GFX10-NEXT: s_setpc_b64 s[30:31] %lhs = bitcast i32 %lhs.arg to <4 x i8> @@ -867,11 +864,11 @@ ; GFX9-NEXT: v_pk_add_i16 v1, s3, v1 clamp ; GFX9-NEXT: v_pk_ashrrev_i16 v0, 8, v0 op_sel_hi:[0,1] ; GFX9-NEXT: v_pk_ashrrev_i16 v1, 8, v1 op_sel_hi:[0,1] -; GFX9-NEXT: s_movk_i32 s0, 0xff -; GFX9-NEXT: v_lshlrev_b32_sdwa v2, s2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 +; GFX9-NEXT: v_mov_b32_e32 v2, 0xff +; GFX9-NEXT: v_lshlrev_b32_sdwa v3, s2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 ; GFX9-NEXT: s_mov_b32 s5, 24 -; GFX9-NEXT: v_and_or_b32 v0, v0, s0, v2 -; GFX9-NEXT: v_and_b32_e32 v2, s0, v1 +; GFX9-NEXT: v_and_or_b32 v0, v0, v2, v3 +; GFX9-NEXT: v_and_b32_e32 v2, v1, v2 ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX9-NEXT: v_lshlrev_b32_sdwa v1, s5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 ; GFX9-NEXT: v_or3_b32 v0, v0, v2, v1 @@ -909,16 +906,16 @@ ; GFX10-NEXT: s_pack_ll_b32_b16 s3, s3, s4 ; GFX10-NEXT: v_pk_add_i16 v0, s0, s1 clamp ; GFX10-NEXT: v_pk_add_i16 v1, s2, s3 clamp +; GFX10-NEXT: v_mov_b32_e32 v2, 0xff ; GFX10-NEXT: s_mov_b32 s0, 8 -; GFX10-NEXT: s_movk_i32 s1, 0xff ; GFX10-NEXT: v_pk_ashrrev_i16 v0, 8, v0 op_sel_hi:[0,1] ; GFX10-NEXT: v_pk_ashrrev_i16 v1, 8, v1 op_sel_hi:[0,1] -; GFX10-NEXT: v_lshlrev_b32_sdwa v2, s0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 -; GFX10-NEXT: v_and_b32_e32 v3, s1, v1 +; GFX10-NEXT: v_lshlrev_b32_sdwa v3, s0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 +; GFX10-NEXT: v_and_b32_e32 v4, v1, v2 ; GFX10-NEXT: s_mov_b32 s0, 24 ; GFX10-NEXT: v_lshlrev_b32_sdwa v1, s0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 -; GFX10-NEXT: v_and_or_b32 v0, v0, s1, v2 -; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; GFX10-NEXT: v_and_or_b32 v0, v0, v2, v3 +; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v4 ; GFX10-NEXT: v_or3_b32 v0, v0, v2, v1 ; GFX10-NEXT: v_readfirstlane_b32 s0, v0 ; GFX10-NEXT: ; return to shader part epilog @@ -1209,19 +1206,19 @@ ; GFX6-LABEL: v_saddsat_v2i32: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: s_brev_b32 s5, 1 -; GFX6-NEXT: v_min_i32_e32 v5, 0, v0 -; GFX6-NEXT: s_brev_b32 s4, -2 -; GFX6-NEXT: v_max_i32_e32 v4, 0, v0 -; GFX6-NEXT: v_sub_i32_e32 v5, vcc, s5, v5 -; GFX6-NEXT: v_sub_i32_e32 v4, vcc, s4, v4 -; GFX6-NEXT: v_max_i32_e32 v2, v5, v2 -; GFX6-NEXT: v_min_i32_e32 v2, v2, v4 -; GFX6-NEXT: v_min_i32_e32 v4, 0, v1 +; GFX6-NEXT: v_bfrev_b32_e32 v5, 1 +; GFX6-NEXT: v_min_i32_e32 v7, 0, v0 +; GFX6-NEXT: v_bfrev_b32_e32 v4, -2 +; GFX6-NEXT: v_max_i32_e32 v6, 0, v0 +; GFX6-NEXT: v_sub_i32_e32 v7, vcc, v5, v7 +; GFX6-NEXT: v_sub_i32_e32 v6, vcc, v4, v6 +; GFX6-NEXT: v_max_i32_e32 v2, v7, v2 +; GFX6-NEXT: v_min_i32_e32 v2, v2, v6 ; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v2 ; GFX6-NEXT: v_max_i32_e32 v2, 0, v1 -; GFX6-NEXT: v_sub_i32_e32 v4, vcc, s5, v4 -; GFX6-NEXT: v_sub_i32_e32 v2, vcc, s4, v2 +; GFX6-NEXT: v_sub_i32_e32 v2, vcc, v4, v2 +; GFX6-NEXT: v_min_i32_e32 v4, 0, v1 +; GFX6-NEXT: v_sub_i32_e32 v4, vcc, v5, v4 ; GFX6-NEXT: v_max_i32_e32 v3, v4, v3 ; GFX6-NEXT: v_min_i32_e32 v2, v3, v2 ; GFX6-NEXT: v_add_i32_e32 v1, vcc, v1, v2 @@ -1230,19 +1227,19 @@ ; GFX8-LABEL: v_saddsat_v2i32: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: s_brev_b32 s5, 1 -; GFX8-NEXT: v_min_i32_e32 v5, 0, v0 -; GFX8-NEXT: s_brev_b32 s4, -2 -; GFX8-NEXT: v_max_i32_e32 v4, 0, v0 -; GFX8-NEXT: v_sub_u32_e32 v5, vcc, s5, v5 -; GFX8-NEXT: v_sub_u32_e32 v4, vcc, s4, v4 -; GFX8-NEXT: v_max_i32_e32 v2, v5, v2 -; GFX8-NEXT: v_min_i32_e32 v2, v2, v4 -; GFX8-NEXT: v_min_i32_e32 v4, 0, v1 +; GFX8-NEXT: v_bfrev_b32_e32 v5, 1 +; GFX8-NEXT: v_min_i32_e32 v7, 0, v0 +; GFX8-NEXT: v_bfrev_b32_e32 v4, -2 +; GFX8-NEXT: v_max_i32_e32 v6, 0, v0 +; GFX8-NEXT: v_sub_u32_e32 v7, vcc, v5, v7 +; GFX8-NEXT: v_sub_u32_e32 v6, vcc, v4, v6 +; GFX8-NEXT: v_max_i32_e32 v2, v7, v2 +; GFX8-NEXT: v_min_i32_e32 v2, v2, v6 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v2 ; GFX8-NEXT: v_max_i32_e32 v2, 0, v1 -; GFX8-NEXT: v_sub_u32_e32 v4, vcc, s5, v4 -; GFX8-NEXT: v_sub_u32_e32 v2, vcc, s4, v2 +; GFX8-NEXT: v_sub_u32_e32 v2, vcc, v4, v2 +; GFX8-NEXT: v_min_i32_e32 v4, 0, v1 +; GFX8-NEXT: v_sub_u32_e32 v4, vcc, v5, v4 ; GFX8-NEXT: v_max_i32_e32 v3, v4, v3 ; GFX8-NEXT: v_min_i32_e32 v2, v3, v2 ; GFX8-NEXT: v_add_u32_e32 v1, vcc, v1, v2 @@ -1332,26 +1329,26 @@ ; GFX6-LABEL: v_saddsat_v3i32: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: s_brev_b32 s5, 1 -; GFX6-NEXT: v_min_i32_e32 v7, 0, v0 -; GFX6-NEXT: s_brev_b32 s4, -2 -; GFX6-NEXT: v_max_i32_e32 v6, 0, v0 -; GFX6-NEXT: v_sub_i32_e32 v7, vcc, s5, v7 -; GFX6-NEXT: v_sub_i32_e32 v6, vcc, s4, v6 -; GFX6-NEXT: v_max_i32_e32 v3, v7, v3 -; GFX6-NEXT: v_min_i32_e32 v3, v3, v6 -; GFX6-NEXT: v_min_i32_e32 v6, 0, v1 +; GFX6-NEXT: v_bfrev_b32_e32 v7, 1 +; GFX6-NEXT: v_min_i32_e32 v9, 0, v0 +; GFX6-NEXT: v_bfrev_b32_e32 v6, -2 +; GFX6-NEXT: v_max_i32_e32 v8, 0, v0 +; GFX6-NEXT: v_sub_i32_e32 v9, vcc, v7, v9 +; GFX6-NEXT: v_sub_i32_e32 v8, vcc, v6, v8 +; GFX6-NEXT: v_max_i32_e32 v3, v9, v3 +; GFX6-NEXT: v_min_i32_e32 v3, v3, v8 +; GFX6-NEXT: v_min_i32_e32 v8, 0, v1 ; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v3 ; GFX6-NEXT: v_max_i32_e32 v3, 0, v1 -; GFX6-NEXT: v_sub_i32_e32 v6, vcc, s5, v6 -; GFX6-NEXT: v_sub_i32_e32 v3, vcc, s4, v3 -; GFX6-NEXT: v_max_i32_e32 v4, v6, v4 +; GFX6-NEXT: v_sub_i32_e32 v8, vcc, v7, v8 +; GFX6-NEXT: v_sub_i32_e32 v3, vcc, v6, v3 +; GFX6-NEXT: v_max_i32_e32 v4, v8, v4 ; GFX6-NEXT: v_min_i32_e32 v3, v4, v3 ; GFX6-NEXT: v_min_i32_e32 v4, 0, v2 ; GFX6-NEXT: v_add_i32_e32 v1, vcc, v1, v3 ; GFX6-NEXT: v_max_i32_e32 v3, 0, v2 -; GFX6-NEXT: v_sub_i32_e32 v4, vcc, s5, v4 -; GFX6-NEXT: v_sub_i32_e32 v3, vcc, s4, v3 +; GFX6-NEXT: v_sub_i32_e32 v4, vcc, v7, v4 +; GFX6-NEXT: v_sub_i32_e32 v3, vcc, v6, v3 ; GFX6-NEXT: v_max_i32_e32 v4, v4, v5 ; GFX6-NEXT: v_min_i32_e32 v3, v4, v3 ; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v3 @@ -1360,26 +1357,26 @@ ; GFX8-LABEL: v_saddsat_v3i32: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: s_brev_b32 s5, 1 -; GFX8-NEXT: v_min_i32_e32 v7, 0, v0 -; GFX8-NEXT: s_brev_b32 s4, -2 -; GFX8-NEXT: v_max_i32_e32 v6, 0, v0 -; GFX8-NEXT: v_sub_u32_e32 v7, vcc, s5, v7 -; GFX8-NEXT: v_sub_u32_e32 v6, vcc, s4, v6 -; GFX8-NEXT: v_max_i32_e32 v3, v7, v3 -; GFX8-NEXT: v_min_i32_e32 v3, v3, v6 -; GFX8-NEXT: v_min_i32_e32 v6, 0, v1 +; GFX8-NEXT: v_bfrev_b32_e32 v7, 1 +; GFX8-NEXT: v_min_i32_e32 v9, 0, v0 +; GFX8-NEXT: v_bfrev_b32_e32 v6, -2 +; GFX8-NEXT: v_max_i32_e32 v8, 0, v0 +; GFX8-NEXT: v_sub_u32_e32 v9, vcc, v7, v9 +; GFX8-NEXT: v_sub_u32_e32 v8, vcc, v6, v8 +; GFX8-NEXT: v_max_i32_e32 v3, v9, v3 +; GFX8-NEXT: v_min_i32_e32 v3, v3, v8 +; GFX8-NEXT: v_min_i32_e32 v8, 0, v1 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v3 ; GFX8-NEXT: v_max_i32_e32 v3, 0, v1 -; GFX8-NEXT: v_sub_u32_e32 v6, vcc, s5, v6 -; GFX8-NEXT: v_sub_u32_e32 v3, vcc, s4, v3 -; GFX8-NEXT: v_max_i32_e32 v4, v6, v4 +; GFX8-NEXT: v_sub_u32_e32 v8, vcc, v7, v8 +; GFX8-NEXT: v_sub_u32_e32 v3, vcc, v6, v3 +; GFX8-NEXT: v_max_i32_e32 v4, v8, v4 ; GFX8-NEXT: v_min_i32_e32 v3, v4, v3 ; GFX8-NEXT: v_min_i32_e32 v4, 0, v2 ; GFX8-NEXT: v_add_u32_e32 v1, vcc, v1, v3 ; GFX8-NEXT: v_max_i32_e32 v3, 0, v2 -; GFX8-NEXT: v_sub_u32_e32 v4, vcc, s5, v4 -; GFX8-NEXT: v_sub_u32_e32 v3, vcc, s4, v3 +; GFX8-NEXT: v_sub_u32_e32 v4, vcc, v7, v4 +; GFX8-NEXT: v_sub_u32_e32 v3, vcc, v6, v3 ; GFX8-NEXT: v_max_i32_e32 v4, v4, v5 ; GFX8-NEXT: v_min_i32_e32 v3, v4, v3 ; GFX8-NEXT: v_add_u32_e32 v2, vcc, v2, v3 @@ -1490,33 +1487,33 @@ ; GFX6-LABEL: v_saddsat_v4i32: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: s_brev_b32 s5, 1 -; GFX6-NEXT: v_min_i32_e32 v9, 0, v0 -; GFX6-NEXT: s_brev_b32 s4, -2 -; GFX6-NEXT: v_max_i32_e32 v8, 0, v0 -; GFX6-NEXT: v_sub_i32_e32 v9, vcc, s5, v9 -; GFX6-NEXT: v_sub_i32_e32 v8, vcc, s4, v8 -; GFX6-NEXT: v_max_i32_e32 v4, v9, v4 -; GFX6-NEXT: v_min_i32_e32 v4, v4, v8 -; GFX6-NEXT: v_min_i32_e32 v8, 0, v1 +; GFX6-NEXT: v_bfrev_b32_e32 v9, 1 +; GFX6-NEXT: v_min_i32_e32 v11, 0, v0 +; GFX6-NEXT: v_bfrev_b32_e32 v8, -2 +; GFX6-NEXT: v_max_i32_e32 v10, 0, v0 +; GFX6-NEXT: v_sub_i32_e32 v11, vcc, v9, v11 +; GFX6-NEXT: v_sub_i32_e32 v10, vcc, v8, v10 +; GFX6-NEXT: v_max_i32_e32 v4, v11, v4 +; GFX6-NEXT: v_min_i32_e32 v4, v4, v10 +; GFX6-NEXT: v_min_i32_e32 v10, 0, v1 ; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v4 ; GFX6-NEXT: v_max_i32_e32 v4, 0, v1 -; GFX6-NEXT: v_sub_i32_e32 v8, vcc, s5, v8 -; GFX6-NEXT: v_sub_i32_e32 v4, vcc, s4, v4 -; GFX6-NEXT: v_max_i32_e32 v5, v8, v5 +; GFX6-NEXT: v_sub_i32_e32 v10, vcc, v9, v10 +; GFX6-NEXT: v_sub_i32_e32 v4, vcc, v8, v4 +; GFX6-NEXT: v_max_i32_e32 v5, v10, v5 ; GFX6-NEXT: v_min_i32_e32 v4, v5, v4 ; GFX6-NEXT: v_min_i32_e32 v5, 0, v2 ; GFX6-NEXT: v_add_i32_e32 v1, vcc, v1, v4 ; GFX6-NEXT: v_max_i32_e32 v4, 0, v2 -; GFX6-NEXT: v_sub_i32_e32 v5, vcc, s5, v5 -; GFX6-NEXT: v_sub_i32_e32 v4, vcc, s4, v4 +; GFX6-NEXT: v_sub_i32_e32 v5, vcc, v9, v5 +; GFX6-NEXT: v_sub_i32_e32 v4, vcc, v8, v4 ; GFX6-NEXT: v_max_i32_e32 v5, v5, v6 ; GFX6-NEXT: v_min_i32_e32 v4, v5, v4 ; GFX6-NEXT: v_min_i32_e32 v5, 0, v3 ; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v4 ; GFX6-NEXT: v_max_i32_e32 v4, 0, v3 -; GFX6-NEXT: v_sub_i32_e32 v5, vcc, 0x80000000, v5 -; GFX6-NEXT: v_sub_i32_e32 v4, vcc, 0x7fffffff, v4 +; GFX6-NEXT: v_sub_i32_e32 v5, vcc, v9, v5 +; GFX6-NEXT: v_sub_i32_e32 v4, vcc, v8, v4 ; GFX6-NEXT: v_max_i32_e32 v5, v5, v7 ; GFX6-NEXT: v_min_i32_e32 v4, v5, v4 ; GFX6-NEXT: v_add_i32_e32 v3, vcc, v3, v4 @@ -1525,33 +1522,33 @@ ; GFX8-LABEL: v_saddsat_v4i32: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: s_brev_b32 s5, 1 -; GFX8-NEXT: v_min_i32_e32 v9, 0, v0 -; GFX8-NEXT: s_brev_b32 s4, -2 -; GFX8-NEXT: v_max_i32_e32 v8, 0, v0 -; GFX8-NEXT: v_sub_u32_e32 v9, vcc, s5, v9 -; GFX8-NEXT: v_sub_u32_e32 v8, vcc, s4, v8 -; GFX8-NEXT: v_max_i32_e32 v4, v9, v4 -; GFX8-NEXT: v_min_i32_e32 v4, v4, v8 -; GFX8-NEXT: v_min_i32_e32 v8, 0, v1 +; GFX8-NEXT: v_bfrev_b32_e32 v9, 1 +; GFX8-NEXT: v_min_i32_e32 v11, 0, v0 +; GFX8-NEXT: v_bfrev_b32_e32 v8, -2 +; GFX8-NEXT: v_max_i32_e32 v10, 0, v0 +; GFX8-NEXT: v_sub_u32_e32 v11, vcc, v9, v11 +; GFX8-NEXT: v_sub_u32_e32 v10, vcc, v8, v10 +; GFX8-NEXT: v_max_i32_e32 v4, v11, v4 +; GFX8-NEXT: v_min_i32_e32 v4, v4, v10 +; GFX8-NEXT: v_min_i32_e32 v10, 0, v1 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v4 ; GFX8-NEXT: v_max_i32_e32 v4, 0, v1 -; GFX8-NEXT: v_sub_u32_e32 v8, vcc, s5, v8 -; GFX8-NEXT: v_sub_u32_e32 v4, vcc, s4, v4 -; GFX8-NEXT: v_max_i32_e32 v5, v8, v5 +; GFX8-NEXT: v_sub_u32_e32 v10, vcc, v9, v10 +; GFX8-NEXT: v_sub_u32_e32 v4, vcc, v8, v4 +; GFX8-NEXT: v_max_i32_e32 v5, v10, v5 ; GFX8-NEXT: v_min_i32_e32 v4, v5, v4 ; GFX8-NEXT: v_min_i32_e32 v5, 0, v2 ; GFX8-NEXT: v_add_u32_e32 v1, vcc, v1, v4 ; GFX8-NEXT: v_max_i32_e32 v4, 0, v2 -; GFX8-NEXT: v_sub_u32_e32 v5, vcc, s5, v5 -; GFX8-NEXT: v_sub_u32_e32 v4, vcc, s4, v4 +; GFX8-NEXT: v_sub_u32_e32 v5, vcc, v9, v5 +; GFX8-NEXT: v_sub_u32_e32 v4, vcc, v8, v4 ; GFX8-NEXT: v_max_i32_e32 v5, v5, v6 ; GFX8-NEXT: v_min_i32_e32 v4, v5, v4 ; GFX8-NEXT: v_min_i32_e32 v5, 0, v3 ; GFX8-NEXT: v_add_u32_e32 v2, vcc, v2, v4 ; GFX8-NEXT: v_max_i32_e32 v4, 0, v3 -; GFX8-NEXT: v_sub_u32_e32 v5, vcc, 0x80000000, v5 -; GFX8-NEXT: v_sub_u32_e32 v4, vcc, 0x7fffffff, v4 +; GFX8-NEXT: v_sub_u32_e32 v5, vcc, v9, v5 +; GFX8-NEXT: v_sub_u32_e32 v4, vcc, v8, v4 ; GFX8-NEXT: v_max_i32_e32 v5, v5, v7 ; GFX8-NEXT: v_min_i32_e32 v4, v5, v4 ; GFX8-NEXT: v_add_u32_e32 v3, vcc, v3, v4 @@ -1683,42 +1680,40 @@ ; GFX6-LABEL: v_saddsat_v5i32: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: s_brev_b32 s5, 1 -; GFX6-NEXT: v_min_i32_e32 v12, 0, v0 -; GFX6-NEXT: s_brev_b32 s4, -2 -; GFX6-NEXT: v_max_i32_e32 v10, 0, v0 -; GFX6-NEXT: v_sub_i32_e32 v12, vcc, s5, v12 -; GFX6-NEXT: v_sub_i32_e32 v10, vcc, s4, v10 -; GFX6-NEXT: v_max_i32_e32 v5, v12, v5 -; GFX6-NEXT: v_min_i32_e32 v5, v5, v10 -; GFX6-NEXT: v_min_i32_e32 v10, 0, v1 +; GFX6-NEXT: v_bfrev_b32_e32 v11, 1 +; GFX6-NEXT: v_min_i32_e32 v13, 0, v0 +; GFX6-NEXT: v_bfrev_b32_e32 v10, -2 +; GFX6-NEXT: v_max_i32_e32 v12, 0, v0 +; GFX6-NEXT: v_sub_i32_e32 v13, vcc, v11, v13 +; GFX6-NEXT: v_sub_i32_e32 v12, vcc, v10, v12 +; GFX6-NEXT: v_max_i32_e32 v5, v13, v5 +; GFX6-NEXT: v_min_i32_e32 v5, v5, v12 +; GFX6-NEXT: v_min_i32_e32 v12, 0, v1 ; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v5 ; GFX6-NEXT: v_max_i32_e32 v5, 0, v1 -; GFX6-NEXT: v_sub_i32_e32 v10, vcc, s5, v10 -; GFX6-NEXT: v_sub_i32_e32 v5, vcc, s4, v5 -; GFX6-NEXT: v_max_i32_e32 v6, v10, v6 +; GFX6-NEXT: v_sub_i32_e32 v12, vcc, v11, v12 +; GFX6-NEXT: v_sub_i32_e32 v5, vcc, v10, v5 +; GFX6-NEXT: v_max_i32_e32 v6, v12, v6 ; GFX6-NEXT: v_min_i32_e32 v5, v6, v5 ; GFX6-NEXT: v_min_i32_e32 v6, 0, v2 ; GFX6-NEXT: v_add_i32_e32 v1, vcc, v1, v5 ; GFX6-NEXT: v_max_i32_e32 v5, 0, v2 -; GFX6-NEXT: v_sub_i32_e32 v6, vcc, s5, v6 -; GFX6-NEXT: v_sub_i32_e32 v5, vcc, s4, v5 +; GFX6-NEXT: v_sub_i32_e32 v6, vcc, v11, v6 +; GFX6-NEXT: v_sub_i32_e32 v5, vcc, v10, v5 ; GFX6-NEXT: v_max_i32_e32 v6, v6, v7 -; GFX6-NEXT: v_bfrev_b32_e32 v13, 1 ; GFX6-NEXT: v_min_i32_e32 v5, v6, v5 ; GFX6-NEXT: v_min_i32_e32 v6, 0, v3 -; GFX6-NEXT: v_bfrev_b32_e32 v11, -2 ; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v5 ; GFX6-NEXT: v_max_i32_e32 v5, 0, v3 -; GFX6-NEXT: v_sub_i32_e32 v6, vcc, v13, v6 -; GFX6-NEXT: v_sub_i32_e32 v5, vcc, v11, v5 +; GFX6-NEXT: v_sub_i32_e32 v6, vcc, v11, v6 +; GFX6-NEXT: v_sub_i32_e32 v5, vcc, v10, v5 ; GFX6-NEXT: v_max_i32_e32 v6, v6, v8 ; GFX6-NEXT: v_min_i32_e32 v5, v6, v5 ; GFX6-NEXT: v_min_i32_e32 v6, 0, v4 ; GFX6-NEXT: v_add_i32_e32 v3, vcc, v3, v5 ; GFX6-NEXT: v_max_i32_e32 v5, 0, v4 -; GFX6-NEXT: v_sub_i32_e32 v6, vcc, v13, v6 -; GFX6-NEXT: v_sub_i32_e32 v5, vcc, v11, v5 +; GFX6-NEXT: v_sub_i32_e32 v6, vcc, v11, v6 +; GFX6-NEXT: v_sub_i32_e32 v5, vcc, v10, v5 ; GFX6-NEXT: v_max_i32_e32 v6, v6, v9 ; GFX6-NEXT: v_min_i32_e32 v5, v6, v5 ; GFX6-NEXT: v_add_i32_e32 v4, vcc, v4, v5 @@ -1727,42 +1722,40 @@ ; GFX8-LABEL: v_saddsat_v5i32: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: s_brev_b32 s5, 1 -; GFX8-NEXT: v_min_i32_e32 v12, 0, v0 -; GFX8-NEXT: s_brev_b32 s4, -2 -; GFX8-NEXT: v_max_i32_e32 v10, 0, v0 -; GFX8-NEXT: v_sub_u32_e32 v12, vcc, s5, v12 -; GFX8-NEXT: v_sub_u32_e32 v10, vcc, s4, v10 -; GFX8-NEXT: v_max_i32_e32 v5, v12, v5 -; GFX8-NEXT: v_min_i32_e32 v5, v5, v10 -; GFX8-NEXT: v_min_i32_e32 v10, 0, v1 +; GFX8-NEXT: v_bfrev_b32_e32 v11, 1 +; GFX8-NEXT: v_min_i32_e32 v13, 0, v0 +; GFX8-NEXT: v_bfrev_b32_e32 v10, -2 +; GFX8-NEXT: v_max_i32_e32 v12, 0, v0 +; GFX8-NEXT: v_sub_u32_e32 v13, vcc, v11, v13 +; GFX8-NEXT: v_sub_u32_e32 v12, vcc, v10, v12 +; GFX8-NEXT: v_max_i32_e32 v5, v13, v5 +; GFX8-NEXT: v_min_i32_e32 v5, v5, v12 +; GFX8-NEXT: v_min_i32_e32 v12, 0, v1 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v5 ; GFX8-NEXT: v_max_i32_e32 v5, 0, v1 -; GFX8-NEXT: v_sub_u32_e32 v10, vcc, s5, v10 -; GFX8-NEXT: v_sub_u32_e32 v5, vcc, s4, v5 -; GFX8-NEXT: v_max_i32_e32 v6, v10, v6 +; GFX8-NEXT: v_sub_u32_e32 v12, vcc, v11, v12 +; GFX8-NEXT: v_sub_u32_e32 v5, vcc, v10, v5 +; GFX8-NEXT: v_max_i32_e32 v6, v12, v6 ; GFX8-NEXT: v_min_i32_e32 v5, v6, v5 ; GFX8-NEXT: v_min_i32_e32 v6, 0, v2 ; GFX8-NEXT: v_add_u32_e32 v1, vcc, v1, v5 ; GFX8-NEXT: v_max_i32_e32 v5, 0, v2 -; GFX8-NEXT: v_sub_u32_e32 v6, vcc, s5, v6 -; GFX8-NEXT: v_sub_u32_e32 v5, vcc, s4, v5 +; GFX8-NEXT: v_sub_u32_e32 v6, vcc, v11, v6 +; GFX8-NEXT: v_sub_u32_e32 v5, vcc, v10, v5 ; GFX8-NEXT: v_max_i32_e32 v6, v6, v7 -; GFX8-NEXT: v_bfrev_b32_e32 v13, 1 ; GFX8-NEXT: v_min_i32_e32 v5, v6, v5 ; GFX8-NEXT: v_min_i32_e32 v6, 0, v3 -; GFX8-NEXT: v_bfrev_b32_e32 v11, -2 ; GFX8-NEXT: v_add_u32_e32 v2, vcc, v2, v5 ; GFX8-NEXT: v_max_i32_e32 v5, 0, v3 -; GFX8-NEXT: v_sub_u32_e32 v6, vcc, v13, v6 -; GFX8-NEXT: v_sub_u32_e32 v5, vcc, v11, v5 +; GFX8-NEXT: v_sub_u32_e32 v6, vcc, v11, v6 +; GFX8-NEXT: v_sub_u32_e32 v5, vcc, v10, v5 ; GFX8-NEXT: v_max_i32_e32 v6, v6, v8 ; GFX8-NEXT: v_min_i32_e32 v5, v6, v5 ; GFX8-NEXT: v_min_i32_e32 v6, 0, v4 ; GFX8-NEXT: v_add_u32_e32 v3, vcc, v3, v5 ; GFX8-NEXT: v_max_i32_e32 v5, 0, v4 -; GFX8-NEXT: v_sub_u32_e32 v6, vcc, v13, v6 -; GFX8-NEXT: v_sub_u32_e32 v5, vcc, v11, v5 +; GFX8-NEXT: v_sub_u32_e32 v6, vcc, v11, v6 +; GFX8-NEXT: v_sub_u32_e32 v5, vcc, v10, v5 ; GFX8-NEXT: v_max_i32_e32 v6, v6, v9 ; GFX8-NEXT: v_min_i32_e32 v5, v6, v5 ; GFX8-NEXT: v_add_u32_e32 v4, vcc, v4, v5 @@ -1915,242 +1908,238 @@ ; GFX6-LABEL: v_saddsat_v16i32: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: s_brev_b32 s4, 1 -; GFX6-NEXT: v_min_i32_e32 v32, 0, v0 -; GFX6-NEXT: v_sub_i32_e32 v32, vcc, s4, v32 -; GFX6-NEXT: v_max_i32_e32 v16, v32, v16 -; GFX6-NEXT: s_brev_b32 s5, -2 -; GFX6-NEXT: v_max_i32_e32 v32, 0, v0 -; GFX6-NEXT: v_sub_i32_e32 v32, vcc, s5, v32 -; GFX6-NEXT: v_min_i32_e32 v16, v16, v32 +; GFX6-NEXT: v_bfrev_b32_e32 v32, 1 +; GFX6-NEXT: v_min_i32_e32 v33, 0, v0 +; GFX6-NEXT: v_sub_i32_e32 v33, vcc, v32, v33 +; GFX6-NEXT: v_max_i32_e32 v16, v33, v16 +; GFX6-NEXT: v_bfrev_b32_e32 v33, -2 +; GFX6-NEXT: v_max_i32_e32 v34, 0, v0 +; GFX6-NEXT: v_sub_i32_e32 v34, vcc, v33, v34 +; GFX6-NEXT: v_min_i32_e32 v16, v16, v34 ; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v16 ; GFX6-NEXT: v_min_i32_e32 v16, 0, v1 -; GFX6-NEXT: v_sub_i32_e32 v16, vcc, s4, v16 +; GFX6-NEXT: v_sub_i32_e32 v16, vcc, v32, v16 ; GFX6-NEXT: v_max_i32_e32 v16, v16, v17 ; GFX6-NEXT: v_max_i32_e32 v17, 0, v1 -; GFX6-NEXT: v_sub_i32_e32 v17, vcc, s5, v17 +; GFX6-NEXT: v_sub_i32_e32 v17, vcc, v33, v17 ; GFX6-NEXT: v_min_i32_e32 v16, v16, v17 ; GFX6-NEXT: v_add_i32_e32 v1, vcc, v1, v16 ; GFX6-NEXT: v_min_i32_e32 v16, 0, v2 -; GFX6-NEXT: v_sub_i32_e32 v16, vcc, s4, v16 +; GFX6-NEXT: v_sub_i32_e32 v16, vcc, v32, v16 ; GFX6-NEXT: v_max_i32_e32 v17, 0, v2 ; GFX6-NEXT: v_max_i32_e32 v16, v16, v18 -; GFX6-NEXT: v_sub_i32_e32 v17, vcc, s5, v17 +; GFX6-NEXT: v_sub_i32_e32 v17, vcc, v33, v17 ; GFX6-NEXT: v_min_i32_e32 v16, v16, v17 ; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v16 -; GFX6-NEXT: v_bfrev_b32_e32 v16, 1 -; GFX6-NEXT: v_min_i32_e32 v17, 0, v3 -; GFX6-NEXT: v_sub_i32_e32 v17, vcc, v16, v17 -; GFX6-NEXT: v_max_i32_e32 v17, v17, v19 -; GFX6-NEXT: v_bfrev_b32_e32 v18, -2 -; GFX6-NEXT: v_max_i32_e32 v19, 0, v3 -; GFX6-NEXT: v_sub_i32_e32 v19, vcc, v18, v19 -; GFX6-NEXT: v_min_i32_e32 v17, v17, v19 -; GFX6-NEXT: v_add_i32_e32 v3, vcc, v3, v17 -; GFX6-NEXT: v_min_i32_e32 v17, 0, v4 -; GFX6-NEXT: v_sub_i32_e32 v17, vcc, v16, v17 -; GFX6-NEXT: v_max_i32_e32 v19, 0, v4 -; GFX6-NEXT: v_max_i32_e32 v17, v17, v20 -; GFX6-NEXT: v_sub_i32_e32 v19, vcc, v18, v19 -; GFX6-NEXT: v_min_i32_e32 v17, v17, v19 -; GFX6-NEXT: v_add_i32_e32 v4, vcc, v4, v17 -; GFX6-NEXT: v_min_i32_e32 v17, 0, v5 -; GFX6-NEXT: v_sub_i32_e32 v17, vcc, v16, v17 -; GFX6-NEXT: v_max_i32_e32 v19, 0, v5 -; GFX6-NEXT: v_max_i32_e32 v17, v17, v21 -; GFX6-NEXT: v_sub_i32_e32 v19, vcc, v18, v19 -; GFX6-NEXT: v_min_i32_e32 v17, v17, v19 -; GFX6-NEXT: v_add_i32_e32 v5, vcc, v5, v17 -; GFX6-NEXT: v_min_i32_e32 v17, 0, v6 -; GFX6-NEXT: v_sub_i32_e32 v17, vcc, v16, v17 -; GFX6-NEXT: v_max_i32_e32 v19, 0, v6 -; GFX6-NEXT: v_max_i32_e32 v17, v17, v22 -; GFX6-NEXT: v_sub_i32_e32 v19, vcc, v18, v19 -; GFX6-NEXT: v_min_i32_e32 v17, v17, v19 -; GFX6-NEXT: v_add_i32_e32 v6, vcc, v6, v17 -; GFX6-NEXT: v_min_i32_e32 v17, 0, v7 -; GFX6-NEXT: v_sub_i32_e32 v17, vcc, v16, v17 -; GFX6-NEXT: v_max_i32_e32 v19, 0, v7 -; GFX6-NEXT: v_max_i32_e32 v17, v17, v23 -; GFX6-NEXT: v_sub_i32_e32 v19, vcc, v18, v19 -; GFX6-NEXT: v_min_i32_e32 v17, v17, v19 -; GFX6-NEXT: v_add_i32_e32 v7, vcc, v7, v17 -; GFX6-NEXT: v_min_i32_e32 v17, 0, v8 -; GFX6-NEXT: v_sub_i32_e32 v17, vcc, v16, v17 -; GFX6-NEXT: v_max_i32_e32 v19, 0, v8 -; GFX6-NEXT: v_max_i32_e32 v17, v17, v24 -; GFX6-NEXT: v_sub_i32_e32 v19, vcc, v18, v19 -; GFX6-NEXT: v_min_i32_e32 v17, v17, v19 -; GFX6-NEXT: v_add_i32_e32 v8, vcc, v8, v17 -; GFX6-NEXT: v_min_i32_e32 v17, 0, v9 -; GFX6-NEXT: v_sub_i32_e32 v17, vcc, v16, v17 -; GFX6-NEXT: v_max_i32_e32 v19, 0, v9 -; GFX6-NEXT: v_max_i32_e32 v17, v17, v25 -; GFX6-NEXT: v_sub_i32_e32 v19, vcc, v18, v19 -; GFX6-NEXT: v_min_i32_e32 v17, v17, v19 -; GFX6-NEXT: v_add_i32_e32 v9, vcc, v9, v17 -; GFX6-NEXT: v_min_i32_e32 v17, 0, v10 -; GFX6-NEXT: v_sub_i32_e32 v17, vcc, v16, v17 -; GFX6-NEXT: v_max_i32_e32 v19, 0, v10 -; GFX6-NEXT: v_max_i32_e32 v17, v17, v26 -; GFX6-NEXT: v_sub_i32_e32 v19, vcc, v18, v19 -; GFX6-NEXT: v_min_i32_e32 v17, v17, v19 -; GFX6-NEXT: v_add_i32_e32 v10, vcc, v10, v17 -; GFX6-NEXT: v_min_i32_e32 v17, 0, v11 -; GFX6-NEXT: v_sub_i32_e32 v17, vcc, v16, v17 -; GFX6-NEXT: v_max_i32_e32 v19, 0, v11 -; GFX6-NEXT: v_max_i32_e32 v17, v17, v27 -; GFX6-NEXT: v_sub_i32_e32 v19, vcc, v18, v19 -; GFX6-NEXT: v_min_i32_e32 v17, v17, v19 -; GFX6-NEXT: v_add_i32_e32 v11, vcc, v11, v17 -; GFX6-NEXT: v_min_i32_e32 v17, 0, v12 -; GFX6-NEXT: v_sub_i32_e32 v17, vcc, v16, v17 -; GFX6-NEXT: v_max_i32_e32 v19, 0, v12 -; GFX6-NEXT: v_max_i32_e32 v17, v17, v28 -; GFX6-NEXT: v_sub_i32_e32 v19, vcc, v18, v19 -; GFX6-NEXT: v_min_i32_e32 v17, v17, v19 -; GFX6-NEXT: v_add_i32_e32 v12, vcc, v12, v17 -; GFX6-NEXT: v_min_i32_e32 v17, 0, v13 -; GFX6-NEXT: v_sub_i32_e32 v17, vcc, v16, v17 -; GFX6-NEXT: v_max_i32_e32 v19, 0, v13 -; GFX6-NEXT: v_max_i32_e32 v17, v17, v29 -; GFX6-NEXT: v_sub_i32_e32 v19, vcc, v18, v19 -; GFX6-NEXT: v_min_i32_e32 v17, v17, v19 -; GFX6-NEXT: v_add_i32_e32 v13, vcc, v13, v17 -; GFX6-NEXT: v_min_i32_e32 v17, 0, v14 -; GFX6-NEXT: v_sub_i32_e32 v17, vcc, v16, v17 -; GFX6-NEXT: v_max_i32_e32 v19, 0, v14 -; GFX6-NEXT: v_max_i32_e32 v17, v17, v30 -; GFX6-NEXT: v_sub_i32_e32 v19, vcc, v18, v19 -; GFX6-NEXT: v_min_i32_e32 v17, v17, v19 -; GFX6-NEXT: v_add_i32_e32 v14, vcc, v14, v17 -; GFX6-NEXT: v_max_i32_e32 v17, 0, v15 -; GFX6-NEXT: v_sub_i32_e32 v17, vcc, v18, v17 -; GFX6-NEXT: v_min_i32_e32 v18, 0, v15 -; GFX6-NEXT: v_sub_i32_e32 v16, vcc, v16, v18 -; GFX6-NEXT: v_max_i32_e32 v16, v16, v31 +; GFX6-NEXT: v_min_i32_e32 v16, 0, v3 +; GFX6-NEXT: v_sub_i32_e32 v16, vcc, v32, v16 +; GFX6-NEXT: v_max_i32_e32 v17, 0, v3 +; GFX6-NEXT: v_max_i32_e32 v16, v16, v19 +; GFX6-NEXT: v_sub_i32_e32 v17, vcc, v33, v17 +; GFX6-NEXT: v_min_i32_e32 v16, v16, v17 +; GFX6-NEXT: v_add_i32_e32 v3, vcc, v3, v16 +; GFX6-NEXT: v_min_i32_e32 v16, 0, v4 +; GFX6-NEXT: v_sub_i32_e32 v16, vcc, v32, v16 +; GFX6-NEXT: v_max_i32_e32 v17, 0, v4 +; GFX6-NEXT: v_max_i32_e32 v16, v16, v20 +; GFX6-NEXT: v_sub_i32_e32 v17, vcc, v33, v17 +; GFX6-NEXT: v_min_i32_e32 v16, v16, v17 +; GFX6-NEXT: v_add_i32_e32 v4, vcc, v4, v16 +; GFX6-NEXT: v_min_i32_e32 v16, 0, v5 +; GFX6-NEXT: v_sub_i32_e32 v16, vcc, v32, v16 +; GFX6-NEXT: v_max_i32_e32 v17, 0, v5 +; GFX6-NEXT: v_max_i32_e32 v16, v16, v21 +; GFX6-NEXT: v_sub_i32_e32 v17, vcc, v33, v17 +; GFX6-NEXT: v_min_i32_e32 v16, v16, v17 +; GFX6-NEXT: v_add_i32_e32 v5, vcc, v5, v16 +; GFX6-NEXT: v_min_i32_e32 v16, 0, v6 +; GFX6-NEXT: v_sub_i32_e32 v16, vcc, v32, v16 +; GFX6-NEXT: v_max_i32_e32 v17, 0, v6 +; GFX6-NEXT: v_max_i32_e32 v16, v16, v22 +; GFX6-NEXT: v_sub_i32_e32 v17, vcc, v33, v17 +; GFX6-NEXT: v_min_i32_e32 v16, v16, v17 +; GFX6-NEXT: v_add_i32_e32 v6, vcc, v6, v16 +; GFX6-NEXT: v_min_i32_e32 v16, 0, v7 +; GFX6-NEXT: v_sub_i32_e32 v16, vcc, v32, v16 +; GFX6-NEXT: v_max_i32_e32 v17, 0, v7 +; GFX6-NEXT: v_max_i32_e32 v16, v16, v23 +; GFX6-NEXT: v_sub_i32_e32 v17, vcc, v33, v17 +; GFX6-NEXT: v_min_i32_e32 v16, v16, v17 +; GFX6-NEXT: v_add_i32_e32 v7, vcc, v7, v16 +; GFX6-NEXT: v_min_i32_e32 v16, 0, v8 +; GFX6-NEXT: v_sub_i32_e32 v16, vcc, v32, v16 +; GFX6-NEXT: v_max_i32_e32 v17, 0, v8 +; GFX6-NEXT: v_max_i32_e32 v16, v16, v24 +; GFX6-NEXT: v_sub_i32_e32 v17, vcc, v33, v17 +; GFX6-NEXT: v_min_i32_e32 v16, v16, v17 +; GFX6-NEXT: v_add_i32_e32 v8, vcc, v8, v16 +; GFX6-NEXT: v_min_i32_e32 v16, 0, v9 +; GFX6-NEXT: v_sub_i32_e32 v16, vcc, v32, v16 +; GFX6-NEXT: v_max_i32_e32 v17, 0, v9 +; GFX6-NEXT: v_max_i32_e32 v16, v16, v25 +; GFX6-NEXT: v_sub_i32_e32 v17, vcc, v33, v17 +; GFX6-NEXT: v_min_i32_e32 v16, v16, v17 +; GFX6-NEXT: v_add_i32_e32 v9, vcc, v9, v16 +; GFX6-NEXT: v_min_i32_e32 v16, 0, v10 +; GFX6-NEXT: v_sub_i32_e32 v16, vcc, v32, v16 +; GFX6-NEXT: v_max_i32_e32 v17, 0, v10 +; GFX6-NEXT: v_max_i32_e32 v16, v16, v26 +; GFX6-NEXT: v_sub_i32_e32 v17, vcc, v33, v17 ; GFX6-NEXT: v_min_i32_e32 v16, v16, v17 +; GFX6-NEXT: v_add_i32_e32 v10, vcc, v10, v16 +; GFX6-NEXT: v_min_i32_e32 v16, 0, v11 +; GFX6-NEXT: v_sub_i32_e32 v16, vcc, v32, v16 +; GFX6-NEXT: v_max_i32_e32 v17, 0, v11 +; GFX6-NEXT: v_max_i32_e32 v16, v16, v27 +; GFX6-NEXT: v_sub_i32_e32 v17, vcc, v33, v17 +; GFX6-NEXT: v_min_i32_e32 v16, v16, v17 +; GFX6-NEXT: v_add_i32_e32 v11, vcc, v11, v16 +; GFX6-NEXT: v_min_i32_e32 v16, 0, v12 +; GFX6-NEXT: v_sub_i32_e32 v16, vcc, v32, v16 +; GFX6-NEXT: v_max_i32_e32 v17, 0, v12 +; GFX6-NEXT: v_max_i32_e32 v16, v16, v28 +; GFX6-NEXT: v_sub_i32_e32 v17, vcc, v33, v17 +; GFX6-NEXT: v_min_i32_e32 v16, v16, v17 +; GFX6-NEXT: v_add_i32_e32 v12, vcc, v12, v16 +; GFX6-NEXT: v_min_i32_e32 v16, 0, v13 +; GFX6-NEXT: v_sub_i32_e32 v16, vcc, v32, v16 +; GFX6-NEXT: v_max_i32_e32 v17, 0, v13 +; GFX6-NEXT: v_max_i32_e32 v16, v16, v29 +; GFX6-NEXT: v_sub_i32_e32 v17, vcc, v33, v17 +; GFX6-NEXT: v_min_i32_e32 v16, v16, v17 +; GFX6-NEXT: v_add_i32_e32 v13, vcc, v13, v16 +; GFX6-NEXT: v_min_i32_e32 v16, 0, v14 +; GFX6-NEXT: v_sub_i32_e32 v16, vcc, v32, v16 +; GFX6-NEXT: v_max_i32_e32 v17, 0, v14 +; GFX6-NEXT: v_max_i32_e32 v16, v16, v30 +; GFX6-NEXT: v_sub_i32_e32 v17, vcc, v33, v17 +; GFX6-NEXT: v_min_i32_e32 v16, v16, v17 +; GFX6-NEXT: v_min_i32_e32 v17, 0, v15 +; GFX6-NEXT: v_add_i32_e32 v14, vcc, v14, v16 +; GFX6-NEXT: v_max_i32_e32 v16, 0, v15 +; GFX6-NEXT: v_sub_i32_e32 v17, vcc, v32, v17 +; GFX6-NEXT: v_sub_i32_e32 v16, vcc, v33, v16 +; GFX6-NEXT: v_max_i32_e32 v17, v17, v31 +; GFX6-NEXT: v_min_i32_e32 v16, v17, v16 ; GFX6-NEXT: v_add_i32_e32 v15, vcc, v15, v16 ; GFX6-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: v_saddsat_v16i32: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: s_brev_b32 s4, 1 -; GFX8-NEXT: v_min_i32_e32 v32, 0, v0 -; GFX8-NEXT: v_sub_u32_e32 v32, vcc, s4, v32 -; GFX8-NEXT: v_max_i32_e32 v16, v32, v16 -; GFX8-NEXT: s_brev_b32 s5, -2 -; GFX8-NEXT: v_max_i32_e32 v32, 0, v0 -; GFX8-NEXT: v_sub_u32_e32 v32, vcc, s5, v32 -; GFX8-NEXT: v_min_i32_e32 v16, v16, v32 +; GFX8-NEXT: v_bfrev_b32_e32 v32, 1 +; GFX8-NEXT: v_min_i32_e32 v33, 0, v0 +; GFX8-NEXT: v_sub_u32_e32 v33, vcc, v32, v33 +; GFX8-NEXT: v_max_i32_e32 v16, v33, v16 +; GFX8-NEXT: v_bfrev_b32_e32 v33, -2 +; GFX8-NEXT: v_max_i32_e32 v34, 0, v0 +; GFX8-NEXT: v_sub_u32_e32 v34, vcc, v33, v34 +; GFX8-NEXT: v_min_i32_e32 v16, v16, v34 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v16 ; GFX8-NEXT: v_min_i32_e32 v16, 0, v1 -; GFX8-NEXT: v_sub_u32_e32 v16, vcc, s4, v16 +; GFX8-NEXT: v_sub_u32_e32 v16, vcc, v32, v16 ; GFX8-NEXT: v_max_i32_e32 v16, v16, v17 ; GFX8-NEXT: v_max_i32_e32 v17, 0, v1 -; GFX8-NEXT: v_sub_u32_e32 v17, vcc, s5, v17 +; GFX8-NEXT: v_sub_u32_e32 v17, vcc, v33, v17 ; GFX8-NEXT: v_min_i32_e32 v16, v16, v17 ; GFX8-NEXT: v_add_u32_e32 v1, vcc, v1, v16 ; GFX8-NEXT: v_min_i32_e32 v16, 0, v2 -; GFX8-NEXT: v_sub_u32_e32 v16, vcc, s4, v16 +; GFX8-NEXT: v_sub_u32_e32 v16, vcc, v32, v16 ; GFX8-NEXT: v_max_i32_e32 v17, 0, v2 ; GFX8-NEXT: v_max_i32_e32 v16, v16, v18 -; GFX8-NEXT: v_sub_u32_e32 v17, vcc, s5, v17 +; GFX8-NEXT: v_sub_u32_e32 v17, vcc, v33, v17 ; GFX8-NEXT: v_min_i32_e32 v16, v16, v17 ; GFX8-NEXT: v_add_u32_e32 v2, vcc, v2, v16 -; GFX8-NEXT: v_bfrev_b32_e32 v16, 1 -; GFX8-NEXT: v_min_i32_e32 v17, 0, v3 -; GFX8-NEXT: v_sub_u32_e32 v17, vcc, v16, v17 -; GFX8-NEXT: v_max_i32_e32 v17, v17, v19 -; GFX8-NEXT: v_bfrev_b32_e32 v18, -2 -; GFX8-NEXT: v_max_i32_e32 v19, 0, v3 -; GFX8-NEXT: v_sub_u32_e32 v19, vcc, v18, v19 -; GFX8-NEXT: v_min_i32_e32 v17, v17, v19 -; GFX8-NEXT: v_add_u32_e32 v3, vcc, v3, v17 -; GFX8-NEXT: v_min_i32_e32 v17, 0, v4 -; GFX8-NEXT: v_sub_u32_e32 v17, vcc, v16, v17 -; GFX8-NEXT: v_max_i32_e32 v19, 0, v4 -; GFX8-NEXT: v_max_i32_e32 v17, v17, v20 -; GFX8-NEXT: v_sub_u32_e32 v19, vcc, v18, v19 -; GFX8-NEXT: v_min_i32_e32 v17, v17, v19 -; GFX8-NEXT: v_add_u32_e32 v4, vcc, v4, v17 -; GFX8-NEXT: v_min_i32_e32 v17, 0, v5 -; GFX8-NEXT: v_sub_u32_e32 v17, vcc, v16, v17 -; GFX8-NEXT: v_max_i32_e32 v19, 0, v5 -; GFX8-NEXT: v_max_i32_e32 v17, v17, v21 -; GFX8-NEXT: v_sub_u32_e32 v19, vcc, v18, v19 -; GFX8-NEXT: v_min_i32_e32 v17, v17, v19 -; GFX8-NEXT: v_add_u32_e32 v5, vcc, v5, v17 -; GFX8-NEXT: v_min_i32_e32 v17, 0, v6 -; GFX8-NEXT: v_sub_u32_e32 v17, vcc, v16, v17 -; GFX8-NEXT: v_max_i32_e32 v19, 0, v6 -; GFX8-NEXT: v_max_i32_e32 v17, v17, v22 -; GFX8-NEXT: v_sub_u32_e32 v19, vcc, v18, v19 -; GFX8-NEXT: v_min_i32_e32 v17, v17, v19 -; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v17 -; GFX8-NEXT: v_min_i32_e32 v17, 0, v7 -; GFX8-NEXT: v_sub_u32_e32 v17, vcc, v16, v17 -; GFX8-NEXT: v_max_i32_e32 v19, 0, v7 -; GFX8-NEXT: v_max_i32_e32 v17, v17, v23 -; GFX8-NEXT: v_sub_u32_e32 v19, vcc, v18, v19 -; GFX8-NEXT: v_min_i32_e32 v17, v17, v19 -; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v17 -; GFX8-NEXT: v_min_i32_e32 v17, 0, v8 -; GFX8-NEXT: v_sub_u32_e32 v17, vcc, v16, v17 -; GFX8-NEXT: v_max_i32_e32 v19, 0, v8 -; GFX8-NEXT: v_max_i32_e32 v17, v17, v24 -; GFX8-NEXT: v_sub_u32_e32 v19, vcc, v18, v19 -; GFX8-NEXT: v_min_i32_e32 v17, v17, v19 -; GFX8-NEXT: v_add_u32_e32 v8, vcc, v8, v17 -; GFX8-NEXT: v_min_i32_e32 v17, 0, v9 -; GFX8-NEXT: v_sub_u32_e32 v17, vcc, v16, v17 -; GFX8-NEXT: v_max_i32_e32 v19, 0, v9 -; GFX8-NEXT: v_max_i32_e32 v17, v17, v25 -; GFX8-NEXT: v_sub_u32_e32 v19, vcc, v18, v19 -; GFX8-NEXT: v_min_i32_e32 v17, v17, v19 -; GFX8-NEXT: v_add_u32_e32 v9, vcc, v9, v17 -; GFX8-NEXT: v_min_i32_e32 v17, 0, v10 -; GFX8-NEXT: v_sub_u32_e32 v17, vcc, v16, v17 -; GFX8-NEXT: v_max_i32_e32 v19, 0, v10 -; GFX8-NEXT: v_max_i32_e32 v17, v17, v26 -; GFX8-NEXT: v_sub_u32_e32 v19, vcc, v18, v19 -; GFX8-NEXT: v_min_i32_e32 v17, v17, v19 -; GFX8-NEXT: v_add_u32_e32 v10, vcc, v10, v17 -; GFX8-NEXT: v_min_i32_e32 v17, 0, v11 -; GFX8-NEXT: v_sub_u32_e32 v17, vcc, v16, v17 -; GFX8-NEXT: v_max_i32_e32 v19, 0, v11 -; GFX8-NEXT: v_max_i32_e32 v17, v17, v27 -; GFX8-NEXT: v_sub_u32_e32 v19, vcc, v18, v19 -; GFX8-NEXT: v_min_i32_e32 v17, v17, v19 -; GFX8-NEXT: v_add_u32_e32 v11, vcc, v11, v17 -; GFX8-NEXT: v_min_i32_e32 v17, 0, v12 -; GFX8-NEXT: v_sub_u32_e32 v17, vcc, v16, v17 -; GFX8-NEXT: v_max_i32_e32 v19, 0, v12 -; GFX8-NEXT: v_max_i32_e32 v17, v17, v28 -; GFX8-NEXT: v_sub_u32_e32 v19, vcc, v18, v19 -; GFX8-NEXT: v_min_i32_e32 v17, v17, v19 -; GFX8-NEXT: v_add_u32_e32 v12, vcc, v12, v17 -; GFX8-NEXT: v_min_i32_e32 v17, 0, v13 -; GFX8-NEXT: v_sub_u32_e32 v17, vcc, v16, v17 -; GFX8-NEXT: v_max_i32_e32 v19, 0, v13 -; GFX8-NEXT: v_max_i32_e32 v17, v17, v29 -; GFX8-NEXT: v_sub_u32_e32 v19, vcc, v18, v19 -; GFX8-NEXT: v_min_i32_e32 v17, v17, v19 -; GFX8-NEXT: v_add_u32_e32 v13, vcc, v13, v17 -; GFX8-NEXT: v_min_i32_e32 v17, 0, v14 -; GFX8-NEXT: v_sub_u32_e32 v17, vcc, v16, v17 -; GFX8-NEXT: v_max_i32_e32 v19, 0, v14 -; GFX8-NEXT: v_max_i32_e32 v17, v17, v30 -; GFX8-NEXT: v_sub_u32_e32 v19, vcc, v18, v19 -; GFX8-NEXT: v_min_i32_e32 v17, v17, v19 -; GFX8-NEXT: v_add_u32_e32 v14, vcc, v14, v17 -; GFX8-NEXT: v_max_i32_e32 v17, 0, v15 -; GFX8-NEXT: v_sub_u32_e32 v17, vcc, v18, v17 -; GFX8-NEXT: v_min_i32_e32 v18, 0, v15 -; GFX8-NEXT: v_sub_u32_e32 v16, vcc, v16, v18 -; GFX8-NEXT: v_max_i32_e32 v16, v16, v31 +; GFX8-NEXT: v_min_i32_e32 v16, 0, v3 +; GFX8-NEXT: v_sub_u32_e32 v16, vcc, v32, v16 +; GFX8-NEXT: v_max_i32_e32 v17, 0, v3 +; GFX8-NEXT: v_max_i32_e32 v16, v16, v19 +; GFX8-NEXT: v_sub_u32_e32 v17, vcc, v33, v17 +; GFX8-NEXT: v_min_i32_e32 v16, v16, v17 +; GFX8-NEXT: v_add_u32_e32 v3, vcc, v3, v16 +; GFX8-NEXT: v_min_i32_e32 v16, 0, v4 +; GFX8-NEXT: v_sub_u32_e32 v16, vcc, v32, v16 +; GFX8-NEXT: v_max_i32_e32 v17, 0, v4 +; GFX8-NEXT: v_max_i32_e32 v16, v16, v20 +; GFX8-NEXT: v_sub_u32_e32 v17, vcc, v33, v17 +; GFX8-NEXT: v_min_i32_e32 v16, v16, v17 +; GFX8-NEXT: v_add_u32_e32 v4, vcc, v4, v16 +; GFX8-NEXT: v_min_i32_e32 v16, 0, v5 +; GFX8-NEXT: v_sub_u32_e32 v16, vcc, v32, v16 +; GFX8-NEXT: v_max_i32_e32 v17, 0, v5 +; GFX8-NEXT: v_max_i32_e32 v16, v16, v21 +; GFX8-NEXT: v_sub_u32_e32 v17, vcc, v33, v17 +; GFX8-NEXT: v_min_i32_e32 v16, v16, v17 +; GFX8-NEXT: v_add_u32_e32 v5, vcc, v5, v16 +; GFX8-NEXT: v_min_i32_e32 v16, 0, v6 +; GFX8-NEXT: v_sub_u32_e32 v16, vcc, v32, v16 +; GFX8-NEXT: v_max_i32_e32 v17, 0, v6 +; GFX8-NEXT: v_max_i32_e32 v16, v16, v22 +; GFX8-NEXT: v_sub_u32_e32 v17, vcc, v33, v17 +; GFX8-NEXT: v_min_i32_e32 v16, v16, v17 +; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v16 +; GFX8-NEXT: v_min_i32_e32 v16, 0, v7 +; GFX8-NEXT: v_sub_u32_e32 v16, vcc, v32, v16 +; GFX8-NEXT: v_max_i32_e32 v17, 0, v7 +; GFX8-NEXT: v_max_i32_e32 v16, v16, v23 +; GFX8-NEXT: v_sub_u32_e32 v17, vcc, v33, v17 +; GFX8-NEXT: v_min_i32_e32 v16, v16, v17 +; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v16 +; GFX8-NEXT: v_min_i32_e32 v16, 0, v8 +; GFX8-NEXT: v_sub_u32_e32 v16, vcc, v32, v16 +; GFX8-NEXT: v_max_i32_e32 v17, 0, v8 +; GFX8-NEXT: v_max_i32_e32 v16, v16, v24 +; GFX8-NEXT: v_sub_u32_e32 v17, vcc, v33, v17 +; GFX8-NEXT: v_min_i32_e32 v16, v16, v17 +; GFX8-NEXT: v_add_u32_e32 v8, vcc, v8, v16 +; GFX8-NEXT: v_min_i32_e32 v16, 0, v9 +; GFX8-NEXT: v_sub_u32_e32 v16, vcc, v32, v16 +; GFX8-NEXT: v_max_i32_e32 v17, 0, v9 +; GFX8-NEXT: v_max_i32_e32 v16, v16, v25 +; GFX8-NEXT: v_sub_u32_e32 v17, vcc, v33, v17 +; GFX8-NEXT: v_min_i32_e32 v16, v16, v17 +; GFX8-NEXT: v_add_u32_e32 v9, vcc, v9, v16 +; GFX8-NEXT: v_min_i32_e32 v16, 0, v10 +; GFX8-NEXT: v_sub_u32_e32 v16, vcc, v32, v16 +; GFX8-NEXT: v_max_i32_e32 v17, 0, v10 +; GFX8-NEXT: v_max_i32_e32 v16, v16, v26 +; GFX8-NEXT: v_sub_u32_e32 v17, vcc, v33, v17 +; GFX8-NEXT: v_min_i32_e32 v16, v16, v17 +; GFX8-NEXT: v_add_u32_e32 v10, vcc, v10, v16 +; GFX8-NEXT: v_min_i32_e32 v16, 0, v11 +; GFX8-NEXT: v_sub_u32_e32 v16, vcc, v32, v16 +; GFX8-NEXT: v_max_i32_e32 v17, 0, v11 +; GFX8-NEXT: v_max_i32_e32 v16, v16, v27 +; GFX8-NEXT: v_sub_u32_e32 v17, vcc, v33, v17 +; GFX8-NEXT: v_min_i32_e32 v16, v16, v17 +; GFX8-NEXT: v_add_u32_e32 v11, vcc, v11, v16 +; GFX8-NEXT: v_min_i32_e32 v16, 0, v12 +; GFX8-NEXT: v_sub_u32_e32 v16, vcc, v32, v16 +; GFX8-NEXT: v_max_i32_e32 v17, 0, v12 +; GFX8-NEXT: v_max_i32_e32 v16, v16, v28 +; GFX8-NEXT: v_sub_u32_e32 v17, vcc, v33, v17 +; GFX8-NEXT: v_min_i32_e32 v16, v16, v17 +; GFX8-NEXT: v_add_u32_e32 v12, vcc, v12, v16 +; GFX8-NEXT: v_min_i32_e32 v16, 0, v13 +; GFX8-NEXT: v_sub_u32_e32 v16, vcc, v32, v16 +; GFX8-NEXT: v_max_i32_e32 v17, 0, v13 +; GFX8-NEXT: v_max_i32_e32 v16, v16, v29 +; GFX8-NEXT: v_sub_u32_e32 v17, vcc, v33, v17 +; GFX8-NEXT: v_min_i32_e32 v16, v16, v17 +; GFX8-NEXT: v_add_u32_e32 v13, vcc, v13, v16 +; GFX8-NEXT: v_min_i32_e32 v16, 0, v14 +; GFX8-NEXT: v_sub_u32_e32 v16, vcc, v32, v16 +; GFX8-NEXT: v_max_i32_e32 v17, 0, v14 +; GFX8-NEXT: v_max_i32_e32 v16, v16, v30 +; GFX8-NEXT: v_sub_u32_e32 v17, vcc, v33, v17 ; GFX8-NEXT: v_min_i32_e32 v16, v16, v17 +; GFX8-NEXT: v_min_i32_e32 v17, 0, v15 +; GFX8-NEXT: v_add_u32_e32 v14, vcc, v14, v16 +; GFX8-NEXT: v_max_i32_e32 v16, 0, v15 +; GFX8-NEXT: v_sub_u32_e32 v17, vcc, v32, v17 +; GFX8-NEXT: v_sub_u32_e32 v16, vcc, v33, v16 +; GFX8-NEXT: v_max_i32_e32 v17, v17, v31 +; GFX8-NEXT: v_min_i32_e32 v16, v17, v16 ; GFX8-NEXT: v_add_u32_e32 v15, vcc, v15, v16 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; @@ -2707,22 +2696,22 @@ ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX6-NEXT: s_brev_b32 s5, 1 -; GFX6-NEXT: v_min_i32_e32 v5, 0, v0 +; GFX6-NEXT: v_bfrev_b32_e32 v5, 1 +; GFX6-NEXT: v_min_i32_e32 v7, 0, v0 ; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX6-NEXT: s_brev_b32 s4, -2 -; GFX6-NEXT: v_max_i32_e32 v4, 0, v0 -; GFX6-NEXT: v_sub_i32_e32 v5, vcc, s5, v5 -; GFX6-NEXT: v_sub_i32_e32 v4, vcc, s4, v4 -; GFX6-NEXT: v_max_i32_e32 v2, v5, v2 +; GFX6-NEXT: v_bfrev_b32_e32 v4, -2 +; GFX6-NEXT: v_max_i32_e32 v6, 0, v0 +; GFX6-NEXT: v_sub_i32_e32 v7, vcc, v5, v7 +; GFX6-NEXT: v_sub_i32_e32 v6, vcc, v4, v6 +; GFX6-NEXT: v_max_i32_e32 v2, v7, v2 +; GFX6-NEXT: v_min_i32_e32 v2, v2, v6 ; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX6-NEXT: v_min_i32_e32 v2, v2, v4 -; GFX6-NEXT: v_min_i32_e32 v4, 0, v1 ; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v2 ; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v3 ; GFX6-NEXT: v_max_i32_e32 v3, 0, v1 -; GFX6-NEXT: v_sub_i32_e32 v4, vcc, s5, v4 -; GFX6-NEXT: v_sub_i32_e32 v3, vcc, s4, v3 +; GFX6-NEXT: v_sub_i32_e32 v3, vcc, v4, v3 +; GFX6-NEXT: v_min_i32_e32 v4, 0, v1 +; GFX6-NEXT: v_sub_i32_e32 v4, vcc, v5, v4 ; GFX6-NEXT: v_max_i32_e32 v2, v4, v2 ; GFX6-NEXT: v_min_i32_e32 v2, v2, v3 ; GFX6-NEXT: v_add_i32_e32 v1, vcc, v1, v2 @@ -2733,22 +2722,22 @@ ; GFX8-LABEL: v_saddsat_v2i16: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: s_movk_i32 s5, 0x8000 -; GFX8-NEXT: v_min_i16_e32 v4, 0, v0 +; GFX8-NEXT: v_mov_b32_e32 v4, 0xffff8000 +; GFX8-NEXT: v_min_i16_e32 v6, 0, v0 +; GFX8-NEXT: v_mov_b32_e32 v3, 0x7fff +; GFX8-NEXT: v_max_i16_e32 v5, 0, v0 +; GFX8-NEXT: v_sub_u16_e32 v6, v4, v6 ; GFX8-NEXT: v_lshrrev_b32_e32 v2, 16, v0 -; GFX8-NEXT: s_movk_i32 s4, 0x7fff -; GFX8-NEXT: v_max_i16_e32 v3, 0, v0 -; GFX8-NEXT: v_sub_u16_e32 v4, s5, v4 -; GFX8-NEXT: v_sub_u16_e32 v3, s4, v3 -; GFX8-NEXT: v_max_i16_e32 v4, v4, v1 -; GFX8-NEXT: v_min_i16_e32 v5, 0, v2 -; GFX8-NEXT: v_min_i16_e32 v3, v4, v3 -; GFX8-NEXT: v_max_i16_e32 v4, 0, v2 -; GFX8-NEXT: v_sub_u16_e32 v5, s5, v5 -; GFX8-NEXT: v_sub_u16_e32 v4, s4, v4 -; GFX8-NEXT: v_max_i16_sdwa v1, v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX8-NEXT: v_min_i16_e32 v1, v1, v4 -; GFX8-NEXT: v_add_u16_e32 v0, v0, v3 +; GFX8-NEXT: v_sub_u16_e32 v5, v3, v5 +; GFX8-NEXT: v_max_i16_e32 v6, v6, v1 +; GFX8-NEXT: v_min_i16_e32 v5, v6, v5 +; GFX8-NEXT: v_max_i16_e32 v6, 0, v2 +; GFX8-NEXT: v_sub_u16_e32 v3, v3, v6 +; GFX8-NEXT: v_min_i16_e32 v6, 0, v2 +; GFX8-NEXT: v_sub_u16_e32 v4, v4, v6 +; GFX8-NEXT: v_max_i16_sdwa v1, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX8-NEXT: v_min_i16_e32 v1, v1, v3 +; GFX8-NEXT: v_add_u16_e32 v0, v0, v5 ; GFX8-NEXT: v_add_u16_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX8-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX8-NEXT: s_setpc_b64 s[30:31] @@ -2879,10 +2868,10 @@ ; GFX6-NEXT: v_min_i32_e32 v1, s1, v1 ; GFX6-NEXT: v_add_i32_e32 v1, vcc, s0, v1 ; GFX6-NEXT: v_ashrrev_i32_e32 v1, 16, v1 -; GFX6-NEXT: s_mov_b32 s0, 0xffff +; GFX6-NEXT: v_mov_b32_e32 v2, 0xffff ; GFX6-NEXT: v_ashrrev_i32_e32 v0, 16, v0 -; GFX6-NEXT: v_and_b32_e32 v1, s0, v1 -; GFX6-NEXT: v_and_b32_e32 v0, s0, v0 +; GFX6-NEXT: v_and_b32_e32 v1, v1, v2 +; GFX6-NEXT: v_and_b32_e32 v0, v0, v2 ; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX6-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX6-NEXT: ; return to shader part epilog @@ -2932,54 +2921,54 @@ ; GFX6-LABEL: saddsat_v2i16_vs: ; GFX6: ; %bb.0: ; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX6-NEXT: s_brev_b32 s3, 1 -; GFX6-NEXT: v_min_i32_e32 v3, 0, v0 +; GFX6-NEXT: v_bfrev_b32_e32 v3, 1 +; GFX6-NEXT: v_min_i32_e32 v5, 0, v0 ; GFX6-NEXT: s_lshl_b32 s0, s0, 16 -; GFX6-NEXT: s_brev_b32 s2, -2 -; GFX6-NEXT: v_max_i32_e32 v2, 0, v0 -; GFX6-NEXT: v_sub_i32_e32 v3, vcc, s3, v3 -; GFX6-NEXT: v_sub_i32_e32 v2, vcc, s2, v2 -; GFX6-NEXT: v_max_i32_e32 v3, s0, v3 +; GFX6-NEXT: v_bfrev_b32_e32 v2, -2 +; GFX6-NEXT: v_max_i32_e32 v4, 0, v0 +; GFX6-NEXT: v_sub_i32_e32 v5, vcc, v3, v5 +; GFX6-NEXT: v_sub_i32_e32 v4, vcc, v2, v4 +; GFX6-NEXT: v_max_i32_e32 v5, s0, v5 +; GFX6-NEXT: v_min_i32_e32 v4, v5, v4 ; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX6-NEXT: v_min_i32_e32 v2, v3, v2 -; GFX6-NEXT: v_min_i32_e32 v3, 0, v1 -; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v2 +; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v4 +; GFX6-NEXT: v_max_i32_e32 v4, 0, v1 +; GFX6-NEXT: v_sub_i32_e32 v2, vcc, v2, v4 +; GFX6-NEXT: v_min_i32_e32 v4, 0, v1 ; GFX6-NEXT: s_lshl_b32 s0, s1, 16 -; GFX6-NEXT: v_max_i32_e32 v2, 0, v1 -; GFX6-NEXT: v_sub_i32_e32 v3, vcc, s3, v3 -; GFX6-NEXT: v_sub_i32_e32 v2, vcc, s2, v2 +; GFX6-NEXT: v_sub_i32_e32 v3, vcc, v3, v4 ; GFX6-NEXT: v_max_i32_e32 v3, s0, v3 ; GFX6-NEXT: v_min_i32_e32 v2, v3, v2 ; GFX6-NEXT: v_add_i32_e32 v1, vcc, v1, v2 ; GFX6-NEXT: v_ashrrev_i32_e32 v1, 16, v1 -; GFX6-NEXT: s_mov_b32 s0, 0xffff +; GFX6-NEXT: v_mov_b32_e32 v2, 0xffff ; GFX6-NEXT: v_ashrrev_i32_e32 v0, 16, v0 -; GFX6-NEXT: v_and_b32_e32 v1, s0, v1 -; GFX6-NEXT: v_and_b32_e32 v0, s0, v0 +; GFX6-NEXT: v_and_b32_e32 v1, v1, v2 +; GFX6-NEXT: v_and_b32_e32 v0, v0, v2 ; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX6-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX6-NEXT: ; return to shader part epilog ; ; GFX8-LABEL: saddsat_v2i16_vs: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_movk_i32 s3, 0x8000 -; GFX8-NEXT: v_min_i16_e32 v3, 0, v0 +; GFX8-NEXT: v_mov_b32_e32 v3, 0xffff8000 +; GFX8-NEXT: v_min_i16_e32 v5, 0, v0 +; GFX8-NEXT: v_mov_b32_e32 v2, 0x7fff +; GFX8-NEXT: v_max_i16_e32 v4, 0, v0 +; GFX8-NEXT: v_sub_u16_e32 v5, v3, v5 ; GFX8-NEXT: v_lshrrev_b32_e32 v1, 16, v0 -; GFX8-NEXT: s_movk_i32 s2, 0x7fff -; GFX8-NEXT: v_max_i16_e32 v2, 0, v0 -; GFX8-NEXT: v_sub_u16_e32 v3, s3, v3 -; GFX8-NEXT: v_sub_u16_e32 v2, s2, v2 -; GFX8-NEXT: v_max_i16_e32 v3, s0, v3 -; GFX8-NEXT: v_min_i16_e32 v4, 0, v1 +; GFX8-NEXT: v_sub_u16_e32 v4, v2, v4 +; GFX8-NEXT: v_max_i16_e32 v5, s0, v5 +; GFX8-NEXT: v_min_i16_e32 v4, v5, v4 +; GFX8-NEXT: v_max_i16_e32 v5, 0, v1 +; GFX8-NEXT: v_sub_u16_e32 v2, v2, v5 +; GFX8-NEXT: v_min_i16_e32 v5, 0, v1 ; GFX8-NEXT: s_lshr_b32 s1, s0, 16 +; GFX8-NEXT: v_sub_u16_e32 v3, v3, v5 +; GFX8-NEXT: v_max_i16_e32 v3, s1, v3 ; GFX8-NEXT: v_min_i16_e32 v2, v3, v2 -; GFX8-NEXT: v_max_i16_e32 v3, 0, v1 -; GFX8-NEXT: v_sub_u16_e32 v4, s3, v4 -; GFX8-NEXT: v_sub_u16_e32 v3, s2, v3 -; GFX8-NEXT: v_max_i16_e32 v4, s1, v4 -; GFX8-NEXT: v_min_i16_e32 v3, v4, v3 -; GFX8-NEXT: v_add_u16_e32 v0, v0, v2 -; GFX8-NEXT: v_add_u16_sdwa v1, v1, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX8-NEXT: v_add_u16_e32 v0, v0, v4 +; GFX8-NEXT: v_add_u16_sdwa v1, v1, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX8-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX8-NEXT: ; return to shader part epilog ; @@ -3013,56 +3002,54 @@ ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX6-NEXT: s_brev_b32 s5, 1 -; GFX6-NEXT: v_min_i32_e32 v10, 0, v0 +; GFX6-NEXT: v_bfrev_b32_e32 v9, 1 +; GFX6-NEXT: v_min_i32_e32 v11, 0, v0 ; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GFX6-NEXT: s_brev_b32 s4, -2 -; GFX6-NEXT: v_max_i32_e32 v8, 0, v0 -; GFX6-NEXT: v_sub_i32_e32 v10, vcc, s5, v10 -; GFX6-NEXT: v_sub_i32_e32 v8, vcc, s4, v8 -; GFX6-NEXT: v_max_i32_e32 v4, v10, v4 +; GFX6-NEXT: v_bfrev_b32_e32 v8, -2 +; GFX6-NEXT: v_max_i32_e32 v10, 0, v0 +; GFX6-NEXT: v_sub_i32_e32 v11, vcc, v9, v11 +; GFX6-NEXT: v_sub_i32_e32 v10, vcc, v8, v10 +; GFX6-NEXT: v_max_i32_e32 v4, v11, v4 ; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX6-NEXT: v_min_i32_e32 v4, v4, v8 -; GFX6-NEXT: v_min_i32_e32 v8, 0, v1 +; GFX6-NEXT: v_min_i32_e32 v4, v4, v10 +; GFX6-NEXT: v_min_i32_e32 v10, 0, v1 ; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v4 ; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v5 ; GFX6-NEXT: v_max_i32_e32 v5, 0, v1 -; GFX6-NEXT: v_sub_i32_e32 v8, vcc, s5, v8 -; GFX6-NEXT: v_sub_i32_e32 v5, vcc, s4, v5 -; GFX6-NEXT: v_max_i32_e32 v4, v8, v4 +; GFX6-NEXT: v_sub_i32_e32 v10, vcc, v9, v10 +; GFX6-NEXT: v_sub_i32_e32 v5, vcc, v8, v5 +; GFX6-NEXT: v_max_i32_e32 v4, v10, v4 ; GFX6-NEXT: v_min_i32_e32 v4, v4, v5 ; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX6-NEXT: v_add_i32_e32 v1, vcc, v1, v4 ; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v6 ; GFX6-NEXT: v_min_i32_e32 v6, 0, v2 -; GFX6-NEXT: v_bfrev_b32_e32 v9, -2 ; GFX6-NEXT: v_max_i32_e32 v5, 0, v2 -; GFX6-NEXT: v_sub_i32_e32 v6, vcc, s5, v6 -; GFX6-NEXT: v_sub_i32_e32 v5, vcc, v9, v5 +; GFX6-NEXT: v_sub_i32_e32 v6, vcc, v9, v6 +; GFX6-NEXT: v_sub_i32_e32 v5, vcc, v8, v5 ; GFX6-NEXT: v_max_i32_e32 v4, v6, v4 ; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GFX6-NEXT: v_bfrev_b32_e32 v11, 1 ; GFX6-NEXT: v_min_i32_e32 v4, v4, v5 ; GFX6-NEXT: v_min_i32_e32 v6, 0, v3 ; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v4 ; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v7 ; GFX6-NEXT: v_max_i32_e32 v5, 0, v3 -; GFX6-NEXT: v_sub_i32_e32 v6, vcc, v11, v6 -; GFX6-NEXT: v_sub_i32_e32 v5, vcc, v9, v5 +; GFX6-NEXT: v_sub_i32_e32 v6, vcc, v9, v6 +; GFX6-NEXT: v_sub_i32_e32 v5, vcc, v8, v5 ; GFX6-NEXT: v_max_i32_e32 v4, v6, v4 -; GFX6-NEXT: v_ashrrev_i32_e32 v1, 16, v1 ; GFX6-NEXT: v_min_i32_e32 v4, v4, v5 -; GFX6-NEXT: s_mov_b32 s4, 0xffff -; GFX6-NEXT: v_ashrrev_i32_e32 v0, 16, v0 +; GFX6-NEXT: v_ashrrev_i32_e32 v1, 16, v1 ; GFX6-NEXT: v_add_i32_e32 v3, vcc, v3, v4 -; GFX6-NEXT: v_and_b32_e32 v1, s4, v1 +; GFX6-NEXT: v_mov_b32_e32 v4, 0xffff +; GFX6-NEXT: v_ashrrev_i32_e32 v0, 16, v0 +; GFX6-NEXT: v_and_b32_e32 v1, v1, v4 ; GFX6-NEXT: v_ashrrev_i32_e32 v2, 16, v2 ; GFX6-NEXT: v_ashrrev_i32_e32 v3, 16, v3 -; GFX6-NEXT: v_and_b32_e32 v0, s4, v0 +; GFX6-NEXT: v_and_b32_e32 v0, v0, v4 ; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX6-NEXT: v_or_b32_e32 v0, v0, v1 -; GFX6-NEXT: v_and_b32_e32 v1, s4, v2 -; GFX6-NEXT: v_and_b32_e32 v2, s4, v3 +; GFX6-NEXT: v_and_b32_e32 v1, v2, v4 +; GFX6-NEXT: v_and_b32_e32 v2, v3, v4 ; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX6-NEXT: v_or_b32_e32 v1, v1, v2 ; GFX6-NEXT: s_setpc_b64 s[30:31] @@ -3070,38 +3057,38 @@ ; GFX8-LABEL: v_saddsat_v4i16: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: s_movk_i32 s5, 0x8000 -; GFX8-NEXT: v_min_i16_e32 v7, 0, v0 +; GFX8-NEXT: v_mov_b32_e32 v7, 0xffff8000 +; GFX8-NEXT: v_min_i16_e32 v9, 0, v0 ; GFX8-NEXT: v_lshrrev_b32_e32 v4, 16, v0 -; GFX8-NEXT: s_movk_i32 s4, 0x7fff -; GFX8-NEXT: v_max_i16_e32 v6, 0, v0 -; GFX8-NEXT: v_sub_u16_e32 v7, s5, v7 -; GFX8-NEXT: v_sub_u16_e32 v6, s4, v6 -; GFX8-NEXT: v_max_i16_e32 v7, v7, v2 -; GFX8-NEXT: v_min_i16_e32 v8, 0, v4 -; GFX8-NEXT: v_min_i16_e32 v6, v7, v6 -; GFX8-NEXT: v_max_i16_e32 v7, 0, v4 -; GFX8-NEXT: v_sub_u16_e32 v8, s5, v8 -; GFX8-NEXT: v_sub_u16_e32 v7, s4, v7 -; GFX8-NEXT: v_max_i16_sdwa v2, v8, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX8-NEXT: v_min_i16_e32 v8, 0, v1 +; GFX8-NEXT: v_mov_b32_e32 v6, 0x7fff +; GFX8-NEXT: v_max_i16_e32 v8, 0, v0 +; GFX8-NEXT: v_sub_u16_e32 v9, v7, v9 +; GFX8-NEXT: v_sub_u16_e32 v8, v6, v8 +; GFX8-NEXT: v_max_i16_e32 v9, v9, v2 +; GFX8-NEXT: v_min_i16_e32 v10, 0, v4 +; GFX8-NEXT: v_min_i16_e32 v8, v9, v8 +; GFX8-NEXT: v_max_i16_e32 v9, 0, v4 +; GFX8-NEXT: v_sub_u16_e32 v10, v7, v10 +; GFX8-NEXT: v_sub_u16_e32 v9, v6, v9 +; GFX8-NEXT: v_max_i16_sdwa v2, v10, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX8-NEXT: v_min_i16_e32 v10, 0, v1 +; GFX8-NEXT: v_min_i16_e32 v2, v2, v9 +; GFX8-NEXT: v_max_i16_e32 v9, 0, v1 +; GFX8-NEXT: v_sub_u16_e32 v10, v7, v10 ; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v1 -; GFX8-NEXT: v_min_i16_e32 v2, v2, v7 -; GFX8-NEXT: v_max_i16_e32 v7, 0, v1 -; GFX8-NEXT: v_sub_u16_e32 v8, s5, v8 -; GFX8-NEXT: v_sub_u16_e32 v7, s4, v7 -; GFX8-NEXT: v_max_i16_e32 v8, v8, v3 -; GFX8-NEXT: v_min_i16_e32 v9, 0, v5 -; GFX8-NEXT: v_min_i16_e32 v7, v8, v7 -; GFX8-NEXT: v_max_i16_e32 v8, 0, v5 -; GFX8-NEXT: v_sub_u16_e32 v9, s5, v9 -; GFX8-NEXT: v_sub_u16_e32 v8, s4, v8 -; GFX8-NEXT: v_max_i16_sdwa v3, v9, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX8-NEXT: v_min_i16_e32 v3, v3, v8 -; GFX8-NEXT: v_add_u16_e32 v0, v0, v6 +; GFX8-NEXT: v_sub_u16_e32 v9, v6, v9 +; GFX8-NEXT: v_max_i16_e32 v10, v10, v3 +; GFX8-NEXT: v_min_i16_e32 v9, v10, v9 +; GFX8-NEXT: v_max_i16_e32 v10, 0, v5 +; GFX8-NEXT: v_sub_u16_e32 v6, v6, v10 +; GFX8-NEXT: v_min_i16_e32 v10, 0, v5 +; GFX8-NEXT: v_sub_u16_e32 v7, v7, v10 +; GFX8-NEXT: v_max_i16_sdwa v3, v7, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX8-NEXT: v_min_i16_e32 v3, v3, v6 +; GFX8-NEXT: v_add_u16_e32 v0, v0, v8 ; GFX8-NEXT: v_add_u16_sdwa v2, v4, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX8-NEXT: v_or_b32_e32 v0, v0, v2 -; GFX8-NEXT: v_add_u16_e32 v1, v1, v7 +; GFX8-NEXT: v_add_u16_e32 v1, v1, v9 ; GFX8-NEXT: v_add_u16_sdwa v2, v5, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX8-NEXT: v_or_b32_e32 v1, v1, v2 ; GFX8-NEXT: s_setpc_b64 s[30:31] @@ -3286,42 +3273,40 @@ ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX6-NEXT: s_brev_b32 s5, 1 -; GFX6-NEXT: v_min_i32_e32 v14, 0, v0 +; GFX6-NEXT: v_bfrev_b32_e32 v13, 1 +; GFX6-NEXT: v_min_i32_e32 v15, 0, v0 ; GFX6-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; GFX6-NEXT: s_brev_b32 s4, -2 -; GFX6-NEXT: v_max_i32_e32 v12, 0, v0 -; GFX6-NEXT: v_sub_i32_e32 v14, vcc, s5, v14 -; GFX6-NEXT: v_sub_i32_e32 v12, vcc, s4, v12 -; GFX6-NEXT: v_max_i32_e32 v6, v14, v6 +; GFX6-NEXT: v_bfrev_b32_e32 v12, -2 +; GFX6-NEXT: v_max_i32_e32 v14, 0, v0 +; GFX6-NEXT: v_sub_i32_e32 v15, vcc, v13, v15 +; GFX6-NEXT: v_sub_i32_e32 v14, vcc, v12, v14 +; GFX6-NEXT: v_max_i32_e32 v6, v15, v6 ; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX6-NEXT: v_min_i32_e32 v6, v6, v12 -; GFX6-NEXT: v_min_i32_e32 v12, 0, v1 +; GFX6-NEXT: v_min_i32_e32 v6, v6, v14 +; GFX6-NEXT: v_min_i32_e32 v14, 0, v1 ; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v6 ; GFX6-NEXT: v_lshlrev_b32_e32 v6, 16, v7 ; GFX6-NEXT: v_max_i32_e32 v7, 0, v1 -; GFX6-NEXT: v_sub_i32_e32 v12, vcc, s5, v12 -; GFX6-NEXT: v_sub_i32_e32 v7, vcc, s4, v7 -; GFX6-NEXT: v_max_i32_e32 v6, v12, v6 +; GFX6-NEXT: v_sub_i32_e32 v14, vcc, v13, v14 +; GFX6-NEXT: v_sub_i32_e32 v7, vcc, v12, v7 +; GFX6-NEXT: v_max_i32_e32 v6, v14, v6 ; GFX6-NEXT: v_min_i32_e32 v6, v6, v7 ; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX6-NEXT: v_add_i32_e32 v1, vcc, v1, v6 ; GFX6-NEXT: v_lshlrev_b32_e32 v6, 16, v8 ; GFX6-NEXT: v_min_i32_e32 v8, 0, v2 -; GFX6-NEXT: v_bfrev_b32_e32 v13, -2 ; GFX6-NEXT: v_max_i32_e32 v7, 0, v2 -; GFX6-NEXT: v_sub_i32_e32 v8, vcc, s5, v8 -; GFX6-NEXT: v_sub_i32_e32 v7, vcc, v13, v7 +; GFX6-NEXT: v_sub_i32_e32 v8, vcc, v13, v8 +; GFX6-NEXT: v_sub_i32_e32 v7, vcc, v12, v7 ; GFX6-NEXT: v_max_i32_e32 v6, v8, v6 ; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GFX6-NEXT: v_bfrev_b32_e32 v15, 1 ; GFX6-NEXT: v_min_i32_e32 v6, v6, v7 ; GFX6-NEXT: v_min_i32_e32 v8, 0, v3 ; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v6 ; GFX6-NEXT: v_lshlrev_b32_e32 v6, 16, v9 ; GFX6-NEXT: v_max_i32_e32 v7, 0, v3 -; GFX6-NEXT: v_sub_i32_e32 v8, vcc, v15, v8 -; GFX6-NEXT: v_sub_i32_e32 v7, vcc, v13, v7 +; GFX6-NEXT: v_sub_i32_e32 v8, vcc, v13, v8 +; GFX6-NEXT: v_sub_i32_e32 v7, vcc, v12, v7 ; GFX6-NEXT: v_max_i32_e32 v6, v8, v6 ; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v4 ; GFX6-NEXT: v_min_i32_e32 v6, v6, v7 @@ -3329,8 +3314,8 @@ ; GFX6-NEXT: v_add_i32_e32 v3, vcc, v3, v6 ; GFX6-NEXT: v_lshlrev_b32_e32 v6, 16, v10 ; GFX6-NEXT: v_max_i32_e32 v7, 0, v4 -; GFX6-NEXT: v_sub_i32_e32 v8, vcc, v15, v8 -; GFX6-NEXT: v_sub_i32_e32 v7, vcc, v13, v7 +; GFX6-NEXT: v_sub_i32_e32 v8, vcc, v13, v8 +; GFX6-NEXT: v_sub_i32_e32 v7, vcc, v12, v7 ; GFX6-NEXT: v_max_i32_e32 v6, v8, v6 ; GFX6-NEXT: v_lshlrev_b32_e32 v5, 16, v5 ; GFX6-NEXT: v_min_i32_e32 v6, v6, v7 @@ -3338,28 +3323,28 @@ ; GFX6-NEXT: v_add_i32_e32 v4, vcc, v4, v6 ; GFX6-NEXT: v_lshlrev_b32_e32 v6, 16, v11 ; GFX6-NEXT: v_max_i32_e32 v7, 0, v5 -; GFX6-NEXT: v_sub_i32_e32 v8, vcc, v15, v8 -; GFX6-NEXT: v_ashrrev_i32_e32 v1, 16, v1 -; GFX6-NEXT: v_sub_i32_e32 v7, vcc, v13, v7 +; GFX6-NEXT: v_sub_i32_e32 v8, vcc, v13, v8 +; GFX6-NEXT: v_sub_i32_e32 v7, vcc, v12, v7 ; GFX6-NEXT: v_max_i32_e32 v6, v8, v6 -; GFX6-NEXT: s_mov_b32 s4, 0xffff -; GFX6-NEXT: v_ashrrev_i32_e32 v0, 16, v0 ; GFX6-NEXT: v_min_i32_e32 v6, v6, v7 -; GFX6-NEXT: v_and_b32_e32 v1, s4, v1 +; GFX6-NEXT: v_ashrrev_i32_e32 v1, 16, v1 +; GFX6-NEXT: v_add_i32_e32 v5, vcc, v5, v6 +; GFX6-NEXT: v_mov_b32_e32 v6, 0xffff +; GFX6-NEXT: v_ashrrev_i32_e32 v0, 16, v0 +; GFX6-NEXT: v_and_b32_e32 v1, v1, v6 ; GFX6-NEXT: v_ashrrev_i32_e32 v2, 16, v2 ; GFX6-NEXT: v_ashrrev_i32_e32 v3, 16, v3 -; GFX6-NEXT: v_add_i32_e32 v5, vcc, v5, v6 -; GFX6-NEXT: v_and_b32_e32 v0, s4, v0 +; GFX6-NEXT: v_and_b32_e32 v0, v0, v6 ; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX6-NEXT: v_ashrrev_i32_e32 v5, 16, v5 ; GFX6-NEXT: v_or_b32_e32 v0, v0, v1 -; GFX6-NEXT: v_and_b32_e32 v1, s4, v2 -; GFX6-NEXT: v_and_b32_e32 v2, s4, v3 +; GFX6-NEXT: v_and_b32_e32 v1, v2, v6 +; GFX6-NEXT: v_and_b32_e32 v2, v3, v6 ; GFX6-NEXT: v_ashrrev_i32_e32 v4, 16, v4 ; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX6-NEXT: v_and_b32_e32 v3, s4, v5 +; GFX6-NEXT: v_and_b32_e32 v3, v5, v6 ; GFX6-NEXT: v_or_b32_e32 v1, v1, v2 -; GFX6-NEXT: v_and_b32_e32 v2, s4, v4 +; GFX6-NEXT: v_and_b32_e32 v2, v4, v6 ; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; GFX6-NEXT: v_or_b32_e32 v2, v2, v3 ; GFX6-NEXT: s_setpc_b64 s[30:31] @@ -3367,53 +3352,51 @@ ; GFX8-LABEL: v_saddsat_v6i16: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: s_movk_i32 s5, 0x8000 -; GFX8-NEXT: v_min_i16_e32 v11, 0, v0 +; GFX8-NEXT: v_mov_b32_e32 v10, 0xffff8000 +; GFX8-NEXT: v_min_i16_e32 v12, 0, v0 ; GFX8-NEXT: v_lshrrev_b32_e32 v6, 16, v0 -; GFX8-NEXT: s_movk_i32 s4, 0x7fff -; GFX8-NEXT: v_max_i16_e32 v9, 0, v0 -; GFX8-NEXT: v_sub_u16_e32 v11, s5, v11 -; GFX8-NEXT: v_sub_u16_e32 v9, s4, v9 -; GFX8-NEXT: v_max_i16_e32 v11, v11, v3 +; GFX8-NEXT: v_mov_b32_e32 v9, 0x7fff +; GFX8-NEXT: v_max_i16_e32 v11, 0, v0 +; GFX8-NEXT: v_sub_u16_e32 v12, v10, v12 +; GFX8-NEXT: v_sub_u16_e32 v11, v9, v11 +; GFX8-NEXT: v_max_i16_e32 v12, v12, v3 ; GFX8-NEXT: v_min_i16_e32 v13, 0, v6 -; GFX8-NEXT: v_min_i16_e32 v9, v11, v9 -; GFX8-NEXT: v_max_i16_e32 v11, 0, v6 -; GFX8-NEXT: v_sub_u16_e32 v13, s5, v13 -; GFX8-NEXT: v_sub_u16_e32 v11, s4, v11 +; GFX8-NEXT: v_min_i16_e32 v11, v12, v11 +; GFX8-NEXT: v_max_i16_e32 v12, 0, v6 +; GFX8-NEXT: v_sub_u16_e32 v13, v10, v13 +; GFX8-NEXT: v_sub_u16_e32 v12, v9, v12 ; GFX8-NEXT: v_max_i16_sdwa v3, v13, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX8-NEXT: v_min_i16_e32 v13, 0, v1 ; GFX8-NEXT: v_lshrrev_b32_e32 v7, 16, v1 -; GFX8-NEXT: v_min_i16_e32 v3, v3, v11 -; GFX8-NEXT: v_max_i16_e32 v11, 0, v1 -; GFX8-NEXT: v_sub_u16_e32 v13, s5, v13 -; GFX8-NEXT: v_sub_u16_e32 v11, s4, v11 +; GFX8-NEXT: v_min_i16_e32 v3, v3, v12 +; GFX8-NEXT: v_max_i16_e32 v12, 0, v1 +; GFX8-NEXT: v_sub_u16_e32 v13, v10, v13 +; GFX8-NEXT: v_sub_u16_e32 v12, v9, v12 ; GFX8-NEXT: v_max_i16_e32 v13, v13, v4 ; GFX8-NEXT: v_min_i16_e32 v14, 0, v7 -; GFX8-NEXT: v_min_i16_e32 v11, v13, v11 +; GFX8-NEXT: v_min_i16_e32 v12, v13, v12 ; GFX8-NEXT: v_max_i16_e32 v13, 0, v7 -; GFX8-NEXT: v_sub_u16_e32 v14, s5, v14 -; GFX8-NEXT: v_mov_b32_e32 v12, 0xffff8000 -; GFX8-NEXT: v_sub_u16_e32 v13, s4, v13 +; GFX8-NEXT: v_sub_u16_e32 v14, v10, v14 +; GFX8-NEXT: v_sub_u16_e32 v13, v9, v13 ; GFX8-NEXT: v_max_i16_sdwa v4, v14, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX8-NEXT: v_min_i16_e32 v14, 0, v2 -; GFX8-NEXT: v_mov_b32_e32 v10, 0x7fff ; GFX8-NEXT: v_min_i16_e32 v4, v4, v13 ; GFX8-NEXT: v_max_i16_e32 v13, 0, v2 -; GFX8-NEXT: v_sub_u16_e32 v14, v12, v14 +; GFX8-NEXT: v_sub_u16_e32 v14, v10, v14 ; GFX8-NEXT: v_lshrrev_b32_e32 v8, 16, v2 -; GFX8-NEXT: v_sub_u16_e32 v13, v10, v13 +; GFX8-NEXT: v_sub_u16_e32 v13, v9, v13 ; GFX8-NEXT: v_max_i16_e32 v14, v14, v5 ; GFX8-NEXT: v_min_i16_e32 v13, v14, v13 ; GFX8-NEXT: v_max_i16_e32 v14, 0, v8 -; GFX8-NEXT: v_sub_u16_e32 v10, v10, v14 +; GFX8-NEXT: v_sub_u16_e32 v9, v9, v14 ; GFX8-NEXT: v_min_i16_e32 v14, 0, v8 -; GFX8-NEXT: v_sub_u16_e32 v12, v12, v14 -; GFX8-NEXT: v_max_i16_sdwa v5, v12, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX8-NEXT: v_add_u16_e32 v0, v0, v9 +; GFX8-NEXT: v_sub_u16_e32 v10, v10, v14 +; GFX8-NEXT: v_max_i16_sdwa v5, v10, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX8-NEXT: v_add_u16_e32 v0, v0, v11 ; GFX8-NEXT: v_add_u16_sdwa v3, v6, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX8-NEXT: v_min_i16_e32 v5, v5, v10 +; GFX8-NEXT: v_min_i16_e32 v5, v5, v9 ; GFX8-NEXT: v_or_b32_e32 v0, v0, v3 -; GFX8-NEXT: v_add_u16_e32 v1, v1, v11 +; GFX8-NEXT: v_add_u16_e32 v1, v1, v12 ; GFX8-NEXT: v_add_u16_sdwa v3, v7, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX8-NEXT: v_or_b32_e32 v1, v1, v3 ; GFX8-NEXT: v_add_u16_e32 v2, v2, v13 @@ -3651,42 +3634,40 @@ ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX6-NEXT: s_brev_b32 s5, 1 -; GFX6-NEXT: v_min_i32_e32 v18, 0, v0 +; GFX6-NEXT: v_bfrev_b32_e32 v17, 1 +; GFX6-NEXT: v_min_i32_e32 v19, 0, v0 ; GFX6-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; GFX6-NEXT: s_brev_b32 s4, -2 -; GFX6-NEXT: v_max_i32_e32 v16, 0, v0 -; GFX6-NEXT: v_sub_i32_e32 v18, vcc, s5, v18 -; GFX6-NEXT: v_sub_i32_e32 v16, vcc, s4, v16 -; GFX6-NEXT: v_max_i32_e32 v8, v18, v8 +; GFX6-NEXT: v_bfrev_b32_e32 v16, -2 +; GFX6-NEXT: v_max_i32_e32 v18, 0, v0 +; GFX6-NEXT: v_sub_i32_e32 v19, vcc, v17, v19 +; GFX6-NEXT: v_sub_i32_e32 v18, vcc, v16, v18 +; GFX6-NEXT: v_max_i32_e32 v8, v19, v8 ; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX6-NEXT: v_min_i32_e32 v8, v8, v16 -; GFX6-NEXT: v_min_i32_e32 v16, 0, v1 +; GFX6-NEXT: v_min_i32_e32 v8, v8, v18 +; GFX6-NEXT: v_min_i32_e32 v18, 0, v1 ; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v8 ; GFX6-NEXT: v_lshlrev_b32_e32 v8, 16, v9 ; GFX6-NEXT: v_max_i32_e32 v9, 0, v1 -; GFX6-NEXT: v_sub_i32_e32 v16, vcc, s5, v16 -; GFX6-NEXT: v_sub_i32_e32 v9, vcc, s4, v9 -; GFX6-NEXT: v_max_i32_e32 v8, v16, v8 +; GFX6-NEXT: v_sub_i32_e32 v18, vcc, v17, v18 +; GFX6-NEXT: v_sub_i32_e32 v9, vcc, v16, v9 +; GFX6-NEXT: v_max_i32_e32 v8, v18, v8 ; GFX6-NEXT: v_min_i32_e32 v8, v8, v9 ; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX6-NEXT: v_add_i32_e32 v1, vcc, v1, v8 ; GFX6-NEXT: v_lshlrev_b32_e32 v8, 16, v10 ; GFX6-NEXT: v_min_i32_e32 v10, 0, v2 -; GFX6-NEXT: v_bfrev_b32_e32 v17, -2 ; GFX6-NEXT: v_max_i32_e32 v9, 0, v2 -; GFX6-NEXT: v_sub_i32_e32 v10, vcc, s5, v10 -; GFX6-NEXT: v_sub_i32_e32 v9, vcc, v17, v9 +; GFX6-NEXT: v_sub_i32_e32 v10, vcc, v17, v10 +; GFX6-NEXT: v_sub_i32_e32 v9, vcc, v16, v9 ; GFX6-NEXT: v_max_i32_e32 v8, v10, v8 ; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GFX6-NEXT: v_bfrev_b32_e32 v19, 1 ; GFX6-NEXT: v_min_i32_e32 v8, v8, v9 ; GFX6-NEXT: v_min_i32_e32 v10, 0, v3 ; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v8 ; GFX6-NEXT: v_lshlrev_b32_e32 v8, 16, v11 ; GFX6-NEXT: v_max_i32_e32 v9, 0, v3 -; GFX6-NEXT: v_sub_i32_e32 v10, vcc, v19, v10 -; GFX6-NEXT: v_sub_i32_e32 v9, vcc, v17, v9 +; GFX6-NEXT: v_sub_i32_e32 v10, vcc, v17, v10 +; GFX6-NEXT: v_sub_i32_e32 v9, vcc, v16, v9 ; GFX6-NEXT: v_max_i32_e32 v8, v10, v8 ; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v4 ; GFX6-NEXT: v_min_i32_e32 v8, v8, v9 @@ -3694,8 +3675,8 @@ ; GFX6-NEXT: v_add_i32_e32 v3, vcc, v3, v8 ; GFX6-NEXT: v_lshlrev_b32_e32 v8, 16, v12 ; GFX6-NEXT: v_max_i32_e32 v9, 0, v4 -; GFX6-NEXT: v_sub_i32_e32 v10, vcc, v19, v10 -; GFX6-NEXT: v_sub_i32_e32 v9, vcc, v17, v9 +; GFX6-NEXT: v_sub_i32_e32 v10, vcc, v17, v10 +; GFX6-NEXT: v_sub_i32_e32 v9, vcc, v16, v9 ; GFX6-NEXT: v_max_i32_e32 v8, v10, v8 ; GFX6-NEXT: v_lshlrev_b32_e32 v5, 16, v5 ; GFX6-NEXT: v_min_i32_e32 v8, v8, v9 @@ -3703,8 +3684,8 @@ ; GFX6-NEXT: v_add_i32_e32 v4, vcc, v4, v8 ; GFX6-NEXT: v_lshlrev_b32_e32 v8, 16, v13 ; GFX6-NEXT: v_max_i32_e32 v9, 0, v5 -; GFX6-NEXT: v_sub_i32_e32 v10, vcc, v19, v10 -; GFX6-NEXT: v_sub_i32_e32 v9, vcc, v17, v9 +; GFX6-NEXT: v_sub_i32_e32 v10, vcc, v17, v10 +; GFX6-NEXT: v_sub_i32_e32 v9, vcc, v16, v9 ; GFX6-NEXT: v_max_i32_e32 v8, v10, v8 ; GFX6-NEXT: v_lshlrev_b32_e32 v6, 16, v6 ; GFX6-NEXT: v_min_i32_e32 v8, v8, v9 @@ -3712,43 +3693,43 @@ ; GFX6-NEXT: v_add_i32_e32 v5, vcc, v5, v8 ; GFX6-NEXT: v_lshlrev_b32_e32 v8, 16, v14 ; GFX6-NEXT: v_max_i32_e32 v9, 0, v6 -; GFX6-NEXT: v_sub_i32_e32 v10, vcc, v19, v10 -; GFX6-NEXT: v_sub_i32_e32 v9, vcc, v17, v9 +; GFX6-NEXT: v_sub_i32_e32 v10, vcc, v17, v10 +; GFX6-NEXT: v_sub_i32_e32 v9, vcc, v16, v9 ; GFX6-NEXT: v_max_i32_e32 v8, v10, v8 ; GFX6-NEXT: v_lshlrev_b32_e32 v7, 16, v7 ; GFX6-NEXT: v_min_i32_e32 v8, v8, v9 ; GFX6-NEXT: v_min_i32_e32 v10, 0, v7 -; GFX6-NEXT: v_ashrrev_i32_e32 v1, 16, v1 ; GFX6-NEXT: v_add_i32_e32 v6, vcc, v6, v8 ; GFX6-NEXT: v_lshlrev_b32_e32 v8, 16, v15 ; GFX6-NEXT: v_max_i32_e32 v9, 0, v7 -; GFX6-NEXT: v_sub_i32_e32 v10, vcc, v19, v10 -; GFX6-NEXT: s_mov_b32 s4, 0xffff -; GFX6-NEXT: v_ashrrev_i32_e32 v0, 16, v0 -; GFX6-NEXT: v_sub_i32_e32 v9, vcc, v17, v9 +; GFX6-NEXT: v_sub_i32_e32 v10, vcc, v17, v10 +; GFX6-NEXT: v_sub_i32_e32 v9, vcc, v16, v9 ; GFX6-NEXT: v_max_i32_e32 v8, v10, v8 -; GFX6-NEXT: v_and_b32_e32 v1, s4, v1 +; GFX6-NEXT: v_min_i32_e32 v8, v8, v9 +; GFX6-NEXT: v_ashrrev_i32_e32 v1, 16, v1 +; GFX6-NEXT: v_add_i32_e32 v7, vcc, v7, v8 +; GFX6-NEXT: v_mov_b32_e32 v8, 0xffff +; GFX6-NEXT: v_ashrrev_i32_e32 v0, 16, v0 +; GFX6-NEXT: v_and_b32_e32 v1, v1, v8 ; GFX6-NEXT: v_ashrrev_i32_e32 v2, 16, v2 ; GFX6-NEXT: v_ashrrev_i32_e32 v3, 16, v3 -; GFX6-NEXT: v_min_i32_e32 v8, v8, v9 -; GFX6-NEXT: v_and_b32_e32 v0, s4, v0 +; GFX6-NEXT: v_and_b32_e32 v0, v0, v8 ; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX6-NEXT: v_ashrrev_i32_e32 v5, 16, v5 -; GFX6-NEXT: v_add_i32_e32 v7, vcc, v7, v8 ; GFX6-NEXT: v_or_b32_e32 v0, v0, v1 -; GFX6-NEXT: v_and_b32_e32 v1, s4, v2 -; GFX6-NEXT: v_and_b32_e32 v2, s4, v3 +; GFX6-NEXT: v_and_b32_e32 v1, v2, v8 +; GFX6-NEXT: v_and_b32_e32 v2, v3, v8 ; GFX6-NEXT: v_ashrrev_i32_e32 v4, 16, v4 ; GFX6-NEXT: v_ashrrev_i32_e32 v7, 16, v7 ; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX6-NEXT: v_and_b32_e32 v3, s4, v5 +; GFX6-NEXT: v_and_b32_e32 v3, v5, v8 ; GFX6-NEXT: v_ashrrev_i32_e32 v6, 16, v6 ; GFX6-NEXT: v_or_b32_e32 v1, v1, v2 -; GFX6-NEXT: v_and_b32_e32 v2, s4, v4 +; GFX6-NEXT: v_and_b32_e32 v2, v4, v8 ; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GFX6-NEXT: v_and_b32_e32 v4, s4, v7 +; GFX6-NEXT: v_and_b32_e32 v4, v7, v8 ; GFX6-NEXT: v_or_b32_e32 v2, v2, v3 -; GFX6-NEXT: v_and_b32_e32 v3, s4, v6 +; GFX6-NEXT: v_and_b32_e32 v3, v6, v8 ; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v4 ; GFX6-NEXT: v_or_b32_e32 v3, v3, v4 ; GFX6-NEXT: s_setpc_b64 s[30:31] @@ -3756,67 +3737,65 @@ ; GFX8-LABEL: v_saddsat_v8i16: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: s_movk_i32 s5, 0x8000 -; GFX8-NEXT: v_min_i16_e32 v14, 0, v0 +; GFX8-NEXT: v_mov_b32_e32 v13, 0xffff8000 +; GFX8-NEXT: v_min_i16_e32 v15, 0, v0 ; GFX8-NEXT: v_lshrrev_b32_e32 v8, 16, v0 -; GFX8-NEXT: s_movk_i32 s4, 0x7fff -; GFX8-NEXT: v_max_i16_e32 v12, 0, v0 -; GFX8-NEXT: v_sub_u16_e32 v14, s5, v14 -; GFX8-NEXT: v_sub_u16_e32 v12, s4, v12 -; GFX8-NEXT: v_max_i16_e32 v14, v14, v4 +; GFX8-NEXT: v_mov_b32_e32 v12, 0x7fff +; GFX8-NEXT: v_max_i16_e32 v14, 0, v0 +; GFX8-NEXT: v_sub_u16_e32 v15, v13, v15 +; GFX8-NEXT: v_sub_u16_e32 v14, v12, v14 +; GFX8-NEXT: v_max_i16_e32 v15, v15, v4 ; GFX8-NEXT: v_min_i16_e32 v16, 0, v8 -; GFX8-NEXT: v_min_i16_e32 v12, v14, v12 -; GFX8-NEXT: v_max_i16_e32 v14, 0, v8 -; GFX8-NEXT: v_sub_u16_e32 v16, s5, v16 -; GFX8-NEXT: v_sub_u16_e32 v14, s4, v14 +; GFX8-NEXT: v_min_i16_e32 v14, v15, v14 +; GFX8-NEXT: v_max_i16_e32 v15, 0, v8 +; GFX8-NEXT: v_sub_u16_e32 v16, v13, v16 +; GFX8-NEXT: v_sub_u16_e32 v15, v12, v15 ; GFX8-NEXT: v_max_i16_sdwa v4, v16, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX8-NEXT: v_min_i16_e32 v16, 0, v1 ; GFX8-NEXT: v_lshrrev_b32_e32 v9, 16, v1 -; GFX8-NEXT: v_min_i16_e32 v4, v4, v14 -; GFX8-NEXT: v_max_i16_e32 v14, 0, v1 -; GFX8-NEXT: v_sub_u16_e32 v16, s5, v16 -; GFX8-NEXT: v_sub_u16_e32 v14, s4, v14 +; GFX8-NEXT: v_min_i16_e32 v4, v4, v15 +; GFX8-NEXT: v_max_i16_e32 v15, 0, v1 +; GFX8-NEXT: v_sub_u16_e32 v16, v13, v16 +; GFX8-NEXT: v_sub_u16_e32 v15, v12, v15 ; GFX8-NEXT: v_max_i16_e32 v16, v16, v5 ; GFX8-NEXT: v_min_i16_e32 v17, 0, v9 -; GFX8-NEXT: v_min_i16_e32 v14, v16, v14 +; GFX8-NEXT: v_min_i16_e32 v15, v16, v15 ; GFX8-NEXT: v_max_i16_e32 v16, 0, v9 -; GFX8-NEXT: v_sub_u16_e32 v17, s5, v17 -; GFX8-NEXT: v_mov_b32_e32 v15, 0xffff8000 -; GFX8-NEXT: v_sub_u16_e32 v16, s4, v16 +; GFX8-NEXT: v_sub_u16_e32 v17, v13, v17 +; GFX8-NEXT: v_sub_u16_e32 v16, v12, v16 ; GFX8-NEXT: v_max_i16_sdwa v5, v17, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX8-NEXT: v_min_i16_e32 v17, 0, v2 ; GFX8-NEXT: v_lshrrev_b32_e32 v10, 16, v2 -; GFX8-NEXT: v_mov_b32_e32 v13, 0x7fff ; GFX8-NEXT: v_min_i16_e32 v5, v5, v16 ; GFX8-NEXT: v_max_i16_e32 v16, 0, v2 -; GFX8-NEXT: v_sub_u16_e32 v17, v15, v17 -; GFX8-NEXT: v_sub_u16_e32 v16, v13, v16 +; GFX8-NEXT: v_sub_u16_e32 v17, v13, v17 +; GFX8-NEXT: v_sub_u16_e32 v16, v12, v16 ; GFX8-NEXT: v_max_i16_e32 v17, v17, v6 ; GFX8-NEXT: v_min_i16_e32 v18, 0, v10 ; GFX8-NEXT: v_min_i16_e32 v16, v17, v16 ; GFX8-NEXT: v_max_i16_e32 v17, 0, v10 -; GFX8-NEXT: v_sub_u16_e32 v18, v15, v18 -; GFX8-NEXT: v_sub_u16_e32 v17, v13, v17 +; GFX8-NEXT: v_sub_u16_e32 v18, v13, v18 +; GFX8-NEXT: v_sub_u16_e32 v17, v12, v17 ; GFX8-NEXT: v_max_i16_sdwa v6, v18, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX8-NEXT: v_min_i16_e32 v18, 0, v3 ; GFX8-NEXT: v_min_i16_e32 v6, v6, v17 ; GFX8-NEXT: v_max_i16_e32 v17, 0, v3 -; GFX8-NEXT: v_sub_u16_e32 v18, v15, v18 +; GFX8-NEXT: v_sub_u16_e32 v18, v13, v18 ; GFX8-NEXT: v_lshrrev_b32_e32 v11, 16, v3 -; GFX8-NEXT: v_sub_u16_e32 v17, v13, v17 +; GFX8-NEXT: v_sub_u16_e32 v17, v12, v17 ; GFX8-NEXT: v_max_i16_e32 v18, v18, v7 ; GFX8-NEXT: v_min_i16_e32 v17, v18, v17 ; GFX8-NEXT: v_max_i16_e32 v18, 0, v11 -; GFX8-NEXT: v_sub_u16_e32 v13, v13, v18 +; GFX8-NEXT: v_sub_u16_e32 v12, v12, v18 ; GFX8-NEXT: v_min_i16_e32 v18, 0, v11 -; GFX8-NEXT: v_sub_u16_e32 v15, v15, v18 -; GFX8-NEXT: v_add_u16_e32 v0, v0, v12 +; GFX8-NEXT: v_sub_u16_e32 v13, v13, v18 +; GFX8-NEXT: v_add_u16_e32 v0, v0, v14 ; GFX8-NEXT: v_add_u16_sdwa v4, v8, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX8-NEXT: v_max_i16_sdwa v7, v15, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX8-NEXT: v_max_i16_sdwa v7, v13, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX8-NEXT: v_or_b32_e32 v0, v0, v4 -; GFX8-NEXT: v_add_u16_e32 v1, v1, v14 +; GFX8-NEXT: v_add_u16_e32 v1, v1, v15 ; GFX8-NEXT: v_add_u16_sdwa v4, v9, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX8-NEXT: v_min_i16_e32 v7, v7, v13 +; GFX8-NEXT: v_min_i16_e32 v7, v7, v12 ; GFX8-NEXT: v_or_b32_e32 v1, v1, v4 ; GFX8-NEXT: v_add_u16_e32 v2, v2, v16 ; GFX8-NEXT: v_add_u16_sdwa v4, v10, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD @@ -4538,19 +4517,20 @@ ; GFX10-NEXT: v_add_co_ci_u32_e32 v9, vcc_lo, v1, v5, vcc_lo ; GFX10-NEXT: v_add_co_u32 v10, vcc_lo, v2, v6 ; GFX10-NEXT: v_add_co_ci_u32_e32 v11, vcc_lo, v3, v7, vcc_lo -; GFX10-NEXT: v_ashrrev_i32_e32 v12, 31, v9 ; GFX10-NEXT: v_cmp_lt_i64_e32 vcc_lo, v[8:9], v[0:1] +; GFX10-NEXT: v_ashrrev_i32_e32 v0, 31, v9 +; GFX10-NEXT: v_bfrev_b32_e32 v1, 1 ; GFX10-NEXT: v_cmp_gt_i64_e64 s4, 0, v[4:5] -; GFX10-NEXT: v_ashrrev_i32_e32 v0, 31, v11 +; GFX10-NEXT: v_ashrrev_i32_e32 v4, 31, v11 ; GFX10-NEXT: v_cmp_gt_i64_e64 s6, 0, v[6:7] -; GFX10-NEXT: v_add_co_u32 v1, s5, v12, 0 -; GFX10-NEXT: v_add_co_ci_u32_e64 v4, s5, 0x80000000, v12, s5 +; GFX10-NEXT: v_add_co_u32 v5, s5, v0, 0 +; GFX10-NEXT: v_add_co_ci_u32_e64 v12, s5, v0, v1, s5 ; GFX10-NEXT: v_cmp_lt_i64_e64 s5, v[10:11], v[2:3] -; GFX10-NEXT: v_add_co_u32 v2, s7, v0, 0 -; GFX10-NEXT: v_add_co_ci_u32_e64 v3, s7, 0x80000000, v0, s7 +; GFX10-NEXT: v_add_co_u32 v2, s7, v4, 0 +; GFX10-NEXT: v_add_co_ci_u32_e64 v3, s7, v4, v1, s7 ; GFX10-NEXT: s_xor_b32 vcc_lo, s4, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e32 v0, v8, v1, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e32 v1, v9, v4, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e32 v0, v8, v5, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e32 v1, v9, v12, vcc_lo ; GFX10-NEXT: s_xor_b32 vcc_lo, s6, s5 ; GFX10-NEXT: v_cndmask_b32_e32 v2, v10, v2, vcc_lo ; GFX10-NEXT: v_cndmask_b32_e32 v3, v11, v3, vcc_lo @@ -5048,7 +5028,6 @@ ; GFX6-NEXT: v_addc_u32_e32 v4, vcc, v4, v2, vcc ; GFX6-NEXT: v_addc_u32_e32 v5, vcc, v5, v3, vcc ; GFX6-NEXT: v_cmp_gt_u64_e32 vcc, s[0:1], v[0:1] -; GFX6-NEXT: v_bfrev_b32_e32 v8, 1 ; GFX6-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc ; GFX6-NEXT: v_cmp_gt_i64_e32 vcc, s[2:3], v[4:5] ; GFX6-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc @@ -5059,15 +5038,16 @@ ; GFX6-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[2:3] ; GFX6-NEXT: v_ashrrev_i32_e32 v3, 31, v5 ; GFX6-NEXT: v_cndmask_b32_e64 v2, v7, 0, vcc +; GFX6-NEXT: v_add_i32_e32 v7, vcc, 0, v3 +; GFX6-NEXT: v_addc_u32_e32 v8, vcc, 0, v3, vcc ; GFX6-NEXT: v_xor_b32_e32 v2, v2, v6 -; GFX6-NEXT: v_add_i32_e32 v6, vcc, 0, v3 -; GFX6-NEXT: v_addc_u32_e32 v7, vcc, 0, v3, vcc +; GFX6-NEXT: v_bfrev_b32_e32 v6, 1 ; GFX6-NEXT: v_addc_u32_e32 v9, vcc, 0, v3, vcc -; GFX6-NEXT: v_addc_u32_e32 v3, vcc, v3, v8, vcc +; GFX6-NEXT: v_addc_u32_e32 v3, vcc, v3, v6, vcc ; GFX6-NEXT: v_and_b32_e32 v2, 1, v2 ; GFX6-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 -; GFX6-NEXT: v_cndmask_b32_e32 v0, v0, v6, vcc -; GFX6-NEXT: v_cndmask_b32_e32 v1, v1, v7, vcc +; GFX6-NEXT: v_cndmask_b32_e32 v0, v0, v7, vcc +; GFX6-NEXT: v_cndmask_b32_e32 v1, v1, v8, vcc ; GFX6-NEXT: v_cndmask_b32_e32 v2, v4, v9, vcc ; GFX6-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc ; GFX6-NEXT: ; return to shader part epilog @@ -5082,7 +5062,6 @@ ; GFX8-NEXT: v_addc_u32_e32 v4, vcc, v4, v2, vcc ; GFX8-NEXT: v_addc_u32_e32 v5, vcc, v5, v3, vcc ; GFX8-NEXT: v_cmp_gt_u64_e32 vcc, s[0:1], v[0:1] -; GFX8-NEXT: v_bfrev_b32_e32 v8, 1 ; GFX8-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc ; GFX8-NEXT: v_cmp_gt_i64_e32 vcc, s[2:3], v[4:5] ; GFX8-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc @@ -5093,15 +5072,16 @@ ; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[2:3] ; GFX8-NEXT: v_ashrrev_i32_e32 v3, 31, v5 ; GFX8-NEXT: v_cndmask_b32_e64 v2, v7, 0, vcc +; GFX8-NEXT: v_add_u32_e32 v7, vcc, 0, v3 +; GFX8-NEXT: v_addc_u32_e32 v8, vcc, 0, v3, vcc ; GFX8-NEXT: v_xor_b32_e32 v2, v2, v6 -; GFX8-NEXT: v_add_u32_e32 v6, vcc, 0, v3 -; GFX8-NEXT: v_addc_u32_e32 v7, vcc, 0, v3, vcc +; GFX8-NEXT: v_bfrev_b32_e32 v6, 1 ; GFX8-NEXT: v_addc_u32_e32 v9, vcc, 0, v3, vcc -; GFX8-NEXT: v_addc_u32_e32 v3, vcc, v3, v8, vcc +; GFX8-NEXT: v_addc_u32_e32 v3, vcc, v3, v6, vcc ; GFX8-NEXT: v_and_b32_e32 v2, 1, v2 ; GFX8-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 -; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v6, vcc -; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v7, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v7, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v8, vcc ; GFX8-NEXT: v_cndmask_b32_e32 v2, v4, v9, vcc ; GFX8-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc ; GFX8-NEXT: ; return to shader part epilog @@ -5116,7 +5096,6 @@ ; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, v4, v2, vcc ; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, v5, v3, vcc ; GFX9-NEXT: v_cmp_gt_u64_e32 vcc, s[0:1], v[0:1] -; GFX9-NEXT: v_bfrev_b32_e32 v8, 1 ; GFX9-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc ; GFX9-NEXT: v_cmp_gt_i64_e32 vcc, s[2:3], v[4:5] ; GFX9-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc @@ -5127,15 +5106,16 @@ ; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[2:3] ; GFX9-NEXT: v_ashrrev_i32_e32 v3, 31, v5 ; GFX9-NEXT: v_cndmask_b32_e64 v2, v7, 0, vcc +; GFX9-NEXT: v_add_co_u32_e32 v7, vcc, 0, v3 +; GFX9-NEXT: v_addc_co_u32_e32 v8, vcc, 0, v3, vcc ; GFX9-NEXT: v_xor_b32_e32 v2, v2, v6 -; GFX9-NEXT: v_add_co_u32_e32 v6, vcc, 0, v3 -; GFX9-NEXT: v_addc_co_u32_e32 v7, vcc, 0, v3, vcc +; GFX9-NEXT: v_bfrev_b32_e32 v6, 1 ; GFX9-NEXT: v_addc_co_u32_e32 v9, vcc, 0, v3, vcc -; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, v3, v8, vcc +; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, v3, v6, vcc ; GFX9-NEXT: v_and_b32_e32 v2, 1, v2 ; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v6, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v7, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v7, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v8, vcc ; GFX9-NEXT: v_cndmask_b32_e32 v2, v4, v9, vcc ; GFX9-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc ; GFX9-NEXT: ; return to shader part epilog @@ -5188,25 +5168,25 @@ ; GFX6-NEXT: v_cmp_lt_i64_e64 s[0:1], s[2:3], 0 ; GFX6-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc ; GFX6-NEXT: v_cmp_lt_i64_e32 vcc, v[6:7], v[2:3] -; GFX6-NEXT: v_bfrev_b32_e32 v8, 1 ; GFX6-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc ; GFX6-NEXT: v_cmp_eq_u64_e32 vcc, v[6:7], v[2:3] +; GFX6-NEXT: v_bfrev_b32_e32 v2, 1 ; GFX6-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc ; GFX6-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[0:1] ; GFX6-NEXT: v_cmp_eq_u64_e64 s[0:1], s[2:3], 0 ; GFX6-NEXT: v_cndmask_b32_e64 v1, v1, 0, s[0:1] ; GFX6-NEXT: v_xor_b32_e32 v0, v1, v0 ; GFX6-NEXT: v_ashrrev_i32_e32 v1, 31, v7 -; GFX6-NEXT: v_add_i32_e32 v2, vcc, 0, v1 -; GFX6-NEXT: v_addc_u32_e32 v3, vcc, 0, v1, vcc +; GFX6-NEXT: v_add_i32_e32 v3, vcc, 0, v1 +; GFX6-NEXT: v_addc_u32_e32 v8, vcc, 0, v1, vcc ; GFX6-NEXT: v_addc_u32_e32 v9, vcc, 0, v1, vcc -; GFX6-NEXT: v_addc_u32_e32 v8, vcc, v1, v8, vcc +; GFX6-NEXT: v_addc_u32_e32 v10, vcc, v1, v2, vcc ; GFX6-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX6-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 -; GFX6-NEXT: v_cndmask_b32_e32 v0, v4, v2, vcc -; GFX6-NEXT: v_cndmask_b32_e32 v1, v5, v3, vcc +; GFX6-NEXT: v_cndmask_b32_e32 v0, v4, v3, vcc +; GFX6-NEXT: v_cndmask_b32_e32 v1, v5, v8, vcc ; GFX6-NEXT: v_cndmask_b32_e32 v2, v6, v9, vcc -; GFX6-NEXT: v_cndmask_b32_e32 v3, v7, v8, vcc +; GFX6-NEXT: v_cndmask_b32_e32 v3, v7, v10, vcc ; GFX6-NEXT: ; return to shader part epilog ; ; GFX8-LABEL: saddsat_i128_vs: @@ -5233,17 +5213,17 @@ ; GFX8-NEXT: v_cndmask_b32_e64 v1, v1, 0, s[0:1] ; GFX8-NEXT: v_xor_b32_e32 v0, v1, v0 ; GFX8-NEXT: v_ashrrev_i32_e32 v1, 31, v7 -; GFX8-NEXT: v_add_u32_e32 v2, vcc, 0, v1 -; GFX8-NEXT: v_addc_u32_e32 v3, vcc, 0, v1, vcc -; GFX8-NEXT: v_bfrev_b32_e32 v8, 1 +; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0, v1 +; GFX8-NEXT: v_addc_u32_e32 v8, vcc, 0, v1, vcc +; GFX8-NEXT: v_bfrev_b32_e32 v2, 1 ; GFX8-NEXT: v_addc_u32_e32 v9, vcc, 0, v1, vcc -; GFX8-NEXT: v_addc_u32_e32 v8, vcc, v1, v8, vcc +; GFX8-NEXT: v_addc_u32_e32 v10, vcc, v1, v2, vcc ; GFX8-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX8-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 -; GFX8-NEXT: v_cndmask_b32_e32 v0, v4, v2, vcc -; GFX8-NEXT: v_cndmask_b32_e32 v1, v5, v3, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v0, v4, v3, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v1, v5, v8, vcc ; GFX8-NEXT: v_cndmask_b32_e32 v2, v6, v9, vcc -; GFX8-NEXT: v_cndmask_b32_e32 v3, v7, v8, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v3, v7, v10, vcc ; GFX8-NEXT: ; return to shader part epilog ; ; GFX9-LABEL: saddsat_i128_vs: @@ -5270,17 +5250,17 @@ ; GFX9-NEXT: v_cndmask_b32_e64 v1, v1, 0, s[0:1] ; GFX9-NEXT: v_xor_b32_e32 v0, v1, v0 ; GFX9-NEXT: v_ashrrev_i32_e32 v1, 31, v7 -; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, 0, v1 -; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v1, vcc -; GFX9-NEXT: v_bfrev_b32_e32 v8, 1 +; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, 0, v1 +; GFX9-NEXT: v_addc_co_u32_e32 v8, vcc, 0, v1, vcc +; GFX9-NEXT: v_bfrev_b32_e32 v2, 1 ; GFX9-NEXT: v_addc_co_u32_e32 v9, vcc, 0, v1, vcc -; GFX9-NEXT: v_addc_co_u32_e32 v8, vcc, v1, v8, vcc +; GFX9-NEXT: v_addc_co_u32_e32 v10, vcc, v1, v2, vcc ; GFX9-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v4, v2, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v1, v5, v3, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v0, v4, v3, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v1, v5, v8, vcc ; GFX9-NEXT: v_cndmask_b32_e32 v2, v6, v9, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v3, v7, v8, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v3, v7, v10, vcc ; GFX9-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: saddsat_i128_vs: @@ -5510,6 +5490,7 @@ ; GFX10-NEXT: v_cmp_gt_i64_e32 vcc_lo, 0, v[10:11] ; GFX10-NEXT: v_cndmask_b32_e64 v18, 0, 1, vcc_lo ; GFX10-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[16:17], v[2:3] +; GFX10-NEXT: v_bfrev_b32_e32 v3, 1 ; GFX10-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc_lo ; GFX10-NEXT: v_cmp_eq_u64_e32 vcc_lo, 0, v[10:11] ; GFX10-NEXT: v_cndmask_b32_e64 v1, v18, 0, vcc_lo @@ -5525,31 +5506,31 @@ ; GFX10-NEXT: v_cmp_lt_i64_e64 s4, v[12:13], v[6:7] ; GFX10-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX10-NEXT: v_add_co_u32 v2, vcc_lo, v1, 0 -; GFX10-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, 0, v1, vcc_lo +; GFX10-NEXT: v_add_co_ci_u32_e32 v18, vcc_lo, 0, v1, vcc_lo ; GFX10-NEXT: v_cndmask_b32_e64 v5, 0, 1, s4 ; GFX10-NEXT: v_cmp_gt_i64_e64 s4, 0, v[14:15] ; GFX10-NEXT: v_ashrrev_i32_e32 v7, 31, v13 -; GFX10-NEXT: v_cndmask_b32_e64 v18, 0, 1, s4 +; GFX10-NEXT: v_cndmask_b32_e64 v19, 0, 1, s4 ; GFX10-NEXT: v_cmp_ne_u32_e64 s4, 0, v0 ; GFX10-NEXT: v_cndmask_b32_e64 v0, v5, v4, s5 ; GFX10-NEXT: v_cmp_eq_u64_e64 s5, 0, v[14:15] ; GFX10-NEXT: v_add_co_ci_u32_e32 v5, vcc_lo, 0, v1, vcc_lo -; GFX10-NEXT: v_add_co_ci_u32_e32 v6, vcc_lo, 0x80000000, v1, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e64 v1, v9, v3, s4 -; GFX10-NEXT: v_cndmask_b32_e64 v4, v18, 0, s5 +; GFX10-NEXT: v_add_co_ci_u32_e32 v6, vcc_lo, v1, v3, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e64 v1, v9, v18, s4 +; GFX10-NEXT: v_cndmask_b32_e64 v4, v19, 0, s5 ; GFX10-NEXT: v_xor_b32_e32 v4, v4, v0 ; GFX10-NEXT: v_cndmask_b32_e64 v0, v8, v2, s4 ; GFX10-NEXT: v_cndmask_b32_e64 v2, v16, v5, s4 -; GFX10-NEXT: v_and_b32_e32 v3, 1, v4 -; GFX10-NEXT: v_add_co_u32 v4, vcc_lo, v7, 0 -; GFX10-NEXT: v_add_co_ci_u32_e32 v5, vcc_lo, 0, v7, vcc_lo +; GFX10-NEXT: v_add_co_u32 v5, vcc_lo, v7, 0 +; GFX10-NEXT: v_and_b32_e32 v4, 1, v4 ; GFX10-NEXT: v_add_co_ci_u32_e32 v8, vcc_lo, 0, v7, vcc_lo -; GFX10-NEXT: v_cmp_ne_u32_e64 s5, 0, v3 -; GFX10-NEXT: v_add_co_ci_u32_e32 v7, vcc_lo, 0x80000000, v7, vcc_lo +; GFX10-NEXT: v_add_co_ci_u32_e32 v9, vcc_lo, 0, v7, vcc_lo +; GFX10-NEXT: v_cmp_ne_u32_e64 s5, 0, v4 +; GFX10-NEXT: v_add_co_ci_u32_e32 v7, vcc_lo, v7, v3, vcc_lo ; GFX10-NEXT: v_cndmask_b32_e64 v3, v17, v6, s4 -; GFX10-NEXT: v_cndmask_b32_e64 v4, v10, v4, s5 -; GFX10-NEXT: v_cndmask_b32_e64 v5, v11, v5, s5 -; GFX10-NEXT: v_cndmask_b32_e64 v6, v12, v8, s5 +; GFX10-NEXT: v_cndmask_b32_e64 v4, v10, v5, s5 +; GFX10-NEXT: v_cndmask_b32_e64 v5, v11, v8, s5 +; GFX10-NEXT: v_cndmask_b32_e64 v6, v12, v9, s5 ; GFX10-NEXT: v_cndmask_b32_e64 v7, v13, v7, s5 ; GFX10-NEXT: s_setpc_b64 s[30:31] %result = call <2 x i128> @llvm.sadd.sat.v2i128(<2 x i128> %lhs, <2 x i128> %rhs) Index: llvm/test/CodeGen/AMDGPU/GlobalISel/sdiv.i32.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/GlobalISel/sdiv.i32.ll +++ llvm/test/CodeGen/AMDGPU/GlobalISel/sdiv.i32.ll @@ -294,30 +294,30 @@ ; CHECK-LABEL: v_sdiv_i32_pow2k_denom: ; CHECK: ; %bb.0: ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_movk_i32 s6, 0x1000 -; CHECK-NEXT: v_ashrrev_i32_e32 v1, 31, v0 -; CHECK-NEXT: v_rcp_iflag_f32_e32 v2, 0x45800000 -; CHECK-NEXT: v_mov_b32_e32 v3, 0xfffff000 -; CHECK-NEXT: v_add_i32_e32 v0, vcc, v0, v1 -; CHECK-NEXT: v_mul_f32_e32 v2, 0x4f7ffffe, v2 -; CHECK-NEXT: v_xor_b32_e32 v0, v0, v1 -; CHECK-NEXT: v_cvt_u32_f32_e32 v2, v2 -; CHECK-NEXT: v_mul_lo_u32 v3, v3, v2 -; CHECK-NEXT: v_mul_hi_u32 v3, v2, v3 -; CHECK-NEXT: v_add_i32_e32 v2, vcc, v2, v3 -; CHECK-NEXT: v_mul_hi_u32 v2, v0, v2 -; CHECK-NEXT: v_lshlrev_b32_e32 v3, 12, v2 -; CHECK-NEXT: v_add_i32_e32 v4, vcc, 1, v2 -; CHECK-NEXT: v_sub_i32_e32 v0, vcc, v0, v3 -; CHECK-NEXT: v_cmp_le_u32_e32 vcc, s6, v0 -; CHECK-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc -; CHECK-NEXT: v_subrev_i32_e64 v3, s[4:5], s6, v0 -; CHECK-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc -; CHECK-NEXT: v_add_i32_e32 v3, vcc, 1, v2 -; CHECK-NEXT: v_cmp_le_u32_e32 vcc, s6, v0 -; CHECK-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc -; CHECK-NEXT: v_xor_b32_e32 v0, v0, v1 -; CHECK-NEXT: v_sub_i32_e32 v0, vcc, v0, v1 +; CHECK-NEXT: v_mov_b32_e32 v1, 0x1000 +; CHECK-NEXT: v_ashrrev_i32_e32 v2, 31, v0 +; CHECK-NEXT: v_rcp_iflag_f32_e32 v3, 0x45800000 +; CHECK-NEXT: v_mov_b32_e32 v4, 0xfffff000 +; CHECK-NEXT: v_add_i32_e32 v0, vcc, v0, v2 +; CHECK-NEXT: v_mul_f32_e32 v3, 0x4f7ffffe, v3 +; CHECK-NEXT: v_xor_b32_e32 v0, v0, v2 +; CHECK-NEXT: v_cvt_u32_f32_e32 v3, v3 +; CHECK-NEXT: v_mul_lo_u32 v4, v4, v3 +; CHECK-NEXT: v_mul_hi_u32 v4, v3, v4 +; CHECK-NEXT: v_add_i32_e32 v3, vcc, v3, v4 +; CHECK-NEXT: v_mul_hi_u32 v3, v0, v3 +; CHECK-NEXT: v_lshlrev_b32_e32 v4, 12, v3 +; CHECK-NEXT: v_add_i32_e32 v5, vcc, 1, v3 +; CHECK-NEXT: v_sub_i32_e32 v0, vcc, v0, v4 +; CHECK-NEXT: v_cmp_ge_u32_e32 vcc, v0, v1 +; CHECK-NEXT: v_cndmask_b32_e32 v3, v3, v5, vcc +; CHECK-NEXT: v_sub_i32_e64 v4, s[4:5], v0, v1 +; CHECK-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc +; CHECK-NEXT: v_add_i32_e32 v4, vcc, 1, v3 +; CHECK-NEXT: v_cmp_ge_u32_e32 vcc, v0, v1 +; CHECK-NEXT: v_cndmask_b32_e32 v0, v3, v4, vcc +; CHECK-NEXT: v_xor_b32_e32 v0, v0, v2 +; CHECK-NEXT: v_sub_i32_e32 v0, vcc, v0, v2 ; CHECK-NEXT: s_setpc_b64 s[30:31] %result = sdiv i32 %num, 4096 ret i32 %result @@ -378,53 +378,51 @@ ; CGP-LABEL: v_sdiv_v2i32_pow2k_denom: ; CGP: ; %bb.0: ; CGP-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CGP-NEXT: s_movk_i32 s8, 0x1000 -; CGP-NEXT: v_ashrrev_i32_e32 v2, 31, v0 -; CGP-NEXT: v_rcp_iflag_f32_e32 v3, 0x45800000 -; CGP-NEXT: s_movk_i32 s4, 0xf000 -; CGP-NEXT: v_mov_b32_e32 v4, 0xfffff000 -; CGP-NEXT: v_mov_b32_e32 v5, 0x1000 +; CGP-NEXT: v_mov_b32_e32 v2, 0x1000 +; CGP-NEXT: v_ashrrev_i32_e32 v3, 31, v0 +; CGP-NEXT: v_rcp_iflag_f32_e32 v4, 0x45800000 +; CGP-NEXT: v_mov_b32_e32 v5, 0xfffff000 ; CGP-NEXT: v_ashrrev_i32_e32 v6, 31, v1 ; CGP-NEXT: v_rcp_iflag_f32_e32 v7, 0x45800000 -; CGP-NEXT: v_add_i32_e32 v0, vcc, v0, v2 -; CGP-NEXT: v_mul_f32_e32 v3, 0x4f7ffffe, v3 +; CGP-NEXT: v_add_i32_e32 v0, vcc, v0, v3 +; CGP-NEXT: v_mul_f32_e32 v4, 0x4f7ffffe, v4 ; CGP-NEXT: v_add_i32_e32 v1, vcc, v1, v6 ; CGP-NEXT: v_mul_f32_e32 v7, 0x4f7ffffe, v7 -; CGP-NEXT: v_xor_b32_e32 v0, v0, v2 -; CGP-NEXT: v_cvt_u32_f32_e32 v3, v3 +; CGP-NEXT: v_xor_b32_e32 v0, v0, v3 +; CGP-NEXT: v_cvt_u32_f32_e32 v4, v4 ; CGP-NEXT: v_xor_b32_e32 v1, v1, v6 ; CGP-NEXT: v_cvt_u32_f32_e32 v7, v7 -; CGP-NEXT: v_mul_lo_u32 v8, s4, v3 -; CGP-NEXT: v_mul_lo_u32 v4, v4, v7 -; CGP-NEXT: v_mul_hi_u32 v8, v3, v8 -; CGP-NEXT: v_mul_hi_u32 v4, v7, v4 -; CGP-NEXT: v_add_i32_e32 v3, vcc, v3, v8 -; CGP-NEXT: v_add_i32_e32 v4, vcc, v7, v4 -; CGP-NEXT: v_mul_hi_u32 v3, v0, v3 -; CGP-NEXT: v_mul_hi_u32 v4, v1, v4 -; CGP-NEXT: v_lshlrev_b32_e32 v7, 12, v3 -; CGP-NEXT: v_add_i32_e32 v8, vcc, 1, v3 -; CGP-NEXT: v_lshlrev_b32_e32 v9, 12, v4 -; CGP-NEXT: v_add_i32_e32 v10, vcc, 1, v4 +; CGP-NEXT: v_mul_lo_u32 v8, v5, v4 +; CGP-NEXT: v_mul_lo_u32 v5, v5, v7 +; CGP-NEXT: v_mul_hi_u32 v8, v4, v8 +; CGP-NEXT: v_mul_hi_u32 v5, v7, v5 +; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v8 +; CGP-NEXT: v_add_i32_e32 v5, vcc, v7, v5 +; CGP-NEXT: v_mul_hi_u32 v4, v0, v4 +; CGP-NEXT: v_mul_hi_u32 v5, v1, v5 +; CGP-NEXT: v_lshlrev_b32_e32 v7, 12, v4 +; CGP-NEXT: v_add_i32_e32 v8, vcc, 1, v4 +; CGP-NEXT: v_lshlrev_b32_e32 v9, 12, v5 +; CGP-NEXT: v_add_i32_e32 v10, vcc, 1, v5 ; CGP-NEXT: v_sub_i32_e32 v0, vcc, v0, v7 ; CGP-NEXT: v_sub_i32_e32 v1, vcc, v1, v9 -; CGP-NEXT: v_cmp_le_u32_e32 vcc, s8, v0 -; CGP-NEXT: v_cndmask_b32_e32 v3, v3, v8, vcc -; CGP-NEXT: v_subrev_i32_e64 v7, s[4:5], s8, v0 -; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v1, v5 -; CGP-NEXT: v_cndmask_b32_e64 v4, v4, v10, s[4:5] -; CGP-NEXT: v_sub_i32_e64 v8, s[6:7], v1, v5 +; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v0, v2 +; CGP-NEXT: v_cndmask_b32_e32 v4, v4, v8, vcc +; CGP-NEXT: v_sub_i32_e64 v7, s[4:5], v0, v2 +; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v1, v2 +; CGP-NEXT: v_cndmask_b32_e64 v5, v5, v10, s[4:5] +; CGP-NEXT: v_sub_i32_e64 v8, s[6:7], v1, v2 ; CGP-NEXT: v_cndmask_b32_e32 v0, v0, v7, vcc -; CGP-NEXT: v_add_i32_e32 v7, vcc, 1, v3 +; CGP-NEXT: v_add_i32_e32 v7, vcc, 1, v4 ; CGP-NEXT: v_cndmask_b32_e64 v1, v1, v8, s[4:5] -; CGP-NEXT: v_add_i32_e32 v8, vcc, 1, v4 -; CGP-NEXT: v_cmp_le_u32_e32 vcc, s8, v0 -; CGP-NEXT: v_cndmask_b32_e32 v0, v3, v7, vcc -; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v1, v5 -; CGP-NEXT: v_cndmask_b32_e32 v1, v4, v8, vcc -; CGP-NEXT: v_xor_b32_e32 v0, v0, v2 +; CGP-NEXT: v_add_i32_e32 v8, vcc, 1, v5 +; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v0, v2 +; CGP-NEXT: v_cndmask_b32_e32 v0, v4, v7, vcc +; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v1, v2 +; CGP-NEXT: v_cndmask_b32_e32 v1, v5, v8, vcc +; CGP-NEXT: v_xor_b32_e32 v0, v0, v3 ; CGP-NEXT: v_xor_b32_e32 v1, v1, v6 -; CGP-NEXT: v_sub_i32_e32 v0, vcc, v0, v2 +; CGP-NEXT: v_sub_i32_e32 v0, vcc, v0, v3 ; CGP-NEXT: v_sub_i32_e32 v1, vcc, v1, v6 ; CGP-NEXT: s_setpc_b64 s[30:31] %result = sdiv <2 x i32> %num, @@ -435,30 +433,30 @@ ; CHECK-LABEL: v_sdiv_i32_oddk_denom: ; CHECK: ; %bb.0: ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_mov_b32 s6, 0x12d8fb -; CHECK-NEXT: v_ashrrev_i32_e32 v1, 31, v0 -; CHECK-NEXT: v_rcp_iflag_f32_e32 v2, 0x4996c7d8 -; CHECK-NEXT: v_mov_b32_e32 v3, 0xffed2705 -; CHECK-NEXT: v_add_i32_e32 v0, vcc, v0, v1 -; CHECK-NEXT: v_mul_f32_e32 v2, 0x4f7ffffe, v2 -; CHECK-NEXT: v_xor_b32_e32 v0, v0, v1 -; CHECK-NEXT: v_cvt_u32_f32_e32 v2, v2 -; CHECK-NEXT: v_mul_lo_u32 v3, v3, v2 -; CHECK-NEXT: v_mul_hi_u32 v3, v2, v3 -; CHECK-NEXT: v_add_i32_e32 v2, vcc, v2, v3 -; CHECK-NEXT: v_mul_hi_u32 v2, v0, v2 -; CHECK-NEXT: v_mul_lo_u32 v3, v2, s6 -; CHECK-NEXT: v_add_i32_e32 v4, vcc, 1, v2 -; CHECK-NEXT: v_sub_i32_e32 v0, vcc, v0, v3 -; CHECK-NEXT: v_cmp_le_u32_e32 vcc, s6, v0 -; CHECK-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc -; CHECK-NEXT: v_subrev_i32_e64 v3, s[4:5], s6, v0 -; CHECK-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc -; CHECK-NEXT: v_add_i32_e32 v3, vcc, 1, v2 -; CHECK-NEXT: v_cmp_le_u32_e32 vcc, s6, v0 -; CHECK-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc -; CHECK-NEXT: v_xor_b32_e32 v0, v0, v1 -; CHECK-NEXT: v_sub_i32_e32 v0, vcc, v0, v1 +; CHECK-NEXT: v_mov_b32_e32 v1, 0x12d8fb +; CHECK-NEXT: v_ashrrev_i32_e32 v2, 31, v0 +; CHECK-NEXT: v_rcp_iflag_f32_e32 v3, 0x4996c7d8 +; CHECK-NEXT: v_mov_b32_e32 v4, 0xffed2705 +; CHECK-NEXT: v_add_i32_e32 v0, vcc, v0, v2 +; CHECK-NEXT: v_mul_f32_e32 v3, 0x4f7ffffe, v3 +; CHECK-NEXT: v_xor_b32_e32 v0, v0, v2 +; CHECK-NEXT: v_cvt_u32_f32_e32 v3, v3 +; CHECK-NEXT: v_mul_lo_u32 v4, v4, v3 +; CHECK-NEXT: v_mul_hi_u32 v4, v3, v4 +; CHECK-NEXT: v_add_i32_e32 v3, vcc, v3, v4 +; CHECK-NEXT: v_mul_hi_u32 v3, v0, v3 +; CHECK-NEXT: v_mul_lo_u32 v4, v3, v1 +; CHECK-NEXT: v_add_i32_e32 v5, vcc, 1, v3 +; CHECK-NEXT: v_sub_i32_e32 v0, vcc, v0, v4 +; CHECK-NEXT: v_cmp_ge_u32_e32 vcc, v0, v1 +; CHECK-NEXT: v_cndmask_b32_e32 v3, v3, v5, vcc +; CHECK-NEXT: v_sub_i32_e64 v4, s[4:5], v0, v1 +; CHECK-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc +; CHECK-NEXT: v_add_i32_e32 v4, vcc, 1, v3 +; CHECK-NEXT: v_cmp_ge_u32_e32 vcc, v0, v1 +; CHECK-NEXT: v_cndmask_b32_e32 v0, v3, v4, vcc +; CHECK-NEXT: v_xor_b32_e32 v0, v0, v2 +; CHECK-NEXT: v_sub_i32_e32 v0, vcc, v0, v2 ; CHECK-NEXT: s_setpc_b64 s[30:31] %result = sdiv i32 %num, 1235195 ret i32 %result @@ -519,53 +517,51 @@ ; CGP-LABEL: v_sdiv_v2i32_oddk_denom: ; CGP: ; %bb.0: ; CGP-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CGP-NEXT: s_mov_b32 s8, 0x12d8fb -; CGP-NEXT: v_ashrrev_i32_e32 v2, 31, v0 -; CGP-NEXT: v_rcp_iflag_f32_e32 v3, 0x4996c7d8 -; CGP-NEXT: s_mov_b32 s4, 0xffed2705 -; CGP-NEXT: v_mov_b32_e32 v4, 0xffed2705 -; CGP-NEXT: v_mov_b32_e32 v5, 0x12d8fb +; CGP-NEXT: v_mov_b32_e32 v2, 0x12d8fb +; CGP-NEXT: v_ashrrev_i32_e32 v3, 31, v0 +; CGP-NEXT: v_rcp_iflag_f32_e32 v4, 0x4996c7d8 +; CGP-NEXT: v_mov_b32_e32 v5, 0xffed2705 ; CGP-NEXT: v_ashrrev_i32_e32 v6, 31, v1 ; CGP-NEXT: v_rcp_iflag_f32_e32 v7, 0x4996c7d8 -; CGP-NEXT: v_add_i32_e32 v0, vcc, v0, v2 -; CGP-NEXT: v_mul_f32_e32 v3, 0x4f7ffffe, v3 +; CGP-NEXT: v_add_i32_e32 v0, vcc, v0, v3 +; CGP-NEXT: v_mul_f32_e32 v4, 0x4f7ffffe, v4 ; CGP-NEXT: v_add_i32_e32 v1, vcc, v1, v6 ; CGP-NEXT: v_mul_f32_e32 v7, 0x4f7ffffe, v7 -; CGP-NEXT: v_xor_b32_e32 v0, v0, v2 -; CGP-NEXT: v_cvt_u32_f32_e32 v3, v3 +; CGP-NEXT: v_xor_b32_e32 v0, v0, v3 +; CGP-NEXT: v_cvt_u32_f32_e32 v4, v4 ; CGP-NEXT: v_xor_b32_e32 v1, v1, v6 ; CGP-NEXT: v_cvt_u32_f32_e32 v7, v7 -; CGP-NEXT: v_mul_lo_u32 v8, s4, v3 -; CGP-NEXT: v_mul_lo_u32 v4, v4, v7 -; CGP-NEXT: v_mul_hi_u32 v8, v3, v8 -; CGP-NEXT: v_mul_hi_u32 v4, v7, v4 -; CGP-NEXT: v_add_i32_e32 v3, vcc, v3, v8 -; CGP-NEXT: v_add_i32_e32 v4, vcc, v7, v4 -; CGP-NEXT: v_mul_hi_u32 v3, v0, v3 -; CGP-NEXT: v_mul_hi_u32 v4, v1, v4 -; CGP-NEXT: v_mul_lo_u32 v7, v3, s8 -; CGP-NEXT: v_add_i32_e32 v8, vcc, 1, v3 -; CGP-NEXT: v_mul_lo_u32 v9, v4, v5 -; CGP-NEXT: v_add_i32_e32 v10, vcc, 1, v4 +; CGP-NEXT: v_mul_lo_u32 v8, v5, v4 +; CGP-NEXT: v_mul_lo_u32 v5, v5, v7 +; CGP-NEXT: v_mul_hi_u32 v8, v4, v8 +; CGP-NEXT: v_mul_hi_u32 v5, v7, v5 +; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v8 +; CGP-NEXT: v_add_i32_e32 v5, vcc, v7, v5 +; CGP-NEXT: v_mul_hi_u32 v4, v0, v4 +; CGP-NEXT: v_mul_hi_u32 v5, v1, v5 +; CGP-NEXT: v_mul_lo_u32 v7, v4, v2 +; CGP-NEXT: v_add_i32_e32 v8, vcc, 1, v4 +; CGP-NEXT: v_mul_lo_u32 v9, v5, v2 +; CGP-NEXT: v_add_i32_e32 v10, vcc, 1, v5 ; CGP-NEXT: v_sub_i32_e32 v0, vcc, v0, v7 ; CGP-NEXT: v_sub_i32_e32 v1, vcc, v1, v9 -; CGP-NEXT: v_cmp_le_u32_e32 vcc, s8, v0 -; CGP-NEXT: v_cndmask_b32_e32 v3, v3, v8, vcc -; CGP-NEXT: v_subrev_i32_e64 v7, s[4:5], s8, v0 -; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v1, v5 -; CGP-NEXT: v_cndmask_b32_e64 v4, v4, v10, s[4:5] -; CGP-NEXT: v_sub_i32_e64 v8, s[6:7], v1, v5 +; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v0, v2 +; CGP-NEXT: v_cndmask_b32_e32 v4, v4, v8, vcc +; CGP-NEXT: v_sub_i32_e64 v7, s[4:5], v0, v2 +; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v1, v2 +; CGP-NEXT: v_cndmask_b32_e64 v5, v5, v10, s[4:5] +; CGP-NEXT: v_sub_i32_e64 v8, s[6:7], v1, v2 ; CGP-NEXT: v_cndmask_b32_e32 v0, v0, v7, vcc -; CGP-NEXT: v_add_i32_e32 v7, vcc, 1, v3 +; CGP-NEXT: v_add_i32_e32 v7, vcc, 1, v4 ; CGP-NEXT: v_cndmask_b32_e64 v1, v1, v8, s[4:5] -; CGP-NEXT: v_add_i32_e32 v8, vcc, 1, v4 -; CGP-NEXT: v_cmp_le_u32_e32 vcc, s8, v0 -; CGP-NEXT: v_cndmask_b32_e32 v0, v3, v7, vcc -; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v1, v5 -; CGP-NEXT: v_cndmask_b32_e32 v1, v4, v8, vcc -; CGP-NEXT: v_xor_b32_e32 v0, v0, v2 +; CGP-NEXT: v_add_i32_e32 v8, vcc, 1, v5 +; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v0, v2 +; CGP-NEXT: v_cndmask_b32_e32 v0, v4, v7, vcc +; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v1, v2 +; CGP-NEXT: v_cndmask_b32_e32 v1, v5, v8, vcc +; CGP-NEXT: v_xor_b32_e32 v0, v0, v3 ; CGP-NEXT: v_xor_b32_e32 v1, v1, v6 -; CGP-NEXT: v_sub_i32_e32 v0, vcc, v0, v2 +; CGP-NEXT: v_sub_i32_e32 v0, vcc, v0, v3 ; CGP-NEXT: v_sub_i32_e32 v1, vcc, v1, v6 ; CGP-NEXT: s_setpc_b64 s[30:31] %result = sdiv <2 x i32> %num, @@ -615,137 +611,137 @@ ; GISEL-LABEL: v_sdiv_v2i32_pow2_shl_denom: ; GISEL: ; %bb.0: ; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GISEL-NEXT: s_movk_i32 s4, 0x1000 -; GISEL-NEXT: v_ashrrev_i32_e32 v4, 31, v0 -; GISEL-NEXT: v_ashrrev_i32_e32 v5, 31, v1 -; GISEL-NEXT: v_lshl_b32_e32 v2, s4, v2 -; GISEL-NEXT: v_lshl_b32_e32 v3, s4, v3 -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v4 -; GISEL-NEXT: v_add_i32_e32 v1, vcc, v1, v5 -; GISEL-NEXT: v_ashrrev_i32_e32 v6, 31, v2 -; GISEL-NEXT: v_xor_b32_e32 v0, v0, v4 +; GISEL-NEXT: v_mov_b32_e32 v4, 0x1000 +; GISEL-NEXT: v_ashrrev_i32_e32 v5, 31, v0 +; GISEL-NEXT: v_ashrrev_i32_e32 v6, 31, v1 +; GISEL-NEXT: v_lshlrev_b32_e32 v2, v2, v4 +; GISEL-NEXT: v_lshlrev_b32_e32 v3, v3, v4 +; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v5 +; GISEL-NEXT: v_add_i32_e32 v1, vcc, v1, v6 +; GISEL-NEXT: v_ashrrev_i32_e32 v4, 31, v2 +; GISEL-NEXT: v_xor_b32_e32 v0, v0, v5 ; GISEL-NEXT: v_ashrrev_i32_e32 v7, 31, v3 -; GISEL-NEXT: v_xor_b32_e32 v1, v1, v5 -; GISEL-NEXT: v_add_i32_e32 v2, vcc, v2, v6 -; GISEL-NEXT: v_xor_b32_e32 v4, v4, v6 +; GISEL-NEXT: v_xor_b32_e32 v1, v1, v6 +; GISEL-NEXT: v_add_i32_e32 v2, vcc, v2, v4 +; GISEL-NEXT: v_xor_b32_e32 v5, v5, v4 ; GISEL-NEXT: v_add_i32_e32 v3, vcc, v3, v7 -; GISEL-NEXT: v_xor_b32_e32 v5, v5, v7 -; GISEL-NEXT: v_xor_b32_e32 v2, v2, v6 +; GISEL-NEXT: v_xor_b32_e32 v6, v6, v7 +; GISEL-NEXT: v_xor_b32_e32 v2, v2, v4 ; GISEL-NEXT: v_xor_b32_e32 v3, v3, v7 -; GISEL-NEXT: v_cvt_f32_u32_e32 v6, v2 +; GISEL-NEXT: v_cvt_f32_u32_e32 v4, v2 ; GISEL-NEXT: v_sub_i32_e32 v7, vcc, 0, v2 ; GISEL-NEXT: v_cvt_f32_u32_e32 v8, v3 ; GISEL-NEXT: v_sub_i32_e32 v9, vcc, 0, v3 -; GISEL-NEXT: v_rcp_iflag_f32_e32 v6, v6 +; GISEL-NEXT: v_rcp_iflag_f32_e32 v4, v4 ; GISEL-NEXT: v_rcp_iflag_f32_e32 v8, v8 -; GISEL-NEXT: v_mul_f32_e32 v6, 0x4f7ffffe, v6 +; GISEL-NEXT: v_mul_f32_e32 v4, 0x4f7ffffe, v4 ; GISEL-NEXT: v_mul_f32_e32 v8, 0x4f7ffffe, v8 -; GISEL-NEXT: v_cvt_u32_f32_e32 v6, v6 +; GISEL-NEXT: v_cvt_u32_f32_e32 v4, v4 ; GISEL-NEXT: v_cvt_u32_f32_e32 v8, v8 -; GISEL-NEXT: v_mul_lo_u32 v7, v7, v6 +; GISEL-NEXT: v_mul_lo_u32 v7, v7, v4 ; GISEL-NEXT: v_mul_lo_u32 v9, v9, v8 -; GISEL-NEXT: v_mul_hi_u32 v7, v6, v7 +; GISEL-NEXT: v_mul_hi_u32 v7, v4, v7 ; GISEL-NEXT: v_mul_hi_u32 v9, v8, v9 -; GISEL-NEXT: v_add_i32_e32 v6, vcc, v6, v7 +; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v7 ; GISEL-NEXT: v_add_i32_e32 v7, vcc, v8, v9 -; GISEL-NEXT: v_mul_hi_u32 v6, v0, v6 +; GISEL-NEXT: v_mul_hi_u32 v4, v0, v4 ; GISEL-NEXT: v_mul_hi_u32 v7, v1, v7 -; GISEL-NEXT: v_mul_lo_u32 v8, v6, v2 -; GISEL-NEXT: v_add_i32_e32 v9, vcc, 1, v6 +; GISEL-NEXT: v_mul_lo_u32 v8, v4, v2 +; GISEL-NEXT: v_add_i32_e32 v9, vcc, 1, v4 ; GISEL-NEXT: v_mul_lo_u32 v10, v7, v3 ; GISEL-NEXT: v_add_i32_e32 v11, vcc, 1, v7 ; GISEL-NEXT: v_sub_i32_e32 v0, vcc, v0, v8 ; GISEL-NEXT: v_sub_i32_e32 v1, vcc, v1, v10 ; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v0, v2 -; GISEL-NEXT: v_cndmask_b32_e32 v6, v6, v9, vcc +; GISEL-NEXT: v_cndmask_b32_e32 v4, v4, v9, vcc ; GISEL-NEXT: v_sub_i32_e64 v8, s[4:5], v0, v2 ; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v1, v3 ; GISEL-NEXT: v_cndmask_b32_e64 v7, v7, v11, s[4:5] ; GISEL-NEXT: v_sub_i32_e64 v9, s[6:7], v1, v3 ; GISEL-NEXT: v_cndmask_b32_e32 v0, v0, v8, vcc -; GISEL-NEXT: v_add_i32_e32 v8, vcc, 1, v6 +; GISEL-NEXT: v_add_i32_e32 v8, vcc, 1, v4 ; GISEL-NEXT: v_cndmask_b32_e64 v1, v1, v9, s[4:5] ; GISEL-NEXT: v_add_i32_e32 v9, vcc, 1, v7 ; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v0, v2 -; GISEL-NEXT: v_cndmask_b32_e32 v0, v6, v8, vcc +; GISEL-NEXT: v_cndmask_b32_e32 v0, v4, v8, vcc ; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v1, v3 ; GISEL-NEXT: v_cndmask_b32_e32 v1, v7, v9, vcc -; GISEL-NEXT: v_xor_b32_e32 v0, v0, v4 -; GISEL-NEXT: v_xor_b32_e32 v1, v1, v5 -; GISEL-NEXT: v_sub_i32_e32 v0, vcc, v0, v4 -; GISEL-NEXT: v_sub_i32_e32 v1, vcc, v1, v5 +; GISEL-NEXT: v_xor_b32_e32 v0, v0, v5 +; GISEL-NEXT: v_xor_b32_e32 v1, v1, v6 +; GISEL-NEXT: v_sub_i32_e32 v0, vcc, v0, v5 +; GISEL-NEXT: v_sub_i32_e32 v1, vcc, v1, v6 ; GISEL-NEXT: s_setpc_b64 s[30:31] ; ; CGP-LABEL: v_sdiv_v2i32_pow2_shl_denom: ; CGP: ; %bb.0: ; CGP-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CGP-NEXT: s_movk_i32 s4, 0x1000 -; CGP-NEXT: v_ashrrev_i32_e32 v4, 31, v0 -; CGP-NEXT: v_ashrrev_i32_e32 v5, 31, v1 -; CGP-NEXT: v_lshl_b32_e32 v2, s4, v2 -; CGP-NEXT: v_lshl_b32_e32 v3, s4, v3 -; CGP-NEXT: v_add_i32_e32 v0, vcc, v0, v4 -; CGP-NEXT: v_add_i32_e32 v1, vcc, v1, v5 -; CGP-NEXT: v_ashrrev_i32_e32 v6, 31, v2 -; CGP-NEXT: v_xor_b32_e32 v0, v0, v4 +; CGP-NEXT: v_mov_b32_e32 v4, 0x1000 +; CGP-NEXT: v_ashrrev_i32_e32 v5, 31, v0 +; CGP-NEXT: v_ashrrev_i32_e32 v6, 31, v1 +; CGP-NEXT: v_lshlrev_b32_e32 v2, v2, v4 +; CGP-NEXT: v_lshlrev_b32_e32 v3, v3, v4 +; CGP-NEXT: v_add_i32_e32 v0, vcc, v0, v5 +; CGP-NEXT: v_add_i32_e32 v1, vcc, v1, v6 +; CGP-NEXT: v_ashrrev_i32_e32 v4, 31, v2 +; CGP-NEXT: v_xor_b32_e32 v0, v0, v5 ; CGP-NEXT: v_ashrrev_i32_e32 v7, 31, v3 -; CGP-NEXT: v_xor_b32_e32 v1, v1, v5 -; CGP-NEXT: v_xor_b32_e32 v4, v4, v6 -; CGP-NEXT: v_add_i32_e32 v2, vcc, v2, v6 -; CGP-NEXT: v_xor_b32_e32 v5, v5, v7 +; CGP-NEXT: v_xor_b32_e32 v1, v1, v6 +; CGP-NEXT: v_xor_b32_e32 v5, v5, v4 +; CGP-NEXT: v_add_i32_e32 v2, vcc, v2, v4 +; CGP-NEXT: v_xor_b32_e32 v6, v6, v7 ; CGP-NEXT: v_add_i32_e32 v3, vcc, v3, v7 -; CGP-NEXT: v_xor_b32_e32 v2, v2, v6 +; CGP-NEXT: v_xor_b32_e32 v2, v2, v4 ; CGP-NEXT: v_xor_b32_e32 v3, v3, v7 -; CGP-NEXT: v_cvt_f32_u32_e32 v6, v2 +; CGP-NEXT: v_cvt_f32_u32_e32 v4, v2 ; CGP-NEXT: v_sub_i32_e32 v7, vcc, 0, v2 ; CGP-NEXT: v_cvt_f32_u32_e32 v8, v3 ; CGP-NEXT: v_sub_i32_e32 v9, vcc, 0, v3 -; CGP-NEXT: v_rcp_f32_e32 v6, v6 +; CGP-NEXT: v_rcp_f32_e32 v4, v4 ; CGP-NEXT: v_rcp_f32_e32 v8, v8 -; CGP-NEXT: v_mul_f32_e32 v6, 0x4f7ffffe, v6 +; CGP-NEXT: v_mul_f32_e32 v4, 0x4f7ffffe, v4 ; CGP-NEXT: v_mul_f32_e32 v8, 0x4f7ffffe, v8 -; CGP-NEXT: v_cvt_u32_f32_e32 v6, v6 +; CGP-NEXT: v_cvt_u32_f32_e32 v4, v4 ; CGP-NEXT: v_cvt_u32_f32_e32 v8, v8 -; CGP-NEXT: v_mul_lo_u32 v7, v7, v6 +; CGP-NEXT: v_mul_lo_u32 v7, v7, v4 ; CGP-NEXT: v_mul_lo_u32 v9, v9, v8 ; CGP-NEXT: v_mul_lo_u32 v10, 0, v7 -; CGP-NEXT: v_mul_hi_u32 v7, v6, v7 +; CGP-NEXT: v_mul_hi_u32 v7, v4, v7 ; CGP-NEXT: v_mul_lo_u32 v11, 0, v9 ; CGP-NEXT: v_mul_hi_u32 v9, v8, v9 ; CGP-NEXT: v_add_i32_e32 v7, vcc, v10, v7 ; CGP-NEXT: v_add_i32_e32 v9, vcc, v11, v9 -; CGP-NEXT: v_add_i32_e32 v6, vcc, v6, v7 +; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v7 ; CGP-NEXT: v_add_i32_e32 v7, vcc, v8, v9 -; CGP-NEXT: v_mul_lo_u32 v8, 0, v6 -; CGP-NEXT: v_mul_hi_u32 v6, v0, v6 +; CGP-NEXT: v_mul_lo_u32 v8, 0, v4 +; CGP-NEXT: v_mul_hi_u32 v4, v0, v4 ; CGP-NEXT: v_mul_lo_u32 v9, 0, v7 ; CGP-NEXT: v_mul_hi_u32 v7, v1, v7 -; CGP-NEXT: v_add_i32_e32 v6, vcc, v8, v6 +; CGP-NEXT: v_add_i32_e32 v4, vcc, v8, v4 ; CGP-NEXT: v_add_i32_e32 v7, vcc, v9, v7 -; CGP-NEXT: v_mul_lo_u32 v8, v6, v2 -; CGP-NEXT: v_add_i32_e32 v9, vcc, 1, v6 +; CGP-NEXT: v_mul_lo_u32 v8, v4, v2 +; CGP-NEXT: v_add_i32_e32 v9, vcc, 1, v4 ; CGP-NEXT: v_mul_lo_u32 v10, v7, v3 ; CGP-NEXT: v_add_i32_e32 v11, vcc, 1, v7 ; CGP-NEXT: v_sub_i32_e32 v0, vcc, v0, v8 ; CGP-NEXT: v_sub_i32_e32 v1, vcc, v1, v10 ; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v0, v2 -; CGP-NEXT: v_cndmask_b32_e32 v6, v6, v9, vcc +; CGP-NEXT: v_cndmask_b32_e32 v4, v4, v9, vcc ; CGP-NEXT: v_sub_i32_e64 v8, s[4:5], v0, v2 ; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v1, v3 ; CGP-NEXT: v_cndmask_b32_e64 v7, v7, v11, s[4:5] ; CGP-NEXT: v_sub_i32_e64 v9, s[6:7], v1, v3 ; CGP-NEXT: v_cndmask_b32_e32 v0, v0, v8, vcc -; CGP-NEXT: v_add_i32_e32 v8, vcc, 1, v6 +; CGP-NEXT: v_add_i32_e32 v8, vcc, 1, v4 ; CGP-NEXT: v_cndmask_b32_e64 v1, v1, v9, s[4:5] ; CGP-NEXT: v_add_i32_e32 v9, vcc, 1, v7 ; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v0, v2 -; CGP-NEXT: v_cndmask_b32_e32 v0, v6, v8, vcc +; CGP-NEXT: v_cndmask_b32_e32 v0, v4, v8, vcc ; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v1, v3 ; CGP-NEXT: v_cndmask_b32_e32 v1, v7, v9, vcc -; CGP-NEXT: v_xor_b32_e32 v0, v0, v4 -; CGP-NEXT: v_xor_b32_e32 v1, v1, v5 -; CGP-NEXT: v_sub_i32_e32 v0, vcc, v0, v4 -; CGP-NEXT: v_sub_i32_e32 v1, vcc, v1, v5 +; CGP-NEXT: v_xor_b32_e32 v0, v0, v5 +; CGP-NEXT: v_xor_b32_e32 v1, v1, v6 +; CGP-NEXT: v_sub_i32_e32 v0, vcc, v0, v5 +; CGP-NEXT: v_sub_i32_e32 v1, vcc, v1, v6 ; CGP-NEXT: s_setpc_b64 s[30:31] %shl.y = shl <2 x i32> , %y %r = sdiv <2 x i32> %x, %shl.y @@ -756,9 +752,9 @@ ; GISEL-LABEL: v_sdiv_i32_24bit: ; GISEL: ; %bb.0: ; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GISEL-NEXT: s_mov_b32 s4, 0xffffff -; GISEL-NEXT: v_and_b32_e32 v0, s4, v0 -; GISEL-NEXT: v_and_b32_e32 v1, s4, v1 +; GISEL-NEXT: v_mov_b32_e32 v2, 0xffffff +; GISEL-NEXT: v_and_b32_e32 v0, v0, v2 +; GISEL-NEXT: v_and_b32_e32 v1, v1, v2 ; GISEL-NEXT: v_ashrrev_i32_e32 v2, 31, v0 ; GISEL-NEXT: v_ashrrev_i32_e32 v3, 31, v1 ; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v2 @@ -792,9 +788,9 @@ ; CGP-LABEL: v_sdiv_i32_24bit: ; CGP: ; %bb.0: ; CGP-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CGP-NEXT: s_mov_b32 s4, 0xffffff -; CGP-NEXT: v_and_b32_e32 v0, s4, v0 -; CGP-NEXT: v_and_b32_e32 v1, s4, v1 +; CGP-NEXT: v_mov_b32_e32 v2, 0xffffff +; CGP-NEXT: v_and_b32_e32 v0, v0, v2 +; CGP-NEXT: v_and_b32_e32 v1, v1, v2 ; CGP-NEXT: v_cvt_f32_u32_e32 v2, v1 ; CGP-NEXT: v_sub_i32_e32 v3, vcc, 0, v1 ; CGP-NEXT: v_rcp_f32_e32 v2, v2 Index: llvm/test/CodeGen/AMDGPU/GlobalISel/sdiv.i64.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/GlobalISel/sdiv.i64.ll +++ llvm/test/CodeGen/AMDGPU/GlobalISel/sdiv.i64.ll @@ -176,17 +176,17 @@ ; CHECK-NEXT: v_mul_hi_u32 v1, v0, v1 ; CHECK-NEXT: v_add_i32_e32 v0, vcc, v0, v1 ; CHECK-NEXT: v_mul_hi_u32 v0, v4, v0 -; CHECK-NEXT: v_mul_lo_u32 v1, v0, v2 -; CHECK-NEXT: v_add_i32_e32 v3, vcc, 1, v0 -; CHECK-NEXT: v_sub_i32_e32 v1, vcc, v4, v1 -; CHECK-NEXT: v_cmp_ge_u32_e32 vcc, v1, v2 -; CHECK-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc -; CHECK-NEXT: v_sub_i32_e64 v3, s[4:5], v1, v2 -; CHECK-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc -; CHECK-NEXT: v_add_i32_e32 v3, vcc, 1, v0 -; CHECK-NEXT: v_cmp_ge_u32_e32 vcc, v1, v2 -; CHECK-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc ; CHECK-NEXT: v_mov_b32_e32 v1, 0 +; CHECK-NEXT: v_mul_lo_u32 v3, v0, v2 +; CHECK-NEXT: v_add_i32_e32 v5, vcc, 1, v0 +; CHECK-NEXT: v_sub_i32_e32 v3, vcc, v4, v3 +; CHECK-NEXT: v_cmp_ge_u32_e32 vcc, v3, v2 +; CHECK-NEXT: v_cndmask_b32_e32 v0, v0, v5, vcc +; CHECK-NEXT: v_sub_i32_e64 v4, s[4:5], v3, v2 +; CHECK-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc +; CHECK-NEXT: v_add_i32_e32 v4, vcc, 1, v0 +; CHECK-NEXT: v_cmp_ge_u32_e32 vcc, v3, v2 +; CHECK-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc ; CHECK-NEXT: BB0_4: ; CHECK-NEXT: s_or_b64 exec, exec, s[6:7] ; CHECK-NEXT: s_setpc_b64 s[30:31] @@ -862,17 +862,17 @@ ; CGP-NEXT: v_mul_hi_u32 v1, v0, v1 ; CGP-NEXT: v_add_i32_e32 v0, vcc, v0, v1 ; CGP-NEXT: v_mul_hi_u32 v0, v10, v0 -; CGP-NEXT: v_mul_lo_u32 v1, v0, v4 -; CGP-NEXT: v_add_i32_e32 v2, vcc, 1, v0 -; CGP-NEXT: v_sub_i32_e32 v1, vcc, v10, v1 -; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v1, v4 -; CGP-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc -; CGP-NEXT: v_sub_i32_e64 v2, s[4:5], v1, v4 -; CGP-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc -; CGP-NEXT: v_add_i32_e32 v2, vcc, 1, v0 -; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v1, v4 -; CGP-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc ; CGP-NEXT: v_mov_b32_e32 v1, 0 +; CGP-NEXT: v_mul_lo_u32 v2, v0, v4 +; CGP-NEXT: v_add_i32_e32 v3, vcc, 1, v0 +; CGP-NEXT: v_sub_i32_e32 v2, vcc, v10, v2 +; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v2, v4 +; CGP-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc +; CGP-NEXT: v_sub_i32_e64 v3, s[4:5], v2, v4 +; CGP-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc +; CGP-NEXT: v_add_i32_e32 v3, vcc, 1, v0 +; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v2, v4 +; CGP-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc ; CGP-NEXT: BB2_4: ; CGP-NEXT: s_or_b64 exec, exec, s[6:7] ; CGP-NEXT: v_or_b32_e32 v3, v9, v7 @@ -1041,17 +1041,17 @@ ; CGP-NEXT: v_mul_hi_u32 v3, v2, v3 ; CGP-NEXT: v_add_i32_e32 v2, vcc, v2, v3 ; CGP-NEXT: v_mul_hi_u32 v2, v8, v2 -; CGP-NEXT: v_mul_lo_u32 v3, v2, v6 -; CGP-NEXT: v_add_i32_e32 v4, vcc, 1, v2 -; CGP-NEXT: v_sub_i32_e32 v3, vcc, v8, v3 -; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v3, v6 -; CGP-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc -; CGP-NEXT: v_sub_i32_e64 v4, s[4:5], v3, v6 -; CGP-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc -; CGP-NEXT: v_add_i32_e32 v4, vcc, 1, v2 -; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v3, v6 -; CGP-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc ; CGP-NEXT: v_mov_b32_e32 v3, 0 +; CGP-NEXT: v_mul_lo_u32 v4, v2, v6 +; CGP-NEXT: v_add_i32_e32 v5, vcc, 1, v2 +; CGP-NEXT: v_sub_i32_e32 v4, vcc, v8, v4 +; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v4, v6 +; CGP-NEXT: v_cndmask_b32_e32 v2, v2, v5, vcc +; CGP-NEXT: v_sub_i32_e64 v5, s[4:5], v4, v6 +; CGP-NEXT: v_cndmask_b32_e32 v4, v4, v5, vcc +; CGP-NEXT: v_add_i32_e32 v5, vcc, 1, v2 +; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v4, v6 +; CGP-NEXT: v_cndmask_b32_e32 v2, v2, v5, vcc ; CGP-NEXT: BB2_8: ; CGP-NEXT: s_or_b64 exec, exec, s[6:7] ; CGP-NEXT: s_setpc_b64 s[30:31] @@ -1063,138 +1063,138 @@ ; CHECK-LABEL: v_sdiv_i64_pow2k_denom: ; CHECK: ; %bb.0: ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: v_cvt_f32_u32_e32 v3, 0x1000 -; CHECK-NEXT: v_cvt_f32_ubyte0_e32 v4, 0 -; CHECK-NEXT: s_movk_i32 s6, 0xf000 +; CHECK-NEXT: v_mov_b32_e32 v3, 0x1000 +; CHECK-NEXT: v_cvt_f32_u32_e32 v4, v3 +; CHECK-NEXT: v_cvt_f32_ubyte0_e32 v5, 0 +; CHECK-NEXT: v_mov_b32_e32 v6, 0xfffff000 ; CHECK-NEXT: v_ashrrev_i32_e32 v2, 31, v1 -; CHECK-NEXT: v_mac_f32_e32 v3, 0x4f800000, v4 -; CHECK-NEXT: v_rcp_iflag_f32_e32 v3, v3 +; CHECK-NEXT: v_mac_f32_e32 v4, 0x4f800000, v5 +; CHECK-NEXT: v_rcp_iflag_f32_e32 v4, v4 ; CHECK-NEXT: v_add_i32_e32 v0, vcc, v0, v2 ; CHECK-NEXT: v_addc_u32_e32 v1, vcc, v1, v2, vcc -; CHECK-NEXT: v_mul_f32_e32 v3, 0x5f7ffffc, v3 -; CHECK-NEXT: v_mul_f32_e32 v4, 0x2f800000, v3 -; CHECK-NEXT: v_trunc_f32_e32 v4, v4 -; CHECK-NEXT: v_mac_f32_e32 v3, 0xcf800000, v4 -; CHECK-NEXT: v_cvt_u32_f32_e32 v3, v3 +; CHECK-NEXT: v_mul_f32_e32 v4, 0x5f7ffffc, v4 +; CHECK-NEXT: v_mul_f32_e32 v5, 0x2f800000, v4 +; CHECK-NEXT: v_trunc_f32_e32 v5, v5 +; CHECK-NEXT: v_mac_f32_e32 v4, 0xcf800000, v5 ; CHECK-NEXT: v_cvt_u32_f32_e32 v4, v4 +; CHECK-NEXT: v_cvt_u32_f32_e32 v5, v5 ; CHECK-NEXT: v_xor_b32_e32 v0, v0, v2 ; CHECK-NEXT: v_xor_b32_e32 v1, v1, v2 -; CHECK-NEXT: v_mul_lo_u32 v5, -1, v3 -; CHECK-NEXT: v_mul_lo_u32 v6, s6, v4 -; CHECK-NEXT: v_mul_hi_u32 v8, s6, v3 -; CHECK-NEXT: v_mul_lo_u32 v7, s6, v3 -; CHECK-NEXT: s_bfe_i32 s7, -1, 0x10000 -; CHECK-NEXT: v_add_i32_e32 v5, vcc, v5, v6 -; CHECK-NEXT: v_add_i32_e32 v5, vcc, v5, v8 -; CHECK-NEXT: v_mul_lo_u32 v6, v4, v7 -; CHECK-NEXT: v_mul_lo_u32 v8, v3, v5 -; CHECK-NEXT: v_mul_hi_u32 v9, v3, v7 -; CHECK-NEXT: v_mul_hi_u32 v7, v4, v7 -; CHECK-NEXT: v_add_i32_e32 v6, vcc, v6, v8 -; CHECK-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc -; CHECK-NEXT: v_add_i32_e32 v6, vcc, v6, v9 -; CHECK-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc -; CHECK-NEXT: v_mul_lo_u32 v9, v4, v5 -; CHECK-NEXT: v_add_i32_e32 v6, vcc, v8, v6 -; CHECK-NEXT: v_mul_hi_u32 v8, v3, v5 -; CHECK-NEXT: v_add_i32_e32 v7, vcc, v9, v7 -; CHECK-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc +; CHECK-NEXT: v_mul_lo_u32 v7, -1, v4 +; CHECK-NEXT: v_mul_lo_u32 v8, v6, v5 +; CHECK-NEXT: v_mul_hi_u32 v10, v6, v4 +; CHECK-NEXT: v_mul_lo_u32 v9, v6, v4 +; CHECK-NEXT: s_bfe_i32 s6, -1, 0x10000 ; CHECK-NEXT: v_add_i32_e32 v7, vcc, v7, v8 +; CHECK-NEXT: v_add_i32_e32 v7, vcc, v7, v10 +; CHECK-NEXT: v_mul_lo_u32 v8, v5, v9 +; CHECK-NEXT: v_mul_lo_u32 v10, v4, v7 +; CHECK-NEXT: v_mul_hi_u32 v11, v4, v9 +; CHECK-NEXT: v_mul_hi_u32 v9, v5, v9 +; CHECK-NEXT: v_add_i32_e32 v8, vcc, v8, v10 +; CHECK-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc +; CHECK-NEXT: v_add_i32_e32 v8, vcc, v8, v11 ; CHECK-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc +; CHECK-NEXT: v_mul_lo_u32 v11, v5, v7 +; CHECK-NEXT: v_add_i32_e32 v8, vcc, v10, v8 +; CHECK-NEXT: v_mul_hi_u32 v10, v4, v7 +; CHECK-NEXT: v_add_i32_e32 v9, vcc, v11, v9 +; CHECK-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc +; CHECK-NEXT: v_add_i32_e32 v9, vcc, v9, v10 +; CHECK-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc +; CHECK-NEXT: v_add_i32_e32 v10, vcc, v11, v10 +; CHECK-NEXT: v_mul_hi_u32 v7, v5, v7 ; CHECK-NEXT: v_add_i32_e32 v8, vcc, v9, v8 -; CHECK-NEXT: v_mul_hi_u32 v5, v4, v5 -; CHECK-NEXT: v_add_i32_e32 v6, vcc, v7, v6 -; CHECK-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc -; CHECK-NEXT: v_add_i32_e32 v7, vcc, v8, v7 -; CHECK-NEXT: v_add_i32_e32 v5, vcc, v5, v7 -; CHECK-NEXT: v_add_i32_e32 v3, vcc, v3, v6 -; CHECK-NEXT: v_addc_u32_e64 v6, s[4:5], v4, v5, vcc -; CHECK-NEXT: v_mul_lo_u32 v7, -1, v3 -; CHECK-NEXT: v_mul_lo_u32 v8, s6, v6 -; CHECK-NEXT: v_mul_hi_u32 v10, s6, v3 -; CHECK-NEXT: v_mul_lo_u32 v9, s6, v3 -; CHECK-NEXT: v_add_i32_e64 v4, s[4:5], v4, v5 -; CHECK-NEXT: v_add_i32_e64 v7, s[4:5], v7, v8 -; CHECK-NEXT: v_add_i32_e64 v7, s[4:5], v7, v10 -; CHECK-NEXT: v_mul_lo_u32 v8, v6, v9 -; CHECK-NEXT: v_mul_lo_u32 v10, v3, v7 -; CHECK-NEXT: v_mul_hi_u32 v5, v3, v9 -; CHECK-NEXT: v_mul_hi_u32 v9, v6, v9 -; CHECK-NEXT: s_movk_i32 s6, 0x1000 -; CHECK-NEXT: v_add_i32_e64 v8, s[4:5], v8, v10 -; CHECK-NEXT: v_cndmask_b32_e64 v10, 0, 1, s[4:5] -; CHECK-NEXT: v_add_i32_e64 v5, s[4:5], v8, v5 -; CHECK-NEXT: v_cndmask_b32_e64 v5, 0, 1, s[4:5] -; CHECK-NEXT: v_mul_lo_u32 v8, v6, v7 -; CHECK-NEXT: v_add_i32_e64 v5, s[4:5], v10, v5 -; CHECK-NEXT: v_mul_hi_u32 v10, v3, v7 -; CHECK-NEXT: v_add_i32_e64 v8, s[4:5], v8, v9 -; CHECK-NEXT: v_cndmask_b32_e64 v9, 0, 1, s[4:5] -; CHECK-NEXT: v_add_i32_e64 v8, s[4:5], v8, v10 +; CHECK-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc +; CHECK-NEXT: v_add_i32_e32 v9, vcc, v10, v9 +; CHECK-NEXT: v_add_i32_e32 v7, vcc, v7, v9 +; CHECK-NEXT: v_add_i32_e32 v4, vcc, v4, v8 +; CHECK-NEXT: v_addc_u32_e64 v8, s[4:5], v5, v7, vcc +; CHECK-NEXT: v_mul_lo_u32 v9, -1, v4 +; CHECK-NEXT: v_mul_lo_u32 v10, v6, v8 +; CHECK-NEXT: v_mul_lo_u32 v11, v6, v4 +; CHECK-NEXT: v_mul_hi_u32 v6, v6, v4 +; CHECK-NEXT: v_add_i32_e64 v5, s[4:5], v5, v7 +; CHECK-NEXT: v_add_i32_e64 v9, s[4:5], v9, v10 +; CHECK-NEXT: v_add_i32_e64 v6, s[4:5], v9, v6 +; CHECK-NEXT: v_mul_lo_u32 v9, v8, v11 +; CHECK-NEXT: v_mul_lo_u32 v10, v4, v6 +; CHECK-NEXT: v_mul_hi_u32 v7, v4, v11 +; CHECK-NEXT: v_mul_hi_u32 v11, v8, v11 +; CHECK-NEXT: v_add_i32_e64 v9, s[4:5], v9, v10 ; CHECK-NEXT: v_cndmask_b32_e64 v10, 0, 1, s[4:5] +; CHECK-NEXT: v_add_i32_e64 v7, s[4:5], v9, v7 +; CHECK-NEXT: v_cndmask_b32_e64 v7, 0, 1, s[4:5] +; CHECK-NEXT: v_mul_lo_u32 v9, v8, v6 +; CHECK-NEXT: v_add_i32_e64 v7, s[4:5], v10, v7 +; CHECK-NEXT: v_mul_hi_u32 v10, v4, v6 +; CHECK-NEXT: v_add_i32_e64 v9, s[4:5], v9, v11 +; CHECK-NEXT: v_cndmask_b32_e64 v11, 0, 1, s[4:5] ; CHECK-NEXT: v_add_i32_e64 v9, s[4:5], v9, v10 -; CHECK-NEXT: v_mul_hi_u32 v6, v6, v7 -; CHECK-NEXT: v_add_i32_e64 v5, s[4:5], v8, v5 -; CHECK-NEXT: v_cndmask_b32_e64 v8, 0, 1, s[4:5] -; CHECK-NEXT: v_add_i32_e64 v7, s[4:5], v9, v8 -; CHECK-NEXT: v_add_i32_e64 v6, s[4:5], v6, v7 -; CHECK-NEXT: v_addc_u32_e32 v4, vcc, v4, v6, vcc -; CHECK-NEXT: v_add_i32_e32 v3, vcc, v3, v5 -; CHECK-NEXT: v_addc_u32_e32 v4, vcc, 0, v4, vcc -; CHECK-NEXT: v_mul_lo_u32 v5, v1, v3 -; CHECK-NEXT: v_mul_lo_u32 v6, v0, v4 -; CHECK-NEXT: v_mul_hi_u32 v7, v0, v3 -; CHECK-NEXT: v_mul_hi_u32 v3, v1, v3 -; CHECK-NEXT: v_add_i32_e32 v5, vcc, v5, v6 +; CHECK-NEXT: v_cndmask_b32_e64 v10, 0, 1, s[4:5] +; CHECK-NEXT: v_add_i32_e64 v10, s[4:5], v11, v10 +; CHECK-NEXT: v_mul_hi_u32 v6, v8, v6 +; CHECK-NEXT: v_add_i32_e64 v7, s[4:5], v9, v7 +; CHECK-NEXT: v_cndmask_b32_e64 v9, 0, 1, s[4:5] +; CHECK-NEXT: v_add_i32_e64 v8, s[4:5], v10, v9 +; CHECK-NEXT: v_add_i32_e64 v6, s[4:5], v6, v8 +; CHECK-NEXT: v_addc_u32_e32 v5, vcc, v5, v6, vcc +; CHECK-NEXT: v_add_i32_e32 v4, vcc, v4, v7 +; CHECK-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc +; CHECK-NEXT: v_mul_lo_u32 v6, v1, v4 +; CHECK-NEXT: v_mul_lo_u32 v7, v0, v5 +; CHECK-NEXT: v_mul_hi_u32 v8, v0, v4 +; CHECK-NEXT: v_mul_hi_u32 v4, v1, v4 +; CHECK-NEXT: v_add_i32_e32 v6, vcc, v6, v7 +; CHECK-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc +; CHECK-NEXT: v_add_i32_e32 v6, vcc, v6, v8 ; CHECK-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc -; CHECK-NEXT: v_add_i32_e32 v5, vcc, v5, v7 -; CHECK-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc -; CHECK-NEXT: v_mul_lo_u32 v7, v1, v4 -; CHECK-NEXT: v_add_i32_e32 v5, vcc, v6, v5 -; CHECK-NEXT: v_mul_hi_u32 v6, v0, v4 -; CHECK-NEXT: v_add_i32_e32 v3, vcc, v7, v3 +; CHECK-NEXT: v_mul_lo_u32 v8, v1, v5 +; CHECK-NEXT: v_add_i32_e32 v6, vcc, v7, v6 +; CHECK-NEXT: v_mul_hi_u32 v7, v0, v5 +; CHECK-NEXT: v_add_i32_e32 v4, vcc, v8, v4 +; CHECK-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc +; CHECK-NEXT: v_add_i32_e32 v4, vcc, v4, v7 ; CHECK-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc -; CHECK-NEXT: v_add_i32_e32 v3, vcc, v3, v6 +; CHECK-NEXT: v_add_i32_e32 v7, vcc, v8, v7 +; CHECK-NEXT: v_mul_hi_u32 v5, v1, v5 +; CHECK-NEXT: v_add_i32_e32 v4, vcc, v4, v6 ; CHECK-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc ; CHECK-NEXT: v_add_i32_e32 v6, vcc, v7, v6 -; CHECK-NEXT: v_mul_hi_u32 v4, v1, v4 -; CHECK-NEXT: v_add_i32_e32 v3, vcc, v3, v5 -; CHECK-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc -; CHECK-NEXT: v_add_i32_e32 v5, vcc, v6, v5 -; CHECK-NEXT: v_add_i32_e32 v4, vcc, v4, v5 -; CHECK-NEXT: v_mul_lo_u32 v5, 0, v3 -; CHECK-NEXT: v_mul_lo_u32 v6, s6, v4 -; CHECK-NEXT: v_mul_hi_u32 v8, s6, v3 -; CHECK-NEXT: v_mul_lo_u32 v7, s6, v3 ; CHECK-NEXT: v_add_i32_e32 v5, vcc, v5, v6 -; CHECK-NEXT: v_add_i32_e32 v5, vcc, v5, v8 -; CHECK-NEXT: v_sub_i32_e32 v0, vcc, v0, v7 -; CHECK-NEXT: v_subb_u32_e64 v6, s[4:5], v1, v5, vcc -; CHECK-NEXT: v_sub_i32_e64 v1, s[4:5], v1, v5 +; CHECK-NEXT: v_mul_lo_u32 v6, 0, v4 +; CHECK-NEXT: v_mul_lo_u32 v7, v3, v5 +; CHECK-NEXT: v_mul_hi_u32 v9, v3, v4 +; CHECK-NEXT: v_mul_lo_u32 v8, v3, v4 +; CHECK-NEXT: v_add_i32_e32 v6, vcc, v6, v7 +; CHECK-NEXT: v_add_i32_e32 v6, vcc, v6, v9 +; CHECK-NEXT: v_sub_i32_e32 v0, vcc, v0, v8 +; CHECK-NEXT: v_subb_u32_e64 v7, s[4:5], v1, v6, vcc +; CHECK-NEXT: v_sub_i32_e64 v1, s[4:5], v1, v6 ; CHECK-NEXT: v_subbrev_u32_e32 v1, vcc, 0, v1, vcc -; CHECK-NEXT: v_cmp_le_u32_e64 s[4:5], s6, v0 -; CHECK-NEXT: v_subrev_i32_e32 v0, vcc, s6, v0 +; CHECK-NEXT: v_cmp_ge_u32_e64 s[4:5], v0, v3 +; CHECK-NEXT: v_sub_i32_e32 v0, vcc, v0, v3 ; CHECK-NEXT: v_subbrev_u32_e32 v1, vcc, 0, v1, vcc -; CHECK-NEXT: v_cndmask_b32_e64 v5, 0, -1, s[4:5] -; CHECK-NEXT: v_mov_b32_e32 v7, s7 -; CHECK-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v6 -; CHECK-NEXT: v_add_i32_e32 v6, vcc, 1, v3 -; CHECK-NEXT: v_cndmask_b32_e64 v5, v7, v5, s[4:5] -; CHECK-NEXT: v_addc_u32_e32 v7, vcc, 0, v4, vcc +; CHECK-NEXT: v_cndmask_b32_e64 v6, 0, -1, s[4:5] +; CHECK-NEXT: v_mov_b32_e32 v8, s6 +; CHECK-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v7 +; CHECK-NEXT: v_add_i32_e32 v7, vcc, 1, v4 +; CHECK-NEXT: v_cndmask_b32_e64 v6, v8, v6, s[4:5] +; CHECK-NEXT: v_addc_u32_e32 v8, vcc, 0, v5, vcc ; CHECK-NEXT: s_bfe_i32 s4, -1, 0x10000 -; CHECK-NEXT: v_cmp_le_u32_e32 vcc, s6, v0 +; CHECK-NEXT: v_cmp_ge_u32_e32 vcc, v0, v3 ; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc -; CHECK-NEXT: v_mov_b32_e32 v8, s4 +; CHECK-NEXT: v_mov_b32_e32 v3, s4 ; CHECK-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 -; CHECK-NEXT: v_cndmask_b32_e32 v0, v8, v0, vcc -; CHECK-NEXT: v_add_i32_e32 v1, vcc, 1, v6 -; CHECK-NEXT: v_addc_u32_e32 v8, vcc, 0, v7, vcc -; CHECK-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 -; CHECK-NEXT: v_cndmask_b32_e32 v0, v6, v1, vcc -; CHECK-NEXT: v_cndmask_b32_e32 v1, v7, v8, vcc -; CHECK-NEXT: v_cmp_ne_u32_e32 vcc, 0, v5 ; CHECK-NEXT: v_cndmask_b32_e32 v0, v3, v0, vcc -; CHECK-NEXT: v_cndmask_b32_e32 v1, v4, v1, vcc +; CHECK-NEXT: v_add_i32_e32 v1, vcc, 1, v7 +; CHECK-NEXT: v_addc_u32_e32 v3, vcc, 0, v8, vcc +; CHECK-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 +; CHECK-NEXT: v_cndmask_b32_e32 v0, v7, v1, vcc +; CHECK-NEXT: v_cndmask_b32_e32 v1, v8, v3, vcc +; CHECK-NEXT: v_cmp_ne_u32_e32 vcc, 0, v6 +; CHECK-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc +; CHECK-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc ; CHECK-NEXT: v_xor_b32_e32 v0, v0, v2 ; CHECK-NEXT: v_xor_b32_e32 v1, v1, v2 ; CHECK-NEXT: v_sub_i32_e32 v0, vcc, v0, v2 @@ -1508,275 +1508,275 @@ ; CGP-LABEL: v_sdiv_v2i64_pow2k_denom: ; CGP: ; %bb.0: ; CGP-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CGP-NEXT: v_cvt_f32_u32_e32 v5, 0x1000 -; CGP-NEXT: v_cvt_f32_ubyte0_e32 v6, 0 -; CGP-NEXT: s_movk_i32 s6, 0xf000 -; CGP-NEXT: v_ashrrev_i32_e32 v4, 31, v1 -; CGP-NEXT: v_mov_b32_e32 v7, v5 -; CGP-NEXT: v_mac_f32_e32 v7, 0x4f800000, v6 -; CGP-NEXT: v_rcp_iflag_f32_e32 v7, v7 -; CGP-NEXT: v_add_i32_e32 v0, vcc, v0, v4 -; CGP-NEXT: v_addc_u32_e32 v1, vcc, v1, v4, vcc -; CGP-NEXT: v_mul_f32_e32 v7, 0x5f7ffffc, v7 -; CGP-NEXT: v_mul_f32_e32 v8, 0x2f800000, v7 -; CGP-NEXT: v_trunc_f32_e32 v8, v8 -; CGP-NEXT: v_mac_f32_e32 v7, 0xcf800000, v8 -; CGP-NEXT: v_cvt_u32_f32_e32 v7, v7 +; CGP-NEXT: v_mov_b32_e32 v4, 0x1000 +; CGP-NEXT: v_cvt_f32_u32_e32 v6, v4 +; CGP-NEXT: v_cvt_f32_ubyte0_e32 v7, 0 +; CGP-NEXT: v_ashrrev_i32_e32 v5, 31, v1 +; CGP-NEXT: v_add_i32_e32 v0, vcc, v0, v5 +; CGP-NEXT: v_mov_b32_e32 v8, v6 +; CGP-NEXT: v_mac_f32_e32 v8, 0x4f800000, v7 +; CGP-NEXT: v_rcp_iflag_f32_e32 v8, v8 +; CGP-NEXT: v_xor_b32_e32 v10, v0, v5 +; CGP-NEXT: v_mov_b32_e32 v0, 0xfffff000 +; CGP-NEXT: v_addc_u32_e32 v1, vcc, v1, v5, vcc +; CGP-NEXT: v_mul_f32_e32 v8, 0x5f7ffffc, v8 +; CGP-NEXT: v_mul_f32_e32 v9, 0x2f800000, v8 +; CGP-NEXT: v_trunc_f32_e32 v9, v9 +; CGP-NEXT: v_mac_f32_e32 v8, 0xcf800000, v9 ; CGP-NEXT: v_cvt_u32_f32_e32 v8, v8 -; CGP-NEXT: v_xor_b32_e32 v0, v0, v4 -; CGP-NEXT: v_xor_b32_e32 v1, v1, v4 -; CGP-NEXT: v_mul_lo_u32 v9, -1, v7 -; CGP-NEXT: v_mul_lo_u32 v10, s6, v8 -; CGP-NEXT: v_mul_hi_u32 v12, s6, v7 -; CGP-NEXT: v_mul_lo_u32 v11, s6, v7 -; CGP-NEXT: s_movk_i32 s7, 0x1000 -; CGP-NEXT: v_add_i32_e32 v9, vcc, v9, v10 -; CGP-NEXT: v_add_i32_e32 v9, vcc, v9, v12 -; CGP-NEXT: v_mul_lo_u32 v10, v8, v11 -; CGP-NEXT: v_mul_lo_u32 v12, v7, v9 -; CGP-NEXT: v_mul_hi_u32 v13, v7, v11 -; CGP-NEXT: v_mul_hi_u32 v11, v8, v11 -; CGP-NEXT: s_bfe_i32 s8, -1, 0x10000 -; CGP-NEXT: v_add_i32_e32 v10, vcc, v10, v12 +; CGP-NEXT: v_cvt_u32_f32_e32 v9, v9 +; CGP-NEXT: v_xor_b32_e32 v1, v1, v5 +; CGP-NEXT: s_bfe_i32 s6, -1, 0x10000 +; CGP-NEXT: v_mul_lo_u32 v11, -1, v8 +; CGP-NEXT: v_mul_lo_u32 v12, v0, v9 +; CGP-NEXT: v_mul_hi_u32 v14, v0, v8 +; CGP-NEXT: v_mul_lo_u32 v13, v0, v8 +; CGP-NEXT: v_mac_f32_e32 v6, 0x4f800000, v7 +; CGP-NEXT: v_add_i32_e32 v11, vcc, v11, v12 +; CGP-NEXT: v_add_i32_e32 v11, vcc, v11, v14 +; CGP-NEXT: v_mul_lo_u32 v12, v9, v13 +; CGP-NEXT: v_mul_lo_u32 v14, v8, v11 +; CGP-NEXT: v_mul_hi_u32 v15, v8, v13 +; CGP-NEXT: v_mul_hi_u32 v13, v9, v13 +; CGP-NEXT: v_rcp_iflag_f32_e32 v6, v6 +; CGP-NEXT: v_add_i32_e32 v12, vcc, v12, v14 +; CGP-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v12, vcc, v12, v15 ; CGP-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v10, vcc, v10, v13 -; CGP-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc -; CGP-NEXT: v_mul_lo_u32 v13, v8, v9 -; CGP-NEXT: v_add_i32_e32 v10, vcc, v12, v10 -; CGP-NEXT: v_mul_hi_u32 v12, v7, v9 -; CGP-NEXT: v_add_i32_e32 v11, vcc, v13, v11 +; CGP-NEXT: v_mul_lo_u32 v15, v9, v11 +; CGP-NEXT: v_add_i32_e32 v12, vcc, v14, v12 +; CGP-NEXT: v_mul_hi_u32 v14, v8, v11 +; CGP-NEXT: v_add_i32_e32 v13, vcc, v15, v13 +; CGP-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v13, vcc, v13, v14 +; CGP-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v14, vcc, v15, v14 +; CGP-NEXT: v_mul_hi_u32 v11, v9, v11 +; CGP-NEXT: v_add_i32_e32 v12, vcc, v13, v12 ; CGP-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v13, vcc, v14, v13 +; CGP-NEXT: v_add_i32_e32 v11, vcc, v11, v13 +; CGP-NEXT: v_add_i32_e32 v8, vcc, v8, v12 +; CGP-NEXT: v_addc_u32_e64 v12, s[4:5], v9, v11, vcc +; CGP-NEXT: v_mul_lo_u32 v13, -1, v8 +; CGP-NEXT: v_mul_lo_u32 v14, v0, v12 +; CGP-NEXT: v_mul_hi_u32 v16, v0, v8 +; CGP-NEXT: v_mul_lo_u32 v15, v0, v8 +; CGP-NEXT: v_add_i32_e64 v9, s[4:5], v9, v11 +; CGP-NEXT: v_add_i32_e64 v13, s[4:5], v13, v14 +; CGP-NEXT: v_add_i32_e64 v13, s[4:5], v13, v16 +; CGP-NEXT: v_mul_lo_u32 v14, v12, v15 +; CGP-NEXT: v_mul_lo_u32 v16, v8, v13 +; CGP-NEXT: v_mul_hi_u32 v11, v8, v15 +; CGP-NEXT: v_mul_hi_u32 v15, v12, v15 +; CGP-NEXT: v_mul_f32_e32 v6, 0x5f7ffffc, v6 +; CGP-NEXT: v_add_i32_e64 v14, s[4:5], v14, v16 +; CGP-NEXT: v_cndmask_b32_e64 v16, 0, 1, s[4:5] +; CGP-NEXT: v_add_i32_e64 v11, s[4:5], v14, v11 +; CGP-NEXT: v_cndmask_b32_e64 v11, 0, 1, s[4:5] +; CGP-NEXT: v_mul_lo_u32 v14, v12, v13 +; CGP-NEXT: v_add_i32_e64 v11, s[4:5], v16, v11 +; CGP-NEXT: v_mul_hi_u32 v16, v8, v13 +; CGP-NEXT: v_add_i32_e64 v14, s[4:5], v14, v15 +; CGP-NEXT: v_cndmask_b32_e64 v15, 0, 1, s[4:5] +; CGP-NEXT: v_add_i32_e64 v14, s[4:5], v14, v16 +; CGP-NEXT: v_cndmask_b32_e64 v16, 0, 1, s[4:5] +; CGP-NEXT: v_add_i32_e64 v15, s[4:5], v15, v16 +; CGP-NEXT: v_mul_hi_u32 v12, v12, v13 +; CGP-NEXT: v_add_i32_e64 v11, s[4:5], v14, v11 +; CGP-NEXT: v_cndmask_b32_e64 v14, 0, 1, s[4:5] +; CGP-NEXT: v_add_i32_e64 v13, s[4:5], v15, v14 +; CGP-NEXT: v_add_i32_e64 v12, s[4:5], v12, v13 +; CGP-NEXT: v_addc_u32_e32 v9, vcc, v9, v12, vcc +; CGP-NEXT: v_add_i32_e32 v8, vcc, v8, v11 +; CGP-NEXT: v_addc_u32_e32 v9, vcc, 0, v9, vcc +; CGP-NEXT: v_mul_lo_u32 v11, v1, v8 +; CGP-NEXT: v_mul_lo_u32 v12, v10, v9 +; CGP-NEXT: v_mul_hi_u32 v13, v10, v8 +; CGP-NEXT: v_mul_hi_u32 v8, v1, v8 ; CGP-NEXT: v_add_i32_e32 v11, vcc, v11, v12 ; CGP-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v11, vcc, v11, v13 +; CGP-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc +; CGP-NEXT: v_mul_lo_u32 v13, v1, v9 +; CGP-NEXT: v_add_i32_e32 v11, vcc, v12, v11 +; CGP-NEXT: v_mul_hi_u32 v12, v10, v9 +; CGP-NEXT: v_add_i32_e32 v8, vcc, v13, v8 +; CGP-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v8, vcc, v8, v12 +; CGP-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v12, vcc, v13, v12 -; CGP-NEXT: v_mul_hi_u32 v9, v8, v9 -; CGP-NEXT: v_add_i32_e32 v10, vcc, v11, v10 +; CGP-NEXT: v_mul_hi_u32 v9, v1, v9 +; CGP-NEXT: v_add_i32_e32 v8, vcc, v8, v11 ; CGP-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v11, vcc, v12, v11 ; CGP-NEXT: v_add_i32_e32 v9, vcc, v9, v11 -; CGP-NEXT: v_add_i32_e32 v7, vcc, v7, v10 -; CGP-NEXT: v_addc_u32_e64 v10, s[4:5], v8, v9, vcc -; CGP-NEXT: v_mul_lo_u32 v11, -1, v7 -; CGP-NEXT: v_mul_lo_u32 v12, s6, v10 -; CGP-NEXT: v_mul_hi_u32 v14, s6, v7 -; CGP-NEXT: v_mul_lo_u32 v13, s6, v7 -; CGP-NEXT: v_add_i32_e64 v8, s[4:5], v8, v9 -; CGP-NEXT: v_add_i32_e64 v11, s[4:5], v11, v12 -; CGP-NEXT: v_add_i32_e64 v11, s[4:5], v11, v14 -; CGP-NEXT: v_mul_lo_u32 v12, v10, v13 -; CGP-NEXT: v_mul_lo_u32 v14, v7, v11 -; CGP-NEXT: v_mul_hi_u32 v9, v7, v13 -; CGP-NEXT: v_mul_hi_u32 v13, v10, v13 -; CGP-NEXT: v_mac_f32_e32 v5, 0x4f800000, v6 -; CGP-NEXT: v_add_i32_e64 v12, s[4:5], v12, v14 -; CGP-NEXT: v_cndmask_b32_e64 v14, 0, 1, s[4:5] -; CGP-NEXT: v_add_i32_e64 v9, s[4:5], v12, v9 -; CGP-NEXT: v_cndmask_b32_e64 v9, 0, 1, s[4:5] -; CGP-NEXT: v_mul_lo_u32 v12, v10, v11 -; CGP-NEXT: v_add_i32_e64 v9, s[4:5], v14, v9 -; CGP-NEXT: v_mul_hi_u32 v14, v7, v11 -; CGP-NEXT: v_add_i32_e64 v12, s[4:5], v12, v13 -; CGP-NEXT: v_cndmask_b32_e64 v13, 0, 1, s[4:5] -; CGP-NEXT: v_add_i32_e64 v12, s[4:5], v12, v14 -; CGP-NEXT: v_cndmask_b32_e64 v14, 0, 1, s[4:5] -; CGP-NEXT: v_add_i32_e64 v13, s[4:5], v13, v14 -; CGP-NEXT: v_mul_hi_u32 v10, v10, v11 -; CGP-NEXT: v_add_i32_e64 v9, s[4:5], v12, v9 -; CGP-NEXT: v_cndmask_b32_e64 v12, 0, 1, s[4:5] -; CGP-NEXT: v_add_i32_e64 v11, s[4:5], v13, v12 -; CGP-NEXT: v_add_i32_e64 v10, s[4:5], v10, v11 -; CGP-NEXT: v_addc_u32_e32 v8, vcc, v8, v10, vcc -; CGP-NEXT: v_add_i32_e32 v7, vcc, v7, v9 -; CGP-NEXT: v_addc_u32_e32 v8, vcc, 0, v8, vcc -; CGP-NEXT: v_mul_lo_u32 v9, v1, v7 -; CGP-NEXT: v_mul_lo_u32 v10, v0, v8 -; CGP-NEXT: v_mul_hi_u32 v11, v0, v7 -; CGP-NEXT: v_mul_hi_u32 v7, v1, v7 -; CGP-NEXT: v_rcp_iflag_f32_e32 v5, v5 -; CGP-NEXT: v_add_i32_e32 v9, vcc, v9, v10 -; CGP-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v9, vcc, v9, v11 -; CGP-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc -; CGP-NEXT: v_mul_lo_u32 v11, v1, v8 -; CGP-NEXT: v_add_i32_e32 v9, vcc, v10, v9 -; CGP-NEXT: v_mul_hi_u32 v10, v0, v8 -; CGP-NEXT: v_add_i32_e32 v7, vcc, v11, v7 -; CGP-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v7, vcc, v7, v10 -; CGP-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v10, vcc, v11, v10 -; CGP-NEXT: v_mul_hi_u32 v8, v1, v8 -; CGP-NEXT: v_add_i32_e32 v7, vcc, v7, v9 -; CGP-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v9, vcc, v10, v9 -; CGP-NEXT: v_add_i32_e32 v8, vcc, v8, v9 -; CGP-NEXT: v_mul_lo_u32 v9, 0, v7 -; CGP-NEXT: v_mul_lo_u32 v10, s7, v8 -; CGP-NEXT: v_mul_hi_u32 v12, s7, v7 -; CGP-NEXT: v_mul_lo_u32 v11, s7, v7 -; CGP-NEXT: v_mul_f32_e32 v5, 0x5f7ffffc, v5 -; CGP-NEXT: v_add_i32_e32 v9, vcc, v9, v10 -; CGP-NEXT: v_add_i32_e32 v9, vcc, v9, v12 -; CGP-NEXT: v_sub_i32_e32 v0, vcc, v0, v11 -; CGP-NEXT: v_subb_u32_e64 v10, s[4:5], v1, v9, vcc -; CGP-NEXT: v_sub_i32_e64 v1, s[4:5], v1, v9 +; CGP-NEXT: v_mul_lo_u32 v11, 0, v8 +; CGP-NEXT: v_mul_lo_u32 v12, v4, v9 +; CGP-NEXT: v_mul_hi_u32 v14, v4, v8 +; CGP-NEXT: v_mul_lo_u32 v13, v4, v8 +; CGP-NEXT: v_add_i32_e32 v11, vcc, v11, v12 +; CGP-NEXT: v_add_i32_e32 v11, vcc, v11, v14 +; CGP-NEXT: v_sub_i32_e32 v10, vcc, v10, v13 +; CGP-NEXT: v_subb_u32_e64 v12, s[4:5], v1, v11, vcc +; CGP-NEXT: v_sub_i32_e64 v1, s[4:5], v1, v11 ; CGP-NEXT: v_subbrev_u32_e32 v1, vcc, 0, v1, vcc -; CGP-NEXT: v_cmp_le_u32_e64 s[4:5], s7, v0 -; CGP-NEXT: v_subrev_i32_e32 v0, vcc, s7, v0 +; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v10, v4 +; CGP-NEXT: v_sub_i32_e32 v10, vcc, v10, v4 ; CGP-NEXT: v_subbrev_u32_e32 v1, vcc, 0, v1, vcc -; CGP-NEXT: v_cndmask_b32_e64 v9, 0, -1, s[4:5] -; CGP-NEXT: v_mov_b32_e32 v11, s8 -; CGP-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v10 -; CGP-NEXT: v_add_i32_e32 v10, vcc, 1, v7 -; CGP-NEXT: v_cndmask_b32_e64 v9, v11, v9, s[4:5] -; CGP-NEXT: v_addc_u32_e32 v11, vcc, 0, v8, vcc +; CGP-NEXT: v_cndmask_b32_e64 v11, 0, -1, s[4:5] +; CGP-NEXT: v_mov_b32_e32 v13, s6 +; CGP-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v12 +; CGP-NEXT: v_add_i32_e32 v12, vcc, 1, v8 +; CGP-NEXT: v_cndmask_b32_e64 v11, v13, v11, s[4:5] +; CGP-NEXT: v_addc_u32_e32 v13, vcc, 0, v9, vcc ; CGP-NEXT: s_bfe_i32 s4, -1, 0x10000 -; CGP-NEXT: v_cmp_le_u32_e32 vcc, s7, v0 -; CGP-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc -; CGP-NEXT: v_mov_b32_e32 v12, s4 +; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v10, v4 +; CGP-NEXT: v_cndmask_b32_e64 v10, 0, -1, vcc +; CGP-NEXT: v_mov_b32_e32 v14, s4 ; CGP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 -; CGP-NEXT: v_cndmask_b32_e32 v0, v12, v0, vcc -; CGP-NEXT: v_add_i32_e32 v1, vcc, 1, v10 -; CGP-NEXT: v_addc_u32_e32 v12, vcc, 0, v11, vcc -; CGP-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 -; CGP-NEXT: v_cndmask_b32_e32 v0, v10, v1, vcc -; CGP-NEXT: v_cndmask_b32_e32 v1, v11, v12, vcc -; CGP-NEXT: v_cmp_ne_u32_e32 vcc, 0, v9 -; CGP-NEXT: v_cndmask_b32_e32 v0, v7, v0, vcc -; CGP-NEXT: v_mul_f32_e32 v7, 0x2f800000, v5 -; CGP-NEXT: v_trunc_f32_e32 v7, v7 -; CGP-NEXT: v_mac_f32_e32 v5, 0xcf800000, v7 -; CGP-NEXT: v_cvt_u32_f32_e32 v5, v5 -; CGP-NEXT: v_cvt_u32_f32_e32 v7, v7 +; CGP-NEXT: v_cndmask_b32_e32 v1, v14, v10, vcc +; CGP-NEXT: v_add_i32_e32 v10, vcc, 1, v12 +; CGP-NEXT: v_addc_u32_e32 v14, vcc, 0, v13, vcc +; CGP-NEXT: v_cmp_ne_u32_e32 vcc, 0, v1 +; CGP-NEXT: v_cndmask_b32_e32 v1, v12, v10, vcc +; CGP-NEXT: v_cndmask_b32_e32 v10, v13, v14, vcc +; CGP-NEXT: v_cmp_ne_u32_e32 vcc, 0, v11 ; CGP-NEXT: v_cndmask_b32_e32 v1, v8, v1, vcc -; CGP-NEXT: v_ashrrev_i32_e32 v6, 31, v3 -; CGP-NEXT: v_mul_lo_u32 v8, -1, v5 -; CGP-NEXT: v_mul_lo_u32 v9, s6, v7 -; CGP-NEXT: v_mul_hi_u32 v11, s6, v5 -; CGP-NEXT: v_mul_lo_u32 v10, s6, v5 -; CGP-NEXT: v_add_i32_e32 v2, vcc, v2, v6 -; CGP-NEXT: v_addc_u32_e32 v3, vcc, v3, v6, vcc -; CGP-NEXT: v_add_i32_e32 v8, vcc, v8, v9 -; CGP-NEXT: v_add_i32_e32 v8, vcc, v8, v11 -; CGP-NEXT: v_mul_lo_u32 v9, v7, v10 -; CGP-NEXT: v_mul_lo_u32 v11, v5, v8 -; CGP-NEXT: v_mul_hi_u32 v12, v5, v10 -; CGP-NEXT: v_mul_hi_u32 v10, v7, v10 -; CGP-NEXT: v_xor_b32_e32 v0, v0, v4 -; CGP-NEXT: v_add_i32_e32 v9, vcc, v9, v11 -; CGP-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v9, vcc, v9, v12 -; CGP-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc -; CGP-NEXT: v_mul_lo_u32 v12, v7, v8 -; CGP-NEXT: v_add_i32_e32 v9, vcc, v11, v9 -; CGP-NEXT: v_mul_hi_u32 v11, v5, v8 -; CGP-NEXT: v_add_i32_e32 v10, vcc, v12, v10 -; CGP-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc +; CGP-NEXT: v_cndmask_b32_e32 v8, v9, v10, vcc +; CGP-NEXT: v_mul_f32_e32 v9, 0x2f800000, v6 +; CGP-NEXT: v_trunc_f32_e32 v9, v9 +; CGP-NEXT: v_mac_f32_e32 v6, 0xcf800000, v9 +; CGP-NEXT: v_cvt_u32_f32_e32 v6, v6 +; CGP-NEXT: v_cvt_u32_f32_e32 v9, v9 +; CGP-NEXT: v_xor_b32_e32 v7, v8, v5 +; CGP-NEXT: v_ashrrev_i32_e32 v8, 31, v3 +; CGP-NEXT: v_mul_lo_u32 v10, -1, v6 +; CGP-NEXT: v_mul_lo_u32 v11, v0, v9 +; CGP-NEXT: v_mul_hi_u32 v13, v0, v6 +; CGP-NEXT: v_mul_lo_u32 v12, v0, v6 +; CGP-NEXT: v_add_i32_e32 v2, vcc, v2, v8 +; CGP-NEXT: v_addc_u32_e32 v3, vcc, v3, v8, vcc ; CGP-NEXT: v_add_i32_e32 v10, vcc, v10, v11 -; CGP-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v11, vcc, v12, v11 -; CGP-NEXT: v_mul_hi_u32 v8, v7, v8 -; CGP-NEXT: v_add_i32_e32 v9, vcc, v10, v9 -; CGP-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v10, vcc, v11, v10 -; CGP-NEXT: v_add_i32_e32 v8, vcc, v8, v10 -; CGP-NEXT: v_add_i32_e32 v5, vcc, v5, v9 -; CGP-NEXT: v_addc_u32_e64 v9, s[4:5], v7, v8, vcc -; CGP-NEXT: v_mul_lo_u32 v10, -1, v5 -; CGP-NEXT: v_mul_lo_u32 v11, s6, v9 -; CGP-NEXT: v_mul_hi_u32 v13, s6, v5 -; CGP-NEXT: v_mul_lo_u32 v12, s6, v5 -; CGP-NEXT: v_add_i32_e64 v7, s[4:5], v7, v8 -; CGP-NEXT: v_add_i32_e64 v10, s[4:5], v10, v11 -; CGP-NEXT: v_add_i32_e64 v10, s[4:5], v10, v13 +; CGP-NEXT: v_add_i32_e32 v10, vcc, v10, v13 ; CGP-NEXT: v_mul_lo_u32 v11, v9, v12 -; CGP-NEXT: v_mul_lo_u32 v13, v5, v10 -; CGP-NEXT: v_mul_hi_u32 v8, v5, v12 +; CGP-NEXT: v_mul_lo_u32 v13, v6, v10 +; CGP-NEXT: v_mul_hi_u32 v14, v6, v12 ; CGP-NEXT: v_mul_hi_u32 v12, v9, v12 -; CGP-NEXT: v_xor_b32_e32 v2, v2, v6 -; CGP-NEXT: v_add_i32_e64 v11, s[4:5], v11, v13 -; CGP-NEXT: v_cndmask_b32_e64 v13, 0, 1, s[4:5] -; CGP-NEXT: v_add_i32_e64 v8, s[4:5], v11, v8 -; CGP-NEXT: v_cndmask_b32_e64 v8, 0, 1, s[4:5] -; CGP-NEXT: v_mul_lo_u32 v11, v9, v10 -; CGP-NEXT: v_add_i32_e64 v8, s[4:5], v13, v8 -; CGP-NEXT: v_mul_hi_u32 v13, v5, v10 -; CGP-NEXT: v_add_i32_e64 v11, s[4:5], v11, v12 -; CGP-NEXT: v_cndmask_b32_e64 v12, 0, 1, s[4:5] -; CGP-NEXT: v_add_i32_e64 v11, s[4:5], v11, v13 +; CGP-NEXT: v_xor_b32_e32 v1, v1, v5 +; CGP-NEXT: v_add_i32_e32 v11, vcc, v11, v13 +; CGP-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v11, vcc, v11, v14 +; CGP-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc +; CGP-NEXT: v_mul_lo_u32 v14, v9, v10 +; CGP-NEXT: v_add_i32_e32 v11, vcc, v13, v11 +; CGP-NEXT: v_mul_hi_u32 v13, v6, v10 +; CGP-NEXT: v_add_i32_e32 v12, vcc, v14, v12 +; CGP-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v12, vcc, v12, v13 +; CGP-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v13, vcc, v14, v13 +; CGP-NEXT: v_mul_hi_u32 v10, v9, v10 +; CGP-NEXT: v_add_i32_e32 v11, vcc, v12, v11 +; CGP-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v12, vcc, v13, v12 +; CGP-NEXT: v_add_i32_e32 v10, vcc, v10, v12 +; CGP-NEXT: v_add_i32_e32 v6, vcc, v6, v11 +; CGP-NEXT: v_addc_u32_e64 v11, s[4:5], v9, v10, vcc +; CGP-NEXT: v_mul_lo_u32 v12, -1, v6 +; CGP-NEXT: v_mul_lo_u32 v13, v0, v11 +; CGP-NEXT: v_mul_lo_u32 v14, v0, v6 +; CGP-NEXT: v_mul_hi_u32 v0, v0, v6 +; CGP-NEXT: v_add_i32_e64 v9, s[4:5], v9, v10 +; CGP-NEXT: v_add_i32_e64 v12, s[4:5], v12, v13 +; CGP-NEXT: v_add_i32_e64 v0, s[4:5], v12, v0 +; CGP-NEXT: v_mul_lo_u32 v12, v11, v14 +; CGP-NEXT: v_mul_lo_u32 v13, v6, v0 +; CGP-NEXT: v_mul_hi_u32 v10, v6, v14 +; CGP-NEXT: v_mul_hi_u32 v14, v11, v14 +; CGP-NEXT: v_xor_b32_e32 v2, v2, v8 +; CGP-NEXT: v_add_i32_e64 v12, s[4:5], v12, v13 ; CGP-NEXT: v_cndmask_b32_e64 v13, 0, 1, s[4:5] +; CGP-NEXT: v_add_i32_e64 v10, s[4:5], v12, v10 +; CGP-NEXT: v_cndmask_b32_e64 v10, 0, 1, s[4:5] +; CGP-NEXT: v_mul_lo_u32 v12, v11, v0 +; CGP-NEXT: v_add_i32_e64 v10, s[4:5], v13, v10 +; CGP-NEXT: v_mul_hi_u32 v13, v6, v0 +; CGP-NEXT: v_add_i32_e64 v12, s[4:5], v12, v14 +; CGP-NEXT: v_cndmask_b32_e64 v14, 0, 1, s[4:5] ; CGP-NEXT: v_add_i32_e64 v12, s[4:5], v12, v13 -; CGP-NEXT: v_mul_hi_u32 v9, v9, v10 -; CGP-NEXT: v_add_i32_e64 v8, s[4:5], v11, v8 -; CGP-NEXT: v_cndmask_b32_e64 v11, 0, 1, s[4:5] -; CGP-NEXT: v_add_i32_e64 v10, s[4:5], v12, v11 -; CGP-NEXT: v_add_i32_e64 v9, s[4:5], v9, v10 -; CGP-NEXT: v_addc_u32_e32 v7, vcc, v7, v9, vcc -; CGP-NEXT: v_add_i32_e32 v5, vcc, v5, v8 -; CGP-NEXT: v_xor_b32_e32 v3, v3, v6 -; CGP-NEXT: v_addc_u32_e32 v7, vcc, 0, v7, vcc -; CGP-NEXT: v_xor_b32_e32 v1, v1, v4 -; CGP-NEXT: v_mul_lo_u32 v8, v3, v5 -; CGP-NEXT: v_mul_lo_u32 v9, v2, v7 -; CGP-NEXT: v_sub_i32_e32 v0, vcc, v0, v4 -; CGP-NEXT: v_subb_u32_e32 v1, vcc, v1, v4, vcc -; CGP-NEXT: v_mul_hi_u32 v4, v2, v5 -; CGP-NEXT: v_add_i32_e32 v8, vcc, v8, v9 -; CGP-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v4, vcc, v8, v4 -; CGP-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc -; CGP-NEXT: v_mul_lo_u32 v8, v3, v7 -; CGP-NEXT: v_mul_hi_u32 v5, v3, v5 -; CGP-NEXT: v_add_i32_e32 v4, vcc, v9, v4 -; CGP-NEXT: v_mul_hi_u32 v9, v2, v7 -; CGP-NEXT: v_add_i32_e32 v5, vcc, v8, v5 -; CGP-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v5, vcc, v5, v9 -; CGP-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v8, vcc, v8, v9 -; CGP-NEXT: v_mul_hi_u32 v7, v3, v7 -; CGP-NEXT: v_add_i32_e32 v4, vcc, v5, v4 -; CGP-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v5, vcc, v8, v5 +; CGP-NEXT: v_cndmask_b32_e64 v13, 0, 1, s[4:5] +; CGP-NEXT: v_add_i32_e64 v13, s[4:5], v14, v13 +; CGP-NEXT: v_mul_hi_u32 v0, v11, v0 +; CGP-NEXT: v_add_i32_e64 v10, s[4:5], v12, v10 +; CGP-NEXT: v_cndmask_b32_e64 v12, 0, 1, s[4:5] +; CGP-NEXT: v_add_i32_e64 v11, s[4:5], v13, v12 +; CGP-NEXT: v_add_i32_e64 v0, s[4:5], v0, v11 +; CGP-NEXT: v_addc_u32_e32 v0, vcc, v9, v0, vcc +; CGP-NEXT: v_add_i32_e32 v6, vcc, v6, v10 +; CGP-NEXT: v_xor_b32_e32 v3, v3, v8 +; CGP-NEXT: v_addc_u32_e32 v9, vcc, 0, v0, vcc +; CGP-NEXT: v_mul_lo_u32 v10, v3, v6 +; CGP-NEXT: v_mul_lo_u32 v11, v2, v9 +; CGP-NEXT: v_sub_i32_e32 v0, vcc, v1, v5 +; CGP-NEXT: v_subb_u32_e32 v1, vcc, v7, v5, vcc +; CGP-NEXT: v_mul_hi_u32 v5, v2, v6 +; CGP-NEXT: v_add_i32_e32 v7, vcc, v10, v11 +; CGP-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v5, vcc, v7, v5 -; CGP-NEXT: v_mul_lo_u32 v7, 0, v4 -; CGP-NEXT: v_mul_lo_u32 v8, s7, v5 -; CGP-NEXT: v_mul_hi_u32 v10, s7, v4 -; CGP-NEXT: v_mul_lo_u32 v9, s7, v4 -; CGP-NEXT: s_bfe_i32 s6, -1, 0x10000 -; CGP-NEXT: v_add_i32_e32 v7, vcc, v7, v8 +; CGP-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc +; CGP-NEXT: v_mul_lo_u32 v7, v3, v9 +; CGP-NEXT: v_mul_hi_u32 v6, v3, v6 +; CGP-NEXT: v_add_i32_e32 v5, vcc, v10, v5 +; CGP-NEXT: v_mul_hi_u32 v10, v2, v9 +; CGP-NEXT: v_add_i32_e32 v6, vcc, v7, v6 +; CGP-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v6, vcc, v6, v10 +; CGP-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v7, vcc, v7, v10 -; CGP-NEXT: v_sub_i32_e32 v2, vcc, v2, v9 -; CGP-NEXT: v_subb_u32_e64 v8, s[4:5], v3, v7, vcc +; CGP-NEXT: v_mul_hi_u32 v9, v3, v9 +; CGP-NEXT: v_add_i32_e32 v5, vcc, v6, v5 +; CGP-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v6, vcc, v7, v6 +; CGP-NEXT: v_add_i32_e32 v6, vcc, v9, v6 +; CGP-NEXT: v_mul_lo_u32 v7, 0, v5 +; CGP-NEXT: v_mul_lo_u32 v9, v4, v6 +; CGP-NEXT: v_mul_hi_u32 v11, v4, v5 +; CGP-NEXT: v_mul_lo_u32 v10, v4, v5 +; CGP-NEXT: s_bfe_i32 s6, -1, 0x10000 +; CGP-NEXT: v_add_i32_e32 v7, vcc, v7, v9 +; CGP-NEXT: v_add_i32_e32 v7, vcc, v7, v11 +; CGP-NEXT: v_sub_i32_e32 v2, vcc, v2, v10 +; CGP-NEXT: v_subb_u32_e64 v9, s[4:5], v3, v7, vcc ; CGP-NEXT: v_sub_i32_e64 v3, s[4:5], v3, v7 ; CGP-NEXT: v_subbrev_u32_e32 v3, vcc, 0, v3, vcc -; CGP-NEXT: v_cmp_le_u32_e64 s[4:5], s7, v2 -; CGP-NEXT: v_subrev_i32_e32 v2, vcc, s7, v2 +; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v2, v4 +; CGP-NEXT: v_sub_i32_e32 v2, vcc, v2, v4 ; CGP-NEXT: v_subbrev_u32_e32 v3, vcc, 0, v3, vcc ; CGP-NEXT: v_cndmask_b32_e64 v7, 0, -1, s[4:5] -; CGP-NEXT: v_mov_b32_e32 v9, s6 -; CGP-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v8 -; CGP-NEXT: v_add_i32_e32 v8, vcc, 1, v4 -; CGP-NEXT: v_cndmask_b32_e64 v7, v9, v7, s[4:5] -; CGP-NEXT: v_addc_u32_e32 v9, vcc, 0, v5, vcc +; CGP-NEXT: v_mov_b32_e32 v10, s6 +; CGP-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v9 +; CGP-NEXT: v_add_i32_e32 v9, vcc, 1, v5 +; CGP-NEXT: v_cndmask_b32_e64 v7, v10, v7, s[4:5] +; CGP-NEXT: v_addc_u32_e32 v10, vcc, 0, v6, vcc ; CGP-NEXT: s_bfe_i32 s4, -1, 0x10000 -; CGP-NEXT: v_cmp_le_u32_e32 vcc, s7, v2 +; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v2, v4 ; CGP-NEXT: v_cndmask_b32_e64 v2, 0, -1, vcc -; CGP-NEXT: v_mov_b32_e32 v10, s4 +; CGP-NEXT: v_mov_b32_e32 v4, s4 ; CGP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 -; CGP-NEXT: v_cndmask_b32_e32 v2, v10, v2, vcc -; CGP-NEXT: v_add_i32_e32 v3, vcc, 1, v8 -; CGP-NEXT: v_addc_u32_e32 v10, vcc, 0, v9, vcc +; CGP-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc +; CGP-NEXT: v_add_i32_e32 v3, vcc, 1, v9 +; CGP-NEXT: v_addc_u32_e32 v4, vcc, 0, v10, vcc ; CGP-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 -; CGP-NEXT: v_cndmask_b32_e32 v2, v8, v3, vcc -; CGP-NEXT: v_cndmask_b32_e32 v3, v9, v10, vcc +; CGP-NEXT: v_cndmask_b32_e32 v2, v9, v3, vcc +; CGP-NEXT: v_cndmask_b32_e32 v3, v10, v4, vcc ; CGP-NEXT: v_cmp_ne_u32_e32 vcc, 0, v7 -; CGP-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc -; CGP-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc -; CGP-NEXT: v_xor_b32_e32 v2, v2, v6 -; CGP-NEXT: v_xor_b32_e32 v3, v3, v6 -; CGP-NEXT: v_sub_i32_e32 v2, vcc, v2, v6 -; CGP-NEXT: v_subb_u32_e32 v3, vcc, v3, v6, vcc +; CGP-NEXT: v_cndmask_b32_e32 v2, v5, v2, vcc +; CGP-NEXT: v_cndmask_b32_e32 v3, v6, v3, vcc +; CGP-NEXT: v_xor_b32_e32 v2, v2, v8 +; CGP-NEXT: v_xor_b32_e32 v3, v3, v8 +; CGP-NEXT: v_sub_i32_e32 v2, vcc, v2, v8 +; CGP-NEXT: v_subb_u32_e32 v3, vcc, v3, v8, vcc ; CGP-NEXT: s_setpc_b64 s[30:31] %result = sdiv <2 x i64> %num, ret <2 x i64> %result @@ -1786,138 +1786,138 @@ ; CHECK-LABEL: v_sdiv_i64_oddk_denom: ; CHECK: ; %bb.0: ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: v_cvt_f32_u32_e32 v3, 0x12d8fb -; CHECK-NEXT: v_cvt_f32_ubyte0_e32 v4, 0 -; CHECK-NEXT: s_mov_b32 s6, 0xffed2705 +; CHECK-NEXT: v_mov_b32_e32 v3, 0x12d8fb +; CHECK-NEXT: v_cvt_f32_u32_e32 v4, v3 +; CHECK-NEXT: v_cvt_f32_ubyte0_e32 v5, 0 +; CHECK-NEXT: v_mov_b32_e32 v6, 0xffed2705 ; CHECK-NEXT: v_ashrrev_i32_e32 v2, 31, v1 -; CHECK-NEXT: v_mac_f32_e32 v3, 0x4f800000, v4 -; CHECK-NEXT: v_rcp_iflag_f32_e32 v3, v3 +; CHECK-NEXT: v_mac_f32_e32 v4, 0x4f800000, v5 +; CHECK-NEXT: v_rcp_iflag_f32_e32 v4, v4 ; CHECK-NEXT: v_add_i32_e32 v0, vcc, v0, v2 ; CHECK-NEXT: v_addc_u32_e32 v1, vcc, v1, v2, vcc -; CHECK-NEXT: v_mul_f32_e32 v3, 0x5f7ffffc, v3 -; CHECK-NEXT: v_mul_f32_e32 v4, 0x2f800000, v3 -; CHECK-NEXT: v_trunc_f32_e32 v4, v4 -; CHECK-NEXT: v_mac_f32_e32 v3, 0xcf800000, v4 -; CHECK-NEXT: v_cvt_u32_f32_e32 v3, v3 +; CHECK-NEXT: v_mul_f32_e32 v4, 0x5f7ffffc, v4 +; CHECK-NEXT: v_mul_f32_e32 v5, 0x2f800000, v4 +; CHECK-NEXT: v_trunc_f32_e32 v5, v5 +; CHECK-NEXT: v_mac_f32_e32 v4, 0xcf800000, v5 ; CHECK-NEXT: v_cvt_u32_f32_e32 v4, v4 +; CHECK-NEXT: v_cvt_u32_f32_e32 v5, v5 ; CHECK-NEXT: v_xor_b32_e32 v0, v0, v2 ; CHECK-NEXT: v_xor_b32_e32 v1, v1, v2 -; CHECK-NEXT: v_mul_lo_u32 v5, -1, v3 -; CHECK-NEXT: v_mul_lo_u32 v6, s6, v4 -; CHECK-NEXT: v_mul_hi_u32 v8, s6, v3 -; CHECK-NEXT: v_mul_lo_u32 v7, s6, v3 -; CHECK-NEXT: s_bfe_i32 s7, -1, 0x10000 -; CHECK-NEXT: v_add_i32_e32 v5, vcc, v5, v6 -; CHECK-NEXT: v_add_i32_e32 v5, vcc, v5, v8 -; CHECK-NEXT: v_mul_lo_u32 v6, v4, v7 -; CHECK-NEXT: v_mul_lo_u32 v8, v3, v5 -; CHECK-NEXT: v_mul_hi_u32 v9, v3, v7 -; CHECK-NEXT: v_mul_hi_u32 v7, v4, v7 -; CHECK-NEXT: v_add_i32_e32 v6, vcc, v6, v8 -; CHECK-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc -; CHECK-NEXT: v_add_i32_e32 v6, vcc, v6, v9 -; CHECK-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc -; CHECK-NEXT: v_mul_lo_u32 v9, v4, v5 -; CHECK-NEXT: v_add_i32_e32 v6, vcc, v8, v6 -; CHECK-NEXT: v_mul_hi_u32 v8, v3, v5 -; CHECK-NEXT: v_add_i32_e32 v7, vcc, v9, v7 -; CHECK-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc +; CHECK-NEXT: v_mul_lo_u32 v7, -1, v4 +; CHECK-NEXT: v_mul_lo_u32 v8, v6, v5 +; CHECK-NEXT: v_mul_hi_u32 v10, v6, v4 +; CHECK-NEXT: v_mul_lo_u32 v9, v6, v4 +; CHECK-NEXT: s_bfe_i32 s6, -1, 0x10000 ; CHECK-NEXT: v_add_i32_e32 v7, vcc, v7, v8 +; CHECK-NEXT: v_add_i32_e32 v7, vcc, v7, v10 +; CHECK-NEXT: v_mul_lo_u32 v8, v5, v9 +; CHECK-NEXT: v_mul_lo_u32 v10, v4, v7 +; CHECK-NEXT: v_mul_hi_u32 v11, v4, v9 +; CHECK-NEXT: v_mul_hi_u32 v9, v5, v9 +; CHECK-NEXT: v_add_i32_e32 v8, vcc, v8, v10 +; CHECK-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc +; CHECK-NEXT: v_add_i32_e32 v8, vcc, v8, v11 ; CHECK-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc +; CHECK-NEXT: v_mul_lo_u32 v11, v5, v7 +; CHECK-NEXT: v_add_i32_e32 v8, vcc, v10, v8 +; CHECK-NEXT: v_mul_hi_u32 v10, v4, v7 +; CHECK-NEXT: v_add_i32_e32 v9, vcc, v11, v9 +; CHECK-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc +; CHECK-NEXT: v_add_i32_e32 v9, vcc, v9, v10 +; CHECK-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc +; CHECK-NEXT: v_add_i32_e32 v10, vcc, v11, v10 +; CHECK-NEXT: v_mul_hi_u32 v7, v5, v7 ; CHECK-NEXT: v_add_i32_e32 v8, vcc, v9, v8 -; CHECK-NEXT: v_mul_hi_u32 v5, v4, v5 -; CHECK-NEXT: v_add_i32_e32 v6, vcc, v7, v6 -; CHECK-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc -; CHECK-NEXT: v_add_i32_e32 v7, vcc, v8, v7 -; CHECK-NEXT: v_add_i32_e32 v5, vcc, v5, v7 -; CHECK-NEXT: v_add_i32_e32 v3, vcc, v3, v6 -; CHECK-NEXT: v_addc_u32_e64 v6, s[4:5], v4, v5, vcc -; CHECK-NEXT: v_mul_lo_u32 v7, -1, v3 -; CHECK-NEXT: v_mul_lo_u32 v8, s6, v6 -; CHECK-NEXT: v_mul_hi_u32 v10, s6, v3 -; CHECK-NEXT: v_mul_lo_u32 v9, s6, v3 -; CHECK-NEXT: v_add_i32_e64 v4, s[4:5], v4, v5 -; CHECK-NEXT: v_add_i32_e64 v7, s[4:5], v7, v8 -; CHECK-NEXT: v_add_i32_e64 v7, s[4:5], v7, v10 -; CHECK-NEXT: v_mul_lo_u32 v8, v6, v9 -; CHECK-NEXT: v_mul_lo_u32 v10, v3, v7 -; CHECK-NEXT: v_mul_hi_u32 v5, v3, v9 -; CHECK-NEXT: v_mul_hi_u32 v9, v6, v9 -; CHECK-NEXT: s_mov_b32 s6, 0x12d8fb -; CHECK-NEXT: v_add_i32_e64 v8, s[4:5], v8, v10 -; CHECK-NEXT: v_cndmask_b32_e64 v10, 0, 1, s[4:5] -; CHECK-NEXT: v_add_i32_e64 v5, s[4:5], v8, v5 -; CHECK-NEXT: v_cndmask_b32_e64 v5, 0, 1, s[4:5] -; CHECK-NEXT: v_mul_lo_u32 v8, v6, v7 -; CHECK-NEXT: v_add_i32_e64 v5, s[4:5], v10, v5 -; CHECK-NEXT: v_mul_hi_u32 v10, v3, v7 -; CHECK-NEXT: v_add_i32_e64 v8, s[4:5], v8, v9 -; CHECK-NEXT: v_cndmask_b32_e64 v9, 0, 1, s[4:5] -; CHECK-NEXT: v_add_i32_e64 v8, s[4:5], v8, v10 +; CHECK-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc +; CHECK-NEXT: v_add_i32_e32 v9, vcc, v10, v9 +; CHECK-NEXT: v_add_i32_e32 v7, vcc, v7, v9 +; CHECK-NEXT: v_add_i32_e32 v4, vcc, v4, v8 +; CHECK-NEXT: v_addc_u32_e64 v8, s[4:5], v5, v7, vcc +; CHECK-NEXT: v_mul_lo_u32 v9, -1, v4 +; CHECK-NEXT: v_mul_lo_u32 v10, v6, v8 +; CHECK-NEXT: v_mul_lo_u32 v11, v6, v4 +; CHECK-NEXT: v_mul_hi_u32 v6, v6, v4 +; CHECK-NEXT: v_add_i32_e64 v5, s[4:5], v5, v7 +; CHECK-NEXT: v_add_i32_e64 v9, s[4:5], v9, v10 +; CHECK-NEXT: v_add_i32_e64 v6, s[4:5], v9, v6 +; CHECK-NEXT: v_mul_lo_u32 v9, v8, v11 +; CHECK-NEXT: v_mul_lo_u32 v10, v4, v6 +; CHECK-NEXT: v_mul_hi_u32 v7, v4, v11 +; CHECK-NEXT: v_mul_hi_u32 v11, v8, v11 +; CHECK-NEXT: v_add_i32_e64 v9, s[4:5], v9, v10 ; CHECK-NEXT: v_cndmask_b32_e64 v10, 0, 1, s[4:5] +; CHECK-NEXT: v_add_i32_e64 v7, s[4:5], v9, v7 +; CHECK-NEXT: v_cndmask_b32_e64 v7, 0, 1, s[4:5] +; CHECK-NEXT: v_mul_lo_u32 v9, v8, v6 +; CHECK-NEXT: v_add_i32_e64 v7, s[4:5], v10, v7 +; CHECK-NEXT: v_mul_hi_u32 v10, v4, v6 +; CHECK-NEXT: v_add_i32_e64 v9, s[4:5], v9, v11 +; CHECK-NEXT: v_cndmask_b32_e64 v11, 0, 1, s[4:5] ; CHECK-NEXT: v_add_i32_e64 v9, s[4:5], v9, v10 -; CHECK-NEXT: v_mul_hi_u32 v6, v6, v7 -; CHECK-NEXT: v_add_i32_e64 v5, s[4:5], v8, v5 -; CHECK-NEXT: v_cndmask_b32_e64 v8, 0, 1, s[4:5] -; CHECK-NEXT: v_add_i32_e64 v7, s[4:5], v9, v8 -; CHECK-NEXT: v_add_i32_e64 v6, s[4:5], v6, v7 -; CHECK-NEXT: v_addc_u32_e32 v4, vcc, v4, v6, vcc -; CHECK-NEXT: v_add_i32_e32 v3, vcc, v3, v5 -; CHECK-NEXT: v_addc_u32_e32 v4, vcc, 0, v4, vcc -; CHECK-NEXT: v_mul_lo_u32 v5, v1, v3 -; CHECK-NEXT: v_mul_lo_u32 v6, v0, v4 -; CHECK-NEXT: v_mul_hi_u32 v7, v0, v3 -; CHECK-NEXT: v_mul_hi_u32 v3, v1, v3 -; CHECK-NEXT: v_add_i32_e32 v5, vcc, v5, v6 +; CHECK-NEXT: v_cndmask_b32_e64 v10, 0, 1, s[4:5] +; CHECK-NEXT: v_add_i32_e64 v10, s[4:5], v11, v10 +; CHECK-NEXT: v_mul_hi_u32 v6, v8, v6 +; CHECK-NEXT: v_add_i32_e64 v7, s[4:5], v9, v7 +; CHECK-NEXT: v_cndmask_b32_e64 v9, 0, 1, s[4:5] +; CHECK-NEXT: v_add_i32_e64 v8, s[4:5], v10, v9 +; CHECK-NEXT: v_add_i32_e64 v6, s[4:5], v6, v8 +; CHECK-NEXT: v_addc_u32_e32 v5, vcc, v5, v6, vcc +; CHECK-NEXT: v_add_i32_e32 v4, vcc, v4, v7 +; CHECK-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc +; CHECK-NEXT: v_mul_lo_u32 v6, v1, v4 +; CHECK-NEXT: v_mul_lo_u32 v7, v0, v5 +; CHECK-NEXT: v_mul_hi_u32 v8, v0, v4 +; CHECK-NEXT: v_mul_hi_u32 v4, v1, v4 +; CHECK-NEXT: v_add_i32_e32 v6, vcc, v6, v7 +; CHECK-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc +; CHECK-NEXT: v_add_i32_e32 v6, vcc, v6, v8 ; CHECK-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc -; CHECK-NEXT: v_add_i32_e32 v5, vcc, v5, v7 -; CHECK-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc -; CHECK-NEXT: v_mul_lo_u32 v7, v1, v4 -; CHECK-NEXT: v_add_i32_e32 v5, vcc, v6, v5 -; CHECK-NEXT: v_mul_hi_u32 v6, v0, v4 -; CHECK-NEXT: v_add_i32_e32 v3, vcc, v7, v3 +; CHECK-NEXT: v_mul_lo_u32 v8, v1, v5 +; CHECK-NEXT: v_add_i32_e32 v6, vcc, v7, v6 +; CHECK-NEXT: v_mul_hi_u32 v7, v0, v5 +; CHECK-NEXT: v_add_i32_e32 v4, vcc, v8, v4 +; CHECK-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc +; CHECK-NEXT: v_add_i32_e32 v4, vcc, v4, v7 ; CHECK-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc -; CHECK-NEXT: v_add_i32_e32 v3, vcc, v3, v6 +; CHECK-NEXT: v_add_i32_e32 v7, vcc, v8, v7 +; CHECK-NEXT: v_mul_hi_u32 v5, v1, v5 +; CHECK-NEXT: v_add_i32_e32 v4, vcc, v4, v6 ; CHECK-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc ; CHECK-NEXT: v_add_i32_e32 v6, vcc, v7, v6 -; CHECK-NEXT: v_mul_hi_u32 v4, v1, v4 -; CHECK-NEXT: v_add_i32_e32 v3, vcc, v3, v5 -; CHECK-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc -; CHECK-NEXT: v_add_i32_e32 v5, vcc, v6, v5 -; CHECK-NEXT: v_add_i32_e32 v4, vcc, v4, v5 -; CHECK-NEXT: v_mul_lo_u32 v5, 0, v3 -; CHECK-NEXT: v_mul_lo_u32 v6, s6, v4 -; CHECK-NEXT: v_mul_hi_u32 v8, s6, v3 -; CHECK-NEXT: v_mul_lo_u32 v7, s6, v3 ; CHECK-NEXT: v_add_i32_e32 v5, vcc, v5, v6 -; CHECK-NEXT: v_add_i32_e32 v5, vcc, v5, v8 -; CHECK-NEXT: v_sub_i32_e32 v0, vcc, v0, v7 -; CHECK-NEXT: v_subb_u32_e64 v6, s[4:5], v1, v5, vcc -; CHECK-NEXT: v_sub_i32_e64 v1, s[4:5], v1, v5 +; CHECK-NEXT: v_mul_lo_u32 v6, 0, v4 +; CHECK-NEXT: v_mul_lo_u32 v7, v3, v5 +; CHECK-NEXT: v_mul_hi_u32 v9, v3, v4 +; CHECK-NEXT: v_mul_lo_u32 v8, v3, v4 +; CHECK-NEXT: v_add_i32_e32 v6, vcc, v6, v7 +; CHECK-NEXT: v_add_i32_e32 v6, vcc, v6, v9 +; CHECK-NEXT: v_sub_i32_e32 v0, vcc, v0, v8 +; CHECK-NEXT: v_subb_u32_e64 v7, s[4:5], v1, v6, vcc +; CHECK-NEXT: v_sub_i32_e64 v1, s[4:5], v1, v6 ; CHECK-NEXT: v_subbrev_u32_e32 v1, vcc, 0, v1, vcc -; CHECK-NEXT: v_cmp_le_u32_e64 s[4:5], s6, v0 -; CHECK-NEXT: v_subrev_i32_e32 v0, vcc, s6, v0 +; CHECK-NEXT: v_cmp_ge_u32_e64 s[4:5], v0, v3 +; CHECK-NEXT: v_sub_i32_e32 v0, vcc, v0, v3 ; CHECK-NEXT: v_subbrev_u32_e32 v1, vcc, 0, v1, vcc -; CHECK-NEXT: v_cndmask_b32_e64 v5, 0, -1, s[4:5] -; CHECK-NEXT: v_mov_b32_e32 v7, s7 -; CHECK-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v6 -; CHECK-NEXT: v_add_i32_e32 v6, vcc, 1, v3 -; CHECK-NEXT: v_cndmask_b32_e64 v5, v7, v5, s[4:5] -; CHECK-NEXT: v_addc_u32_e32 v7, vcc, 0, v4, vcc +; CHECK-NEXT: v_cndmask_b32_e64 v6, 0, -1, s[4:5] +; CHECK-NEXT: v_mov_b32_e32 v8, s6 +; CHECK-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v7 +; CHECK-NEXT: v_add_i32_e32 v7, vcc, 1, v4 +; CHECK-NEXT: v_cndmask_b32_e64 v6, v8, v6, s[4:5] +; CHECK-NEXT: v_addc_u32_e32 v8, vcc, 0, v5, vcc ; CHECK-NEXT: s_bfe_i32 s4, -1, 0x10000 -; CHECK-NEXT: v_cmp_le_u32_e32 vcc, s6, v0 +; CHECK-NEXT: v_cmp_ge_u32_e32 vcc, v0, v3 ; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc -; CHECK-NEXT: v_mov_b32_e32 v8, s4 +; CHECK-NEXT: v_mov_b32_e32 v3, s4 ; CHECK-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 -; CHECK-NEXT: v_cndmask_b32_e32 v0, v8, v0, vcc -; CHECK-NEXT: v_add_i32_e32 v1, vcc, 1, v6 -; CHECK-NEXT: v_addc_u32_e32 v8, vcc, 0, v7, vcc -; CHECK-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 -; CHECK-NEXT: v_cndmask_b32_e32 v0, v6, v1, vcc -; CHECK-NEXT: v_cndmask_b32_e32 v1, v7, v8, vcc -; CHECK-NEXT: v_cmp_ne_u32_e32 vcc, 0, v5 ; CHECK-NEXT: v_cndmask_b32_e32 v0, v3, v0, vcc -; CHECK-NEXT: v_cndmask_b32_e32 v1, v4, v1, vcc +; CHECK-NEXT: v_add_i32_e32 v1, vcc, 1, v7 +; CHECK-NEXT: v_addc_u32_e32 v3, vcc, 0, v8, vcc +; CHECK-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 +; CHECK-NEXT: v_cndmask_b32_e32 v0, v7, v1, vcc +; CHECK-NEXT: v_cndmask_b32_e32 v1, v8, v3, vcc +; CHECK-NEXT: v_cmp_ne_u32_e32 vcc, 0, v6 +; CHECK-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc +; CHECK-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc ; CHECK-NEXT: v_xor_b32_e32 v0, v0, v2 ; CHECK-NEXT: v_xor_b32_e32 v1, v1, v2 ; CHECK-NEXT: v_sub_i32_e32 v0, vcc, v0, v2 @@ -2231,275 +2231,275 @@ ; CGP-LABEL: v_sdiv_v2i64_oddk_denom: ; CGP: ; %bb.0: ; CGP-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CGP-NEXT: v_cvt_f32_u32_e32 v5, 0x12d8fb -; CGP-NEXT: v_cvt_f32_ubyte0_e32 v6, 0 -; CGP-NEXT: s_mov_b32 s6, 0xffed2705 -; CGP-NEXT: v_ashrrev_i32_e32 v4, 31, v1 -; CGP-NEXT: v_mov_b32_e32 v7, v5 -; CGP-NEXT: v_mac_f32_e32 v7, 0x4f800000, v6 -; CGP-NEXT: v_rcp_iflag_f32_e32 v7, v7 -; CGP-NEXT: v_add_i32_e32 v0, vcc, v0, v4 -; CGP-NEXT: v_addc_u32_e32 v1, vcc, v1, v4, vcc -; CGP-NEXT: v_mul_f32_e32 v7, 0x5f7ffffc, v7 -; CGP-NEXT: v_mul_f32_e32 v8, 0x2f800000, v7 -; CGP-NEXT: v_trunc_f32_e32 v8, v8 -; CGP-NEXT: v_mac_f32_e32 v7, 0xcf800000, v8 -; CGP-NEXT: v_cvt_u32_f32_e32 v7, v7 +; CGP-NEXT: v_mov_b32_e32 v4, 0x12d8fb +; CGP-NEXT: v_cvt_f32_u32_e32 v6, v4 +; CGP-NEXT: v_cvt_f32_ubyte0_e32 v7, 0 +; CGP-NEXT: v_ashrrev_i32_e32 v5, 31, v1 +; CGP-NEXT: v_add_i32_e32 v0, vcc, v0, v5 +; CGP-NEXT: v_mov_b32_e32 v8, v6 +; CGP-NEXT: v_mac_f32_e32 v8, 0x4f800000, v7 +; CGP-NEXT: v_rcp_iflag_f32_e32 v8, v8 +; CGP-NEXT: v_xor_b32_e32 v10, v0, v5 +; CGP-NEXT: v_mov_b32_e32 v0, 0xffed2705 +; CGP-NEXT: v_addc_u32_e32 v1, vcc, v1, v5, vcc +; CGP-NEXT: v_mul_f32_e32 v8, 0x5f7ffffc, v8 +; CGP-NEXT: v_mul_f32_e32 v9, 0x2f800000, v8 +; CGP-NEXT: v_trunc_f32_e32 v9, v9 +; CGP-NEXT: v_mac_f32_e32 v8, 0xcf800000, v9 ; CGP-NEXT: v_cvt_u32_f32_e32 v8, v8 -; CGP-NEXT: v_xor_b32_e32 v0, v0, v4 -; CGP-NEXT: v_xor_b32_e32 v1, v1, v4 -; CGP-NEXT: v_mul_lo_u32 v9, -1, v7 -; CGP-NEXT: v_mul_lo_u32 v10, s6, v8 -; CGP-NEXT: v_mul_hi_u32 v12, s6, v7 -; CGP-NEXT: v_mul_lo_u32 v11, s6, v7 -; CGP-NEXT: s_mov_b32 s7, 0x12d8fb -; CGP-NEXT: v_add_i32_e32 v9, vcc, v9, v10 -; CGP-NEXT: v_add_i32_e32 v9, vcc, v9, v12 -; CGP-NEXT: v_mul_lo_u32 v10, v8, v11 -; CGP-NEXT: v_mul_lo_u32 v12, v7, v9 -; CGP-NEXT: v_mul_hi_u32 v13, v7, v11 -; CGP-NEXT: v_mul_hi_u32 v11, v8, v11 -; CGP-NEXT: s_bfe_i32 s8, -1, 0x10000 -; CGP-NEXT: v_add_i32_e32 v10, vcc, v10, v12 +; CGP-NEXT: v_cvt_u32_f32_e32 v9, v9 +; CGP-NEXT: v_xor_b32_e32 v1, v1, v5 +; CGP-NEXT: s_bfe_i32 s6, -1, 0x10000 +; CGP-NEXT: v_mul_lo_u32 v11, -1, v8 +; CGP-NEXT: v_mul_lo_u32 v12, v0, v9 +; CGP-NEXT: v_mul_hi_u32 v14, v0, v8 +; CGP-NEXT: v_mul_lo_u32 v13, v0, v8 +; CGP-NEXT: v_mac_f32_e32 v6, 0x4f800000, v7 +; CGP-NEXT: v_add_i32_e32 v11, vcc, v11, v12 +; CGP-NEXT: v_add_i32_e32 v11, vcc, v11, v14 +; CGP-NEXT: v_mul_lo_u32 v12, v9, v13 +; CGP-NEXT: v_mul_lo_u32 v14, v8, v11 +; CGP-NEXT: v_mul_hi_u32 v15, v8, v13 +; CGP-NEXT: v_mul_hi_u32 v13, v9, v13 +; CGP-NEXT: v_rcp_iflag_f32_e32 v6, v6 +; CGP-NEXT: v_add_i32_e32 v12, vcc, v12, v14 +; CGP-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v12, vcc, v12, v15 ; CGP-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v10, vcc, v10, v13 -; CGP-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc -; CGP-NEXT: v_mul_lo_u32 v13, v8, v9 -; CGP-NEXT: v_add_i32_e32 v10, vcc, v12, v10 -; CGP-NEXT: v_mul_hi_u32 v12, v7, v9 -; CGP-NEXT: v_add_i32_e32 v11, vcc, v13, v11 +; CGP-NEXT: v_mul_lo_u32 v15, v9, v11 +; CGP-NEXT: v_add_i32_e32 v12, vcc, v14, v12 +; CGP-NEXT: v_mul_hi_u32 v14, v8, v11 +; CGP-NEXT: v_add_i32_e32 v13, vcc, v15, v13 +; CGP-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v13, vcc, v13, v14 +; CGP-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v14, vcc, v15, v14 +; CGP-NEXT: v_mul_hi_u32 v11, v9, v11 +; CGP-NEXT: v_add_i32_e32 v12, vcc, v13, v12 ; CGP-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v13, vcc, v14, v13 +; CGP-NEXT: v_add_i32_e32 v11, vcc, v11, v13 +; CGP-NEXT: v_add_i32_e32 v8, vcc, v8, v12 +; CGP-NEXT: v_addc_u32_e64 v12, s[4:5], v9, v11, vcc +; CGP-NEXT: v_mul_lo_u32 v13, -1, v8 +; CGP-NEXT: v_mul_lo_u32 v14, v0, v12 +; CGP-NEXT: v_mul_hi_u32 v16, v0, v8 +; CGP-NEXT: v_mul_lo_u32 v15, v0, v8 +; CGP-NEXT: v_add_i32_e64 v9, s[4:5], v9, v11 +; CGP-NEXT: v_add_i32_e64 v13, s[4:5], v13, v14 +; CGP-NEXT: v_add_i32_e64 v13, s[4:5], v13, v16 +; CGP-NEXT: v_mul_lo_u32 v14, v12, v15 +; CGP-NEXT: v_mul_lo_u32 v16, v8, v13 +; CGP-NEXT: v_mul_hi_u32 v11, v8, v15 +; CGP-NEXT: v_mul_hi_u32 v15, v12, v15 +; CGP-NEXT: v_mul_f32_e32 v6, 0x5f7ffffc, v6 +; CGP-NEXT: v_add_i32_e64 v14, s[4:5], v14, v16 +; CGP-NEXT: v_cndmask_b32_e64 v16, 0, 1, s[4:5] +; CGP-NEXT: v_add_i32_e64 v11, s[4:5], v14, v11 +; CGP-NEXT: v_cndmask_b32_e64 v11, 0, 1, s[4:5] +; CGP-NEXT: v_mul_lo_u32 v14, v12, v13 +; CGP-NEXT: v_add_i32_e64 v11, s[4:5], v16, v11 +; CGP-NEXT: v_mul_hi_u32 v16, v8, v13 +; CGP-NEXT: v_add_i32_e64 v14, s[4:5], v14, v15 +; CGP-NEXT: v_cndmask_b32_e64 v15, 0, 1, s[4:5] +; CGP-NEXT: v_add_i32_e64 v14, s[4:5], v14, v16 +; CGP-NEXT: v_cndmask_b32_e64 v16, 0, 1, s[4:5] +; CGP-NEXT: v_add_i32_e64 v15, s[4:5], v15, v16 +; CGP-NEXT: v_mul_hi_u32 v12, v12, v13 +; CGP-NEXT: v_add_i32_e64 v11, s[4:5], v14, v11 +; CGP-NEXT: v_cndmask_b32_e64 v14, 0, 1, s[4:5] +; CGP-NEXT: v_add_i32_e64 v13, s[4:5], v15, v14 +; CGP-NEXT: v_add_i32_e64 v12, s[4:5], v12, v13 +; CGP-NEXT: v_addc_u32_e32 v9, vcc, v9, v12, vcc +; CGP-NEXT: v_add_i32_e32 v8, vcc, v8, v11 +; CGP-NEXT: v_addc_u32_e32 v9, vcc, 0, v9, vcc +; CGP-NEXT: v_mul_lo_u32 v11, v1, v8 +; CGP-NEXT: v_mul_lo_u32 v12, v10, v9 +; CGP-NEXT: v_mul_hi_u32 v13, v10, v8 +; CGP-NEXT: v_mul_hi_u32 v8, v1, v8 ; CGP-NEXT: v_add_i32_e32 v11, vcc, v11, v12 ; CGP-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v11, vcc, v11, v13 +; CGP-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc +; CGP-NEXT: v_mul_lo_u32 v13, v1, v9 +; CGP-NEXT: v_add_i32_e32 v11, vcc, v12, v11 +; CGP-NEXT: v_mul_hi_u32 v12, v10, v9 +; CGP-NEXT: v_add_i32_e32 v8, vcc, v13, v8 +; CGP-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v8, vcc, v8, v12 +; CGP-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v12, vcc, v13, v12 -; CGP-NEXT: v_mul_hi_u32 v9, v8, v9 -; CGP-NEXT: v_add_i32_e32 v10, vcc, v11, v10 +; CGP-NEXT: v_mul_hi_u32 v9, v1, v9 +; CGP-NEXT: v_add_i32_e32 v8, vcc, v8, v11 ; CGP-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v11, vcc, v12, v11 ; CGP-NEXT: v_add_i32_e32 v9, vcc, v9, v11 -; CGP-NEXT: v_add_i32_e32 v7, vcc, v7, v10 -; CGP-NEXT: v_addc_u32_e64 v10, s[4:5], v8, v9, vcc -; CGP-NEXT: v_mul_lo_u32 v11, -1, v7 -; CGP-NEXT: v_mul_lo_u32 v12, s6, v10 -; CGP-NEXT: v_mul_hi_u32 v14, s6, v7 -; CGP-NEXT: v_mul_lo_u32 v13, s6, v7 -; CGP-NEXT: v_add_i32_e64 v8, s[4:5], v8, v9 -; CGP-NEXT: v_add_i32_e64 v11, s[4:5], v11, v12 -; CGP-NEXT: v_add_i32_e64 v11, s[4:5], v11, v14 -; CGP-NEXT: v_mul_lo_u32 v12, v10, v13 -; CGP-NEXT: v_mul_lo_u32 v14, v7, v11 -; CGP-NEXT: v_mul_hi_u32 v9, v7, v13 -; CGP-NEXT: v_mul_hi_u32 v13, v10, v13 -; CGP-NEXT: v_mac_f32_e32 v5, 0x4f800000, v6 -; CGP-NEXT: v_add_i32_e64 v12, s[4:5], v12, v14 -; CGP-NEXT: v_cndmask_b32_e64 v14, 0, 1, s[4:5] -; CGP-NEXT: v_add_i32_e64 v9, s[4:5], v12, v9 -; CGP-NEXT: v_cndmask_b32_e64 v9, 0, 1, s[4:5] -; CGP-NEXT: v_mul_lo_u32 v12, v10, v11 -; CGP-NEXT: v_add_i32_e64 v9, s[4:5], v14, v9 -; CGP-NEXT: v_mul_hi_u32 v14, v7, v11 -; CGP-NEXT: v_add_i32_e64 v12, s[4:5], v12, v13 -; CGP-NEXT: v_cndmask_b32_e64 v13, 0, 1, s[4:5] -; CGP-NEXT: v_add_i32_e64 v12, s[4:5], v12, v14 -; CGP-NEXT: v_cndmask_b32_e64 v14, 0, 1, s[4:5] -; CGP-NEXT: v_add_i32_e64 v13, s[4:5], v13, v14 -; CGP-NEXT: v_mul_hi_u32 v10, v10, v11 -; CGP-NEXT: v_add_i32_e64 v9, s[4:5], v12, v9 -; CGP-NEXT: v_cndmask_b32_e64 v12, 0, 1, s[4:5] -; CGP-NEXT: v_add_i32_e64 v11, s[4:5], v13, v12 -; CGP-NEXT: v_add_i32_e64 v10, s[4:5], v10, v11 -; CGP-NEXT: v_addc_u32_e32 v8, vcc, v8, v10, vcc -; CGP-NEXT: v_add_i32_e32 v7, vcc, v7, v9 -; CGP-NEXT: v_addc_u32_e32 v8, vcc, 0, v8, vcc -; CGP-NEXT: v_mul_lo_u32 v9, v1, v7 -; CGP-NEXT: v_mul_lo_u32 v10, v0, v8 -; CGP-NEXT: v_mul_hi_u32 v11, v0, v7 -; CGP-NEXT: v_mul_hi_u32 v7, v1, v7 -; CGP-NEXT: v_rcp_iflag_f32_e32 v5, v5 -; CGP-NEXT: v_add_i32_e32 v9, vcc, v9, v10 -; CGP-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v9, vcc, v9, v11 -; CGP-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc -; CGP-NEXT: v_mul_lo_u32 v11, v1, v8 -; CGP-NEXT: v_add_i32_e32 v9, vcc, v10, v9 -; CGP-NEXT: v_mul_hi_u32 v10, v0, v8 -; CGP-NEXT: v_add_i32_e32 v7, vcc, v11, v7 -; CGP-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v7, vcc, v7, v10 -; CGP-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v10, vcc, v11, v10 -; CGP-NEXT: v_mul_hi_u32 v8, v1, v8 -; CGP-NEXT: v_add_i32_e32 v7, vcc, v7, v9 -; CGP-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v9, vcc, v10, v9 -; CGP-NEXT: v_add_i32_e32 v8, vcc, v8, v9 -; CGP-NEXT: v_mul_lo_u32 v9, 0, v7 -; CGP-NEXT: v_mul_lo_u32 v10, s7, v8 -; CGP-NEXT: v_mul_hi_u32 v12, s7, v7 -; CGP-NEXT: v_mul_lo_u32 v11, s7, v7 -; CGP-NEXT: v_mul_f32_e32 v5, 0x5f7ffffc, v5 -; CGP-NEXT: v_add_i32_e32 v9, vcc, v9, v10 -; CGP-NEXT: v_add_i32_e32 v9, vcc, v9, v12 -; CGP-NEXT: v_sub_i32_e32 v0, vcc, v0, v11 -; CGP-NEXT: v_subb_u32_e64 v10, s[4:5], v1, v9, vcc -; CGP-NEXT: v_sub_i32_e64 v1, s[4:5], v1, v9 +; CGP-NEXT: v_mul_lo_u32 v11, 0, v8 +; CGP-NEXT: v_mul_lo_u32 v12, v4, v9 +; CGP-NEXT: v_mul_hi_u32 v14, v4, v8 +; CGP-NEXT: v_mul_lo_u32 v13, v4, v8 +; CGP-NEXT: v_add_i32_e32 v11, vcc, v11, v12 +; CGP-NEXT: v_add_i32_e32 v11, vcc, v11, v14 +; CGP-NEXT: v_sub_i32_e32 v10, vcc, v10, v13 +; CGP-NEXT: v_subb_u32_e64 v12, s[4:5], v1, v11, vcc +; CGP-NEXT: v_sub_i32_e64 v1, s[4:5], v1, v11 ; CGP-NEXT: v_subbrev_u32_e32 v1, vcc, 0, v1, vcc -; CGP-NEXT: v_cmp_le_u32_e64 s[4:5], s7, v0 -; CGP-NEXT: v_subrev_i32_e32 v0, vcc, s7, v0 +; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v10, v4 +; CGP-NEXT: v_sub_i32_e32 v10, vcc, v10, v4 ; CGP-NEXT: v_subbrev_u32_e32 v1, vcc, 0, v1, vcc -; CGP-NEXT: v_cndmask_b32_e64 v9, 0, -1, s[4:5] -; CGP-NEXT: v_mov_b32_e32 v11, s8 -; CGP-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v10 -; CGP-NEXT: v_add_i32_e32 v10, vcc, 1, v7 -; CGP-NEXT: v_cndmask_b32_e64 v9, v11, v9, s[4:5] -; CGP-NEXT: v_addc_u32_e32 v11, vcc, 0, v8, vcc +; CGP-NEXT: v_cndmask_b32_e64 v11, 0, -1, s[4:5] +; CGP-NEXT: v_mov_b32_e32 v13, s6 +; CGP-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v12 +; CGP-NEXT: v_add_i32_e32 v12, vcc, 1, v8 +; CGP-NEXT: v_cndmask_b32_e64 v11, v13, v11, s[4:5] +; CGP-NEXT: v_addc_u32_e32 v13, vcc, 0, v9, vcc ; CGP-NEXT: s_bfe_i32 s4, -1, 0x10000 -; CGP-NEXT: v_cmp_le_u32_e32 vcc, s7, v0 -; CGP-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc -; CGP-NEXT: v_mov_b32_e32 v12, s4 +; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v10, v4 +; CGP-NEXT: v_cndmask_b32_e64 v10, 0, -1, vcc +; CGP-NEXT: v_mov_b32_e32 v14, s4 ; CGP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 -; CGP-NEXT: v_cndmask_b32_e32 v0, v12, v0, vcc -; CGP-NEXT: v_add_i32_e32 v1, vcc, 1, v10 -; CGP-NEXT: v_addc_u32_e32 v12, vcc, 0, v11, vcc -; CGP-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 -; CGP-NEXT: v_cndmask_b32_e32 v0, v10, v1, vcc -; CGP-NEXT: v_cndmask_b32_e32 v1, v11, v12, vcc -; CGP-NEXT: v_cmp_ne_u32_e32 vcc, 0, v9 -; CGP-NEXT: v_cndmask_b32_e32 v0, v7, v0, vcc -; CGP-NEXT: v_mul_f32_e32 v7, 0x2f800000, v5 -; CGP-NEXT: v_trunc_f32_e32 v7, v7 -; CGP-NEXT: v_mac_f32_e32 v5, 0xcf800000, v7 -; CGP-NEXT: v_cvt_u32_f32_e32 v5, v5 -; CGP-NEXT: v_cvt_u32_f32_e32 v7, v7 +; CGP-NEXT: v_cndmask_b32_e32 v1, v14, v10, vcc +; CGP-NEXT: v_add_i32_e32 v10, vcc, 1, v12 +; CGP-NEXT: v_addc_u32_e32 v14, vcc, 0, v13, vcc +; CGP-NEXT: v_cmp_ne_u32_e32 vcc, 0, v1 +; CGP-NEXT: v_cndmask_b32_e32 v1, v12, v10, vcc +; CGP-NEXT: v_cndmask_b32_e32 v10, v13, v14, vcc +; CGP-NEXT: v_cmp_ne_u32_e32 vcc, 0, v11 ; CGP-NEXT: v_cndmask_b32_e32 v1, v8, v1, vcc -; CGP-NEXT: v_ashrrev_i32_e32 v6, 31, v3 -; CGP-NEXT: v_mul_lo_u32 v8, -1, v5 -; CGP-NEXT: v_mul_lo_u32 v9, s6, v7 -; CGP-NEXT: v_mul_hi_u32 v11, s6, v5 -; CGP-NEXT: v_mul_lo_u32 v10, s6, v5 -; CGP-NEXT: v_add_i32_e32 v2, vcc, v2, v6 -; CGP-NEXT: v_addc_u32_e32 v3, vcc, v3, v6, vcc -; CGP-NEXT: v_add_i32_e32 v8, vcc, v8, v9 -; CGP-NEXT: v_add_i32_e32 v8, vcc, v8, v11 -; CGP-NEXT: v_mul_lo_u32 v9, v7, v10 -; CGP-NEXT: v_mul_lo_u32 v11, v5, v8 -; CGP-NEXT: v_mul_hi_u32 v12, v5, v10 -; CGP-NEXT: v_mul_hi_u32 v10, v7, v10 -; CGP-NEXT: v_xor_b32_e32 v0, v0, v4 -; CGP-NEXT: v_add_i32_e32 v9, vcc, v9, v11 -; CGP-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v9, vcc, v9, v12 -; CGP-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc -; CGP-NEXT: v_mul_lo_u32 v12, v7, v8 -; CGP-NEXT: v_add_i32_e32 v9, vcc, v11, v9 -; CGP-NEXT: v_mul_hi_u32 v11, v5, v8 -; CGP-NEXT: v_add_i32_e32 v10, vcc, v12, v10 -; CGP-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc +; CGP-NEXT: v_cndmask_b32_e32 v8, v9, v10, vcc +; CGP-NEXT: v_mul_f32_e32 v9, 0x2f800000, v6 +; CGP-NEXT: v_trunc_f32_e32 v9, v9 +; CGP-NEXT: v_mac_f32_e32 v6, 0xcf800000, v9 +; CGP-NEXT: v_cvt_u32_f32_e32 v6, v6 +; CGP-NEXT: v_cvt_u32_f32_e32 v9, v9 +; CGP-NEXT: v_xor_b32_e32 v7, v8, v5 +; CGP-NEXT: v_ashrrev_i32_e32 v8, 31, v3 +; CGP-NEXT: v_mul_lo_u32 v10, -1, v6 +; CGP-NEXT: v_mul_lo_u32 v11, v0, v9 +; CGP-NEXT: v_mul_hi_u32 v13, v0, v6 +; CGP-NEXT: v_mul_lo_u32 v12, v0, v6 +; CGP-NEXT: v_add_i32_e32 v2, vcc, v2, v8 +; CGP-NEXT: v_addc_u32_e32 v3, vcc, v3, v8, vcc ; CGP-NEXT: v_add_i32_e32 v10, vcc, v10, v11 -; CGP-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v11, vcc, v12, v11 -; CGP-NEXT: v_mul_hi_u32 v8, v7, v8 -; CGP-NEXT: v_add_i32_e32 v9, vcc, v10, v9 -; CGP-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v10, vcc, v11, v10 -; CGP-NEXT: v_add_i32_e32 v8, vcc, v8, v10 -; CGP-NEXT: v_add_i32_e32 v5, vcc, v5, v9 -; CGP-NEXT: v_addc_u32_e64 v9, s[4:5], v7, v8, vcc -; CGP-NEXT: v_mul_lo_u32 v10, -1, v5 -; CGP-NEXT: v_mul_lo_u32 v11, s6, v9 -; CGP-NEXT: v_mul_hi_u32 v13, s6, v5 -; CGP-NEXT: v_mul_lo_u32 v12, s6, v5 -; CGP-NEXT: v_add_i32_e64 v7, s[4:5], v7, v8 -; CGP-NEXT: v_add_i32_e64 v10, s[4:5], v10, v11 -; CGP-NEXT: v_add_i32_e64 v10, s[4:5], v10, v13 +; CGP-NEXT: v_add_i32_e32 v10, vcc, v10, v13 ; CGP-NEXT: v_mul_lo_u32 v11, v9, v12 -; CGP-NEXT: v_mul_lo_u32 v13, v5, v10 -; CGP-NEXT: v_mul_hi_u32 v8, v5, v12 +; CGP-NEXT: v_mul_lo_u32 v13, v6, v10 +; CGP-NEXT: v_mul_hi_u32 v14, v6, v12 ; CGP-NEXT: v_mul_hi_u32 v12, v9, v12 -; CGP-NEXT: v_xor_b32_e32 v2, v2, v6 -; CGP-NEXT: v_add_i32_e64 v11, s[4:5], v11, v13 -; CGP-NEXT: v_cndmask_b32_e64 v13, 0, 1, s[4:5] -; CGP-NEXT: v_add_i32_e64 v8, s[4:5], v11, v8 -; CGP-NEXT: v_cndmask_b32_e64 v8, 0, 1, s[4:5] -; CGP-NEXT: v_mul_lo_u32 v11, v9, v10 -; CGP-NEXT: v_add_i32_e64 v8, s[4:5], v13, v8 -; CGP-NEXT: v_mul_hi_u32 v13, v5, v10 -; CGP-NEXT: v_add_i32_e64 v11, s[4:5], v11, v12 -; CGP-NEXT: v_cndmask_b32_e64 v12, 0, 1, s[4:5] -; CGP-NEXT: v_add_i32_e64 v11, s[4:5], v11, v13 +; CGP-NEXT: v_xor_b32_e32 v1, v1, v5 +; CGP-NEXT: v_add_i32_e32 v11, vcc, v11, v13 +; CGP-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v11, vcc, v11, v14 +; CGP-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc +; CGP-NEXT: v_mul_lo_u32 v14, v9, v10 +; CGP-NEXT: v_add_i32_e32 v11, vcc, v13, v11 +; CGP-NEXT: v_mul_hi_u32 v13, v6, v10 +; CGP-NEXT: v_add_i32_e32 v12, vcc, v14, v12 +; CGP-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v12, vcc, v12, v13 +; CGP-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v13, vcc, v14, v13 +; CGP-NEXT: v_mul_hi_u32 v10, v9, v10 +; CGP-NEXT: v_add_i32_e32 v11, vcc, v12, v11 +; CGP-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v12, vcc, v13, v12 +; CGP-NEXT: v_add_i32_e32 v10, vcc, v10, v12 +; CGP-NEXT: v_add_i32_e32 v6, vcc, v6, v11 +; CGP-NEXT: v_addc_u32_e64 v11, s[4:5], v9, v10, vcc +; CGP-NEXT: v_mul_lo_u32 v12, -1, v6 +; CGP-NEXT: v_mul_lo_u32 v13, v0, v11 +; CGP-NEXT: v_mul_lo_u32 v14, v0, v6 +; CGP-NEXT: v_mul_hi_u32 v0, v0, v6 +; CGP-NEXT: v_add_i32_e64 v9, s[4:5], v9, v10 +; CGP-NEXT: v_add_i32_e64 v12, s[4:5], v12, v13 +; CGP-NEXT: v_add_i32_e64 v0, s[4:5], v12, v0 +; CGP-NEXT: v_mul_lo_u32 v12, v11, v14 +; CGP-NEXT: v_mul_lo_u32 v13, v6, v0 +; CGP-NEXT: v_mul_hi_u32 v10, v6, v14 +; CGP-NEXT: v_mul_hi_u32 v14, v11, v14 +; CGP-NEXT: v_xor_b32_e32 v2, v2, v8 +; CGP-NEXT: v_add_i32_e64 v12, s[4:5], v12, v13 ; CGP-NEXT: v_cndmask_b32_e64 v13, 0, 1, s[4:5] +; CGP-NEXT: v_add_i32_e64 v10, s[4:5], v12, v10 +; CGP-NEXT: v_cndmask_b32_e64 v10, 0, 1, s[4:5] +; CGP-NEXT: v_mul_lo_u32 v12, v11, v0 +; CGP-NEXT: v_add_i32_e64 v10, s[4:5], v13, v10 +; CGP-NEXT: v_mul_hi_u32 v13, v6, v0 +; CGP-NEXT: v_add_i32_e64 v12, s[4:5], v12, v14 +; CGP-NEXT: v_cndmask_b32_e64 v14, 0, 1, s[4:5] ; CGP-NEXT: v_add_i32_e64 v12, s[4:5], v12, v13 -; CGP-NEXT: v_mul_hi_u32 v9, v9, v10 -; CGP-NEXT: v_add_i32_e64 v8, s[4:5], v11, v8 -; CGP-NEXT: v_cndmask_b32_e64 v11, 0, 1, s[4:5] -; CGP-NEXT: v_add_i32_e64 v10, s[4:5], v12, v11 -; CGP-NEXT: v_add_i32_e64 v9, s[4:5], v9, v10 -; CGP-NEXT: v_addc_u32_e32 v7, vcc, v7, v9, vcc -; CGP-NEXT: v_add_i32_e32 v5, vcc, v5, v8 -; CGP-NEXT: v_xor_b32_e32 v3, v3, v6 -; CGP-NEXT: v_addc_u32_e32 v7, vcc, 0, v7, vcc -; CGP-NEXT: v_xor_b32_e32 v1, v1, v4 -; CGP-NEXT: v_mul_lo_u32 v8, v3, v5 -; CGP-NEXT: v_mul_lo_u32 v9, v2, v7 -; CGP-NEXT: v_sub_i32_e32 v0, vcc, v0, v4 -; CGP-NEXT: v_subb_u32_e32 v1, vcc, v1, v4, vcc -; CGP-NEXT: v_mul_hi_u32 v4, v2, v5 -; CGP-NEXT: v_add_i32_e32 v8, vcc, v8, v9 -; CGP-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v4, vcc, v8, v4 -; CGP-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc -; CGP-NEXT: v_mul_lo_u32 v8, v3, v7 -; CGP-NEXT: v_mul_hi_u32 v5, v3, v5 -; CGP-NEXT: v_add_i32_e32 v4, vcc, v9, v4 -; CGP-NEXT: v_mul_hi_u32 v9, v2, v7 -; CGP-NEXT: v_add_i32_e32 v5, vcc, v8, v5 -; CGP-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v5, vcc, v5, v9 -; CGP-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v8, vcc, v8, v9 -; CGP-NEXT: v_mul_hi_u32 v7, v3, v7 -; CGP-NEXT: v_add_i32_e32 v4, vcc, v5, v4 -; CGP-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v5, vcc, v8, v5 +; CGP-NEXT: v_cndmask_b32_e64 v13, 0, 1, s[4:5] +; CGP-NEXT: v_add_i32_e64 v13, s[4:5], v14, v13 +; CGP-NEXT: v_mul_hi_u32 v0, v11, v0 +; CGP-NEXT: v_add_i32_e64 v10, s[4:5], v12, v10 +; CGP-NEXT: v_cndmask_b32_e64 v12, 0, 1, s[4:5] +; CGP-NEXT: v_add_i32_e64 v11, s[4:5], v13, v12 +; CGP-NEXT: v_add_i32_e64 v0, s[4:5], v0, v11 +; CGP-NEXT: v_addc_u32_e32 v0, vcc, v9, v0, vcc +; CGP-NEXT: v_add_i32_e32 v6, vcc, v6, v10 +; CGP-NEXT: v_xor_b32_e32 v3, v3, v8 +; CGP-NEXT: v_addc_u32_e32 v9, vcc, 0, v0, vcc +; CGP-NEXT: v_mul_lo_u32 v10, v3, v6 +; CGP-NEXT: v_mul_lo_u32 v11, v2, v9 +; CGP-NEXT: v_sub_i32_e32 v0, vcc, v1, v5 +; CGP-NEXT: v_subb_u32_e32 v1, vcc, v7, v5, vcc +; CGP-NEXT: v_mul_hi_u32 v5, v2, v6 +; CGP-NEXT: v_add_i32_e32 v7, vcc, v10, v11 +; CGP-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v5, vcc, v7, v5 -; CGP-NEXT: v_mul_lo_u32 v7, 0, v4 -; CGP-NEXT: v_mul_lo_u32 v8, s7, v5 -; CGP-NEXT: v_mul_hi_u32 v10, s7, v4 -; CGP-NEXT: v_mul_lo_u32 v9, s7, v4 -; CGP-NEXT: s_bfe_i32 s6, -1, 0x10000 -; CGP-NEXT: v_add_i32_e32 v7, vcc, v7, v8 +; CGP-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc +; CGP-NEXT: v_mul_lo_u32 v7, v3, v9 +; CGP-NEXT: v_mul_hi_u32 v6, v3, v6 +; CGP-NEXT: v_add_i32_e32 v5, vcc, v10, v5 +; CGP-NEXT: v_mul_hi_u32 v10, v2, v9 +; CGP-NEXT: v_add_i32_e32 v6, vcc, v7, v6 +; CGP-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v6, vcc, v6, v10 +; CGP-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v7, vcc, v7, v10 -; CGP-NEXT: v_sub_i32_e32 v2, vcc, v2, v9 -; CGP-NEXT: v_subb_u32_e64 v8, s[4:5], v3, v7, vcc +; CGP-NEXT: v_mul_hi_u32 v9, v3, v9 +; CGP-NEXT: v_add_i32_e32 v5, vcc, v6, v5 +; CGP-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v6, vcc, v7, v6 +; CGP-NEXT: v_add_i32_e32 v6, vcc, v9, v6 +; CGP-NEXT: v_mul_lo_u32 v7, 0, v5 +; CGP-NEXT: v_mul_lo_u32 v9, v4, v6 +; CGP-NEXT: v_mul_hi_u32 v11, v4, v5 +; CGP-NEXT: v_mul_lo_u32 v10, v4, v5 +; CGP-NEXT: s_bfe_i32 s6, -1, 0x10000 +; CGP-NEXT: v_add_i32_e32 v7, vcc, v7, v9 +; CGP-NEXT: v_add_i32_e32 v7, vcc, v7, v11 +; CGP-NEXT: v_sub_i32_e32 v2, vcc, v2, v10 +; CGP-NEXT: v_subb_u32_e64 v9, s[4:5], v3, v7, vcc ; CGP-NEXT: v_sub_i32_e64 v3, s[4:5], v3, v7 ; CGP-NEXT: v_subbrev_u32_e32 v3, vcc, 0, v3, vcc -; CGP-NEXT: v_cmp_le_u32_e64 s[4:5], s7, v2 -; CGP-NEXT: v_subrev_i32_e32 v2, vcc, s7, v2 +; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v2, v4 +; CGP-NEXT: v_sub_i32_e32 v2, vcc, v2, v4 ; CGP-NEXT: v_subbrev_u32_e32 v3, vcc, 0, v3, vcc ; CGP-NEXT: v_cndmask_b32_e64 v7, 0, -1, s[4:5] -; CGP-NEXT: v_mov_b32_e32 v9, s6 -; CGP-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v8 -; CGP-NEXT: v_add_i32_e32 v8, vcc, 1, v4 -; CGP-NEXT: v_cndmask_b32_e64 v7, v9, v7, s[4:5] -; CGP-NEXT: v_addc_u32_e32 v9, vcc, 0, v5, vcc +; CGP-NEXT: v_mov_b32_e32 v10, s6 +; CGP-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v9 +; CGP-NEXT: v_add_i32_e32 v9, vcc, 1, v5 +; CGP-NEXT: v_cndmask_b32_e64 v7, v10, v7, s[4:5] +; CGP-NEXT: v_addc_u32_e32 v10, vcc, 0, v6, vcc ; CGP-NEXT: s_bfe_i32 s4, -1, 0x10000 -; CGP-NEXT: v_cmp_le_u32_e32 vcc, s7, v2 +; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v2, v4 ; CGP-NEXT: v_cndmask_b32_e64 v2, 0, -1, vcc -; CGP-NEXT: v_mov_b32_e32 v10, s4 +; CGP-NEXT: v_mov_b32_e32 v4, s4 ; CGP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 -; CGP-NEXT: v_cndmask_b32_e32 v2, v10, v2, vcc -; CGP-NEXT: v_add_i32_e32 v3, vcc, 1, v8 -; CGP-NEXT: v_addc_u32_e32 v10, vcc, 0, v9, vcc +; CGP-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc +; CGP-NEXT: v_add_i32_e32 v3, vcc, 1, v9 +; CGP-NEXT: v_addc_u32_e32 v4, vcc, 0, v10, vcc ; CGP-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 -; CGP-NEXT: v_cndmask_b32_e32 v2, v8, v3, vcc -; CGP-NEXT: v_cndmask_b32_e32 v3, v9, v10, vcc +; CGP-NEXT: v_cndmask_b32_e32 v2, v9, v3, vcc +; CGP-NEXT: v_cndmask_b32_e32 v3, v10, v4, vcc ; CGP-NEXT: v_cmp_ne_u32_e32 vcc, 0, v7 -; CGP-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc -; CGP-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc -; CGP-NEXT: v_xor_b32_e32 v2, v2, v6 -; CGP-NEXT: v_xor_b32_e32 v3, v3, v6 -; CGP-NEXT: v_sub_i32_e32 v2, vcc, v2, v6 -; CGP-NEXT: v_subb_u32_e32 v3, vcc, v3, v6, vcc +; CGP-NEXT: v_cndmask_b32_e32 v2, v5, v2, vcc +; CGP-NEXT: v_cndmask_b32_e32 v3, v6, v3, vcc +; CGP-NEXT: v_xor_b32_e32 v2, v2, v8 +; CGP-NEXT: v_xor_b32_e32 v3, v3, v8 +; CGP-NEXT: v_sub_i32_e32 v2, vcc, v2, v8 +; CGP-NEXT: v_subb_u32_e32 v3, vcc, v3, v8, vcc ; CGP-NEXT: s_setpc_b64 s[30:31] %result = sdiv <2 x i64> %num, ret <2 x i64> %result @@ -2679,17 +2679,17 @@ ; CHECK-NEXT: v_mul_hi_u32 v1, v0, v1 ; CHECK-NEXT: v_add_i32_e32 v0, vcc, v0, v1 ; CHECK-NEXT: v_mul_hi_u32 v0, v3, v0 -; CHECK-NEXT: v_mul_lo_u32 v1, v0, v5 -; CHECK-NEXT: v_add_i32_e32 v2, vcc, 1, v0 -; CHECK-NEXT: v_sub_i32_e32 v1, vcc, v3, v1 -; CHECK-NEXT: v_cmp_ge_u32_e32 vcc, v1, v5 -; CHECK-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc -; CHECK-NEXT: v_sub_i32_e64 v2, s[4:5], v1, v5 -; CHECK-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc -; CHECK-NEXT: v_add_i32_e32 v2, vcc, 1, v0 -; CHECK-NEXT: v_cmp_ge_u32_e32 vcc, v1, v5 -; CHECK-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc ; CHECK-NEXT: v_mov_b32_e32 v1, 0 +; CHECK-NEXT: v_mul_lo_u32 v2, v0, v5 +; CHECK-NEXT: v_add_i32_e32 v4, vcc, 1, v0 +; CHECK-NEXT: v_sub_i32_e32 v2, vcc, v3, v2 +; CHECK-NEXT: v_cmp_ge_u32_e32 vcc, v2, v5 +; CHECK-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc +; CHECK-NEXT: v_sub_i32_e64 v3, s[4:5], v2, v5 +; CHECK-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc +; CHECK-NEXT: v_add_i32_e32 v3, vcc, 1, v0 +; CHECK-NEXT: v_cmp_ge_u32_e32 vcc, v2, v5 +; CHECK-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc ; CHECK-NEXT: BB7_4: ; CHECK-NEXT: s_or_b64 exec, exec, s[6:7] ; CHECK-NEXT: s_setpc_b64 s[30:31] @@ -3167,17 +3167,17 @@ ; CGP-NEXT: v_mul_hi_u32 v1, v0, v1 ; CGP-NEXT: v_add_i32_e32 v0, vcc, v0, v1 ; CGP-NEXT: v_mul_hi_u32 v0, v8, v0 -; CGP-NEXT: v_mul_lo_u32 v1, v0, v2 -; CGP-NEXT: v_add_i32_e32 v3, vcc, 1, v0 -; CGP-NEXT: v_sub_i32_e32 v1, vcc, v8, v1 -; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v1, v2 -; CGP-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc -; CGP-NEXT: v_sub_i32_e64 v3, s[4:5], v1, v2 -; CGP-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc -; CGP-NEXT: v_add_i32_e32 v3, vcc, 1, v0 -; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v1, v2 -; CGP-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc ; CGP-NEXT: v_mov_b32_e32 v1, 0 +; CGP-NEXT: v_mul_lo_u32 v3, v0, v2 +; CGP-NEXT: v_add_i32_e32 v4, vcc, 1, v0 +; CGP-NEXT: v_sub_i32_e32 v3, vcc, v8, v3 +; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v3, v2 +; CGP-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc +; CGP-NEXT: v_sub_i32_e64 v4, s[4:5], v3, v2 +; CGP-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc +; CGP-NEXT: v_add_i32_e32 v4, vcc, 1, v0 +; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v3, v2 +; CGP-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc ; CGP-NEXT: BB8_4: ; CGP-NEXT: s_or_b64 exec, exec, s[6:7] ; CGP-NEXT: v_or_b32_e32 v3, v7, v11 @@ -3346,17 +3346,17 @@ ; CGP-NEXT: v_mul_hi_u32 v3, v2, v3 ; CGP-NEXT: v_add_i32_e32 v2, vcc, v2, v3 ; CGP-NEXT: v_mul_hi_u32 v2, v5, v2 -; CGP-NEXT: v_mul_lo_u32 v3, v2, v10 -; CGP-NEXT: v_add_i32_e32 v4, vcc, 1, v2 -; CGP-NEXT: v_sub_i32_e32 v3, vcc, v5, v3 -; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v3, v10 -; CGP-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc -; CGP-NEXT: v_sub_i32_e64 v4, s[4:5], v3, v10 -; CGP-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc -; CGP-NEXT: v_add_i32_e32 v4, vcc, 1, v2 -; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v3, v10 -; CGP-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc ; CGP-NEXT: v_mov_b32_e32 v3, 0 +; CGP-NEXT: v_mul_lo_u32 v4, v2, v10 +; CGP-NEXT: v_add_i32_e32 v6, vcc, 1, v2 +; CGP-NEXT: v_sub_i32_e32 v4, vcc, v5, v4 +; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v4, v10 +; CGP-NEXT: v_cndmask_b32_e32 v2, v2, v6, vcc +; CGP-NEXT: v_sub_i32_e64 v5, s[4:5], v4, v10 +; CGP-NEXT: v_cndmask_b32_e32 v4, v4, v5, vcc +; CGP-NEXT: v_add_i32_e32 v5, vcc, 1, v2 +; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v4, v10 +; CGP-NEXT: v_cndmask_b32_e32 v2, v2, v5, vcc ; CGP-NEXT: BB8_8: ; CGP-NEXT: s_or_b64 exec, exec, s[6:7] ; CGP-NEXT: s_setpc_b64 s[30:31] Index: llvm/test/CodeGen/AMDGPU/GlobalISel/sdivrem.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/GlobalISel/sdivrem.ll +++ llvm/test/CodeGen/AMDGPU/GlobalISel/sdivrem.ll @@ -421,7 +421,7 @@ ; GFX9-NEXT: v_mul_lo_u32 v3, s10, v1 ; GFX9-NEXT: v_mul_hi_u32 v5, s10, v0 ; GFX9-NEXT: v_mul_hi_u32 v0, s11, v0 -; GFX9-NEXT: v_mov_b32_e32 v4, s9 +; GFX9-NEXT: v_mov_b32_e32 v6, s9 ; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v2, v3 ; GFX9-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc ; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v2, v5 @@ -442,7 +442,7 @@ ; GFX9-NEXT: v_mul_lo_u32 v3, s8, v1 ; GFX9-NEXT: v_mul_hi_u32 v5, s8, v0 ; GFX9-NEXT: v_mul_lo_u32 v7, s8, v0 -; GFX9-NEXT: v_mov_b32_e32 v6, 0 +; GFX9-NEXT: v_mov_b32_e32 v4, 0 ; GFX9-NEXT: v_add3_u32 v2, v2, v3, v5 ; GFX9-NEXT: v_sub_co_u32_e32 v3, vcc, s10, v7 ; GFX9-NEXT: v_subb_co_u32_e64 v5, s[0:1], v8, v2, vcc @@ -452,7 +452,7 @@ ; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s8, v3 ; GFX9-NEXT: v_cndmask_b32_e64 v8, 0, -1, s[0:1] ; GFX9-NEXT: v_cmp_eq_u32_e64 s[0:1], s9, v5 -; GFX9-NEXT: v_subb_co_u32_e32 v2, vcc, v2, v4, vcc +; GFX9-NEXT: v_subb_co_u32_e32 v2, vcc, v2, v6, vcc ; GFX9-NEXT: v_cndmask_b32_e64 v7, v7, v8, s[0:1] ; GFX9-NEXT: v_subrev_co_u32_e32 v8, vcc, s8, v3 ; GFX9-NEXT: v_subbrev_co_u32_e64 v9, s[0:1], 0, v2, vcc @@ -461,10 +461,10 @@ ; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s9, v9 ; GFX9-NEXT: v_cndmask_b32_e64 v12, 0, -1, s[0:1] ; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s8, v8 -; GFX9-NEXT: v_subb_co_u32_e32 v2, vcc, v2, v4, vcc +; GFX9-NEXT: v_subb_co_u32_e32 v2, vcc, v2, v6, vcc ; GFX9-NEXT: v_cndmask_b32_e64 v13, 0, -1, s[0:1] ; GFX9-NEXT: v_cmp_eq_u32_e64 s[0:1], s9, v9 -; GFX9-NEXT: v_subrev_co_u32_e32 v4, vcc, s8, v8 +; GFX9-NEXT: v_subrev_co_u32_e32 v6, vcc, s8, v8 ; GFX9-NEXT: v_cndmask_b32_e64 v12, v12, v13, s[0:1] ; GFX9-NEXT: v_add_co_u32_e64 v13, s[0:1], 1, v10 ; GFX9-NEXT: v_subbrev_co_u32_e32 v2, vcc, 0, v2, vcc @@ -473,25 +473,25 @@ ; GFX9-NEXT: v_cndmask_b32_e32 v10, v10, v13, vcc ; GFX9-NEXT: v_cndmask_b32_e32 v11, v11, v14, vcc ; GFX9-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v7 -; GFX9-NEXT: v_cndmask_b32_e32 v4, v8, v4, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v6, v8, v6, vcc ; GFX9-NEXT: v_cndmask_b32_e32 v2, v9, v2, vcc ; GFX9-NEXT: v_cndmask_b32_e64 v0, v0, v10, s[0:1] ; GFX9-NEXT: v_cndmask_b32_e64 v1, v1, v11, s[0:1] -; GFX9-NEXT: v_cndmask_b32_e64 v3, v3, v4, s[0:1] +; GFX9-NEXT: v_cndmask_b32_e64 v3, v3, v6, s[0:1] ; GFX9-NEXT: v_cndmask_b32_e64 v2, v5, v2, s[0:1] ; GFX9-NEXT: s_xor_b64 s[0:1], s[2:3], s[12:13] ; GFX9-NEXT: v_xor_b32_e32 v0, s0, v0 ; GFX9-NEXT: v_xor_b32_e32 v1, s1, v1 -; GFX9-NEXT: v_mov_b32_e32 v4, s1 +; GFX9-NEXT: v_mov_b32_e32 v5, s1 ; GFX9-NEXT: v_subrev_co_u32_e32 v0, vcc, s0, v0 -; GFX9-NEXT: v_subb_co_u32_e32 v1, vcc, v1, v4, vcc +; GFX9-NEXT: v_subb_co_u32_e32 v1, vcc, v1, v5, vcc ; GFX9-NEXT: v_xor_b32_e32 v3, s2, v3 -; GFX9-NEXT: v_xor_b32_e32 v4, s2, v2 -; GFX9-NEXT: v_mov_b32_e32 v5, s2 +; GFX9-NEXT: v_xor_b32_e32 v5, s2, v2 +; GFX9-NEXT: v_mov_b32_e32 v6, s2 ; GFX9-NEXT: v_subrev_co_u32_e32 v2, vcc, s2, v3 -; GFX9-NEXT: v_subb_co_u32_e32 v3, vcc, v4, v5, vcc -; GFX9-NEXT: global_store_dwordx2 v6, v[0:1], s[4:5] -; GFX9-NEXT: global_store_dwordx2 v6, v[2:3], s[6:7] +; GFX9-NEXT: v_subb_co_u32_e32 v3, vcc, v5, v6, vcc +; GFX9-NEXT: global_store_dwordx2 v4, v[0:1], s[4:5] +; GFX9-NEXT: global_store_dwordx2 v4, v[2:3], s[6:7] ; GFX9-NEXT: s_endpgm ; ; GFX10-LABEL: sdivrem_i64: @@ -2579,17 +2579,17 @@ ; GFX8-NEXT: v_xor_b32_e32 v1, s0, v1 ; GFX8-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc ; GFX8-NEXT: v_subrev_u32_e32 v1, vcc, s0, v1 -; GFX8-NEXT: s_movk_i32 s0, 0xff -; GFX8-NEXT: v_and_b32_e32 v1, s0, v1 +; GFX8-NEXT: v_mov_b32_e32 v4, 0xff +; GFX8-NEXT: v_and_b32_e32 v1, v1, v4 ; GFX8-NEXT: v_lshlrev_b16_e32 v1, 8, v1 ; GFX8-NEXT: v_xor_b32_e32 v3, s2, v3 -; GFX8-NEXT: v_or_b32_sdwa v4, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_sdwa v5, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v0, s4 ; GFX8-NEXT: v_subrev_u32_e32 v3, vcc, s2, v3 ; GFX8-NEXT: v_mov_b32_e32 v1, s5 -; GFX8-NEXT: flat_store_short v[0:1], v4 -; GFX8-NEXT: v_and_b32_e32 v0, s0, v3 +; GFX8-NEXT: flat_store_short v[0:1], v5 +; GFX8-NEXT: v_and_b32_e32 v0, v3, v4 ; GFX8-NEXT: v_lshlrev_b16_e32 v0, 8, v0 ; GFX8-NEXT: v_or_b32_sdwa v2, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX8-NEXT: v_mov_b32_e32 v0, s6 @@ -2661,12 +2661,12 @@ ; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s5, v3 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc ; GFX9-NEXT: s_xor_b32 s4, s11, s9 -; GFX9-NEXT: v_xor_b32_e32 v1, s4, v1 ; GFX9-NEXT: v_subrev_u32_e32 v4, s5, v3 -; GFX9-NEXT: v_subrev_u32_e32 v1, s4, v1 -; GFX9-NEXT: s_movk_i32 s4, 0xff +; GFX9-NEXT: v_xor_b32_e32 v1, s4, v1 ; GFX9-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc -; GFX9-NEXT: v_and_b32_e32 v1, s4, v1 +; GFX9-NEXT: v_subrev_u32_e32 v1, s4, v1 +; GFX9-NEXT: v_mov_b32_e32 v4, 0xff +; GFX9-NEXT: v_and_b32_e32 v1, v1, v4 ; GFX9-NEXT: v_subrev_u32_e32 v0, s6, v0 ; GFX9-NEXT: v_xor_b32_e32 v3, s11, v3 ; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 @@ -2676,7 +2676,7 @@ ; GFX9-NEXT: v_xor_b32_e32 v2, s10, v2 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_store_short v1, v0, s[0:1] -; GFX9-NEXT: v_and_b32_e32 v0, s4, v3 +; GFX9-NEXT: v_and_b32_e32 v0, v3, v4 ; GFX9-NEXT: v_subrev_u32_e32 v2, s10, v2 ; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v0 ; GFX9-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD @@ -2688,17 +2688,16 @@ ; GFX10-NEXT: s_load_dword s0, s[4:5], 0x10 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_bfe_i32 s1, s0, 0x80018 -; GFX10-NEXT: s_bfe_i32 s2, s0, 0x80010 -; GFX10-NEXT: s_ashr_i32 s3, s1, 31 -; GFX10-NEXT: s_ashr_i32 s8, s2, 31 -; GFX10-NEXT: s_add_i32 s1, s1, s3 -; GFX10-NEXT: s_add_i32 s2, s2, s8 -; GFX10-NEXT: s_xor_b32 s1, s1, s3 -; GFX10-NEXT: s_xor_b32 s2, s2, s8 +; GFX10-NEXT: s_bfe_i32 s3, s0, 0x80010 +; GFX10-NEXT: s_ashr_i32 s2, s1, 31 +; GFX10-NEXT: s_ashr_i32 s8, s3, 31 +; GFX10-NEXT: s_add_i32 s1, s1, s2 +; GFX10-NEXT: s_add_i32 s3, s3, s8 +; GFX10-NEXT: s_xor_b32 s1, s1, s2 +; GFX10-NEXT: s_xor_b32 s3, s3, s8 ; GFX10-NEXT: v_cvt_f32_u32_e32 v0, s1 -; GFX10-NEXT: v_cvt_f32_u32_e32 v1, s2 +; GFX10-NEXT: v_cvt_f32_u32_e32 v1, s3 ; GFX10-NEXT: s_sub_i32 s6, 0, s1 -; GFX10-NEXT: s_sub_i32 s7, 0, s2 ; GFX10-NEXT: v_rcp_iflag_f32_e32 v0, v0 ; GFX10-NEXT: v_rcp_iflag_f32_e32 v1, v1 ; GFX10-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 @@ -2706,59 +2705,60 @@ ; GFX10-NEXT: v_cvt_u32_f32_e32 v0, v0 ; GFX10-NEXT: v_cvt_u32_f32_e32 v1, v1 ; GFX10-NEXT: v_mul_lo_u32 v2, s6, v0 -; GFX10-NEXT: v_mul_lo_u32 v3, s7, v1 -; GFX10-NEXT: s_sext_i32_i8 s6, s0 -; GFX10-NEXT: s_bfe_i32 s0, s0, 0x80008 +; GFX10-NEXT: s_sub_i32 s6, 0, s3 +; GFX10-NEXT: v_mul_lo_u32 v3, s6, v1 +; GFX10-NEXT: s_bfe_i32 s6, s0, 0x80008 +; GFX10-NEXT: s_sext_i32_i8 s0, s0 ; GFX10-NEXT: s_ashr_i32 s9, s6, 31 ; GFX10-NEXT: s_ashr_i32 s10, s0, 31 +; GFX10-NEXT: v_mul_hi_u32 v2, v0, v2 ; GFX10-NEXT: s_add_i32 s6, s6, s9 ; GFX10-NEXT: s_add_i32 s0, s0, s10 -; GFX10-NEXT: v_mul_hi_u32 v2, v0, v2 ; GFX10-NEXT: v_mul_hi_u32 v3, v1, v3 -; GFX10-NEXT: s_xor_b32 s0, s0, s10 ; GFX10-NEXT: s_xor_b32 s6, s6, s9 +; GFX10-NEXT: s_xor_b32 s0, s0, s10 ; GFX10-NEXT: v_add_nc_u32_e32 v0, v0, v2 ; GFX10-NEXT: v_add_nc_u32_e32 v1, v1, v3 -; GFX10-NEXT: v_mul_hi_u32 v0, s0, v0 -; GFX10-NEXT: v_mul_hi_u32 v1, s6, v1 +; GFX10-NEXT: v_mul_hi_u32 v0, s6, v0 +; GFX10-NEXT: v_mul_hi_u32 v1, s0, v1 ; GFX10-NEXT: v_mul_lo_u32 v2, v0, s1 -; GFX10-NEXT: v_mul_lo_u32 v3, v1, s2 -; GFX10-NEXT: v_add_nc_u32_e32 v5, 1, v0 -; GFX10-NEXT: v_add_nc_u32_e32 v4, 1, v1 -; GFX10-NEXT: v_sub_nc_u32_e32 v2, s0, v2 -; GFX10-NEXT: v_sub_nc_u32_e32 v3, s6, v3 +; GFX10-NEXT: v_add_nc_u32_e32 v4, 1, v0 +; GFX10-NEXT: v_mul_lo_u32 v3, v1, s3 +; GFX10-NEXT: v_add_nc_u32_e32 v6, 1, v1 +; GFX10-NEXT: v_sub_nc_u32_e32 v2, s6, v2 ; GFX10-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x0 +; GFX10-NEXT: v_sub_nc_u32_e32 v3, s0, v3 +; GFX10-NEXT: v_subrev_nc_u32_e32 v5, s1, v2 ; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, s1, v2 -; GFX10-NEXT: v_subrev_nc_u32_e32 v6, s1, v2 -; GFX10-NEXT: v_subrev_nc_u32_e32 v7, s2, v3 -; GFX10-NEXT: v_cmp_le_u32_e64 s0, s2, v3 -; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v5, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e32 v2, v2, v6, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, v4, s0 +; GFX10-NEXT: v_cmp_le_u32_e64 s0, s3, v3 +; GFX10-NEXT: v_subrev_nc_u32_e32 v7, s3, v3 +; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e32 v2, v2, v5, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, v6, s0 ; GFX10-NEXT: v_cndmask_b32_e64 v3, v3, v7, s0 -; GFX10-NEXT: v_add_nc_u32_e32 v5, 1, v0 +; GFX10-NEXT: v_add_nc_u32_e32 v4, 1, v0 ; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, s1, v2 -; GFX10-NEXT: v_subrev_nc_u32_e32 v6, s1, v2 -; GFX10-NEXT: v_add_nc_u32_e32 v4, 1, v1 -; GFX10-NEXT: v_cmp_le_u32_e64 s0, s2, v3 -; GFX10-NEXT: v_subrev_nc_u32_e32 v7, s2, v3 -; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v5, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e32 v2, v2, v6, vcc_lo -; GFX10-NEXT: s_xor_b32 s1, s10, s3 -; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, v4, s0 +; GFX10-NEXT: v_subrev_nc_u32_e32 v5, s1, v2 +; GFX10-NEXT: v_add_nc_u32_e32 v6, 1, v1 +; GFX10-NEXT: v_cmp_le_u32_e64 s0, s3, v3 +; GFX10-NEXT: v_subrev_nc_u32_e32 v7, s3, v3 +; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e32 v2, v2, v5, vcc_lo +; GFX10-NEXT: s_xor_b32 s1, s9, s2 +; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, v6, s0 ; GFX10-NEXT: v_cndmask_b32_e64 v3, v3, v7, s0 ; GFX10-NEXT: v_xor_b32_e32 v0, s1, v0 -; GFX10-NEXT: v_xor_b32_e32 v2, s10, v2 -; GFX10-NEXT: s_xor_b32 s0, s9, s8 +; GFX10-NEXT: v_xor_b32_e32 v2, s9, v2 +; GFX10-NEXT: s_xor_b32 s0, s10, s8 +; GFX10-NEXT: v_mov_b32_e32 v4, 0xff ; GFX10-NEXT: v_xor_b32_e32 v1, s0, v1 ; GFX10-NEXT: v_subrev_nc_u32_e32 v0, s1, v0 -; GFX10-NEXT: v_xor_b32_e32 v3, s9, v3 -; GFX10-NEXT: v_subrev_nc_u32_e32 v2, s10, v2 -; GFX10-NEXT: s_movk_i32 s1, 0xff +; GFX10-NEXT: v_xor_b32_e32 v3, s10, v3 +; GFX10-NEXT: v_subrev_nc_u32_e32 v2, s9, v2 ; GFX10-NEXT: v_subrev_nc_u32_e32 v1, s0, v1 -; GFX10-NEXT: v_and_b32_sdwa v0, v0, s1 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX10-NEXT: v_subrev_nc_u32_e32 v3, s9, v3 -; GFX10-NEXT: v_and_b32_sdwa v2, v2, s1 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX10-NEXT: v_and_b32_sdwa v0, v0, v4 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX10-NEXT: v_subrev_nc_u32_e32 v3, s10, v3 +; GFX10-NEXT: v_and_b32_sdwa v2, v2, v4 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX10-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD @@ -2992,19 +2992,19 @@ ; GFX8-NEXT: v_xor_b32_e32 v1, s0, v1 ; GFX8-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc ; GFX8-NEXT: v_subrev_u32_e32 v1, vcc, s0, v1 -; GFX8-NEXT: s_mov_b32 s0, 0xffff +; GFX8-NEXT: v_mov_b32_e32 v4, 0xffff ; GFX8-NEXT: v_xor_b32_e32 v3, s2, v3 -; GFX8-NEXT: v_and_b32_e32 v1, s0, v1 +; GFX8-NEXT: v_and_b32_e32 v1, v1, v4 ; GFX8-NEXT: v_subrev_u32_e32 v3, vcc, s2, v3 ; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX8-NEXT: v_or_b32_sdwa v4, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX8-NEXT: v_and_b32_e32 v0, s0, v3 +; GFX8-NEXT: v_or_b32_sdwa v5, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX8-NEXT: v_and_b32_e32 v0, v3, v4 ; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX8-NEXT: v_or_b32_sdwa v2, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v0, s4 ; GFX8-NEXT: v_mov_b32_e32 v1, s5 -; GFX8-NEXT: flat_store_dword v[0:1], v4 +; GFX8-NEXT: flat_store_dword v[0:1], v5 ; GFX8-NEXT: v_mov_b32_e32 v0, s6 ; GFX8-NEXT: v_mov_b32_e32 v1, s7 ; GFX8-NEXT: flat_store_dword v[0:1], v2 @@ -3341,7 +3341,7 @@ ; GFX8: ; %bb.0: ; GFX8-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x10 ; GFX8-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x0 -; GFX8-NEXT: s_mov_b32 s9, 0x7ffffff +; GFX8-NEXT: v_mov_b32_e32 v2, 0x7ffffff ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: s_bfe_i32 s1, s1, 0x1b0000 ; GFX8-NEXT: s_ashr_i32 s2, s1, 31 @@ -3362,27 +3362,27 @@ ; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v1 ; GFX8-NEXT: v_mul_hi_u32 v0, s0, v0 ; GFX8-NEXT: v_mul_lo_u32 v1, v0, s3 -; GFX8-NEXT: v_add_u32_e32 v2, vcc, 1, v0 +; GFX8-NEXT: v_add_u32_e32 v3, vcc, 1, v0 ; GFX8-NEXT: v_sub_u32_e32 v1, vcc, s0, v1 ; GFX8-NEXT: v_cmp_le_u32_e32 vcc, s3, v1 -; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc -; GFX8-NEXT: v_subrev_u32_e64 v2, s[0:1], s3, v1 -; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc -; GFX8-NEXT: v_add_u32_e32 v2, vcc, 1, v0 +; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc +; GFX8-NEXT: v_subrev_u32_e64 v3, s[0:1], s3, v1 +; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc +; GFX8-NEXT: v_add_u32_e32 v3, vcc, 1, v0 ; GFX8-NEXT: v_cmp_le_u32_e32 vcc, s3, v1 -; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc -; GFX8-NEXT: v_subrev_u32_e64 v2, s[0:1], s3, v1 -; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc +; GFX8-NEXT: v_subrev_u32_e64 v3, s[0:1], s3, v1 +; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc ; GFX8-NEXT: v_xor_b32_e32 v0, s2, v0 ; GFX8-NEXT: v_subrev_u32_e32 v0, vcc, s2, v0 ; GFX8-NEXT: v_xor_b32_e32 v1, s8, v1 -; GFX8-NEXT: v_subrev_u32_e32 v2, vcc, s8, v1 -; GFX8-NEXT: v_and_b32_e32 v3, s9, v0 +; GFX8-NEXT: v_subrev_u32_e32 v3, vcc, s8, v1 +; GFX8-NEXT: v_and_b32_e32 v4, v0, v2 ; GFX8-NEXT: v_mov_b32_e32 v0, s4 ; GFX8-NEXT: v_mov_b32_e32 v1, s5 -; GFX8-NEXT: flat_store_dword v[0:1], v3 +; GFX8-NEXT: flat_store_dword v[0:1], v4 ; GFX8-NEXT: v_mov_b32_e32 v0, s6 -; GFX8-NEXT: v_and_b32_e32 v2, s9, v2 +; GFX8-NEXT: v_and_b32_e32 v2, v3, v2 ; GFX8-NEXT: v_mov_b32_e32 v1, s7 ; GFX8-NEXT: flat_store_dword v[0:1], v2 ; GFX8-NEXT: s_endpgm @@ -3390,7 +3390,7 @@ ; GFX9-LABEL: sdivrem_i27: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x10 -; GFX9-NEXT: v_mov_b32_e32 v2, 0 +; GFX9-NEXT: v_mov_b32_e32 v3, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_bfe_i32 s1, s1, 0x1b0000 ; GFX9-NEXT: s_ashr_i32 s6, s1, 31 @@ -3407,32 +3407,32 @@ ; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0 ; GFX9-NEXT: v_mul_lo_u32 v1, s1, v0 ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; GFX9-NEXT: s_xor_b32 s5, s8, s6 -; GFX9-NEXT: s_mov_b32 s4, 0x7ffffff +; GFX9-NEXT: s_xor_b32 s4, s8, s6 ; GFX9-NEXT: v_mul_hi_u32 v1, v0, v1 ; GFX9-NEXT: v_add_u32_e32 v0, v0, v1 ; GFX9-NEXT: v_mul_hi_u32 v0, s9, v0 -; GFX9-NEXT: v_mul_lo_u32 v1, v0, s7 -; GFX9-NEXT: v_add_u32_e32 v3, 1, v0 -; GFX9-NEXT: v_sub_u32_e32 v1, s9, v1 -; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s7, v1 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc -; GFX9-NEXT: v_subrev_u32_e32 v3, s7, v1 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc -; GFX9-NEXT: v_add_u32_e32 v3, 1, v0 -; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s7, v1 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc -; GFX9-NEXT: v_subrev_u32_e32 v3, s7, v1 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc -; GFX9-NEXT: v_xor_b32_e32 v0, s5, v0 -; GFX9-NEXT: v_subrev_u32_e32 v0, s5, v0 -; GFX9-NEXT: v_xor_b32_e32 v1, s8, v1 -; GFX9-NEXT: v_subrev_u32_e32 v1, s8, v1 -; GFX9-NEXT: v_and_b32_e32 v0, s4, v0 +; GFX9-NEXT: v_mov_b32_e32 v1, 0x7ffffff +; GFX9-NEXT: v_mul_lo_u32 v2, v0, s7 +; GFX9-NEXT: v_add_u32_e32 v4, 1, v0 +; GFX9-NEXT: v_sub_u32_e32 v2, s9, v2 +; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s7, v2 +; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc +; GFX9-NEXT: v_subrev_u32_e32 v4, s7, v2 +; GFX9-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc +; GFX9-NEXT: v_add_u32_e32 v4, 1, v0 +; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s7, v2 +; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc +; GFX9-NEXT: v_subrev_u32_e32 v4, s7, v2 +; GFX9-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc +; GFX9-NEXT: v_xor_b32_e32 v0, s4, v0 +; GFX9-NEXT: v_subrev_u32_e32 v0, s4, v0 +; GFX9-NEXT: v_xor_b32_e32 v2, s8, v2 +; GFX9-NEXT: v_subrev_u32_e32 v2, s8, v2 +; GFX9-NEXT: v_and_b32_e32 v0, v0, v1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_store_dword v2, v0, s[0:1] -; GFX9-NEXT: v_and_b32_e32 v0, s4, v1 -; GFX9-NEXT: global_store_dword v2, v0, s[2:3] +; GFX9-NEXT: global_store_dword v3, v0, s[0:1] +; GFX9-NEXT: v_and_b32_e32 v0, v2, v1 +; GFX9-NEXT: global_store_dword v3, v0, s[2:3] ; GFX9-NEXT: s_endpgm ; ; GFX10-LABEL: sdivrem_i27: @@ -3470,17 +3470,17 @@ ; GFX10-NEXT: s_xor_b32 s4, s7, s6 ; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc_lo ; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc_lo -; GFX10-NEXT: v_mov_b32_e32 v2, 0 +; GFX10-NEXT: v_mov_b32_e32 v2, 0x7ffffff +; GFX10-NEXT: v_mov_b32_e32 v3, 0 ; GFX10-NEXT: v_xor_b32_e32 v0, s4, v0 ; GFX10-NEXT: v_xor_b32_e32 v1, s7, v1 ; GFX10-NEXT: v_subrev_nc_u32_e32 v0, s4, v0 ; GFX10-NEXT: v_subrev_nc_u32_e32 v1, s7, v1 -; GFX10-NEXT: s_mov_b32 s4, 0x7ffffff -; GFX10-NEXT: v_and_b32_e32 v0, s4, v0 -; GFX10-NEXT: v_and_b32_e32 v1, s4, v1 +; GFX10-NEXT: v_and_b32_e32 v0, v0, v2 +; GFX10-NEXT: v_and_b32_e32 v1, v1, v2 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: global_store_dword v2, v0, s[0:1] -; GFX10-NEXT: global_store_dword v2, v1, s[2:3] +; GFX10-NEXT: global_store_dword v3, v0, s[0:1] +; GFX10-NEXT: global_store_dword v3, v1, s[2:3] ; GFX10-NEXT: s_endpgm %div = sdiv i27 %x, %y store i27 %div, i27 addrspace(1)* %out0 Index: llvm/test/CodeGen/AMDGPU/GlobalISel/shl-ext-reduce.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/GlobalISel/shl-ext-reduce.ll +++ llvm/test/CodeGen/AMDGPU/GlobalISel/shl-ext-reduce.ll @@ -685,20 +685,20 @@ ; GFX9-LABEL: v_shl_v2i32_zext_v2i16: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: s_mov_b32 s4, 2 ; GFX9-NEXT: v_and_b32_e32 v1, 0x3fff3fff, v0 -; GFX9-NEXT: v_lshlrev_b32_sdwa v0, s4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; GFX9-NEXT: v_lshlrev_b32_sdwa v1, s4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_mov_b32_e32 v2, 2 +; GFX9-NEXT: v_lshlrev_b32_sdwa v0, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; GFX9-NEXT: v_lshlrev_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_shl_v2i32_zext_v2i16: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: v_and_b32_e32 v1, 0x3fff3fff, v0 -; GFX10-NEXT: s_mov_b32 s4, 2 -; GFX10-NEXT: v_lshlrev_b32_sdwa v0, s4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; GFX10-NEXT: v_lshlrev_b32_sdwa v1, s4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX10-NEXT: v_mov_b32_e32 v1, 2 +; GFX10-NEXT: v_and_b32_e32 v2, 0x3fff3fff, v0 +; GFX10-NEXT: v_lshlrev_b32_sdwa v0, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; GFX10-NEXT: v_lshlrev_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX10-NEXT: s_setpc_b64 s[30:31] %and = and <2 x i16> %x, %ext = zext <2 x i16> %and to <2 x i32> Index: llvm/test/CodeGen/AMDGPU/GlobalISel/shl.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/GlobalISel/shl.ll +++ llvm/test/CodeGen/AMDGPU/GlobalISel/shl.ll @@ -730,10 +730,10 @@ ; GFX6-LABEL: v_shl_v2i16: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: s_mov_b32 s4, 0xffff -; GFX6-NEXT: v_and_b32_e32 v2, s4, v2 +; GFX6-NEXT: v_mov_b32_e32 v4, 0xffff +; GFX6-NEXT: v_and_b32_e32 v2, v2, v4 ; GFX6-NEXT: v_lshlrev_b32_e32 v0, v2, v0 -; GFX6-NEXT: v_and_b32_e32 v2, s4, v3 +; GFX6-NEXT: v_and_b32_e32 v2, v3, v4 ; GFX6-NEXT: v_lshlrev_b32_e32 v1, v2, v1 ; GFX6-NEXT: s_setpc_b64 s[30:31] ; @@ -847,13 +847,13 @@ define amdgpu_ps float @shl_v2i16_sv(<2 x i16> inreg %value, <2 x i16> %amount) { ; GFX6-LABEL: shl_v2i16_sv: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_mov_b32 s2, 0xffff -; GFX6-NEXT: v_and_b32_e32 v1, s2, v1 -; GFX6-NEXT: v_and_b32_e32 v0, s2, v0 +; GFX6-NEXT: v_mov_b32_e32 v2, 0xffff +; GFX6-NEXT: v_and_b32_e32 v1, v1, v2 +; GFX6-NEXT: v_and_b32_e32 v0, v0, v2 ; GFX6-NEXT: v_lshl_b32_e32 v1, s1, v1 ; GFX6-NEXT: v_lshl_b32_e32 v0, s0, v0 -; GFX6-NEXT: v_and_b32_e32 v1, s2, v1 -; GFX6-NEXT: v_and_b32_e32 v0, s2, v0 +; GFX6-NEXT: v_and_b32_e32 v1, v1, v2 +; GFX6-NEXT: v_and_b32_e32 v0, v0, v2 ; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX6-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX6-NEXT: ; return to shader part epilog @@ -933,21 +933,21 @@ ; GFX6-LABEL: v_shl_v4i16: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: s_mov_b32 s4, 0xffff -; GFX6-NEXT: v_and_b32_e32 v4, s4, v4 +; GFX6-NEXT: v_mov_b32_e32 v8, 0xffff +; GFX6-NEXT: v_and_b32_e32 v4, v4, v8 ; GFX6-NEXT: v_lshlrev_b32_e32 v0, v4, v0 -; GFX6-NEXT: v_and_b32_e32 v4, s4, v5 +; GFX6-NEXT: v_and_b32_e32 v4, v5, v8 ; GFX6-NEXT: v_lshlrev_b32_e32 v1, v4, v1 -; GFX6-NEXT: v_and_b32_e32 v4, s4, v6 +; GFX6-NEXT: v_and_b32_e32 v4, v6, v8 ; GFX6-NEXT: v_lshlrev_b32_e32 v2, v4, v2 -; GFX6-NEXT: v_and_b32_e32 v4, s4, v7 -; GFX6-NEXT: v_and_b32_e32 v1, s4, v1 +; GFX6-NEXT: v_and_b32_e32 v4, v7, v8 +; GFX6-NEXT: v_and_b32_e32 v1, v1, v8 ; GFX6-NEXT: v_lshlrev_b32_e32 v3, v4, v3 -; GFX6-NEXT: v_and_b32_e32 v0, s4, v0 +; GFX6-NEXT: v_and_b32_e32 v0, v0, v8 ; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX6-NEXT: v_or_b32_e32 v0, v0, v1 -; GFX6-NEXT: v_and_b32_e32 v1, s4, v2 -; GFX6-NEXT: v_and_b32_e32 v2, s4, v3 +; GFX6-NEXT: v_and_b32_e32 v1, v2, v8 +; GFX6-NEXT: v_and_b32_e32 v2, v3, v8 ; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX6-NEXT: v_or_b32_e32 v1, v1, v2 ; GFX6-NEXT: s_setpc_b64 s[30:31] @@ -1086,26 +1086,25 @@ ; GFX6-LABEL: v_shl_v8i16: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: s_mov_b32 s4, 0xffff -; GFX6-NEXT: v_and_b32_e32 v8, s4, v8 +; GFX6-NEXT: v_mov_b32_e32 v16, 0xffff +; GFX6-NEXT: v_and_b32_e32 v8, v8, v16 ; GFX6-NEXT: v_lshlrev_b32_e32 v0, v8, v0 -; GFX6-NEXT: v_and_b32_e32 v8, s4, v9 +; GFX6-NEXT: v_and_b32_e32 v8, v9, v16 ; GFX6-NEXT: v_lshlrev_b32_e32 v1, v8, v1 -; GFX6-NEXT: v_and_b32_e32 v8, s4, v10 +; GFX6-NEXT: v_and_b32_e32 v8, v10, v16 ; GFX6-NEXT: v_lshlrev_b32_e32 v2, v8, v2 -; GFX6-NEXT: v_and_b32_e32 v8, s4, v11 +; GFX6-NEXT: v_and_b32_e32 v8, v11, v16 ; GFX6-NEXT: v_lshlrev_b32_e32 v3, v8, v3 -; GFX6-NEXT: v_and_b32_e32 v8, s4, v12 +; GFX6-NEXT: v_and_b32_e32 v8, v12, v16 ; GFX6-NEXT: v_lshlrev_b32_e32 v4, v8, v4 -; GFX6-NEXT: v_and_b32_e32 v8, s4, v13 -; GFX6-NEXT: v_and_b32_e32 v1, s4, v1 -; GFX6-NEXT: v_mov_b32_e32 v16, 0xffff +; GFX6-NEXT: v_and_b32_e32 v8, v13, v16 +; GFX6-NEXT: v_and_b32_e32 v1, v1, v16 ; GFX6-NEXT: v_lshlrev_b32_e32 v5, v8, v5 -; GFX6-NEXT: v_and_b32_e32 v8, s4, v14 -; GFX6-NEXT: v_and_b32_e32 v0, s4, v0 +; GFX6-NEXT: v_and_b32_e32 v8, v14, v16 +; GFX6-NEXT: v_and_b32_e32 v0, v0, v16 ; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX6-NEXT: v_lshlrev_b32_e32 v6, v8, v6 -; GFX6-NEXT: v_and_b32_e32 v8, s4, v15 +; GFX6-NEXT: v_and_b32_e32 v8, v15, v16 ; GFX6-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX6-NEXT: v_and_b32_e32 v1, v2, v16 ; GFX6-NEXT: v_and_b32_e32 v2, v3, v16 Index: llvm/test/CodeGen/AMDGPU/GlobalISel/srem.i32.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/GlobalISel/srem.i32.ll +++ llvm/test/CodeGen/AMDGPU/GlobalISel/srem.i32.ll @@ -270,28 +270,28 @@ ; CHECK-LABEL: v_srem_i32_pow2k_denom: ; CHECK: ; %bb.0: ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_movk_i32 s4, 0x1000 -; CHECK-NEXT: v_ashrrev_i32_e32 v1, 31, v0 -; CHECK-NEXT: v_rcp_iflag_f32_e32 v2, 0x45800000 -; CHECK-NEXT: v_mov_b32_e32 v3, 0xfffff000 -; CHECK-NEXT: v_add_i32_e32 v0, vcc, v0, v1 -; CHECK-NEXT: v_mul_f32_e32 v2, 0x4f7ffffe, v2 -; CHECK-NEXT: v_xor_b32_e32 v0, v0, v1 -; CHECK-NEXT: v_cvt_u32_f32_e32 v2, v2 -; CHECK-NEXT: v_mul_lo_u32 v3, v3, v2 -; CHECK-NEXT: v_mul_hi_u32 v3, v2, v3 -; CHECK-NEXT: v_add_i32_e32 v2, vcc, v2, v3 -; CHECK-NEXT: v_mul_hi_u32 v2, v0, v2 -; CHECK-NEXT: v_lshlrev_b32_e32 v2, 12, v2 +; CHECK-NEXT: v_mov_b32_e32 v1, 0x1000 +; CHECK-NEXT: v_ashrrev_i32_e32 v2, 31, v0 +; CHECK-NEXT: v_rcp_iflag_f32_e32 v3, 0x45800000 +; CHECK-NEXT: v_mov_b32_e32 v4, 0xfffff000 +; CHECK-NEXT: v_add_i32_e32 v0, vcc, v0, v2 +; CHECK-NEXT: v_mul_f32_e32 v3, 0x4f7ffffe, v3 +; CHECK-NEXT: v_xor_b32_e32 v0, v0, v2 +; CHECK-NEXT: v_cvt_u32_f32_e32 v3, v3 +; CHECK-NEXT: v_mul_lo_u32 v4, v4, v3 +; CHECK-NEXT: v_mul_hi_u32 v4, v3, v4 +; CHECK-NEXT: v_add_i32_e32 v3, vcc, v3, v4 +; CHECK-NEXT: v_mul_hi_u32 v3, v0, v3 +; CHECK-NEXT: v_lshlrev_b32_e32 v3, 12, v3 +; CHECK-NEXT: v_sub_i32_e32 v0, vcc, v0, v3 +; CHECK-NEXT: v_sub_i32_e32 v3, vcc, v0, v1 +; CHECK-NEXT: v_cmp_ge_u32_e32 vcc, v0, v1 +; CHECK-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc +; CHECK-NEXT: v_sub_i32_e32 v3, vcc, v0, v1 +; CHECK-NEXT: v_cmp_ge_u32_e32 vcc, v0, v1 +; CHECK-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc +; CHECK-NEXT: v_xor_b32_e32 v0, v0, v2 ; CHECK-NEXT: v_sub_i32_e32 v0, vcc, v0, v2 -; CHECK-NEXT: v_subrev_i32_e32 v2, vcc, s4, v0 -; CHECK-NEXT: v_cmp_le_u32_e32 vcc, s4, v0 -; CHECK-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc -; CHECK-NEXT: v_subrev_i32_e32 v2, vcc, s4, v0 -; CHECK-NEXT: v_cmp_le_u32_e32 vcc, s4, v0 -; CHECK-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc -; CHECK-NEXT: v_xor_b32_e32 v0, v0, v1 -; CHECK-NEXT: v_sub_i32_e32 v0, vcc, v0, v1 ; CHECK-NEXT: s_setpc_b64 s[30:31] %result = srem i32 %num, 4096 ret i32 %result @@ -348,49 +348,47 @@ ; CGP-LABEL: v_srem_v2i32_pow2k_denom: ; CGP: ; %bb.0: ; CGP-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CGP-NEXT: s_movk_i32 s4, 0x1000 -; CGP-NEXT: v_ashrrev_i32_e32 v2, 31, v0 -; CGP-NEXT: v_rcp_iflag_f32_e32 v3, 0x45800000 -; CGP-NEXT: s_movk_i32 s5, 0xf000 -; CGP-NEXT: v_mov_b32_e32 v4, 0xfffff000 -; CGP-NEXT: v_mov_b32_e32 v5, 0x1000 +; CGP-NEXT: v_mov_b32_e32 v2, 0x1000 +; CGP-NEXT: v_ashrrev_i32_e32 v3, 31, v0 +; CGP-NEXT: v_rcp_iflag_f32_e32 v4, 0x45800000 +; CGP-NEXT: v_mov_b32_e32 v5, 0xfffff000 ; CGP-NEXT: v_ashrrev_i32_e32 v6, 31, v1 ; CGP-NEXT: v_rcp_iflag_f32_e32 v7, 0x45800000 -; CGP-NEXT: v_add_i32_e32 v0, vcc, v0, v2 -; CGP-NEXT: v_mul_f32_e32 v3, 0x4f7ffffe, v3 +; CGP-NEXT: v_add_i32_e32 v0, vcc, v0, v3 +; CGP-NEXT: v_mul_f32_e32 v4, 0x4f7ffffe, v4 ; CGP-NEXT: v_add_i32_e32 v1, vcc, v1, v6 ; CGP-NEXT: v_mul_f32_e32 v7, 0x4f7ffffe, v7 -; CGP-NEXT: v_xor_b32_e32 v0, v0, v2 -; CGP-NEXT: v_cvt_u32_f32_e32 v3, v3 +; CGP-NEXT: v_xor_b32_e32 v0, v0, v3 +; CGP-NEXT: v_cvt_u32_f32_e32 v4, v4 ; CGP-NEXT: v_xor_b32_e32 v1, v1, v6 ; CGP-NEXT: v_cvt_u32_f32_e32 v7, v7 -; CGP-NEXT: v_mul_lo_u32 v8, s5, v3 -; CGP-NEXT: v_mul_lo_u32 v4, v4, v7 -; CGP-NEXT: v_mul_hi_u32 v8, v3, v8 -; CGP-NEXT: v_mul_hi_u32 v4, v7, v4 -; CGP-NEXT: v_add_i32_e32 v3, vcc, v3, v8 -; CGP-NEXT: v_add_i32_e32 v4, vcc, v7, v4 -; CGP-NEXT: v_mul_hi_u32 v3, v0, v3 -; CGP-NEXT: v_mul_hi_u32 v4, v1, v4 -; CGP-NEXT: v_lshlrev_b32_e32 v3, 12, v3 +; CGP-NEXT: v_mul_lo_u32 v8, v5, v4 +; CGP-NEXT: v_mul_lo_u32 v5, v5, v7 +; CGP-NEXT: v_mul_hi_u32 v8, v4, v8 +; CGP-NEXT: v_mul_hi_u32 v5, v7, v5 +; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v8 +; CGP-NEXT: v_add_i32_e32 v5, vcc, v7, v5 +; CGP-NEXT: v_mul_hi_u32 v4, v0, v4 +; CGP-NEXT: v_mul_hi_u32 v5, v1, v5 ; CGP-NEXT: v_lshlrev_b32_e32 v4, 12, v4 -; CGP-NEXT: v_sub_i32_e32 v0, vcc, v0, v3 -; CGP-NEXT: v_sub_i32_e32 v1, vcc, v1, v4 -; CGP-NEXT: v_subrev_i32_e32 v3, vcc, s4, v0 -; CGP-NEXT: v_sub_i32_e32 v4, vcc, v1, v5 -; CGP-NEXT: v_cmp_le_u32_e32 vcc, s4, v0 -; CGP-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc -; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v1, v5 -; CGP-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc -; CGP-NEXT: v_subrev_i32_e32 v3, vcc, s4, v0 -; CGP-NEXT: v_sub_i32_e32 v4, vcc, v1, v5 -; CGP-NEXT: v_cmp_le_u32_e32 vcc, s4, v0 -; CGP-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc -; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v1, v5 -; CGP-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc -; CGP-NEXT: v_xor_b32_e32 v0, v0, v2 +; CGP-NEXT: v_lshlrev_b32_e32 v5, 12, v5 +; CGP-NEXT: v_sub_i32_e32 v0, vcc, v0, v4 +; CGP-NEXT: v_sub_i32_e32 v1, vcc, v1, v5 +; CGP-NEXT: v_sub_i32_e32 v4, vcc, v0, v2 +; CGP-NEXT: v_sub_i32_e32 v5, vcc, v1, v2 +; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v0, v2 +; CGP-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc +; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v1, v2 +; CGP-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc +; CGP-NEXT: v_sub_i32_e32 v4, vcc, v0, v2 +; CGP-NEXT: v_sub_i32_e32 v5, vcc, v1, v2 +; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v0, v2 +; CGP-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc +; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v1, v2 +; CGP-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc +; CGP-NEXT: v_xor_b32_e32 v0, v0, v3 ; CGP-NEXT: v_xor_b32_e32 v1, v1, v6 -; CGP-NEXT: v_sub_i32_e32 v0, vcc, v0, v2 +; CGP-NEXT: v_sub_i32_e32 v0, vcc, v0, v3 ; CGP-NEXT: v_sub_i32_e32 v1, vcc, v1, v6 ; CGP-NEXT: s_setpc_b64 s[30:31] %result = srem <2 x i32> %num, @@ -401,28 +399,28 @@ ; CHECK-LABEL: v_srem_i32_oddk_denom: ; CHECK: ; %bb.0: ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_mov_b32 s4, 0x12d8fb -; CHECK-NEXT: v_ashrrev_i32_e32 v1, 31, v0 -; CHECK-NEXT: v_rcp_iflag_f32_e32 v2, 0x4996c7d8 -; CHECK-NEXT: v_mov_b32_e32 v3, 0xffed2705 -; CHECK-NEXT: v_add_i32_e32 v0, vcc, v0, v1 -; CHECK-NEXT: v_mul_f32_e32 v2, 0x4f7ffffe, v2 -; CHECK-NEXT: v_xor_b32_e32 v0, v0, v1 -; CHECK-NEXT: v_cvt_u32_f32_e32 v2, v2 -; CHECK-NEXT: v_mul_lo_u32 v3, v3, v2 -; CHECK-NEXT: v_mul_hi_u32 v3, v2, v3 -; CHECK-NEXT: v_add_i32_e32 v2, vcc, v2, v3 -; CHECK-NEXT: v_mul_hi_u32 v2, v0, v2 -; CHECK-NEXT: v_mul_lo_u32 v2, v2, s4 +; CHECK-NEXT: v_mov_b32_e32 v1, 0x12d8fb +; CHECK-NEXT: v_ashrrev_i32_e32 v2, 31, v0 +; CHECK-NEXT: v_rcp_iflag_f32_e32 v3, 0x4996c7d8 +; CHECK-NEXT: v_mov_b32_e32 v4, 0xffed2705 +; CHECK-NEXT: v_add_i32_e32 v0, vcc, v0, v2 +; CHECK-NEXT: v_mul_f32_e32 v3, 0x4f7ffffe, v3 +; CHECK-NEXT: v_xor_b32_e32 v0, v0, v2 +; CHECK-NEXT: v_cvt_u32_f32_e32 v3, v3 +; CHECK-NEXT: v_mul_lo_u32 v4, v4, v3 +; CHECK-NEXT: v_mul_hi_u32 v4, v3, v4 +; CHECK-NEXT: v_add_i32_e32 v3, vcc, v3, v4 +; CHECK-NEXT: v_mul_hi_u32 v3, v0, v3 +; CHECK-NEXT: v_mul_lo_u32 v3, v3, v1 +; CHECK-NEXT: v_sub_i32_e32 v0, vcc, v0, v3 +; CHECK-NEXT: v_sub_i32_e32 v3, vcc, v0, v1 +; CHECK-NEXT: v_cmp_ge_u32_e32 vcc, v0, v1 +; CHECK-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc +; CHECK-NEXT: v_sub_i32_e32 v3, vcc, v0, v1 +; CHECK-NEXT: v_cmp_ge_u32_e32 vcc, v0, v1 +; CHECK-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc +; CHECK-NEXT: v_xor_b32_e32 v0, v0, v2 ; CHECK-NEXT: v_sub_i32_e32 v0, vcc, v0, v2 -; CHECK-NEXT: v_subrev_i32_e32 v2, vcc, s4, v0 -; CHECK-NEXT: v_cmp_le_u32_e32 vcc, s4, v0 -; CHECK-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc -; CHECK-NEXT: v_subrev_i32_e32 v2, vcc, s4, v0 -; CHECK-NEXT: v_cmp_le_u32_e32 vcc, s4, v0 -; CHECK-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc -; CHECK-NEXT: v_xor_b32_e32 v0, v0, v1 -; CHECK-NEXT: v_sub_i32_e32 v0, vcc, v0, v1 ; CHECK-NEXT: s_setpc_b64 s[30:31] %result = srem i32 %num, 1235195 ret i32 %result @@ -479,49 +477,47 @@ ; CGP-LABEL: v_srem_v2i32_oddk_denom: ; CGP: ; %bb.0: ; CGP-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CGP-NEXT: s_mov_b32 s4, 0x12d8fb -; CGP-NEXT: v_ashrrev_i32_e32 v2, 31, v0 -; CGP-NEXT: v_rcp_iflag_f32_e32 v3, 0x4996c7d8 -; CGP-NEXT: s_mov_b32 s5, 0xffed2705 -; CGP-NEXT: v_mov_b32_e32 v4, 0xffed2705 -; CGP-NEXT: v_mov_b32_e32 v5, 0x12d8fb +; CGP-NEXT: v_mov_b32_e32 v2, 0x12d8fb +; CGP-NEXT: v_ashrrev_i32_e32 v3, 31, v0 +; CGP-NEXT: v_rcp_iflag_f32_e32 v4, 0x4996c7d8 +; CGP-NEXT: v_mov_b32_e32 v5, 0xffed2705 ; CGP-NEXT: v_ashrrev_i32_e32 v6, 31, v1 ; CGP-NEXT: v_rcp_iflag_f32_e32 v7, 0x4996c7d8 -; CGP-NEXT: v_add_i32_e32 v0, vcc, v0, v2 -; CGP-NEXT: v_mul_f32_e32 v3, 0x4f7ffffe, v3 +; CGP-NEXT: v_add_i32_e32 v0, vcc, v0, v3 +; CGP-NEXT: v_mul_f32_e32 v4, 0x4f7ffffe, v4 ; CGP-NEXT: v_add_i32_e32 v1, vcc, v1, v6 ; CGP-NEXT: v_mul_f32_e32 v7, 0x4f7ffffe, v7 -; CGP-NEXT: v_xor_b32_e32 v0, v0, v2 -; CGP-NEXT: v_cvt_u32_f32_e32 v3, v3 +; CGP-NEXT: v_xor_b32_e32 v0, v0, v3 +; CGP-NEXT: v_cvt_u32_f32_e32 v4, v4 ; CGP-NEXT: v_xor_b32_e32 v1, v1, v6 ; CGP-NEXT: v_cvt_u32_f32_e32 v7, v7 -; CGP-NEXT: v_mul_lo_u32 v8, s5, v3 -; CGP-NEXT: v_mul_lo_u32 v4, v4, v7 -; CGP-NEXT: v_mul_hi_u32 v8, v3, v8 -; CGP-NEXT: v_mul_hi_u32 v4, v7, v4 -; CGP-NEXT: v_add_i32_e32 v3, vcc, v3, v8 -; CGP-NEXT: v_add_i32_e32 v4, vcc, v7, v4 -; CGP-NEXT: v_mul_hi_u32 v3, v0, v3 -; CGP-NEXT: v_mul_hi_u32 v4, v1, v4 -; CGP-NEXT: v_mul_lo_u32 v3, v3, s4 -; CGP-NEXT: v_mul_lo_u32 v4, v4, s4 -; CGP-NEXT: v_sub_i32_e32 v0, vcc, v0, v3 -; CGP-NEXT: v_sub_i32_e32 v1, vcc, v1, v4 -; CGP-NEXT: v_subrev_i32_e32 v3, vcc, s4, v0 -; CGP-NEXT: v_sub_i32_e32 v4, vcc, v1, v5 -; CGP-NEXT: v_cmp_le_u32_e32 vcc, s4, v0 -; CGP-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc -; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v1, v5 -; CGP-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc -; CGP-NEXT: v_subrev_i32_e32 v3, vcc, s4, v0 -; CGP-NEXT: v_sub_i32_e32 v4, vcc, v1, v5 -; CGP-NEXT: v_cmp_le_u32_e32 vcc, s4, v0 -; CGP-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc -; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v1, v5 -; CGP-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc -; CGP-NEXT: v_xor_b32_e32 v0, v0, v2 +; CGP-NEXT: v_mul_lo_u32 v8, v5, v4 +; CGP-NEXT: v_mul_lo_u32 v5, v5, v7 +; CGP-NEXT: v_mul_hi_u32 v8, v4, v8 +; CGP-NEXT: v_mul_hi_u32 v5, v7, v5 +; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v8 +; CGP-NEXT: v_add_i32_e32 v5, vcc, v7, v5 +; CGP-NEXT: v_mul_hi_u32 v4, v0, v4 +; CGP-NEXT: v_mul_hi_u32 v5, v1, v5 +; CGP-NEXT: v_mul_lo_u32 v4, v4, v2 +; CGP-NEXT: v_mul_lo_u32 v5, v5, v2 +; CGP-NEXT: v_sub_i32_e32 v0, vcc, v0, v4 +; CGP-NEXT: v_sub_i32_e32 v1, vcc, v1, v5 +; CGP-NEXT: v_sub_i32_e32 v4, vcc, v0, v2 +; CGP-NEXT: v_sub_i32_e32 v5, vcc, v1, v2 +; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v0, v2 +; CGP-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc +; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v1, v2 +; CGP-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc +; CGP-NEXT: v_sub_i32_e32 v4, vcc, v0, v2 +; CGP-NEXT: v_sub_i32_e32 v5, vcc, v1, v2 +; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v0, v2 +; CGP-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc +; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v1, v2 +; CGP-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc +; CGP-NEXT: v_xor_b32_e32 v0, v0, v3 ; CGP-NEXT: v_xor_b32_e32 v1, v1, v6 -; CGP-NEXT: v_sub_i32_e32 v0, vcc, v0, v2 +; CGP-NEXT: v_sub_i32_e32 v0, vcc, v0, v3 ; CGP-NEXT: v_sub_i32_e32 v1, vcc, v1, v6 ; CGP-NEXT: s_setpc_b64 s[30:31] %result = srem <2 x i32> %num, @@ -568,125 +564,125 @@ ; GISEL-LABEL: v_srem_v2i32_pow2_shl_denom: ; GISEL: ; %bb.0: ; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GISEL-NEXT: s_movk_i32 s4, 0x1000 -; GISEL-NEXT: v_ashrrev_i32_e32 v4, 31, v0 -; GISEL-NEXT: v_ashrrev_i32_e32 v5, 31, v1 -; GISEL-NEXT: v_lshl_b32_e32 v2, s4, v2 -; GISEL-NEXT: v_lshl_b32_e32 v3, s4, v3 -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v4 -; GISEL-NEXT: v_add_i32_e32 v1, vcc, v1, v5 -; GISEL-NEXT: v_ashrrev_i32_e32 v6, 31, v2 -; GISEL-NEXT: v_xor_b32_e32 v0, v0, v4 +; GISEL-NEXT: v_mov_b32_e32 v4, 0x1000 +; GISEL-NEXT: v_ashrrev_i32_e32 v5, 31, v0 +; GISEL-NEXT: v_ashrrev_i32_e32 v6, 31, v1 +; GISEL-NEXT: v_lshlrev_b32_e32 v2, v2, v4 +; GISEL-NEXT: v_lshlrev_b32_e32 v3, v3, v4 +; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v5 +; GISEL-NEXT: v_add_i32_e32 v1, vcc, v1, v6 +; GISEL-NEXT: v_ashrrev_i32_e32 v4, 31, v2 +; GISEL-NEXT: v_xor_b32_e32 v0, v0, v5 ; GISEL-NEXT: v_ashrrev_i32_e32 v7, 31, v3 -; GISEL-NEXT: v_xor_b32_e32 v1, v1, v5 -; GISEL-NEXT: v_add_i32_e32 v2, vcc, v2, v6 +; GISEL-NEXT: v_xor_b32_e32 v1, v1, v6 +; GISEL-NEXT: v_add_i32_e32 v2, vcc, v2, v4 ; GISEL-NEXT: v_add_i32_e32 v3, vcc, v3, v7 -; GISEL-NEXT: v_xor_b32_e32 v2, v2, v6 +; GISEL-NEXT: v_xor_b32_e32 v2, v2, v4 ; GISEL-NEXT: v_xor_b32_e32 v3, v3, v7 -; GISEL-NEXT: v_cvt_f32_u32_e32 v6, v2 +; GISEL-NEXT: v_cvt_f32_u32_e32 v4, v2 ; GISEL-NEXT: v_sub_i32_e32 v7, vcc, 0, v2 ; GISEL-NEXT: v_cvt_f32_u32_e32 v8, v3 ; GISEL-NEXT: v_sub_i32_e32 v9, vcc, 0, v3 -; GISEL-NEXT: v_rcp_iflag_f32_e32 v6, v6 +; GISEL-NEXT: v_rcp_iflag_f32_e32 v4, v4 ; GISEL-NEXT: v_rcp_iflag_f32_e32 v8, v8 -; GISEL-NEXT: v_mul_f32_e32 v6, 0x4f7ffffe, v6 +; GISEL-NEXT: v_mul_f32_e32 v4, 0x4f7ffffe, v4 ; GISEL-NEXT: v_mul_f32_e32 v8, 0x4f7ffffe, v8 -; GISEL-NEXT: v_cvt_u32_f32_e32 v6, v6 +; GISEL-NEXT: v_cvt_u32_f32_e32 v4, v4 ; GISEL-NEXT: v_cvt_u32_f32_e32 v8, v8 -; GISEL-NEXT: v_mul_lo_u32 v7, v7, v6 +; GISEL-NEXT: v_mul_lo_u32 v7, v7, v4 ; GISEL-NEXT: v_mul_lo_u32 v9, v9, v8 -; GISEL-NEXT: v_mul_hi_u32 v7, v6, v7 +; GISEL-NEXT: v_mul_hi_u32 v7, v4, v7 ; GISEL-NEXT: v_mul_hi_u32 v9, v8, v9 -; GISEL-NEXT: v_add_i32_e32 v6, vcc, v6, v7 +; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v7 ; GISEL-NEXT: v_add_i32_e32 v7, vcc, v8, v9 -; GISEL-NEXT: v_mul_hi_u32 v6, v0, v6 +; GISEL-NEXT: v_mul_hi_u32 v4, v0, v4 ; GISEL-NEXT: v_mul_hi_u32 v7, v1, v7 -; GISEL-NEXT: v_mul_lo_u32 v6, v6, v2 +; GISEL-NEXT: v_mul_lo_u32 v4, v4, v2 ; GISEL-NEXT: v_mul_lo_u32 v7, v7, v3 -; GISEL-NEXT: v_sub_i32_e32 v0, vcc, v0, v6 +; GISEL-NEXT: v_sub_i32_e32 v0, vcc, v0, v4 ; GISEL-NEXT: v_sub_i32_e32 v1, vcc, v1, v7 -; GISEL-NEXT: v_sub_i32_e32 v6, vcc, v0, v2 +; GISEL-NEXT: v_sub_i32_e32 v4, vcc, v0, v2 ; GISEL-NEXT: v_sub_i32_e32 v7, vcc, v1, v3 ; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v0, v2 -; GISEL-NEXT: v_cndmask_b32_e32 v0, v0, v6, vcc +; GISEL-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc ; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v1, v3 ; GISEL-NEXT: v_cndmask_b32_e32 v1, v1, v7, vcc -; GISEL-NEXT: v_sub_i32_e32 v6, vcc, v0, v2 +; GISEL-NEXT: v_sub_i32_e32 v4, vcc, v0, v2 ; GISEL-NEXT: v_sub_i32_e32 v7, vcc, v1, v3 ; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v0, v2 -; GISEL-NEXT: v_cndmask_b32_e32 v0, v0, v6, vcc +; GISEL-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc ; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v1, v3 ; GISEL-NEXT: v_cndmask_b32_e32 v1, v1, v7, vcc -; GISEL-NEXT: v_xor_b32_e32 v0, v0, v4 -; GISEL-NEXT: v_xor_b32_e32 v1, v1, v5 -; GISEL-NEXT: v_sub_i32_e32 v0, vcc, v0, v4 -; GISEL-NEXT: v_sub_i32_e32 v1, vcc, v1, v5 +; GISEL-NEXT: v_xor_b32_e32 v0, v0, v5 +; GISEL-NEXT: v_xor_b32_e32 v1, v1, v6 +; GISEL-NEXT: v_sub_i32_e32 v0, vcc, v0, v5 +; GISEL-NEXT: v_sub_i32_e32 v1, vcc, v1, v6 ; GISEL-NEXT: s_setpc_b64 s[30:31] ; ; CGP-LABEL: v_srem_v2i32_pow2_shl_denom: ; CGP: ; %bb.0: ; CGP-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CGP-NEXT: s_movk_i32 s4, 0x1000 -; CGP-NEXT: v_ashrrev_i32_e32 v4, 31, v0 -; CGP-NEXT: v_ashrrev_i32_e32 v5, 31, v1 -; CGP-NEXT: v_lshl_b32_e32 v2, s4, v2 -; CGP-NEXT: v_lshl_b32_e32 v3, s4, v3 -; CGP-NEXT: v_add_i32_e32 v0, vcc, v0, v4 -; CGP-NEXT: v_add_i32_e32 v1, vcc, v1, v5 -; CGP-NEXT: v_ashrrev_i32_e32 v6, 31, v2 -; CGP-NEXT: v_xor_b32_e32 v0, v0, v4 +; CGP-NEXT: v_mov_b32_e32 v4, 0x1000 +; CGP-NEXT: v_ashrrev_i32_e32 v5, 31, v0 +; CGP-NEXT: v_ashrrev_i32_e32 v6, 31, v1 +; CGP-NEXT: v_lshlrev_b32_e32 v2, v2, v4 +; CGP-NEXT: v_lshlrev_b32_e32 v3, v3, v4 +; CGP-NEXT: v_add_i32_e32 v0, vcc, v0, v5 +; CGP-NEXT: v_add_i32_e32 v1, vcc, v1, v6 +; CGP-NEXT: v_ashrrev_i32_e32 v4, 31, v2 +; CGP-NEXT: v_xor_b32_e32 v0, v0, v5 ; CGP-NEXT: v_ashrrev_i32_e32 v7, 31, v3 -; CGP-NEXT: v_xor_b32_e32 v1, v1, v5 -; CGP-NEXT: v_add_i32_e32 v2, vcc, v2, v6 +; CGP-NEXT: v_xor_b32_e32 v1, v1, v6 +; CGP-NEXT: v_add_i32_e32 v2, vcc, v2, v4 ; CGP-NEXT: v_add_i32_e32 v3, vcc, v3, v7 -; CGP-NEXT: v_xor_b32_e32 v2, v2, v6 +; CGP-NEXT: v_xor_b32_e32 v2, v2, v4 ; CGP-NEXT: v_xor_b32_e32 v3, v3, v7 -; CGP-NEXT: v_cvt_f32_u32_e32 v6, v2 +; CGP-NEXT: v_cvt_f32_u32_e32 v4, v2 ; CGP-NEXT: v_sub_i32_e32 v7, vcc, 0, v2 ; CGP-NEXT: v_cvt_f32_u32_e32 v8, v3 ; CGP-NEXT: v_sub_i32_e32 v9, vcc, 0, v3 -; CGP-NEXT: v_rcp_f32_e32 v6, v6 +; CGP-NEXT: v_rcp_f32_e32 v4, v4 ; CGP-NEXT: v_rcp_f32_e32 v8, v8 -; CGP-NEXT: v_mul_f32_e32 v6, 0x4f7ffffe, v6 +; CGP-NEXT: v_mul_f32_e32 v4, 0x4f7ffffe, v4 ; CGP-NEXT: v_mul_f32_e32 v8, 0x4f7ffffe, v8 -; CGP-NEXT: v_cvt_u32_f32_e32 v6, v6 +; CGP-NEXT: v_cvt_u32_f32_e32 v4, v4 ; CGP-NEXT: v_cvt_u32_f32_e32 v8, v8 -; CGP-NEXT: v_mul_lo_u32 v7, v7, v6 +; CGP-NEXT: v_mul_lo_u32 v7, v7, v4 ; CGP-NEXT: v_mul_lo_u32 v9, v9, v8 ; CGP-NEXT: v_mul_lo_u32 v10, 0, v7 -; CGP-NEXT: v_mul_hi_u32 v7, v6, v7 +; CGP-NEXT: v_mul_hi_u32 v7, v4, v7 ; CGP-NEXT: v_mul_lo_u32 v11, 0, v9 ; CGP-NEXT: v_mul_hi_u32 v9, v8, v9 ; CGP-NEXT: v_add_i32_e32 v7, vcc, v10, v7 ; CGP-NEXT: v_add_i32_e32 v9, vcc, v11, v9 -; CGP-NEXT: v_add_i32_e32 v6, vcc, v6, v7 +; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v7 ; CGP-NEXT: v_add_i32_e32 v7, vcc, v8, v9 -; CGP-NEXT: v_mul_lo_u32 v8, 0, v6 -; CGP-NEXT: v_mul_hi_u32 v6, v0, v6 +; CGP-NEXT: v_mul_lo_u32 v8, 0, v4 +; CGP-NEXT: v_mul_hi_u32 v4, v0, v4 ; CGP-NEXT: v_mul_lo_u32 v9, 0, v7 ; CGP-NEXT: v_mul_hi_u32 v7, v1, v7 -; CGP-NEXT: v_add_i32_e32 v6, vcc, v8, v6 +; CGP-NEXT: v_add_i32_e32 v4, vcc, v8, v4 ; CGP-NEXT: v_add_i32_e32 v7, vcc, v9, v7 -; CGP-NEXT: v_mul_lo_u32 v6, v6, v2 +; CGP-NEXT: v_mul_lo_u32 v4, v4, v2 ; CGP-NEXT: v_mul_lo_u32 v7, v7, v3 -; CGP-NEXT: v_sub_i32_e32 v0, vcc, v0, v6 +; CGP-NEXT: v_sub_i32_e32 v0, vcc, v0, v4 ; CGP-NEXT: v_sub_i32_e32 v1, vcc, v1, v7 -; CGP-NEXT: v_sub_i32_e32 v6, vcc, v0, v2 +; CGP-NEXT: v_sub_i32_e32 v4, vcc, v0, v2 ; CGP-NEXT: v_sub_i32_e32 v7, vcc, v1, v3 ; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v0, v2 -; CGP-NEXT: v_cndmask_b32_e32 v0, v0, v6, vcc +; CGP-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc ; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v1, v3 ; CGP-NEXT: v_cndmask_b32_e32 v1, v1, v7, vcc -; CGP-NEXT: v_sub_i32_e32 v6, vcc, v0, v2 +; CGP-NEXT: v_sub_i32_e32 v4, vcc, v0, v2 ; CGP-NEXT: v_sub_i32_e32 v7, vcc, v1, v3 ; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v0, v2 -; CGP-NEXT: v_cndmask_b32_e32 v0, v0, v6, vcc +; CGP-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc ; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v1, v3 ; CGP-NEXT: v_cndmask_b32_e32 v1, v1, v7, vcc -; CGP-NEXT: v_xor_b32_e32 v0, v0, v4 -; CGP-NEXT: v_xor_b32_e32 v1, v1, v5 -; CGP-NEXT: v_sub_i32_e32 v0, vcc, v0, v4 -; CGP-NEXT: v_sub_i32_e32 v1, vcc, v1, v5 +; CGP-NEXT: v_xor_b32_e32 v0, v0, v5 +; CGP-NEXT: v_xor_b32_e32 v1, v1, v6 +; CGP-NEXT: v_sub_i32_e32 v0, vcc, v0, v5 +; CGP-NEXT: v_sub_i32_e32 v1, vcc, v1, v6 ; CGP-NEXT: s_setpc_b64 s[30:31] %shl.y = shl <2 x i32> , %y %r = srem <2 x i32> %x, %shl.y @@ -697,9 +693,9 @@ ; GISEL-LABEL: v_srem_i32_24bit: ; GISEL: ; %bb.0: ; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GISEL-NEXT: s_mov_b32 s4, 0xffffff -; GISEL-NEXT: v_and_b32_e32 v0, s4, v0 -; GISEL-NEXT: v_and_b32_e32 v1, s4, v1 +; GISEL-NEXT: v_mov_b32_e32 v2, 0xffffff +; GISEL-NEXT: v_and_b32_e32 v0, v0, v2 +; GISEL-NEXT: v_and_b32_e32 v1, v1, v2 ; GISEL-NEXT: v_ashrrev_i32_e32 v2, 31, v0 ; GISEL-NEXT: v_ashrrev_i32_e32 v3, 31, v1 ; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v2 @@ -730,9 +726,9 @@ ; CGP-LABEL: v_srem_i32_24bit: ; CGP: ; %bb.0: ; CGP-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CGP-NEXT: s_mov_b32 s4, 0xffffff -; CGP-NEXT: v_and_b32_e32 v0, s4, v0 -; CGP-NEXT: v_and_b32_e32 v1, s4, v1 +; CGP-NEXT: v_mov_b32_e32 v2, 0xffffff +; CGP-NEXT: v_and_b32_e32 v0, v0, v2 +; CGP-NEXT: v_and_b32_e32 v1, v1, v2 ; CGP-NEXT: v_cvt_f32_u32_e32 v2, v1 ; CGP-NEXT: v_sub_i32_e32 v3, vcc, 0, v1 ; CGP-NEXT: v_rcp_f32_e32 v2, v2 Index: llvm/test/CodeGen/AMDGPU/GlobalISel/srem.i64.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/GlobalISel/srem.i64.ll +++ llvm/test/CodeGen/AMDGPU/GlobalISel/srem.i64.ll @@ -174,15 +174,15 @@ ; CHECK-NEXT: v_mul_hi_u32 v1, v0, v1 ; CHECK-NEXT: v_add_i32_e32 v0, vcc, v0, v1 ; CHECK-NEXT: v_mul_hi_u32 v0, v4, v0 +; CHECK-NEXT: v_mov_b32_e32 v1, 0 ; CHECK-NEXT: v_mul_lo_u32 v0, v0, v2 ; CHECK-NEXT: v_sub_i32_e32 v0, vcc, v4, v0 -; CHECK-NEXT: v_sub_i32_e32 v1, vcc, v0, v2 +; CHECK-NEXT: v_sub_i32_e32 v3, vcc, v0, v2 ; CHECK-NEXT: v_cmp_ge_u32_e32 vcc, v0, v2 -; CHECK-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc -; CHECK-NEXT: v_sub_i32_e32 v1, vcc, v0, v2 +; CHECK-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc +; CHECK-NEXT: v_sub_i32_e32 v3, vcc, v0, v2 ; CHECK-NEXT: v_cmp_ge_u32_e32 vcc, v0, v2 -; CHECK-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc -; CHECK-NEXT: v_mov_b32_e32 v1, 0 +; CHECK-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc ; CHECK-NEXT: BB0_4: ; CHECK-NEXT: s_or_b64 exec, exec, s[4:5] ; CHECK-NEXT: s_setpc_b64 s[30:31] @@ -848,15 +848,15 @@ ; CGP-NEXT: v_mul_hi_u32 v1, v0, v1 ; CGP-NEXT: v_add_i32_e32 v0, vcc, v0, v1 ; CGP-NEXT: v_mul_hi_u32 v0, v10, v0 +; CGP-NEXT: v_mov_b32_e32 v1, 0 ; CGP-NEXT: v_mul_lo_u32 v0, v0, v4 ; CGP-NEXT: v_sub_i32_e32 v0, vcc, v10, v0 -; CGP-NEXT: v_sub_i32_e32 v1, vcc, v0, v4 +; CGP-NEXT: v_sub_i32_e32 v2, vcc, v0, v4 ; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v0, v4 -; CGP-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc -; CGP-NEXT: v_sub_i32_e32 v1, vcc, v0, v4 +; CGP-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc +; CGP-NEXT: v_sub_i32_e32 v2, vcc, v0, v4 ; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v0, v4 -; CGP-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc -; CGP-NEXT: v_mov_b32_e32 v1, 0 +; CGP-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc ; CGP-NEXT: BB2_4: ; CGP-NEXT: s_or_b64 exec, exec, s[4:5] ; CGP-NEXT: v_or_b32_e32 v3, v9, v7 @@ -1023,15 +1023,15 @@ ; CGP-NEXT: v_mul_hi_u32 v3, v2, v3 ; CGP-NEXT: v_add_i32_e32 v2, vcc, v2, v3 ; CGP-NEXT: v_mul_hi_u32 v2, v8, v2 +; CGP-NEXT: v_mov_b32_e32 v3, 0 ; CGP-NEXT: v_mul_lo_u32 v2, v2, v6 ; CGP-NEXT: v_sub_i32_e32 v2, vcc, v8, v2 -; CGP-NEXT: v_sub_i32_e32 v3, vcc, v2, v6 +; CGP-NEXT: v_sub_i32_e32 v4, vcc, v2, v6 ; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v2, v6 -; CGP-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc -; CGP-NEXT: v_sub_i32_e32 v3, vcc, v2, v6 +; CGP-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc +; CGP-NEXT: v_sub_i32_e32 v4, vcc, v2, v6 ; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v2, v6 -; CGP-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc -; CGP-NEXT: v_mov_b32_e32 v3, 0 +; CGP-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc ; CGP-NEXT: BB2_8: ; CGP-NEXT: s_or_b64 exec, exec, s[4:5] ; CGP-NEXT: s_setpc_b64 s[30:31] @@ -1043,140 +1043,140 @@ ; CHECK-LABEL: v_srem_i64_pow2k_denom: ; CHECK: ; %bb.0: ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: v_cvt_f32_u32_e32 v3, 0x1000 -; CHECK-NEXT: v_cvt_f32_ubyte0_e32 v4, 0 -; CHECK-NEXT: s_movk_i32 s6, 0xf000 -; CHECK-NEXT: v_ashrrev_i32_e32 v2, 31, v1 -; CHECK-NEXT: v_mac_f32_e32 v3, 0x4f800000, v4 +; CHECK-NEXT: v_mov_b32_e32 v2, 0x1000 +; CHECK-NEXT: v_cvt_f32_u32_e32 v3, v2 +; CHECK-NEXT: v_cvt_f32_ubyte0_e32 v5, 0 +; CHECK-NEXT: v_mov_b32_e32 v6, 0xfffff000 +; CHECK-NEXT: v_ashrrev_i32_e32 v4, 31, v1 +; CHECK-NEXT: v_mac_f32_e32 v3, 0x4f800000, v5 ; CHECK-NEXT: v_rcp_iflag_f32_e32 v3, v3 -; CHECK-NEXT: v_add_i32_e32 v0, vcc, v0, v2 -; CHECK-NEXT: v_addc_u32_e32 v1, vcc, v1, v2, vcc +; CHECK-NEXT: v_add_i32_e32 v0, vcc, v0, v4 +; CHECK-NEXT: v_addc_u32_e32 v1, vcc, v1, v4, vcc ; CHECK-NEXT: v_mul_f32_e32 v3, 0x5f7ffffc, v3 -; CHECK-NEXT: v_mul_f32_e32 v4, 0x2f800000, v3 -; CHECK-NEXT: v_trunc_f32_e32 v4, v4 -; CHECK-NEXT: v_mac_f32_e32 v3, 0xcf800000, v4 +; CHECK-NEXT: v_mul_f32_e32 v5, 0x2f800000, v3 +; CHECK-NEXT: v_trunc_f32_e32 v5, v5 +; CHECK-NEXT: v_mac_f32_e32 v3, 0xcf800000, v5 ; CHECK-NEXT: v_cvt_u32_f32_e32 v3, v3 -; CHECK-NEXT: v_cvt_u32_f32_e32 v4, v4 -; CHECK-NEXT: v_xor_b32_e32 v0, v0, v2 -; CHECK-NEXT: v_xor_b32_e32 v1, v1, v2 -; CHECK-NEXT: v_mul_lo_u32 v5, -1, v3 -; CHECK-NEXT: v_mul_lo_u32 v6, s6, v4 -; CHECK-NEXT: v_mul_hi_u32 v8, s6, v3 -; CHECK-NEXT: v_mul_lo_u32 v7, s6, v3 -; CHECK-NEXT: s_bfe_i32 s7, -1, 0x10000 -; CHECK-NEXT: v_add_i32_e32 v5, vcc, v5, v6 -; CHECK-NEXT: v_add_i32_e32 v5, vcc, v5, v8 -; CHECK-NEXT: v_mul_lo_u32 v6, v4, v7 -; CHECK-NEXT: v_mul_lo_u32 v8, v3, v5 -; CHECK-NEXT: v_mul_hi_u32 v9, v3, v7 -; CHECK-NEXT: v_mul_hi_u32 v7, v4, v7 -; CHECK-NEXT: v_add_i32_e32 v6, vcc, v6, v8 -; CHECK-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc -; CHECK-NEXT: v_add_i32_e32 v6, vcc, v6, v9 -; CHECK-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc -; CHECK-NEXT: v_mul_lo_u32 v9, v4, v5 -; CHECK-NEXT: v_add_i32_e32 v6, vcc, v8, v6 -; CHECK-NEXT: v_mul_hi_u32 v8, v3, v5 -; CHECK-NEXT: v_add_i32_e32 v7, vcc, v9, v7 -; CHECK-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc -; CHECK-NEXT: v_add_i32_e32 v7, vcc, v7, v8 -; CHECK-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc -; CHECK-NEXT: v_add_i32_e32 v8, vcc, v9, v8 -; CHECK-NEXT: v_mul_hi_u32 v5, v4, v5 -; CHECK-NEXT: v_add_i32_e32 v6, vcc, v7, v6 -; CHECK-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc -; CHECK-NEXT: v_add_i32_e32 v7, vcc, v8, v7 -; CHECK-NEXT: v_add_i32_e32 v5, vcc, v5, v7 -; CHECK-NEXT: v_add_i32_e32 v3, vcc, v3, v6 -; CHECK-NEXT: v_addc_u32_e64 v6, s[4:5], v4, v5, vcc +; CHECK-NEXT: v_cvt_u32_f32_e32 v5, v5 +; CHECK-NEXT: v_xor_b32_e32 v0, v0, v4 +; CHECK-NEXT: v_xor_b32_e32 v1, v1, v4 ; CHECK-NEXT: v_mul_lo_u32 v7, -1, v3 -; CHECK-NEXT: v_mul_lo_u32 v8, s6, v6 -; CHECK-NEXT: v_mul_hi_u32 v10, s6, v3 -; CHECK-NEXT: v_mul_lo_u32 v9, s6, v3 -; CHECK-NEXT: v_add_i32_e64 v4, s[4:5], v4, v5 -; CHECK-NEXT: v_add_i32_e64 v7, s[4:5], v7, v8 -; CHECK-NEXT: v_add_i32_e64 v7, s[4:5], v7, v10 -; CHECK-NEXT: v_mul_lo_u32 v8, v6, v9 +; CHECK-NEXT: v_mul_lo_u32 v8, v6, v5 +; CHECK-NEXT: v_mul_hi_u32 v10, v6, v3 +; CHECK-NEXT: v_mul_lo_u32 v9, v6, v3 +; CHECK-NEXT: s_bfe_i32 s6, -1, 0x10000 +; CHECK-NEXT: v_add_i32_e32 v7, vcc, v7, v8 +; CHECK-NEXT: v_add_i32_e32 v7, vcc, v7, v10 +; CHECK-NEXT: v_mul_lo_u32 v8, v5, v9 ; CHECK-NEXT: v_mul_lo_u32 v10, v3, v7 -; CHECK-NEXT: v_mul_hi_u32 v5, v3, v9 -; CHECK-NEXT: v_mul_hi_u32 v9, v6, v9 -; CHECK-NEXT: s_movk_i32 s6, 0x1000 -; CHECK-NEXT: v_add_i32_e64 v8, s[4:5], v8, v10 -; CHECK-NEXT: v_cndmask_b32_e64 v10, 0, 1, s[4:5] -; CHECK-NEXT: v_add_i32_e64 v5, s[4:5], v8, v5 -; CHECK-NEXT: v_cndmask_b32_e64 v5, 0, 1, s[4:5] -; CHECK-NEXT: v_mul_lo_u32 v8, v6, v7 -; CHECK-NEXT: v_add_i32_e64 v5, s[4:5], v10, v5 +; CHECK-NEXT: v_mul_hi_u32 v11, v3, v9 +; CHECK-NEXT: v_mul_hi_u32 v9, v5, v9 +; CHECK-NEXT: v_add_i32_e32 v8, vcc, v8, v10 +; CHECK-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc +; CHECK-NEXT: v_add_i32_e32 v8, vcc, v8, v11 +; CHECK-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc +; CHECK-NEXT: v_mul_lo_u32 v11, v5, v7 +; CHECK-NEXT: v_add_i32_e32 v8, vcc, v10, v8 ; CHECK-NEXT: v_mul_hi_u32 v10, v3, v7 -; CHECK-NEXT: v_add_i32_e64 v8, s[4:5], v8, v9 -; CHECK-NEXT: v_cndmask_b32_e64 v9, 0, 1, s[4:5] -; CHECK-NEXT: v_add_i32_e64 v8, s[4:5], v8, v10 +; CHECK-NEXT: v_add_i32_e32 v9, vcc, v11, v9 +; CHECK-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc +; CHECK-NEXT: v_add_i32_e32 v9, vcc, v9, v10 +; CHECK-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc +; CHECK-NEXT: v_add_i32_e32 v10, vcc, v11, v10 +; CHECK-NEXT: v_mul_hi_u32 v7, v5, v7 +; CHECK-NEXT: v_add_i32_e32 v8, vcc, v9, v8 +; CHECK-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc +; CHECK-NEXT: v_add_i32_e32 v9, vcc, v10, v9 +; CHECK-NEXT: v_add_i32_e32 v7, vcc, v7, v9 +; CHECK-NEXT: v_add_i32_e32 v3, vcc, v3, v8 +; CHECK-NEXT: v_addc_u32_e64 v8, s[4:5], v5, v7, vcc +; CHECK-NEXT: v_mul_lo_u32 v9, -1, v3 +; CHECK-NEXT: v_mul_lo_u32 v10, v6, v8 +; CHECK-NEXT: v_mul_lo_u32 v11, v6, v3 +; CHECK-NEXT: v_mul_hi_u32 v6, v6, v3 +; CHECK-NEXT: v_add_i32_e64 v5, s[4:5], v5, v7 +; CHECK-NEXT: v_add_i32_e64 v9, s[4:5], v9, v10 +; CHECK-NEXT: v_add_i32_e64 v6, s[4:5], v9, v6 +; CHECK-NEXT: v_mul_lo_u32 v9, v8, v11 +; CHECK-NEXT: v_mul_lo_u32 v10, v3, v6 +; CHECK-NEXT: v_mul_hi_u32 v7, v3, v11 +; CHECK-NEXT: v_mul_hi_u32 v11, v8, v11 +; CHECK-NEXT: v_add_i32_e64 v9, s[4:5], v9, v10 ; CHECK-NEXT: v_cndmask_b32_e64 v10, 0, 1, s[4:5] +; CHECK-NEXT: v_add_i32_e64 v7, s[4:5], v9, v7 +; CHECK-NEXT: v_cndmask_b32_e64 v7, 0, 1, s[4:5] +; CHECK-NEXT: v_mul_lo_u32 v9, v8, v6 +; CHECK-NEXT: v_add_i32_e64 v7, s[4:5], v10, v7 +; CHECK-NEXT: v_mul_hi_u32 v10, v3, v6 +; CHECK-NEXT: v_add_i32_e64 v9, s[4:5], v9, v11 +; CHECK-NEXT: v_cndmask_b32_e64 v11, 0, 1, s[4:5] ; CHECK-NEXT: v_add_i32_e64 v9, s[4:5], v9, v10 -; CHECK-NEXT: v_mul_hi_u32 v6, v6, v7 -; CHECK-NEXT: v_add_i32_e64 v5, s[4:5], v8, v5 -; CHECK-NEXT: v_cndmask_b32_e64 v8, 0, 1, s[4:5] -; CHECK-NEXT: v_add_i32_e64 v7, s[4:5], v9, v8 -; CHECK-NEXT: v_add_i32_e64 v6, s[4:5], v6, v7 -; CHECK-NEXT: v_addc_u32_e32 v4, vcc, v4, v6, vcc -; CHECK-NEXT: v_add_i32_e32 v3, vcc, v3, v5 -; CHECK-NEXT: v_addc_u32_e32 v4, vcc, 0, v4, vcc -; CHECK-NEXT: v_mul_lo_u32 v5, v1, v3 -; CHECK-NEXT: v_mul_lo_u32 v6, v0, v4 -; CHECK-NEXT: v_mul_hi_u32 v7, v0, v3 +; CHECK-NEXT: v_cndmask_b32_e64 v10, 0, 1, s[4:5] +; CHECK-NEXT: v_add_i32_e64 v10, s[4:5], v11, v10 +; CHECK-NEXT: v_mul_hi_u32 v6, v8, v6 +; CHECK-NEXT: v_add_i32_e64 v7, s[4:5], v9, v7 +; CHECK-NEXT: v_cndmask_b32_e64 v9, 0, 1, s[4:5] +; CHECK-NEXT: v_add_i32_e64 v8, s[4:5], v10, v9 +; CHECK-NEXT: v_add_i32_e64 v6, s[4:5], v6, v8 +; CHECK-NEXT: v_addc_u32_e32 v5, vcc, v5, v6, vcc +; CHECK-NEXT: v_add_i32_e32 v3, vcc, v3, v7 +; CHECK-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc +; CHECK-NEXT: v_mul_lo_u32 v6, v1, v3 +; CHECK-NEXT: v_mul_lo_u32 v7, v0, v5 +; CHECK-NEXT: v_mul_hi_u32 v8, v0, v3 ; CHECK-NEXT: v_mul_hi_u32 v3, v1, v3 -; CHECK-NEXT: v_add_i32_e32 v5, vcc, v5, v6 +; CHECK-NEXT: v_add_i32_e32 v6, vcc, v6, v7 +; CHECK-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc +; CHECK-NEXT: v_add_i32_e32 v6, vcc, v6, v8 ; CHECK-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc -; CHECK-NEXT: v_add_i32_e32 v5, vcc, v5, v7 -; CHECK-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc -; CHECK-NEXT: v_mul_lo_u32 v7, v1, v4 -; CHECK-NEXT: v_add_i32_e32 v5, vcc, v6, v5 -; CHECK-NEXT: v_mul_hi_u32 v6, v0, v4 -; CHECK-NEXT: v_add_i32_e32 v3, vcc, v7, v3 +; CHECK-NEXT: v_mul_lo_u32 v8, v1, v5 +; CHECK-NEXT: v_add_i32_e32 v6, vcc, v7, v6 +; CHECK-NEXT: v_mul_hi_u32 v7, v0, v5 +; CHECK-NEXT: v_add_i32_e32 v3, vcc, v8, v3 +; CHECK-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc +; CHECK-NEXT: v_add_i32_e32 v3, vcc, v3, v7 ; CHECK-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc +; CHECK-NEXT: v_add_i32_e32 v7, vcc, v8, v7 +; CHECK-NEXT: v_mul_hi_u32 v5, v1, v5 ; CHECK-NEXT: v_add_i32_e32 v3, vcc, v3, v6 ; CHECK-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc ; CHECK-NEXT: v_add_i32_e32 v6, vcc, v7, v6 -; CHECK-NEXT: v_mul_hi_u32 v4, v1, v4 -; CHECK-NEXT: v_add_i32_e32 v3, vcc, v3, v5 -; CHECK-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc +; CHECK-NEXT: v_add_i32_e32 v5, vcc, v5, v6 +; CHECK-NEXT: v_mul_lo_u32 v6, 0, v3 +; CHECK-NEXT: v_mul_lo_u32 v5, v2, v5 +; CHECK-NEXT: v_mul_lo_u32 v7, v2, v3 +; CHECK-NEXT: v_mul_hi_u32 v3, v2, v3 ; CHECK-NEXT: v_add_i32_e32 v5, vcc, v6, v5 -; CHECK-NEXT: v_add_i32_e32 v4, vcc, v4, v5 -; CHECK-NEXT: v_mul_lo_u32 v5, 0, v3 -; CHECK-NEXT: v_mul_lo_u32 v4, s6, v4 -; CHECK-NEXT: v_mul_lo_u32 v6, s6, v3 -; CHECK-NEXT: v_mul_hi_u32 v3, s6, v3 -; CHECK-NEXT: v_add_i32_e32 v4, vcc, v5, v4 -; CHECK-NEXT: v_add_i32_e32 v3, vcc, v4, v3 -; CHECK-NEXT: v_sub_i32_e32 v0, vcc, v0, v6 -; CHECK-NEXT: v_subb_u32_e64 v4, s[4:5], v1, v3, vcc +; CHECK-NEXT: v_add_i32_e32 v3, vcc, v5, v3 +; CHECK-NEXT: v_sub_i32_e32 v0, vcc, v0, v7 +; CHECK-NEXT: v_subb_u32_e64 v5, s[4:5], v1, v3, vcc ; CHECK-NEXT: v_sub_i32_e64 v1, s[4:5], v1, v3 -; CHECK-NEXT: v_cmp_le_u32_e64 s[4:5], s6, v0 +; CHECK-NEXT: v_cmp_ge_u32_e64 s[4:5], v0, v2 ; CHECK-NEXT: v_cndmask_b32_e64 v3, 0, -1, s[4:5] -; CHECK-NEXT: v_mov_b32_e32 v5, s7 -; CHECK-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v4 +; CHECK-NEXT: v_mov_b32_e32 v6, s6 +; CHECK-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v5 ; CHECK-NEXT: v_subbrev_u32_e32 v1, vcc, 0, v1, vcc -; CHECK-NEXT: v_cndmask_b32_e64 v3, v5, v3, s[4:5] -; CHECK-NEXT: v_subrev_i32_e32 v5, vcc, s6, v0 +; CHECK-NEXT: v_cndmask_b32_e64 v3, v6, v3, s[4:5] +; CHECK-NEXT: v_sub_i32_e32 v6, vcc, v0, v2 ; CHECK-NEXT: v_subbrev_u32_e32 v1, vcc, 0, v1, vcc ; CHECK-NEXT: s_bfe_i32 s4, -1, 0x10000 -; CHECK-NEXT: v_cmp_le_u32_e32 vcc, s6, v5 -; CHECK-NEXT: v_cndmask_b32_e64 v6, 0, -1, vcc -; CHECK-NEXT: v_mov_b32_e32 v7, s4 +; CHECK-NEXT: v_cmp_ge_u32_e32 vcc, v6, v2 +; CHECK-NEXT: v_cndmask_b32_e64 v7, 0, -1, vcc +; CHECK-NEXT: v_mov_b32_e32 v8, s4 ; CHECK-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 -; CHECK-NEXT: v_cndmask_b32_e32 v6, v7, v6, vcc -; CHECK-NEXT: v_subrev_i32_e32 v7, vcc, s6, v5 +; CHECK-NEXT: v_cndmask_b32_e32 v7, v8, v7, vcc +; CHECK-NEXT: v_sub_i32_e32 v2, vcc, v6, v2 ; CHECK-NEXT: v_subbrev_u32_e32 v8, vcc, 0, v1, vcc -; CHECK-NEXT: v_cmp_ne_u32_e32 vcc, 0, v6 -; CHECK-NEXT: v_cndmask_b32_e32 v5, v5, v7, vcc +; CHECK-NEXT: v_cmp_ne_u32_e32 vcc, 0, v7 +; CHECK-NEXT: v_cndmask_b32_e32 v2, v6, v2, vcc ; CHECK-NEXT: v_cndmask_b32_e32 v1, v1, v8, vcc ; CHECK-NEXT: v_cmp_ne_u32_e32 vcc, 0, v3 -; CHECK-NEXT: v_cndmask_b32_e32 v0, v0, v5, vcc -; CHECK-NEXT: v_cndmask_b32_e32 v1, v4, v1, vcc -; CHECK-NEXT: v_xor_b32_e32 v0, v0, v2 -; CHECK-NEXT: v_xor_b32_e32 v1, v1, v2 -; CHECK-NEXT: v_sub_i32_e32 v0, vcc, v0, v2 -; CHECK-NEXT: v_subb_u32_e32 v1, vcc, v1, v2, vcc +; CHECK-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc +; CHECK-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc +; CHECK-NEXT: v_xor_b32_e32 v0, v0, v4 +; CHECK-NEXT: v_xor_b32_e32 v1, v1, v4 +; CHECK-NEXT: v_sub_i32_e32 v0, vcc, v0, v4 +; CHECK-NEXT: v_subb_u32_e32 v1, vcc, v1, v4, vcc ; CHECK-NEXT: s_setpc_b64 s[30:31] %result = srem i64 %num, 4096 ret i64 %result @@ -1484,271 +1484,271 @@ ; CGP-LABEL: v_srem_v2i64_pow2k_denom: ; CGP: ; %bb.0: ; CGP-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CGP-NEXT: v_cvt_f32_u32_e32 v5, 0x1000 -; CGP-NEXT: v_cvt_f32_ubyte0_e32 v6, 0 -; CGP-NEXT: s_movk_i32 s6, 0xf000 -; CGP-NEXT: v_ashrrev_i32_e32 v4, 31, v1 -; CGP-NEXT: v_mov_b32_e32 v7, v5 -; CGP-NEXT: v_mac_f32_e32 v7, 0x4f800000, v6 -; CGP-NEXT: v_rcp_iflag_f32_e32 v7, v7 -; CGP-NEXT: v_add_i32_e32 v0, vcc, v0, v4 -; CGP-NEXT: v_addc_u32_e32 v1, vcc, v1, v4, vcc -; CGP-NEXT: v_mul_f32_e32 v7, 0x5f7ffffc, v7 -; CGP-NEXT: v_mul_f32_e32 v8, 0x2f800000, v7 -; CGP-NEXT: v_trunc_f32_e32 v8, v8 -; CGP-NEXT: v_mac_f32_e32 v7, 0xcf800000, v8 -; CGP-NEXT: v_cvt_u32_f32_e32 v7, v7 +; CGP-NEXT: v_mov_b32_e32 v4, 0x1000 +; CGP-NEXT: v_cvt_f32_u32_e32 v6, v4 +; CGP-NEXT: v_cvt_f32_ubyte0_e32 v7, 0 +; CGP-NEXT: v_ashrrev_i32_e32 v5, 31, v1 +; CGP-NEXT: v_add_i32_e32 v0, vcc, v0, v5 +; CGP-NEXT: v_mov_b32_e32 v8, v6 +; CGP-NEXT: v_mac_f32_e32 v8, 0x4f800000, v7 +; CGP-NEXT: v_rcp_iflag_f32_e32 v8, v8 +; CGP-NEXT: v_xor_b32_e32 v10, v0, v5 +; CGP-NEXT: v_mov_b32_e32 v0, 0xfffff000 +; CGP-NEXT: v_addc_u32_e32 v1, vcc, v1, v5, vcc +; CGP-NEXT: v_mul_f32_e32 v8, 0x5f7ffffc, v8 +; CGP-NEXT: v_mul_f32_e32 v9, 0x2f800000, v8 +; CGP-NEXT: v_trunc_f32_e32 v9, v9 +; CGP-NEXT: v_mac_f32_e32 v8, 0xcf800000, v9 ; CGP-NEXT: v_cvt_u32_f32_e32 v8, v8 -; CGP-NEXT: v_xor_b32_e32 v0, v0, v4 -; CGP-NEXT: v_xor_b32_e32 v1, v1, v4 -; CGP-NEXT: v_mul_lo_u32 v9, -1, v7 -; CGP-NEXT: v_mul_lo_u32 v10, s6, v8 -; CGP-NEXT: v_mul_hi_u32 v12, s6, v7 -; CGP-NEXT: v_mul_lo_u32 v11, s6, v7 -; CGP-NEXT: s_movk_i32 s7, 0x1000 -; CGP-NEXT: v_add_i32_e32 v9, vcc, v9, v10 -; CGP-NEXT: v_add_i32_e32 v9, vcc, v9, v12 -; CGP-NEXT: v_mul_lo_u32 v10, v8, v11 -; CGP-NEXT: v_mul_lo_u32 v12, v7, v9 -; CGP-NEXT: v_mul_hi_u32 v13, v7, v11 -; CGP-NEXT: v_mul_hi_u32 v11, v8, v11 -; CGP-NEXT: s_bfe_i32 s8, -1, 0x10000 -; CGP-NEXT: v_add_i32_e32 v10, vcc, v10, v12 +; CGP-NEXT: v_cvt_u32_f32_e32 v9, v9 +; CGP-NEXT: v_xor_b32_e32 v1, v1, v5 +; CGP-NEXT: s_bfe_i32 s6, -1, 0x10000 +; CGP-NEXT: v_mul_lo_u32 v11, -1, v8 +; CGP-NEXT: v_mul_lo_u32 v12, v0, v9 +; CGP-NEXT: v_mul_hi_u32 v14, v0, v8 +; CGP-NEXT: v_mul_lo_u32 v13, v0, v8 +; CGP-NEXT: v_mac_f32_e32 v6, 0x4f800000, v7 +; CGP-NEXT: v_add_i32_e32 v11, vcc, v11, v12 +; CGP-NEXT: v_add_i32_e32 v11, vcc, v11, v14 +; CGP-NEXT: v_mul_lo_u32 v12, v9, v13 +; CGP-NEXT: v_mul_lo_u32 v14, v8, v11 +; CGP-NEXT: v_mul_hi_u32 v15, v8, v13 +; CGP-NEXT: v_mul_hi_u32 v13, v9, v13 +; CGP-NEXT: v_rcp_iflag_f32_e32 v6, v6 +; CGP-NEXT: v_add_i32_e32 v12, vcc, v12, v14 +; CGP-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v12, vcc, v12, v15 ; CGP-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v10, vcc, v10, v13 -; CGP-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc -; CGP-NEXT: v_mul_lo_u32 v13, v8, v9 -; CGP-NEXT: v_add_i32_e32 v10, vcc, v12, v10 -; CGP-NEXT: v_mul_hi_u32 v12, v7, v9 -; CGP-NEXT: v_add_i32_e32 v11, vcc, v13, v11 +; CGP-NEXT: v_mul_lo_u32 v15, v9, v11 +; CGP-NEXT: v_add_i32_e32 v12, vcc, v14, v12 +; CGP-NEXT: v_mul_hi_u32 v14, v8, v11 +; CGP-NEXT: v_add_i32_e32 v13, vcc, v15, v13 +; CGP-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v13, vcc, v13, v14 +; CGP-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v14, vcc, v15, v14 +; CGP-NEXT: v_mul_hi_u32 v11, v9, v11 +; CGP-NEXT: v_add_i32_e32 v12, vcc, v13, v12 ; CGP-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v13, vcc, v14, v13 +; CGP-NEXT: v_add_i32_e32 v11, vcc, v11, v13 +; CGP-NEXT: v_add_i32_e32 v8, vcc, v8, v12 +; CGP-NEXT: v_addc_u32_e64 v12, s[4:5], v9, v11, vcc +; CGP-NEXT: v_mul_lo_u32 v13, -1, v8 +; CGP-NEXT: v_mul_lo_u32 v14, v0, v12 +; CGP-NEXT: v_mul_hi_u32 v16, v0, v8 +; CGP-NEXT: v_mul_lo_u32 v15, v0, v8 +; CGP-NEXT: v_add_i32_e64 v9, s[4:5], v9, v11 +; CGP-NEXT: v_add_i32_e64 v13, s[4:5], v13, v14 +; CGP-NEXT: v_add_i32_e64 v13, s[4:5], v13, v16 +; CGP-NEXT: v_mul_lo_u32 v14, v12, v15 +; CGP-NEXT: v_mul_lo_u32 v16, v8, v13 +; CGP-NEXT: v_mul_hi_u32 v11, v8, v15 +; CGP-NEXT: v_mul_hi_u32 v15, v12, v15 +; CGP-NEXT: v_mul_f32_e32 v6, 0x5f7ffffc, v6 +; CGP-NEXT: v_add_i32_e64 v14, s[4:5], v14, v16 +; CGP-NEXT: v_cndmask_b32_e64 v16, 0, 1, s[4:5] +; CGP-NEXT: v_add_i32_e64 v11, s[4:5], v14, v11 +; CGP-NEXT: v_cndmask_b32_e64 v11, 0, 1, s[4:5] +; CGP-NEXT: v_mul_lo_u32 v14, v12, v13 +; CGP-NEXT: v_add_i32_e64 v11, s[4:5], v16, v11 +; CGP-NEXT: v_mul_hi_u32 v16, v8, v13 +; CGP-NEXT: v_add_i32_e64 v14, s[4:5], v14, v15 +; CGP-NEXT: v_cndmask_b32_e64 v15, 0, 1, s[4:5] +; CGP-NEXT: v_add_i32_e64 v14, s[4:5], v14, v16 +; CGP-NEXT: v_cndmask_b32_e64 v16, 0, 1, s[4:5] +; CGP-NEXT: v_add_i32_e64 v15, s[4:5], v15, v16 +; CGP-NEXT: v_mul_hi_u32 v12, v12, v13 +; CGP-NEXT: v_add_i32_e64 v11, s[4:5], v14, v11 +; CGP-NEXT: v_cndmask_b32_e64 v14, 0, 1, s[4:5] +; CGP-NEXT: v_add_i32_e64 v13, s[4:5], v15, v14 +; CGP-NEXT: v_add_i32_e64 v12, s[4:5], v12, v13 +; CGP-NEXT: v_addc_u32_e32 v9, vcc, v9, v12, vcc +; CGP-NEXT: v_add_i32_e32 v8, vcc, v8, v11 +; CGP-NEXT: v_addc_u32_e32 v9, vcc, 0, v9, vcc +; CGP-NEXT: v_mul_lo_u32 v11, v1, v8 +; CGP-NEXT: v_mul_lo_u32 v12, v10, v9 +; CGP-NEXT: v_mul_hi_u32 v13, v10, v8 +; CGP-NEXT: v_mul_hi_u32 v8, v1, v8 ; CGP-NEXT: v_add_i32_e32 v11, vcc, v11, v12 ; CGP-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v11, vcc, v11, v13 +; CGP-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc +; CGP-NEXT: v_mul_lo_u32 v13, v1, v9 +; CGP-NEXT: v_add_i32_e32 v11, vcc, v12, v11 +; CGP-NEXT: v_mul_hi_u32 v12, v10, v9 +; CGP-NEXT: v_add_i32_e32 v8, vcc, v13, v8 +; CGP-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v8, vcc, v8, v12 +; CGP-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v12, vcc, v13, v12 -; CGP-NEXT: v_mul_hi_u32 v9, v8, v9 -; CGP-NEXT: v_add_i32_e32 v10, vcc, v11, v10 +; CGP-NEXT: v_mul_hi_u32 v9, v1, v9 +; CGP-NEXT: v_add_i32_e32 v8, vcc, v8, v11 ; CGP-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v11, vcc, v12, v11 ; CGP-NEXT: v_add_i32_e32 v9, vcc, v9, v11 -; CGP-NEXT: v_add_i32_e32 v7, vcc, v7, v10 -; CGP-NEXT: v_addc_u32_e64 v10, s[4:5], v8, v9, vcc -; CGP-NEXT: v_mul_lo_u32 v11, -1, v7 -; CGP-NEXT: v_mul_lo_u32 v12, s6, v10 -; CGP-NEXT: v_mul_hi_u32 v14, s6, v7 -; CGP-NEXT: v_mul_lo_u32 v13, s6, v7 -; CGP-NEXT: v_add_i32_e64 v8, s[4:5], v8, v9 -; CGP-NEXT: v_add_i32_e64 v11, s[4:5], v11, v12 -; CGP-NEXT: v_add_i32_e64 v11, s[4:5], v11, v14 -; CGP-NEXT: v_mul_lo_u32 v12, v10, v13 -; CGP-NEXT: v_mul_lo_u32 v14, v7, v11 -; CGP-NEXT: v_mul_hi_u32 v9, v7, v13 -; CGP-NEXT: v_mul_hi_u32 v13, v10, v13 -; CGP-NEXT: v_mac_f32_e32 v5, 0x4f800000, v6 -; CGP-NEXT: v_add_i32_e64 v12, s[4:5], v12, v14 -; CGP-NEXT: v_cndmask_b32_e64 v14, 0, 1, s[4:5] -; CGP-NEXT: v_add_i32_e64 v9, s[4:5], v12, v9 -; CGP-NEXT: v_cndmask_b32_e64 v9, 0, 1, s[4:5] -; CGP-NEXT: v_mul_lo_u32 v12, v10, v11 -; CGP-NEXT: v_add_i32_e64 v9, s[4:5], v14, v9 -; CGP-NEXT: v_mul_hi_u32 v14, v7, v11 -; CGP-NEXT: v_add_i32_e64 v12, s[4:5], v12, v13 -; CGP-NEXT: v_cndmask_b32_e64 v13, 0, 1, s[4:5] -; CGP-NEXT: v_add_i32_e64 v12, s[4:5], v12, v14 -; CGP-NEXT: v_cndmask_b32_e64 v14, 0, 1, s[4:5] -; CGP-NEXT: v_add_i32_e64 v13, s[4:5], v13, v14 -; CGP-NEXT: v_mul_hi_u32 v10, v10, v11 -; CGP-NEXT: v_add_i32_e64 v9, s[4:5], v12, v9 -; CGP-NEXT: v_cndmask_b32_e64 v12, 0, 1, s[4:5] -; CGP-NEXT: v_add_i32_e64 v11, s[4:5], v13, v12 -; CGP-NEXT: v_add_i32_e64 v10, s[4:5], v10, v11 -; CGP-NEXT: v_addc_u32_e32 v8, vcc, v8, v10, vcc -; CGP-NEXT: v_add_i32_e32 v7, vcc, v7, v9 -; CGP-NEXT: v_addc_u32_e32 v8, vcc, 0, v8, vcc -; CGP-NEXT: v_mul_lo_u32 v9, v1, v7 -; CGP-NEXT: v_mul_lo_u32 v10, v0, v8 -; CGP-NEXT: v_mul_hi_u32 v11, v0, v7 -; CGP-NEXT: v_mul_hi_u32 v7, v1, v7 -; CGP-NEXT: v_rcp_iflag_f32_e32 v5, v5 -; CGP-NEXT: v_add_i32_e32 v9, vcc, v9, v10 -; CGP-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v9, vcc, v9, v11 -; CGP-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc -; CGP-NEXT: v_mul_lo_u32 v11, v1, v8 -; CGP-NEXT: v_add_i32_e32 v9, vcc, v10, v9 -; CGP-NEXT: v_mul_hi_u32 v10, v0, v8 -; CGP-NEXT: v_add_i32_e32 v7, vcc, v11, v7 -; CGP-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v7, vcc, v7, v10 -; CGP-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v10, vcc, v11, v10 -; CGP-NEXT: v_mul_hi_u32 v8, v1, v8 -; CGP-NEXT: v_add_i32_e32 v7, vcc, v7, v9 -; CGP-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v9, vcc, v10, v9 -; CGP-NEXT: v_add_i32_e32 v8, vcc, v8, v9 -; CGP-NEXT: v_mul_lo_u32 v9, 0, v7 -; CGP-NEXT: v_mul_lo_u32 v8, s7, v8 -; CGP-NEXT: v_mul_lo_u32 v10, s7, v7 -; CGP-NEXT: v_mul_hi_u32 v7, s7, v7 -; CGP-NEXT: v_mul_f32_e32 v5, 0x5f7ffffc, v5 +; CGP-NEXT: v_mul_lo_u32 v11, 0, v8 +; CGP-NEXT: v_mul_lo_u32 v9, v4, v9 +; CGP-NEXT: v_mul_lo_u32 v12, v4, v8 +; CGP-NEXT: v_mul_hi_u32 v8, v4, v8 +; CGP-NEXT: v_add_i32_e32 v9, vcc, v11, v9 ; CGP-NEXT: v_add_i32_e32 v8, vcc, v9, v8 -; CGP-NEXT: v_add_i32_e32 v7, vcc, v8, v7 -; CGP-NEXT: v_sub_i32_e32 v0, vcc, v0, v10 -; CGP-NEXT: v_subb_u32_e64 v8, s[4:5], v1, v7, vcc -; CGP-NEXT: v_sub_i32_e64 v1, s[4:5], v1, v7 -; CGP-NEXT: v_cmp_le_u32_e64 s[4:5], s7, v0 -; CGP-NEXT: v_cndmask_b32_e64 v7, 0, -1, s[4:5] -; CGP-NEXT: v_mov_b32_e32 v9, s8 -; CGP-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v8 +; CGP-NEXT: v_sub_i32_e32 v9, vcc, v10, v12 +; CGP-NEXT: v_subb_u32_e64 v10, s[4:5], v1, v8, vcc +; CGP-NEXT: v_sub_i32_e64 v1, s[4:5], v1, v8 +; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v9, v4 +; CGP-NEXT: v_cndmask_b32_e64 v8, 0, -1, s[4:5] +; CGP-NEXT: v_mov_b32_e32 v11, s6 +; CGP-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v10 ; CGP-NEXT: v_subbrev_u32_e32 v1, vcc, 0, v1, vcc -; CGP-NEXT: v_cndmask_b32_e64 v7, v9, v7, s[4:5] -; CGP-NEXT: v_subrev_i32_e32 v9, vcc, s7, v0 +; CGP-NEXT: v_cndmask_b32_e64 v8, v11, v8, s[4:5] +; CGP-NEXT: v_sub_i32_e32 v11, vcc, v9, v4 ; CGP-NEXT: v_subbrev_u32_e32 v1, vcc, 0, v1, vcc ; CGP-NEXT: s_bfe_i32 s4, -1, 0x10000 -; CGP-NEXT: v_cmp_le_u32_e32 vcc, s7, v9 -; CGP-NEXT: v_cndmask_b32_e64 v10, 0, -1, vcc -; CGP-NEXT: v_mov_b32_e32 v11, s4 +; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v11, v4 +; CGP-NEXT: v_cndmask_b32_e64 v12, 0, -1, vcc +; CGP-NEXT: v_mov_b32_e32 v13, s4 ; CGP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 -; CGP-NEXT: v_cndmask_b32_e32 v10, v11, v10, vcc -; CGP-NEXT: v_subrev_i32_e32 v11, vcc, s7, v9 -; CGP-NEXT: v_subbrev_u32_e32 v12, vcc, 0, v1, vcc -; CGP-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10 -; CGP-NEXT: v_cndmask_b32_e32 v9, v9, v11, vcc -; CGP-NEXT: v_cndmask_b32_e32 v1, v1, v12, vcc -; CGP-NEXT: v_cmp_ne_u32_e32 vcc, 0, v7 -; CGP-NEXT: v_mul_f32_e32 v7, 0x2f800000, v5 -; CGP-NEXT: v_trunc_f32_e32 v7, v7 -; CGP-NEXT: v_mac_f32_e32 v5, 0xcf800000, v7 -; CGP-NEXT: v_cvt_u32_f32_e32 v5, v5 -; CGP-NEXT: v_cvt_u32_f32_e32 v7, v7 -; CGP-NEXT: v_cndmask_b32_e32 v0, v0, v9, vcc -; CGP-NEXT: v_cndmask_b32_e32 v1, v8, v1, vcc -; CGP-NEXT: v_mul_lo_u32 v8, -1, v5 -; CGP-NEXT: v_mul_lo_u32 v9, s6, v7 -; CGP-NEXT: v_mul_hi_u32 v11, s6, v5 -; CGP-NEXT: v_ashrrev_i32_e32 v6, 31, v3 -; CGP-NEXT: v_mul_lo_u32 v10, s6, v5 -; CGP-NEXT: v_add_i32_e32 v2, vcc, v2, v6 -; CGP-NEXT: v_addc_u32_e32 v3, vcc, v3, v6, vcc -; CGP-NEXT: v_add_i32_e32 v8, vcc, v8, v9 -; CGP-NEXT: v_add_i32_e32 v8, vcc, v8, v11 -; CGP-NEXT: v_mul_lo_u32 v9, v7, v10 -; CGP-NEXT: v_mul_lo_u32 v11, v5, v8 -; CGP-NEXT: v_mul_hi_u32 v12, v5, v10 -; CGP-NEXT: v_mul_hi_u32 v10, v7, v10 -; CGP-NEXT: v_xor_b32_e32 v0, v0, v4 -; CGP-NEXT: v_add_i32_e32 v9, vcc, v9, v11 -; CGP-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v9, vcc, v9, v12 -; CGP-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc -; CGP-NEXT: v_mul_lo_u32 v12, v7, v8 -; CGP-NEXT: v_add_i32_e32 v9, vcc, v11, v9 -; CGP-NEXT: v_mul_hi_u32 v11, v5, v8 -; CGP-NEXT: v_add_i32_e32 v10, vcc, v12, v10 -; CGP-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc +; CGP-NEXT: v_cndmask_b32_e32 v12, v13, v12, vcc +; CGP-NEXT: v_sub_i32_e32 v13, vcc, v11, v4 +; CGP-NEXT: v_subbrev_u32_e32 v14, vcc, 0, v1, vcc +; CGP-NEXT: v_cmp_ne_u32_e32 vcc, 0, v12 +; CGP-NEXT: v_cndmask_b32_e32 v11, v11, v13, vcc +; CGP-NEXT: v_cndmask_b32_e32 v1, v1, v14, vcc +; CGP-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 +; CGP-NEXT: v_cndmask_b32_e32 v8, v9, v11, vcc +; CGP-NEXT: v_mul_f32_e32 v9, 0x2f800000, v6 +; CGP-NEXT: v_trunc_f32_e32 v9, v9 +; CGP-NEXT: v_mac_f32_e32 v6, 0xcf800000, v9 +; CGP-NEXT: v_cvt_u32_f32_e32 v6, v6 +; CGP-NEXT: v_cvt_u32_f32_e32 v9, v9 +; CGP-NEXT: v_cndmask_b32_e32 v1, v10, v1, vcc +; CGP-NEXT: v_xor_b32_e32 v7, v8, v5 +; CGP-NEXT: v_mul_lo_u32 v10, -1, v6 +; CGP-NEXT: v_mul_lo_u32 v11, v0, v9 +; CGP-NEXT: v_mul_hi_u32 v13, v0, v6 +; CGP-NEXT: v_ashrrev_i32_e32 v8, 31, v3 +; CGP-NEXT: v_mul_lo_u32 v12, v0, v6 +; CGP-NEXT: v_add_i32_e32 v2, vcc, v2, v8 +; CGP-NEXT: v_addc_u32_e32 v3, vcc, v3, v8, vcc ; CGP-NEXT: v_add_i32_e32 v10, vcc, v10, v11 -; CGP-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v11, vcc, v12, v11 -; CGP-NEXT: v_mul_hi_u32 v8, v7, v8 -; CGP-NEXT: v_add_i32_e32 v9, vcc, v10, v9 -; CGP-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v10, vcc, v11, v10 -; CGP-NEXT: v_add_i32_e32 v8, vcc, v8, v10 -; CGP-NEXT: v_add_i32_e32 v5, vcc, v5, v9 -; CGP-NEXT: v_addc_u32_e64 v9, s[4:5], v7, v8, vcc -; CGP-NEXT: v_mul_lo_u32 v10, -1, v5 -; CGP-NEXT: v_mul_lo_u32 v11, s6, v9 -; CGP-NEXT: v_mul_hi_u32 v13, s6, v5 -; CGP-NEXT: v_mul_lo_u32 v12, s6, v5 -; CGP-NEXT: v_add_i32_e64 v7, s[4:5], v7, v8 -; CGP-NEXT: v_add_i32_e64 v10, s[4:5], v10, v11 -; CGP-NEXT: v_add_i32_e64 v10, s[4:5], v10, v13 +; CGP-NEXT: v_add_i32_e32 v10, vcc, v10, v13 ; CGP-NEXT: v_mul_lo_u32 v11, v9, v12 -; CGP-NEXT: v_mul_lo_u32 v13, v5, v10 -; CGP-NEXT: v_mul_hi_u32 v8, v5, v12 +; CGP-NEXT: v_mul_lo_u32 v13, v6, v10 +; CGP-NEXT: v_mul_hi_u32 v14, v6, v12 ; CGP-NEXT: v_mul_hi_u32 v12, v9, v12 -; CGP-NEXT: v_xor_b32_e32 v2, v2, v6 -; CGP-NEXT: v_add_i32_e64 v11, s[4:5], v11, v13 -; CGP-NEXT: v_cndmask_b32_e64 v13, 0, 1, s[4:5] -; CGP-NEXT: v_add_i32_e64 v8, s[4:5], v11, v8 -; CGP-NEXT: v_cndmask_b32_e64 v8, 0, 1, s[4:5] -; CGP-NEXT: v_mul_lo_u32 v11, v9, v10 -; CGP-NEXT: v_add_i32_e64 v8, s[4:5], v13, v8 -; CGP-NEXT: v_mul_hi_u32 v13, v5, v10 -; CGP-NEXT: v_add_i32_e64 v11, s[4:5], v11, v12 -; CGP-NEXT: v_cndmask_b32_e64 v12, 0, 1, s[4:5] -; CGP-NEXT: v_add_i32_e64 v11, s[4:5], v11, v13 +; CGP-NEXT: v_xor_b32_e32 v2, v2, v8 +; CGP-NEXT: v_add_i32_e32 v11, vcc, v11, v13 +; CGP-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v11, vcc, v11, v14 +; CGP-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc +; CGP-NEXT: v_mul_lo_u32 v14, v9, v10 +; CGP-NEXT: v_add_i32_e32 v11, vcc, v13, v11 +; CGP-NEXT: v_mul_hi_u32 v13, v6, v10 +; CGP-NEXT: v_add_i32_e32 v12, vcc, v14, v12 +; CGP-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v12, vcc, v12, v13 +; CGP-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v13, vcc, v14, v13 +; CGP-NEXT: v_mul_hi_u32 v10, v9, v10 +; CGP-NEXT: v_add_i32_e32 v11, vcc, v12, v11 +; CGP-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v12, vcc, v13, v12 +; CGP-NEXT: v_add_i32_e32 v10, vcc, v10, v12 +; CGP-NEXT: v_add_i32_e32 v6, vcc, v6, v11 +; CGP-NEXT: v_addc_u32_e64 v11, s[4:5], v9, v10, vcc +; CGP-NEXT: v_mul_lo_u32 v12, -1, v6 +; CGP-NEXT: v_mul_lo_u32 v13, v0, v11 +; CGP-NEXT: v_mul_lo_u32 v14, v0, v6 +; CGP-NEXT: v_mul_hi_u32 v0, v0, v6 +; CGP-NEXT: v_add_i32_e64 v9, s[4:5], v9, v10 +; CGP-NEXT: v_add_i32_e64 v12, s[4:5], v12, v13 +; CGP-NEXT: v_add_i32_e64 v0, s[4:5], v12, v0 +; CGP-NEXT: v_mul_lo_u32 v12, v11, v14 +; CGP-NEXT: v_mul_lo_u32 v13, v6, v0 +; CGP-NEXT: v_mul_hi_u32 v10, v6, v14 +; CGP-NEXT: v_mul_hi_u32 v14, v11, v14 +; CGP-NEXT: v_xor_b32_e32 v3, v3, v8 +; CGP-NEXT: v_add_i32_e64 v12, s[4:5], v12, v13 ; CGP-NEXT: v_cndmask_b32_e64 v13, 0, 1, s[4:5] +; CGP-NEXT: v_add_i32_e64 v10, s[4:5], v12, v10 +; CGP-NEXT: v_cndmask_b32_e64 v10, 0, 1, s[4:5] +; CGP-NEXT: v_mul_lo_u32 v12, v11, v0 +; CGP-NEXT: v_add_i32_e64 v10, s[4:5], v13, v10 +; CGP-NEXT: v_mul_hi_u32 v13, v6, v0 +; CGP-NEXT: v_add_i32_e64 v12, s[4:5], v12, v14 +; CGP-NEXT: v_cndmask_b32_e64 v14, 0, 1, s[4:5] ; CGP-NEXT: v_add_i32_e64 v12, s[4:5], v12, v13 -; CGP-NEXT: v_mul_hi_u32 v9, v9, v10 -; CGP-NEXT: v_add_i32_e64 v8, s[4:5], v11, v8 -; CGP-NEXT: v_cndmask_b32_e64 v11, 0, 1, s[4:5] -; CGP-NEXT: v_add_i32_e64 v10, s[4:5], v12, v11 -; CGP-NEXT: v_add_i32_e64 v9, s[4:5], v9, v10 -; CGP-NEXT: v_addc_u32_e32 v7, vcc, v7, v9, vcc -; CGP-NEXT: v_add_i32_e32 v5, vcc, v5, v8 -; CGP-NEXT: v_xor_b32_e32 v3, v3, v6 -; CGP-NEXT: v_addc_u32_e32 v7, vcc, 0, v7, vcc -; CGP-NEXT: v_xor_b32_e32 v1, v1, v4 -; CGP-NEXT: v_mul_lo_u32 v8, v3, v5 -; CGP-NEXT: v_mul_lo_u32 v9, v2, v7 -; CGP-NEXT: v_sub_i32_e32 v0, vcc, v0, v4 -; CGP-NEXT: v_subb_u32_e32 v1, vcc, v1, v4, vcc -; CGP-NEXT: v_mul_hi_u32 v4, v2, v5 -; CGP-NEXT: v_add_i32_e32 v8, vcc, v8, v9 -; CGP-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v4, vcc, v8, v4 -; CGP-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc -; CGP-NEXT: v_mul_lo_u32 v8, v3, v7 -; CGP-NEXT: v_mul_hi_u32 v5, v3, v5 -; CGP-NEXT: v_add_i32_e32 v4, vcc, v9, v4 -; CGP-NEXT: v_mul_hi_u32 v9, v2, v7 -; CGP-NEXT: v_add_i32_e32 v5, vcc, v8, v5 -; CGP-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v5, vcc, v5, v9 -; CGP-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v8, vcc, v8, v9 -; CGP-NEXT: v_mul_hi_u32 v7, v3, v7 -; CGP-NEXT: v_add_i32_e32 v4, vcc, v5, v4 -; CGP-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v5, vcc, v8, v5 +; CGP-NEXT: v_cndmask_b32_e64 v13, 0, 1, s[4:5] +; CGP-NEXT: v_add_i32_e64 v13, s[4:5], v14, v13 +; CGP-NEXT: v_mul_hi_u32 v0, v11, v0 +; CGP-NEXT: v_add_i32_e64 v10, s[4:5], v12, v10 +; CGP-NEXT: v_cndmask_b32_e64 v12, 0, 1, s[4:5] +; CGP-NEXT: v_add_i32_e64 v11, s[4:5], v13, v12 +; CGP-NEXT: v_add_i32_e64 v0, s[4:5], v0, v11 +; CGP-NEXT: v_addc_u32_e32 v0, vcc, v9, v0, vcc +; CGP-NEXT: v_add_i32_e32 v6, vcc, v6, v10 +; CGP-NEXT: v_addc_u32_e32 v9, vcc, 0, v0, vcc +; CGP-NEXT: v_xor_b32_e32 v1, v1, v5 +; CGP-NEXT: v_mul_lo_u32 v10, v3, v6 +; CGP-NEXT: v_mul_lo_u32 v11, v2, v9 +; CGP-NEXT: v_sub_i32_e32 v0, vcc, v7, v5 +; CGP-NEXT: v_subb_u32_e32 v1, vcc, v1, v5, vcc +; CGP-NEXT: v_mul_hi_u32 v5, v2, v6 +; CGP-NEXT: v_add_i32_e32 v7, vcc, v10, v11 +; CGP-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v5, vcc, v7, v5 -; CGP-NEXT: v_mul_lo_u32 v7, 0, v4 -; CGP-NEXT: v_mul_lo_u32 v5, s7, v5 -; CGP-NEXT: v_mul_lo_u32 v8, s7, v4 -; CGP-NEXT: v_mul_hi_u32 v4, s7, v4 +; CGP-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc +; CGP-NEXT: v_mul_lo_u32 v7, v3, v9 +; CGP-NEXT: v_mul_hi_u32 v6, v3, v6 +; CGP-NEXT: v_add_i32_e32 v5, vcc, v10, v5 +; CGP-NEXT: v_mul_hi_u32 v10, v2, v9 +; CGP-NEXT: v_add_i32_e32 v6, vcc, v7, v6 +; CGP-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v6, vcc, v6, v10 +; CGP-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v7, vcc, v7, v10 +; CGP-NEXT: v_mul_hi_u32 v9, v3, v9 +; CGP-NEXT: v_add_i32_e32 v5, vcc, v6, v5 +; CGP-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v6, vcc, v7, v6 +; CGP-NEXT: v_add_i32_e32 v6, vcc, v9, v6 +; CGP-NEXT: v_mul_lo_u32 v7, 0, v5 +; CGP-NEXT: v_mul_lo_u32 v6, v4, v6 +; CGP-NEXT: v_mul_lo_u32 v9, v4, v5 +; CGP-NEXT: v_mul_hi_u32 v5, v4, v5 ; CGP-NEXT: s_bfe_i32 s6, -1, 0x10000 -; CGP-NEXT: v_add_i32_e32 v5, vcc, v7, v5 -; CGP-NEXT: v_add_i32_e32 v4, vcc, v5, v4 -; CGP-NEXT: v_sub_i32_e32 v2, vcc, v2, v8 -; CGP-NEXT: v_subb_u32_e64 v5, s[4:5], v3, v4, vcc -; CGP-NEXT: v_sub_i32_e64 v3, s[4:5], v3, v4 -; CGP-NEXT: v_cmp_le_u32_e64 s[4:5], s7, v2 -; CGP-NEXT: v_cndmask_b32_e64 v4, 0, -1, s[4:5] +; CGP-NEXT: v_add_i32_e32 v6, vcc, v7, v6 +; CGP-NEXT: v_add_i32_e32 v5, vcc, v6, v5 +; CGP-NEXT: v_sub_i32_e32 v2, vcc, v2, v9 +; CGP-NEXT: v_subb_u32_e64 v6, s[4:5], v3, v5, vcc +; CGP-NEXT: v_sub_i32_e64 v3, s[4:5], v3, v5 +; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v2, v4 +; CGP-NEXT: v_cndmask_b32_e64 v5, 0, -1, s[4:5] ; CGP-NEXT: v_mov_b32_e32 v7, s6 -; CGP-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v5 +; CGP-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v6 ; CGP-NEXT: v_subbrev_u32_e32 v3, vcc, 0, v3, vcc -; CGP-NEXT: v_cndmask_b32_e64 v4, v7, v4, s[4:5] -; CGP-NEXT: v_subrev_i32_e32 v7, vcc, s7, v2 +; CGP-NEXT: v_cndmask_b32_e64 v5, v7, v5, s[4:5] +; CGP-NEXT: v_sub_i32_e32 v7, vcc, v2, v4 ; CGP-NEXT: v_subbrev_u32_e32 v3, vcc, 0, v3, vcc ; CGP-NEXT: s_bfe_i32 s4, -1, 0x10000 -; CGP-NEXT: v_cmp_le_u32_e32 vcc, s7, v7 -; CGP-NEXT: v_cndmask_b32_e64 v8, 0, -1, vcc -; CGP-NEXT: v_mov_b32_e32 v9, s4 +; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v7, v4 +; CGP-NEXT: v_cndmask_b32_e64 v9, 0, -1, vcc +; CGP-NEXT: v_mov_b32_e32 v10, s4 ; CGP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 -; CGP-NEXT: v_cndmask_b32_e32 v8, v9, v8, vcc -; CGP-NEXT: v_subrev_i32_e32 v9, vcc, s7, v7 +; CGP-NEXT: v_cndmask_b32_e32 v9, v10, v9, vcc +; CGP-NEXT: v_sub_i32_e32 v4, vcc, v7, v4 ; CGP-NEXT: v_subbrev_u32_e32 v10, vcc, 0, v3, vcc -; CGP-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 -; CGP-NEXT: v_cndmask_b32_e32 v7, v7, v9, vcc +; CGP-NEXT: v_cmp_ne_u32_e32 vcc, 0, v9 +; CGP-NEXT: v_cndmask_b32_e32 v4, v7, v4, vcc ; CGP-NEXT: v_cndmask_b32_e32 v3, v3, v10, vcc -; CGP-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 -; CGP-NEXT: v_cndmask_b32_e32 v2, v2, v7, vcc -; CGP-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc -; CGP-NEXT: v_xor_b32_e32 v2, v2, v6 -; CGP-NEXT: v_xor_b32_e32 v3, v3, v6 -; CGP-NEXT: v_sub_i32_e32 v2, vcc, v2, v6 -; CGP-NEXT: v_subb_u32_e32 v3, vcc, v3, v6, vcc +; CGP-NEXT: v_cmp_ne_u32_e32 vcc, 0, v5 +; CGP-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc +; CGP-NEXT: v_cndmask_b32_e32 v3, v6, v3, vcc +; CGP-NEXT: v_xor_b32_e32 v2, v2, v8 +; CGP-NEXT: v_xor_b32_e32 v3, v3, v8 +; CGP-NEXT: v_sub_i32_e32 v2, vcc, v2, v8 +; CGP-NEXT: v_subb_u32_e32 v3, vcc, v3, v8, vcc ; CGP-NEXT: s_setpc_b64 s[30:31] %result = srem <2 x i64> %num, ret <2 x i64> %result @@ -1758,140 +1758,140 @@ ; CHECK-LABEL: v_srem_i64_oddk_denom: ; CHECK: ; %bb.0: ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: v_cvt_f32_u32_e32 v3, 0x12d8fb -; CHECK-NEXT: v_cvt_f32_ubyte0_e32 v4, 0 -; CHECK-NEXT: s_mov_b32 s6, 0xffed2705 -; CHECK-NEXT: v_ashrrev_i32_e32 v2, 31, v1 -; CHECK-NEXT: v_mac_f32_e32 v3, 0x4f800000, v4 +; CHECK-NEXT: v_mov_b32_e32 v2, 0x12d8fb +; CHECK-NEXT: v_cvt_f32_u32_e32 v3, v2 +; CHECK-NEXT: v_cvt_f32_ubyte0_e32 v5, 0 +; CHECK-NEXT: v_mov_b32_e32 v6, 0xffed2705 +; CHECK-NEXT: v_ashrrev_i32_e32 v4, 31, v1 +; CHECK-NEXT: v_mac_f32_e32 v3, 0x4f800000, v5 ; CHECK-NEXT: v_rcp_iflag_f32_e32 v3, v3 -; CHECK-NEXT: v_add_i32_e32 v0, vcc, v0, v2 -; CHECK-NEXT: v_addc_u32_e32 v1, vcc, v1, v2, vcc +; CHECK-NEXT: v_add_i32_e32 v0, vcc, v0, v4 +; CHECK-NEXT: v_addc_u32_e32 v1, vcc, v1, v4, vcc ; CHECK-NEXT: v_mul_f32_e32 v3, 0x5f7ffffc, v3 -; CHECK-NEXT: v_mul_f32_e32 v4, 0x2f800000, v3 -; CHECK-NEXT: v_trunc_f32_e32 v4, v4 -; CHECK-NEXT: v_mac_f32_e32 v3, 0xcf800000, v4 +; CHECK-NEXT: v_mul_f32_e32 v5, 0x2f800000, v3 +; CHECK-NEXT: v_trunc_f32_e32 v5, v5 +; CHECK-NEXT: v_mac_f32_e32 v3, 0xcf800000, v5 ; CHECK-NEXT: v_cvt_u32_f32_e32 v3, v3 -; CHECK-NEXT: v_cvt_u32_f32_e32 v4, v4 -; CHECK-NEXT: v_xor_b32_e32 v0, v0, v2 -; CHECK-NEXT: v_xor_b32_e32 v1, v1, v2 -; CHECK-NEXT: v_mul_lo_u32 v5, -1, v3 -; CHECK-NEXT: v_mul_lo_u32 v6, s6, v4 -; CHECK-NEXT: v_mul_hi_u32 v8, s6, v3 -; CHECK-NEXT: v_mul_lo_u32 v7, s6, v3 -; CHECK-NEXT: s_bfe_i32 s7, -1, 0x10000 -; CHECK-NEXT: v_add_i32_e32 v5, vcc, v5, v6 -; CHECK-NEXT: v_add_i32_e32 v5, vcc, v5, v8 -; CHECK-NEXT: v_mul_lo_u32 v6, v4, v7 -; CHECK-NEXT: v_mul_lo_u32 v8, v3, v5 -; CHECK-NEXT: v_mul_hi_u32 v9, v3, v7 -; CHECK-NEXT: v_mul_hi_u32 v7, v4, v7 -; CHECK-NEXT: v_add_i32_e32 v6, vcc, v6, v8 -; CHECK-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc -; CHECK-NEXT: v_add_i32_e32 v6, vcc, v6, v9 -; CHECK-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc -; CHECK-NEXT: v_mul_lo_u32 v9, v4, v5 -; CHECK-NEXT: v_add_i32_e32 v6, vcc, v8, v6 -; CHECK-NEXT: v_mul_hi_u32 v8, v3, v5 -; CHECK-NEXT: v_add_i32_e32 v7, vcc, v9, v7 -; CHECK-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc -; CHECK-NEXT: v_add_i32_e32 v7, vcc, v7, v8 -; CHECK-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc -; CHECK-NEXT: v_add_i32_e32 v8, vcc, v9, v8 -; CHECK-NEXT: v_mul_hi_u32 v5, v4, v5 -; CHECK-NEXT: v_add_i32_e32 v6, vcc, v7, v6 -; CHECK-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc -; CHECK-NEXT: v_add_i32_e32 v7, vcc, v8, v7 -; CHECK-NEXT: v_add_i32_e32 v5, vcc, v5, v7 -; CHECK-NEXT: v_add_i32_e32 v3, vcc, v3, v6 -; CHECK-NEXT: v_addc_u32_e64 v6, s[4:5], v4, v5, vcc +; CHECK-NEXT: v_cvt_u32_f32_e32 v5, v5 +; CHECK-NEXT: v_xor_b32_e32 v0, v0, v4 +; CHECK-NEXT: v_xor_b32_e32 v1, v1, v4 ; CHECK-NEXT: v_mul_lo_u32 v7, -1, v3 -; CHECK-NEXT: v_mul_lo_u32 v8, s6, v6 -; CHECK-NEXT: v_mul_hi_u32 v10, s6, v3 -; CHECK-NEXT: v_mul_lo_u32 v9, s6, v3 -; CHECK-NEXT: v_add_i32_e64 v4, s[4:5], v4, v5 -; CHECK-NEXT: v_add_i32_e64 v7, s[4:5], v7, v8 -; CHECK-NEXT: v_add_i32_e64 v7, s[4:5], v7, v10 -; CHECK-NEXT: v_mul_lo_u32 v8, v6, v9 +; CHECK-NEXT: v_mul_lo_u32 v8, v6, v5 +; CHECK-NEXT: v_mul_hi_u32 v10, v6, v3 +; CHECK-NEXT: v_mul_lo_u32 v9, v6, v3 +; CHECK-NEXT: s_bfe_i32 s6, -1, 0x10000 +; CHECK-NEXT: v_add_i32_e32 v7, vcc, v7, v8 +; CHECK-NEXT: v_add_i32_e32 v7, vcc, v7, v10 +; CHECK-NEXT: v_mul_lo_u32 v8, v5, v9 ; CHECK-NEXT: v_mul_lo_u32 v10, v3, v7 -; CHECK-NEXT: v_mul_hi_u32 v5, v3, v9 -; CHECK-NEXT: v_mul_hi_u32 v9, v6, v9 -; CHECK-NEXT: s_mov_b32 s6, 0x12d8fb -; CHECK-NEXT: v_add_i32_e64 v8, s[4:5], v8, v10 -; CHECK-NEXT: v_cndmask_b32_e64 v10, 0, 1, s[4:5] -; CHECK-NEXT: v_add_i32_e64 v5, s[4:5], v8, v5 -; CHECK-NEXT: v_cndmask_b32_e64 v5, 0, 1, s[4:5] -; CHECK-NEXT: v_mul_lo_u32 v8, v6, v7 -; CHECK-NEXT: v_add_i32_e64 v5, s[4:5], v10, v5 +; CHECK-NEXT: v_mul_hi_u32 v11, v3, v9 +; CHECK-NEXT: v_mul_hi_u32 v9, v5, v9 +; CHECK-NEXT: v_add_i32_e32 v8, vcc, v8, v10 +; CHECK-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc +; CHECK-NEXT: v_add_i32_e32 v8, vcc, v8, v11 +; CHECK-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc +; CHECK-NEXT: v_mul_lo_u32 v11, v5, v7 +; CHECK-NEXT: v_add_i32_e32 v8, vcc, v10, v8 ; CHECK-NEXT: v_mul_hi_u32 v10, v3, v7 -; CHECK-NEXT: v_add_i32_e64 v8, s[4:5], v8, v9 -; CHECK-NEXT: v_cndmask_b32_e64 v9, 0, 1, s[4:5] -; CHECK-NEXT: v_add_i32_e64 v8, s[4:5], v8, v10 +; CHECK-NEXT: v_add_i32_e32 v9, vcc, v11, v9 +; CHECK-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc +; CHECK-NEXT: v_add_i32_e32 v9, vcc, v9, v10 +; CHECK-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc +; CHECK-NEXT: v_add_i32_e32 v10, vcc, v11, v10 +; CHECK-NEXT: v_mul_hi_u32 v7, v5, v7 +; CHECK-NEXT: v_add_i32_e32 v8, vcc, v9, v8 +; CHECK-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc +; CHECK-NEXT: v_add_i32_e32 v9, vcc, v10, v9 +; CHECK-NEXT: v_add_i32_e32 v7, vcc, v7, v9 +; CHECK-NEXT: v_add_i32_e32 v3, vcc, v3, v8 +; CHECK-NEXT: v_addc_u32_e64 v8, s[4:5], v5, v7, vcc +; CHECK-NEXT: v_mul_lo_u32 v9, -1, v3 +; CHECK-NEXT: v_mul_lo_u32 v10, v6, v8 +; CHECK-NEXT: v_mul_lo_u32 v11, v6, v3 +; CHECK-NEXT: v_mul_hi_u32 v6, v6, v3 +; CHECK-NEXT: v_add_i32_e64 v5, s[4:5], v5, v7 +; CHECK-NEXT: v_add_i32_e64 v9, s[4:5], v9, v10 +; CHECK-NEXT: v_add_i32_e64 v6, s[4:5], v9, v6 +; CHECK-NEXT: v_mul_lo_u32 v9, v8, v11 +; CHECK-NEXT: v_mul_lo_u32 v10, v3, v6 +; CHECK-NEXT: v_mul_hi_u32 v7, v3, v11 +; CHECK-NEXT: v_mul_hi_u32 v11, v8, v11 +; CHECK-NEXT: v_add_i32_e64 v9, s[4:5], v9, v10 ; CHECK-NEXT: v_cndmask_b32_e64 v10, 0, 1, s[4:5] +; CHECK-NEXT: v_add_i32_e64 v7, s[4:5], v9, v7 +; CHECK-NEXT: v_cndmask_b32_e64 v7, 0, 1, s[4:5] +; CHECK-NEXT: v_mul_lo_u32 v9, v8, v6 +; CHECK-NEXT: v_add_i32_e64 v7, s[4:5], v10, v7 +; CHECK-NEXT: v_mul_hi_u32 v10, v3, v6 +; CHECK-NEXT: v_add_i32_e64 v9, s[4:5], v9, v11 +; CHECK-NEXT: v_cndmask_b32_e64 v11, 0, 1, s[4:5] ; CHECK-NEXT: v_add_i32_e64 v9, s[4:5], v9, v10 -; CHECK-NEXT: v_mul_hi_u32 v6, v6, v7 -; CHECK-NEXT: v_add_i32_e64 v5, s[4:5], v8, v5 -; CHECK-NEXT: v_cndmask_b32_e64 v8, 0, 1, s[4:5] -; CHECK-NEXT: v_add_i32_e64 v7, s[4:5], v9, v8 -; CHECK-NEXT: v_add_i32_e64 v6, s[4:5], v6, v7 -; CHECK-NEXT: v_addc_u32_e32 v4, vcc, v4, v6, vcc -; CHECK-NEXT: v_add_i32_e32 v3, vcc, v3, v5 -; CHECK-NEXT: v_addc_u32_e32 v4, vcc, 0, v4, vcc -; CHECK-NEXT: v_mul_lo_u32 v5, v1, v3 -; CHECK-NEXT: v_mul_lo_u32 v6, v0, v4 -; CHECK-NEXT: v_mul_hi_u32 v7, v0, v3 +; CHECK-NEXT: v_cndmask_b32_e64 v10, 0, 1, s[4:5] +; CHECK-NEXT: v_add_i32_e64 v10, s[4:5], v11, v10 +; CHECK-NEXT: v_mul_hi_u32 v6, v8, v6 +; CHECK-NEXT: v_add_i32_e64 v7, s[4:5], v9, v7 +; CHECK-NEXT: v_cndmask_b32_e64 v9, 0, 1, s[4:5] +; CHECK-NEXT: v_add_i32_e64 v8, s[4:5], v10, v9 +; CHECK-NEXT: v_add_i32_e64 v6, s[4:5], v6, v8 +; CHECK-NEXT: v_addc_u32_e32 v5, vcc, v5, v6, vcc +; CHECK-NEXT: v_add_i32_e32 v3, vcc, v3, v7 +; CHECK-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc +; CHECK-NEXT: v_mul_lo_u32 v6, v1, v3 +; CHECK-NEXT: v_mul_lo_u32 v7, v0, v5 +; CHECK-NEXT: v_mul_hi_u32 v8, v0, v3 ; CHECK-NEXT: v_mul_hi_u32 v3, v1, v3 -; CHECK-NEXT: v_add_i32_e32 v5, vcc, v5, v6 +; CHECK-NEXT: v_add_i32_e32 v6, vcc, v6, v7 +; CHECK-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc +; CHECK-NEXT: v_add_i32_e32 v6, vcc, v6, v8 ; CHECK-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc -; CHECK-NEXT: v_add_i32_e32 v5, vcc, v5, v7 -; CHECK-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc -; CHECK-NEXT: v_mul_lo_u32 v7, v1, v4 -; CHECK-NEXT: v_add_i32_e32 v5, vcc, v6, v5 -; CHECK-NEXT: v_mul_hi_u32 v6, v0, v4 -; CHECK-NEXT: v_add_i32_e32 v3, vcc, v7, v3 +; CHECK-NEXT: v_mul_lo_u32 v8, v1, v5 +; CHECK-NEXT: v_add_i32_e32 v6, vcc, v7, v6 +; CHECK-NEXT: v_mul_hi_u32 v7, v0, v5 +; CHECK-NEXT: v_add_i32_e32 v3, vcc, v8, v3 +; CHECK-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc +; CHECK-NEXT: v_add_i32_e32 v3, vcc, v3, v7 ; CHECK-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc +; CHECK-NEXT: v_add_i32_e32 v7, vcc, v8, v7 +; CHECK-NEXT: v_mul_hi_u32 v5, v1, v5 ; CHECK-NEXT: v_add_i32_e32 v3, vcc, v3, v6 ; CHECK-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc ; CHECK-NEXT: v_add_i32_e32 v6, vcc, v7, v6 -; CHECK-NEXT: v_mul_hi_u32 v4, v1, v4 -; CHECK-NEXT: v_add_i32_e32 v3, vcc, v3, v5 -; CHECK-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc +; CHECK-NEXT: v_add_i32_e32 v5, vcc, v5, v6 +; CHECK-NEXT: v_mul_lo_u32 v6, 0, v3 +; CHECK-NEXT: v_mul_lo_u32 v5, v2, v5 +; CHECK-NEXT: v_mul_lo_u32 v7, v2, v3 +; CHECK-NEXT: v_mul_hi_u32 v3, v2, v3 ; CHECK-NEXT: v_add_i32_e32 v5, vcc, v6, v5 -; CHECK-NEXT: v_add_i32_e32 v4, vcc, v4, v5 -; CHECK-NEXT: v_mul_lo_u32 v5, 0, v3 -; CHECK-NEXT: v_mul_lo_u32 v4, s6, v4 -; CHECK-NEXT: v_mul_lo_u32 v6, s6, v3 -; CHECK-NEXT: v_mul_hi_u32 v3, s6, v3 -; CHECK-NEXT: v_add_i32_e32 v4, vcc, v5, v4 -; CHECK-NEXT: v_add_i32_e32 v3, vcc, v4, v3 -; CHECK-NEXT: v_sub_i32_e32 v0, vcc, v0, v6 -; CHECK-NEXT: v_subb_u32_e64 v4, s[4:5], v1, v3, vcc +; CHECK-NEXT: v_add_i32_e32 v3, vcc, v5, v3 +; CHECK-NEXT: v_sub_i32_e32 v0, vcc, v0, v7 +; CHECK-NEXT: v_subb_u32_e64 v5, s[4:5], v1, v3, vcc ; CHECK-NEXT: v_sub_i32_e64 v1, s[4:5], v1, v3 -; CHECK-NEXT: v_cmp_le_u32_e64 s[4:5], s6, v0 +; CHECK-NEXT: v_cmp_ge_u32_e64 s[4:5], v0, v2 ; CHECK-NEXT: v_cndmask_b32_e64 v3, 0, -1, s[4:5] -; CHECK-NEXT: v_mov_b32_e32 v5, s7 -; CHECK-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v4 +; CHECK-NEXT: v_mov_b32_e32 v6, s6 +; CHECK-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v5 ; CHECK-NEXT: v_subbrev_u32_e32 v1, vcc, 0, v1, vcc -; CHECK-NEXT: v_cndmask_b32_e64 v3, v5, v3, s[4:5] -; CHECK-NEXT: v_subrev_i32_e32 v5, vcc, s6, v0 +; CHECK-NEXT: v_cndmask_b32_e64 v3, v6, v3, s[4:5] +; CHECK-NEXT: v_sub_i32_e32 v6, vcc, v0, v2 ; CHECK-NEXT: v_subbrev_u32_e32 v1, vcc, 0, v1, vcc ; CHECK-NEXT: s_bfe_i32 s4, -1, 0x10000 -; CHECK-NEXT: v_cmp_le_u32_e32 vcc, s6, v5 -; CHECK-NEXT: v_cndmask_b32_e64 v6, 0, -1, vcc -; CHECK-NEXT: v_mov_b32_e32 v7, s4 +; CHECK-NEXT: v_cmp_ge_u32_e32 vcc, v6, v2 +; CHECK-NEXT: v_cndmask_b32_e64 v7, 0, -1, vcc +; CHECK-NEXT: v_mov_b32_e32 v8, s4 ; CHECK-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 -; CHECK-NEXT: v_cndmask_b32_e32 v6, v7, v6, vcc -; CHECK-NEXT: v_subrev_i32_e32 v7, vcc, s6, v5 +; CHECK-NEXT: v_cndmask_b32_e32 v7, v8, v7, vcc +; CHECK-NEXT: v_sub_i32_e32 v2, vcc, v6, v2 ; CHECK-NEXT: v_subbrev_u32_e32 v8, vcc, 0, v1, vcc -; CHECK-NEXT: v_cmp_ne_u32_e32 vcc, 0, v6 -; CHECK-NEXT: v_cndmask_b32_e32 v5, v5, v7, vcc +; CHECK-NEXT: v_cmp_ne_u32_e32 vcc, 0, v7 +; CHECK-NEXT: v_cndmask_b32_e32 v2, v6, v2, vcc ; CHECK-NEXT: v_cndmask_b32_e32 v1, v1, v8, vcc ; CHECK-NEXT: v_cmp_ne_u32_e32 vcc, 0, v3 -; CHECK-NEXT: v_cndmask_b32_e32 v0, v0, v5, vcc -; CHECK-NEXT: v_cndmask_b32_e32 v1, v4, v1, vcc -; CHECK-NEXT: v_xor_b32_e32 v0, v0, v2 -; CHECK-NEXT: v_xor_b32_e32 v1, v1, v2 -; CHECK-NEXT: v_sub_i32_e32 v0, vcc, v0, v2 -; CHECK-NEXT: v_subb_u32_e32 v1, vcc, v1, v2, vcc +; CHECK-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc +; CHECK-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc +; CHECK-NEXT: v_xor_b32_e32 v0, v0, v4 +; CHECK-NEXT: v_xor_b32_e32 v1, v1, v4 +; CHECK-NEXT: v_sub_i32_e32 v0, vcc, v0, v4 +; CHECK-NEXT: v_subb_u32_e32 v1, vcc, v1, v4, vcc ; CHECK-NEXT: s_setpc_b64 s[30:31] %result = srem i64 %num, 1235195 ret i64 %result @@ -2199,271 +2199,271 @@ ; CGP-LABEL: v_srem_v2i64_oddk_denom: ; CGP: ; %bb.0: ; CGP-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CGP-NEXT: v_cvt_f32_u32_e32 v5, 0x12d8fb -; CGP-NEXT: v_cvt_f32_ubyte0_e32 v6, 0 -; CGP-NEXT: s_mov_b32 s6, 0xffed2705 -; CGP-NEXT: v_ashrrev_i32_e32 v4, 31, v1 -; CGP-NEXT: v_mov_b32_e32 v7, v5 -; CGP-NEXT: v_mac_f32_e32 v7, 0x4f800000, v6 -; CGP-NEXT: v_rcp_iflag_f32_e32 v7, v7 -; CGP-NEXT: v_add_i32_e32 v0, vcc, v0, v4 -; CGP-NEXT: v_addc_u32_e32 v1, vcc, v1, v4, vcc -; CGP-NEXT: v_mul_f32_e32 v7, 0x5f7ffffc, v7 -; CGP-NEXT: v_mul_f32_e32 v8, 0x2f800000, v7 -; CGP-NEXT: v_trunc_f32_e32 v8, v8 -; CGP-NEXT: v_mac_f32_e32 v7, 0xcf800000, v8 -; CGP-NEXT: v_cvt_u32_f32_e32 v7, v7 +; CGP-NEXT: v_mov_b32_e32 v4, 0x12d8fb +; CGP-NEXT: v_cvt_f32_u32_e32 v6, v4 +; CGP-NEXT: v_cvt_f32_ubyte0_e32 v7, 0 +; CGP-NEXT: v_ashrrev_i32_e32 v5, 31, v1 +; CGP-NEXT: v_add_i32_e32 v0, vcc, v0, v5 +; CGP-NEXT: v_mov_b32_e32 v8, v6 +; CGP-NEXT: v_mac_f32_e32 v8, 0x4f800000, v7 +; CGP-NEXT: v_rcp_iflag_f32_e32 v8, v8 +; CGP-NEXT: v_xor_b32_e32 v10, v0, v5 +; CGP-NEXT: v_mov_b32_e32 v0, 0xffed2705 +; CGP-NEXT: v_addc_u32_e32 v1, vcc, v1, v5, vcc +; CGP-NEXT: v_mul_f32_e32 v8, 0x5f7ffffc, v8 +; CGP-NEXT: v_mul_f32_e32 v9, 0x2f800000, v8 +; CGP-NEXT: v_trunc_f32_e32 v9, v9 +; CGP-NEXT: v_mac_f32_e32 v8, 0xcf800000, v9 ; CGP-NEXT: v_cvt_u32_f32_e32 v8, v8 -; CGP-NEXT: v_xor_b32_e32 v0, v0, v4 -; CGP-NEXT: v_xor_b32_e32 v1, v1, v4 -; CGP-NEXT: v_mul_lo_u32 v9, -1, v7 -; CGP-NEXT: v_mul_lo_u32 v10, s6, v8 -; CGP-NEXT: v_mul_hi_u32 v12, s6, v7 -; CGP-NEXT: v_mul_lo_u32 v11, s6, v7 -; CGP-NEXT: s_mov_b32 s7, 0x12d8fb -; CGP-NEXT: v_add_i32_e32 v9, vcc, v9, v10 -; CGP-NEXT: v_add_i32_e32 v9, vcc, v9, v12 -; CGP-NEXT: v_mul_lo_u32 v10, v8, v11 -; CGP-NEXT: v_mul_lo_u32 v12, v7, v9 -; CGP-NEXT: v_mul_hi_u32 v13, v7, v11 -; CGP-NEXT: v_mul_hi_u32 v11, v8, v11 -; CGP-NEXT: s_bfe_i32 s8, -1, 0x10000 -; CGP-NEXT: v_add_i32_e32 v10, vcc, v10, v12 +; CGP-NEXT: v_cvt_u32_f32_e32 v9, v9 +; CGP-NEXT: v_xor_b32_e32 v1, v1, v5 +; CGP-NEXT: s_bfe_i32 s6, -1, 0x10000 +; CGP-NEXT: v_mul_lo_u32 v11, -1, v8 +; CGP-NEXT: v_mul_lo_u32 v12, v0, v9 +; CGP-NEXT: v_mul_hi_u32 v14, v0, v8 +; CGP-NEXT: v_mul_lo_u32 v13, v0, v8 +; CGP-NEXT: v_mac_f32_e32 v6, 0x4f800000, v7 +; CGP-NEXT: v_add_i32_e32 v11, vcc, v11, v12 +; CGP-NEXT: v_add_i32_e32 v11, vcc, v11, v14 +; CGP-NEXT: v_mul_lo_u32 v12, v9, v13 +; CGP-NEXT: v_mul_lo_u32 v14, v8, v11 +; CGP-NEXT: v_mul_hi_u32 v15, v8, v13 +; CGP-NEXT: v_mul_hi_u32 v13, v9, v13 +; CGP-NEXT: v_rcp_iflag_f32_e32 v6, v6 +; CGP-NEXT: v_add_i32_e32 v12, vcc, v12, v14 +; CGP-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v12, vcc, v12, v15 ; CGP-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v10, vcc, v10, v13 -; CGP-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc -; CGP-NEXT: v_mul_lo_u32 v13, v8, v9 -; CGP-NEXT: v_add_i32_e32 v10, vcc, v12, v10 -; CGP-NEXT: v_mul_hi_u32 v12, v7, v9 -; CGP-NEXT: v_add_i32_e32 v11, vcc, v13, v11 +; CGP-NEXT: v_mul_lo_u32 v15, v9, v11 +; CGP-NEXT: v_add_i32_e32 v12, vcc, v14, v12 +; CGP-NEXT: v_mul_hi_u32 v14, v8, v11 +; CGP-NEXT: v_add_i32_e32 v13, vcc, v15, v13 +; CGP-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v13, vcc, v13, v14 +; CGP-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v14, vcc, v15, v14 +; CGP-NEXT: v_mul_hi_u32 v11, v9, v11 +; CGP-NEXT: v_add_i32_e32 v12, vcc, v13, v12 ; CGP-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v13, vcc, v14, v13 +; CGP-NEXT: v_add_i32_e32 v11, vcc, v11, v13 +; CGP-NEXT: v_add_i32_e32 v8, vcc, v8, v12 +; CGP-NEXT: v_addc_u32_e64 v12, s[4:5], v9, v11, vcc +; CGP-NEXT: v_mul_lo_u32 v13, -1, v8 +; CGP-NEXT: v_mul_lo_u32 v14, v0, v12 +; CGP-NEXT: v_mul_hi_u32 v16, v0, v8 +; CGP-NEXT: v_mul_lo_u32 v15, v0, v8 +; CGP-NEXT: v_add_i32_e64 v9, s[4:5], v9, v11 +; CGP-NEXT: v_add_i32_e64 v13, s[4:5], v13, v14 +; CGP-NEXT: v_add_i32_e64 v13, s[4:5], v13, v16 +; CGP-NEXT: v_mul_lo_u32 v14, v12, v15 +; CGP-NEXT: v_mul_lo_u32 v16, v8, v13 +; CGP-NEXT: v_mul_hi_u32 v11, v8, v15 +; CGP-NEXT: v_mul_hi_u32 v15, v12, v15 +; CGP-NEXT: v_mul_f32_e32 v6, 0x5f7ffffc, v6 +; CGP-NEXT: v_add_i32_e64 v14, s[4:5], v14, v16 +; CGP-NEXT: v_cndmask_b32_e64 v16, 0, 1, s[4:5] +; CGP-NEXT: v_add_i32_e64 v11, s[4:5], v14, v11 +; CGP-NEXT: v_cndmask_b32_e64 v11, 0, 1, s[4:5] +; CGP-NEXT: v_mul_lo_u32 v14, v12, v13 +; CGP-NEXT: v_add_i32_e64 v11, s[4:5], v16, v11 +; CGP-NEXT: v_mul_hi_u32 v16, v8, v13 +; CGP-NEXT: v_add_i32_e64 v14, s[4:5], v14, v15 +; CGP-NEXT: v_cndmask_b32_e64 v15, 0, 1, s[4:5] +; CGP-NEXT: v_add_i32_e64 v14, s[4:5], v14, v16 +; CGP-NEXT: v_cndmask_b32_e64 v16, 0, 1, s[4:5] +; CGP-NEXT: v_add_i32_e64 v15, s[4:5], v15, v16 +; CGP-NEXT: v_mul_hi_u32 v12, v12, v13 +; CGP-NEXT: v_add_i32_e64 v11, s[4:5], v14, v11 +; CGP-NEXT: v_cndmask_b32_e64 v14, 0, 1, s[4:5] +; CGP-NEXT: v_add_i32_e64 v13, s[4:5], v15, v14 +; CGP-NEXT: v_add_i32_e64 v12, s[4:5], v12, v13 +; CGP-NEXT: v_addc_u32_e32 v9, vcc, v9, v12, vcc +; CGP-NEXT: v_add_i32_e32 v8, vcc, v8, v11 +; CGP-NEXT: v_addc_u32_e32 v9, vcc, 0, v9, vcc +; CGP-NEXT: v_mul_lo_u32 v11, v1, v8 +; CGP-NEXT: v_mul_lo_u32 v12, v10, v9 +; CGP-NEXT: v_mul_hi_u32 v13, v10, v8 +; CGP-NEXT: v_mul_hi_u32 v8, v1, v8 ; CGP-NEXT: v_add_i32_e32 v11, vcc, v11, v12 ; CGP-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v11, vcc, v11, v13 +; CGP-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc +; CGP-NEXT: v_mul_lo_u32 v13, v1, v9 +; CGP-NEXT: v_add_i32_e32 v11, vcc, v12, v11 +; CGP-NEXT: v_mul_hi_u32 v12, v10, v9 +; CGP-NEXT: v_add_i32_e32 v8, vcc, v13, v8 +; CGP-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v8, vcc, v8, v12 +; CGP-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v12, vcc, v13, v12 -; CGP-NEXT: v_mul_hi_u32 v9, v8, v9 -; CGP-NEXT: v_add_i32_e32 v10, vcc, v11, v10 +; CGP-NEXT: v_mul_hi_u32 v9, v1, v9 +; CGP-NEXT: v_add_i32_e32 v8, vcc, v8, v11 ; CGP-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v11, vcc, v12, v11 ; CGP-NEXT: v_add_i32_e32 v9, vcc, v9, v11 -; CGP-NEXT: v_add_i32_e32 v7, vcc, v7, v10 -; CGP-NEXT: v_addc_u32_e64 v10, s[4:5], v8, v9, vcc -; CGP-NEXT: v_mul_lo_u32 v11, -1, v7 -; CGP-NEXT: v_mul_lo_u32 v12, s6, v10 -; CGP-NEXT: v_mul_hi_u32 v14, s6, v7 -; CGP-NEXT: v_mul_lo_u32 v13, s6, v7 -; CGP-NEXT: v_add_i32_e64 v8, s[4:5], v8, v9 -; CGP-NEXT: v_add_i32_e64 v11, s[4:5], v11, v12 -; CGP-NEXT: v_add_i32_e64 v11, s[4:5], v11, v14 -; CGP-NEXT: v_mul_lo_u32 v12, v10, v13 -; CGP-NEXT: v_mul_lo_u32 v14, v7, v11 -; CGP-NEXT: v_mul_hi_u32 v9, v7, v13 -; CGP-NEXT: v_mul_hi_u32 v13, v10, v13 -; CGP-NEXT: v_mac_f32_e32 v5, 0x4f800000, v6 -; CGP-NEXT: v_add_i32_e64 v12, s[4:5], v12, v14 -; CGP-NEXT: v_cndmask_b32_e64 v14, 0, 1, s[4:5] -; CGP-NEXT: v_add_i32_e64 v9, s[4:5], v12, v9 -; CGP-NEXT: v_cndmask_b32_e64 v9, 0, 1, s[4:5] -; CGP-NEXT: v_mul_lo_u32 v12, v10, v11 -; CGP-NEXT: v_add_i32_e64 v9, s[4:5], v14, v9 -; CGP-NEXT: v_mul_hi_u32 v14, v7, v11 -; CGP-NEXT: v_add_i32_e64 v12, s[4:5], v12, v13 -; CGP-NEXT: v_cndmask_b32_e64 v13, 0, 1, s[4:5] -; CGP-NEXT: v_add_i32_e64 v12, s[4:5], v12, v14 -; CGP-NEXT: v_cndmask_b32_e64 v14, 0, 1, s[4:5] -; CGP-NEXT: v_add_i32_e64 v13, s[4:5], v13, v14 -; CGP-NEXT: v_mul_hi_u32 v10, v10, v11 -; CGP-NEXT: v_add_i32_e64 v9, s[4:5], v12, v9 -; CGP-NEXT: v_cndmask_b32_e64 v12, 0, 1, s[4:5] -; CGP-NEXT: v_add_i32_e64 v11, s[4:5], v13, v12 -; CGP-NEXT: v_add_i32_e64 v10, s[4:5], v10, v11 -; CGP-NEXT: v_addc_u32_e32 v8, vcc, v8, v10, vcc -; CGP-NEXT: v_add_i32_e32 v7, vcc, v7, v9 -; CGP-NEXT: v_addc_u32_e32 v8, vcc, 0, v8, vcc -; CGP-NEXT: v_mul_lo_u32 v9, v1, v7 -; CGP-NEXT: v_mul_lo_u32 v10, v0, v8 -; CGP-NEXT: v_mul_hi_u32 v11, v0, v7 -; CGP-NEXT: v_mul_hi_u32 v7, v1, v7 -; CGP-NEXT: v_rcp_iflag_f32_e32 v5, v5 -; CGP-NEXT: v_add_i32_e32 v9, vcc, v9, v10 -; CGP-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v9, vcc, v9, v11 -; CGP-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc -; CGP-NEXT: v_mul_lo_u32 v11, v1, v8 -; CGP-NEXT: v_add_i32_e32 v9, vcc, v10, v9 -; CGP-NEXT: v_mul_hi_u32 v10, v0, v8 -; CGP-NEXT: v_add_i32_e32 v7, vcc, v11, v7 -; CGP-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v7, vcc, v7, v10 -; CGP-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v10, vcc, v11, v10 -; CGP-NEXT: v_mul_hi_u32 v8, v1, v8 -; CGP-NEXT: v_add_i32_e32 v7, vcc, v7, v9 -; CGP-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v9, vcc, v10, v9 -; CGP-NEXT: v_add_i32_e32 v8, vcc, v8, v9 -; CGP-NEXT: v_mul_lo_u32 v9, 0, v7 -; CGP-NEXT: v_mul_lo_u32 v8, s7, v8 -; CGP-NEXT: v_mul_lo_u32 v10, s7, v7 -; CGP-NEXT: v_mul_hi_u32 v7, s7, v7 -; CGP-NEXT: v_mul_f32_e32 v5, 0x5f7ffffc, v5 +; CGP-NEXT: v_mul_lo_u32 v11, 0, v8 +; CGP-NEXT: v_mul_lo_u32 v9, v4, v9 +; CGP-NEXT: v_mul_lo_u32 v12, v4, v8 +; CGP-NEXT: v_mul_hi_u32 v8, v4, v8 +; CGP-NEXT: v_add_i32_e32 v9, vcc, v11, v9 ; CGP-NEXT: v_add_i32_e32 v8, vcc, v9, v8 -; CGP-NEXT: v_add_i32_e32 v7, vcc, v8, v7 -; CGP-NEXT: v_sub_i32_e32 v0, vcc, v0, v10 -; CGP-NEXT: v_subb_u32_e64 v8, s[4:5], v1, v7, vcc -; CGP-NEXT: v_sub_i32_e64 v1, s[4:5], v1, v7 -; CGP-NEXT: v_cmp_le_u32_e64 s[4:5], s7, v0 -; CGP-NEXT: v_cndmask_b32_e64 v7, 0, -1, s[4:5] -; CGP-NEXT: v_mov_b32_e32 v9, s8 -; CGP-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v8 +; CGP-NEXT: v_sub_i32_e32 v9, vcc, v10, v12 +; CGP-NEXT: v_subb_u32_e64 v10, s[4:5], v1, v8, vcc +; CGP-NEXT: v_sub_i32_e64 v1, s[4:5], v1, v8 +; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v9, v4 +; CGP-NEXT: v_cndmask_b32_e64 v8, 0, -1, s[4:5] +; CGP-NEXT: v_mov_b32_e32 v11, s6 +; CGP-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v10 ; CGP-NEXT: v_subbrev_u32_e32 v1, vcc, 0, v1, vcc -; CGP-NEXT: v_cndmask_b32_e64 v7, v9, v7, s[4:5] -; CGP-NEXT: v_subrev_i32_e32 v9, vcc, s7, v0 +; CGP-NEXT: v_cndmask_b32_e64 v8, v11, v8, s[4:5] +; CGP-NEXT: v_sub_i32_e32 v11, vcc, v9, v4 ; CGP-NEXT: v_subbrev_u32_e32 v1, vcc, 0, v1, vcc ; CGP-NEXT: s_bfe_i32 s4, -1, 0x10000 -; CGP-NEXT: v_cmp_le_u32_e32 vcc, s7, v9 -; CGP-NEXT: v_cndmask_b32_e64 v10, 0, -1, vcc -; CGP-NEXT: v_mov_b32_e32 v11, s4 +; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v11, v4 +; CGP-NEXT: v_cndmask_b32_e64 v12, 0, -1, vcc +; CGP-NEXT: v_mov_b32_e32 v13, s4 ; CGP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 -; CGP-NEXT: v_cndmask_b32_e32 v10, v11, v10, vcc -; CGP-NEXT: v_subrev_i32_e32 v11, vcc, s7, v9 -; CGP-NEXT: v_subbrev_u32_e32 v12, vcc, 0, v1, vcc -; CGP-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10 -; CGP-NEXT: v_cndmask_b32_e32 v9, v9, v11, vcc -; CGP-NEXT: v_cndmask_b32_e32 v1, v1, v12, vcc -; CGP-NEXT: v_cmp_ne_u32_e32 vcc, 0, v7 -; CGP-NEXT: v_mul_f32_e32 v7, 0x2f800000, v5 -; CGP-NEXT: v_trunc_f32_e32 v7, v7 -; CGP-NEXT: v_mac_f32_e32 v5, 0xcf800000, v7 -; CGP-NEXT: v_cvt_u32_f32_e32 v5, v5 -; CGP-NEXT: v_cvt_u32_f32_e32 v7, v7 -; CGP-NEXT: v_cndmask_b32_e32 v0, v0, v9, vcc -; CGP-NEXT: v_cndmask_b32_e32 v1, v8, v1, vcc -; CGP-NEXT: v_mul_lo_u32 v8, -1, v5 -; CGP-NEXT: v_mul_lo_u32 v9, s6, v7 -; CGP-NEXT: v_mul_hi_u32 v11, s6, v5 -; CGP-NEXT: v_ashrrev_i32_e32 v6, 31, v3 -; CGP-NEXT: v_mul_lo_u32 v10, s6, v5 -; CGP-NEXT: v_add_i32_e32 v2, vcc, v2, v6 -; CGP-NEXT: v_addc_u32_e32 v3, vcc, v3, v6, vcc -; CGP-NEXT: v_add_i32_e32 v8, vcc, v8, v9 -; CGP-NEXT: v_add_i32_e32 v8, vcc, v8, v11 -; CGP-NEXT: v_mul_lo_u32 v9, v7, v10 -; CGP-NEXT: v_mul_lo_u32 v11, v5, v8 -; CGP-NEXT: v_mul_hi_u32 v12, v5, v10 -; CGP-NEXT: v_mul_hi_u32 v10, v7, v10 -; CGP-NEXT: v_xor_b32_e32 v0, v0, v4 -; CGP-NEXT: v_add_i32_e32 v9, vcc, v9, v11 -; CGP-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v9, vcc, v9, v12 -; CGP-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc -; CGP-NEXT: v_mul_lo_u32 v12, v7, v8 -; CGP-NEXT: v_add_i32_e32 v9, vcc, v11, v9 -; CGP-NEXT: v_mul_hi_u32 v11, v5, v8 -; CGP-NEXT: v_add_i32_e32 v10, vcc, v12, v10 -; CGP-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc +; CGP-NEXT: v_cndmask_b32_e32 v12, v13, v12, vcc +; CGP-NEXT: v_sub_i32_e32 v13, vcc, v11, v4 +; CGP-NEXT: v_subbrev_u32_e32 v14, vcc, 0, v1, vcc +; CGP-NEXT: v_cmp_ne_u32_e32 vcc, 0, v12 +; CGP-NEXT: v_cndmask_b32_e32 v11, v11, v13, vcc +; CGP-NEXT: v_cndmask_b32_e32 v1, v1, v14, vcc +; CGP-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 +; CGP-NEXT: v_cndmask_b32_e32 v8, v9, v11, vcc +; CGP-NEXT: v_mul_f32_e32 v9, 0x2f800000, v6 +; CGP-NEXT: v_trunc_f32_e32 v9, v9 +; CGP-NEXT: v_mac_f32_e32 v6, 0xcf800000, v9 +; CGP-NEXT: v_cvt_u32_f32_e32 v6, v6 +; CGP-NEXT: v_cvt_u32_f32_e32 v9, v9 +; CGP-NEXT: v_cndmask_b32_e32 v1, v10, v1, vcc +; CGP-NEXT: v_xor_b32_e32 v7, v8, v5 +; CGP-NEXT: v_mul_lo_u32 v10, -1, v6 +; CGP-NEXT: v_mul_lo_u32 v11, v0, v9 +; CGP-NEXT: v_mul_hi_u32 v13, v0, v6 +; CGP-NEXT: v_ashrrev_i32_e32 v8, 31, v3 +; CGP-NEXT: v_mul_lo_u32 v12, v0, v6 +; CGP-NEXT: v_add_i32_e32 v2, vcc, v2, v8 +; CGP-NEXT: v_addc_u32_e32 v3, vcc, v3, v8, vcc ; CGP-NEXT: v_add_i32_e32 v10, vcc, v10, v11 -; CGP-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v11, vcc, v12, v11 -; CGP-NEXT: v_mul_hi_u32 v8, v7, v8 -; CGP-NEXT: v_add_i32_e32 v9, vcc, v10, v9 -; CGP-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v10, vcc, v11, v10 -; CGP-NEXT: v_add_i32_e32 v8, vcc, v8, v10 -; CGP-NEXT: v_add_i32_e32 v5, vcc, v5, v9 -; CGP-NEXT: v_addc_u32_e64 v9, s[4:5], v7, v8, vcc -; CGP-NEXT: v_mul_lo_u32 v10, -1, v5 -; CGP-NEXT: v_mul_lo_u32 v11, s6, v9 -; CGP-NEXT: v_mul_hi_u32 v13, s6, v5 -; CGP-NEXT: v_mul_lo_u32 v12, s6, v5 -; CGP-NEXT: v_add_i32_e64 v7, s[4:5], v7, v8 -; CGP-NEXT: v_add_i32_e64 v10, s[4:5], v10, v11 -; CGP-NEXT: v_add_i32_e64 v10, s[4:5], v10, v13 +; CGP-NEXT: v_add_i32_e32 v10, vcc, v10, v13 ; CGP-NEXT: v_mul_lo_u32 v11, v9, v12 -; CGP-NEXT: v_mul_lo_u32 v13, v5, v10 -; CGP-NEXT: v_mul_hi_u32 v8, v5, v12 +; CGP-NEXT: v_mul_lo_u32 v13, v6, v10 +; CGP-NEXT: v_mul_hi_u32 v14, v6, v12 ; CGP-NEXT: v_mul_hi_u32 v12, v9, v12 -; CGP-NEXT: v_xor_b32_e32 v2, v2, v6 -; CGP-NEXT: v_add_i32_e64 v11, s[4:5], v11, v13 -; CGP-NEXT: v_cndmask_b32_e64 v13, 0, 1, s[4:5] -; CGP-NEXT: v_add_i32_e64 v8, s[4:5], v11, v8 -; CGP-NEXT: v_cndmask_b32_e64 v8, 0, 1, s[4:5] -; CGP-NEXT: v_mul_lo_u32 v11, v9, v10 -; CGP-NEXT: v_add_i32_e64 v8, s[4:5], v13, v8 -; CGP-NEXT: v_mul_hi_u32 v13, v5, v10 -; CGP-NEXT: v_add_i32_e64 v11, s[4:5], v11, v12 -; CGP-NEXT: v_cndmask_b32_e64 v12, 0, 1, s[4:5] -; CGP-NEXT: v_add_i32_e64 v11, s[4:5], v11, v13 +; CGP-NEXT: v_xor_b32_e32 v2, v2, v8 +; CGP-NEXT: v_add_i32_e32 v11, vcc, v11, v13 +; CGP-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v11, vcc, v11, v14 +; CGP-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc +; CGP-NEXT: v_mul_lo_u32 v14, v9, v10 +; CGP-NEXT: v_add_i32_e32 v11, vcc, v13, v11 +; CGP-NEXT: v_mul_hi_u32 v13, v6, v10 +; CGP-NEXT: v_add_i32_e32 v12, vcc, v14, v12 +; CGP-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v12, vcc, v12, v13 +; CGP-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v13, vcc, v14, v13 +; CGP-NEXT: v_mul_hi_u32 v10, v9, v10 +; CGP-NEXT: v_add_i32_e32 v11, vcc, v12, v11 +; CGP-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v12, vcc, v13, v12 +; CGP-NEXT: v_add_i32_e32 v10, vcc, v10, v12 +; CGP-NEXT: v_add_i32_e32 v6, vcc, v6, v11 +; CGP-NEXT: v_addc_u32_e64 v11, s[4:5], v9, v10, vcc +; CGP-NEXT: v_mul_lo_u32 v12, -1, v6 +; CGP-NEXT: v_mul_lo_u32 v13, v0, v11 +; CGP-NEXT: v_mul_lo_u32 v14, v0, v6 +; CGP-NEXT: v_mul_hi_u32 v0, v0, v6 +; CGP-NEXT: v_add_i32_e64 v9, s[4:5], v9, v10 +; CGP-NEXT: v_add_i32_e64 v12, s[4:5], v12, v13 +; CGP-NEXT: v_add_i32_e64 v0, s[4:5], v12, v0 +; CGP-NEXT: v_mul_lo_u32 v12, v11, v14 +; CGP-NEXT: v_mul_lo_u32 v13, v6, v0 +; CGP-NEXT: v_mul_hi_u32 v10, v6, v14 +; CGP-NEXT: v_mul_hi_u32 v14, v11, v14 +; CGP-NEXT: v_xor_b32_e32 v3, v3, v8 +; CGP-NEXT: v_add_i32_e64 v12, s[4:5], v12, v13 ; CGP-NEXT: v_cndmask_b32_e64 v13, 0, 1, s[4:5] +; CGP-NEXT: v_add_i32_e64 v10, s[4:5], v12, v10 +; CGP-NEXT: v_cndmask_b32_e64 v10, 0, 1, s[4:5] +; CGP-NEXT: v_mul_lo_u32 v12, v11, v0 +; CGP-NEXT: v_add_i32_e64 v10, s[4:5], v13, v10 +; CGP-NEXT: v_mul_hi_u32 v13, v6, v0 +; CGP-NEXT: v_add_i32_e64 v12, s[4:5], v12, v14 +; CGP-NEXT: v_cndmask_b32_e64 v14, 0, 1, s[4:5] ; CGP-NEXT: v_add_i32_e64 v12, s[4:5], v12, v13 -; CGP-NEXT: v_mul_hi_u32 v9, v9, v10 -; CGP-NEXT: v_add_i32_e64 v8, s[4:5], v11, v8 -; CGP-NEXT: v_cndmask_b32_e64 v11, 0, 1, s[4:5] -; CGP-NEXT: v_add_i32_e64 v10, s[4:5], v12, v11 -; CGP-NEXT: v_add_i32_e64 v9, s[4:5], v9, v10 -; CGP-NEXT: v_addc_u32_e32 v7, vcc, v7, v9, vcc -; CGP-NEXT: v_add_i32_e32 v5, vcc, v5, v8 -; CGP-NEXT: v_xor_b32_e32 v3, v3, v6 -; CGP-NEXT: v_addc_u32_e32 v7, vcc, 0, v7, vcc -; CGP-NEXT: v_xor_b32_e32 v1, v1, v4 -; CGP-NEXT: v_mul_lo_u32 v8, v3, v5 -; CGP-NEXT: v_mul_lo_u32 v9, v2, v7 -; CGP-NEXT: v_sub_i32_e32 v0, vcc, v0, v4 -; CGP-NEXT: v_subb_u32_e32 v1, vcc, v1, v4, vcc -; CGP-NEXT: v_mul_hi_u32 v4, v2, v5 -; CGP-NEXT: v_add_i32_e32 v8, vcc, v8, v9 -; CGP-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v4, vcc, v8, v4 -; CGP-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc -; CGP-NEXT: v_mul_lo_u32 v8, v3, v7 -; CGP-NEXT: v_mul_hi_u32 v5, v3, v5 -; CGP-NEXT: v_add_i32_e32 v4, vcc, v9, v4 -; CGP-NEXT: v_mul_hi_u32 v9, v2, v7 -; CGP-NEXT: v_add_i32_e32 v5, vcc, v8, v5 -; CGP-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v5, vcc, v5, v9 -; CGP-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v8, vcc, v8, v9 -; CGP-NEXT: v_mul_hi_u32 v7, v3, v7 -; CGP-NEXT: v_add_i32_e32 v4, vcc, v5, v4 -; CGP-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v5, vcc, v8, v5 +; CGP-NEXT: v_cndmask_b32_e64 v13, 0, 1, s[4:5] +; CGP-NEXT: v_add_i32_e64 v13, s[4:5], v14, v13 +; CGP-NEXT: v_mul_hi_u32 v0, v11, v0 +; CGP-NEXT: v_add_i32_e64 v10, s[4:5], v12, v10 +; CGP-NEXT: v_cndmask_b32_e64 v12, 0, 1, s[4:5] +; CGP-NEXT: v_add_i32_e64 v11, s[4:5], v13, v12 +; CGP-NEXT: v_add_i32_e64 v0, s[4:5], v0, v11 +; CGP-NEXT: v_addc_u32_e32 v0, vcc, v9, v0, vcc +; CGP-NEXT: v_add_i32_e32 v6, vcc, v6, v10 +; CGP-NEXT: v_addc_u32_e32 v9, vcc, 0, v0, vcc +; CGP-NEXT: v_xor_b32_e32 v1, v1, v5 +; CGP-NEXT: v_mul_lo_u32 v10, v3, v6 +; CGP-NEXT: v_mul_lo_u32 v11, v2, v9 +; CGP-NEXT: v_sub_i32_e32 v0, vcc, v7, v5 +; CGP-NEXT: v_subb_u32_e32 v1, vcc, v1, v5, vcc +; CGP-NEXT: v_mul_hi_u32 v5, v2, v6 +; CGP-NEXT: v_add_i32_e32 v7, vcc, v10, v11 +; CGP-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v5, vcc, v7, v5 -; CGP-NEXT: v_mul_lo_u32 v7, 0, v4 -; CGP-NEXT: v_mul_lo_u32 v5, s7, v5 -; CGP-NEXT: v_mul_lo_u32 v8, s7, v4 -; CGP-NEXT: v_mul_hi_u32 v4, s7, v4 +; CGP-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc +; CGP-NEXT: v_mul_lo_u32 v7, v3, v9 +; CGP-NEXT: v_mul_hi_u32 v6, v3, v6 +; CGP-NEXT: v_add_i32_e32 v5, vcc, v10, v5 +; CGP-NEXT: v_mul_hi_u32 v10, v2, v9 +; CGP-NEXT: v_add_i32_e32 v6, vcc, v7, v6 +; CGP-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v6, vcc, v6, v10 +; CGP-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v7, vcc, v7, v10 +; CGP-NEXT: v_mul_hi_u32 v9, v3, v9 +; CGP-NEXT: v_add_i32_e32 v5, vcc, v6, v5 +; CGP-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v6, vcc, v7, v6 +; CGP-NEXT: v_add_i32_e32 v6, vcc, v9, v6 +; CGP-NEXT: v_mul_lo_u32 v7, 0, v5 +; CGP-NEXT: v_mul_lo_u32 v6, v4, v6 +; CGP-NEXT: v_mul_lo_u32 v9, v4, v5 +; CGP-NEXT: v_mul_hi_u32 v5, v4, v5 ; CGP-NEXT: s_bfe_i32 s6, -1, 0x10000 -; CGP-NEXT: v_add_i32_e32 v5, vcc, v7, v5 -; CGP-NEXT: v_add_i32_e32 v4, vcc, v5, v4 -; CGP-NEXT: v_sub_i32_e32 v2, vcc, v2, v8 -; CGP-NEXT: v_subb_u32_e64 v5, s[4:5], v3, v4, vcc -; CGP-NEXT: v_sub_i32_e64 v3, s[4:5], v3, v4 -; CGP-NEXT: v_cmp_le_u32_e64 s[4:5], s7, v2 -; CGP-NEXT: v_cndmask_b32_e64 v4, 0, -1, s[4:5] +; CGP-NEXT: v_add_i32_e32 v6, vcc, v7, v6 +; CGP-NEXT: v_add_i32_e32 v5, vcc, v6, v5 +; CGP-NEXT: v_sub_i32_e32 v2, vcc, v2, v9 +; CGP-NEXT: v_subb_u32_e64 v6, s[4:5], v3, v5, vcc +; CGP-NEXT: v_sub_i32_e64 v3, s[4:5], v3, v5 +; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v2, v4 +; CGP-NEXT: v_cndmask_b32_e64 v5, 0, -1, s[4:5] ; CGP-NEXT: v_mov_b32_e32 v7, s6 -; CGP-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v5 +; CGP-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v6 ; CGP-NEXT: v_subbrev_u32_e32 v3, vcc, 0, v3, vcc -; CGP-NEXT: v_cndmask_b32_e64 v4, v7, v4, s[4:5] -; CGP-NEXT: v_subrev_i32_e32 v7, vcc, s7, v2 +; CGP-NEXT: v_cndmask_b32_e64 v5, v7, v5, s[4:5] +; CGP-NEXT: v_sub_i32_e32 v7, vcc, v2, v4 ; CGP-NEXT: v_subbrev_u32_e32 v3, vcc, 0, v3, vcc ; CGP-NEXT: s_bfe_i32 s4, -1, 0x10000 -; CGP-NEXT: v_cmp_le_u32_e32 vcc, s7, v7 -; CGP-NEXT: v_cndmask_b32_e64 v8, 0, -1, vcc -; CGP-NEXT: v_mov_b32_e32 v9, s4 +; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v7, v4 +; CGP-NEXT: v_cndmask_b32_e64 v9, 0, -1, vcc +; CGP-NEXT: v_mov_b32_e32 v10, s4 ; CGP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 -; CGP-NEXT: v_cndmask_b32_e32 v8, v9, v8, vcc -; CGP-NEXT: v_subrev_i32_e32 v9, vcc, s7, v7 +; CGP-NEXT: v_cndmask_b32_e32 v9, v10, v9, vcc +; CGP-NEXT: v_sub_i32_e32 v4, vcc, v7, v4 ; CGP-NEXT: v_subbrev_u32_e32 v10, vcc, 0, v3, vcc -; CGP-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 -; CGP-NEXT: v_cndmask_b32_e32 v7, v7, v9, vcc +; CGP-NEXT: v_cmp_ne_u32_e32 vcc, 0, v9 +; CGP-NEXT: v_cndmask_b32_e32 v4, v7, v4, vcc ; CGP-NEXT: v_cndmask_b32_e32 v3, v3, v10, vcc -; CGP-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 -; CGP-NEXT: v_cndmask_b32_e32 v2, v2, v7, vcc -; CGP-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc -; CGP-NEXT: v_xor_b32_e32 v2, v2, v6 -; CGP-NEXT: v_xor_b32_e32 v3, v3, v6 -; CGP-NEXT: v_sub_i32_e32 v2, vcc, v2, v6 -; CGP-NEXT: v_subb_u32_e32 v3, vcc, v3, v6, vcc +; CGP-NEXT: v_cmp_ne_u32_e32 vcc, 0, v5 +; CGP-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc +; CGP-NEXT: v_cndmask_b32_e32 v3, v6, v3, vcc +; CGP-NEXT: v_xor_b32_e32 v2, v2, v8 +; CGP-NEXT: v_xor_b32_e32 v3, v3, v8 +; CGP-NEXT: v_sub_i32_e32 v2, vcc, v2, v8 +; CGP-NEXT: v_subb_u32_e32 v3, vcc, v3, v8, vcc ; CGP-NEXT: s_setpc_b64 s[30:31] %result = srem <2 x i64> %num, ret <2 x i64> %result @@ -2641,15 +2641,15 @@ ; CHECK-NEXT: v_mul_hi_u32 v1, v0, v1 ; CHECK-NEXT: v_add_i32_e32 v0, vcc, v0, v1 ; CHECK-NEXT: v_mul_hi_u32 v0, v3, v0 +; CHECK-NEXT: v_mov_b32_e32 v1, 0 ; CHECK-NEXT: v_mul_lo_u32 v0, v0, v5 ; CHECK-NEXT: v_sub_i32_e32 v0, vcc, v3, v0 -; CHECK-NEXT: v_sub_i32_e32 v1, vcc, v0, v5 +; CHECK-NEXT: v_sub_i32_e32 v2, vcc, v0, v5 ; CHECK-NEXT: v_cmp_ge_u32_e32 vcc, v0, v5 -; CHECK-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc -; CHECK-NEXT: v_sub_i32_e32 v1, vcc, v0, v5 +; CHECK-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc +; CHECK-NEXT: v_sub_i32_e32 v2, vcc, v0, v5 ; CHECK-NEXT: v_cmp_ge_u32_e32 vcc, v0, v5 -; CHECK-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc -; CHECK-NEXT: v_mov_b32_e32 v1, 0 +; CHECK-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc ; CHECK-NEXT: BB7_4: ; CHECK-NEXT: s_or_b64 exec, exec, s[4:5] ; CHECK-NEXT: s_setpc_b64 s[30:31] @@ -3121,15 +3121,15 @@ ; CGP-NEXT: v_mul_hi_u32 v1, v0, v1 ; CGP-NEXT: v_add_i32_e32 v0, vcc, v0, v1 ; CGP-NEXT: v_mul_hi_u32 v0, v8, v0 +; CGP-NEXT: v_mov_b32_e32 v1, 0 ; CGP-NEXT: v_mul_lo_u32 v0, v0, v2 ; CGP-NEXT: v_sub_i32_e32 v0, vcc, v8, v0 -; CGP-NEXT: v_sub_i32_e32 v1, vcc, v0, v2 +; CGP-NEXT: v_sub_i32_e32 v3, vcc, v0, v2 ; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v0, v2 -; CGP-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc -; CGP-NEXT: v_sub_i32_e32 v1, vcc, v0, v2 +; CGP-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc +; CGP-NEXT: v_sub_i32_e32 v3, vcc, v0, v2 ; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v0, v2 -; CGP-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc -; CGP-NEXT: v_mov_b32_e32 v1, 0 +; CGP-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc ; CGP-NEXT: BB8_4: ; CGP-NEXT: s_or_b64 exec, exec, s[4:5] ; CGP-NEXT: v_or_b32_e32 v3, v7, v11 @@ -3296,15 +3296,15 @@ ; CGP-NEXT: v_mul_hi_u32 v3, v2, v3 ; CGP-NEXT: v_add_i32_e32 v2, vcc, v2, v3 ; CGP-NEXT: v_mul_hi_u32 v2, v5, v2 +; CGP-NEXT: v_mov_b32_e32 v3, 0 ; CGP-NEXT: v_mul_lo_u32 v2, v2, v10 ; CGP-NEXT: v_sub_i32_e32 v2, vcc, v5, v2 -; CGP-NEXT: v_sub_i32_e32 v3, vcc, v2, v10 +; CGP-NEXT: v_sub_i32_e32 v4, vcc, v2, v10 ; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v2, v10 -; CGP-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc -; CGP-NEXT: v_sub_i32_e32 v3, vcc, v2, v10 +; CGP-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc +; CGP-NEXT: v_sub_i32_e32 v4, vcc, v2, v10 ; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v2, v10 -; CGP-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc -; CGP-NEXT: v_mov_b32_e32 v3, 0 +; CGP-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc ; CGP-NEXT: BB8_8: ; CGP-NEXT: s_or_b64 exec, exec, s[4:5] ; CGP-NEXT: s_setpc_b64 s[30:31] Index: llvm/test/CodeGen/AMDGPU/GlobalISel/ssubsat.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/GlobalISel/ssubsat.ll +++ llvm/test/CodeGen/AMDGPU/GlobalISel/ssubsat.ll @@ -238,23 +238,23 @@ ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: v_lshrrev_b32_e32 v2, 8, v0 ; GFX6-NEXT: v_lshlrev_b32_e32 v0, 24, v0 -; GFX6-NEXT: s_brev_b32 s4, -2 -; GFX6-NEXT: v_max_i32_e32 v4, -1, v0 +; GFX6-NEXT: v_bfrev_b32_e32 v4, -2 +; GFX6-NEXT: v_max_i32_e32 v6, -1, v0 ; GFX6-NEXT: v_lshrrev_b32_e32 v3, 8, v1 ; GFX6-NEXT: v_lshlrev_b32_e32 v1, 24, v1 -; GFX6-NEXT: s_brev_b32 s5, 1 -; GFX6-NEXT: v_subrev_i32_e32 v4, vcc, s4, v4 -; GFX6-NEXT: v_min_i32_e32 v5, -1, v0 -; GFX6-NEXT: v_subrev_i32_e32 v5, vcc, s5, v5 -; GFX6-NEXT: v_max_i32_e32 v1, v4, v1 -; GFX6-NEXT: v_min_i32_e32 v1, v1, v5 +; GFX6-NEXT: v_bfrev_b32_e32 v5, 1 +; GFX6-NEXT: v_sub_i32_e32 v6, vcc, v6, v4 +; GFX6-NEXT: v_min_i32_e32 v7, -1, v0 +; GFX6-NEXT: v_sub_i32_e32 v7, vcc, v7, v5 +; GFX6-NEXT: v_max_i32_e32 v1, v6, v1 +; GFX6-NEXT: v_min_i32_e32 v1, v1, v7 ; GFX6-NEXT: v_sub_i32_e32 v0, vcc, v0, v1 ; GFX6-NEXT: v_lshlrev_b32_e32 v1, 24, v2 ; GFX6-NEXT: v_lshlrev_b32_e32 v2, 24, v3 ; GFX6-NEXT: v_max_i32_e32 v3, -1, v1 -; GFX6-NEXT: v_subrev_i32_e32 v3, vcc, s4, v3 +; GFX6-NEXT: v_sub_i32_e32 v3, vcc, v3, v4 ; GFX6-NEXT: v_min_i32_e32 v4, -1, v1 -; GFX6-NEXT: v_subrev_i32_e32 v4, vcc, s5, v4 +; GFX6-NEXT: v_sub_i32_e32 v4, vcc, v4, v5 ; GFX6-NEXT: v_max_i32_e32 v2, v3, v2 ; GFX6-NEXT: v_min_i32_e32 v2, v2, v4 ; GFX6-NEXT: v_sub_i32_e32 v1, vcc, v1, v2 @@ -273,21 +273,21 @@ ; GFX8-NEXT: v_mov_b32_e32 v2, 8 ; GFX8-NEXT: v_lshrrev_b32_sdwa v3, v2, v0 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX8-NEXT: v_lshlrev_b16_e32 v0, 8, v0 -; GFX8-NEXT: s_movk_i32 s4, 0x7fff -; GFX8-NEXT: v_max_i16_e32 v4, -1, v0 +; GFX8-NEXT: v_mov_b32_e32 v4, 0x7fff +; GFX8-NEXT: v_max_i16_e32 v6, -1, v0 ; GFX8-NEXT: v_lshrrev_b32_sdwa v2, v2, v1 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX8-NEXT: v_lshlrev_b16_e32 v1, 8, v1 -; GFX8-NEXT: s_movk_i32 s5, 0x8000 -; GFX8-NEXT: v_subrev_u16_e32 v4, s4, v4 -; GFX8-NEXT: v_min_i16_e32 v5, -1, v0 -; GFX8-NEXT: v_subrev_u16_e32 v5, s5, v5 -; GFX8-NEXT: v_max_i16_e32 v1, v4, v1 -; GFX8-NEXT: v_min_i16_e32 v1, v1, v5 +; GFX8-NEXT: v_mov_b32_e32 v5, 0xffff8000 +; GFX8-NEXT: v_sub_u16_e32 v6, v6, v4 +; GFX8-NEXT: v_min_i16_e32 v7, -1, v0 +; GFX8-NEXT: v_sub_u16_e32 v7, v7, v5 +; GFX8-NEXT: v_max_i16_e32 v1, v6, v1 +; GFX8-NEXT: v_min_i16_e32 v1, v1, v7 ; GFX8-NEXT: v_sub_u16_e32 v0, v0, v1 ; GFX8-NEXT: v_max_i16_e32 v1, -1, v3 -; GFX8-NEXT: v_subrev_u16_e32 v1, s4, v1 +; GFX8-NEXT: v_sub_u16_e32 v1, v1, v4 ; GFX8-NEXT: v_min_i16_e32 v4, -1, v3 -; GFX8-NEXT: v_subrev_u16_e32 v4, s5, v4 +; GFX8-NEXT: v_sub_u16_e32 v4, v4, v5 ; GFX8-NEXT: v_max_i16_e32 v1, v1, v2 ; GFX8-NEXT: v_min_i16_e32 v1, v1, v4 ; GFX8-NEXT: v_sub_u16_e32 v1, v3, v1 @@ -310,8 +310,8 @@ ; GFX9-NEXT: v_pk_lshlrev_b16 v1, 8, v1 op_sel_hi:[0,1] ; GFX9-NEXT: v_pk_sub_i16 v0, v0, v1 clamp ; GFX9-NEXT: v_pk_ashrrev_i16 v0, 8, v0 op_sel_hi:[0,1] -; GFX9-NEXT: s_movk_i32 s4, 0xff -; GFX9-NEXT: v_and_b32_sdwa v1, v0, s4 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX9-NEXT: v_mov_b32_e32 v1, 0xff +; GFX9-NEXT: v_and_b32_sdwa v1, v0, v1 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -323,14 +323,14 @@ ; GFX10-NEXT: v_mov_b32_e32 v2, 0xffff ; GFX10-NEXT: v_lshrrev_b32_sdwa v3, s4, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX10-NEXT: v_lshrrev_b32_sdwa v4, s4, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX10-NEXT: s_movk_i32 s4, 0xff ; GFX10-NEXT: v_and_or_b32 v0, v0, v2, v3 ; GFX10-NEXT: v_and_or_b32 v1, v1, v2, v4 ; GFX10-NEXT: v_pk_lshlrev_b16 v0, 8, v0 op_sel_hi:[0,1] ; GFX10-NEXT: v_pk_lshlrev_b16 v1, 8, v1 op_sel_hi:[0,1] ; GFX10-NEXT: v_pk_sub_i16 v0, v0, v1 clamp +; GFX10-NEXT: v_mov_b32_e32 v1, 0xff ; GFX10-NEXT: v_pk_ashrrev_i16 v0, 8, v0 op_sel_hi:[0,1] -; GFX10-NEXT: v_and_b32_sdwa v1, v0, s4 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX10-NEXT: v_and_b32_sdwa v1, v0, v1 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; GFX10-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX10-NEXT: s_setpc_b64 s[30:31] %lhs = bitcast i16 %lhs.arg to <2 x i8> @@ -439,8 +439,8 @@ ; GFX9-NEXT: v_mov_b32_e32 v0, s1 ; GFX9-NEXT: v_pk_sub_i16 v0, s0, v0 clamp ; GFX9-NEXT: v_pk_ashrrev_i16 v0, 8, v0 op_sel_hi:[0,1] -; GFX9-NEXT: s_movk_i32 s0, 0xff -; GFX9-NEXT: v_and_b32_sdwa v1, v0, s0 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX9-NEXT: v_mov_b32_e32 v1, 0xff +; GFX9-NEXT: v_and_b32_sdwa v1, v0, v1 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_readfirstlane_b32 s0, v0 ; GFX9-NEXT: ; return to shader part epilog @@ -460,10 +460,10 @@ ; GFX10-NEXT: s_lshl_b32 s2, s4, 8 ; GFX10-NEXT: s_pack_ll_b32_b16 s0, s0, s3 ; GFX10-NEXT: s_pack_ll_b32_b16 s1, s1, s2 +; GFX10-NEXT: v_mov_b32_e32 v1, 0xff ; GFX10-NEXT: v_pk_sub_i16 v0, s0, s1 clamp -; GFX10-NEXT: s_movk_i32 s0, 0xff ; GFX10-NEXT: v_pk_ashrrev_i16 v0, 8, v0 op_sel_hi:[0,1] -; GFX10-NEXT: v_and_b32_sdwa v1, v0, s0 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX10-NEXT: v_and_b32_sdwa v1, v0, v1 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; GFX10-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX10-NEXT: v_readfirstlane_b32 s0, v0 ; GFX10-NEXT: ; return to shader part epilog @@ -482,61 +482,59 @@ ; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v0 ; GFX6-NEXT: v_lshrrev_b32_e32 v4, 24, v0 ; GFX6-NEXT: v_lshlrev_b32_e32 v0, 24, v0 -; GFX6-NEXT: s_brev_b32 s4, -2 -; GFX6-NEXT: v_max_i32_e32 v8, -1, v0 +; GFX6-NEXT: v_bfrev_b32_e32 v8, -2 +; GFX6-NEXT: v_max_i32_e32 v10, -1, v0 ; GFX6-NEXT: v_lshrrev_b32_e32 v5, 8, v1 ; GFX6-NEXT: v_lshrrev_b32_e32 v6, 16, v1 ; GFX6-NEXT: v_lshrrev_b32_e32 v7, 24, v1 ; GFX6-NEXT: v_lshlrev_b32_e32 v1, 24, v1 -; GFX6-NEXT: s_brev_b32 s5, 1 -; GFX6-NEXT: v_subrev_i32_e32 v8, vcc, s4, v8 -; GFX6-NEXT: v_min_i32_e32 v10, -1, v0 -; GFX6-NEXT: v_subrev_i32_e32 v10, vcc, s5, v10 -; GFX6-NEXT: v_max_i32_e32 v1, v8, v1 -; GFX6-NEXT: v_min_i32_e32 v1, v1, v10 +; GFX6-NEXT: v_bfrev_b32_e32 v9, 1 +; GFX6-NEXT: v_sub_i32_e32 v10, vcc, v10, v8 +; GFX6-NEXT: v_min_i32_e32 v11, -1, v0 +; GFX6-NEXT: v_sub_i32_e32 v11, vcc, v11, v9 +; GFX6-NEXT: v_max_i32_e32 v1, v10, v1 +; GFX6-NEXT: v_min_i32_e32 v1, v1, v11 ; GFX6-NEXT: v_sub_i32_e32 v0, vcc, v0, v1 ; GFX6-NEXT: v_lshlrev_b32_e32 v1, 24, v2 ; GFX6-NEXT: v_lshlrev_b32_e32 v2, 24, v5 ; GFX6-NEXT: v_max_i32_e32 v5, -1, v1 -; GFX6-NEXT: v_subrev_i32_e32 v5, vcc, s4, v5 -; GFX6-NEXT: v_min_i32_e32 v8, -1, v1 -; GFX6-NEXT: v_subrev_i32_e32 v8, vcc, s5, v8 +; GFX6-NEXT: v_sub_i32_e32 v5, vcc, v5, v8 +; GFX6-NEXT: v_min_i32_e32 v10, -1, v1 +; GFX6-NEXT: v_sub_i32_e32 v10, vcc, v10, v9 ; GFX6-NEXT: v_max_i32_e32 v2, v5, v2 -; GFX6-NEXT: v_min_i32_e32 v2, v2, v8 +; GFX6-NEXT: v_min_i32_e32 v2, v2, v10 ; GFX6-NEXT: v_sub_i32_e32 v1, vcc, v1, v2 ; GFX6-NEXT: v_lshlrev_b32_e32 v2, 24, v3 -; GFX6-NEXT: v_bfrev_b32_e32 v9, -2 ; GFX6-NEXT: v_max_i32_e32 v5, -1, v2 ; GFX6-NEXT: v_lshlrev_b32_e32 v3, 24, v6 -; GFX6-NEXT: v_sub_i32_e32 v5, vcc, v5, v9 +; GFX6-NEXT: v_sub_i32_e32 v5, vcc, v5, v8 ; GFX6-NEXT: v_min_i32_e32 v6, -1, v2 -; GFX6-NEXT: v_subrev_i32_e32 v6, vcc, s5, v6 +; GFX6-NEXT: v_sub_i32_e32 v6, vcc, v6, v9 ; GFX6-NEXT: v_max_i32_e32 v3, v5, v3 ; GFX6-NEXT: v_min_i32_e32 v3, v3, v6 ; GFX6-NEXT: v_sub_i32_e32 v2, vcc, v2, v3 ; GFX6-NEXT: v_lshlrev_b32_e32 v3, 24, v4 ; GFX6-NEXT: v_max_i32_e32 v5, -1, v3 -; GFX6-NEXT: v_bfrev_b32_e32 v11, 1 -; GFX6-NEXT: v_ashrrev_i32_e32 v1, 24, v1 ; GFX6-NEXT: v_lshlrev_b32_e32 v4, 24, v7 -; GFX6-NEXT: v_sub_i32_e32 v5, vcc, v5, v9 +; GFX6-NEXT: v_sub_i32_e32 v5, vcc, v5, v8 ; GFX6-NEXT: v_min_i32_e32 v6, -1, v3 -; GFX6-NEXT: s_movk_i32 s4, 0xff -; GFX6-NEXT: v_ashrrev_i32_e32 v0, 24, v0 -; GFX6-NEXT: v_sub_i32_e32 v6, vcc, v6, v11 +; GFX6-NEXT: v_sub_i32_e32 v6, vcc, v6, v9 ; GFX6-NEXT: v_max_i32_e32 v4, v5, v4 -; GFX6-NEXT: v_and_b32_e32 v1, s4, v1 -; GFX6-NEXT: v_ashrrev_i32_e32 v2, 24, v2 ; GFX6-NEXT: v_min_i32_e32 v4, v4, v6 -; GFX6-NEXT: v_and_b32_e32 v0, s4, v0 -; GFX6-NEXT: v_lshlrev_b32_e32 v1, 8, v1 +; GFX6-NEXT: v_ashrrev_i32_e32 v1, 24, v1 ; GFX6-NEXT: v_sub_i32_e32 v3, vcc, v3, v4 +; GFX6-NEXT: v_mov_b32_e32 v4, 0xff +; GFX6-NEXT: v_ashrrev_i32_e32 v0, 24, v0 +; GFX6-NEXT: v_and_b32_e32 v1, v1, v4 +; GFX6-NEXT: v_ashrrev_i32_e32 v2, 24, v2 +; GFX6-NEXT: v_and_b32_e32 v0, v0, v4 +; GFX6-NEXT: v_lshlrev_b32_e32 v1, 8, v1 ; GFX6-NEXT: v_or_b32_e32 v0, v0, v1 -; GFX6-NEXT: v_and_b32_e32 v1, s4, v2 +; GFX6-NEXT: v_and_b32_e32 v1, v2, v4 ; GFX6-NEXT: v_ashrrev_i32_e32 v3, 24, v3 ; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX6-NEXT: v_or_b32_e32 v0, v0, v1 -; GFX6-NEXT: v_and_b32_e32 v1, s4, v3 +; GFX6-NEXT: v_and_b32_e32 v1, v3, v4 ; GFX6-NEXT: v_lshlrev_b32_e32 v1, 24, v1 ; GFX6-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX6-NEXT: s_setpc_b64 s[30:31] @@ -549,42 +547,41 @@ ; GFX8-NEXT: v_lshrrev_b32_e32 v4, 16, v0 ; GFX8-NEXT: v_lshrrev_b32_e32 v5, 24, v0 ; GFX8-NEXT: v_lshlrev_b16_e32 v0, 8, v0 -; GFX8-NEXT: s_movk_i32 s4, 0x7fff -; GFX8-NEXT: v_max_i16_e32 v8, -1, v0 +; GFX8-NEXT: v_mov_b32_e32 v8, 0x7fff +; GFX8-NEXT: v_max_i16_e32 v10, -1, v0 ; GFX8-NEXT: v_lshrrev_b32_sdwa v2, v2, v1 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX8-NEXT: v_lshrrev_b32_e32 v6, 16, v1 ; GFX8-NEXT: v_lshrrev_b32_e32 v7, 24, v1 ; GFX8-NEXT: v_lshlrev_b16_e32 v1, 8, v1 -; GFX8-NEXT: s_movk_i32 s5, 0x8000 -; GFX8-NEXT: v_subrev_u16_e32 v8, s4, v8 -; GFX8-NEXT: v_min_i16_e32 v10, -1, v0 -; GFX8-NEXT: v_subrev_u16_e32 v10, s5, v10 -; GFX8-NEXT: v_max_i16_e32 v1, v8, v1 -; GFX8-NEXT: v_min_i16_e32 v1, v1, v10 +; GFX8-NEXT: v_mov_b32_e32 v9, 0xffff8000 +; GFX8-NEXT: v_sub_u16_e32 v10, v10, v8 +; GFX8-NEXT: v_min_i16_e32 v11, -1, v0 +; GFX8-NEXT: v_sub_u16_e32 v11, v11, v9 +; GFX8-NEXT: v_max_i16_e32 v1, v10, v1 +; GFX8-NEXT: v_min_i16_e32 v1, v1, v11 ; GFX8-NEXT: v_sub_u16_e32 v0, v0, v1 ; GFX8-NEXT: v_max_i16_e32 v1, -1, v3 -; GFX8-NEXT: v_subrev_u16_e32 v1, s4, v1 -; GFX8-NEXT: v_min_i16_e32 v8, -1, v3 -; GFX8-NEXT: v_subrev_u16_e32 v8, s5, v8 +; GFX8-NEXT: v_sub_u16_e32 v1, v1, v8 +; GFX8-NEXT: v_min_i16_e32 v10, -1, v3 +; GFX8-NEXT: v_sub_u16_e32 v10, v10, v9 ; GFX8-NEXT: v_max_i16_e32 v1, v1, v2 ; GFX8-NEXT: v_lshlrev_b16_e32 v2, 8, v4 -; GFX8-NEXT: v_mov_b32_e32 v9, 0x7fff -; GFX8-NEXT: v_min_i16_e32 v1, v1, v8 +; GFX8-NEXT: v_min_i16_e32 v1, v1, v10 ; GFX8-NEXT: v_max_i16_e32 v4, -1, v2 ; GFX8-NEXT: v_sub_u16_e32 v1, v3, v1 ; GFX8-NEXT: v_lshlrev_b16_e32 v3, 8, v6 -; GFX8-NEXT: v_sub_u16_e32 v4, v4, v9 +; GFX8-NEXT: v_sub_u16_e32 v4, v4, v8 ; GFX8-NEXT: v_min_i16_e32 v6, -1, v2 -; GFX8-NEXT: v_subrev_u16_e32 v6, s5, v6 +; GFX8-NEXT: v_sub_u16_e32 v6, v6, v9 ; GFX8-NEXT: v_max_i16_e32 v3, v4, v3 ; GFX8-NEXT: v_min_i16_e32 v3, v3, v6 ; GFX8-NEXT: v_sub_u16_e32 v2, v2, v3 ; GFX8-NEXT: v_lshlrev_b16_e32 v3, 8, v5 ; GFX8-NEXT: v_max_i16_e32 v5, -1, v3 ; GFX8-NEXT: v_lshlrev_b16_e32 v4, 8, v7 -; GFX8-NEXT: v_sub_u16_e32 v5, v5, v9 +; GFX8-NEXT: v_sub_u16_e32 v5, v5, v8 ; GFX8-NEXT: v_min_i16_e32 v6, -1, v3 -; GFX8-NEXT: v_subrev_u16_e32 v6, 0x8000, v6 +; GFX8-NEXT: v_sub_u16_e32 v6, v6, v9 ; GFX8-NEXT: v_max_i16_e32 v4, v5, v4 ; GFX8-NEXT: v_min_i16_e32 v4, v4, v6 ; GFX8-NEXT: v_sub_u16_e32 v3, v3, v4 @@ -623,12 +620,12 @@ ; GFX9-NEXT: v_pk_sub_i16 v0, v0, v1 clamp ; GFX9-NEXT: v_pk_sub_i16 v1, v2, v3 clamp ; GFX9-NEXT: v_pk_ashrrev_i16 v0, 8, v0 op_sel_hi:[0,1] -; GFX9-NEXT: v_mov_b32_e32 v2, 8 +; GFX9-NEXT: v_mov_b32_e32 v3, 8 ; GFX9-NEXT: v_pk_ashrrev_i16 v1, 8, v1 op_sel_hi:[0,1] -; GFX9-NEXT: s_movk_i32 s4, 0xff -; GFX9-NEXT: v_lshlrev_b32_sdwa v2, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 -; GFX9-NEXT: v_and_or_b32 v0, v0, s4, v2 -; GFX9-NEXT: v_and_b32_e32 v2, s4, v1 +; GFX9-NEXT: v_mov_b32_e32 v2, 0xff +; GFX9-NEXT: v_lshlrev_b32_sdwa v3, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 +; GFX9-NEXT: v_and_or_b32 v0, v0, v2, v3 +; GFX9-NEXT: v_and_b32_e32 v2, v1, v2 ; GFX9-NEXT: v_mov_b32_e32 v3, 24 ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX9-NEXT: v_lshlrev_b32_sdwa v1, v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 @@ -649,26 +646,26 @@ ; GFX10-NEXT: v_lshrrev_b32_e32 v8, 16, v1 ; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v4 ; GFX10-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GFX10-NEXT: s_movk_i32 s4, 0xff ; GFX10-NEXT: v_and_or_b32 v0, v0, v7, v2 ; GFX10-NEXT: v_and_or_b32 v1, v1, v7, v6 ; GFX10-NEXT: v_and_or_b32 v2, v3, v7, v4 ; GFX10-NEXT: v_and_or_b32 v3, v8, v7, v5 -; GFX10-NEXT: v_mov_b32_e32 v4, 24 +; GFX10-NEXT: v_mov_b32_e32 v5, 24 ; GFX10-NEXT: v_pk_lshlrev_b16 v0, 8, v0 op_sel_hi:[0,1] ; GFX10-NEXT: v_pk_lshlrev_b16 v1, 8, v1 op_sel_hi:[0,1] ; GFX10-NEXT: v_pk_lshlrev_b16 v2, 8, v2 op_sel_hi:[0,1] ; GFX10-NEXT: v_pk_lshlrev_b16 v3, 8, v3 op_sel_hi:[0,1] ; GFX10-NEXT: v_pk_sub_i16 v0, v0, v1 clamp ; GFX10-NEXT: v_pk_sub_i16 v1, v2, v3 clamp -; GFX10-NEXT: v_mov_b32_e32 v2, 8 +; GFX10-NEXT: v_mov_b32_e32 v2, 0xff +; GFX10-NEXT: v_mov_b32_e32 v3, 8 ; GFX10-NEXT: v_pk_ashrrev_i16 v0, 8, v0 op_sel_hi:[0,1] ; GFX10-NEXT: v_pk_ashrrev_i16 v1, 8, v1 op_sel_hi:[0,1] -; GFX10-NEXT: v_lshlrev_b32_sdwa v2, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 -; GFX10-NEXT: v_and_b32_e32 v3, s4, v1 -; GFX10-NEXT: v_lshlrev_b32_sdwa v1, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 -; GFX10-NEXT: v_and_or_b32 v0, v0, s4, v2 -; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; GFX10-NEXT: v_lshlrev_b32_sdwa v3, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 +; GFX10-NEXT: v_and_b32_e32 v4, v1, v2 +; GFX10-NEXT: v_lshlrev_b32_sdwa v1, v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 +; GFX10-NEXT: v_and_or_b32 v0, v0, v2, v3 +; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v4 ; GFX10-NEXT: v_or3_b32 v0, v0, v2, v1 ; GFX10-NEXT: s_setpc_b64 s[30:31] %lhs = bitcast i32 %lhs.arg to <4 x i8> @@ -867,11 +864,11 @@ ; GFX9-NEXT: v_pk_sub_i16 v1, s3, v1 clamp ; GFX9-NEXT: v_pk_ashrrev_i16 v0, 8, v0 op_sel_hi:[0,1] ; GFX9-NEXT: v_pk_ashrrev_i16 v1, 8, v1 op_sel_hi:[0,1] -; GFX9-NEXT: s_movk_i32 s0, 0xff -; GFX9-NEXT: v_lshlrev_b32_sdwa v2, s2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 +; GFX9-NEXT: v_mov_b32_e32 v2, 0xff +; GFX9-NEXT: v_lshlrev_b32_sdwa v3, s2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 ; GFX9-NEXT: s_mov_b32 s5, 24 -; GFX9-NEXT: v_and_or_b32 v0, v0, s0, v2 -; GFX9-NEXT: v_and_b32_e32 v2, s0, v1 +; GFX9-NEXT: v_and_or_b32 v0, v0, v2, v3 +; GFX9-NEXT: v_and_b32_e32 v2, v1, v2 ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX9-NEXT: v_lshlrev_b32_sdwa v1, s5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 ; GFX9-NEXT: v_or3_b32 v0, v0, v2, v1 @@ -909,16 +906,16 @@ ; GFX10-NEXT: s_pack_ll_b32_b16 s3, s3, s4 ; GFX10-NEXT: v_pk_sub_i16 v0, s0, s1 clamp ; GFX10-NEXT: v_pk_sub_i16 v1, s2, s3 clamp +; GFX10-NEXT: v_mov_b32_e32 v2, 0xff ; GFX10-NEXT: s_mov_b32 s0, 8 -; GFX10-NEXT: s_movk_i32 s1, 0xff ; GFX10-NEXT: v_pk_ashrrev_i16 v0, 8, v0 op_sel_hi:[0,1] ; GFX10-NEXT: v_pk_ashrrev_i16 v1, 8, v1 op_sel_hi:[0,1] -; GFX10-NEXT: v_lshlrev_b32_sdwa v2, s0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 -; GFX10-NEXT: v_and_b32_e32 v3, s1, v1 +; GFX10-NEXT: v_lshlrev_b32_sdwa v3, s0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 +; GFX10-NEXT: v_and_b32_e32 v4, v1, v2 ; GFX10-NEXT: s_mov_b32 s0, 24 ; GFX10-NEXT: v_lshlrev_b32_sdwa v1, s0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 -; GFX10-NEXT: v_and_or_b32 v0, v0, s1, v2 -; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; GFX10-NEXT: v_and_or_b32 v0, v0, v2, v3 +; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v4 ; GFX10-NEXT: v_or3_b32 v0, v0, v2, v1 ; GFX10-NEXT: v_readfirstlane_b32 s0, v0 ; GFX10-NEXT: ; return to shader part epilog @@ -1195,19 +1192,19 @@ ; GFX6-LABEL: v_ssubsat_v2i32: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: s_brev_b32 s4, -2 -; GFX6-NEXT: v_max_i32_e32 v4, -1, v0 -; GFX6-NEXT: s_brev_b32 s5, 1 -; GFX6-NEXT: v_subrev_i32_e32 v4, vcc, s4, v4 -; GFX6-NEXT: v_min_i32_e32 v5, -1, v0 -; GFX6-NEXT: v_subrev_i32_e32 v5, vcc, s5, v5 -; GFX6-NEXT: v_max_i32_e32 v2, v4, v2 -; GFX6-NEXT: v_min_i32_e32 v2, v2, v5 +; GFX6-NEXT: v_bfrev_b32_e32 v4, -2 +; GFX6-NEXT: v_max_i32_e32 v6, -1, v0 +; GFX6-NEXT: v_bfrev_b32_e32 v5, 1 +; GFX6-NEXT: v_sub_i32_e32 v6, vcc, v6, v4 +; GFX6-NEXT: v_min_i32_e32 v7, -1, v0 +; GFX6-NEXT: v_sub_i32_e32 v7, vcc, v7, v5 +; GFX6-NEXT: v_max_i32_e32 v2, v6, v2 +; GFX6-NEXT: v_min_i32_e32 v2, v2, v7 ; GFX6-NEXT: v_sub_i32_e32 v0, vcc, v0, v2 ; GFX6-NEXT: v_max_i32_e32 v2, -1, v1 -; GFX6-NEXT: v_subrev_i32_e32 v2, vcc, s4, v2 +; GFX6-NEXT: v_sub_i32_e32 v2, vcc, v2, v4 ; GFX6-NEXT: v_min_i32_e32 v4, -1, v1 -; GFX6-NEXT: v_subrev_i32_e32 v4, vcc, s5, v4 +; GFX6-NEXT: v_sub_i32_e32 v4, vcc, v4, v5 ; GFX6-NEXT: v_max_i32_e32 v2, v2, v3 ; GFX6-NEXT: v_min_i32_e32 v2, v2, v4 ; GFX6-NEXT: v_sub_i32_e32 v1, vcc, v1, v2 @@ -1216,19 +1213,19 @@ ; GFX8-LABEL: v_ssubsat_v2i32: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: s_brev_b32 s4, -2 -; GFX8-NEXT: v_max_i32_e32 v4, -1, v0 -; GFX8-NEXT: s_brev_b32 s5, 1 -; GFX8-NEXT: v_subrev_u32_e32 v4, vcc, s4, v4 -; GFX8-NEXT: v_min_i32_e32 v5, -1, v0 -; GFX8-NEXT: v_subrev_u32_e32 v5, vcc, s5, v5 -; GFX8-NEXT: v_max_i32_e32 v2, v4, v2 -; GFX8-NEXT: v_min_i32_e32 v2, v2, v5 +; GFX8-NEXT: v_bfrev_b32_e32 v4, -2 +; GFX8-NEXT: v_max_i32_e32 v6, -1, v0 +; GFX8-NEXT: v_bfrev_b32_e32 v5, 1 +; GFX8-NEXT: v_sub_u32_e32 v6, vcc, v6, v4 +; GFX8-NEXT: v_min_i32_e32 v7, -1, v0 +; GFX8-NEXT: v_sub_u32_e32 v7, vcc, v7, v5 +; GFX8-NEXT: v_max_i32_e32 v2, v6, v2 +; GFX8-NEXT: v_min_i32_e32 v2, v2, v7 ; GFX8-NEXT: v_sub_u32_e32 v0, vcc, v0, v2 ; GFX8-NEXT: v_max_i32_e32 v2, -1, v1 -; GFX8-NEXT: v_subrev_u32_e32 v2, vcc, s4, v2 +; GFX8-NEXT: v_sub_u32_e32 v2, vcc, v2, v4 ; GFX8-NEXT: v_min_i32_e32 v4, -1, v1 -; GFX8-NEXT: v_subrev_u32_e32 v4, vcc, s5, v4 +; GFX8-NEXT: v_sub_u32_e32 v4, vcc, v4, v5 ; GFX8-NEXT: v_max_i32_e32 v2, v2, v3 ; GFX8-NEXT: v_min_i32_e32 v2, v2, v4 ; GFX8-NEXT: v_sub_u32_e32 v1, vcc, v1, v2 @@ -1318,26 +1315,26 @@ ; GFX6-LABEL: v_ssubsat_v3i32: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: s_brev_b32 s4, -2 -; GFX6-NEXT: v_max_i32_e32 v6, -1, v0 -; GFX6-NEXT: s_brev_b32 s5, 1 -; GFX6-NEXT: v_subrev_i32_e32 v6, vcc, s4, v6 -; GFX6-NEXT: v_min_i32_e32 v7, -1, v0 -; GFX6-NEXT: v_subrev_i32_e32 v7, vcc, s5, v7 -; GFX6-NEXT: v_max_i32_e32 v3, v6, v3 -; GFX6-NEXT: v_min_i32_e32 v3, v3, v7 +; GFX6-NEXT: v_bfrev_b32_e32 v6, -2 +; GFX6-NEXT: v_max_i32_e32 v8, -1, v0 +; GFX6-NEXT: v_bfrev_b32_e32 v7, 1 +; GFX6-NEXT: v_sub_i32_e32 v8, vcc, v8, v6 +; GFX6-NEXT: v_min_i32_e32 v9, -1, v0 +; GFX6-NEXT: v_sub_i32_e32 v9, vcc, v9, v7 +; GFX6-NEXT: v_max_i32_e32 v3, v8, v3 +; GFX6-NEXT: v_min_i32_e32 v3, v3, v9 ; GFX6-NEXT: v_sub_i32_e32 v0, vcc, v0, v3 ; GFX6-NEXT: v_max_i32_e32 v3, -1, v1 -; GFX6-NEXT: v_subrev_i32_e32 v3, vcc, s4, v3 -; GFX6-NEXT: v_min_i32_e32 v6, -1, v1 -; GFX6-NEXT: v_subrev_i32_e32 v6, vcc, s5, v6 +; GFX6-NEXT: v_sub_i32_e32 v3, vcc, v3, v6 +; GFX6-NEXT: v_min_i32_e32 v8, -1, v1 +; GFX6-NEXT: v_sub_i32_e32 v8, vcc, v8, v7 ; GFX6-NEXT: v_max_i32_e32 v3, v3, v4 -; GFX6-NEXT: v_min_i32_e32 v3, v3, v6 +; GFX6-NEXT: v_min_i32_e32 v3, v3, v8 ; GFX6-NEXT: v_sub_i32_e32 v1, vcc, v1, v3 ; GFX6-NEXT: v_max_i32_e32 v3, -1, v2 -; GFX6-NEXT: v_subrev_i32_e32 v3, vcc, s4, v3 +; GFX6-NEXT: v_sub_i32_e32 v3, vcc, v3, v6 ; GFX6-NEXT: v_min_i32_e32 v4, -1, v2 -; GFX6-NEXT: v_subrev_i32_e32 v4, vcc, s5, v4 +; GFX6-NEXT: v_sub_i32_e32 v4, vcc, v4, v7 ; GFX6-NEXT: v_max_i32_e32 v3, v3, v5 ; GFX6-NEXT: v_min_i32_e32 v3, v3, v4 ; GFX6-NEXT: v_sub_i32_e32 v2, vcc, v2, v3 @@ -1346,26 +1343,26 @@ ; GFX8-LABEL: v_ssubsat_v3i32: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: s_brev_b32 s4, -2 -; GFX8-NEXT: v_max_i32_e32 v6, -1, v0 -; GFX8-NEXT: s_brev_b32 s5, 1 -; GFX8-NEXT: v_subrev_u32_e32 v6, vcc, s4, v6 -; GFX8-NEXT: v_min_i32_e32 v7, -1, v0 -; GFX8-NEXT: v_subrev_u32_e32 v7, vcc, s5, v7 -; GFX8-NEXT: v_max_i32_e32 v3, v6, v3 -; GFX8-NEXT: v_min_i32_e32 v3, v3, v7 +; GFX8-NEXT: v_bfrev_b32_e32 v6, -2 +; GFX8-NEXT: v_max_i32_e32 v8, -1, v0 +; GFX8-NEXT: v_bfrev_b32_e32 v7, 1 +; GFX8-NEXT: v_sub_u32_e32 v8, vcc, v8, v6 +; GFX8-NEXT: v_min_i32_e32 v9, -1, v0 +; GFX8-NEXT: v_sub_u32_e32 v9, vcc, v9, v7 +; GFX8-NEXT: v_max_i32_e32 v3, v8, v3 +; GFX8-NEXT: v_min_i32_e32 v3, v3, v9 ; GFX8-NEXT: v_sub_u32_e32 v0, vcc, v0, v3 ; GFX8-NEXT: v_max_i32_e32 v3, -1, v1 -; GFX8-NEXT: v_subrev_u32_e32 v3, vcc, s4, v3 -; GFX8-NEXT: v_min_i32_e32 v6, -1, v1 -; GFX8-NEXT: v_subrev_u32_e32 v6, vcc, s5, v6 +; GFX8-NEXT: v_sub_u32_e32 v3, vcc, v3, v6 +; GFX8-NEXT: v_min_i32_e32 v8, -1, v1 +; GFX8-NEXT: v_sub_u32_e32 v8, vcc, v8, v7 ; GFX8-NEXT: v_max_i32_e32 v3, v3, v4 -; GFX8-NEXT: v_min_i32_e32 v3, v3, v6 +; GFX8-NEXT: v_min_i32_e32 v3, v3, v8 ; GFX8-NEXT: v_sub_u32_e32 v1, vcc, v1, v3 ; GFX8-NEXT: v_max_i32_e32 v3, -1, v2 -; GFX8-NEXT: v_subrev_u32_e32 v3, vcc, s4, v3 +; GFX8-NEXT: v_sub_u32_e32 v3, vcc, v3, v6 ; GFX8-NEXT: v_min_i32_e32 v4, -1, v2 -; GFX8-NEXT: v_subrev_u32_e32 v4, vcc, s5, v4 +; GFX8-NEXT: v_sub_u32_e32 v4, vcc, v4, v7 ; GFX8-NEXT: v_max_i32_e32 v3, v3, v5 ; GFX8-NEXT: v_min_i32_e32 v3, v3, v4 ; GFX8-NEXT: v_sub_u32_e32 v2, vcc, v2, v3 @@ -1476,33 +1473,33 @@ ; GFX6-LABEL: v_ssubsat_v4i32: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: s_brev_b32 s4, -2 -; GFX6-NEXT: v_max_i32_e32 v8, -1, v0 -; GFX6-NEXT: s_brev_b32 s5, 1 -; GFX6-NEXT: v_subrev_i32_e32 v8, vcc, s4, v8 -; GFX6-NEXT: v_min_i32_e32 v9, -1, v0 -; GFX6-NEXT: v_subrev_i32_e32 v9, vcc, s5, v9 -; GFX6-NEXT: v_max_i32_e32 v4, v8, v4 -; GFX6-NEXT: v_min_i32_e32 v4, v4, v9 +; GFX6-NEXT: v_bfrev_b32_e32 v8, -2 +; GFX6-NEXT: v_max_i32_e32 v10, -1, v0 +; GFX6-NEXT: v_bfrev_b32_e32 v9, 1 +; GFX6-NEXT: v_sub_i32_e32 v10, vcc, v10, v8 +; GFX6-NEXT: v_min_i32_e32 v11, -1, v0 +; GFX6-NEXT: v_sub_i32_e32 v11, vcc, v11, v9 +; GFX6-NEXT: v_max_i32_e32 v4, v10, v4 +; GFX6-NEXT: v_min_i32_e32 v4, v4, v11 ; GFX6-NEXT: v_sub_i32_e32 v0, vcc, v0, v4 ; GFX6-NEXT: v_max_i32_e32 v4, -1, v1 -; GFX6-NEXT: v_subrev_i32_e32 v4, vcc, s4, v4 -; GFX6-NEXT: v_min_i32_e32 v8, -1, v1 -; GFX6-NEXT: v_subrev_i32_e32 v8, vcc, s5, v8 +; GFX6-NEXT: v_sub_i32_e32 v4, vcc, v4, v8 +; GFX6-NEXT: v_min_i32_e32 v10, -1, v1 +; GFX6-NEXT: v_sub_i32_e32 v10, vcc, v10, v9 ; GFX6-NEXT: v_max_i32_e32 v4, v4, v5 -; GFX6-NEXT: v_min_i32_e32 v4, v4, v8 +; GFX6-NEXT: v_min_i32_e32 v4, v4, v10 ; GFX6-NEXT: v_sub_i32_e32 v1, vcc, v1, v4 ; GFX6-NEXT: v_max_i32_e32 v4, -1, v2 -; GFX6-NEXT: v_subrev_i32_e32 v4, vcc, s4, v4 +; GFX6-NEXT: v_sub_i32_e32 v4, vcc, v4, v8 ; GFX6-NEXT: v_min_i32_e32 v5, -1, v2 -; GFX6-NEXT: v_subrev_i32_e32 v5, vcc, s5, v5 +; GFX6-NEXT: v_sub_i32_e32 v5, vcc, v5, v9 ; GFX6-NEXT: v_max_i32_e32 v4, v4, v6 ; GFX6-NEXT: v_min_i32_e32 v4, v4, v5 ; GFX6-NEXT: v_sub_i32_e32 v2, vcc, v2, v4 ; GFX6-NEXT: v_max_i32_e32 v4, -1, v3 -; GFX6-NEXT: v_subrev_i32_e32 v4, vcc, 0x7fffffff, v4 +; GFX6-NEXT: v_sub_i32_e32 v4, vcc, v4, v8 ; GFX6-NEXT: v_min_i32_e32 v5, -1, v3 -; GFX6-NEXT: v_subrev_i32_e32 v5, vcc, 0x80000000, v5 +; GFX6-NEXT: v_sub_i32_e32 v5, vcc, v5, v9 ; GFX6-NEXT: v_max_i32_e32 v4, v4, v7 ; GFX6-NEXT: v_min_i32_e32 v4, v4, v5 ; GFX6-NEXT: v_sub_i32_e32 v3, vcc, v3, v4 @@ -1511,33 +1508,33 @@ ; GFX8-LABEL: v_ssubsat_v4i32: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: s_brev_b32 s4, -2 -; GFX8-NEXT: v_max_i32_e32 v8, -1, v0 -; GFX8-NEXT: s_brev_b32 s5, 1 -; GFX8-NEXT: v_subrev_u32_e32 v8, vcc, s4, v8 -; GFX8-NEXT: v_min_i32_e32 v9, -1, v0 -; GFX8-NEXT: v_subrev_u32_e32 v9, vcc, s5, v9 -; GFX8-NEXT: v_max_i32_e32 v4, v8, v4 -; GFX8-NEXT: v_min_i32_e32 v4, v4, v9 +; GFX8-NEXT: v_bfrev_b32_e32 v8, -2 +; GFX8-NEXT: v_max_i32_e32 v10, -1, v0 +; GFX8-NEXT: v_bfrev_b32_e32 v9, 1 +; GFX8-NEXT: v_sub_u32_e32 v10, vcc, v10, v8 +; GFX8-NEXT: v_min_i32_e32 v11, -1, v0 +; GFX8-NEXT: v_sub_u32_e32 v11, vcc, v11, v9 +; GFX8-NEXT: v_max_i32_e32 v4, v10, v4 +; GFX8-NEXT: v_min_i32_e32 v4, v4, v11 ; GFX8-NEXT: v_sub_u32_e32 v0, vcc, v0, v4 ; GFX8-NEXT: v_max_i32_e32 v4, -1, v1 -; GFX8-NEXT: v_subrev_u32_e32 v4, vcc, s4, v4 -; GFX8-NEXT: v_min_i32_e32 v8, -1, v1 -; GFX8-NEXT: v_subrev_u32_e32 v8, vcc, s5, v8 +; GFX8-NEXT: v_sub_u32_e32 v4, vcc, v4, v8 +; GFX8-NEXT: v_min_i32_e32 v10, -1, v1 +; GFX8-NEXT: v_sub_u32_e32 v10, vcc, v10, v9 ; GFX8-NEXT: v_max_i32_e32 v4, v4, v5 -; GFX8-NEXT: v_min_i32_e32 v4, v4, v8 +; GFX8-NEXT: v_min_i32_e32 v4, v4, v10 ; GFX8-NEXT: v_sub_u32_e32 v1, vcc, v1, v4 ; GFX8-NEXT: v_max_i32_e32 v4, -1, v2 -; GFX8-NEXT: v_subrev_u32_e32 v4, vcc, s4, v4 +; GFX8-NEXT: v_sub_u32_e32 v4, vcc, v4, v8 ; GFX8-NEXT: v_min_i32_e32 v5, -1, v2 -; GFX8-NEXT: v_subrev_u32_e32 v5, vcc, s5, v5 +; GFX8-NEXT: v_sub_u32_e32 v5, vcc, v5, v9 ; GFX8-NEXT: v_max_i32_e32 v4, v4, v6 ; GFX8-NEXT: v_min_i32_e32 v4, v4, v5 ; GFX8-NEXT: v_sub_u32_e32 v2, vcc, v2, v4 ; GFX8-NEXT: v_max_i32_e32 v4, -1, v3 -; GFX8-NEXT: v_subrev_u32_e32 v4, vcc, 0x7fffffff, v4 +; GFX8-NEXT: v_sub_u32_e32 v4, vcc, v4, v8 ; GFX8-NEXT: v_min_i32_e32 v5, -1, v3 -; GFX8-NEXT: v_subrev_u32_e32 v5, vcc, 0x80000000, v5 +; GFX8-NEXT: v_sub_u32_e32 v5, vcc, v5, v9 ; GFX8-NEXT: v_max_i32_e32 v4, v4, v7 ; GFX8-NEXT: v_min_i32_e32 v4, v4, v5 ; GFX8-NEXT: v_sub_u32_e32 v3, vcc, v3, v4 @@ -1669,42 +1666,40 @@ ; GFX6-LABEL: v_ssubsat_v5i32: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: s_brev_b32 s4, -2 -; GFX6-NEXT: v_max_i32_e32 v10, -1, v0 -; GFX6-NEXT: s_brev_b32 s5, 1 -; GFX6-NEXT: v_subrev_i32_e32 v10, vcc, s4, v10 -; GFX6-NEXT: v_min_i32_e32 v12, -1, v0 -; GFX6-NEXT: v_subrev_i32_e32 v12, vcc, s5, v12 -; GFX6-NEXT: v_max_i32_e32 v5, v10, v5 -; GFX6-NEXT: v_min_i32_e32 v5, v5, v12 +; GFX6-NEXT: v_bfrev_b32_e32 v10, -2 +; GFX6-NEXT: v_max_i32_e32 v12, -1, v0 +; GFX6-NEXT: v_bfrev_b32_e32 v11, 1 +; GFX6-NEXT: v_sub_i32_e32 v12, vcc, v12, v10 +; GFX6-NEXT: v_min_i32_e32 v13, -1, v0 +; GFX6-NEXT: v_sub_i32_e32 v13, vcc, v13, v11 +; GFX6-NEXT: v_max_i32_e32 v5, v12, v5 +; GFX6-NEXT: v_min_i32_e32 v5, v5, v13 ; GFX6-NEXT: v_sub_i32_e32 v0, vcc, v0, v5 ; GFX6-NEXT: v_max_i32_e32 v5, -1, v1 -; GFX6-NEXT: v_subrev_i32_e32 v5, vcc, s4, v5 -; GFX6-NEXT: v_min_i32_e32 v10, -1, v1 -; GFX6-NEXT: v_subrev_i32_e32 v10, vcc, s5, v10 +; GFX6-NEXT: v_sub_i32_e32 v5, vcc, v5, v10 +; GFX6-NEXT: v_min_i32_e32 v12, -1, v1 +; GFX6-NEXT: v_sub_i32_e32 v12, vcc, v12, v11 ; GFX6-NEXT: v_max_i32_e32 v5, v5, v6 -; GFX6-NEXT: v_min_i32_e32 v5, v5, v10 +; GFX6-NEXT: v_min_i32_e32 v5, v5, v12 ; GFX6-NEXT: v_sub_i32_e32 v1, vcc, v1, v5 ; GFX6-NEXT: v_max_i32_e32 v5, -1, v2 -; GFX6-NEXT: v_subrev_i32_e32 v5, vcc, s4, v5 +; GFX6-NEXT: v_sub_i32_e32 v5, vcc, v5, v10 ; GFX6-NEXT: v_min_i32_e32 v6, -1, v2 -; GFX6-NEXT: v_subrev_i32_e32 v6, vcc, s5, v6 +; GFX6-NEXT: v_sub_i32_e32 v6, vcc, v6, v11 ; GFX6-NEXT: v_max_i32_e32 v5, v5, v7 ; GFX6-NEXT: v_min_i32_e32 v5, v5, v6 -; GFX6-NEXT: v_bfrev_b32_e32 v11, -2 ; GFX6-NEXT: v_sub_i32_e32 v2, vcc, v2, v5 ; GFX6-NEXT: v_max_i32_e32 v5, -1, v3 -; GFX6-NEXT: v_bfrev_b32_e32 v13, 1 -; GFX6-NEXT: v_sub_i32_e32 v5, vcc, v5, v11 +; GFX6-NEXT: v_sub_i32_e32 v5, vcc, v5, v10 ; GFX6-NEXT: v_min_i32_e32 v6, -1, v3 -; GFX6-NEXT: v_sub_i32_e32 v6, vcc, v6, v13 +; GFX6-NEXT: v_sub_i32_e32 v6, vcc, v6, v11 ; GFX6-NEXT: v_max_i32_e32 v5, v5, v8 ; GFX6-NEXT: v_min_i32_e32 v5, v5, v6 ; GFX6-NEXT: v_sub_i32_e32 v3, vcc, v3, v5 ; GFX6-NEXT: v_max_i32_e32 v5, -1, v4 -; GFX6-NEXT: v_sub_i32_e32 v5, vcc, v5, v11 +; GFX6-NEXT: v_sub_i32_e32 v5, vcc, v5, v10 ; GFX6-NEXT: v_min_i32_e32 v6, -1, v4 -; GFX6-NEXT: v_sub_i32_e32 v6, vcc, v6, v13 +; GFX6-NEXT: v_sub_i32_e32 v6, vcc, v6, v11 ; GFX6-NEXT: v_max_i32_e32 v5, v5, v9 ; GFX6-NEXT: v_min_i32_e32 v5, v5, v6 ; GFX6-NEXT: v_sub_i32_e32 v4, vcc, v4, v5 @@ -1713,42 +1708,40 @@ ; GFX8-LABEL: v_ssubsat_v5i32: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: s_brev_b32 s4, -2 -; GFX8-NEXT: v_max_i32_e32 v10, -1, v0 -; GFX8-NEXT: s_brev_b32 s5, 1 -; GFX8-NEXT: v_subrev_u32_e32 v10, vcc, s4, v10 -; GFX8-NEXT: v_min_i32_e32 v12, -1, v0 -; GFX8-NEXT: v_subrev_u32_e32 v12, vcc, s5, v12 -; GFX8-NEXT: v_max_i32_e32 v5, v10, v5 -; GFX8-NEXT: v_min_i32_e32 v5, v5, v12 +; GFX8-NEXT: v_bfrev_b32_e32 v10, -2 +; GFX8-NEXT: v_max_i32_e32 v12, -1, v0 +; GFX8-NEXT: v_bfrev_b32_e32 v11, 1 +; GFX8-NEXT: v_sub_u32_e32 v12, vcc, v12, v10 +; GFX8-NEXT: v_min_i32_e32 v13, -1, v0 +; GFX8-NEXT: v_sub_u32_e32 v13, vcc, v13, v11 +; GFX8-NEXT: v_max_i32_e32 v5, v12, v5 +; GFX8-NEXT: v_min_i32_e32 v5, v5, v13 ; GFX8-NEXT: v_sub_u32_e32 v0, vcc, v0, v5 ; GFX8-NEXT: v_max_i32_e32 v5, -1, v1 -; GFX8-NEXT: v_subrev_u32_e32 v5, vcc, s4, v5 -; GFX8-NEXT: v_min_i32_e32 v10, -1, v1 -; GFX8-NEXT: v_subrev_u32_e32 v10, vcc, s5, v10 +; GFX8-NEXT: v_sub_u32_e32 v5, vcc, v5, v10 +; GFX8-NEXT: v_min_i32_e32 v12, -1, v1 +; GFX8-NEXT: v_sub_u32_e32 v12, vcc, v12, v11 ; GFX8-NEXT: v_max_i32_e32 v5, v5, v6 -; GFX8-NEXT: v_min_i32_e32 v5, v5, v10 +; GFX8-NEXT: v_min_i32_e32 v5, v5, v12 ; GFX8-NEXT: v_sub_u32_e32 v1, vcc, v1, v5 ; GFX8-NEXT: v_max_i32_e32 v5, -1, v2 -; GFX8-NEXT: v_subrev_u32_e32 v5, vcc, s4, v5 +; GFX8-NEXT: v_sub_u32_e32 v5, vcc, v5, v10 ; GFX8-NEXT: v_min_i32_e32 v6, -1, v2 -; GFX8-NEXT: v_subrev_u32_e32 v6, vcc, s5, v6 +; GFX8-NEXT: v_sub_u32_e32 v6, vcc, v6, v11 ; GFX8-NEXT: v_max_i32_e32 v5, v5, v7 ; GFX8-NEXT: v_min_i32_e32 v5, v5, v6 -; GFX8-NEXT: v_bfrev_b32_e32 v11, -2 ; GFX8-NEXT: v_sub_u32_e32 v2, vcc, v2, v5 ; GFX8-NEXT: v_max_i32_e32 v5, -1, v3 -; GFX8-NEXT: v_bfrev_b32_e32 v13, 1 -; GFX8-NEXT: v_sub_u32_e32 v5, vcc, v5, v11 +; GFX8-NEXT: v_sub_u32_e32 v5, vcc, v5, v10 ; GFX8-NEXT: v_min_i32_e32 v6, -1, v3 -; GFX8-NEXT: v_sub_u32_e32 v6, vcc, v6, v13 +; GFX8-NEXT: v_sub_u32_e32 v6, vcc, v6, v11 ; GFX8-NEXT: v_max_i32_e32 v5, v5, v8 ; GFX8-NEXT: v_min_i32_e32 v5, v5, v6 ; GFX8-NEXT: v_sub_u32_e32 v3, vcc, v3, v5 ; GFX8-NEXT: v_max_i32_e32 v5, -1, v4 -; GFX8-NEXT: v_sub_u32_e32 v5, vcc, v5, v11 +; GFX8-NEXT: v_sub_u32_e32 v5, vcc, v5, v10 ; GFX8-NEXT: v_min_i32_e32 v6, -1, v4 -; GFX8-NEXT: v_sub_u32_e32 v6, vcc, v6, v13 +; GFX8-NEXT: v_sub_u32_e32 v6, vcc, v6, v11 ; GFX8-NEXT: v_max_i32_e32 v5, v5, v9 ; GFX8-NEXT: v_min_i32_e32 v5, v5, v6 ; GFX8-NEXT: v_sub_u32_e32 v4, vcc, v4, v5 @@ -1901,119 +1894,117 @@ ; GFX6-LABEL: v_ssubsat_v16i32: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: s_brev_b32 s4, -2 -; GFX6-NEXT: v_max_i32_e32 v32, -1, v0 -; GFX6-NEXT: v_subrev_i32_e32 v32, vcc, s4, v32 -; GFX6-NEXT: v_max_i32_e32 v16, v32, v16 -; GFX6-NEXT: s_brev_b32 s5, 1 -; GFX6-NEXT: v_min_i32_e32 v32, -1, v0 -; GFX6-NEXT: v_subrev_i32_e32 v32, vcc, s5, v32 -; GFX6-NEXT: v_min_i32_e32 v16, v16, v32 +; GFX6-NEXT: v_bfrev_b32_e32 v32, -2 +; GFX6-NEXT: v_max_i32_e32 v33, -1, v0 +; GFX6-NEXT: v_sub_i32_e32 v33, vcc, v33, v32 +; GFX6-NEXT: v_max_i32_e32 v16, v33, v16 +; GFX6-NEXT: v_bfrev_b32_e32 v33, 1 +; GFX6-NEXT: v_min_i32_e32 v34, -1, v0 +; GFX6-NEXT: v_sub_i32_e32 v34, vcc, v34, v33 +; GFX6-NEXT: v_min_i32_e32 v16, v16, v34 ; GFX6-NEXT: v_sub_i32_e32 v0, vcc, v0, v16 ; GFX6-NEXT: v_max_i32_e32 v16, -1, v1 -; GFX6-NEXT: v_subrev_i32_e32 v16, vcc, s4, v16 +; GFX6-NEXT: v_sub_i32_e32 v16, vcc, v16, v32 ; GFX6-NEXT: v_max_i32_e32 v16, v16, v17 ; GFX6-NEXT: v_min_i32_e32 v17, -1, v1 -; GFX6-NEXT: v_subrev_i32_e32 v17, vcc, s5, v17 +; GFX6-NEXT: v_sub_i32_e32 v17, vcc, v17, v33 ; GFX6-NEXT: v_min_i32_e32 v16, v16, v17 ; GFX6-NEXT: v_sub_i32_e32 v1, vcc, v1, v16 ; GFX6-NEXT: v_max_i32_e32 v16, -1, v2 -; GFX6-NEXT: v_subrev_i32_e32 v16, vcc, s4, v16 +; GFX6-NEXT: v_sub_i32_e32 v16, vcc, v16, v32 ; GFX6-NEXT: v_min_i32_e32 v17, -1, v2 ; GFX6-NEXT: v_max_i32_e32 v16, v16, v18 -; GFX6-NEXT: v_subrev_i32_e32 v17, vcc, s5, v17 +; GFX6-NEXT: v_sub_i32_e32 v17, vcc, v17, v33 ; GFX6-NEXT: v_min_i32_e32 v16, v16, v17 ; GFX6-NEXT: v_sub_i32_e32 v2, vcc, v2, v16 -; GFX6-NEXT: v_bfrev_b32_e32 v16, -2 -; GFX6-NEXT: v_max_i32_e32 v17, -1, v3 -; GFX6-NEXT: v_sub_i32_e32 v17, vcc, v17, v16 -; GFX6-NEXT: v_max_i32_e32 v17, v17, v19 -; GFX6-NEXT: v_bfrev_b32_e32 v18, 1 -; GFX6-NEXT: v_min_i32_e32 v19, -1, v3 -; GFX6-NEXT: v_sub_i32_e32 v19, vcc, v19, v18 -; GFX6-NEXT: v_min_i32_e32 v17, v17, v19 -; GFX6-NEXT: v_sub_i32_e32 v3, vcc, v3, v17 -; GFX6-NEXT: v_max_i32_e32 v17, -1, v4 -; GFX6-NEXT: v_sub_i32_e32 v17, vcc, v17, v16 -; GFX6-NEXT: v_min_i32_e32 v19, -1, v4 -; GFX6-NEXT: v_max_i32_e32 v17, v17, v20 -; GFX6-NEXT: v_sub_i32_e32 v19, vcc, v19, v18 -; GFX6-NEXT: v_min_i32_e32 v17, v17, v19 -; GFX6-NEXT: v_sub_i32_e32 v4, vcc, v4, v17 -; GFX6-NEXT: v_max_i32_e32 v17, -1, v5 -; GFX6-NEXT: v_sub_i32_e32 v17, vcc, v17, v16 -; GFX6-NEXT: v_min_i32_e32 v19, -1, v5 -; GFX6-NEXT: v_max_i32_e32 v17, v17, v21 -; GFX6-NEXT: v_sub_i32_e32 v19, vcc, v19, v18 -; GFX6-NEXT: v_min_i32_e32 v17, v17, v19 -; GFX6-NEXT: v_sub_i32_e32 v5, vcc, v5, v17 -; GFX6-NEXT: v_max_i32_e32 v17, -1, v6 -; GFX6-NEXT: v_sub_i32_e32 v17, vcc, v17, v16 -; GFX6-NEXT: v_min_i32_e32 v19, -1, v6 -; GFX6-NEXT: v_max_i32_e32 v17, v17, v22 -; GFX6-NEXT: v_sub_i32_e32 v19, vcc, v19, v18 -; GFX6-NEXT: v_min_i32_e32 v17, v17, v19 -; GFX6-NEXT: v_sub_i32_e32 v6, vcc, v6, v17 -; GFX6-NEXT: v_max_i32_e32 v17, -1, v7 -; GFX6-NEXT: v_sub_i32_e32 v17, vcc, v17, v16 -; GFX6-NEXT: v_min_i32_e32 v19, -1, v7 -; GFX6-NEXT: v_max_i32_e32 v17, v17, v23 -; GFX6-NEXT: v_sub_i32_e32 v19, vcc, v19, v18 -; GFX6-NEXT: v_min_i32_e32 v17, v17, v19 -; GFX6-NEXT: v_sub_i32_e32 v7, vcc, v7, v17 -; GFX6-NEXT: v_max_i32_e32 v17, -1, v8 -; GFX6-NEXT: v_sub_i32_e32 v17, vcc, v17, v16 -; GFX6-NEXT: v_min_i32_e32 v19, -1, v8 -; GFX6-NEXT: v_max_i32_e32 v17, v17, v24 -; GFX6-NEXT: v_sub_i32_e32 v19, vcc, v19, v18 -; GFX6-NEXT: v_min_i32_e32 v17, v17, v19 -; GFX6-NEXT: v_sub_i32_e32 v8, vcc, v8, v17 -; GFX6-NEXT: v_max_i32_e32 v17, -1, v9 -; GFX6-NEXT: v_sub_i32_e32 v17, vcc, v17, v16 -; GFX6-NEXT: v_min_i32_e32 v19, -1, v9 -; GFX6-NEXT: v_max_i32_e32 v17, v17, v25 -; GFX6-NEXT: v_sub_i32_e32 v19, vcc, v19, v18 -; GFX6-NEXT: v_min_i32_e32 v17, v17, v19 -; GFX6-NEXT: v_sub_i32_e32 v9, vcc, v9, v17 -; GFX6-NEXT: v_max_i32_e32 v17, -1, v10 -; GFX6-NEXT: v_sub_i32_e32 v17, vcc, v17, v16 -; GFX6-NEXT: v_min_i32_e32 v19, -1, v10 -; GFX6-NEXT: v_max_i32_e32 v17, v17, v26 -; GFX6-NEXT: v_sub_i32_e32 v19, vcc, v19, v18 -; GFX6-NEXT: v_min_i32_e32 v17, v17, v19 -; GFX6-NEXT: v_sub_i32_e32 v10, vcc, v10, v17 -; GFX6-NEXT: v_max_i32_e32 v17, -1, v11 -; GFX6-NEXT: v_sub_i32_e32 v17, vcc, v17, v16 -; GFX6-NEXT: v_min_i32_e32 v19, -1, v11 -; GFX6-NEXT: v_max_i32_e32 v17, v17, v27 -; GFX6-NEXT: v_sub_i32_e32 v19, vcc, v19, v18 -; GFX6-NEXT: v_min_i32_e32 v17, v17, v19 -; GFX6-NEXT: v_sub_i32_e32 v11, vcc, v11, v17 -; GFX6-NEXT: v_max_i32_e32 v17, -1, v12 -; GFX6-NEXT: v_sub_i32_e32 v17, vcc, v17, v16 -; GFX6-NEXT: v_min_i32_e32 v19, -1, v12 -; GFX6-NEXT: v_max_i32_e32 v17, v17, v28 -; GFX6-NEXT: v_sub_i32_e32 v19, vcc, v19, v18 -; GFX6-NEXT: v_min_i32_e32 v17, v17, v19 -; GFX6-NEXT: v_sub_i32_e32 v12, vcc, v12, v17 -; GFX6-NEXT: v_max_i32_e32 v17, -1, v13 -; GFX6-NEXT: v_sub_i32_e32 v17, vcc, v17, v16 -; GFX6-NEXT: v_min_i32_e32 v19, -1, v13 -; GFX6-NEXT: v_max_i32_e32 v17, v17, v29 -; GFX6-NEXT: v_sub_i32_e32 v19, vcc, v19, v18 -; GFX6-NEXT: v_min_i32_e32 v17, v17, v19 -; GFX6-NEXT: v_sub_i32_e32 v13, vcc, v13, v17 -; GFX6-NEXT: v_max_i32_e32 v17, -1, v14 -; GFX6-NEXT: v_sub_i32_e32 v17, vcc, v17, v16 -; GFX6-NEXT: v_min_i32_e32 v19, -1, v14 -; GFX6-NEXT: v_max_i32_e32 v17, v17, v30 -; GFX6-NEXT: v_sub_i32_e32 v19, vcc, v19, v18 -; GFX6-NEXT: v_min_i32_e32 v17, v17, v19 -; GFX6-NEXT: v_sub_i32_e32 v14, vcc, v14, v17 -; GFX6-NEXT: v_max_i32_e32 v17, -1, v15 -; GFX6-NEXT: v_sub_i32_e32 v16, vcc, v17, v16 +; GFX6-NEXT: v_max_i32_e32 v16, -1, v3 +; GFX6-NEXT: v_sub_i32_e32 v16, vcc, v16, v32 +; GFX6-NEXT: v_min_i32_e32 v17, -1, v3 +; GFX6-NEXT: v_max_i32_e32 v16, v16, v19 +; GFX6-NEXT: v_sub_i32_e32 v17, vcc, v17, v33 +; GFX6-NEXT: v_min_i32_e32 v16, v16, v17 +; GFX6-NEXT: v_sub_i32_e32 v3, vcc, v3, v16 +; GFX6-NEXT: v_max_i32_e32 v16, -1, v4 +; GFX6-NEXT: v_sub_i32_e32 v16, vcc, v16, v32 +; GFX6-NEXT: v_min_i32_e32 v17, -1, v4 +; GFX6-NEXT: v_max_i32_e32 v16, v16, v20 +; GFX6-NEXT: v_sub_i32_e32 v17, vcc, v17, v33 +; GFX6-NEXT: v_min_i32_e32 v16, v16, v17 +; GFX6-NEXT: v_sub_i32_e32 v4, vcc, v4, v16 +; GFX6-NEXT: v_max_i32_e32 v16, -1, v5 +; GFX6-NEXT: v_sub_i32_e32 v16, vcc, v16, v32 +; GFX6-NEXT: v_min_i32_e32 v17, -1, v5 +; GFX6-NEXT: v_max_i32_e32 v16, v16, v21 +; GFX6-NEXT: v_sub_i32_e32 v17, vcc, v17, v33 +; GFX6-NEXT: v_min_i32_e32 v16, v16, v17 +; GFX6-NEXT: v_sub_i32_e32 v5, vcc, v5, v16 +; GFX6-NEXT: v_max_i32_e32 v16, -1, v6 +; GFX6-NEXT: v_sub_i32_e32 v16, vcc, v16, v32 +; GFX6-NEXT: v_min_i32_e32 v17, -1, v6 +; GFX6-NEXT: v_max_i32_e32 v16, v16, v22 +; GFX6-NEXT: v_sub_i32_e32 v17, vcc, v17, v33 +; GFX6-NEXT: v_min_i32_e32 v16, v16, v17 +; GFX6-NEXT: v_sub_i32_e32 v6, vcc, v6, v16 +; GFX6-NEXT: v_max_i32_e32 v16, -1, v7 +; GFX6-NEXT: v_sub_i32_e32 v16, vcc, v16, v32 +; GFX6-NEXT: v_min_i32_e32 v17, -1, v7 +; GFX6-NEXT: v_max_i32_e32 v16, v16, v23 +; GFX6-NEXT: v_sub_i32_e32 v17, vcc, v17, v33 +; GFX6-NEXT: v_min_i32_e32 v16, v16, v17 +; GFX6-NEXT: v_sub_i32_e32 v7, vcc, v7, v16 +; GFX6-NEXT: v_max_i32_e32 v16, -1, v8 +; GFX6-NEXT: v_sub_i32_e32 v16, vcc, v16, v32 +; GFX6-NEXT: v_min_i32_e32 v17, -1, v8 +; GFX6-NEXT: v_max_i32_e32 v16, v16, v24 +; GFX6-NEXT: v_sub_i32_e32 v17, vcc, v17, v33 +; GFX6-NEXT: v_min_i32_e32 v16, v16, v17 +; GFX6-NEXT: v_sub_i32_e32 v8, vcc, v8, v16 +; GFX6-NEXT: v_max_i32_e32 v16, -1, v9 +; GFX6-NEXT: v_sub_i32_e32 v16, vcc, v16, v32 +; GFX6-NEXT: v_min_i32_e32 v17, -1, v9 +; GFX6-NEXT: v_max_i32_e32 v16, v16, v25 +; GFX6-NEXT: v_sub_i32_e32 v17, vcc, v17, v33 +; GFX6-NEXT: v_min_i32_e32 v16, v16, v17 +; GFX6-NEXT: v_sub_i32_e32 v9, vcc, v9, v16 +; GFX6-NEXT: v_max_i32_e32 v16, -1, v10 +; GFX6-NEXT: v_sub_i32_e32 v16, vcc, v16, v32 +; GFX6-NEXT: v_min_i32_e32 v17, -1, v10 +; GFX6-NEXT: v_max_i32_e32 v16, v16, v26 +; GFX6-NEXT: v_sub_i32_e32 v17, vcc, v17, v33 +; GFX6-NEXT: v_min_i32_e32 v16, v16, v17 +; GFX6-NEXT: v_sub_i32_e32 v10, vcc, v10, v16 +; GFX6-NEXT: v_max_i32_e32 v16, -1, v11 +; GFX6-NEXT: v_sub_i32_e32 v16, vcc, v16, v32 +; GFX6-NEXT: v_min_i32_e32 v17, -1, v11 +; GFX6-NEXT: v_max_i32_e32 v16, v16, v27 +; GFX6-NEXT: v_sub_i32_e32 v17, vcc, v17, v33 +; GFX6-NEXT: v_min_i32_e32 v16, v16, v17 +; GFX6-NEXT: v_sub_i32_e32 v11, vcc, v11, v16 +; GFX6-NEXT: v_max_i32_e32 v16, -1, v12 +; GFX6-NEXT: v_sub_i32_e32 v16, vcc, v16, v32 +; GFX6-NEXT: v_min_i32_e32 v17, -1, v12 +; GFX6-NEXT: v_max_i32_e32 v16, v16, v28 +; GFX6-NEXT: v_sub_i32_e32 v17, vcc, v17, v33 +; GFX6-NEXT: v_min_i32_e32 v16, v16, v17 +; GFX6-NEXT: v_sub_i32_e32 v12, vcc, v12, v16 +; GFX6-NEXT: v_max_i32_e32 v16, -1, v13 +; GFX6-NEXT: v_sub_i32_e32 v16, vcc, v16, v32 +; GFX6-NEXT: v_min_i32_e32 v17, -1, v13 +; GFX6-NEXT: v_max_i32_e32 v16, v16, v29 +; GFX6-NEXT: v_sub_i32_e32 v17, vcc, v17, v33 +; GFX6-NEXT: v_min_i32_e32 v16, v16, v17 +; GFX6-NEXT: v_sub_i32_e32 v13, vcc, v13, v16 +; GFX6-NEXT: v_max_i32_e32 v16, -1, v14 +; GFX6-NEXT: v_sub_i32_e32 v16, vcc, v16, v32 +; GFX6-NEXT: v_min_i32_e32 v17, -1, v14 +; GFX6-NEXT: v_max_i32_e32 v16, v16, v30 +; GFX6-NEXT: v_sub_i32_e32 v17, vcc, v17, v33 +; GFX6-NEXT: v_min_i32_e32 v16, v16, v17 +; GFX6-NEXT: v_sub_i32_e32 v14, vcc, v14, v16 +; GFX6-NEXT: v_max_i32_e32 v16, -1, v15 +; GFX6-NEXT: v_sub_i32_e32 v16, vcc, v16, v32 ; GFX6-NEXT: v_min_i32_e32 v17, -1, v15 -; GFX6-NEXT: v_sub_i32_e32 v17, vcc, v17, v18 +; GFX6-NEXT: v_sub_i32_e32 v17, vcc, v17, v33 ; GFX6-NEXT: v_max_i32_e32 v16, v16, v31 ; GFX6-NEXT: v_min_i32_e32 v16, v16, v17 ; GFX6-NEXT: v_sub_i32_e32 v15, vcc, v15, v16 @@ -2022,119 +2013,117 @@ ; GFX8-LABEL: v_ssubsat_v16i32: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: s_brev_b32 s4, -2 -; GFX8-NEXT: v_max_i32_e32 v32, -1, v0 -; GFX8-NEXT: v_subrev_u32_e32 v32, vcc, s4, v32 -; GFX8-NEXT: v_max_i32_e32 v16, v32, v16 -; GFX8-NEXT: s_brev_b32 s5, 1 -; GFX8-NEXT: v_min_i32_e32 v32, -1, v0 -; GFX8-NEXT: v_subrev_u32_e32 v32, vcc, s5, v32 -; GFX8-NEXT: v_min_i32_e32 v16, v16, v32 +; GFX8-NEXT: v_bfrev_b32_e32 v32, -2 +; GFX8-NEXT: v_max_i32_e32 v33, -1, v0 +; GFX8-NEXT: v_sub_u32_e32 v33, vcc, v33, v32 +; GFX8-NEXT: v_max_i32_e32 v16, v33, v16 +; GFX8-NEXT: v_bfrev_b32_e32 v33, 1 +; GFX8-NEXT: v_min_i32_e32 v34, -1, v0 +; GFX8-NEXT: v_sub_u32_e32 v34, vcc, v34, v33 +; GFX8-NEXT: v_min_i32_e32 v16, v16, v34 ; GFX8-NEXT: v_sub_u32_e32 v0, vcc, v0, v16 ; GFX8-NEXT: v_max_i32_e32 v16, -1, v1 -; GFX8-NEXT: v_subrev_u32_e32 v16, vcc, s4, v16 +; GFX8-NEXT: v_sub_u32_e32 v16, vcc, v16, v32 ; GFX8-NEXT: v_max_i32_e32 v16, v16, v17 ; GFX8-NEXT: v_min_i32_e32 v17, -1, v1 -; GFX8-NEXT: v_subrev_u32_e32 v17, vcc, s5, v17 +; GFX8-NEXT: v_sub_u32_e32 v17, vcc, v17, v33 ; GFX8-NEXT: v_min_i32_e32 v16, v16, v17 ; GFX8-NEXT: v_sub_u32_e32 v1, vcc, v1, v16 ; GFX8-NEXT: v_max_i32_e32 v16, -1, v2 -; GFX8-NEXT: v_subrev_u32_e32 v16, vcc, s4, v16 +; GFX8-NEXT: v_sub_u32_e32 v16, vcc, v16, v32 ; GFX8-NEXT: v_min_i32_e32 v17, -1, v2 ; GFX8-NEXT: v_max_i32_e32 v16, v16, v18 -; GFX8-NEXT: v_subrev_u32_e32 v17, vcc, s5, v17 +; GFX8-NEXT: v_sub_u32_e32 v17, vcc, v17, v33 ; GFX8-NEXT: v_min_i32_e32 v16, v16, v17 ; GFX8-NEXT: v_sub_u32_e32 v2, vcc, v2, v16 -; GFX8-NEXT: v_bfrev_b32_e32 v16, -2 -; GFX8-NEXT: v_max_i32_e32 v17, -1, v3 -; GFX8-NEXT: v_sub_u32_e32 v17, vcc, v17, v16 -; GFX8-NEXT: v_max_i32_e32 v17, v17, v19 -; GFX8-NEXT: v_bfrev_b32_e32 v18, 1 -; GFX8-NEXT: v_min_i32_e32 v19, -1, v3 -; GFX8-NEXT: v_sub_u32_e32 v19, vcc, v19, v18 -; GFX8-NEXT: v_min_i32_e32 v17, v17, v19 -; GFX8-NEXT: v_sub_u32_e32 v3, vcc, v3, v17 -; GFX8-NEXT: v_max_i32_e32 v17, -1, v4 -; GFX8-NEXT: v_sub_u32_e32 v17, vcc, v17, v16 -; GFX8-NEXT: v_min_i32_e32 v19, -1, v4 -; GFX8-NEXT: v_max_i32_e32 v17, v17, v20 -; GFX8-NEXT: v_sub_u32_e32 v19, vcc, v19, v18 -; GFX8-NEXT: v_min_i32_e32 v17, v17, v19 -; GFX8-NEXT: v_sub_u32_e32 v4, vcc, v4, v17 -; GFX8-NEXT: v_max_i32_e32 v17, -1, v5 -; GFX8-NEXT: v_sub_u32_e32 v17, vcc, v17, v16 -; GFX8-NEXT: v_min_i32_e32 v19, -1, v5 -; GFX8-NEXT: v_max_i32_e32 v17, v17, v21 -; GFX8-NEXT: v_sub_u32_e32 v19, vcc, v19, v18 -; GFX8-NEXT: v_min_i32_e32 v17, v17, v19 -; GFX8-NEXT: v_sub_u32_e32 v5, vcc, v5, v17 -; GFX8-NEXT: v_max_i32_e32 v17, -1, v6 -; GFX8-NEXT: v_sub_u32_e32 v17, vcc, v17, v16 -; GFX8-NEXT: v_min_i32_e32 v19, -1, v6 -; GFX8-NEXT: v_max_i32_e32 v17, v17, v22 -; GFX8-NEXT: v_sub_u32_e32 v19, vcc, v19, v18 -; GFX8-NEXT: v_min_i32_e32 v17, v17, v19 -; GFX8-NEXT: v_sub_u32_e32 v6, vcc, v6, v17 -; GFX8-NEXT: v_max_i32_e32 v17, -1, v7 -; GFX8-NEXT: v_sub_u32_e32 v17, vcc, v17, v16 -; GFX8-NEXT: v_min_i32_e32 v19, -1, v7 -; GFX8-NEXT: v_max_i32_e32 v17, v17, v23 -; GFX8-NEXT: v_sub_u32_e32 v19, vcc, v19, v18 -; GFX8-NEXT: v_min_i32_e32 v17, v17, v19 -; GFX8-NEXT: v_sub_u32_e32 v7, vcc, v7, v17 -; GFX8-NEXT: v_max_i32_e32 v17, -1, v8 -; GFX8-NEXT: v_sub_u32_e32 v17, vcc, v17, v16 -; GFX8-NEXT: v_min_i32_e32 v19, -1, v8 -; GFX8-NEXT: v_max_i32_e32 v17, v17, v24 -; GFX8-NEXT: v_sub_u32_e32 v19, vcc, v19, v18 -; GFX8-NEXT: v_min_i32_e32 v17, v17, v19 -; GFX8-NEXT: v_sub_u32_e32 v8, vcc, v8, v17 -; GFX8-NEXT: v_max_i32_e32 v17, -1, v9 -; GFX8-NEXT: v_sub_u32_e32 v17, vcc, v17, v16 -; GFX8-NEXT: v_min_i32_e32 v19, -1, v9 -; GFX8-NEXT: v_max_i32_e32 v17, v17, v25 -; GFX8-NEXT: v_sub_u32_e32 v19, vcc, v19, v18 -; GFX8-NEXT: v_min_i32_e32 v17, v17, v19 -; GFX8-NEXT: v_sub_u32_e32 v9, vcc, v9, v17 -; GFX8-NEXT: v_max_i32_e32 v17, -1, v10 -; GFX8-NEXT: v_sub_u32_e32 v17, vcc, v17, v16 -; GFX8-NEXT: v_min_i32_e32 v19, -1, v10 -; GFX8-NEXT: v_max_i32_e32 v17, v17, v26 -; GFX8-NEXT: v_sub_u32_e32 v19, vcc, v19, v18 -; GFX8-NEXT: v_min_i32_e32 v17, v17, v19 -; GFX8-NEXT: v_sub_u32_e32 v10, vcc, v10, v17 -; GFX8-NEXT: v_max_i32_e32 v17, -1, v11 -; GFX8-NEXT: v_sub_u32_e32 v17, vcc, v17, v16 -; GFX8-NEXT: v_min_i32_e32 v19, -1, v11 -; GFX8-NEXT: v_max_i32_e32 v17, v17, v27 -; GFX8-NEXT: v_sub_u32_e32 v19, vcc, v19, v18 -; GFX8-NEXT: v_min_i32_e32 v17, v17, v19 -; GFX8-NEXT: v_sub_u32_e32 v11, vcc, v11, v17 -; GFX8-NEXT: v_max_i32_e32 v17, -1, v12 -; GFX8-NEXT: v_sub_u32_e32 v17, vcc, v17, v16 -; GFX8-NEXT: v_min_i32_e32 v19, -1, v12 -; GFX8-NEXT: v_max_i32_e32 v17, v17, v28 -; GFX8-NEXT: v_sub_u32_e32 v19, vcc, v19, v18 -; GFX8-NEXT: v_min_i32_e32 v17, v17, v19 -; GFX8-NEXT: v_sub_u32_e32 v12, vcc, v12, v17 -; GFX8-NEXT: v_max_i32_e32 v17, -1, v13 -; GFX8-NEXT: v_sub_u32_e32 v17, vcc, v17, v16 -; GFX8-NEXT: v_min_i32_e32 v19, -1, v13 -; GFX8-NEXT: v_max_i32_e32 v17, v17, v29 -; GFX8-NEXT: v_sub_u32_e32 v19, vcc, v19, v18 -; GFX8-NEXT: v_min_i32_e32 v17, v17, v19 -; GFX8-NEXT: v_sub_u32_e32 v13, vcc, v13, v17 -; GFX8-NEXT: v_max_i32_e32 v17, -1, v14 -; GFX8-NEXT: v_sub_u32_e32 v17, vcc, v17, v16 -; GFX8-NEXT: v_min_i32_e32 v19, -1, v14 -; GFX8-NEXT: v_max_i32_e32 v17, v17, v30 -; GFX8-NEXT: v_sub_u32_e32 v19, vcc, v19, v18 -; GFX8-NEXT: v_min_i32_e32 v17, v17, v19 -; GFX8-NEXT: v_sub_u32_e32 v14, vcc, v14, v17 -; GFX8-NEXT: v_max_i32_e32 v17, -1, v15 -; GFX8-NEXT: v_sub_u32_e32 v16, vcc, v17, v16 +; GFX8-NEXT: v_max_i32_e32 v16, -1, v3 +; GFX8-NEXT: v_sub_u32_e32 v16, vcc, v16, v32 +; GFX8-NEXT: v_min_i32_e32 v17, -1, v3 +; GFX8-NEXT: v_max_i32_e32 v16, v16, v19 +; GFX8-NEXT: v_sub_u32_e32 v17, vcc, v17, v33 +; GFX8-NEXT: v_min_i32_e32 v16, v16, v17 +; GFX8-NEXT: v_sub_u32_e32 v3, vcc, v3, v16 +; GFX8-NEXT: v_max_i32_e32 v16, -1, v4 +; GFX8-NEXT: v_sub_u32_e32 v16, vcc, v16, v32 +; GFX8-NEXT: v_min_i32_e32 v17, -1, v4 +; GFX8-NEXT: v_max_i32_e32 v16, v16, v20 +; GFX8-NEXT: v_sub_u32_e32 v17, vcc, v17, v33 +; GFX8-NEXT: v_min_i32_e32 v16, v16, v17 +; GFX8-NEXT: v_sub_u32_e32 v4, vcc, v4, v16 +; GFX8-NEXT: v_max_i32_e32 v16, -1, v5 +; GFX8-NEXT: v_sub_u32_e32 v16, vcc, v16, v32 +; GFX8-NEXT: v_min_i32_e32 v17, -1, v5 +; GFX8-NEXT: v_max_i32_e32 v16, v16, v21 +; GFX8-NEXT: v_sub_u32_e32 v17, vcc, v17, v33 +; GFX8-NEXT: v_min_i32_e32 v16, v16, v17 +; GFX8-NEXT: v_sub_u32_e32 v5, vcc, v5, v16 +; GFX8-NEXT: v_max_i32_e32 v16, -1, v6 +; GFX8-NEXT: v_sub_u32_e32 v16, vcc, v16, v32 +; GFX8-NEXT: v_min_i32_e32 v17, -1, v6 +; GFX8-NEXT: v_max_i32_e32 v16, v16, v22 +; GFX8-NEXT: v_sub_u32_e32 v17, vcc, v17, v33 +; GFX8-NEXT: v_min_i32_e32 v16, v16, v17 +; GFX8-NEXT: v_sub_u32_e32 v6, vcc, v6, v16 +; GFX8-NEXT: v_max_i32_e32 v16, -1, v7 +; GFX8-NEXT: v_sub_u32_e32 v16, vcc, v16, v32 +; GFX8-NEXT: v_min_i32_e32 v17, -1, v7 +; GFX8-NEXT: v_max_i32_e32 v16, v16, v23 +; GFX8-NEXT: v_sub_u32_e32 v17, vcc, v17, v33 +; GFX8-NEXT: v_min_i32_e32 v16, v16, v17 +; GFX8-NEXT: v_sub_u32_e32 v7, vcc, v7, v16 +; GFX8-NEXT: v_max_i32_e32 v16, -1, v8 +; GFX8-NEXT: v_sub_u32_e32 v16, vcc, v16, v32 +; GFX8-NEXT: v_min_i32_e32 v17, -1, v8 +; GFX8-NEXT: v_max_i32_e32 v16, v16, v24 +; GFX8-NEXT: v_sub_u32_e32 v17, vcc, v17, v33 +; GFX8-NEXT: v_min_i32_e32 v16, v16, v17 +; GFX8-NEXT: v_sub_u32_e32 v8, vcc, v8, v16 +; GFX8-NEXT: v_max_i32_e32 v16, -1, v9 +; GFX8-NEXT: v_sub_u32_e32 v16, vcc, v16, v32 +; GFX8-NEXT: v_min_i32_e32 v17, -1, v9 +; GFX8-NEXT: v_max_i32_e32 v16, v16, v25 +; GFX8-NEXT: v_sub_u32_e32 v17, vcc, v17, v33 +; GFX8-NEXT: v_min_i32_e32 v16, v16, v17 +; GFX8-NEXT: v_sub_u32_e32 v9, vcc, v9, v16 +; GFX8-NEXT: v_max_i32_e32 v16, -1, v10 +; GFX8-NEXT: v_sub_u32_e32 v16, vcc, v16, v32 +; GFX8-NEXT: v_min_i32_e32 v17, -1, v10 +; GFX8-NEXT: v_max_i32_e32 v16, v16, v26 +; GFX8-NEXT: v_sub_u32_e32 v17, vcc, v17, v33 +; GFX8-NEXT: v_min_i32_e32 v16, v16, v17 +; GFX8-NEXT: v_sub_u32_e32 v10, vcc, v10, v16 +; GFX8-NEXT: v_max_i32_e32 v16, -1, v11 +; GFX8-NEXT: v_sub_u32_e32 v16, vcc, v16, v32 +; GFX8-NEXT: v_min_i32_e32 v17, -1, v11 +; GFX8-NEXT: v_max_i32_e32 v16, v16, v27 +; GFX8-NEXT: v_sub_u32_e32 v17, vcc, v17, v33 +; GFX8-NEXT: v_min_i32_e32 v16, v16, v17 +; GFX8-NEXT: v_sub_u32_e32 v11, vcc, v11, v16 +; GFX8-NEXT: v_max_i32_e32 v16, -1, v12 +; GFX8-NEXT: v_sub_u32_e32 v16, vcc, v16, v32 +; GFX8-NEXT: v_min_i32_e32 v17, -1, v12 +; GFX8-NEXT: v_max_i32_e32 v16, v16, v28 +; GFX8-NEXT: v_sub_u32_e32 v17, vcc, v17, v33 +; GFX8-NEXT: v_min_i32_e32 v16, v16, v17 +; GFX8-NEXT: v_sub_u32_e32 v12, vcc, v12, v16 +; GFX8-NEXT: v_max_i32_e32 v16, -1, v13 +; GFX8-NEXT: v_sub_u32_e32 v16, vcc, v16, v32 +; GFX8-NEXT: v_min_i32_e32 v17, -1, v13 +; GFX8-NEXT: v_max_i32_e32 v16, v16, v29 +; GFX8-NEXT: v_sub_u32_e32 v17, vcc, v17, v33 +; GFX8-NEXT: v_min_i32_e32 v16, v16, v17 +; GFX8-NEXT: v_sub_u32_e32 v13, vcc, v13, v16 +; GFX8-NEXT: v_max_i32_e32 v16, -1, v14 +; GFX8-NEXT: v_sub_u32_e32 v16, vcc, v16, v32 +; GFX8-NEXT: v_min_i32_e32 v17, -1, v14 +; GFX8-NEXT: v_max_i32_e32 v16, v16, v30 +; GFX8-NEXT: v_sub_u32_e32 v17, vcc, v17, v33 +; GFX8-NEXT: v_min_i32_e32 v16, v16, v17 +; GFX8-NEXT: v_sub_u32_e32 v14, vcc, v14, v16 +; GFX8-NEXT: v_max_i32_e32 v16, -1, v15 +; GFX8-NEXT: v_sub_u32_e32 v16, vcc, v16, v32 ; GFX8-NEXT: v_min_i32_e32 v17, -1, v15 -; GFX8-NEXT: v_sub_u32_e32 v17, vcc, v17, v18 +; GFX8-NEXT: v_sub_u32_e32 v17, vcc, v17, v33 ; GFX8-NEXT: v_max_i32_e32 v16, v16, v31 ; GFX8-NEXT: v_min_i32_e32 v16, v16, v17 ; GFX8-NEXT: v_sub_u32_e32 v15, vcc, v15, v16 @@ -2693,22 +2682,22 @@ ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX6-NEXT: s_brev_b32 s4, -2 -; GFX6-NEXT: v_max_i32_e32 v4, -1, v0 +; GFX6-NEXT: v_bfrev_b32_e32 v4, -2 +; GFX6-NEXT: v_max_i32_e32 v6, -1, v0 ; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX6-NEXT: s_brev_b32 s5, 1 -; GFX6-NEXT: v_subrev_i32_e32 v4, vcc, s4, v4 -; GFX6-NEXT: v_min_i32_e32 v5, -1, v0 -; GFX6-NEXT: v_subrev_i32_e32 v5, vcc, s5, v5 -; GFX6-NEXT: v_max_i32_e32 v2, v4, v2 -; GFX6-NEXT: v_min_i32_e32 v2, v2, v5 +; GFX6-NEXT: v_bfrev_b32_e32 v5, 1 +; GFX6-NEXT: v_sub_i32_e32 v6, vcc, v6, v4 +; GFX6-NEXT: v_min_i32_e32 v7, -1, v0 +; GFX6-NEXT: v_sub_i32_e32 v7, vcc, v7, v5 +; GFX6-NEXT: v_max_i32_e32 v2, v6, v2 +; GFX6-NEXT: v_min_i32_e32 v2, v2, v7 ; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX6-NEXT: v_sub_i32_e32 v0, vcc, v0, v2 ; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v3 ; GFX6-NEXT: v_max_i32_e32 v3, -1, v1 -; GFX6-NEXT: v_subrev_i32_e32 v3, vcc, s4, v3 +; GFX6-NEXT: v_sub_i32_e32 v3, vcc, v3, v4 ; GFX6-NEXT: v_min_i32_e32 v4, -1, v1 -; GFX6-NEXT: v_subrev_i32_e32 v4, vcc, s5, v4 +; GFX6-NEXT: v_sub_i32_e32 v4, vcc, v4, v5 ; GFX6-NEXT: v_max_i32_e32 v2, v3, v2 ; GFX6-NEXT: v_min_i32_e32 v2, v2, v4 ; GFX6-NEXT: v_sub_i32_e32 v1, vcc, v1, v2 @@ -2719,22 +2708,22 @@ ; GFX8-LABEL: v_ssubsat_v2i16: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: s_movk_i32 s4, 0x7fff -; GFX8-NEXT: v_max_i16_e32 v3, -1, v0 -; GFX8-NEXT: s_movk_i32 s5, 0x8000 -; GFX8-NEXT: v_subrev_u16_e32 v3, s4, v3 -; GFX8-NEXT: v_min_i16_e32 v4, -1, v0 +; GFX8-NEXT: v_mov_b32_e32 v3, 0x7fff +; GFX8-NEXT: v_max_i16_e32 v5, -1, v0 +; GFX8-NEXT: v_mov_b32_e32 v4, 0xffff8000 +; GFX8-NEXT: v_sub_u16_e32 v5, v5, v3 +; GFX8-NEXT: v_min_i16_e32 v6, -1, v0 ; GFX8-NEXT: v_lshrrev_b32_e32 v2, 16, v0 -; GFX8-NEXT: v_subrev_u16_e32 v4, s5, v4 -; GFX8-NEXT: v_max_i16_e32 v3, v3, v1 -; GFX8-NEXT: v_min_i16_e32 v3, v3, v4 -; GFX8-NEXT: v_max_i16_e32 v4, -1, v2 -; GFX8-NEXT: v_subrev_u16_e32 v4, s4, v4 -; GFX8-NEXT: v_min_i16_e32 v5, -1, v2 -; GFX8-NEXT: v_subrev_u16_e32 v5, s5, v5 -; GFX8-NEXT: v_max_i16_sdwa v1, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX8-NEXT: v_min_i16_e32 v1, v1, v5 -; GFX8-NEXT: v_sub_u16_e32 v0, v0, v3 +; GFX8-NEXT: v_sub_u16_e32 v6, v6, v4 +; GFX8-NEXT: v_max_i16_e32 v5, v5, v1 +; GFX8-NEXT: v_min_i16_e32 v5, v5, v6 +; GFX8-NEXT: v_max_i16_e32 v6, -1, v2 +; GFX8-NEXT: v_sub_u16_e32 v3, v6, v3 +; GFX8-NEXT: v_min_i16_e32 v6, -1, v2 +; GFX8-NEXT: v_sub_u16_e32 v4, v6, v4 +; GFX8-NEXT: v_max_i16_sdwa v1, v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX8-NEXT: v_min_i16_e32 v1, v1, v4 +; GFX8-NEXT: v_sub_u16_e32 v0, v0, v5 ; GFX8-NEXT: v_sub_u16_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX8-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX8-NEXT: s_setpc_b64 s[30:31] @@ -2865,10 +2854,10 @@ ; GFX6-NEXT: v_min_i32_e32 v1, s2, v1 ; GFX6-NEXT: v_sub_i32_e32 v1, vcc, s0, v1 ; GFX6-NEXT: v_ashrrev_i32_e32 v1, 16, v1 -; GFX6-NEXT: s_mov_b32 s0, 0xffff +; GFX6-NEXT: v_mov_b32_e32 v2, 0xffff ; GFX6-NEXT: v_ashrrev_i32_e32 v0, 16, v0 -; GFX6-NEXT: v_and_b32_e32 v1, s0, v1 -; GFX6-NEXT: v_and_b32_e32 v0, s0, v0 +; GFX6-NEXT: v_and_b32_e32 v1, v1, v2 +; GFX6-NEXT: v_and_b32_e32 v0, v0, v2 ; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX6-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX6-NEXT: ; return to shader part epilog @@ -2918,54 +2907,54 @@ ; GFX6-LABEL: ssubsat_v2i16_vs: ; GFX6: ; %bb.0: ; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX6-NEXT: s_brev_b32 s2, -2 -; GFX6-NEXT: v_max_i32_e32 v2, -1, v0 +; GFX6-NEXT: v_bfrev_b32_e32 v2, -2 +; GFX6-NEXT: v_max_i32_e32 v4, -1, v0 ; GFX6-NEXT: s_lshl_b32 s0, s0, 16 -; GFX6-NEXT: s_brev_b32 s3, 1 -; GFX6-NEXT: v_subrev_i32_e32 v2, vcc, s2, v2 -; GFX6-NEXT: v_min_i32_e32 v3, -1, v0 -; GFX6-NEXT: v_subrev_i32_e32 v3, vcc, s3, v3 -; GFX6-NEXT: v_max_i32_e32 v2, s0, v2 -; GFX6-NEXT: v_min_i32_e32 v2, v2, v3 +; GFX6-NEXT: v_bfrev_b32_e32 v3, 1 +; GFX6-NEXT: v_sub_i32_e32 v4, vcc, v4, v2 +; GFX6-NEXT: v_min_i32_e32 v5, -1, v0 +; GFX6-NEXT: v_sub_i32_e32 v5, vcc, v5, v3 +; GFX6-NEXT: v_max_i32_e32 v4, s0, v4 +; GFX6-NEXT: v_min_i32_e32 v4, v4, v5 ; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX6-NEXT: v_sub_i32_e32 v0, vcc, v0, v2 -; GFX6-NEXT: v_max_i32_e32 v2, -1, v1 +; GFX6-NEXT: v_sub_i32_e32 v0, vcc, v0, v4 +; GFX6-NEXT: v_max_i32_e32 v4, -1, v1 ; GFX6-NEXT: s_lshl_b32 s0, s1, 16 -; GFX6-NEXT: v_subrev_i32_e32 v2, vcc, s2, v2 -; GFX6-NEXT: v_min_i32_e32 v3, -1, v1 -; GFX6-NEXT: v_subrev_i32_e32 v3, vcc, s3, v3 +; GFX6-NEXT: v_sub_i32_e32 v2, vcc, v4, v2 +; GFX6-NEXT: v_min_i32_e32 v4, -1, v1 +; GFX6-NEXT: v_sub_i32_e32 v3, vcc, v4, v3 ; GFX6-NEXT: v_max_i32_e32 v2, s0, v2 ; GFX6-NEXT: v_min_i32_e32 v2, v2, v3 ; GFX6-NEXT: v_sub_i32_e32 v1, vcc, v1, v2 ; GFX6-NEXT: v_ashrrev_i32_e32 v1, 16, v1 -; GFX6-NEXT: s_mov_b32 s0, 0xffff +; GFX6-NEXT: v_mov_b32_e32 v2, 0xffff ; GFX6-NEXT: v_ashrrev_i32_e32 v0, 16, v0 -; GFX6-NEXT: v_and_b32_e32 v1, s0, v1 -; GFX6-NEXT: v_and_b32_e32 v0, s0, v0 +; GFX6-NEXT: v_and_b32_e32 v1, v1, v2 +; GFX6-NEXT: v_and_b32_e32 v0, v0, v2 ; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX6-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX6-NEXT: ; return to shader part epilog ; ; GFX8-LABEL: ssubsat_v2i16_vs: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_movk_i32 s2, 0x7fff -; GFX8-NEXT: v_max_i16_e32 v2, -1, v0 -; GFX8-NEXT: s_movk_i32 s3, 0x8000 -; GFX8-NEXT: v_subrev_u16_e32 v2, s2, v2 -; GFX8-NEXT: v_min_i16_e32 v3, -1, v0 +; GFX8-NEXT: v_mov_b32_e32 v2, 0x7fff +; GFX8-NEXT: v_max_i16_e32 v4, -1, v0 +; GFX8-NEXT: v_mov_b32_e32 v3, 0xffff8000 +; GFX8-NEXT: v_sub_u16_e32 v4, v4, v2 +; GFX8-NEXT: v_min_i16_e32 v5, -1, v0 ; GFX8-NEXT: v_lshrrev_b32_e32 v1, 16, v0 -; GFX8-NEXT: v_subrev_u16_e32 v3, s3, v3 -; GFX8-NEXT: v_max_i16_e32 v2, s0, v2 -; GFX8-NEXT: v_min_i16_e32 v2, v2, v3 -; GFX8-NEXT: v_max_i16_e32 v3, -1, v1 +; GFX8-NEXT: v_sub_u16_e32 v5, v5, v3 +; GFX8-NEXT: v_max_i16_e32 v4, s0, v4 +; GFX8-NEXT: v_min_i16_e32 v4, v4, v5 +; GFX8-NEXT: v_max_i16_e32 v5, -1, v1 ; GFX8-NEXT: s_lshr_b32 s1, s0, 16 -; GFX8-NEXT: v_subrev_u16_e32 v3, s2, v3 -; GFX8-NEXT: v_min_i16_e32 v4, -1, v1 -; GFX8-NEXT: v_subrev_u16_e32 v4, s3, v4 -; GFX8-NEXT: v_max_i16_e32 v3, s1, v3 -; GFX8-NEXT: v_min_i16_e32 v3, v3, v4 -; GFX8-NEXT: v_sub_u16_e32 v0, v0, v2 -; GFX8-NEXT: v_sub_u16_sdwa v1, v1, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX8-NEXT: v_sub_u16_e32 v2, v5, v2 +; GFX8-NEXT: v_min_i16_e32 v5, -1, v1 +; GFX8-NEXT: v_sub_u16_e32 v3, v5, v3 +; GFX8-NEXT: v_max_i16_e32 v2, s1, v2 +; GFX8-NEXT: v_min_i16_e32 v2, v2, v3 +; GFX8-NEXT: v_sub_u16_e32 v0, v0, v4 +; GFX8-NEXT: v_sub_u16_sdwa v1, v1, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX8-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX8-NEXT: ; return to shader part epilog ; @@ -2999,56 +2988,54 @@ ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX6-NEXT: s_brev_b32 s4, -2 -; GFX6-NEXT: v_max_i32_e32 v8, -1, v0 +; GFX6-NEXT: v_bfrev_b32_e32 v8, -2 +; GFX6-NEXT: v_max_i32_e32 v10, -1, v0 ; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GFX6-NEXT: s_brev_b32 s5, 1 -; GFX6-NEXT: v_subrev_i32_e32 v8, vcc, s4, v8 -; GFX6-NEXT: v_min_i32_e32 v10, -1, v0 -; GFX6-NEXT: v_subrev_i32_e32 v10, vcc, s5, v10 -; GFX6-NEXT: v_max_i32_e32 v4, v8, v4 -; GFX6-NEXT: v_min_i32_e32 v4, v4, v10 +; GFX6-NEXT: v_bfrev_b32_e32 v9, 1 +; GFX6-NEXT: v_sub_i32_e32 v10, vcc, v10, v8 +; GFX6-NEXT: v_min_i32_e32 v11, -1, v0 +; GFX6-NEXT: v_sub_i32_e32 v11, vcc, v11, v9 +; GFX6-NEXT: v_max_i32_e32 v4, v10, v4 +; GFX6-NEXT: v_min_i32_e32 v4, v4, v11 ; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX6-NEXT: v_sub_i32_e32 v0, vcc, v0, v4 ; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v5 ; GFX6-NEXT: v_max_i32_e32 v5, -1, v1 -; GFX6-NEXT: v_subrev_i32_e32 v5, vcc, s4, v5 -; GFX6-NEXT: v_min_i32_e32 v8, -1, v1 -; GFX6-NEXT: v_subrev_i32_e32 v8, vcc, s5, v8 +; GFX6-NEXT: v_sub_i32_e32 v5, vcc, v5, v8 +; GFX6-NEXT: v_min_i32_e32 v10, -1, v1 +; GFX6-NEXT: v_sub_i32_e32 v10, vcc, v10, v9 ; GFX6-NEXT: v_max_i32_e32 v4, v5, v4 ; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX6-NEXT: v_bfrev_b32_e32 v9, -2 -; GFX6-NEXT: v_min_i32_e32 v4, v4, v8 +; GFX6-NEXT: v_min_i32_e32 v4, v4, v10 ; GFX6-NEXT: v_max_i32_e32 v5, -1, v2 ; GFX6-NEXT: v_sub_i32_e32 v1, vcc, v1, v4 ; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v6 -; GFX6-NEXT: v_sub_i32_e32 v5, vcc, v5, v9 +; GFX6-NEXT: v_sub_i32_e32 v5, vcc, v5, v8 ; GFX6-NEXT: v_min_i32_e32 v6, -1, v2 -; GFX6-NEXT: v_subrev_i32_e32 v6, vcc, s5, v6 +; GFX6-NEXT: v_sub_i32_e32 v6, vcc, v6, v9 ; GFX6-NEXT: v_max_i32_e32 v4, v5, v4 ; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; GFX6-NEXT: v_min_i32_e32 v4, v4, v6 ; GFX6-NEXT: v_max_i32_e32 v5, -1, v3 -; GFX6-NEXT: v_bfrev_b32_e32 v11, 1 ; GFX6-NEXT: v_sub_i32_e32 v2, vcc, v2, v4 ; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v7 -; GFX6-NEXT: v_sub_i32_e32 v5, vcc, v5, v9 +; GFX6-NEXT: v_sub_i32_e32 v5, vcc, v5, v8 ; GFX6-NEXT: v_min_i32_e32 v6, -1, v3 -; GFX6-NEXT: v_sub_i32_e32 v6, vcc, v6, v11 +; GFX6-NEXT: v_sub_i32_e32 v6, vcc, v6, v9 ; GFX6-NEXT: v_max_i32_e32 v4, v5, v4 -; GFX6-NEXT: v_ashrrev_i32_e32 v1, 16, v1 ; GFX6-NEXT: v_min_i32_e32 v4, v4, v6 -; GFX6-NEXT: s_mov_b32 s4, 0xffff -; GFX6-NEXT: v_ashrrev_i32_e32 v0, 16, v0 +; GFX6-NEXT: v_ashrrev_i32_e32 v1, 16, v1 ; GFX6-NEXT: v_sub_i32_e32 v3, vcc, v3, v4 -; GFX6-NEXT: v_and_b32_e32 v1, s4, v1 +; GFX6-NEXT: v_mov_b32_e32 v4, 0xffff +; GFX6-NEXT: v_ashrrev_i32_e32 v0, 16, v0 +; GFX6-NEXT: v_and_b32_e32 v1, v1, v4 ; GFX6-NEXT: v_ashrrev_i32_e32 v2, 16, v2 ; GFX6-NEXT: v_ashrrev_i32_e32 v3, 16, v3 -; GFX6-NEXT: v_and_b32_e32 v0, s4, v0 +; GFX6-NEXT: v_and_b32_e32 v0, v0, v4 ; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX6-NEXT: v_or_b32_e32 v0, v0, v1 -; GFX6-NEXT: v_and_b32_e32 v1, s4, v2 -; GFX6-NEXT: v_and_b32_e32 v2, s4, v3 +; GFX6-NEXT: v_and_b32_e32 v1, v2, v4 +; GFX6-NEXT: v_and_b32_e32 v2, v3, v4 ; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX6-NEXT: v_or_b32_e32 v1, v1, v2 ; GFX6-NEXT: s_setpc_b64 s[30:31] @@ -3056,38 +3043,38 @@ ; GFX8-LABEL: v_ssubsat_v4i16: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: s_movk_i32 s4, 0x7fff -; GFX8-NEXT: v_max_i16_e32 v6, -1, v0 -; GFX8-NEXT: s_movk_i32 s5, 0x8000 -; GFX8-NEXT: v_subrev_u16_e32 v6, s4, v6 -; GFX8-NEXT: v_min_i16_e32 v7, -1, v0 +; GFX8-NEXT: v_mov_b32_e32 v6, 0x7fff +; GFX8-NEXT: v_max_i16_e32 v8, -1, v0 +; GFX8-NEXT: v_mov_b32_e32 v7, 0xffff8000 +; GFX8-NEXT: v_sub_u16_e32 v8, v8, v6 +; GFX8-NEXT: v_min_i16_e32 v9, -1, v0 ; GFX8-NEXT: v_lshrrev_b32_e32 v4, 16, v0 -; GFX8-NEXT: v_subrev_u16_e32 v7, s5, v7 -; GFX8-NEXT: v_max_i16_e32 v6, v6, v2 -; GFX8-NEXT: v_min_i16_e32 v6, v6, v7 -; GFX8-NEXT: v_max_i16_e32 v7, -1, v4 -; GFX8-NEXT: v_subrev_u16_e32 v7, s4, v7 -; GFX8-NEXT: v_min_i16_e32 v8, -1, v4 -; GFX8-NEXT: v_subrev_u16_e32 v8, s5, v8 -; GFX8-NEXT: v_max_i16_sdwa v2, v7, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX8-NEXT: v_max_i16_e32 v7, -1, v1 -; GFX8-NEXT: v_min_i16_e32 v2, v2, v8 -; GFX8-NEXT: v_subrev_u16_e32 v7, s4, v7 -; GFX8-NEXT: v_min_i16_e32 v8, -1, v1 +; GFX8-NEXT: v_sub_u16_e32 v9, v9, v7 +; GFX8-NEXT: v_max_i16_e32 v8, v8, v2 +; GFX8-NEXT: v_min_i16_e32 v8, v8, v9 +; GFX8-NEXT: v_max_i16_e32 v9, -1, v4 +; GFX8-NEXT: v_sub_u16_e32 v9, v9, v6 +; GFX8-NEXT: v_min_i16_e32 v10, -1, v4 +; GFX8-NEXT: v_sub_u16_e32 v10, v10, v7 +; GFX8-NEXT: v_max_i16_sdwa v2, v9, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX8-NEXT: v_max_i16_e32 v9, -1, v1 +; GFX8-NEXT: v_min_i16_e32 v2, v2, v10 +; GFX8-NEXT: v_sub_u16_e32 v9, v9, v6 +; GFX8-NEXT: v_min_i16_e32 v10, -1, v1 ; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v1 -; GFX8-NEXT: v_subrev_u16_e32 v8, s5, v8 -; GFX8-NEXT: v_max_i16_e32 v7, v7, v3 -; GFX8-NEXT: v_min_i16_e32 v7, v7, v8 -; GFX8-NEXT: v_max_i16_e32 v8, -1, v5 -; GFX8-NEXT: v_subrev_u16_e32 v8, s4, v8 -; GFX8-NEXT: v_min_i16_e32 v9, -1, v5 -; GFX8-NEXT: v_subrev_u16_e32 v9, s5, v9 -; GFX8-NEXT: v_max_i16_sdwa v3, v8, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX8-NEXT: v_min_i16_e32 v3, v3, v9 -; GFX8-NEXT: v_sub_u16_e32 v0, v0, v6 +; GFX8-NEXT: v_sub_u16_e32 v10, v10, v7 +; GFX8-NEXT: v_max_i16_e32 v9, v9, v3 +; GFX8-NEXT: v_min_i16_e32 v9, v9, v10 +; GFX8-NEXT: v_max_i16_e32 v10, -1, v5 +; GFX8-NEXT: v_sub_u16_e32 v6, v10, v6 +; GFX8-NEXT: v_min_i16_e32 v10, -1, v5 +; GFX8-NEXT: v_sub_u16_e32 v7, v10, v7 +; GFX8-NEXT: v_max_i16_sdwa v3, v6, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX8-NEXT: v_min_i16_e32 v3, v3, v7 +; GFX8-NEXT: v_sub_u16_e32 v0, v0, v8 ; GFX8-NEXT: v_sub_u16_sdwa v2, v4, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX8-NEXT: v_or_b32_e32 v0, v0, v2 -; GFX8-NEXT: v_sub_u16_e32 v1, v1, v7 +; GFX8-NEXT: v_sub_u16_e32 v1, v1, v9 ; GFX8-NEXT: v_sub_u16_sdwa v2, v5, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX8-NEXT: v_or_b32_e32 v1, v1, v2 ; GFX8-NEXT: s_setpc_b64 s[30:31] @@ -3272,80 +3259,78 @@ ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX6-NEXT: s_brev_b32 s4, -2 -; GFX6-NEXT: v_max_i32_e32 v12, -1, v0 +; GFX6-NEXT: v_bfrev_b32_e32 v12, -2 +; GFX6-NEXT: v_max_i32_e32 v14, -1, v0 ; GFX6-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; GFX6-NEXT: s_brev_b32 s5, 1 -; GFX6-NEXT: v_subrev_i32_e32 v12, vcc, s4, v12 -; GFX6-NEXT: v_min_i32_e32 v14, -1, v0 -; GFX6-NEXT: v_subrev_i32_e32 v14, vcc, s5, v14 -; GFX6-NEXT: v_max_i32_e32 v6, v12, v6 -; GFX6-NEXT: v_min_i32_e32 v6, v6, v14 +; GFX6-NEXT: v_bfrev_b32_e32 v13, 1 +; GFX6-NEXT: v_sub_i32_e32 v14, vcc, v14, v12 +; GFX6-NEXT: v_min_i32_e32 v15, -1, v0 +; GFX6-NEXT: v_sub_i32_e32 v15, vcc, v15, v13 +; GFX6-NEXT: v_max_i32_e32 v6, v14, v6 +; GFX6-NEXT: v_min_i32_e32 v6, v6, v15 ; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX6-NEXT: v_sub_i32_e32 v0, vcc, v0, v6 ; GFX6-NEXT: v_lshlrev_b32_e32 v6, 16, v7 ; GFX6-NEXT: v_max_i32_e32 v7, -1, v1 -; GFX6-NEXT: v_subrev_i32_e32 v7, vcc, s4, v7 -; GFX6-NEXT: v_min_i32_e32 v12, -1, v1 -; GFX6-NEXT: v_subrev_i32_e32 v12, vcc, s5, v12 +; GFX6-NEXT: v_sub_i32_e32 v7, vcc, v7, v12 +; GFX6-NEXT: v_min_i32_e32 v14, -1, v1 +; GFX6-NEXT: v_sub_i32_e32 v14, vcc, v14, v13 ; GFX6-NEXT: v_max_i32_e32 v6, v7, v6 ; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX6-NEXT: v_bfrev_b32_e32 v13, -2 -; GFX6-NEXT: v_min_i32_e32 v6, v6, v12 +; GFX6-NEXT: v_min_i32_e32 v6, v6, v14 ; GFX6-NEXT: v_max_i32_e32 v7, -1, v2 ; GFX6-NEXT: v_sub_i32_e32 v1, vcc, v1, v6 ; GFX6-NEXT: v_lshlrev_b32_e32 v6, 16, v8 -; GFX6-NEXT: v_sub_i32_e32 v7, vcc, v7, v13 +; GFX6-NEXT: v_sub_i32_e32 v7, vcc, v7, v12 ; GFX6-NEXT: v_min_i32_e32 v8, -1, v2 -; GFX6-NEXT: v_subrev_i32_e32 v8, vcc, s5, v8 +; GFX6-NEXT: v_sub_i32_e32 v8, vcc, v8, v13 ; GFX6-NEXT: v_max_i32_e32 v6, v7, v6 ; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; GFX6-NEXT: v_min_i32_e32 v6, v6, v8 ; GFX6-NEXT: v_max_i32_e32 v7, -1, v3 -; GFX6-NEXT: v_bfrev_b32_e32 v15, 1 ; GFX6-NEXT: v_sub_i32_e32 v2, vcc, v2, v6 ; GFX6-NEXT: v_lshlrev_b32_e32 v6, 16, v9 -; GFX6-NEXT: v_sub_i32_e32 v7, vcc, v7, v13 +; GFX6-NEXT: v_sub_i32_e32 v7, vcc, v7, v12 ; GFX6-NEXT: v_min_i32_e32 v8, -1, v3 -; GFX6-NEXT: v_sub_i32_e32 v8, vcc, v8, v15 +; GFX6-NEXT: v_sub_i32_e32 v8, vcc, v8, v13 ; GFX6-NEXT: v_max_i32_e32 v6, v7, v6 ; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v4 ; GFX6-NEXT: v_min_i32_e32 v6, v6, v8 ; GFX6-NEXT: v_max_i32_e32 v7, -1, v4 ; GFX6-NEXT: v_sub_i32_e32 v3, vcc, v3, v6 ; GFX6-NEXT: v_lshlrev_b32_e32 v6, 16, v10 -; GFX6-NEXT: v_sub_i32_e32 v7, vcc, v7, v13 +; GFX6-NEXT: v_sub_i32_e32 v7, vcc, v7, v12 ; GFX6-NEXT: v_min_i32_e32 v8, -1, v4 -; GFX6-NEXT: v_sub_i32_e32 v8, vcc, v8, v15 +; GFX6-NEXT: v_sub_i32_e32 v8, vcc, v8, v13 ; GFX6-NEXT: v_max_i32_e32 v6, v7, v6 ; GFX6-NEXT: v_lshlrev_b32_e32 v5, 16, v5 ; GFX6-NEXT: v_min_i32_e32 v6, v6, v8 ; GFX6-NEXT: v_max_i32_e32 v7, -1, v5 ; GFX6-NEXT: v_sub_i32_e32 v4, vcc, v4, v6 ; GFX6-NEXT: v_lshlrev_b32_e32 v6, 16, v11 -; GFX6-NEXT: v_sub_i32_e32 v7, vcc, v7, v13 +; GFX6-NEXT: v_sub_i32_e32 v7, vcc, v7, v12 ; GFX6-NEXT: v_min_i32_e32 v8, -1, v5 -; GFX6-NEXT: v_ashrrev_i32_e32 v1, 16, v1 -; GFX6-NEXT: v_sub_i32_e32 v8, vcc, v8, v15 +; GFX6-NEXT: v_sub_i32_e32 v8, vcc, v8, v13 ; GFX6-NEXT: v_max_i32_e32 v6, v7, v6 -; GFX6-NEXT: s_mov_b32 s4, 0xffff -; GFX6-NEXT: v_ashrrev_i32_e32 v0, 16, v0 ; GFX6-NEXT: v_min_i32_e32 v6, v6, v8 -; GFX6-NEXT: v_and_b32_e32 v1, s4, v1 +; GFX6-NEXT: v_ashrrev_i32_e32 v1, 16, v1 +; GFX6-NEXT: v_sub_i32_e32 v5, vcc, v5, v6 +; GFX6-NEXT: v_mov_b32_e32 v6, 0xffff +; GFX6-NEXT: v_ashrrev_i32_e32 v0, 16, v0 +; GFX6-NEXT: v_and_b32_e32 v1, v1, v6 ; GFX6-NEXT: v_ashrrev_i32_e32 v2, 16, v2 ; GFX6-NEXT: v_ashrrev_i32_e32 v3, 16, v3 -; GFX6-NEXT: v_sub_i32_e32 v5, vcc, v5, v6 -; GFX6-NEXT: v_and_b32_e32 v0, s4, v0 +; GFX6-NEXT: v_and_b32_e32 v0, v0, v6 ; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX6-NEXT: v_ashrrev_i32_e32 v5, 16, v5 ; GFX6-NEXT: v_or_b32_e32 v0, v0, v1 -; GFX6-NEXT: v_and_b32_e32 v1, s4, v2 -; GFX6-NEXT: v_and_b32_e32 v2, s4, v3 +; GFX6-NEXT: v_and_b32_e32 v1, v2, v6 +; GFX6-NEXT: v_and_b32_e32 v2, v3, v6 ; GFX6-NEXT: v_ashrrev_i32_e32 v4, 16, v4 ; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX6-NEXT: v_and_b32_e32 v3, s4, v5 +; GFX6-NEXT: v_and_b32_e32 v3, v5, v6 ; GFX6-NEXT: v_or_b32_e32 v1, v1, v2 -; GFX6-NEXT: v_and_b32_e32 v2, s4, v4 +; GFX6-NEXT: v_and_b32_e32 v2, v4, v6 ; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; GFX6-NEXT: v_or_b32_e32 v2, v2, v3 ; GFX6-NEXT: s_setpc_b64 s[30:31] @@ -3353,53 +3338,51 @@ ; GFX8-LABEL: v_ssubsat_v6i16: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: s_movk_i32 s4, 0x7fff -; GFX8-NEXT: v_max_i16_e32 v9, -1, v0 -; GFX8-NEXT: s_movk_i32 s5, 0x8000 -; GFX8-NEXT: v_subrev_u16_e32 v9, s4, v9 -; GFX8-NEXT: v_min_i16_e32 v11, -1, v0 +; GFX8-NEXT: v_mov_b32_e32 v9, 0x7fff +; GFX8-NEXT: v_max_i16_e32 v11, -1, v0 +; GFX8-NEXT: v_mov_b32_e32 v10, 0xffff8000 +; GFX8-NEXT: v_sub_u16_e32 v11, v11, v9 +; GFX8-NEXT: v_min_i16_e32 v12, -1, v0 ; GFX8-NEXT: v_lshrrev_b32_e32 v6, 16, v0 -; GFX8-NEXT: v_subrev_u16_e32 v11, s5, v11 -; GFX8-NEXT: v_max_i16_e32 v9, v9, v3 -; GFX8-NEXT: v_min_i16_e32 v9, v9, v11 -; GFX8-NEXT: v_max_i16_e32 v11, -1, v6 -; GFX8-NEXT: v_subrev_u16_e32 v11, s4, v11 +; GFX8-NEXT: v_sub_u16_e32 v12, v12, v10 +; GFX8-NEXT: v_max_i16_e32 v11, v11, v3 +; GFX8-NEXT: v_min_i16_e32 v11, v11, v12 +; GFX8-NEXT: v_max_i16_e32 v12, -1, v6 +; GFX8-NEXT: v_sub_u16_e32 v12, v12, v9 ; GFX8-NEXT: v_min_i16_e32 v13, -1, v6 -; GFX8-NEXT: v_subrev_u16_e32 v13, s5, v13 -; GFX8-NEXT: v_max_i16_sdwa v3, v11, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX8-NEXT: v_max_i16_e32 v11, -1, v1 +; GFX8-NEXT: v_sub_u16_e32 v13, v13, v10 +; GFX8-NEXT: v_max_i16_sdwa v3, v12, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX8-NEXT: v_max_i16_e32 v12, -1, v1 ; GFX8-NEXT: v_min_i16_e32 v3, v3, v13 -; GFX8-NEXT: v_subrev_u16_e32 v11, s4, v11 +; GFX8-NEXT: v_sub_u16_e32 v12, v12, v9 ; GFX8-NEXT: v_min_i16_e32 v13, -1, v1 ; GFX8-NEXT: v_lshrrev_b32_e32 v7, 16, v1 -; GFX8-NEXT: v_subrev_u16_e32 v13, s5, v13 -; GFX8-NEXT: v_max_i16_e32 v11, v11, v4 -; GFX8-NEXT: v_min_i16_e32 v11, v11, v13 +; GFX8-NEXT: v_sub_u16_e32 v13, v13, v10 +; GFX8-NEXT: v_max_i16_e32 v12, v12, v4 +; GFX8-NEXT: v_min_i16_e32 v12, v12, v13 ; GFX8-NEXT: v_max_i16_e32 v13, -1, v7 -; GFX8-NEXT: v_subrev_u16_e32 v13, s4, v13 +; GFX8-NEXT: v_sub_u16_e32 v13, v13, v9 ; GFX8-NEXT: v_min_i16_e32 v14, -1, v7 -; GFX8-NEXT: v_mov_b32_e32 v10, 0x7fff -; GFX8-NEXT: v_subrev_u16_e32 v14, s5, v14 +; GFX8-NEXT: v_sub_u16_e32 v14, v14, v10 ; GFX8-NEXT: v_max_i16_sdwa v4, v13, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX8-NEXT: v_max_i16_e32 v13, -1, v2 -; GFX8-NEXT: v_mov_b32_e32 v12, 0xffff8000 ; GFX8-NEXT: v_min_i16_e32 v4, v4, v14 -; GFX8-NEXT: v_sub_u16_e32 v13, v13, v10 +; GFX8-NEXT: v_sub_u16_e32 v13, v13, v9 ; GFX8-NEXT: v_min_i16_e32 v14, -1, v2 ; GFX8-NEXT: v_lshrrev_b32_e32 v8, 16, v2 -; GFX8-NEXT: v_sub_u16_e32 v14, v14, v12 +; GFX8-NEXT: v_sub_u16_e32 v14, v14, v10 ; GFX8-NEXT: v_max_i16_e32 v13, v13, v5 ; GFX8-NEXT: v_min_i16_e32 v13, v13, v14 ; GFX8-NEXT: v_max_i16_e32 v14, -1, v8 -; GFX8-NEXT: v_sub_u16_e32 v10, v14, v10 +; GFX8-NEXT: v_sub_u16_e32 v9, v14, v9 ; GFX8-NEXT: v_min_i16_e32 v14, -1, v8 -; GFX8-NEXT: v_sub_u16_e32 v12, v14, v12 -; GFX8-NEXT: v_max_i16_sdwa v5, v10, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX8-NEXT: v_sub_u16_e32 v0, v0, v9 +; GFX8-NEXT: v_sub_u16_e32 v10, v14, v10 +; GFX8-NEXT: v_max_i16_sdwa v5, v9, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX8-NEXT: v_sub_u16_e32 v0, v0, v11 ; GFX8-NEXT: v_sub_u16_sdwa v3, v6, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX8-NEXT: v_min_i16_e32 v5, v5, v12 +; GFX8-NEXT: v_min_i16_e32 v5, v5, v10 ; GFX8-NEXT: v_or_b32_e32 v0, v0, v3 -; GFX8-NEXT: v_sub_u16_e32 v1, v1, v11 +; GFX8-NEXT: v_sub_u16_e32 v1, v1, v12 ; GFX8-NEXT: v_sub_u16_sdwa v3, v7, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX8-NEXT: v_or_b32_e32 v1, v1, v3 ; GFX8-NEXT: v_sub_u16_e32 v2, v2, v13 @@ -3637,104 +3620,102 @@ ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX6-NEXT: s_brev_b32 s4, -2 -; GFX6-NEXT: v_max_i32_e32 v16, -1, v0 +; GFX6-NEXT: v_bfrev_b32_e32 v16, -2 +; GFX6-NEXT: v_max_i32_e32 v18, -1, v0 ; GFX6-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; GFX6-NEXT: s_brev_b32 s5, 1 -; GFX6-NEXT: v_subrev_i32_e32 v16, vcc, s4, v16 -; GFX6-NEXT: v_min_i32_e32 v18, -1, v0 -; GFX6-NEXT: v_subrev_i32_e32 v18, vcc, s5, v18 -; GFX6-NEXT: v_max_i32_e32 v8, v16, v8 -; GFX6-NEXT: v_min_i32_e32 v8, v8, v18 +; GFX6-NEXT: v_bfrev_b32_e32 v17, 1 +; GFX6-NEXT: v_sub_i32_e32 v18, vcc, v18, v16 +; GFX6-NEXT: v_min_i32_e32 v19, -1, v0 +; GFX6-NEXT: v_sub_i32_e32 v19, vcc, v19, v17 +; GFX6-NEXT: v_max_i32_e32 v8, v18, v8 +; GFX6-NEXT: v_min_i32_e32 v8, v8, v19 ; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX6-NEXT: v_sub_i32_e32 v0, vcc, v0, v8 ; GFX6-NEXT: v_lshlrev_b32_e32 v8, 16, v9 ; GFX6-NEXT: v_max_i32_e32 v9, -1, v1 -; GFX6-NEXT: v_subrev_i32_e32 v9, vcc, s4, v9 -; GFX6-NEXT: v_min_i32_e32 v16, -1, v1 -; GFX6-NEXT: v_subrev_i32_e32 v16, vcc, s5, v16 +; GFX6-NEXT: v_sub_i32_e32 v9, vcc, v9, v16 +; GFX6-NEXT: v_min_i32_e32 v18, -1, v1 +; GFX6-NEXT: v_sub_i32_e32 v18, vcc, v18, v17 ; GFX6-NEXT: v_max_i32_e32 v8, v9, v8 ; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX6-NEXT: v_bfrev_b32_e32 v17, -2 -; GFX6-NEXT: v_min_i32_e32 v8, v8, v16 +; GFX6-NEXT: v_min_i32_e32 v8, v8, v18 ; GFX6-NEXT: v_max_i32_e32 v9, -1, v2 ; GFX6-NEXT: v_sub_i32_e32 v1, vcc, v1, v8 ; GFX6-NEXT: v_lshlrev_b32_e32 v8, 16, v10 -; GFX6-NEXT: v_sub_i32_e32 v9, vcc, v9, v17 +; GFX6-NEXT: v_sub_i32_e32 v9, vcc, v9, v16 ; GFX6-NEXT: v_min_i32_e32 v10, -1, v2 -; GFX6-NEXT: v_subrev_i32_e32 v10, vcc, s5, v10 +; GFX6-NEXT: v_sub_i32_e32 v10, vcc, v10, v17 ; GFX6-NEXT: v_max_i32_e32 v8, v9, v8 ; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; GFX6-NEXT: v_min_i32_e32 v8, v8, v10 ; GFX6-NEXT: v_max_i32_e32 v9, -1, v3 -; GFX6-NEXT: v_bfrev_b32_e32 v19, 1 ; GFX6-NEXT: v_sub_i32_e32 v2, vcc, v2, v8 ; GFX6-NEXT: v_lshlrev_b32_e32 v8, 16, v11 -; GFX6-NEXT: v_sub_i32_e32 v9, vcc, v9, v17 +; GFX6-NEXT: v_sub_i32_e32 v9, vcc, v9, v16 ; GFX6-NEXT: v_min_i32_e32 v10, -1, v3 -; GFX6-NEXT: v_sub_i32_e32 v10, vcc, v10, v19 +; GFX6-NEXT: v_sub_i32_e32 v10, vcc, v10, v17 ; GFX6-NEXT: v_max_i32_e32 v8, v9, v8 ; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v4 ; GFX6-NEXT: v_min_i32_e32 v8, v8, v10 ; GFX6-NEXT: v_max_i32_e32 v9, -1, v4 ; GFX6-NEXT: v_sub_i32_e32 v3, vcc, v3, v8 ; GFX6-NEXT: v_lshlrev_b32_e32 v8, 16, v12 -; GFX6-NEXT: v_sub_i32_e32 v9, vcc, v9, v17 +; GFX6-NEXT: v_sub_i32_e32 v9, vcc, v9, v16 ; GFX6-NEXT: v_min_i32_e32 v10, -1, v4 -; GFX6-NEXT: v_sub_i32_e32 v10, vcc, v10, v19 +; GFX6-NEXT: v_sub_i32_e32 v10, vcc, v10, v17 ; GFX6-NEXT: v_max_i32_e32 v8, v9, v8 ; GFX6-NEXT: v_lshlrev_b32_e32 v5, 16, v5 ; GFX6-NEXT: v_min_i32_e32 v8, v8, v10 ; GFX6-NEXT: v_max_i32_e32 v9, -1, v5 ; GFX6-NEXT: v_sub_i32_e32 v4, vcc, v4, v8 ; GFX6-NEXT: v_lshlrev_b32_e32 v8, 16, v13 -; GFX6-NEXT: v_sub_i32_e32 v9, vcc, v9, v17 +; GFX6-NEXT: v_sub_i32_e32 v9, vcc, v9, v16 ; GFX6-NEXT: v_min_i32_e32 v10, -1, v5 -; GFX6-NEXT: v_sub_i32_e32 v10, vcc, v10, v19 +; GFX6-NEXT: v_sub_i32_e32 v10, vcc, v10, v17 ; GFX6-NEXT: v_max_i32_e32 v8, v9, v8 ; GFX6-NEXT: v_lshlrev_b32_e32 v6, 16, v6 ; GFX6-NEXT: v_min_i32_e32 v8, v8, v10 ; GFX6-NEXT: v_max_i32_e32 v9, -1, v6 ; GFX6-NEXT: v_sub_i32_e32 v5, vcc, v5, v8 ; GFX6-NEXT: v_lshlrev_b32_e32 v8, 16, v14 -; GFX6-NEXT: v_sub_i32_e32 v9, vcc, v9, v17 +; GFX6-NEXT: v_sub_i32_e32 v9, vcc, v9, v16 ; GFX6-NEXT: v_min_i32_e32 v10, -1, v6 -; GFX6-NEXT: v_sub_i32_e32 v10, vcc, v10, v19 +; GFX6-NEXT: v_sub_i32_e32 v10, vcc, v10, v17 ; GFX6-NEXT: v_max_i32_e32 v8, v9, v8 ; GFX6-NEXT: v_lshlrev_b32_e32 v7, 16, v7 ; GFX6-NEXT: v_min_i32_e32 v8, v8, v10 ; GFX6-NEXT: v_max_i32_e32 v9, -1, v7 -; GFX6-NEXT: v_ashrrev_i32_e32 v1, 16, v1 ; GFX6-NEXT: v_sub_i32_e32 v6, vcc, v6, v8 ; GFX6-NEXT: v_lshlrev_b32_e32 v8, 16, v15 -; GFX6-NEXT: v_sub_i32_e32 v9, vcc, v9, v17 +; GFX6-NEXT: v_sub_i32_e32 v9, vcc, v9, v16 ; GFX6-NEXT: v_min_i32_e32 v10, -1, v7 -; GFX6-NEXT: s_mov_b32 s4, 0xffff -; GFX6-NEXT: v_ashrrev_i32_e32 v0, 16, v0 -; GFX6-NEXT: v_sub_i32_e32 v10, vcc, v10, v19 +; GFX6-NEXT: v_sub_i32_e32 v10, vcc, v10, v17 ; GFX6-NEXT: v_max_i32_e32 v8, v9, v8 -; GFX6-NEXT: v_and_b32_e32 v1, s4, v1 +; GFX6-NEXT: v_min_i32_e32 v8, v8, v10 +; GFX6-NEXT: v_ashrrev_i32_e32 v1, 16, v1 +; GFX6-NEXT: v_sub_i32_e32 v7, vcc, v7, v8 +; GFX6-NEXT: v_mov_b32_e32 v8, 0xffff +; GFX6-NEXT: v_ashrrev_i32_e32 v0, 16, v0 +; GFX6-NEXT: v_and_b32_e32 v1, v1, v8 ; GFX6-NEXT: v_ashrrev_i32_e32 v2, 16, v2 ; GFX6-NEXT: v_ashrrev_i32_e32 v3, 16, v3 -; GFX6-NEXT: v_min_i32_e32 v8, v8, v10 -; GFX6-NEXT: v_and_b32_e32 v0, s4, v0 +; GFX6-NEXT: v_and_b32_e32 v0, v0, v8 ; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX6-NEXT: v_ashrrev_i32_e32 v5, 16, v5 -; GFX6-NEXT: v_sub_i32_e32 v7, vcc, v7, v8 ; GFX6-NEXT: v_or_b32_e32 v0, v0, v1 -; GFX6-NEXT: v_and_b32_e32 v1, s4, v2 -; GFX6-NEXT: v_and_b32_e32 v2, s4, v3 +; GFX6-NEXT: v_and_b32_e32 v1, v2, v8 +; GFX6-NEXT: v_and_b32_e32 v2, v3, v8 ; GFX6-NEXT: v_ashrrev_i32_e32 v4, 16, v4 ; GFX6-NEXT: v_ashrrev_i32_e32 v7, 16, v7 ; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX6-NEXT: v_and_b32_e32 v3, s4, v5 +; GFX6-NEXT: v_and_b32_e32 v3, v5, v8 ; GFX6-NEXT: v_ashrrev_i32_e32 v6, 16, v6 ; GFX6-NEXT: v_or_b32_e32 v1, v1, v2 -; GFX6-NEXT: v_and_b32_e32 v2, s4, v4 +; GFX6-NEXT: v_and_b32_e32 v2, v4, v8 ; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GFX6-NEXT: v_and_b32_e32 v4, s4, v7 +; GFX6-NEXT: v_and_b32_e32 v4, v7, v8 ; GFX6-NEXT: v_or_b32_e32 v2, v2, v3 -; GFX6-NEXT: v_and_b32_e32 v3, s4, v6 +; GFX6-NEXT: v_and_b32_e32 v3, v6, v8 ; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v4 ; GFX6-NEXT: v_or_b32_e32 v3, v3, v4 ; GFX6-NEXT: s_setpc_b64 s[30:31] @@ -3742,67 +3723,65 @@ ; GFX8-LABEL: v_ssubsat_v8i16: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: s_movk_i32 s4, 0x7fff -; GFX8-NEXT: v_max_i16_e32 v12, -1, v0 -; GFX8-NEXT: s_movk_i32 s5, 0x8000 -; GFX8-NEXT: v_subrev_u16_e32 v12, s4, v12 -; GFX8-NEXT: v_min_i16_e32 v14, -1, v0 +; GFX8-NEXT: v_mov_b32_e32 v12, 0x7fff +; GFX8-NEXT: v_max_i16_e32 v14, -1, v0 +; GFX8-NEXT: v_mov_b32_e32 v13, 0xffff8000 +; GFX8-NEXT: v_sub_u16_e32 v14, v14, v12 +; GFX8-NEXT: v_min_i16_e32 v15, -1, v0 ; GFX8-NEXT: v_lshrrev_b32_e32 v8, 16, v0 -; GFX8-NEXT: v_subrev_u16_e32 v14, s5, v14 -; GFX8-NEXT: v_max_i16_e32 v12, v12, v4 -; GFX8-NEXT: v_min_i16_e32 v12, v12, v14 -; GFX8-NEXT: v_max_i16_e32 v14, -1, v8 -; GFX8-NEXT: v_subrev_u16_e32 v14, s4, v14 +; GFX8-NEXT: v_sub_u16_e32 v15, v15, v13 +; GFX8-NEXT: v_max_i16_e32 v14, v14, v4 +; GFX8-NEXT: v_min_i16_e32 v14, v14, v15 +; GFX8-NEXT: v_max_i16_e32 v15, -1, v8 +; GFX8-NEXT: v_sub_u16_e32 v15, v15, v12 ; GFX8-NEXT: v_min_i16_e32 v16, -1, v8 -; GFX8-NEXT: v_subrev_u16_e32 v16, s5, v16 -; GFX8-NEXT: v_max_i16_sdwa v4, v14, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX8-NEXT: v_max_i16_e32 v14, -1, v1 +; GFX8-NEXT: v_sub_u16_e32 v16, v16, v13 +; GFX8-NEXT: v_max_i16_sdwa v4, v15, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX8-NEXT: v_max_i16_e32 v15, -1, v1 ; GFX8-NEXT: v_min_i16_e32 v4, v4, v16 -; GFX8-NEXT: v_subrev_u16_e32 v14, s4, v14 +; GFX8-NEXT: v_sub_u16_e32 v15, v15, v12 ; GFX8-NEXT: v_min_i16_e32 v16, -1, v1 ; GFX8-NEXT: v_lshrrev_b32_e32 v9, 16, v1 -; GFX8-NEXT: v_subrev_u16_e32 v16, s5, v16 -; GFX8-NEXT: v_max_i16_e32 v14, v14, v5 -; GFX8-NEXT: v_min_i16_e32 v14, v14, v16 +; GFX8-NEXT: v_sub_u16_e32 v16, v16, v13 +; GFX8-NEXT: v_max_i16_e32 v15, v15, v5 +; GFX8-NEXT: v_min_i16_e32 v15, v15, v16 ; GFX8-NEXT: v_max_i16_e32 v16, -1, v9 -; GFX8-NEXT: v_subrev_u16_e32 v16, s4, v16 +; GFX8-NEXT: v_sub_u16_e32 v16, v16, v12 ; GFX8-NEXT: v_min_i16_e32 v17, -1, v9 -; GFX8-NEXT: v_mov_b32_e32 v13, 0x7fff -; GFX8-NEXT: v_subrev_u16_e32 v17, s5, v17 +; GFX8-NEXT: v_sub_u16_e32 v17, v17, v13 ; GFX8-NEXT: v_max_i16_sdwa v5, v16, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX8-NEXT: v_max_i16_e32 v16, -1, v2 -; GFX8-NEXT: v_mov_b32_e32 v15, 0xffff8000 ; GFX8-NEXT: v_min_i16_e32 v5, v5, v17 -; GFX8-NEXT: v_sub_u16_e32 v16, v16, v13 +; GFX8-NEXT: v_sub_u16_e32 v16, v16, v12 ; GFX8-NEXT: v_min_i16_e32 v17, -1, v2 ; GFX8-NEXT: v_lshrrev_b32_e32 v10, 16, v2 -; GFX8-NEXT: v_sub_u16_e32 v17, v17, v15 +; GFX8-NEXT: v_sub_u16_e32 v17, v17, v13 ; GFX8-NEXT: v_max_i16_e32 v16, v16, v6 ; GFX8-NEXT: v_min_i16_e32 v16, v16, v17 ; GFX8-NEXT: v_max_i16_e32 v17, -1, v10 -; GFX8-NEXT: v_sub_u16_e32 v17, v17, v13 +; GFX8-NEXT: v_sub_u16_e32 v17, v17, v12 ; GFX8-NEXT: v_min_i16_e32 v18, -1, v10 -; GFX8-NEXT: v_sub_u16_e32 v18, v18, v15 +; GFX8-NEXT: v_sub_u16_e32 v18, v18, v13 ; GFX8-NEXT: v_max_i16_sdwa v6, v17, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX8-NEXT: v_max_i16_e32 v17, -1, v3 ; GFX8-NEXT: v_min_i16_e32 v6, v6, v18 -; GFX8-NEXT: v_sub_u16_e32 v17, v17, v13 +; GFX8-NEXT: v_sub_u16_e32 v17, v17, v12 ; GFX8-NEXT: v_min_i16_e32 v18, -1, v3 ; GFX8-NEXT: v_lshrrev_b32_e32 v11, 16, v3 -; GFX8-NEXT: v_sub_u16_e32 v18, v18, v15 +; GFX8-NEXT: v_sub_u16_e32 v18, v18, v13 ; GFX8-NEXT: v_max_i16_e32 v17, v17, v7 ; GFX8-NEXT: v_min_i16_e32 v17, v17, v18 ; GFX8-NEXT: v_max_i16_e32 v18, -1, v11 -; GFX8-NEXT: v_sub_u16_e32 v13, v18, v13 +; GFX8-NEXT: v_sub_u16_e32 v12, v18, v12 ; GFX8-NEXT: v_min_i16_e32 v18, -1, v11 -; GFX8-NEXT: v_sub_u16_e32 v0, v0, v12 +; GFX8-NEXT: v_sub_u16_e32 v0, v0, v14 ; GFX8-NEXT: v_sub_u16_sdwa v4, v8, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX8-NEXT: v_sub_u16_e32 v15, v18, v15 -; GFX8-NEXT: v_max_i16_sdwa v7, v13, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX8-NEXT: v_sub_u16_e32 v13, v18, v13 +; GFX8-NEXT: v_max_i16_sdwa v7, v12, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX8-NEXT: v_or_b32_e32 v0, v0, v4 -; GFX8-NEXT: v_sub_u16_e32 v1, v1, v14 +; GFX8-NEXT: v_sub_u16_e32 v1, v1, v15 ; GFX8-NEXT: v_sub_u16_sdwa v4, v9, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX8-NEXT: v_min_i16_e32 v7, v7, v15 +; GFX8-NEXT: v_min_i16_e32 v7, v7, v13 ; GFX8-NEXT: v_or_b32_e32 v1, v1, v4 ; GFX8-NEXT: v_sub_u16_e32 v2, v2, v16 ; GFX8-NEXT: v_sub_u16_sdwa v4, v10, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD @@ -4524,19 +4503,20 @@ ; GFX10-NEXT: v_sub_co_ci_u32_e32 v9, vcc_lo, v1, v5, vcc_lo ; GFX10-NEXT: v_sub_co_u32 v10, vcc_lo, v2, v6 ; GFX10-NEXT: v_sub_co_ci_u32_e32 v11, vcc_lo, v3, v7, vcc_lo -; GFX10-NEXT: v_ashrrev_i32_e32 v12, 31, v9 ; GFX10-NEXT: v_cmp_lt_i64_e32 vcc_lo, v[8:9], v[0:1] +; GFX10-NEXT: v_ashrrev_i32_e32 v0, 31, v9 +; GFX10-NEXT: v_bfrev_b32_e32 v1, 1 ; GFX10-NEXT: v_cmp_lt_i64_e64 s4, 0, v[4:5] -; GFX10-NEXT: v_ashrrev_i32_e32 v0, 31, v11 +; GFX10-NEXT: v_ashrrev_i32_e32 v4, 31, v11 ; GFX10-NEXT: v_cmp_lt_i64_e64 s6, 0, v[6:7] -; GFX10-NEXT: v_add_co_u32 v1, s5, v12, 0 -; GFX10-NEXT: v_add_co_ci_u32_e64 v4, s5, 0x80000000, v12, s5 +; GFX10-NEXT: v_add_co_u32 v5, s5, v0, 0 +; GFX10-NEXT: v_add_co_ci_u32_e64 v12, s5, v0, v1, s5 ; GFX10-NEXT: v_cmp_lt_i64_e64 s5, v[10:11], v[2:3] -; GFX10-NEXT: v_add_co_u32 v2, s7, v0, 0 -; GFX10-NEXT: v_add_co_ci_u32_e64 v3, s7, 0x80000000, v0, s7 +; GFX10-NEXT: v_add_co_u32 v2, s7, v4, 0 +; GFX10-NEXT: v_add_co_ci_u32_e64 v3, s7, v4, v1, s7 ; GFX10-NEXT: s_xor_b32 vcc_lo, s4, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e32 v0, v8, v1, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e32 v1, v9, v4, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e32 v0, v8, v5, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e32 v1, v9, v12, vcc_lo ; GFX10-NEXT: s_xor_b32 vcc_lo, s6, s5 ; GFX10-NEXT: v_cndmask_b32_e32 v2, v10, v2, vcc_lo ; GFX10-NEXT: v_cndmask_b32_e32 v3, v11, v3, vcc_lo @@ -5052,20 +5032,20 @@ ; GFX6-NEXT: v_cmp_lt_i64_e32 vcc, 0, v[2:3] ; GFX6-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc ; GFX6-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[2:3] +; GFX6-NEXT: v_bfrev_b32_e32 v2, 1 ; GFX6-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc ; GFX6-NEXT: v_ashrrev_i32_e32 v1, 31, v7 -; GFX6-NEXT: v_add_i32_e32 v2, vcc, 0, v1 -; GFX6-NEXT: v_addc_u32_e32 v3, vcc, 0, v1, vcc +; GFX6-NEXT: v_add_i32_e32 v3, vcc, 0, v1 ; GFX6-NEXT: v_xor_b32_e32 v0, v0, v8 -; GFX6-NEXT: v_bfrev_b32_e32 v8, 1 +; GFX6-NEXT: v_addc_u32_e32 v8, vcc, 0, v1, vcc ; GFX6-NEXT: v_addc_u32_e32 v9, vcc, 0, v1, vcc -; GFX6-NEXT: v_addc_u32_e32 v8, vcc, v1, v8, vcc +; GFX6-NEXT: v_addc_u32_e32 v10, vcc, v1, v2, vcc ; GFX6-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX6-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 -; GFX6-NEXT: v_cndmask_b32_e32 v0, v4, v2, vcc -; GFX6-NEXT: v_cndmask_b32_e32 v1, v5, v3, vcc +; GFX6-NEXT: v_cndmask_b32_e32 v0, v4, v3, vcc +; GFX6-NEXT: v_cndmask_b32_e32 v1, v5, v8, vcc ; GFX6-NEXT: v_cndmask_b32_e32 v2, v6, v9, vcc -; GFX6-NEXT: v_cndmask_b32_e32 v3, v7, v8, vcc +; GFX6-NEXT: v_cndmask_b32_e32 v3, v7, v10, vcc ; GFX6-NEXT: ; return to shader part epilog ; ; GFX8-LABEL: ssubsat_i128_sv: @@ -5088,20 +5068,20 @@ ; GFX8-NEXT: v_cmp_lt_i64_e32 vcc, 0, v[2:3] ; GFX8-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc ; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[2:3] +; GFX8-NEXT: v_bfrev_b32_e32 v2, 1 ; GFX8-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc ; GFX8-NEXT: v_ashrrev_i32_e32 v1, 31, v7 -; GFX8-NEXT: v_add_u32_e32 v2, vcc, 0, v1 -; GFX8-NEXT: v_addc_u32_e32 v3, vcc, 0, v1, vcc +; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0, v1 ; GFX8-NEXT: v_xor_b32_e32 v0, v0, v8 -; GFX8-NEXT: v_bfrev_b32_e32 v8, 1 +; GFX8-NEXT: v_addc_u32_e32 v8, vcc, 0, v1, vcc ; GFX8-NEXT: v_addc_u32_e32 v9, vcc, 0, v1, vcc -; GFX8-NEXT: v_addc_u32_e32 v8, vcc, v1, v8, vcc +; GFX8-NEXT: v_addc_u32_e32 v10, vcc, v1, v2, vcc ; GFX8-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX8-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 -; GFX8-NEXT: v_cndmask_b32_e32 v0, v4, v2, vcc -; GFX8-NEXT: v_cndmask_b32_e32 v1, v5, v3, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v0, v4, v3, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v1, v5, v8, vcc ; GFX8-NEXT: v_cndmask_b32_e32 v2, v6, v9, vcc -; GFX8-NEXT: v_cndmask_b32_e32 v3, v7, v8, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v3, v7, v10, vcc ; GFX8-NEXT: ; return to shader part epilog ; ; GFX9-LABEL: ssubsat_i128_sv: @@ -5124,20 +5104,20 @@ ; GFX9-NEXT: v_cmp_lt_i64_e32 vcc, 0, v[2:3] ; GFX9-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc ; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[2:3] +; GFX9-NEXT: v_bfrev_b32_e32 v2, 1 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc ; GFX9-NEXT: v_ashrrev_i32_e32 v1, 31, v7 -; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, 0, v1 -; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v1, vcc +; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, 0, v1 ; GFX9-NEXT: v_xor_b32_e32 v0, v0, v8 -; GFX9-NEXT: v_bfrev_b32_e32 v8, 1 +; GFX9-NEXT: v_addc_co_u32_e32 v8, vcc, 0, v1, vcc ; GFX9-NEXT: v_addc_co_u32_e32 v9, vcc, 0, v1, vcc -; GFX9-NEXT: v_addc_co_u32_e32 v8, vcc, v1, v8, vcc +; GFX9-NEXT: v_addc_co_u32_e32 v10, vcc, v1, v2, vcc ; GFX9-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v4, v2, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v1, v5, v3, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v0, v4, v3, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v1, v5, v8, vcc ; GFX9-NEXT: v_cndmask_b32_e32 v2, v6, v9, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v3, v7, v8, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v3, v7, v10, vcc ; GFX9-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: ssubsat_i128_sv: @@ -5190,7 +5170,6 @@ ; GFX6-NEXT: v_cmp_gt_u64_e64 s[0:1], s[0:1], 0 ; GFX6-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc ; GFX6-NEXT: v_cmp_lt_i64_e32 vcc, v[6:7], v[2:3] -; GFX6-NEXT: v_bfrev_b32_e32 v8, 1 ; GFX6-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc ; GFX6-NEXT: v_cmp_eq_u64_e32 vcc, v[6:7], v[2:3] ; GFX6-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc @@ -5201,16 +5180,17 @@ ; GFX6-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc ; GFX6-NEXT: v_xor_b32_e32 v0, v1, v0 ; GFX6-NEXT: v_ashrrev_i32_e32 v1, 31, v7 -; GFX6-NEXT: v_add_i32_e32 v2, vcc, 0, v1 -; GFX6-NEXT: v_addc_u32_e32 v3, vcc, 0, v1, vcc +; GFX6-NEXT: v_add_i32_e32 v3, vcc, 0, v1 +; GFX6-NEXT: v_addc_u32_e32 v8, vcc, 0, v1, vcc +; GFX6-NEXT: v_bfrev_b32_e32 v2, 1 ; GFX6-NEXT: v_addc_u32_e32 v9, vcc, 0, v1, vcc -; GFX6-NEXT: v_addc_u32_e32 v8, vcc, v1, v8, vcc +; GFX6-NEXT: v_addc_u32_e32 v10, vcc, v1, v2, vcc ; GFX6-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX6-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 -; GFX6-NEXT: v_cndmask_b32_e32 v0, v4, v2, vcc -; GFX6-NEXT: v_cndmask_b32_e32 v1, v5, v3, vcc +; GFX6-NEXT: v_cndmask_b32_e32 v0, v4, v3, vcc +; GFX6-NEXT: v_cndmask_b32_e32 v1, v5, v8, vcc ; GFX6-NEXT: v_cndmask_b32_e32 v2, v6, v9, vcc -; GFX6-NEXT: v_cndmask_b32_e32 v3, v7, v8, vcc +; GFX6-NEXT: v_cndmask_b32_e32 v3, v7, v10, vcc ; GFX6-NEXT: ; return to shader part epilog ; ; GFX8-LABEL: ssubsat_i128_vs: @@ -5233,23 +5213,23 @@ ; GFX8-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc ; GFX8-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[0:1] ; GFX8-NEXT: v_cmp_gt_i64_e64 s[0:1], s[2:3], 0 -; GFX8-NEXT: v_bfrev_b32_e32 v8, 1 ; GFX8-NEXT: v_cndmask_b32_e64 v2, 0, 1, s[0:1] ; GFX8-NEXT: s_and_b32 s0, 1, s4 ; GFX8-NEXT: v_cmp_ne_u32_e64 vcc, 0, s0 ; GFX8-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc ; GFX8-NEXT: v_xor_b32_e32 v0, v1, v0 ; GFX8-NEXT: v_ashrrev_i32_e32 v1, 31, v7 -; GFX8-NEXT: v_add_u32_e32 v2, vcc, 0, v1 -; GFX8-NEXT: v_addc_u32_e32 v3, vcc, 0, v1, vcc +; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0, v1 +; GFX8-NEXT: v_addc_u32_e32 v8, vcc, 0, v1, vcc +; GFX8-NEXT: v_bfrev_b32_e32 v2, 1 ; GFX8-NEXT: v_addc_u32_e32 v9, vcc, 0, v1, vcc -; GFX8-NEXT: v_addc_u32_e32 v8, vcc, v1, v8, vcc +; GFX8-NEXT: v_addc_u32_e32 v10, vcc, v1, v2, vcc ; GFX8-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX8-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 -; GFX8-NEXT: v_cndmask_b32_e32 v0, v4, v2, vcc -; GFX8-NEXT: v_cndmask_b32_e32 v1, v5, v3, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v0, v4, v3, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v1, v5, v8, vcc ; GFX8-NEXT: v_cndmask_b32_e32 v2, v6, v9, vcc -; GFX8-NEXT: v_cndmask_b32_e32 v3, v7, v8, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v3, v7, v10, vcc ; GFX8-NEXT: ; return to shader part epilog ; ; GFX9-LABEL: ssubsat_i128_vs: @@ -5272,23 +5252,23 @@ ; GFX9-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc ; GFX9-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[0:1] ; GFX9-NEXT: v_cmp_gt_i64_e64 s[0:1], s[2:3], 0 -; GFX9-NEXT: v_bfrev_b32_e32 v8, 1 ; GFX9-NEXT: v_cndmask_b32_e64 v2, 0, 1, s[0:1] ; GFX9-NEXT: s_and_b32 s0, 1, s4 ; GFX9-NEXT: v_cmp_ne_u32_e64 vcc, 0, s0 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc ; GFX9-NEXT: v_xor_b32_e32 v0, v1, v0 ; GFX9-NEXT: v_ashrrev_i32_e32 v1, 31, v7 -; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, 0, v1 -; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v1, vcc +; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, 0, v1 +; GFX9-NEXT: v_addc_co_u32_e32 v8, vcc, 0, v1, vcc +; GFX9-NEXT: v_bfrev_b32_e32 v2, 1 ; GFX9-NEXT: v_addc_co_u32_e32 v9, vcc, 0, v1, vcc -; GFX9-NEXT: v_addc_co_u32_e32 v8, vcc, v1, v8, vcc +; GFX9-NEXT: v_addc_co_u32_e32 v10, vcc, v1, v2, vcc ; GFX9-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v4, v2, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v1, v5, v3, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v0, v4, v3, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v1, v5, v8, vcc ; GFX9-NEXT: v_cndmask_b32_e32 v2, v6, v9, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v3, v7, v8, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v3, v7, v10, vcc ; GFX9-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: ssubsat_i128_vs: @@ -5534,6 +5514,7 @@ ; GFX10-NEXT: v_cmp_lt_i64_e32 vcc_lo, 0, v[10:11] ; GFX10-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc_lo ; GFX10-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[18:19], v[2:3] +; GFX10-NEXT: v_bfrev_b32_e32 v3, 1 ; GFX10-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc_lo ; GFX10-NEXT: v_cmp_eq_u64_e32 vcc_lo, 0, v[10:11] ; GFX10-NEXT: v_cndmask_b32_e32 v1, v9, v8, vcc_lo @@ -5549,7 +5530,7 @@ ; GFX10-NEXT: v_cmp_lt_i64_e64 s4, v[10:11], v[6:7] ; GFX10-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX10-NEXT: v_add_co_u32 v2, vcc_lo, v1, 0 -; GFX10-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, 0, v1, vcc_lo +; GFX10-NEXT: v_add_co_ci_u32_e32 v20, vcc_lo, 0, v1, vcc_lo ; GFX10-NEXT: v_cndmask_b32_e64 v5, 0, 1, s4 ; GFX10-NEXT: v_cmp_lt_u64_e64 s4, 0, v[12:13] ; GFX10-NEXT: v_ashrrev_i32_e32 v7, 31, v11 @@ -5560,22 +5541,22 @@ ; GFX10-NEXT: v_cndmask_b32_e64 v0, v5, v4, s5 ; GFX10-NEXT: v_cmp_eq_u64_e64 s5, 0, v[14:15] ; GFX10-NEXT: v_add_co_ci_u32_e32 v5, vcc_lo, 0, v1, vcc_lo -; GFX10-NEXT: v_add_co_ci_u32_e32 v6, vcc_lo, 0x80000000, v1, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e64 v1, v17, v3, s4 +; GFX10-NEXT: v_add_co_ci_u32_e32 v6, vcc_lo, v1, v3, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e64 v1, v17, v20, s4 ; GFX10-NEXT: v_cndmask_b32_e64 v4, v13, v12, s5 ; GFX10-NEXT: v_xor_b32_e32 v4, v4, v0 ; GFX10-NEXT: v_cndmask_b32_e64 v0, v16, v2, s4 ; GFX10-NEXT: v_cndmask_b32_e64 v2, v18, v5, s4 -; GFX10-NEXT: v_and_b32_e32 v3, 1, v4 -; GFX10-NEXT: v_add_co_u32 v4, vcc_lo, v7, 0 -; GFX10-NEXT: v_add_co_ci_u32_e32 v5, vcc_lo, 0, v7, vcc_lo +; GFX10-NEXT: v_add_co_u32 v5, vcc_lo, v7, 0 +; GFX10-NEXT: v_and_b32_e32 v4, 1, v4 ; GFX10-NEXT: v_add_co_ci_u32_e32 v12, vcc_lo, 0, v7, vcc_lo -; GFX10-NEXT: v_cmp_ne_u32_e64 s5, 0, v3 -; GFX10-NEXT: v_add_co_ci_u32_e32 v7, vcc_lo, 0x80000000, v7, vcc_lo +; GFX10-NEXT: v_add_co_ci_u32_e32 v13, vcc_lo, 0, v7, vcc_lo +; GFX10-NEXT: v_cmp_ne_u32_e64 s5, 0, v4 +; GFX10-NEXT: v_add_co_ci_u32_e32 v7, vcc_lo, v7, v3, vcc_lo ; GFX10-NEXT: v_cndmask_b32_e64 v3, v19, v6, s4 -; GFX10-NEXT: v_cndmask_b32_e64 v4, v8, v4, s5 -; GFX10-NEXT: v_cndmask_b32_e64 v5, v9, v5, s5 -; GFX10-NEXT: v_cndmask_b32_e64 v6, v10, v12, s5 +; GFX10-NEXT: v_cndmask_b32_e64 v4, v8, v5, s5 +; GFX10-NEXT: v_cndmask_b32_e64 v5, v9, v12, s5 +; GFX10-NEXT: v_cndmask_b32_e64 v6, v10, v13, s5 ; GFX10-NEXT: v_cndmask_b32_e64 v7, v11, v7, s5 ; GFX10-NEXT: s_setpc_b64 s[30:31] %result = call <2 x i128> @llvm.ssub.sat.v2i128(<2 x i128> %lhs, <2 x i128> %rhs) Index: llvm/test/CodeGen/AMDGPU/GlobalISel/uaddsat.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/GlobalISel/uaddsat.ll +++ llvm/test/CodeGen/AMDGPU/GlobalISel/uaddsat.ll @@ -232,8 +232,8 @@ ; GFX9-NEXT: v_pk_lshlrev_b16 v1, 8, v1 op_sel_hi:[0,1] ; GFX9-NEXT: v_pk_add_u16 v0, v0, v1 clamp ; GFX9-NEXT: v_pk_lshrrev_b16 v0, 8, v0 op_sel_hi:[0,1] -; GFX9-NEXT: s_movk_i32 s4, 0xff -; GFX9-NEXT: v_and_b32_sdwa v1, v0, s4 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX9-NEXT: v_mov_b32_e32 v1, 0xff +; GFX9-NEXT: v_and_b32_sdwa v1, v0, v1 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -245,14 +245,14 @@ ; GFX10-NEXT: v_mov_b32_e32 v2, 0xffff ; GFX10-NEXT: v_lshrrev_b32_sdwa v3, s4, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX10-NEXT: v_lshrrev_b32_sdwa v4, s4, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX10-NEXT: s_movk_i32 s4, 0xff ; GFX10-NEXT: v_and_or_b32 v0, v0, v2, v3 ; GFX10-NEXT: v_and_or_b32 v1, v1, v2, v4 ; GFX10-NEXT: v_pk_lshlrev_b16 v0, 8, v0 op_sel_hi:[0,1] ; GFX10-NEXT: v_pk_lshlrev_b16 v1, 8, v1 op_sel_hi:[0,1] ; GFX10-NEXT: v_pk_add_u16 v0, v0, v1 clamp +; GFX10-NEXT: v_mov_b32_e32 v1, 0xff ; GFX10-NEXT: v_pk_lshrrev_b16 v0, 8, v0 op_sel_hi:[0,1] -; GFX10-NEXT: v_and_b32_sdwa v1, v0, s4 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX10-NEXT: v_and_b32_sdwa v1, v0, v1 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; GFX10-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX10-NEXT: s_setpc_b64 s[30:31] %lhs = bitcast i16 %lhs.arg to <2 x i8> @@ -320,8 +320,8 @@ ; GFX9-NEXT: v_mov_b32_e32 v0, s1 ; GFX9-NEXT: v_pk_add_u16 v0, s0, v0 clamp ; GFX9-NEXT: v_pk_lshrrev_b16 v0, 8, v0 op_sel_hi:[0,1] -; GFX9-NEXT: s_movk_i32 s0, 0xff -; GFX9-NEXT: v_and_b32_sdwa v1, v0, s0 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX9-NEXT: v_mov_b32_e32 v1, 0xff +; GFX9-NEXT: v_and_b32_sdwa v1, v0, v1 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_readfirstlane_b32 s0, v0 ; GFX9-NEXT: ; return to shader part epilog @@ -341,10 +341,10 @@ ; GFX10-NEXT: s_lshl_b32 s2, s4, 8 ; GFX10-NEXT: s_pack_ll_b32_b16 s0, s0, s3 ; GFX10-NEXT: s_pack_ll_b32_b16 s1, s1, s2 +; GFX10-NEXT: v_mov_b32_e32 v1, 0xff ; GFX10-NEXT: v_pk_add_u16 v0, s0, s1 clamp -; GFX10-NEXT: s_movk_i32 s0, 0xff ; GFX10-NEXT: v_pk_lshrrev_b16 v0, 8, v0 op_sel_hi:[0,1] -; GFX10-NEXT: v_and_b32_sdwa v1, v0, s0 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX10-NEXT: v_and_b32_sdwa v1, v0, v1 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; GFX10-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX10-NEXT: v_readfirstlane_b32 s0, v0 ; GFX10-NEXT: ; return to shader part epilog @@ -452,12 +452,12 @@ ; GFX9-NEXT: v_pk_add_u16 v0, v0, v1 clamp ; GFX9-NEXT: v_pk_add_u16 v1, v2, v3 clamp ; GFX9-NEXT: v_pk_lshrrev_b16 v0, 8, v0 op_sel_hi:[0,1] -; GFX9-NEXT: v_mov_b32_e32 v2, 8 +; GFX9-NEXT: v_mov_b32_e32 v3, 8 ; GFX9-NEXT: v_pk_lshrrev_b16 v1, 8, v1 op_sel_hi:[0,1] -; GFX9-NEXT: s_movk_i32 s4, 0xff -; GFX9-NEXT: v_lshlrev_b32_sdwa v2, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 -; GFX9-NEXT: v_and_or_b32 v0, v0, s4, v2 -; GFX9-NEXT: v_and_b32_e32 v2, s4, v1 +; GFX9-NEXT: v_mov_b32_e32 v2, 0xff +; GFX9-NEXT: v_lshlrev_b32_sdwa v3, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 +; GFX9-NEXT: v_and_or_b32 v0, v0, v2, v3 +; GFX9-NEXT: v_and_b32_e32 v2, v1, v2 ; GFX9-NEXT: v_mov_b32_e32 v3, 24 ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX9-NEXT: v_lshlrev_b32_sdwa v1, v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 @@ -478,26 +478,26 @@ ; GFX10-NEXT: v_lshrrev_b32_e32 v8, 16, v1 ; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v4 ; GFX10-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GFX10-NEXT: s_movk_i32 s4, 0xff ; GFX10-NEXT: v_and_or_b32 v0, v0, v7, v2 ; GFX10-NEXT: v_and_or_b32 v1, v1, v7, v6 ; GFX10-NEXT: v_and_or_b32 v2, v3, v7, v4 ; GFX10-NEXT: v_and_or_b32 v3, v8, v7, v5 -; GFX10-NEXT: v_mov_b32_e32 v4, 24 +; GFX10-NEXT: v_mov_b32_e32 v5, 24 ; GFX10-NEXT: v_pk_lshlrev_b16 v0, 8, v0 op_sel_hi:[0,1] ; GFX10-NEXT: v_pk_lshlrev_b16 v1, 8, v1 op_sel_hi:[0,1] ; GFX10-NEXT: v_pk_lshlrev_b16 v2, 8, v2 op_sel_hi:[0,1] ; GFX10-NEXT: v_pk_lshlrev_b16 v3, 8, v3 op_sel_hi:[0,1] ; GFX10-NEXT: v_pk_add_u16 v0, v0, v1 clamp ; GFX10-NEXT: v_pk_add_u16 v1, v2, v3 clamp -; GFX10-NEXT: v_mov_b32_e32 v2, 8 +; GFX10-NEXT: v_mov_b32_e32 v2, 0xff +; GFX10-NEXT: v_mov_b32_e32 v3, 8 ; GFX10-NEXT: v_pk_lshrrev_b16 v0, 8, v0 op_sel_hi:[0,1] ; GFX10-NEXT: v_pk_lshrrev_b16 v1, 8, v1 op_sel_hi:[0,1] -; GFX10-NEXT: v_lshlrev_b32_sdwa v2, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 -; GFX10-NEXT: v_and_b32_e32 v3, s4, v1 -; GFX10-NEXT: v_lshlrev_b32_sdwa v1, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 -; GFX10-NEXT: v_and_or_b32 v0, v0, s4, v2 -; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; GFX10-NEXT: v_lshlrev_b32_sdwa v3, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 +; GFX10-NEXT: v_and_b32_e32 v4, v1, v2 +; GFX10-NEXT: v_lshlrev_b32_sdwa v1, v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 +; GFX10-NEXT: v_and_or_b32 v0, v0, v2, v3 +; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v4 ; GFX10-NEXT: v_or3_b32 v0, v0, v2, v1 ; GFX10-NEXT: s_setpc_b64 s[30:31] %lhs = bitcast i32 %lhs.arg to <4 x i8> @@ -621,11 +621,11 @@ ; GFX9-NEXT: v_pk_add_u16 v1, s3, v1 clamp ; GFX9-NEXT: v_pk_lshrrev_b16 v0, 8, v0 op_sel_hi:[0,1] ; GFX9-NEXT: v_pk_lshrrev_b16 v1, 8, v1 op_sel_hi:[0,1] -; GFX9-NEXT: s_movk_i32 s0, 0xff -; GFX9-NEXT: v_lshlrev_b32_sdwa v2, s2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 +; GFX9-NEXT: v_mov_b32_e32 v2, 0xff +; GFX9-NEXT: v_lshlrev_b32_sdwa v3, s2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 ; GFX9-NEXT: s_mov_b32 s5, 24 -; GFX9-NEXT: v_and_or_b32 v0, v0, s0, v2 -; GFX9-NEXT: v_and_b32_e32 v2, s0, v1 +; GFX9-NEXT: v_and_or_b32 v0, v0, v2, v3 +; GFX9-NEXT: v_and_b32_e32 v2, v1, v2 ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX9-NEXT: v_lshlrev_b32_sdwa v1, s5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 ; GFX9-NEXT: v_or3_b32 v0, v0, v2, v1 @@ -663,16 +663,16 @@ ; GFX10-NEXT: s_pack_ll_b32_b16 s3, s3, s4 ; GFX10-NEXT: v_pk_add_u16 v0, s0, s1 clamp ; GFX10-NEXT: v_pk_add_u16 v1, s2, s3 clamp +; GFX10-NEXT: v_mov_b32_e32 v2, 0xff ; GFX10-NEXT: s_mov_b32 s0, 8 -; GFX10-NEXT: s_movk_i32 s1, 0xff ; GFX10-NEXT: v_pk_lshrrev_b16 v0, 8, v0 op_sel_hi:[0,1] ; GFX10-NEXT: v_pk_lshrrev_b16 v1, 8, v1 op_sel_hi:[0,1] -; GFX10-NEXT: v_lshlrev_b32_sdwa v2, s0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 -; GFX10-NEXT: v_and_b32_e32 v3, s1, v1 +; GFX10-NEXT: v_lshlrev_b32_sdwa v3, s0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 +; GFX10-NEXT: v_and_b32_e32 v4, v1, v2 ; GFX10-NEXT: s_mov_b32 s0, 24 ; GFX10-NEXT: v_lshlrev_b32_sdwa v1, s0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 -; GFX10-NEXT: v_and_or_b32 v0, v0, s1, v2 -; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; GFX10-NEXT: v_and_or_b32 v0, v0, v2, v3 +; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v4 ; GFX10-NEXT: v_or3_b32 v0, v0, v2, v1 ; GFX10-NEXT: v_readfirstlane_b32 s0, v0 ; GFX10-NEXT: ; return to shader part epilog @@ -2155,12 +2155,11 @@ ; GFX8-NEXT: v_add_u16_e64 v4, v2, v5 clamp ; GFX8-NEXT: v_add_u16_sdwa v2, v2, v5 clamp dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; GFX8-NEXT: v_mov_b32_e32 v5, 16 -; GFX8-NEXT: v_lshlrev_b32_sdwa v1, v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; GFX8-NEXT: v_or_b32_sdwa v1, v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX8-NEXT: v_mov_b32_e32 v3, 16 ; GFX8-NEXT: v_lshlrev_b32_sdwa v0, v5, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; GFX8-NEXT: v_lshlrev_b32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; GFX8-NEXT: v_lshlrev_b32_sdwa v1, v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; GFX8-NEXT: v_lshlrev_b32_sdwa v2, v5, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; GFX8-NEXT: v_or_b32_sdwa v0, v6, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_sdwa v1, v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX8-NEXT: v_or_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX8-NEXT: s_setpc_b64 s[30:31] ; @@ -2366,7 +2365,6 @@ ; GFX8-NEXT: v_add_u16_sdwa v3, v3, v7 clamp dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; GFX8-NEXT: v_mov_b32_e32 v7, 16 ; GFX8-NEXT: v_lshlrev_b32_sdwa v0, v7, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; GFX8-NEXT: v_mov_b32_e32 v7, 16 ; GFX8-NEXT: v_lshlrev_b32_sdwa v1, v7, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; GFX8-NEXT: v_lshlrev_b32_sdwa v2, v7, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; GFX8-NEXT: v_lshlrev_b32_sdwa v3, v7, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 Index: llvm/test/CodeGen/AMDGPU/GlobalISel/udiv.i32.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/GlobalISel/udiv.i32.ll +++ llvm/test/CodeGen/AMDGPU/GlobalISel/udiv.i32.ll @@ -258,15 +258,15 @@ ; CHECK-LABEL: v_udiv_v2i32_oddk_denom: ; CHECK: ; %bb.0: ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_mov_b32 s4, 0xb2a50881 -; CHECK-NEXT: v_mul_hi_u32 v2, v0, s4 -; CHECK-NEXT: v_mul_hi_u32 v3, v1, s4 -; CHECK-NEXT: v_sub_i32_e32 v0, vcc, v0, v2 -; CHECK-NEXT: v_sub_i32_e32 v1, vcc, v1, v3 +; CHECK-NEXT: v_mov_b32_e32 v2, 0xb2a50881 +; CHECK-NEXT: v_mul_hi_u32 v3, v0, v2 +; CHECK-NEXT: v_mul_hi_u32 v2, v1, v2 +; CHECK-NEXT: v_sub_i32_e32 v0, vcc, v0, v3 +; CHECK-NEXT: v_sub_i32_e32 v1, vcc, v1, v2 ; CHECK-NEXT: v_lshrrev_b32_e32 v0, 1, v0 ; CHECK-NEXT: v_lshrrev_b32_e32 v1, 1, v1 -; CHECK-NEXT: v_add_i32_e32 v0, vcc, v0, v2 -; CHECK-NEXT: v_add_i32_e32 v1, vcc, v1, v3 +; CHECK-NEXT: v_add_i32_e32 v0, vcc, v0, v3 +; CHECK-NEXT: v_add_i32_e32 v1, vcc, v1, v2 ; CHECK-NEXT: v_lshrrev_b32_e32 v0, 20, v0 ; CHECK-NEXT: v_lshrrev_b32_e32 v1, 20, v1 ; CHECK-NEXT: s_setpc_b64 s[30:31] @@ -308,9 +308,9 @@ ; GISEL-LABEL: v_udiv_v2i32_pow2_shl_denom: ; GISEL: ; %bb.0: ; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GISEL-NEXT: s_movk_i32 s4, 0x1000 -; GISEL-NEXT: v_lshl_b32_e32 v2, s4, v2 -; GISEL-NEXT: v_lshl_b32_e32 v3, s4, v3 +; GISEL-NEXT: v_mov_b32_e32 v4, 0x1000 +; GISEL-NEXT: v_lshlrev_b32_e32 v2, v2, v4 +; GISEL-NEXT: v_lshlrev_b32_e32 v3, v3, v4 ; GISEL-NEXT: v_cvt_f32_u32_e32 v4, v2 ; GISEL-NEXT: v_sub_i32_e32 v5, vcc, 0, v2 ; GISEL-NEXT: v_cvt_f32_u32_e32 v6, v3 @@ -354,9 +354,9 @@ ; CGP-LABEL: v_udiv_v2i32_pow2_shl_denom: ; CGP: ; %bb.0: ; CGP-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CGP-NEXT: s_movk_i32 s4, 0x1000 -; CGP-NEXT: v_lshl_b32_e32 v2, s4, v2 -; CGP-NEXT: v_lshl_b32_e32 v3, s4, v3 +; CGP-NEXT: v_mov_b32_e32 v4, 0x1000 +; CGP-NEXT: v_lshlrev_b32_e32 v2, v2, v4 +; CGP-NEXT: v_lshlrev_b32_e32 v3, v3, v4 ; CGP-NEXT: v_cvt_f32_u32_e32 v4, v2 ; CGP-NEXT: v_sub_i32_e32 v5, vcc, 0, v2 ; CGP-NEXT: v_cvt_f32_u32_e32 v6, v3 @@ -413,9 +413,9 @@ ; GISEL-LABEL: v_udiv_i32_24bit: ; GISEL: ; %bb.0: ; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GISEL-NEXT: s_mov_b32 s4, 0xffffff -; GISEL-NEXT: v_and_b32_e32 v0, s4, v0 -; GISEL-NEXT: v_and_b32_e32 v1, s4, v1 +; GISEL-NEXT: v_mov_b32_e32 v2, 0xffffff +; GISEL-NEXT: v_and_b32_e32 v0, v0, v2 +; GISEL-NEXT: v_and_b32_e32 v1, v1, v2 ; GISEL-NEXT: v_cvt_f32_u32_e32 v2, v1 ; GISEL-NEXT: v_sub_i32_e32 v3, vcc, 0, v1 ; GISEL-NEXT: v_rcp_iflag_f32_e32 v2, v2 @@ -440,9 +440,9 @@ ; CGP-LABEL: v_udiv_i32_24bit: ; CGP: ; %bb.0: ; CGP-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CGP-NEXT: s_mov_b32 s4, 0xffffff -; CGP-NEXT: v_and_b32_e32 v0, s4, v0 -; CGP-NEXT: v_and_b32_e32 v1, s4, v1 +; CGP-NEXT: v_mov_b32_e32 v2, 0xffffff +; CGP-NEXT: v_and_b32_e32 v0, v0, v2 +; CGP-NEXT: v_and_b32_e32 v1, v1, v2 ; CGP-NEXT: v_cvt_f32_u32_e32 v2, v1 ; CGP-NEXT: v_sub_i32_e32 v3, vcc, 0, v1 ; CGP-NEXT: v_rcp_f32_e32 v2, v2 Index: llvm/test/CodeGen/AMDGPU/GlobalISel/udiv.i64.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/GlobalISel/udiv.i64.ll +++ llvm/test/CodeGen/AMDGPU/GlobalISel/udiv.i64.ll @@ -153,25 +153,25 @@ ; CHECK-NEXT: s_cbranch_execz BB0_4 ; CHECK-NEXT: ; %bb.3: ; CHECK-NEXT: v_cvt_f32_u32_e32 v0, v2 -; CHECK-NEXT: v_sub_i32_e32 v1, vcc, 0, v2 +; CHECK-NEXT: v_mov_b32_e32 v1, 0 +; CHECK-NEXT: v_sub_i32_e32 v3, vcc, 0, v2 ; CHECK-NEXT: v_rcp_iflag_f32_e32 v0, v0 ; CHECK-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; CHECK-NEXT: v_cvt_u32_f32_e32 v0, v0 -; CHECK-NEXT: v_mul_lo_u32 v1, v1, v0 -; CHECK-NEXT: v_mul_hi_u32 v1, v0, v1 -; CHECK-NEXT: v_add_i32_e32 v0, vcc, v0, v1 +; CHECK-NEXT: v_mul_lo_u32 v3, v3, v0 +; CHECK-NEXT: v_mul_hi_u32 v3, v0, v3 +; CHECK-NEXT: v_add_i32_e32 v0, vcc, v0, v3 ; CHECK-NEXT: v_mul_hi_u32 v0, v4, v0 -; CHECK-NEXT: v_mul_lo_u32 v1, v0, v2 -; CHECK-NEXT: v_add_i32_e32 v3, vcc, 1, v0 -; CHECK-NEXT: v_sub_i32_e32 v1, vcc, v4, v1 -; CHECK-NEXT: v_cmp_ge_u32_e32 vcc, v1, v2 -; CHECK-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc -; CHECK-NEXT: v_sub_i32_e64 v3, s[4:5], v1, v2 -; CHECK-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc -; CHECK-NEXT: v_add_i32_e32 v3, vcc, 1, v0 -; CHECK-NEXT: v_cmp_ge_u32_e32 vcc, v1, v2 -; CHECK-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc -; CHECK-NEXT: v_mov_b32_e32 v1, 0 +; CHECK-NEXT: v_mul_lo_u32 v3, v0, v2 +; CHECK-NEXT: v_add_i32_e32 v5, vcc, 1, v0 +; CHECK-NEXT: v_sub_i32_e32 v3, vcc, v4, v3 +; CHECK-NEXT: v_cmp_ge_u32_e32 vcc, v3, v2 +; CHECK-NEXT: v_cndmask_b32_e32 v0, v0, v5, vcc +; CHECK-NEXT: v_sub_i32_e64 v4, s[4:5], v3, v2 +; CHECK-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc +; CHECK-NEXT: v_add_i32_e32 v4, vcc, 1, v0 +; CHECK-NEXT: v_cmp_ge_u32_e32 vcc, v3, v2 +; CHECK-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc ; CHECK-NEXT: BB0_4: ; CHECK-NEXT: s_or_b64 exec, exec, s[6:7] ; CHECK-NEXT: s_setpc_b64 s[30:31] @@ -775,25 +775,25 @@ ; CGP-NEXT: s_cbranch_execz BB2_4 ; CGP-NEXT: ; %bb.3: ; CGP-NEXT: v_cvt_f32_u32_e32 v0, v4 -; CGP-NEXT: v_sub_i32_e32 v1, vcc, 0, v4 +; CGP-NEXT: v_mov_b32_e32 v1, 0 +; CGP-NEXT: v_sub_i32_e32 v2, vcc, 0, v4 ; CGP-NEXT: v_rcp_iflag_f32_e32 v0, v0 ; CGP-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; CGP-NEXT: v_cvt_u32_f32_e32 v0, v0 -; CGP-NEXT: v_mul_lo_u32 v1, v1, v0 -; CGP-NEXT: v_mul_hi_u32 v1, v0, v1 -; CGP-NEXT: v_add_i32_e32 v0, vcc, v0, v1 +; CGP-NEXT: v_mul_lo_u32 v2, v2, v0 +; CGP-NEXT: v_mul_hi_u32 v2, v0, v2 +; CGP-NEXT: v_add_i32_e32 v0, vcc, v0, v2 ; CGP-NEXT: v_mul_hi_u32 v0, v10, v0 -; CGP-NEXT: v_mul_lo_u32 v1, v0, v4 -; CGP-NEXT: v_add_i32_e32 v2, vcc, 1, v0 -; CGP-NEXT: v_sub_i32_e32 v1, vcc, v10, v1 -; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v1, v4 -; CGP-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc -; CGP-NEXT: v_sub_i32_e64 v2, s[4:5], v1, v4 -; CGP-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc -; CGP-NEXT: v_add_i32_e32 v2, vcc, 1, v0 -; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v1, v4 -; CGP-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc -; CGP-NEXT: v_mov_b32_e32 v1, 0 +; CGP-NEXT: v_mul_lo_u32 v2, v0, v4 +; CGP-NEXT: v_add_i32_e32 v3, vcc, 1, v0 +; CGP-NEXT: v_sub_i32_e32 v2, vcc, v10, v2 +; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v2, v4 +; CGP-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc +; CGP-NEXT: v_sub_i32_e64 v3, s[4:5], v2, v4 +; CGP-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc +; CGP-NEXT: v_add_i32_e32 v3, vcc, 1, v0 +; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v2, v4 +; CGP-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc ; CGP-NEXT: BB2_4: ; CGP-NEXT: s_or_b64 exec, exec, s[6:7] ; CGP-NEXT: v_or_b32_e32 v3, v9, v7 @@ -939,25 +939,25 @@ ; CGP-NEXT: s_cbranch_execz BB2_8 ; CGP-NEXT: ; %bb.7: ; CGP-NEXT: v_cvt_f32_u32_e32 v2, v6 -; CGP-NEXT: v_sub_i32_e32 v3, vcc, 0, v6 +; CGP-NEXT: v_mov_b32_e32 v3, 0 +; CGP-NEXT: v_sub_i32_e32 v4, vcc, 0, v6 ; CGP-NEXT: v_rcp_iflag_f32_e32 v2, v2 ; CGP-NEXT: v_mul_f32_e32 v2, 0x4f7ffffe, v2 ; CGP-NEXT: v_cvt_u32_f32_e32 v2, v2 -; CGP-NEXT: v_mul_lo_u32 v3, v3, v2 -; CGP-NEXT: v_mul_hi_u32 v3, v2, v3 -; CGP-NEXT: v_add_i32_e32 v2, vcc, v2, v3 +; CGP-NEXT: v_mul_lo_u32 v4, v4, v2 +; CGP-NEXT: v_mul_hi_u32 v4, v2, v4 +; CGP-NEXT: v_add_i32_e32 v2, vcc, v2, v4 ; CGP-NEXT: v_mul_hi_u32 v2, v8, v2 -; CGP-NEXT: v_mul_lo_u32 v3, v2, v6 -; CGP-NEXT: v_add_i32_e32 v4, vcc, 1, v2 -; CGP-NEXT: v_sub_i32_e32 v3, vcc, v8, v3 -; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v3, v6 -; CGP-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc -; CGP-NEXT: v_sub_i32_e64 v4, s[4:5], v3, v6 -; CGP-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc -; CGP-NEXT: v_add_i32_e32 v4, vcc, 1, v2 -; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v3, v6 -; CGP-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc -; CGP-NEXT: v_mov_b32_e32 v3, 0 +; CGP-NEXT: v_mul_lo_u32 v4, v2, v6 +; CGP-NEXT: v_add_i32_e32 v5, vcc, 1, v2 +; CGP-NEXT: v_sub_i32_e32 v4, vcc, v8, v4 +; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v4, v6 +; CGP-NEXT: v_cndmask_b32_e32 v2, v2, v5, vcc +; CGP-NEXT: v_sub_i32_e64 v5, s[4:5], v4, v6 +; CGP-NEXT: v_cndmask_b32_e32 v4, v4, v5, vcc +; CGP-NEXT: v_add_i32_e32 v5, vcc, 1, v2 +; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v4, v6 +; CGP-NEXT: v_cndmask_b32_e32 v2, v2, v5, vcc ; CGP-NEXT: BB2_8: ; CGP-NEXT: s_or_b64 exec, exec, s[6:7] ; CGP-NEXT: s_setpc_b64 s[30:31] @@ -990,28 +990,28 @@ ; CHECK-LABEL: v_udiv_i64_oddk_denom: ; CHECK: ; %bb.0: ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_mov_b32 s4, 0x1fb03c31 -; CHECK-NEXT: s_mov_b32 s5, 0xd9528440 -; CHECK-NEXT: v_mul_lo_u32 v2, v1, s4 -; CHECK-NEXT: v_mul_lo_u32 v3, v0, s5 -; CHECK-NEXT: v_mul_hi_u32 v4, v0, s4 -; CHECK-NEXT: v_mul_lo_u32 v5, v1, s5 -; CHECK-NEXT: v_mul_hi_u32 v6, v1, s4 -; CHECK-NEXT: v_mul_hi_u32 v0, v0, s5 -; CHECK-NEXT: v_add_i32_e32 v2, vcc, v2, v3 -; CHECK-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc -; CHECK-NEXT: v_add_i32_e32 v5, vcc, v5, v6 -; CHECK-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc -; CHECK-NEXT: v_add_i32_e32 v2, vcc, v2, v4 -; CHECK-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc -; CHECK-NEXT: v_add_i32_e32 v0, vcc, v5, v0 +; CHECK-NEXT: v_mov_b32_e32 v2, 0x1fb03c31 +; CHECK-NEXT: v_mov_b32_e32 v3, 0xd9528440 +; CHECK-NEXT: v_mul_lo_u32 v4, v1, v2 +; CHECK-NEXT: v_mul_lo_u32 v5, v0, v3 +; CHECK-NEXT: v_mul_hi_u32 v6, v0, v2 +; CHECK-NEXT: v_mul_lo_u32 v7, v1, v3 +; CHECK-NEXT: v_mul_hi_u32 v2, v1, v2 +; CHECK-NEXT: v_mul_hi_u32 v0, v0, v3 +; CHECK-NEXT: v_add_i32_e32 v4, vcc, v4, v5 +; CHECK-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc +; CHECK-NEXT: v_add_i32_e32 v2, vcc, v7, v2 +; CHECK-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc +; CHECK-NEXT: v_add_i32_e32 v4, vcc, v4, v6 ; CHECK-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc -; CHECK-NEXT: v_add_i32_e32 v2, vcc, v3, v2 -; CHECK-NEXT: v_add_i32_e32 v3, vcc, v6, v4 -; CHECK-NEXT: v_add_i32_e32 v0, vcc, v0, v2 +; CHECK-NEXT: v_add_i32_e32 v0, vcc, v2, v0 ; CHECK-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc -; CHECK-NEXT: v_add_i32_e32 v2, vcc, v3, v2 -; CHECK-NEXT: v_mul_hi_u32 v1, v1, s5 +; CHECK-NEXT: v_add_i32_e32 v4, vcc, v5, v4 +; CHECK-NEXT: v_add_i32_e32 v2, vcc, v7, v2 +; CHECK-NEXT: v_add_i32_e32 v0, vcc, v0, v4 +; CHECK-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc +; CHECK-NEXT: v_add_i32_e32 v2, vcc, v2, v4 +; CHECK-NEXT: v_mul_hi_u32 v1, v1, v3 ; CHECK-NEXT: v_add_i32_e32 v1, vcc, v1, v2 ; CHECK-NEXT: v_lshr_b64 v[0:1], v[0:1], 20 ; CHECK-NEXT: s_setpc_b64 s[30:31] @@ -1023,50 +1023,50 @@ ; CHECK-LABEL: v_udiv_v2i64_oddk_denom: ; CHECK: ; %bb.0: ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_mov_b32 s4, 0x1fb03c31 -; CHECK-NEXT: s_mov_b32 s5, 0xd9528440 -; CHECK-NEXT: v_mul_lo_u32 v4, v1, s4 -; CHECK-NEXT: v_mul_lo_u32 v5, v0, s5 -; CHECK-NEXT: v_mul_hi_u32 v6, v0, s4 -; CHECK-NEXT: v_mul_lo_u32 v7, v1, s5 -; CHECK-NEXT: v_mul_hi_u32 v8, v1, s4 -; CHECK-NEXT: v_mul_hi_u32 v0, v0, s5 -; CHECK-NEXT: v_mul_hi_u32 v1, v1, s5 -; CHECK-NEXT: v_mul_lo_u32 v9, v3, s4 -; CHECK-NEXT: v_mul_lo_u32 v10, v2, s5 -; CHECK-NEXT: v_mul_hi_u32 v11, v2, s4 -; CHECK-NEXT: v_mul_lo_u32 v12, v3, s5 -; CHECK-NEXT: v_mul_hi_u32 v13, v3, s4 -; CHECK-NEXT: v_mul_hi_u32 v2, v2, s5 -; CHECK-NEXT: v_mul_hi_u32 v3, v3, s5 -; CHECK-NEXT: v_add_i32_e32 v4, vcc, v4, v5 +; CHECK-NEXT: v_mov_b32_e32 v4, 0x1fb03c31 +; CHECK-NEXT: v_mov_b32_e32 v5, 0xd9528440 +; CHECK-NEXT: v_mul_lo_u32 v6, v1, v4 +; CHECK-NEXT: v_mul_lo_u32 v7, v0, v5 +; CHECK-NEXT: v_mul_hi_u32 v8, v0, v4 +; CHECK-NEXT: v_mul_lo_u32 v9, v1, v5 +; CHECK-NEXT: v_mul_hi_u32 v10, v1, v4 +; CHECK-NEXT: v_mul_hi_u32 v0, v0, v5 +; CHECK-NEXT: v_mul_hi_u32 v1, v1, v5 +; CHECK-NEXT: v_mul_lo_u32 v11, v3, v4 +; CHECK-NEXT: v_mul_lo_u32 v12, v2, v5 +; CHECK-NEXT: v_mul_hi_u32 v13, v2, v4 +; CHECK-NEXT: v_mul_lo_u32 v14, v3, v5 +; CHECK-NEXT: v_mul_hi_u32 v4, v3, v4 +; CHECK-NEXT: v_mul_hi_u32 v2, v2, v5 +; CHECK-NEXT: v_mul_hi_u32 v3, v3, v5 +; CHECK-NEXT: v_add_i32_e32 v5, vcc, v6, v7 +; CHECK-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc +; CHECK-NEXT: v_add_i32_e32 v7, vcc, v9, v10 +; CHECK-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc +; CHECK-NEXT: v_add_i32_e32 v10, vcc, v11, v12 +; CHECK-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc +; CHECK-NEXT: v_add_i32_e32 v4, vcc, v14, v4 +; CHECK-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc +; CHECK-NEXT: v_add_i32_e32 v5, vcc, v5, v8 ; CHECK-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc -; CHECK-NEXT: v_add_i32_e32 v7, vcc, v7, v8 -; CHECK-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc -; CHECK-NEXT: v_add_i32_e32 v9, vcc, v9, v10 -; CHECK-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc -; CHECK-NEXT: v_add_i32_e32 v12, vcc, v12, v13 -; CHECK-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc -; CHECK-NEXT: v_add_i32_e32 v4, vcc, v4, v6 -; CHECK-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc ; CHECK-NEXT: v_add_i32_e32 v0, vcc, v7, v0 -; CHECK-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc -; CHECK-NEXT: v_add_i32_e32 v7, vcc, v9, v11 ; CHECK-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc -; CHECK-NEXT: v_add_i32_e32 v2, vcc, v12, v2 -; CHECK-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc -; CHECK-NEXT: v_add_i32_e32 v4, vcc, v5, v4 -; CHECK-NEXT: v_add_i32_e32 v5, vcc, v8, v6 -; CHECK-NEXT: v_add_i32_e32 v6, vcc, v10, v7 -; CHECK-NEXT: v_add_i32_e32 v7, vcc, v13, v9 -; CHECK-NEXT: v_add_i32_e32 v0, vcc, v0, v4 +; CHECK-NEXT: v_add_i32_e32 v8, vcc, v10, v13 +; CHECK-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc +; CHECK-NEXT: v_add_i32_e32 v2, vcc, v4, v2 ; CHECK-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc -; CHECK-NEXT: v_add_i32_e32 v2, vcc, v2, v6 -; CHECK-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc -; CHECK-NEXT: v_add_i32_e32 v4, vcc, v5, v4 -; CHECK-NEXT: v_add_i32_e32 v5, vcc, v7, v6 -; CHECK-NEXT: v_add_i32_e32 v1, vcc, v1, v4 -; CHECK-NEXT: v_add_i32_e32 v3, vcc, v3, v5 +; CHECK-NEXT: v_add_i32_e32 v5, vcc, v6, v5 +; CHECK-NEXT: v_add_i32_e32 v6, vcc, v9, v7 +; CHECK-NEXT: v_add_i32_e32 v7, vcc, v11, v8 +; CHECK-NEXT: v_add_i32_e32 v4, vcc, v12, v4 +; CHECK-NEXT: v_add_i32_e32 v0, vcc, v0, v5 +; CHECK-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc +; CHECK-NEXT: v_add_i32_e32 v2, vcc, v2, v7 +; CHECK-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc +; CHECK-NEXT: v_add_i32_e32 v5, vcc, v6, v5 +; CHECK-NEXT: v_add_i32_e32 v4, vcc, v4, v7 +; CHECK-NEXT: v_add_i32_e32 v1, vcc, v1, v5 +; CHECK-NEXT: v_add_i32_e32 v3, vcc, v3, v4 ; CHECK-NEXT: v_lshr_b64 v[0:1], v[0:1], 20 ; CHECK-NEXT: v_lshr_b64 v[2:3], v[2:3], 20 ; CHECK-NEXT: s_setpc_b64 s[30:31] @@ -1225,25 +1225,25 @@ ; CHECK-NEXT: s_cbranch_execz BB7_4 ; CHECK-NEXT: ; %bb.3: ; CHECK-NEXT: v_cvt_f32_u32_e32 v0, v5 -; CHECK-NEXT: v_sub_i32_e32 v1, vcc, 0, v5 +; CHECK-NEXT: v_mov_b32_e32 v1, 0 +; CHECK-NEXT: v_sub_i32_e32 v2, vcc, 0, v5 ; CHECK-NEXT: v_rcp_iflag_f32_e32 v0, v0 ; CHECK-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; CHECK-NEXT: v_cvt_u32_f32_e32 v0, v0 -; CHECK-NEXT: v_mul_lo_u32 v1, v1, v0 -; CHECK-NEXT: v_mul_hi_u32 v1, v0, v1 -; CHECK-NEXT: v_add_i32_e32 v0, vcc, v0, v1 +; CHECK-NEXT: v_mul_lo_u32 v2, v2, v0 +; CHECK-NEXT: v_mul_hi_u32 v2, v0, v2 +; CHECK-NEXT: v_add_i32_e32 v0, vcc, v0, v2 ; CHECK-NEXT: v_mul_hi_u32 v0, v3, v0 -; CHECK-NEXT: v_mul_lo_u32 v1, v0, v5 -; CHECK-NEXT: v_add_i32_e32 v2, vcc, 1, v0 -; CHECK-NEXT: v_sub_i32_e32 v1, vcc, v3, v1 -; CHECK-NEXT: v_cmp_ge_u32_e32 vcc, v1, v5 -; CHECK-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc -; CHECK-NEXT: v_sub_i32_e64 v2, s[4:5], v1, v5 -; CHECK-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc -; CHECK-NEXT: v_add_i32_e32 v2, vcc, 1, v0 -; CHECK-NEXT: v_cmp_ge_u32_e32 vcc, v1, v5 -; CHECK-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc -; CHECK-NEXT: v_mov_b32_e32 v1, 0 +; CHECK-NEXT: v_mul_lo_u32 v2, v0, v5 +; CHECK-NEXT: v_add_i32_e32 v4, vcc, 1, v0 +; CHECK-NEXT: v_sub_i32_e32 v2, vcc, v3, v2 +; CHECK-NEXT: v_cmp_ge_u32_e32 vcc, v2, v5 +; CHECK-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc +; CHECK-NEXT: v_sub_i32_e64 v3, s[4:5], v2, v5 +; CHECK-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc +; CHECK-NEXT: v_add_i32_e32 v3, vcc, 1, v0 +; CHECK-NEXT: v_cmp_ge_u32_e32 vcc, v2, v5 +; CHECK-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc ; CHECK-NEXT: BB7_4: ; CHECK-NEXT: s_or_b64 exec, exec, s[6:7] ; CHECK-NEXT: s_setpc_b64 s[30:31] @@ -1668,25 +1668,25 @@ ; CGP-NEXT: s_cbranch_execz BB8_4 ; CGP-NEXT: ; %bb.3: ; CGP-NEXT: v_cvt_f32_u32_e32 v0, v2 -; CGP-NEXT: v_sub_i32_e32 v1, vcc, 0, v2 +; CGP-NEXT: v_mov_b32_e32 v1, 0 +; CGP-NEXT: v_sub_i32_e32 v3, vcc, 0, v2 ; CGP-NEXT: v_rcp_iflag_f32_e32 v0, v0 ; CGP-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; CGP-NEXT: v_cvt_u32_f32_e32 v0, v0 -; CGP-NEXT: v_mul_lo_u32 v1, v1, v0 -; CGP-NEXT: v_mul_hi_u32 v1, v0, v1 -; CGP-NEXT: v_add_i32_e32 v0, vcc, v0, v1 +; CGP-NEXT: v_mul_lo_u32 v3, v3, v0 +; CGP-NEXT: v_mul_hi_u32 v3, v0, v3 +; CGP-NEXT: v_add_i32_e32 v0, vcc, v0, v3 ; CGP-NEXT: v_mul_hi_u32 v0, v8, v0 -; CGP-NEXT: v_mul_lo_u32 v1, v0, v2 -; CGP-NEXT: v_add_i32_e32 v3, vcc, 1, v0 -; CGP-NEXT: v_sub_i32_e32 v1, vcc, v8, v1 -; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v1, v2 -; CGP-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc -; CGP-NEXT: v_sub_i32_e64 v3, s[4:5], v1, v2 -; CGP-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc -; CGP-NEXT: v_add_i32_e32 v3, vcc, 1, v0 -; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v1, v2 -; CGP-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc -; CGP-NEXT: v_mov_b32_e32 v1, 0 +; CGP-NEXT: v_mul_lo_u32 v3, v0, v2 +; CGP-NEXT: v_add_i32_e32 v4, vcc, 1, v0 +; CGP-NEXT: v_sub_i32_e32 v3, vcc, v8, v3 +; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v3, v2 +; CGP-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc +; CGP-NEXT: v_sub_i32_e64 v4, s[4:5], v3, v2 +; CGP-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc +; CGP-NEXT: v_add_i32_e32 v4, vcc, 1, v0 +; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v3, v2 +; CGP-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc ; CGP-NEXT: BB8_4: ; CGP-NEXT: s_or_b64 exec, exec, s[6:7] ; CGP-NEXT: v_or_b32_e32 v3, v7, v11 @@ -1832,25 +1832,25 @@ ; CGP-NEXT: s_cbranch_execz BB8_8 ; CGP-NEXT: ; %bb.7: ; CGP-NEXT: v_cvt_f32_u32_e32 v2, v10 -; CGP-NEXT: v_sub_i32_e32 v3, vcc, 0, v10 +; CGP-NEXT: v_mov_b32_e32 v3, 0 +; CGP-NEXT: v_sub_i32_e32 v4, vcc, 0, v10 ; CGP-NEXT: v_rcp_iflag_f32_e32 v2, v2 ; CGP-NEXT: v_mul_f32_e32 v2, 0x4f7ffffe, v2 ; CGP-NEXT: v_cvt_u32_f32_e32 v2, v2 -; CGP-NEXT: v_mul_lo_u32 v3, v3, v2 -; CGP-NEXT: v_mul_hi_u32 v3, v2, v3 -; CGP-NEXT: v_add_i32_e32 v2, vcc, v2, v3 +; CGP-NEXT: v_mul_lo_u32 v4, v4, v2 +; CGP-NEXT: v_mul_hi_u32 v4, v2, v4 +; CGP-NEXT: v_add_i32_e32 v2, vcc, v2, v4 ; CGP-NEXT: v_mul_hi_u32 v2, v5, v2 -; CGP-NEXT: v_mul_lo_u32 v3, v2, v10 -; CGP-NEXT: v_add_i32_e32 v4, vcc, 1, v2 -; CGP-NEXT: v_sub_i32_e32 v3, vcc, v5, v3 -; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v3, v10 -; CGP-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc -; CGP-NEXT: v_sub_i32_e64 v4, s[4:5], v3, v10 -; CGP-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc -; CGP-NEXT: v_add_i32_e32 v4, vcc, 1, v2 -; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v3, v10 -; CGP-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc -; CGP-NEXT: v_mov_b32_e32 v3, 0 +; CGP-NEXT: v_mul_lo_u32 v4, v2, v10 +; CGP-NEXT: v_add_i32_e32 v6, vcc, 1, v2 +; CGP-NEXT: v_sub_i32_e32 v4, vcc, v5, v4 +; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v4, v10 +; CGP-NEXT: v_cndmask_b32_e32 v2, v2, v6, vcc +; CGP-NEXT: v_sub_i32_e64 v5, s[4:5], v4, v10 +; CGP-NEXT: v_cndmask_b32_e32 v4, v4, v5, vcc +; CGP-NEXT: v_add_i32_e32 v5, vcc, 1, v2 +; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v4, v10 +; CGP-NEXT: v_cndmask_b32_e32 v2, v2, v5, vcc ; CGP-NEXT: BB8_8: ; CGP-NEXT: s_or_b64 exec, exec, s[6:7] ; CGP-NEXT: s_setpc_b64 s[30:31] @@ -2180,33 +2180,34 @@ ; CGP-LABEL: v_udiv_v2i64_24bit: ; CGP: ; %bb.0: ; CGP-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CGP-NEXT: s_mov_b32 s6, 0xffffff -; CGP-NEXT: v_and_b32_e32 v0, s6, v0 -; CGP-NEXT: v_and_b32_e32 v1, s6, v2 -; CGP-NEXT: v_and_b32_e32 v2, s6, v4 -; CGP-NEXT: v_and_b32_e32 v3, s6, v6 +; CGP-NEXT: s_mov_b32 s4, 0xffffff +; CGP-NEXT: v_mov_b32_e32 v1, 0xffffff +; CGP-NEXT: v_and_b32_e32 v0, s4, v0 +; CGP-NEXT: v_and_b32_e32 v2, s4, v2 +; CGP-NEXT: v_and_b32_e32 v3, s4, v4 +; CGP-NEXT: v_and_b32_e32 v4, s4, v6 ; CGP-NEXT: v_cvt_f32_u32_e32 v0, v0 -; CGP-NEXT: v_cvt_f32_u32_e32 v2, v2 -; CGP-NEXT: v_cvt_f32_u32_e32 v1, v1 ; CGP-NEXT: v_cvt_f32_u32_e32 v3, v3 -; CGP-NEXT: v_rcp_f32_e32 v4, v2 +; CGP-NEXT: v_cvt_f32_u32_e32 v2, v2 +; CGP-NEXT: v_cvt_f32_u32_e32 v4, v4 ; CGP-NEXT: v_rcp_f32_e32 v5, v3 -; CGP-NEXT: v_mul_f32_e32 v4, v0, v4 -; CGP-NEXT: v_mul_f32_e32 v5, v1, v5 -; CGP-NEXT: v_trunc_f32_e32 v4, v4 +; CGP-NEXT: v_rcp_f32_e32 v6, v4 +; CGP-NEXT: v_mul_f32_e32 v5, v0, v5 +; CGP-NEXT: v_mul_f32_e32 v6, v2, v6 ; CGP-NEXT: v_trunc_f32_e32 v5, v5 -; CGP-NEXT: v_mad_f32 v0, -v4, v2, v0 -; CGP-NEXT: v_cvt_u32_f32_e32 v4, v4 -; CGP-NEXT: v_mad_f32 v1, -v5, v3, v1 +; CGP-NEXT: v_trunc_f32_e32 v6, v6 +; CGP-NEXT: v_mad_f32 v0, -v5, v3, v0 ; CGP-NEXT: v_cvt_u32_f32_e32 v5, v5 -; CGP-NEXT: v_cmp_ge_f32_e64 s[4:5], |v0|, v2 +; CGP-NEXT: v_mad_f32 v2, -v6, v4, v2 +; CGP-NEXT: v_cvt_u32_f32_e32 v6, v6 +; CGP-NEXT: v_cmp_ge_f32_e64 s[4:5], |v0|, v3 ; CGP-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[4:5] -; CGP-NEXT: v_cmp_ge_f32_e64 s[4:5], |v1|, v3 -; CGP-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[4:5] -; CGP-NEXT: v_add_i32_e32 v0, vcc, v4, v0 -; CGP-NEXT: v_add_i32_e32 v1, vcc, v5, v1 -; CGP-NEXT: v_and_b32_e32 v0, s6, v0 -; CGP-NEXT: v_and_b32_e32 v2, s6, v1 +; CGP-NEXT: v_cmp_ge_f32_e64 s[4:5], |v2|, v4 +; CGP-NEXT: v_cndmask_b32_e64 v2, 0, 1, s[4:5] +; CGP-NEXT: v_add_i32_e32 v0, vcc, v5, v0 +; CGP-NEXT: v_add_i32_e32 v2, vcc, v6, v2 +; CGP-NEXT: v_and_b32_e32 v0, v0, v1 +; CGP-NEXT: v_and_b32_e32 v2, v2, v1 ; CGP-NEXT: v_mov_b32_e32 v1, 0 ; CGP-NEXT: v_mov_b32_e32 v3, 0 ; CGP-NEXT: s_setpc_b64 s[30:31] Index: llvm/test/CodeGen/AMDGPU/GlobalISel/udivrem.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/GlobalISel/udivrem.ll +++ llvm/test/CodeGen/AMDGPU/GlobalISel/udivrem.ll @@ -345,7 +345,7 @@ ; GFX9-NEXT: v_mul_lo_u32 v3, s8, v1 ; GFX9-NEXT: v_mul_hi_u32 v5, s8, v0 ; GFX9-NEXT: v_mul_hi_u32 v0, s9, v0 -; GFX9-NEXT: v_mov_b32_e32 v4, s11 +; GFX9-NEXT: v_mov_b32_e32 v6, s11 ; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v2, v3 ; GFX9-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc ; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v2, v5 @@ -366,7 +366,7 @@ ; GFX9-NEXT: v_mul_lo_u32 v3, s10, v1 ; GFX9-NEXT: v_mul_hi_u32 v5, s10, v0 ; GFX9-NEXT: v_mul_lo_u32 v7, s10, v0 -; GFX9-NEXT: v_mov_b32_e32 v6, 0 +; GFX9-NEXT: v_mov_b32_e32 v4, 0 ; GFX9-NEXT: v_add3_u32 v2, v2, v3, v5 ; GFX9-NEXT: v_sub_co_u32_e32 v3, vcc, s8, v7 ; GFX9-NEXT: v_subb_co_u32_e64 v5, s[0:1], v8, v2, vcc @@ -376,7 +376,7 @@ ; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s10, v3 ; GFX9-NEXT: v_cndmask_b32_e64 v8, 0, -1, s[0:1] ; GFX9-NEXT: v_cmp_eq_u32_e64 s[0:1], s11, v5 -; GFX9-NEXT: v_subb_co_u32_e32 v2, vcc, v2, v4, vcc +; GFX9-NEXT: v_subb_co_u32_e32 v2, vcc, v2, v6, vcc ; GFX9-NEXT: v_cndmask_b32_e64 v7, v7, v8, s[0:1] ; GFX9-NEXT: v_subrev_co_u32_e32 v8, vcc, s10, v3 ; GFX9-NEXT: v_subbrev_co_u32_e64 v9, s[0:1], 0, v2, vcc @@ -385,10 +385,10 @@ ; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s11, v9 ; GFX9-NEXT: v_cndmask_b32_e64 v12, 0, -1, s[0:1] ; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s10, v8 -; GFX9-NEXT: v_subb_co_u32_e32 v2, vcc, v2, v4, vcc +; GFX9-NEXT: v_subb_co_u32_e32 v2, vcc, v2, v6, vcc ; GFX9-NEXT: v_cndmask_b32_e64 v13, 0, -1, s[0:1] ; GFX9-NEXT: v_cmp_eq_u32_e64 s[0:1], s11, v9 -; GFX9-NEXT: v_subrev_co_u32_e32 v4, vcc, s10, v8 +; GFX9-NEXT: v_subrev_co_u32_e32 v6, vcc, s10, v8 ; GFX9-NEXT: v_cndmask_b32_e64 v12, v12, v13, s[0:1] ; GFX9-NEXT: v_add_co_u32_e64 v13, s[0:1], 1, v10 ; GFX9-NEXT: v_subbrev_co_u32_e32 v2, vcc, 0, v2, vcc @@ -399,12 +399,12 @@ ; GFX9-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v7 ; GFX9-NEXT: v_cndmask_b32_e64 v0, v0, v10, s[0:1] ; GFX9-NEXT: v_cndmask_b32_e64 v1, v1, v11, s[0:1] -; GFX9-NEXT: v_cndmask_b32_e32 v4, v8, v4, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v6, v8, v6, vcc ; GFX9-NEXT: v_cndmask_b32_e32 v7, v9, v2, vcc -; GFX9-NEXT: v_cndmask_b32_e64 v2, v3, v4, s[0:1] +; GFX9-NEXT: v_cndmask_b32_e64 v2, v3, v6, s[0:1] ; GFX9-NEXT: v_cndmask_b32_e64 v3, v5, v7, s[0:1] -; GFX9-NEXT: global_store_dwordx2 v6, v[0:1], s[4:5] -; GFX9-NEXT: global_store_dwordx2 v6, v[2:3], s[6:7] +; GFX9-NEXT: global_store_dwordx2 v4, v[0:1], s[4:5] +; GFX9-NEXT: global_store_dwordx2 v4, v[2:3], s[6:7] ; GFX9-NEXT: s_endpgm ; ; GFX10-LABEL: udivrem_i64: @@ -2055,15 +2055,15 @@ ; GFX8-NEXT: v_cmp_le_u32_e32 vcc, s3, v3 ; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc ; GFX8-NEXT: v_subrev_u32_e64 v4, s[0:1], s3, v3 -; GFX8-NEXT: s_movk_i32 s0, 0xff -; GFX8-NEXT: v_and_b32_e32 v1, s0, v1 -; GFX8-NEXT: v_lshlrev_b16_e32 v1, 8, v1 ; GFX8-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc -; GFX8-NEXT: v_or_b32_sdwa v4, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX8-NEXT: v_mov_b32_e32 v4, 0xff +; GFX8-NEXT: v_and_b32_e32 v1, v1, v4 +; GFX8-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; GFX8-NEXT: v_or_b32_sdwa v5, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX8-NEXT: v_mov_b32_e32 v0, s8 ; GFX8-NEXT: v_mov_b32_e32 v1, s9 -; GFX8-NEXT: flat_store_short v[0:1], v4 -; GFX8-NEXT: v_and_b32_e32 v0, s0, v3 +; GFX8-NEXT: flat_store_short v[0:1], v5 +; GFX8-NEXT: v_and_b32_e32 v0, v3, v4 ; GFX8-NEXT: v_lshlrev_b16_e32 v0, 8, v0 ; GFX8-NEXT: v_or_b32_sdwa v2, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX8-NEXT: v_mov_b32_e32 v0, s10 @@ -2098,7 +2098,6 @@ ; GFX9-NEXT: v_mul_hi_u32 v1, s8, v1 ; GFX9-NEXT: v_add_u32_e32 v0, v0, v2 ; GFX9-NEXT: v_mul_hi_u32 v0, s9, v0 -; GFX9-NEXT: s_movk_i32 s4, 0xff ; GFX9-NEXT: v_mul_lo_u32 v3, v1, s6 ; GFX9-NEXT: v_add_u32_e32 v4, 1, v1 ; GFX9-NEXT: v_mul_lo_u32 v2, v0, s7 @@ -2121,15 +2120,16 @@ ; GFX9-NEXT: v_add_u32_e32 v4, 1, v0 ; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s7, v2 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc -; GFX9-NEXT: v_and_b32_e32 v0, s4, v0 ; GFX9-NEXT: v_subrev_u32_e32 v4, s7, v2 -; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v0 ; GFX9-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc +; GFX9-NEXT: v_mov_b32_e32 v4, 0xff +; GFX9-NEXT: v_and_b32_e32 v0, v0, v4 +; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v0 ; GFX9-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_store_short v1, v0, s[0:1] -; GFX9-NEXT: v_and_b32_e32 v0, s4, v2 +; GFX9-NEXT: v_and_b32_e32 v0, v2, v4 ; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v0 ; GFX9-NEXT: v_or_b32_sdwa v0, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: global_store_short v1, v0, s[2:3] @@ -2145,17 +2145,17 @@ ; GFX10-NEXT: v_cvt_f32_ubyte0_e32 v1, s1 ; GFX10-NEXT: s_sub_i32 s3, 0, s2 ; GFX10-NEXT: v_rcp_iflag_f32_e32 v0, v0 -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x0 +; GFX10-NEXT: s_sub_i32 s6, 0, s1 ; GFX10-NEXT: v_rcp_iflag_f32_e32 v1, v1 ; GFX10-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX10-NEXT: v_mul_f32_e32 v1, 0x4f7ffffe, v1 ; GFX10-NEXT: v_cvt_u32_f32_e32 v0, v0 ; GFX10-NEXT: v_cvt_u32_f32_e32 v1, v1 ; GFX10-NEXT: v_mul_lo_u32 v2, s3, v0 -; GFX10-NEXT: s_sub_i32 s3, 0, s1 -; GFX10-NEXT: v_mul_lo_u32 v3, s3, v1 ; GFX10-NEXT: s_bfe_u32 s3, s0, 0x80008 ; GFX10-NEXT: s_and_b32 s0, s0, 0xff +; GFX10-NEXT: v_mul_lo_u32 v3, s6, v1 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x0 ; GFX10-NEXT: v_mul_hi_u32 v2, v0, v2 ; GFX10-NEXT: v_mul_hi_u32 v3, v1, v3 ; GFX10-NEXT: v_add_nc_u32_e32 v0, v0, v2 @@ -2165,30 +2165,30 @@ ; GFX10-NEXT: v_mul_lo_u32 v2, v0, s2 ; GFX10-NEXT: v_add_nc_u32_e32 v4, 1, v0 ; GFX10-NEXT: v_mul_lo_u32 v3, v1, s1 -; GFX10-NEXT: v_add_nc_u32_e32 v6, 1, v1 +; GFX10-NEXT: v_add_nc_u32_e32 v5, 1, v1 ; GFX10-NEXT: v_sub_nc_u32_e32 v2, s3, v2 ; GFX10-NEXT: v_sub_nc_u32_e32 v3, s0, v3 -; GFX10-NEXT: v_subrev_nc_u32_e32 v5, s2, v2 +; GFX10-NEXT: v_subrev_nc_u32_e32 v6, s2, v2 ; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, s2, v2 ; GFX10-NEXT: v_cmp_le_u32_e64 s0, s1, v3 ; GFX10-NEXT: v_subrev_nc_u32_e32 v7, s1, v3 ; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e32 v2, v2, v5, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, v6, s0 +; GFX10-NEXT: v_cndmask_b32_e32 v2, v2, v6, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, v5, s0 ; GFX10-NEXT: v_cndmask_b32_e64 v3, v3, v7, s0 ; GFX10-NEXT: v_add_nc_u32_e32 v4, 1, v0 ; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, s2, v2 -; GFX10-NEXT: v_subrev_nc_u32_e32 v5, s2, v2 -; GFX10-NEXT: v_add_nc_u32_e32 v6, 1, v1 +; GFX10-NEXT: v_subrev_nc_u32_e32 v6, s2, v2 +; GFX10-NEXT: v_add_nc_u32_e32 v5, 1, v1 ; GFX10-NEXT: v_cmp_le_u32_e64 s0, s1, v3 ; GFX10-NEXT: v_subrev_nc_u32_e32 v7, s1, v3 ; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e32 v2, v2, v5, vcc_lo -; GFX10-NEXT: s_movk_i32 s1, 0xff -; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, v6, s0 +; GFX10-NEXT: v_mov_b32_e32 v4, 0xff +; GFX10-NEXT: v_cndmask_b32_e32 v2, v2, v6, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, v5, s0 ; GFX10-NEXT: v_cndmask_b32_e64 v3, v3, v7, s0 -; GFX10-NEXT: v_and_b32_sdwa v0, v0, s1 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX10-NEXT: v_and_b32_sdwa v2, v2, s1 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX10-NEXT: v_and_b32_sdwa v0, v0, v4 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX10-NEXT: v_and_b32_sdwa v2, v2, v4 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX10-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD Index: llvm/test/CodeGen/AMDGPU/GlobalISel/urem.i32.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/GlobalISel/urem.i32.ll +++ llvm/test/CodeGen/AMDGPU/GlobalISel/urem.i32.ll @@ -214,13 +214,21 @@ } define <2 x i32> @v_urem_v2i32_pow2k_denom(<2 x i32> %num) { -; CHECK-LABEL: v_urem_v2i32_pow2k_denom: -; CHECK: ; %bb.0: -; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_movk_i32 s4, 0xfff -; CHECK-NEXT: v_and_b32_e32 v0, s4, v0 -; CHECK-NEXT: v_and_b32_e32 v1, s4, v1 -; CHECK-NEXT: s_setpc_b64 s[30:31] +; GISEL-LABEL: v_urem_v2i32_pow2k_denom: +; GISEL: ; %bb.0: +; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-NEXT: s_movk_i32 s4, 0xfff +; GISEL-NEXT: v_and_b32_e32 v0, s4, v0 +; GISEL-NEXT: v_and_b32_e32 v1, s4, v1 +; GISEL-NEXT: s_setpc_b64 s[30:31] +; +; CGP-LABEL: v_urem_v2i32_pow2k_denom: +; CGP: ; %bb.0: +; CGP-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CGP-NEXT: v_mov_b32_e32 v2, 0xfff +; CGP-NEXT: v_and_b32_e32 v0, v0, v2 +; CGP-NEXT: v_and_b32_e32 v1, v1, v2 +; CGP-NEXT: s_setpc_b64 s[30:31] %result = urem <2 x i32> %num, ret <2 x i32> %result } @@ -229,23 +237,23 @@ ; CHECK-LABEL: v_urem_i32_oddk_denom: ; CHECK: ; %bb.0: ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_mov_b32 s4, 0x12d8fb -; CHECK-NEXT: v_rcp_iflag_f32_e32 v1, 0x4996c7d8 -; CHECK-NEXT: v_mov_b32_e32 v2, 0xffed2705 -; CHECK-NEXT: v_mul_f32_e32 v1, 0x4f7ffffe, v1 -; CHECK-NEXT: v_cvt_u32_f32_e32 v1, v1 +; CHECK-NEXT: v_mov_b32_e32 v1, 0x12d8fb +; CHECK-NEXT: v_rcp_iflag_f32_e32 v2, 0x4996c7d8 +; CHECK-NEXT: v_mov_b32_e32 v3, 0xffed2705 +; CHECK-NEXT: v_mul_f32_e32 v2, 0x4f7ffffe, v2 +; CHECK-NEXT: v_cvt_u32_f32_e32 v2, v2 +; CHECK-NEXT: v_mul_lo_u32 v3, v3, v2 +; CHECK-NEXT: v_mul_hi_u32 v3, v2, v3 +; CHECK-NEXT: v_add_i32_e32 v2, vcc, v2, v3 +; CHECK-NEXT: v_mul_hi_u32 v2, v0, v2 ; CHECK-NEXT: v_mul_lo_u32 v2, v2, v1 -; CHECK-NEXT: v_mul_hi_u32 v2, v1, v2 -; CHECK-NEXT: v_add_i32_e32 v1, vcc, v1, v2 -; CHECK-NEXT: v_mul_hi_u32 v1, v0, v1 -; CHECK-NEXT: v_mul_lo_u32 v1, v1, s4 -; CHECK-NEXT: v_sub_i32_e32 v0, vcc, v0, v1 -; CHECK-NEXT: v_subrev_i32_e32 v1, vcc, s4, v0 -; CHECK-NEXT: v_cmp_le_u32_e32 vcc, s4, v0 -; CHECK-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc -; CHECK-NEXT: v_subrev_i32_e32 v1, vcc, s4, v0 -; CHECK-NEXT: v_cmp_le_u32_e32 vcc, s4, v0 -; CHECK-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc +; CHECK-NEXT: v_sub_i32_e32 v0, vcc, v0, v2 +; CHECK-NEXT: v_sub_i32_e32 v2, vcc, v0, v1 +; CHECK-NEXT: v_cmp_ge_u32_e32 vcc, v0, v1 +; CHECK-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc +; CHECK-NEXT: v_sub_i32_e32 v2, vcc, v0, v1 +; CHECK-NEXT: v_cmp_ge_u32_e32 vcc, v0, v1 +; CHECK-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc ; CHECK-NEXT: s_setpc_b64 s[30:31] %result = urem i32 %num, 1235195 ret i32 %result @@ -255,10 +263,9 @@ ; GISEL-LABEL: v_urem_v2i32_oddk_denom: ; GISEL: ; %bb.0: ; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GISEL-NEXT: s_mov_b32 s4, 0x12d8fb ; GISEL-NEXT: v_mov_b32_e32 v2, 0x12d8fb ; GISEL-NEXT: v_mov_b32_e32 v3, 0xffed2705 -; GISEL-NEXT: v_cvt_f32_u32_e32 v4, s4 +; GISEL-NEXT: v_cvt_f32_u32_e32 v4, v2 ; GISEL-NEXT: v_rcp_iflag_f32_e32 v4, v4 ; GISEL-NEXT: v_mul_f32_e32 v4, 0x4f7ffffe, v4 ; GISEL-NEXT: v_cvt_u32_f32_e32 v4, v4 @@ -267,19 +274,19 @@ ; GISEL-NEXT: v_add_i32_e32 v3, vcc, v4, v3 ; GISEL-NEXT: v_mul_hi_u32 v4, v0, v3 ; GISEL-NEXT: v_mul_hi_u32 v3, v1, v3 -; GISEL-NEXT: v_mul_lo_u32 v4, v4, s4 +; GISEL-NEXT: v_mul_lo_u32 v4, v4, v2 ; GISEL-NEXT: v_mul_lo_u32 v3, v3, v2 ; GISEL-NEXT: v_sub_i32_e32 v0, vcc, v0, v4 ; GISEL-NEXT: v_sub_i32_e32 v1, vcc, v1, v3 -; GISEL-NEXT: v_subrev_i32_e32 v3, vcc, s4, v0 +; GISEL-NEXT: v_sub_i32_e32 v3, vcc, v0, v2 ; GISEL-NEXT: v_sub_i32_e32 v4, vcc, v1, v2 -; GISEL-NEXT: v_cmp_le_u32_e32 vcc, s4, v0 +; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v0, v2 ; GISEL-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc ; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v1, v2 ; GISEL-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc -; GISEL-NEXT: v_subrev_i32_e32 v3, vcc, s4, v0 +; GISEL-NEXT: v_sub_i32_e32 v3, vcc, v0, v2 ; GISEL-NEXT: v_sub_i32_e32 v4, vcc, v1, v2 -; GISEL-NEXT: v_cmp_le_u32_e32 vcc, s4, v0 +; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v0, v2 ; GISEL-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc ; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v1, v2 ; GISEL-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc @@ -288,32 +295,32 @@ ; CGP-LABEL: v_urem_v2i32_oddk_denom: ; CGP: ; %bb.0: ; CGP-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CGP-NEXT: s_mov_b32 s4, 0x12d8fb -; CGP-NEXT: v_rcp_iflag_f32_e32 v2, 0x4996c7d8 -; CGP-NEXT: s_mov_b32 s5, 0xffed2705 -; CGP-NEXT: v_mul_f32_e32 v2, 0x4f7ffffe, v2 -; CGP-NEXT: v_cvt_u32_f32_e32 v2, v2 -; CGP-NEXT: v_mul_lo_u32 v3, s5, v2 -; CGP-NEXT: v_mul_hi_u32 v3, v2, v3 -; CGP-NEXT: v_add_i32_e32 v2, vcc, v2, v3 -; CGP-NEXT: v_mul_hi_u32 v3, v0, v2 -; CGP-NEXT: v_mul_hi_u32 v2, v1, v2 -; CGP-NEXT: v_mul_lo_u32 v3, v3, s4 -; CGP-NEXT: v_mul_lo_u32 v2, v2, s4 -; CGP-NEXT: v_sub_i32_e32 v0, vcc, v0, v3 -; CGP-NEXT: v_sub_i32_e32 v1, vcc, v1, v2 -; CGP-NEXT: v_subrev_i32_e32 v2, vcc, s4, v0 -; CGP-NEXT: v_subrev_i32_e32 v3, vcc, s4, v1 -; CGP-NEXT: v_cmp_le_u32_e32 vcc, s4, v0 -; CGP-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc -; CGP-NEXT: v_cmp_le_u32_e32 vcc, s4, v1 -; CGP-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc -; CGP-NEXT: v_subrev_i32_e32 v2, vcc, s4, v0 -; CGP-NEXT: v_subrev_i32_e32 v3, vcc, 0x12d8fb, v1 -; CGP-NEXT: v_cmp_le_u32_e32 vcc, s4, v0 -; CGP-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc -; CGP-NEXT: v_cmp_le_u32_e32 vcc, s4, v1 -; CGP-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc +; CGP-NEXT: v_mov_b32_e32 v2, 0x12d8fb +; CGP-NEXT: v_rcp_iflag_f32_e32 v3, 0x4996c7d8 +; CGP-NEXT: v_mov_b32_e32 v4, 0xffed2705 +; CGP-NEXT: v_mul_f32_e32 v3, 0x4f7ffffe, v3 +; CGP-NEXT: v_cvt_u32_f32_e32 v3, v3 +; CGP-NEXT: v_mul_lo_u32 v4, v4, v3 +; CGP-NEXT: v_mul_hi_u32 v4, v3, v4 +; CGP-NEXT: v_add_i32_e32 v3, vcc, v3, v4 +; CGP-NEXT: v_mul_hi_u32 v4, v0, v3 +; CGP-NEXT: v_mul_hi_u32 v3, v1, v3 +; CGP-NEXT: v_mul_lo_u32 v4, v4, v2 +; CGP-NEXT: v_mul_lo_u32 v3, v3, v2 +; CGP-NEXT: v_sub_i32_e32 v0, vcc, v0, v4 +; CGP-NEXT: v_sub_i32_e32 v1, vcc, v1, v3 +; CGP-NEXT: v_sub_i32_e32 v3, vcc, v0, v2 +; CGP-NEXT: v_sub_i32_e32 v4, vcc, v1, v2 +; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v0, v2 +; CGP-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc +; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v1, v2 +; CGP-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc +; CGP-NEXT: v_sub_i32_e32 v3, vcc, v0, v2 +; CGP-NEXT: v_sub_i32_e32 v4, vcc, v1, v2 +; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v0, v2 +; CGP-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc +; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v1, v2 +; CGP-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc ; CGP-NEXT: s_setpc_b64 s[30:31] %result = urem <2 x i32> %num, ret <2 x i32> %result @@ -351,18 +358,18 @@ ; GISEL-LABEL: v_urem_v2i32_pow2_shl_denom: ; GISEL: ; %bb.0: ; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GISEL-NEXT: s_movk_i32 s4, 0x1000 -; GISEL-NEXT: s_mov_b32 s5, 0x4f7ffffe -; GISEL-NEXT: v_lshl_b32_e32 v2, s4, v2 -; GISEL-NEXT: v_lshl_b32_e32 v3, s4, v3 +; GISEL-NEXT: v_mov_b32_e32 v4, 0x1000 +; GISEL-NEXT: s_mov_b32 s4, 0x4f7ffffe +; GISEL-NEXT: v_lshlrev_b32_e32 v2, v2, v4 +; GISEL-NEXT: v_lshlrev_b32_e32 v3, v3, v4 ; GISEL-NEXT: v_cvt_f32_u32_e32 v4, v2 ; GISEL-NEXT: v_sub_i32_e32 v5, vcc, 0, v2 ; GISEL-NEXT: v_cvt_f32_u32_e32 v6, v3 ; GISEL-NEXT: v_sub_i32_e32 v7, vcc, 0, v3 ; GISEL-NEXT: v_rcp_iflag_f32_e32 v4, v4 ; GISEL-NEXT: v_rcp_iflag_f32_e32 v6, v6 -; GISEL-NEXT: v_mul_f32_e32 v4, s5, v4 -; GISEL-NEXT: v_mul_f32_e32 v6, s5, v6 +; GISEL-NEXT: v_mul_f32_e32 v4, s4, v4 +; GISEL-NEXT: v_mul_f32_e32 v6, s4, v6 ; GISEL-NEXT: v_cvt_u32_f32_e32 v4, v4 ; GISEL-NEXT: v_cvt_u32_f32_e32 v6, v6 ; GISEL-NEXT: v_mul_lo_u32 v5, v5, v4 @@ -394,9 +401,9 @@ ; CGP-LABEL: v_urem_v2i32_pow2_shl_denom: ; CGP: ; %bb.0: ; CGP-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CGP-NEXT: s_movk_i32 s4, 0x1000 -; CGP-NEXT: v_lshl_b32_e32 v2, s4, v2 -; CGP-NEXT: v_lshl_b32_e32 v3, s4, v3 +; CGP-NEXT: v_mov_b32_e32 v4, 0x1000 +; CGP-NEXT: v_lshlrev_b32_e32 v2, v2, v4 +; CGP-NEXT: v_lshlrev_b32_e32 v3, v3, v4 ; CGP-NEXT: v_cvt_f32_u32_e32 v4, v2 ; CGP-NEXT: v_sub_i32_e32 v5, vcc, 0, v2 ; CGP-NEXT: v_cvt_f32_u32_e32 v6, v3 @@ -449,9 +456,9 @@ ; GISEL-LABEL: v_urem_i32_24bit: ; GISEL: ; %bb.0: ; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GISEL-NEXT: s_mov_b32 s4, 0xffffff -; GISEL-NEXT: v_and_b32_e32 v0, s4, v0 -; GISEL-NEXT: v_and_b32_e32 v1, s4, v1 +; GISEL-NEXT: v_mov_b32_e32 v2, 0xffffff +; GISEL-NEXT: v_and_b32_e32 v0, v0, v2 +; GISEL-NEXT: v_and_b32_e32 v1, v1, v2 ; GISEL-NEXT: v_cvt_f32_u32_e32 v2, v1 ; GISEL-NEXT: v_sub_i32_e32 v3, vcc, 0, v1 ; GISEL-NEXT: v_rcp_iflag_f32_e32 v2, v2 @@ -474,9 +481,9 @@ ; CGP-LABEL: v_urem_i32_24bit: ; CGP: ; %bb.0: ; CGP-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CGP-NEXT: s_mov_b32 s4, 0xffffff -; CGP-NEXT: v_and_b32_e32 v0, s4, v0 -; CGP-NEXT: v_and_b32_e32 v1, s4, v1 +; CGP-NEXT: v_mov_b32_e32 v2, 0xffffff +; CGP-NEXT: v_and_b32_e32 v0, v0, v2 +; CGP-NEXT: v_and_b32_e32 v1, v1, v2 ; CGP-NEXT: v_cvt_f32_u32_e32 v2, v1 ; CGP-NEXT: v_sub_i32_e32 v3, vcc, 0, v1 ; CGP-NEXT: v_rcp_f32_e32 v2, v2 Index: llvm/test/CodeGen/AMDGPU/GlobalISel/urem.i64.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/GlobalISel/urem.i64.ll +++ llvm/test/CodeGen/AMDGPU/GlobalISel/urem.i64.ll @@ -152,23 +152,23 @@ ; CHECK-NEXT: s_cbranch_execz BB0_4 ; CHECK-NEXT: ; %bb.3: ; CHECK-NEXT: v_cvt_f32_u32_e32 v0, v2 -; CHECK-NEXT: v_sub_i32_e32 v1, vcc, 0, v2 +; CHECK-NEXT: v_mov_b32_e32 v1, 0 +; CHECK-NEXT: v_sub_i32_e32 v3, vcc, 0, v2 ; CHECK-NEXT: v_rcp_iflag_f32_e32 v0, v0 ; CHECK-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; CHECK-NEXT: v_cvt_u32_f32_e32 v0, v0 -; CHECK-NEXT: v_mul_lo_u32 v1, v1, v0 -; CHECK-NEXT: v_mul_hi_u32 v1, v0, v1 -; CHECK-NEXT: v_add_i32_e32 v0, vcc, v0, v1 +; CHECK-NEXT: v_mul_lo_u32 v3, v3, v0 +; CHECK-NEXT: v_mul_hi_u32 v3, v0, v3 +; CHECK-NEXT: v_add_i32_e32 v0, vcc, v0, v3 ; CHECK-NEXT: v_mul_hi_u32 v0, v4, v0 ; CHECK-NEXT: v_mul_lo_u32 v0, v0, v2 ; CHECK-NEXT: v_sub_i32_e32 v0, vcc, v4, v0 -; CHECK-NEXT: v_sub_i32_e32 v1, vcc, v0, v2 +; CHECK-NEXT: v_sub_i32_e32 v3, vcc, v0, v2 ; CHECK-NEXT: v_cmp_ge_u32_e32 vcc, v0, v2 -; CHECK-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc -; CHECK-NEXT: v_sub_i32_e32 v1, vcc, v0, v2 +; CHECK-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc +; CHECK-NEXT: v_sub_i32_e32 v3, vcc, v0, v2 ; CHECK-NEXT: v_cmp_ge_u32_e32 vcc, v0, v2 -; CHECK-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc -; CHECK-NEXT: v_mov_b32_e32 v1, 0 +; CHECK-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc ; CHECK-NEXT: BB0_4: ; CHECK-NEXT: s_or_b64 exec, exec, s[4:5] ; CHECK-NEXT: s_setpc_b64 s[30:31] @@ -766,23 +766,23 @@ ; CGP-NEXT: s_cbranch_execz BB2_4 ; CGP-NEXT: ; %bb.3: ; CGP-NEXT: v_cvt_f32_u32_e32 v0, v4 -; CGP-NEXT: v_sub_i32_e32 v1, vcc, 0, v4 +; CGP-NEXT: v_mov_b32_e32 v1, 0 +; CGP-NEXT: v_sub_i32_e32 v2, vcc, 0, v4 ; CGP-NEXT: v_rcp_iflag_f32_e32 v0, v0 ; CGP-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; CGP-NEXT: v_cvt_u32_f32_e32 v0, v0 -; CGP-NEXT: v_mul_lo_u32 v1, v1, v0 -; CGP-NEXT: v_mul_hi_u32 v1, v0, v1 -; CGP-NEXT: v_add_i32_e32 v0, vcc, v0, v1 +; CGP-NEXT: v_mul_lo_u32 v2, v2, v0 +; CGP-NEXT: v_mul_hi_u32 v2, v0, v2 +; CGP-NEXT: v_add_i32_e32 v0, vcc, v0, v2 ; CGP-NEXT: v_mul_hi_u32 v0, v10, v0 ; CGP-NEXT: v_mul_lo_u32 v0, v0, v4 ; CGP-NEXT: v_sub_i32_e32 v0, vcc, v10, v0 -; CGP-NEXT: v_sub_i32_e32 v1, vcc, v0, v4 +; CGP-NEXT: v_sub_i32_e32 v2, vcc, v0, v4 ; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v0, v4 -; CGP-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc -; CGP-NEXT: v_sub_i32_e32 v1, vcc, v0, v4 +; CGP-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc +; CGP-NEXT: v_sub_i32_e32 v2, vcc, v0, v4 ; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v0, v4 -; CGP-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc -; CGP-NEXT: v_mov_b32_e32 v1, 0 +; CGP-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc ; CGP-NEXT: BB2_4: ; CGP-NEXT: s_or_b64 exec, exec, s[4:5] ; CGP-NEXT: v_or_b32_e32 v3, v9, v7 @@ -927,23 +927,23 @@ ; CGP-NEXT: s_cbranch_execz BB2_8 ; CGP-NEXT: ; %bb.7: ; CGP-NEXT: v_cvt_f32_u32_e32 v2, v6 -; CGP-NEXT: v_sub_i32_e32 v3, vcc, 0, v6 +; CGP-NEXT: v_mov_b32_e32 v3, 0 +; CGP-NEXT: v_sub_i32_e32 v4, vcc, 0, v6 ; CGP-NEXT: v_rcp_iflag_f32_e32 v2, v2 ; CGP-NEXT: v_mul_f32_e32 v2, 0x4f7ffffe, v2 ; CGP-NEXT: v_cvt_u32_f32_e32 v2, v2 -; CGP-NEXT: v_mul_lo_u32 v3, v3, v2 -; CGP-NEXT: v_mul_hi_u32 v3, v2, v3 -; CGP-NEXT: v_add_i32_e32 v2, vcc, v2, v3 +; CGP-NEXT: v_mul_lo_u32 v4, v4, v2 +; CGP-NEXT: v_mul_hi_u32 v4, v2, v4 +; CGP-NEXT: v_add_i32_e32 v2, vcc, v2, v4 ; CGP-NEXT: v_mul_hi_u32 v2, v8, v2 ; CGP-NEXT: v_mul_lo_u32 v2, v2, v6 ; CGP-NEXT: v_sub_i32_e32 v2, vcc, v8, v2 -; CGP-NEXT: v_sub_i32_e32 v3, vcc, v2, v6 +; CGP-NEXT: v_sub_i32_e32 v4, vcc, v2, v6 ; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v2, v6 -; CGP-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc -; CGP-NEXT: v_sub_i32_e32 v3, vcc, v2, v6 +; CGP-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc +; CGP-NEXT: v_sub_i32_e32 v4, vcc, v2, v6 ; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v2, v6 -; CGP-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc -; CGP-NEXT: v_mov_b32_e32 v3, 0 +; CGP-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc ; CGP-NEXT: BB2_8: ; CGP-NEXT: s_or_b64 exec, exec, s[4:5] ; CGP-NEXT: s_setpc_b64 s[30:31] @@ -980,131 +980,131 @@ ; CHECK-LABEL: v_urem_i64_oddk_denom: ; CHECK: ; %bb.0: ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: v_cvt_f32_u32_e32 v2, 0x12d8fb +; CHECK-NEXT: v_mov_b32_e32 v2, 0x12d8fb ; CHECK-NEXT: v_cvt_f32_ubyte0_e32 v3, 0 -; CHECK-NEXT: s_mov_b32 s6, 0xffed2705 -; CHECK-NEXT: s_mov_b32 s7, 0x12d8fb +; CHECK-NEXT: v_mov_b32_e32 v4, 0xffed2705 ; CHECK-NEXT: s_bfe_i32 s4, -1, 0x10000 ; CHECK-NEXT: s_bfe_i32 s5, -1, 0x10000 -; CHECK-NEXT: v_mac_f32_e32 v2, 0x4f800000, v3 -; CHECK-NEXT: v_mov_b32_e32 v3, s4 -; CHECK-NEXT: v_mov_b32_e32 v4, s5 -; CHECK-NEXT: v_rcp_iflag_f32_e32 v2, v2 -; CHECK-NEXT: v_mul_f32_e32 v2, 0x5f7ffffc, v2 -; CHECK-NEXT: v_mul_f32_e32 v5, 0x2f800000, v2 +; CHECK-NEXT: v_cvt_f32_u32_e32 v5, v2 +; CHECK-NEXT: v_mov_b32_e32 v6, s4 +; CHECK-NEXT: v_mov_b32_e32 v7, s5 +; CHECK-NEXT: v_mac_f32_e32 v5, 0x4f800000, v3 +; CHECK-NEXT: v_rcp_iflag_f32_e32 v3, v5 +; CHECK-NEXT: v_mul_f32_e32 v3, 0x5f7ffffc, v3 +; CHECK-NEXT: v_mul_f32_e32 v5, 0x2f800000, v3 ; CHECK-NEXT: v_trunc_f32_e32 v5, v5 -; CHECK-NEXT: v_mac_f32_e32 v2, 0xcf800000, v5 +; CHECK-NEXT: v_mac_f32_e32 v3, 0xcf800000, v5 ; CHECK-NEXT: v_cvt_u32_f32_e32 v5, v5 -; CHECK-NEXT: v_cvt_u32_f32_e32 v2, v2 -; CHECK-NEXT: v_mul_lo_u32 v6, s6, v5 -; CHECK-NEXT: v_mul_lo_u32 v7, s6, v2 -; CHECK-NEXT: v_mul_lo_u32 v8, -1, v2 -; CHECK-NEXT: v_mul_hi_u32 v9, s6, v2 -; CHECK-NEXT: v_add_i32_e32 v6, vcc, v8, v6 -; CHECK-NEXT: v_mul_lo_u32 v8, v5, v7 -; CHECK-NEXT: v_mul_hi_u32 v10, v2, v7 -; CHECK-NEXT: v_mul_hi_u32 v7, v5, v7 -; CHECK-NEXT: v_add_i32_e32 v6, vcc, v6, v9 -; CHECK-NEXT: v_mul_lo_u32 v9, v2, v6 -; CHECK-NEXT: v_mul_lo_u32 v11, v5, v6 -; CHECK-NEXT: v_mul_hi_u32 v12, v2, v6 -; CHECK-NEXT: v_mul_hi_u32 v6, v5, v6 -; CHECK-NEXT: v_add_i32_e32 v8, vcc, v8, v9 -; CHECK-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc -; CHECK-NEXT: v_add_i32_e32 v7, vcc, v11, v7 +; CHECK-NEXT: v_cvt_u32_f32_e32 v3, v3 +; CHECK-NEXT: v_mul_lo_u32 v8, v4, v5 +; CHECK-NEXT: v_mul_lo_u32 v9, v4, v3 +; CHECK-NEXT: v_mul_lo_u32 v10, -1, v3 +; CHECK-NEXT: v_mul_hi_u32 v11, v4, v3 +; CHECK-NEXT: v_add_i32_e32 v8, vcc, v10, v8 +; CHECK-NEXT: v_mul_lo_u32 v10, v5, v9 +; CHECK-NEXT: v_mul_hi_u32 v12, v3, v9 +; CHECK-NEXT: v_mul_hi_u32 v9, v5, v9 +; CHECK-NEXT: v_add_i32_e32 v8, vcc, v8, v11 +; CHECK-NEXT: v_mul_lo_u32 v11, v3, v8 +; CHECK-NEXT: v_mul_lo_u32 v13, v5, v8 +; CHECK-NEXT: v_mul_hi_u32 v14, v3, v8 +; CHECK-NEXT: v_mul_hi_u32 v8, v5, v8 +; CHECK-NEXT: v_add_i32_e32 v10, vcc, v10, v11 ; CHECK-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc -; CHECK-NEXT: v_add_i32_e32 v8, vcc, v8, v10 -; CHECK-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc -; CHECK-NEXT: v_add_i32_e32 v7, vcc, v7, v12 +; CHECK-NEXT: v_add_i32_e32 v9, vcc, v13, v9 +; CHECK-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc +; CHECK-NEXT: v_add_i32_e32 v10, vcc, v10, v12 ; CHECK-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc -; CHECK-NEXT: v_add_i32_e32 v8, vcc, v9, v8 -; CHECK-NEXT: v_add_i32_e32 v9, vcc, v11, v10 -; CHECK-NEXT: v_add_i32_e32 v7, vcc, v7, v8 -; CHECK-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc -; CHECK-NEXT: v_add_i32_e32 v8, vcc, v9, v8 -; CHECK-NEXT: v_add_i32_e32 v6, vcc, v6, v8 -; CHECK-NEXT: v_add_i32_e32 v2, vcc, v2, v7 -; CHECK-NEXT: v_addc_u32_e64 v7, s[4:5], v5, v6, vcc -; CHECK-NEXT: v_add_i32_e64 v5, s[4:5], v5, v6 -; CHECK-NEXT: v_mul_lo_u32 v6, s6, v2 -; CHECK-NEXT: v_mul_lo_u32 v8, -1, v2 -; CHECK-NEXT: v_mul_hi_u32 v9, s6, v2 -; CHECK-NEXT: v_mul_lo_u32 v10, s6, v7 -; CHECK-NEXT: v_mul_lo_u32 v11, v7, v6 -; CHECK-NEXT: v_mul_hi_u32 v12, v2, v6 -; CHECK-NEXT: v_mul_hi_u32 v6, v7, v6 -; CHECK-NEXT: v_add_i32_e64 v8, s[4:5], v8, v10 -; CHECK-NEXT: v_add_i32_e64 v8, s[4:5], v8, v9 -; CHECK-NEXT: v_mul_lo_u32 v9, v2, v8 -; CHECK-NEXT: v_mul_lo_u32 v10, v7, v8 -; CHECK-NEXT: v_mul_hi_u32 v13, v2, v8 -; CHECK-NEXT: v_mul_hi_u32 v7, v7, v8 -; CHECK-NEXT: v_add_i32_e64 v8, s[4:5], v11, v9 -; CHECK-NEXT: v_cndmask_b32_e64 v9, 0, 1, s[4:5] -; CHECK-NEXT: v_add_i32_e64 v6, s[4:5], v10, v6 +; CHECK-NEXT: v_add_i32_e32 v9, vcc, v9, v14 +; CHECK-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc +; CHECK-NEXT: v_add_i32_e32 v10, vcc, v11, v10 +; CHECK-NEXT: v_add_i32_e32 v11, vcc, v13, v12 +; CHECK-NEXT: v_add_i32_e32 v9, vcc, v9, v10 +; CHECK-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc +; CHECK-NEXT: v_add_i32_e32 v10, vcc, v11, v10 +; CHECK-NEXT: v_add_i32_e32 v8, vcc, v8, v10 +; CHECK-NEXT: v_add_i32_e32 v3, vcc, v3, v9 +; CHECK-NEXT: v_addc_u32_e64 v9, s[4:5], v5, v8, vcc +; CHECK-NEXT: v_add_i32_e64 v5, s[4:5], v5, v8 +; CHECK-NEXT: v_mul_lo_u32 v8, v4, v3 +; CHECK-NEXT: v_mul_lo_u32 v10, -1, v3 +; CHECK-NEXT: v_mul_hi_u32 v11, v4, v3 +; CHECK-NEXT: v_mul_lo_u32 v4, v4, v9 +; CHECK-NEXT: v_mul_lo_u32 v12, v9, v8 +; CHECK-NEXT: v_mul_hi_u32 v13, v3, v8 +; CHECK-NEXT: v_mul_hi_u32 v8, v9, v8 +; CHECK-NEXT: v_add_i32_e64 v4, s[4:5], v10, v4 +; CHECK-NEXT: v_add_i32_e64 v4, s[4:5], v4, v11 +; CHECK-NEXT: v_mul_lo_u32 v10, v3, v4 +; CHECK-NEXT: v_mul_lo_u32 v11, v9, v4 +; CHECK-NEXT: v_mul_hi_u32 v14, v3, v4 +; CHECK-NEXT: v_mul_hi_u32 v4, v9, v4 +; CHECK-NEXT: v_add_i32_e64 v9, s[4:5], v12, v10 ; CHECK-NEXT: v_cndmask_b32_e64 v10, 0, 1, s[4:5] -; CHECK-NEXT: v_add_i32_e64 v8, s[4:5], v8, v12 -; CHECK-NEXT: v_cndmask_b32_e64 v8, 0, 1, s[4:5] -; CHECK-NEXT: v_add_i32_e64 v6, s[4:5], v6, v13 +; CHECK-NEXT: v_add_i32_e64 v8, s[4:5], v11, v8 ; CHECK-NEXT: v_cndmask_b32_e64 v11, 0, 1, s[4:5] -; CHECK-NEXT: v_add_i32_e64 v8, s[4:5], v9, v8 -; CHECK-NEXT: v_add_i32_e64 v9, s[4:5], v10, v11 -; CHECK-NEXT: v_add_i32_e64 v6, s[4:5], v6, v8 -; CHECK-NEXT: v_cndmask_b32_e64 v8, 0, 1, s[4:5] -; CHECK-NEXT: v_add_i32_e64 v8, s[4:5], v9, v8 -; CHECK-NEXT: v_add_i32_e64 v7, s[4:5], v7, v8 -; CHECK-NEXT: v_addc_u32_e32 v5, vcc, v5, v7, vcc -; CHECK-NEXT: v_add_i32_e32 v2, vcc, v2, v6 -; CHECK-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc -; CHECK-NEXT: v_mul_lo_u32 v6, v1, v2 -; CHECK-NEXT: v_mul_hi_u32 v7, v0, v2 -; CHECK-NEXT: v_mul_hi_u32 v2, v1, v2 -; CHECK-NEXT: v_mul_lo_u32 v8, v0, v5 -; CHECK-NEXT: v_mul_lo_u32 v9, v1, v5 -; CHECK-NEXT: v_mul_hi_u32 v10, v0, v5 -; CHECK-NEXT: v_mul_hi_u32 v5, v1, v5 -; CHECK-NEXT: v_add_i32_e32 v6, vcc, v6, v8 -; CHECK-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc -; CHECK-NEXT: v_add_i32_e32 v2, vcc, v9, v2 +; CHECK-NEXT: v_add_i32_e64 v9, s[4:5], v9, v13 +; CHECK-NEXT: v_cndmask_b32_e64 v9, 0, 1, s[4:5] +; CHECK-NEXT: v_add_i32_e64 v8, s[4:5], v8, v14 +; CHECK-NEXT: v_cndmask_b32_e64 v12, 0, 1, s[4:5] +; CHECK-NEXT: v_add_i32_e64 v9, s[4:5], v10, v9 +; CHECK-NEXT: v_add_i32_e64 v10, s[4:5], v11, v12 +; CHECK-NEXT: v_add_i32_e64 v8, s[4:5], v8, v9 +; CHECK-NEXT: v_cndmask_b32_e64 v9, 0, 1, s[4:5] +; CHECK-NEXT: v_add_i32_e64 v9, s[4:5], v10, v9 +; CHECK-NEXT: v_add_i32_e64 v4, s[4:5], v4, v9 +; CHECK-NEXT: v_addc_u32_e32 v4, vcc, v5, v4, vcc +; CHECK-NEXT: v_add_i32_e32 v3, vcc, v3, v8 +; CHECK-NEXT: v_addc_u32_e32 v4, vcc, 0, v4, vcc +; CHECK-NEXT: v_mul_lo_u32 v5, v1, v3 +; CHECK-NEXT: v_mul_hi_u32 v8, v0, v3 +; CHECK-NEXT: v_mul_hi_u32 v3, v1, v3 +; CHECK-NEXT: v_mul_lo_u32 v9, v0, v4 +; CHECK-NEXT: v_mul_lo_u32 v10, v1, v4 +; CHECK-NEXT: v_mul_hi_u32 v11, v0, v4 +; CHECK-NEXT: v_mul_hi_u32 v4, v1, v4 +; CHECK-NEXT: v_add_i32_e32 v5, vcc, v5, v9 ; CHECK-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc -; CHECK-NEXT: v_add_i32_e32 v6, vcc, v6, v7 -; CHECK-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc -; CHECK-NEXT: v_add_i32_e32 v2, vcc, v2, v10 -; CHECK-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc -; CHECK-NEXT: v_add_i32_e32 v6, vcc, v8, v6 -; CHECK-NEXT: v_add_i32_e32 v7, vcc, v9, v7 -; CHECK-NEXT: v_add_i32_e32 v2, vcc, v2, v6 -; CHECK-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc -; CHECK-NEXT: v_add_i32_e32 v6, vcc, v7, v6 -; CHECK-NEXT: v_mul_lo_u32 v7, s7, v2 -; CHECK-NEXT: v_mul_lo_u32 v8, 0, v2 -; CHECK-NEXT: v_mul_hi_u32 v2, s7, v2 -; CHECK-NEXT: v_add_i32_e32 v5, vcc, v5, v6 -; CHECK-NEXT: v_mul_lo_u32 v5, s7, v5 +; CHECK-NEXT: v_add_i32_e32 v3, vcc, v10, v3 +; CHECK-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc +; CHECK-NEXT: v_add_i32_e32 v5, vcc, v5, v8 +; CHECK-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc +; CHECK-NEXT: v_add_i32_e32 v3, vcc, v3, v11 +; CHECK-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc +; CHECK-NEXT: v_add_i32_e32 v5, vcc, v9, v5 +; CHECK-NEXT: v_add_i32_e32 v8, vcc, v10, v8 +; CHECK-NEXT: v_add_i32_e32 v3, vcc, v3, v5 +; CHECK-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc ; CHECK-NEXT: v_add_i32_e32 v5, vcc, v8, v5 -; CHECK-NEXT: v_add_i32_e32 v2, vcc, v5, v2 -; CHECK-NEXT: v_sub_i32_e32 v0, vcc, v0, v7 -; CHECK-NEXT: v_subb_u32_e64 v5, s[4:5], v1, v2, vcc -; CHECK-NEXT: v_sub_i32_e64 v1, s[4:5], v1, v2 -; CHECK-NEXT: v_cmp_le_u32_e64 s[4:5], s7, v0 -; CHECK-NEXT: v_cndmask_b32_e64 v2, 0, -1, s[4:5] -; CHECK-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v5 -; CHECK-NEXT: v_cndmask_b32_e64 v2, v3, v2, s[4:5] +; CHECK-NEXT: v_mul_lo_u32 v8, v2, v3 +; CHECK-NEXT: v_mul_lo_u32 v9, 0, v3 +; CHECK-NEXT: v_mul_hi_u32 v3, v2, v3 +; CHECK-NEXT: v_add_i32_e32 v4, vcc, v4, v5 +; CHECK-NEXT: v_mul_lo_u32 v4, v2, v4 +; CHECK-NEXT: v_add_i32_e32 v4, vcc, v9, v4 +; CHECK-NEXT: v_add_i32_e32 v3, vcc, v4, v3 +; CHECK-NEXT: v_sub_i32_e32 v0, vcc, v0, v8 +; CHECK-NEXT: v_subb_u32_e64 v4, s[4:5], v1, v3, vcc +; CHECK-NEXT: v_sub_i32_e64 v1, s[4:5], v1, v3 +; CHECK-NEXT: v_cmp_ge_u32_e64 s[4:5], v0, v2 +; CHECK-NEXT: v_cndmask_b32_e64 v3, 0, -1, s[4:5] +; CHECK-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v4 +; CHECK-NEXT: v_cndmask_b32_e64 v3, v6, v3, s[4:5] ; CHECK-NEXT: v_subbrev_u32_e32 v1, vcc, 0, v1, vcc -; CHECK-NEXT: v_subrev_i32_e32 v3, vcc, s7, v0 +; CHECK-NEXT: v_sub_i32_e32 v5, vcc, v0, v2 ; CHECK-NEXT: v_subbrev_u32_e32 v1, vcc, 0, v1, vcc -; CHECK-NEXT: v_cmp_le_u32_e32 vcc, s7, v3 +; CHECK-NEXT: v_cmp_ge_u32_e32 vcc, v5, v2 ; CHECK-NEXT: v_cndmask_b32_e64 v6, 0, -1, vcc ; CHECK-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 -; CHECK-NEXT: v_cndmask_b32_e32 v4, v4, v6, vcc -; CHECK-NEXT: v_subrev_i32_e32 v6, vcc, s7, v3 +; CHECK-NEXT: v_cndmask_b32_e32 v6, v7, v6, vcc +; CHECK-NEXT: v_sub_i32_e32 v2, vcc, v5, v2 ; CHECK-NEXT: v_subbrev_u32_e32 v7, vcc, 0, v1, vcc -; CHECK-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 -; CHECK-NEXT: v_cndmask_b32_e32 v3, v3, v6, vcc +; CHECK-NEXT: v_cmp_ne_u32_e32 vcc, 0, v6 +; CHECK-NEXT: v_cndmask_b32_e32 v2, v5, v2, vcc ; CHECK-NEXT: v_cndmask_b32_e32 v1, v1, v7, vcc -; CHECK-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 -; CHECK-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc -; CHECK-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc +; CHECK-NEXT: v_cmp_ne_u32_e32 vcc, 0, v3 +; CHECK-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc +; CHECK-NEXT: v_cndmask_b32_e32 v1, v4, v1, vcc ; CHECK-NEXT: s_setpc_b64 s[30:31] %result = urem i64 %num, 1235195 ret i64 %result @@ -1114,514 +1114,515 @@ ; GISEL-LABEL: v_urem_v2i64_oddk_denom: ; GISEL: ; %bb.0: ; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GISEL-NEXT: s_mov_b32 s12, 0x12d8fb -; GISEL-NEXT: v_cvt_f32_u32_e32 v4, s12 -; GISEL-NEXT: s_sub_u32 s8, 0, s12 -; GISEL-NEXT: s_cselect_b32 s4, 1, 0 -; GISEL-NEXT: v_cvt_f32_ubyte0_e32 v5, 0 -; GISEL-NEXT: v_mov_b32_e32 v6, v4 -; GISEL-NEXT: s_and_b32 s4, s4, 1 -; GISEL-NEXT: v_mac_f32_e32 v4, 0x4f800000, v5 -; GISEL-NEXT: v_mac_f32_e32 v6, 0x4f800000, v5 -; GISEL-NEXT: v_rcp_iflag_f32_e32 v4, v4 -; GISEL-NEXT: v_rcp_iflag_f32_e32 v5, v6 -; GISEL-NEXT: s_cmp_lg_u32 s4, 0 +; GISEL-NEXT: v_mov_b32_e32 v4, 0x12d8fb +; GISEL-NEXT: s_mov_b32 s4, 0x12d8fb +; GISEL-NEXT: v_cvt_f32_u32_e32 v5, v4 +; GISEL-NEXT: s_sub_u32 s8, 0, s4 +; GISEL-NEXT: s_cselect_b32 s5, 1, 0 +; GISEL-NEXT: v_cvt_f32_ubyte0_e32 v6, 0 +; GISEL-NEXT: v_mov_b32_e32 v7, v5 +; GISEL-NEXT: s_and_b32 s5, s5, 1 +; GISEL-NEXT: v_mac_f32_e32 v5, 0x4f800000, v6 +; GISEL-NEXT: v_mac_f32_e32 v7, 0x4f800000, v6 +; GISEL-NEXT: v_rcp_iflag_f32_e32 v5, v5 +; GISEL-NEXT: v_rcp_iflag_f32_e32 v6, v7 +; GISEL-NEXT: s_cmp_lg_u32 s5, 0 ; GISEL-NEXT: s_subb_u32 s9, 0, 0 ; GISEL-NEXT: s_bfe_i32 s10, -1, 0x10000 -; GISEL-NEXT: s_bfe_i32 s11, -1, 0x10000 -; GISEL-NEXT: v_mul_f32_e32 v4, 0x5f7ffffc, v4 +; GISEL-NEXT: s_bfe_i32 s12, -1, 0x10000 ; GISEL-NEXT: v_mul_f32_e32 v5, 0x5f7ffffc, v5 -; GISEL-NEXT: v_mul_f32_e32 v6, 0x2f800000, v4 -; GISEL-NEXT: s_sub_u32 s13, 0, s12 -; GISEL-NEXT: s_cselect_b32 s4, 1, 0 +; GISEL-NEXT: v_mul_f32_e32 v6, 0x5f7ffffc, v6 ; GISEL-NEXT: v_mul_f32_e32 v7, 0x2f800000, v5 -; GISEL-NEXT: v_trunc_f32_e32 v6, v6 -; GISEL-NEXT: s_and_b32 s4, s4, 1 +; GISEL-NEXT: s_sub_u32 s11, 0, s4 +; GISEL-NEXT: s_cselect_b32 s4, 1, 0 +; GISEL-NEXT: v_mul_f32_e32 v8, 0x2f800000, v6 ; GISEL-NEXT: v_trunc_f32_e32 v7, v7 -; GISEL-NEXT: v_mac_f32_e32 v4, 0xcf800000, v6 -; GISEL-NEXT: v_cvt_u32_f32_e32 v6, v6 +; GISEL-NEXT: s_and_b32 s4, s4, 1 +; GISEL-NEXT: v_trunc_f32_e32 v8, v8 ; GISEL-NEXT: v_mac_f32_e32 v5, 0xcf800000, v7 ; GISEL-NEXT: v_cvt_u32_f32_e32 v7, v7 -; GISEL-NEXT: v_cvt_u32_f32_e32 v4, v4 -; GISEL-NEXT: s_cmp_lg_u32 s4, 0 -; GISEL-NEXT: s_subb_u32 s6, 0, 0 -; GISEL-NEXT: v_mul_lo_u32 v8, s13, v6 +; GISEL-NEXT: v_mac_f32_e32 v6, 0xcf800000, v8 +; GISEL-NEXT: v_cvt_u32_f32_e32 v8, v8 ; GISEL-NEXT: v_cvt_u32_f32_e32 v5, v5 -; GISEL-NEXT: v_mul_lo_u32 v9, s8, v7 -; GISEL-NEXT: v_mul_lo_u32 v10, s13, v4 -; GISEL-NEXT: v_mul_lo_u32 v11, s6, v4 -; GISEL-NEXT: v_mul_hi_u32 v12, s13, v4 -; GISEL-NEXT: v_mul_lo_u32 v13, s8, v5 -; GISEL-NEXT: v_mul_lo_u32 v14, s9, v5 -; GISEL-NEXT: v_mul_hi_u32 v15, s8, v5 -; GISEL-NEXT: v_add_i32_e32 v8, vcc, v11, v8 -; GISEL-NEXT: v_mul_lo_u32 v11, v6, v10 -; GISEL-NEXT: v_mul_hi_u32 v16, v4, v10 -; GISEL-NEXT: v_mul_hi_u32 v10, v6, v10 -; GISEL-NEXT: v_add_i32_e32 v9, vcc, v14, v9 -; GISEL-NEXT: v_mul_lo_u32 v14, v7, v13 -; GISEL-NEXT: v_mul_hi_u32 v17, v5, v13 -; GISEL-NEXT: v_mul_hi_u32 v13, v7, v13 -; GISEL-NEXT: v_add_i32_e32 v8, vcc, v8, v12 -; GISEL-NEXT: v_add_i32_e32 v9, vcc, v9, v15 -; GISEL-NEXT: v_mul_lo_u32 v12, v4, v8 -; GISEL-NEXT: v_mul_lo_u32 v15, v6, v8 -; GISEL-NEXT: v_mul_hi_u32 v18, v4, v8 -; GISEL-NEXT: v_mul_hi_u32 v8, v6, v8 -; GISEL-NEXT: v_mul_lo_u32 v19, v5, v9 -; GISEL-NEXT: v_add_i32_e32 v14, vcc, v14, v19 -; GISEL-NEXT: v_cndmask_b32_e64 v19, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v14, vcc, v14, v17 -; GISEL-NEXT: v_mul_lo_u32 v14, v7, v9 -; GISEL-NEXT: v_mul_hi_u32 v17, v5, v9 +; GISEL-NEXT: s_cmp_lg_u32 s4, 0 +; GISEL-NEXT: s_subb_u32 s13, 0, 0 +; GISEL-NEXT: v_mul_lo_u32 v9, s11, v7 +; GISEL-NEXT: v_cvt_u32_f32_e32 v6, v6 +; GISEL-NEXT: v_mul_lo_u32 v10, s8, v8 +; GISEL-NEXT: v_mul_lo_u32 v11, s11, v5 +; GISEL-NEXT: v_mul_lo_u32 v12, s13, v5 +; GISEL-NEXT: v_mul_hi_u32 v13, s11, v5 +; GISEL-NEXT: v_mul_lo_u32 v14, s8, v6 +; GISEL-NEXT: v_mul_lo_u32 v15, s9, v6 +; GISEL-NEXT: v_mul_hi_u32 v16, s8, v6 +; GISEL-NEXT: v_add_i32_e32 v9, vcc, v12, v9 +; GISEL-NEXT: v_mul_lo_u32 v12, v7, v11 +; GISEL-NEXT: v_mul_hi_u32 v17, v5, v11 +; GISEL-NEXT: v_mul_hi_u32 v11, v7, v11 +; GISEL-NEXT: v_add_i32_e32 v10, vcc, v15, v10 +; GISEL-NEXT: v_mul_lo_u32 v15, v8, v14 +; GISEL-NEXT: v_mul_hi_u32 v18, v6, v14 +; GISEL-NEXT: v_mul_hi_u32 v14, v8, v14 +; GISEL-NEXT: v_add_i32_e32 v9, vcc, v9, v13 +; GISEL-NEXT: v_add_i32_e32 v10, vcc, v10, v16 +; GISEL-NEXT: v_mul_lo_u32 v13, v5, v9 +; GISEL-NEXT: v_mul_lo_u32 v16, v7, v9 +; GISEL-NEXT: v_mul_hi_u32 v19, v5, v9 ; GISEL-NEXT: v_mul_hi_u32 v9, v7, v9 -; GISEL-NEXT: v_add_i32_e64 v11, s[4:5], v11, v12 -; GISEL-NEXT: v_cndmask_b32_e64 v12, 0, 1, s[4:5] -; GISEL-NEXT: v_add_i32_e64 v10, s[4:5], v15, v10 +; GISEL-NEXT: v_add_i32_e32 v12, vcc, v12, v13 +; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v12, vcc, v12, v17 +; GISEL-NEXT: v_mul_lo_u32 v12, v6, v10 +; GISEL-NEXT: v_mul_lo_u32 v17, v8, v10 +; GISEL-NEXT: v_add_i32_e64 v12, s[4:5], v15, v12 ; GISEL-NEXT: v_cndmask_b32_e64 v15, 0, 1, s[4:5] -; GISEL-NEXT: v_add_i32_e64 v13, s[4:5], v14, v13 -; GISEL-NEXT: v_cndmask_b32_e64 v14, 0, 1, s[4:5] -; GISEL-NEXT: v_add_i32_e64 v11, s[4:5], v11, v16 -; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, 1, s[4:5] -; GISEL-NEXT: v_add_i32_e64 v10, s[4:5], v10, v18 -; GISEL-NEXT: v_cndmask_b32_e64 v16, 0, 1, s[4:5] +; GISEL-NEXT: v_add_i32_e64 v12, s[4:5], v12, v18 +; GISEL-NEXT: v_mul_hi_u32 v12, v6, v10 +; GISEL-NEXT: v_mul_hi_u32 v10, v8, v10 +; GISEL-NEXT: v_add_i32_e64 v11, s[6:7], v16, v11 +; GISEL-NEXT: v_cndmask_b32_e64 v16, 0, 1, s[6:7] +; GISEL-NEXT: v_add_i32_e64 v14, s[6:7], v17, v14 +; GISEL-NEXT: v_cndmask_b32_e64 v17, 0, 1, s[6:7] ; GISEL-NEXT: v_cndmask_b32_e64 v18, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v13, vcc, v13, v17 -; GISEL-NEXT: v_cndmask_b32_e64 v17, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v11, vcc, v12, v11 -; GISEL-NEXT: v_add_i32_e32 v12, vcc, v15, v16 -; GISEL-NEXT: v_add_i32_e32 v15, vcc, v19, v18 -; GISEL-NEXT: v_add_i32_e32 v14, vcc, v14, v17 -; GISEL-NEXT: v_add_i32_e32 v10, vcc, v10, v11 -; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v13, vcc, v13, v15 +; GISEL-NEXT: v_add_i32_e32 v11, vcc, v11, v19 +; GISEL-NEXT: v_cndmask_b32_e64 v19, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v13, vcc, v13, v18 +; GISEL-NEXT: v_cndmask_b32_e64 v18, 0, 1, s[4:5] +; GISEL-NEXT: v_add_i32_e32 v12, vcc, v14, v12 +; GISEL-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v16, vcc, v16, v19 +; GISEL-NEXT: v_add_i32_e32 v15, vcc, v15, v18 +; GISEL-NEXT: v_add_i32_e32 v14, vcc, v17, v14 +; GISEL-NEXT: v_add_i32_e32 v11, vcc, v11, v13 +; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v12, vcc, v12, v15 ; GISEL-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v11, vcc, v12, v11 -; GISEL-NEXT: v_add_i32_e32 v12, vcc, v14, v15 -; GISEL-NEXT: v_add_i32_e32 v8, vcc, v8, v11 -; GISEL-NEXT: v_add_i32_e32 v9, vcc, v9, v12 -; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v10 -; GISEL-NEXT: v_addc_u32_e64 v10, s[4:5], v6, v8, vcc -; GISEL-NEXT: v_mul_lo_u32 v11, s13, v4 -; GISEL-NEXT: v_mul_lo_u32 v12, s6, v4 -; GISEL-NEXT: v_mul_hi_u32 v14, s13, v4 -; GISEL-NEXT: v_add_i32_e64 v5, s[4:5], v5, v13 -; GISEL-NEXT: v_addc_u32_e64 v13, s[6:7], v7, v9, s[4:5] -; GISEL-NEXT: v_mul_lo_u32 v15, s8, v5 -; GISEL-NEXT: v_mul_lo_u32 v16, s9, v5 -; GISEL-NEXT: v_mul_hi_u32 v17, s8, v5 -; GISEL-NEXT: v_mul_lo_u32 v18, s8, v13 -; GISEL-NEXT: v_mul_lo_u32 v19, v13, v15 +; GISEL-NEXT: v_add_i32_e32 v13, vcc, v16, v13 +; GISEL-NEXT: v_add_i32_e32 v14, vcc, v14, v15 +; GISEL-NEXT: v_add_i32_e32 v9, vcc, v9, v13 +; GISEL-NEXT: v_add_i32_e32 v10, vcc, v10, v14 +; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v11 +; GISEL-NEXT: v_addc_u32_e64 v11, s[4:5], v7, v9, vcc +; GISEL-NEXT: v_mul_lo_u32 v13, s11, v5 +; GISEL-NEXT: v_mul_lo_u32 v14, s13, v5 +; GISEL-NEXT: v_add_i32_e64 v6, s[4:5], v6, v12 +; GISEL-NEXT: v_addc_u32_e64 v12, s[6:7], v8, v10, s[4:5] +; GISEL-NEXT: v_mul_lo_u32 v15, s8, v6 +; GISEL-NEXT: v_mul_lo_u32 v16, s9, v6 +; GISEL-NEXT: v_mul_hi_u32 v17, s8, v6 +; GISEL-NEXT: v_mul_lo_u32 v18, s8, v12 +; GISEL-NEXT: v_mul_lo_u32 v19, v12, v15 ; GISEL-NEXT: v_add_i32_e64 v16, s[6:7], v16, v18 -; GISEL-NEXT: v_mul_hi_u32 v18, v5, v15 +; GISEL-NEXT: v_mul_hi_u32 v18, v6, v15 ; GISEL-NEXT: v_add_i32_e64 v16, s[6:7], v16, v17 -; GISEL-NEXT: v_mul_lo_u32 v17, v5, v16 +; GISEL-NEXT: v_mul_lo_u32 v17, v6, v16 ; GISEL-NEXT: v_add_i32_e64 v17, s[6:7], v19, v17 ; GISEL-NEXT: v_cndmask_b32_e64 v19, 0, 1, s[6:7] ; GISEL-NEXT: v_add_i32_e64 v17, s[6:7], v17, v18 -; GISEL-NEXT: v_mul_lo_u32 v17, s13, v10 -; GISEL-NEXT: v_mul_lo_u32 v18, v10, v11 -; GISEL-NEXT: v_add_i32_e64 v12, s[8:9], v12, v17 -; GISEL-NEXT: v_mul_hi_u32 v17, v4, v11 -; GISEL-NEXT: v_add_i32_e64 v12, s[8:9], v12, v14 -; GISEL-NEXT: v_mul_lo_u32 v14, v4, v12 -; GISEL-NEXT: v_add_i32_e64 v14, s[8:9], v18, v14 -; GISEL-NEXT: v_cndmask_b32_e64 v18, 0, 1, s[8:9] +; GISEL-NEXT: v_mul_hi_u32 v17, s11, v5 +; GISEL-NEXT: v_mul_lo_u32 v18, s11, v11 +; GISEL-NEXT: v_add_i32_e64 v14, s[8:9], v14, v18 +; GISEL-NEXT: v_mul_lo_u32 v18, v11, v13 ; GISEL-NEXT: v_add_i32_e64 v14, s[8:9], v14, v17 -; GISEL-NEXT: v_mov_b32_e32 v14, s10 -; GISEL-NEXT: v_mov_b32_e32 v17, s11 -; GISEL-NEXT: s_bfe_i32 s13, -1, 0x10000 -; GISEL-NEXT: s_bfe_i32 s14, -1, 0x10000 -; GISEL-NEXT: v_add_i32_e64 v6, s[10:11], v6, v8 -; GISEL-NEXT: v_mov_b32_e32 v8, s13 +; GISEL-NEXT: v_mul_lo_u32 v17, v5, v14 +; GISEL-NEXT: v_add_i32_e64 v17, s[8:9], v18, v17 +; GISEL-NEXT: v_mul_hi_u32 v18, v5, v13 +; GISEL-NEXT: v_cndmask_b32_e64 v20, 0, 1, s[8:9] +; GISEL-NEXT: v_add_i32_e64 v17, s[8:9], v17, v18 +; GISEL-NEXT: v_mov_b32_e32 v17, s10 ; GISEL-NEXT: v_add_i32_e64 v7, s[10:11], v7, v9 -; GISEL-NEXT: v_mul_hi_u32 v9, v10, v11 -; GISEL-NEXT: v_mul_hi_u32 v11, v13, v15 +; GISEL-NEXT: v_mov_b32_e32 v9, s12 +; GISEL-NEXT: s_bfe_i32 s12, -1, 0x10000 +; GISEL-NEXT: s_bfe_i32 s13, -1, 0x10000 +; GISEL-NEXT: v_add_i32_e64 v8, s[10:11], v8, v10 +; GISEL-NEXT: v_mul_hi_u32 v10, v11, v13 +; GISEL-NEXT: v_mul_hi_u32 v13, v12, v15 ; GISEL-NEXT: v_cndmask_b32_e64 v15, 0, 1, s[8:9] -; GISEL-NEXT: v_add_i32_e64 v15, s[8:9], v18, v15 -; GISEL-NEXT: v_mul_lo_u32 v18, v10, v12 -; GISEL-NEXT: v_mul_hi_u32 v10, v10, v12 -; GISEL-NEXT: v_mul_hi_u32 v12, v4, v12 -; GISEL-NEXT: v_add_i32_e64 v9, s[8:9], v18, v9 +; GISEL-NEXT: v_add_i32_e64 v15, s[8:9], v20, v15 +; GISEL-NEXT: v_mul_lo_u32 v18, v11, v14 +; GISEL-NEXT: v_mul_hi_u32 v11, v11, v14 +; GISEL-NEXT: v_mul_hi_u32 v14, v5, v14 +; GISEL-NEXT: v_add_i32_e64 v10, s[8:9], v18, v10 ; GISEL-NEXT: v_cndmask_b32_e64 v18, 0, 1, s[8:9] -; GISEL-NEXT: v_add_i32_e64 v9, s[8:9], v9, v12 -; GISEL-NEXT: v_cndmask_b32_e64 v12, 0, 1, s[8:9] -; GISEL-NEXT: v_add_i32_e64 v12, s[8:9], v18, v12 +; GISEL-NEXT: v_add_i32_e64 v10, s[8:9], v10, v14 +; GISEL-NEXT: v_cndmask_b32_e64 v14, 0, 1, s[8:9] +; GISEL-NEXT: v_add_i32_e64 v14, s[8:9], v18, v14 ; GISEL-NEXT: v_cndmask_b32_e64 v18, 0, 1, s[6:7] ; GISEL-NEXT: v_add_i32_e64 v18, s[6:7], v19, v18 -; GISEL-NEXT: v_mul_lo_u32 v19, v13, v16 -; GISEL-NEXT: v_mul_hi_u32 v13, v13, v16 -; GISEL-NEXT: v_mul_hi_u32 v16, v5, v16 -; GISEL-NEXT: v_add_i32_e64 v11, s[6:7], v19, v11 +; GISEL-NEXT: v_mul_lo_u32 v19, v12, v16 +; GISEL-NEXT: v_mul_hi_u32 v12, v12, v16 +; GISEL-NEXT: v_mul_hi_u32 v16, v6, v16 +; GISEL-NEXT: v_add_i32_e64 v13, s[6:7], v19, v13 ; GISEL-NEXT: v_cndmask_b32_e64 v19, 0, 1, s[6:7] -; GISEL-NEXT: v_add_i32_e64 v11, s[6:7], v11, v16 +; GISEL-NEXT: v_add_i32_e64 v13, s[6:7], v13, v16 ; GISEL-NEXT: v_cndmask_b32_e64 v16, 0, 1, s[6:7] ; GISEL-NEXT: v_add_i32_e64 v16, s[6:7], v19, v16 -; GISEL-NEXT: v_mov_b32_e32 v19, s14 -; GISEL-NEXT: v_add_i32_e64 v9, s[6:7], v9, v15 +; GISEL-NEXT: v_mov_b32_e32 v19, s12 +; GISEL-NEXT: v_add_i32_e64 v10, s[6:7], v10, v15 ; GISEL-NEXT: v_cndmask_b32_e64 v15, 0, 1, s[6:7] -; GISEL-NEXT: v_add_i32_e64 v11, s[6:7], v11, v18 +; GISEL-NEXT: v_add_i32_e64 v14, s[6:7], v14, v15 +; GISEL-NEXT: v_mov_b32_e32 v15, s13 +; GISEL-NEXT: v_add_i32_e64 v13, s[6:7], v13, v18 ; GISEL-NEXT: v_cndmask_b32_e64 v18, 0, 1, s[6:7] -; GISEL-NEXT: v_add_i32_e64 v12, s[6:7], v12, v15 -; GISEL-NEXT: v_add_i32_e64 v15, s[6:7], v16, v18 -; GISEL-NEXT: v_add_i32_e64 v10, s[6:7], v10, v12 -; GISEL-NEXT: v_add_i32_e64 v12, s[6:7], v13, v15 -; GISEL-NEXT: v_addc_u32_e32 v6, vcc, v6, v10, vcc -; GISEL-NEXT: v_addc_u32_e64 v7, vcc, v7, v12, s[4:5] -; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v9 -; GISEL-NEXT: v_addc_u32_e32 v6, vcc, 0, v6, vcc -; GISEL-NEXT: v_mul_lo_u32 v9, v3, v4 -; GISEL-NEXT: v_mul_hi_u32 v10, v2, v4 -; GISEL-NEXT: v_mul_hi_u32 v4, v3, v4 -; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v11 +; GISEL-NEXT: v_add_i32_e64 v16, s[6:7], v16, v18 +; GISEL-NEXT: v_add_i32_e64 v11, s[6:7], v11, v14 +; GISEL-NEXT: v_add_i32_e64 v12, s[6:7], v12, v16 +; GISEL-NEXT: v_addc_u32_e32 v7, vcc, v7, v11, vcc +; GISEL-NEXT: v_addc_u32_e64 v8, vcc, v8, v12, s[4:5] +; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v10 ; GISEL-NEXT: v_addc_u32_e32 v7, vcc, 0, v7, vcc -; GISEL-NEXT: v_mul_lo_u32 v11, v1, v5 -; GISEL-NEXT: v_mul_hi_u32 v12, v0, v5 -; GISEL-NEXT: v_mul_hi_u32 v5, v1, v5 -; GISEL-NEXT: v_mul_lo_u32 v13, v2, v6 -; GISEL-NEXT: v_mul_lo_u32 v15, v3, v6 -; GISEL-NEXT: v_mul_hi_u32 v16, v2, v6 -; GISEL-NEXT: v_mul_hi_u32 v6, v3, v6 -; GISEL-NEXT: v_mul_lo_u32 v18, v0, v7 -; GISEL-NEXT: v_add_i32_e32 v11, vcc, v11, v18 -; GISEL-NEXT: v_cndmask_b32_e64 v18, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v11, vcc, v11, v12 -; GISEL-NEXT: v_mul_lo_u32 v11, v1, v7 -; GISEL-NEXT: v_mul_hi_u32 v12, v0, v7 -; GISEL-NEXT: v_mul_hi_u32 v7, v1, v7 -; GISEL-NEXT: v_add_i32_e64 v9, s[4:5], v9, v13 -; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, s[4:5] -; GISEL-NEXT: v_add_i32_e64 v4, s[4:5], v15, v4 -; GISEL-NEXT: v_cndmask_b32_e64 v15, 0, 1, s[4:5] -; GISEL-NEXT: v_add_i32_e64 v5, s[4:5], v11, v5 -; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, 1, s[4:5] -; GISEL-NEXT: v_add_i32_e64 v9, s[4:5], v9, v10 -; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 1, s[4:5] -; GISEL-NEXT: v_add_i32_e64 v4, s[4:5], v4, v16 -; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 1, s[4:5] +; GISEL-NEXT: v_mul_lo_u32 v10, v3, v5 +; GISEL-NEXT: v_mul_hi_u32 v11, v2, v5 +; GISEL-NEXT: v_mul_hi_u32 v5, v3, v5 +; GISEL-NEXT: v_add_i32_e32 v6, vcc, v6, v13 +; GISEL-NEXT: v_addc_u32_e32 v8, vcc, 0, v8, vcc +; GISEL-NEXT: v_mul_lo_u32 v12, v1, v6 +; GISEL-NEXT: v_mul_hi_u32 v13, v0, v6 +; GISEL-NEXT: v_mul_hi_u32 v6, v1, v6 +; GISEL-NEXT: v_mul_lo_u32 v14, v2, v7 +; GISEL-NEXT: v_mul_lo_u32 v16, v3, v7 +; GISEL-NEXT: v_mul_hi_u32 v18, v2, v7 +; GISEL-NEXT: v_mul_hi_u32 v7, v3, v7 +; GISEL-NEXT: v_add_i32_e32 v10, vcc, v10, v14 +; GISEL-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v10, vcc, v10, v11 +; GISEL-NEXT: v_mul_lo_u32 v10, v0, v8 +; GISEL-NEXT: v_mul_lo_u32 v11, v1, v8 +; GISEL-NEXT: v_add_i32_e64 v10, s[4:5], v12, v10 +; GISEL-NEXT: v_cndmask_b32_e64 v12, 0, 1, s[4:5] +; GISEL-NEXT: v_add_i32_e64 v10, s[4:5], v10, v13 +; GISEL-NEXT: v_mul_hi_u32 v10, v0, v8 +; GISEL-NEXT: v_mul_hi_u32 v8, v1, v8 +; GISEL-NEXT: v_add_i32_e64 v5, s[6:7], v16, v5 +; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, s[6:7] +; GISEL-NEXT: v_add_i32_e64 v6, s[6:7], v11, v6 +; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, 1, s[6:7] ; GISEL-NEXT: v_cndmask_b32_e64 v16, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v12 -; GISEL-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v9, vcc, v13, v9 -; GISEL-NEXT: v_add_i32_e32 v10, vcc, v15, v10 -; GISEL-NEXT: v_add_i32_e32 v13, vcc, v18, v16 -; GISEL-NEXT: v_add_i32_e32 v11, vcc, v11, v12 -; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v9 -; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v13 +; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v18 +; GISEL-NEXT: v_cndmask_b32_e64 v18, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v14, vcc, v14, v16 +; GISEL-NEXT: v_cndmask_b32_e64 v16, 0, 1, s[4:5] +; GISEL-NEXT: v_add_i32_e32 v6, vcc, v6, v10 +; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v13, vcc, v13, v18 +; GISEL-NEXT: v_add_i32_e32 v12, vcc, v12, v16 +; GISEL-NEXT: v_add_i32_e32 v10, vcc, v11, v10 +; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v14 +; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v6, vcc, v6, v12 ; GISEL-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v9, vcc, v10, v9 -; GISEL-NEXT: v_mul_lo_u32 v10, s12, v4 -; GISEL-NEXT: v_mul_lo_u32 v13, 0, v4 -; GISEL-NEXT: v_mul_hi_u32 v4, s12, v4 -; GISEL-NEXT: v_add_i32_e32 v11, vcc, v11, v12 -; GISEL-NEXT: v_mul_lo_u32 v12, s12, v5 -; GISEL-NEXT: v_mul_lo_u32 v15, 0, v5 -; GISEL-NEXT: v_mul_hi_u32 v5, s12, v5 -; GISEL-NEXT: v_add_i32_e32 v6, vcc, v6, v9 +; GISEL-NEXT: v_add_i32_e32 v11, vcc, v13, v11 +; GISEL-NEXT: v_mul_lo_u32 v13, v4, v5 +; GISEL-NEXT: v_mul_lo_u32 v14, 0, v5 +; GISEL-NEXT: v_mul_hi_u32 v5, v4, v5 +; GISEL-NEXT: v_add_i32_e32 v10, vcc, v10, v12 +; GISEL-NEXT: v_mul_lo_u32 v12, v4, v6 +; GISEL-NEXT: v_mul_lo_u32 v16, 0, v6 +; GISEL-NEXT: v_mul_hi_u32 v6, v4, v6 ; GISEL-NEXT: v_add_i32_e32 v7, vcc, v7, v11 -; GISEL-NEXT: v_mul_lo_u32 v6, s12, v6 -; GISEL-NEXT: v_mul_lo_u32 v7, s12, v7 -; GISEL-NEXT: v_add_i32_e32 v6, vcc, v13, v6 -; GISEL-NEXT: v_add_i32_e32 v7, vcc, v15, v7 -; GISEL-NEXT: v_add_i32_e32 v4, vcc, v6, v4 +; GISEL-NEXT: v_add_i32_e32 v8, vcc, v8, v10 +; GISEL-NEXT: v_mul_lo_u32 v7, v4, v7 +; GISEL-NEXT: v_mul_lo_u32 v8, v4, v8 +; GISEL-NEXT: v_add_i32_e32 v7, vcc, v14, v7 +; GISEL-NEXT: v_add_i32_e32 v8, vcc, v16, v8 ; GISEL-NEXT: v_add_i32_e32 v5, vcc, v7, v5 -; GISEL-NEXT: v_sub_i32_e32 v2, vcc, v2, v10 -; GISEL-NEXT: v_subb_u32_e64 v6, s[4:5], v3, v4, vcc -; GISEL-NEXT: v_sub_i32_e64 v3, s[4:5], v3, v4 -; GISEL-NEXT: v_cmp_le_u32_e64 s[4:5], s12, v2 -; GISEL-NEXT: v_cndmask_b32_e64 v4, 0, -1, s[4:5] +; GISEL-NEXT: v_add_i32_e32 v6, vcc, v8, v6 +; GISEL-NEXT: v_sub_i32_e32 v2, vcc, v2, v13 +; GISEL-NEXT: v_subb_u32_e64 v7, s[4:5], v3, v5, vcc +; GISEL-NEXT: v_sub_i32_e64 v3, s[4:5], v3, v5 +; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v2, v4 +; GISEL-NEXT: v_cndmask_b32_e64 v5, 0, -1, s[4:5] ; GISEL-NEXT: v_sub_i32_e64 v0, s[4:5], v0, v12 -; GISEL-NEXT: v_subb_u32_e64 v7, s[6:7], v1, v5, s[4:5] -; GISEL-NEXT: v_sub_i32_e64 v1, s[6:7], v1, v5 -; GISEL-NEXT: v_cmp_le_u32_e64 s[6:7], s12, v0 -; GISEL-NEXT: v_cndmask_b32_e64 v5, 0, -1, s[6:7] -; GISEL-NEXT: v_cmp_eq_u32_e64 s[6:7], 0, v6 -; GISEL-NEXT: v_cndmask_b32_e64 v4, v8, v4, s[6:7] +; GISEL-NEXT: v_subb_u32_e64 v8, s[6:7], v1, v6, s[4:5] +; GISEL-NEXT: v_sub_i32_e64 v1, s[6:7], v1, v6 +; GISEL-NEXT: v_cmp_ge_u32_e64 s[6:7], v0, v4 +; GISEL-NEXT: v_cndmask_b32_e64 v6, 0, -1, s[6:7] +; GISEL-NEXT: v_cmp_eq_u32_e64 s[6:7], 0, v7 +; GISEL-NEXT: v_cndmask_b32_e64 v5, v19, v5, s[6:7] ; GISEL-NEXT: v_subbrev_u32_e32 v3, vcc, 0, v3, vcc -; GISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v7 -; GISEL-NEXT: v_cndmask_b32_e32 v5, v14, v5, vcc +; GISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v8 +; GISEL-NEXT: v_cndmask_b32_e32 v6, v17, v6, vcc ; GISEL-NEXT: v_subbrev_u32_e64 v1, vcc, 0, v1, s[4:5] -; GISEL-NEXT: v_subrev_i32_e32 v8, vcc, s12, v2 +; GISEL-NEXT: v_sub_i32_e32 v10, vcc, v2, v4 ; GISEL-NEXT: v_subbrev_u32_e32 v3, vcc, 0, v3, vcc -; GISEL-NEXT: v_cmp_le_u32_e32 vcc, s12, v8 -; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, -1, vcc -; GISEL-NEXT: v_subrev_i32_e32 v10, vcc, s12, v0 -; GISEL-NEXT: v_subbrev_u32_e32 v1, vcc, 0, v1, vcc -; GISEL-NEXT: v_cmp_le_u32_e32 vcc, s12, v10 +; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v10, v4 ; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, -1, vcc +; GISEL-NEXT: v_sub_i32_e32 v12, vcc, v0, v4 +; GISEL-NEXT: v_subbrev_u32_e32 v1, vcc, 0, v1, vcc +; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v12, v4 +; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, -1, vcc ; GISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 -; GISEL-NEXT: v_cndmask_b32_e32 v9, v19, v9, vcc -; GISEL-NEXT: v_subrev_i32_e32 v12, vcc, s12, v8 -; GISEL-NEXT: v_subbrev_u32_e32 v13, vcc, 0, v3, vcc +; GISEL-NEXT: v_cndmask_b32_e32 v11, v15, v11, vcc +; GISEL-NEXT: v_sub_i32_e32 v14, vcc, v10, v4 +; GISEL-NEXT: v_subbrev_u32_e32 v15, vcc, 0, v3, vcc ; GISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 -; GISEL-NEXT: v_cndmask_b32_e32 v11, v17, v11, vcc -; GISEL-NEXT: v_subrev_i32_e32 v14, vcc, s12, v10 -; GISEL-NEXT: v_subbrev_u32_e32 v15, vcc, 0, v1, vcc -; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v9 -; GISEL-NEXT: v_cndmask_b32_e32 v8, v8, v12, vcc -; GISEL-NEXT: v_cmp_ne_u32_e64 s[4:5], 0, v11 -; GISEL-NEXT: v_cndmask_b32_e64 v9, v10, v14, s[4:5] -; GISEL-NEXT: v_cndmask_b32_e32 v3, v3, v13, vcc -; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 -; GISEL-NEXT: v_cndmask_b32_e32 v2, v2, v8, vcc -; GISEL-NEXT: v_cndmask_b32_e64 v1, v1, v15, s[4:5] -; GISEL-NEXT: v_cmp_ne_u32_e64 s[4:5], 0, v5 -; GISEL-NEXT: v_cndmask_b32_e64 v0, v0, v9, s[4:5] -; GISEL-NEXT: v_cndmask_b32_e64 v1, v7, v1, s[4:5] -; GISEL-NEXT: v_cndmask_b32_e32 v3, v6, v3, vcc +; GISEL-NEXT: v_cndmask_b32_e32 v9, v9, v13, vcc +; GISEL-NEXT: v_sub_i32_e32 v4, vcc, v12, v4 +; GISEL-NEXT: v_subbrev_u32_e32 v13, vcc, 0, v1, vcc +; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v11 +; GISEL-NEXT: v_cndmask_b32_e32 v10, v10, v14, vcc +; GISEL-NEXT: v_cmp_ne_u32_e64 s[4:5], 0, v9 +; GISEL-NEXT: v_cndmask_b32_e64 v4, v12, v4, s[4:5] +; GISEL-NEXT: v_cndmask_b32_e32 v3, v3, v15, vcc +; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v5 +; GISEL-NEXT: v_cndmask_b32_e32 v2, v2, v10, vcc +; GISEL-NEXT: v_cndmask_b32_e64 v1, v1, v13, s[4:5] +; GISEL-NEXT: v_cmp_ne_u32_e64 s[4:5], 0, v6 +; GISEL-NEXT: v_cndmask_b32_e64 v0, v0, v4, s[4:5] +; GISEL-NEXT: v_cndmask_b32_e64 v1, v8, v1, s[4:5] +; GISEL-NEXT: v_cndmask_b32_e32 v3, v7, v3, vcc ; GISEL-NEXT: s_setpc_b64 s[30:31] ; ; CGP-LABEL: v_urem_v2i64_oddk_denom: ; CGP: ; %bb.0: ; CGP-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CGP-NEXT: v_cvt_f32_u32_e32 v4, 0x12d8fb -; CGP-NEXT: v_cvt_f32_ubyte0_e32 v5, 0 -; CGP-NEXT: s_mov_b32 s8, 0xffed2705 -; CGP-NEXT: s_mov_b32 s12, 0x12d8fb -; CGP-NEXT: s_bfe_i32 s10, -1, 0x10000 -; CGP-NEXT: s_bfe_i32 s11, -1, 0x10000 -; CGP-NEXT: s_bfe_i32 s13, -1, 0x10000 -; CGP-NEXT: s_bfe_i32 s14, -1, 0x10000 -; CGP-NEXT: v_mov_b32_e32 v6, v4 -; CGP-NEXT: v_mac_f32_e32 v4, 0x4f800000, v5 -; CGP-NEXT: v_mac_f32_e32 v6, 0x4f800000, v5 -; CGP-NEXT: v_rcp_iflag_f32_e32 v4, v4 -; CGP-NEXT: v_rcp_iflag_f32_e32 v5, v6 -; CGP-NEXT: v_mul_f32_e32 v4, 0x5f7ffffc, v4 -; CGP-NEXT: v_mul_f32_e32 v5, 0x5f7ffffc, v5 -; CGP-NEXT: v_mul_f32_e32 v6, 0x2f800000, v4 -; CGP-NEXT: v_mul_f32_e32 v7, 0x2f800000, v5 -; CGP-NEXT: v_trunc_f32_e32 v6, v6 -; CGP-NEXT: v_trunc_f32_e32 v7, v7 -; CGP-NEXT: v_mac_f32_e32 v4, 0xcf800000, v6 +; CGP-NEXT: v_mov_b32_e32 v4, 0x12d8fb +; CGP-NEXT: v_cvt_f32_ubyte0_e32 v6, 0 +; CGP-NEXT: v_cvt_f32_u32_e32 v7, v4 +; CGP-NEXT: v_mov_b32_e32 v5, 0xffed2705 +; CGP-NEXT: v_mov_b32_e32 v8, v7 +; CGP-NEXT: v_mac_f32_e32 v7, 0x4f800000, v6 +; CGP-NEXT: v_mac_f32_e32 v8, 0x4f800000, v6 +; CGP-NEXT: v_rcp_iflag_f32_e32 v6, v7 +; CGP-NEXT: v_rcp_iflag_f32_e32 v7, v8 +; CGP-NEXT: v_mul_f32_e32 v6, 0x5f7ffffc, v6 +; CGP-NEXT: v_mul_f32_e32 v7, 0x5f7ffffc, v7 +; CGP-NEXT: v_mul_f32_e32 v8, 0x2f800000, v6 +; CGP-NEXT: v_mul_f32_e32 v9, 0x2f800000, v7 +; CGP-NEXT: v_trunc_f32_e32 v8, v8 +; CGP-NEXT: v_trunc_f32_e32 v9, v9 +; CGP-NEXT: v_mac_f32_e32 v6, 0xcf800000, v8 +; CGP-NEXT: v_cvt_u32_f32_e32 v8, v8 +; CGP-NEXT: v_mac_f32_e32 v7, 0xcf800000, v9 +; CGP-NEXT: v_cvt_u32_f32_e32 v9, v9 ; CGP-NEXT: v_cvt_u32_f32_e32 v6, v6 -; CGP-NEXT: v_mac_f32_e32 v5, 0xcf800000, v7 +; CGP-NEXT: v_mul_lo_u32 v10, v5, v8 ; CGP-NEXT: v_cvt_u32_f32_e32 v7, v7 -; CGP-NEXT: v_cvt_u32_f32_e32 v4, v4 -; CGP-NEXT: v_mul_lo_u32 v8, s8, v6 -; CGP-NEXT: v_cvt_u32_f32_e32 v5, v5 -; CGP-NEXT: v_mul_lo_u32 v9, s8, v7 -; CGP-NEXT: v_mul_lo_u32 v10, s8, v4 -; CGP-NEXT: v_mul_lo_u32 v11, -1, v4 -; CGP-NEXT: v_mul_hi_u32 v12, s8, v4 -; CGP-NEXT: v_mul_lo_u32 v13, s8, v5 -; CGP-NEXT: v_mul_lo_u32 v14, -1, v5 -; CGP-NEXT: v_mul_hi_u32 v15, s8, v5 -; CGP-NEXT: v_add_i32_e32 v8, vcc, v11, v8 -; CGP-NEXT: v_mul_lo_u32 v11, v6, v10 -; CGP-NEXT: v_mul_hi_u32 v16, v4, v10 -; CGP-NEXT: v_mul_hi_u32 v10, v6, v10 -; CGP-NEXT: v_add_i32_e32 v9, vcc, v14, v9 -; CGP-NEXT: v_mul_lo_u32 v14, v7, v13 -; CGP-NEXT: v_mul_hi_u32 v17, v5, v13 -; CGP-NEXT: v_mul_hi_u32 v13, v7, v13 -; CGP-NEXT: v_add_i32_e32 v8, vcc, v8, v12 -; CGP-NEXT: v_add_i32_e32 v9, vcc, v9, v15 -; CGP-NEXT: v_mul_lo_u32 v12, v4, v8 -; CGP-NEXT: v_mul_lo_u32 v15, v6, v8 -; CGP-NEXT: v_mul_hi_u32 v18, v4, v8 -; CGP-NEXT: v_mul_hi_u32 v8, v6, v8 -; CGP-NEXT: v_mul_lo_u32 v19, v5, v9 -; CGP-NEXT: v_add_i32_e32 v14, vcc, v14, v19 -; CGP-NEXT: v_cndmask_b32_e64 v19, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v14, vcc, v14, v17 -; CGP-NEXT: v_mul_lo_u32 v14, v7, v9 -; CGP-NEXT: v_mul_hi_u32 v17, v5, v9 -; CGP-NEXT: v_mul_hi_u32 v9, v7, v9 -; CGP-NEXT: v_add_i32_e64 v11, s[4:5], v11, v12 -; CGP-NEXT: v_cndmask_b32_e64 v12, 0, 1, s[4:5] -; CGP-NEXT: v_add_i32_e64 v10, s[4:5], v15, v10 -; CGP-NEXT: v_cndmask_b32_e64 v15, 0, 1, s[4:5] -; CGP-NEXT: v_add_i32_e64 v13, s[4:5], v14, v13 +; CGP-NEXT: v_mul_lo_u32 v11, v5, v9 +; CGP-NEXT: v_mul_lo_u32 v12, v5, v6 +; CGP-NEXT: v_mul_lo_u32 v13, -1, v6 +; CGP-NEXT: v_mul_hi_u32 v14, v5, v6 +; CGP-NEXT: v_mul_lo_u32 v15, v5, v7 +; CGP-NEXT: v_mul_lo_u32 v16, -1, v7 +; CGP-NEXT: v_mul_hi_u32 v17, v5, v7 +; CGP-NEXT: v_add_i32_e32 v10, vcc, v13, v10 +; CGP-NEXT: v_mul_lo_u32 v13, v8, v12 +; CGP-NEXT: v_mul_hi_u32 v18, v6, v12 +; CGP-NEXT: v_mul_hi_u32 v12, v8, v12 +; CGP-NEXT: v_add_i32_e32 v11, vcc, v16, v11 +; CGP-NEXT: v_mul_lo_u32 v16, v9, v15 +; CGP-NEXT: v_mul_hi_u32 v19, v7, v15 +; CGP-NEXT: v_mul_hi_u32 v15, v9, v15 +; CGP-NEXT: v_add_i32_e32 v10, vcc, v10, v14 +; CGP-NEXT: v_add_i32_e32 v11, vcc, v11, v17 +; CGP-NEXT: v_mul_lo_u32 v14, v6, v10 +; CGP-NEXT: v_mul_lo_u32 v17, v7, v11 +; CGP-NEXT: v_add_i32_e32 v16, vcc, v16, v17 +; CGP-NEXT: v_cndmask_b32_e64 v17, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v16, vcc, v16, v19 +; CGP-NEXT: v_mul_lo_u32 v16, v8, v10 +; CGP-NEXT: v_mul_hi_u32 v19, v6, v10 +; CGP-NEXT: v_mul_hi_u32 v10, v8, v10 +; CGP-NEXT: v_add_i32_e64 v13, s[4:5], v13, v14 ; CGP-NEXT: v_cndmask_b32_e64 v14, 0, 1, s[4:5] -; CGP-NEXT: v_add_i32_e64 v11, s[4:5], v11, v16 -; CGP-NEXT: v_cndmask_b32_e64 v11, 0, 1, s[4:5] -; CGP-NEXT: v_add_i32_e64 v10, s[4:5], v10, v18 +; CGP-NEXT: v_add_i32_e64 v13, s[4:5], v13, v18 +; CGP-NEXT: v_mul_lo_u32 v13, v9, v11 +; CGP-NEXT: v_cndmask_b32_e64 v18, 0, 1, s[4:5] +; CGP-NEXT: v_add_i32_e64 v14, s[4:5], v14, v18 +; CGP-NEXT: v_mul_hi_u32 v18, v7, v11 +; CGP-NEXT: v_mul_hi_u32 v11, v9, v11 +; CGP-NEXT: v_add_i32_e64 v12, s[4:5], v16, v12 ; CGP-NEXT: v_cndmask_b32_e64 v16, 0, 1, s[4:5] +; CGP-NEXT: v_add_i32_e64 v13, s[4:5], v13, v15 +; CGP-NEXT: v_cndmask_b32_e64 v15, 0, 1, s[4:5] +; CGP-NEXT: v_add_i32_e64 v12, s[4:5], v12, v19 +; CGP-NEXT: v_cndmask_b32_e64 v19, 0, 1, s[4:5] +; CGP-NEXT: v_add_i32_e64 v16, s[4:5], v16, v19 +; CGP-NEXT: v_cndmask_b32_e64 v19, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v13, vcc, v13, v18 ; CGP-NEXT: v_cndmask_b32_e64 v18, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v17, vcc, v17, v19 +; CGP-NEXT: v_add_i32_e32 v15, vcc, v15, v18 +; CGP-NEXT: v_add_i32_e32 v12, vcc, v12, v14 +; CGP-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v13, vcc, v13, v17 ; CGP-NEXT: v_cndmask_b32_e64 v17, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v11, vcc, v12, v11 -; CGP-NEXT: v_add_i32_e32 v12, vcc, v15, v16 -; CGP-NEXT: v_add_i32_e32 v15, vcc, v19, v18 -; CGP-NEXT: v_add_i32_e32 v14, vcc, v14, v17 -; CGP-NEXT: v_add_i32_e32 v10, vcc, v10, v11 -; CGP-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v13, vcc, v13, v15 -; CGP-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v11, vcc, v12, v11 -; CGP-NEXT: v_add_i32_e32 v12, vcc, v14, v15 -; CGP-NEXT: v_add_i32_e32 v8, vcc, v8, v11 -; CGP-NEXT: v_add_i32_e32 v9, vcc, v9, v12 -; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v10 -; CGP-NEXT: v_addc_u32_e64 v10, s[4:5], v6, v8, vcc -; CGP-NEXT: v_mul_lo_u32 v11, s8, v4 -; CGP-NEXT: v_mul_lo_u32 v12, -1, v4 -; CGP-NEXT: v_mul_hi_u32 v14, s8, v4 -; CGP-NEXT: v_add_i32_e64 v5, s[4:5], v5, v13 -; CGP-NEXT: v_addc_u32_e64 v13, s[6:7], v7, v9, s[4:5] -; CGP-NEXT: v_mul_lo_u32 v15, s8, v5 -; CGP-NEXT: v_mul_lo_u32 v16, -1, v5 -; CGP-NEXT: v_mul_hi_u32 v17, s8, v5 -; CGP-NEXT: v_mul_lo_u32 v18, s8, v13 +; CGP-NEXT: v_add_i32_e32 v14, vcc, v16, v14 +; CGP-NEXT: v_add_i32_e32 v15, vcc, v15, v17 +; CGP-NEXT: v_add_i32_e32 v10, vcc, v10, v14 +; CGP-NEXT: v_add_i32_e32 v11, vcc, v11, v15 +; CGP-NEXT: v_add_i32_e32 v6, vcc, v6, v12 +; CGP-NEXT: v_addc_u32_e64 v12, s[4:5], v8, v10, vcc +; CGP-NEXT: v_mul_lo_u32 v14, v5, v6 +; CGP-NEXT: v_add_i32_e64 v7, s[4:5], v7, v13 +; CGP-NEXT: v_addc_u32_e64 v13, s[6:7], v9, v11, s[4:5] +; CGP-NEXT: v_mul_lo_u32 v15, v5, v7 +; CGP-NEXT: v_mul_lo_u32 v16, -1, v7 +; CGP-NEXT: v_mul_hi_u32 v17, v5, v7 +; CGP-NEXT: v_mul_lo_u32 v18, v5, v13 ; CGP-NEXT: v_mul_lo_u32 v19, v13, v15 ; CGP-NEXT: v_add_i32_e64 v16, s[6:7], v16, v18 -; CGP-NEXT: v_mul_hi_u32 v18, v5, v15 +; CGP-NEXT: v_mul_hi_u32 v18, v7, v15 ; CGP-NEXT: v_add_i32_e64 v16, s[6:7], v16, v17 -; CGP-NEXT: v_mul_lo_u32 v17, v5, v16 +; CGP-NEXT: v_mul_lo_u32 v17, v7, v16 ; CGP-NEXT: v_add_i32_e64 v17, s[6:7], v19, v17 ; CGP-NEXT: v_cndmask_b32_e64 v19, 0, 1, s[6:7] ; CGP-NEXT: v_add_i32_e64 v17, s[6:7], v17, v18 -; CGP-NEXT: v_mul_lo_u32 v17, s8, v10 -; CGP-NEXT: v_mul_lo_u32 v18, v10, v11 -; CGP-NEXT: v_add_i32_e64 v12, s[8:9], v12, v17 -; CGP-NEXT: v_mul_hi_u32 v17, v4, v11 -; CGP-NEXT: v_add_i32_e64 v12, s[8:9], v12, v14 -; CGP-NEXT: v_mul_lo_u32 v14, v4, v12 -; CGP-NEXT: v_add_i32_e64 v14, s[8:9], v18, v14 -; CGP-NEXT: v_cndmask_b32_e64 v18, 0, 1, s[8:9] -; CGP-NEXT: v_add_i32_e64 v14, s[8:9], v14, v17 -; CGP-NEXT: v_mov_b32_e32 v14, s10 -; CGP-NEXT: v_mov_b32_e32 v17, s11 -; CGP-NEXT: v_add_i32_e64 v6, s[10:11], v6, v8 -; CGP-NEXT: v_mov_b32_e32 v8, s13 -; CGP-NEXT: v_add_i32_e64 v7, s[10:11], v7, v9 -; CGP-NEXT: v_mul_hi_u32 v9, v10, v11 -; CGP-NEXT: v_mul_hi_u32 v11, v13, v15 +; CGP-NEXT: v_mul_lo_u32 v17, -1, v6 +; CGP-NEXT: v_mul_hi_u32 v18, v5, v6 +; CGP-NEXT: v_mul_lo_u32 v5, v5, v12 +; CGP-NEXT: v_add_i32_e64 v5, s[8:9], v17, v5 +; CGP-NEXT: v_mul_lo_u32 v17, v12, v14 +; CGP-NEXT: v_add_i32_e64 v5, s[8:9], v5, v18 +; CGP-NEXT: v_mul_lo_u32 v18, v6, v5 +; CGP-NEXT: v_add_i32_e64 v17, s[8:9], v17, v18 +; CGP-NEXT: v_mul_hi_u32 v18, v6, v14 +; CGP-NEXT: v_cndmask_b32_e64 v20, 0, 1, s[8:9] +; CGP-NEXT: v_add_i32_e64 v17, s[8:9], v17, v18 +; CGP-NEXT: s_bfe_i32 s10, -1, 0x10000 +; CGP-NEXT: s_bfe_i32 s12, -1, 0x10000 +; CGP-NEXT: s_bfe_i32 s13, -1, 0x10000 +; CGP-NEXT: s_bfe_i32 s14, -1, 0x10000 +; CGP-NEXT: v_mov_b32_e32 v17, s10 +; CGP-NEXT: v_add_i32_e64 v8, s[10:11], v8, v10 +; CGP-NEXT: v_mov_b32_e32 v10, s12 +; CGP-NEXT: v_add_i32_e64 v9, s[10:11], v9, v11 +; CGP-NEXT: v_mul_hi_u32 v11, v12, v14 +; CGP-NEXT: v_mul_hi_u32 v14, v13, v15 ; CGP-NEXT: v_cndmask_b32_e64 v15, 0, 1, s[8:9] -; CGP-NEXT: v_add_i32_e64 v15, s[8:9], v18, v15 -; CGP-NEXT: v_mul_lo_u32 v18, v10, v12 -; CGP-NEXT: v_mul_hi_u32 v10, v10, v12 -; CGP-NEXT: v_mul_hi_u32 v12, v4, v12 -; CGP-NEXT: v_add_i32_e64 v9, s[8:9], v18, v9 +; CGP-NEXT: v_add_i32_e64 v15, s[8:9], v20, v15 +; CGP-NEXT: v_mul_lo_u32 v18, v12, v5 +; CGP-NEXT: v_mul_hi_u32 v12, v12, v5 +; CGP-NEXT: v_mul_hi_u32 v5, v6, v5 +; CGP-NEXT: v_add_i32_e64 v11, s[8:9], v18, v11 ; CGP-NEXT: v_cndmask_b32_e64 v18, 0, 1, s[8:9] -; CGP-NEXT: v_add_i32_e64 v9, s[8:9], v9, v12 -; CGP-NEXT: v_cndmask_b32_e64 v12, 0, 1, s[8:9] -; CGP-NEXT: v_add_i32_e64 v12, s[8:9], v18, v12 +; CGP-NEXT: v_add_i32_e64 v5, s[8:9], v11, v5 +; CGP-NEXT: v_cndmask_b32_e64 v11, 0, 1, s[8:9] +; CGP-NEXT: v_add_i32_e64 v11, s[8:9], v18, v11 ; CGP-NEXT: v_cndmask_b32_e64 v18, 0, 1, s[6:7] ; CGP-NEXT: v_add_i32_e64 v18, s[6:7], v19, v18 ; CGP-NEXT: v_mul_lo_u32 v19, v13, v16 ; CGP-NEXT: v_mul_hi_u32 v13, v13, v16 -; CGP-NEXT: v_mul_hi_u32 v16, v5, v16 -; CGP-NEXT: v_add_i32_e64 v11, s[6:7], v19, v11 +; CGP-NEXT: v_mul_hi_u32 v16, v7, v16 +; CGP-NEXT: v_add_i32_e64 v14, s[6:7], v19, v14 ; CGP-NEXT: v_cndmask_b32_e64 v19, 0, 1, s[6:7] -; CGP-NEXT: v_add_i32_e64 v11, s[6:7], v11, v16 +; CGP-NEXT: v_add_i32_e64 v14, s[6:7], v14, v16 ; CGP-NEXT: v_cndmask_b32_e64 v16, 0, 1, s[6:7] ; CGP-NEXT: v_add_i32_e64 v16, s[6:7], v19, v16 -; CGP-NEXT: v_mov_b32_e32 v19, s14 -; CGP-NEXT: v_add_i32_e64 v9, s[6:7], v9, v15 +; CGP-NEXT: v_mov_b32_e32 v19, s13 +; CGP-NEXT: v_add_i32_e64 v5, s[6:7], v5, v15 ; CGP-NEXT: v_cndmask_b32_e64 v15, 0, 1, s[6:7] -; CGP-NEXT: v_add_i32_e64 v11, s[6:7], v11, v18 +; CGP-NEXT: v_add_i32_e64 v11, s[6:7], v11, v15 +; CGP-NEXT: v_mov_b32_e32 v15, s14 +; CGP-NEXT: v_add_i32_e64 v14, s[6:7], v14, v18 ; CGP-NEXT: v_cndmask_b32_e64 v18, 0, 1, s[6:7] -; CGP-NEXT: v_add_i32_e64 v12, s[6:7], v12, v15 -; CGP-NEXT: v_add_i32_e64 v15, s[6:7], v16, v18 -; CGP-NEXT: v_add_i32_e64 v10, s[6:7], v10, v12 -; CGP-NEXT: v_add_i32_e64 v12, s[6:7], v13, v15 -; CGP-NEXT: v_addc_u32_e32 v6, vcc, v6, v10, vcc -; CGP-NEXT: v_addc_u32_e64 v7, vcc, v7, v12, s[4:5] -; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v9 -; CGP-NEXT: v_addc_u32_e32 v6, vcc, 0, v6, vcc -; CGP-NEXT: v_mul_lo_u32 v9, v3, v4 -; CGP-NEXT: v_mul_hi_u32 v10, v2, v4 -; CGP-NEXT: v_mul_hi_u32 v4, v3, v4 -; CGP-NEXT: v_add_i32_e32 v5, vcc, v5, v11 -; CGP-NEXT: v_addc_u32_e32 v7, vcc, 0, v7, vcc -; CGP-NEXT: v_mul_lo_u32 v11, v1, v5 -; CGP-NEXT: v_mul_hi_u32 v12, v0, v5 -; CGP-NEXT: v_mul_hi_u32 v5, v1, v5 -; CGP-NEXT: v_mul_lo_u32 v13, v2, v6 -; CGP-NEXT: v_mul_lo_u32 v15, v3, v6 -; CGP-NEXT: v_mul_hi_u32 v16, v2, v6 -; CGP-NEXT: v_mul_hi_u32 v6, v3, v6 -; CGP-NEXT: v_mul_lo_u32 v18, v0, v7 -; CGP-NEXT: v_add_i32_e32 v11, vcc, v11, v18 -; CGP-NEXT: v_cndmask_b32_e64 v18, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v11, vcc, v11, v12 -; CGP-NEXT: v_mul_lo_u32 v11, v1, v7 -; CGP-NEXT: v_mul_hi_u32 v12, v0, v7 +; CGP-NEXT: v_add_i32_e64 v16, s[6:7], v16, v18 +; CGP-NEXT: v_add_i32_e64 v11, s[6:7], v12, v11 +; CGP-NEXT: v_add_i32_e64 v12, s[6:7], v13, v16 +; CGP-NEXT: v_addc_u32_e32 v8, vcc, v8, v11, vcc +; CGP-NEXT: v_addc_u32_e64 v9, vcc, v9, v12, s[4:5] +; CGP-NEXT: v_add_i32_e32 v5, vcc, v6, v5 +; CGP-NEXT: v_addc_u32_e32 v6, vcc, 0, v8, vcc +; CGP-NEXT: v_mul_lo_u32 v8, v3, v5 +; CGP-NEXT: v_mul_hi_u32 v11, v2, v5 +; CGP-NEXT: v_mul_hi_u32 v5, v3, v5 +; CGP-NEXT: v_add_i32_e32 v7, vcc, v7, v14 +; CGP-NEXT: v_addc_u32_e32 v9, vcc, 0, v9, vcc +; CGP-NEXT: v_mul_lo_u32 v12, v1, v7 +; CGP-NEXT: v_mul_hi_u32 v13, v0, v7 ; CGP-NEXT: v_mul_hi_u32 v7, v1, v7 -; CGP-NEXT: v_add_i32_e64 v9, s[4:5], v9, v13 -; CGP-NEXT: v_cndmask_b32_e64 v13, 0, 1, s[4:5] -; CGP-NEXT: v_add_i32_e64 v4, s[4:5], v15, v4 -; CGP-NEXT: v_cndmask_b32_e64 v15, 0, 1, s[4:5] -; CGP-NEXT: v_add_i32_e64 v5, s[4:5], v11, v5 -; CGP-NEXT: v_cndmask_b32_e64 v11, 0, 1, s[4:5] -; CGP-NEXT: v_add_i32_e64 v9, s[4:5], v9, v10 -; CGP-NEXT: v_cndmask_b32_e64 v9, 0, 1, s[4:5] -; CGP-NEXT: v_add_i32_e64 v4, s[4:5], v4, v16 -; CGP-NEXT: v_cndmask_b32_e64 v10, 0, 1, s[4:5] +; CGP-NEXT: v_mul_lo_u32 v14, v2, v6 +; CGP-NEXT: v_mul_lo_u32 v16, v3, v6 +; CGP-NEXT: v_mul_hi_u32 v18, v2, v6 +; CGP-NEXT: v_mul_hi_u32 v6, v3, v6 +; CGP-NEXT: v_add_i32_e32 v8, vcc, v8, v14 +; CGP-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v8, vcc, v8, v11 +; CGP-NEXT: v_mul_lo_u32 v8, v0, v9 +; CGP-NEXT: v_mul_lo_u32 v11, v1, v9 +; CGP-NEXT: v_add_i32_e64 v8, s[4:5], v12, v8 +; CGP-NEXT: v_cndmask_b32_e64 v12, 0, 1, s[4:5] +; CGP-NEXT: v_add_i32_e64 v8, s[4:5], v8, v13 +; CGP-NEXT: v_mul_hi_u32 v8, v0, v9 +; CGP-NEXT: v_mul_hi_u32 v9, v1, v9 +; CGP-NEXT: v_add_i32_e64 v5, s[6:7], v16, v5 +; CGP-NEXT: v_cndmask_b32_e64 v13, 0, 1, s[6:7] +; CGP-NEXT: v_add_i32_e64 v7, s[6:7], v11, v7 +; CGP-NEXT: v_cndmask_b32_e64 v11, 0, 1, s[6:7] ; CGP-NEXT: v_cndmask_b32_e64 v16, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v5, vcc, v5, v12 -; CGP-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v9, vcc, v13, v9 -; CGP-NEXT: v_add_i32_e32 v10, vcc, v15, v10 -; CGP-NEXT: v_add_i32_e32 v13, vcc, v18, v16 -; CGP-NEXT: v_add_i32_e32 v11, vcc, v11, v12 -; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v9 -; CGP-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v5, vcc, v5, v13 +; CGP-NEXT: v_add_i32_e32 v5, vcc, v5, v18 +; CGP-NEXT: v_cndmask_b32_e64 v18, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v14, vcc, v14, v16 +; CGP-NEXT: v_cndmask_b32_e64 v16, 0, 1, s[4:5] +; CGP-NEXT: v_add_i32_e32 v7, vcc, v7, v8 +; CGP-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v13, vcc, v13, v18 +; CGP-NEXT: v_add_i32_e32 v12, vcc, v12, v16 +; CGP-NEXT: v_add_i32_e32 v8, vcc, v11, v8 +; CGP-NEXT: v_add_i32_e32 v5, vcc, v5, v14 +; CGP-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v7, vcc, v7, v12 ; CGP-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v9, vcc, v10, v9 -; CGP-NEXT: v_mul_lo_u32 v10, s12, v4 -; CGP-NEXT: v_mul_lo_u32 v13, 0, v4 -; CGP-NEXT: v_mul_hi_u32 v4, s12, v4 -; CGP-NEXT: v_add_i32_e32 v11, vcc, v11, v12 -; CGP-NEXT: v_mul_lo_u32 v12, s12, v5 -; CGP-NEXT: v_mul_lo_u32 v15, 0, v5 -; CGP-NEXT: v_mul_hi_u32 v5, s12, v5 -; CGP-NEXT: v_add_i32_e32 v6, vcc, v6, v9 -; CGP-NEXT: v_add_i32_e32 v7, vcc, v7, v11 -; CGP-NEXT: v_mul_lo_u32 v6, s12, v6 -; CGP-NEXT: v_mul_lo_u32 v7, s12, v7 -; CGP-NEXT: v_add_i32_e32 v6, vcc, v13, v6 -; CGP-NEXT: v_add_i32_e32 v7, vcc, v15, v7 -; CGP-NEXT: v_add_i32_e32 v4, vcc, v6, v4 -; CGP-NEXT: v_add_i32_e32 v5, vcc, v7, v5 -; CGP-NEXT: v_sub_i32_e32 v2, vcc, v2, v10 -; CGP-NEXT: v_subb_u32_e64 v6, s[4:5], v3, v4, vcc -; CGP-NEXT: v_sub_i32_e64 v3, s[4:5], v3, v4 -; CGP-NEXT: v_cmp_le_u32_e64 s[4:5], s12, v2 -; CGP-NEXT: v_cndmask_b32_e64 v4, 0, -1, s[4:5] +; CGP-NEXT: v_add_i32_e32 v11, vcc, v13, v11 +; CGP-NEXT: v_mul_lo_u32 v13, v4, v5 +; CGP-NEXT: v_mul_lo_u32 v14, 0, v5 +; CGP-NEXT: v_mul_hi_u32 v5, v4, v5 +; CGP-NEXT: v_add_i32_e32 v8, vcc, v8, v12 +; CGP-NEXT: v_mul_lo_u32 v12, v4, v7 +; CGP-NEXT: v_mul_lo_u32 v16, 0, v7 +; CGP-NEXT: v_mul_hi_u32 v7, v4, v7 +; CGP-NEXT: v_add_i32_e32 v6, vcc, v6, v11 +; CGP-NEXT: v_add_i32_e32 v8, vcc, v9, v8 +; CGP-NEXT: v_mul_lo_u32 v6, v4, v6 +; CGP-NEXT: v_mul_lo_u32 v8, v4, v8 +; CGP-NEXT: v_add_i32_e32 v6, vcc, v14, v6 +; CGP-NEXT: v_add_i32_e32 v8, vcc, v16, v8 +; CGP-NEXT: v_add_i32_e32 v5, vcc, v6, v5 +; CGP-NEXT: v_add_i32_e32 v6, vcc, v8, v7 +; CGP-NEXT: v_sub_i32_e32 v2, vcc, v2, v13 +; CGP-NEXT: v_subb_u32_e64 v7, s[4:5], v3, v5, vcc +; CGP-NEXT: v_sub_i32_e64 v3, s[4:5], v3, v5 +; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v2, v4 +; CGP-NEXT: v_cndmask_b32_e64 v5, 0, -1, s[4:5] ; CGP-NEXT: v_sub_i32_e64 v0, s[4:5], v0, v12 -; CGP-NEXT: v_subb_u32_e64 v7, s[6:7], v1, v5, s[4:5] -; CGP-NEXT: v_sub_i32_e64 v1, s[6:7], v1, v5 -; CGP-NEXT: v_cmp_le_u32_e64 s[6:7], s12, v0 -; CGP-NEXT: v_cndmask_b32_e64 v5, 0, -1, s[6:7] -; CGP-NEXT: v_cmp_eq_u32_e64 s[6:7], 0, v6 -; CGP-NEXT: v_cndmask_b32_e64 v4, v8, v4, s[6:7] +; CGP-NEXT: v_subb_u32_e64 v8, s[6:7], v1, v6, s[4:5] +; CGP-NEXT: v_sub_i32_e64 v1, s[6:7], v1, v6 +; CGP-NEXT: v_cmp_ge_u32_e64 s[6:7], v0, v4 +; CGP-NEXT: v_cndmask_b32_e64 v6, 0, -1, s[6:7] +; CGP-NEXT: v_cmp_eq_u32_e64 s[6:7], 0, v7 +; CGP-NEXT: v_cndmask_b32_e64 v5, v19, v5, s[6:7] ; CGP-NEXT: v_subbrev_u32_e32 v3, vcc, 0, v3, vcc -; CGP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v7 -; CGP-NEXT: v_cndmask_b32_e32 v5, v14, v5, vcc +; CGP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v8 +; CGP-NEXT: v_cndmask_b32_e32 v6, v17, v6, vcc ; CGP-NEXT: v_subbrev_u32_e64 v1, vcc, 0, v1, s[4:5] -; CGP-NEXT: v_subrev_i32_e32 v8, vcc, s12, v2 +; CGP-NEXT: v_sub_i32_e32 v9, vcc, v2, v4 ; CGP-NEXT: v_subbrev_u32_e32 v3, vcc, 0, v3, vcc -; CGP-NEXT: v_cmp_le_u32_e32 vcc, s12, v8 -; CGP-NEXT: v_cndmask_b32_e64 v9, 0, -1, vcc -; CGP-NEXT: v_subrev_i32_e32 v10, vcc, s12, v0 -; CGP-NEXT: v_subbrev_u32_e32 v1, vcc, 0, v1, vcc -; CGP-NEXT: v_cmp_le_u32_e32 vcc, s12, v10 +; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v9, v4 ; CGP-NEXT: v_cndmask_b32_e64 v11, 0, -1, vcc +; CGP-NEXT: v_sub_i32_e32 v12, vcc, v0, v4 +; CGP-NEXT: v_subbrev_u32_e32 v1, vcc, 0, v1, vcc +; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v12, v4 +; CGP-NEXT: v_cndmask_b32_e64 v13, 0, -1, vcc ; CGP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 -; CGP-NEXT: v_cndmask_b32_e32 v9, v19, v9, vcc -; CGP-NEXT: v_subrev_i32_e32 v12, vcc, s12, v8 -; CGP-NEXT: v_subbrev_u32_e32 v13, vcc, 0, v3, vcc +; CGP-NEXT: v_cndmask_b32_e32 v11, v15, v11, vcc +; CGP-NEXT: v_sub_i32_e32 v14, vcc, v9, v4 +; CGP-NEXT: v_subbrev_u32_e32 v15, vcc, 0, v3, vcc ; CGP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 -; CGP-NEXT: v_cndmask_b32_e32 v11, v17, v11, vcc -; CGP-NEXT: v_subrev_i32_e32 v14, vcc, s12, v10 -; CGP-NEXT: v_subbrev_u32_e32 v15, vcc, 0, v1, vcc -; CGP-NEXT: v_cmp_ne_u32_e32 vcc, 0, v9 -; CGP-NEXT: v_cndmask_b32_e32 v8, v8, v12, vcc -; CGP-NEXT: v_cmp_ne_u32_e64 s[4:5], 0, v11 -; CGP-NEXT: v_cndmask_b32_e64 v9, v10, v14, s[4:5] -; CGP-NEXT: v_cndmask_b32_e32 v3, v3, v13, vcc -; CGP-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 -; CGP-NEXT: v_cndmask_b32_e32 v2, v2, v8, vcc -; CGP-NEXT: v_cndmask_b32_e64 v1, v1, v15, s[4:5] -; CGP-NEXT: v_cmp_ne_u32_e64 s[4:5], 0, v5 -; CGP-NEXT: v_cndmask_b32_e64 v0, v0, v9, s[4:5] -; CGP-NEXT: v_cndmask_b32_e64 v1, v7, v1, s[4:5] -; CGP-NEXT: v_cndmask_b32_e32 v3, v6, v3, vcc +; CGP-NEXT: v_cndmask_b32_e32 v10, v10, v13, vcc +; CGP-NEXT: v_sub_i32_e32 v4, vcc, v12, v4 +; CGP-NEXT: v_subbrev_u32_e32 v13, vcc, 0, v1, vcc +; CGP-NEXT: v_cmp_ne_u32_e32 vcc, 0, v11 +; CGP-NEXT: v_cndmask_b32_e32 v9, v9, v14, vcc +; CGP-NEXT: v_cmp_ne_u32_e64 s[4:5], 0, v10 +; CGP-NEXT: v_cndmask_b32_e64 v4, v12, v4, s[4:5] +; CGP-NEXT: v_cndmask_b32_e32 v3, v3, v15, vcc +; CGP-NEXT: v_cmp_ne_u32_e32 vcc, 0, v5 +; CGP-NEXT: v_cndmask_b32_e32 v2, v2, v9, vcc +; CGP-NEXT: v_cndmask_b32_e64 v1, v1, v13, s[4:5] +; CGP-NEXT: v_cmp_ne_u32_e64 s[4:5], 0, v6 +; CGP-NEXT: v_cndmask_b32_e64 v0, v0, v4, s[4:5] +; CGP-NEXT: v_cndmask_b32_e64 v1, v8, v1, s[4:5] +; CGP-NEXT: v_cndmask_b32_e32 v3, v7, v3, vcc ; CGP-NEXT: s_setpc_b64 s[30:31] %result = urem <2 x i64> %num, ret <2 x i64> %result @@ -1777,23 +1778,23 @@ ; CHECK-NEXT: s_cbranch_execz BB7_4 ; CHECK-NEXT: ; %bb.3: ; CHECK-NEXT: v_cvt_f32_u32_e32 v0, v5 -; CHECK-NEXT: v_sub_i32_e32 v1, vcc, 0, v5 +; CHECK-NEXT: v_mov_b32_e32 v1, 0 +; CHECK-NEXT: v_sub_i32_e32 v2, vcc, 0, v5 ; CHECK-NEXT: v_rcp_iflag_f32_e32 v0, v0 ; CHECK-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; CHECK-NEXT: v_cvt_u32_f32_e32 v0, v0 -; CHECK-NEXT: v_mul_lo_u32 v1, v1, v0 -; CHECK-NEXT: v_mul_hi_u32 v1, v0, v1 -; CHECK-NEXT: v_add_i32_e32 v0, vcc, v0, v1 +; CHECK-NEXT: v_mul_lo_u32 v2, v2, v0 +; CHECK-NEXT: v_mul_hi_u32 v2, v0, v2 +; CHECK-NEXT: v_add_i32_e32 v0, vcc, v0, v2 ; CHECK-NEXT: v_mul_hi_u32 v0, v3, v0 ; CHECK-NEXT: v_mul_lo_u32 v0, v0, v5 ; CHECK-NEXT: v_sub_i32_e32 v0, vcc, v3, v0 -; CHECK-NEXT: v_sub_i32_e32 v1, vcc, v0, v5 +; CHECK-NEXT: v_sub_i32_e32 v2, vcc, v0, v5 ; CHECK-NEXT: v_cmp_ge_u32_e32 vcc, v0, v5 -; CHECK-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc -; CHECK-NEXT: v_sub_i32_e32 v1, vcc, v0, v5 +; CHECK-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc +; CHECK-NEXT: v_sub_i32_e32 v2, vcc, v0, v5 ; CHECK-NEXT: v_cmp_ge_u32_e32 vcc, v0, v5 -; CHECK-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc -; CHECK-NEXT: v_mov_b32_e32 v1, 0 +; CHECK-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc ; CHECK-NEXT: BB7_4: ; CHECK-NEXT: s_or_b64 exec, exec, s[4:5] ; CHECK-NEXT: s_setpc_b64 s[30:31] @@ -2215,23 +2216,23 @@ ; CGP-NEXT: s_cbranch_execz BB8_4 ; CGP-NEXT: ; %bb.3: ; CGP-NEXT: v_cvt_f32_u32_e32 v0, v2 -; CGP-NEXT: v_sub_i32_e32 v1, vcc, 0, v2 +; CGP-NEXT: v_mov_b32_e32 v1, 0 +; CGP-NEXT: v_sub_i32_e32 v3, vcc, 0, v2 ; CGP-NEXT: v_rcp_iflag_f32_e32 v0, v0 ; CGP-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; CGP-NEXT: v_cvt_u32_f32_e32 v0, v0 -; CGP-NEXT: v_mul_lo_u32 v1, v1, v0 -; CGP-NEXT: v_mul_hi_u32 v1, v0, v1 -; CGP-NEXT: v_add_i32_e32 v0, vcc, v0, v1 +; CGP-NEXT: v_mul_lo_u32 v3, v3, v0 +; CGP-NEXT: v_mul_hi_u32 v3, v0, v3 +; CGP-NEXT: v_add_i32_e32 v0, vcc, v0, v3 ; CGP-NEXT: v_mul_hi_u32 v0, v8, v0 ; CGP-NEXT: v_mul_lo_u32 v0, v0, v2 ; CGP-NEXT: v_sub_i32_e32 v0, vcc, v8, v0 -; CGP-NEXT: v_sub_i32_e32 v1, vcc, v0, v2 +; CGP-NEXT: v_sub_i32_e32 v3, vcc, v0, v2 ; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v0, v2 -; CGP-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc -; CGP-NEXT: v_sub_i32_e32 v1, vcc, v0, v2 +; CGP-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc +; CGP-NEXT: v_sub_i32_e32 v3, vcc, v0, v2 ; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v0, v2 -; CGP-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc -; CGP-NEXT: v_mov_b32_e32 v1, 0 +; CGP-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc ; CGP-NEXT: BB8_4: ; CGP-NEXT: s_or_b64 exec, exec, s[4:5] ; CGP-NEXT: v_or_b32_e32 v3, v7, v11 @@ -2376,23 +2377,23 @@ ; CGP-NEXT: s_cbranch_execz BB8_8 ; CGP-NEXT: ; %bb.7: ; CGP-NEXT: v_cvt_f32_u32_e32 v2, v10 -; CGP-NEXT: v_sub_i32_e32 v3, vcc, 0, v10 +; CGP-NEXT: v_mov_b32_e32 v3, 0 +; CGP-NEXT: v_sub_i32_e32 v4, vcc, 0, v10 ; CGP-NEXT: v_rcp_iflag_f32_e32 v2, v2 ; CGP-NEXT: v_mul_f32_e32 v2, 0x4f7ffffe, v2 ; CGP-NEXT: v_cvt_u32_f32_e32 v2, v2 -; CGP-NEXT: v_mul_lo_u32 v3, v3, v2 -; CGP-NEXT: v_mul_hi_u32 v3, v2, v3 -; CGP-NEXT: v_add_i32_e32 v2, vcc, v2, v3 +; CGP-NEXT: v_mul_lo_u32 v4, v4, v2 +; CGP-NEXT: v_mul_hi_u32 v4, v2, v4 +; CGP-NEXT: v_add_i32_e32 v2, vcc, v2, v4 ; CGP-NEXT: v_mul_hi_u32 v2, v5, v2 ; CGP-NEXT: v_mul_lo_u32 v2, v2, v10 ; CGP-NEXT: v_sub_i32_e32 v2, vcc, v5, v2 -; CGP-NEXT: v_sub_i32_e32 v3, vcc, v2, v10 +; CGP-NEXT: v_sub_i32_e32 v4, vcc, v2, v10 ; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v2, v10 -; CGP-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc -; CGP-NEXT: v_sub_i32_e32 v3, vcc, v2, v10 +; CGP-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc +; CGP-NEXT: v_sub_i32_e32 v4, vcc, v2, v10 ; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v2, v10 -; CGP-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc -; CGP-NEXT: v_mov_b32_e32 v3, 0 +; CGP-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc ; CGP-NEXT: BB8_8: ; CGP-NEXT: s_or_b64 exec, exec, s[4:5] ; CGP-NEXT: s_setpc_b64 s[30:31] @@ -2718,37 +2719,38 @@ ; CGP-LABEL: v_urem_v2i64_24bit: ; CGP: ; %bb.0: ; CGP-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CGP-NEXT: s_mov_b32 s6, 0xffffff -; CGP-NEXT: v_and_b32_e32 v0, s6, v0 -; CGP-NEXT: v_and_b32_e32 v1, s6, v2 -; CGP-NEXT: v_and_b32_e32 v2, s6, v4 -; CGP-NEXT: v_and_b32_e32 v3, s6, v6 -; CGP-NEXT: v_cvt_f32_u32_e32 v4, v0 -; CGP-NEXT: v_cvt_f32_u32_e32 v5, v2 -; CGP-NEXT: v_cvt_f32_u32_e32 v6, v1 -; CGP-NEXT: v_cvt_f32_u32_e32 v7, v3 -; CGP-NEXT: v_rcp_f32_e32 v8, v5 -; CGP-NEXT: v_rcp_f32_e32 v9, v7 -; CGP-NEXT: v_mul_f32_e32 v8, v4, v8 -; CGP-NEXT: v_mul_f32_e32 v9, v6, v9 -; CGP-NEXT: v_trunc_f32_e32 v8, v8 +; CGP-NEXT: s_mov_b32 s4, 0xffffff +; CGP-NEXT: v_mov_b32_e32 v1, 0xffffff +; CGP-NEXT: v_and_b32_e32 v0, s4, v0 +; CGP-NEXT: v_and_b32_e32 v2, s4, v2 +; CGP-NEXT: v_and_b32_e32 v3, s4, v4 +; CGP-NEXT: v_and_b32_e32 v4, s4, v6 +; CGP-NEXT: v_cvt_f32_u32_e32 v5, v0 +; CGP-NEXT: v_cvt_f32_u32_e32 v6, v3 +; CGP-NEXT: v_cvt_f32_u32_e32 v7, v2 +; CGP-NEXT: v_cvt_f32_u32_e32 v8, v4 +; CGP-NEXT: v_rcp_f32_e32 v9, v6 +; CGP-NEXT: v_rcp_f32_e32 v10, v8 +; CGP-NEXT: v_mul_f32_e32 v9, v5, v9 +; CGP-NEXT: v_mul_f32_e32 v10, v7, v10 ; CGP-NEXT: v_trunc_f32_e32 v9, v9 -; CGP-NEXT: v_mad_f32 v4, -v8, v5, v4 -; CGP-NEXT: v_cvt_u32_f32_e32 v8, v8 -; CGP-NEXT: v_mad_f32 v6, -v9, v7, v6 +; CGP-NEXT: v_trunc_f32_e32 v10, v10 +; CGP-NEXT: v_mad_f32 v5, -v9, v6, v5 ; CGP-NEXT: v_cvt_u32_f32_e32 v9, v9 -; CGP-NEXT: v_cmp_ge_f32_e64 s[4:5], |v4|, v5 -; CGP-NEXT: v_cndmask_b32_e64 v4, 0, 1, s[4:5] -; CGP-NEXT: v_cmp_ge_f32_e64 s[4:5], |v6|, v7 +; CGP-NEXT: v_mad_f32 v7, -v10, v8, v7 +; CGP-NEXT: v_cvt_u32_f32_e32 v10, v10 +; CGP-NEXT: v_cmp_ge_f32_e64 s[4:5], |v5|, v6 ; CGP-NEXT: v_cndmask_b32_e64 v5, 0, 1, s[4:5] -; CGP-NEXT: v_add_i32_e32 v4, vcc, v8, v4 +; CGP-NEXT: v_cmp_ge_f32_e64 s[4:5], |v7|, v8 +; CGP-NEXT: v_cndmask_b32_e64 v6, 0, 1, s[4:5] ; CGP-NEXT: v_add_i32_e32 v5, vcc, v9, v5 -; CGP-NEXT: v_mul_lo_u32 v2, v4, v2 +; CGP-NEXT: v_add_i32_e32 v6, vcc, v10, v6 ; CGP-NEXT: v_mul_lo_u32 v3, v5, v3 -; CGP-NEXT: v_sub_i32_e32 v0, vcc, v0, v2 -; CGP-NEXT: v_sub_i32_e32 v1, vcc, v1, v3 -; CGP-NEXT: v_and_b32_e32 v0, s6, v0 -; CGP-NEXT: v_and_b32_e32 v2, s6, v1 +; CGP-NEXT: v_mul_lo_u32 v4, v6, v4 +; CGP-NEXT: v_sub_i32_e32 v0, vcc, v0, v3 +; CGP-NEXT: v_sub_i32_e32 v2, vcc, v2, v4 +; CGP-NEXT: v_and_b32_e32 v0, v0, v1 +; CGP-NEXT: v_and_b32_e32 v2, v2, v1 ; CGP-NEXT: v_mov_b32_e32 v1, 0 ; CGP-NEXT: v_mov_b32_e32 v3, 0 ; CGP-NEXT: s_setpc_b64 s[30:31] Index: llvm/test/CodeGen/AMDGPU/GlobalISel/usubsat.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/GlobalISel/usubsat.ll +++ llvm/test/CodeGen/AMDGPU/GlobalISel/usubsat.ll @@ -226,8 +226,8 @@ ; GFX9-NEXT: v_pk_lshlrev_b16 v1, 8, v1 op_sel_hi:[0,1] ; GFX9-NEXT: v_pk_sub_u16 v0, v0, v1 clamp ; GFX9-NEXT: v_pk_lshrrev_b16 v0, 8, v0 op_sel_hi:[0,1] -; GFX9-NEXT: s_movk_i32 s4, 0xff -; GFX9-NEXT: v_and_b32_sdwa v1, v0, s4 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX9-NEXT: v_mov_b32_e32 v1, 0xff +; GFX9-NEXT: v_and_b32_sdwa v1, v0, v1 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -239,14 +239,14 @@ ; GFX10-NEXT: v_mov_b32_e32 v2, 0xffff ; GFX10-NEXT: v_lshrrev_b32_sdwa v3, s4, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX10-NEXT: v_lshrrev_b32_sdwa v4, s4, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX10-NEXT: s_movk_i32 s4, 0xff ; GFX10-NEXT: v_and_or_b32 v0, v0, v2, v3 ; GFX10-NEXT: v_and_or_b32 v1, v1, v2, v4 ; GFX10-NEXT: v_pk_lshlrev_b16 v0, 8, v0 op_sel_hi:[0,1] ; GFX10-NEXT: v_pk_lshlrev_b16 v1, 8, v1 op_sel_hi:[0,1] ; GFX10-NEXT: v_pk_sub_u16 v0, v0, v1 clamp +; GFX10-NEXT: v_mov_b32_e32 v1, 0xff ; GFX10-NEXT: v_pk_lshrrev_b16 v0, 8, v0 op_sel_hi:[0,1] -; GFX10-NEXT: v_and_b32_sdwa v1, v0, s4 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX10-NEXT: v_and_b32_sdwa v1, v0, v1 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; GFX10-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX10-NEXT: s_setpc_b64 s[30:31] %lhs = bitcast i16 %lhs.arg to <2 x i8> @@ -312,8 +312,8 @@ ; GFX9-NEXT: v_mov_b32_e32 v0, s1 ; GFX9-NEXT: v_pk_sub_u16 v0, s0, v0 clamp ; GFX9-NEXT: v_pk_lshrrev_b16 v0, 8, v0 op_sel_hi:[0,1] -; GFX9-NEXT: s_movk_i32 s0, 0xff -; GFX9-NEXT: v_and_b32_sdwa v1, v0, s0 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX9-NEXT: v_mov_b32_e32 v1, 0xff +; GFX9-NEXT: v_and_b32_sdwa v1, v0, v1 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_readfirstlane_b32 s0, v0 ; GFX9-NEXT: ; return to shader part epilog @@ -333,10 +333,10 @@ ; GFX10-NEXT: s_lshl_b32 s2, s4, 8 ; GFX10-NEXT: s_pack_ll_b32_b16 s0, s0, s3 ; GFX10-NEXT: s_pack_ll_b32_b16 s1, s1, s2 +; GFX10-NEXT: v_mov_b32_e32 v1, 0xff ; GFX10-NEXT: v_pk_sub_u16 v0, s0, s1 clamp -; GFX10-NEXT: s_movk_i32 s0, 0xff ; GFX10-NEXT: v_pk_lshrrev_b16 v0, 8, v0 op_sel_hi:[0,1] -; GFX10-NEXT: v_and_b32_sdwa v1, v0, s0 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX10-NEXT: v_and_b32_sdwa v1, v0, v1 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; GFX10-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX10-NEXT: v_readfirstlane_b32 s0, v0 ; GFX10-NEXT: ; return to shader part epilog @@ -440,12 +440,12 @@ ; GFX9-NEXT: v_pk_sub_u16 v0, v0, v1 clamp ; GFX9-NEXT: v_pk_sub_u16 v1, v2, v3 clamp ; GFX9-NEXT: v_pk_lshrrev_b16 v0, 8, v0 op_sel_hi:[0,1] -; GFX9-NEXT: v_mov_b32_e32 v2, 8 +; GFX9-NEXT: v_mov_b32_e32 v3, 8 ; GFX9-NEXT: v_pk_lshrrev_b16 v1, 8, v1 op_sel_hi:[0,1] -; GFX9-NEXT: s_movk_i32 s4, 0xff -; GFX9-NEXT: v_lshlrev_b32_sdwa v2, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 -; GFX9-NEXT: v_and_or_b32 v0, v0, s4, v2 -; GFX9-NEXT: v_and_b32_e32 v2, s4, v1 +; GFX9-NEXT: v_mov_b32_e32 v2, 0xff +; GFX9-NEXT: v_lshlrev_b32_sdwa v3, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 +; GFX9-NEXT: v_and_or_b32 v0, v0, v2, v3 +; GFX9-NEXT: v_and_b32_e32 v2, v1, v2 ; GFX9-NEXT: v_mov_b32_e32 v3, 24 ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX9-NEXT: v_lshlrev_b32_sdwa v1, v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 @@ -466,26 +466,26 @@ ; GFX10-NEXT: v_lshrrev_b32_e32 v8, 16, v1 ; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v4 ; GFX10-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GFX10-NEXT: s_movk_i32 s4, 0xff ; GFX10-NEXT: v_and_or_b32 v0, v0, v7, v2 ; GFX10-NEXT: v_and_or_b32 v1, v1, v7, v6 ; GFX10-NEXT: v_and_or_b32 v2, v3, v7, v4 ; GFX10-NEXT: v_and_or_b32 v3, v8, v7, v5 -; GFX10-NEXT: v_mov_b32_e32 v4, 24 +; GFX10-NEXT: v_mov_b32_e32 v5, 24 ; GFX10-NEXT: v_pk_lshlrev_b16 v0, 8, v0 op_sel_hi:[0,1] ; GFX10-NEXT: v_pk_lshlrev_b16 v1, 8, v1 op_sel_hi:[0,1] ; GFX10-NEXT: v_pk_lshlrev_b16 v2, 8, v2 op_sel_hi:[0,1] ; GFX10-NEXT: v_pk_lshlrev_b16 v3, 8, v3 op_sel_hi:[0,1] ; GFX10-NEXT: v_pk_sub_u16 v0, v0, v1 clamp ; GFX10-NEXT: v_pk_sub_u16 v1, v2, v3 clamp -; GFX10-NEXT: v_mov_b32_e32 v2, 8 +; GFX10-NEXT: v_mov_b32_e32 v2, 0xff +; GFX10-NEXT: v_mov_b32_e32 v3, 8 ; GFX10-NEXT: v_pk_lshrrev_b16 v0, 8, v0 op_sel_hi:[0,1] ; GFX10-NEXT: v_pk_lshrrev_b16 v1, 8, v1 op_sel_hi:[0,1] -; GFX10-NEXT: v_lshlrev_b32_sdwa v2, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 -; GFX10-NEXT: v_and_b32_e32 v3, s4, v1 -; GFX10-NEXT: v_lshlrev_b32_sdwa v1, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 -; GFX10-NEXT: v_and_or_b32 v0, v0, s4, v2 -; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; GFX10-NEXT: v_lshlrev_b32_sdwa v3, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 +; GFX10-NEXT: v_and_b32_e32 v4, v1, v2 +; GFX10-NEXT: v_lshlrev_b32_sdwa v1, v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 +; GFX10-NEXT: v_and_or_b32 v0, v0, v2, v3 +; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v4 ; GFX10-NEXT: v_or3_b32 v0, v0, v2, v1 ; GFX10-NEXT: s_setpc_b64 s[30:31] %lhs = bitcast i32 %lhs.arg to <4 x i8> @@ -605,11 +605,11 @@ ; GFX9-NEXT: v_pk_sub_u16 v1, s3, v1 clamp ; GFX9-NEXT: v_pk_lshrrev_b16 v0, 8, v0 op_sel_hi:[0,1] ; GFX9-NEXT: v_pk_lshrrev_b16 v1, 8, v1 op_sel_hi:[0,1] -; GFX9-NEXT: s_movk_i32 s0, 0xff -; GFX9-NEXT: v_lshlrev_b32_sdwa v2, s2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 +; GFX9-NEXT: v_mov_b32_e32 v2, 0xff +; GFX9-NEXT: v_lshlrev_b32_sdwa v3, s2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 ; GFX9-NEXT: s_mov_b32 s5, 24 -; GFX9-NEXT: v_and_or_b32 v0, v0, s0, v2 -; GFX9-NEXT: v_and_b32_e32 v2, s0, v1 +; GFX9-NEXT: v_and_or_b32 v0, v0, v2, v3 +; GFX9-NEXT: v_and_b32_e32 v2, v1, v2 ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX9-NEXT: v_lshlrev_b32_sdwa v1, s5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 ; GFX9-NEXT: v_or3_b32 v0, v0, v2, v1 @@ -647,16 +647,16 @@ ; GFX10-NEXT: s_pack_ll_b32_b16 s3, s3, s4 ; GFX10-NEXT: v_pk_sub_u16 v0, s0, s1 clamp ; GFX10-NEXT: v_pk_sub_u16 v1, s2, s3 clamp +; GFX10-NEXT: v_mov_b32_e32 v2, 0xff ; GFX10-NEXT: s_mov_b32 s0, 8 -; GFX10-NEXT: s_movk_i32 s1, 0xff ; GFX10-NEXT: v_pk_lshrrev_b16 v0, 8, v0 op_sel_hi:[0,1] ; GFX10-NEXT: v_pk_lshrrev_b16 v1, 8, v1 op_sel_hi:[0,1] -; GFX10-NEXT: v_lshlrev_b32_sdwa v2, s0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 -; GFX10-NEXT: v_and_b32_e32 v3, s1, v1 +; GFX10-NEXT: v_lshlrev_b32_sdwa v3, s0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 +; GFX10-NEXT: v_and_b32_e32 v4, v1, v2 ; GFX10-NEXT: s_mov_b32 s0, 24 ; GFX10-NEXT: v_lshlrev_b32_sdwa v1, s0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 -; GFX10-NEXT: v_and_or_b32 v0, v0, s1, v2 -; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; GFX10-NEXT: v_and_or_b32 v0, v0, v2, v3 +; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v4 ; GFX10-NEXT: v_or3_b32 v0, v0, v2, v1 ; GFX10-NEXT: v_readfirstlane_b32 s0, v0 ; GFX10-NEXT: ; return to shader part epilog @@ -2047,12 +2047,11 @@ ; GFX8-NEXT: v_sub_u16_e64 v4, v2, v5 clamp ; GFX8-NEXT: v_sub_u16_sdwa v2, v2, v5 clamp dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; GFX8-NEXT: v_mov_b32_e32 v5, 16 -; GFX8-NEXT: v_lshlrev_b32_sdwa v1, v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; GFX8-NEXT: v_or_b32_sdwa v1, v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX8-NEXT: v_mov_b32_e32 v3, 16 ; GFX8-NEXT: v_lshlrev_b32_sdwa v0, v5, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; GFX8-NEXT: v_lshlrev_b32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; GFX8-NEXT: v_lshlrev_b32_sdwa v1, v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; GFX8-NEXT: v_lshlrev_b32_sdwa v2, v5, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; GFX8-NEXT: v_or_b32_sdwa v0, v6, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_sdwa v1, v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX8-NEXT: v_or_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX8-NEXT: s_setpc_b64 s[30:31] ; @@ -2244,7 +2243,6 @@ ; GFX8-NEXT: v_sub_u16_sdwa v3, v3, v7 clamp dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; GFX8-NEXT: v_mov_b32_e32 v7, 16 ; GFX8-NEXT: v_lshlrev_b32_sdwa v0, v7, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; GFX8-NEXT: v_mov_b32_e32 v7, 16 ; GFX8-NEXT: v_lshlrev_b32_sdwa v1, v7, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; GFX8-NEXT: v_lshlrev_b32_sdwa v2, v7, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; GFX8-NEXT: v_lshlrev_b32_sdwa v3, v7, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 Index: llvm/test/CodeGen/AMDGPU/ctlz.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/ctlz.ll +++ llvm/test/CodeGen/AMDGPU/ctlz.ll @@ -977,6 +977,7 @@ ; GFX10-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c ; GFX10-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX10-GISEL-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-GISEL-NEXT: global_load_dword v0, v0, s[2:3] ; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) @@ -984,8 +985,7 @@ ; GFX10-GISEL-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX10-GISEL-NEXT: v_min_u32_e32 v1, 32, v1 ; GFX10-GISEL-NEXT: v_cndmask_b32_e64 v0, v1, -1, vcc_lo -; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, 0 -; GFX10-GISEL-NEXT: global_store_dword v1, v0, s[0:1] +; GFX10-GISEL-NEXT: global_store_dword v2, v0, s[0:1] ; GFX10-GISEL-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %in.gep = getelementptr i32, i32 addrspace(1)* %valptr, i32 %tid @@ -1073,6 +1073,7 @@ ; GFX10-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c ; GFX10-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX10-GISEL-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-GISEL-NEXT: global_load_dword v0, v0, s[2:3] ; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) @@ -1080,8 +1081,7 @@ ; GFX10-GISEL-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0 ; GFX10-GISEL-NEXT: v_min_u32_e32 v1, 32, v1 ; GFX10-GISEL-NEXT: v_cndmask_b32_e32 v0, -1, v1, vcc_lo -; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, 0 -; GFX10-GISEL-NEXT: global_store_dword v1, v0, s[0:1] +; GFX10-GISEL-NEXT: global_store_dword v2, v0, s[0:1] ; GFX10-GISEL-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %in.gep = getelementptr i32, i32 addrspace(1)* %valptr, i32 %tid @@ -1391,6 +1391,7 @@ ; GFX10-GISEL-NEXT: v_mov_b32_e32 v2, s3 ; GFX10-GISEL-NEXT: v_add_co_u32 v0, vcc_lo, v1, v0 ; GFX10-GISEL-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v2, v3, vcc_lo +; GFX10-GISEL-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-GISEL-NEXT: global_load_ubyte v0, v[0:1], off ; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX10-GISEL-NEXT: v_ffbh_u32_e32 v1, v0 @@ -1398,8 +1399,7 @@ ; GFX10-GISEL-NEXT: v_min_u32_e32 v1, 32, v1 ; GFX10-GISEL-NEXT: v_subrev_nc_u32_e32 v1, 24, v1 ; GFX10-GISEL-NEXT: v_cndmask_b32_e64 v0, v1, -1, vcc_lo -; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, 0 -; GFX10-GISEL-NEXT: global_store_byte v1, v0, s[0:1] +; GFX10-GISEL-NEXT: global_store_byte v2, v0, s[0:1] ; GFX10-GISEL-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %valptr.gep = getelementptr i8, i8 addrspace(1)* %valptr, i32 %tid @@ -1496,17 +1496,16 @@ ; GFX10-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c ; GFX10-GISEL-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX10-GISEL-NEXT: v_mov_b32_e32 v3, 0xffff ; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-GISEL-NEXT: global_load_ushort v1, v0, s[2:3] -; GFX10-GISEL-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-GISEL-NEXT: s_mov_b32 s2, 0xffff ; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX10-GISEL-NEXT: v_ffbh_u32_e32 v2, v1 ; GFX10-GISEL-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1 ; GFX10-GISEL-NEXT: v_min_u32_e32 v2, 32, v2 ; GFX10-GISEL-NEXT: v_subrev_nc_u32_e32 v2, 16, v2 -; GFX10-GISEL-NEXT: v_and_b32_e32 v2, s2, v2 -; GFX10-GISEL-NEXT: v_cndmask_b32_e64 v1, v2, s2, vcc_lo +; GFX10-GISEL-NEXT: v_and_b32_e32 v2, v2, v3 +; GFX10-GISEL-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc_lo ; GFX10-GISEL-NEXT: global_store_short v0, v1, s[0:1] ; GFX10-GISEL-NEXT: s_endpgm %val = load i16, i16 addrspace(1)* %valptr @@ -1602,20 +1601,20 @@ ; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, s2 ; GFX10-GISEL-NEXT: v_mov_b32_e32 v2, s3 -; GFX10-GISEL-NEXT: s_movk_i32 s2, 0x7f ; GFX10-GISEL-NEXT: v_add_co_u32 v0, vcc_lo, v1, v0 ; GFX10-GISEL-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v2, v3, vcc_lo ; GFX10-GISEL-NEXT: global_load_ubyte v0, v[0:1], off +; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, 0x7f ; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) -; GFX10-GISEL-NEXT: v_and_b32_e32 v0, s2, v0 -; GFX10-GISEL-NEXT: v_ffbh_u32_e32 v1, v0 +; GFX10-GISEL-NEXT: v_and_b32_e32 v0, v0, v1 +; GFX10-GISEL-NEXT: v_ffbh_u32_e32 v2, v0 ; GFX10-GISEL-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX10-GISEL-NEXT: v_min_u32_e32 v1, 32, v1 -; GFX10-GISEL-NEXT: v_subrev_nc_u32_e32 v1, 25, v1 -; GFX10-GISEL-NEXT: v_cndmask_b32_e64 v0, v1, 0x7f, vcc_lo -; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, 0 -; GFX10-GISEL-NEXT: v_and_b32_e32 v0, s2, v0 -; GFX10-GISEL-NEXT: global_store_byte v1, v0, s[0:1] +; GFX10-GISEL-NEXT: v_min_u32_e32 v2, 32, v2 +; GFX10-GISEL-NEXT: v_subrev_nc_u32_e32 v2, 25, v2 +; GFX10-GISEL-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc_lo +; GFX10-GISEL-NEXT: v_mov_b32_e32 v2, 0 +; GFX10-GISEL-NEXT: v_and_b32_e32 v0, v0, v1 +; GFX10-GISEL-NEXT: global_store_byte v2, v0, s[0:1] ; GFX10-GISEL-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %valptr.gep = getelementptr i7, i7 addrspace(1)* %valptr, i32 %tid Index: llvm/test/CodeGen/AMDGPU/ctlz_zero_undef.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/ctlz_zero_undef.ll +++ llvm/test/CodeGen/AMDGPU/ctlz_zero_undef.ll @@ -773,13 +773,13 @@ ; GFX9-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX9-GISEL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2c ; GFX9-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX9-GISEL-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-GISEL-NEXT: global_load_dword v0, v0, s[4:5] ; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) -; GFX9-GISEL-NEXT: v_ffbh_u32_e32 v1, v0 +; GFX9-GISEL-NEXT: v_ffbh_u32_e32 v2, v0 ; GFX9-GISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX9-GISEL-NEXT: v_cndmask_b32_e64 v0, v1, -1, vcc -; GFX9-GISEL-NEXT: v_mov_b32_e32 v1, 0 +; GFX9-GISEL-NEXT: v_cndmask_b32_e64 v0, v2, -1, vcc ; GFX9-GISEL-NEXT: global_store_dword v1, v0, s[2:3] ; GFX9-GISEL-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() @@ -853,13 +853,13 @@ ; GFX9-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX9-GISEL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2c ; GFX9-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX9-GISEL-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-GISEL-NEXT: global_load_dword v0, v0, s[4:5] ; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) -; GFX9-GISEL-NEXT: v_ffbh_u32_e32 v1, v0 +; GFX9-GISEL-NEXT: v_ffbh_u32_e32 v2, v0 ; GFX9-GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 -; GFX9-GISEL-NEXT: v_cndmask_b32_e32 v0, -1, v1, vcc -; GFX9-GISEL-NEXT: v_mov_b32_e32 v1, 0 +; GFX9-GISEL-NEXT: v_cndmask_b32_e32 v0, -1, v2, vcc ; GFX9-GISEL-NEXT: global_store_dword v1, v0, s[2:3] ; GFX9-GISEL-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() @@ -944,12 +944,12 @@ ; GFX9-GISEL-NEXT: v_add_co_u32_e32 v0, vcc, v1, v0 ; GFX9-GISEL-NEXT: v_addc_co_u32_e32 v1, vcc, v2, v3, vcc ; GFX9-GISEL-NEXT: global_load_ubyte v0, v[0:1], off +; GFX9-GISEL-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) -; GFX9-GISEL-NEXT: v_ffbh_u32_e32 v1, v0 -; GFX9-GISEL-NEXT: v_subrev_u32_e32 v1, 24, v1 +; GFX9-GISEL-NEXT: v_ffbh_u32_e32 v2, v0 +; GFX9-GISEL-NEXT: v_subrev_u32_e32 v2, 24, v2 ; GFX9-GISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX9-GISEL-NEXT: v_cndmask_b32_e64 v0, v1, -1, vcc -; GFX9-GISEL-NEXT: v_mov_b32_e32 v1, 0 +; GFX9-GISEL-NEXT: v_cndmask_b32_e64 v0, v2, -1, vcc ; GFX9-GISEL-NEXT: global_store_byte v1, v0, s[2:3] ; GFX9-GISEL-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() @@ -1131,13 +1131,13 @@ ; GFX9-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX9-GISEL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2c ; GFX9-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX9-GISEL-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-GISEL-NEXT: global_load_dword v0, v0, s[4:5] ; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) -; GFX9-GISEL-NEXT: v_ffbh_u32_e32 v1, v0 +; GFX9-GISEL-NEXT: v_ffbh_u32_e32 v2, v0 ; GFX9-GISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX9-GISEL-NEXT: v_cndmask_b32_e64 v0, v1, 0, vcc -; GFX9-GISEL-NEXT: v_mov_b32_e32 v1, 0 +; GFX9-GISEL-NEXT: v_cndmask_b32_e64 v0, v2, 0, vcc ; GFX9-GISEL-NEXT: global_store_dword v1, v0, s[2:3] ; GFX9-GISEL-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() @@ -1216,13 +1216,13 @@ ; GFX9-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX9-GISEL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2c ; GFX9-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX9-GISEL-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-GISEL-NEXT: global_load_dword v0, v0, s[4:5] ; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) -; GFX9-GISEL-NEXT: v_ffbh_u32_e32 v1, v0 +; GFX9-GISEL-NEXT: v_ffbh_u32_e32 v2, v0 ; GFX9-GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 -; GFX9-GISEL-NEXT: v_cndmask_b32_e32 v0, 0, v1, vcc -; GFX9-GISEL-NEXT: v_mov_b32_e32 v1, 0 +; GFX9-GISEL-NEXT: v_cndmask_b32_e32 v0, 0, v2, vcc ; GFX9-GISEL-NEXT: global_store_dword v1, v0, s[2:3] ; GFX9-GISEL-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() @@ -1302,13 +1302,13 @@ ; GFX9-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX9-GISEL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2c ; GFX9-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX9-GISEL-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-GISEL-NEXT: global_load_dword v0, v0, s[4:5] ; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) -; GFX9-GISEL-NEXT: v_ffbh_u32_e32 v1, v0 +; GFX9-GISEL-NEXT: v_ffbh_u32_e32 v2, v0 ; GFX9-GISEL-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 -; GFX9-GISEL-NEXT: v_cndmask_b32_e64 v0, v1, 0, vcc -; GFX9-GISEL-NEXT: v_mov_b32_e32 v1, 0 +; GFX9-GISEL-NEXT: v_cndmask_b32_e64 v0, v2, 0, vcc ; GFX9-GISEL-NEXT: global_store_dword v1, v0, s[2:3] ; GFX9-GISEL-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() @@ -1388,13 +1388,13 @@ ; GFX9-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX9-GISEL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2c ; GFX9-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX9-GISEL-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-GISEL-NEXT: global_load_dword v0, v0, s[4:5] ; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) -; GFX9-GISEL-NEXT: v_ffbh_u32_e32 v1, v0 +; GFX9-GISEL-NEXT: v_ffbh_u32_e32 v2, v0 ; GFX9-GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 1, v0 -; GFX9-GISEL-NEXT: v_cndmask_b32_e32 v0, 0, v1, vcc -; GFX9-GISEL-NEXT: v_mov_b32_e32 v1, 0 +; GFX9-GISEL-NEXT: v_cndmask_b32_e32 v0, 0, v2, vcc ; GFX9-GISEL-NEXT: global_store_dword v1, v0, s[2:3] ; GFX9-GISEL-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() Index: llvm/test/CodeGen/AMDGPU/cttz.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/cttz.ll +++ llvm/test/CodeGen/AMDGPU/cttz.ll @@ -970,6 +970,7 @@ ; GFX10-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c ; GFX10-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX10-GISEL-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-GISEL-NEXT: global_load_dword v0, v0, s[2:3] ; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) @@ -977,8 +978,7 @@ ; GFX10-GISEL-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX10-GISEL-NEXT: v_min_u32_e32 v1, 32, v1 ; GFX10-GISEL-NEXT: v_cndmask_b32_e64 v0, v1, -1, vcc_lo -; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, 0 -; GFX10-GISEL-NEXT: global_store_dword v1, v0, s[0:1] +; GFX10-GISEL-NEXT: global_store_dword v2, v0, s[0:1] ; GFX10-GISEL-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %in.gep = getelementptr i32, i32 addrspace(1)* %valptr, i32 %tid @@ -1066,6 +1066,7 @@ ; GFX10-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c ; GFX10-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX10-GISEL-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-GISEL-NEXT: global_load_dword v0, v0, s[2:3] ; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) @@ -1073,8 +1074,7 @@ ; GFX10-GISEL-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0 ; GFX10-GISEL-NEXT: v_min_u32_e32 v1, 32, v1 ; GFX10-GISEL-NEXT: v_cndmask_b32_e32 v0, -1, v1, vcc_lo -; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, 0 -; GFX10-GISEL-NEXT: global_store_dword v1, v0, s[0:1] +; GFX10-GISEL-NEXT: global_store_dword v2, v0, s[0:1] ; GFX10-GISEL-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %in.gep = getelementptr i32, i32 addrspace(1)* %valptr, i32 %tid @@ -1488,16 +1488,15 @@ ; GFX10-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c ; GFX10-GISEL-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX10-GISEL-NEXT: v_mov_b32_e32 v3, 0xffff ; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-GISEL-NEXT: global_load_ushort v1, v0, s[2:3] -; GFX10-GISEL-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-GISEL-NEXT: s_mov_b32 s2, 0xffff ; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX10-GISEL-NEXT: v_or_b32_e32 v2, 0x10000, v1 ; GFX10-GISEL-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1 ; GFX10-GISEL-NEXT: v_ffbl_b32_e32 v2, v2 -; GFX10-GISEL-NEXT: v_and_b32_e32 v2, s2, v2 -; GFX10-GISEL-NEXT: v_cndmask_b32_e64 v1, v2, s2, vcc_lo +; GFX10-GISEL-NEXT: v_and_b32_e32 v2, v2, v3 +; GFX10-GISEL-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc_lo ; GFX10-GISEL-NEXT: global_store_short v0, v1, s[0:1] ; GFX10-GISEL-NEXT: s_endpgm %val = load i16, i16 addrspace(1)* %valptr @@ -1593,19 +1592,19 @@ ; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, s2 ; GFX10-GISEL-NEXT: v_mov_b32_e32 v2, s3 -; GFX10-GISEL-NEXT: s_movk_i32 s2, 0x7f ; GFX10-GISEL-NEXT: v_add_co_u32 v0, vcc_lo, v1, v0 ; GFX10-GISEL-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v2, v3, vcc_lo ; GFX10-GISEL-NEXT: global_load_ubyte v0, v[0:1], off +; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, 0x7f ; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) -; GFX10-GISEL-NEXT: v_or_b32_e32 v1, 0x80, v0 -; GFX10-GISEL-NEXT: v_and_b32_e32 v0, s2, v0 -; GFX10-GISEL-NEXT: v_ffbl_b32_e32 v1, v1 +; GFX10-GISEL-NEXT: v_or_b32_e32 v2, 0x80, v0 +; GFX10-GISEL-NEXT: v_and_b32_e32 v0, v0, v1 +; GFX10-GISEL-NEXT: v_ffbl_b32_e32 v2, v2 ; GFX10-GISEL-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX10-GISEL-NEXT: v_cndmask_b32_e64 v0, v1, 0x7f, vcc_lo -; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, 0 -; GFX10-GISEL-NEXT: v_and_b32_e32 v0, s2, v0 -; GFX10-GISEL-NEXT: global_store_byte v1, v0, s[0:1] +; GFX10-GISEL-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc_lo +; GFX10-GISEL-NEXT: v_mov_b32_e32 v2, 0 +; GFX10-GISEL-NEXT: v_and_b32_e32 v0, v0, v1 +; GFX10-GISEL-NEXT: global_store_byte v2, v0, s[0:1] ; GFX10-GISEL-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %valptr.gep = getelementptr i7, i7 addrspace(1)* %valptr, i32 %tid Index: llvm/test/CodeGen/AMDGPU/cttz_zero_undef.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/cttz_zero_undef.ll +++ llvm/test/CodeGen/AMDGPU/cttz_zero_undef.ll @@ -1030,7 +1030,7 @@ ; GFX9-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX9-GISEL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2c ; GFX9-GISEL-NEXT: v_mov_b32_e32 v1, 0 -; GFX9-GISEL-NEXT: s_movk_i32 s0, 0xff +; GFX9-GISEL-NEXT: v_mov_b32_e32 v9, 0xff ; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-GISEL-NEXT: global_load_ubyte v0, v1, s[4:5] ; GFX9-GISEL-NEXT: global_load_ubyte v2, v1, s[4:5] offset:1 @@ -1041,16 +1041,16 @@ ; GFX9-GISEL-NEXT: global_load_ubyte v7, v1, s[4:5] offset:6 ; GFX9-GISEL-NEXT: global_load_ubyte v8, v1, s[4:5] offset:7 ; GFX9-GISEL-NEXT: s_waitcnt vmcnt(6) -; GFX9-GISEL-NEXT: v_and_b32_e32 v2, s0, v2 +; GFX9-GISEL-NEXT: v_and_b32_e32 v2, v2, v9 ; GFX9-GISEL-NEXT: v_lshlrev_b16_e32 v2, 8, v2 ; GFX9-GISEL-NEXT: s_waitcnt vmcnt(4) -; GFX9-GISEL-NEXT: v_and_b32_e32 v4, s0, v4 +; GFX9-GISEL-NEXT: v_and_b32_e32 v4, v4, v9 ; GFX9-GISEL-NEXT: v_lshlrev_b16_e32 v4, 8, v4 ; GFX9-GISEL-NEXT: s_waitcnt vmcnt(2) -; GFX9-GISEL-NEXT: v_and_b32_e32 v6, s0, v6 +; GFX9-GISEL-NEXT: v_and_b32_e32 v6, v6, v9 ; GFX9-GISEL-NEXT: v_lshlrev_b16_e32 v6, 8, v6 ; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) -; GFX9-GISEL-NEXT: v_and_b32_e32 v8, s0, v8 +; GFX9-GISEL-NEXT: v_and_b32_e32 v8, v8, v9 ; GFX9-GISEL-NEXT: v_lshlrev_b16_e32 v8, 8, v8 ; GFX9-GISEL-NEXT: v_or_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-GISEL-NEXT: v_or_b32_sdwa v2, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD @@ -1523,7 +1523,7 @@ ; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX9-GISEL-NEXT: v_or_b32_e32 v3, 0x100, v1 ; GFX9-GISEL-NEXT: v_ffbl_b32_e32 v3, v3 -; GFX9-GISEL-NEXT: v_and_b32_e32 v3, 0xff, v3 +; GFX9-GISEL-NEXT: v_and_b32_e32 v3, v3, v2 ; GFX9-GISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 ; GFX9-GISEL-NEXT: v_cndmask_b32_e32 v1, v3, v2, vcc ; GFX9-GISEL-NEXT: global_store_byte v0, v1, s[2:3] @@ -1625,7 +1625,7 @@ ; GFX9-GISEL-NEXT: v_lshl_or_b32 v1, v2, 8, v1 ; GFX9-GISEL-NEXT: v_or_b32_e32 v2, 0x10000, v1 ; GFX9-GISEL-NEXT: v_ffbl_b32_e32 v2, v2 -; GFX9-GISEL-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX9-GISEL-NEXT: v_and_b32_e32 v2, v2, v3 ; GFX9-GISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 ; GFX9-GISEL-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc ; GFX9-GISEL-NEXT: global_store_short v0, v1, s[2:3] Index: llvm/test/CodeGen/AMDGPU/remat-fp64-constants.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/remat-fp64-constants.ll +++ llvm/test/CodeGen/AMDGPU/remat-fp64-constants.ll @@ -1,10 +1,11 @@ -; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -verify-machineinstrs --stress-regalloc=10 < %s | FileCheck -check-prefix=GCN %s -; RUN: llc -global-isel -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -verify-machineinstrs --stress-regalloc=10 < %s | FileCheck -check-prefix=GCN %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -verify-machineinstrs --stress-regalloc=10 < %s | FileCheck -check-prefixes=GCN,SDAG %s +; RUN: llc -global-isel -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -verify-machineinstrs --stress-regalloc=10 < %s | FileCheck -check-prefixes=GCN,GISEL %s ; GCN-LABEL: {{^}}test_remat_sgpr: ; GCN-NOT: v_writelane_b32 ; GCN: {{^}}[[LOOP:BB[0-9_]+]]: -; GCN-COUNT-6: s_mov_b32 s{{[0-9]+}}, 0x +; SDAG-COUNT-6: s_mov_b32 s{{[0-9]+}}, 0x +; GISEL-COUNT-4: s_mov_b32 s{{[0-9]+}}, 0x ; GCN-NOT: v_writelane_b32 ; GCN: s_cbranch_{{[^ ]+}} [[LOOP]] ; GCN: .sgpr_spill_count: 0