diff --git a/llvm/lib/Target/AMDGPU/AMDGPUCombine.td b/llvm/lib/Target/AMDGPU/AMDGPUCombine.td --- a/llvm/lib/Target/AMDGPU/AMDGPUCombine.td +++ b/llvm/lib/Target/AMDGPU/AMDGPUCombine.td @@ -88,6 +88,14 @@ [{ return RegBankHelper.matchFPMed3ToClamp(*${fmed3}, ${matchinfo}); }]), (apply [{ RegBankHelper.applyClamp(*${fmed3}, ${matchinfo}); }])>; +def simple_copy_hoisting_matchdata : GIDefMatchData<"MachineInstr *">; + +def simple_copy_hoisting : GICombineRule< + (defs root:$ffn, simple_copy_hoisting_matchdata:$matchinfo), + (match (wip_match_opcode COPY):$ffn, + [{ return RegBankHelper.matchSimpleCopyHoisting(*${ffn}, ${matchinfo}); }]), + (apply [{ RegBankHelper.applySimpleCopyHoisting(*${ffn}, ${matchinfo}); }])>; + def remove_fcanonicalize_matchinfo : GIDefMatchData<"Register">; def remove_fcanonicalize : GICombineRule< @@ -128,7 +136,8 @@ def AMDGPURegBankCombinerHelper : GICombinerHelper< "AMDGPUGenRegBankCombinerHelper", [zext_trunc_fold, int_minmax_to_med3, ptr_add_immed_chain, - fp_minmax_to_clamp, fp_minmax_to_med3, fmed3_intrinsic_to_clamp]> { + fp_minmax_to_clamp, fp_minmax_to_med3, fmed3_intrinsic_to_clamp, + simple_copy_hoisting]> { let DisableRuleOption = "amdgpuregbankcombiner-disable-rule"; let StateClass = "AMDGPURegBankCombinerHelperState"; let AdditionalArguments = []; diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegBankCombiner.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegBankCombiner.cpp --- a/llvm/lib/Target/AMDGPU/AMDGPURegBankCombiner.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPURegBankCombiner.cpp @@ -50,8 +50,12 @@ TII(*Subtarget.getInstrInfo()), Helper(Helper){}; bool isVgprRegBank(Register Reg); + bool isSgprRegBank(Register Reg); Register getAsVgpr(Register Reg); + bool isVgprToSgprCopy(const MachineInstr &MI); + bool isSgprToVgprCopy(const MachineInstr &MI); + struct MinMaxMedOpc { unsigned Min, Max, Med; }; @@ -74,6 +78,9 @@ void applyMed3(MachineInstr &MI, Med3MatchInfo &MatchInfo); void applyClamp(MachineInstr &MI, Register &Reg); + bool matchSimpleCopyHoisting(MachineInstr &MI, MachineInstr *&MatchInfo); + void applySimpleCopyHoisting(MachineInstr &MI, MachineInstr *&MatchInfo); + private: AMDGPU::SIModeRegisterDefaults getMode(); bool getIEEE(); @@ -87,6 +94,10 @@ return RBI.getRegBank(Reg, MRI, TRI)->getID() == AMDGPU::VGPRRegBankID; } +bool AMDGPURegBankCombinerHelper::isSgprRegBank(Register Reg) { + return RBI.getRegBank(Reg, MRI, TRI)->getID() == AMDGPU::SGPRRegBankID; +} + Register AMDGPURegBankCombinerHelper::getAsVgpr(Register Reg) { if (isVgprRegBank(Reg)) return Reg; @@ -94,8 +105,11 @@ // Search for existing copy of Reg to vgpr. for (MachineInstr &Use : MRI.use_instructions(Reg)) { Register Def = Use.getOperand(0).getReg(); - if (Use.getOpcode() == AMDGPU::COPY && isVgprRegBank(Def)) + if (Use.getOpcode() == AMDGPU::COPY && isVgprRegBank(Def)) { + // Make sure the use dominates the insertion point. + Use.moveBefore(&*B.getInsertPt()); return Def; + } } // Copy Reg to vgpr. @@ -104,6 +118,28 @@ return VgprReg; } +bool AMDGPURegBankCombinerHelper::isVgprToSgprCopy(const MachineInstr &MI) { + if (MI.getOpcode() != AMDGPU::COPY) { + return false; + } + + Register DstReg = MI.getOperand(0).getReg(); + Register SrcReg = MI.getOperand(1).getReg(); + return isVgprRegBank(SrcReg) && isSgprRegBank(DstReg) && + MRI.getType(DstReg) == MRI.getType(SrcReg); +} + +bool AMDGPURegBankCombinerHelper::isSgprToVgprCopy(const MachineInstr &MI) { + if (MI.getOpcode() != AMDGPU::COPY) { + return false; + } + + Register DstReg = MI.getOperand(0).getReg(); + Register SrcReg = MI.getOperand(1).getReg(); + return isSgprRegBank(SrcReg) && isVgprRegBank(DstReg) && + MRI.getType(DstReg) == MRI.getType(SrcReg); +} + AMDGPURegBankCombinerHelper::MinMaxMedOpc AMDGPURegBankCombinerHelper::getMinMaxPair(unsigned Opc) { switch (Opc) { @@ -328,6 +364,147 @@ MI.eraseFromParent(); } +static bool IsEligibleForSimpleCopyHoisting(MachineInstr &MI) { + + // This combine is currently targeted at improving generation of BFI + // instructions, where copies can be inserted in-between a chain of + // bitwise operations, preventing pattern matching. + // + // More operations can be added if needed for other purposes, as long + // as the copy hoisting combine (which can transform an operation that + // uses/returns a SGPR to use VGPRs instead) won't break them. + // + // In short, those instructions should be able to run on both the SALU/VALU. + switch (MI.getOpcode()) { + case AMDGPU::G_XOR: + case AMDGPU::G_AND: + case AMDGPU::G_OR: + return true; + default: + return false; + } +} + +bool AMDGPURegBankCombinerHelper::matchSimpleCopyHoisting( + MachineInstr &MI, MachineInstr *&MatchInfo) { + // When we have a SGPR -> VGPR copy where the input is defined by an eligible + // instruction, we may be able to hoist the copy to the operands of + // instruction that defines the COPY's input. + // + // For this combine to be applied, the following criterias must be satisfied: + // - The COPY input must be eligible according to + // `IsEligibleForSimpleCopyHoisting`. + // - The COPY must be the only user of its input register. + // - The COPY's output register must only be used by another eligible + // instruction. + // - There are also some additional constraints on the COPY input + // instruction's + // operands. For instance, some specific instructions are not allowed to + // try and avoid cases where hoisting the copy would worsen codegen. + // + // For example, in: + // + // %4:sgpr(s32) = G_XOR %1:sgpr, %2:sgpr + // %7:vgpr(s32) = COPY %4:sgpr(s32) + // %5:vgpr(s32) = G_AND %0:vgpr, %7:vgpr + // + // %4 is only used by the COPY, G_XOR has only SGPR operands and + // G_AND has only VGPR operand. We can hoist the copy out into the G_XOR + // operands to make both instructions use all VGPR operands, making + // matching easier in GISel. + // + // %4:vgpr(s32) = COPY %1:sgpr + // %5:vgpr(s32) = COPY %2:sgpr + // %6:vgpr(s32) = G_XOR %4:vgpr, %5:vgpr + // %7:vgpr(s32) = G_AND %0:vgpr, %6:vgpr + // + // Now G_XOR and G_AND chain neatly together, making pattern matching + // easier. + + Register DstReg = MI.getOperand(0).getReg(); + Register InputReg = MI.getOperand(1).getReg(); + + // Check that this is a simple SGPR -> VGPR copy. + if (!isSgprToVgprCopy(MI)) { + return false; + } + + // Input/Output register must have exactly one user. + if (!MRI.hasOneNonDBGUse(InputReg) || !MRI.hasOneNonDBGUse(DstReg)) { + return false; + } + + // The instruction that defines the input register AND the instruction + // that uses the output register must both be considered eligible. + MachineInstr &InputInstr = *getDefIgnoringCopies(InputReg, MRI); + MachineInstr &DefUserInstr = *MRI.use_instr_begin(DstReg); + if (!IsEligibleForSimpleCopyHoisting(InputInstr) || + !IsEligibleForSimpleCopyHoisting(DefUserInstr)) { + return false; + } + + // Check all instructions are in the same basic block. + if (InputInstr.getParent() != MI.getParent() || + MI.getParent() != DefUserInstr.getParent()) { + return false; + } + + // Check all input operands of InputInstr are SGPRs, and check + // for undesirable patterns. + for (std::size_t k = 1; k < InputInstr.getNumOperands(); ++k) { + MachineOperand &MO = InputInstr.getOperand(k); + Register Reg = MO.getReg(); + if (!isSgprRegBank(Reg)) { + return false; + } + + // G_CONSTANT is not allowed because it can prevent matching of + // common instructions, like s_not. + // + // G_BITCAST is not allowed because we assume that it's there for a + // reason, and forcing its result to be a VGPR when the destination + // of the BITCAST is a SGPR worsens codegen. + // + // VGPR->SGPR G_COPY is not allowed for similar reasons (likely there + // for a reason + affects codegen), but also because it'd introduce + // a pair of useless copies. + MachineInstr *Def = MRI.getVRegDef(Reg); + if (Def->getOpcode() == AMDGPU::G_CONSTANT || + Def->getOpcode() == AMDGPU::G_BITCAST || isVgprToSgprCopy(*Def)) { + return false; + } + } + + MatchInfo = &InputInstr; + return true; +} + +void AMDGPURegBankCombinerHelper::applySimpleCopyHoisting( + MachineInstr &MI, MachineInstr *&MatchInfo) { + assert(&MI != MatchInfo); + + B.setInstrAndDebugLoc(*MatchInfo); + const RegisterBank &VGPRRegBank = RBI.getRegBank(AMDGPU::VGPRRegBankID); + + // Make all operands of MatchInfo into VGPRs. + for (std::size_t k = 1; k < MatchInfo->getNumOperands(); ++k) { + MachineOperand &MO = MatchInfo->getOperand(k); + Register Reg = MO.getReg(); + assert(isSgprRegBank(Reg)); + MO.setReg(getAsVgpr(Reg)); + } + + // Make the result of MatchInfo into a VGPR. + Register MatchInfoDst = MatchInfo->getOperand(0).getReg(); + MRI.setRegBank(MatchInfoDst, VGPRRegBank); + + // Replace all users of the Copy's result with MatchInfoDst. + MRI.replaceRegWith(MI.getOperand(0).getReg(), MatchInfoDst); + + // Eliminate the copy. + MI.removeFromParent(); +} + AMDGPU::SIModeRegisterDefaults AMDGPURegBankCombinerHelper::getMode() { return MF.getInfo()->getMode(); } diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp --- a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp @@ -643,9 +643,31 @@ Register Reg) const { assert(HalfTy.getSizeInBits() == 32); MachineRegisterInfo *MRI = B.getMRI(); + const RegisterBank *Bank = getRegBank(Reg, *MRI, *TRI); + + // Check if the Reg is already defined by a MERGE_VALUE, if so + // don't bother generating a UNMERGE_VALUE/BUILD_VECTOR and just + // reuse its operands. + MachineInstr *RegInst = getDefIgnoringCopies(Reg, *MRI); + if ((RegInst->getOpcode() == AMDGPU::G_MERGE_VALUES || + RegInst->getOpcode() == AMDGPU::G_BUILD_VECTOR) && + RegInst->getNumOperands() == 3) { + Register Lo = RegInst->getOperand(1).getReg(); + Register Hi = RegInst->getOperand(2).getReg(); + + // FIXME: Do we need to insert copies to fix types? + assert(MRI->getType(Lo) == HalfTy && MRI->getType(Hi) == HalfTy); + + if (MRI->getRegBankOrNull(Lo) == Bank && + MRI->getRegBankOrNull(Hi) == Bank) { + Regs.push_back(Lo); + Regs.push_back(Hi); + return; + } + } + Register LoLHS = MRI->createGenericVirtualRegister(HalfTy); Register HiLHS = MRI->createGenericVirtualRegister(HalfTy); - const RegisterBank *Bank = getRegBank(Reg, *MRI, *TRI); MRI->setRegBank(LoLHS, *Bank); MRI->setRegBank(HiLHS, *Bank); diff --git a/llvm/lib/Target/AMDGPU/SIInstructions.td b/llvm/lib/Target/AMDGPU/SIInstructions.td --- a/llvm/lib/Target/AMDGPU/SIInstructions.td +++ b/llvm/lib/Target/AMDGPU/SIInstructions.td @@ -1948,7 +1948,9 @@ // z ^ (x & (y ^ z)) def : AMDGPUPat < (DivergentBinFrag i32:$z, (and i32:$x, (xor i32:$y, i32:$z))), - (V_BFI_B32_e64 VSrc_b32:$x, VSrc_b32:$y, VSrc_b32:$z) + (V_BFI_B32_e64 (COPY_TO_REGCLASS VSrc_b32:$x, VGPR_32), + (COPY_TO_REGCLASS VSrc_b32:$y, VGPR_32), + (COPY_TO_REGCLASS VSrc_b32:$z, VGPR_32)) >; // 64-bit version @@ -2958,7 +2960,10 @@ def : AMDGPUPat < (DivergentBinFrag (and i32:$x, i32:$z), (and i32:$y, (or i32:$x, i32:$z))), - (V_BFI_B32_e64 (V_XOR_B32_e64 VSrc_b32:$x, VSrc_b32:$y), VSrc_b32:$z, VSrc_b32:$y) + (V_BFI_B32_e64 (V_XOR_B32_e64 (COPY_TO_REGCLASS VSrc_b32:$x, VGPR_32), + (COPY_TO_REGCLASS VSrc_b32:$y, VGPR_32)), + (COPY_TO_REGCLASS VSrc_b32:$z, VGPR_32), + (COPY_TO_REGCLASS VSrc_b32:$y, VGPR_32)) >; def : AMDGPUPat < diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/fshl.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/fshl.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/fshl.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/fshl.ll @@ -5248,8 +5248,7 @@ ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v4, v0 ; GFX9-NEXT: v_lshrrev_b64 v[0:1], 16, v[2:3] -; GFX9-NEXT: v_lshlrev_b32_e32 v2, 16, v4 -; GFX9-NEXT: v_or_b32_e32 v1, v2, v1 +; GFX9-NEXT: v_lshl_or_b32 v1, v4, 16, v1 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_fshl_i64_48: @@ -5258,8 +5257,7 @@ ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: v_mov_b32_e32 v4, v0 ; GFX10-NEXT: v_lshrrev_b64 v[0:1], 16, v[2:3] -; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v4 -; GFX10-NEXT: v_or_b32_e32 v1, v2, v1 +; GFX10-NEXT: v_lshl_or_b32 v1, v4, 16, v1 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: v_fshl_i64_48: @@ -5268,9 +5266,8 @@ ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: v_mov_b32_e32 v4, v0 ; GFX11-NEXT: v_lshrrev_b64 v[0:1], 16, v[2:3] -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v4 -; GFX11-NEXT: v_or_b32_e32 v1, v2, v1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_lshl_or_b32 v1, v4, 16, v1 ; GFX11-NEXT: s_setpc_b64 s[30:31] %result = call i64 @llvm.fshl.i64(i64 %lhs, i64 %rhs, i64 48) ret i64 %result @@ -6023,14 +6020,13 @@ ; GFX9-NEXT: v_cndmask_b32_e32 v10, 0, v12, vcc ; GFX9-NEXT: v_cndmask_b32_e32 v11, 0, v13, vcc ; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v8, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v9, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v8, v1, v9, vcc ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v14 ; GFX9-NEXT: v_cndmask_b32_e32 v12, v0, v2, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v13, v1, v3, vcc ; GFX9-NEXT: v_lshrrev_b64 v[0:1], 1, v[4:5] -; GFX9-NEXT: v_lshlrev_b32_e32 v2, 31, v6 -; GFX9-NEXT: v_or_b32_e32 v1, v1, v2 +; GFX9-NEXT: v_cndmask_b32_e32 v13, v8, v3, vcc ; GFX9-NEXT: v_lshrrev_b64 v[2:3], 1, v[6:7] +; GFX9-NEXT: v_lshl_or_b32 v1, v6, 31, v1 ; GFX9-NEXT: v_sub_u32_e32 v6, 64, v15 ; GFX9-NEXT: v_subrev_u32_e32 v14, 64, v15 ; GFX9-NEXT: v_lshrrev_b64 v[4:5], v15, v[0:1] @@ -6060,103 +6056,97 @@ ; GFX10-NEXT: v_and_b32_e32 v18, 0x7f, v8 ; GFX10-NEXT: v_xor_b32_e32 v8, -1, v8 ; GFX10-NEXT: v_lshrrev_b64 v[4:5], 1, v[4:5] -; GFX10-NEXT: v_lshlrev_b32_e32 v12, 31, v6 -; GFX10-NEXT: v_lshrrev_b64 v[6:7], 1, v[6:7] -; GFX10-NEXT: v_sub_nc_u32_e32 v9, 64, v18 +; GFX10-NEXT: v_lshrrev_b64 v[12:13], 1, v[6:7] +; GFX10-NEXT: v_sub_nc_u32_e32 v10, 64, v18 ; GFX10-NEXT: v_and_b32_e32 v19, 0x7f, v8 -; GFX10-NEXT: v_lshlrev_b64 v[10:11], v18, v[2:3] -; GFX10-NEXT: v_or_b32_e32 v5, v5, v12 +; GFX10-NEXT: v_lshlrev_b64 v[8:9], v18, v[2:3] +; GFX10-NEXT: v_lshl_or_b32 v5, v6, 31, v5 ; GFX10-NEXT: v_subrev_nc_u32_e32 v20, 64, v18 -; GFX10-NEXT: v_lshrrev_b64 v[8:9], v9, v[0:1] +; GFX10-NEXT: v_lshrrev_b64 v[10:11], v10, v[0:1] ; GFX10-NEXT: v_sub_nc_u32_e32 v16, 64, v19 -; GFX10-NEXT: v_lshlrev_b64 v[12:13], v18, v[0:1] +; GFX10-NEXT: v_lshlrev_b64 v[6:7], v18, v[0:1] ; GFX10-NEXT: v_lshrrev_b64 v[14:15], v19, v[4:5] ; GFX10-NEXT: v_lshlrev_b64 v[0:1], v20, v[0:1] ; GFX10-NEXT: v_cmp_gt_u32_e32 vcc_lo, 64, v18 -; GFX10-NEXT: v_or_b32_e32 v10, v8, v10 +; GFX10-NEXT: v_or_b32_e32 v10, v10, v8 ; GFX10-NEXT: v_subrev_nc_u32_e32 v8, 64, v19 -; GFX10-NEXT: v_lshlrev_b64 v[16:17], v16, v[6:7] -; GFX10-NEXT: v_or_b32_e32 v11, v9, v11 +; GFX10-NEXT: v_lshlrev_b64 v[16:17], v16, v[12:13] +; GFX10-NEXT: v_or_b32_e32 v11, v11, v9 ; GFX10-NEXT: v_cmp_gt_u32_e64 s4, 64, v19 ; GFX10-NEXT: v_cndmask_b32_e32 v10, v0, v10, vcc_lo -; GFX10-NEXT: v_lshrrev_b64 v[8:9], v8, v[6:7] +; GFX10-NEXT: v_lshrrev_b64 v[8:9], v8, v[12:13] ; GFX10-NEXT: v_cmp_eq_u32_e64 s5, 0, v19 ; GFX10-NEXT: v_or_b32_e32 v14, v14, v16 ; GFX10-NEXT: v_or_b32_e32 v15, v15, v17 ; GFX10-NEXT: v_cndmask_b32_e32 v11, v1, v11, vcc_lo -; GFX10-NEXT: v_lshrrev_b64 v[0:1], v19, v[6:7] +; GFX10-NEXT: v_lshrrev_b64 v[0:1], v19, v[12:13] ; GFX10-NEXT: v_cmp_eq_u32_e64 s6, 0, v18 ; GFX10-NEXT: v_cndmask_b32_e64 v8, v8, v14, s4 -; GFX10-NEXT: v_cndmask_b32_e64 v6, v9, v15, s4 -; GFX10-NEXT: v_cndmask_b32_e32 v12, 0, v12, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e32 v7, 0, v13, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e64 v9, v9, v15, s4 +; GFX10-NEXT: v_cndmask_b32_e32 v6, 0, v6, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e32 v7, 0, v7, vcc_lo ; GFX10-NEXT: v_cndmask_b32_e64 v2, v10, v2, s6 ; GFX10-NEXT: v_cndmask_b32_e64 v3, v11, v3, s6 ; GFX10-NEXT: v_cndmask_b32_e64 v4, v8, v4, s5 -; GFX10-NEXT: v_cndmask_b32_e64 v5, v6, v5, s5 -; GFX10-NEXT: v_cndmask_b32_e64 v6, 0, v0, s4 -; GFX10-NEXT: v_cndmask_b32_e64 v8, 0, v1, s4 -; GFX10-NEXT: v_or_b32_e32 v0, v12, v4 +; GFX10-NEXT: v_cndmask_b32_e64 v5, v9, v5, s5 +; GFX10-NEXT: v_cndmask_b32_e64 v8, 0, v0, s4 +; GFX10-NEXT: v_cndmask_b32_e64 v9, 0, v1, s4 +; GFX10-NEXT: v_or_b32_e32 v0, v6, v4 ; GFX10-NEXT: v_or_b32_e32 v1, v7, v5 -; GFX10-NEXT: v_or_b32_e32 v2, v2, v6 -; GFX10-NEXT: v_or_b32_e32 v3, v3, v8 +; GFX10-NEXT: v_or_b32_e32 v2, v2, v8 +; GFX10-NEXT: v_or_b32_e32 v3, v3, v9 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: v_fshl_i128: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: v_lshrrev_b64 v[4:5], 1, v[4:5] ; GFX11-NEXT: v_and_b32_e32 v18, 0x7f, v8 ; GFX11-NEXT: v_xor_b32_e32 v8, -1, v8 -; GFX11-NEXT: v_lshrrev_b64 v[4:5], 1, v[4:5] +; GFX11-NEXT: v_lshrrev_b64 v[12:13], 1, v[6:7] ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) -; GFX11-NEXT: v_sub_nc_u32_e32 v9, 64, v18 -; GFX11-NEXT: v_lshlrev_b64 v[10:11], v18, v[2:3] -; GFX11-NEXT: v_subrev_nc_u32_e32 v20, 64, v18 +; GFX11-NEXT: v_sub_nc_u32_e32 v10, 64, v18 +; GFX11-NEXT: v_lshl_or_b32 v5, v6, 31, v5 +; GFX11-NEXT: v_lshlrev_b64 v[6:7], v18, v[0:1] ; GFX11-NEXT: v_cmp_gt_u32_e32 vcc_lo, 64, v18 -; GFX11-NEXT: v_lshlrev_b32_e32 v12, 31, v6 ; GFX11-NEXT: v_and_b32_e32 v19, 0x7f, v8 -; GFX11-NEXT: v_lshrrev_b64 v[8:9], v9, v[0:1] -; GFX11-NEXT: v_lshrrev_b64 v[6:7], 1, v[6:7] -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_4) | instid1(VALU_DEP_4) -; GFX11-NEXT: v_or_b32_e32 v5, v5, v12 -; GFX11-NEXT: v_lshlrev_b64 v[12:13], v18, v[0:1] -; GFX11-NEXT: v_lshlrev_b64 v[0:1], v20, v[0:1] -; GFX11-NEXT: v_or_b32_e32 v10, v8, v10 -; GFX11-NEXT: v_or_b32_e32 v11, v9, v11 -; GFX11-NEXT: v_cndmask_b32_e32 v12, 0, v12, vcc_lo -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) -; GFX11-NEXT: v_cndmask_b32_e32 v10, v0, v10, vcc_lo +; GFX11-NEXT: v_lshlrev_b64 v[8:9], v18, v[2:3] +; GFX11-NEXT: v_lshrrev_b64 v[10:11], v10, v[0:1] +; GFX11-NEXT: v_subrev_nc_u32_e32 v20, 64, v18 +; GFX11-NEXT: v_cndmask_b32_e32 v6, 0, v6, vcc_lo ; GFX11-NEXT: v_sub_nc_u32_e32 v16, 64, v19 -; GFX11-NEXT: v_subrev_nc_u32_e32 v8, 64, v19 ; GFX11-NEXT: v_lshrrev_b64 v[14:15], v19, v[4:5] ; GFX11-NEXT: v_cmp_gt_u32_e64 s0, 64, v19 -; GFX11-NEXT: v_cndmask_b32_e32 v11, v1, v11, vcc_lo -; GFX11-NEXT: v_lshlrev_b64 v[16:17], v16, v[6:7] -; GFX11-NEXT: v_lshrrev_b64 v[8:9], v8, v[6:7] -; GFX11-NEXT: v_lshrrev_b64 v[0:1], v19, v[6:7] +; GFX11-NEXT: v_or_b32_e32 v10, v10, v8 +; GFX11-NEXT: v_subrev_nc_u32_e32 v8, 64, v19 +; GFX11-NEXT: v_lshlrev_b64 v[16:17], v16, v[12:13] +; GFX11-NEXT: v_lshlrev_b64 v[0:1], v20, v[0:1] +; GFX11-NEXT: v_or_b32_e32 v11, v11, v9 ; GFX11-NEXT: v_cmp_eq_u32_e64 s1, 0, v19 -; GFX11-NEXT: v_cmp_eq_u32_e64 s2, 0, v18 -; GFX11-NEXT: v_cndmask_b32_e32 v7, 0, v13, vcc_lo +; GFX11-NEXT: v_lshrrev_b64 v[8:9], v8, v[12:13] +; GFX11-NEXT: v_cndmask_b32_e32 v7, 0, v7, vcc_lo ; GFX11-NEXT: v_or_b32_e32 v14, v14, v16 ; GFX11-NEXT: v_or_b32_e32 v15, v15, v17 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4) -; GFX11-NEXT: v_cndmask_b32_e64 v2, v10, v2, s2 -; GFX11-NEXT: v_cndmask_b32_e64 v3, v11, v3, s2 +; GFX11-NEXT: v_dual_cndmask_b32 v10, v0, v10 :: v_dual_cndmask_b32 v11, v1, v11 +; GFX11-NEXT: v_lshrrev_b64 v[0:1], v19, v[12:13] +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_3) ; GFX11-NEXT: v_cndmask_b32_e64 v8, v8, v14, s0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_cndmask_b32_e64 v6, v9, v15, s0 +; GFX11-NEXT: v_cmp_eq_u32_e64 s2, 0, v18 +; GFX11-NEXT: v_cndmask_b32_e64 v9, v9, v15, s0 ; GFX11-NEXT: v_cndmask_b32_e64 v4, v8, v4, s1 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_4) -; GFX11-NEXT: v_cndmask_b32_e64 v5, v6, v5, s1 -; GFX11-NEXT: v_cndmask_b32_e64 v6, 0, v0, s0 -; GFX11-NEXT: v_cndmask_b32_e64 v8, 0, v1, s0 -; GFX11-NEXT: v_or_b32_e32 v0, v12, v4 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_cndmask_b32_e64 v2, v10, v2, s2 +; GFX11-NEXT: v_cndmask_b32_e64 v3, v11, v3, s2 +; GFX11-NEXT: v_cndmask_b32_e64 v5, v9, v5, s1 +; GFX11-NEXT: v_cndmask_b32_e64 v8, 0, v0, s0 +; GFX11-NEXT: v_cndmask_b32_e64 v9, 0, v1, s0 +; GFX11-NEXT: v_or_b32_e32 v0, v6, v4 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) ; GFX11-NEXT: v_or_b32_e32 v1, v7, v5 -; GFX11-NEXT: v_or_b32_e32 v2, v2, v6 +; GFX11-NEXT: v_or_b32_e32 v2, v2, v8 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) -; GFX11-NEXT: v_or_b32_e32 v3, v3, v8 +; GFX11-NEXT: v_or_b32_e32 v3, v3, v9 ; GFX11-NEXT: s_setpc_b64 s[30:31] %result = call i128 @llvm.fshl.i128(i128 %lhs, i128 %rhs, i128 %amt) ret i128 %result @@ -6556,16 +6546,15 @@ ; GFX9-NEXT: s_or_b64 s[8:9], s[10:11], s[8:9] ; GFX9-NEXT: s_lshl_b64 s[0:1], s[0:1], s5 ; GFX9-NEXT: s_cmp_lg_u32 s12, 0 +; GFX9-NEXT: v_lshrrev_b64 v[0:1], 1, v[0:1] ; GFX9-NEXT: s_cselect_b64 s[6:7], s[6:7], 0 ; GFX9-NEXT: s_cselect_b64 s[0:1], s[8:9], s[0:1] ; GFX9-NEXT: s_cmp_lg_u32 s13, 0 -; GFX9-NEXT: v_lshrrev_b64 v[0:1], 1, v[0:1] ; GFX9-NEXT: s_cselect_b64 s[0:1], s[2:3], s[0:1] -; GFX9-NEXT: v_lshlrev_b32_e32 v4, 31, v2 +; GFX9-NEXT: v_lshl_or_b32 v1, v2, 31, v1 ; GFX9-NEXT: v_lshrrev_b64 v[2:3], 1, v[2:3] ; GFX9-NEXT: s_sub_i32 s2, s4, 64 ; GFX9-NEXT: s_sub_i32 s3, 64, s4 -; GFX9-NEXT: v_or_b32_e32 v1, v1, v4 ; GFX9-NEXT: s_cmp_lt_u32 s4, 64 ; GFX9-NEXT: s_cselect_b32 s5, 1, 0 ; GFX9-NEXT: s_cmp_eq_u32 s4, 0 @@ -6603,7 +6592,7 @@ ; GFX10-NEXT: s_sub_i32 s5, s8, 64 ; GFX10-NEXT: s_sub_i32 s6, 64, s8 ; GFX10-NEXT: s_cmp_lt_u32 s8, 64 -; GFX10-NEXT: v_lshlrev_b32_e32 v4, 31, v2 +; GFX10-NEXT: v_lshl_or_b32 v1, v2, 31, v1 ; GFX10-NEXT: s_cselect_b32 s12, 1, 0 ; GFX10-NEXT: s_cmp_eq_u32 s8, 0 ; GFX10-NEXT: v_lshrrev_b64 v[2:3], 1, v[2:3] @@ -6614,13 +6603,12 @@ ; GFX10-NEXT: s_or_b64 s[6:7], s[6:7], s[10:11] ; GFX10-NEXT: s_lshl_b64 s[0:1], s[0:1], s5 ; GFX10-NEXT: s_cmp_lg_u32 s12, 0 -; GFX10-NEXT: v_or_b32_e32 v1, v1, v4 +; GFX10-NEXT: v_lshrrev_b64 v[4:5], s4, v[0:1] ; GFX10-NEXT: s_cselect_b64 s[8:9], s[8:9], 0 ; GFX10-NEXT: s_cselect_b64 s[0:1], s[6:7], s[0:1] ; GFX10-NEXT: s_cmp_lg_u32 s13, 0 ; GFX10-NEXT: s_cselect_b64 s[2:3], s[2:3], s[0:1] ; GFX10-NEXT: s_sub_i32 s0, 64, s4 -; GFX10-NEXT: v_lshrrev_b64 v[4:5], s4, v[0:1] ; GFX10-NEXT: v_lshlrev_b64 v[6:7], s0, v[2:3] ; GFX10-NEXT: s_sub_i32 s0, s4, 64 ; GFX10-NEXT: s_cmp_lt_u32 s4, 64 @@ -6658,7 +6646,7 @@ ; GFX11-NEXT: s_sub_i32 s5, s8, 64 ; GFX11-NEXT: s_sub_i32 s6, 64, s8 ; GFX11-NEXT: s_cmp_lt_u32 s8, 64 -; GFX11-NEXT: v_lshlrev_b32_e32 v4, 31, v2 +; GFX11-NEXT: v_lshl_or_b32 v1, v2, 31, v1 ; GFX11-NEXT: s_cselect_b32 s12, 1, 0 ; GFX11-NEXT: s_cmp_eq_u32 s8, 0 ; GFX11-NEXT: v_lshrrev_b64 v[2:3], 1, v[2:3] @@ -6669,13 +6657,13 @@ ; GFX11-NEXT: s_or_b64 s[6:7], s[6:7], s[10:11] ; GFX11-NEXT: s_lshl_b64 s[0:1], s[0:1], s5 ; GFX11-NEXT: s_cmp_lg_u32 s12, 0 -; GFX11-NEXT: v_or_b32_e32 v1, v1, v4 +; GFX11-NEXT: v_lshrrev_b64 v[4:5], s4, v[0:1] ; GFX11-NEXT: s_cselect_b64 s[8:9], s[8:9], 0 ; GFX11-NEXT: s_cselect_b64 s[0:1], s[6:7], s[0:1] ; GFX11-NEXT: s_cmp_lg_u32 s13, 0 ; GFX11-NEXT: s_cselect_b64 s[2:3], s[2:3], s[0:1] ; GFX11-NEXT: s_sub_i32 s0, 64, s4 -; GFX11-NEXT: v_lshrrev_b64 v[4:5], s4, v[0:1] +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: v_lshlrev_b64 v[6:7], s0, v[2:3] ; GFX11-NEXT: s_sub_i32 s0, s4, 64 ; GFX11-NEXT: s_cmp_lt_u32 s4, 64 @@ -6690,16 +6678,16 @@ ; GFX11-NEXT: s_and_b32 s0, 1, s5 ; GFX11-NEXT: s_and_b32 s1, 1, s1 ; GFX11-NEXT: v_lshrrev_b64 v[2:3], s4, v[2:3] -; GFX11-NEXT: v_dual_cndmask_b32 v5, v9, v5 :: v_dual_cndmask_b32 v4, v8, v4 +; GFX11-NEXT: v_dual_cndmask_b32 v4, v8, v4 :: v_dual_cndmask_b32 v5, v9, v5 ; GFX11-NEXT: v_cmp_ne_u32_e64 vcc_lo, 0, s0 ; GFX11-NEXT: v_cmp_ne_u32_e64 s0, 0, s1 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_dual_cndmask_b32 v1, v5, v1 :: v_dual_cndmask_b32 v0, v4, v0 +; GFX11-NEXT: v_dual_cndmask_b32 v0, v4, v0 :: v_dual_cndmask_b32 v1, v5, v1 ; GFX11-NEXT: v_cndmask_b32_e64 v2, 0, v2, s0 ; GFX11-NEXT: v_cndmask_b32_e64 v3, 0, v3, s0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-NEXT: v_or_b32_e32 v1, s9, v1 ; GFX11-NEXT: v_or_b32_e32 v0, s8, v0 +; GFX11-NEXT: v_or_b32_e32 v1, s9, v1 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) ; GFX11-NEXT: v_or_b32_e32 v2, s2, v2 ; GFX11-NEXT: v_or_b32_e32 v3, s3, v3 @@ -7746,17 +7734,16 @@ ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: v_and_b32_e32 v23, 0x7f, v16 +; GFX9-NEXT: v_lshrrev_b64 v[8:9], 1, v[8:9] ; GFX9-NEXT: v_sub_u32_e32 v17, 64, v23 +; GFX9-NEXT: v_xor_b32_e32 v16, -1, v16 ; GFX9-NEXT: v_lshrrev_b64 v[17:18], v17, v[0:1] ; GFX9-NEXT: v_lshlrev_b64 v[21:22], v23, v[2:3] -; GFX9-NEXT: v_lshrrev_b64 v[8:9], 1, v[8:9] -; GFX9-NEXT: v_xor_b32_e32 v16, -1, v16 -; GFX9-NEXT: v_or_b32_e32 v21, v17, v21 -; GFX9-NEXT: v_lshlrev_b32_e32 v17, 31, v10 ; GFX9-NEXT: v_and_b32_e32 v24, 0x7f, v16 +; GFX9-NEXT: v_lshl_or_b32 v9, v10, 31, v9 ; GFX9-NEXT: v_lshrrev_b64 v[10:11], 1, v[10:11] -; GFX9-NEXT: v_or_b32_e32 v9, v9, v17 ; GFX9-NEXT: v_sub_u32_e32 v16, 64, v24 +; GFX9-NEXT: v_or_b32_e32 v21, v17, v21 ; GFX9-NEXT: v_or_b32_e32 v22, v18, v22 ; GFX9-NEXT: v_lshlrev_b64 v[16:17], v16, v[10:11] ; GFX9-NEXT: v_lshrrev_b64 v[18:19], v24, v[8:9] @@ -7803,22 +7790,21 @@ ; GFX9-NEXT: v_cndmask_b32_e32 v18, 0, v8, vcc ; GFX9-NEXT: v_cndmask_b32_e32 v19, 0, v9, vcc ; GFX9-NEXT: v_cndmask_b32_e32 v4, v4, v10, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v5, v5, v11, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v8, v5, v11, vcc ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v16 ; GFX9-NEXT: v_cndmask_b32_e32 v16, v4, v6, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v20, v5, v7, vcc ; GFX9-NEXT: v_lshrrev_b64 v[4:5], 1, v[12:13] -; GFX9-NEXT: v_lshlrev_b32_e32 v6, 31, v14 -; GFX9-NEXT: v_or_b32_e32 v5, v5, v6 +; GFX9-NEXT: v_cndmask_b32_e32 v12, v8, v7, vcc ; GFX9-NEXT: v_lshrrev_b64 v[6:7], 1, v[14:15] +; GFX9-NEXT: v_lshl_or_b32 v5, v14, 31, v5 ; GFX9-NEXT: v_sub_u32_e32 v10, 64, v17 ; GFX9-NEXT: v_lshrrev_b64 v[8:9], v17, v[4:5] ; GFX9-NEXT: v_lshlrev_b64 v[10:11], v10, v[6:7] -; GFX9-NEXT: v_subrev_u32_e32 v12, 64, v17 +; GFX9-NEXT: v_subrev_u32_e32 v13, 64, v17 ; GFX9-NEXT: v_or_b32_e32 v10, v8, v10 ; GFX9-NEXT: v_or_b32_e32 v11, v9, v11 ; GFX9-NEXT: v_lshrrev_b64 v[8:9], v17, v[6:7] -; GFX9-NEXT: v_lshrrev_b64 v[6:7], v12, v[6:7] +; GFX9-NEXT: v_lshrrev_b64 v[6:7], v13, v[6:7] ; GFX9-NEXT: v_cmp_gt_u32_e32 vcc, 64, v17 ; GFX9-NEXT: v_cndmask_b32_e32 v6, v6, v10, vcc ; GFX9-NEXT: v_cndmask_b32_e32 v7, v7, v11, vcc @@ -7830,7 +7816,7 @@ ; GFX9-NEXT: v_or_b32_e32 v4, v18, v4 ; GFX9-NEXT: v_or_b32_e32 v5, v19, v5 ; GFX9-NEXT: v_or_b32_e32 v6, v16, v6 -; GFX9-NEXT: v_or_b32_e32 v7, v20, v7 +; GFX9-NEXT: v_or_b32_e32 v7, v12, v7 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_fshl_v2i128: @@ -7840,76 +7826,74 @@ ; GFX10-NEXT: v_and_b32_e32 v27, 0x7f, v16 ; GFX10-NEXT: v_xor_b32_e32 v16, -1, v16 ; GFX10-NEXT: v_lshrrev_b64 v[8:9], 1, v[8:9] -; GFX10-NEXT: v_lshlrev_b32_e32 v21, 31, v10 -; GFX10-NEXT: v_lshrrev_b64 v[10:11], 1, v[10:11] ; GFX10-NEXT: v_sub_nc_u32_e32 v17, 64, v27 ; GFX10-NEXT: v_and_b32_e32 v28, 0x7f, v16 ; GFX10-NEXT: v_lshlrev_b64 v[18:19], v27, v[2:3] -; GFX10-NEXT: v_or_b32_e32 v9, v9, v21 -; GFX10-NEXT: v_subrev_nc_u32_e32 v29, 64, v27 +; GFX10-NEXT: v_lshl_or_b32 v9, v10, 31, v9 +; GFX10-NEXT: v_lshrrev_b64 v[10:11], 1, v[10:11] ; GFX10-NEXT: v_lshrrev_b64 v[16:17], v17, v[0:1] ; GFX10-NEXT: v_sub_nc_u32_e32 v25, 64, v28 -; GFX10-NEXT: v_lshlrev_b64 v[21:22], v27, v[0:1] +; GFX10-NEXT: v_subrev_nc_u32_e32 v29, 64, v27 ; GFX10-NEXT: v_lshrrev_b64 v[23:24], v28, v[8:9] -; GFX10-NEXT: v_lshlrev_b64 v[0:1], v29, v[0:1] +; GFX10-NEXT: v_lshlrev_b64 v[21:22], v27, v[0:1] ; GFX10-NEXT: v_cmp_gt_u32_e32 vcc_lo, 64, v27 ; GFX10-NEXT: v_or_b32_e32 v18, v16, v18 ; GFX10-NEXT: v_subrev_nc_u32_e32 v16, 64, v28 ; GFX10-NEXT: v_lshlrev_b64 v[25:26], v25, v[10:11] +; GFX10-NEXT: v_lshlrev_b64 v[0:1], v29, v[0:1] ; GFX10-NEXT: v_or_b32_e32 v19, v17, v19 ; GFX10-NEXT: v_cmp_gt_u32_e64 s4, 64, v28 -; GFX10-NEXT: v_cmp_eq_u32_e64 s5, 0, v28 ; GFX10-NEXT: v_lshrrev_b64 v[16:17], v16, v[10:11] -; GFX10-NEXT: v_cmp_eq_u32_e64 s6, 0, v27 +; GFX10-NEXT: v_cndmask_b32_e32 v21, 0, v21, vcc_lo ; GFX10-NEXT: v_or_b32_e32 v23, v23, v25 -; GFX10-NEXT: v_or_b32_e32 v24, v24, v26 -; GFX10-NEXT: v_cndmask_b32_e32 v19, v1, v19, vcc_lo ; GFX10-NEXT: v_cndmask_b32_e32 v18, v0, v18, vcc_lo -; GFX10-NEXT: v_lshrrev_b64 v[0:1], v28, v[10:11] +; GFX10-NEXT: v_or_b32_e32 v0, v24, v26 +; GFX10-NEXT: v_cndmask_b32_e32 v22, 0, v22, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e32 v19, v1, v19, vcc_lo ; GFX10-NEXT: v_cndmask_b32_e64 v16, v16, v23, s4 -; GFX10-NEXT: v_cndmask_b32_e64 v10, v17, v24, s4 -; GFX10-NEXT: v_cndmask_b32_e32 v21, 0, v21, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e32 v11, 0, v22, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e64 v22, v19, v3, s6 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v27 +; GFX10-NEXT: v_cmp_eq_u32_e64 s5, 0, v28 +; GFX10-NEXT: v_cndmask_b32_e64 v17, v17, v0, s4 +; GFX10-NEXT: v_lshrrev_b64 v[0:1], v28, v[10:11] +; GFX10-NEXT: v_and_b32_e32 v24, 0x7f, v20 +; GFX10-NEXT: v_cndmask_b32_e32 v23, v19, v3, vcc_lo ; GFX10-NEXT: v_cndmask_b32_e64 v3, v16, v8, s5 -; GFX10-NEXT: v_cndmask_b32_e64 v2, v18, v2, s6 -; GFX10-NEXT: v_cndmask_b32_e64 v8, v10, v9, s5 -; GFX10-NEXT: v_cndmask_b32_e64 v9, 0, v0, s4 -; GFX10-NEXT: v_and_b32_e32 v23, 0x7f, v20 +; GFX10-NEXT: v_cndmask_b32_e64 v8, v17, v9, s5 +; GFX10-NEXT: v_cndmask_b32_e32 v2, v18, v2, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e64 v10, 0, v0, s4 +; GFX10-NEXT: v_cndmask_b32_e64 v25, 0, v1, s4 ; GFX10-NEXT: v_or_b32_e32 v0, v21, v3 ; GFX10-NEXT: v_xor_b32_e32 v3, -1, v20 -; GFX10-NEXT: v_cndmask_b32_e64 v24, 0, v1, s4 -; GFX10-NEXT: v_or_b32_e32 v1, v11, v8 -; GFX10-NEXT: v_sub_nc_u32_e32 v10, 64, v23 -; GFX10-NEXT: v_or_b32_e32 v2, v2, v9 +; GFX10-NEXT: v_or_b32_e32 v1, v22, v8 ; GFX10-NEXT: v_lshrrev_b64 v[8:9], 1, v[12:13] -; GFX10-NEXT: v_lshlrev_b32_e32 v16, 31, v14 -; GFX10-NEXT: v_and_b32_e32 v25, 0x7f, v3 -; GFX10-NEXT: v_lshrrev_b64 v[10:11], v10, v[4:5] -; GFX10-NEXT: v_lshlrev_b64 v[12:13], v23, v[6:7] +; GFX10-NEXT: v_sub_nc_u32_e32 v11, 64, v24 +; GFX10-NEXT: v_or_b32_e32 v2, v2, v10 +; GFX10-NEXT: v_and_b32_e32 v22, 0x7f, v3 +; GFX10-NEXT: v_lshlrev_b64 v[12:13], v24, v[6:7] +; GFX10-NEXT: v_subrev_nc_u32_e32 v3, 64, v24 +; GFX10-NEXT: v_lshrrev_b64 v[10:11], v11, v[4:5] +; GFX10-NEXT: v_lshl_or_b32 v9, v14, 31, v9 ; GFX10-NEXT: v_lshrrev_b64 v[14:15], 1, v[14:15] -; GFX10-NEXT: v_or_b32_e32 v9, v9, v16 -; GFX10-NEXT: v_sub_nc_u32_e32 v20, 64, v25 -; GFX10-NEXT: v_subrev_nc_u32_e32 v3, 64, v23 -; GFX10-NEXT: v_lshlrev_b64 v[16:17], v23, v[4:5] +; GFX10-NEXT: v_sub_nc_u32_e32 v20, 64, v22 +; GFX10-NEXT: v_lshlrev_b64 v[16:17], v24, v[4:5] +; GFX10-NEXT: v_cmp_gt_u32_e32 vcc_lo, 64, v24 ; GFX10-NEXT: v_or_b32_e32 v12, v10, v12 -; GFX10-NEXT: v_subrev_nc_u32_e32 v10, 64, v25 -; GFX10-NEXT: v_lshrrev_b64 v[18:19], v25, v[8:9] +; GFX10-NEXT: v_subrev_nc_u32_e32 v10, 64, v22 +; GFX10-NEXT: v_lshrrev_b64 v[18:19], v22, v[8:9] ; GFX10-NEXT: v_lshlrev_b64 v[20:21], v20, v[14:15] -; GFX10-NEXT: v_cmp_gt_u32_e32 vcc_lo, 64, v23 ; GFX10-NEXT: v_lshlrev_b64 v[3:4], v3, v[4:5] ; GFX10-NEXT: v_or_b32_e32 v5, v11, v13 ; GFX10-NEXT: v_lshrrev_b64 v[10:11], v10, v[14:15] -; GFX10-NEXT: v_cmp_gt_u32_e64 s4, 64, v25 ; GFX10-NEXT: v_cndmask_b32_e32 v13, 0, v16, vcc_lo +; GFX10-NEXT: v_cmp_gt_u32_e64 s4, 64, v22 ; GFX10-NEXT: v_or_b32_e32 v16, v18, v20 ; GFX10-NEXT: v_or_b32_e32 v18, v19, v21 ; GFX10-NEXT: v_cndmask_b32_e32 v12, v3, v12, vcc_lo ; GFX10-NEXT: v_cndmask_b32_e32 v5, v4, v5, vcc_lo -; GFX10-NEXT: v_lshrrev_b64 v[3:4], v25, v[14:15] +; GFX10-NEXT: v_lshrrev_b64 v[3:4], v22, v[14:15] ; GFX10-NEXT: v_cndmask_b32_e64 v10, v10, v16, s4 -; GFX10-NEXT: v_cmp_eq_u32_e64 s5, 0, v25 -; GFX10-NEXT: v_cmp_eq_u32_e64 s6, 0, v23 +; GFX10-NEXT: v_cmp_eq_u32_e64 s5, 0, v22 +; GFX10-NEXT: v_cmp_eq_u32_e64 s6, 0, v24 ; GFX10-NEXT: v_cndmask_b32_e64 v11, v11, v18, s4 ; GFX10-NEXT: v_cndmask_b32_e32 v14, 0, v17, vcc_lo ; GFX10-NEXT: v_cndmask_b32_e64 v6, v12, v6, s6 @@ -7918,7 +7902,7 @@ ; GFX10-NEXT: v_cndmask_b32_e64 v8, v11, v9, s5 ; GFX10-NEXT: v_cndmask_b32_e64 v9, 0, v3, s4 ; GFX10-NEXT: v_cndmask_b32_e64 v10, 0, v4, s4 -; GFX10-NEXT: v_or_b32_e32 v3, v22, v24 +; GFX10-NEXT: v_or_b32_e32 v3, v23, v25 ; GFX10-NEXT: v_or_b32_e32 v4, v13, v5 ; GFX10-NEXT: v_or_b32_e32 v5, v14, v8 ; GFX10-NEXT: v_or_b32_e32 v6, v6, v9 @@ -7929,89 +7913,93 @@ ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: v_lshrrev_b64 v[8:9], 1, v[8:9] ; GFX11-NEXT: v_and_b32_e32 v27, 0x7f, v16 -; GFX11-NEXT: v_lshlrev_b32_e32 v21, 31, v10 ; GFX11-NEXT: v_xor_b32_e32 v16, -1, v16 -; GFX11-NEXT: v_lshrrev_b64 v[10:11], 1, v[10:11] -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-NEXT: v_sub_nc_u32_e32 v17, 64, v27 -; GFX11-NEXT: v_or_b32_e32 v9, v9, v21 +; GFX11-NEXT: v_lshrrev_b64 v[8:9], 1, v[8:9] +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4) ; GFX11-NEXT: v_lshlrev_b64 v[21:22], v27, v[0:1] ; GFX11-NEXT: v_cmp_gt_u32_e32 vcc_lo, 64, v27 ; GFX11-NEXT: v_and_b32_e32 v28, 0x7f, v16 -; GFX11-NEXT: v_lshrrev_b64 v[16:17], v17, v[0:1] +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-NEXT: v_lshl_or_b32 v9, v10, 31, v9 +; GFX11-NEXT: v_lshrrev_b64 v[10:11], 1, v[10:11] +; GFX11-NEXT: v_dual_cndmask_b32 v21, 0, v21 :: v_dual_cndmask_b32 v22, 0, v22 +; GFX11-NEXT: v_sub_nc_u32_e32 v17, 64, v27 ; GFX11-NEXT: v_lshlrev_b64 v[18:19], v27, v[2:3] -; GFX11-NEXT: v_subrev_nc_u32_e32 v29, 64, v27 -; GFX11-NEXT: v_cndmask_b32_e32 v21, 0, v21, vcc_lo ; GFX11-NEXT: v_sub_nc_u32_e32 v25, 64, v28 +; GFX11-NEXT: v_subrev_nc_u32_e32 v29, 64, v27 ; GFX11-NEXT: v_lshrrev_b64 v[23:24], v28, v[8:9] +; GFX11-NEXT: v_lshrrev_b64 v[16:17], v17, v[0:1] ; GFX11-NEXT: v_cmp_gt_u32_e64 s0, 64, v28 -; GFX11-NEXT: v_or_b32_e32 v18, v16, v18 -; GFX11-NEXT: v_subrev_nc_u32_e32 v16, 64, v28 ; GFX11-NEXT: v_lshlrev_b64 v[25:26], v25, v[10:11] ; GFX11-NEXT: v_lshlrev_b64 v[0:1], v29, v[0:1] -; GFX11-NEXT: v_or_b32_e32 v19, v17, v19 ; GFX11-NEXT: v_cmp_eq_u32_e64 s1, 0, v28 -; GFX11-NEXT: v_lshrrev_b64 v[16:17], v16, v[10:11] +; GFX11-NEXT: v_or_b32_e32 v18, v16, v18 +; GFX11-NEXT: v_subrev_nc_u32_e32 v16, 64, v28 +; GFX11-NEXT: v_or_b32_e32 v19, v17, v19 ; GFX11-NEXT: v_or_b32_e32 v23, v23, v25 -; GFX11-NEXT: v_or_b32_e32 v24, v24, v26 -; GFX11-NEXT: v_dual_cndmask_b32 v18, v0, v18 :: v_dual_cndmask_b32 v19, v1, v19 -; GFX11-NEXT: v_cmp_eq_u32_e64 s2, 0, v27 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_cndmask_b32_e32 v18, v0, v18, vcc_lo +; GFX11-NEXT: v_lshrrev_b64 v[16:17], v16, v[10:11] +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_4) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_cndmask_b32_e32 v19, v1, v19, vcc_lo +; GFX11-NEXT: v_or_b32_e32 v0, v24, v26 +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v27 +; GFX11-NEXT: v_and_b32_e32 v24, 0x7f, v20 ; GFX11-NEXT: v_cndmask_b32_e64 v16, v16, v23, s0 +; GFX11-NEXT: v_cndmask_b32_e64 v17, v17, v0, s0 ; GFX11-NEXT: v_lshrrev_b64 v[0:1], v28, v[10:11] -; GFX11-NEXT: v_cndmask_b32_e64 v10, v17, v24, s0 -; GFX11-NEXT: v_cndmask_b32_e32 v11, 0, v22, vcc_lo -; GFX11-NEXT: v_cndmask_b32_e64 v22, v19, v3, s2 +; GFX11-NEXT: v_dual_cndmask_b32 v2, v18, v2 :: v_dual_cndmask_b32 v23, v19, v3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) ; GFX11-NEXT: v_cndmask_b32_e64 v3, v16, v8, s1 -; GFX11-NEXT: v_cndmask_b32_e64 v2, v18, v2, s2 -; GFX11-NEXT: v_cndmask_b32_e64 v8, v10, v9, s1 -; GFX11-NEXT: v_cndmask_b32_e64 v9, 0, v0, s0 -; GFX11-NEXT: v_and_b32_e32 v23, 0x7f, v20 +; GFX11-NEXT: v_cndmask_b32_e64 v8, v17, v9, s1 +; GFX11-NEXT: v_sub_nc_u32_e32 v11, 64, v24 +; GFX11-NEXT: v_cndmask_b32_e64 v10, 0, v0, s0 +; GFX11-NEXT: v_cndmask_b32_e64 v25, 0, v1, s0 ; GFX11-NEXT: v_or_b32_e32 v0, v21, v3 ; GFX11-NEXT: v_xor_b32_e32 v3, -1, v20 -; GFX11-NEXT: v_cndmask_b32_e64 v24, 0, v1, s0 -; GFX11-NEXT: v_or_b32_e32 v1, v11, v8 -; GFX11-NEXT: v_sub_nc_u32_e32 v10, 64, v23 -; GFX11-NEXT: v_or_b32_e32 v2, v2, v9 +; GFX11-NEXT: v_or_b32_e32 v1, v22, v8 ; GFX11-NEXT: v_lshrrev_b64 v[8:9], 1, v[12:13] -; GFX11-NEXT: v_lshlrev_b32_e32 v16, 31, v14 -; GFX11-NEXT: v_and_b32_e32 v25, 0x7f, v3 -; GFX11-NEXT: v_lshrrev_b64 v[10:11], v10, v[4:5] -; GFX11-NEXT: v_lshlrev_b64 v[12:13], v23, v[6:7] +; GFX11-NEXT: v_or_b32_e32 v2, v2, v10 +; GFX11-NEXT: v_lshrrev_b64 v[10:11], v11, v[4:5] +; GFX11-NEXT: v_lshlrev_b64 v[12:13], v24, v[6:7] +; GFX11-NEXT: v_lshlrev_b64 v[16:17], v24, v[4:5] +; GFX11-NEXT: v_cmp_gt_u32_e32 vcc_lo, 64, v24 +; GFX11-NEXT: v_and_b32_e32 v22, 0x7f, v3 +; GFX11-NEXT: v_subrev_nc_u32_e32 v3, 64, v24 +; GFX11-NEXT: v_lshl_or_b32 v9, v14, 31, v9 ; GFX11-NEXT: v_lshrrev_b64 v[14:15], 1, v[14:15] -; GFX11-NEXT: v_or_b32_e32 v9, v9, v16 -; GFX11-NEXT: v_sub_nc_u32_e32 v20, 64, v25 -; GFX11-NEXT: v_subrev_nc_u32_e32 v3, 64, v23 -; GFX11-NEXT: v_lshlrev_b64 v[16:17], v23, v[4:5] ; GFX11-NEXT: v_or_b32_e32 v12, v10, v12 -; GFX11-NEXT: v_subrev_nc_u32_e32 v10, 64, v25 -; GFX11-NEXT: v_lshrrev_b64 v[18:19], v25, v[8:9] -; GFX11-NEXT: v_lshlrev_b64 v[20:21], v20, v[14:15] -; GFX11-NEXT: v_cmp_gt_u32_e32 vcc_lo, 64, v23 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) ; GFX11-NEXT: v_lshlrev_b64 v[3:4], v3, v[4:5] ; GFX11-NEXT: v_or_b32_e32 v5, v11, v13 -; GFX11-NEXT: v_lshrrev_b64 v[10:11], v10, v[14:15] -; GFX11-NEXT: v_cmp_gt_u32_e64 s0, 64, v25 ; GFX11-NEXT: v_cndmask_b32_e32 v13, 0, v16, vcc_lo +; GFX11-NEXT: v_sub_nc_u32_e32 v20, 64, v22 +; GFX11-NEXT: v_subrev_nc_u32_e32 v10, 64, v22 +; GFX11-NEXT: v_lshrrev_b64 v[18:19], v22, v[8:9] +; GFX11-NEXT: v_cmp_gt_u32_e64 s0, 64, v22 +; GFX11-NEXT: v_cndmask_b32_e32 v12, v3, v12, vcc_lo +; GFX11-NEXT: v_lshlrev_b64 v[20:21], v20, v[14:15] +; GFX11-NEXT: v_lshrrev_b64 v[10:11], v10, v[14:15] +; GFX11-NEXT: v_cndmask_b32_e32 v5, v4, v5, vcc_lo +; GFX11-NEXT: v_lshrrev_b64 v[3:4], v22, v[14:15] +; GFX11-NEXT: v_cmp_eq_u32_e64 s1, 0, v22 +; GFX11-NEXT: v_cmp_eq_u32_e64 s2, 0, v24 ; GFX11-NEXT: v_or_b32_e32 v16, v18, v20 ; GFX11-NEXT: v_or_b32_e32 v18, v19, v21 -; GFX11-NEXT: v_dual_cndmask_b32 v12, v3, v12 :: v_dual_cndmask_b32 v5, v4, v5 -; GFX11-NEXT: v_lshrrev_b64 v[3:4], v25, v[14:15] ; GFX11-NEXT: v_cndmask_b32_e32 v14, 0, v17, vcc_lo +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_cndmask_b32_e64 v6, v12, v6, s2 ; GFX11-NEXT: v_cndmask_b32_e64 v10, v10, v16, s0 -; GFX11-NEXT: v_cmp_eq_u32_e64 s1, 0, v25 -; GFX11-NEXT: v_cmp_eq_u32_e64 s2, 0, v23 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3) ; GFX11-NEXT: v_cndmask_b32_e64 v11, v11, v18, s0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_4) -; GFX11-NEXT: v_cndmask_b32_e64 v6, v12, v6, s2 ; GFX11-NEXT: v_cndmask_b32_e64 v7, v5, v7, s2 ; GFX11-NEXT: v_cndmask_b32_e64 v5, v10, v8, s1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) ; GFX11-NEXT: v_cndmask_b32_e64 v8, v11, v9, s1 ; GFX11-NEXT: v_cndmask_b32_e64 v9, 0, v3, s0 ; GFX11-NEXT: v_cndmask_b32_e64 v10, 0, v4, s0 -; GFX11-NEXT: v_or_b32_e32 v3, v22, v24 +; GFX11-NEXT: v_or_b32_e32 v3, v23, v25 ; GFX11-NEXT: v_or_b32_e32 v4, v13, v5 ; GFX11-NEXT: v_or_b32_e32 v5, v14, v8 ; GFX11-NEXT: v_or_b32_e32 v6, v6, v9 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/fshr.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/fshr.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/fshr.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/fshr.ll @@ -5231,8 +5231,7 @@ ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v4, v0 ; GFX9-NEXT: v_lshrrev_b64 v[0:1], 5, v[2:3] -; GFX9-NEXT: v_lshlrev_b32_e32 v2, 27, v4 -; GFX9-NEXT: v_or_b32_e32 v1, v2, v1 +; GFX9-NEXT: v_lshl_or_b32 v1, v4, 27, v1 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_fshr_i64_5: @@ -5241,8 +5240,7 @@ ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: v_mov_b32_e32 v4, v0 ; GFX10-NEXT: v_lshrrev_b64 v[0:1], 5, v[2:3] -; GFX10-NEXT: v_lshlrev_b32_e32 v2, 27, v4 -; GFX10-NEXT: v_or_b32_e32 v1, v2, v1 +; GFX10-NEXT: v_lshl_or_b32 v1, v4, 27, v1 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: v_fshr_i64_5: @@ -5251,9 +5249,8 @@ ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: v_mov_b32_e32 v4, v0 ; GFX11-NEXT: v_lshrrev_b64 v[0:1], 5, v[2:3] -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_lshlrev_b32_e32 v2, 27, v4 -; GFX11-NEXT: v_or_b32_e32 v1, v2, v1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_lshl_or_b32 v1, v4, 27, v1 ; GFX11-NEXT: s_setpc_b64 s[30:31] %result = call i64 @llvm.fshr.i64(i64 %lhs, i64 %rhs, i64 5) ret i64 %result @@ -7134,12 +7131,12 @@ ; GFX9-LABEL: v_fshr_i128_65: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_lshlrev_b32_e32 v4, 31, v0 -; GFX9-NEXT: v_lshlrev_b32_e32 v5, 31, v2 +; GFX9-NEXT: v_mov_b32_e32 v8, v2 ; GFX9-NEXT: v_lshrrev_b64 v[2:3], 1, v[0:1] -; GFX9-NEXT: v_lshrrev_b64 v[0:1], 1, v[6:7] -; GFX9-NEXT: v_or_b32_e32 v3, v5, v3 -; GFX9-NEXT: v_or_b32_e32 v1, v4, v1 +; GFX9-NEXT: v_lshrrev_b64 v[4:5], 1, v[6:7] +; GFX9-NEXT: v_lshl_or_b32 v3, v8, 31, v3 +; GFX9-NEXT: v_lshl_or_b32 v1, v0, 31, v5 +; GFX9-NEXT: v_mov_b32_e32 v0, v4 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_fshr_i128_65: @@ -7149,10 +7146,8 @@ ; GFX10-NEXT: v_mov_b32_e32 v8, v2 ; GFX10-NEXT: v_lshrrev_b64 v[4:5], 1, v[6:7] ; GFX10-NEXT: v_lshrrev_b64 v[2:3], 1, v[0:1] -; GFX10-NEXT: v_lshlrev_b32_e32 v9, 31, v0 -; GFX10-NEXT: v_lshlrev_b32_e32 v0, 31, v8 -; GFX10-NEXT: v_or_b32_e32 v1, v9, v5 -; GFX10-NEXT: v_or_b32_e32 v3, v0, v3 +; GFX10-NEXT: v_lshl_or_b32 v1, v0, 31, v5 +; GFX10-NEXT: v_lshl_or_b32 v3, v8, 31, v3 ; GFX10-NEXT: v_mov_b32_e32 v0, v4 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; @@ -7160,14 +7155,13 @@ ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: v_dual_mov_b32 v8, v2 :: v_dual_lshlrev_b32 v9, 31, v0 +; GFX11-NEXT: v_mov_b32_e32 v8, v2 ; GFX11-NEXT: v_lshrrev_b64 v[4:5], 1, v[6:7] ; GFX11-NEXT: v_lshrrev_b64 v[2:3], 1, v[0:1] -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-NEXT: v_lshlrev_b32_e32 v0, 31, v8 -; GFX11-NEXT: v_or_b32_e32 v1, v9, v5 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX11-NEXT: v_or_b32_e32 v3, v0, v3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_lshl_or_b32 v1, v0, 31, v5 +; GFX11-NEXT: v_lshl_or_b32 v3, v8, 31, v3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) ; GFX11-NEXT: v_mov_b32_e32 v0, v4 ; GFX11-NEXT: s_setpc_b64 s[30:31] %result = call i128 @llvm.fshr.i128(i128 %lhs, i128 %rhs, i128 65) diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.i16.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.i16.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.i16.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.i16.ll @@ -912,20 +912,20 @@ ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; GFX9-NEXT: s_lshr_b32 s2, s4, 1 ; GFX9-NEXT: s_cmp_eq_u32 s2, 1 -; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX9-NEXT: v_cmp_eq_u32_e64 vcc, s2, 0 +; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_cselect_b32 s3, s1, s0 ; GFX9-NEXT: s_and_b32 s4, s4, 1 ; GFX9-NEXT: s_lshl_b32 s4, s4, 4 -; GFX9-NEXT: s_lshl_b32 s5, 0xffff, s4 -; GFX9-NEXT: s_andn2_b32 s3, s3, s5 -; GFX9-NEXT: v_mov_b32_e32 v1, s3 -; GFX9-NEXT: v_lshl_or_b32 v4, v0, s4, v1 +; GFX9-NEXT: v_lshlrev_b32_sdwa v0, s4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; GFX9-NEXT: s_lshl_b32 s4, 0xffff, s4 +; GFX9-NEXT: s_not_b32 s4, s4 +; GFX9-NEXT: v_mov_b32_e32 v1, s4 +; GFX9-NEXT: v_and_or_b32 v4, s3, v1, v0 ; GFX9-NEXT: v_mov_b32_e32 v0, s0 ; GFX9-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc -; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: v_cmp_eq_u32_e64 vcc, s2, 1 ; GFX9-NEXT: v_mov_b32_e32 v3, 0 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc @@ -945,9 +945,11 @@ ; GFX8-NEXT: s_lshl_b32 s4, s4, 4 ; GFX8-NEXT: v_mov_b32_e32 v1, s4 ; GFX8-NEXT: s_lshl_b32 s4, 0xffff, s4 +; GFX8-NEXT: s_not_b32 s4, s4 ; GFX8-NEXT: v_lshlrev_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; GFX8-NEXT: s_andn2_b32 s3, s3, s4 -; GFX8-NEXT: v_or_b32_e32 v4, s3, v0 +; GFX8-NEXT: v_mov_b32_e32 v1, s4 +; GFX8-NEXT: v_and_b32_e32 v1, s3, v1 +; GFX8-NEXT: v_or_b32_e32 v4, v1, v0 ; GFX8-NEXT: v_mov_b32_e32 v0, s0 ; GFX8-NEXT: v_mov_b32_e32 v1, s1 ; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc @@ -970,8 +972,10 @@ ; GFX7-NEXT: s_lshl_b32 s4, s4, 4 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, s4, v0 ; GFX7-NEXT: s_lshl_b32 s4, 0xffff, s4 -; GFX7-NEXT: s_andn2_b32 s3, s3, s4 -; GFX7-NEXT: v_or_b32_e32 v4, s3, v0 +; GFX7-NEXT: s_not_b32 s4, s4 +; GFX7-NEXT: v_mov_b32_e32 v1, s4 +; GFX7-NEXT: v_and_b32_e32 v1, s3, v1 +; GFX7-NEXT: v_or_b32_e32 v4, v1, v0 ; GFX7-NEXT: v_mov_b32_e32 v0, s0 ; GFX7-NEXT: v_mov_b32_e32 v1, s1 ; GFX7-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc @@ -986,18 +990,18 @@ ; GFX10: ; %bb.0: ; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; GFX10-NEXT: s_lshr_b32 s2, s4, 1 -; GFX10-NEXT: v_and_b32_e32 v2, 0xffff, v0 ; GFX10-NEXT: s_cmp_eq_u32 s2, 1 ; GFX10-NEXT: v_cmp_eq_u32_e64 vcc_lo, s2, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_cselect_b32 s3, s1, s0 ; GFX10-NEXT: s_and_b32 s4, s4, 1 -; GFX10-NEXT: v_mov_b32_e32 v0, s0 ; GFX10-NEXT: s_lshl_b32 s4, s4, 4 +; GFX10-NEXT: v_lshlrev_b32_sdwa v2, s4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; GFX10-NEXT: s_lshl_b32 s4, 0xffff, s4 +; GFX10-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-NEXT: s_not_b32 s4, s4 ; GFX10-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-NEXT: s_lshl_b32 s5, 0xffff, s4 -; GFX10-NEXT: s_andn2_b32 s3, s3, s5 -; GFX10-NEXT: v_lshl_or_b32 v4, v2, s4, s3 +; GFX10-NEXT: v_and_or_b32 v4, s3, s4, v2 ; GFX10-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-NEXT: v_mov_b32_e32 v3, 0 ; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc_lo @@ -1010,21 +1014,23 @@ ; GFX11: ; %bb.0: ; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 ; GFX11-NEXT: s_lshr_b32 s2, s4, 1 -; GFX11-NEXT: v_and_b32_e32 v2, 0xffff, v0 +; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX11-NEXT: s_cmp_eq_u32 s2, 1 ; GFX11-NEXT: v_cmp_eq_u32_e64 vcc_lo, s2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_cselect_b32 s3, s1, s0 ; GFX11-NEXT: s_and_b32 s4, s4, 1 -; GFX11-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-NEXT: s_lshl_b32 s4, s4, 4 -; GFX11-NEXT: v_mov_b32_e32 v1, s1 -; GFX11-NEXT: s_lshl_b32 s5, 0xffff, s4 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 s3, s3, s5 -; GFX11-NEXT: v_lshl_or_b32 v4, v2, s4, s3 +; GFX11-NEXT: s_lshl_b32 s4, s4, 4 +; GFX11-NEXT: v_lshlrev_b32_e32 v2, s4, v0 +; GFX11-NEXT: s_lshl_b32 s4, 0xffff, s4 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_not_b32 s4, s4 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: v_and_or_b32 v4, s3, s4, v2 +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_4) ; GFX11-NEXT: v_dual_mov_b32 v3, 0 :: v_dual_cndmask_b32 v0, v0, v4 ; GFX11-NEXT: v_cmp_eq_u32_e64 vcc_lo, s2, 1 ; GFX11-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc_lo @@ -2024,8 +2030,8 @@ ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x0 ; GFX9-NEXT: s_lshr_b32 s5, s4, 1 ; GFX9-NEXT: s_cmp_eq_u32 s5, 1 -; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX9-NEXT: v_cmp_eq_u32_e64 vcc, s5, 0 +; GFX9-NEXT: v_mov_b32_e32 v4, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_cselect_b32 s6, s1, s0 ; GFX9-NEXT: s_cmp_eq_u32 s5, 2 @@ -2034,10 +2040,11 @@ ; GFX9-NEXT: s_cselect_b32 s6, s3, s6 ; GFX9-NEXT: s_and_b32 s4, s4, 1 ; GFX9-NEXT: s_lshl_b32 s4, s4, 4 -; GFX9-NEXT: s_lshl_b32 s7, 0xffff, s4 -; GFX9-NEXT: s_andn2_b32 s6, s6, s7 -; GFX9-NEXT: v_mov_b32_e32 v1, s6 -; GFX9-NEXT: v_lshl_or_b32 v6, v0, s4, v1 +; GFX9-NEXT: v_lshlrev_b32_sdwa v0, s4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; GFX9-NEXT: s_lshl_b32 s4, 0xffff, s4 +; GFX9-NEXT: s_not_b32 s4, s4 +; GFX9-NEXT: v_mov_b32_e32 v1, s4 +; GFX9-NEXT: v_and_or_b32 v6, s6, v1, v0 ; GFX9-NEXT: v_mov_b32_e32 v0, s0 ; GFX9-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v6, vcc @@ -2047,7 +2054,6 @@ ; GFX9-NEXT: v_cmp_eq_u32_e64 vcc, s5, 2 ; GFX9-NEXT: v_mov_b32_e32 v3, s3 ; GFX9-NEXT: v_cndmask_b32_e32 v2, v2, v6, vcc -; GFX9-NEXT: v_mov_b32_e32 v4, 0 ; GFX9-NEXT: v_cmp_eq_u32_e64 vcc, s5, 3 ; GFX9-NEXT: v_mov_b32_e32 v5, 0 ; GFX9-NEXT: v_cndmask_b32_e32 v3, v3, v6, vcc @@ -2071,9 +2077,11 @@ ; GFX8-NEXT: s_lshl_b32 s4, s4, 4 ; GFX8-NEXT: v_mov_b32_e32 v1, s4 ; GFX8-NEXT: s_lshl_b32 s4, 0xffff, s4 +; GFX8-NEXT: s_not_b32 s4, s4 ; GFX8-NEXT: v_lshlrev_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; GFX8-NEXT: s_andn2_b32 s4, s6, s4 -; GFX8-NEXT: v_or_b32_e32 v6, s4, v0 +; GFX8-NEXT: v_mov_b32_e32 v1, s4 +; GFX8-NEXT: v_and_b32_e32 v1, s6, v1 +; GFX8-NEXT: v_or_b32_e32 v6, v1, v0 ; GFX8-NEXT: v_mov_b32_e32 v0, s0 ; GFX8-NEXT: v_mov_b32_e32 v1, s1 ; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v6, vcc @@ -2106,8 +2114,10 @@ ; GFX7-NEXT: s_lshl_b32 s4, s4, 4 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, s4, v0 ; GFX7-NEXT: s_lshl_b32 s4, 0xffff, s4 -; GFX7-NEXT: s_andn2_b32 s4, s6, s4 -; GFX7-NEXT: v_or_b32_e32 v4, s4, v0 +; GFX7-NEXT: s_not_b32 s4, s4 +; GFX7-NEXT: v_mov_b32_e32 v1, s4 +; GFX7-NEXT: v_and_b32_e32 v1, s6, v1 +; GFX7-NEXT: v_or_b32_e32 v4, v1, v0 ; GFX7-NEXT: v_mov_b32_e32 v0, s0 ; GFX7-NEXT: v_mov_b32_e32 v1, s1 ; GFX7-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc @@ -2129,24 +2139,24 @@ ; GFX10: ; %bb.0: ; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x0 ; GFX10-NEXT: s_lshr_b32 s5, s4, 1 -; GFX10-NEXT: v_and_b32_e32 v4, 0xffff, v0 ; GFX10-NEXT: s_cmp_eq_u32 s5, 1 ; GFX10-NEXT: v_cmp_eq_u32_e64 vcc_lo, s5, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_cselect_b32 s6, s1, s0 ; GFX10-NEXT: s_cmp_eq_u32 s5, 2 -; GFX10-NEXT: v_mov_b32_e32 v0, s0 ; GFX10-NEXT: s_cselect_b32 s6, s2, s6 ; GFX10-NEXT: s_cmp_eq_u32 s5, 3 -; GFX10-NEXT: v_mov_b32_e32 v1, s1 ; GFX10-NEXT: s_cselect_b32 s6, s3, s6 ; GFX10-NEXT: s_and_b32 s4, s4, 1 -; GFX10-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-NEXT: s_lshl_b32 s4, s4, 4 +; GFX10-NEXT: v_lshlrev_b32_sdwa v4, s4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; GFX10-NEXT: s_lshl_b32 s4, 0xffff, s4 +; GFX10-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-NEXT: s_not_b32 s4, s4 +; GFX10-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-NEXT: v_and_or_b32 v6, s6, s4, v4 +; GFX10-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-NEXT: v_mov_b32_e32 v3, s3 -; GFX10-NEXT: s_lshl_b32 s7, 0xffff, s4 -; GFX10-NEXT: s_andn2_b32 s6, s6, s7 -; GFX10-NEXT: v_lshl_or_b32 v6, v4, s4, s6 ; GFX10-NEXT: v_mov_b32_e32 v4, 0 ; GFX10-NEXT: v_mov_b32_e32 v5, 0 ; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v6, vcc_lo @@ -2163,31 +2173,31 @@ ; GFX11: ; %bb.0: ; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x0 ; GFX11-NEXT: s_lshr_b32 s5, s4, 1 -; GFX11-NEXT: v_and_b32_e32 v4, 0xffff, v0 +; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX11-NEXT: s_cmp_eq_u32 s5, 1 ; GFX11-NEXT: v_cmp_eq_u32_e64 vcc_lo, s5, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_cselect_b32 s6, s1, s0 ; GFX11-NEXT: s_cmp_eq_u32 s5, 2 -; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v3, s3 ; GFX11-NEXT: s_cselect_b32 s6, s2, s6 ; GFX11-NEXT: s_cmp_eq_u32 s5, 3 -; GFX11-NEXT: v_mov_b32_e32 v1, s1 ; GFX11-NEXT: s_cselect_b32 s6, s3, s6 ; GFX11-NEXT: s_and_b32 s4, s4, 1 -; GFX11-NEXT: v_mov_b32_e32 v2, s2 -; GFX11-NEXT: s_lshl_b32 s4, s4, 4 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) -; GFX11-NEXT: s_lshl_b32 s7, 0xffff, s4 -; GFX11-NEXT: s_and_not1_b32 s6, s6, s7 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_lshl_or_b32 v6, v4, s4, s6 -; GFX11-NEXT: v_cndmask_b32_e32 v0, v0, v6, vcc_lo +; GFX11-NEXT: s_lshl_b32 s4, s4, 4 +; GFX11-NEXT: v_lshlrev_b32_e32 v4, s4, v0 +; GFX11-NEXT: s_lshl_b32 s4, 0xffff, s4 +; GFX11-NEXT: v_mov_b32_e32 v0, s0 +; GFX11-NEXT: s_not_b32 s4, s4 +; GFX11-NEXT: v_mov_b32_e32 v3, s3 +; GFX11-NEXT: v_and_or_b32 v6, s6, s4, v4 +; GFX11-NEXT: v_dual_mov_b32 v4, 0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_dual_mov_b32 v5, 0 :: v_dual_cndmask_b32 v0, v0, v6 ; GFX11-NEXT: v_cmp_eq_u32_e64 vcc_lo, s5, 1 -; GFX11-NEXT: v_mov_b32_e32 v4, 0 -; GFX11-NEXT: v_mov_b32_e32 v5, 0 -; GFX11-NEXT: v_cndmask_b32_e32 v1, v1, v6, vcc_lo +; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_cndmask_b32 v1, v1, v6 ; GFX11-NEXT: v_cmp_eq_u32_e64 vcc_lo, s5, 2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX11-NEXT: v_cndmask_b32_e32 v2, v2, v6, vcc_lo ; GFX11-NEXT: v_cmp_eq_u32_e64 vcc_lo, s5, 3 ; GFX11-NEXT: v_cndmask_b32_e32 v3, v3, v6, vcc_lo @@ -3378,8 +3388,8 @@ ; GFX9-NEXT: s_load_dwordx8 s[8:15], s[2:3], 0x0 ; GFX9-NEXT: s_lshr_b32 s2, s4, 1 ; GFX9-NEXT: s_cmp_eq_u32 s2, 1 -; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX9-NEXT: v_cmp_eq_u32_e64 vcc, s2, 0 +; GFX9-NEXT: v_mov_b32_e32 v10, 16 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_cselect_b32 s0, s9, s8 ; GFX9-NEXT: s_cmp_eq_u32 s2, 2 @@ -3396,10 +3406,11 @@ ; GFX9-NEXT: s_cselect_b32 s0, s15, s0 ; GFX9-NEXT: s_and_b32 s1, s4, 1 ; GFX9-NEXT: s_lshl_b32 s1, s1, 4 -; GFX9-NEXT: s_lshl_b32 s3, 0xffff, s1 -; GFX9-NEXT: s_andn2_b32 s0, s0, s3 -; GFX9-NEXT: v_mov_b32_e32 v1, s0 -; GFX9-NEXT: v_lshl_or_b32 v8, v0, s1, v1 +; GFX9-NEXT: v_lshlrev_b32_sdwa v0, s1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; GFX9-NEXT: s_lshl_b32 s1, 0xffff, s1 +; GFX9-NEXT: s_not_b32 s1, s1 +; GFX9-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NEXT: v_and_or_b32 v8, s0, v1, v0 ; GFX9-NEXT: v_mov_b32_e32 v0, s8 ; GFX9-NEXT: v_mov_b32_e32 v1, s9 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v8, vcc @@ -3425,7 +3436,6 @@ ; GFX9-NEXT: v_cndmask_b32_e32 v7, v7, v8, vcc ; GFX9-NEXT: v_mov_b32_e32 v8, 0 ; GFX9-NEXT: v_mov_b32_e32 v9, 0 -; GFX9-NEXT: v_mov_b32_e32 v10, 16 ; GFX9-NEXT: v_mov_b32_e32 v11, 0 ; GFX9-NEXT: global_store_dwordx4 v[8:9], v[0:3], off ; GFX9-NEXT: global_store_dwordx4 v[10:11], v[4:7], off @@ -3435,15 +3445,17 @@ ; GFX8: ; %bb.0: ; GFX8-NEXT: s_load_dwordx8 s[8:15], s[2:3], 0x0 ; GFX8-NEXT: s_and_b32 s1, s4, 1 -; GFX8-NEXT: s_lshr_b32 m0, s4, 1 ; GFX8-NEXT: s_lshl_b32 s1, s1, 4 ; GFX8-NEXT: v_mov_b32_e32 v1, s1 +; GFX8-NEXT: s_lshl_b32 s1, 0xffff, s1 +; GFX8-NEXT: s_lshr_b32 m0, s4, 1 +; GFX8-NEXT: s_not_b32 s1, s1 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: s_movrels_b32 s0, s8 -; GFX8-NEXT: s_lshl_b32 s1, 0xffff, s1 ; GFX8-NEXT: v_lshlrev_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; GFX8-NEXT: s_andn2_b32 s0, s0, s1 -; GFX8-NEXT: v_or_b32_e32 v8, s0, v0 +; GFX8-NEXT: v_mov_b32_e32 v1, s1 +; GFX8-NEXT: v_and_b32_e32 v1, s0, v1 +; GFX8-NEXT: v_or_b32_e32 v8, v1, v0 ; GFX8-NEXT: v_mov_b32_e32 v0, s8 ; GFX8-NEXT: v_mov_b32_e32 v1, s9 ; GFX8-NEXT: v_mov_b32_e32 v2, s10 @@ -3465,15 +3477,17 @@ ; GFX7: ; %bb.0: ; GFX7-NEXT: s_load_dwordx8 s[8:15], s[2:3], 0x0 ; GFX7-NEXT: s_and_b32 s1, s4, 1 -; GFX7-NEXT: s_lshr_b32 m0, s4, 1 ; GFX7-NEXT: s_lshl_b32 s1, s1, 4 ; GFX7-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: s_movrels_b32 s0, s8 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, s1, v0 ; GFX7-NEXT: s_lshl_b32 s1, 0xffff, s1 -; GFX7-NEXT: s_andn2_b32 s0, s0, s1 -; GFX7-NEXT: v_or_b32_e32 v8, s0, v0 +; GFX7-NEXT: s_lshr_b32 m0, s4, 1 +; GFX7-NEXT: s_not_b32 s1, s1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_movrels_b32 s0, s8 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_and_b32_e32 v1, s0, v1 +; GFX7-NEXT: v_or_b32_e32 v8, v1, v0 ; GFX7-NEXT: v_mov_b32_e32 v0, s8 ; GFX7-NEXT: v_mov_b32_e32 v1, s9 ; GFX7-NEXT: v_mov_b32_e32 v2, s10 @@ -3497,16 +3511,16 @@ ; GFX10-NEXT: s_and_b32 s0, s4, 1 ; GFX10-NEXT: s_lshr_b32 m0, s4, 1 ; GFX10-NEXT: s_lshl_b32 s0, s0, 4 -; GFX10-NEXT: v_and_b32_e32 v8, 0xffff, v0 -; GFX10-NEXT: s_lshl_b32 s1, 0xffff, s0 ; GFX10-NEXT: v_mov_b32_e32 v10, 16 +; GFX10-NEXT: v_lshlrev_b32_sdwa v8, s0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; GFX10-NEXT: s_lshl_b32 s0, 0xffff, s0 ; GFX10-NEXT: v_mov_b32_e32 v11, 0 +; GFX10-NEXT: s_not_b32 s0, s0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: s_movrels_b32 s2, s8 +; GFX10-NEXT: s_movrels_b32 s1, s8 ; GFX10-NEXT: v_mov_b32_e32 v0, s8 -; GFX10-NEXT: s_andn2_b32 s1, s2, s1 +; GFX10-NEXT: v_and_or_b32 v12, s1, s0, v8 ; GFX10-NEXT: v_mov_b32_e32 v1, s9 -; GFX10-NEXT: v_lshl_or_b32 v12, v8, s0, s1 ; GFX10-NEXT: v_mov_b32_e32 v2, s10 ; GFX10-NEXT: v_mov_b32_e32 v3, s11 ; GFX10-NEXT: v_mov_b32_e32 v4, s12 @@ -3523,21 +3537,22 @@ ; GFX11-LABEL: insertelement_s_v16i16_v_s: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_load_b256 s[8:15], s[2:3], 0x0 +; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX11-NEXT: s_and_b32 s0, s4, 1 ; GFX11-NEXT: s_lshr_b32 m0, s4, 1 ; GFX11-NEXT: s_lshl_b32 s0, s0, 4 -; GFX11-NEXT: v_and_b32_e32 v8, 0xffff, v0 -; GFX11-NEXT: s_lshl_b32 s1, 0xffff, s0 ; GFX11-NEXT: v_mov_b32_e32 v10, 16 +; GFX11-NEXT: v_lshlrev_b32_e32 v8, s0, v0 +; GFX11-NEXT: s_lshl_b32 s1, 0xffff, s0 ; GFX11-NEXT: v_mov_b32_e32 v11, 0 +; GFX11-NEXT: s_not_b32 s0, s1 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_movrels_b32 s2, s8 -; GFX11-NEXT: v_dual_mov_b32 v0, s8 :: v_dual_mov_b32 v3, s11 -; GFX11-NEXT: s_and_not1_b32 s1, s2, s1 -; GFX11-NEXT: v_mov_b32_e32 v1, s9 -; GFX11-NEXT: v_lshl_or_b32 v12, v8, s0, s1 -; GFX11-NEXT: v_dual_mov_b32 v2, s10 :: v_dual_mov_b32 v5, s13 -; GFX11-NEXT: v_dual_mov_b32 v4, s12 :: v_dual_mov_b32 v7, s15 +; GFX11-NEXT: s_movrels_b32 s1, s8 +; GFX11-NEXT: v_dual_mov_b32 v0, s8 :: v_dual_mov_b32 v5, s13 +; GFX11-NEXT: v_mov_b32_e32 v3, s11 +; GFX11-NEXT: v_and_or_b32 v12, s1, s0, v8 +; GFX11-NEXT: v_dual_mov_b32 v1, s9 :: v_dual_mov_b32 v2, s10 +; GFX11-NEXT: v_dual_mov_b32 v7, s15 :: v_dual_mov_b32 v4, s12 ; GFX11-NEXT: v_mov_b32_e32 v6, s14 ; GFX11-NEXT: v_mov_b32_e32 v8, 0 ; GFX11-NEXT: v_mov_b32_e32 v9, 0 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.i8.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.i8.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.i8.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.i8.ll @@ -1079,27 +1079,27 @@ ; GFX9-LABEL: insertelement_s_v4i8_v_s: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dword s0, s[2:3], 0x0 -; GFX9-NEXT: v_and_b32_e32 v0, 0xff, v0 ; GFX9-NEXT: s_mov_b32 s1, 8 ; GFX9-NEXT: s_mov_b32 s2, 16 ; GFX9-NEXT: s_movk_i32 s5, 0xff ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_bfe_u32 s7, s0, 0x80008 ; GFX9-NEXT: s_lshr_b32 s3, s0, 24 -; GFX9-NEXT: s_and_b32 s6, s0, 0xff ; GFX9-NEXT: s_lshl_b32 s7, s7, 8 +; GFX9-NEXT: s_and_b32 s6, s0, 0xff +; GFX9-NEXT: v_mov_b32_e32 v1, s7 ; GFX9-NEXT: s_bfe_u32 s0, s0, 0x80010 -; GFX9-NEXT: s_or_b32 s6, s6, s7 -; GFX9-NEXT: s_lshl_b32 s0, s0, 16 -; GFX9-NEXT: s_or_b32 s0, s6, s0 ; GFX9-NEXT: s_lshl_b32 s3, s3, 24 -; GFX9-NEXT: s_or_b32 s0, s0, s3 -; GFX9-NEXT: s_and_b32 s3, s4, 3 -; GFX9-NEXT: s_lshl_b32 s3, s3, 3 -; GFX9-NEXT: s_lshl_b32 s4, 0xff, s3 -; GFX9-NEXT: s_andn2_b32 s0, s0, s4 -; GFX9-NEXT: v_mov_b32_e32 v1, s0 -; GFX9-NEXT: v_lshl_or_b32 v0, v0, s3, v1 +; GFX9-NEXT: v_or_b32_e32 v1, s6, v1 +; GFX9-NEXT: s_lshl_b32 s0, s0, 16 +; GFX9-NEXT: v_mov_b32_e32 v2, s3 +; GFX9-NEXT: v_or3_b32 v1, v1, s0, v2 +; GFX9-NEXT: s_and_b32 s0, s4, 3 +; GFX9-NEXT: s_lshl_b32 s0, s0, 3 +; GFX9-NEXT: v_lshlrev_b32_sdwa v0, s0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: s_lshl_b32 s0, 0xff, s0 +; GFX9-NEXT: s_not_b32 s0, s0 +; GFX9-NEXT: v_and_or_b32 v0, v1, s0, v0 ; GFX9-NEXT: v_lshrrev_b32_e32 v1, 24, v0 ; GFX9-NEXT: v_lshlrev_b32_sdwa v2, s1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 ; GFX9-NEXT: v_and_or_b32 v2, v0, s5, v2 @@ -1114,26 +1114,28 @@ ; GFX8-LABEL: insertelement_s_v4i8_v_s: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_load_dword s0, s[2:3], 0x0 -; GFX8-NEXT: v_mov_b32_e32 v2, 8 ; GFX8-NEXT: v_mov_b32_e32 v3, 16 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: s_bfe_u32 s3, s0, 0x80008 +; GFX8-NEXT: s_lshl_b32 s3, s3, 8 ; GFX8-NEXT: s_lshr_b32 s1, s0, 24 ; GFX8-NEXT: s_and_b32 s2, s0, 0xff -; GFX8-NEXT: s_lshl_b32 s3, s3, 8 +; GFX8-NEXT: v_mov_b32_e32 v1, s3 ; GFX8-NEXT: s_bfe_u32 s0, s0, 0x80010 -; GFX8-NEXT: s_or_b32 s2, s2, s3 +; GFX8-NEXT: v_or_b32_e32 v1, s2, v1 ; GFX8-NEXT: s_lshl_b32 s0, s0, 16 -; GFX8-NEXT: s_or_b32 s0, s2, s0 -; GFX8-NEXT: s_lshl_b32 s1, s1, 24 -; GFX8-NEXT: s_or_b32 s0, s0, s1 -; GFX8-NEXT: s_and_b32 s1, s4, 3 -; GFX8-NEXT: s_lshl_b32 s1, s1, 3 -; GFX8-NEXT: v_mov_b32_e32 v1, s1 -; GFX8-NEXT: s_lshl_b32 s1, 0xff, s1 -; GFX8-NEXT: v_lshlrev_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX8-NEXT: s_andn2_b32 s0, s0, s1 -; GFX8-NEXT: v_or_b32_e32 v0, s0, v0 +; GFX8-NEXT: v_or_b32_e32 v1, s0, v1 +; GFX8-NEXT: s_lshl_b32 s0, s1, 24 +; GFX8-NEXT: v_or_b32_e32 v1, s0, v1 +; GFX8-NEXT: s_and_b32 s0, s4, 3 +; GFX8-NEXT: s_lshl_b32 s0, s0, 3 +; GFX8-NEXT: v_mov_b32_e32 v2, s0 +; GFX8-NEXT: s_lshl_b32 s0, 0xff, s0 +; GFX8-NEXT: s_not_b32 s0, s0 +; GFX8-NEXT: v_lshlrev_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX8-NEXT: v_and_b32_e32 v1, s0, v1 +; GFX8-NEXT: v_or_b32_e32 v0, v1, v0 +; GFX8-NEXT: v_mov_b32_e32 v2, 8 ; GFX8-NEXT: v_lshlrev_b32_sdwa v2, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 ; GFX8-NEXT: v_lshrrev_b32_e32 v1, 24, v0 ; GFX8-NEXT: v_or_b32_sdwa v2, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD @@ -1152,21 +1154,23 @@ ; GFX7-NEXT: v_and_b32_e32 v0, 0xff, v0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_bfe_u32 s3, s0, 0x80008 +; GFX7-NEXT: s_lshl_b32 s3, s3, 8 ; GFX7-NEXT: s_lshr_b32 s1, s0, 24 ; GFX7-NEXT: s_and_b32 s2, s0, 0xff -; GFX7-NEXT: s_lshl_b32 s3, s3, 8 +; GFX7-NEXT: v_mov_b32_e32 v1, s3 ; GFX7-NEXT: s_bfe_u32 s0, s0, 0x80010 -; GFX7-NEXT: s_or_b32 s2, s2, s3 +; GFX7-NEXT: v_or_b32_e32 v1, s2, v1 ; GFX7-NEXT: s_lshl_b32 s0, s0, 16 -; GFX7-NEXT: s_or_b32 s0, s2, s0 -; GFX7-NEXT: s_lshl_b32 s1, s1, 24 -; GFX7-NEXT: s_or_b32 s0, s0, s1 -; GFX7-NEXT: s_and_b32 s1, s4, 3 -; GFX7-NEXT: s_lshl_b32 s1, s1, 3 -; GFX7-NEXT: v_lshlrev_b32_e32 v0, s1, v0 -; GFX7-NEXT: s_lshl_b32 s1, 0xff, s1 -; GFX7-NEXT: s_andn2_b32 s0, s0, s1 -; GFX7-NEXT: v_or_b32_e32 v0, s0, v0 +; GFX7-NEXT: v_or_b32_e32 v1, s0, v1 +; GFX7-NEXT: s_lshl_b32 s0, s1, 24 +; GFX7-NEXT: v_or_b32_e32 v1, s0, v1 +; GFX7-NEXT: s_and_b32 s0, s4, 3 +; GFX7-NEXT: s_lshl_b32 s0, s0, 3 +; GFX7-NEXT: v_lshlrev_b32_e32 v0, s0, v0 +; GFX7-NEXT: s_lshl_b32 s0, 0xff, s0 +; GFX7-NEXT: s_not_b32 s0, s0 +; GFX7-NEXT: v_and_b32_e32 v1, s0, v1 +; GFX7-NEXT: v_or_b32_e32 v0, v1, v0 ; GFX7-NEXT: v_bfe_u32 v3, v0, 8, 8 ; GFX7-NEXT: v_lshrrev_b32_e32 v1, 24, v0 ; GFX7-NEXT: v_and_b32_e32 v2, 0xff, v0 @@ -1187,22 +1191,21 @@ ; GFX10: ; %bb.0: ; GFX10-NEXT: s_load_dword s0, s[2:3], 0x0 ; GFX10-NEXT: s_and_b32 s1, s4, 3 -; GFX10-NEXT: v_and_b32_e32 v0, 0xff, v0 ; GFX10-NEXT: s_lshl_b32 s1, s1, 3 +; GFX10-NEXT: v_lshlrev_b32_sdwa v0, s1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: s_bfe_u32 s4, s0, 0x80008 -; GFX10-NEXT: s_lshr_b32 s2, s0, 24 +; GFX10-NEXT: s_bfe_u32 s2, s0, 0x80008 ; GFX10-NEXT: s_and_b32 s3, s0, 0xff -; GFX10-NEXT: s_bfe_u32 s0, s0, 0x80010 -; GFX10-NEXT: s_lshl_b32 s4, s4, 8 -; GFX10-NEXT: s_lshl_b32 s0, s0, 16 -; GFX10-NEXT: s_or_b32 s3, s3, s4 -; GFX10-NEXT: s_lshl_b32 s2, s2, 24 -; GFX10-NEXT: s_or_b32 s0, s3, s0 -; GFX10-NEXT: s_lshl_b32 s3, 0xff, s1 -; GFX10-NEXT: s_or_b32 s0, s0, s2 -; GFX10-NEXT: s_andn2_b32 s0, s0, s3 -; GFX10-NEXT: v_lshl_or_b32 v0, v0, s1, s0 +; GFX10-NEXT: s_lshl_b32 s2, s2, 8 +; GFX10-NEXT: s_bfe_u32 s4, s0, 0x80010 +; GFX10-NEXT: v_or_b32_e64 v1, s3, s2 +; GFX10-NEXT: s_lshr_b32 s0, s0, 24 +; GFX10-NEXT: s_lshl_b32 s2, s4, 16 +; GFX10-NEXT: s_lshl_b32 s0, s0, 24 +; GFX10-NEXT: v_or3_b32 v1, v1, s2, s0 +; GFX10-NEXT: s_lshl_b32 s0, 0xff, s1 +; GFX10-NEXT: s_not_b32 s0, s0 +; GFX10-NEXT: v_and_or_b32 v0, v1, s0, v0 ; GFX10-NEXT: s_mov_b32 s0, 8 ; GFX10-NEXT: v_lshlrev_b32_sdwa v1, s0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 ; GFX10-NEXT: v_lshrrev_b32_e32 v2, 24, v0 @@ -1219,24 +1222,28 @@ ; GFX11-LABEL: insertelement_s_v4i8_v_s: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_load_b32 s0, s[2:3], 0x0 -; GFX11-NEXT: s_and_b32 s1, s4, 3 ; GFX11-NEXT: v_and_b32_e32 v0, 0xff, v0 +; GFX11-NEXT: s_and_b32 s1, s4, 3 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_lshl_b32 s1, s1, 3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: v_lshlrev_b32_e32 v0, s1, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_bfe_u32 s4, s0, 0x80008 -; GFX11-NEXT: s_lshr_b32 s2, s0, 24 +; GFX11-NEXT: s_bfe_u32 s2, s0, 0x80008 ; GFX11-NEXT: s_and_b32 s3, s0, 0xff -; GFX11-NEXT: s_bfe_u32 s0, s0, 0x80010 -; GFX11-NEXT: s_lshl_b32 s4, s4, 8 -; GFX11-NEXT: s_lshl_b32 s0, s0, 16 -; GFX11-NEXT: s_or_b32 s3, s3, s4 -; GFX11-NEXT: s_lshl_b32 s2, s2, 24 -; GFX11-NEXT: s_or_b32 s0, s3, s0 -; GFX11-NEXT: s_lshl_b32 s3, 0xff, s1 -; GFX11-NEXT: s_or_b32 s0, s0, s2 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 s0, s0, s3 -; GFX11-NEXT: v_lshl_or_b32 v0, v0, s1, s0 +; GFX11-NEXT: s_lshl_b32 s2, s2, 8 +; GFX11-NEXT: s_bfe_u32 s4, s0, 0x80010 +; GFX11-NEXT: v_or_b32_e64 v1, s3, s2 +; GFX11-NEXT: s_lshr_b32 s0, s0, 24 +; GFX11-NEXT: s_lshl_b32 s2, s4, 16 +; GFX11-NEXT: s_lshl_b32 s0, s0, 24 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: v_or3_b32 v1, v1, s2, s0 +; GFX11-NEXT: s_lshl_b32 s0, 0xff, s1 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_not_b32 s0, s0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: v_and_or_b32 v0, v1, s0, v0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) ; GFX11-NEXT: v_bfe_u32 v1, v0, 8, 8 ; GFX11-NEXT: v_bfe_u32 v2, v0, 16, 8 @@ -1270,19 +1277,20 @@ ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_bfe_u32 s7, s0, 0x80008 ; GFX9-NEXT: s_lshr_b32 s3, s0, 24 -; GFX9-NEXT: s_and_b32 s6, s0, 0xff ; GFX9-NEXT: s_lshl_b32 s7, s7, 8 +; GFX9-NEXT: s_and_b32 s6, s0, 0xff +; GFX9-NEXT: v_mov_b32_e32 v1, s7 ; GFX9-NEXT: s_bfe_u32 s0, s0, 0x80010 -; GFX9-NEXT: s_or_b32 s6, s6, s7 -; GFX9-NEXT: s_lshl_b32 s0, s0, 16 -; GFX9-NEXT: s_or_b32 s0, s6, s0 ; GFX9-NEXT: s_lshl_b32 s3, s3, 24 -; GFX9-NEXT: s_or_b32 s0, s0, s3 -; GFX9-NEXT: s_and_b32 s3, s4, 0xff -; GFX9-NEXT: v_lshlrev_b32_e64 v1, v0, s3 +; GFX9-NEXT: v_or_b32_e32 v1, s6, v1 +; GFX9-NEXT: s_lshl_b32 s0, s0, 16 +; GFX9-NEXT: v_mov_b32_e32 v2, s3 +; GFX9-NEXT: v_or3_b32 v1, v1, s0, v2 +; GFX9-NEXT: s_and_b32 s0, s4, 0xff +; GFX9-NEXT: v_lshlrev_b32_e64 v2, v0, s0 ; GFX9-NEXT: v_lshlrev_b32_e64 v0, v0, s5 ; GFX9-NEXT: v_xor_b32_e32 v0, -1, v0 -; GFX9-NEXT: v_and_or_b32 v0, s0, v0, v1 +; GFX9-NEXT: v_and_or_b32 v0, v1, v0, v2 ; GFX9-NEXT: s_mov_b32 s2, 16 ; GFX9-NEXT: v_lshrrev_b32_e32 v1, 24, v0 ; GFX9-NEXT: v_lshlrev_b32_sdwa v2, s1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 @@ -1301,26 +1309,27 @@ ; GFX8-NEXT: v_and_b32_e32 v0, 3, v0 ; GFX8-NEXT: s_movk_i32 s1, 0xff ; GFX8-NEXT: v_lshlrev_b32_e32 v0, 3, v0 -; GFX8-NEXT: v_mov_b32_e32 v2, 8 +; GFX8-NEXT: v_mov_b32_e32 v3, 16 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: s_bfe_u32 s5, s0, 0x80008 +; GFX8-NEXT: s_lshl_b32 s5, s5, 8 ; GFX8-NEXT: s_lshr_b32 s2, s0, 24 ; GFX8-NEXT: s_and_b32 s3, s0, 0xff -; GFX8-NEXT: s_lshl_b32 s5, s5, 8 +; GFX8-NEXT: v_mov_b32_e32 v1, s5 ; GFX8-NEXT: s_bfe_u32 s0, s0, 0x80010 -; GFX8-NEXT: s_or_b32 s3, s3, s5 +; GFX8-NEXT: v_or_b32_e32 v1, s3, v1 ; GFX8-NEXT: s_lshl_b32 s0, s0, 16 -; GFX8-NEXT: s_or_b32 s0, s3, s0 -; GFX8-NEXT: s_lshl_b32 s2, s2, 24 -; GFX8-NEXT: s_or_b32 s0, s0, s2 -; GFX8-NEXT: s_and_b32 s2, s4, 0xff -; GFX8-NEXT: v_lshlrev_b32_e64 v1, v0, s2 +; GFX8-NEXT: v_or_b32_e32 v1, s0, v1 +; GFX8-NEXT: s_lshl_b32 s0, s2, 24 +; GFX8-NEXT: v_or_b32_e32 v1, s0, v1 +; GFX8-NEXT: s_and_b32 s0, s4, 0xff +; GFX8-NEXT: v_lshlrev_b32_e64 v2, v0, s0 ; GFX8-NEXT: v_lshlrev_b32_e64 v0, v0, s1 ; GFX8-NEXT: v_xor_b32_e32 v0, -1, v0 -; GFX8-NEXT: v_and_b32_e32 v0, s0, v0 -; GFX8-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX8-NEXT: v_and_b32_e32 v0, v1, v0 +; GFX8-NEXT: v_or_b32_e32 v0, v0, v2 +; GFX8-NEXT: v_mov_b32_e32 v2, 8 ; GFX8-NEXT: v_lshlrev_b32_sdwa v2, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 -; GFX8-NEXT: v_mov_b32_e32 v3, 16 ; GFX8-NEXT: v_lshrrev_b32_e32 v1, 24, v0 ; GFX8-NEXT: v_or_b32_sdwa v2, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX8-NEXT: v_lshlrev_b32_sdwa v0, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 @@ -1339,21 +1348,22 @@ ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_bfe_u32 s3, s0, 0x80008 +; GFX7-NEXT: s_lshl_b32 s3, s3, 8 ; GFX7-NEXT: s_lshr_b32 s1, s0, 24 ; GFX7-NEXT: s_and_b32 s2, s0, 0xff -; GFX7-NEXT: s_lshl_b32 s3, s3, 8 +; GFX7-NEXT: v_mov_b32_e32 v1, s3 ; GFX7-NEXT: s_bfe_u32 s0, s0, 0x80010 -; GFX7-NEXT: s_or_b32 s2, s2, s3 +; GFX7-NEXT: v_or_b32_e32 v1, s2, v1 ; GFX7-NEXT: s_lshl_b32 s0, s0, 16 -; GFX7-NEXT: s_or_b32 s0, s2, s0 -; GFX7-NEXT: s_lshl_b32 s1, s1, 24 -; GFX7-NEXT: s_or_b32 s0, s0, s1 -; GFX7-NEXT: s_and_b32 s1, s4, 0xff -; GFX7-NEXT: v_lshl_b32_e32 v1, s1, v0 +; GFX7-NEXT: v_or_b32_e32 v1, s0, v1 +; GFX7-NEXT: s_lshl_b32 s0, s1, 24 +; GFX7-NEXT: v_or_b32_e32 v1, s0, v1 +; GFX7-NEXT: s_and_b32 s0, s4, 0xff +; GFX7-NEXT: v_lshl_b32_e32 v2, s0, v0 ; GFX7-NEXT: v_lshl_b32_e32 v0, 0xff, v0 ; GFX7-NEXT: v_xor_b32_e32 v0, -1, v0 -; GFX7-NEXT: v_and_b32_e32 v0, s0, v0 -; GFX7-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX7-NEXT: v_and_b32_e32 v0, v1, v0 +; GFX7-NEXT: v_or_b32_e32 v0, v0, v2 ; GFX7-NEXT: v_bfe_u32 v3, v0, 8, 8 ; GFX7-NEXT: v_lshrrev_b32_e32 v1, 24, v0 ; GFX7-NEXT: v_and_b32_e32 v2, 0xff, v0 @@ -1376,22 +1386,21 @@ ; GFX10-NEXT: v_and_b32_e32 v0, 3, v0 ; GFX10-NEXT: s_and_b32 s1, s4, 0xff ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 3, v0 -; GFX10-NEXT: v_lshlrev_b32_e64 v1, v0, 0xff -; GFX10-NEXT: v_lshlrev_b32_e64 v0, v0, s1 -; GFX10-NEXT: v_xor_b32_e32 v1, -1, v1 +; GFX10-NEXT: v_lshlrev_b32_e64 v1, v0, s1 +; GFX10-NEXT: v_lshlrev_b32_e64 v0, v0, 0xff +; GFX10-NEXT: v_xor_b32_e32 v0, -1, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: s_bfe_u32 s3, s0, 0x80008 -; GFX10-NEXT: s_lshr_b32 s1, s0, 24 +; GFX10-NEXT: s_bfe_u32 s1, s0, 0x80008 ; GFX10-NEXT: s_and_b32 s2, s0, 0xff -; GFX10-NEXT: s_bfe_u32 s0, s0, 0x80010 -; GFX10-NEXT: s_lshl_b32 s3, s3, 8 -; GFX10-NEXT: s_lshl_b32 s0, s0, 16 -; GFX10-NEXT: s_or_b32 s2, s2, s3 -; GFX10-NEXT: s_lshl_b32 s1, s1, 24 -; GFX10-NEXT: s_or_b32 s0, s2, s0 -; GFX10-NEXT: s_or_b32 s0, s0, s1 -; GFX10-NEXT: v_and_or_b32 v0, s0, v1, v0 +; GFX10-NEXT: s_lshl_b32 s1, s1, 8 +; GFX10-NEXT: s_bfe_u32 s3, s0, 0x80010 +; GFX10-NEXT: v_or_b32_e64 v2, s2, s1 +; GFX10-NEXT: s_lshr_b32 s0, s0, 24 +; GFX10-NEXT: s_lshl_b32 s1, s3, 16 +; GFX10-NEXT: s_lshl_b32 s0, s0, 24 +; GFX10-NEXT: v_or3_b32 v2, v2, s1, s0 ; GFX10-NEXT: s_mov_b32 s0, 8 +; GFX10-NEXT: v_and_or_b32 v0, v2, v0, v1 ; GFX10-NEXT: v_lshlrev_b32_sdwa v1, s0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 ; GFX10-NEXT: v_lshrrev_b32_e32 v2, 24, v0 ; GFX10-NEXT: s_mov_b32 s0, 16 @@ -1411,35 +1420,35 @@ ; GFX11-NEXT: s_and_b32 s1, s4, 0xff ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 3, v0 -; GFX11-NEXT: v_lshlrev_b32_e64 v1, v0, 0xff -; GFX11-NEXT: v_lshlrev_b32_e64 v0, v0, s1 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX11-NEXT: v_xor_b32_e32 v1, -1, v1 +; GFX11-NEXT: v_lshlrev_b32_e64 v1, v0, s1 +; GFX11-NEXT: v_lshlrev_b32_e64 v0, v0, 0xff +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_xor_b32_e32 v0, -1, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_bfe_u32 s3, s0, 0x80008 -; GFX11-NEXT: s_lshr_b32 s1, s0, 24 +; GFX11-NEXT: s_bfe_u32 s1, s0, 0x80008 ; GFX11-NEXT: s_and_b32 s2, s0, 0xff -; GFX11-NEXT: s_bfe_u32 s0, s0, 0x80010 -; GFX11-NEXT: s_lshl_b32 s3, s3, 8 -; GFX11-NEXT: s_lshl_b32 s0, s0, 16 -; GFX11-NEXT: s_or_b32 s2, s2, s3 -; GFX11-NEXT: s_lshl_b32 s1, s1, 24 -; GFX11-NEXT: s_or_b32 s0, s2, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) -; GFX11-NEXT: s_or_b32 s0, s0, s1 -; GFX11-NEXT: v_and_or_b32 v0, s0, v1, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-NEXT: s_lshl_b32 s1, s1, 8 +; GFX11-NEXT: s_bfe_u32 s3, s0, 0x80010 +; GFX11-NEXT: v_or_b32_e64 v2, s2, s1 +; GFX11-NEXT: s_lshr_b32 s0, s0, 24 +; GFX11-NEXT: s_lshl_b32 s1, s3, 16 +; GFX11-NEXT: s_lshl_b32 s0, s0, 24 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: v_or3_b32 v2, v2, s1, s0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_and_or_b32 v0, v2, v0, v1 ; GFX11-NEXT: v_bfe_u32 v1, v0, 8, 8 ; GFX11-NEXT: v_bfe_u32 v2, v0, 16, 8 ; GFX11-NEXT: v_lshrrev_b32_e32 v3, 24, v0 -; GFX11-NEXT: v_lshlrev_b32_e32 v1, 8, v1 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_lshlrev_b32_e32 v1, 8, v1 ; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) ; GFX11-NEXT: v_lshlrev_b32_e32 v3, 24, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_3) ; GFX11-NEXT: v_and_or_b32 v4, v0, 0xff, v1 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: v_mov_b32_e32 v1, 0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) ; GFX11-NEXT: v_or3_b32 v2, v4, v2, v3 ; GFX11-NEXT: global_store_b32 v[0:1], v2, off ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) @@ -1461,18 +1470,19 @@ ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_bfe_u32 s6, s0, 0x80008 ; GFX9-NEXT: s_lshr_b32 s3, s0, 24 -; GFX9-NEXT: s_and_b32 s5, s0, 0xff ; GFX9-NEXT: s_lshl_b32 s6, s6, 8 +; GFX9-NEXT: s_and_b32 s5, s0, 0xff +; GFX9-NEXT: v_mov_b32_e32 v2, s6 ; GFX9-NEXT: s_bfe_u32 s0, s0, 0x80010 -; GFX9-NEXT: s_or_b32 s5, s5, s6 -; GFX9-NEXT: s_lshl_b32 s0, s0, 16 -; GFX9-NEXT: s_or_b32 s0, s5, s0 ; GFX9-NEXT: s_lshl_b32 s3, s3, 24 +; GFX9-NEXT: v_or_b32_e32 v2, s5, v2 +; GFX9-NEXT: s_lshl_b32 s0, s0, 16 +; GFX9-NEXT: v_mov_b32_e32 v3, s3 ; GFX9-NEXT: v_lshlrev_b32_e64 v1, v1, s4 -; GFX9-NEXT: s_or_b32 s0, s0, s3 +; GFX9-NEXT: v_or3_b32 v2, v2, s0, v3 ; GFX9-NEXT: v_xor_b32_e32 v1, -1, v1 ; GFX9-NEXT: s_mov_b32 s1, 8 -; GFX9-NEXT: v_and_or_b32 v0, s0, v1, v0 +; GFX9-NEXT: v_and_or_b32 v0, v2, v1, v0 ; GFX9-NEXT: s_mov_b32 s2, 16 ; GFX9-NEXT: v_lshrrev_b32_e32 v1, 24, v0 ; GFX9-NEXT: v_lshlrev_b32_sdwa v2, s1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 @@ -1494,18 +1504,19 @@ ; GFX8-NEXT: v_lshlrev_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: s_bfe_u32 s4, s0, 0x80008 +; GFX8-NEXT: s_lshl_b32 s4, s4, 8 ; GFX8-NEXT: s_lshr_b32 s2, s0, 24 ; GFX8-NEXT: s_and_b32 s3, s0, 0xff -; GFX8-NEXT: s_lshl_b32 s4, s4, 8 +; GFX8-NEXT: v_mov_b32_e32 v2, s4 ; GFX8-NEXT: s_bfe_u32 s0, s0, 0x80010 -; GFX8-NEXT: s_or_b32 s3, s3, s4 +; GFX8-NEXT: v_or_b32_e32 v2, s3, v2 ; GFX8-NEXT: s_lshl_b32 s0, s0, 16 -; GFX8-NEXT: s_or_b32 s0, s3, s0 -; GFX8-NEXT: s_lshl_b32 s2, s2, 24 +; GFX8-NEXT: v_or_b32_e32 v2, s0, v2 +; GFX8-NEXT: s_lshl_b32 s0, s2, 24 ; GFX8-NEXT: v_lshlrev_b32_e64 v1, v1, s1 -; GFX8-NEXT: s_or_b32 s0, s0, s2 +; GFX8-NEXT: v_or_b32_e32 v2, s0, v2 ; GFX8-NEXT: v_xor_b32_e32 v1, -1, v1 -; GFX8-NEXT: v_and_b32_e32 v1, s0, v1 +; GFX8-NEXT: v_and_b32_e32 v1, v2, v1 ; GFX8-NEXT: v_or_b32_e32 v0, v1, v0 ; GFX8-NEXT: v_mov_b32_e32 v2, 8 ; GFX8-NEXT: v_lshlrev_b32_sdwa v2, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 @@ -1530,18 +1541,19 @@ ; GFX7-NEXT: v_lshlrev_b32_e32 v0, v1, v0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_bfe_u32 s3, s0, 0x80008 +; GFX7-NEXT: s_lshl_b32 s3, s3, 8 ; GFX7-NEXT: s_lshr_b32 s1, s0, 24 ; GFX7-NEXT: s_and_b32 s2, s0, 0xff -; GFX7-NEXT: s_lshl_b32 s3, s3, 8 +; GFX7-NEXT: v_mov_b32_e32 v2, s3 ; GFX7-NEXT: s_bfe_u32 s0, s0, 0x80010 -; GFX7-NEXT: s_or_b32 s2, s2, s3 +; GFX7-NEXT: v_or_b32_e32 v2, s2, v2 ; GFX7-NEXT: s_lshl_b32 s0, s0, 16 -; GFX7-NEXT: s_or_b32 s0, s2, s0 -; GFX7-NEXT: s_lshl_b32 s1, s1, 24 +; GFX7-NEXT: v_or_b32_e32 v2, s0, v2 +; GFX7-NEXT: s_lshl_b32 s0, s1, 24 ; GFX7-NEXT: v_lshl_b32_e32 v1, 0xff, v1 -; GFX7-NEXT: s_or_b32 s0, s0, s1 +; GFX7-NEXT: v_or_b32_e32 v2, s0, v2 ; GFX7-NEXT: v_xor_b32_e32 v1, -1, v1 -; GFX7-NEXT: v_and_b32_e32 v1, s0, v1 +; GFX7-NEXT: v_and_b32_e32 v1, v2, v1 ; GFX7-NEXT: v_or_b32_e32 v0, v1, v0 ; GFX7-NEXT: v_bfe_u32 v3, v0, 8, 8 ; GFX7-NEXT: v_lshrrev_b32_e32 v1, 24, v0 @@ -1564,22 +1576,21 @@ ; GFX10-NEXT: s_load_dword s0, s[2:3], 0x0 ; GFX10-NEXT: v_and_b32_e32 v1, 3, v1 ; GFX10-NEXT: v_lshlrev_b32_e32 v1, 3, v1 -; GFX10-NEXT: v_lshlrev_b32_e64 v2, v1, 0xff ; GFX10-NEXT: v_lshlrev_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX10-NEXT: v_xor_b32_e32 v1, -1, v2 +; GFX10-NEXT: v_lshlrev_b32_e64 v1, v1, 0xff +; GFX10-NEXT: v_xor_b32_e32 v1, -1, v1 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: s_bfe_u32 s3, s0, 0x80008 -; GFX10-NEXT: s_lshr_b32 s1, s0, 24 +; GFX10-NEXT: s_bfe_u32 s1, s0, 0x80008 ; GFX10-NEXT: s_and_b32 s2, s0, 0xff -; GFX10-NEXT: s_bfe_u32 s0, s0, 0x80010 -; GFX10-NEXT: s_lshl_b32 s3, s3, 8 -; GFX10-NEXT: s_lshl_b32 s0, s0, 16 -; GFX10-NEXT: s_or_b32 s2, s2, s3 -; GFX10-NEXT: s_lshl_b32 s1, s1, 24 -; GFX10-NEXT: s_or_b32 s0, s2, s0 -; GFX10-NEXT: s_or_b32 s0, s0, s1 -; GFX10-NEXT: v_and_or_b32 v0, s0, v1, v0 +; GFX10-NEXT: s_lshl_b32 s1, s1, 8 +; GFX10-NEXT: s_bfe_u32 s3, s0, 0x80010 +; GFX10-NEXT: v_or_b32_e64 v2, s2, s1 +; GFX10-NEXT: s_lshr_b32 s0, s0, 24 +; GFX10-NEXT: s_lshl_b32 s1, s3, 16 +; GFX10-NEXT: s_lshl_b32 s0, s0, 24 +; GFX10-NEXT: v_or3_b32 v2, v2, s1, s0 ; GFX10-NEXT: s_mov_b32 s0, 8 +; GFX10-NEXT: v_and_or_b32 v0, v2, v1, v0 ; GFX10-NEXT: v_lshlrev_b32_sdwa v1, s0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 ; GFX10-NEXT: v_lshrrev_b32_e32 v2, 24, v0 ; GFX10-NEXT: s_mov_b32 s0, 16 @@ -1599,35 +1610,35 @@ ; GFX11-NEXT: v_and_b32_e32 v0, 0xff, v0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_lshlrev_b32_e32 v1, 3, v1 -; GFX11-NEXT: v_lshlrev_b32_e64 v2, v1, 0xff -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_lshlrev_b32_e32 v0, v1, v0 -; GFX11-NEXT: v_xor_b32_e32 v1, -1, v2 +; GFX11-NEXT: v_lshlrev_b32_e64 v1, v1, 0xff +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_xor_b32_e32 v1, -1, v1 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_bfe_u32 s3, s0, 0x80008 -; GFX11-NEXT: s_lshr_b32 s1, s0, 24 +; GFX11-NEXT: s_bfe_u32 s1, s0, 0x80008 ; GFX11-NEXT: s_and_b32 s2, s0, 0xff -; GFX11-NEXT: s_bfe_u32 s0, s0, 0x80010 -; GFX11-NEXT: s_lshl_b32 s3, s3, 8 -; GFX11-NEXT: s_lshl_b32 s0, s0, 16 -; GFX11-NEXT: s_or_b32 s2, s2, s3 -; GFX11-NEXT: s_lshl_b32 s1, s1, 24 -; GFX11-NEXT: s_or_b32 s0, s2, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) -; GFX11-NEXT: s_or_b32 s0, s0, s1 -; GFX11-NEXT: v_and_or_b32 v0, s0, v1, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-NEXT: s_lshl_b32 s1, s1, 8 +; GFX11-NEXT: s_bfe_u32 s3, s0, 0x80010 +; GFX11-NEXT: v_or_b32_e64 v2, s2, s1 +; GFX11-NEXT: s_lshr_b32 s0, s0, 24 +; GFX11-NEXT: s_lshl_b32 s1, s3, 16 +; GFX11-NEXT: s_lshl_b32 s0, s0, 24 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: v_or3_b32 v2, v2, s1, s0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_and_or_b32 v0, v2, v1, v0 ; GFX11-NEXT: v_bfe_u32 v1, v0, 8, 8 ; GFX11-NEXT: v_bfe_u32 v2, v0, 16, 8 ; GFX11-NEXT: v_lshrrev_b32_e32 v3, 24, v0 -; GFX11-NEXT: v_lshlrev_b32_e32 v1, 8, v1 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_lshlrev_b32_e32 v1, 8, v1 ; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) ; GFX11-NEXT: v_lshlrev_b32_e32 v3, 24, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_3) ; GFX11-NEXT: v_and_or_b32 v4, v0, 0xff, v1 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: v_mov_b32_e32 v1, 0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) ; GFX11-NEXT: v_or3_b32 v2, v4, v2, v3 ; GFX11-NEXT: global_store_b32 v[0:1], v2, off ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) @@ -2820,7 +2831,6 @@ ; GFX9-LABEL: insertelement_s_v8i8_v_s: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 -; GFX9-NEXT: v_and_b32_e32 v0, 0xff, v0 ; GFX9-NEXT: s_mov_b32 s2, 8 ; GFX9-NEXT: s_mov_b32 s3, 16 ; GFX9-NEXT: v_mov_b32_e32 v2, 0xff @@ -2850,10 +2860,11 @@ ; GFX9-NEXT: s_cselect_b32 s6, s1, s0 ; GFX9-NEXT: s_and_b32 s4, s4, 3 ; GFX9-NEXT: s_lshl_b32 s4, s4, 3 -; GFX9-NEXT: s_lshl_b32 s7, 0xff, s4 -; GFX9-NEXT: s_andn2_b32 s6, s6, s7 -; GFX9-NEXT: v_mov_b32_e32 v1, s6 -; GFX9-NEXT: v_lshl_or_b32 v3, v0, s4, v1 +; GFX9-NEXT: v_lshlrev_b32_sdwa v0, s4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: s_lshl_b32 s4, 0xff, s4 +; GFX9-NEXT: s_not_b32 s4, s4 +; GFX9-NEXT: v_mov_b32_e32 v1, s4 +; GFX9-NEXT: v_and_or_b32 v3, s6, v1, v0 ; GFX9-NEXT: v_mov_b32_e32 v0, s0 ; GFX9-NEXT: v_cmp_eq_u32_e64 vcc, s5, 0 ; GFX9-NEXT: v_mov_b32_e32 v1, s1 @@ -2910,9 +2921,11 @@ ; GFX8-NEXT: s_lshl_b32 s4, s4, 3 ; GFX8-NEXT: v_mov_b32_e32 v1, s4 ; GFX8-NEXT: s_lshl_b32 s4, 0xff, s4 +; GFX8-NEXT: s_not_b32 s4, s4 ; GFX8-NEXT: v_lshlrev_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX8-NEXT: s_andn2_b32 s3, s3, s4 -; GFX8-NEXT: v_or_b32_e32 v2, s3, v0 +; GFX8-NEXT: v_mov_b32_e32 v1, s4 +; GFX8-NEXT: v_and_b32_e32 v1, s3, v1 +; GFX8-NEXT: v_or_b32_e32 v2, v1, v0 ; GFX8-NEXT: v_mov_b32_e32 v0, s0 ; GFX8-NEXT: v_cmp_eq_u32_e64 vcc, s2, 0 ; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc @@ -2970,8 +2983,10 @@ ; GFX7-NEXT: s_lshl_b32 s4, s4, 3 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, s4, v0 ; GFX7-NEXT: s_lshl_b32 s4, 0xff, s4 -; GFX7-NEXT: s_andn2_b32 s3, s3, s4 -; GFX7-NEXT: v_or_b32_e32 v2, s3, v0 +; GFX7-NEXT: s_not_b32 s4, s4 +; GFX7-NEXT: v_mov_b32_e32 v1, s4 +; GFX7-NEXT: v_and_b32_e32 v1, s3, v1 +; GFX7-NEXT: v_or_b32_e32 v2, v1, v0 ; GFX7-NEXT: v_mov_b32_e32 v0, s0 ; GFX7-NEXT: v_cmp_eq_u32_e64 vcc, s2, 0 ; GFX7-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc @@ -3008,7 +3023,6 @@ ; GFX10: ; %bb.0: ; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; GFX10-NEXT: s_lshr_b32 s2, s4, 2 -; GFX10-NEXT: v_and_b32_e32 v2, 0xff, v0 ; GFX10-NEXT: v_cmp_eq_u32_e64 vcc_lo, s2, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_bfe_u32 s7, s0, 0x80008 @@ -3032,16 +3046,17 @@ ; GFX10-NEXT: s_or_b32 s0, s0, s3 ; GFX10-NEXT: s_or_b32 s1, s1, s5 ; GFX10-NEXT: s_cmp_eq_u32 s2, 1 -; GFX10-NEXT: v_mov_b32_e32 v0, s0 ; GFX10-NEXT: s_cselect_b32 s3, s1, s0 ; GFX10-NEXT: s_and_b32 s4, s4, 3 -; GFX10-NEXT: v_mov_b32_e32 v1, s1 ; GFX10-NEXT: s_lshl_b32 s4, s4, 3 +; GFX10-NEXT: v_lshlrev_b32_sdwa v2, s4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX10-NEXT: s_lshl_b32 s4, 0xff, s4 +; GFX10-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-NEXT: s_not_b32 s4, s4 +; GFX10-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-NEXT: v_and_or_b32 v2, s3, s4, v2 ; GFX10-NEXT: s_mov_b32 s0, 8 -; GFX10-NEXT: s_lshl_b32 s5, 0xff, s4 ; GFX10-NEXT: s_mov_b32 s1, 16 -; GFX10-NEXT: s_andn2_b32 s3, s3, s5 -; GFX10-NEXT: v_lshl_or_b32 v2, v2, s4, s3 ; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc_lo ; GFX10-NEXT: v_cmp_eq_u32_e64 vcc_lo, s2, 1 ; GFX10-NEXT: v_lshlrev_b32_sdwa v3, s0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 @@ -3090,23 +3105,27 @@ ; GFX11-NEXT: s_or_b32 s0, s0, s3 ; GFX11-NEXT: s_or_b32 s1, s1, s5 ; GFX11-NEXT: s_cmp_eq_u32 s2, 1 -; GFX11-NEXT: v_and_b32_e32 v2, 0xff, v0 +; GFX11-NEXT: v_and_b32_e32 v0, 0xff, v0 ; GFX11-NEXT: s_cselect_b32 s3, s1, s0 ; GFX11-NEXT: s_and_b32 s4, s4, 3 -; GFX11-NEXT: v_mov_b32_e32 v0, s0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_lshl_b32 s4, s4, 3 -; GFX11-NEXT: v_mov_b32_e32 v1, s1 -; GFX11-NEXT: s_lshl_b32 s5, 0xff, s4 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 s3, s3, s5 -; GFX11-NEXT: v_lshl_or_b32 v2, v2, s4, s3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: v_lshlrev_b32_e32 v2, s4, v0 +; GFX11-NEXT: s_lshl_b32 s4, 0xff, s4 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_not_b32 s4, s4 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: v_and_or_b32 v2, s3, s4, v2 +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc_lo ; GFX11-NEXT: v_cmp_eq_u32_e64 vcc_lo, s2, 1 ; GFX11-NEXT: v_bfe_u32 v3, v0, 8, 8 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc_lo -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_lshlrev_b32_e32 v3, 8, v3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX11-NEXT: v_bfe_u32 v5, v1, 8, 8 ; GFX11-NEXT: v_lshrrev_b32_e32 v2, 24, v0 ; GFX11-NEXT: v_bfe_u32 v4, v0, 16, 8 @@ -5700,7 +5719,6 @@ ; GFX9-LABEL: insertelement_s_v16i8_v_s: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x0 -; GFX9-NEXT: v_and_b32_e32 v0, 0xff, v0 ; GFX9-NEXT: s_mov_b32 s5, 8 ; GFX9-NEXT: s_mov_b32 s6, 16 ; GFX9-NEXT: v_mov_b32_e32 v4, 0xff @@ -5754,10 +5772,11 @@ ; GFX9-NEXT: s_cselect_b32 s8, s3, s8 ; GFX9-NEXT: s_and_b32 s4, s4, 3 ; GFX9-NEXT: s_lshl_b32 s4, s4, 3 -; GFX9-NEXT: s_lshl_b32 s9, 0xff, s4 -; GFX9-NEXT: s_andn2_b32 s8, s8, s9 -; GFX9-NEXT: v_mov_b32_e32 v1, s8 -; GFX9-NEXT: v_lshl_or_b32 v5, v0, s4, v1 +; GFX9-NEXT: v_lshlrev_b32_sdwa v0, s4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: s_lshl_b32 s4, 0xff, s4 +; GFX9-NEXT: s_not_b32 s4, s4 +; GFX9-NEXT: v_mov_b32_e32 v1, s4 +; GFX9-NEXT: v_and_or_b32 v5, s8, v1, v0 ; GFX9-NEXT: v_mov_b32_e32 v0, s0 ; GFX9-NEXT: v_cmp_eq_u32_e64 vcc, s7, 0 ; GFX9-NEXT: v_mov_b32_e32 v1, s1 @@ -5858,9 +5877,11 @@ ; GFX8-NEXT: s_lshl_b32 s4, s4, 3 ; GFX8-NEXT: v_mov_b32_e32 v1, s4 ; GFX8-NEXT: s_lshl_b32 s4, 0xff, s4 +; GFX8-NEXT: s_not_b32 s4, s4 ; GFX8-NEXT: v_lshlrev_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX8-NEXT: s_andn2_b32 s4, s6, s4 -; GFX8-NEXT: v_or_b32_e32 v4, s4, v0 +; GFX8-NEXT: v_mov_b32_e32 v1, s4 +; GFX8-NEXT: v_and_b32_e32 v1, s6, v1 +; GFX8-NEXT: v_or_b32_e32 v4, v1, v0 ; GFX8-NEXT: v_mov_b32_e32 v0, s0 ; GFX8-NEXT: v_cmp_eq_u32_e64 vcc, s5, 0 ; GFX8-NEXT: v_mov_b32_e32 v1, s1 @@ -5964,8 +5985,10 @@ ; GFX7-NEXT: s_lshl_b32 s4, s4, 3 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, s4, v0 ; GFX7-NEXT: s_lshl_b32 s4, 0xff, s4 -; GFX7-NEXT: s_andn2_b32 s4, s6, s4 -; GFX7-NEXT: v_or_b32_e32 v4, s4, v0 +; GFX7-NEXT: s_not_b32 s4, s4 +; GFX7-NEXT: v_mov_b32_e32 v1, s4 +; GFX7-NEXT: v_and_b32_e32 v1, s6, v1 +; GFX7-NEXT: v_or_b32_e32 v4, v1, v0 ; GFX7-NEXT: v_mov_b32_e32 v0, s0 ; GFX7-NEXT: v_cmp_eq_u32_e64 vcc, s5, 0 ; GFX7-NEXT: v_mov_b32_e32 v1, s1 @@ -6027,7 +6050,6 @@ ; GFX10-LABEL: insertelement_s_v16i8_v_s: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x0 -; GFX10-NEXT: v_and_b32_e32 v0, 0xff, v0 ; GFX10-NEXT: v_mov_b32_e32 v10, 8 ; GFX10-NEXT: v_mov_b32_e32 v12, 16 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) @@ -6081,9 +6103,10 @@ ; GFX10-NEXT: s_cselect_b32 s5, s3, s5 ; GFX10-NEXT: s_and_b32 s4, s4, 3 ; GFX10-NEXT: s_lshl_b32 s4, s4, 3 -; GFX10-NEXT: s_lshl_b32 s7, 0xff, s4 -; GFX10-NEXT: s_andn2_b32 s5, s5, s7 -; GFX10-NEXT: v_lshl_or_b32 v4, v0, s4, s5 +; GFX10-NEXT: v_lshlrev_b32_sdwa v0, s4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX10-NEXT: s_lshl_b32 s4, 0xff, s4 +; GFX10-NEXT: s_not_b32 s4, s4 +; GFX10-NEXT: v_and_or_b32 v4, s5, s4, v0 ; GFX10-NEXT: v_mov_b32_e32 v0, s0 ; GFX10-NEXT: v_mov_b32_e32 v1, s1 ; GFX10-NEXT: v_mov_b32_e32 v2, s2 @@ -6182,27 +6205,28 @@ ; GFX11-NEXT: s_and_b32 s4, s4, 3 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX11-NEXT: s_lshl_b32 s4, s4, 3 -; GFX11-NEXT: s_lshl_b32 s7, 0xff, s4 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 s5, s5, s7 -; GFX11-NEXT: v_lshl_or_b32 v4, v0, s4, s5 -; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v3, s3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX11-NEXT: v_dual_mov_b32 v1, s1 :: v_dual_cndmask_b32 v0, v0, v4 +; GFX11-NEXT: v_lshlrev_b32_e32 v0, s4, v0 +; GFX11-NEXT: s_lshl_b32 s4, 0xff, s4 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_not_b32 s4, s4 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: v_and_or_b32 v4, s5, s4, v0 +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: v_dual_mov_b32 v3, s3 :: v_dual_mov_b32 v2, s2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc_lo ; GFX11-NEXT: v_cmp_eq_u32_e64 vcc_lo, s6, 1 -; GFX11-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-NEXT: v_bfe_u32 v5, v0, 8, 8 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3) ; GFX11-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc_lo ; GFX11-NEXT: v_cmp_eq_u32_e64 vcc_lo, s6, 2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) ; GFX11-NEXT: v_lshlrev_b32_e32 v5, 8, v5 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_bfe_u32 v7, v1, 8, 8 ; GFX11-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc_lo ; GFX11-NEXT: v_cmp_eq_u32_e64 vcc_lo, s6, 3 -; GFX11-NEXT: v_lshrrev_b32_e32 v6, 24, v1 ; GFX11-NEXT: v_bfe_u32 v8, v0, 16, 8 -; GFX11-NEXT: v_dual_cndmask_b32 v3, v3, v4 :: v_dual_lshlrev_b32 v6, 24, v6 +; GFX11-NEXT: v_lshrrev_b32_e32 v6, 24, v1 +; GFX11-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc_lo ; GFX11-NEXT: v_lshrrev_b32_e32 v4, 24, v0 ; GFX11-NEXT: v_and_or_b32 v0, 0xff, v0, v5 ; GFX11-NEXT: v_lshlrev_b32_e32 v5, 8, v7 @@ -6219,6 +6243,7 @@ ; GFX11-NEXT: v_lshlrev_b32_e32 v5, 8, v5 ; GFX11-NEXT: v_lshlrev_b32_e32 v4, 24, v4 ; GFX11-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; GFX11-NEXT: v_lshlrev_b32_e32 v6, 24, v6 ; GFX11-NEXT: v_and_or_b32 v2, 0xff, v2, v11 ; GFX11-NEXT: v_lshlrev_b32_e32 v11, 16, v12 ; GFX11-NEXT: v_lshlrev_b32_e32 v9, 24, v9 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankcombiner-clamp-minmax-const.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankcombiner-clamp-minmax-const.mir --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankcombiner-clamp-minmax-const.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankcombiner-clamp-minmax-const.mir @@ -483,8 +483,8 @@ ; CHECK-NEXT: [[FMUL:%[0-9]+]]:vgpr(s32) = G_FMUL [[COPY]], [[COPY1]] ; CHECK-NEXT: [[C1:%[0-9]+]]:sgpr(s32) = G_FCONSTANT float 0.000000e+00 ; CHECK-NEXT: [[FCANONICALIZE:%[0-9]+]]:vgpr(s32) = G_FCANONICALIZE [[FMUL]] - ; CHECK-NEXT: [[COPY2:%[0-9]+]]:vgpr(s32) = COPY [[C1]](s32) ; CHECK-NEXT: [[C2:%[0-9]+]]:sgpr(s32) = G_FCONSTANT float 1.000000e+00 + ; CHECK-NEXT: [[COPY2:%[0-9]+]]:vgpr(s32) = COPY [[C1]](s32) ; CHECK-NEXT: [[COPY3:%[0-9]+]]:vgpr(s32) = COPY [[C2]](s32) ; CHECK-NEXT: [[AMDGPU_FMED3_:%[0-9]+]]:vgpr(s32) = G_AMDGPU_FMED3 [[FCANONICALIZE]], [[COPY2]], [[COPY3]] ; CHECK-NEXT: $vgpr0 = COPY [[AMDGPU_FMED3_]](s32) diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankcombiner-fmed3-minmax-const.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankcombiner-fmed3-minmax-const.mir --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankcombiner-fmed3-minmax-const.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankcombiner-fmed3-minmax-const.mir @@ -18,8 +18,8 @@ ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr(s32) = COPY $vgpr0 ; CHECK-NEXT: [[C:%[0-9]+]]:sgpr(s32) = G_FCONSTANT float 2.000000e+00 - ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr(s32) = COPY [[C]](s32) ; CHECK-NEXT: [[C1:%[0-9]+]]:sgpr(s32) = G_FCONSTANT float 4.000000e+00 + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr(s32) = COPY [[C]](s32) ; CHECK-NEXT: [[COPY2:%[0-9]+]]:vgpr(s32) = COPY [[C1]](s32) ; CHECK-NEXT: [[AMDGPU_FMED3_:%[0-9]+]]:vgpr(s32) = nnan G_AMDGPU_FMED3 [[COPY]], [[COPY1]], [[COPY2]] ; CHECK-NEXT: $vgpr0 = COPY [[AMDGPU_FMED3_]](s32) @@ -50,8 +50,8 @@ ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr(s32) = COPY $vgpr0 ; CHECK-NEXT: [[C:%[0-9]+]]:sgpr(s32) = G_FCONSTANT float 2.000000e+00 - ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr(s32) = COPY [[C]](s32) ; CHECK-NEXT: [[C1:%[0-9]+]]:sgpr(s32) = G_FCONSTANT float 4.000000e+00 + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr(s32) = COPY [[C]](s32) ; CHECK-NEXT: [[COPY2:%[0-9]+]]:vgpr(s32) = COPY [[C1]](s32) ; CHECK-NEXT: [[AMDGPU_FMED3_:%[0-9]+]]:vgpr(s32) = nnan G_AMDGPU_FMED3 [[COPY]], [[COPY1]], [[COPY2]] ; CHECK-NEXT: $vgpr0 = COPY [[AMDGPU_FMED3_]](s32) @@ -84,8 +84,8 @@ ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:vgpr(s16) = G_TRUNC [[COPY]](s32) ; CHECK-NEXT: [[C:%[0-9]+]]:sgpr(s16) = G_FCONSTANT half 0xH4000 ; CHECK-NEXT: [[FCANONICALIZE:%[0-9]+]]:vgpr(s16) = G_FCANONICALIZE [[TRUNC]] - ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr(s16) = COPY [[C]](s16) ; CHECK-NEXT: [[C1:%[0-9]+]]:sgpr(s16) = G_FCONSTANT half 0xH4400 + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr(s16) = COPY [[C]](s16) ; CHECK-NEXT: [[COPY2:%[0-9]+]]:vgpr(s16) = COPY [[C1]](s16) ; CHECK-NEXT: [[AMDGPU_FMED3_:%[0-9]+]]:vgpr(s16) = G_AMDGPU_FMED3 [[FCANONICALIZE]], [[COPY1]], [[COPY2]] ; CHECK-NEXT: [[ANYEXT:%[0-9]+]]:vgpr(s32) = G_ANYEXT [[AMDGPU_FMED3_]](s16) @@ -122,8 +122,8 @@ ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr(s32) = COPY $vgpr0 ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:vgpr(s16) = G_TRUNC [[COPY]](s32) ; CHECK-NEXT: [[C:%[0-9]+]]:sgpr(s16) = G_FCONSTANT half 0xH4000 - ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr(s16) = COPY [[C]](s16) ; CHECK-NEXT: [[C1:%[0-9]+]]:sgpr(s16) = G_FCONSTANT half 0xH4400 + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr(s16) = COPY [[C]](s16) ; CHECK-NEXT: [[COPY2:%[0-9]+]]:vgpr(s16) = COPY [[C1]](s16) ; CHECK-NEXT: [[AMDGPU_FMED3_:%[0-9]+]]:vgpr(s16) = nnan G_AMDGPU_FMED3 [[TRUNC]], [[COPY1]], [[COPY2]] ; CHECK-NEXT: [[ANYEXT:%[0-9]+]]:vgpr(s32) = G_ANYEXT [[AMDGPU_FMED3_]](s16) @@ -157,10 +157,10 @@ ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr(s32) = COPY $vgpr0 ; CHECK-NEXT: [[C:%[0-9]+]]:sgpr(s32) = G_FCONSTANT float 4.000000e+00 - ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr(s32) = COPY [[C]](s32) ; CHECK-NEXT: [[C1:%[0-9]+]]:sgpr(s32) = G_FCONSTANT float 2.000000e+00 - ; CHECK-NEXT: [[COPY2:%[0-9]+]]:vgpr(s32) = COPY [[C1]](s32) - ; CHECK-NEXT: [[AMDGPU_FMED3_:%[0-9]+]]:vgpr(s32) = nnan G_AMDGPU_FMED3 [[COPY]], [[COPY2]], [[COPY1]] + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr(s32) = COPY [[C1]](s32) + ; CHECK-NEXT: [[COPY2:%[0-9]+]]:vgpr(s32) = COPY [[C]](s32) + ; CHECK-NEXT: [[AMDGPU_FMED3_:%[0-9]+]]:vgpr(s32) = nnan G_AMDGPU_FMED3 [[COPY]], [[COPY1]], [[COPY2]] ; CHECK-NEXT: $vgpr0 = COPY [[AMDGPU_FMED3_]](s32) %0:vgpr(s32) = COPY $vgpr0 %2:sgpr(s32) = G_FCONSTANT float 4.000000e+00 @@ -189,10 +189,10 @@ ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr(s32) = COPY $vgpr0 ; CHECK-NEXT: [[C:%[0-9]+]]:sgpr(s32) = G_FCONSTANT float 4.000000e+00 - ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr(s32) = COPY [[C]](s32) ; CHECK-NEXT: [[C1:%[0-9]+]]:sgpr(s32) = G_FCONSTANT float 2.000000e+00 - ; CHECK-NEXT: [[COPY2:%[0-9]+]]:vgpr(s32) = COPY [[C1]](s32) - ; CHECK-NEXT: [[AMDGPU_FMED3_:%[0-9]+]]:vgpr(s32) = nnan G_AMDGPU_FMED3 [[COPY]], [[COPY2]], [[COPY1]] + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr(s32) = COPY [[C1]](s32) + ; CHECK-NEXT: [[COPY2:%[0-9]+]]:vgpr(s32) = COPY [[C]](s32) + ; CHECK-NEXT: [[AMDGPU_FMED3_:%[0-9]+]]:vgpr(s32) = nnan G_AMDGPU_FMED3 [[COPY]], [[COPY1]], [[COPY2]] ; CHECK-NEXT: $vgpr0 = COPY [[AMDGPU_FMED3_]](s32) %0:vgpr(s32) = COPY $vgpr0 %2:sgpr(s32) = G_FCONSTANT float 4.000000e+00 @@ -222,10 +222,10 @@ ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr(s32) = COPY $vgpr0 ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:vgpr(s16) = G_TRUNC [[COPY]](s32) ; CHECK-NEXT: [[C:%[0-9]+]]:sgpr(s16) = G_FCONSTANT half 0xH4400 - ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr(s16) = COPY [[C]](s16) ; CHECK-NEXT: [[C1:%[0-9]+]]:sgpr(s16) = G_FCONSTANT half 0xH4000 - ; CHECK-NEXT: [[COPY2:%[0-9]+]]:vgpr(s16) = COPY [[C1]](s16) - ; CHECK-NEXT: [[AMDGPU_FMED3_:%[0-9]+]]:vgpr(s16) = nnan G_AMDGPU_FMED3 [[TRUNC]], [[COPY2]], [[COPY1]] + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr(s16) = COPY [[C1]](s16) + ; CHECK-NEXT: [[COPY2:%[0-9]+]]:vgpr(s16) = COPY [[C]](s16) + ; CHECK-NEXT: [[AMDGPU_FMED3_:%[0-9]+]]:vgpr(s16) = nnan G_AMDGPU_FMED3 [[TRUNC]], [[COPY1]], [[COPY2]] ; CHECK-NEXT: [[ANYEXT:%[0-9]+]]:vgpr(s32) = G_ANYEXT [[AMDGPU_FMED3_]](s16) ; CHECK-NEXT: $vgpr0 = COPY [[ANYEXT]](s32) %2:vgpr(s32) = COPY $vgpr0 @@ -258,10 +258,10 @@ ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr(s32) = COPY $vgpr0 ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:vgpr(s16) = G_TRUNC [[COPY]](s32) ; CHECK-NEXT: [[C:%[0-9]+]]:sgpr(s16) = G_FCONSTANT half 0xH4400 - ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr(s16) = COPY [[C]](s16) ; CHECK-NEXT: [[C1:%[0-9]+]]:sgpr(s16) = G_FCONSTANT half 0xH4000 - ; CHECK-NEXT: [[COPY2:%[0-9]+]]:vgpr(s16) = COPY [[C1]](s16) - ; CHECK-NEXT: [[AMDGPU_FMED3_:%[0-9]+]]:vgpr(s16) = nnan G_AMDGPU_FMED3 [[TRUNC]], [[COPY2]], [[COPY1]] + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr(s16) = COPY [[C1]](s16) + ; CHECK-NEXT: [[COPY2:%[0-9]+]]:vgpr(s16) = COPY [[C]](s16) + ; CHECK-NEXT: [[AMDGPU_FMED3_:%[0-9]+]]:vgpr(s16) = nnan G_AMDGPU_FMED3 [[TRUNC]], [[COPY1]], [[COPY2]] ; CHECK-NEXT: [[ANYEXT:%[0-9]+]]:vgpr(s32) = G_ANYEXT [[AMDGPU_FMED3_]](s16) ; CHECK-NEXT: $vgpr0 = COPY [[ANYEXT]](s32) %2:vgpr(s32) = COPY $vgpr0 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankcombiner-smed3.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankcombiner-smed3.mir --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankcombiner-smed3.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankcombiner-smed3.mir @@ -16,8 +16,8 @@ ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr(s32) = COPY $vgpr0 ; CHECK-NEXT: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 -12 - ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr(s32) = COPY [[C]](s32) ; CHECK-NEXT: [[C1:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 17 + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr(s32) = COPY [[C]](s32) ; CHECK-NEXT: [[COPY2:%[0-9]+]]:vgpr(s32) = COPY [[C1]](s32) ; CHECK-NEXT: [[AMDGPU_SMED3_:%[0-9]+]]:vgpr(s32) = G_AMDGPU_SMED3 [[COPY]], [[COPY1]], [[COPY2]] ; CHECK-NEXT: $vgpr0 = COPY [[AMDGPU_SMED3_]](s32) @@ -47,8 +47,8 @@ ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr(s32) = COPY $vgpr0 ; CHECK-NEXT: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 -12 - ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr(s32) = COPY [[C]](s32) ; CHECK-NEXT: [[C1:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 17 + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr(s32) = COPY [[C]](s32) ; CHECK-NEXT: [[COPY2:%[0-9]+]]:vgpr(s32) = COPY [[C1]](s32) ; CHECK-NEXT: [[AMDGPU_SMED3_:%[0-9]+]]:vgpr(s32) = G_AMDGPU_SMED3 [[COPY]], [[COPY1]], [[COPY2]] ; CHECK-NEXT: $vgpr0 = COPY [[AMDGPU_SMED3_]](s32) @@ -78,8 +78,8 @@ ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr(s32) = COPY $vgpr0 ; CHECK-NEXT: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 -12 - ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr(s32) = COPY [[C]](s32) ; CHECK-NEXT: [[C1:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 17 + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr(s32) = COPY [[C]](s32) ; CHECK-NEXT: [[COPY2:%[0-9]+]]:vgpr(s32) = COPY [[C1]](s32) ; CHECK-NEXT: [[AMDGPU_SMED3_:%[0-9]+]]:vgpr(s32) = G_AMDGPU_SMED3 [[COPY]], [[COPY1]], [[COPY2]] ; CHECK-NEXT: $vgpr0 = COPY [[AMDGPU_SMED3_]](s32) @@ -109,8 +109,8 @@ ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr(s32) = COPY $vgpr0 ; CHECK-NEXT: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 -12 - ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr(s32) = COPY [[C]](s32) ; CHECK-NEXT: [[C1:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 17 + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr(s32) = COPY [[C]](s32) ; CHECK-NEXT: [[COPY2:%[0-9]+]]:vgpr(s32) = COPY [[C1]](s32) ; CHECK-NEXT: [[AMDGPU_SMED3_:%[0-9]+]]:vgpr(s32) = G_AMDGPU_SMED3 [[COPY]], [[COPY1]], [[COPY2]] ; CHECK-NEXT: $vgpr0 = COPY [[AMDGPU_SMED3_]](s32) @@ -140,10 +140,10 @@ ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr(s32) = COPY $vgpr0 ; CHECK-NEXT: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 17 - ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr(s32) = COPY [[C]](s32) ; CHECK-NEXT: [[C1:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 -12 - ; CHECK-NEXT: [[COPY2:%[0-9]+]]:vgpr(s32) = COPY [[C1]](s32) - ; CHECK-NEXT: [[AMDGPU_SMED3_:%[0-9]+]]:vgpr(s32) = G_AMDGPU_SMED3 [[COPY]], [[COPY2]], [[COPY1]] + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr(s32) = COPY [[C1]](s32) + ; CHECK-NEXT: [[COPY2:%[0-9]+]]:vgpr(s32) = COPY [[C]](s32) + ; CHECK-NEXT: [[AMDGPU_SMED3_:%[0-9]+]]:vgpr(s32) = G_AMDGPU_SMED3 [[COPY]], [[COPY1]], [[COPY2]] ; CHECK-NEXT: $vgpr0 = COPY [[AMDGPU_SMED3_]](s32) ; CHECK-NEXT: S_SETPC_B64_return undef $sgpr30_sgpr31, implicit $vgpr0 %0:vgpr(s32) = COPY $vgpr0 @@ -171,10 +171,10 @@ ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr(s32) = COPY $vgpr0 ; CHECK-NEXT: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 17 - ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr(s32) = COPY [[C]](s32) ; CHECK-NEXT: [[C1:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 -12 - ; CHECK-NEXT: [[COPY2:%[0-9]+]]:vgpr(s32) = COPY [[C1]](s32) - ; CHECK-NEXT: [[AMDGPU_SMED3_:%[0-9]+]]:vgpr(s32) = G_AMDGPU_SMED3 [[COPY]], [[COPY2]], [[COPY1]] + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr(s32) = COPY [[C1]](s32) + ; CHECK-NEXT: [[COPY2:%[0-9]+]]:vgpr(s32) = COPY [[C]](s32) + ; CHECK-NEXT: [[AMDGPU_SMED3_:%[0-9]+]]:vgpr(s32) = G_AMDGPU_SMED3 [[COPY]], [[COPY1]], [[COPY2]] ; CHECK-NEXT: $vgpr0 = COPY [[AMDGPU_SMED3_]](s32) ; CHECK-NEXT: S_SETPC_B64_return undef $sgpr30_sgpr31, implicit $vgpr0 %0:vgpr(s32) = COPY $vgpr0 @@ -202,10 +202,10 @@ ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr(s32) = COPY $vgpr0 ; CHECK-NEXT: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 17 - ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr(s32) = COPY [[C]](s32) ; CHECK-NEXT: [[C1:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 -12 - ; CHECK-NEXT: [[COPY2:%[0-9]+]]:vgpr(s32) = COPY [[C1]](s32) - ; CHECK-NEXT: [[AMDGPU_SMED3_:%[0-9]+]]:vgpr(s32) = G_AMDGPU_SMED3 [[COPY]], [[COPY2]], [[COPY1]] + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr(s32) = COPY [[C1]](s32) + ; CHECK-NEXT: [[COPY2:%[0-9]+]]:vgpr(s32) = COPY [[C]](s32) + ; CHECK-NEXT: [[AMDGPU_SMED3_:%[0-9]+]]:vgpr(s32) = G_AMDGPU_SMED3 [[COPY]], [[COPY1]], [[COPY2]] ; CHECK-NEXT: $vgpr0 = COPY [[AMDGPU_SMED3_]](s32) ; CHECK-NEXT: S_SETPC_B64_return undef $sgpr30_sgpr31, implicit $vgpr0 %0:vgpr(s32) = COPY $vgpr0 @@ -233,10 +233,10 @@ ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr(s32) = COPY $vgpr0 ; CHECK-NEXT: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 17 - ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr(s32) = COPY [[C]](s32) ; CHECK-NEXT: [[C1:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 -12 - ; CHECK-NEXT: [[COPY2:%[0-9]+]]:vgpr(s32) = COPY [[C1]](s32) - ; CHECK-NEXT: [[AMDGPU_SMED3_:%[0-9]+]]:vgpr(s32) = G_AMDGPU_SMED3 [[COPY]], [[COPY2]], [[COPY1]] + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr(s32) = COPY [[C1]](s32) + ; CHECK-NEXT: [[COPY2:%[0-9]+]]:vgpr(s32) = COPY [[C]](s32) + ; CHECK-NEXT: [[AMDGPU_SMED3_:%[0-9]+]]:vgpr(s32) = G_AMDGPU_SMED3 [[COPY]], [[COPY1]], [[COPY2]] ; CHECK-NEXT: $vgpr0 = COPY [[AMDGPU_SMED3_]](s32) ; CHECK-NEXT: S_SETPC_B64_return undef $sgpr30_sgpr31, implicit $vgpr0 %0:vgpr(s32) = COPY $vgpr0 @@ -332,8 +332,8 @@ ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr(s32) = COPY $vgpr0 ; CHECK-NEXT: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 -12 - ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr(s32) = COPY [[C]](s32) ; CHECK-NEXT: [[C1:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 65 + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr(s32) = COPY [[C]](s32) ; CHECK-NEXT: [[COPY2:%[0-9]+]]:vgpr(s32) = COPY [[C1]](s32) ; CHECK-NEXT: [[AMDGPU_SMED3_:%[0-9]+]]:vgpr(s32) = G_AMDGPU_SMED3 [[COPY]], [[COPY1]], [[COPY2]] ; CHECK-NEXT: $vgpr0 = COPY [[AMDGPU_SMED3_]](s32) diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankcombiner-umed3.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankcombiner-umed3.mir --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankcombiner-umed3.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankcombiner-umed3.mir @@ -16,8 +16,8 @@ ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr(s32) = COPY $vgpr0 ; CHECK-NEXT: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 12 - ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr(s32) = COPY [[C]](s32) ; CHECK-NEXT: [[C1:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 17 + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr(s32) = COPY [[C]](s32) ; CHECK-NEXT: [[COPY2:%[0-9]+]]:vgpr(s32) = COPY [[C1]](s32) ; CHECK-NEXT: [[AMDGPU_UMED3_:%[0-9]+]]:vgpr(s32) = G_AMDGPU_UMED3 [[COPY]], [[COPY1]], [[COPY2]] ; CHECK-NEXT: $vgpr0 = COPY [[AMDGPU_UMED3_]](s32) @@ -47,8 +47,8 @@ ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr(s32) = COPY $vgpr0 ; CHECK-NEXT: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 12 - ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr(s32) = COPY [[C]](s32) ; CHECK-NEXT: [[C1:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 17 + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr(s32) = COPY [[C]](s32) ; CHECK-NEXT: [[COPY2:%[0-9]+]]:vgpr(s32) = COPY [[C1]](s32) ; CHECK-NEXT: [[AMDGPU_UMED3_:%[0-9]+]]:vgpr(s32) = G_AMDGPU_UMED3 [[COPY]], [[COPY1]], [[COPY2]] ; CHECK-NEXT: $vgpr0 = COPY [[AMDGPU_UMED3_]](s32) @@ -78,8 +78,8 @@ ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr(s32) = COPY $vgpr0 ; CHECK-NEXT: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 12 - ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr(s32) = COPY [[C]](s32) ; CHECK-NEXT: [[C1:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 17 + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr(s32) = COPY [[C]](s32) ; CHECK-NEXT: [[COPY2:%[0-9]+]]:vgpr(s32) = COPY [[C1]](s32) ; CHECK-NEXT: [[AMDGPU_UMED3_:%[0-9]+]]:vgpr(s32) = G_AMDGPU_UMED3 [[COPY]], [[COPY1]], [[COPY2]] ; CHECK-NEXT: $vgpr0 = COPY [[AMDGPU_UMED3_]](s32) @@ -109,8 +109,8 @@ ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr(s32) = COPY $vgpr0 ; CHECK-NEXT: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 12 - ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr(s32) = COPY [[C]](s32) ; CHECK-NEXT: [[C1:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 17 + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr(s32) = COPY [[C]](s32) ; CHECK-NEXT: [[COPY2:%[0-9]+]]:vgpr(s32) = COPY [[C1]](s32) ; CHECK-NEXT: [[AMDGPU_UMED3_:%[0-9]+]]:vgpr(s32) = G_AMDGPU_UMED3 [[COPY]], [[COPY1]], [[COPY2]] ; CHECK-NEXT: $vgpr0 = COPY [[AMDGPU_UMED3_]](s32) @@ -140,10 +140,10 @@ ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr(s32) = COPY $vgpr0 ; CHECK-NEXT: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 17 - ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr(s32) = COPY [[C]](s32) ; CHECK-NEXT: [[C1:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 12 - ; CHECK-NEXT: [[COPY2:%[0-9]+]]:vgpr(s32) = COPY [[C1]](s32) - ; CHECK-NEXT: [[AMDGPU_UMED3_:%[0-9]+]]:vgpr(s32) = G_AMDGPU_UMED3 [[COPY]], [[COPY2]], [[COPY1]] + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr(s32) = COPY [[C1]](s32) + ; CHECK-NEXT: [[COPY2:%[0-9]+]]:vgpr(s32) = COPY [[C]](s32) + ; CHECK-NEXT: [[AMDGPU_UMED3_:%[0-9]+]]:vgpr(s32) = G_AMDGPU_UMED3 [[COPY]], [[COPY1]], [[COPY2]] ; CHECK-NEXT: $vgpr0 = COPY [[AMDGPU_UMED3_]](s32) ; CHECK-NEXT: S_SETPC_B64_return undef $sgpr30_sgpr31, implicit $vgpr0 %0:vgpr(s32) = COPY $vgpr0 @@ -171,10 +171,10 @@ ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr(s32) = COPY $vgpr0 ; CHECK-NEXT: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 17 - ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr(s32) = COPY [[C]](s32) ; CHECK-NEXT: [[C1:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 12 - ; CHECK-NEXT: [[COPY2:%[0-9]+]]:vgpr(s32) = COPY [[C1]](s32) - ; CHECK-NEXT: [[AMDGPU_UMED3_:%[0-9]+]]:vgpr(s32) = G_AMDGPU_UMED3 [[COPY]], [[COPY2]], [[COPY1]] + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr(s32) = COPY [[C1]](s32) + ; CHECK-NEXT: [[COPY2:%[0-9]+]]:vgpr(s32) = COPY [[C]](s32) + ; CHECK-NEXT: [[AMDGPU_UMED3_:%[0-9]+]]:vgpr(s32) = G_AMDGPU_UMED3 [[COPY]], [[COPY1]], [[COPY2]] ; CHECK-NEXT: $vgpr0 = COPY [[AMDGPU_UMED3_]](s32) ; CHECK-NEXT: S_SETPC_B64_return undef $sgpr30_sgpr31, implicit $vgpr0 %0:vgpr(s32) = COPY $vgpr0 @@ -202,10 +202,10 @@ ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr(s32) = COPY $vgpr0 ; CHECK-NEXT: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 17 - ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr(s32) = COPY [[C]](s32) ; CHECK-NEXT: [[C1:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 12 - ; CHECK-NEXT: [[COPY2:%[0-9]+]]:vgpr(s32) = COPY [[C1]](s32) - ; CHECK-NEXT: [[AMDGPU_UMED3_:%[0-9]+]]:vgpr(s32) = G_AMDGPU_UMED3 [[COPY]], [[COPY2]], [[COPY1]] + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr(s32) = COPY [[C1]](s32) + ; CHECK-NEXT: [[COPY2:%[0-9]+]]:vgpr(s32) = COPY [[C]](s32) + ; CHECK-NEXT: [[AMDGPU_UMED3_:%[0-9]+]]:vgpr(s32) = G_AMDGPU_UMED3 [[COPY]], [[COPY1]], [[COPY2]] ; CHECK-NEXT: $vgpr0 = COPY [[AMDGPU_UMED3_]](s32) ; CHECK-NEXT: S_SETPC_B64_return undef $sgpr30_sgpr31, implicit $vgpr0 %0:vgpr(s32) = COPY $vgpr0 @@ -233,10 +233,10 @@ ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr(s32) = COPY $vgpr0 ; CHECK-NEXT: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 17 - ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr(s32) = COPY [[C]](s32) ; CHECK-NEXT: [[C1:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 12 - ; CHECK-NEXT: [[COPY2:%[0-9]+]]:vgpr(s32) = COPY [[C1]](s32) - ; CHECK-NEXT: [[AMDGPU_UMED3_:%[0-9]+]]:vgpr(s32) = G_AMDGPU_UMED3 [[COPY]], [[COPY2]], [[COPY1]] + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr(s32) = COPY [[C1]](s32) + ; CHECK-NEXT: [[COPY2:%[0-9]+]]:vgpr(s32) = COPY [[C]](s32) + ; CHECK-NEXT: [[AMDGPU_UMED3_:%[0-9]+]]:vgpr(s32) = G_AMDGPU_UMED3 [[COPY]], [[COPY1]], [[COPY2]] ; CHECK-NEXT: $vgpr0 = COPY [[AMDGPU_UMED3_]](s32) ; CHECK-NEXT: S_SETPC_B64_return undef $sgpr30_sgpr31, implicit $vgpr0 %0:vgpr(s32) = COPY $vgpr0 @@ -333,8 +333,8 @@ ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr(s32) = COPY $vgpr0 ; CHECK-NEXT: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 12 - ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr(s32) = COPY [[C]](s32) ; CHECK-NEXT: [[C1:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 65 + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr(s32) = COPY [[C]](s32) ; CHECK-NEXT: [[COPY2:%[0-9]+]]:vgpr(s32) = COPY [[C1]](s32) ; CHECK-NEXT: [[AMDGPU_UMED3_:%[0-9]+]]:vgpr(s32) = G_AMDGPU_UMED3 [[COPY]], [[COPY1]], [[COPY2]] ; CHECK-NEXT: $vgpr0 = COPY [[AMDGPU_UMED3_]](s32) diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-and.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-and.mir --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-and.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-and.mir @@ -227,10 +227,8 @@ ; CHECK-NEXT: [[COPY3:%[0-9]+]]:vgpr(s32) = COPY $vgpr3 ; CHECK-NEXT: [[MV:%[0-9]+]]:vgpr(s64) = G_MERGE_VALUES [[COPY]](s32), [[COPY1]](s32) ; CHECK-NEXT: [[MV1:%[0-9]+]]:vgpr(s64) = G_MERGE_VALUES [[COPY2]](s32), [[COPY3]](s32) - ; CHECK-NEXT: [[UV:%[0-9]+]]:vgpr(s32), [[UV1:%[0-9]+]]:vgpr(s32) = G_UNMERGE_VALUES [[MV]](s64) - ; CHECK-NEXT: [[UV2:%[0-9]+]]:vgpr(s32), [[UV3:%[0-9]+]]:vgpr(s32) = G_UNMERGE_VALUES [[MV1]](s64) - ; CHECK-NEXT: [[AND:%[0-9]+]]:vgpr(s32) = G_AND [[UV]], [[UV2]] - ; CHECK-NEXT: [[AND1:%[0-9]+]]:vgpr(s32) = G_AND [[UV1]], [[UV3]] + ; CHECK-NEXT: [[AND:%[0-9]+]]:vgpr(s32) = G_AND [[COPY]], [[COPY2]] + ; CHECK-NEXT: [[AND1:%[0-9]+]]:vgpr(s32) = G_AND [[COPY1]], [[COPY3]] ; CHECK-NEXT: [[MV2:%[0-9]+]]:vgpr(s64) = G_MERGE_VALUES [[AND]](s32), [[AND1]](s32) ; CHECK-NEXT: S_NOP 0, implicit [[MV2]](s64) %0:_(s32) = COPY $vgpr0 @@ -259,9 +257,8 @@ ; CHECK-NEXT: [[COPY3:%[0-9]+]]:vgpr(s32) = COPY [[COPY1]](s32) ; CHECK-NEXT: [[MV:%[0-9]+]]:vgpr(s64) = G_MERGE_VALUES [[COPY3]](s32), [[COPY2]](s32) ; CHECK-NEXT: [[UV:%[0-9]+]]:sgpr(s32), [[UV1:%[0-9]+]]:sgpr(s32) = G_UNMERGE_VALUES [[COPY]](s64) - ; CHECK-NEXT: [[UV2:%[0-9]+]]:vgpr(s32), [[UV3:%[0-9]+]]:vgpr(s32) = G_UNMERGE_VALUES [[MV]](s64) - ; CHECK-NEXT: [[AND:%[0-9]+]]:vgpr(s32) = G_AND [[UV]], [[UV2]] - ; CHECK-NEXT: [[AND1:%[0-9]+]]:vgpr(s32) = G_AND [[UV1]], [[UV3]] + ; CHECK-NEXT: [[AND:%[0-9]+]]:vgpr(s32) = G_AND [[UV]], [[COPY3]] + ; CHECK-NEXT: [[AND1:%[0-9]+]]:vgpr(s32) = G_AND [[UV1]], [[COPY2]] ; CHECK-NEXT: [[MV1:%[0-9]+]]:vgpr(s64) = G_MERGE_VALUES [[AND]](s32), [[AND1]](s32) ; CHECK-NEXT: S_NOP 0, implicit [[MV1]](s64) %0:_(s64) = COPY $sgpr0_sgpr1 @@ -288,9 +285,8 @@ ; CHECK-NEXT: [[COPY3:%[0-9]+]]:vgpr(s32) = COPY [[COPY1]](s32) ; CHECK-NEXT: [[MV:%[0-9]+]]:vgpr(s64) = G_MERGE_VALUES [[COPY2]](s32), [[COPY3]](s32) ; CHECK-NEXT: [[UV:%[0-9]+]]:sgpr(s32), [[UV1:%[0-9]+]]:sgpr(s32) = G_UNMERGE_VALUES [[COPY]](s64) - ; CHECK-NEXT: [[UV2:%[0-9]+]]:vgpr(s32), [[UV3:%[0-9]+]]:vgpr(s32) = G_UNMERGE_VALUES [[MV]](s64) - ; CHECK-NEXT: [[AND:%[0-9]+]]:vgpr(s32) = G_AND [[UV]], [[UV2]] - ; CHECK-NEXT: [[AND1:%[0-9]+]]:vgpr(s32) = G_AND [[UV1]], [[UV3]] + ; CHECK-NEXT: [[AND:%[0-9]+]]:vgpr(s32) = G_AND [[UV]], [[COPY2]] + ; CHECK-NEXT: [[AND1:%[0-9]+]]:vgpr(s32) = G_AND [[UV1]], [[COPY3]] ; CHECK-NEXT: [[MV1:%[0-9]+]]:vgpr(s64) = G_MERGE_VALUES [[AND]](s32), [[AND1]](s32) ; CHECK-NEXT: S_NOP 0, implicit [[MV1]](s64) %0:_(s64) = COPY $sgpr0_sgpr1 @@ -319,10 +315,8 @@ ; CHECK-NEXT: [[MV:%[0-9]+]]:vgpr(s64) = G_MERGE_VALUES [[COPY4]](s32), [[COPY2]](s32) ; CHECK-NEXT: [[COPY5:%[0-9]+]]:vgpr(s32) = COPY [[COPY1]](s32) ; CHECK-NEXT: [[MV1:%[0-9]+]]:vgpr(s64) = G_MERGE_VALUES [[COPY5]](s32), [[COPY3]](s32) - ; CHECK-NEXT: [[UV:%[0-9]+]]:vgpr(s32), [[UV1:%[0-9]+]]:vgpr(s32) = G_UNMERGE_VALUES [[MV]](s64) - ; CHECK-NEXT: [[UV2:%[0-9]+]]:vgpr(s32), [[UV3:%[0-9]+]]:vgpr(s32) = G_UNMERGE_VALUES [[MV1]](s64) - ; CHECK-NEXT: [[AND:%[0-9]+]]:vgpr(s32) = G_AND [[UV]], [[UV2]] - ; CHECK-NEXT: [[AND1:%[0-9]+]]:vgpr(s32) = G_AND [[UV1]], [[UV3]] + ; CHECK-NEXT: [[AND:%[0-9]+]]:vgpr(s32) = G_AND [[COPY4]], [[COPY5]] + ; CHECK-NEXT: [[AND1:%[0-9]+]]:vgpr(s32) = G_AND [[COPY2]], [[COPY3]] ; CHECK-NEXT: [[MV2:%[0-9]+]]:vgpr(s64) = G_MERGE_VALUES [[AND]](s32), [[AND1]](s32) ; CHECK-NEXT: S_NOP 0, implicit [[MV2]](s64) %0:_(s32) = COPY $sgpr0 @@ -353,10 +347,8 @@ ; CHECK-NEXT: [[MV:%[0-9]+]]:vgpr(s64) = G_MERGE_VALUES [[COPY4]](s32), [[COPY2]](s32) ; CHECK-NEXT: [[COPY5:%[0-9]+]]:vgpr(s32) = COPY [[COPY1]](s32) ; CHECK-NEXT: [[MV1:%[0-9]+]]:vgpr(s64) = G_MERGE_VALUES [[COPY3]](s32), [[COPY5]](s32) - ; CHECK-NEXT: [[UV:%[0-9]+]]:vgpr(s32), [[UV1:%[0-9]+]]:vgpr(s32) = G_UNMERGE_VALUES [[MV]](s64) - ; CHECK-NEXT: [[UV2:%[0-9]+]]:vgpr(s32), [[UV3:%[0-9]+]]:vgpr(s32) = G_UNMERGE_VALUES [[MV1]](s64) - ; CHECK-NEXT: [[AND:%[0-9]+]]:vgpr(s32) = G_AND [[UV]], [[UV2]] - ; CHECK-NEXT: [[AND1:%[0-9]+]]:vgpr(s32) = G_AND [[UV1]], [[UV3]] + ; CHECK-NEXT: [[AND:%[0-9]+]]:vgpr(s32) = G_AND [[COPY4]], [[COPY3]] + ; CHECK-NEXT: [[AND1:%[0-9]+]]:vgpr(s32) = G_AND [[COPY2]], [[COPY5]] ; CHECK-NEXT: [[MV2:%[0-9]+]]:vgpr(s64) = G_MERGE_VALUES [[AND]](s32), [[AND1]](s32) ; CHECK-NEXT: S_NOP 0, implicit [[MV2]](s64) %0:_(s32) = COPY $sgpr0 @@ -388,9 +380,8 @@ ; CHECK-NEXT: [[AND1:%[0-9]+]]:vgpr(s32) = G_AND [[UV1]], [[UV3]] ; CHECK-NEXT: [[MV:%[0-9]+]]:vgpr(s64) = G_MERGE_VALUES [[AND]](s32), [[AND1]](s32) ; CHECK-NEXT: [[UV4:%[0-9]+]]:sgpr(s32), [[UV5:%[0-9]+]]:sgpr(s32) = G_UNMERGE_VALUES [[COPY1]](s64) - ; CHECK-NEXT: [[UV6:%[0-9]+]]:vgpr(s32), [[UV7:%[0-9]+]]:vgpr(s32) = G_UNMERGE_VALUES [[MV]](s64) - ; CHECK-NEXT: [[AND2:%[0-9]+]]:vgpr(s32) = G_AND [[UV4]], [[UV6]] - ; CHECK-NEXT: [[AND3:%[0-9]+]]:vgpr(s32) = G_AND [[UV5]], [[UV7]] + ; CHECK-NEXT: [[AND2:%[0-9]+]]:vgpr(s32) = G_AND [[UV4]], [[AND]] + ; CHECK-NEXT: [[AND3:%[0-9]+]]:vgpr(s32) = G_AND [[UV5]], [[AND1]] ; CHECK-NEXT: [[MV1:%[0-9]+]]:vgpr(s64) = G_MERGE_VALUES [[AND2]](s32), [[AND3]](s32) ; CHECK-NEXT: S_NOP 0, implicit [[MV1]](s64) %0:_(s64) = COPY $sgpr0_sgpr1 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-or.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-or.mir --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-or.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-or.mir @@ -394,10 +394,8 @@ ; CHECK-NEXT: [[COPY3:%[0-9]+]]:vgpr(s32) = COPY $vgpr3 ; CHECK-NEXT: [[MV:%[0-9]+]]:vgpr(s64) = G_MERGE_VALUES [[COPY]](s32), [[COPY1]](s32) ; CHECK-NEXT: [[MV1:%[0-9]+]]:vgpr(s64) = G_MERGE_VALUES [[COPY2]](s32), [[COPY3]](s32) - ; CHECK-NEXT: [[UV:%[0-9]+]]:vgpr(s32), [[UV1:%[0-9]+]]:vgpr(s32) = G_UNMERGE_VALUES [[MV]](s64) - ; CHECK-NEXT: [[UV2:%[0-9]+]]:vgpr(s32), [[UV3:%[0-9]+]]:vgpr(s32) = G_UNMERGE_VALUES [[MV1]](s64) - ; CHECK-NEXT: [[OR:%[0-9]+]]:vgpr(s32) = G_OR [[UV]], [[UV2]] - ; CHECK-NEXT: [[OR1:%[0-9]+]]:vgpr(s32) = G_OR [[UV1]], [[UV3]] + ; CHECK-NEXT: [[OR:%[0-9]+]]:vgpr(s32) = G_OR [[COPY]], [[COPY2]] + ; CHECK-NEXT: [[OR1:%[0-9]+]]:vgpr(s32) = G_OR [[COPY1]], [[COPY3]] ; CHECK-NEXT: [[MV2:%[0-9]+]]:vgpr(s64) = G_MERGE_VALUES [[OR]](s32), [[OR1]](s32) ; CHECK-NEXT: S_NOP 0, implicit [[MV2]](s64) %0:_(s32) = COPY $vgpr0 @@ -426,9 +424,8 @@ ; CHECK-NEXT: [[COPY3:%[0-9]+]]:vgpr(s32) = COPY [[COPY1]](s32) ; CHECK-NEXT: [[MV:%[0-9]+]]:vgpr(s64) = G_MERGE_VALUES [[COPY3]](s32), [[COPY2]](s32) ; CHECK-NEXT: [[UV:%[0-9]+]]:sgpr(s32), [[UV1:%[0-9]+]]:sgpr(s32) = G_UNMERGE_VALUES [[COPY]](s64) - ; CHECK-NEXT: [[UV2:%[0-9]+]]:vgpr(s32), [[UV3:%[0-9]+]]:vgpr(s32) = G_UNMERGE_VALUES [[MV]](s64) - ; CHECK-NEXT: [[OR:%[0-9]+]]:vgpr(s32) = G_OR [[UV]], [[UV2]] - ; CHECK-NEXT: [[OR1:%[0-9]+]]:vgpr(s32) = G_OR [[UV1]], [[UV3]] + ; CHECK-NEXT: [[OR:%[0-9]+]]:vgpr(s32) = G_OR [[UV]], [[COPY3]] + ; CHECK-NEXT: [[OR1:%[0-9]+]]:vgpr(s32) = G_OR [[UV1]], [[COPY2]] ; CHECK-NEXT: [[MV1:%[0-9]+]]:vgpr(s64) = G_MERGE_VALUES [[OR]](s32), [[OR1]](s32) ; CHECK-NEXT: S_NOP 0, implicit [[MV1]](s64) %0:_(s64) = COPY $sgpr0_sgpr1 @@ -455,9 +452,8 @@ ; CHECK-NEXT: [[COPY3:%[0-9]+]]:vgpr(s32) = COPY [[COPY1]](s32) ; CHECK-NEXT: [[MV:%[0-9]+]]:vgpr(s64) = G_MERGE_VALUES [[COPY2]](s32), [[COPY3]](s32) ; CHECK-NEXT: [[UV:%[0-9]+]]:sgpr(s32), [[UV1:%[0-9]+]]:sgpr(s32) = G_UNMERGE_VALUES [[COPY]](s64) - ; CHECK-NEXT: [[UV2:%[0-9]+]]:vgpr(s32), [[UV3:%[0-9]+]]:vgpr(s32) = G_UNMERGE_VALUES [[MV]](s64) - ; CHECK-NEXT: [[OR:%[0-9]+]]:vgpr(s32) = G_OR [[UV]], [[UV2]] - ; CHECK-NEXT: [[OR1:%[0-9]+]]:vgpr(s32) = G_OR [[UV1]], [[UV3]] + ; CHECK-NEXT: [[OR:%[0-9]+]]:vgpr(s32) = G_OR [[UV]], [[COPY2]] + ; CHECK-NEXT: [[OR1:%[0-9]+]]:vgpr(s32) = G_OR [[UV1]], [[COPY3]] ; CHECK-NEXT: [[MV1:%[0-9]+]]:vgpr(s64) = G_MERGE_VALUES [[OR]](s32), [[OR1]](s32) ; CHECK-NEXT: S_NOP 0, implicit [[MV1]](s64) %0:_(s64) = COPY $sgpr0_sgpr1 @@ -486,10 +482,8 @@ ; CHECK-NEXT: [[MV:%[0-9]+]]:vgpr(s64) = G_MERGE_VALUES [[COPY4]](s32), [[COPY2]](s32) ; CHECK-NEXT: [[COPY5:%[0-9]+]]:vgpr(s32) = COPY [[COPY1]](s32) ; CHECK-NEXT: [[MV1:%[0-9]+]]:vgpr(s64) = G_MERGE_VALUES [[COPY5]](s32), [[COPY3]](s32) - ; CHECK-NEXT: [[UV:%[0-9]+]]:vgpr(s32), [[UV1:%[0-9]+]]:vgpr(s32) = G_UNMERGE_VALUES [[MV]](s64) - ; CHECK-NEXT: [[UV2:%[0-9]+]]:vgpr(s32), [[UV3:%[0-9]+]]:vgpr(s32) = G_UNMERGE_VALUES [[MV1]](s64) - ; CHECK-NEXT: [[OR:%[0-9]+]]:vgpr(s32) = G_OR [[UV]], [[UV2]] - ; CHECK-NEXT: [[OR1:%[0-9]+]]:vgpr(s32) = G_OR [[UV1]], [[UV3]] + ; CHECK-NEXT: [[OR:%[0-9]+]]:vgpr(s32) = G_OR [[COPY4]], [[COPY5]] + ; CHECK-NEXT: [[OR1:%[0-9]+]]:vgpr(s32) = G_OR [[COPY2]], [[COPY3]] ; CHECK-NEXT: [[MV2:%[0-9]+]]:vgpr(s64) = G_MERGE_VALUES [[OR]](s32), [[OR1]](s32) ; CHECK-NEXT: S_NOP 0, implicit [[MV2]](s64) %0:_(s32) = COPY $sgpr0 @@ -520,10 +514,8 @@ ; CHECK-NEXT: [[MV:%[0-9]+]]:vgpr(s64) = G_MERGE_VALUES [[COPY4]](s32), [[COPY2]](s32) ; CHECK-NEXT: [[COPY5:%[0-9]+]]:vgpr(s32) = COPY [[COPY1]](s32) ; CHECK-NEXT: [[MV1:%[0-9]+]]:vgpr(s64) = G_MERGE_VALUES [[COPY3]](s32), [[COPY5]](s32) - ; CHECK-NEXT: [[UV:%[0-9]+]]:vgpr(s32), [[UV1:%[0-9]+]]:vgpr(s32) = G_UNMERGE_VALUES [[MV]](s64) - ; CHECK-NEXT: [[UV2:%[0-9]+]]:vgpr(s32), [[UV3:%[0-9]+]]:vgpr(s32) = G_UNMERGE_VALUES [[MV1]](s64) - ; CHECK-NEXT: [[OR:%[0-9]+]]:vgpr(s32) = G_OR [[UV]], [[UV2]] - ; CHECK-NEXT: [[OR1:%[0-9]+]]:vgpr(s32) = G_OR [[UV1]], [[UV3]] + ; CHECK-NEXT: [[OR:%[0-9]+]]:vgpr(s32) = G_OR [[COPY4]], [[COPY3]] + ; CHECK-NEXT: [[OR1:%[0-9]+]]:vgpr(s32) = G_OR [[COPY2]], [[COPY5]] ; CHECK-NEXT: [[MV2:%[0-9]+]]:vgpr(s64) = G_MERGE_VALUES [[OR]](s32), [[OR1]](s32) ; CHECK-NEXT: S_NOP 0, implicit [[MV2]](s64) %0:_(s32) = COPY $sgpr0 @@ -555,9 +547,8 @@ ; CHECK-NEXT: [[OR1:%[0-9]+]]:vgpr(s32) = G_OR [[UV1]], [[UV3]] ; CHECK-NEXT: [[MV:%[0-9]+]]:vgpr(s64) = G_MERGE_VALUES [[OR]](s32), [[OR1]](s32) ; CHECK-NEXT: [[UV4:%[0-9]+]]:sgpr(s32), [[UV5:%[0-9]+]]:sgpr(s32) = G_UNMERGE_VALUES [[COPY1]](s64) - ; CHECK-NEXT: [[UV6:%[0-9]+]]:vgpr(s32), [[UV7:%[0-9]+]]:vgpr(s32) = G_UNMERGE_VALUES [[MV]](s64) - ; CHECK-NEXT: [[OR2:%[0-9]+]]:vgpr(s32) = G_OR [[UV4]], [[UV6]] - ; CHECK-NEXT: [[OR3:%[0-9]+]]:vgpr(s32) = G_OR [[UV5]], [[UV7]] + ; CHECK-NEXT: [[OR2:%[0-9]+]]:vgpr(s32) = G_OR [[UV4]], [[OR]] + ; CHECK-NEXT: [[OR3:%[0-9]+]]:vgpr(s32) = G_OR [[UV5]], [[OR1]] ; CHECK-NEXT: [[MV1:%[0-9]+]]:vgpr(s64) = G_MERGE_VALUES [[OR2]](s32), [[OR3]](s32) ; CHECK-NEXT: S_NOP 0, implicit [[MV1]](s64) %0:_(s64) = COPY $sgpr0_sgpr1 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-xor.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-xor.mir --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-xor.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-xor.mir @@ -394,10 +394,8 @@ ; CHECK-NEXT: [[COPY3:%[0-9]+]]:vgpr(s32) = COPY $vgpr3 ; CHECK-NEXT: [[MV:%[0-9]+]]:vgpr(s64) = G_MERGE_VALUES [[COPY]](s32), [[COPY1]](s32) ; CHECK-NEXT: [[MV1:%[0-9]+]]:vgpr(s64) = G_MERGE_VALUES [[COPY2]](s32), [[COPY3]](s32) - ; CHECK-NEXT: [[UV:%[0-9]+]]:vgpr(s32), [[UV1:%[0-9]+]]:vgpr(s32) = G_UNMERGE_VALUES [[MV]](s64) - ; CHECK-NEXT: [[UV2:%[0-9]+]]:vgpr(s32), [[UV3:%[0-9]+]]:vgpr(s32) = G_UNMERGE_VALUES [[MV1]](s64) - ; CHECK-NEXT: [[XOR:%[0-9]+]]:vgpr(s32) = G_XOR [[UV]], [[UV2]] - ; CHECK-NEXT: [[XOR1:%[0-9]+]]:vgpr(s32) = G_XOR [[UV1]], [[UV3]] + ; CHECK-NEXT: [[XOR:%[0-9]+]]:vgpr(s32) = G_XOR [[COPY]], [[COPY2]] + ; CHECK-NEXT: [[XOR1:%[0-9]+]]:vgpr(s32) = G_XOR [[COPY1]], [[COPY3]] ; CHECK-NEXT: [[MV2:%[0-9]+]]:vgpr(s64) = G_MERGE_VALUES [[XOR]](s32), [[XOR1]](s32) ; CHECK-NEXT: S_NOP 0, implicit [[MV2]](s64) %0:_(s32) = COPY $vgpr0 @@ -426,9 +424,8 @@ ; CHECK-NEXT: [[COPY3:%[0-9]+]]:vgpr(s32) = COPY [[COPY1]](s32) ; CHECK-NEXT: [[MV:%[0-9]+]]:vgpr(s64) = G_MERGE_VALUES [[COPY3]](s32), [[COPY2]](s32) ; CHECK-NEXT: [[UV:%[0-9]+]]:sgpr(s32), [[UV1:%[0-9]+]]:sgpr(s32) = G_UNMERGE_VALUES [[COPY]](s64) - ; CHECK-NEXT: [[UV2:%[0-9]+]]:vgpr(s32), [[UV3:%[0-9]+]]:vgpr(s32) = G_UNMERGE_VALUES [[MV]](s64) - ; CHECK-NEXT: [[XOR:%[0-9]+]]:vgpr(s32) = G_XOR [[UV]], [[UV2]] - ; CHECK-NEXT: [[XOR1:%[0-9]+]]:vgpr(s32) = G_XOR [[UV1]], [[UV3]] + ; CHECK-NEXT: [[XOR:%[0-9]+]]:vgpr(s32) = G_XOR [[UV]], [[COPY3]] + ; CHECK-NEXT: [[XOR1:%[0-9]+]]:vgpr(s32) = G_XOR [[UV1]], [[COPY2]] ; CHECK-NEXT: [[MV1:%[0-9]+]]:vgpr(s64) = G_MERGE_VALUES [[XOR]](s32), [[XOR1]](s32) ; CHECK-NEXT: S_NOP 0, implicit [[MV1]](s64) %0:_(s64) = COPY $sgpr0_sgpr1 @@ -455,9 +452,8 @@ ; CHECK-NEXT: [[COPY3:%[0-9]+]]:vgpr(s32) = COPY [[COPY1]](s32) ; CHECK-NEXT: [[MV:%[0-9]+]]:vgpr(s64) = G_MERGE_VALUES [[COPY2]](s32), [[COPY3]](s32) ; CHECK-NEXT: [[UV:%[0-9]+]]:sgpr(s32), [[UV1:%[0-9]+]]:sgpr(s32) = G_UNMERGE_VALUES [[COPY]](s64) - ; CHECK-NEXT: [[UV2:%[0-9]+]]:vgpr(s32), [[UV3:%[0-9]+]]:vgpr(s32) = G_UNMERGE_VALUES [[MV]](s64) - ; CHECK-NEXT: [[XOR:%[0-9]+]]:vgpr(s32) = G_XOR [[UV]], [[UV2]] - ; CHECK-NEXT: [[XOR1:%[0-9]+]]:vgpr(s32) = G_XOR [[UV1]], [[UV3]] + ; CHECK-NEXT: [[XOR:%[0-9]+]]:vgpr(s32) = G_XOR [[UV]], [[COPY2]] + ; CHECK-NEXT: [[XOR1:%[0-9]+]]:vgpr(s32) = G_XOR [[UV1]], [[COPY3]] ; CHECK-NEXT: [[MV1:%[0-9]+]]:vgpr(s64) = G_MERGE_VALUES [[XOR]](s32), [[XOR1]](s32) ; CHECK-NEXT: S_NOP 0, implicit [[MV1]](s64) %0:_(s64) = COPY $sgpr0_sgpr1 @@ -486,10 +482,8 @@ ; CHECK-NEXT: [[MV:%[0-9]+]]:vgpr(s64) = G_MERGE_VALUES [[COPY4]](s32), [[COPY2]](s32) ; CHECK-NEXT: [[COPY5:%[0-9]+]]:vgpr(s32) = COPY [[COPY1]](s32) ; CHECK-NEXT: [[MV1:%[0-9]+]]:vgpr(s64) = G_MERGE_VALUES [[COPY5]](s32), [[COPY3]](s32) - ; CHECK-NEXT: [[UV:%[0-9]+]]:vgpr(s32), [[UV1:%[0-9]+]]:vgpr(s32) = G_UNMERGE_VALUES [[MV]](s64) - ; CHECK-NEXT: [[UV2:%[0-9]+]]:vgpr(s32), [[UV3:%[0-9]+]]:vgpr(s32) = G_UNMERGE_VALUES [[MV1]](s64) - ; CHECK-NEXT: [[XOR:%[0-9]+]]:vgpr(s32) = G_XOR [[UV]], [[UV2]] - ; CHECK-NEXT: [[XOR1:%[0-9]+]]:vgpr(s32) = G_XOR [[UV1]], [[UV3]] + ; CHECK-NEXT: [[XOR:%[0-9]+]]:vgpr(s32) = G_XOR [[COPY4]], [[COPY5]] + ; CHECK-NEXT: [[XOR1:%[0-9]+]]:vgpr(s32) = G_XOR [[COPY2]], [[COPY3]] ; CHECK-NEXT: [[MV2:%[0-9]+]]:vgpr(s64) = G_MERGE_VALUES [[XOR]](s32), [[XOR1]](s32) ; CHECK-NEXT: S_NOP 0, implicit [[MV2]](s64) %0:_(s32) = COPY $sgpr0 @@ -520,10 +514,8 @@ ; CHECK-NEXT: [[MV:%[0-9]+]]:vgpr(s64) = G_MERGE_VALUES [[COPY4]](s32), [[COPY2]](s32) ; CHECK-NEXT: [[COPY5:%[0-9]+]]:vgpr(s32) = COPY [[COPY1]](s32) ; CHECK-NEXT: [[MV1:%[0-9]+]]:vgpr(s64) = G_MERGE_VALUES [[COPY3]](s32), [[COPY5]](s32) - ; CHECK-NEXT: [[UV:%[0-9]+]]:vgpr(s32), [[UV1:%[0-9]+]]:vgpr(s32) = G_UNMERGE_VALUES [[MV]](s64) - ; CHECK-NEXT: [[UV2:%[0-9]+]]:vgpr(s32), [[UV3:%[0-9]+]]:vgpr(s32) = G_UNMERGE_VALUES [[MV1]](s64) - ; CHECK-NEXT: [[XOR:%[0-9]+]]:vgpr(s32) = G_XOR [[UV]], [[UV2]] - ; CHECK-NEXT: [[XOR1:%[0-9]+]]:vgpr(s32) = G_XOR [[UV1]], [[UV3]] + ; CHECK-NEXT: [[XOR:%[0-9]+]]:vgpr(s32) = G_XOR [[COPY4]], [[COPY3]] + ; CHECK-NEXT: [[XOR1:%[0-9]+]]:vgpr(s32) = G_XOR [[COPY2]], [[COPY5]] ; CHECK-NEXT: [[MV2:%[0-9]+]]:vgpr(s64) = G_MERGE_VALUES [[XOR]](s32), [[XOR1]](s32) ; CHECK-NEXT: S_NOP 0, implicit [[MV2]](s64) %0:_(s32) = COPY $sgpr0 @@ -555,9 +547,8 @@ ; CHECK-NEXT: [[XOR1:%[0-9]+]]:vgpr(s32) = G_XOR [[UV1]], [[UV3]] ; CHECK-NEXT: [[MV:%[0-9]+]]:vgpr(s64) = G_MERGE_VALUES [[XOR]](s32), [[XOR1]](s32) ; CHECK-NEXT: [[UV4:%[0-9]+]]:sgpr(s32), [[UV5:%[0-9]+]]:sgpr(s32) = G_UNMERGE_VALUES [[COPY1]](s64) - ; CHECK-NEXT: [[UV6:%[0-9]+]]:vgpr(s32), [[UV7:%[0-9]+]]:vgpr(s32) = G_UNMERGE_VALUES [[MV]](s64) - ; CHECK-NEXT: [[XOR2:%[0-9]+]]:vgpr(s32) = G_XOR [[UV4]], [[UV6]] - ; CHECK-NEXT: [[XOR3:%[0-9]+]]:vgpr(s32) = G_XOR [[UV5]], [[UV7]] + ; CHECK-NEXT: [[XOR2:%[0-9]+]]:vgpr(s32) = G_XOR [[UV4]], [[XOR]] + ; CHECK-NEXT: [[XOR3:%[0-9]+]]:vgpr(s32) = G_XOR [[UV5]], [[XOR1]] ; CHECK-NEXT: [[MV1:%[0-9]+]]:vgpr(s64) = G_MERGE_VALUES [[XOR2]](s32), [[XOR3]](s32) ; CHECK-NEXT: S_NOP 0, implicit [[MV1]](s64) %0:_(s64) = COPY $sgpr0_sgpr1 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/xnor.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/xnor.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/xnor.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/xnor.ll @@ -265,10 +265,8 @@ ; GFX10: ; %bb.0: ; %entry ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: v_xor_b32_e32 v0, v0, v2 -; GFX10-NEXT: v_xor_b32_e32 v1, v1, v3 -; GFX10-NEXT: v_xor_b32_e32 v0, -1, v0 -; GFX10-NEXT: v_xor_b32_e32 v1, -1, v1 +; GFX10-NEXT: v_xor3_b32 v0, v0, v2, -1 +; GFX10-NEXT: v_xor3_b32 v1, v1, v3, -1 ; GFX10-NEXT: s_setpc_b64 s[30:31] entry: %xor = xor i64 %a, %b @@ -384,10 +382,8 @@ ; GFX10-LABEL: xnor_i64_s_v_one_use: ; GFX10: ; %bb.0: ; %entry ; GFX10-NEXT: v_lshlrev_b64 v[0:1], 29, v[0:1] -; GFX10-NEXT: v_xor_b32_e32 v0, s0, v0 -; GFX10-NEXT: v_xor_b32_e32 v1, s1, v1 -; GFX10-NEXT: v_xor_b32_e32 v0, -1, v0 -; GFX10-NEXT: v_xor_b32_e32 v1, -1, v1 +; GFX10-NEXT: v_xor3_b32 v0, s0, v0, -1 +; GFX10-NEXT: v_xor3_b32 v1, s1, v1, -1 ; GFX10-NEXT: ; return to shader part epilog entry: %b = shl i64 %b64, 29 @@ -437,10 +433,8 @@ ; GFX10-LABEL: xnor_i64_v_s_one_use: ; GFX10: ; %bb.0: ; GFX10-NEXT: v_lshlrev_b64 v[0:1], 29, v[0:1] -; GFX10-NEXT: v_xor_b32_e32 v0, s0, v0 -; GFX10-NEXT: v_xor_b32_e32 v1, s1, v1 -; GFX10-NEXT: v_xor_b32_e32 v0, -1, v0 -; GFX10-NEXT: v_xor_b32_e32 v1, -1, v1 +; GFX10-NEXT: v_xor3_b32 v0, v0, s0, -1 +; GFX10-NEXT: v_xor3_b32 v1, v1, s1, -1 ; GFX10-NEXT: ; return to shader part epilog %b = shl i64 %b64, 29 %xor = xor i64 %b, %a diff --git a/llvm/test/CodeGen/AMDGPU/bfi_int.ll b/llvm/test/CodeGen/AMDGPU/bfi_int.ll --- a/llvm/test/CodeGen/AMDGPU/bfi_int.ll +++ b/llvm/test/CodeGen/AMDGPU/bfi_int.ll @@ -253,14 +253,14 @@ define amdgpu_ps float @v_s_s_bfi_sha256_ch(i32 %x, i32 inreg %y, i32 inreg %z) { ; GFX7-LABEL: v_s_s_bfi_sha256_ch: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: v_mov_b32_e32 v1, s1 -; GFX7-NEXT: v_bfi_b32 v0, v0, s0, v1 +; GFX7-NEXT: v_mov_b32_e32 v1, s0 +; GFX7-NEXT: v_bfi_b32 v0, v0, v1, s1 ; GFX7-NEXT: ; return to shader part epilog ; ; GFX8-LABEL: v_s_s_bfi_sha256_ch: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: v_mov_b32_e32 v1, s1 -; GFX8-NEXT: v_bfi_b32 v0, v0, s0, v1 +; GFX8-NEXT: v_mov_b32_e32 v1, s0 +; GFX8-NEXT: v_bfi_b32 v0, v0, v1, s1 ; GFX8-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: v_s_s_bfi_sha256_ch: @@ -270,16 +270,13 @@ ; ; GFX8-GISEL-LABEL: v_s_s_bfi_sha256_ch: ; GFX8-GISEL: ; %bb.0: ; %entry -; GFX8-GISEL-NEXT: s_xor_b32 s0, s0, s1 -; GFX8-GISEL-NEXT: v_and_b32_e32 v0, s0, v0 -; GFX8-GISEL-NEXT: v_xor_b32_e32 v0, s1, v0 +; GFX8-GISEL-NEXT: v_mov_b32_e32 v1, s1 +; GFX8-GISEL-NEXT: v_bfi_b32 v0, v0, s0, v1 ; GFX8-GISEL-NEXT: ; return to shader part epilog ; ; GFX10-GISEL-LABEL: v_s_s_bfi_sha256_ch: ; GFX10-GISEL: ; %bb.0: ; %entry -; GFX10-GISEL-NEXT: s_xor_b32 s0, s0, s1 -; GFX10-GISEL-NEXT: v_and_b32_e32 v0, s0, v0 -; GFX10-GISEL-NEXT: v_xor_b32_e32 v0, s1, v0 +; GFX10-GISEL-NEXT: v_bfi_b32 v0, v0, s0, s1 ; GFX10-GISEL-NEXT: ; return to shader part epilog entry: %xor0 = xor i32 %y, %z @@ -292,14 +289,14 @@ define amdgpu_ps float @s_v_s_bfi_sha256_ch(i32 inreg %x, i32 %y, i32 inreg %z) { ; GFX7-LABEL: s_v_s_bfi_sha256_ch: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: v_mov_b32_e32 v1, s1 -; GFX7-NEXT: v_bfi_b32 v0, s0, v0, v1 +; GFX7-NEXT: v_mov_b32_e32 v1, s0 +; GFX7-NEXT: v_bfi_b32 v0, v1, v0, s1 ; GFX7-NEXT: ; return to shader part epilog ; ; GFX8-LABEL: s_v_s_bfi_sha256_ch: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: v_mov_b32_e32 v1, s1 -; GFX8-NEXT: v_bfi_b32 v0, s0, v0, v1 +; GFX8-NEXT: v_mov_b32_e32 v1, s0 +; GFX8-NEXT: v_bfi_b32 v0, v1, v0, s1 ; GFX8-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: s_v_s_bfi_sha256_ch: @@ -331,14 +328,14 @@ define amdgpu_ps float @s_s_v_bfi_sha256_ch(i32 inreg %x, i32 inreg %y, i32 %z) { ; GFX7-LABEL: s_s_v_bfi_sha256_ch: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: v_mov_b32_e32 v1, s1 -; GFX7-NEXT: v_bfi_b32 v0, s0, v1, v0 +; GFX7-NEXT: v_mov_b32_e32 v1, s0 +; GFX7-NEXT: v_bfi_b32 v0, v1, s1, v0 ; GFX7-NEXT: ; return to shader part epilog ; ; GFX8-LABEL: s_s_v_bfi_sha256_ch: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: v_mov_b32_e32 v1, s1 -; GFX8-NEXT: v_bfi_b32 v0, s0, v1, v0 +; GFX8-NEXT: v_mov_b32_e32 v1, s0 +; GFX8-NEXT: v_bfi_b32 v0, v1, s1, v0 ; GFX8-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: s_s_v_bfi_sha256_ch: @@ -626,24 +623,16 @@ ; GFX8-GISEL-LABEL: v_bitselect_v2i32_pat1: ; GFX8-GISEL: ; %bb.0: ; GFX8-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-GISEL-NEXT: v_xor_b32_e32 v0, v0, v4 -; GFX8-GISEL-NEXT: v_xor_b32_e32 v1, v1, v5 -; GFX8-GISEL-NEXT: v_and_b32_e32 v0, v0, v2 -; GFX8-GISEL-NEXT: v_and_b32_e32 v1, v1, v3 -; GFX8-GISEL-NEXT: v_xor_b32_e32 v0, v0, v4 -; GFX8-GISEL-NEXT: v_xor_b32_e32 v1, v1, v5 +; GFX8-GISEL-NEXT: v_bfi_b32 v0, v2, v0, v4 +; GFX8-GISEL-NEXT: v_bfi_b32 v1, v3, v1, v5 ; GFX8-GISEL-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-GISEL-LABEL: v_bitselect_v2i32_pat1: ; GFX10-GISEL: ; %bb.0: ; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-GISEL-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-GISEL-NEXT: v_xor_b32_e32 v0, v0, v4 -; GFX10-GISEL-NEXT: v_xor_b32_e32 v1, v1, v5 -; GFX10-GISEL-NEXT: v_and_b32_e32 v0, v0, v2 -; GFX10-GISEL-NEXT: v_and_b32_e32 v1, v1, v3 -; GFX10-GISEL-NEXT: v_xor_b32_e32 v0, v0, v4 -; GFX10-GISEL-NEXT: v_xor_b32_e32 v1, v1, v5 +; GFX10-GISEL-NEXT: v_bfi_b32 v0, v2, v0, v4 +; GFX10-GISEL-NEXT: v_bfi_b32 v1, v3, v1, v5 ; GFX10-GISEL-NEXT: s_setpc_b64 s[30:31] %xor.0 = xor <2 x i32> %a, %mask %and = and <2 x i32> %xor.0, %b @@ -693,12 +682,10 @@ ; GFX10-GISEL-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-GISEL-NEXT: v_xor_b32_e32 v6, -1, v0 ; GFX10-GISEL-NEXT: v_xor_b32_e32 v7, -1, v1 -; GFX10-GISEL-NEXT: v_and_b32_e32 v0, v0, v2 -; GFX10-GISEL-NEXT: v_and_b32_e32 v1, v1, v3 -; GFX10-GISEL-NEXT: v_and_b32_e32 v2, v6, v4 -; GFX10-GISEL-NEXT: v_and_b32_e32 v3, v7, v5 -; GFX10-GISEL-NEXT: v_or_b32_e32 v0, v0, v2 -; GFX10-GISEL-NEXT: v_or_b32_e32 v1, v1, v3 +; GFX10-GISEL-NEXT: v_and_b32_e32 v4, v6, v4 +; GFX10-GISEL-NEXT: v_and_b32_e32 v5, v7, v5 +; GFX10-GISEL-NEXT: v_and_or_b32 v0, v0, v2, v4 +; GFX10-GISEL-NEXT: v_and_or_b32 v1, v1, v3, v5 ; GFX10-GISEL-NEXT: s_setpc_b64 s[30:31] %and0 = and i64 %a, %b %not.a = xor i64 %a, -1 @@ -746,12 +733,10 @@ ; GFX10-GISEL: ; %bb.0: ; GFX10-GISEL-NEXT: v_xor_b32_e32 v2, -1, v0 ; GFX10-GISEL-NEXT: v_xor_b32_e32 v3, -1, v1 -; GFX10-GISEL-NEXT: v_and_b32_e32 v0, s0, v0 -; GFX10-GISEL-NEXT: v_and_b32_e32 v1, s1, v1 ; GFX10-GISEL-NEXT: v_and_b32_e32 v2, s2, v2 ; GFX10-GISEL-NEXT: v_and_b32_e32 v3, s3, v3 -; GFX10-GISEL-NEXT: v_or_b32_e32 v0, v0, v2 -; GFX10-GISEL-NEXT: v_or_b32_e32 v1, v1, v3 +; GFX10-GISEL-NEXT: v_and_or_b32 v0, v0, s0, v2 +; GFX10-GISEL-NEXT: v_and_or_b32 v1, v1, s1, v3 ; GFX10-GISEL-NEXT: ; return to shader part epilog %and0 = and i64 %a, %b %not.a = xor i64 %a, -1 @@ -795,11 +780,9 @@ ; ; GFX10-GISEL-LABEL: s_v_s_bitselect_i64_pat_0: ; GFX10-GISEL: ; %bb.0: -; GFX10-GISEL-NEXT: v_and_b32_e32 v0, s0, v0 -; GFX10-GISEL-NEXT: v_and_b32_e32 v1, s1, v1 -; GFX10-GISEL-NEXT: s_andn2_b64 s[0:1], s[2:3], s[0:1] -; GFX10-GISEL-NEXT: v_or_b32_e32 v0, s0, v0 -; GFX10-GISEL-NEXT: v_or_b32_e32 v1, s1, v1 +; GFX10-GISEL-NEXT: s_andn2_b64 s[2:3], s[2:3], s[0:1] +; GFX10-GISEL-NEXT: v_and_or_b32 v0, s0, v0, s2 +; GFX10-GISEL-NEXT: v_and_or_b32 v1, s1, v1, s3 ; GFX10-GISEL-NEXT: ; return to shader part epilog %and0 = and i64 %a, %b %not.a = xor i64 %a, -1 @@ -844,12 +827,10 @@ ; ; GFX10-GISEL-LABEL: s_s_v_bitselect_i64_pat_0: ; GFX10-GISEL: ; %bb.0: -; GFX10-GISEL-NEXT: s_not_b64 s[4:5], s[0:1] -; GFX10-GISEL-NEXT: s_and_b64 s[0:1], s[0:1], s[2:3] -; GFX10-GISEL-NEXT: v_and_b32_e32 v0, s4, v0 -; GFX10-GISEL-NEXT: v_and_b32_e32 v1, s5, v1 -; GFX10-GISEL-NEXT: v_or_b32_e32 v0, s0, v0 -; GFX10-GISEL-NEXT: v_or_b32_e32 v1, s1, v1 +; GFX10-GISEL-NEXT: s_and_b64 s[2:3], s[0:1], s[2:3] +; GFX10-GISEL-NEXT: s_not_b64 s[0:1], s[0:1] +; GFX10-GISEL-NEXT: v_and_or_b32 v0, s0, v0, s2 +; GFX10-GISEL-NEXT: v_and_or_b32 v1, s1, v1, s3 ; GFX10-GISEL-NEXT: ; return to shader part epilog %and0 = and i64 %a, %b %not.a = xor i64 %a, -1 @@ -894,12 +875,10 @@ ; GFX10-GISEL: ; %bb.0: ; GFX10-GISEL-NEXT: v_xor_b32_e32 v4, -1, v0 ; GFX10-GISEL-NEXT: v_xor_b32_e32 v5, -1, v1 -; GFX10-GISEL-NEXT: v_and_b32_e32 v0, v0, v2 -; GFX10-GISEL-NEXT: v_and_b32_e32 v1, v1, v3 -; GFX10-GISEL-NEXT: v_and_b32_e32 v2, s0, v4 -; GFX10-GISEL-NEXT: v_and_b32_e32 v3, s1, v5 -; GFX10-GISEL-NEXT: v_or_b32_e32 v0, v0, v2 -; GFX10-GISEL-NEXT: v_or_b32_e32 v1, v1, v3 +; GFX10-GISEL-NEXT: v_and_b32_e32 v4, s0, v4 +; GFX10-GISEL-NEXT: v_and_b32_e32 v5, s1, v5 +; GFX10-GISEL-NEXT: v_and_or_b32 v0, v0, v2, v4 +; GFX10-GISEL-NEXT: v_and_or_b32 v1, v1, v3, v5 ; GFX10-GISEL-NEXT: ; return to shader part epilog %and0 = and i64 %a, %b %not.a = xor i64 %a, -1 @@ -944,12 +923,10 @@ ; GFX10-GISEL: ; %bb.0: ; GFX10-GISEL-NEXT: v_xor_b32_e32 v4, -1, v0 ; GFX10-GISEL-NEXT: v_xor_b32_e32 v5, -1, v1 -; GFX10-GISEL-NEXT: v_and_b32_e32 v0, s0, v0 -; GFX10-GISEL-NEXT: v_and_b32_e32 v1, s1, v1 ; GFX10-GISEL-NEXT: v_and_b32_e32 v2, v4, v2 ; GFX10-GISEL-NEXT: v_and_b32_e32 v3, v5, v3 -; GFX10-GISEL-NEXT: v_or_b32_e32 v0, v0, v2 -; GFX10-GISEL-NEXT: v_or_b32_e32 v1, v1, v3 +; GFX10-GISEL-NEXT: v_and_or_b32 v0, v0, s0, v2 +; GFX10-GISEL-NEXT: v_and_or_b32 v1, v1, s1, v3 ; GFX10-GISEL-NEXT: ; return to shader part epilog %and0 = and i64 %a, %b %not.a = xor i64 %a, -1 @@ -992,12 +969,10 @@ ; GFX10-GISEL-LABEL: s_v_v_bitselect_i64_pat_0: ; GFX10-GISEL: ; %bb.0: ; GFX10-GISEL-NEXT: s_not_b64 s[2:3], s[0:1] -; GFX10-GISEL-NEXT: v_and_b32_e32 v0, s0, v0 -; GFX10-GISEL-NEXT: v_and_b32_e32 v1, s1, v1 ; GFX10-GISEL-NEXT: v_and_b32_e32 v2, s2, v2 ; GFX10-GISEL-NEXT: v_and_b32_e32 v3, s3, v3 -; GFX10-GISEL-NEXT: v_or_b32_e32 v0, v0, v2 -; GFX10-GISEL-NEXT: v_or_b32_e32 v1, v1, v3 +; GFX10-GISEL-NEXT: v_and_or_b32 v0, s0, v0, v2 +; GFX10-GISEL-NEXT: v_and_or_b32 v1, s1, v1, v3 ; GFX10-GISEL-NEXT: ; return to shader part epilog %and0 = and i64 %a, %b %not.a = xor i64 %a, -1 @@ -1033,24 +1008,16 @@ ; GFX8-GISEL-LABEL: v_bitselect_i64_pat_1: ; GFX8-GISEL: ; %bb.0: ; GFX8-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-GISEL-NEXT: v_xor_b32_e32 v0, v0, v4 -; GFX8-GISEL-NEXT: v_xor_b32_e32 v1, v1, v5 -; GFX8-GISEL-NEXT: v_and_b32_e32 v0, v0, v2 -; GFX8-GISEL-NEXT: v_and_b32_e32 v1, v1, v3 -; GFX8-GISEL-NEXT: v_xor_b32_e32 v0, v0, v4 -; GFX8-GISEL-NEXT: v_xor_b32_e32 v1, v1, v5 +; GFX8-GISEL-NEXT: v_bfi_b32 v0, v2, v0, v4 +; GFX8-GISEL-NEXT: v_bfi_b32 v1, v3, v1, v5 ; GFX8-GISEL-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-GISEL-LABEL: v_bitselect_i64_pat_1: ; GFX10-GISEL: ; %bb.0: ; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-GISEL-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-GISEL-NEXT: v_xor_b32_e32 v0, v0, v4 -; GFX10-GISEL-NEXT: v_xor_b32_e32 v1, v1, v5 -; GFX10-GISEL-NEXT: v_and_b32_e32 v0, v0, v2 -; GFX10-GISEL-NEXT: v_and_b32_e32 v1, v1, v3 -; GFX10-GISEL-NEXT: v_xor_b32_e32 v0, v0, v4 -; GFX10-GISEL-NEXT: v_xor_b32_e32 v1, v1, v5 +; GFX10-GISEL-NEXT: v_bfi_b32 v0, v2, v0, v4 +; GFX10-GISEL-NEXT: v_bfi_b32 v1, v3, v1, v5 ; GFX10-GISEL-NEXT: s_setpc_b64 s[30:31] %xor.0 = xor i64 %a, %mask %and = and i64 %xor.0, %b @@ -1083,22 +1050,16 @@ ; ; GFX8-GISEL-LABEL: v_s_s_bitselect_i64_pat_1: ; GFX8-GISEL: ; %bb.0: -; GFX8-GISEL-NEXT: v_xor_b32_e32 v0, s2, v0 -; GFX8-GISEL-NEXT: v_xor_b32_e32 v1, s3, v1 -; GFX8-GISEL-NEXT: v_and_b32_e32 v0, s0, v0 -; GFX8-GISEL-NEXT: v_and_b32_e32 v1, s1, v1 -; GFX8-GISEL-NEXT: v_xor_b32_e32 v0, s2, v0 -; GFX8-GISEL-NEXT: v_xor_b32_e32 v1, s3, v1 +; GFX8-GISEL-NEXT: v_mov_b32_e32 v2, s0 +; GFX8-GISEL-NEXT: v_bfi_b32 v0, v2, v0, s2 +; GFX8-GISEL-NEXT: v_mov_b32_e32 v2, s1 +; GFX8-GISEL-NEXT: v_bfi_b32 v1, v2, v1, s3 ; GFX8-GISEL-NEXT: ; return to shader part epilog ; ; GFX10-GISEL-LABEL: v_s_s_bitselect_i64_pat_1: ; GFX10-GISEL: ; %bb.0: -; GFX10-GISEL-NEXT: v_xor_b32_e32 v0, s2, v0 -; GFX10-GISEL-NEXT: v_xor_b32_e32 v1, s3, v1 -; GFX10-GISEL-NEXT: v_and_b32_e32 v0, s0, v0 -; GFX10-GISEL-NEXT: v_and_b32_e32 v1, s1, v1 -; GFX10-GISEL-NEXT: v_xor_b32_e32 v0, s2, v0 -; GFX10-GISEL-NEXT: v_xor_b32_e32 v1, s3, v1 +; GFX10-GISEL-NEXT: v_bfi_b32 v0, s0, v0, s2 +; GFX10-GISEL-NEXT: v_bfi_b32 v1, s1, v1, s3 ; GFX10-GISEL-NEXT: ; return to shader part epilog %xor.0 = xor i64 %a, %mask %and = and i64 %xor.0, %b @@ -1132,22 +1093,16 @@ ; ; GFX8-GISEL-LABEL: s_s_v_bitselect_i64_pat_1: ; GFX8-GISEL: ; %bb.0: -; GFX8-GISEL-NEXT: v_xor_b32_e32 v2, s0, v0 -; GFX8-GISEL-NEXT: v_xor_b32_e32 v3, s1, v1 -; GFX8-GISEL-NEXT: v_and_b32_e32 v2, s2, v2 -; GFX8-GISEL-NEXT: v_and_b32_e32 v3, s3, v3 -; GFX8-GISEL-NEXT: v_xor_b32_e32 v0, v2, v0 -; GFX8-GISEL-NEXT: v_xor_b32_e32 v1, v3, v1 +; GFX8-GISEL-NEXT: v_mov_b32_e32 v2, s2 +; GFX8-GISEL-NEXT: v_bfi_b32 v0, v2, s0, v0 +; GFX8-GISEL-NEXT: v_mov_b32_e32 v2, s3 +; GFX8-GISEL-NEXT: v_bfi_b32 v1, v2, s1, v1 ; GFX8-GISEL-NEXT: ; return to shader part epilog ; ; GFX10-GISEL-LABEL: s_s_v_bitselect_i64_pat_1: ; GFX10-GISEL: ; %bb.0: -; GFX10-GISEL-NEXT: v_xor_b32_e32 v2, s0, v0 -; GFX10-GISEL-NEXT: v_xor_b32_e32 v3, s1, v1 -; GFX10-GISEL-NEXT: v_and_b32_e32 v2, s2, v2 -; GFX10-GISEL-NEXT: v_and_b32_e32 v3, s3, v3 -; GFX10-GISEL-NEXT: v_xor_b32_e32 v0, v2, v0 -; GFX10-GISEL-NEXT: v_xor_b32_e32 v1, v3, v1 +; GFX10-GISEL-NEXT: v_bfi_b32 v0, s2, s0, v0 +; GFX10-GISEL-NEXT: v_bfi_b32 v1, s3, s1, v1 ; GFX10-GISEL-NEXT: ; return to shader part epilog %xor.0 = xor i64 %a, %mask %and = and i64 %xor.0, %b @@ -1229,24 +1184,16 @@ ; GFX8-GISEL-LABEL: v_bitselect_i64_pat_2: ; GFX8-GISEL: ; %bb.0: ; GFX8-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-GISEL-NEXT: v_xor_b32_e32 v0, v0, v4 -; GFX8-GISEL-NEXT: v_xor_b32_e32 v1, v1, v5 -; GFX8-GISEL-NEXT: v_and_b32_e32 v0, v0, v2 -; GFX8-GISEL-NEXT: v_and_b32_e32 v1, v1, v3 -; GFX8-GISEL-NEXT: v_xor_b32_e32 v0, v0, v4 -; GFX8-GISEL-NEXT: v_xor_b32_e32 v1, v1, v5 +; GFX8-GISEL-NEXT: v_bfi_b32 v0, v2, v0, v4 +; GFX8-GISEL-NEXT: v_bfi_b32 v1, v3, v1, v5 ; GFX8-GISEL-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-GISEL-LABEL: v_bitselect_i64_pat_2: ; GFX10-GISEL: ; %bb.0: ; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-GISEL-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-GISEL-NEXT: v_xor_b32_e32 v0, v0, v4 -; GFX10-GISEL-NEXT: v_xor_b32_e32 v1, v1, v5 -; GFX10-GISEL-NEXT: v_and_b32_e32 v0, v0, v2 -; GFX10-GISEL-NEXT: v_and_b32_e32 v1, v1, v3 -; GFX10-GISEL-NEXT: v_xor_b32_e32 v0, v0, v4 -; GFX10-GISEL-NEXT: v_xor_b32_e32 v1, v1, v5 +; GFX10-GISEL-NEXT: v_bfi_b32 v0, v2, v0, v4 +; GFX10-GISEL-NEXT: v_bfi_b32 v1, v3, v1, v5 ; GFX10-GISEL-NEXT: s_setpc_b64 s[30:31] %xor.0 = xor i64 %a, %mask %and = and i64 %xor.0, %b @@ -1286,28 +1233,20 @@ ; GFX8-GISEL-LABEL: v_bfi_sha256_ma_i64: ; GFX8-GISEL: ; %bb.0: ; %entry ; GFX8-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-GISEL-NEXT: v_and_b32_e32 v6, v0, v4 -; GFX8-GISEL-NEXT: v_and_b32_e32 v7, v1, v5 -; GFX8-GISEL-NEXT: v_or_b32_e32 v0, v0, v4 -; GFX8-GISEL-NEXT: v_or_b32_e32 v1, v1, v5 -; GFX8-GISEL-NEXT: v_and_b32_e32 v0, v2, v0 -; GFX8-GISEL-NEXT: v_and_b32_e32 v1, v3, v1 -; GFX8-GISEL-NEXT: v_or_b32_e32 v0, v6, v0 -; GFX8-GISEL-NEXT: v_or_b32_e32 v1, v7, v1 +; GFX8-GISEL-NEXT: v_xor_b32_e32 v0, v0, v2 +; GFX8-GISEL-NEXT: v_xor_b32_e32 v1, v1, v3 +; GFX8-GISEL-NEXT: v_bfi_b32 v0, v0, v4, v2 +; GFX8-GISEL-NEXT: v_bfi_b32 v1, v1, v5, v3 ; GFX8-GISEL-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-GISEL-LABEL: v_bfi_sha256_ma_i64: ; GFX10-GISEL: ; %bb.0: ; %entry ; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-GISEL-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-GISEL-NEXT: v_or_b32_e32 v6, v0, v4 -; GFX10-GISEL-NEXT: v_or_b32_e32 v7, v1, v5 -; GFX10-GISEL-NEXT: v_and_b32_e32 v0, v0, v4 -; GFX10-GISEL-NEXT: v_and_b32_e32 v1, v1, v5 -; GFX10-GISEL-NEXT: v_and_b32_e32 v2, v2, v6 -; GFX10-GISEL-NEXT: v_and_b32_e32 v3, v3, v7 -; GFX10-GISEL-NEXT: v_or_b32_e32 v0, v0, v2 -; GFX10-GISEL-NEXT: v_or_b32_e32 v1, v1, v3 +; GFX10-GISEL-NEXT: v_xor_b32_e32 v0, v0, v2 +; GFX10-GISEL-NEXT: v_xor_b32_e32 v1, v1, v3 +; GFX10-GISEL-NEXT: v_bfi_b32 v0, v0, v4, v2 +; GFX10-GISEL-NEXT: v_bfi_b32 v1, v1, v5, v3 ; GFX10-GISEL-NEXT: s_setpc_b64 s[30:31] entry: %and0 = and i64 %x, %z @@ -1348,26 +1287,20 @@ ; ; GFX8-GISEL-LABEL: v_s_s_bfi_sha256_ma_i64: ; GFX8-GISEL: ; %bb.0: ; %entry -; GFX8-GISEL-NEXT: v_and_b32_e32 v2, s2, v0 -; GFX8-GISEL-NEXT: v_and_b32_e32 v3, s3, v1 -; GFX8-GISEL-NEXT: v_or_b32_e32 v0, s2, v0 -; GFX8-GISEL-NEXT: v_or_b32_e32 v1, s3, v1 -; GFX8-GISEL-NEXT: v_and_b32_e32 v0, s0, v0 -; GFX8-GISEL-NEXT: v_and_b32_e32 v1, s1, v1 -; GFX8-GISEL-NEXT: v_or_b32_e32 v0, v2, v0 -; GFX8-GISEL-NEXT: v_or_b32_e32 v1, v3, v1 +; GFX8-GISEL-NEXT: v_mov_b32_e32 v2, s2 +; GFX8-GISEL-NEXT: v_xor_b32_e32 v0, s0, v0 +; GFX8-GISEL-NEXT: v_bfi_b32 v0, v0, v2, s0 +; GFX8-GISEL-NEXT: v_mov_b32_e32 v2, s3 +; GFX8-GISEL-NEXT: v_xor_b32_e32 v1, s1, v1 +; GFX8-GISEL-NEXT: v_bfi_b32 v1, v1, v2, s1 ; GFX8-GISEL-NEXT: ; return to shader part epilog ; ; GFX10-GISEL-LABEL: v_s_s_bfi_sha256_ma_i64: ; GFX10-GISEL: ; %bb.0: ; %entry -; GFX10-GISEL-NEXT: v_or_b32_e32 v2, s2, v0 -; GFX10-GISEL-NEXT: v_or_b32_e32 v3, s3, v1 -; GFX10-GISEL-NEXT: v_and_b32_e32 v0, s2, v0 -; GFX10-GISEL-NEXT: v_and_b32_e32 v1, s3, v1 -; GFX10-GISEL-NEXT: v_and_b32_e32 v2, s0, v2 -; GFX10-GISEL-NEXT: v_and_b32_e32 v3, s1, v3 -; GFX10-GISEL-NEXT: v_or_b32_e32 v0, v0, v2 -; GFX10-GISEL-NEXT: v_or_b32_e32 v1, v1, v3 +; GFX10-GISEL-NEXT: v_xor_b32_e32 v0, s0, v0 +; GFX10-GISEL-NEXT: v_xor_b32_e32 v1, s1, v1 +; GFX10-GISEL-NEXT: v_bfi_b32 v0, v0, s2, s0 +; GFX10-GISEL-NEXT: v_bfi_b32 v1, v1, s3, s1 ; GFX10-GISEL-NEXT: ; return to shader part epilog entry: %and0 = and i64 %x, %z @@ -1415,12 +1348,10 @@ ; ; GFX10-GISEL-LABEL: s_v_s_bfi_sha256_ma_i64: ; GFX10-GISEL: ; %bb.0: ; %entry -; GFX10-GISEL-NEXT: s_or_b64 s[4:5], s[0:1], s[2:3] -; GFX10-GISEL-NEXT: s_and_b64 s[0:1], s[0:1], s[2:3] -; GFX10-GISEL-NEXT: v_and_b32_e32 v0, s4, v0 -; GFX10-GISEL-NEXT: v_and_b32_e32 v1, s5, v1 -; GFX10-GISEL-NEXT: v_or_b32_e32 v0, s0, v0 -; GFX10-GISEL-NEXT: v_or_b32_e32 v1, s1, v1 +; GFX10-GISEL-NEXT: s_and_b64 s[4:5], s[0:1], s[2:3] +; GFX10-GISEL-NEXT: s_or_b64 s[0:1], s[0:1], s[2:3] +; GFX10-GISEL-NEXT: v_and_or_b32 v0, v0, s0, s4 +; GFX10-GISEL-NEXT: v_and_or_b32 v1, v1, s1, s5 ; GFX10-GISEL-NEXT: ; return to shader part epilog entry: %and0 = and i64 %x, %z @@ -1462,26 +1393,20 @@ ; ; GFX8-GISEL-LABEL: s_s_v_bfi_sha256_ma_i64: ; GFX8-GISEL: ; %bb.0: ; %entry -; GFX8-GISEL-NEXT: v_and_b32_e32 v2, s0, v0 -; GFX8-GISEL-NEXT: v_and_b32_e32 v3, s1, v1 -; GFX8-GISEL-NEXT: v_or_b32_e32 v0, s0, v0 -; GFX8-GISEL-NEXT: v_or_b32_e32 v1, s1, v1 -; GFX8-GISEL-NEXT: v_and_b32_e32 v0, s2, v0 -; GFX8-GISEL-NEXT: v_and_b32_e32 v1, s3, v1 -; GFX8-GISEL-NEXT: v_or_b32_e32 v0, v2, v0 -; GFX8-GISEL-NEXT: v_or_b32_e32 v1, v3, v1 +; GFX8-GISEL-NEXT: v_mov_b32_e32 v2, s0 +; GFX8-GISEL-NEXT: v_xor_b32_e32 v2, s2, v2 +; GFX8-GISEL-NEXT: v_bfi_b32 v0, v2, v0, s2 +; GFX8-GISEL-NEXT: v_mov_b32_e32 v2, s1 +; GFX8-GISEL-NEXT: v_xor_b32_e32 v2, s3, v2 +; GFX8-GISEL-NEXT: v_bfi_b32 v1, v2, v1, s3 ; GFX8-GISEL-NEXT: ; return to shader part epilog ; ; GFX10-GISEL-LABEL: s_s_v_bfi_sha256_ma_i64: ; GFX10-GISEL: ; %bb.0: ; %entry -; GFX10-GISEL-NEXT: v_or_b32_e32 v2, s0, v0 -; GFX10-GISEL-NEXT: v_or_b32_e32 v3, s1, v1 -; GFX10-GISEL-NEXT: v_and_b32_e32 v0, s0, v0 -; GFX10-GISEL-NEXT: v_and_b32_e32 v1, s1, v1 -; GFX10-GISEL-NEXT: v_and_b32_e32 v2, s2, v2 -; GFX10-GISEL-NEXT: v_and_b32_e32 v3, s3, v3 -; GFX10-GISEL-NEXT: v_or_b32_e32 v0, v0, v2 -; GFX10-GISEL-NEXT: v_or_b32_e32 v1, v1, v3 +; GFX10-GISEL-NEXT: v_xor_b32_e64 v2, s0, s2 +; GFX10-GISEL-NEXT: v_xor_b32_e64 v3, s1, s3 +; GFX10-GISEL-NEXT: v_bfi_b32 v0, v2, v0, s2 +; GFX10-GISEL-NEXT: v_bfi_b32 v1, v3, v1, s3 ; GFX10-GISEL-NEXT: ; return to shader part epilog entry: %and0 = and i64 %x, %z @@ -1519,26 +1444,18 @@ ; ; GFX8-GISEL-LABEL: v_s_v_bfi_sha256_ma_i64: ; GFX8-GISEL: ; %bb.0: ; %entry -; GFX8-GISEL-NEXT: v_and_b32_e32 v4, v0, v2 -; GFX8-GISEL-NEXT: v_and_b32_e32 v5, v1, v3 -; GFX8-GISEL-NEXT: v_or_b32_e32 v0, v0, v2 -; GFX8-GISEL-NEXT: v_or_b32_e32 v1, v1, v3 -; GFX8-GISEL-NEXT: v_and_b32_e32 v0, s0, v0 -; GFX8-GISEL-NEXT: v_and_b32_e32 v1, s1, v1 -; GFX8-GISEL-NEXT: v_or_b32_e32 v0, v4, v0 -; GFX8-GISEL-NEXT: v_or_b32_e32 v1, v5, v1 +; GFX8-GISEL-NEXT: v_xor_b32_e32 v0, s0, v0 +; GFX8-GISEL-NEXT: v_xor_b32_e32 v1, s1, v1 +; GFX8-GISEL-NEXT: v_bfi_b32 v0, v0, v2, s0 +; GFX8-GISEL-NEXT: v_bfi_b32 v1, v1, v3, s1 ; GFX8-GISEL-NEXT: ; return to shader part epilog ; ; GFX10-GISEL-LABEL: v_s_v_bfi_sha256_ma_i64: ; GFX10-GISEL: ; %bb.0: ; %entry -; GFX10-GISEL-NEXT: v_or_b32_e32 v4, v0, v2 -; GFX10-GISEL-NEXT: v_or_b32_e32 v5, v1, v3 -; GFX10-GISEL-NEXT: v_and_b32_e32 v0, v0, v2 -; GFX10-GISEL-NEXT: v_and_b32_e32 v1, v1, v3 -; GFX10-GISEL-NEXT: v_and_b32_e32 v2, s0, v4 -; GFX10-GISEL-NEXT: v_and_b32_e32 v3, s1, v5 -; GFX10-GISEL-NEXT: v_or_b32_e32 v0, v0, v2 -; GFX10-GISEL-NEXT: v_or_b32_e32 v1, v1, v3 +; GFX10-GISEL-NEXT: v_xor_b32_e32 v0, s0, v0 +; GFX10-GISEL-NEXT: v_xor_b32_e32 v1, s1, v1 +; GFX10-GISEL-NEXT: v_bfi_b32 v0, v0, v2, s0 +; GFX10-GISEL-NEXT: v_bfi_b32 v1, v1, v3, s1 ; GFX10-GISEL-NEXT: ; return to shader part epilog entry: %and0 = and i64 %x, %z diff --git a/llvm/test/CodeGen/AMDGPU/cttz_zero_undef.ll b/llvm/test/CodeGen/AMDGPU/cttz_zero_undef.ll --- a/llvm/test/CodeGen/AMDGPU/cttz_zero_undef.ll +++ b/llvm/test/CodeGen/AMDGPU/cttz_zero_undef.ll @@ -1033,8 +1033,8 @@ ; GFX9-GISEL-NEXT: global_load_ubyte v4, v1, s[2:3] offset:3 ; GFX9-GISEL-NEXT: global_load_ubyte v5, v1, s[2:3] offset:4 ; GFX9-GISEL-NEXT: global_load_ubyte v6, v1, s[2:3] offset:5 -; GFX9-GISEL-NEXT: global_load_ubyte v7, v1, s[2:3] offset:7 -; GFX9-GISEL-NEXT: global_load_ubyte v8, v1, s[2:3] offset:6 +; GFX9-GISEL-NEXT: global_load_ubyte v7, v1, s[2:3] offset:6 +; GFX9-GISEL-NEXT: global_load_ubyte v8, v1, s[2:3] offset:7 ; GFX9-GISEL-NEXT: s_waitcnt vmcnt(6) ; GFX9-GISEL-NEXT: v_lshl_or_b32 v0, v2, 8, v0 ; GFX9-GISEL-NEXT: s_waitcnt vmcnt(5) @@ -1045,10 +1045,10 @@ ; GFX9-GISEL-NEXT: s_waitcnt vmcnt(2) ; GFX9-GISEL-NEXT: v_lshl_or_b32 v4, v6, 8, v5 ; GFX9-GISEL-NEXT: s_waitcnt vmcnt(1) -; GFX9-GISEL-NEXT: v_lshlrev_b32_e32 v5, 24, v7 +; GFX9-GISEL-NEXT: v_lshlrev_b32_e32 v5, 16, v7 ; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) -; GFX9-GISEL-NEXT: v_lshlrev_b32_e32 v6, 16, v8 -; GFX9-GISEL-NEXT: v_or3_b32 v3, v5, v6, v4 +; GFX9-GISEL-NEXT: v_lshl_or_b32 v0, v8, 24, v5 +; GFX9-GISEL-NEXT: v_or3_b32 v3, v0, v4, 0 ; GFX9-GISEL-NEXT: v_ffbl_b32_e32 v4, v3 ; GFX9-GISEL-NEXT: v_ffbl_b32_e32 v0, v2 ; GFX9-GISEL-NEXT: v_add_u32_e32 v4, 32, v4