Index: llvm/include/llvm/CodeGen/GlobalISel/InstructionSelector.h =================================================================== --- llvm/include/llvm/CodeGen/GlobalISel/InstructionSelector.h +++ llvm/include/llvm/CodeGen/GlobalISel/InstructionSelector.h @@ -393,6 +393,7 @@ GIU_MergeMemOperands_EndOfList = -1, }; +using NewMIVector = SmallVector; /// Provides the logic to select generic machine instructions. class InstructionSelector { public: @@ -409,6 +410,7 @@ /// for I in all mutated/inserted instructions: /// !isPreISelGenericOpcode(I.getOpcode()) virtual bool select(MachineInstr &I) = 0; + virtual void adjustInstrPostInstrSelection(NewMIVector &OutMIs) const {} CodeGenCoverage *CoverageInfo = nullptr; GISelKnownBits *KnownBits = nullptr; @@ -432,7 +434,6 @@ using ComplexRendererFns = Optional, 4>>; using RecordedMIVector = SmallVector; - using NewMIVector = SmallVector; struct MatcherState { std::vector Renderers; Index: llvm/include/llvm/CodeGen/GlobalISel/InstructionSelectorImpl.h =================================================================== --- llvm/include/llvm/CodeGen/GlobalISel/InstructionSelectorImpl.h +++ llvm/include/llvm/CodeGen/GlobalISel/InstructionSelectorImpl.h @@ -160,6 +160,16 @@ assert(State.MIs[InsnID] != nullptr && "Used insn before defined"); unsigned Opcode = State.MIs[InsnID]->getOpcode(); + if (Opcode == TargetOpcode::COPY) { + Register CopySrc = State.MIs[InsnID]->getOperand(1).getReg(); + if (!CopySrc.isPhysical()) { + auto MI2 = State.MIs[InsnID] = getDefIgnoringCopies(CopySrc, MRI); + if (MI2) { + State.MIs[InsnID] = MI2; + Opcode = MI2->getOpcode(); + } + } + } DEBUG_WITH_TYPE(TgtInstructionSelector::getName(), dbgs() << CurrentIdx << ": GIM_CheckOpcode(MIs[" << InsnID @@ -1113,6 +1123,7 @@ DEBUG_WITH_TYPE(TgtInstructionSelector::getName(), dbgs() << CurrentIdx << ": GIR_Done\n"); propagateFlags(OutMIs); + adjustInstrPostInstrSelection(OutMIs); return true; default: Index: llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h =================================================================== --- llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h +++ llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h @@ -57,6 +57,7 @@ const AMDGPUTargetMachine &TM); bool select(MachineInstr &I) override; + void adjustInstrPostInstrSelection(NewMIVector &OutMIs) const override; static const char *getName(); void setupMF(MachineFunction &MF, GISelKnownBits &KB, Index: llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp =================================================================== --- llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp +++ llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp @@ -2850,6 +2850,14 @@ return true; } +void AMDGPUInstructionSelector::adjustInstrPostInstrSelection( + NewMIVector &OutMIs) const { + if (OutMIs.size() != 1) + return; + TII.AdjustInstrPostInstrSelectionBase(*OutMIs[0], nullptr, + STI.getRegisterInfo()); +} + bool AMDGPUInstructionSelector::select(MachineInstr &I) { if (I.isPHI()) return selectPHI(I); Index: llvm/lib/Target/AMDGPU/SIISelLowering.cpp =================================================================== --- llvm/lib/Target/AMDGPU/SIISelLowering.cpp +++ llvm/lib/Target/AMDGPU/SIISelLowering.cpp @@ -10898,80 +10898,8 @@ /// bits set in the writemask void SITargetLowering::AdjustInstrPostInstrSelection(MachineInstr &MI, SDNode *Node) const { - const SIInstrInfo *TII = getSubtarget()->getInstrInfo(); - - MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo(); - - if (TII->isVOP3(MI.getOpcode())) { - // Make sure constant bus requirements are respected. - TII->legalizeOperandsVOP3(MRI, MI); - - // Prefer VGPRs over AGPRs in mAI instructions where possible. - // This saves a chain-copy of registers and better ballance register - // use between vgpr and agpr as agpr tuples tend to be big. - if (const MCOperandInfo *OpInfo = MI.getDesc().OpInfo) { - unsigned Opc = MI.getOpcode(); - const SIRegisterInfo *TRI = Subtarget->getRegisterInfo(); - for (auto I : { AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0), - AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1) }) { - if (I == -1) - break; - MachineOperand &Op = MI.getOperand(I); - if ((OpInfo[I].RegClass != llvm::AMDGPU::AV_64RegClassID && - OpInfo[I].RegClass != llvm::AMDGPU::AV_32RegClassID) || - !Register::isVirtualRegister(Op.getReg()) || - !TRI->isAGPR(MRI, Op.getReg())) - continue; - auto *Src = MRI.getUniqueVRegDef(Op.getReg()); - if (!Src || !Src->isCopy() || - !TRI->isSGPRReg(MRI, Src->getOperand(1).getReg())) - continue; - auto *RC = TRI->getRegClassForReg(MRI, Op.getReg()); - auto *NewRC = TRI->getEquivalentVGPRClass(RC); - // All uses of agpr64 and agpr32 can also accept vgpr except for - // v_accvgpr_read, but we do not produce agpr reads during selection, - // so no use checks are needed. - MRI.setRegClass(Op.getReg(), NewRC); - } - } - - return; - } - - // Replace unused atomics with the no return version. - int NoRetAtomicOp = AMDGPU::getAtomicNoRetOp(MI.getOpcode()); - if (NoRetAtomicOp != -1) { - if (!Node->hasAnyUseOfValue(0)) { - MI.setDesc(TII->get(NoRetAtomicOp)); - MI.RemoveOperand(0); - return; - } - - // For mubuf_atomic_cmpswap, we need to have tablegen use an extract_subreg - // instruction, because the return type of these instructions is a vec2 of - // the memory type, so it can be tied to the input operand. - // This means these instructions always have a use, so we need to add a - // special case to check if the atomic has only one extract_subreg use, - // which itself has no uses. - if ((Node->hasNUsesOfValue(1, 0) && - Node->use_begin()->isMachineOpcode() && - Node->use_begin()->getMachineOpcode() == AMDGPU::EXTRACT_SUBREG && - !Node->use_begin()->hasAnyUseOfValue(0))) { - Register Def = MI.getOperand(0).getReg(); - - // Change this into a noret atomic. - MI.setDesc(TII->get(NoRetAtomicOp)); - MI.RemoveOperand(0); - - // If we only remove the def operand from the atomic instruction, the - // extract_subreg will be left with a use of a vreg without a def. - // So we need to insert an implicit_def to avoid machine verifier - // errors. - BuildMI(*MI.getParent(), MI, MI.getDebugLoc(), - TII->get(AMDGPU::IMPLICIT_DEF), Def); - } - return; - } + getSubtarget()->getInstrInfo()->AdjustInstrPostInstrSelectionBase( + MI, Node, Subtarget->getRegisterInfo()); } static SDValue buildSMovImm32(SelectionDAG &DAG, const SDLoc &DL, Index: llvm/lib/Target/AMDGPU/SIInstrInfo.h =================================================================== --- llvm/lib/Target/AMDGPU/SIInstrInfo.h +++ llvm/lib/Target/AMDGPU/SIInstrInfo.h @@ -872,6 +872,9 @@ /// copy of src1. void legalizeOperandsVOP2(MachineRegisterInfo &MRI, MachineInstr &MI) const; + /// Fix \p MI operands post isel. + void AdjustInstrPostInstrSelectionBase(MachineInstr &MI, SDNode *Node, + const SIRegisterInfo *TRI) const; /// Fix operands in \p MI to satisfy constant bus requirements. void legalizeOperandsVOP3(MachineRegisterInfo &MRI, MachineInstr &MI) const; Index: llvm/lib/Target/AMDGPU/SIInstrInfo.cpp =================================================================== --- llvm/lib/Target/AMDGPU/SIInstrInfo.cpp +++ llvm/lib/Target/AMDGPU/SIInstrInfo.cpp @@ -4529,6 +4529,82 @@ fixImplicitOperands(MI); } +/// Assign the register class depending on the number of +/// bits set in the writemask +void SIInstrInfo::AdjustInstrPostInstrSelectionBase( + MachineInstr &MI, SDNode *Node, const SIRegisterInfo *TRI) const { + MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo(); + + if (isVOP3(MI.getOpcode())) { + // Make sure constant bus requirements are respected. + legalizeOperandsVOP3(MRI, MI); + + // Prefer VGPRs over AGPRs in mAI instructions where possible. + // This saves a chain-copy of registers and better ballance register + // use between vgpr and agpr as agpr tuples tend to be big. + if (const MCOperandInfo *OpInfo = MI.getDesc().OpInfo) { + unsigned Opc = MI.getOpcode(); + for (auto I : {AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0), + AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1)}) { + if (I == -1) + break; + MachineOperand &Op = MI.getOperand(I); + if ((OpInfo[I].RegClass != llvm::AMDGPU::AV_64RegClassID && + OpInfo[I].RegClass != llvm::AMDGPU::AV_32RegClassID) || + !Register::isVirtualRegister(Op.getReg()) || + !TRI->isAGPR(MRI, Op.getReg())) + continue; + auto *Src = MRI.getUniqueVRegDef(Op.getReg()); + if (!Src || !Src->isCopy() || + !TRI->isSGPRReg(MRI, Src->getOperand(1).getReg())) + continue; + auto *RC = TRI->getRegClassForReg(MRI, Op.getReg()); + auto *NewRC = TRI->getEquivalentVGPRClass(RC); + // All uses of agpr64 and agpr32 can also accept vgpr except for + // v_accvgpr_read, but we do not produce agpr reads during selection, + // so no use checks are needed. + MRI.setRegClass(Op.getReg(), NewRC); + } + } + + return; + } + + // Replace unused atomics with the no return version. + int NoRetAtomicOp = AMDGPU::getAtomicNoRetOp(MI.getOpcode()); + if (Node && NoRetAtomicOp != -1) { + if (!Node->hasAnyUseOfValue(0)) { + MI.setDesc(get(NoRetAtomicOp)); + MI.RemoveOperand(0); + return; + } + + // For mubuf_atomic_cmpswap, we need to have tablegen use an extract_subreg + // instruction, because the return type of these instructions is a vec2 of + // the memory type, so it can be tied to the input operand. + // This means these instructions always have a use, so we need to add a + // special case to check if the atomic has only one extract_subreg use, + // which itself has no uses. + if ((Node->hasNUsesOfValue(1, 0) && Node->use_begin()->isMachineOpcode() && + Node->use_begin()->getMachineOpcode() == AMDGPU::EXTRACT_SUBREG && + !Node->use_begin()->hasAnyUseOfValue(0))) { + Register Def = MI.getOperand(0).getReg(); + + // Change this into a noret atomic. + MI.setDesc(get(NoRetAtomicOp)); + MI.RemoveOperand(0); + + // If we only remove the def operand from the atomic instruction, the + // extract_subreg will be left with a use of a vreg without a def. + // So we need to insert an implicit_def to avoid machine verifier + // errors. + BuildMI(*MI.getParent(), MI, MI.getDebugLoc(), get(AMDGPU::IMPLICIT_DEF), + Def); + } + return; + } +} + // Legalize VOP3 operands. All operand types are supported for any operand // but only one literal constant and only starting from GFX10. void SIInstrInfo::legalizeOperandsVOP3(MachineRegisterInfo &MRI, Index: llvm/test/CodeGen/AMDGPU/GlobalISel/add.v2i16.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/GlobalISel/add.v2i16.ll +++ llvm/test/CodeGen/AMDGPU/GlobalISel/add.v2i16.ll @@ -98,10 +98,9 @@ ; GFX8-LABEL: v_add_v2i16_neg_inline_imm_splat: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: s_movk_i32 s4, 0xffc0 -; GFX8-NEXT: v_mov_b32_e32 v2, s4 -; GFX8-NEXT: v_add_u16_e32 v1, s4, v0 -; GFX8-NEXT: v_add_u16_sdwa v0, v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-NEXT: v_mov_b32_e32 v2, 64 +; GFX8-NEXT: v_subrev_u16_e32 v1, 64, v0 +; GFX8-NEXT: v_sub_u16_sdwa v0, v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; GFX8-NEXT: v_or_b32_e32 v0, v1, v0 ; GFX8-NEXT: s_setpc_b64 s[30:31] %add = add <2 x i16> %a, @@ -120,7 +119,7 @@ ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v2, 4 -; GFX8-NEXT: v_add_u16_e32 v1, 0xffc0, v0 +; GFX8-NEXT: v_subrev_u16_e32 v1, 64, v0 ; GFX8-NEXT: v_add_u16_sdwa v0, v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; GFX8-NEXT: v_or_b32_e32 v0, v1, v0 ; GFX8-NEXT: s_setpc_b64 s[30:31] @@ -139,10 +138,10 @@ ; GFX8-LABEL: v_add_v2i16_neg_inline_imm_hi: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v1, 0xffffffc0 -; GFX8-NEXT: v_add_u16_e32 v2, 4, v0 -; GFX8-NEXT: v_add_u16_sdwa v0, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; GFX8-NEXT: v_or_b32_e32 v0, v2, v0 +; GFX8-NEXT: v_mov_b32_e32 v2, 64 +; GFX8-NEXT: v_add_u16_e32 v1, 4, v0 +; GFX8-NEXT: v_sub_u16_sdwa v0, v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_e32 v0, v1, v0 ; GFX8-NEXT: s_setpc_b64 s[30:31] %add = add <2 x i16> %a, ret <2 x i16> %add Index: llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.i16.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.i16.ll +++ llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.i16.ll @@ -126,16 +126,16 @@ define amdgpu_ps void @insertelement_s_v2i16_v_s(<2 x i16> addrspace(4)* inreg %ptr, i16 %val, i32 inreg %idx) { ; GFX9-LABEL: insertelement_s_v2i16_v_s: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s0, s[2:3], 0x0 ; GFX9-NEXT: s_and_b32 s1, s4, 1 -; GFX9-NEXT: s_mov_b32 s2, 0xffff +; GFX9-NEXT: s_load_dword s0, s[2:3], 0x0 ; GFX9-NEXT: s_lshl_b32 s1, s1, 4 -; GFX9-NEXT: v_and_b32_e32 v0, s2, v0 -; GFX9-NEXT: s_lshl_b32 s2, s2, s1 +; GFX9-NEXT: s_mov_b32 s2, 0xffff +; GFX9-NEXT: v_lshlrev_b32_sdwa v0, s1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; GFX9-NEXT: s_lshl_b32 s1, s2, s1 +; GFX9-NEXT: s_not_b32 s1, s1 +; GFX9-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_andn2_b32 s0, s0, s2 -; GFX9-NEXT: v_mov_b32_e32 v1, s0 -; GFX9-NEXT: v_lshl_or_b32 v2, v0, s1, v1 +; GFX9-NEXT: v_and_or_b32 v2, s0, v1, v0 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: global_store_dword v[0:1], v2, off @@ -612,17 +612,17 @@ ; GFX9-NEXT: s_lshr_b32 s2, s4, 1 ; GFX9-NEXT: s_cmp_eq_u32 s2, 1 ; GFX9-NEXT: s_mov_b32 s5, 0xffff -; GFX9-NEXT: v_and_b32_e32 v0, s5, v0 +; GFX9-NEXT: v_cmp_eq_u32_e64 vcc, s2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_cselect_b32 s3, s1, s0 ; GFX9-NEXT: s_and_b32 s4, s4, 1 ; GFX9-NEXT: s_lshl_b32 s4, s4, 4 -; GFX9-NEXT: s_lshl_b32 s5, s5, s4 -; GFX9-NEXT: s_andn2_b32 s3, s3, s5 -; GFX9-NEXT: v_mov_b32_e32 v1, s3 -; GFX9-NEXT: v_lshl_or_b32 v2, v0, s4, v1 +; GFX9-NEXT: v_lshlrev_b32_sdwa v0, s4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; GFX9-NEXT: s_lshl_b32 s4, s5, s4 +; GFX9-NEXT: s_not_b32 s4, s4 +; GFX9-NEXT: v_mov_b32_e32 v1, s4 +; GFX9-NEXT: v_and_or_b32 v2, s3, v1, v0 ; GFX9-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NEXT: v_cmp_eq_u32_e64 vcc, s2, 0 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc ; GFX9-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-NEXT: v_cmp_eq_u32_e64 vcc, s2, 1 @@ -1313,7 +1313,7 @@ ; GFX9-NEXT: s_lshr_b32 s5, s4, 1 ; GFX9-NEXT: s_cmp_eq_u32 s5, 1 ; GFX9-NEXT: s_mov_b32 s7, 0xffff -; GFX9-NEXT: v_and_b32_e32 v0, s7, v0 +; GFX9-NEXT: v_cmp_eq_u32_e64 vcc, s5, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_cselect_b32 s6, s1, s0 ; GFX9-NEXT: s_cmp_eq_u32 s5, 2 @@ -1322,12 +1322,12 @@ ; GFX9-NEXT: s_cselect_b32 s6, s3, s6 ; GFX9-NEXT: s_and_b32 s4, s4, 1 ; GFX9-NEXT: s_lshl_b32 s4, s4, 4 -; GFX9-NEXT: s_lshl_b32 s7, s7, s4 -; GFX9-NEXT: s_andn2_b32 s6, s6, s7 -; GFX9-NEXT: v_mov_b32_e32 v1, s6 -; GFX9-NEXT: v_lshl_or_b32 v4, v0, s4, v1 +; GFX9-NEXT: v_lshlrev_b32_sdwa v0, s4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; GFX9-NEXT: s_lshl_b32 s4, s7, s4 +; GFX9-NEXT: s_not_b32 s4, s4 +; GFX9-NEXT: v_mov_b32_e32 v1, s4 +; GFX9-NEXT: v_and_or_b32 v4, s6, v1, v0 ; GFX9-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NEXT: v_cmp_eq_u32_e64 vcc, s5, 0 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc ; GFX9-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-NEXT: v_cmp_eq_u32_e64 vcc, s5, 1 @@ -2293,7 +2293,7 @@ ; GFX9-NEXT: s_lshr_b32 s2, s4, 1 ; GFX9-NEXT: s_cmp_eq_u32 s2, 1 ; GFX9-NEXT: s_mov_b32 s3, 0xffff -; GFX9-NEXT: v_and_b32_e32 v0, s3, v0 +; GFX9-NEXT: v_cmp_eq_u32_e64 vcc, s2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_cselect_b32 s0, s9, s8 ; GFX9-NEXT: s_cmp_eq_u32 s2, 2 @@ -2310,12 +2310,12 @@ ; GFX9-NEXT: s_cselect_b32 s0, s15, s0 ; GFX9-NEXT: s_and_b32 s1, s4, 1 ; GFX9-NEXT: s_lshl_b32 s1, s1, 4 -; GFX9-NEXT: s_lshl_b32 s3, s3, s1 -; GFX9-NEXT: s_andn2_b32 s0, s0, s3 -; GFX9-NEXT: v_mov_b32_e32 v1, s0 -; GFX9-NEXT: v_lshl_or_b32 v8, v0, s1, v1 +; GFX9-NEXT: v_lshlrev_b32_sdwa v0, s1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; GFX9-NEXT: s_lshl_b32 s1, s3, s1 +; GFX9-NEXT: s_not_b32 s1, s1 +; GFX9-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NEXT: v_and_or_b32 v8, s0, v1, v0 ; GFX9-NEXT: v_mov_b32_e32 v0, s8 -; GFX9-NEXT: v_cmp_eq_u32_e64 vcc, s2, 0 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v8, vcc ; GFX9-NEXT: v_mov_b32_e32 v1, s9 ; GFX9-NEXT: v_cmp_eq_u32_e64 vcc, s2, 1 Index: llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.i8.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.i8.ll +++ llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.i8.ll @@ -704,7 +704,6 @@ ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dword s1, s[2:3], 0x0 ; GFX9-NEXT: s_movk_i32 s6, 0xff -; GFX9-NEXT: v_and_b32_e32 v0, s6, v0 ; GFX9-NEXT: s_mov_b32 s0, 8 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_lshr_b32 s2, s1, 8 @@ -722,10 +721,11 @@ ; GFX9-NEXT: s_or_b32 s1, s1, s2 ; GFX9-NEXT: s_and_b32 s2, s4, 3 ; GFX9-NEXT: s_lshl_b32 s2, s2, 3 -; GFX9-NEXT: s_lshl_b32 s3, s6, s2 -; GFX9-NEXT: s_andn2_b32 s1, s1, s3 -; GFX9-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-NEXT: v_lshl_or_b32 v0, v0, s2, v1 +; GFX9-NEXT: v_lshlrev_b32_sdwa v0, s2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: s_lshl_b32 s2, s6, s2 +; GFX9-NEXT: s_not_b32 s2, s2 +; GFX9-NEXT: v_mov_b32_e32 v1, s2 +; GFX9-NEXT: v_and_or_b32 v0, s1, v1, v0 ; GFX9-NEXT: v_lshrrev_b32_e32 v1, 8, v0 ; GFX9-NEXT: v_lshlrev_b32_sdwa v1, s0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_and_or_b32 v1, v0, s6, v1 @@ -1868,7 +1868,6 @@ ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; GFX9-NEXT: s_movk_i32 s10, 0xff -; GFX9-NEXT: v_and_b32_e32 v0, s10, v0 ; GFX9-NEXT: s_mov_b32 s5, 8 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_lshr_b32 s2, s0, 8 @@ -1902,10 +1901,11 @@ ; GFX9-NEXT: s_cselect_b32 s3, s1, s0 ; GFX9-NEXT: s_and_b32 s4, s4, 3 ; GFX9-NEXT: s_lshl_b32 s4, s4, 3 -; GFX9-NEXT: s_lshl_b32 s6, s10, s4 -; GFX9-NEXT: s_andn2_b32 s3, s3, s6 -; GFX9-NEXT: v_mov_b32_e32 v1, s3 -; GFX9-NEXT: v_lshl_or_b32 v2, v0, s4, v1 +; GFX9-NEXT: v_lshlrev_b32_sdwa v0, s4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: s_lshl_b32 s4, s10, s4 +; GFX9-NEXT: s_not_b32 s4, s4 +; GFX9-NEXT: v_mov_b32_e32 v1, s4 +; GFX9-NEXT: v_and_or_b32 v2, s3, v1, v0 ; GFX9-NEXT: v_mov_b32_e32 v0, s0 ; GFX9-NEXT: v_cmp_eq_u32_e64 vcc, s2, 0 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc @@ -3835,7 +3835,6 @@ ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x0 ; GFX9-NEXT: s_movk_i32 s18, 0xff -; GFX9-NEXT: v_and_b32_e32 v0, s18, v0 ; GFX9-NEXT: s_mov_b32 s5, 8 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_lshr_b32 s6, s0, 8 @@ -3899,10 +3898,11 @@ ; GFX9-NEXT: s_cselect_b32 s7, s3, s7 ; GFX9-NEXT: s_and_b32 s4, s4, 3 ; GFX9-NEXT: s_lshl_b32 s4, s4, 3 -; GFX9-NEXT: s_lshl_b32 s8, s18, s4 -; GFX9-NEXT: s_andn2_b32 s7, s7, s8 -; GFX9-NEXT: v_mov_b32_e32 v1, s7 -; GFX9-NEXT: v_lshl_or_b32 v4, v0, s4, v1 +; GFX9-NEXT: v_lshlrev_b32_sdwa v0, s4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: s_lshl_b32 s4, s18, s4 +; GFX9-NEXT: s_not_b32 s4, s4 +; GFX9-NEXT: v_mov_b32_e32 v1, s4 +; GFX9-NEXT: v_and_or_b32 v4, s7, v1, v0 ; GFX9-NEXT: v_mov_b32_e32 v0, s0 ; GFX9-NEXT: v_cmp_eq_u32_e64 vcc, s6, 0 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc Index: llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-pattern-and-or.mir =================================================================== --- llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-pattern-and-or.mir +++ llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-pattern-and-or.mir @@ -152,20 +152,17 @@ ; GFX9: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0 ; GFX9: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr1 ; GFX9: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; GFX9: [[S_AND_B32_:%[0-9]+]]:sreg_32 = S_AND_B32 [[COPY]], [[COPY1]], implicit-def $scc - ; GFX9: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[S_AND_B32_]] - ; GFX9: [[V_OR_B32_e64_:%[0-9]+]]:vgpr_32 = V_OR_B32_e64 [[COPY3]], [[COPY2]], implicit $exec - ; GFX9: S_ENDPGM 0, implicit [[V_OR_B32_e64_]] + ; GFX9: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[COPY1]] + ; GFX9: [[V_AND_OR_B32_:%[0-9]+]]:vgpr_32 = V_AND_OR_B32 [[COPY]], [[COPY3]], [[COPY2]], implicit $exec + ; GFX9: S_ENDPGM 0, implicit [[V_AND_OR_B32_]] ; GFX10-LABEL: name: and_or_s32_sgpr_sgpr_vgpr ; GFX10: liveins: $sgpr0, $sgpr1, $vgpr0 ; GFX10: $vcc_hi = IMPLICIT_DEF ; GFX10: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0 ; GFX10: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr1 ; GFX10: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; GFX10: [[S_AND_B32_:%[0-9]+]]:sreg_32 = S_AND_B32 [[COPY]], [[COPY1]], implicit-def $scc - ; GFX10: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[S_AND_B32_]] - ; GFX10: [[V_OR_B32_e64_:%[0-9]+]]:vgpr_32 = V_OR_B32_e64 [[COPY3]], [[COPY2]], implicit $exec - ; GFX10: S_ENDPGM 0, implicit [[V_OR_B32_e64_]] + ; GFX10: [[V_AND_OR_B32_:%[0-9]+]]:vgpr_32 = V_AND_OR_B32 [[COPY]], [[COPY1]], [[COPY2]], implicit $exec + ; GFX10: S_ENDPGM 0, implicit [[V_AND_OR_B32_]] %0:sgpr(s32) = COPY $sgpr0 %1:sgpr(s32) = COPY $sgpr1 %2:vgpr(s32) = COPY $vgpr0 Index: llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-pattern-xor3.mir =================================================================== --- llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-pattern-xor3.mir +++ llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-pattern-xor3.mir @@ -126,10 +126,8 @@ ; GFX10: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0 ; GFX10: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr1 ; GFX10: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; GFX10: [[S_XOR_B32_:%[0-9]+]]:sreg_32 = S_XOR_B32 [[COPY]], [[COPY1]], implicit-def $scc - ; GFX10: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[S_XOR_B32_]] - ; GFX10: [[V_XOR_B32_e64_:%[0-9]+]]:vgpr_32 = V_XOR_B32_e64 [[COPY3]], [[COPY2]], implicit $exec - ; GFX10: S_ENDPGM 0, implicit [[V_XOR_B32_e64_]] + ; GFX10: [[V_XOR3_B32_:%[0-9]+]]:vgpr_32 = V_XOR3_B32 [[COPY]], [[COPY1]], [[COPY2]], implicit $exec + ; GFX10: S_ENDPGM 0, implicit [[V_XOR3_B32_]] %0:sgpr(s32) = COPY $sgpr0 %1:sgpr(s32) = COPY $sgpr1 %2:vgpr(s32) = COPY $vgpr0 @@ -174,10 +172,8 @@ ; GFX10: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0 ; GFX10: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr1 ; GFX10: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; GFX10: [[S_XOR_B32_:%[0-9]+]]:sreg_32 = S_XOR_B32 [[COPY]], [[COPY1]], implicit-def $scc - ; GFX10: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[S_XOR_B32_]] - ; GFX10: [[V_XOR_B32_e64_:%[0-9]+]]:vgpr_32 = V_XOR_B32_e64 [[COPY2]], [[COPY3]], implicit $exec - ; GFX10: S_ENDPGM 0, implicit [[V_XOR_B32_e64_]] + ; GFX10: [[V_XOR3_B32_:%[0-9]+]]:vgpr_32 = V_XOR3_B32 [[COPY]], [[COPY1]], [[COPY2]], implicit $exec + ; GFX10: S_ENDPGM 0, implicit [[V_XOR3_B32_]] %0:sgpr(s32) = COPY $sgpr0 %1:sgpr(s32) = COPY $sgpr1 %2:vgpr(s32) = COPY $vgpr0 Index: llvm/test/CodeGen/AMDGPU/GlobalISel/shlN_add.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/GlobalISel/shlN_add.ll +++ llvm/test/CodeGen/AMDGPU/GlobalISel/shlN_add.ll @@ -174,8 +174,7 @@ define amdgpu_ps float @shl1_add_u32_vgpr1(i32 inreg %src0, i32 %src1) { ; GFX9-LABEL: shl1_add_u32_vgpr1: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_lshl_b32 s0, s0, 1 -; GFX9-NEXT: v_add_u32_e32 v0, s0, v0 +; GFX9-NEXT: v_lshl_add_u32 v0, s0, 1, v0 ; GFX9-NEXT: ; return to shader part epilog ; ; GFX8-LABEL: shl1_add_u32_vgpr1: @@ -192,8 +191,7 @@ define amdgpu_ps float @shl2_add_u32_vgpr1(i32 inreg %src0, i32 %src1) { ; GFX9-LABEL: shl2_add_u32_vgpr1: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_lshl_b32 s0, s0, 2 -; GFX9-NEXT: v_add_u32_e32 v0, s0, v0 +; GFX9-NEXT: v_lshl_add_u32 v0, s0, 2, v0 ; GFX9-NEXT: ; return to shader part epilog ; ; GFX8-LABEL: shl2_add_u32_vgpr1: @@ -210,8 +208,7 @@ define amdgpu_ps float @shl3_add_u32_vgpr1(i32 inreg %src0, i32 %src1) { ; GFX9-LABEL: shl3_add_u32_vgpr1: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_lshl_b32 s0, s0, 3 -; GFX9-NEXT: v_add_u32_e32 v0, s0, v0 +; GFX9-NEXT: v_lshl_add_u32 v0, s0, 3, v0 ; GFX9-NEXT: ; return to shader part epilog ; ; GFX8-LABEL: shl3_add_u32_vgpr1: @@ -228,8 +225,7 @@ define amdgpu_ps float @shl4_add_u32_vgpr1(i32 inreg %src0, i32 %src1) { ; GFX9-LABEL: shl4_add_u32_vgpr1: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_lshl_b32 s0, s0, 4 -; GFX9-NEXT: v_add_u32_e32 v0, s0, v0 +; GFX9-NEXT: v_lshl_add_u32 v0, s0, 4, v0 ; GFX9-NEXT: ; return to shader part epilog ; ; GFX8-LABEL: shl4_add_u32_vgpr1: @@ -246,8 +242,7 @@ define amdgpu_ps float @shl5_add_u32_vgpr1(i32 inreg %src0, i32 %src1) { ; GFX9-LABEL: shl5_add_u32_vgpr1: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_lshl_b32 s0, s0, 5 -; GFX9-NEXT: v_add_u32_e32 v0, s0, v0 +; GFX9-NEXT: v_lshl_add_u32 v0, s0, 5, v0 ; GFX9-NEXT: ; return to shader part epilog ; ; GFX8-LABEL: shl5_add_u32_vgpr1: Index: llvm/test/CodeGen/Mips/GlobalISel/llvm-ir/select.ll =================================================================== --- llvm/test/CodeGen/Mips/GlobalISel/llvm-ir/select.ll +++ llvm/test/CodeGen/Mips/GlobalISel/llvm-ir/select.ll @@ -56,9 +56,8 @@ define i32 @select_with_negation(i32 %a, i32 %b, i32 %x, i32 %y) { ; MIPS32-LABEL: select_with_negation: ; MIPS32: # %bb.0: # %entry -; MIPS32-NEXT: ori $1, $zero, 1 -; MIPS32-NEXT: slt $2, $4, $5 -; MIPS32-NEXT: xor $1, $2, $1 +; MIPS32-NEXT: slt $1, $4, $5 +; MIPS32-NEXT: xori $1, $1, 1 ; MIPS32-NEXT: andi $1, $1, 1 ; MIPS32-NEXT: movn $7, $6, $1 ; MIPS32-NEXT: move $2, $7