diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.h b/llvm/lib/Target/AMDGPU/SIInstrInfo.h --- a/llvm/lib/Target/AMDGPU/SIInstrInfo.h +++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.h @@ -1263,10 +1263,6 @@ LLVM_READONLY int getMFMAEarlyClobberOp(uint16_t Opcode); - /// \returns v_cmpx version of a v_cmp instruction. - LLVM_READONLY - int getVCMPXOpFromVCMP(uint16_t Opcode); - const uint64_t RSRC_DATA_FORMAT = 0xf00000000000LL; const uint64_t RSRC_ELEMENT_SIZE_SHIFT = (32 + 19); const uint64_t RSRC_INDEX_STRIDE_SHIFT = (32 + 21); diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp --- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp +++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp @@ -3856,19 +3856,18 @@ MachineInstr *SIInstrInfo::buildShrunkInst(MachineInstr &MI, unsigned Op32) const { - MachineBasicBlock *MBB = MI.getParent(); + MachineBasicBlock *MBB = MI.getParent();; MachineInstrBuilder Inst32 = BuildMI(*MBB, MI, MI.getDebugLoc(), get(Op32)) .setMIFlags(MI.getFlags()); // Add the dst operand if the 32-bit encoding also has an explicit $vdst. // For VOPC instructions, this is replaced by an implicit def of vcc. - if (AMDGPU::getNamedOperandIdx(Op32, AMDGPU::OpName::vdst) != -1) { + int Op32DstIdx = AMDGPU::getNamedOperandIdx(Op32, AMDGPU::OpName::vdst); + if (Op32DstIdx != -1) { // dst Inst32.add(MI.getOperand(0)); - } else if (AMDGPU::getNamedOperandIdx(Op32, AMDGPU::OpName::sdst) != -1) { - // VOPCX instructions won't be writing to an explicit dst, so this should - // not fail for these instructions. + } else { assert(((MI.getOperand(0).getReg() == AMDGPU::VCC) || (MI.getOperand(0).getReg() == AMDGPU::VCC_LO)) && "Unexpected case"); diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.td b/llvm/lib/Target/AMDGPU/SIInstrInfo.td --- a/llvm/lib/Target/AMDGPU/SIInstrInfo.td +++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.td @@ -2669,15 +2669,6 @@ let ValueCols = [["0"]]; } -// Maps an v_cmp instruction to its v_cmpx equivalent. -def getVCMPXOpFromVCMP : InstrMapping { - let FilterClass = "VCMPVCMPXTable"; - let RowFields = ["VCMPOp"]; - let ColFields = ["IsVCMPX"]; - let KeyCol = ["0"]; - let ValueCols = [["1"]]; -} - include "SIInstructions.td" include "DSInstructions.td" diff --git a/llvm/lib/Target/AMDGPU/SIOptimizeExecMasking.cpp b/llvm/lib/Target/AMDGPU/SIOptimizeExecMasking.cpp --- a/llvm/lib/Target/AMDGPU/SIOptimizeExecMasking.cpp +++ b/llvm/lib/Target/AMDGPU/SIOptimizeExecMasking.cpp @@ -11,7 +11,6 @@ #include "MCTargetDesc/AMDGPUMCTargetDesc.h" #include "llvm/CodeGen/MachineFunctionPass.h" #include "llvm/InitializePasses.h" -#include "llvm/CodeGen/LivePhysRegs.h" using namespace llvm; @@ -293,183 +292,6 @@ return false; } -// Backwards-iterate from Origin (for n=MaxInstructions iterations) until either -// the beginning of the BB is reached or Pred evaluates to true - which can be -// an arbitrary condition based on the current MachineInstr, for instance an -// target instruction. Breaks prematurely by returning nullptr if one of the -// registers given in NonModifiableRegs is modified by the current instruction. -static MachineInstr * -findInstrBackwards(MachineInstr &Origin, - std::function Pred, - ArrayRef NonModifiableRegs, - const SIRegisterInfo *TRI, unsigned MaxInstructions = 5) { - MachineBasicBlock::reverse_iterator A = Origin.getReverseIterator(), - E = Origin.getParent()->rend(); - unsigned CurrentIteration = 0; - - for (++A; CurrentIteration < MaxInstructions && A != E; ++A) { - if (Pred(&*A)) - return &*A; - - for (MCRegister Reg : NonModifiableRegs) { - if (A->modifiesRegister(Reg, TRI)) - return nullptr; - } - - ++CurrentIteration; - } - - return nullptr; -} - -// Determine if a register Reg is not re-defined and still in use -// in the range (Stop..BB.end]. -// It does so by backwards calculating liveness from the end of the BB until -// either Stop or the beginning of the BB is reached. -// After liveness is calculated, we can determine if Reg is still in use and not -// defined inbetween the instructions. -static bool isRegisterInUseAfter(MachineInstr &Stop, MCRegister Reg, - const SIRegisterInfo *TRI, - MachineRegisterInfo &MRI) { - LivePhysRegs LR(*TRI); - LR.addLiveOuts(*Stop.getParent()); - - for (auto A = Stop.getParent()->rbegin(); - A != Stop.getParent()->rend() && A != Stop; ++A) { - LR.stepBackward(*A); - } - - return !LR.available(MRI, Reg); -} - -// Tries to find a possibility to optimize a v_cmp ..., s_and_saveexec sequence -// by looking at an instance of a s_and_saveexec instruction. Returns a pointer -// to the v_cmp instruction if it is safe to replace the sequence (see the -// conditions in the function body). This is after register allocation, so some -// checks on operand dependencies need to be considered. -static MachineInstr *findPossibleVCMPVCMPXOptimization( - MachineInstr &SaveExec, MCRegister Exec, const SIRegisterInfo *TRI, - const SIInstrInfo *TII, MachineRegisterInfo &MRI) { - - MachineInstr *VCmp = nullptr; - - Register SaveExecDest = SaveExec.getOperand(0).getReg(); - if (!TRI->isSGPRReg(MRI, SaveExecDest)) - return nullptr; - - MachineOperand *SaveExecSrc0 = - TII->getNamedOperand(SaveExec, AMDGPU::OpName::src0); - if (!SaveExecSrc0->isReg()) - return nullptr; - - // Try to find the last v_cmp instruction that defs the saveexec input - // operand without any write to Exec inbetween. - VCmp = findInstrBackwards( - SaveExec, - [&](MachineInstr *Check) { - return AMDGPU::getVCMPXOpFromVCMP(Check->getOpcode()) != -1 && - Check->modifiesRegister(SaveExecSrc0->getReg(), TRI); - }, - {Exec, SaveExecSrc0->getReg()}, TRI); - - if (!VCmp) - return nullptr; - - MachineOperand *VCmpDest = TII->getNamedOperand(*VCmp, AMDGPU::OpName::sdst); - assert(VCmpDest && "Should have an sdst operand!"); - - // Check if any of the v_cmp source operands is written by the saveexec. - MachineOperand *Src0 = TII->getNamedOperand(*VCmp, AMDGPU::OpName::src0); - if (Src0->isReg() && TRI->isSGPRReg(MRI, Src0->getReg()) && - SaveExec.modifiesRegister(Src0->getReg(), TRI)) - return nullptr; - - MachineOperand *Src1 = TII->getNamedOperand(*VCmp, AMDGPU::OpName::src1); - if (Src1->isReg() && TRI->isSGPRReg(MRI, Src1->getReg()) && - SaveExec.modifiesRegister(Src1->getReg(), TRI)) - return nullptr; - - // Don't do the transformation if the destination operand is included in - // it's MBB Live-outs, meaning it's used in any of it's successors, leading - // to incorrect code if the v_cmp and therefore the def of - // the dest operand is removed. - if (isLiveOut(*VCmp->getParent(), VCmpDest->getReg())) - return nullptr; - - // If the v_cmp target is in use after the s_and_saveexec, skip the - // optimization. - if (isRegisterInUseAfter(SaveExec, VCmpDest->getReg(), TRI, - MRI)) - return nullptr; - - // Try to determine if there is a write to any of the VCmp - // operands between the saveexec and the vcmp. - // If yes, additional VGPR spilling might need to be inserted. In this case, - // it's not worth replacing the instruction sequence. - SmallVector NonDefRegs; - if (Src0->isReg()) - NonDefRegs.push_back(Src0->getReg()); - - if (Src1->isReg()) - NonDefRegs.push_back(Src1->getReg()); - - if (!findInstrBackwards( - SaveExec, [&](MachineInstr *Check) { return Check == VCmp; }, - NonDefRegs, TRI)) - return nullptr; - - return VCmp; -} - -// Inserts the optimized s_mov_b32 / v_cmpx sequence based on the -// operands extracted from a v_cmp ..., s_and_saveexec pattern. -static bool optimizeVCMPSaveExecSequence(MachineInstr &SaveExecInstr, - MachineInstr &VCmp, MCRegister Exec, - const SIInstrInfo *TII, - const SIRegisterInfo *TRI, - MachineRegisterInfo &MRI) { - const int NewOpcode = AMDGPU::getVCMPXOpFromVCMP(VCmp.getOpcode()); - - if (NewOpcode == -1) - return false; - - MachineOperand *Src0 = TII->getNamedOperand(VCmp, AMDGPU::OpName::src0); - MachineOperand *Src1 = TII->getNamedOperand(VCmp, AMDGPU::OpName::src1); - - Register MoveDest = SaveExecInstr.getOperand(0).getReg(); - - MachineBasicBlock::instr_iterator InsertPosIt = SaveExecInstr.getIterator(); - if (!SaveExecInstr.uses().empty()) { - bool isSGPR32 = TRI->getRegSizeInBits(MoveDest, MRI) == 32; - unsigned MovOpcode = isSGPR32 ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64; - BuildMI(*SaveExecInstr.getParent(), InsertPosIt, - SaveExecInstr.getDebugLoc(), TII->get(MovOpcode), MoveDest) - .addReg(Exec); - } - - // Omit dst as V_CMPX is implicitly writing to EXEC. - // Add dummy src and clamp modifiers, if needed. - auto Builder = BuildMI(*VCmp.getParent(), std::next(InsertPosIt), - VCmp.getDebugLoc(), TII->get(NewOpcode)); - - if (AMDGPU::getNamedOperandIdx(NewOpcode, AMDGPU::OpName::src0_modifiers) != - -1) - Builder.addImm(0); - - Builder.add(*Src0); - - if (AMDGPU::getNamedOperandIdx(NewOpcode, AMDGPU::OpName::src1_modifiers) != - -1) - Builder.addImm(0); - - Builder.add(*Src1); - - if (AMDGPU::getNamedOperandIdx(NewOpcode, AMDGPU::OpName::clamp) != -1) - Builder.addImm(0); - - return true; -} - bool SIOptimizeExecMasking::runOnMachineFunction(MachineFunction &MF) { if (skipFunction(MF.getFunction())) return false; @@ -477,7 +299,6 @@ const GCNSubtarget &ST = MF.getSubtarget(); const SIRegisterInfo *TRI = ST.getRegisterInfo(); const SIInstrInfo *TII = ST.getInstrInfo(); - MachineRegisterInfo *MRI = &MF.getRegInfo(); MCRegister Exec = ST.isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC; // Optimize sequences emitted for control flow lowering. They are originally @@ -641,45 +462,5 @@ Changed = true; } - // After all s_op_saveexec instructions are inserted, - // replace (on GFX10.3 and later) - // v_cmp_* SGPR, IMM, VGPR - // s_and_saveexec_b32 EXEC_SGPR_DEST, SGPR - // with - // s_mov_b32 EXEC_SGPR_DEST, exec_lo - // v_cmpx_* IMM, VGPR - // to reduce pipeline stalls. - if (ST.hasGFX10_3Insts()) { - DenseMap SaveExecVCmpMapping; - const unsigned AndSaveExecOpcode = - ST.isWave32() ? AMDGPU::S_AND_SAVEEXEC_B32 : AMDGPU::S_AND_SAVEEXEC_B64; - - for (MachineBasicBlock &MBB : MF) { - for (MachineInstr &MI : MBB) { - // Record relevant v_cmp / s_and_saveexec instruction pairs for - // replacement. - if (MI.getOpcode() != AndSaveExecOpcode) - continue; - - if (MachineInstr *VCmp = - findPossibleVCMPVCMPXOptimization(MI, Exec, TRI, TII, *MRI)) - SaveExecVCmpMapping[&MI] = VCmp; - } - } - - for (const auto &Entry : SaveExecVCmpMapping) { - MachineInstr *SaveExecInstr = Entry.getFirst(); - MachineInstr *VCmpInstr = Entry.getSecond(); - - if (optimizeVCMPSaveExecSequence(*SaveExecInstr, *VCmpInstr, Exec, TII, - TRI, *MRI)) { - SaveExecInstr->eraseFromParent(); - VCmpInstr->eraseFromParent(); - - Changed = true; - } - } - } - return Changed; } diff --git a/llvm/lib/Target/AMDGPU/SIShrinkInstructions.cpp b/llvm/lib/Target/AMDGPU/SIShrinkInstructions.cpp --- a/llvm/lib/Target/AMDGPU/SIShrinkInstructions.cpp +++ b/llvm/lib/Target/AMDGPU/SIShrinkInstructions.cpp @@ -728,27 +728,21 @@ int Op32 = AMDGPU::getVOPe32(MI.getOpcode()); if (TII->isVOPC(Op32)) { - MachineOperand &Op0 = MI.getOperand(0); - if (Op0.isReg()) { - // Exclude VOPCX instructions as these don't explicitly write a - // dst. - Register DstReg = Op0.getReg(); - if (DstReg.isVirtual()) { - // VOPC instructions can only write to the VCC register. We can't - // force them to use VCC here, because this is only one register and - // cannot deal with sequences which would require multiple copies of - // VCC, e.g. S_AND_B64 (vcc = V_CMP_...), (vcc = V_CMP_...) - // - // So, instead of forcing the instruction to write to VCC, we - // provide a hint to the register allocator to use VCC and then we - // will run this pass again after RA and shrink it if it outputs to - // VCC. - MRI.setRegAllocationHint(DstReg, 0, VCCReg); - continue; - } - if (DstReg != VCCReg) - continue; + Register DstReg = MI.getOperand(0).getReg(); + if (DstReg.isVirtual()) { + // VOPC instructions can only write to the VCC register. We can't + // force them to use VCC here, because this is only one register and + // cannot deal with sequences which would require multiple copies of + // VCC, e.g. S_AND_B64 (vcc = V_CMP_...), (vcc = V_CMP_...) + // + // So, instead of forcing the instruction to write to VCC, we provide + // a hint to the register allocator to use VCC and then we will run + // this pass again after RA and shrink it if it outputs to VCC. + MRI.setRegAllocationHint(MI.getOperand(0).getReg(), 0, VCCReg); + continue; } + if (DstReg != VCCReg) + continue; } if (Op32 == AMDGPU::V_CNDMASK_B32_e32) { diff --git a/llvm/lib/Target/AMDGPU/VOPCInstructions.td b/llvm/lib/Target/AMDGPU/VOPCInstructions.td --- a/llvm/lib/Target/AMDGPU/VOPCInstructions.td +++ b/llvm/lib/Target/AMDGPU/VOPCInstructions.td @@ -205,11 +205,6 @@ string NoSDstOp = Name; } -class VCMPVCMPXTable { - bit IsVCMPX = 0; - string VCMPOp = Name; -} - multiclass VOPC_Pseudos , Commutable_REV, - VCMPXNoSDstTable<1, opName#"_e32">, - VCMPVCMPXTable { + VCMPXNoSDstTable<1, opName#"_e32"> { let Defs = !if(DefExec, [VCC, EXEC], [VCC]); let SchedRW = P.Schedule; let isConvergent = DefExec; @@ -229,8 +223,7 @@ def _e64 : VOP3_Pseudo.ret>, Commutable_REV, - VCMPXNoSDstTable<1, opName#"_e64">, - VCMPVCMPXTable { + VCMPXNoSDstTable<1, opName#"_e64"> { let Defs = !if(DefExec, [EXEC], []); let SchedRW = P.Schedule; let isCompare = 1; @@ -255,27 +248,23 @@ def _nosdst_e32 : VOPC_Pseudo , Commutable_REV, - VCMPXNoSDstTable<0, opName#"_e32">, - VCMPVCMPXTable { + VCMPXNoSDstTable<0, opName#"_e32"> { let Defs = [EXEC]; let SchedRW = P_NoSDst.Schedule; let isConvergent = 1; let isCompare = 1; let isCommutable = 1; let SubtargetPredicate = HasNoSdstCMPX; - let IsVCMPX = 1; } def _nosdst_e64 : VOP3_Pseudo, Commutable_REV, - VCMPXNoSDstTable<0, opName#"_e64">, - VCMPVCMPXTable { + VCMPXNoSDstTable<0, opName#"_e64"> { let Defs = [EXEC]; let SchedRW = P_NoSDst.Schedule; let isCompare = 1; let isCommutable = 1; let SubtargetPredicate = HasNoSdstCMPX; - let IsVCMPX = 1; } foreach _ = BoolToList.ret in diff --git a/llvm/test/CodeGen/AMDGPU/branch-relaxation-gfx10-branch-offset-bug.ll b/llvm/test/CodeGen/AMDGPU/branch-relaxation-gfx10-branch-offset-bug.ll --- a/llvm/test/CodeGen/AMDGPU/branch-relaxation-gfx10-branch-offset-bug.ll +++ b/llvm/test/CodeGen/AMDGPU/branch-relaxation-gfx10-branch-offset-bug.ll @@ -6,7 +6,7 @@ ; GCN-LABEL: long_forward_scc_branch_3f_offset_bug: ; GFX1030: s_cmp_lg_u32 -; GFX1030: s_cbranch_scc1 [[ENDBB:.LBB[0-9]+_[0-9]+]] +; GFX1030-NEXT: s_cbranch_scc1 [[ENDBB:.LBB[0-9]+_[0-9]+]] ; GFX1010: s_cmp_lg_u32 ; GFX1010-NEXT: s_cbranch_scc0 [[RELAX_BB:.LBB[0-9]+_[0-9]+]] @@ -51,9 +51,9 @@ } ; GCN-LABEL: {{^}}long_forward_exec_branch_3f_offset_bug: -; GFX1030: s_mov_b32 -; GFX1030: v_cmpx_eq_u32 -; GFX1030: s_cbranch_execnz [[RELAX_BB:.LBB[0-9]+_[0-9]+]] +; GFX1030: v_cmp_eq_u32 +; GFX1030: s_and_saveexec_b32 +; GFX1030-NEXT: s_cbranch_execnz [[RELAX_BB:.LBB[0-9]+_[0-9]+]] ; GFX1010: v_cmp_eq_u32 ; GFX1010: s_and_saveexec_b32 diff --git a/llvm/test/CodeGen/AMDGPU/vcmp-saveexec-to-vcmpx.ll b/llvm/test/CodeGen/AMDGPU/vcmp-saveexec-to-vcmpx.ll deleted file mode 100644 --- a/llvm/test/CodeGen/AMDGPU/vcmp-saveexec-to-vcmpx.ll +++ /dev/null @@ -1,167 +0,0 @@ -; RUN: llc -march=amdgcn -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX1010 %s -; RUN: llc -march=amdgcn -mcpu=gfx1030 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX1030 %s - -; GCN-LABEL: {{^}}test_insert_vcmpx_pattern_lt: -; GFX1010: v_cmp_lt_i32_e32 vcc_lo, 15, v{{.*}} -; GFX1010-NEXT: s_and_saveexec_b32 s{{.*}}, vcc_lo -; GFX1030: s_mov_b32 s{{.*}}, exec_lo -; GFX1030-NEXT: v_cmpx_lt_i32_e32 15, v{{.*}} -define i32 @test_insert_vcmpx_pattern_lt(i32 %x) { -entry: - %bc = icmp slt i32 %x, 16 - br i1 %bc, label %endif, label %if - -if: - %ret = shl i32 %x, 2 - ret i32 %ret - -endif: - ret i32 %x -} - -; GCN-LABEL: {{^}}test_insert_vcmpx_pattern_gt: -; GFX1010: v_cmp_gt_i32_e32 vcc_lo, 17, v{{.*}} -; GFX1010-NEXT: s_and_saveexec_b32 s{{.*}}, vcc_lo -; GFX1030: s_mov_b32 s{{.*}}, exec_lo -; GFX1030-NEXT: v_cmpx_gt_i32_e32 17, v{{.*}} -define i32 @test_insert_vcmpx_pattern_gt(i32 %x) { -entry: - %bc = icmp sgt i32 %x, 16 - br i1 %bc, label %endif, label %if - -if: - %ret = shl i32 %x, 2 - ret i32 %ret - -endif: - ret i32 %x -} - -; GCN-LABEL: {{^}}test_insert_vcmpx_pattern_eq: -; GFX1010: v_cmp_ne_u32_e32 vcc_lo, 16, v{{.*}} -; GFX1010-NEXT: s_and_saveexec_b32 s{{.*}}, vcc_lo -; GFX1030: s_mov_b32 s{{.*}}, exec_lo -; GFX1030-NEXT: v_cmpx_ne_u32_e32 16, v{{.*}} -define i32 @test_insert_vcmpx_pattern_eq(i32 %x) { -entry: - %bc = icmp eq i32 %x, 16 - br i1 %bc, label %endif, label %if - -if: - %ret = shl i32 %x, 2 - ret i32 %ret - -endif: - ret i32 %x -} - -; GCN-LABEL: {{^}}test_insert_vcmpx_pattern_ne: -; GFX1010: v_cmp_eq_u32_e32 vcc_lo, 16, v{{.*}} -; GFX1010-NEXT: s_and_saveexec_b32 s{{.*}}, vcc_lo -; GFX1030: s_mov_b32 s{{.*}}, exec_lo -; GFX1030-NEXT: v_cmpx_eq_u32_e32 16, v{{.*}} -define i32 @test_insert_vcmpx_pattern_ne(i32 %x) { -entry: - %bc = icmp ne i32 %x, 16 - br i1 %bc, label %endif, label %if - -if: - %ret = shl i32 %x, 2 - ret i32 %ret - -endif: - ret i32 %x -} - -; GCN-LABEL: {{^}}test_insert_vcmpx_pattern_le: -; GFX1010: v_cmp_lt_i32_e32 vcc_lo, 16, v{{.*}} -; GFX1010-NEXT: s_and_saveexec_b32 s{{.*}}, vcc_lo -; GFX1030: s_mov_b32 s{{.*}}, exec_lo -; GFX1030-NEXT: v_cmpx_lt_i32_e32 16, v{{.*}} -define i32 @test_insert_vcmpx_pattern_le(i32 %x) { -entry: - %bc = icmp sle i32 %x, 16 - br i1 %bc, label %endif, label %if - -if: - %ret = shl i32 %x, 2 - ret i32 %ret - -endif: - ret i32 %x -} - -; GCN-LABEL: {{^}}test_insert_vcmpx_pattern_ge: -; GFX1010: v_cmp_gt_i32_e32 vcc_lo, 16, v{{.*}} -; GFX1010-NEXT: s_and_saveexec_b32 s{{.*}}, vcc_lo -; GFX1030: s_mov_b32 s{{.*}}, exec_lo -; GFX1030-NEXT: v_cmpx_gt_i32_e32 16, v{{.*}} -define i32 @test_insert_vcmpx_pattern_ge(i32 %x) { -entry: - %bc = icmp sge i32 %x, 16 - br i1 %bc, label %endif, label %if - -if: - %ret = shl i32 %x, 2 - ret i32 %ret - -endif: - ret i32 %x -} - -declare amdgpu_gfx void @check_live_outs_helper(i64) #0 - -; In cases where the output operand cannot be safely removed, -; don't apply the v_cmpx transformation. - -; GCN-LABEL: {{^}}check_live_outs: -; GFX1010: v_cmp_eq_u32_e64 s{{.*}}, v{{.*}}, v{{.*}} -; GFX1010: s_and_saveexec_b32 s{{.*}}, s{{.*}} -; GFX1030: v_cmp_eq_u32_e64 s{{.*}}, v{{.*}}, v{{.*}} -; GFX1030: s_and_saveexec_b32 s{{.*}}, s{{.*}} -define amdgpu_cs void @check_live_outs(i32 %a, i32 %b) { - %cond = icmp eq i32 %a, %b - %result = call i64 @llvm.amdgcn.icmp.i32(i32 %a, i32 %b, i32 32) - br i1 %cond, label %l1, label %l2 -l1: - call amdgpu_gfx void @check_live_outs_helper(i64 %result) - br label %l2 -l2: - ret void -} - -; Omit the transformation if the s_and_saveexec instruction overwrites -; any of the v_cmp source operands. - -; GCN-LABEL: check_saveexec_overwrites_vcmp_source: -; GCN: ; %bb.1: ; %then -; GFX1010: v_cmp_ge_i32_e32 vcc_lo, s[[A:[0-9]+]], v{{.*}} -; GFX1010-NEXT: v_mov_b32_e32 {{.*}}, s[[A]] -; GFX1010-NEXT: s_and_saveexec_b32 s[[A]], vcc_lo -; GFX1030: v_cmp_ge_i32_e32 vcc_lo, s[[A:[0-9]+]], v{{.*}} -; GFX1030-NEXT: v_mov_b32_e32 {{.*}}, s[[A]] -; GFX1030-NEXT: s_and_saveexec_b32 s[[A]], vcc_lo -define i32 @check_saveexec_overwrites_vcmp_source(i32 inreg %a, i32 inreg %b) { -entry: - %0 = icmp sge i32 %a, 0 - br i1 %0, label %if, label %then - -if: - %1 = shl i32 %a, 2 - %2 = or i32 %1, %b - ret i32 %2 - -then: - %3 = call i64 @llvm.amdgcn.icmp.i32(i32 %a, i32 %b, i32 32) - %4 = trunc i64 %3 to i32 - %5 = icmp slt i32 %4, %b - br i1 %5, label %after, label %end - -after: - ret i32 %4 - -end: - ret i32 %a -} - -declare i64 @llvm.amdgcn.icmp.i32(i32, i32, i32) #0 diff --git a/llvm/test/CodeGen/AMDGPU/vcmp-saveexec-to-vcmpx.mir b/llvm/test/CodeGen/AMDGPU/vcmp-saveexec-to-vcmpx.mir deleted file mode 100644 --- a/llvm/test/CodeGen/AMDGPU/vcmp-saveexec-to-vcmpx.mir +++ /dev/null @@ -1,24 +0,0 @@ -# RUN: llc -march=amdgcn -mcpu=gfx1010 -mattr=-wavefrontsize32,+wavefrontsize64 %s -o - | FileCheck -check-prefix=GCN %s -# RUN: llc -march=amdgcn -mcpu=gfx1030 -mattr=-wavefrontsize32,+wavefrontsize64 %s -o - | FileCheck -check-prefix=GCN %s - ---- - -# After the Optimize exec masking (post-RA) pass, there's a change of having v_cmpx instructions -# being introduced whenever there's a sequence of v_cmp and s_and_saveexec instructions -# which can be safely replaced in various cases. -# However, it is not safe to do so when the generated code sequence would omit part of the EXEC mask -# which could occur when a subset of EXEC is used as input operand in the v_cmp instruction. -# The idea behind this test is to check if the subregisters are correctly handled here. - -# GCN-LABEL: vcmp_saveexec_to_mov_vcmpx_exec_subreg: -# GCN: v_cmp_gt_u32_e64 s[[[SDST_LO:[0-9]+]]:[[SDST_HI:[0-9]+]]], exec_lo, v{{.*}} -# GCN: s_and_saveexec_b64 s[[[EXEC_LO:[0-9]+]]:[[EXEC_HI:[0-9]+]]], s[[[SDST_LO]]:[[SDST_HI]]] -name: vcmp_saveexec_to_mov_vcmpx_exec_subreg -tracksRegLiveness: true -body: | - bb.0: - liveins: $vgpr0 - renamable $sgpr0_sgpr1 = V_CMP_GT_U32_e64 $exec_lo, killed $vgpr0, implicit $exec - $sgpr2_sgpr3 = COPY $exec, implicit-def $exec - $sgpr2_sgpr3 = S_AND_B64 killed renamable $sgpr2_sgpr3, killed renamable $sgpr0_sgpr1, implicit-def dead $scc - $exec = S_MOV_B64_term killed renamable $sgpr2_sgpr3 diff --git a/llvm/test/CodeGen/AMDGPU/wqm.ll b/llvm/test/CodeGen/AMDGPU/wqm.ll --- a/llvm/test/CodeGen/AMDGPU/wqm.ll +++ b/llvm/test/CodeGen/AMDGPU/wqm.ll @@ -1250,8 +1250,8 @@ ; GFX10-W32: ; %bb.0: ; %main_body ; GFX10-W32-NEXT: s_mov_b32 s12, exec_lo ; GFX10-W32-NEXT: s_wqm_b32 exec_lo, exec_lo -; GFX10-W32-NEXT: s_mov_b32 s13, exec_lo -; GFX10-W32-NEXT: v_cmpx_ne_u32_e32 0, v1 +; GFX10-W32-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v1 +; GFX10-W32-NEXT: s_and_saveexec_b32 s13, vcc_lo ; GFX10-W32-NEXT: s_xor_b32 s13, exec_lo, s13 ; GFX10-W32-NEXT: s_cbranch_execz .LBB23_2 ; GFX10-W32-NEXT: ; %bb.1: ; %ELSE @@ -1329,8 +1329,8 @@ ; GFX10-W32: ; %bb.0: ; %main_body ; GFX10-W32-NEXT: s_mov_b32 s12, exec_lo ; GFX10-W32-NEXT: s_wqm_b32 exec_lo, exec_lo -; GFX10-W32-NEXT: s_mov_b32 s13, exec_lo -; GFX10-W32-NEXT: v_cmpx_ne_u32_e32 0, v1 +; GFX10-W32-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v1 +; GFX10-W32-NEXT: s_and_saveexec_b32 s13, vcc_lo ; GFX10-W32-NEXT: s_xor_b32 s13, exec_lo, s13 ; GFX10-W32-NEXT: s_cbranch_execz .LBB24_2 ; GFX10-W32-NEXT: ; %bb.1: ; %IF @@ -1508,10 +1508,10 @@ ; GFX10-W32-NEXT: s_waitcnt vmcnt(0) ; GFX10-W32-NEXT: image_sample v1, v1, s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_1D ; GFX10-W32-NEXT: s_waitcnt vmcnt(0) +; GFX10-W32-NEXT: v_cmp_nlt_f32_e32 vcc_lo, 0, v1 ; GFX10-W32-NEXT: buffer_store_dword v1, v0, s[0:3], 0 idxen -; GFX10-W32-NEXT: s_mov_b32 s0, exec_lo ; GFX10-W32-NEXT: ; implicit-def: $vgpr0 -; GFX10-W32-NEXT: v_cmpx_nlt_f32_e32 0, v1 +; GFX10-W32-NEXT: s_and_saveexec_b32 s0, vcc_lo ; GFX10-W32-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX10-W32-NEXT: ; %bb.1: ; %ELSE ; GFX10-W32-NEXT: v_mul_f32_e32 v0, 4.0, v1 @@ -1577,8 +1577,8 @@ ; GFX10-W32: ; %bb.0: ; %main_body ; GFX10-W32-NEXT: s_mov_b32 s12, exec_lo ; GFX10-W32-NEXT: s_wqm_b32 exec_lo, exec_lo -; GFX10-W32-NEXT: s_mov_b32 s13, exec_lo -; GFX10-W32-NEXT: v_cmpx_eq_u32_e32 0, v1 +; GFX10-W32-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1 +; GFX10-W32-NEXT: s_and_saveexec_b32 s13, vcc_lo ; GFX10-W32-NEXT: s_cbranch_execz .LBB27_2 ; GFX10-W32-NEXT: ; %bb.1: ; %IF ; GFX10-W32-NEXT: s_and_saveexec_b32 s14, s12 @@ -2960,9 +2960,9 @@ ; GFX10-W32-NEXT: s_wqm_b32 exec_lo, exec_lo ; GFX10-W32-NEXT: v_mov_b32_e32 v2, v0 ; GFX10-W32-NEXT: s_mov_b32 exec_lo, s13 +; GFX10-W32-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1 ; GFX10-W32-NEXT: v_mov_b32_e32 v0, 0 -; GFX10-W32-NEXT: s_mov_b32 s13, exec_lo -; GFX10-W32-NEXT: v_cmpx_eq_u32_e32 0, v1 +; GFX10-W32-NEXT: s_and_saveexec_b32 s13, vcc_lo ; GFX10-W32-NEXT: s_cbranch_execz .LBB46_2 ; GFX10-W32-NEXT: ; %bb.1: ; %IF ; GFX10-W32-NEXT: s_mov_b32 s14, exec_lo