diff --git a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp --- a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp @@ -4625,10 +4625,9 @@ .addMBB(UncondBrTarget); } else { B.buildInstr(AMDGPU::SI_ELSE) - .addDef(Def) - .addUse(Use) - .addMBB(UncondBrTarget) - .addImm(0); + .addDef(Def) + .addUse(Use) + .addMBB(UncondBrTarget); } if (Br) { diff --git a/llvm/lib/Target/AMDGPU/SIInstructions.td b/llvm/lib/Target/AMDGPU/SIInstructions.td --- a/llvm/lib/Target/AMDGPU/SIInstructions.td +++ b/llvm/lib/Target/AMDGPU/SIInstructions.td @@ -325,7 +325,7 @@ def SI_ELSE : CFPseudoInstSI < (outs SReg_1:$dst), - (ins SReg_1:$src, brtarget:$target, i1imm:$execfix), [], 1, 1> { + (ins SReg_1:$src, brtarget:$target), [], 1, 1> { let Size = 12; let hasSideEffects = 1; } @@ -745,7 +745,7 @@ def : GCNPat< (AMDGPUelse i1:$src, bb:$target), - (SI_ELSE $src, $target, 0) + (SI_ELSE $src, $target) >; def : Pat < diff --git a/llvm/lib/Target/AMDGPU/SILowerControlFlow.cpp b/llvm/lib/Target/AMDGPU/SILowerControlFlow.cpp --- a/llvm/lib/Target/AMDGPU/SILowerControlFlow.cpp +++ b/llvm/lib/Target/AMDGPU/SILowerControlFlow.cpp @@ -333,13 +333,11 @@ Register DstReg = MI.getOperand(0).getReg(); - bool ExecModified = MI.getOperand(3).getImm() != 0; MachineBasicBlock::iterator Start = MBB.begin(); // This must be inserted before phis and any spill code inserted before the // else. - Register SaveReg = ExecModified ? - MRI->createVirtualRegister(BoolRC) : DstReg; + Register SaveReg = MRI->createVirtualRegister(BoolRC); MachineInstr *OrSaveExec = BuildMI(MBB, Start, DL, TII->get(OrSaveExecOpc), SaveReg) .add(MI.getOperand(1)); // Saved EXEC @@ -348,15 +346,14 @@ MachineBasicBlock::iterator ElsePt(MI); - if (ExecModified) { - MachineInstr *And = - BuildMI(MBB, ElsePt, DL, TII->get(AndOpc), DstReg) - .addReg(Exec) - .addReg(SaveReg); + // This accounts for any modification of the EXEC mask within the block and + // can be optimized out pre-RA when not required. + MachineInstr *And = BuildMI(MBB, ElsePt, DL, TII->get(AndOpc), DstReg) + .addReg(Exec) + .addReg(SaveReg); - if (LIS) - LIS->InsertMachineInstrInMaps(*And); - } + if (LIS) + LIS->InsertMachineInstrInMaps(*And); MachineInstr *Xor = BuildMI(MBB, ElsePt, DL, TII->get(XorTermrOpc), Exec) @@ -386,8 +383,7 @@ LIS->removeInterval(DstReg); LIS->createAndComputeVirtRegInterval(DstReg); - if (ExecModified) - LIS->createAndComputeVirtRegInterval(SaveReg); + LIS->createAndComputeVirtRegInterval(SaveReg); // Let this be recomputed. LIS->removeAllRegUnitsForPhysReg(AMDGPU::EXEC); diff --git a/llvm/lib/Target/AMDGPU/SIOptimizeExecMaskingPreRA.cpp b/llvm/lib/Target/AMDGPU/SIOptimizeExecMaskingPreRA.cpp --- a/llvm/lib/Target/AMDGPU/SIOptimizeExecMaskingPreRA.cpp +++ b/llvm/lib/Target/AMDGPU/SIOptimizeExecMaskingPreRA.cpp @@ -35,10 +35,13 @@ unsigned AndOpc; unsigned Andn2Opc; + unsigned OrSaveExecOpc; + unsigned XorTermrOpc; Register CondReg; Register ExecReg; Register optimizeVcndVcmpPair(MachineBasicBlock &MBB); + bool optimizeElseBranch(MachineBasicBlock &MBB); public: static char ID; @@ -224,6 +227,81 @@ return CCReg; } +// Optimize sequence +// %dst = S_OR_SAVEEXEC %src +// ... instructions not modifying exec ... +// %tmp = S_AND $exec, %dst +// $exec = S_XOR_term $exec, %tmp +// => +// %dst = S_OR_SAVEEXEC %src +// ... instructions not modifying exec ... +// $exec = S_XOR_term $exec, %dst +// +// Clean up potentially unnecessary code added for safety during +// control flow lowering. +// +// Return whether any changes were made to MBB. +bool SIOptimizeExecMaskingPreRA::optimizeElseBranch(MachineBasicBlock &MBB) { + if (MBB.empty()) + return false; + + // Check this is an else block. + auto First = MBB.begin(); + MachineInstr &SaveExecMI = *First; + if (SaveExecMI.getOpcode() != OrSaveExecOpc) + return false; + + auto I = llvm::find_if(MBB.terminators(), [this](const MachineInstr &MI) { + return MI.getOpcode() == XorTermrOpc; + }); + if (I == MBB.terminators().end()) + return false; + + MachineInstr &XorTermMI = *I; + if (XorTermMI.getOperand(1).getReg() != ExecReg) + return false; + + Register SavedExecReg = SaveExecMI.getOperand(0).getReg(); + Register DstReg = XorTermMI.getOperand(2).getReg(); + + // Find potentially unnecessary S_AND + MachineInstr *AndExecMI = nullptr; + I--; + while (I != First && !AndExecMI) { + if (I->getOpcode() == AndOpc && I->getOperand(0).getReg() == DstReg && + I->getOperand(1).getReg() == ExecReg) + AndExecMI = &*I; + I--; + } + if (!AndExecMI) + return false; + + // Check for exec modifying instructions. + // Note: exec defs do not create live ranges beyond the + // instruction so isDefBetween cannot be used. + // Instead just check that the def segments are adjacent. + SlotIndex StartIdx = LIS->getInstructionIndex(SaveExecMI); + SlotIndex EndIdx = LIS->getInstructionIndex(*AndExecMI); + for (MCRegUnitIterator UI(ExecReg, TRI); UI.isValid(); ++UI) { + LiveRange &RegUnit = LIS->getRegUnit(*UI); + if (RegUnit.find(StartIdx) != std::prev(RegUnit.find(EndIdx))) + return false; + } + + // Remove unnecessary S_AND + LIS->removeInterval(SavedExecReg); + LIS->removeInterval(DstReg); + + SaveExecMI.getOperand(0).setReg(DstReg); + + LIS->RemoveMachineInstrFromMaps(*AndExecMI); + AndExecMI->eraseFromParent(); + + LIS->createAndComputeVirtRegInterval(DstReg); + + return true; +} + bool SIOptimizeExecMaskingPreRA::runOnMachineFunction(MachineFunction &MF) { if (skipFunction(MF.getFunction())) return false; @@ -237,6 +315,9 @@ const bool Wave32 = ST.isWave32(); AndOpc = Wave32 ? AMDGPU::S_AND_B32 : AMDGPU::S_AND_B64; Andn2Opc = Wave32 ? AMDGPU::S_ANDN2_B32 : AMDGPU::S_ANDN2_B64; + OrSaveExecOpc = + Wave32 ? AMDGPU::S_OR_SAVEEXEC_B32 : AMDGPU::S_OR_SAVEEXEC_B64; + XorTermrOpc = Wave32 ? AMDGPU::S_XOR_B32_term : AMDGPU::S_XOR_B64_term; CondReg = Wave32 ? AMDGPU::VCC_LO : AMDGPU::VCC; ExecReg = Wave32 ? AMDGPU::EXEC_LO : AMDGPU::EXEC; @@ -245,6 +326,11 @@ for (MachineBasicBlock &MBB : MF) { + if (optimizeElseBranch(MBB)) { + RecalcRegs.insert(AMDGPU::SCC); + Changed = true; + } + if (Register Reg = optimizeVcndVcmpPair(MBB)) { RecalcRegs.insert(Reg); RecalcRegs.insert(AMDGPU::VCC_LO); diff --git a/llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp b/llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp --- a/llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp +++ b/llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp @@ -730,9 +730,6 @@ if (MI.isTerminator() && OutNeeds == StateExact) Needs = StateExact; - if (MI.getOpcode() == AMDGPU::SI_ELSE && BI.OutNeeds == StateExact) - MI.getOperand(3).setImm(1); - ++Next; } else { // End of basic block diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-brcond.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-brcond.mir --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-brcond.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-brcond.mir @@ -140,7 +140,7 @@ ; WAVE64: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0 ; WAVE64: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1 ; WAVE64: [[ICMP:%[0-9]+]]:sreg_64_xexec(s1) = G_ICMP intpred(ne), [[COPY]](s32), [[COPY1]] - ; WAVE64: [[SI_ELSE:%[0-9]+]]:sreg_64_xexec(s64) = SI_ELSE [[ICMP]](s1), %bb.1, 0, implicit-def $exec, implicit-def $scc, implicit $exec + ; WAVE64: [[SI_ELSE:%[0-9]+]]:sreg_64_xexec(s64) = SI_ELSE [[ICMP]](s1), %bb.1, implicit-def $exec, implicit-def $scc, implicit $exec ; WAVE64: G_BR %bb.1 ; WAVE64: bb.1: ; WAVE32-LABEL: name: brcond_si_else @@ -149,7 +149,7 @@ ; WAVE32: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0 ; WAVE32: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1 ; WAVE32: [[ICMP:%[0-9]+]]:sreg_32_xm0_xexec(s1) = G_ICMP intpred(ne), [[COPY]](s32), [[COPY1]] - ; WAVE32: [[SI_ELSE:%[0-9]+]]:sreg_32_xm0_xexec(s64) = SI_ELSE [[ICMP]](s1), %bb.1, 0, implicit-def $exec, implicit-def $scc, implicit $exec + ; WAVE32: [[SI_ELSE:%[0-9]+]]:sreg_32_xm0_xexec(s64) = SI_ELSE [[ICMP]](s1), %bb.1, implicit-def $exec, implicit-def $scc, implicit $exec ; WAVE32: G_BR %bb.1 ; WAVE32: bb.1: bb.0: diff --git a/llvm/test/CodeGen/AMDGPU/collapse-endcf.mir b/llvm/test/CodeGen/AMDGPU/collapse-endcf.mir --- a/llvm/test/CodeGen/AMDGPU/collapse-endcf.mir +++ b/llvm/test/CodeGen/AMDGPU/collapse-endcf.mir @@ -475,13 +475,14 @@ ; GCN: bb.2: ; GCN: successors: %bb.3(0x40000000), %bb.6(0x40000000) ; GCN: [[S_OR_SAVEEXEC_B64_:%[0-9]+]]:sreg_64 = S_OR_SAVEEXEC_B64 [[S_XOR_B64_]], implicit-def $exec, implicit-def $scc, implicit $exec - ; GCN: $exec = S_XOR_B64_term $exec, [[S_OR_SAVEEXEC_B64_]], implicit-def $scc + ; GCN: [[S_AND_B64_1:%[0-9]+]]:sreg_64 = S_AND_B64 $exec, [[S_OR_SAVEEXEC_B64_]], implicit-def $scc + ; GCN: $exec = S_XOR_B64_term $exec, [[S_AND_B64_1]], implicit-def $scc ; GCN: S_CBRANCH_EXECZ %bb.6, implicit $exec ; GCN: bb.3: ; GCN: successors: %bb.3(0x40000000), %bb.4(0x40000000) ; GCN: [[COPY1:%[0-9]+]]:sreg_64 = COPY $exec, implicit-def $exec - ; GCN: [[S_AND_B64_1:%[0-9]+]]:sreg_64 = S_AND_B64 [[COPY1]], undef %4:sreg_64, implicit-def dead $scc - ; GCN: $exec = S_MOV_B64_term killed [[S_AND_B64_1]] + ; GCN: [[S_AND_B64_2:%[0-9]+]]:sreg_64 = S_AND_B64 [[COPY1]], undef %4:sreg_64, implicit-def dead $scc + ; GCN: $exec = S_MOV_B64_term killed [[S_AND_B64_2]] ; GCN: S_CBRANCH_EXECZ %bb.3, implicit $exec ; GCN: bb.4: ; GCN: successors: %bb.5(0x80000000) @@ -489,7 +490,7 @@ ; GCN: successors: %bb.6(0x80000000) ; GCN: $exec = S_OR_B64 $exec, [[COPY1]], implicit-def $scc ; GCN: bb.6: - ; GCN: $exec = S_OR_B64 $exec, [[S_OR_SAVEEXEC_B64_]], implicit-def $scc + ; GCN: $exec = S_OR_B64 $exec, [[S_AND_B64_1]], implicit-def $scc ; GCN: S_ENDPGM 0 bb.0: successors: %bb.1, %bb.2 @@ -502,7 +503,7 @@ bb.2: successors: %bb.3, %bb.6 - %2:sreg_64 = SI_ELSE %0:sreg_64, %bb.6, 0, implicit-def dead $exec, implicit-def dead $scc, implicit $exec + %2:sreg_64 = SI_ELSE %0:sreg_64, %bb.6, implicit-def dead $exec, implicit-def dead $scc, implicit $exec bb.3: successors: %bb.3, %bb.4 diff --git a/llvm/test/CodeGen/AMDGPU/control-flow-fastregalloc.ll b/llvm/test/CodeGen/AMDGPU/control-flow-fastregalloc.ll --- a/llvm/test/CodeGen/AMDGPU/control-flow-fastregalloc.ll +++ b/llvm/test/CodeGen/AMDGPU/control-flow-fastregalloc.ll @@ -196,19 +196,20 @@ ; Regular spill value restored after exec modification ; GCN: buffer_load_dword [[FLOW_VAL:v[0-9]+]], off, s[0:3], 0 offset:[[FLOW_VAL_OFFSET:[0-9]+]] ; 4-byte Folded Reload +; Followed by spill +; GCN: buffer_store_dword [[FLOW_VAL]], off, s[0:3], 0 offset:[[RESULT_OFFSET:[0-9]+]] ; 4-byte Folded Spill +; GCN: s_and_b64 s{{\[}}[[FLOW_AND_EXEC_LO:[0-9]+]]:[[FLOW_AND_EXEC_HI:[0-9]+]]{{\]}}, exec, s{{\[}}[[FLOW_S_RELOAD_SAVEEXEC_LO_SAVEEXEC]]:[[FLOW_S_RELOAD_SAVEEXEC_HI_SAVEEXEC]]{{\]}} ; Spill saved exec -; VGPR: v_writelane_b32 [[SPILL_VGPR]], s[[FLOW_S_RELOAD_SAVEEXEC_LO_SAVEEXEC]], [[FLOW_SAVEEXEC_LO_LANE:[0-9]+]] -; VGPR: v_writelane_b32 [[SPILL_VGPR]], s[[FLOW_S_RELOAD_SAVEEXEC_HI_SAVEEXEC]], [[FLOW_SAVEEXEC_HI_LANE:[0-9]+]] - +; VGPR: v_writelane_b32 [[SPILL_VGPR]], s[[FLOW_AND_EXEC_LO]], [[FLOW_SAVEEXEC_LO_LANE:[0-9]+]] +; VGPR: v_writelane_b32 [[SPILL_VGPR]], s[[FLOW_AND_EXEC_HI]], [[FLOW_SAVEEXEC_HI_LANE:[0-9]+]] -; VMEM: v_writelane_b32 v[[FLOW_V_SAVEEXEC:[0-9]+]], s[[FLOW_S_RELOAD_SAVEEXEC_LO_SAVEEXEC]], 0 -; VMEM: v_writelane_b32 v[[FLOW_V_SAVEEXEC]], s[[FLOW_S_RELOAD_SAVEEXEC_HI_SAVEEXEC]], 1 +; VMEM: v_writelane_b32 v[[FLOW_V_SAVEEXEC:[0-9]+]], s[[FLOW_AND_EXEC_LO]], 0 +; VMEM: v_writelane_b32 v[[FLOW_V_SAVEEXEC]], s[[FLOW_AND_EXEC_HI]], 1 ; VMEM: buffer_store_dword v[[FLOW_V_SAVEEXEC]], off, s[0:3], 0 offset:[[FLOW_SAVEEXEC_OFFSET:[0-9]+]] ; 4-byte Folded Spill -; GCN: buffer_store_dword [[FLOW_VAL]], off, s[0:3], 0 offset:[[RESULT_OFFSET:[0-9]+]] ; 4-byte Folded Spill -; GCN: s_xor_b64 exec, exec, s{{\[}}[[FLOW_S_RELOAD_SAVEEXEC_LO_SAVEEXEC]]:[[FLOW_S_RELOAD_SAVEEXEC_HI_SAVEEXEC]]{{\]}} +; GCN: s_xor_b64 exec, exec, s{{\[}}[[FLOW_AND_EXEC_LO]]:[[FLOW_AND_EXEC_HI]]{{\]}} ; GCN-NEXT: s_cbranch_execz [[ENDIF:BB[0-9]+_[0-9]+]] diff --git a/llvm/test/CodeGen/AMDGPU/lower-control-flow-other-terminators.mir b/llvm/test/CodeGen/AMDGPU/lower-control-flow-other-terminators.mir --- a/llvm/test/CodeGen/AMDGPU/lower-control-flow-other-terminators.mir +++ b/llvm/test/CodeGen/AMDGPU/lower-control-flow-other-terminators.mir @@ -100,11 +100,12 @@ ; CHECK: bb.0: ; CHECK: successors: %bb.2(0x40000000), %bb.1(0x40000000) ; CHECK: liveins: $vgpr0, $vgpr1, $sgpr4_sgpr5 - ; CHECK: [[S_OR_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_OR_SAVEEXEC_B64 %2, implicit-def $exec, implicit-def $scc, implicit $exec + ; CHECK: [[S_OR_SAVEEXEC_B64_:%[0-9]+]]:sreg_64 = S_OR_SAVEEXEC_B64 %2, implicit-def $exec, implicit-def $scc, implicit $exec ; CHECK: [[COPY:%[0-9]+]]:vgpr_32 = COPY killed $vgpr0 ; CHECK: [[COPY1:%[0-9]+]]:sreg_64_xexec = COPY $sgpr4_sgpr5 ; CHECK: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 0, [[COPY]], implicit $exec - ; CHECK: $exec = S_XOR_B64_term $exec, [[S_OR_SAVEEXEC_B64_]], implicit-def $scc + ; CHECK: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 $exec, [[S_OR_SAVEEXEC_B64_]], implicit-def $scc + ; CHECK: $exec = S_XOR_B64_term $exec, [[S_AND_B64_]], implicit-def $scc ; CHECK: [[S_MOV_B64_term:%[0-9]+]]:sreg_64_xexec = S_MOV_B64_term killed [[COPY1]], implicit $exec ; CHECK: S_CBRANCH_EXECZ %bb.1, implicit $exec ; CHECK: S_BRANCH %bb.2 @@ -120,7 +121,7 @@ %0:vgpr_32 = COPY killed $vgpr0 %1:sreg_64_xexec = COPY $sgpr4_sgpr5 %2:sreg_64_xexec = V_CMP_EQ_U32_e64 0, %0, implicit $exec - %3:sreg_64_xexec = SI_ELSE %2, %bb.1, 0, implicit-def $exec, implicit-def dead $scc, implicit $exec + %3:sreg_64_xexec = SI_ELSE %2, %bb.1, implicit-def $exec, implicit-def dead $scc, implicit $exec %4:sreg_64_xexec = S_MOV_B64_term killed %1, implicit $exec S_BRANCH %bb.2