Index: llvm/trunk/lib/Target/AMDGPU/SIOptimizeExecMasking.cpp =================================================================== --- llvm/trunk/lib/Target/AMDGPU/SIOptimizeExecMasking.cpp +++ llvm/trunk/lib/Target/AMDGPU/SIOptimizeExecMasking.cpp @@ -87,6 +87,30 @@ return AMDGPU::NoRegister; } +/// If \p MI is a logical operation on an exec value, +/// return the register copied to. +static unsigned isLogicalOpOnExec(const MachineInstr &MI) { + switch (MI.getOpcode()) { + case AMDGPU::S_AND_B64: + case AMDGPU::S_OR_B64: + case AMDGPU::S_XOR_B64: + case AMDGPU::S_ANDN2_B64: + case AMDGPU::S_ORN2_B64: + case AMDGPU::S_NAND_B64: + case AMDGPU::S_NOR_B64: + case AMDGPU::S_XNOR_B64: { + const MachineOperand &Src1 = MI.getOperand(1); + if (Src1.isReg() && Src1.getReg() == AMDGPU::EXEC) + return MI.getOperand(0).getReg(); + const MachineOperand &Src2 = MI.getOperand(2); + if (Src2.isReg() && Src2.getReg() == AMDGPU::EXEC) + return MI.getOperand(0).getReg(); + } + } + + return AMDGPU::NoRegister; +} + static unsigned getSaveExecOp(unsigned Opc) { switch (Opc) { case AMDGPU::S_AND_B64: @@ -209,8 +233,24 @@ // Scan backwards to find the def. auto CopyToExecInst = &*I; auto CopyFromExecInst = findExecCopy(*TII, MBB, I, CopyToExec); - if (CopyFromExecInst == E) + if (CopyFromExecInst == E) { + auto PrepareExecInst = std::next(I); + if (PrepareExecInst == E) + continue; + // Fold exec = COPY (S_AND_B64 reg, exec) -> exec = S_AND_B64 reg, exec + if (CopyToExecInst->getOperand(1).isKill() && + isLogicalOpOnExec(*PrepareExecInst) == CopyToExec) { + DEBUG(dbgs() << "Fold exec copy: " << *PrepareExecInst); + + PrepareExecInst->getOperand(0).setReg(AMDGPU::EXEC); + + DEBUG(dbgs() << "into: " << *PrepareExecInst << '\n'); + + CopyToExecInst->eraseFromParent(); + } + continue; + } if (isLiveOut(MBB, CopyToExec)) { // The copied register is live out and has a second use in another block. Index: llvm/trunk/lib/Target/AMDGPU/SIOptimizeExecMaskingPreRA.cpp =================================================================== --- llvm/trunk/lib/Target/AMDGPU/SIOptimizeExecMaskingPreRA.cpp +++ llvm/trunk/lib/Target/AMDGPU/SIOptimizeExecMaskingPreRA.cpp @@ -147,6 +147,30 @@ } Changed = true; + + // If the only use of saved exec in the removed instruction is S_AND_B64 + // fold the copy now. + auto SaveExec = getOrExecSource(*Lead, *TII, MRI); + if (!SaveExec || !SaveExec->isFullCopy()) + continue; + + unsigned SavedExec = SaveExec->getOperand(0).getReg(); + bool SafeToReplace = true; + for (auto& U : MRI.use_nodbg_instructions(SavedExec)) { + if (U.getParent() != SaveExec->getParent()) { + SafeToReplace = false; + break; + } + + DEBUG(dbgs() << "Redundant EXEC COPY: " << *SaveExec << '\n'); + } + + if (SafeToReplace) { + LIS->RemoveMachineInstrFromMaps(*SaveExec); + SaveExec->eraseFromParent(); + MRI.replaceRegWith(SavedExec, AMDGPU::EXEC); + LIS->removeInterval(SavedExec); + } } if (Changed) { Index: llvm/trunk/test/CodeGen/AMDGPU/collapse-endcf.ll =================================================================== --- llvm/trunk/test/CodeGen/AMDGPU/collapse-endcf.ll +++ llvm/trunk/test/CodeGen/AMDGPU/collapse-endcf.ll @@ -4,7 +4,7 @@ ; GCN: s_and_saveexec_b64 [[SAVEEXEC:s\[[0-9:]+\]]] ; GCN-NEXT: ; mask branch [[ENDIF:BB[0-9_]+]] ; GCN-NEXT: s_cbranch_execz [[ENDIF]] -; GCN: s_and_saveexec_b64 +; GCN: s_and_b64 exec, exec, vcc ; GCN-NEXT: ; mask branch [[ENDIF]] ; GCN-NEXT: {{^BB[0-9_]+}}: ; GCN: store_dword Index: llvm/trunk/test/CodeGen/AMDGPU/reduce-saveexec.mir =================================================================== --- llvm/trunk/test/CodeGen/AMDGPU/reduce-saveexec.mir +++ llvm/trunk/test/CodeGen/AMDGPU/reduce-saveexec.mir @@ -0,0 +1,147 @@ +# RUN: llc -march=amdgcn -verify-machineinstrs -run-pass si-optimize-exec-masking %s -o - | FileCheck -check-prefix=GCN %s + +--- +# GCN-LABEL: name: reduce_and_saveexec +# GCN: %exec = S_AND_B64 %exec, killed %vcc +# GCN-NEXT: S_ENDPGM +name: reduce_and_saveexec +tracksRegLiveness: true +body: | + bb.0: + %vcc = IMPLICIT_DEF + %sgpr0_sgpr1 = S_AND_B64 %exec, killed %vcc, implicit-def %scc + %exec = COPY killed %sgpr0_sgpr1 + S_ENDPGM +... +--- +# GCN-LABEL: name: reduce_and_saveexec_commuted +# GCN: %exec = S_AND_B64 killed %vcc, %exec +# GCN-NEXT: S_ENDPGM +name: reduce_and_saveexec_commuted +tracksRegLiveness: true +body: | + bb.0: + %vcc = IMPLICIT_DEF + %sgpr0_sgpr1 = S_AND_B64 killed %vcc, %exec, implicit-def %scc + %exec = COPY killed %sgpr0_sgpr1 + S_ENDPGM +... +--- +# GCN-LABEL: name: reduce_and_saveexec_liveout +# GCN: %sgpr0_sgpr1 = S_AND_B64 %exec, killed %vcc +# GCN-NEXT: %exec = COPY +name: reduce_and_saveexec_liveout +tracksRegLiveness: true +body: | + bb.0: + %vcc = IMPLICIT_DEF + %sgpr0_sgpr1 = S_AND_B64 %exec, killed %vcc, implicit-def %scc + %exec = COPY %sgpr0_sgpr1 + S_ENDPGM +... +--- +# GCN-LABEL: name: and_saveexec +# GCN: %sgpr0_sgpr1 = S_AND_SAVEEXEC_B64 %vcc +# GCN-NEXT: S_ENDPGM +name: and_saveexec +tracksRegLiveness: true +body: | + bb.0: + %vcc = IMPLICIT_DEF + %sgpr0_sgpr1 = COPY %exec + %sgpr2_sgpr3 = S_AND_B64 %sgpr0_sgpr1, killed %vcc, implicit-def %scc + %exec = S_MOV_B64_term %sgpr2_sgpr3 + S_ENDPGM +... +--- +# GCN-LABEL: name: reduce_or_saveexec +# GCN: %exec = S_OR_B64 %exec, killed %vcc +# GCN-NEXT: S_ENDPGM +name: reduce_or_saveexec +tracksRegLiveness: true +body: | + bb.0: + %vcc = IMPLICIT_DEF + %sgpr0_sgpr1 = S_OR_B64 %exec, killed %vcc, implicit-def %scc + %exec = COPY killed %sgpr0_sgpr1 + S_ENDPGM +... +--- +# GCN-LABEL: name: reduce_xor_saveexec +# GCN: %exec = S_XOR_B64 %exec, killed %vcc +# GCN-NEXT: S_ENDPGM +name: reduce_xor_saveexec +tracksRegLiveness: true +body: | + bb.0: + %vcc = IMPLICIT_DEF + %sgpr0_sgpr1 = S_XOR_B64 %exec, killed %vcc, implicit-def %scc + %exec = COPY killed %sgpr0_sgpr1 + S_ENDPGM +... +--- +# GCN-LABEL: name: reduce_andn2_saveexec +# GCN: %exec = S_ANDN2_B64 %exec, killed %vcc +# GCN-NEXT: S_ENDPGM +name: reduce_andn2_saveexec +tracksRegLiveness: true +body: | + bb.0: + %vcc = IMPLICIT_DEF + %sgpr0_sgpr1 = S_ANDN2_B64 %exec, killed %vcc, implicit-def %scc + %exec = COPY killed %sgpr0_sgpr1 + S_ENDPGM +... +--- +# GCN-LABEL: name: reduce_orn2_saveexec +# GCN: %exec = S_ORN2_B64 %exec, killed %vcc +# GCN-NEXT: S_ENDPGM +name: reduce_orn2_saveexec +tracksRegLiveness: true +body: | + bb.0: + %vcc = IMPLICIT_DEF + %sgpr0_sgpr1 = S_ORN2_B64 %exec, killed %vcc, implicit-def %scc + %exec = COPY killed %sgpr0_sgpr1 + S_ENDPGM +... +--- +# GCN-LABEL: name: reduce_nand_saveexec +# GCN: %exec = S_NAND_B64 %exec, killed %vcc +# GCN-NEXT: S_ENDPGM +name: reduce_nand_saveexec +tracksRegLiveness: true +body: | + bb.0: + %vcc = IMPLICIT_DEF + %sgpr0_sgpr1 = S_NAND_B64 %exec, killed %vcc, implicit-def %scc + %exec = COPY killed %sgpr0_sgpr1 + S_ENDPGM +... +--- +# GCN-LABEL: name: reduce_nor_saveexec +# GCN: %exec = S_NOR_B64 %exec, killed %vcc +# GCN-NEXT: S_ENDPGM +name: reduce_nor_saveexec +tracksRegLiveness: true +body: | + bb.0: + %vcc = IMPLICIT_DEF + %sgpr0_sgpr1 = S_NOR_B64 %exec, killed %vcc, implicit-def %scc + %exec = COPY killed %sgpr0_sgpr1 + S_ENDPGM +... +--- +# GCN-LABEL: name: reduce_xnor_saveexec +# GCN: %exec = S_XNOR_B64 %exec, killed %vcc +# GCN-NEXT: S_ENDPGM +name: reduce_xnor_saveexec +tracksRegLiveness: true +body: | + bb.0: + %vcc = IMPLICIT_DEF + %sgpr0_sgpr1 = S_XNOR_B64 %exec, killed %vcc, implicit-def %scc + %exec = COPY killed %sgpr0_sgpr1 + S_ENDPGM +... +---