Index: llvm/trunk/lib/Target/AMDGPU/AMDGPU.h =================================================================== --- llvm/trunk/lib/Target/AMDGPU/AMDGPU.h +++ llvm/trunk/lib/Target/AMDGPU/AMDGPU.h @@ -60,6 +60,9 @@ void initializeSIFixSGPRCopiesPass(PassRegistry &); extern char &SIFixSGPRCopiesID; +void initializeSIFixVGPRCopiesPass(PassRegistry &); +extern char &SIFixVGPRCopiesID; + void initializeSILowerI1CopiesPass(PassRegistry &); extern char &SILowerI1CopiesID; Index: llvm/trunk/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp =================================================================== --- llvm/trunk/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp +++ llvm/trunk/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp @@ -86,6 +86,7 @@ PassRegistry *PR = PassRegistry::getPassRegistry(); initializeSILowerI1CopiesPass(*PR); initializeSIFixSGPRCopiesPass(*PR); + initializeSIFixVGPRCopiesPass(*PR); initializeSIFoldOperandsPass(*PR); initializeSIShrinkInstructionsPass(*PR); initializeSIFixControlFlowLiveIntervalsPass(*PR); @@ -615,6 +616,7 @@ } void GCNPassConfig::addPostRegAlloc() { + addPass(&SIFixVGPRCopiesID); addPass(&SIOptimizeExecMaskingID); TargetPassConfig::addPostRegAlloc(); } Index: llvm/trunk/lib/Target/AMDGPU/CMakeLists.txt =================================================================== --- llvm/trunk/lib/Target/AMDGPU/CMakeLists.txt +++ llvm/trunk/lib/Target/AMDGPU/CMakeLists.txt @@ -68,6 +68,7 @@ SIDebuggerInsertNops.cpp SIFixControlFlowLiveIntervals.cpp SIFixSGPRCopies.cpp + SIFixVGPRCopies.cpp SIFoldOperands.cpp SIFrameLowering.cpp SIInsertSkips.cpp Index: llvm/trunk/lib/Target/AMDGPU/SIFixVGPRCopies.cpp =================================================================== --- llvm/trunk/lib/Target/AMDGPU/SIFixVGPRCopies.cpp +++ llvm/trunk/lib/Target/AMDGPU/SIFixVGPRCopies.cpp @@ -0,0 +1,72 @@ +//===-- SIFixVGPRCopies.cpp - Fix VGPR Copies after regalloc --------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +/// \file +/// \brief Add implicit use of exec to vector register copies. +/// +//===----------------------------------------------------------------------===// + +#include "AMDGPU.h" +#include "AMDGPUSubtarget.h" +#include "SIInstrInfo.h" +#include "llvm/CodeGen/MachineFunctionPass.h" + +using namespace llvm; + +#define DEBUG_TYPE "si-fix-vgpr-copies" + +namespace { + +class SIFixVGPRCopies : public MachineFunctionPass { +public: + static char ID; + +public: + SIFixVGPRCopies() : MachineFunctionPass(ID) { + initializeSIFixVGPRCopiesPass(*PassRegistry::getPassRegistry()); + } + + bool runOnMachineFunction(MachineFunction &MF) override; + + StringRef getPassName() const override { return "SI Fix VGPR copies"; } +}; + +} // End anonymous namespace. + +INITIALIZE_PASS(SIFixVGPRCopies, DEBUG_TYPE, "SI Fix VGPR copies", false, false) + +char SIFixVGPRCopies::ID = 0; + +char &llvm::SIFixVGPRCopiesID = SIFixVGPRCopies::ID; + +bool SIFixVGPRCopies::runOnMachineFunction(MachineFunction &MF) { + const SISubtarget &ST = MF.getSubtarget(); + const SIRegisterInfo *TRI = ST.getRegisterInfo(); + const SIInstrInfo *TII = ST.getInstrInfo(); + bool Changed = false; + + for (MachineBasicBlock &MBB : MF) { + for (MachineInstr &MI : MBB) { + switch (MI.getOpcode()) { + case AMDGPU::COPY: + if (TII->isVGPRCopy(MI) && !MI.readsRegister(AMDGPU::EXEC, TRI)) { + MI.addOperand(MF, + MachineOperand::CreateReg(AMDGPU::EXEC, false, true)); + DEBUG(dbgs() << "Add exec use to " << MI); + Changed = true; + } + break; + default: + break; + } + } + } + + return Changed; +} Index: llvm/trunk/test/CodeGen/AMDGPU/fix-vgpr-copies.mir =================================================================== --- llvm/trunk/test/CodeGen/AMDGPU/fix-vgpr-copies.mir +++ llvm/trunk/test/CodeGen/AMDGPU/fix-vgpr-copies.mir @@ -0,0 +1,44 @@ +# RUN: llc -march=amdgcn -start-after=greedy -stop-after=si-optimize-exec-masking -o - %s | FileCheck %s +# Check that we first do all vector instructions and only then change exec +# CHECK-DAG: COPY %vgpr10_vgpr11 +# CHECK-DAG: COPY %vgpr12_vgpr13 +# CHECK: %exec = COPY + +--- +name: main +alignment: 0 +exposesReturnsTwice: false +legalized: false +regBankSelected: false +selected: false +tracksRegLiveness: true +liveins: + - { reg: '%sgpr4_sgpr5' } + - { reg: '%sgpr6' } + - { reg: '%vgpr0' } +frameInfo: + isFrameAddressTaken: false + isReturnAddressTaken: false + hasStackMap: false + hasPatchPoint: false + stackSize: 0 + offsetAdjustment: 0 + maxAlignment: 4 + adjustsStack: false + hasCalls: false + maxCallFrameSize: 0 + hasOpaqueSPAdjustment: false + hasVAStart: false + hasMustTailInVarArgFunc: false +body: | + bb.0.entry: + liveins: %vgpr3, %vgpr10_vgpr11, %vgpr12_vgpr13 + + %vcc = V_CMP_NE_U32_e64 0, killed %vgpr3, implicit %exec + %sgpr4_sgpr5 = COPY %exec, implicit-def %exec + %sgpr6_sgpr7 = S_AND_B64 %sgpr4_sgpr5, killed %vcc, implicit-def dead %scc + %sgpr4_sgpr5 = S_XOR_B64 %sgpr6_sgpr7, killed %sgpr4_sgpr5, implicit-def dead %scc + %vgpr61_vgpr62 = COPY %vgpr10_vgpr11 + %vgpr155_vgpr156 = COPY %vgpr12_vgpr13 + %exec = S_MOV_B64_term killed %sgpr6_sgpr7 +...