Index: lib/Target/NVPTX/CMakeLists.txt =================================================================== --- lib/Target/NVPTX/CMakeLists.txt +++ lib/Target/NVPTX/CMakeLists.txt @@ -22,6 +22,7 @@ NVPTXLowerAggrCopies.cpp NVPTXLowerKernelArgs.cpp NVPTXLowerAlloca.cpp + NVPTXPeephole.cpp NVPTXMCExpr.cpp NVPTXPrologEpilogPass.cpp NVPTXRegisterInfo.cpp Index: lib/Target/NVPTX/NVPTX.h =================================================================== --- lib/Target/NVPTX/NVPTX.h +++ lib/Target/NVPTX/NVPTX.h @@ -71,6 +71,7 @@ FunctionPass *createNVPTXImageOptimizerPass(); FunctionPass *createNVPTXLowerKernelArgsPass(const NVPTXTargetMachine *TM); BasicBlockPass *createNVPTXLowerAllocaPass(); +MachineFunctionPass *createNVPTXPeephole(); bool isImageOrSamplerVal(const Value *, const Module *); Index: lib/Target/NVPTX/NVPTXFrameLowering.cpp =================================================================== --- lib/Target/NVPTX/NVPTXFrameLowering.cpp +++ lib/Target/NVPTX/NVPTXFrameLowering.cpp @@ -41,28 +41,24 @@ // in the BB, so giving it no debug location. DebugLoc dl = DebugLoc(); - MachineRegisterInfo &MRI = MF.getRegInfo(); - // mov %SPL, %depot; // cvta.local %SP, %SPL; if (static_cast(MF.getTarget()).is64Bit()) { - unsigned LocalReg = MRI.createVirtualRegister(&NVPTX::Int64RegsRegClass); MachineInstr *MI = BuildMI(MBB, MBBI, dl, MF.getSubtarget().getInstrInfo()->get( NVPTX::cvta_local_yes_64), - NVPTX::VRFrame).addReg(LocalReg); + NVPTX::VRFrame).addReg(NVPTX::VRFrameLocal); BuildMI(MBB, MI, dl, MF.getSubtarget().getInstrInfo()->get(NVPTX::MOV_DEPOT_ADDR_64), - LocalReg).addImm(MF.getFunctionNumber()); + NVPTX::VRFrameLocal).addImm(MF.getFunctionNumber()); } else { - unsigned LocalReg = MRI.createVirtualRegister(&NVPTX::Int32RegsRegClass); MachineInstr *MI = BuildMI(MBB, MBBI, dl, MF.getSubtarget().getInstrInfo()->get(NVPTX::cvta_local_yes), - NVPTX::VRFrame).addReg(LocalReg); + NVPTX::VRFrame).addReg(NVPTX::VRFrameLocal); BuildMI(MBB, MI, dl, MF.getSubtarget().getInstrInfo()->get(NVPTX::MOV_DEPOT_ADDR), - LocalReg).addImm(MF.getFunctionNumber()); + NVPTX::VRFrameLocal).addImm(MF.getFunctionNumber()); } } } Index: lib/Target/NVPTX/NVPTXPeephole.cpp =================================================================== --- /dev/null +++ lib/Target/NVPTX/NVPTXPeephole.cpp @@ -0,0 +1,138 @@ +//===-- NVPTXPeephole.cpp - NVPTX Peephole Optimiztions ---------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +// In NVPTX, we always use a special frame register which holds local address +// of frame, NVPTXLowerAlloca may introduce a lot of cvta.to.local instructions, +// this peephole pass optimizes these case, for example +// +// It will transform the following pattern +// %vreg0 = LEA_ADDRi64 , 4 +// %vreg1 = cvta_to_local_yes_64 %vreg0 +// +// into +// %vreg1 = LEA_ADDRi64 %VRFrameLocal, 4 +// +//===----------------------------------------------------------------------===// + +#include "NVPTX.h" +#include "llvm/CodeGen/MachineFunctionPass.h" +#include "llvm/CodeGen/MachineInstrBuilder.h" +#include "llvm/CodeGen/MachineRegisterInfo.h" +#include "llvm/Target/TargetRegisterInfo.h" +#include "llvm/Target/TargetInstrInfo.h" +#include + +using namespace llvm; + +#define DEBUG_TYPE "nvptx-peephole" + +namespace llvm { + void initializeNVPTXPeepholePass(PassRegistry&); +} + +namespace { + struct NVPTXPeephole : public MachineFunctionPass { + + public: + static char ID; + NVPTXPeephole() : MachineFunctionPass(ID) { + initializeNVPTXPeepholePass(*PassRegistry::getPassRegistry()); + } + + bool runOnMachineFunction(MachineFunction &MF) override; + + const char *getPassName() const override { + return "NVPTX optimize redundant cvta.to.local instruction"; + } + + void getAnalysisUsage(AnalysisUsage &AU) const override { + MachineFunctionPass::getAnalysisUsage(AU); + } + }; +} + +char NVPTXPeephole::ID = 0; + +INITIALIZE_PASS(NVPTXPeephole, "nvptx-peephole", "NVPTX Peephole", + false, false) + +static bool isCVTAToLocalCombinationCandidate(MachineInstr& Root) +{ + auto &MBB = *Root.getParent(); + auto &MF = *MBB.getParent(); + // Check current instruction is cvta.to.local + if (Root.getOpcode() != NVPTX::cvta_to_local_yes_64 && + Root.getOpcode() != NVPTX::cvta_to_local_yes) + return false; + + auto &Op = Root.getOperand(1); + const auto &MRI = MF.getRegInfo(); + MachineInstr *GenericAddrDef = nullptr; + if (Op.isReg() && TargetRegisterInfo::isVirtualRegister(Op.getReg())) { + GenericAddrDef = MRI.getUniqueVRegDef(Op.getReg()); + } + + // Check the register operand is uniquely defined by LEA_ADDRi instruction + if (!GenericAddrDef + || GenericAddrDef->getParent() != &MBB + || (GenericAddrDef->getOpcode() != NVPTX::LEA_ADDRi64 && + GenericAddrDef->getOpcode() != NVPTX::LEA_ADDRi)) { + return false; + } + + // Check the LEA_ADDRi operand is Frame index + auto &BaseAddrOp = GenericAddrDef->getOperand(1); + if (BaseAddrOp.getType() == MachineOperand::MO_FrameIndex) { + return true; + } + + return false; +} + +static void CombineCVTAToLocal(MachineInstr& Root) +{ + auto &MBB = *Root.getParent(); + auto &MF = *MBB.getParent(); + const auto &MRI = MF.getRegInfo(); + const TargetInstrInfo *TII = MF.getSubtarget().getInstrInfo(); + auto &Prev = *MRI.getUniqueVRegDef(Root.getOperand(1).getReg()); + MachineInstrBuilder MIB = + BuildMI(MF, Root.getDebugLoc(), TII->get(Prev.getOpcode()), + Root.getOperand(0).getReg()) + .addReg(NVPTX::VRFrameLocal) + .addOperand(Prev.getOperand(2)); + + MBB.insert((MachineBasicBlock::iterator) &Root, (MachineInstr *) MIB); + + // Check if MRI has only one non dbg use, which is Root + if (MRI.hasOneNonDBGUse(Prev.getOperand(0).getReg())) { + Prev.eraseFromParentAndMarkDBGValuesForRemoval(); + } + Root.eraseFromParentAndMarkDBGValuesForRemoval(); +} + +bool NVPTXPeephole::runOnMachineFunction(MachineFunction &MF) { + bool Changed = false; + // Loop over all of the basic blocks. + for (auto &MBB : MF) { + // Traverse the basic block. + auto BlockIter = MBB.begin(); + + while (BlockIter != MBB.end()) { + auto &MI = *BlockIter++; + if (isCVTAToLocalCombinationCandidate(MI)) { + CombineCVTAToLocal(MI); + Changed = true; + } + } // Instruction + } // Basic Block + return Changed; +} + +MachineFunctionPass *llvm::createNVPTXPeephole() { + return new NVPTXPeephole(); +} Index: lib/Target/NVPTX/NVPTXTargetMachine.cpp =================================================================== --- lib/Target/NVPTX/NVPTXTargetMachine.cpp +++ lib/Target/NVPTX/NVPTXTargetMachine.cpp @@ -205,6 +205,8 @@ if (!ST.hasImageHandles()) addPass(createNVPTXReplaceImageHandlesPass()); + addPass(createNVPTXPeephole()); + return false; } Index: test/CodeGen/NVPTX/call-with-alloca-buffer.ll =================================================================== --- test/CodeGen/NVPTX/call-with-alloca-buffer.ll +++ test/CodeGen/NVPTX/call-with-alloca-buffer.ll @@ -20,8 +20,8 @@ %buf = alloca [16 x i8], align 4 ; CHECK: .local .align 4 .b8 __local_depot0[16] -; CHECK: mov.u64 %rd[[BUF_REG:[0-9]+]] -; CHECK: cvta.local.u64 %SP, %rd[[BUF_REG]] +; CHECK: mov.u64 %SPL +; CHECK: cvta.local.u64 %SP, %SPL ; CHECK: ld.param.u64 %rd[[A_REG:[0-9]+]], [kernel_func_param_0] ; CHECK: cvta.to.global.u64 %rd[[A1_REG:[0-9]+]], %rd[[A_REG]] Index: test/CodeGen/NVPTX/local-stack-frame.ll =================================================================== --- test/CodeGen/NVPTX/local-stack-frame.ll +++ test/CodeGen/NVPTX/local-stack-frame.ll @@ -3,12 +3,12 @@ ; Ensure we access the local stack properly -; PTX32: mov.u32 %r{{[0-9]+}}, __local_depot{{[0-9]+}}; -; PTX32: cvta.local.u32 %SP, %r{{[0-9]+}}; +; PTX32: mov.u32 %SPL, __local_depot{{[0-9]+}}; +; PTX32: cvta.local.u32 %SP, %SPL; ; PTX32: ld.param.u32 %r{{[0-9]+}}, [foo_param_0]; ; PTX32: st.volatile.u32 [%SP+0], %r{{[0-9]+}}; -; PTX64: mov.u64 %rd{{[0-9]+}}, __local_depot{{[0-9]+}}; -; PTX64: cvta.local.u64 %SP, %rd{{[0-9]+}}; +; PTX64: mov.u64 %SPL, __local_depot{{[0-9]+}}; +; PTX64: cvta.local.u64 %SP, %SPL; ; PTX64: ld.param.u32 %r{{[0-9]+}}, [foo_param_0]; ; PTX64: st.volatile.u32 [%SP+0], %r{{[0-9]+}}; define void @foo(i32 %a) { @@ -16,3 +16,25 @@ store volatile i32 %a, i32* %local ret void } + +; PTX32: mov.u32 %SPL, __local_depot{{[0-9]+}}; +; PTX32: cvta.local.u32 %SP, %SPL; +; PTX32: ld.param.u32 %r{{[0-9]+}}, [foo2_param_0]; +; PTX32: add.u32 %r[[SP_REG:[0-9]+]], %SPL, 0; +; PTX32: st.local.u32 [%r[[SP_REG]]], %r{{[0-9]+}}; +; PTX64: mov.u64 %SPL, __local_depot{{[0-9]+}}; +; PTX64: cvta.local.u64 %SP, %SPL; +; PTX64: ld.param.u32 %r{{[0-9]+}}, [foo2_param_0]; +; PTX64: add.u64 %rd[[SP_REG:[0-9]+]], %SPL, 0; +; PTX64: st.local.u32 [%rd[[SP_REG]]], %r{{[0-9]+}}; +define void @foo2(i32 %a) { + %local = alloca i32, align 4 + store i32 %a, i32* %local + call void @bar(i32* %local) + ret void +} + +declare void @bar(i32* %a) + +!nvvm.annotations = !{!0} +!0 = !{void (i32)* @foo2, !"kernel", i32 1}