Index: lib/Target/NVPTX/CMakeLists.txt =================================================================== --- lib/Target/NVPTX/CMakeLists.txt +++ lib/Target/NVPTX/CMakeLists.txt @@ -22,6 +22,7 @@ NVPTXLowerAggrCopies.cpp NVPTXLowerKernelArgs.cpp NVPTXLowerAlloca.cpp + NVPTXPeephole.cpp NVPTXMCExpr.cpp NVPTXPrologEpilogPass.cpp NVPTXRegisterInfo.cpp Index: lib/Target/NVPTX/NVPTX.h =================================================================== --- lib/Target/NVPTX/NVPTX.h +++ lib/Target/NVPTX/NVPTX.h @@ -71,6 +71,7 @@ FunctionPass *createNVPTXImageOptimizerPass(); FunctionPass *createNVPTXLowerKernelArgsPass(const NVPTXTargetMachine *TM); BasicBlockPass *createNVPTXLowerAllocaPass(); +MachineFunctionPass *createNVPTXPeephole(); bool isImageOrSamplerVal(const Value *, const Module *); Index: lib/Target/NVPTX/NVPTXFrameLowering.cpp =================================================================== --- lib/Target/NVPTX/NVPTXFrameLowering.cpp +++ lib/Target/NVPTX/NVPTXFrameLowering.cpp @@ -36,33 +36,36 @@ if (MF.getFrameInfo()->hasStackObjects()) { assert(&MF.front() == &MBB && "Shrink-wrapping not yet supported"); // Insert "mov.u32 %SP, %Depot" - MachineBasicBlock::iterator MBBI = MBB.begin(); + MachineInstr* MI = MBB.begin(); + MachineRegisterInfo& MR = MF.getRegInfo(); + // This instruction really occurs before first instruction // in the BB, so giving it no debug location. DebugLoc dl = DebugLoc(); - MachineRegisterInfo &MRI = MF.getRegInfo(); - // mov %SPL, %depot; // cvta.local %SP, %SPL; if (static_cast(MF.getTarget()).is64Bit()) { - unsigned LocalReg = MRI.createVirtualRegister(&NVPTX::Int64RegsRegClass); - MachineInstr *MI = - BuildMI(MBB, MBBI, dl, MF.getSubtarget().getInstrInfo()->get( - NVPTX::cvta_local_yes_64), - NVPTX::VRFrame).addReg(LocalReg); + // Check if %SP is actually used + if (MR.hasOneNonDBGUse(NVPTX::VRFrame)) { + MI = BuildMI(MBB, MI, dl, MF.getSubtarget().getInstrInfo()->get( + NVPTX::cvta_local_yes_64), + NVPTX::VRFrame).addReg(NVPTX::VRFrameLocal); + } + BuildMI(MBB, MI, dl, MF.getSubtarget().getInstrInfo()->get(NVPTX::MOV_DEPOT_ADDR_64), - LocalReg).addImm(MF.getFunctionNumber()); + NVPTX::VRFrameLocal).addImm(MF.getFunctionNumber()); } else { - unsigned LocalReg = MRI.createVirtualRegister(&NVPTX::Int32RegsRegClass); - MachineInstr *MI = - BuildMI(MBB, MBBI, dl, - MF.getSubtarget().getInstrInfo()->get(NVPTX::cvta_local_yes), - NVPTX::VRFrame).addReg(LocalReg); + // Check if %SP is actually used + if (MR.hasOneNonDBGUse(NVPTX::VRFrame)) { + MI = BuildMI(MBB, MI, dl, + MF.getSubtarget().getInstrInfo()->get(NVPTX::cvta_local_yes), + NVPTX::VRFrame).addReg(NVPTX::VRFrameLocal); + } BuildMI(MBB, MI, dl, MF.getSubtarget().getInstrInfo()->get(NVPTX::MOV_DEPOT_ADDR), - LocalReg).addImm(MF.getFunctionNumber()); + NVPTX::VRFrameLocal).addImm(MF.getFunctionNumber()); } } } Index: lib/Target/NVPTX/NVPTXPeephole.cpp =================================================================== --- /dev/null +++ lib/Target/NVPTX/NVPTXPeephole.cpp @@ -0,0 +1,149 @@ +//===-- NVPTXPeephole.cpp - NVPTX Peephole Optimiztions ---------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +// In NVPTX, NVPTXFrameLowering will emit following instruction at the beginning +// of a MachineFunction. +// +// mov %SPL, %depot +// cvta.local %SP, %SPL +// +// Because Frame Index is a generic address and alloca can only return generic +// pointer, without this pass the instructions producing alloca'ed address will +// be based on %SP. NVPTXLowerAlloca tends to help replace store and load on +// this address with their .local versions, but this may introduce a lot of +// cvta.to.local instructions. Performance can be improved if we avoid to cast +// address back and forth and directly calculate local address based on %SPL. +// This peephole pass optimizes these cases, for example +// +// It will transform the following pattern +// %vreg0 = LEA_ADDRi64 , 4 +// %vreg1 = cvta_to_local_yes_64 %vreg0 +// +// into +// %vreg1 = LEA_ADDRi64 %VRFrameLocal, 4 +// +// %VRFrameLocal is the virtual register name of %SPL +// +//===----------------------------------------------------------------------===// + +#include "NVPTX.h" +#include "llvm/CodeGen/MachineFunctionPass.h" +#include "llvm/CodeGen/MachineInstrBuilder.h" +#include "llvm/CodeGen/MachineRegisterInfo.h" +#include "llvm/CodeGen/MachineFrameInfo.h" +#include "llvm/Target/TargetRegisterInfo.h" +#include "llvm/Target/TargetInstrInfo.h" + +using namespace llvm; + +#define DEBUG_TYPE "nvptx-peephole" + +namespace llvm { +void initializeNVPTXPeepholePass(PassRegistry &); +} + +namespace { +struct NVPTXPeephole : public MachineFunctionPass { + public: + static char ID; + NVPTXPeephole() : MachineFunctionPass(ID) { + initializeNVPTXPeepholePass(*PassRegistry::getPassRegistry()); + } + + bool runOnMachineFunction(MachineFunction &MF) override; + + const char *getPassName() const override { + return "NVPTX optimize redundant cvta.to.local instruction"; + } + + void getAnalysisUsage(AnalysisUsage &AU) const override { + MachineFunctionPass::getAnalysisUsage(AU); + } +}; +} + +char NVPTXPeephole::ID = 0; + +INITIALIZE_PASS(NVPTXPeephole, "nvptx-peephole", "NVPTX Peephole", false, false) + +static bool isCVTAToLocalCombinationCandidate(MachineInstr &Root) { + auto &MBB = *Root.getParent(); + auto &MF = *MBB.getParent(); + // Check current instruction is cvta.to.local + if (Root.getOpcode() != NVPTX::cvta_to_local_yes_64 && + Root.getOpcode() != NVPTX::cvta_to_local_yes) + return false; + + auto &Op = Root.getOperand(1); + const auto &MRI = MF.getRegInfo(); + MachineInstr *GenericAddrDef = nullptr; + if (Op.isReg() && TargetRegisterInfo::isVirtualRegister(Op.getReg())) { + GenericAddrDef = MRI.getUniqueVRegDef(Op.getReg()); + } + + // Check the register operand is uniquely defined by LEA_ADDRi instruction + if (!GenericAddrDef || GenericAddrDef->getParent() != &MBB || + (GenericAddrDef->getOpcode() != NVPTX::LEA_ADDRi64 && + GenericAddrDef->getOpcode() != NVPTX::LEA_ADDRi)) { + return false; + } + + // Check the LEA_ADDRi operand is Frame index + auto &BaseAddrOp = GenericAddrDef->getOperand(1); + if (BaseAddrOp.getType() == MachineOperand::MO_FrameIndex) { + return true; + } + + return false; +} + +static void CombineCVTAToLocal(MachineInstr &Root) { + auto &MBB = *Root.getParent(); + auto &MF = *MBB.getParent(); + const auto &MRI = MF.getRegInfo(); + const TargetInstrInfo *TII = MF.getSubtarget().getInstrInfo(); + auto &Prev = *MRI.getUniqueVRegDef(Root.getOperand(1).getReg()); + + // Get the correct offset + int FrameIndex = Prev.getOperand(1).getIndex(); + int Offset = MF.getFrameInfo()->getObjectOffset(FrameIndex) + + Prev.getOperand(2).getImm(); + + MachineInstrBuilder MIB = + BuildMI(MF, Root.getDebugLoc(), TII->get(Prev.getOpcode()), + Root.getOperand(0).getReg()) + .addReg(NVPTX::VRFrameLocal) + .addOperand(MachineOperand::CreateImm(Offset)); + + MBB.insert((MachineBasicBlock::iterator)&Root, MIB); + + // Check if MRI has only one non dbg use, which is Root + if (MRI.hasOneNonDBGUse(Prev.getOperand(0).getReg())) { + Prev.eraseFromParentAndMarkDBGValuesForRemoval(); + } + Root.eraseFromParentAndMarkDBGValuesForRemoval(); +} + +bool NVPTXPeephole::runOnMachineFunction(MachineFunction &MF) { + bool Changed = false; + // Loop over all of the basic blocks. + for (auto &MBB : MF) { + // Traverse the basic block. + auto BlockIter = MBB.begin(); + + while (BlockIter != MBB.end()) { + auto &MI = *BlockIter++; + if (isCVTAToLocalCombinationCandidate(MI)) { + CombineCVTAToLocal(MI); + Changed = true; + } + } // Instruction + } // Basic Block + return Changed; +} + +MachineFunctionPass *llvm::createNVPTXPeephole() { return new NVPTXPeephole(); } Index: lib/Target/NVPTX/NVPTXRegisterInfo.td =================================================================== --- lib/Target/NVPTX/NVPTXRegisterInfo.td +++ lib/Target/NVPTX/NVPTXRegisterInfo.td @@ -65,5 +65,5 @@ def Float64ArgRegs : NVPTXRegClass<[f64], 64, (add (sequence "da%u", 0, 4))>; // Read NVPTXRegisterInfo.cpp to see how VRFrame and VRDepot are used. -def SpecialRegs : NVPTXRegClass<[i32], 32, (add VRFrame, VRDepot, +def SpecialRegs : NVPTXRegClass<[i32], 32, (add VRFrame, VRFrameLocal, VRDepot, (sequence "ENVREG%u", 0, 31))>; Index: lib/Target/NVPTX/NVPTXTargetMachine.cpp =================================================================== --- lib/Target/NVPTX/NVPTXTargetMachine.cpp +++ lib/Target/NVPTX/NVPTXTargetMachine.cpp @@ -205,6 +205,8 @@ if (!ST.hasImageHandles()) addPass(createNVPTXReplaceImageHandlesPass()); + addPass(createNVPTXPeephole()); + return false; } Index: test/CodeGen/NVPTX/call-with-alloca-buffer.ll =================================================================== --- test/CodeGen/NVPTX/call-with-alloca-buffer.ll +++ test/CodeGen/NVPTX/call-with-alloca-buffer.ll @@ -20,8 +20,7 @@ %buf = alloca [16 x i8], align 4 ; CHECK: .local .align 4 .b8 __local_depot0[16] -; CHECK: mov.u64 %rd[[BUF_REG:[0-9]+]] -; CHECK: cvta.local.u64 %SP, %rd[[BUF_REG]] +; CHECK: mov.u64 %SPL ; CHECK: ld.param.u64 %rd[[A_REG:[0-9]+]], [kernel_func_param_0] ; CHECK: cvta.to.global.u64 %rd[[A1_REG:[0-9]+]], %rd[[A_REG]] Index: test/CodeGen/NVPTX/local-stack-frame.ll =================================================================== --- test/CodeGen/NVPTX/local-stack-frame.ll +++ test/CodeGen/NVPTX/local-stack-frame.ll @@ -3,12 +3,12 @@ ; Ensure we access the local stack properly -; PTX32: mov.u32 %r{{[0-9]+}}, __local_depot{{[0-9]+}}; -; PTX32: cvta.local.u32 %SP, %r{{[0-9]+}}; +; PTX32: mov.u32 %SPL, __local_depot{{[0-9]+}}; +; PTX32: cvta.local.u32 %SP, %SPL; ; PTX32: ld.param.u32 %r{{[0-9]+}}, [foo_param_0]; ; PTX32: st.volatile.u32 [%SP+0], %r{{[0-9]+}}; -; PTX64: mov.u64 %rd{{[0-9]+}}, __local_depot{{[0-9]+}}; -; PTX64: cvta.local.u64 %SP, %rd{{[0-9]+}}; +; PTX64: mov.u64 %SPL, __local_depot{{[0-9]+}}; +; PTX64: cvta.local.u64 %SP, %SPL; ; PTX64: ld.param.u32 %r{{[0-9]+}}, [foo_param_0]; ; PTX64: st.volatile.u32 [%SP+0], %r{{[0-9]+}}; define void @foo(i32 %a) { @@ -16,3 +16,43 @@ store volatile i32 %a, i32* %local ret void } + +; PTX32: mov.u32 %SPL, __local_depot{{[0-9]+}}; +; PTX32: cvta.local.u32 %SP, %SPL; +; PTX32: ld.param.u32 %r{{[0-9]+}}, [foo2_param_0]; +; PTX32: add.u32 %r[[SP_REG:[0-9]+]], %SPL, 0; +; PTX32: st.local.u32 [%r[[SP_REG]]], %r{{[0-9]+}}; +; PTX64: mov.u64 %SPL, __local_depot{{[0-9]+}}; +; PTX64: cvta.local.u64 %SP, %SPL; +; PTX64: ld.param.u32 %r{{[0-9]+}}, [foo2_param_0]; +; PTX64: add.u64 %rd[[SP_REG:[0-9]+]], %SPL, 0; +; PTX64: st.local.u32 [%rd[[SP_REG]]], %r{{[0-9]+}}; +define void @foo2(i32 %a) { + %local = alloca i32, align 4 + store i32 %a, i32* %local + call void @bar(i32* %local) + ret void +} + +declare void @bar(i32* %a) + +!nvvm.annotations = !{!0} +!0 = !{void (i32)* @foo2, !"kernel", i32 1} + +; PTX32: mov.u32 %SPL, __local_depot{{[0-9]+}}; +; PTX32-NOT: cvta.local.u32 %SP, %SPL; +; PTX32: ld.param.u32 %r{{[0-9]+}}, [foo3_param_0]; +; PTX32: add.u32 %r{{[0-9]+}}, %SPL, 0; +; PTX32: st.local.u32 [%r{{[0-9]+}}], %r{{[0-9]+}}; +; PTX64: mov.u64 %SPL, __local_depot{{[0-9]+}}; +; PTX64-NOT: cvta.local.u64 %SP, %SPL; +; PTX64: ld.param.u32 %r{{[0-9]+}}, [foo3_param_0]; +; PTX64: add.u64 %rd{{[0-9]+}}, %SPL, 0; +; PTX64: st.local.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}}; +define void @foo3(i32 %a) { + %local = alloca [3 x i32], align 4 + %1 = bitcast [3 x i32]* %local to i32* + %2 = getelementptr inbounds i32, i32* %1, i32 %a + store i32 %a, i32* %2 + ret void +}