Index: lib/Target/NVPTX/CMakeLists.txt =================================================================== --- lib/Target/NVPTX/CMakeLists.txt +++ lib/Target/NVPTX/CMakeLists.txt @@ -32,6 +32,7 @@ NVPTXTargetMachine.cpp NVPTXTargetTransformInfo.cpp NVPTXUtilities.cpp + NVVMIntrRange.cpp NVVMReflect.cpp ) Index: lib/Target/NVPTX/NVPTX.h =================================================================== --- lib/Target/NVPTX/NVPTX.h +++ lib/Target/NVPTX/NVPTX.h @@ -47,6 +47,7 @@ ModulePass *createGenericToNVVMPass(); FunctionPass *createNVPTXFavorNonGenericAddrSpacesPass(); FunctionPass *createNVPTXInferAddressSpacesPass(); +FunctionPass *createNVVMIntrRangePass(unsigned int SmVersion); FunctionPass *createNVVMReflectPass(); FunctionPass *createNVVMReflectPass(const StringMap &Mapping); MachineFunctionPass *createNVPTXPrologEpilogPass(); Index: lib/Target/NVPTX/NVPTXTargetMachine.cpp =================================================================== --- lib/Target/NVPTX/NVPTXTargetMachine.cpp +++ lib/Target/NVPTX/NVPTXTargetMachine.cpp @@ -55,6 +55,7 @@ "NVPTXFavorNonGenericAddrSpaces")); namespace llvm { +void initializeNVVMIntrRangePass(PassRegistry&); void initializeNVVMReflectPass(PassRegistry&); void initializeGenericToNVVMPass(PassRegistry&); void initializeNVPTXAllocaHoistingPass(PassRegistry &); @@ -75,6 +76,7 @@ // but it's very NVPTX-specific. PassRegistry &PR = *PassRegistry::getPassRegistry(); initializeNVVMReflectPass(PR); + initializeNVVMIntrRangePass(PR); initializeGenericToNVVMPass(PR); initializeNVPTXAllocaHoistingPass(PR); initializeNVPTXAssignValidGlobalNamesPass(PR); @@ -176,6 +178,7 @@ void NVPTXTargetMachine::addEarlyAsPossiblePasses(PassManagerBase &PM) { PM.add(createNVVMReflectPass()); + PM.add(createNVVMIntrRangePass(Subtarget.getSmVersion())); } TargetIRAnalysis NVPTXTargetMachine::getTargetIRAnalysis() { Index: lib/Target/NVPTX/NVVMIntrRange.cpp =================================================================== --- /dev/null +++ lib/Target/NVPTX/NVVMIntrRange.cpp @@ -0,0 +1,155 @@ +//===- NVVMReflect.cpp - NVVM Emulate conditional compilation -------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This pass adds appropriate !range metadata for calls of NVVM +// intrinsics that return limited range of values. +// +//===----------------------------------------------------------------------===// + +#include "NVPTX.h" +#include "llvm/IR/Constants.h" +#include "llvm/IR/InstIterator.h" +#include "llvm/IR/Intrinsics.h" +#include "llvm/IR/Instructions.h" + +using namespace llvm; + +#define DEBUG_TYPE "nvvm-intr-range" + +namespace llvm { void initializeNVVMIntrRangePass(PassRegistry &); } + +// Add !range metadata based on limits of given SM variant. +static cl::opt NVVMIntrRangeSM("nvvm-intr-range-sm", cl::init(20), + cl::Hidden, cl::desc("SM variant")); + +namespace { +class NVVMIntrRange : public FunctionPass { + private: + struct { + unsigned x, y, z; + } block, grid; + + public: + static char ID; + NVVMIntrRange() : NVVMIntrRange(NVVMIntrRangeSM) {} + NVVMIntrRange(unsigned int SmVersion) + : FunctionPass(ID), block{1024, 1024, 64}, + grid{SmVersion >= 30 ? 0x7fffffffu : 0xffffu, 0xffff, 0xffff} { + initializeNVVMIntrRangePass(*PassRegistry::getPassRegistry()); + } + + bool runOnFunction(Function &) override; +}; +} + +FunctionPass *llvm::createNVVMIntrRangePass(unsigned int SmVersion) { + return new NVVMIntrRange(SmVersion); +} + +char NVVMIntrRange::ID = 0; +INITIALIZE_PASS(NVVMIntrRange, "nvvm-intr-range", + "Add !range metadata to NVVM intrinsics.", false, false) + +// Adds the passed-in range information as metadata to the passed-in +// call instruction. +static bool addRangeMetadata(int Low, int High, CallInst *C) { + llvm::LLVMContext &Context = C->getParent()->getContext(); + IntegerType *Int32Ty = Type::getInt32Ty(Context); + llvm::Metadata *LowAndHigh[] = { + ConstantAsMetadata::get(llvm::ConstantInt::get(Int32Ty, Low)), + ConstantAsMetadata::get(llvm::ConstantInt::get(Int32Ty, High))}; + C->setMetadata(llvm::LLVMContext::MD_range, + llvm::MDNode::get(Context, LowAndHigh)); + return true; +} + +bool NVVMIntrRange::runOnFunction(Function &F) { + // Go through the calls in this function. + bool Changed = false; + for (Instruction &I : instructions(F)) { + CallInst *Call = dyn_cast(&I); + if (!Call) + continue; + + if (Function *Callee = Call->getCalledFunction()) { + switch (Callee->getIntrinsicID()) { + // Index within block + case Intrinsic::ptx_read_tid_x: + case Intrinsic::nvvm_read_ptx_sreg_tid_x: + Changed |= addRangeMetadata(0, block.x, Call); + break; + case Intrinsic::ptx_read_tid_y: + case Intrinsic::nvvm_read_ptx_sreg_tid_y: + Changed |= addRangeMetadata(0, block.y, Call); + break; + case Intrinsic::ptx_read_tid_z: + case Intrinsic::nvvm_read_ptx_sreg_tid_z: + Changed |= addRangeMetadata(0, block.z, Call); + break; + + // Block size + case Intrinsic::ptx_read_ntid_x: + case Intrinsic::nvvm_read_ptx_sreg_ntid_x: + Changed |= addRangeMetadata(1, block.x+1, Call); + break; + case Intrinsic::ptx_read_ntid_y: + case Intrinsic::nvvm_read_ptx_sreg_ntid_y: + Changed |= addRangeMetadata(1, block.y+1, Call); + break; + case Intrinsic::ptx_read_ntid_z: + case Intrinsic::nvvm_read_ptx_sreg_ntid_z: + Changed |= addRangeMetadata(1, block.z+1, Call); + break; + + // Index within grid + case Intrinsic::ptx_read_ctaid_x: + case Intrinsic::nvvm_read_ptx_sreg_ctaid_x: + Changed |= addRangeMetadata(0, grid.x, Call); + break; + case Intrinsic::ptx_read_ctaid_y: + case Intrinsic::nvvm_read_ptx_sreg_ctaid_y: + Changed |= addRangeMetadata(0, grid.y, Call); + break; + case Intrinsic::ptx_read_ctaid_z: + case Intrinsic::nvvm_read_ptx_sreg_ctaid_z: + Changed |= addRangeMetadata(0, grid.z, Call); + break; + + // Grid size + case Intrinsic::ptx_read_nctaid_x: + case Intrinsic::nvvm_read_ptx_sreg_nctaid_x: + Changed |= addRangeMetadata(1, grid.x+1, Call); + break; + case Intrinsic::ptx_read_nctaid_y: + case Intrinsic::nvvm_read_ptx_sreg_nctaid_y: + Changed |= addRangeMetadata(1, grid.y+1, Call); + break; + case Intrinsic::ptx_read_nctaid_z: + case Intrinsic::nvvm_read_ptx_sreg_nctaid_z: + Changed |= addRangeMetadata(1, grid.z+1, Call); + break; + + // warp size is constant 32. + case Intrinsic::nvvm_read_ptx_sreg_warpsize: + Changed |= addRangeMetadata(32, 32+1, Call); + break; + + // Lane ID is [0..warpsize) + case Intrinsic::ptx_read_laneid: + Changed |= addRangeMetadata(0, 32, Call); + break; + + default: + break; + } + } + } + + return Changed; +} Index: test/CodeGen/NVPTX/intrinsic-old.ll =================================================================== --- test/CodeGen/NVPTX/intrinsic-old.ll +++ test/CodeGen/NVPTX/intrinsic-old.ll @@ -1,8 +1,14 @@ ; RUN: llc < %s -march=nvptx -mcpu=sm_20 | FileCheck %s ; RUN: llc < %s -march=nvptx64 -mcpu=sm_20 | FileCheck %s +; RUN: opt < %s -S -mtriple=nvptx-nvidia-cuda -nvvm-intr-range \ +; RUN: | FileCheck --check-prefix=RANGE --check-prefix=RANGE_20 %s +; RUN: opt < %s -S -mtriple=nvptx-nvidia-cuda \ +; RUN: -nvvm-intr-range -nvvm-intr-range-sm=30 \ +; RUN: | FileCheck --check-prefix=RANGE --check-prefix=RANGE_30 %s define ptx_device i32 @test_tid_x() { ; CHECK: mov.u32 %r{{[0-9]+}}, %tid.x; +; RANGE: call i32 @llvm.ptx.read.tid.x(), !range ![[BLK_IDX_XY:[0-9]+]] ; CHECK: ret; %x = call i32 @llvm.ptx.read.tid.x() ret i32 %x @@ -10,6 +16,7 @@ define ptx_device i32 @test_tid_y() { ; CHECK: mov.u32 %r{{[0-9]+}}, %tid.y; +; RANGE: call i32 @llvm.ptx.read.tid.y(), !range ![[BLK_IDX_XY]] ; CHECK: ret; %x = call i32 @llvm.ptx.read.tid.y() ret i32 %x @@ -17,6 +24,7 @@ define ptx_device i32 @test_tid_z() { ; CHECK: mov.u32 %r{{[0-9]+}}, %tid.z; +; RANGE: call i32 @llvm.ptx.read.tid.z(), !range ![[BLK_IDX_Z:[0-9]+]] ; CHECK: ret; %x = call i32 @llvm.ptx.read.tid.z() ret i32 %x @@ -31,6 +39,7 @@ define ptx_device i32 @test_ntid_x() { ; CHECK: mov.u32 %r{{[0-9]+}}, %ntid.x; +; RANGE: call i32 @llvm.ptx.read.ntid.x(), !range ![[BLK_SIZE_XY:[0-9]+]] ; CHECK: ret; %x = call i32 @llvm.ptx.read.ntid.x() ret i32 %x @@ -38,6 +47,7 @@ define ptx_device i32 @test_ntid_y() { ; CHECK: mov.u32 %r{{[0-9]+}}, %ntid.y; +; RANGE: call i32 @llvm.ptx.read.ntid.y(), !range ![[BLK_SIZE_XY]] ; CHECK: ret; %x = call i32 @llvm.ptx.read.ntid.y() ret i32 %x @@ -45,6 +55,7 @@ define ptx_device i32 @test_ntid_z() { ; CHECK: mov.u32 %r{{[0-9]+}}, %ntid.z; +; RANGE: call i32 @llvm.ptx.read.ntid.z(), !range ![[BLK_SIZE_Z:[0-9]+]] ; CHECK: ret; %x = call i32 @llvm.ptx.read.ntid.z() ret i32 %x @@ -59,6 +70,7 @@ define ptx_device i32 @test_laneid() { ; CHECK: mov.u32 %r{{[0-9]+}}, %laneid; +; RANGE: call i32 @llvm.ptx.read.laneid(), !range ![[LANEID:[0-9]+]] ; CHECK: ret; %x = call i32 @llvm.ptx.read.laneid() ret i32 %x @@ -78,15 +90,9 @@ ret i32 %x } -define ptx_device i32 @test_ctaid_x() { -; CHECK: mov.u32 %r{{[0-9]+}}, %ctaid.x; -; CHECK: ret; - %x = call i32 @llvm.ptx.read.ctaid.x() - ret i32 %x -} - define ptx_device i32 @test_ctaid_y() { ; CHECK: mov.u32 %r{{[0-9]+}}, %ctaid.y; +; RANGE: call i32 @llvm.ptx.read.ctaid.y(), !range ![[GRID_IDX_YZ:[0-9]+]] ; CHECK: ret; %x = call i32 @llvm.ptx.read.ctaid.y() ret i32 %x @@ -94,11 +100,21 @@ define ptx_device i32 @test_ctaid_z() { ; CHECK: mov.u32 %r{{[0-9]+}}, %ctaid.z; +; RANGE: call i32 @llvm.ptx.read.ctaid.z(), !range ![[GRID_IDX_YZ]] ; CHECK: ret; %x = call i32 @llvm.ptx.read.ctaid.z() ret i32 %x } +define ptx_device i32 @test_ctaid_x() { +; CHECK: mov.u32 %r{{[0-9]+}}, %ctaid.x; +; RANGE_30: call i32 @llvm.ptx.read.ctaid.x(), !range ![[GRID_IDX_X:[0-9]+]] +; RANGE_20: call i32 @llvm.ptx.read.ctaid.x(), !range ![[GRID_IDX_YZ]] +; CHECK: ret; + %x = call i32 @llvm.ptx.read.ctaid.x() + ret i32 %x +} + define ptx_device i32 @test_ctaid_w() { ; CHECK: mov.u32 %r{{[0-9]+}}, %ctaid.w; ; CHECK: ret; @@ -106,15 +122,9 @@ ret i32 %x } -define ptx_device i32 @test_nctaid_x() { -; CHECK: mov.u32 %r{{[0-9]+}}, %nctaid.x; -; CHECK: ret; - %x = call i32 @llvm.ptx.read.nctaid.x() - ret i32 %x -} - define ptx_device i32 @test_nctaid_y() { ; CHECK: mov.u32 %r{{[0-9]+}}, %nctaid.y; +; RANGE: call i32 @llvm.ptx.read.nctaid.y(), !range ![[GRID_SIZE_YZ:[0-9]+]] ; CHECK: ret; %x = call i32 @llvm.ptx.read.nctaid.y() ret i32 %x @@ -122,11 +132,22 @@ define ptx_device i32 @test_nctaid_z() { ; CHECK: mov.u32 %r{{[0-9]+}}, %nctaid.z; +; RANGE: call i32 @llvm.ptx.read.nctaid.z(), !range ![[GRID_SIZE_YZ]] ; CHECK: ret; %x = call i32 @llvm.ptx.read.nctaid.z() ret i32 %x } +define ptx_device i32 @test_nctaid_x() { +; CHECK: mov.u32 %r{{[0-9]+}}, %nctaid.x; +; RANGE_30: call i32 @llvm.ptx.read.nctaid.x(), !range ![[GRID_SIZE_X:[0-9]+]] +; RANGE_20: call i32 @llvm.ptx.read.nctaid.x(), !range ![[GRID_SIZE_YZ]] +; CHECK: ret; + %x = call i32 @llvm.ptx.read.nctaid.x() + ret i32 %x +} + + define ptx_device i32 @test_nctaid_w() { ; CHECK: mov.u32 %r{{[0-9]+}}, %nctaid.w; ; CHECK: ret; @@ -280,3 +301,14 @@ declare i32 @llvm.ptx.read.pm3() declare void @llvm.ptx.bar.sync(i32 %i) + +; RANGE-DAG: ![[BLK_IDX_XY]] = !{i32 0, i32 1024} +; RANGE-DAG: ![[BLK_IDX_Z]] = !{i32 0, i32 64} +; RANGE-DAG: ![[BLK_SIZE_XY]] = !{i32 1, i32 1025} +; RANGE-DAG: ![[BLK_SIZE_Z]] = !{i32 1, i32 65} +; RANGE-DAG: ![[LANEID]] = !{i32 0, i32 32} +; RANGE_30-DAG: ![[GRID_IDX_X]] = !{i32 0, i32 2147483647} +; RANGE-DAG: ![[GRID_IDX_YZ]] = !{i32 0, i32 65535} +; RANGE_30-DAG: ![[GRID_SIZE_X]] = !{i32 1, i32 -2147483648} +; RANGE-DAG: ![[GRID_SIZE_YZ]] = !{i32 1, i32 65536} +