diff --git a/clang/lib/CodeGen/TargetInfo.cpp b/clang/lib/CodeGen/TargetInfo.cpp --- a/clang/lib/CodeGen/TargetInfo.cpp +++ b/clang/lib/CodeGen/TargetInfo.cpp @@ -6482,12 +6482,20 @@ if (auto *ASC = llvm::dyn_cast_or_null(C)) C = llvm::cast(ASC->getPointerOperand()); if (auto *GV = llvm::dyn_cast_or_null(C)) { + // According to [NVVM IR Spec][1], `nvvm.texsurf.handle` should be used + // to access texture/surface memory. The first argument to that intrinsic + // is a metadata holding the texture or surface variable. The second + // argument to that intrinsic is the texture or surface variable itself. + // --- + // [1]: https://docs.nvidia.com/cuda/nvvm-ir-spec/index.html + llvm::Value *MD = llvm::MetadataAsValue::get( + CGF.getLLVMContext(), llvm::ConstantAsMetadata::get(GV)); // Load the handle from the specific global variable using // `nvvm.texsurf.handle.internal` intrinsic. Handle = CGF.EmitRuntimeCall( - CGF.CGM.getIntrinsic(llvm::Intrinsic::nvvm_texsurf_handle_internal, + CGF.CGM.getIntrinsic(llvm::Intrinsic::nvvm_texsurf_handle, {GV->getType()}), - {GV}, "texsurf_handle"); + {MD, GV}, "texsurf_handle"); } else Handle = CGF.EmitLoadOfScalar(Src, SourceLocation()); CGF.EmitStoreOfScalar(Handle, Dst); diff --git a/clang/test/CodeGenCUDA/surface.cu b/clang/test/CodeGenCUDA/surface.cu --- a/clang/test/CodeGenCUDA/surface.cu +++ b/clang/test/CodeGenCUDA/surface.cu @@ -28,7 +28,7 @@ __attribute__((device)) int suld_2d_zero(surface, int, int) asm("llvm.nvvm.suld.2d.i32.zero"); // DEVICE-LABEL: i32 @_Z3fooii(i32 %x, i32 %y) -// DEVICE: call i64 @llvm.nvvm.texsurf.handle.internal.p1i64(i64 addrspace(1)* @surf) +// DEVICE: call i64 @llvm.nvvm.texsurf.handle.p1i64(metadata [[SURF:.*]], [[SURF]]) // DEVICE: call i32 @llvm.nvvm.suld.2d.i32.zero(i64 %{{.*}}, i32 %{{.*}}, i32 %{{.*}}) __attribute__((device)) int foo(int x, int y) { return suld_2d_zero(surf, x, y); diff --git a/clang/test/CodeGenCUDA/texture.cu b/clang/test/CodeGenCUDA/texture.cu --- a/clang/test/CodeGenCUDA/texture.cu +++ b/clang/test/CodeGenCUDA/texture.cu @@ -37,9 +37,9 @@ __attribute__((device)) v4f tex2d_ld(texture, int, int) asm("llvm.nvvm.tex.unified.2d.v4f32.s32"); // DEVICE-LABEL: float @_Z3fooff(float %x, float %y) -// DEVICE: call i64 @llvm.nvvm.texsurf.handle.internal.p1i64(i64 addrspace(1)* @tex) +// DEVICE: call i64 @llvm.nvvm.texsurf.handle.p1i64(metadata [[TEX:.*]], [[TEX]]) // DEVICE: call %struct.v4f @llvm.nvvm.tex.unified.2d.v4f32.f32(i64 %{{.*}}, float %{{.*}}, float %{{.*}}) -// DEVICE: call i64 @llvm.nvvm.texsurf.handle.internal.p1i64(i64 addrspace(1)* @norm) +// DEVICE: call i64 @llvm.nvvm.texsurf.handle.p1i64(metadata [[NORM:.*]], [[NORM]]) // DEVICE: call %struct.v4f @llvm.nvvm.tex.unified.2d.v4f32.s32(i64 %{{.*}}, i32 %{{.*}}, i32 %{{.*}}) __attribute__((device)) float foo(float x, float y) { return tex2d_ld(tex, x, y).x + tex2d_ld(norm, int(x), int(y)).x; diff --git a/llvm/lib/Target/NVPTX/CMakeLists.txt b/llvm/lib/Target/NVPTX/CMakeLists.txt --- a/llvm/lib/Target/NVPTX/CMakeLists.txt +++ b/llvm/lib/Target/NVPTX/CMakeLists.txt @@ -19,20 +19,21 @@ NVPTXImageOptimizer.cpp NVPTXInstrInfo.cpp NVPTXLowerAggrCopies.cpp - NVPTXLowerArgs.cpp NVPTXLowerAlloca.cpp - NVPTXPeephole.cpp + NVPTXLowerArgs.cpp NVPTXMCExpr.cpp + NVPTXPeephole.cpp NVPTXPrologEpilogPass.cpp + NVPTXProxyRegErasure.cpp NVPTXRegisterInfo.cpp NVPTXReplaceImageHandles.cpp NVPTXSubtarget.cpp NVPTXTargetMachine.cpp NVPTXTargetTransformInfo.cpp + NVPTXTexSurfHandleInternalizer.cpp NVPTXUtilities.cpp NVVMIntrRange.cpp NVVMReflect.cpp - NVPTXProxyRegErasure.cpp ) add_llvm_target(NVPTXCodeGen ${NVPTXCodeGen_sources}) diff --git a/llvm/lib/Target/NVPTX/NVPTX.h b/llvm/lib/Target/NVPTX/NVPTX.h --- a/llvm/lib/Target/NVPTX/NVPTX.h +++ b/llvm/lib/Target/NVPTX/NVPTX.h @@ -47,6 +47,7 @@ FunctionPass *createNVPTXLowerAllocaPass(); MachineFunctionPass *createNVPTXPeephole(); MachineFunctionPass *createNVPTXProxyRegErasurePass(); +FunctionPass *createNVPTXTexSurfHandleInternalizerPass(); namespace NVPTX { enum DrvInterface { diff --git a/llvm/lib/Target/NVPTX/NVPTXTargetMachine.cpp b/llvm/lib/Target/NVPTX/NVPTXTargetMachine.cpp --- a/llvm/lib/Target/NVPTX/NVPTXTargetMachine.cpp +++ b/llvm/lib/Target/NVPTX/NVPTXTargetMachine.cpp @@ -161,6 +161,7 @@ } void addIRPasses() override; + bool addPreISel() override; bool addInstSelector() override; void addPreRegAlloc() override; void addPostRegAlloc() override; @@ -300,6 +301,11 @@ } } +bool NVPTXPassConfig::addPreISel() { + addPass(createNVPTXTexSurfHandleInternalizerPass()); + return false; +} + bool NVPTXPassConfig::addInstSelector() { const NVPTXSubtarget &ST = *getTM().getSubtargetImpl(); diff --git a/llvm/lib/Target/NVPTX/NVPTXTexSurfHandleInternalizer.cpp b/llvm/lib/Target/NVPTX/NVPTXTexSurfHandleInternalizer.cpp new file mode 100644 --- /dev/null +++ b/llvm/lib/Target/NVPTX/NVPTXTexSurfHandleInternalizer.cpp @@ -0,0 +1,91 @@ +//===- NVPTXLowerAggrCopies.cpp - ------------------------------*- C++ -*--===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// \file +// +// According to [NVVM IR Spec][1], `nvvm.texsurf.handle` should be used to +// access texture/surface memory. The first argument to that intrinsic is a +// metadata holding the texture or surface variable. The second argument to +// that intrinsic is the texture or surface variable itself. However, the first +// metadata argument cannot be handled directly by the NVPTX backend, which +// only handle its internal version, i.e., `nvvm.texsurf.handle.internal`. This +// pass, arranged just before the code selection, replaces +// `nvvm.texsurf.handle` intrinsics with their internal version, i.e., +// `nvvm.texsurf.handle.internal`. +// --- +// [1]: https://docs.nvidia.com/cuda/nvvm-ir-spec/index.html +// +//===----------------------------------------------------------------------===// + +#include "NVPTX.h" +#include "llvm/IR/BasicBlock.h" +#include "llvm/IR/Function.h" +#include "llvm/IR/IRBuilder.h" +#include "llvm/IR/IntrinsicInst.h" +#include "llvm/IR/IntrinsicsNVPTX.h" +#include "llvm/Pass.h" + +using namespace llvm; + +#define DEBUG_TYPE "nvptx-texsurf-handle-internalizer" + +namespace llvm { +void initializeTexSurfHandleInternalizerPass(PassRegistry &); +} // namespace llvm + +namespace { + +class TexSurfHandleInternalizer : public FunctionPass { +public: + static char ID; + + TexSurfHandleInternalizer() : FunctionPass(ID) { + initializeTexSurfHandleInternalizerPass(*PassRegistry::getPassRegistry()); + } + + StringRef getPassName() const override { + return "Internalize `nvvm.texsurf.handle` intrinsics"; + } + + void getAnalysisUsage(AnalysisUsage &AU) const override { + AU.setPreservesCFG(); + } + + bool runOnFunction(Function &F) override { + bool Changed = false; + for (auto &BB : F) + for (auto BI = BB.begin(), BE = BB.end(); BI != BE; /*EMPTY*/) { + IntrinsicInst *II = dyn_cast(&*BI++); + if (!II || II->getIntrinsicID() != Intrinsic::nvvm_texsurf_handle) + continue; + assert(II->getArgOperand(1) == + cast( + cast(II->getArgOperand(0))->getMetadata()) + ->getValue()); + // Replace it with the internal version. + IRBuilder<> Builder(II); + auto *NewII = Builder.CreateUnaryIntrinsic( + Intrinsic::nvvm_texsurf_handle_internal, II->getArgOperand(1)); + II->replaceAllUsesWith(NewII); + II->eraseFromParent(); + Changed = true; + } + return Changed; + } +}; + +} // end of anonymous namespace + +FunctionPass *llvm::createNVPTXTexSurfHandleInternalizerPass() { + return new TexSurfHandleInternalizer(); +} + +char TexSurfHandleInternalizer::ID = 0; + +INITIALIZE_PASS(TexSurfHandleInternalizer, "nvptx-texsurf-handle-internalizer", + "Interalize texsurf-handle intrinsic", false, false) diff --git a/llvm/test/CodeGen/NVPTX/tex-read-cuda.ll b/llvm/test/CodeGen/NVPTX/tex-read-cuda.ll --- a/llvm/test/CodeGen/NVPTX/tex-read-cuda.ll +++ b/llvm/test/CodeGen/NVPTX/tex-read-cuda.ll @@ -6,6 +6,7 @@ declare { float, float, float, float } @llvm.nvvm.tex.unified.1d.v4f32.s32(i64, i32) declare i64 @llvm.nvvm.texsurf.handle.internal.p1i64(i64 addrspace(1)*) +declare i64 @llvm.nvvm.texsurf.handle.p1i64(metadata, i64 addrspace(1)*) ; SM20-LABEL: .entry foo ; SM30-LABEL: .entry foo @@ -28,7 +29,7 @@ ; SM20-LABEL: .entry bar ; SM30-LABEL: .entry bar define void @bar(float* %red, i32 %idx) { -; SM30: mov.u64 %rd[[TEXHANDLE:[0-9]+]], tex0 +; SM30: mov.u64 %rd[[TEXHANDLE:[0-9]+]], tex0 %texHandle = tail call i64 @llvm.nvvm.texsurf.handle.internal.p1i64(i64 addrspace(1)* @tex0) ; SM20: tex.1d.v4.f32.s32 {%f[[RED:[0-9]+]], %f[[GREEN:[0-9]+]], %f[[BLUE:[0-9]+]], %f[[ALPHA:[0-9]+]]}, [tex0, {%r{{[0-9]+}}}] ; SM30: tex.1d.v4.f32.s32 {%f[[RED:[0-9]+]], %f[[GREEN:[0-9]+]], %f[[BLUE:[0-9]+]], %f[[ALPHA:[0-9]+]]}, [%rd[[TEXHANDLE]], {%r{{[0-9]+}}}] @@ -40,7 +41,24 @@ ret void } -!nvvm.annotations = !{!1, !2, !3} +; SM20-LABEL: .entry bax +; SM30-LABEL: .entry bax +define void @bax(float* %red, i32 %idx) { +; SM30: mov.u64 %rd[[TEXHANDLE:[0-9]+]], tex0 + %texHandle = tail call i64 @llvm.nvvm.texsurf.handle.p1i64(metadata !5, i64 addrspace(1)* @tex0) +; SM20: tex.1d.v4.f32.s32 {%f[[RED:[0-9]+]], %f[[GREEN:[0-9]+]], %f[[BLUE:[0-9]+]], %f[[ALPHA:[0-9]+]]}, [tex0, {%r{{[0-9]+}}}] +; SM30: tex.1d.v4.f32.s32 {%f[[RED:[0-9]+]], %f[[GREEN:[0-9]+]], %f[[BLUE:[0-9]+]], %f[[ALPHA:[0-9]+]]}, [%rd[[TEXHANDLE]], {%r{{[0-9]+}}}] + %val = tail call { float, float, float, float } @llvm.nvvm.tex.unified.1d.v4f32.s32(i64 %texHandle, i32 %idx) + %ret = extractvalue { float, float, float, float } %val, 0 +; SM20: st.global.f32 [%r{{[0-9]+}}], %f[[RED]] +; SM30: st.global.f32 [%r{{[0-9]+}}], %f[[RED]] + store float %ret, float* %red + ret void +} + +!nvvm.annotations = !{!1, !2, !3, !4} !1 = !{void (i64, float*, i32)* @foo, !"kernel", i32 1} !2 = !{void (float*, i32)* @bar, !"kernel", i32 1} -!3 = !{i64 addrspace(1)* @tex0, !"texture", i32 1} +!3 = !{void (float*, i32)* @bax, !"kernel", i32 1} +!4 = !{i64 addrspace(1)* @tex0, !"texture", i32 1} +!5 = !{i64 addrspace(1)* @tex0}