diff --git a/clang/include/clang/Basic/Builtins.def b/clang/include/clang/Basic/Builtins.def --- a/clang/include/clang/Basic/Builtins.def +++ b/clang/include/clang/Basic/Builtins.def @@ -1648,6 +1648,8 @@ // CUDA/HIP LANGBUILTIN(__builtin_get_device_side_mangled_name, "cC*.", "ncT", CUDA_LANG) +// Builtin to convert texture operation name into a numeric value. +LANGBUILTIN(__nvvm_texture_op, "icC*", "nc", CUDA_LANG) // Builtins for XRay BUILTIN(__xray_customevent, "vcC*z", "") diff --git a/clang/include/clang/Sema/Sema.h b/clang/include/clang/Sema/Sema.h --- a/clang/include/clang/Sema/Sema.h +++ b/clang/include/clang/Sema/Sema.h @@ -12601,6 +12601,7 @@ bool CheckPPCBuiltinFunctionCall(const TargetInfo &TI, unsigned BuiltinID, CallExpr *TheCall); bool CheckAMDGCNBuiltinFunctionCall(unsigned BuiltinID, CallExpr *TheCall); + bool CheckNVPTXBuiltinFunctionCall(unsigned BuiltinID, CallExpr *TheCall); bool CheckRISCVLMUL(CallExpr *TheCall, unsigned ArgNum); bool CheckRISCVBuiltinFunctionCall(const TargetInfo &TI, unsigned BuiltinID, CallExpr *TheCall); diff --git a/clang/lib/AST/ExprConstant.cpp b/clang/lib/AST/ExprConstant.cpp --- a/clang/lib/AST/ExprConstant.cpp +++ b/clang/lib/AST/ExprConstant.cpp @@ -50,6 +50,7 @@ #include "clang/AST/StmtVisitor.h" #include "clang/AST/TypeLoc.h" #include "clang/Basic/Builtins.h" +#include "clang/Basic/TargetBuiltins.h" #include "clang/Basic/TargetInfo.h" #include "llvm/ADT/APFixedPoint.h" #include "llvm/ADT/Optional.h" @@ -59,6 +60,7 @@ #include "llvm/Support/raw_ostream.h" #include #include +#include #define DEBUG_TYPE "exprconstant" @@ -9327,7 +9329,6 @@ return false; } } - default: break; } @@ -11094,6 +11095,126 @@ return EvaluateBuiltinClassifyType(E->getArg(0)->getType(), LangOpts); } +static int EvaluateTextureOp(const CallExpr *E) { + // Sorted list of known operations stuuported by '__nv_tex_surf_handler' + static constexpr StringRef TextureOps[] = {"__isurf1DLayeredread", + "__isurf1DLayeredwrite_v2", + "__isurf1Dread", + "__isurf1Dwrite_v2", + "__isurf2DLayeredread", + "__isurf2DLayeredwrite_v2", + "__isurf2Dread", + "__isurf2Dwrite_v2", + "__isurf3Dread", + "__isurf3Dwrite_v2", + "__isurfCubemapLayeredread", + "__isurfCubemapLayeredwrite_v2", + "__isurfCubemapread", + "__isurfCubemapwrite_v2", + "__itex1D", + "__itex1DGrad", + "__itex1DLayered", + "__itex1DLayeredGrad", + "__itex1DLayeredLod", + "__itex1DLod", + "__itex1Dfetch", + "__itex2D", + "__itex2DGrad_sparse", + "__itex2DGrad_v2", + "__itex2DLayered", + "__itex2DLayeredGrad_sparse", + "__itex2DLayeredGrad_v2", + "__itex2DLayeredLod", + "__itex2DLayeredLod_sparse", + "__itex2DLayered_sparse", + "__itex2DLod", + "__itex2DLod_sparse", + "__itex2D_sparse", + "__itex2Dgather", + "__itex2Dgather_sparse", + "__itex3D", + "__itex3DGrad_sparse", + "__itex3DGrad_v2", + "__itex3DLod", + "__itex3DLod_sparse", + "__itex3D_sparse", + "__itexCubemap", + "__itexCubemapGrad_v2", + "__itexCubemapLayered", + "__itexCubemapLayeredGrad_v2", + "__itexCubemapLayeredLod", + "__itexCubemapLod", + "__surf1DLayeredread_v2", + "__surf1DLayeredwrite_v2", + "__surf1Dread_v2", + "__surf1Dwrite_v2", + "__surf2DLayeredread_v2", + "__surf2DLayeredwrite_v2", + "__surf2Dread_v2", + "__surf2Dwrite_v2", + "__surf3Dread_v2", + "__surf3Dwrite_v2", + "__surfCubemapLayeredread_v2", + "__surfCubemapLayeredwrite_v2", + "__surfCubemapread_v2", + "__surfCubemapwrite_v2", + "__tex1DGrad_rmnf_v2", + "__tex1DGrad_v2", + "__tex1DLayeredGrad_rmnf_v2", + "__tex1DLayeredGrad_v2", + "__tex1DLayeredLod_rmnf_v2", + "__tex1DLayeredLod_v2", + "__tex1DLayered_rmnf_v2", + "__tex1DLayered_v2", + "__tex1DLod_rmnf_v2", + "__tex1DLod_v2", + "__tex1D_rmnf_v2", + "__tex1D_v2", + "__tex1Dfetch_rmnf_v2", + "__tex1Dfetch_v2", + "__tex2DGrad_rmnf_v2", + "__tex2DGrad_v2", + "__tex2DLayeredGrad_rmnf_v2", + "__tex2DLayeredGrad_v2", + "__tex2DLayeredLod_rmnf_v2", + "__tex2DLayeredLod_v2", + "__tex2DLayered_rmnf_v2", + "__tex2DLayered_v2", + "__tex2DLod_rmnf_v2", + "__tex2DLod_v2", + "__tex2D_rmnf_v2", + "__tex2D_v2", + "__tex2Dgather_rmnf_v2", + "__tex2Dgather_v2", + "__tex3DGrad_rmnf_v2", + "__tex3DGrad_v2", + "__tex3DLod_rmnf_v2", + "__tex3DLod_v2", + "__tex3D_rmnf_v2", + "__tex3D_v2", + "__texCubemapGrad_rmnf_v2", + "__texCubemapGrad_v2", + "__texCubemapLayeredGrad_rmnf_v2", + "__texCubemapLayeredGrad_v2", + "__texCubemapLayeredLod_rmnf_v2", + "__texCubemapLayeredLod_v2", + "__texCubemapLayered_rmnf_v2", + "__texCubemapLayered_v2", + "__texCubemapLod_rmnf_v2", + "__texCubemapLod_v2", + "__texCubemap_rmnf_v2", + "__texCubemap_v2" + + }; + const StringLiteral *S = + dyn_cast(E->getArg(0)->IgnoreParenCasts()); + auto I = llvm::lower_bound(TextureOps, S->getString()); + if (I == std::end(TextureOps) || *I != S->getString()) { + return -1; + } + return I - std::begin(TextureOps); +} + /// EvaluateBuiltinConstantPForLValue - Determine the result of /// __builtin_constant_p when applied to the given pointer. /// @@ -12123,6 +12244,8 @@ return false; return Success(DidOverflow, E); } + case clang::Builtin::BI__nvvm_texture_op: + return Success((int)EvaluateTextureOp(E), E); } } diff --git a/clang/lib/Headers/CMakeLists.txt b/clang/lib/Headers/CMakeLists.txt --- a/clang/lib/Headers/CMakeLists.txt +++ b/clang/lib/Headers/CMakeLists.txt @@ -47,6 +47,7 @@ __clang_cuda_complex_builtins.h __clang_cuda_device_functions.h __clang_cuda_intrinsics.h + __clang_cuda_texture_intrinsics.h __clang_cuda_libdevice_declares.h __clang_cuda_math_forward_declares.h __clang_cuda_runtime_wrapper.h diff --git a/clang/lib/Headers/__clang_cuda_runtime_wrapper.h b/clang/lib/Headers/__clang_cuda_runtime_wrapper.h --- a/clang/lib/Headers/__clang_cuda_runtime_wrapper.h +++ b/clang/lib/Headers/__clang_cuda_runtime_wrapper.h @@ -64,9 +64,9 @@ #endif // Make largest subset of device functions available during host -// compilation -- SM_35 for the time being. +// compilation. #ifndef __CUDA_ARCH__ -#define __CUDA_ARCH__ 350 +#define __CUDA_ARCH__ 9999 #endif #include "__clang_cuda_builtin_vars.h" @@ -330,6 +330,8 @@ #pragma pop_macro("__host__") +#include <__clang_cuda_texture_intrinsics.h> +#include "texture_fetch_functions.h" #include "texture_indirect_functions.h" // Restore state of __CUDA_ARCH__ and __THROW we had on entry. diff --git a/clang/lib/Headers/__clang_cuda_texture_intrinsics.h b/clang/lib/Headers/__clang_cuda_texture_intrinsics.h new file mode 100644 --- /dev/null +++ b/clang/lib/Headers/__clang_cuda_texture_intrinsics.h @@ -0,0 +1,608 @@ +/*===--- __clang_cuda_texture_intrinsics.h - Device-side texture support ---=== + * + * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. + * See https://llvm.org/LICENSE.txt for license information. + * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + * + *===-----------------------------------------------------------------------=== + */ +#ifndef __CLANG_CUDA_TEXTURE_INTRINSICS_H__ +#define __CLANG_CUDA_TEXTURE_INTRINSICS_H__ +#ifndef __CUDA__ +#error "This file is for CUDA __compilation only." +#endif + +#pragma push_macro("__Args") +#pragma push_macro("__ID") +#pragma push_macro("__ID") +#pragma push_macro("__IDV") +#pragma push_macro("__IMPL_2DGATHER") +#pragma push_macro("__IMPL_ALIAS") +#pragma push_macro("__IMPL_ALIASI") +#pragma push_macro("__IMPL_F1") +#pragma push_macro("__IMPL_F3") +#pragma push_macro("__IMPL_F3N") +#pragma push_macro("__IMPL_F3S") +#pragma push_macro("__IMPL_S") +#pragma push_macro("__IMPL_S3") +#pragma push_macro("__IMPL_S3I") +#pragma push_macro("__IMPL_S3N") +#pragma push_macro("__IMPL_S3NI") +#pragma push_macro("__IMPL_S3S") +#pragma push_macro("__IMPL_S3SI") +#pragma push_macro("__IMPL_SI") +#pragma push_macro("__L") +#pragma push_macro("__STRIP_PARENS") +#pragma push_macro("__V4") +#pragma push_macro("__V4P") + +#include + +namespace { + +template struct __Tag; +#define __nv_tex_surf_handler(__op, __ptr, ...) \ + __tex_fetch<__Tag<__nvvm_texture_op(__op)>>(__ptr, __VA_ARGS__) + +#define __ID(__op) __Tag<__nvvm_texture_op(__op)> +// Tags for variants of particular operation. E.g. tex2Dgather can translate +// into 4 different instructions. +#define __IDV(__op, __variant) \ + __Tag<10000 + __nvvm_texture_op(__op) * 100 + __variant> + +// Helper classes for figuring out the fetch type. +template struct __FT; +// Fundamental types. +template <> struct __FT { + using __bt = float; + using __ft = float4; +}; +template <> struct __FT { + using __bt = char; + using __ft = int4; +}; +template <> struct __FT { + using __bt = signed char; + using __ft = int4; +}; +template <> struct __FT { + using __bt = unsigned char; + using __ft = uint4; +}; +template <> struct __FT { + using __bt = short; + using __ft = int4; +}; +template <> struct __FT { + using __bt = ushort; + using __ft = uint4; +}; +template <> struct __FT { + using __bt = int; + using __ft = int4; +}; +template <> struct __FT { + using __bt = uint; + using __ft = uint4; +}; + +// Derived base/fetch types for N-element vectors. +template struct __FT { + using __bt = decltype(__T::x); + using __ft = typename __FT<__bt>::__ft; +}; + +// Classes that implement specific texture ops. +template struct __tex_fetch_v4; +template <> struct __tex_fetch_v4<__Tag<-1>>; // Unknown op + +// Helper macros to strip parens from a macro argument. +#define __Args(...) __VA_ARGS__ +#define __STRIP_PARENS(__X) __X +#define __L(__X) __STRIP_PARENS(__Args __X) + +// Results are stored in a temp var __r. +// isResident bool is pointed to by __ir +// Asm args for return values. It's a 4-element vector +#define __V4(__t) \ + ("=" __t(__r.x), "=" __t(__r.y), "=" __t(__r.z), "=" __t(__r.w)) +// .. possibly combined with a predicate. +#define __V4P(__t) (__L(__V4(__t)), "=h"(*__ir)) + +#define __IMPL_F1(__rt, __dt, __args, __asm_op, __asm_outs, __asm_args) \ + template <> \ + __device__ __rt __run<__dt>(cudaTextureObject_t __obj, __L(__args)) { \ + __rt __r; \ + asm(__asm_op : __L(__asm_outs) : "l"(__obj), __L(__asm_args)); \ + return __r; \ + } + +#define __IMPL_F3(__args, __asm_op, __ctype, __asm_op_args, __asm_args) \ + __IMPL_F1(int4, int4, __args, __asm_op ".s32." __ctype "\t" __asm_op_args, \ + __V4("r"), __asm_args) \ + __IMPL_F1(uint4, uint4, __args, __asm_op ".u32." __ctype "\t" __asm_op_args, \ + __V4("r"), __asm_args) \ + __IMPL_F1(float4, float4, __args, \ + __asm_op ".f32." __ctype "\t" __asm_op_args, __V4("f"), \ + __asm_args) + +#define __IMPL_F3S(__args, __asm_op, __ctype, __asm_op_args, __asm_args) \ + __IMPL_F1(int4, int4, __args, __asm_op ".s32." __ctype "\t" __asm_op_args, \ + __V4P("r"), __asm_args) \ + __IMPL_F1(uint4, uint4, __args, __asm_op ".u32." __ctype "\t" __asm_op_args, \ + __V4P("r"), __asm_args) \ + __IMPL_F1(float4, float4, __args, \ + __asm_op ".f32." __ctype "\t" __asm_op_args, __V4P("f"), \ + __asm_args) + +#define __IMPL_F3N(__args, __asm_op, __ctype, __asm_op_args, __asm_args) \ + __IMPL_F1(float4, int4, __args, __asm_op ".s32." __ctype "\t" __asm_op_args, \ + __V4("r"), __asm_args) \ + __IMPL_F1(float4, uint4, __args, \ + __asm_op ".u32." __ctype "\t" __asm_op_args, __V4("r"), \ + __asm_args) + +#define __IMPL_S3I(__op, __args, __asm_op, __ctype, __asm_op_args, __asm_args) \ + template <> struct __tex_fetch_v4<__op> { \ + template \ + __device__ static T __run(cudaTextureObject_t __obj, __L(__args)); \ + __IMPL_F3(__args, __asm_op, __ctype, __asm_op_args, __asm_args) \ + } + +#define __IMPL_S3(__op, __args, __asm_op, __ctype, __asm_op_args, __asm_args) \ + __IMPL_S3I(__ID(__op), __args, __asm_op, __ctype, __asm_op_args, __asm_args) + +// Same, but for sparse ops. +#define __IMPL_S3SI(__op, __args, __asm_op, __ctype, __asm_op_args, \ + __asm_args) \ + template <> struct __tex_fetch_v4<__op> { \ + template \ + __device__ static T __run(cudaTextureObject_t __obj, __L(__args)); \ + __IMPL_F3S(__args, __asm_op, __ctype, __asm_op_args, __asm_args) \ + } + +#define __IMPL_S3S(__op, __args, __asm_op, __ctype, __asm_op_args, __asm_args) \ + __IMPL_S3SI(__ID(__op), __args, __asm_op, __ctype, __asm_op_args, __asm_args) + +#define __IMPL_S3NI(__op, __args, __asm_op, __ctype, __asm_op_args, \ + __asm_args) \ + template <> struct __tex_fetch_v4<__op> { \ + template \ + __device__ static float4 __run(cudaTextureObject_t __obj, __L(__args)); \ + __IMPL_F3N(__args, __asm_op, __ctype, __asm_op_args, __asm_args) \ + } + +#define __IMPL_S3N(__op, __args, __asm_op, __ctype, __asm_op_args, __asm_args) \ + __IMPL_S3NI(__ID(__op), __args, __asm_op, __ctype, __asm_op_args, __asm_args) + +#define __IMPL_SI(__op, __opn, __args, __asm_op, __ctype, __asm_op_args, \ + __asm_args) \ + __IMPL_S3I(__op, __args, __asm_op, __ctype, __asm_op_args, __asm_args); \ + __IMPL_S3NI(__opn, __args, __asm_op, __ctype, __asm_op_args, __asm_args) + +#define __IMPL_S(__op, __opn, __args, __asm_op, __ctype, __asm_op_args, \ + __asm_args) \ + __IMPL_SI(__ID(__op), __ID(__opn), __args, __asm_op, __ctype, __asm_op_args, \ + __asm_args) + +#define __IMPL_ALIASI(__op, __opn) \ + template <> struct __tex_fetch_v4<__op> : __tex_fetch_v4<__opn> {} +#define __IMPL_ALIAS(__op, __opn) __IMPL_ALIASI(__ID(__op), __ID(__opn)) + +__IMPL_S("__tex1D_v2", "__tex1D_rmnf_v2", (float __x), "tex.1d.v4", "f32", + "{%0, %1, %2, %3}, [%4, {%5}];", ("f"(__x))); +__IMPL_S("__tex1Dfetch_v2", "__tex1Dfetch_rmnf_v2", (int __x), "tex.1d.v4", + "s32", "{%0, %1, %2, %3}, [%4, {%5}];", ("r"(__x))); + +__IMPL_ALIAS("__itex1D", "__tex1D_v2"); +__IMPL_ALIAS("__itex1Dfetch", "__tex1Dfetch_v2"); + +__IMPL_S("__tex1DGrad_v2", "__tex1DGrad_rmnf_v2", + (float __x, float __dPdx, float __dPdy), "tex.grad.1d.v4", "f32", + "{%0, %1, %2, %3}, [%4, {%5}], {%6}, {%7};", + ("f"(__x), "f"(__dPdx), "f"(__dPdy))); +__IMPL_ALIAS("__itex1DGrad", "__tex1DGrad_v2"); + +__IMPL_S("__tex1DLayered_v2", "__tex1DLayered_rmnf_v2", + (float __x, int __layer), "tex.a1d.v4", "f32", + "{%0, %1, %2, %3}, [%4, {%5, %6}];", ("r"(__layer), "f"(__x))); +__IMPL_ALIAS("__itex1DLayered", "__tex1DLayered_v2"); + +__IMPL_S("__tex1DLayeredGrad_v2", "__tex1DLayeredGrad_rmnf_v2", + (float __x, int __layer, float __dPdx, float __dPdy), + "tex.grad.a1d.v4", "f32", + "{%0, %1, %2, %3}, [%4, {%5, %6}], {%7}, {%8};", + ("r"(__layer), "f"(__x), "f"(__dPdx), "f"(__dPdy))); +__IMPL_ALIAS("__itex1DLayeredGrad", "__tex1DLayeredGrad_v2"); + +__IMPL_S("__tex1DLayeredLod_v2", "__tex1DLayeredLod_rmnf_v2", + (float __x, int __layer, float __level), "tex.level.a1d.v4", "f32", + "{%0, %1, %2, %3}, [%4, {%5, %6}], %7;", + ("r"(__layer), "f"(__x), "f"(__level))); +__IMPL_ALIAS("__itex1DLayeredLod", "__tex1DLayeredLod_v2"); + +__IMPL_S("__tex1DLod_v2", "__tex1DLod_rmnf_v2", (float __x, float __level), + "tex.level.1d.v4", "f32", "{%0, %1, %2, %3}, [%4, {%5}], %6;", + ("f"(__x), "f"(__level))); +__IMPL_ALIAS("__itex1DLod", "__tex1DLod_v2"); + +// 2D +__IMPL_S("__tex2D_v2", "__tex2D_rmnf_v2", (float __x, float __y), "tex.2d.v4", + "f32", "{%0, %1, %2, %3}, [%4, {%5, %6}];", ("f"(__x), "f"(__y))); +__IMPL_ALIAS("__itex2D", "__tex2D_v2"); +__IMPL_S3S("__itex2D_sparse", (float __x, float __y, unsigned char *__ir), + "{.reg .pred %%p0;\n\t" + "tex.2d.v4", + "f32", + "{%0, %1, %2, %3}|%%p0, [%5, {%6, %7}];\n\t" + " selp.u16 %4, 1, 0, %%p0; }", + ("f"(__x), "f"(__y))); + +__IMPL_S("__tex2DGrad_v2", "__tex2DGrad_rmnf_v2", + (float __x, float __y, const float2 *__dPdx, const float2 *__dPdy), + "tex.grad.2d.v4", "f32", + "{%0, %1, %2, %3}, [%4, {%5, %6}], {%7, %8}, {%9, %10};", + ("f"(__x), "f"(__y), "f"(__dPdx->x), "f"(__dPdx->y), "f"(__dPdy->x), + "f"(__dPdy->y))); +__IMPL_ALIAS("__itex2DGrad_v2", "__tex2DGrad_v2"); +__IMPL_S3S("__itex2DGrad_sparse", + (float __x, float __y, const float2 *__dPdx, const float2 *__dPdy, + unsigned char *__ir), + "{.reg .pred %%p0;\n\t" + "tex.grad.2d.v4", + "f32", + "{%0, %1, %2, %3}|%%p0, [%5, {%6, %7}], {%8, %9}, {%10, %11};\n\t" + "selp.u16 %4, 1, 0, %%p0; }", + ("f"(__x), "f"(__y), "f"(__dPdx->x), "f"(__dPdx->y), "f"(__dPdy->x), + "f"(__dPdy->y))); + +__IMPL_S("__tex2DLayered_v2", "__tex2DLayered_rmnf_v2", + (float __x, float __y, int __layer), "tex.a2d.v4", "f32", + "{%0, %1, %2, %3}, [%4, {%5, %6, %7, %7}];", + ("r"(__layer), "f"(__x), "f"(__y))); +__IMPL_ALIAS("__itex2DLayered", "__tex2DLayered_v2"); +__IMPL_S3S("__itex2DLayered_sparse", + (float __x, float __y, int __layer, unsigned char *__ir), + "{.reg .pred %%p0;\n\t" + "tex.a2d.v4", + "f32", + "{%0, %1, %2, %3}|%%p0, [%5, {%6, %7, %8, %8}];\n\t" + "selp.u16 %4, 1, 0, %%p0; }", + ("r"(__layer), "f"(__x), "f"(__y))); + +__IMPL_S("__tex2DLayeredGrad_v2", "__tex2DLayeredGrad_rmnf_v2", + (float __x, float __y, int __layer, const float2 *__dPdx, + const float2 *__dPdy), + "tex.grad.a2d.v4", "f32", + "{%0, %1, %2, %3}, [%4, {%5, %6, %7, %7}], {%8, %9}, {%10, %11};", + ("r"(__layer), "f"(__x), "f"(__y), "f"(__dPdx->x), "f"(__dPdx->y), + "f"(__dPdy->x), "f"(__dPdy->y))); +__IMPL_ALIAS("__itex2DLayeredGrad_v2", "__tex2DLayeredGrad_v2"); +__IMPL_S3S( + "__itex2DLayeredGrad_sparse", + (float __x, float __y, int __layer, const float2 *__dPdx, + const float2 *__dPdy, unsigned char *__ir), + "{.reg .pred %%p0;\n\t" + "tex.grad.a2d.v4", + "f32", + "{%0, %1, %2, %3}|%%p0, [%5, {%6, %7, %8, %8}], {%9, %10}, {%11, %12};\n\t" + "selp.u16 %4, 1, 0, %%p0; }", + ("r"(__layer), "f"(__x), "f"(__y), "f"(__dPdx->x), "f"(__dPdx->y), + "f"(__dPdy->x), "f"(__dPdy->y))); + +__IMPL_S("__tex2DLayeredLod_v2", "__tex2DLayeredLod_rmnf_v2", + (float __x, float __y, int __layer, float __level), "tex.level.a2d.v4", + "f32", "{%0, %1, %2, %3}, [%4, {%5, %6, %7, %7}], %8;", + ("r"(__layer), "f"(__x), "f"(__y), "f"(__level))); +__IMPL_ALIAS("__itex2DLayeredLod", "__tex2DLayeredLod_v2"); +__IMPL_S3S("__itex2DLayeredLod_sparse", + (float __x, float __y, int __layer, float __level, + unsigned char *__ir), + "{.reg .pred %%p0;\n\t" + "tex.level.a2d.v4", + "f32", + "{%0, %1, %2, %3}|%%p0, [%5, {%6, %7, %8, %8}], %9;\n\t" + "selp.u16 %4, 1, 0, %%p0; }", + ("r"(__layer), "f"(__x), "f"(__y), "f"(__level))); + +__IMPL_S("__tex2DLod_v2", "__tex2DLod_rmnf_v2", + (float __x, float __y, float __level), "tex.level.2d.v4", "f32", + "{%0, %1, %2, %3}, [%4, {%5, %6}], %7;", + ("f"(__x), "f"(__y), "f"(__level))); +__IMPL_ALIAS("__itex2DLod", "__tex2DLod_v2"); +__IMPL_S3S("__itex2DLod_sparse", + (float __x, float __y, float __level, unsigned char *__ir), + "{.reg .pred %%p0;\n\t" + "tex.level.2d.v4", + "f32", + "{%0, %1, %2, %3}|%%p0, [%5, {%6, %7}], %8;\n\t" + "selp.u16 %4, 1, 0, %%p0; }", + ("f"(__x), "f"(__y), "f"(__level))); + +// 2D gather uses one of the four different instructions selected by __comp. We +// implement eash variant separately plus one umbrella call to pick one of the +// variants. + +#define __IMPL_2DGATHER(variant, instr) \ + __IMPL_SI(__IDV("__tex2Dgather_v2", variant), \ + __IDV("__tex2Dgather_rmnf_v2", variant), \ + (float __x, float __y, int __comp), instr, "f32", \ + "{%0, %1, %2, %3}, [%4, {%5, %6}];", ("f"(__x), "f"(__y))); \ + __IMPL_ALIASI(__IDV("__itex2Dgather", variant), \ + __IDV("__tex2Dgather_v2", variant)); \ + __IMPL_S3SI(__IDV("__itex2Dgather_sparse", variant), \ + (float __x, float __y, unsigned char *__ir, int __comp), \ + "{.reg .pred %%p0;\n\t" instr, "f32", \ + "{%0, %1, %2, %3}|%%p0, [%5, {%6, %7}];\n\t" \ + "selp.u16 %4, 1, 0, %%p0; }", \ + ("f"(__x), "f"(__y))); +__IMPL_2DGATHER(0, "tld4.r.2d.v4"); +__IMPL_2DGATHER(1, "tld4.g.2d.v4"); +__IMPL_2DGATHER(2, "tld4.b.2d.v4"); +__IMPL_2DGATHER(3, "tld4.a.2d.v4"); + +template <> struct __tex_fetch_v4<__ID("__tex2Dgather_v2")> { + template + __device__ static __T __run(cudaTextureObject_t __obj, float __x, float __y, + int __comp) { + switch (__comp) { + case 0: + return __tex_fetch_v4<__IDV("__tex2Dgather_v2", 0)>::__run<__T>( + __obj, __x, __y, __comp); + case 1: + return __tex_fetch_v4<__IDV("__tex2Dgather_v2", 1)>::__run<__T>( + __obj, __x, __y, __comp); + case 2: + return __tex_fetch_v4<__IDV("__tex2Dgather_v2", 2)>::__run<__T>( + __obj, __x, __y, __comp); + case 3: + return __tex_fetch_v4<__IDV("__tex2Dgather_v2", 3)>::__run<__T>( + __obj, __x, __y, __comp); + } + } +}; +__IMPL_ALIAS("__itex2Dgather", "__tex2Dgather_v2"); +template <> struct __tex_fetch_v4<__ID("__tex2Dgather_rmnf_v2")> { + template + __device__ static float4 __run(cudaTextureObject_t __obj, float __x, + float __y, int __comp) { + switch (__comp) { + case 0: + return __tex_fetch_v4<__IDV("__tex2Dgather_rmnf_v2", 0)>::__run<__T>( + __obj, __x, __y, __comp); + case 1: + return __tex_fetch_v4<__IDV("__tex2Dgather_rmnf_v2", 1)>::__run<__T>( + __obj, __x, __y, __comp); + case 2: + return __tex_fetch_v4<__IDV("__tex2Dgather_rmnf_v2", 2)>::__run<__T>( + __obj, __x, __y, __comp); + case 3: + return __tex_fetch_v4<__IDV("__tex2Dgather_rmnf_v2", 3)>::__run<__T>( + __obj, __x, __y, __comp); + } + } +}; +template <> struct __tex_fetch_v4<__ID("__itex2Dgather_sparse")> { + template + __device__ static __T __run(cudaTextureObject_t __obj, float __x, float __y, + unsigned char *__ir, int __comp) { + switch (__comp) { + case 0: + return __tex_fetch_v4<__IDV("__itex2Dgather_sparse", 0)>::__run<__T>( + __obj, __x, __y, __ir, __comp); + case 1: + return __tex_fetch_v4<__IDV("__itex2Dgather_sparse", 1)>::__run<__T>( + __obj, __x, __y, __ir, __comp); + case 2: + return __tex_fetch_v4<__IDV("__itex2Dgather_sparse", 2)>::__run<__T>( + __obj, __x, __y, __ir, __comp); + case 3: + return __tex_fetch_v4<__IDV("__itex2Dgather_sparse", 3)>::__run<__T>( + __obj, __x, __y, __ir, __comp); + } + } +}; + +// 3D +__IMPL_S("__tex3D_v2", "__tex3D_rmnf_v2", (float __x, float __y, float __z), + "tex.3d.v4", "f32", "{%0, %1, %2, %3}, [%4, {%5, %6, %7, %7}];", + ("f"(__x), "f"(__y), "f"(__z))); +__IMPL_ALIAS("__itex3D", "__tex3D_v2"); +__IMPL_S3S("__itex3D_sparse", + (float __x, float __y, float __z, unsigned char *__ir), + "{.reg .pred %%p0;\n\t" + "tex.3d.v4", + "f32", + "{%0, %1, %2, %3}|%%p0, [%5, {%6, %7, %8, %8}];\n\t" + "selp.u16 %4, 1, 0, %%p0; }", + ("f"(__x), "f"(__y), "f"(__z))); + +__IMPL_S("__tex3DGrad_v2", "__tex3DGrad_rmnf_v2", + (float __x, float __y, float __z, const float4 *__dPdx, + const float4 *__dPdy), + "tex.grad.3d.v4", "f32", + "{%0, %1, %2, %3}, [%4, {%5, %6, %7, %7}], " + "{%8, %9, %10, %10}, {%11, %12, %13, %13};", + ("f"(__x), "f"(__y), "f"(__z), "f"(__dPdx->x), "f"(__dPdx->y), + "f"(__dPdx->z), "f"(__dPdy->x), "f"(__dPdy->y), "f"(__dPdy->z))); +__IMPL_ALIAS("__itex3DGrad_v2", "__tex3DGrad_v2"); +__IMPL_S3S("__itex3DGrad_sparse", + (float __x, float __y, float __z, const float4 *__dPdx, + const float4 *__dPdy, unsigned char *__ir), + "{.reg .pred %%p0;\n\t" + "tex.grad.3d.v4", + "f32", + "{%0, %1, %2, %3}|%%p0, [%5, {%6, %7, %8, %8}], " + "{%9, %10, %11, %11}, {%12, %13, %14, %14};\n\t" + "selp.u16 %4, 1, 0, %%p0; }", + ("f"(__x), "f"(__y), "f"(__z), "f"(__dPdx->x), "f"(__dPdx->y), + "f"(__dPdx->z), "f"(__dPdy->x), "f"(__dPdy->y), "f"(__dPdy->z))); + +__IMPL_S("__tex3DLod_v2", "__tex3DLod_rmnf_v2", + (float __x, float __y, float __z, float __level), "tex.level.3d.v4", + "f32", "{%0, %1, %2, %3}, [%4, {%5, %6, %7, %7}], %8;", + ("f"(__x), "f"(__y), "f"(__z), "f"(__level))); +__IMPL_ALIAS("__itex3DLod", "__tex3DLod_v2"); +__IMPL_S3S("__itex3DLod_sparse", + (float __x, float __y, float __z, float __level, + unsigned char *__ir), + "{.reg .pred %%p0;\n\t" + "tex.level.3d.v4", + "f32", + "{%0, %1, %2, %3}|%%p0, [%5, {%6, %7, %8, %8}], %9;\n\t" + "selp.u16 %4, 1, 0, %%p0; }", + ("f"(__x), "f"(__y), "f"(__z), "f"(__level))); + +// Cubemap +__IMPL_S("__texCubemap_v2", "__texCubemap_rmnf_v2", + (float __x, float __y, float __z), "tex.cube.v4", "f32", + "{%0, %1, %2, %3}, [%4, {%5, %6, %7, %7}];", + ("f"(__x), "f"(__y), "f"(__z))); +__IMPL_ALIAS("__itexCubemap", "__texCubemap_v2"); +__IMPL_S3S("__itexCubemap_sparse", + (float __x, float __y, float __z, unsigned char *__ir), + "{.reg .pred %%p0;\n\t" + "tex.cube.v4", + "f32", + "{%0, %1, %2, %3}|%%p0, [%5, {%6, %7, %8, %8}];\n\t" + "selp.u16 %4, 1, 0, %%p0; }", + ("f"(__x), "f"(__y), "f"(__z))); +__IMPL_S("__texCubemapGrad_v2", "__texCubemapGrad_rmnf_v2", + (float __x, float __y, float __z, const float4 *__dPdx, + const float4 *__dPdy), + "tex.grad.cube.v4", "f32", + "{%0, %1, %2, %3}, [%4, {%5, %6, %7, %7}], " + "{%8, %9, %10, %10}, {%11, %12, %13, %13};", + ("f"(__x), "f"(__y), "f"(__z), "f"(__dPdx->x), "f"(__dPdx->y), + "f"(__dPdx->z), "f"(__dPdy->x), "f"(__dPdy->y), "f"(__dPdy->z))); +__IMPL_ALIAS("__itexCubemapGrad_v2", "__texCubemapGrad_v2"); +__IMPL_S("__texCubemapLayered_v2", "__texCubemapLayered_rmnf_v2", + (float __x, float __y, float __z, int __layer), "tex.acube.v4", "f32", + "{%0, %1, %2, %3}, [%4, {%5, %6, %7, %8}];", + ("r"(__layer), "f"(__x), "f"(__y), "f"(__z))); +__IMPL_ALIAS("__itexCubemapLayered", "__texCubemapLayered_v2"); +__IMPL_S("__texCubemapLayeredGrad_v2", "__texCubemapLayeredGrad_rmnf_v2", + (float __x, float __y, float __z, int __layer, const float4 *__dPdx, + const float4 *__dPdy), + "tex.grad.acube.v4", "f32", + "{%0, %1, %2, %3}, [%4, {%5, %6, %7, %8}], " + "{%9, %10, %11, %11}, {%12, %13, %14, %14};", + ("r"(__layer), "f"(__x), "f"(__y), "f"(__z), "f"(__dPdx->x), + "f"(__dPdx->y), "f"(__dPdx->z), "f"(__dPdy->x), "f"(__dPdy->y), + "f"(__dPdy->z))); +__IMPL_ALIAS("__itexCubemapLayeredGrad_v2", "__texCubemapLayeredGrad_v2"); +__IMPL_S("__texCubemapLayeredLod_v2", "__texCubemapLayeredLod_rmnf_v2", + (float __x, float __y, float __z, int __layer, float __level), + "tex.level.acube.v4", "f32", + "{%0, %1, %2, %3}, [%4, {%5, %6, %7, %8}], %9;", + ("r"(__layer), "f"(__x), "f"(__y), "f"(__z), "f"(__level))); +__IMPL_ALIAS("__itexCubemapLayeredLod", "__texCubemapLayeredLod_v2"); +__IMPL_S("__texCubemapLod_v2", "__texCubemapLod_rmnf_v2", + (float __x, float __y, float __z, float __level), "tex.level.cube.v4", + "f32", "{%0, %1, %2, %3}, [%4, {%5, %6, %7, %7}], %8;", + ("f"(__x), "f"(__y), "f"(__z), "f"(__level))); +__IMPL_ALIAS("__itexCubemapLod", "__texCubemapLod_v2"); + +// Helper class for extracting slice of data from V4 fetch results. +template struct __convert { + template ::value, + int __N = sizeof(__DestT) / sizeof(typename __FT<__DestT>::__bt)> + __device__ static __DestT __run(__SrcT __v) { + return __v; + } + template <> __device__ static __DestT __run(__SrcT __v) { + return {__v.x}; + } + template <> __device__ static __DestT __run(__SrcT __v) { + return {__v.x, __v.y}; + } + template <> __device__ static __DestT __run(__SrcT __v) { + return {__v.x, __v.y, __v.z}; + } + template <> __device__ static __DestT __run(__SrcT __v) { + return {__v.x, __v.y, __v.z, __v.w}; + } +}; + +// __nv_tex_surf_handler("__tex...", &ret, cudaTextureObject_t handle, args...); +// Data type and return type are based on ret. +template +__device__ static void __tex_fetch(__T *__ptr, cudaTextureObject_t __handle, + __Args... __args) { + using __FT = typename __FT<__T>::__ft; + *__ptr = __convert<__T, __FT>::__run( + __tex_fetch_v4<__op>::template __run<__FT>(__handle, __args...)); +} + +// texture<> objects get magically converted into a texture reference. However, +// there's no way to convert them to cudaTextureObject_t on C++ level. So, we +// cheat a bit and use inline assembly to do it. It costs us an extra register +// and a move, but that is easy for ptxas to optimize away. +template +__device__ cudaTextureObject_t __tex_handle_to_obj(__T __handle) { + cudaTextureObject_t __obj; + asm("mov.b64 %0, %1; " : "=l"(__obj) : "l"(__handle)); + return __obj; +} + +// __nv_tex_surf_handler ("__tex...", &ret, textureReference, x); +// Data type and return type is based on ret. +template +__device__ static void __tex_fetch(__T *__ptr, __HandleT __handle, + __Args... __args) { + using __FT = typename __FT<__T>::__ft; + *__ptr = + __convert<__T, __FT>::__run(__tex_fetch_v4<__op>::template __run<__FT>( + __tex_handle_to_obj(__handle), __args...)); +} + +// __nv_tex_surf_handler ("__tex...", &type_dummy, &ret, texture<...>, x); +// cudaReadModeNormalizedFloat fetches always return float4. +template +__device__ static void +__tex_fetch(__DataT *, __RetT *__ptr, + texture<__DataT, __TexT, cudaReadModeNormalizedFloat> __handle, + __Args... __args) { + using __FT = typename __FT<__DataT>::__ft; + *__ptr = __convert<__RetT, float4>::__run( + __tex_fetch_v4<__op>::template __run<__FT>(__tex_handle_to_obj(__handle), + __args...)); +} + +// __nv_tex_surf_handler ("__tex...", &type_dummy, &ret, texture<...>, x); +// For cudaReadModeElementType fetch return type is based on type_dummy. +template +__device__ static void +__tex_fetch(__DataT *, __RetT *__ptr, + texture<__DataT, __TexT, cudaReadModeElementType> __handle, + __Args... __args) { + using __FT = typename __FT<__DataT>::__ft; + *__ptr = + __convert<__RetT, __FT>::__run(__tex_fetch_v4<__op>::template __run<__FT>( + __tex_handle_to_obj(__handle), __args...)); +} +} // namespace +#pragma pop_macro("__Args") +#pragma pop_macro("__ID") +#pragma pop_macro("__ID") +#pragma pop_macro("__IDV") +#pragma pop_macro("__IMPL_2DGATHER") +#pragma pop_macro("__IMPL_ALIAS") +#pragma pop_macro("__IMPL_ALIASI") +#pragma pop_macro("__IMPL_F1") +#pragma pop_macro("__IMPL_F3") +#pragma pop_macro("__IMPL_F3N") +#pragma pop_macro("__IMPL_F3S") +#pragma pop_macro("__IMPL_S") +#pragma pop_macro("__IMPL_S3") +#pragma pop_macro("__IMPL_S3I") +#pragma pop_macro("__IMPL_S3N") +#pragma pop_macro("__IMPL_S3NI") +#pragma pop_macro("__IMPL_S3S") +#pragma pop_macro("__IMPL_S3SI") +#pragma pop_macro("__IMPL_SI") +#pragma pop_macro("__L") +#pragma pop_macro("__STRIP_PARENS") +#pragma pop_macro("__V4") +#pragma pop_macro("__V4P") +#endif // __CLANG_CUDA_TEXTURE_INTRINSICS_H__ diff --git a/clang/lib/Sema/SemaChecking.cpp b/clang/lib/Sema/SemaChecking.cpp --- a/clang/lib/Sema/SemaChecking.cpp +++ b/clang/lib/Sema/SemaChecking.cpp @@ -1447,6 +1447,9 @@ case llvm::Triple::riscv32: case llvm::Triple::riscv64: return CheckRISCVBuiltinFunctionCall(TI, BuiltinID, TheCall); + case llvm::Triple::nvptx: + case llvm::Triple::nvptx64: + return CheckNVPTXBuiltinFunctionCall(BuiltinID, TheCall); } } @@ -3559,6 +3562,28 @@ return false; } +static bool CheckNVVMTextureOp(Sema &S, unsigned BuiltinID, CallExpr *TheCall) { + // First argument of the __nvvm_texture_op must be a string literal. + ExprResult Arg = TheCall->getArg(0); + auto ArgExpr = Arg.get(); + Expr::EvalResult ArgResult; + if (!ArgExpr->EvaluateAsConstantExpr(ArgResult, S.Context)) + return S.Diag(ArgExpr->getExprLoc(), diag::err_expr_not_string_literal) + << ArgExpr->getType(); + return false; +} + +bool Sema::CheckNVPTXBuiltinFunctionCall(unsigned BuiltinID, + CallExpr *TheCall) { + // position of memory order and scope arguments in the builtin + switch (BuiltinID) { + case Builtin::BI__nvvm_texture_op: + return CheckNVVMTextureOp(*this, BuiltinID, TheCall); + default: + return false; + } +} + bool Sema::CheckRISCVLMUL(CallExpr *TheCall, unsigned ArgNum) { llvm::APSInt Result;