Index: cfe/trunk/include/clang/Basic/BuiltinsNVPTX.def =================================================================== --- cfe/trunk/include/clang/Basic/BuiltinsNVPTX.def +++ cfe/trunk/include/clang/Basic/BuiltinsNVPTX.def @@ -566,4 +566,40 @@ BUILTIN(__nvvm_compiler_error, "vcC*4", "n") BUILTIN(__nvvm_compiler_warn, "vcC*4", "n") +// __ldg. This is not implemented as a builtin by nvcc. +BUILTIN(__nvvm_ldg_c, "ccC*", "") +BUILTIN(__nvvm_ldg_s, "ssC*", "") +BUILTIN(__nvvm_ldg_i, "iiC*", "") +BUILTIN(__nvvm_ldg_l, "LiLiC*", "") +BUILTIN(__nvvm_ldg_ll, "LLiLLiC*", "") + +BUILTIN(__nvvm_ldg_uc, "UcUcC*", "") +BUILTIN(__nvvm_ldg_us, "UsUsC*", "") +BUILTIN(__nvvm_ldg_ui, "UiUiC*", "") +BUILTIN(__nvvm_ldg_ul, "ULiULiC*", "") +BUILTIN(__nvvm_ldg_ull, "ULLiULLiC*", "") + +BUILTIN(__nvvm_ldg_f, "ffC*", "") +BUILTIN(__nvvm_ldg_d, "ddC*", "") + +BUILTIN(__nvvm_ldg_c2, "E2cE2cC*", "") +BUILTIN(__nvvm_ldg_c4, "E4cE4cC*", "") +BUILTIN(__nvvm_ldg_s2, "E2sE2sC*", "") +BUILTIN(__nvvm_ldg_s4, "E4sE4sC*", "") +BUILTIN(__nvvm_ldg_i2, "E2iE2iC*", "") +BUILTIN(__nvvm_ldg_i4, "E4iE4iC*", "") +BUILTIN(__nvvm_ldg_ll2, "E2LLiE2LLiC*", "") + +BUILTIN(__nvvm_ldg_uc2, "E2UcE2UcC*", "") +BUILTIN(__nvvm_ldg_uc4, "E4UcE4UcC*", "") +BUILTIN(__nvvm_ldg_us2, "E2UsE2UsC*", "") +BUILTIN(__nvvm_ldg_us4, "E4UsE4UsC*", "") +BUILTIN(__nvvm_ldg_ui2, "E2UiE2UiC*", "") +BUILTIN(__nvvm_ldg_ui4, "E4UiE4UiC*", "") +BUILTIN(__nvvm_ldg_ull2, "E2ULLiE2ULLiC*", "") + +BUILTIN(__nvvm_ldg_f2, "E2fE2fC*", "") +BUILTIN(__nvvm_ldg_f4, "E4fE4fC*", "") +BUILTIN(__nvvm_ldg_d2, "E2dE2dC*", "") + #undef BUILTIN Index: cfe/trunk/lib/CodeGen/CGBuiltin.cpp =================================================================== --- cfe/trunk/lib/CodeGen/CGBuiltin.cpp +++ cfe/trunk/lib/CodeGen/CGBuiltin.cpp @@ -7349,6 +7349,17 @@ Value *CodeGenFunction::EmitNVPTXBuiltinExpr(unsigned BuiltinID, const CallExpr *E) { + auto MakeLdg = [&](unsigned IntrinsicID) { + Value *Ptr = EmitScalarExpr(E->getArg(0)); + AlignmentSource AlignSource; + clang::CharUnits Align = + getNaturalPointeeTypeAlignment(E->getArg(0)->getType(), &AlignSource); + return Builder.CreateCall( + CGM.getIntrinsic(IntrinsicID, {Ptr->getType()->getPointerElementType(), + Ptr->getType()}), + {Ptr, ConstantInt::get(Builder.getInt32Ty(), Align.getQuantity())}); + }; + switch (BuiltinID) { case NVPTX::BI__nvvm_atom_add_gen_i: case NVPTX::BI__nvvm_atom_add_gen_l: @@ -7433,6 +7444,40 @@ return Builder.CreateCall(FnALD32, {Ptr, Val}); } + case NVPTX::BI__nvvm_ldg_c: + case NVPTX::BI__nvvm_ldg_c2: + case NVPTX::BI__nvvm_ldg_c4: + case NVPTX::BI__nvvm_ldg_s: + case NVPTX::BI__nvvm_ldg_s2: + case NVPTX::BI__nvvm_ldg_s4: + case NVPTX::BI__nvvm_ldg_i: + case NVPTX::BI__nvvm_ldg_i2: + case NVPTX::BI__nvvm_ldg_i4: + case NVPTX::BI__nvvm_ldg_l: + case NVPTX::BI__nvvm_ldg_ll: + case NVPTX::BI__nvvm_ldg_ll2: + case NVPTX::BI__nvvm_ldg_uc: + case NVPTX::BI__nvvm_ldg_uc2: + case NVPTX::BI__nvvm_ldg_uc4: + case NVPTX::BI__nvvm_ldg_us: + case NVPTX::BI__nvvm_ldg_us2: + case NVPTX::BI__nvvm_ldg_us4: + case NVPTX::BI__nvvm_ldg_ui: + case NVPTX::BI__nvvm_ldg_ui2: + case NVPTX::BI__nvvm_ldg_ui4: + case NVPTX::BI__nvvm_ldg_ul: + case NVPTX::BI__nvvm_ldg_ull: + case NVPTX::BI__nvvm_ldg_ull2: + // PTX Interoperability section 2.2: "For a vector with an even number of + // elements, its alignment is set to number of elements times the alignment + // of its member: n*alignof(t)." + return MakeLdg(Intrinsic::nvvm_ldg_global_i); + case NVPTX::BI__nvvm_ldg_f: + case NVPTX::BI__nvvm_ldg_f2: + case NVPTX::BI__nvvm_ldg_f4: + case NVPTX::BI__nvvm_ldg_d: + case NVPTX::BI__nvvm_ldg_d2: + return MakeLdg(Intrinsic::nvvm_ldg_global_f); default: return nullptr; } Index: cfe/trunk/lib/Headers/CMakeLists.txt =================================================================== --- cfe/trunk/lib/Headers/CMakeLists.txt +++ cfe/trunk/lib/Headers/CMakeLists.txt @@ -21,6 +21,7 @@ bmi2intrin.h bmiintrin.h __clang_cuda_cmath.h + __clang_cuda_intrinsics.h __clang_cuda_math_forward_declares.h __clang_cuda_runtime_wrapper.h cpuid.h Index: cfe/trunk/lib/Headers/__clang_cuda_intrinsics.h =================================================================== --- cfe/trunk/lib/Headers/__clang_cuda_intrinsics.h +++ cfe/trunk/lib/Headers/__clang_cuda_intrinsics.h @@ -0,0 +1,256 @@ +/*===--- __clang_cuda_intrinsics.h - Device-side CUDA intrinsic wrappers ---=== + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + * + *===-----------------------------------------------------------------------=== + */ +#ifndef __CLANG_CUDA_INTRINSICS_H__ +#define __CLANG_CUDA_INTRINSICS_H__ +#ifndef __CUDA__ +#error "This file is for CUDA compilation only." +#endif + +// sm_32 intrinsics: __ldg and __funnelshift_{l,lc,r,rc}. + +// Prevent the vanilla sm_32 intrinsics header from being included. +#define __SM_32_INTRINSICS_H__ +#define __SM_32_INTRINSICS_HPP__ + +#if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 320 + +inline __device__ char __ldg(const char *ptr) { return __nvvm_ldg_c(ptr); } +inline __device__ short __ldg(const short *ptr) { return __nvvm_ldg_s(ptr); } +inline __device__ int __ldg(const int *ptr) { return __nvvm_ldg_i(ptr); } +inline __device__ long __ldg(const long *ptr) { return __nvvm_ldg_l(ptr); } +inline __device__ long long __ldg(const long long *ptr) { + return __nvvm_ldg_ll(ptr); +} +inline __device__ unsigned char __ldg(const unsigned char *ptr) { + return __nvvm_ldg_uc(ptr); +} +inline __device__ unsigned short __ldg(const unsigned short *ptr) { + return __nvvm_ldg_us(ptr); +} +inline __device__ unsigned int __ldg(const unsigned int *ptr) { + return __nvvm_ldg_ui(ptr); +} +inline __device__ unsigned long __ldg(const unsigned long *ptr) { + return __nvvm_ldg_ul(ptr); +} +inline __device__ unsigned long long __ldg(const unsigned long long *ptr) { + return __nvvm_ldg_ull(ptr); +} +inline __device__ float __ldg(const float *ptr) { return __nvvm_ldg_f(ptr); } +inline __device__ double __ldg(const double *ptr) { return __nvvm_ldg_d(ptr); } + +inline __device__ char2 __ldg(const char2 *ptr) { + typedef char c2 __attribute__((ext_vector_type(2))); + // We can assume that ptr is aligned at least to char2's alignment, but the + // load will assume that ptr is aligned to char2's alignment. This is only + // safe if alignof(c2) <= alignof(char2). + c2 rv = __nvvm_ldg_c2(reinterpret_cast(ptr)); + char2 ret; + ret.x = rv[0]; + ret.y = rv[1]; + return ret; +} +inline __device__ char4 __ldg(const char4 *ptr) { + typedef char c4 __attribute__((ext_vector_type(4))); + c4 rv = __nvvm_ldg_c4(reinterpret_cast(ptr)); + char4 ret; + ret.w = rv[0]; + ret.x = rv[1]; + ret.y = rv[2]; + ret.z = rv[3]; + return ret; +} +inline __device__ short2 __ldg(const short2 *ptr) { + typedef short s2 __attribute__((ext_vector_type(2))); + s2 rv = __nvvm_ldg_s2(reinterpret_cast(ptr)); + short2 ret; + ret.x = rv[0]; + ret.y = rv[1]; + return ret; +} +inline __device__ short4 __ldg(const short4 *ptr) { + typedef short s4 __attribute__((ext_vector_type(4))); + s4 rv = __nvvm_ldg_s4(reinterpret_cast(ptr)); + short4 ret; + ret.w = rv[0]; + ret.x = rv[1]; + ret.y = rv[2]; + ret.z = rv[3]; + return ret; +} +inline __device__ int2 __ldg(const int2 *ptr) { + typedef int i2 __attribute__((ext_vector_type(2))); + i2 rv = __nvvm_ldg_i2(reinterpret_cast(ptr)); + int2 ret; + ret.x = rv[0]; + ret.y = rv[1]; + return ret; +} +inline __device__ int4 __ldg(const int4 *ptr) { + typedef int i4 __attribute__((ext_vector_type(4))); + i4 rv = __nvvm_ldg_i4(reinterpret_cast(ptr)); + int4 ret; + ret.w = rv[0]; + ret.x = rv[1]; + ret.y = rv[2]; + ret.z = rv[3]; + return ret; +} +inline __device__ longlong2 __ldg(const longlong2 *ptr) { + typedef long long ll2 __attribute__((ext_vector_type(2))); + ll2 rv = __nvvm_ldg_ll2(reinterpret_cast(ptr)); + longlong2 ret; + ret.x = rv[0]; + ret.y = rv[1]; + return ret; +} + +inline __device__ uchar2 __ldg(const uchar2 *ptr) { + typedef unsigned char uc2 __attribute__((ext_vector_type(2))); + uc2 rv = __nvvm_ldg_uc2(reinterpret_cast(ptr)); + uchar2 ret; + ret.x = rv[0]; + ret.y = rv[1]; + return ret; +} +inline __device__ uchar4 __ldg(const uchar4 *ptr) { + typedef unsigned char uc4 __attribute__((ext_vector_type(4))); + uc4 rv = __nvvm_ldg_uc4(reinterpret_cast(ptr)); + uchar4 ret; + ret.w = rv[0]; + ret.x = rv[1]; + ret.y = rv[2]; + ret.z = rv[3]; + return ret; +} +inline __device__ ushort2 __ldg(const ushort2 *ptr) { + typedef unsigned short us2 __attribute__((ext_vector_type(2))); + us2 rv = __nvvm_ldg_us2(reinterpret_cast(ptr)); + ushort2 ret; + ret.x = rv[0]; + ret.y = rv[1]; + return ret; +} +inline __device__ ushort4 __ldg(const ushort4 *ptr) { + typedef unsigned short us4 __attribute__((ext_vector_type(4))); + us4 rv = __nvvm_ldg_us4(reinterpret_cast(ptr)); + ushort4 ret; + ret.w = rv[0]; + ret.x = rv[1]; + ret.y = rv[2]; + ret.z = rv[3]; + return ret; +} +inline __device__ uint2 __ldg(const uint2 *ptr) { + typedef unsigned int ui2 __attribute__((ext_vector_type(2))); + ui2 rv = __nvvm_ldg_ui2(reinterpret_cast(ptr)); + uint2 ret; + ret.x = rv[0]; + ret.y = rv[1]; + return ret; +} +inline __device__ uint4 __ldg(const uint4 *ptr) { + typedef unsigned int ui4 __attribute__((ext_vector_type(4))); + ui4 rv = __nvvm_ldg_ui4(reinterpret_cast(ptr)); + uint4 ret; + ret.w = rv[0]; + ret.x = rv[1]; + ret.y = rv[2]; + ret.z = rv[3]; + return ret; +} +inline __device__ ulonglong2 __ldg(const ulonglong2 *ptr) { + typedef unsigned long long ull2 __attribute__((ext_vector_type(2))); + ull2 rv = __nvvm_ldg_ull2(reinterpret_cast(ptr)); + ulonglong2 ret; + ret.x = rv[0]; + ret.y = rv[1]; + return ret; +} + +inline __device__ float2 __ldg(const float2 *ptr) { + typedef float f2 __attribute__((ext_vector_type(2))); + f2 rv = __nvvm_ldg_f2(reinterpret_cast(ptr)); + float2 ret; + ret.x = rv[0]; + ret.y = rv[1]; + return ret; +} +inline __device__ float4 __ldg(const float4 *ptr) { + typedef float f4 __attribute__((ext_vector_type(4))); + f4 rv = __nvvm_ldg_f4(reinterpret_cast(ptr)); + float4 ret; + ret.w = rv[0]; + ret.x = rv[1]; + ret.y = rv[2]; + ret.z = rv[3]; + return ret; +} +inline __device__ double2 __ldg(const double2 *ptr) { + typedef double d2 __attribute__((ext_vector_type(2))); + d2 rv = __nvvm_ldg_d2(reinterpret_cast(ptr)); + double2 ret; + ret.x = rv[0]; + ret.y = rv[1]; + return ret; +} + +// TODO: Implement these as intrinsics, so the backend can work its magic on +// these. Alternatively, we could implement these as plain C and try to get +// llvm to recognize the relevant patterns. +inline __device__ unsigned __funnelshift_l(unsigned low32, unsigned high32, + unsigned shiftWidth) { + unsigned result; + asm("shf.l.wrap.b32 %0, %1, %2, %3;" + : "=r"(result) + : "r"(low32), "r"(high32), "r"(shiftWidth)); + return result; +} +inline __device__ unsigned __funnelshift_lc(unsigned low32, unsigned high32, + unsigned shiftWidth) { + unsigned result; + asm("shf.l.clamp.b32 %0, %1, %2, %3;" + : "=r"(result) + : "r"(low32), "r"(high32), "r"(shiftWidth)); + return result; +} +inline __device__ unsigned __funnelshift_r(unsigned low32, unsigned high32, + unsigned shiftWidth) { + unsigned result; + asm("shf.r.wrap.b32 %0, %1, %2, %3;" + : "=r"(result) + : "r"(low32), "r"(high32), "r"(shiftWidth)); + return result; +} +inline __device__ unsigned __funnelshift_rc(unsigned low32, unsigned high32, + unsigned shiftWidth) { + unsigned ret; + asm("shf.r.clamp.b32 %0, %1, %2, %3;" + : "=r"(ret) + : "r"(low32), "r"(high32), "r"(shiftWidth)); + return ret; +} + +#endif // !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 320 + +#endif // defined(__CLANG_CUDA_INTRINSICS_H__) Index: cfe/trunk/lib/Headers/__clang_cuda_runtime_wrapper.h =================================================================== --- cfe/trunk/lib/Headers/__clang_cuda_runtime_wrapper.h +++ cfe/trunk/lib/Headers/__clang_cuda_runtime_wrapper.h @@ -188,7 +188,10 @@ // sm_30_intrinsics.h has declarations that use default argument, so // we have to include it and it will in turn include .hpp #include "sm_30_intrinsics.h" -#include "sm_32_intrinsics.hpp" + +// Don't include sm_32_intrinsics.h. That header defines __ldg using inline +// asm, but we want to define it using builtins, because we can't use the +// [addr+imm] addressing mode if we use the inline asm in the header. #undef __MATH_FUNCTIONS_HPP__ @@ -278,6 +281,7 @@ } #include <__clang_cuda_cmath.h> +#include <__clang_cuda_intrinsics.h> // curand_mtgp32_kernel helpfully redeclares blockDim and threadIdx in host // mode, giving them their "proper" types of dim3 and uint3. This is Index: cfe/trunk/test/CodeGen/builtins-nvptx.c =================================================================== --- cfe/trunk/test/CodeGen/builtins-nvptx.c +++ cfe/trunk/test/CodeGen/builtins-nvptx.c @@ -1,6 +1,8 @@ // REQUIRES: nvptx-registered-target -// RUN: %clang_cc1 -triple nvptx-unknown-unknown -fcuda-is-device -S -emit-llvm -o - -x cuda %s | FileCheck %s -// RUN: %clang_cc1 -triple nvptx64-unknown-unknown -fcuda-is-device -S -emit-llvm -o - -x cuda %s | FileCheck %s +// RUN: %clang_cc1 -triple nvptx-unknown-unknown -fcuda-is-device -S -emit-llvm -o - -x cuda %s | \ +// RUN: FileCheck -check-prefix=CHECK -check-prefix=LP32 %s +// RUN: %clang_cc1 -triple nvptx64-unknown-unknown -fcuda-is-device -S -emit-llvm -o - -x cuda %s | \ +// RUN: FileCheck -check-prefix=CHECK -check-prefix=LP64 %s #define __device__ __attribute__((device)) #define __global__ __attribute__((global)) @@ -280,3 +282,103 @@ // CHECK: ret } + +// CHECK-LABEL: nvvm_ldg +__device__ void nvvm_ldg(const void *p) { + // CHECK: call i8 @llvm.nvvm.ldg.global.i.i8.p0i8(i8* {{%[0-9]+}}, i32 1) + // CHECK: call i8 @llvm.nvvm.ldg.global.i.i8.p0i8(i8* {{%[0-9]+}}, i32 1) + __nvvm_ldg_c((const char *)p); + __nvvm_ldg_uc((const unsigned char *)p); + + // CHECK: call i16 @llvm.nvvm.ldg.global.i.i16.p0i16(i16* {{%[0-9]+}}, i32 2) + // CHECK: call i16 @llvm.nvvm.ldg.global.i.i16.p0i16(i16* {{%[0-9]+}}, i32 2) + __nvvm_ldg_s((const short *)p); + __nvvm_ldg_us((const unsigned short *)p); + + // CHECK: call i32 @llvm.nvvm.ldg.global.i.i32.p0i32(i32* {{%[0-9]+}}, i32 4) + // CHECK: call i32 @llvm.nvvm.ldg.global.i.i32.p0i32(i32* {{%[0-9]+}}, i32 4) + __nvvm_ldg_i((const int *)p); + __nvvm_ldg_ui((const unsigned int *)p); + + // LP32: call i32 @llvm.nvvm.ldg.global.i.i32.p0i32(i32* {{%[0-9]+}}, i32 4) + // LP32: call i32 @llvm.nvvm.ldg.global.i.i32.p0i32(i32* {{%[0-9]+}}, i32 4) + // LP64: call i64 @llvm.nvvm.ldg.global.i.i64.p0i64(i64* {{%[0-9]+}}, i32 8) + // LP64: call i64 @llvm.nvvm.ldg.global.i.i64.p0i64(i64* {{%[0-9]+}}, i32 8) + __nvvm_ldg_l((const long *)p); + __nvvm_ldg_ul((const unsigned long *)p); + + // CHECK: call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* {{%[0-9]+}}, i32 4) + __nvvm_ldg_f((const float *)p); + // CHECK: call double @llvm.nvvm.ldg.global.f.f64.p0f64(double* {{%[0-9]+}}, i32 8) + __nvvm_ldg_d((const double *)p); + + // In practice, the pointers we pass to __ldg will be aligned as appropriate + // for the CUDA N vector types (e.g. short4), which are not the same as + // the LLVM vector types. However, each LLVM vector type has an alignment + // less than or equal to its corresponding CUDA type, so we're OK. + // + // PTX Interoperability section 2.2: "For a vector with an even number of + // elements, its alignment is set to number of elements times the alignment of + // its member: n*alignof(t)." + + // CHECK: call <2 x i8> @llvm.nvvm.ldg.global.i.v2i8.p0v2i8(<2 x i8>* {{%[0-9]+}}, i32 2) + // CHECK: call <2 x i8> @llvm.nvvm.ldg.global.i.v2i8.p0v2i8(<2 x i8>* {{%[0-9]+}}, i32 2) + typedef char char2 __attribute__((ext_vector_type(2))); + typedef unsigned char uchar2 __attribute__((ext_vector_type(2))); + __nvvm_ldg_c2((const char2 *)p); + __nvvm_ldg_uc2((const uchar2 *)p); + + // CHECK: call <4 x i8> @llvm.nvvm.ldg.global.i.v4i8.p0v4i8(<4 x i8>* {{%[0-9]+}}, i32 4) + // CHECK: call <4 x i8> @llvm.nvvm.ldg.global.i.v4i8.p0v4i8(<4 x i8>* {{%[0-9]+}}, i32 4) + typedef char char4 __attribute__((ext_vector_type(4))); + typedef unsigned char uchar4 __attribute__((ext_vector_type(4))); + __nvvm_ldg_c4((const char4 *)p); + __nvvm_ldg_uc4((const uchar4 *)p); + + // CHECK: call <2 x i16> @llvm.nvvm.ldg.global.i.v2i16.p0v2i16(<2 x i16>* {{%[0-9]+}}, i32 4) + // CHECK: call <2 x i16> @llvm.nvvm.ldg.global.i.v2i16.p0v2i16(<2 x i16>* {{%[0-9]+}}, i32 4) + typedef short short2 __attribute__((ext_vector_type(2))); + typedef unsigned short ushort2 __attribute__((ext_vector_type(2))); + __nvvm_ldg_s2((const short2 *)p); + __nvvm_ldg_us2((const ushort2 *)p); + + // CHECK: call <4 x i16> @llvm.nvvm.ldg.global.i.v4i16.p0v4i16(<4 x i16>* {{%[0-9]+}}, i32 8) + // CHECK: call <4 x i16> @llvm.nvvm.ldg.global.i.v4i16.p0v4i16(<4 x i16>* {{%[0-9]+}}, i32 8) + typedef short short4 __attribute__((ext_vector_type(4))); + typedef unsigned short ushort4 __attribute__((ext_vector_type(4))); + __nvvm_ldg_s4((const short4 *)p); + __nvvm_ldg_us4((const ushort4 *)p); + + // CHECK: call <2 x i32> @llvm.nvvm.ldg.global.i.v2i32.p0v2i32(<2 x i32>* {{%[0-9]+}}, i32 8) + // CHECK: call <2 x i32> @llvm.nvvm.ldg.global.i.v2i32.p0v2i32(<2 x i32>* {{%[0-9]+}}, i32 8) + typedef int int2 __attribute__((ext_vector_type(2))); + typedef unsigned int uint2 __attribute__((ext_vector_type(2))); + __nvvm_ldg_i2((const int2 *)p); + __nvvm_ldg_ui2((const uint2 *)p); + + // CHECK: call <4 x i32> @llvm.nvvm.ldg.global.i.v4i32.p0v4i32(<4 x i32>* {{%[0-9]+}}, i32 16) + // CHECK: call <4 x i32> @llvm.nvvm.ldg.global.i.v4i32.p0v4i32(<4 x i32>* {{%[0-9]+}}, i32 16) + typedef int int4 __attribute__((ext_vector_type(4))); + typedef unsigned int uint4 __attribute__((ext_vector_type(4))); + __nvvm_ldg_i4((const int4 *)p); + __nvvm_ldg_ui4((const uint4 *)p); + + // CHECK: call <2 x i64> @llvm.nvvm.ldg.global.i.v2i64.p0v2i64(<2 x i64>* {{%[0-9]+}}, i32 16) + // CHECK: call <2 x i64> @llvm.nvvm.ldg.global.i.v2i64.p0v2i64(<2 x i64>* {{%[0-9]+}}, i32 16) + typedef long long longlong2 __attribute__((ext_vector_type(2))); + typedef unsigned long long ulonglong2 __attribute__((ext_vector_type(2))); + __nvvm_ldg_ll2((const longlong2 *)p); + __nvvm_ldg_ull2((const ulonglong2 *)p); + + // CHECK: call <2 x float> @llvm.nvvm.ldg.global.f.v2f32.p0v2f32(<2 x float>* {{%[0-9]+}}, i32 8) + typedef float float2 __attribute__((ext_vector_type(2))); + __nvvm_ldg_f2((const float2 *)p); + + // CHECK: call <4 x float> @llvm.nvvm.ldg.global.f.v4f32.p0v4f32(<4 x float>* {{%[0-9]+}}, i32 16) + typedef float float4 __attribute__((ext_vector_type(4))); + __nvvm_ldg_f4((const float4 *)p); + + // CHECK: call <2 x double> @llvm.nvvm.ldg.global.f.v2f64.p0v2f64(<2 x double>* {{%[0-9]+}}, i32 16) + typedef double double2 __attribute__((ext_vector_type(2))); + __nvvm_ldg_d2((const double2 *)p); +}