diff --git a/clang/include/clang/Basic/BuiltinsNVPTX.def b/clang/include/clang/Basic/BuiltinsNVPTX.def --- a/clang/include/clang/Basic/BuiltinsNVPTX.def +++ b/clang/include/clang/Basic/BuiltinsNVPTX.def @@ -795,6 +795,7 @@ BUILTIN(__nvvm_ldg_ul, "ULiULiC*", "") BUILTIN(__nvvm_ldg_ull, "ULLiULLiC*", "") +BUILTIN(__nvvm_ldg_h, "hhC*", "") BUILTIN(__nvvm_ldg_f, "ffC*", "") BUILTIN(__nvvm_ldg_d, "ddC*", "") @@ -814,6 +815,7 @@ BUILTIN(__nvvm_ldg_ui4, "E4UiE4UiC*", "") BUILTIN(__nvvm_ldg_ull2, "E2ULLiE2ULLiC*", "") +BUILTIN(__nvvm_ldg_h2, "E2hE2hC*", "") BUILTIN(__nvvm_ldg_f2, "E2fE2fC*", "") BUILTIN(__nvvm_ldg_f4, "E4fE4fC*", "") BUILTIN(__nvvm_ldg_d2, "E2dE2dC*", "") diff --git a/clang/lib/CodeGen/CGBuiltin.cpp b/clang/lib/CodeGen/CGBuiltin.cpp --- a/clang/lib/CodeGen/CGBuiltin.cpp +++ b/clang/lib/CodeGen/CGBuiltin.cpp @@ -18228,7 +18228,9 @@ // elements, its alignment is set to number of elements times the alignment // of its member: n*alignof(t)." return MakeLdg(Intrinsic::nvvm_ldg_global_i); + case NVPTX::BI__nvvm_ldg_h: case NVPTX::BI__nvvm_ldg_f: + case NVPTX::BI__nvvm_ldg_h2: case NVPTX::BI__nvvm_ldg_f2: case NVPTX::BI__nvvm_ldg_f4: case NVPTX::BI__nvvm_ldg_d: diff --git a/clang/test/CodeGen/builtins-nvptx-native-half-type.c b/clang/test/CodeGen/builtins-nvptx-native-half-type.c --- a/clang/test/CodeGen/builtins-nvptx-native-half-type.c +++ b/clang/test/CodeGen/builtins-nvptx-native-half-type.c @@ -172,3 +172,12 @@ #endif // CHECK: ret void } + +// CHECK-LABEL: nvvm_ldg_native_half_types +__device__ void nvvm_ldg_native_half_types(const void *p) { + // CHECK: call half @llvm.nvvm.ldg.global.f.f16.p0 + __nvvm_ldg_h((const __fp16 *)p); + typedef __fp16 __fp16v2 __attribute__((ext_vector_type(2))); + // CHECK: call <2 x half> @llvm.nvvm.ldg.global.f.v2f16.p0 + __nvvm_ldg_h2((const __fp16v2 *)p); +} diff --git a/llvm/test/CodeGen/NVPTX/ldu-ldg.ll b/llvm/test/CodeGen/NVPTX/ldu-ldg.ll --- a/llvm/test/CodeGen/NVPTX/ldu-ldg.ll +++ b/llvm/test/CodeGen/NVPTX/ldu-ldg.ll @@ -4,34 +4,82 @@ declare i8 @llvm.nvvm.ldu.global.i.i8.p1(ptr addrspace(1) %ptr, i32 %align) declare i32 @llvm.nvvm.ldu.global.i.i32.p1(ptr addrspace(1) %ptr, i32 %align) + declare i8 @llvm.nvvm.ldg.global.i.i8.p1(ptr addrspace(1) %ptr, i32 %align) +declare i16 @llvm.nvvm.ldg.global.i.i16.p1(ptr addrspace(1) %ptr, i32 %align) declare i32 @llvm.nvvm.ldg.global.i.i32.p1(ptr addrspace(1) %ptr, i32 %align) +declare i64 @llvm.nvvm.ldg.global.i.i64.p1(ptr addrspace(1) %ptr, i32 %align) +declare float @llvm.nvvm.ldg.global.f.f32.p1(ptr addrspace(1) %ptr, i32 %align) +declare double @llvm.nvvm.ldg.global.f.f64.p1(ptr addrspace(1) %ptr, i32 %align) +declare half @llvm.nvvm.ldg.global.f.f16.p1(ptr addrspace(1) %ptr, i32 %align) +declare <2 x half> @llvm.nvvm.ldg.global.f.v2f16.p1(ptr addrspace(1) %ptr, i32 %align) - -; CHECK: func0 -define i8 @func0(ptr addrspace(1) %ptr) { +; CHECK: test_ldu_i8 +define i8 @test_ldu_i8(ptr addrspace(1) %ptr) { ; ldu.global.u8 %val = tail call i8 @llvm.nvvm.ldu.global.i.i8.p1(ptr addrspace(1) %ptr, i32 4) ret i8 %val } -; CHECK: func1 -define i32 @func1(ptr addrspace(1) %ptr) { +; CHECK: test_ldu_i32 +define i32 @test_ldu_i32(ptr addrspace(1) %ptr) { ; ldu.global.u32 %val = tail call i32 @llvm.nvvm.ldu.global.i.i32.p1(ptr addrspace(1) %ptr, i32 4) ret i32 %val } -; CHECK: func2 -define i8 @func2(ptr addrspace(1) %ptr) { +; CHECK: test_ldg_i8 +define i8 @test_ldg_i8(ptr addrspace(1) %ptr) { ; ld.global.nc.u8 %val = tail call i8 @llvm.nvvm.ldg.global.i.i8.p1(ptr addrspace(1) %ptr, i32 4) ret i8 %val } -; CHECK: func3 -define i32 @func3(ptr addrspace(1) %ptr) { +; CHECK: test_ldg_i16 +define i16 @test_ldg_i16(ptr addrspace(1) %ptr) { +; ld.global.nc.u16 + %val = tail call i16 @llvm.nvvm.ldg.global.i.i16.p1(ptr addrspace(1) %ptr, i32 4) + ret i16 %val +} + +; CHECK: test_ldg_i32 +define i32 @test_ldg_i32(ptr addrspace(1) %ptr) { ; ld.global.nc.u32 %val = tail call i32 @llvm.nvvm.ldg.global.i.i32.p1(ptr addrspace(1) %ptr, i32 4) ret i32 %val } + +; CHECK: test_ldg_i64 +define i64 @test_ldg_i64(ptr addrspace(1) %ptr) { +; ld.global.nc.u64 + %val = tail call i64 @llvm.nvvm.ldg.global.i.i64.p1(ptr addrspace(1) %ptr, i32 8) + ret i64 %val +} + +; CHECK: test_ldg_f32 +define float @test_ldg_f32(ptr addrspace(1) %ptr) { +; ld.global.nc.u64 + %val = tail call float @llvm.nvvm.ldg.global.f.f32.p1(ptr addrspace(1) %ptr, i32 4) + ret float %val +} + +; CHECK: test_ldg_f64 +define double @test_ldg_f64(ptr addrspace(1) %ptr) { +; ld.global.nc.u64 + %val = tail call double @llvm.nvvm.ldg.global.f.f64.p1(ptr addrspace(1) %ptr, i32 8) + ret double %val +} + +; CHECK: test_ldg_f16 +define half @test_ldg_f16(ptr addrspace(1) %ptr) { +; ld.global.nc.b16 + %val = tail call half @llvm.nvvm.ldg.global.f.f16.p1(ptr addrspace(1) %ptr, i32 4) + ret half %val +} + +; CHECK: test_ldg_v2f16 +define <2 x half> @test_ldg_v2f16(ptr addrspace(1) %ptr) { +; ld.global.nc.b32 + %val = tail call <2 x half> @llvm.nvvm.ldg.global.f.v2f16.p1(ptr addrspace(1) %ptr, i32 4) + ret <2 x half> %val +}