diff --git a/clang/include/clang/Basic/BuiltinsNVPTX.def b/clang/include/clang/Basic/BuiltinsNVPTX.def --- a/clang/include/clang/Basic/BuiltinsNVPTX.def +++ b/clang/include/clang/Basic/BuiltinsNVPTX.def @@ -782,7 +782,43 @@ BUILTIN(__nvvm_compiler_error, "vcC*4", "n") BUILTIN(__nvvm_compiler_warn, "vcC*4", "n") -// __ldg. This is not implemented as a builtin by nvcc. +BUILTIN(__nvvm_ldu_c, "ccC*", "") +BUILTIN(__nvvm_ldu_s, "ssC*", "") +BUILTIN(__nvvm_ldu_i, "iiC*", "") +BUILTIN(__nvvm_ldu_l, "LiLiC*", "") +BUILTIN(__nvvm_ldu_ll, "LLiLLiC*", "") + +BUILTIN(__nvvm_ldu_uc, "UcUcC*", "") +BUILTIN(__nvvm_ldu_us, "UsUsC*", "") +BUILTIN(__nvvm_ldu_ui, "UiUiC*", "") +BUILTIN(__nvvm_ldu_ul, "ULiULiC*", "") +BUILTIN(__nvvm_ldu_ull, "ULLiULLiC*", "") + +BUILTIN(__nvvm_ldu_h, "hhC*", "") +BUILTIN(__nvvm_ldu_f, "ffC*", "") +BUILTIN(__nvvm_ldu_d, "ddC*", "") + +BUILTIN(__nvvm_ldu_c2, "E2cE2cC*", "") +BUILTIN(__nvvm_ldu_c4, "E4cE4cC*", "") +BUILTIN(__nvvm_ldu_s2, "E2sE2sC*", "") +BUILTIN(__nvvm_ldu_s4, "E4sE4sC*", "") +BUILTIN(__nvvm_ldu_i2, "E2iE2iC*", "") +BUILTIN(__nvvm_ldu_i4, "E4iE4iC*", "") +BUILTIN(__nvvm_ldu_ll2, "E2LLiE2LLiC*", "") + +BUILTIN(__nvvm_ldu_uc2, "E2UcE2UcC*", "") +BUILTIN(__nvvm_ldu_uc4, "E4UcE4UcC*", "") +BUILTIN(__nvvm_ldu_us2, "E2UsE2UsC*", "") +BUILTIN(__nvvm_ldu_us4, "E4UsE4UsC*", "") +BUILTIN(__nvvm_ldu_ui2, "E2UiE2UiC*", "") +BUILTIN(__nvvm_ldu_ui4, "E4UiE4UiC*", "") +BUILTIN(__nvvm_ldu_ull2, "E2ULLiE2ULLiC*", "") + +BUILTIN(__nvvm_ldu_h2, "E2hE2hC*", "") +BUILTIN(__nvvm_ldu_f2, "E2fE2fC*", "") +BUILTIN(__nvvm_ldu_f4, "E4fE4fC*", "") +BUILTIN(__nvvm_ldu_d2, "E2dE2dC*", "") + BUILTIN(__nvvm_ldg_c, "ccC*", "") BUILTIN(__nvvm_ldg_s, "ssC*", "") BUILTIN(__nvvm_ldg_i, "iiC*", "") diff --git a/clang/lib/CodeGen/CGBuiltin.cpp b/clang/lib/CodeGen/CGBuiltin.cpp --- a/clang/lib/CodeGen/CGBuiltin.cpp +++ b/clang/lib/CodeGen/CGBuiltin.cpp @@ -18113,7 +18113,12 @@ Value * CodeGenFunction::EmitNVPTXBuiltinExpr(unsigned BuiltinID, const CallExpr *E) { - auto MakeLdg = [&](unsigned IntrinsicID) { + auto HasHalfSupport = [&](unsigned BuiltinID) { + auto &Context = getContext(); + return Context.getLangOpts().NativeHalfType || + !Context.getTargetInfo().useFP16ConversionIntrinsics(); + }; + auto MakeLdgLdu = [&](unsigned IntrinsicID) { Value *Ptr = EmitScalarExpr(E->getArg(0)); QualType ArgType = E->getArg(0)->getType(); clang::CharUnits Align = CGM.getNaturalPointeeTypeAlignment(ArgType); @@ -18239,15 +18244,63 @@ // PTX Interoperability section 2.2: "For a vector with an even number of // elements, its alignment is set to number of elements times the alignment // of its member: n*alignof(t)." - return MakeLdg(Intrinsic::nvvm_ldg_global_i); + return MakeLdgLdu(Intrinsic::nvvm_ldg_global_i); case NVPTX::BI__nvvm_ldg_h: - case NVPTX::BI__nvvm_ldg_f: case NVPTX::BI__nvvm_ldg_h2: + if (!HasHalfSupport(BuiltinID)) { + CGM.Error(E->getExprLoc(), + getContext().BuiltinInfo.getName(BuiltinID).str().append( + " requires native half type support.")); + return nullptr; + } + [[fallthrough]]; + case NVPTX::BI__nvvm_ldg_f: case NVPTX::BI__nvvm_ldg_f2: case NVPTX::BI__nvvm_ldg_f4: case NVPTX::BI__nvvm_ldg_d: case NVPTX::BI__nvvm_ldg_d2: - return MakeLdg(Intrinsic::nvvm_ldg_global_f); + return MakeLdgLdu(Intrinsic::nvvm_ldg_global_f); + + case NVPTX::BI__nvvm_ldu_c: + case NVPTX::BI__nvvm_ldu_c2: + case NVPTX::BI__nvvm_ldu_c4: + case NVPTX::BI__nvvm_ldu_s: + case NVPTX::BI__nvvm_ldu_s2: + case NVPTX::BI__nvvm_ldu_s4: + case NVPTX::BI__nvvm_ldu_i: + case NVPTX::BI__nvvm_ldu_i2: + case NVPTX::BI__nvvm_ldu_i4: + case NVPTX::BI__nvvm_ldu_l: + case NVPTX::BI__nvvm_ldu_ll: + case NVPTX::BI__nvvm_ldu_ll2: + case NVPTX::BI__nvvm_ldu_uc: + case NVPTX::BI__nvvm_ldu_uc2: + case NVPTX::BI__nvvm_ldu_uc4: + case NVPTX::BI__nvvm_ldu_us: + case NVPTX::BI__nvvm_ldu_us2: + case NVPTX::BI__nvvm_ldu_us4: + case NVPTX::BI__nvvm_ldu_ui: + case NVPTX::BI__nvvm_ldu_ui2: + case NVPTX::BI__nvvm_ldu_ui4: + case NVPTX::BI__nvvm_ldu_ul: + case NVPTX::BI__nvvm_ldu_ull: + case NVPTX::BI__nvvm_ldu_ull2: + return MakeLdgLdu(Intrinsic::nvvm_ldu_global_i); + case NVPTX::BI__nvvm_ldu_h: + case NVPTX::BI__nvvm_ldu_h2: + if (!HasHalfSupport(BuiltinID)) { + CGM.Error(E->getExprLoc(), + getContext().BuiltinInfo.getName(BuiltinID).str().append( + " requires native half type support.")); + return nullptr; + } + [[fallthrough]]; + case NVPTX::BI__nvvm_ldu_f: + case NVPTX::BI__nvvm_ldu_f2: + case NVPTX::BI__nvvm_ldu_f4: + case NVPTX::BI__nvvm_ldu_d: + case NVPTX::BI__nvvm_ldu_d2: + return MakeLdgLdu(Intrinsic::nvvm_ldu_global_f); case NVPTX::BI__nvvm_atom_cta_add_gen_i: case NVPTX::BI__nvvm_atom_cta_add_gen_l: diff --git a/clang/test/CodeGen/builtins-nvptx-native-half-type-err.c b/clang/test/CodeGen/builtins-nvptx-native-half-type-err.c new file mode 100644 --- /dev/null +++ b/clang/test/CodeGen/builtins-nvptx-native-half-type-err.c @@ -0,0 +1,21 @@ +// REQUIRES: nvptx-registered-target +// +// RUN: not %clang_cc1 -fsyntax-only -ffp-contract=off -triple nvptx-unknown-unknown -target-cpu \ +// RUN: sm_75 -target-feature +ptx70 -fcuda-is-device -x cuda -emit-llvm -o - %s 2>&1 \ +// RUN: | FileCheck -check-prefix=CHECK-ERROR %s + +#define __device__ __attribute__((device)) +typedef __fp16 __fp16v2 __attribute__((ext_vector_type(2))); + +__device__ void nvvm_ldg_ldu_native_half_types(const void *p) { + __nvvm_ldg_h((const __fp16 *)p); + __nvvm_ldg_h2((const __fp16v2 *)p); + + __nvvm_ldu_h((const __fp16 *)p); + __nvvm_ldu_h2((const __fp16v2 *)p); +} + +// CHECK-ERROR: error: __nvvm_ldg_h requires native half type support. +// CHECK-ERROR: error: __nvvm_ldg_h2 requires native half type support. +// CHECK-ERROR: error: __nvvm_ldu_h requires native half type support. +// CHECK-ERROR: error: __nvvm_ldu_h2 requires native half type support. diff --git a/clang/test/CodeGen/builtins-nvptx-native-half-type.c b/clang/test/CodeGen/builtins-nvptx-native-half-type.c --- a/clang/test/CodeGen/builtins-nvptx-native-half-type.c +++ b/clang/test/CodeGen/builtins-nvptx-native-half-type.c @@ -173,11 +173,20 @@ // CHECK: ret void } +typedef __fp16 __fp16v2 __attribute__((ext_vector_type(2))); + // CHECK-LABEL: nvvm_ldg_native_half_types __device__ void nvvm_ldg_native_half_types(const void *p) { // CHECK: call half @llvm.nvvm.ldg.global.f.f16.p0 __nvvm_ldg_h((const __fp16 *)p); - typedef __fp16 __fp16v2 __attribute__((ext_vector_type(2))); // CHECK: call <2 x half> @llvm.nvvm.ldg.global.f.v2f16.p0 __nvvm_ldg_h2((const __fp16v2 *)p); } + +// CHECK-LABEL: nvvm_ldu_native_half_types +__device__ void nvvm_ldu_native_half_types(const void *p) { + // CHECK: call half @llvm.nvvm.ldu.global.f.f16.p0 + __nvvm_ldu_h((const __fp16 *)p); + // CHECK: call <2 x half> @llvm.nvvm.ldu.global.f.v2f16.p0 + __nvvm_ldu_h2((const __fp16v2 *)p); +} diff --git a/clang/test/CodeGen/builtins-nvptx.c b/clang/test/CodeGen/builtins-nvptx.c --- a/clang/test/CodeGen/builtins-nvptx.c +++ b/clang/test/CodeGen/builtins-nvptx.c @@ -652,6 +652,97 @@ __nvvm_ldg_d2((const double2 *)p); } +// CHECK-LABEL: nvvm_ldu +__device__ void nvvm_ldu(const void *p) { + // CHECK: call i8 @llvm.nvvm.ldu.global.i.i8.p0(ptr {{%[0-9]+}}, i32 1) + // CHECK: call i8 @llvm.nvvm.ldu.global.i.i8.p0(ptr {{%[0-9]+}}, i32 1) + __nvvm_ldu_c((const char *)p); + __nvvm_ldu_uc((const unsigned char *)p); + + // CHECK: call i16 @llvm.nvvm.ldu.global.i.i16.p0(ptr {{%[0-9]+}}, i32 2) + // CHECK: call i16 @llvm.nvvm.ldu.global.i.i16.p0(ptr {{%[0-9]+}}, i32 2) + __nvvm_ldu_s((const short *)p); + __nvvm_ldu_us((const unsigned short *)p); + + // CHECK: call i32 @llvm.nvvm.ldu.global.i.i32.p0(ptr {{%[0-9]+}}, i32 4) + // CHECK: call i32 @llvm.nvvm.ldu.global.i.i32.p0(ptr {{%[0-9]+}}, i32 4) + __nvvm_ldu_i((const int *)p); + __nvvm_ldu_ui((const unsigned int *)p); + + // LP32: call i32 @llvm.nvvm.ldu.global.i.i32.p0(ptr {{%[0-9]+}}, i32 4) + // LP32: call i32 @llvm.nvvm.ldu.global.i.i32.p0(ptr {{%[0-9]+}}, i32 4) + // LP64: call i64 @llvm.nvvm.ldu.global.i.i64.p0(ptr {{%[0-9]+}}, i32 8) + // LP64: call i64 @llvm.nvvm.ldu.global.i.i64.p0(ptr {{%[0-9]+}}, i32 8) + __nvvm_ldu_l((const long *)p); + __nvvm_ldu_ul((const unsigned long *)p); + + // CHECK: call float @llvm.nvvm.ldu.global.f.f32.p0(ptr {{%[0-9]+}}, i32 4) + __nvvm_ldu_f((const float *)p); + // CHECK: call double @llvm.nvvm.ldu.global.f.f64.p0(ptr {{%[0-9]+}}, i32 8) + __nvvm_ldu_d((const double *)p); + + // CHECK: call <2 x i8> @llvm.nvvm.ldu.global.i.v2i8.p0(ptr {{%[0-9]+}}, i32 2) + // CHECK: call <2 x i8> @llvm.nvvm.ldu.global.i.v2i8.p0(ptr {{%[0-9]+}}, i32 2) + typedef char char2 __attribute__((ext_vector_type(2))); + typedef unsigned char uchar2 __attribute__((ext_vector_type(2))); + __nvvm_ldu_c2((const char2 *)p); + __nvvm_ldu_uc2((const uchar2 *)p); + + // CHECK: call <4 x i8> @llvm.nvvm.ldu.global.i.v4i8.p0(ptr {{%[0-9]+}}, i32 4) + // CHECK: call <4 x i8> @llvm.nvvm.ldu.global.i.v4i8.p0(ptr {{%[0-9]+}}, i32 4) + typedef char char4 __attribute__((ext_vector_type(4))); + typedef unsigned char uchar4 __attribute__((ext_vector_type(4))); + __nvvm_ldu_c4((const char4 *)p); + __nvvm_ldu_uc4((const uchar4 *)p); + + // CHECK: call <2 x i16> @llvm.nvvm.ldu.global.i.v2i16.p0(ptr {{%[0-9]+}}, i32 4) + // CHECK: call <2 x i16> @llvm.nvvm.ldu.global.i.v2i16.p0(ptr {{%[0-9]+}}, i32 4) + typedef short short2 __attribute__((ext_vector_type(2))); + typedef unsigned short ushort2 __attribute__((ext_vector_type(2))); + __nvvm_ldu_s2((const short2 *)p); + __nvvm_ldu_us2((const ushort2 *)p); + + // CHECK: call <4 x i16> @llvm.nvvm.ldu.global.i.v4i16.p0(ptr {{%[0-9]+}}, i32 8) + // CHECK: call <4 x i16> @llvm.nvvm.ldu.global.i.v4i16.p0(ptr {{%[0-9]+}}, i32 8) + typedef short short4 __attribute__((ext_vector_type(4))); + typedef unsigned short ushort4 __attribute__((ext_vector_type(4))); + __nvvm_ldu_s4((const short4 *)p); + __nvvm_ldu_us4((const ushort4 *)p); + + // CHECK: call <2 x i32> @llvm.nvvm.ldu.global.i.v2i32.p0(ptr {{%[0-9]+}}, i32 8) + // CHECK: call <2 x i32> @llvm.nvvm.ldu.global.i.v2i32.p0(ptr {{%[0-9]+}}, i32 8) + typedef int int2 __attribute__((ext_vector_type(2))); + typedef unsigned int uint2 __attribute__((ext_vector_type(2))); + __nvvm_ldu_i2((const int2 *)p); + __nvvm_ldu_ui2((const uint2 *)p); + + // CHECK: call <4 x i32> @llvm.nvvm.ldu.global.i.v4i32.p0(ptr {{%[0-9]+}}, i32 16) + // CHECK: call <4 x i32> @llvm.nvvm.ldu.global.i.v4i32.p0(ptr {{%[0-9]+}}, i32 16) + typedef int int4 __attribute__((ext_vector_type(4))); + typedef unsigned int uint4 __attribute__((ext_vector_type(4))); + __nvvm_ldu_i4((const int4 *)p); + __nvvm_ldu_ui4((const uint4 *)p); + + // CHECK: call <2 x i64> @llvm.nvvm.ldu.global.i.v2i64.p0(ptr {{%[0-9]+}}, i32 16) + // CHECK: call <2 x i64> @llvm.nvvm.ldu.global.i.v2i64.p0(ptr {{%[0-9]+}}, i32 16) + typedef long long longlong2 __attribute__((ext_vector_type(2))); + typedef unsigned long long ulonglong2 __attribute__((ext_vector_type(2))); + __nvvm_ldu_ll2((const longlong2 *)p); + __nvvm_ldu_ull2((const ulonglong2 *)p); + + // CHECK: call <2 x float> @llvm.nvvm.ldu.global.f.v2f32.p0(ptr {{%[0-9]+}}, i32 8) + typedef float float2 __attribute__((ext_vector_type(2))); + __nvvm_ldu_f2((const float2 *)p); + + // CHECK: call <4 x float> @llvm.nvvm.ldu.global.f.v4f32.p0(ptr {{%[0-9]+}}, i32 16) + typedef float float4 __attribute__((ext_vector_type(4))); + __nvvm_ldu_f4((const float4 *)p); + + // CHECK: call <2 x double> @llvm.nvvm.ldu.global.f.v2f64.p0(ptr {{%[0-9]+}}, i32 16) + typedef double double2 __attribute__((ext_vector_type(2))); + __nvvm_ldu_d2((const double2 *)p); +} + // CHECK-LABEL: nvvm_shfl __device__ void nvvm_shfl(int i, float f, int a, int b) { // CHECK: call i32 @llvm.nvvm.shfl.down.i32(i32 diff --git a/llvm/test/CodeGen/NVPTX/ldu-ldg.ll b/llvm/test/CodeGen/NVPTX/ldu-ldg.ll --- a/llvm/test/CodeGen/NVPTX/ldu-ldg.ll +++ b/llvm/test/CodeGen/NVPTX/ldu-ldg.ll @@ -3,7 +3,13 @@ declare i8 @llvm.nvvm.ldu.global.i.i8.p1(ptr addrspace(1) %ptr, i32 %align) +declare i16 @llvm.nvvm.ldu.global.i.i16.p1(ptr addrspace(1) %ptr, i32 %align) declare i32 @llvm.nvvm.ldu.global.i.i32.p1(ptr addrspace(1) %ptr, i32 %align) +declare i64 @llvm.nvvm.ldu.global.i.i64.p1(ptr addrspace(1) %ptr, i32 %align) +declare float @llvm.nvvm.ldu.global.f.f32.p1(ptr addrspace(1) %ptr, i32 %align) +declare double @llvm.nvvm.ldu.global.f.f64.p1(ptr addrspace(1) %ptr, i32 %align) +declare half @llvm.nvvm.ldu.global.f.f16.p1(ptr addrspace(1) %ptr, i32 %align) +declare <2 x half> @llvm.nvvm.ldu.global.f.v2f16.p1(ptr addrspace(1) %ptr, i32 %align) declare i8 @llvm.nvvm.ldg.global.i.i8.p1(ptr addrspace(1) %ptr, i32 %align) declare i16 @llvm.nvvm.ldg.global.i.i16.p1(ptr addrspace(1) %ptr, i32 %align) @@ -16,70 +22,112 @@ ; CHECK: test_ldu_i8 define i8 @test_ldu_i8(ptr addrspace(1) %ptr) { -; ldu.global.u8 + ; CHECK: ldu.global.u8 %val = tail call i8 @llvm.nvvm.ldu.global.i.i8.p1(ptr addrspace(1) %ptr, i32 4) ret i8 %val } +; CHECK: test_ldu_i16 +define i16 @test_ldu_i16(ptr addrspace(1) %ptr) { + ; CHECK: ldu.global.u16 + %val = tail call i16 @llvm.nvvm.ldu.global.i.i16.p1(ptr addrspace(1) %ptr, i32 2) + ret i16 %val +} + ; CHECK: test_ldu_i32 define i32 @test_ldu_i32(ptr addrspace(1) %ptr) { -; ldu.global.u32 + ; CHECK: ldu.global.u32 %val = tail call i32 @llvm.nvvm.ldu.global.i.i32.p1(ptr addrspace(1) %ptr, i32 4) ret i32 %val } +; CHECK: test_ldu_i64 +define i64 @test_ldu_i64(ptr addrspace(1) %ptr) { + ; CHECK: ldu.global.u64 + %val = tail call i64 @llvm.nvvm.ldu.global.i.i64.p1(ptr addrspace(1) %ptr, i32 8) + ret i64 %val +} + +; CHECK: test_ldu_f32 +define float @test_ldu_f32(ptr addrspace(1) %ptr) { + ; CHECK: ldu.global.f32 + %val = tail call float @llvm.nvvm.ldu.global.f.f32.p1(ptr addrspace(1) %ptr, i32 4) + ret float %val +} + +; CHECK: test_ldu_f64 +define double @test_ldu_f64(ptr addrspace(1) %ptr) { + ; CHECK: ldu.global.f64 + %val = tail call double @llvm.nvvm.ldu.global.f.f64.p1(ptr addrspace(1) %ptr, i32 8) + ret double %val +} + +; CHECK: test_ldu_f16 +define half @test_ldu_f16(ptr addrspace(1) %ptr) { + ; CHECK: ldu.global.b16 + %val = tail call half @llvm.nvvm.ldu.global.f.f16.p1(ptr addrspace(1) %ptr, i32 2) + ret half %val +} + +; CHECK: test_ldu_v2f16 +define <2 x half> @test_ldu_v2f16(ptr addrspace(1) %ptr) { + ; CHECK: ldu.global.b32 + %val = tail call <2 x half> @llvm.nvvm.ldu.global.f.v2f16.p1(ptr addrspace(1) %ptr, i32 4) + ret <2 x half> %val +} + ; CHECK: test_ldg_i8 define i8 @test_ldg_i8(ptr addrspace(1) %ptr) { -; ld.global.nc.u8 + ; CHECK: ld.global.nc.u8 %val = tail call i8 @llvm.nvvm.ldg.global.i.i8.p1(ptr addrspace(1) %ptr, i32 4) ret i8 %val } ; CHECK: test_ldg_i16 define i16 @test_ldg_i16(ptr addrspace(1) %ptr) { -; ld.global.nc.u16 - %val = tail call i16 @llvm.nvvm.ldg.global.i.i16.p1(ptr addrspace(1) %ptr, i32 4) + ; CHECK: ld.global.nc.u16 + %val = tail call i16 @llvm.nvvm.ldg.global.i.i16.p1(ptr addrspace(1) %ptr, i32 2) ret i16 %val } ; CHECK: test_ldg_i32 define i32 @test_ldg_i32(ptr addrspace(1) %ptr) { -; ld.global.nc.u32 + ; CHECK: ld.global.nc.u32 %val = tail call i32 @llvm.nvvm.ldg.global.i.i32.p1(ptr addrspace(1) %ptr, i32 4) ret i32 %val } ; CHECK: test_ldg_i64 define i64 @test_ldg_i64(ptr addrspace(1) %ptr) { -; ld.global.nc.u64 + ; CHECK: ld.global.nc.u64 %val = tail call i64 @llvm.nvvm.ldg.global.i.i64.p1(ptr addrspace(1) %ptr, i32 8) ret i64 %val } ; CHECK: test_ldg_f32 define float @test_ldg_f32(ptr addrspace(1) %ptr) { -; ld.global.nc.u64 + ; CHECK: ld.global.nc.f32 %val = tail call float @llvm.nvvm.ldg.global.f.f32.p1(ptr addrspace(1) %ptr, i32 4) ret float %val } ; CHECK: test_ldg_f64 define double @test_ldg_f64(ptr addrspace(1) %ptr) { -; ld.global.nc.u64 + ; CHECK: ld.global.nc.f64 %val = tail call double @llvm.nvvm.ldg.global.f.f64.p1(ptr addrspace(1) %ptr, i32 8) ret double %val } ; CHECK: test_ldg_f16 define half @test_ldg_f16(ptr addrspace(1) %ptr) { -; ld.global.nc.b16 - %val = tail call half @llvm.nvvm.ldg.global.f.f16.p1(ptr addrspace(1) %ptr, i32 4) + ; CHECK: ld.global.nc.b16 + %val = tail call half @llvm.nvvm.ldg.global.f.f16.p1(ptr addrspace(1) %ptr, i32 2) ret half %val } ; CHECK: test_ldg_v2f16 define <2 x half> @test_ldg_v2f16(ptr addrspace(1) %ptr) { -; ld.global.nc.b32 + ; CHECK: ld.global.nc.b32 %val = tail call <2 x half> @llvm.nvvm.ldg.global.f.v2f16.p1(ptr addrspace(1) %ptr, i32 4) ret <2 x half> %val }