Index: clang/include/clang/Basic/BuiltinsAMDGPU.def =================================================================== --- clang/include/clang/Basic/BuiltinsAMDGPU.def +++ clang/include/clang/Basic/BuiltinsAMDGPU.def @@ -9,6 +9,11 @@ // This file defines the AMDGPU-specific builtin function database. Users of // this file must define the BUILTIN macro to make use of this information. // +// Note: (unsigned) long int type should be avoided in builtin definitions +// since it has different size on Linux (64 bit) and Windows (32 bit). +// (unsigned) long long int type should also be avoided, which is 64 bit for +// C/C++/HIP but is 128 bit for OpenCL. Use `W` as width modifier in builtin +// definitions since it is fixed for 64 bit. //===----------------------------------------------------------------------===// // The format of this database matches clang/Basic/Builtins.def. @@ -44,14 +49,14 @@ BUILTIN(__builtin_amdgcn_mbcnt_hi, "UiUiUi", "nc") BUILTIN(__builtin_amdgcn_mbcnt_lo, "UiUiUi", "nc") -TARGET_BUILTIN(__builtin_amdgcn_s_memtime, "LUi", "n", "s-memtime-inst") +TARGET_BUILTIN(__builtin_amdgcn_s_memtime, "WUi", "n", "s-memtime-inst") //===----------------------------------------------------------------------===// // Instruction builtins. //===----------------------------------------------------------------------===// BUILTIN(__builtin_amdgcn_s_getreg, "UiIi", "n") BUILTIN(__builtin_amdgcn_s_setreg, "vIiUi", "n") -BUILTIN(__builtin_amdgcn_s_getpc, "LUi", "n") +BUILTIN(__builtin_amdgcn_s_getpc, "WUi", "n") BUILTIN(__builtin_amdgcn_s_waitcnt, "vIi", "n") BUILTIN(__builtin_amdgcn_s_sendmsg, "vIiUi", "n") BUILTIN(__builtin_amdgcn_s_sendmsghalt, "vIiUi", "n") @@ -111,12 +116,12 @@ BUILTIN(__builtin_amdgcn_s_sleep, "vIi", "n") BUILTIN(__builtin_amdgcn_s_incperflevel, "vIi", "n") BUILTIN(__builtin_amdgcn_s_decperflevel, "vIi", "n") -BUILTIN(__builtin_amdgcn_uicmp, "LUiUiUiIi", "nc") -BUILTIN(__builtin_amdgcn_uicmpl, "LUiLUiLUiIi", "nc") -BUILTIN(__builtin_amdgcn_sicmp, "LUiiiIi", "nc") -BUILTIN(__builtin_amdgcn_sicmpl, "LUiLiLiIi", "nc") -BUILTIN(__builtin_amdgcn_fcmp, "LUiddIi", "nc") -BUILTIN(__builtin_amdgcn_fcmpf, "LUiffIi", "nc") +BUILTIN(__builtin_amdgcn_uicmp, "WUiUiUiIi", "nc") +BUILTIN(__builtin_amdgcn_uicmpl, "WUiWUiWUiIi", "nc") +BUILTIN(__builtin_amdgcn_sicmp, "WUiiiIi", "nc") +BUILTIN(__builtin_amdgcn_sicmpl, "WUiWiWiIi", "nc") +BUILTIN(__builtin_amdgcn_fcmp, "WUiddIi", "nc") +BUILTIN(__builtin_amdgcn_fcmpf, "WUiffIi", "nc") BUILTIN(__builtin_amdgcn_ds_swizzle, "iiIi", "nc") BUILTIN(__builtin_amdgcn_ds_permute, "iii", "nc") BUILTIN(__builtin_amdgcn_ds_bpermute, "iii", "nc") @@ -142,9 +147,9 @@ BUILTIN(__builtin_amdgcn_msad_u8, "UiUiUiUi", "nc") BUILTIN(__builtin_amdgcn_sad_hi_u8, "UiUiUiUi", "nc") BUILTIN(__builtin_amdgcn_sad_u16, "UiUiUiUi", "nc") -BUILTIN(__builtin_amdgcn_qsad_pk_u16_u8, "LUiLUiUiLUi", "nc") -BUILTIN(__builtin_amdgcn_mqsad_pk_u16_u8, "LUiLUiUiLUi", "nc") -BUILTIN(__builtin_amdgcn_mqsad_u32_u8, "V4UiLUiUiV4Ui", "nc") +BUILTIN(__builtin_amdgcn_qsad_pk_u16_u8, "WUiWUiUiWUi", "nc") +BUILTIN(__builtin_amdgcn_mqsad_pk_u16_u8, "WUiWUiUiWUi", "nc") +BUILTIN(__builtin_amdgcn_mqsad_u32_u8, "V4UiWUiUiV4Ui", "nc") //===----------------------------------------------------------------------===// // CI+ only builtins. @@ -179,7 +184,7 @@ TARGET_BUILTIN(__builtin_amdgcn_frexp_exph, "sh", "nc", "16-bit-insts") TARGET_BUILTIN(__builtin_amdgcn_fracth, "hh", "nc", "16-bit-insts") TARGET_BUILTIN(__builtin_amdgcn_classh, "bhi", "nc", "16-bit-insts") -TARGET_BUILTIN(__builtin_amdgcn_s_memrealtime, "LUi", "n", "s-memrealtime") +TARGET_BUILTIN(__builtin_amdgcn_s_memrealtime, "WUi", "n", "s-memrealtime") TARGET_BUILTIN(__builtin_amdgcn_mov_dpp, "iiIiIiIiIb", "nc", "dpp") TARGET_BUILTIN(__builtin_amdgcn_update_dpp, "iiiIiIiIiIb", "nc", "dpp") TARGET_BUILTIN(__builtin_amdgcn_s_dcache_wb, "v", "n", "gfx8-insts") @@ -213,7 +218,7 @@ //===----------------------------------------------------------------------===// // Special builtins. //===----------------------------------------------------------------------===// -BUILTIN(__builtin_amdgcn_read_exec, "LUi", "nc") +BUILTIN(__builtin_amdgcn_read_exec, "WUi", "nc") BUILTIN(__builtin_amdgcn_read_exec_lo, "Ui", "nc") BUILTIN(__builtin_amdgcn_read_exec_hi, "Ui", "nc") Index: clang/test/CodeGenCUDA/builtins-amdgcn.cu =================================================================== --- clang/test/CodeGenCUDA/builtins-amdgcn.cu +++ clang/test/CodeGenCUDA/builtins-amdgcn.cu @@ -1,4 +1,11 @@ -// RUN: %clang_cc1 -triple amdgcn -fcuda-is-device -emit-llvm %s -o - | FileCheck %s +// RUN: %clang_cc1 -triple amdgcn-amd-amdhsa -target-cpu gfx906 \ +// RUN: -aux-triple x86_64-unknown-linux-gnu -fcuda-is-device -emit-llvm %s \ +// RUN: -o - | FileCheck %s + +// RUN: %clang_cc1 -triple amdgcn-amd-amdhsa -target-cpu gfx906 \ +// RUN: -aux-triple x86_64-pc-windows-msvc -fcuda-is-device -emit-llvm %s \ +// RUN: -o - | FileCheck %s + #include "Inputs/cuda.h" // CHECK-LABEL: @_Z16use_dispatch_ptrPi( @@ -22,3 +29,32 @@ __global__ void endpgm() { __builtin_amdgcn_endpgm(); } + +// Check the 64 bit argument is correctly passed to the intrinsic without truncation or assertion. + +// CHECK-LABEL: @_Z14test_uicmp_i64 +// CHECK: store i64* %out, i64** %out.addr.ascast +// CHECK-NEXT: store i64 %a, i64* %a.addr.ascast +// CHECK-NEXT: store i64 %b, i64* %b.addr.ascast +// CHECK-NEXT: %[[V0:.*]] = load i64, i64* %a.addr.ascast +// CHECK-NEXT: %[[V1:.*]] = load i64, i64* %b.addr.ascast +// CHECK-NEXT: %[[V2:.*]] = call i64 @llvm.amdgcn.icmp.i64.i64(i64 %0, i64 %1, i32 35) +// CHECK-NEXT: %[[V3:.*]] = load i64*, i64** %out.addr.ascast +// CHECK-NEXT: store i64 %[[V2]], i64* %[[V3]] +// CHECK-NEXT: ret void +__global__ void test_uicmp_i64(unsigned long long *out, unsigned long long a, unsigned long long b) +{ + *out = __builtin_amdgcn_uicmpl(a, b, 30+5); +} + +// Check the 64 bit return value is correctly returned without truncation or assertion. + +// CHECK-LABEL: @_Z14test_s_memtime +// CHECK: %[[V1:.*]] = call i64 @llvm.amdgcn.s.memtime() +// CHECK-NEXT: %[[PTR:.*]] = load i64*, i64** %out.addr.ascast +// CHECK-NEXT: store i64 %[[V1]], i64* %[[PTR]] +// CHECK-NEXT: ret void +__global__ void test_s_memtime(unsigned long long* out) +{ + *out = __builtin_amdgcn_s_memtime(); +}