Index: clang/lib/CodeGen/CGCall.cpp =================================================================== --- clang/lib/CodeGen/CGCall.cpp +++ clang/lib/CodeGen/CGCall.cpp @@ -2303,8 +2303,13 @@ getLangOpts().Sanitize.has(SanitizerKind::Memory) || getLangOpts().Sanitize.has(SanitizerKind::Return); + // Enable noundef attribute based on codegen options and + // skip adding the attribute to HIP device functions. + bool EnableNoundefAttrs = CodeGenOpts.EnableNoundefAttrs && + !(getLangOpts().HIP && getLangOpts().CUDAIsDevice); + // Determine if the return type could be partially undef - if (CodeGenOpts.EnableNoundefAttrs && HasStrictReturn) { + if (EnableNoundefAttrs && HasStrictReturn) { if (!RetTy->isVoidType() && RetAI.getKind() != ABIArgInfo::Indirect && DetermineNoUndef(RetTy, getTypes(), DL, RetAI)) RetAttrs.addAttribute(llvm::Attribute::NoUndef); @@ -2438,8 +2443,7 @@ } // Decide whether the argument we're handling could be partially undef - if (CodeGenOpts.EnableNoundefAttrs && - DetermineNoUndef(ParamType, getTypes(), DL, AI)) { + if (EnableNoundefAttrs && DetermineNoUndef(ParamType, getTypes(), DL, AI)) { Attrs.addAttribute(llvm::Attribute::NoUndef); } Index: clang/test/CodeGenCUDA/builtins-amdgcn.cu =================================================================== --- clang/test/CodeGenCUDA/builtins-amdgcn.cu +++ clang/test/CodeGenCUDA/builtins-amdgcn.cu @@ -195,7 +195,7 @@ // CHECK-NEXT: [[TMP4:%.*]] = call contract float @llvm.amdgcn.ds.fmin.f32(float addrspace(3)* [[TMP2]], float [[TMP3]], i32 0, i32 0, i1 false) // CHECK-NEXT: store volatile float [[TMP4]], float* [[X_ASCAST]], align 4 // CHECK-NEXT: [[TMP5:%.*]] = load float*, float** [[SHARED_ADDR_ASCAST]], align 8 -// CHECK-NEXT: call void @_Z4funcPf(float* noundef [[TMP5]]) #[[ATTR8:[0-9]+]] +// CHECK-NEXT: call void @_Z4funcPf(float* [[TMP5]]) #[[ATTR8:[0-9]+]] // CHECK-NEXT: ret void // __global__ void test_ds_fmin_func(float src, float *__restrict shared) { Index: clang/test/CodeGenCUDA/lambda.cu =================================================================== --- clang/test/CodeGenCUDA/lambda.cu +++ clang/test/CodeGenCUDA/lambda.cu @@ -51,8 +51,8 @@ // DEV-LABEL: define{{.*}} amdgpu_kernel void @_Z1gIZ12test_resolvevEUlvE_EvT_ // DEV: call void @_ZZ12test_resolvevENKUlvE_clEv // DEV-LABEL: define internal void @_ZZ12test_resolvevENKUlvE_clEv -// DEV: call noundef i32 @_Z10overloadedIiET_v -// DEV-LABEL: define linkonce_odr noundef i32 @_Z10overloadedIiET_v +// DEV: call i32 @_Z10overloadedIiET_v +// DEV-LABEL: define linkonce_odr i32 @_Z10overloadedIiET_v // DEV: ret i32 1 __device__ int a; Index: clang/test/CodeGenCUDA/unnamed-types.cu =================================================================== --- clang/test/CodeGenCUDA/unnamed-types.cu +++ clang/test/CodeGenCUDA/unnamed-types.cu @@ -19,16 +19,16 @@ } // DEVICE: amdgpu_kernel void @_Z2k0IZZ2f1PfENKUlS0_E_clES0_EUlfE_EvS0_T_( -// DEVICE: define internal noundef float @_ZZZ2f1PfENKUlS_E_clES_ENKUlfE_clEf( +// DEVICE: define internal float @_ZZZ2f1PfENKUlS_E_clES_ENKUlfE_clEf( template __global__ void k0(float *p, F f) { p[0] = f(p[0]) + d0(p[1]) + d1(p[2]); } // DEVICE: amdgpu_kernel void @_Z2k1IZ2f1PfEUlfE_Z2f1S0_EUlffE_Z2f1S0_EUlfE0_EvS0_T_T0_T1_( -// DEVICE: define internal noundef float @_ZZ2f1PfENKUlfE_clEf( -// DEVICE: define internal noundef float @_ZZ2f1PfENKUlffE_clEff( -// DEVICE: define internal noundef float @_ZZ2f1PfENKUlfE0_clEf( +// DEVICE: define internal float @_ZZ2f1PfENKUlfE_clEf( +// DEVICE: define internal float @_ZZ2f1PfENKUlffE_clEff( +// DEVICE: define internal float @_ZZ2f1PfENKUlfE0_clEf( template __global__ void k1(float *p, F0 f0, F1 f1, F2 f2) { p[0] = f0(p[0]) + f1(p[1], p[2]) + f2(p[3]); Index: clang/test/CodeGenHIP/hipspv-addr-spaces.cpp =================================================================== --- clang/test/CodeGenHIP/hipspv-addr-spaces.cpp +++ clang/test/CodeGenHIP/hipspv-addr-spaces.cpp @@ -25,30 +25,30 @@ // Check literals are placed in address space 1 (CrossWorkGroup/__global). // CHECK: @.str ={{.*}} unnamed_addr addrspace(1) constant -// CHECK: define{{.*}} spir_func noundef i32 addrspace(4)* @_Z3barPi(i32 addrspace(4)* +// CHECK: define{{.*}} spir_func i32 addrspace(4)* @_Z3barPi(i32 addrspace(4)* __device__ int* bar(int *x) { return x; } -// CHECK: define{{.*}} spir_func noundef i32 addrspace(4)* @_Z5baz_dv() +// CHECK: define{{.*}} spir_func i32 addrspace(4)* @_Z5baz_dv() __device__ int* baz_d() { // CHECK: ret i32 addrspace(4)* addrspacecast (i32 addrspace(1)* @d to i32 addrspace(4)* return &d; } -// CHECK: define{{.*}} spir_func noundef i32 addrspace(4)* @_Z5baz_cv() +// CHECK: define{{.*}} spir_func i32 addrspace(4)* @_Z5baz_cv() __device__ int* baz_c() { // CHECK: ret i32 addrspace(4)* addrspacecast (i32 addrspace(1)* @c to i32 addrspace(4)* return &c; } -// CHECK: define{{.*}} spir_func noundef i32 addrspace(4)* @_Z5baz_sv() +// CHECK: define{{.*}} spir_func i32 addrspace(4)* @_Z5baz_sv() __device__ int* baz_s() { // CHECK: ret i32 addrspace(4)* addrspacecast (i32 addrspace(3)* @s to i32 addrspace(4)* return &s; } -// CHECK: define{{.*}} spir_func noundef i8 addrspace(4)* @_Z3quzv() +// CHECK: define{{.*}} spir_func i8 addrspace(4)* @_Z3quzv() __device__ const char* quz() { return "abc"; } Index: clang/test/CodeGenHIP/noundef-attribute-hip-device-verify.hip =================================================================== --- /dev/null +++ clang/test/CodeGenHIP/noundef-attribute-hip-device-verify.hip @@ -0,0 +1,38 @@ +// RUN: %clang_cc1 -no-opaque-pointers -triple amdgcn-amd-amdhsa -target-cpu gfx906 -x hip -fcuda-is-device -emit-llvm %s \ +// RUN: -o - | FileCheck %s + +#define __global__ __attribute__((global)) +#define __device__ __attribute__((device)) +#define WARP_SIZE 64 + +static constexpr int warpSize = __AMDGCN_WAVEFRONT_SIZE; + +__device__ static inline unsigned int __lane_id() { + return __builtin_amdgcn_mbcnt_hi( + -1, __builtin_amdgcn_mbcnt_lo(-1, 0)); +} + +__device__ +inline +int __shfl(int var, int src_lane, int width = warpSize) { + int self = __lane_id(); + int index = src_lane + (self & ~(width-1)); + return __builtin_amdgcn_ds_bpermute(index<<2, var); +} + +template +static __device__ +T __shfl_sync(unsigned mask, T val, int src_line, int width=WARP_SIZE) +{ + return __shfl(val, src_line, width); +} + +// CHECK-LABEL: @_Z13shufflekernelv( +// CHECK: call i32 @_ZL11__shfl_syncIiET_jS0_ii(i32 64, i32 %0, i32 0, i32 64) + +__global__ void +shufflekernel() +{ + int res, t; + res = __shfl_sync(WARP_SIZE, t, 0); +} Index: llvm/test/CodeGen/AMDGPU/lower-lds-struct-aa-memcpy.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/lower-lds-struct-aa-memcpy.ll +++ llvm/test/CodeGen/AMDGPU/lower-lds-struct-aa-memcpy.ll @@ -18,19 +18,19 @@ ; CHECK-LABEL: @test ; CHECK: store i8 3, i8 addrspace(3)* %0, align 4, !alias.scope !0, !noalias !3 -; CHECK: tail call void @llvm.memcpy.p3i8.p3i8.i64(i8 addrspace(3)* noundef align 1 dereferenceable(3) %2, i8 addrspace(3)* noundef align 1 dereferenceable(3) %1, i64 3, i1 false), !alias.scope !6, !noalias !7 +; CHECK: tail call void @llvm.memcpy.p3i8.p3i8.i64(i8 addrspace(3)* align 1 dereferenceable(3) %2, i8 addrspace(3)* align 1 dereferenceable(3) %1, i64 3, i1 false), !alias.scope !6, !noalias !7 ; CHECK: %4 = load i8, i8 addrspace(3)* %3, align 4, !alias.scope !8, !noalias !9 -; CHECK: tail call void @llvm.memcpy.p3i8.p3i8.i64(i8 addrspace(3)* noundef align 1 dereferenceable(3) %7, i8 addrspace(3)* noundef align 1 dereferenceable(3) %6, i64 3, i1 false), !alias.scope !6, !noalias !7 +; CHECK: tail call void @llvm.memcpy.p3i8.p3i8.i64(i8 addrspace(3)* align 1 dereferenceable(3) %7, i8 addrspace(3)* align 1 dereferenceable(3) %6, i64 3, i1 false), !alias.scope !6, !noalias !7 ; CHECK: %9 = load i8, i8 addrspace(3)* %8, align 4, !alias.scope !8, !noalias !9 define protected amdgpu_kernel void @test(i8 addrspace(1)* nocapture %ptr.coerce) local_unnamed_addr #0 { entry: store i8 3, i8 addrspace(3)* getelementptr inbounds (%vec_type, %vec_type addrspace(3)* @_f1, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0), align 1 - tail call void @llvm.memcpy.p3i8.p3i8.i64(i8 addrspace(3)* noundef align 1 dereferenceable(3) getelementptr inbounds (%vec_type, %vec_type addrspace(3)* @_f2, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0), i8 addrspace(3)* noundef align 1 dereferenceable(3) getelementptr inbounds (%vec_type, %vec_type addrspace(3)* @_f1, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0), i64 3, i1 false) + tail call void @llvm.memcpy.p3i8.p3i8.i64(i8 addrspace(3)* align 1 dereferenceable(3) getelementptr inbounds (%vec_type, %vec_type addrspace(3)* @_f2, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0), i8 addrspace(3)* align 1 dereferenceable(3) getelementptr inbounds (%vec_type, %vec_type addrspace(3)* @_f1, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0), i64 3, i1 false) %0 = load i8, i8 addrspace(3)* getelementptr inbounds (%vec_type, %vec_type addrspace(3)* @_f2, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0), align 1 %cmp.i.i = icmp eq i8 %0, 3 store i8 2, i8 addrspace(3)* getelementptr inbounds (%vec_type, %vec_type addrspace(3)* @_f1, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0), align 1 - tail call void @llvm.memcpy.p3i8.p3i8.i64(i8 addrspace(3)* noundef align 1 dereferenceable(3) getelementptr inbounds (%vec_type, %vec_type addrspace(3)* @_f2, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0), i8 addrspace(3)* noundef align 1 dereferenceable(3) getelementptr inbounds (%vec_type, %vec_type addrspace(3)* @_f1, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0), i64 3, i1 false) + tail call void @llvm.memcpy.p3i8.p3i8.i64(i8 addrspace(3)* align 1 dereferenceable(3) getelementptr inbounds (%vec_type, %vec_type addrspace(3)* @_f2, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0), i8 addrspace(3)* align 1 dereferenceable(3) getelementptr inbounds (%vec_type, %vec_type addrspace(3)* @_f1, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0), i64 3, i1 false) %1 = load i8, i8 addrspace(3)* getelementptr inbounds (%vec_type, %vec_type addrspace(3)* @_f2, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0), align 1 %cmp.i.i19 = icmp eq i8 %1, 2 %2 = and i1 %cmp.i.i19, %cmp.i.i