Index: clang/include/clang/Basic/Attr.td =================================================================== --- clang/include/clang/Basic/Attr.td +++ clang/include/clang/Basic/Attr.td @@ -1774,6 +1774,13 @@ let SimpleHandler = 1; } +def Shuffle : InheritableAttr { + let Spellings = [Clang<"shuffle">]; + let Subjects = SubjectList<[Function]>; + let Documentation = [ShuffleDocs]; + let SimpleHandler = 1; +} + def NoInline : DeclOrStmtAttr { let Spellings = [GCC<"noinline">, CXX11<"clang", "noinline">, C2x<"clang", "noinline">, Declspec<"noinline">]; Index: clang/include/clang/Basic/AttrDocs.td =================================================================== --- clang/include/clang/Basic/AttrDocs.td +++ clang/include/clang/Basic/AttrDocs.td @@ -1316,6 +1316,30 @@ }]; } +def ShuffleDocs : Documentation { + let Category = DocCatFunction; + let Content = [{ +The ``shuffle`` attribute can be placed on a function declaration. It indicates +that the call instructions of a function with this attribute can take undef +arguments and is still valid. + +In languages HIP or CUDA, there are APIs like +T __shfl_sync(unsigned mask,T var, int srcLane, int width=warpSize); +etc which permit exchanging of a variable between threads within a warp without +use of shared memory. These APIs allow variable var to be uninitialised in the program. +Noundef analysis on such APIs can lead to ambiguous kernel execution. +So shuffle attribute on a function is used to skip adding noundef attribute to such APIs. + +Sample usage: +.. code-block:: c + + void shufflefunc(void) __attribute__((shuffle)); + // Setting it as a C++11 attribute is also valid in a C++ program. + // void shufflefunc(void) [[clang::shuffle]]; + + }]; +} + def NoSplitStackDocs : Documentation { let Category = DocCatFunction; let Content = [{ Index: clang/lib/CodeGen/CGCall.cpp =================================================================== --- clang/lib/CodeGen/CGCall.cpp +++ clang/lib/CodeGen/CGCall.cpp @@ -2035,6 +2035,19 @@ return false; } +static bool DetermineNoUndefForShuffle(const Decl *TargetDecl) { + if (!TargetDecl) + return true; + + // Function has shuffle attribute. + // Skip adding noundef in this case. + if (TargetDecl->hasAttr()) { + return false; + } + + return true; +} + /// Construct the IR attribute list of a function or call. /// /// When adding an attribute, please consider where it should be handled: @@ -2101,6 +2114,8 @@ FuncAttrs.addAttribute(llvm::Attribute::NoDuplicate); if (TargetDecl->hasAttr()) FuncAttrs.addAttribute(llvm::Attribute::Convergent); + if (TargetDecl->hasAttr()) + FuncAttrs.addAttribute(llvm::Attribute::Shuffle); if (const FunctionDecl *Fn = dyn_cast(TargetDecl)) { AddAttributesFromFunctionProtoType( @@ -2298,8 +2313,10 @@ // Determine if the return type could be partially undef if (CodeGenOpts.EnableNoundefAttrs && HasStrictReturn) { if (!RetTy->isVoidType() && RetAI.getKind() != ABIArgInfo::Indirect && - DetermineNoUndef(RetTy, getTypes(), DL, RetAI)) + DetermineNoUndef(RetTy, getTypes(), DL, RetAI) && + DetermineNoUndefForShuffle(TargetDecl)) { RetAttrs.addAttribute(llvm::Attribute::NoUndef); + } } switch (RetAI.getKind()) { @@ -2431,7 +2448,8 @@ // Decide whether the argument we're handling could be partially undef if (CodeGenOpts.EnableNoundefAttrs && - DetermineNoUndef(ParamType, getTypes(), DL, AI)) { + DetermineNoUndef(ParamType, getTypes(), DL, AI) && + DetermineNoUndefForShuffle(TargetDecl)) { Attrs.addAttribute(llvm::Attribute::NoUndef); } Index: clang/lib/Headers/__clang_cuda_intrinsics.h =================================================================== --- clang/lib/Headers/__clang_cuda_intrinsics.h +++ clang/lib/Headers/__clang_cuda_intrinsics.h @@ -45,7 +45,7 @@ _Static_assert(sizeof(__val) == sizeof(__Bits)); \ _Static_assert(sizeof(__Bits) == 2 * sizeof(int)); \ __Bits __tmp; \ - memcpy(&__tmp, &__val, sizeof(__val)); \ + memcpy(&__tmp, &__val, sizeof(__val)); \ __tmp.__a = ::__FnName(__tmp.__a, __offset, __width); \ __tmp.__b = ::__FnName(__tmp.__b, __offset, __width); \ long long __ret; \ @@ -100,29 +100,32 @@ #if CUDA_VERSION >= 9000 #if (!defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 300) +#define __shuffle __attribute__((shuffle)) // __shfl_sync_* variants available in CUDA-9 #pragma push_macro("__MAKE_SYNC_SHUFFLES") #define __MAKE_SYNC_SHUFFLES(__FnName, __IntIntrinsic, __FloatIntrinsic, \ __Mask, __Type) \ - inline __device__ int __FnName(unsigned int __mask, int __val, \ - __Type __offset, int __width = warpSize) { \ + inline __device__ __shuffle int __FnName(unsigned int __mask, int __val, \ + __Type __offset, \ + int __width = warpSize) { \ return __IntIntrinsic(__mask, __val, __offset, \ ((warpSize - __width) << 8) | (__Mask)); \ } \ - inline __device__ float __FnName(unsigned int __mask, float __val, \ - __Type __offset, int __width = warpSize) { \ + inline __device__ __shuffle float __FnName(unsigned int __mask, float __val, \ + __Type __offset, \ + int __width = warpSize) { \ return __FloatIntrinsic(__mask, __val, __offset, \ ((warpSize - __width) << 8) | (__Mask)); \ } \ - inline __device__ unsigned int __FnName(unsigned int __mask, \ - unsigned int __val, __Type __offset, \ - int __width = warpSize) { \ + inline __device__ __shuffle unsigned int __FnName( \ + unsigned int __mask, unsigned int __val, __Type __offset, \ + int __width = warpSize) { \ return static_cast( \ ::__FnName(__mask, static_cast(__val), __offset, __width)); \ } \ - inline __device__ long long __FnName(unsigned int __mask, long long __val, \ - __Type __offset, \ - int __width = warpSize) { \ + inline __device__ __shuffle long long __FnName( \ + unsigned int __mask, long long __val, __Type __offset, \ + int __width = warpSize) { \ struct __Bits { \ int __a, __b; \ }; \ @@ -136,14 +139,15 @@ memcpy(&__ret, &__tmp, sizeof(__tmp)); \ return __ret; \ } \ - inline __device__ unsigned long long __FnName( \ + inline __device__ __shuffle unsigned long long __FnName( \ unsigned int __mask, unsigned long long __val, __Type __offset, \ int __width = warpSize) { \ return static_cast(::__FnName( \ __mask, static_cast(__val), __offset, __width)); \ } \ - inline __device__ long __FnName(unsigned int __mask, long __val, \ - __Type __offset, int __width = warpSize) { \ + inline __device__ __shuffle long __FnName(unsigned int __mask, long __val, \ + __Type __offset, \ + int __width = warpSize) { \ _Static_assert(sizeof(long) == sizeof(long long) || \ sizeof(long) == sizeof(int)); \ if (sizeof(long) == sizeof(long long)) { \ @@ -154,14 +158,15 @@ ::__FnName(__mask, static_cast(__val), __offset, __width)); \ } \ } \ - inline __device__ unsigned long __FnName( \ + inline __device__ __shuffle unsigned long __FnName( \ unsigned int __mask, unsigned long __val, __Type __offset, \ int __width = warpSize) { \ return static_cast( \ ::__FnName(__mask, static_cast(__val), __offset, __width)); \ } \ - inline __device__ double __FnName(unsigned int __mask, double __val, \ - __Type __offset, int __width = warpSize) { \ + inline __device__ __shuffle double __FnName(unsigned int __mask, \ + double __val, __Type __offset, \ + int __width = warpSize) { \ long long __tmp; \ _Static_assert(sizeof(__tmp) == sizeof(__val)); \ memcpy(&__tmp, &__val, sizeof(__val)); \ Index: clang/lib/Sema/SemaDeclAttr.cpp =================================================================== --- clang/lib/Sema/SemaDeclAttr.cpp +++ clang/lib/Sema/SemaDeclAttr.cpp @@ -8423,6 +8423,9 @@ case ParsedAttr::AT_AMDGPUNumVGPR: handleAMDGPUNumVGPRAttr(S, D, AL); break; + case ParsedAttr::AT_Shuffle: + handleSimpleAttribute(S, D, AL); + break; case ParsedAttr::AT_AVRSignal: handleAVRSignalAttr(S, D, AL); break; Index: clang/test/CodeGenHIP/shuffle-attr-verify.hip =================================================================== --- /dev/null +++ clang/test/CodeGenHIP/shuffle-attr-verify.hip @@ -0,0 +1,33 @@ +// RUN: %clang_cc1 -no-opaque-pointers -triple amdgcn-amd-amdhsa -target-cpu gfx906 -x hip -fcuda-is-device -emit-llvm %s \ +// RUN: -o - | FileCheck %s + +#define __global__ __attribute__((global)) +#define __device__ __attribute__((device)) +#define __shuffle __attribute__((shuffle)) +#define HYPRE_WARP_SIZE 64 + +static constexpr int warpSize = __AMDGCN_WAVEFRONT_SIZE; + +__device__ static inline unsigned int __lane_id() { + return __builtin_amdgcn_mbcnt_hi( + -1, __builtin_amdgcn_mbcnt_lo(-1, 0)); +} + +// CHECK: define linkonce_odr i32 @_Z11__shfl_synciii(i32 %var, i32 %src_lane, i32 %width) #[[attr1:[0-9]+]] +__device__ +inline +int __shuffle __shfl_sync(int var, int src_lane, int width = warpSize) { + int self = __lane_id(); + int index = src_lane + (self & ~(width-1)); + return __builtin_amdgcn_ds_bpermute(index<<2, var); +} + +__global__ void +shufflekernel() +{ + int t; + int res, res1; + res = __shfl_sync(HYPRE_WARP_SIZE, t, 0); +} + +// CHECK-DAG: attributes #[[attr1]] = { {{[^}]*}}shuffle{{[^}]*}} } \ No newline at end of file Index: clang/test/CodeGenHIP/shuffle-noundef-attr.hip =================================================================== --- /dev/null +++ clang/test/CodeGenHIP/shuffle-noundef-attr.hip @@ -0,0 +1,48 @@ +// RUN: %clang_cc1 -no-opaque-pointers -triple amdgcn-amd-amdhsa -target-cpu gfx906 -x hip -fcuda-is-device -emit-llvm %s \ +// RUN: -o - | FileCheck %s + +#define __global__ __attribute__((global)) +#define __device__ __attribute__((device)) +#define __shuffle __attribute__((shuffle)) +#define HYPRE_WARP_SIZE 64 + +static constexpr int warpSize = __AMDGCN_WAVEFRONT_SIZE; + +__device__ static inline unsigned int __lane_id() { + return __builtin_amdgcn_mbcnt_hi( + -1, __builtin_amdgcn_mbcnt_lo(-1, 0)); +} + +__device__ +inline +int __shfl(int var, int src_lane, int width = warpSize) { + int self = __lane_id(); + int index = src_lane + (self & ~(width-1)); + return __builtin_amdgcn_ds_bpermute(index<<2, var); +} + +template +static __device__ +T __shuffle __shfl_sync(unsigned mask, T val, int src_line, int width=HYPRE_WARP_SIZE) +{ + return __shfl(val, src_line, width); +} + +template +static __device__ +T __shfl_sync_1(unsigned mask, T val, int src_line, int width=HYPRE_WARP_SIZE) +{ + return __shfl(val, src_line, width); +} + +// CHECK-LABEL: @_Z13shufflekernelv( +// CHECK: call i32 @_ZL11__shfl_syncIiET_jS0_ii(i32 64, i32 %0, i32 0, i32 64) +// CHECK: call noundef i32 @_ZL13__shfl_sync_1IiET_jS0_ii(i32 noundef 64, i32 noundef %1, i32 noundef 0, i32 noundef 64) +__global__ void +shufflekernel() +{ + int t; + int res, res1; + res = __shfl_sync(HYPRE_WARP_SIZE, t, 0); + res1 = __shfl_sync_1(HYPRE_WARP_SIZE, t, 0); +} \ No newline at end of file Index: clang/test/Misc/pragma-attribute-supported-attributes-list.test =================================================================== --- clang/test/Misc/pragma-attribute-supported-attributes-list.test +++ clang/test/Misc/pragma-attribute-supported-attributes-list.test @@ -161,6 +161,7 @@ // CHECK-NEXT: ScopedLockable (SubjectMatchRule_record) // CHECK-NEXT: Section (SubjectMatchRule_function, SubjectMatchRule_variable_is_global, SubjectMatchRule_objc_method, SubjectMatchRule_objc_property) // CHECK-NEXT: SetTypestate (SubjectMatchRule_function_is_member) +// CHECK-NEXT: Shuffle (SubjectMatchRule_function) // CHECK-NEXT: SpeculativeLoadHardening (SubjectMatchRule_function, SubjectMatchRule_objc_method) // CHECK-NEXT: StandaloneDebug (SubjectMatchRule_record) // CHECK-NEXT: SwiftAsync (SubjectMatchRule_function, SubjectMatchRule_objc_method) Index: llvm/include/llvm/Bitcode/LLVMBitCodes.h =================================================================== --- llvm/include/llvm/Bitcode/LLVMBitCodes.h +++ llvm/include/llvm/Bitcode/LLVMBitCodes.h @@ -684,6 +684,7 @@ ATTR_KIND_NO_SANITIZE_BOUNDS = 79, ATTR_KIND_ALLOC_ALIGN = 80, ATTR_KIND_ALLOCATED_POINTER = 81, + ATTR_KIND_SHUFFLE = 82, }; enum ComdatSelectionKindCodes { Index: llvm/include/llvm/IR/Attributes.td =================================================================== --- llvm/include/llvm/IR/Attributes.td +++ llvm/include/llvm/IR/Attributes.td @@ -300,6 +300,9 @@ /// Function is required to make Forward Progress. def MustProgress : EnumAttr<"mustprogress", [FnAttr]>; +/// Function is a __shfl_sync like API. +def Shuffle : EnumAttr<"shuffle", [FnAttr]>; + /// Target-independent string attributes. def LessPreciseFPMAD : StrBoolAttr<"less-precise-fpmad">; def NoInfsFPMath : StrBoolAttr<"no-infs-fp-math">; Index: llvm/include/llvm/IR/Function.h =================================================================== --- llvm/include/llvm/IR/Function.h +++ llvm/include/llvm/IR/Function.h @@ -626,6 +626,10 @@ return AttributeSets.getUWTableKind(); } + /// Determine if the function is __shfl_sync like. + bool isShuffle() const { return hasFnAttribute(Attribute::Shuffle); } + void setShuffle() { addFnAttr(Attribute::Shuffle); } + /// True if the ABI mandates (or the user requested) that this /// function be in a unwind table. bool hasUWTable() const { Index: llvm/lib/Bitcode/Reader/BitcodeReader.cpp =================================================================== --- llvm/lib/Bitcode/Reader/BitcodeReader.cpp +++ llvm/lib/Bitcode/Reader/BitcodeReader.cpp @@ -1632,6 +1632,8 @@ return Attribute::MustProgress; case bitc::ATTR_KIND_HOT: return Attribute::Hot; + case bitc::ATTR_KIND_SHUFFLE: + return Attribute::Shuffle; } } Index: llvm/lib/Bitcode/Writer/BitcodeWriter.cpp =================================================================== --- llvm/lib/Bitcode/Writer/BitcodeWriter.cpp +++ llvm/lib/Bitcode/Writer/BitcodeWriter.cpp @@ -778,6 +778,8 @@ case Attribute::EmptyKey: case Attribute::TombstoneKey: llvm_unreachable("Trying to encode EmptyKey/TombstoneKey"); + case Attribute::Shuffle: + return bitc::ATTR_KIND_SHUFFLE; } llvm_unreachable("Trying to encode unknown attribute"); Index: llvm/lib/IR/Attributes.cpp =================================================================== --- llvm/lib/IR/Attributes.cpp +++ llvm/lib/IR/Attributes.cpp @@ -1804,7 +1804,8 @@ .addAttribute(Attribute::StructRet) .addAttribute(Attribute::ByRef) .addAttribute(Attribute::ElementType) - .addAttribute(Attribute::AllocatedPointer); + .addAttribute(Attribute::AllocatedPointer) + .addAttribute(Attribute::Shuffle); } // Attributes that only apply to pointers or vectors of pointers. Index: llvm/lib/Transforms/Utils/CodeExtractor.cpp =================================================================== --- llvm/lib/Transforms/Utils/CodeExtractor.cpp +++ llvm/lib/Transforms/Utils/CodeExtractor.cpp @@ -960,6 +960,7 @@ case Attribute::NoCfCheck: case Attribute::MustProgress: case Attribute::NoProfile: + case Attribute::Shuffle: break; // These attributes cannot be applied to functions. case Attribute::Alignment: