Index: lib/Target/AMDGPU/AMDGPU.td =================================================================== --- lib/Target/AMDGPU/AMDGPU.td +++ lib/Target/AMDGPU/AMDGPU.td @@ -175,6 +175,18 @@ "Enable floating point exceptions" >; +class FeatureMaxPrivateElementSize : SubtargetFeature< + "max-private-element-size-"#size, + "MaxPrivateElementSize", + !cast(size), + "Maximum private access size may be "#size +>; + +def FeatureMaxPrivateElementSize4 : FeatureMaxPrivateElementSize<4>; +def FeatureMaxPrivateElementSize8 : FeatureMaxPrivateElementSize<8>; +def FeatureMaxPrivateElementSize16 : FeatureMaxPrivateElementSize<16>; + + def FeatureEnableHugeScratchBuffer : SubtargetFeature< "huge-scratch-buffer", "EnableHugeScratchBuffer", Index: lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp =================================================================== --- lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp +++ lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp @@ -593,6 +593,20 @@ } } +// This is supposed to be log2(Size) +static amd_element_byte_size_t getElementByteSizeValue(unsigned Size) { + switch (Size) { + case 4: + return AMD_ELEMENT_4_BYTES; + case 8: + return AMD_ELEMENT_8_BYTES; + case 16: + return AMD_ELEMENT_16_BYTES; + default: + llvm_unreachable("invalid private_element_size"); + } +} + void AMDGPUAsmPrinter::EmitAmdKernelCodeT(const MachineFunction &MF, const SIProgramInfo &KernelInfo) const { const SIMachineFunctionInfo *MFI = MF.getInfo(); @@ -606,6 +620,11 @@ (KernelInfo.ComputePGMRSrc2 << 32); header.code_properties = AMD_CODE_PROPERTY_IS_PTR64; + + AMD_HSA_BITS_SET(header.code_properties, + AMD_CODE_PROPERTY_PRIVATE_ELEMENT_SIZE, + getElementByteSizeValue(STM.getMaxPrivateElementSize())); + if (MFI->hasPrivateSegmentBuffer()) { header.code_properties |= AMD_CODE_PROPERTY_ENABLE_SGPR_PRIVATE_SEGMENT_BUFFER; Index: lib/Target/AMDGPU/AMDGPUSubtarget.h =================================================================== --- lib/Target/AMDGPU/AMDGPUSubtarget.h +++ lib/Target/AMDGPU/AMDGPUSubtarget.h @@ -81,6 +81,7 @@ unsigned WavefrontSize; bool CFALUBug; int LocalMemorySize; + unsigned MaxPrivateElementSize; bool EnableVGPRSpilling; bool SGPRInitBug; bool IsGCN; @@ -253,6 +254,10 @@ return LocalMemorySize; } + unsigned getMaxPrivateElementSize() const { + return MaxPrivateElementSize; + } + bool hasSGPRInitBug() const { return SGPRInitBug; } Index: lib/Target/AMDGPU/AMDGPUSubtarget.cpp =================================================================== --- lib/Target/AMDGPU/AMDGPUSubtarget.cpp +++ lib/Target/AMDGPU/AMDGPUSubtarget.cpp @@ -58,6 +58,11 @@ FP32Denormals = false; FP64Denormals = false; } + + // Set defaults if needed. + if (MaxPrivateElementSize == 0) + MaxPrivateElementSize = 16; + return *this; } @@ -74,7 +79,7 @@ EnableUnsafeDSOffsetFolding(false), EnableXNACK(false), WavefrontSize(0), CFALUBug(false), - LocalMemorySize(0), + LocalMemorySize(0), MaxPrivateElementSize(0), EnableVGPRSpilling(false), SGPRInitBug(false), IsGCN(false), GCN1Encoding(false), GCN3Encoding(false), CIInsts(false), LDSBankCount(0), IsaVersion(ISAVersion0_0_0), EnableHugeScratchBuffer(false), Index: lib/Target/AMDGPU/AMDKernelCodeT.h =================================================================== --- lib/Target/AMDGPU/AMDKernelCodeT.h +++ lib/Target/AMDGPU/AMDKernelCodeT.h @@ -44,6 +44,15 @@ AMD_CODE_VERSION_MINOR = 1 }; +// Sets val bits for specified mask in specified dst packed instance. +#define AMD_HSA_BITS_SET(dst, mask, val) \ + dst &= (~(1 << mask ## _SHIFT) & ~mask); \ + dst |= (((val) << mask ## _SHIFT) & mask) + +// Gets bits for specified mask from specified src packed instance. +#define AMD_HSA_BITS_GET(src, mask) \ + ((src & mask) >> mask ## _SHIFT) \ + /// The values used to define the number of bytes to use for the /// swizzle element size. enum amd_element_byte_size_t { Index: lib/Target/AMDGPU/SIInstrInfo.h =================================================================== --- lib/Target/AMDGPU/SIInstrInfo.h +++ lib/Target/AMDGPU/SIInstrInfo.h @@ -482,7 +482,7 @@ const uint64_t RSRC_DATA_FORMAT = 0xf00000000000LL; const uint64_t RSRC_TID_ENABLE = 1LL << 55; - + const uint64_t RSRC_ELEMENT_SIZE_SHIFT = 51; } // End namespace AMDGPU namespace SI { Index: lib/Target/AMDGPU/SIInstrInfo.cpp =================================================================== --- lib/Target/AMDGPU/SIInstrInfo.cpp +++ lib/Target/AMDGPU/SIInstrInfo.cpp @@ -3011,6 +3011,10 @@ AMDGPU::RSRC_TID_ENABLE | 0xffffffff; // Size; + uint64_t EltSizeValue = Log2_32(ST.getMaxPrivateElementSize()) - 1; + + Rsrc23 |= (EltSizeValue << AMDGPU::RSRC_ELEMENT_SIZE_SHIFT); + // If TID_ENABLE is set, DATA_FORMAT specifies stride bits [14:17]. // Clear them unless we want a huge stride. if (ST.getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) Index: test/CodeGen/AMDGPU/large-alloca-compute.ll =================================================================== --- test/CodeGen/AMDGPU/large-alloca-compute.ll +++ test/CodeGen/AMDGPU/large-alloca-compute.ll @@ -10,8 +10,8 @@ ; GCN: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 ; GCN: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 ; GCN: s_mov_b32 s10, -1 -; CI: s_mov_b32 s11, 0x80f000 -; VI: s_mov_b32 s11, 0x800000 +; CI: s_mov_b32 s11, 0x98f000 +; VI: s_mov_b32 s11, 0x980000 ; GCNHSA: .amd_kernel_code_t Index: test/CodeGen/AMDGPU/large-alloca-graphics.ll =================================================================== --- test/CodeGen/AMDGPU/large-alloca-graphics.ll +++ test/CodeGen/AMDGPU/large-alloca-graphics.ll @@ -5,8 +5,8 @@ ; GCN: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 ; GCN: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 ; GCN: s_mov_b32 s10, -1 -; CI: s_mov_b32 s11, 0x80f000 -; VI: s_mov_b32 s11, 0x800000 +; CI: s_mov_b32 s11, 0x98f000 +; VI: s_mov_b32 s11, 0x980000 ; GCN: buffer_store_dword {{v[0-9]+}}, {{v[0-9]+}}, s[8:11], s1 offen ; GCN: buffer_load_dword {{v[0-9]+}}, {{v[0-9]+}}, s[8:11], s1 offen @@ -26,8 +26,8 @@ ; GCN: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 ; GCN: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 ; GCN: s_mov_b32 s10, -1 -; CI: s_mov_b32 s11, 0x80f000 -; VI: s_mov_b32 s11, 0x800000 +; CI: s_mov_b32 s11, 0x98f000 +; VI: s_mov_b32 s11, 0x980000 ; GCN: buffer_store_dword {{v[0-9]+}}, {{v[0-9]+}}, s[8:11], s1 offen ; GCN: buffer_load_dword {{v[0-9]+}}, {{v[0-9]+}}, s[8:11], s1 offen Index: test/CodeGen/AMDGPU/vgpr-spill-emergency-stack-slot-compute.ll =================================================================== --- test/CodeGen/AMDGPU/vgpr-spill-emergency-stack-slot-compute.ll +++ test/CodeGen/AMDGPU/vgpr-spill-emergency-stack-slot-compute.ll @@ -21,8 +21,8 @@ ; GCN: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 ; GCN-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 ; GCN-NEXT: s_mov_b32 s14, -1 -; SI-NEXT: s_mov_b32 s15, 0x80f000 -; VI-NEXT: s_mov_b32 s15, 0x800000 +; SI-NEXT: s_mov_b32 s15, 0x98f000 +; VI-NEXT: s_mov_b32 s15, 0x980000 ; GCN: buffer_store_dword {{v[0-9]+}}, s[12:15], s16 offset:{{[0-9]+}} ; 4-byte Folded Spill Index: test/CodeGen/AMDGPU/vgpr-spill-emergency-stack-slot.ll =================================================================== --- test/CodeGen/AMDGPU/vgpr-spill-emergency-stack-slot.ll +++ test/CodeGen/AMDGPU/vgpr-spill-emergency-stack-slot.ll @@ -14,8 +14,8 @@ ; GCN: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 ; GCN-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 ; GCN-NEXT: s_mov_b32 s14, -1 -; SI-NEXT: s_mov_b32 s15, 0x80f000 -; VI-NEXT: s_mov_b32 s15, 0x800000 +; SI-NEXT: s_mov_b32 s15, 0x98f000 +; VI-NEXT: s_mov_b32 s15, 0x980000 ; s12 is offset user SGPR ; GCN: buffer_store_dword {{v[0-9]+}}, s[12:15], s11 offset:{{[0-9]+}} ; 4-byte Folded Spill