diff --git a/llvm/lib/Target/AMDGPU/AMDGPU.h b/llvm/lib/Target/AMDGPU/AMDGPU.h --- a/llvm/lib/Target/AMDGPU/AMDGPU.h +++ b/llvm/lib/Target/AMDGPU/AMDGPU.h @@ -365,53 +365,53 @@ /// a separate piece of memory that is unique from other /// memory locations. namespace AMDGPUAS { - enum : unsigned { - // The maximum value for flat, generic, local, private, constant and region. - MAX_AMDGPU_ADDRESS = 7, - - FLAT_ADDRESS = 0, ///< Address space for flat memory. - GLOBAL_ADDRESS = 1, ///< Address space for global memory (RAT0, VTX0). - REGION_ADDRESS = 2, ///< Address space for region memory. (GDS) - - CONSTANT_ADDRESS = 4, ///< Address space for constant memory (VTX2). - LOCAL_ADDRESS = 3, ///< Address space for local memory. - PRIVATE_ADDRESS = 5, ///< Address space for private memory. - - CONSTANT_ADDRESS_32BIT = 6, ///< Address space for 32-bit constant memory. - - BUFFER_FAT_POINTER = 7, ///< Address space for 160-bit buffer fat pointers. - - /// Address space for direct addressable parameter memory (CONST0). - PARAM_D_ADDRESS = 6, - /// Address space for indirect addressable parameter memory (VTX1). - PARAM_I_ADDRESS = 7, - - // Do not re-order the CONSTANT_BUFFER_* enums. Several places depend on - // this order to be able to dynamically index a constant buffer, for - // example: - // - // ConstantBufferAS = CONSTANT_BUFFER_0 + CBIdx - - CONSTANT_BUFFER_0 = 8, - CONSTANT_BUFFER_1 = 9, - CONSTANT_BUFFER_2 = 10, - CONSTANT_BUFFER_3 = 11, - CONSTANT_BUFFER_4 = 12, - CONSTANT_BUFFER_5 = 13, - CONSTANT_BUFFER_6 = 14, - CONSTANT_BUFFER_7 = 15, - CONSTANT_BUFFER_8 = 16, - CONSTANT_BUFFER_9 = 17, - CONSTANT_BUFFER_10 = 18, - CONSTANT_BUFFER_11 = 19, - CONSTANT_BUFFER_12 = 20, - CONSTANT_BUFFER_13 = 21, - CONSTANT_BUFFER_14 = 22, - CONSTANT_BUFFER_15 = 23, - - // Some places use this if the address space can't be determined. - UNKNOWN_ADDRESS_SPACE = ~0u, - }; +enum : unsigned { + // The maximum value for flat, generic, local, private, constant and region. + MAX_AMDGPU_ADDRESS = 7, + + FLAT_ADDRESS = 0, ///< Address space for flat memory. + GLOBAL_ADDRESS = 1, ///< Address space for global memory (RAT0, VTX0). + REGION_ADDRESS = 2, ///< Address space for region memory. (GDS) + + CONSTANT_ADDRESS = 4, ///< Address space for constant memory (VTX2). + LOCAL_ADDRESS = 3, ///< Address space for local memory. + PRIVATE_ADDRESS = 5, ///< Address space for private memory. + + CONSTANT_ADDRESS_32BIT = 6, ///< Address space for 32-bit constant memory. + + BUFFER_FAT_POINTER = 7, ///< Address space for 128-bit buffer fat pointers. + + /// Address space for direct addressable parameter memory (CONST0). + PARAM_D_ADDRESS = 6, + /// Address space for indirect addressable parameter memory (VTX1). + PARAM_I_ADDRESS = 7, + + // Do not re-order the CONSTANT_BUFFER_* enums. Several places depend on + // this order to be able to dynamically index a constant buffer, for + // example: + // + // ConstantBufferAS = CONSTANT_BUFFER_0 + CBIdx + + CONSTANT_BUFFER_0 = 8, + CONSTANT_BUFFER_1 = 9, + CONSTANT_BUFFER_2 = 10, + CONSTANT_BUFFER_3 = 11, + CONSTANT_BUFFER_4 = 12, + CONSTANT_BUFFER_5 = 13, + CONSTANT_BUFFER_6 = 14, + CONSTANT_BUFFER_7 = 15, + CONSTANT_BUFFER_8 = 16, + CONSTANT_BUFFER_9 = 17, + CONSTANT_BUFFER_10 = 18, + CONSTANT_BUFFER_11 = 19, + CONSTANT_BUFFER_12 = 20, + CONSTANT_BUFFER_13 = 21, + CONSTANT_BUFFER_14 = 22, + CONSTANT_BUFFER_15 = 23, + + // Some places use this if the address space can't be determined. + UNKNOWN_ADDRESS_SPACE = ~0u, +}; } namespace AMDGPU { diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructions.td b/llvm/lib/Target/AMDGPU/AMDGPUInstructions.td --- a/llvm/lib/Target/AMDGPU/AMDGPUInstructions.td +++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructions.td @@ -19,6 +19,7 @@ int Constant = 4; int Private = 5; int Constant32Bit = 6; + int Buffer = 7; } def AddrSpaces : AddressSpacesImpl; @@ -444,9 +445,12 @@ def LoadAddress_region : AddressSpaceList<[ AddrSpaces.Region ]>; def StoreAddress_region : AddressSpaceList<[ AddrSpaces.Region ]>; +def LoadAddress_buffer : AddressSpaceList<[ AddrSpaces.Buffer ]>; +def StoreAddress_buffer : AddressSpaceList<[ AddrSpaces.Buffer ]>; -foreach as = [ "global", "flat", "constant", "local", "private", "region" ] in { + +foreach as = [ "global", "flat", "constant", "local", "private", "region", "buffer" ] in { let AddressSpaces = !cast("LoadAddress_"#as).AddrSpaces in { def load_#as : PatFrag<(ops node:$ptr), (unindexedload node:$ptr)> { @@ -501,7 +505,7 @@ } // End foreach as -foreach as = [ "global", "flat", "local", "private", "region" ] in { +foreach as = [ "global", "flat", "local", "private", "region" , "buffer"] in { let IsStore = 1, AddressSpaces = !cast("StoreAddress_"#as).AddrSpaces in { def store_#as : PatFrag<(ops node:$val, node:$ptr), (unindexedstore node:$val, node:$ptr)> { @@ -618,7 +622,8 @@ } multiclass binary_atomic_op_all_as { - foreach as = [ "global", "flat", "constant", "local", "private", "region" ] in { + foreach as = [ "global", "flat", "constant", "local", "private", + "region", "buffer" ] in { let AddressSpaces = !cast("LoadAddress_"#as).AddrSpaces in { defm "_"#as : binary_atomic_op; defm "_"#as : noret_binary_atomic_op; diff --git a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp --- a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp @@ -22,6 +22,7 @@ #include "llvm/ADT/ScopeExit.h" #include "llvm/BinaryFormat/ELF.h" #include "llvm/CodeGen/GlobalISel/LegalizerHelper.h" +#include "llvm/CodeGen/GlobalISel/LegalizerInfo.h" #include "llvm/CodeGen/GlobalISel/MIPatternMatch.h" #include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h" #include "llvm/IR/DiagnosticInfo.h" @@ -247,6 +248,7 @@ case AMDGPUAS::LOCAL_ADDRESS: return ST.useDS128() ? 128 : 64; case AMDGPUAS::GLOBAL_ADDRESS: + case AMDGPUAS::BUFFER_FAT_POINTER: case AMDGPUAS::CONSTANT_ADDRESS: case AMDGPUAS::CONSTANT_ADDRESS_32BIT: // Treat constant and global as identical. SMRD loads are sometimes usable for @@ -487,6 +489,7 @@ const LLT RegionPtr = GetAddrSpacePtr(AMDGPUAS::REGION_ADDRESS); const LLT FlatPtr = GetAddrSpacePtr(AMDGPUAS::FLAT_ADDRESS); const LLT PrivatePtr = GetAddrSpacePtr(AMDGPUAS::PRIVATE_ADDRESS); + const LLT BufferDesc = GetAddrSpacePtr(AMDGPUAS::BUFFER_FAT_POINTER); const LLT CodePtr = FlatPtr; @@ -498,6 +501,8 @@ LocalPtr, PrivatePtr, Constant32Ptr, RegionPtr }; + const std::initializer_list AddrSpaces128 = {BufferDesc}; + const std::initializer_list FPTypesBase = { S32, S64 }; @@ -518,17 +523,18 @@ // TODO: All multiples of 32, vectors of pointers, all v2s16 pairs, more // elements for v3s16 getActionDefinitionsBuilder(G_PHI) - .legalFor({S32, S64, V2S16, S16, V4S16, S1, S128, S256}) - .legalFor(AllS32Vectors) - .legalFor(AllS64Vectors) - .legalFor(AddrSpaces64) - .legalFor(AddrSpaces32) - .legalIf(isPointer(0)) - .clampScalar(0, S16, S256) - .widenScalarToNextPow2(0, 32) - .clampMaxNumElements(0, S32, 16) - .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) - .scalarize(0); + .legalFor({S32, S64, V2S16, S16, V4S16, S1, S128, S256}) + .legalFor(AllS32Vectors) + .legalFor(AllS64Vectors) + .legalFor(AddrSpaces64) + .legalFor(AddrSpaces32) + .legalFor(AddrSpaces128) + .legalIf(isPointer(0)) + .clampScalar(0, S16, S256) + .widenScalarToNextPow2(0, 32) + .clampMaxNumElements(0, S32, 16) + .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) + .scalarize(0); if (ST.hasVOP3PInsts() && ST.hasAddNoCarry() && ST.hasIntClamp()) { // Full set of gfx9 features. @@ -909,9 +915,12 @@ } getActionDefinitionsBuilder(G_PTR_ADD) - .legalIf(all(isPointer(0), sameSize(0, 1))) - .scalarize(0) - .scalarSameSizeAs(1, 0); + .legalIf(all(isPointer(0), sameSize(0, 1))) + .legalIf(all(isPointer(0, AMDGPUAS::BUFFER_FAT_POINTER), sizeIs(1, 32))) + .scalarize(0) + .minScalarIf(isPointer(0, AMDGPUAS::BUFFER_FAT_POINTER), 1, S32) + .maxScalarIf(isPointer(0, AMDGPUAS::BUFFER_FAT_POINTER), 1, S32) + .scalarSameSizeAs(1, 0); getActionDefinitionsBuilder(G_PTRMASK) .legalIf(all(sameSize(0, 1), typeInSet(1, {S64, S32}))) diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.td b/llvm/lib/Target/AMDGPU/SIInstrInfo.td --- a/llvm/lib/Target/AMDGPU/SIInstrInfo.td +++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.td @@ -667,7 +667,8 @@ let IsLoad = 1; } -foreach as = [ "global", "flat", "constant", "local", "private", "region" ] in { +foreach as = [ "global", "flat", "constant", "local", "private", + "region", "buffer" ] in { let AddressSpaces = !cast("LoadAddress_"#as).AddrSpaces in { def load_d16_hi_#as : LoadD16Frag ; diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-ptr-add.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-ptr-add.mir --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-ptr-add.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-ptr-add.mir @@ -412,3 +412,117 @@ %2:_(<2 x p3>) = G_PTR_ADD %0, %1 $vgpr0_vgpr1 = COPY %2 ... + +--- +name: test_gep_buffer_i32_idx +body: | + bb.0: + liveins: $vgpr0_vgpr1_vgpr2_vgpr3, $vgpr4 + + ; CHECK-LABEL: name: test_gep_buffer_i32_idx + ; CHECK: liveins: $vgpr0_vgpr1_vgpr2_vgpr3, $vgpr4 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(p7) = COPY $vgpr0_vgpr1_vgpr2_vgpr3 + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr4 + ; CHECK-NEXT: [[PTR_ADD:%[0-9]+]]:_(p7) = G_PTR_ADD [[COPY]], [[COPY1]](s32) + ; CHECK-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[PTR_ADD]](p7) + %0:_(p7) = COPY $vgpr0_vgpr1_vgpr2_vgpr3 + %1:_(s32) = COPY $vgpr4 + %2:_(p7) = G_PTR_ADD %0, %1 + + $vgpr0_vgpr1_vgpr2_vgpr3 = COPY %2 +... + +--- +name: test_gep_buffer_v2p7_v2i32 +body: | + bb.0: + liveins: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, $vgpr8_vgpr9 + + ; CHECK-LABEL: name: test_gep_buffer_v2p7_v2i32 + ; CHECK: liveins: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, $vgpr8_vgpr9 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(<2 x p7>) = COPY $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(<2 x s32>) = COPY $vgpr8_vgpr9 + ; CHECK-NEXT: [[UV:%[0-9]+]]:_(p7), [[UV1:%[0-9]+]]:_(p7) = G_UNMERGE_VALUES [[COPY]](<2 x p7>) + ; CHECK-NEXT: [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[COPY1]](<2 x s32>) + ; CHECK-NEXT: [[PTR_ADD:%[0-9]+]]:_(p7) = G_PTR_ADD [[UV]], [[UV2]](s32) + ; CHECK-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p7) = G_PTR_ADD [[UV1]], [[UV3]](s32) + ; CHECK-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x p7>) = G_BUILD_VECTOR [[PTR_ADD]](p7), [[PTR_ADD1]](p7) + ; CHECK-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = COPY [[BUILD_VECTOR]](<2 x p7>) + %0:_(<2 x p7>) = COPY $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 + %1:_(<2 x s32>) = COPY $vgpr8_vgpr9 + %2:_(<2 x p7>) = G_PTR_ADD %0, %1 + + $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = COPY %2 +... + +--- +name: test_gep_buffer_v2p7_v2i64 +body: | + bb.0: + liveins: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, $vgpr8_vgpr9_vgpr10_vgpr11 + + ; CHECK-LABEL: name: test_gep_buffer_v2p7_v2i64 + ; CHECK: liveins: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, $vgpr8_vgpr9_vgpr10_vgpr11 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(<2 x p7>) = COPY $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(<2 x s64>) = COPY $vgpr8_vgpr9_vgpr10_vgpr11 + ; CHECK-NEXT: [[UV:%[0-9]+]]:_(p7), [[UV1:%[0-9]+]]:_(p7) = G_UNMERGE_VALUES [[COPY]](<2 x p7>) + ; CHECK-NEXT: [[UV2:%[0-9]+]]:_(s64), [[UV3:%[0-9]+]]:_(s64) = G_UNMERGE_VALUES [[COPY1]](<2 x s64>) + ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s32) = G_TRUNC [[UV2]](s64) + ; CHECK-NEXT: [[PTR_ADD:%[0-9]+]]:_(p7) = G_PTR_ADD [[UV]], [[TRUNC]](s32) + ; CHECK-NEXT: [[TRUNC1:%[0-9]+]]:_(s32) = G_TRUNC [[UV3]](s64) + ; CHECK-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p7) = G_PTR_ADD [[UV1]], [[TRUNC1]](s32) + ; CHECK-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x p7>) = G_BUILD_VECTOR [[PTR_ADD]](p7), [[PTR_ADD1]](p7) + ; CHECK-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = COPY [[BUILD_VECTOR]](<2 x p7>) + %0:_(<2 x p7>) = COPY $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 + %1:_(<2 x s64>) = COPY $vgpr8_vgpr9_vgpr10_vgpr11 + %2:_(<2 x p7>) = G_PTR_ADD %0, %1 + + $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = COPY %2 +... + +--- +name: test_gep_buffer_i64_index +body: | + bb.0: + liveins: $vgpr0_vgpr1_vgpr2_vgpr3, $vgpr4_vgpr5 + + ; CHECK-LABEL: name: test_gep_buffer_i64_index + ; CHECK: liveins: $vgpr0_vgpr1_vgpr2_vgpr3, $vgpr4_vgpr5 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(p7) = COPY $vgpr0_vgpr1_vgpr2_vgpr3 + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s64) = COPY $vgpr4_vgpr5 + ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s32) = G_TRUNC [[COPY1]](s64) + ; CHECK-NEXT: [[PTR_ADD:%[0-9]+]]:_(p7) = G_PTR_ADD [[COPY]], [[TRUNC]](s32) + ; CHECK-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[PTR_ADD]](p7) + %0:_(p7) = COPY $vgpr0_vgpr1_vgpr2_vgpr3 + %1:_(s64) = COPY $vgpr4_vgpr5 + %2:_(p7) = G_PTR_ADD %0, %1 + + $vgpr0_vgpr1_vgpr2_vgpr3 = COPY %2 +... + +--- +name: test_gep_buffer_i16_index +body: | + bb.0: + liveins: $vgpr0_vgpr1_vgpr2_vgpr3, $vgpr4 + + ; CHECK-LABEL: name: test_gep_buffer_i16_index + ; CHECK: liveins: $vgpr0_vgpr1_vgpr2_vgpr3, $vgpr4 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(p7) = COPY $vgpr0_vgpr1_vgpr2_vgpr3 + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr4 + ; CHECK-NEXT: [[SEXT_INREG:%[0-9]+]]:_(s32) = G_SEXT_INREG [[COPY1]], 16 + ; CHECK-NEXT: [[PTR_ADD:%[0-9]+]]:_(p7) = G_PTR_ADD [[COPY]], [[SEXT_INREG]](s32) + ; CHECK-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[PTR_ADD]](p7) + %0:_(p7) = COPY $vgpr0_vgpr1_vgpr2_vgpr3 + %1:_(s32) = COPY $vgpr4 + %2:_(s16) = G_TRUNC %1 + %3:_(p7) = G_PTR_ADD %0, %2 + + $vgpr0_vgpr1_vgpr2_vgpr3 = COPY %3 +... +