Index: llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.h =================================================================== --- llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.h +++ llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.h @@ -134,6 +134,10 @@ GISelChangeObserver &Observer, const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr) const; + bool legalizeSBufferLoad( + MachineInstr &MI, MachineIRBuilder &B, + GISelChangeObserver &Observer) const; + bool legalizeAtomicIncDec(MachineInstr &MI, MachineIRBuilder &B, bool IsInc) const; Index: llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp =================================================================== --- llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp +++ llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp @@ -42,6 +42,20 @@ using namespace LegalityPredicates; using namespace MIPatternMatch; +// Round the number of elements to the next power of two elements +static LLT getPow2VectorType(LLT Ty) { + unsigned NElts = Ty.getNumElements(); + unsigned Pow2NElts = 1 << Log2_32_Ceil(NElts); + return Ty.changeNumElements(Pow2NElts); +} + +// Round the number of bits to the next power of two bits +static LLT getPow2ScalarType(LLT Ty) { + unsigned Bits = Ty.getSizeInBits(); + unsigned Pow2Bits = 1 << Log2_32_Ceil(Bits); + return LLT::scalar(Pow2Bits); +} + static LegalityPredicate isMultiple32(unsigned TypeIdx, unsigned MaxSize = 1024) { return [=](const LegalityQuery &Query) { @@ -2899,6 +2913,33 @@ return true; } +bool AMDGPULegalizerInfo::legalizeSBufferLoad( + MachineInstr &MI, MachineIRBuilder &B, + GISelChangeObserver &Observer) const { + Register Dst = MI.getOperand(0).getReg(); + LLT Ty = B.getMRI()->getType(Dst); + unsigned Size = Ty.getSizeInBits(); + + // There are no 96-bit result scalar loads, but widening to 128-bit should + // always be legal. We may need to restore this to a 96-bit result if it turns + // out this needs to be converted to a vector load during RegBankSelect. + if (isPowerOf2_32(Size)) + return true; + + LegalizerHelper Helper(B.getMF(), *this, Observer, B); + B.setInstr(MI); + + Observer.changingInstr(MI); + + if (Ty.isVector()) + Helper.moreElementsVectorDst(MI, getPow2VectorType(Ty), 0); + else + Helper.widenScalarDst(MI, getPow2ScalarType(Ty), 0); + + Observer.changedInstr(MI); + return true; +} + bool AMDGPULegalizerInfo::legalizeIntrinsic(MachineInstr &MI, MachineIRBuilder &B, GISelChangeObserver &Observer) const { @@ -3015,6 +3056,8 @@ MI.eraseFromParent(); return true; } + case Intrinsic::amdgcn_s_buffer_load: + return legalizeSBufferLoad(MI, B, Observer); case Intrinsic::amdgcn_raw_buffer_store: case Intrinsic::amdgcn_struct_buffer_store: return legalizeBufferStore(MI, MRI, B, false, false); Index: llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-llvm.amdgcn.s.buffer.load.mir =================================================================== --- /dev/null +++ llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-llvm.amdgcn.s.buffer.load.mir @@ -0,0 +1,136 @@ +# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py +# RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=tahiti -run-pass=legalizer %s -o - | FileCheck -check-prefix=GCN %s +# RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=hawaii -run-pass=legalizer %s -o - | FileCheck -check-prefix=GCN %s + +--- +name: s_buffer_load_v3s32 +body: | + bb.0: + liveins: $sgpr0_sgpr1_sgpr2_sgpr3 + + ; GCN-LABEL: name: s_buffer_load_v3s32 + ; GCN: [[COPY:%[0-9]+]]:_(<4 x s32>) = COPY $sgpr0_sgpr1_sgpr2_sgpr3 + ; GCN: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 + ; GCN: [[INT:%[0-9]+]]:_(<4 x s32>) = G_INTRINSIC intrinsic(@llvm.amdgcn.s.buffer.load), [[COPY]](<4 x s32>), [[C]](s32), 0 + ; GCN: [[EXTRACT:%[0-9]+]]:_(<3 x s32>) = G_EXTRACT [[INT]](<4 x s32>), 0 + ; GCN: S_ENDPGM 0, implicit [[EXTRACT]](<3 x s32>) + %0:_(<4 x s32>) = COPY $sgpr0_sgpr1_sgpr2_sgpr3 + %1:_(s32) = G_CONSTANT i32 0 + %2:_(<3 x s32>) = G_INTRINSIC intrinsic(@llvm.amdgcn.s.buffer.load), %0, %1, 0 + S_ENDPGM 0, implicit %2 + +... + +--- +name: s_buffer_load_v3p3 +body: | + bb.0: + liveins: $sgpr0_sgpr1_sgpr2_sgpr3 + + ; GCN-LABEL: name: s_buffer_load_v3p3 + ; GCN: [[COPY:%[0-9]+]]:_(<4 x s32>) = COPY $sgpr0_sgpr1_sgpr2_sgpr3 + ; GCN: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 + ; GCN: [[INT:%[0-9]+]]:_(<4 x p3>) = G_INTRINSIC intrinsic(@llvm.amdgcn.s.buffer.load), [[COPY]](<4 x s32>), [[C]](s32), 0 + ; GCN: [[EXTRACT:%[0-9]+]]:_(<3 x p3>) = G_EXTRACT [[INT]](<4 x p3>), 0 + ; GCN: S_ENDPGM 0, implicit [[EXTRACT]](<3 x p3>) + %0:_(<4 x s32>) = COPY $sgpr0_sgpr1_sgpr2_sgpr3 + %1:_(s32) = G_CONSTANT i32 0 + %2:_(<3 x p3>) = G_INTRINSIC intrinsic(@llvm.amdgcn.s.buffer.load), %0, %1, 0 + S_ENDPGM 0, implicit %2 + +... + +--- +name: s_buffer_load_v6s16 +body: | + bb.0: + liveins: $sgpr0_sgpr1_sgpr2_sgpr3 + + ; GCN-LABEL: name: s_buffer_load_v6s16 + ; GCN: [[COPY:%[0-9]+]]:_(<4 x s32>) = COPY $sgpr0_sgpr1_sgpr2_sgpr3 + ; GCN: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 + ; GCN: [[INT:%[0-9]+]]:_(<8 x s16>) = G_INTRINSIC intrinsic(@llvm.amdgcn.s.buffer.load), [[COPY]](<4 x s32>), [[C]](s32), 0 + ; GCN: [[EXTRACT:%[0-9]+]]:_(<6 x s16>) = G_EXTRACT [[INT]](<8 x s16>), 0 + ; GCN: S_ENDPGM 0, implicit [[EXTRACT]](<6 x s16>) + %0:_(<4 x s32>) = COPY $sgpr0_sgpr1_sgpr2_sgpr3 + %1:_(s32) = G_CONSTANT i32 0 + %2:_(<6 x s16>) = G_INTRINSIC intrinsic(@llvm.amdgcn.s.buffer.load), %0, %1, 0 + S_ENDPGM 0, implicit %2 + +... + +--- +name: s_buffer_load_v6s32 +body: | + bb.0: + liveins: $sgpr0_sgpr1_sgpr2_sgpr3 + + ; GCN-LABEL: name: s_buffer_load_v6s32 + ; GCN: [[COPY:%[0-9]+]]:_(<4 x s32>) = COPY $sgpr0_sgpr1_sgpr2_sgpr3 + ; GCN: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 + ; GCN: [[INT:%[0-9]+]]:_(<8 x s32>) = G_INTRINSIC intrinsic(@llvm.amdgcn.s.buffer.load), [[COPY]](<4 x s32>), [[C]](s32), 0 + ; GCN: [[EXTRACT:%[0-9]+]]:_(<6 x s32>) = G_EXTRACT [[INT]](<8 x s32>), 0 + ; GCN: S_ENDPGM 0, implicit [[EXTRACT]](<6 x s32>) + %0:_(<4 x s32>) = COPY $sgpr0_sgpr1_sgpr2_sgpr3 + %1:_(s32) = G_CONSTANT i32 0 + %2:_(<6 x s32>) = G_INTRINSIC intrinsic(@llvm.amdgcn.s.buffer.load), %0, %1, 0 + S_ENDPGM 0, implicit %2 + +... + +--- +name: s_buffer_load_v3s64 +body: | + bb.0: + liveins: $sgpr0_sgpr1_sgpr2_sgpr3 + + ; GCN-LABEL: name: s_buffer_load_v3s64 + ; GCN: [[COPY:%[0-9]+]]:_(<4 x s32>) = COPY $sgpr0_sgpr1_sgpr2_sgpr3 + ; GCN: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 + ; GCN: [[INT:%[0-9]+]]:_(<4 x s64>) = G_INTRINSIC intrinsic(@llvm.amdgcn.s.buffer.load), [[COPY]](<4 x s32>), [[C]](s32), 0 + ; GCN: [[EXTRACT:%[0-9]+]]:_(<3 x s64>) = G_EXTRACT [[INT]](<4 x s64>), 0 + ; GCN: S_ENDPGM 0, implicit [[EXTRACT]](<3 x s64>) + %0:_(<4 x s32>) = COPY $sgpr0_sgpr1_sgpr2_sgpr3 + %1:_(s32) = G_CONSTANT i32 0 + %2:_(<3 x s64>) = G_INTRINSIC intrinsic(@llvm.amdgcn.s.buffer.load), %0, %1, 0 + S_ENDPGM 0, implicit %2 + +... + +--- +name: s_buffer_load_v12s8 +body: | + bb.0: + liveins: $sgpr0_sgpr1_sgpr2_sgpr3 + + ; GCN-LABEL: name: s_buffer_load_v12s8 + ; GCN: [[COPY:%[0-9]+]]:_(<4 x s32>) = COPY $sgpr0_sgpr1_sgpr2_sgpr3 + ; GCN: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 + ; GCN: [[INT:%[0-9]+]]:_(<16 x s8>) = G_INTRINSIC intrinsic(@llvm.amdgcn.s.buffer.load), [[COPY]](<4 x s32>), [[C]](s32), 0 + ; GCN: [[EXTRACT:%[0-9]+]]:_(<12 x s8>) = G_EXTRACT [[INT]](<16 x s8>), 0 + ; GCN: S_ENDPGM 0, implicit [[EXTRACT]](<12 x s8>) + %0:_(<4 x s32>) = COPY $sgpr0_sgpr1_sgpr2_sgpr3 + %1:_(s32) = G_CONSTANT i32 0 + %2:_(<12 x s8>) = G_INTRINSIC intrinsic(@llvm.amdgcn.s.buffer.load), %0, %1, 0 + S_ENDPGM 0, implicit %2 + +... + +--- +name: s_buffer_load_s96 +body: | + bb.0: + liveins: $sgpr0_sgpr1_sgpr2_sgpr3 + + ; GCN-LABEL: name: s_buffer_load_s96 + ; GCN: [[COPY:%[0-9]+]]:_(<4 x s32>) = COPY $sgpr0_sgpr1_sgpr2_sgpr3 + ; GCN: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 + ; GCN: [[INT:%[0-9]+]]:_(<4 x s32>) = G_INTRINSIC intrinsic(@llvm.amdgcn.s.buffer.load), [[COPY]](<4 x s32>), [[C]](s32), 0 + ; GCN: [[EXTRACT:%[0-9]+]]:_(<3 x s32>) = G_EXTRACT [[INT]](<4 x s32>), 0 + ; GCN: S_ENDPGM 0, implicit [[EXTRACT]](<3 x s32>) + %0:_(<4 x s32>) = COPY $sgpr0_sgpr1_sgpr2_sgpr3 + %1:_(s32) = G_CONSTANT i32 0 + %2:_(<3 x s32>) = G_INTRINSIC intrinsic(@llvm.amdgcn.s.buffer.load), %0, %1, 0 + S_ENDPGM 0, implicit %2 + +...