diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp --- a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp @@ -1773,16 +1773,31 @@ return false; SDLoc SL(ByteOffsetNode); - GCNSubtarget::Generation Gen = Subtarget->getGeneration(); + // GFX9 amd GFX10 have signed byte immediate offsets. + if (Subtarget->hasSMEMImmSignedOffset()) { + int64_t ByteOffset = C->getSExtValue(); + Optional EncodedOffset = + AMDGPU::getSMRDEncodedSignedOffset(*Subtarget, ByteOffset); + if (EncodedOffset) { + Offset = CurDAG->getTargetConstant(*EncodedOffset, SL, MVT::i32); + Imm = true; + return true; + } + } + + if (!CurDAG->SignBitIsZero(ByteOffsetNode)) + return false; + uint64_t ByteOffset = C->getZExtValue(); Optional EncodedOffset = - AMDGPU::getSMRDEncodedOffset(*Subtarget, ByteOffset); + AMDGPU::getSMRDEncodedUnsignedOffset(*Subtarget, ByteOffset); if (EncodedOffset) { Offset = CurDAG->getTargetConstant(*EncodedOffset, SL, MVT::i32); Imm = true; return true; } + GCNSubtarget::Generation Gen = Subtarget->getGeneration(); if (Gen == AMDGPUSubtarget::SEA_ISLANDS) { EncodedOffset = AMDGPU::getSMRDEncodedLiteralOffset32(*Subtarget, ByteOffset); @@ -1878,8 +1893,9 @@ bool AMDGPUDAGToDAGISel::SelectSMRDBufferImm(SDValue Addr, SDValue &Offset) const { if (ConstantSDNode *C = dyn_cast(Addr)) { - if (auto Imm = AMDGPU::getSMRDEncodedOffset(*Subtarget, - C->getZExtValue())) { + // The immediate offset for S_BUFFER instructions is unsigned. + if (auto Imm = AMDGPU::getSMRDEncodedUnsignedOffset(*Subtarget, + C->getZExtValue())) { Offset = CurDAG->getTargetConstant(*Imm, SDLoc(Addr), MVT::i32); return true; } diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp --- a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp @@ -2547,7 +2547,10 @@ return None; const GEPInfo &GEPInfo = AddrInfo[0]; - Optional EncodedImm = AMDGPU::getSMRDEncodedOffset(STI, GEPInfo.Imm); + Optional EncodedImm = + STI.hasSMEMImmSignedOffset() + ? AMDGPU::getSMRDEncodedSignedOffset(STI, GEPInfo.Imm) + : AMDGPU::getSMRDEncodedUnsignedOffset(STI, GEPInfo.Imm); if (!EncodedImm) return None; @@ -2593,7 +2596,7 @@ return None; const GEPInfo &GEPInfo = AddrInfo[0]; - if (!GEPInfo.Imm || !isUInt<32>(GEPInfo.Imm)) + if (!GEPInfo.Imm || GEPInfo.Imm < 0 || !isUInt<32>(GEPInfo.Imm)) return None; // If we make it this far we have a load with an 32-bit immediate offset. @@ -3233,7 +3236,7 @@ if (!OffsetVal) return {}; - Optional EncodedImm = AMDGPU::getSMRDEncodedOffset(STI, *OffsetVal); + Optional EncodedImm = AMDGPU::getSMRDEncodedUnsignedOffset(STI, *OffsetVal); if (!EncodedImm) return {}; diff --git a/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h b/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h --- a/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h +++ b/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h @@ -1017,6 +1017,10 @@ return CIInsts; } + bool hasSMEMImmSignedOffset() const { + return getGeneration() >= AMDGPUSubtarget::GFX9; + } + bool hasSMovFedHazard() const { return getGeneration() == AMDGPUSubtarget::GFX9; } diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h --- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h +++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h @@ -644,10 +644,17 @@ /// offsets. uint64_t convertSMRDOffsetUnits(const MCSubtargetInfo &ST, uint64_t ByteOffset); -/// \returns The encoding that will be used for \p ByteOffset in the SMRD offset -/// field, or None if it won't fit. This is useful on all subtargets. -Optional getSMRDEncodedOffset(const MCSubtargetInfo &ST, - int64_t ByteOffset); +/// \returns The encoding that will be used for an unsigned \p ByteOffset in the +/// SMRD offset field, or None if it won't fit. This is useful on all +/// subtargets. +Optional getSMRDEncodedUnsignedOffset(const MCSubtargetInfo &ST, + int64_t ByteOffset); + +/// \returns The encoding that will be used for a signed \p ByteOffset in the +/// SMRD offset field, or None if it won't fit. This is useful on GFX9 and GFX10 +/// for S_LOAD instructions. S_BUFFER has an unsigned offset on all subtargets. +Optional getSMRDEncodedSignedOffset(const MCSubtargetInfo &ST, + int64_t ByteOffset); /// \return The encoding that can be used for a 32-bit literal offset in an SMRD /// instruction. This is only useful on CI.s diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp --- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp +++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp @@ -1248,8 +1248,8 @@ return isGCN3Encoding(ST) || isGFX10(ST); } -static bool isLegalSMRDEncodedImmOffset(const MCSubtargetInfo &ST, - int64_t EncodedOffset) { +static bool isLegalSMRDEncodedUnsignedOffset(const MCSubtargetInfo &ST, + int64_t EncodedOffset) { return hasSMEMByteOffset(ST) ? isUInt<20>(EncodedOffset) : isUInt<8>(EncodedOffset); } @@ -1267,16 +1267,21 @@ return ByteOffset >> 2; } -Optional getSMRDEncodedOffset(const MCSubtargetInfo &ST, - int64_t ByteOffset) { +Optional getSMRDEncodedUnsignedOffset(const MCSubtargetInfo &ST, + int64_t ByteOffset) { if (!isDwordAligned(ByteOffset) && !hasSMEMByteOffset(ST)) return None; int64_t EncodedOffset = convertSMRDOffsetUnits(ST, ByteOffset); - return isLegalSMRDEncodedImmOffset(ST, EncodedOffset) ? + return isLegalSMRDEncodedUnsignedOffset(ST, EncodedOffset) ? Optional(EncodedOffset) : None; } +Optional getSMRDEncodedSignedOffset(const MCSubtargetInfo &ST, + int64_t ByteOffset) { + return isInt<20>(ByteOffset) ? Optional(ByteOffset) : None; +} + Optional getSMRDEncodedLiteralOffset32(const MCSubtargetInfo &ST, int64_t ByteOffset) { if (!isDwordAligned(ByteOffset) && !hasSMEMByteOffset(ST)) diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-load-constant.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-load-constant.mir --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-load-constant.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-load-constant.mir @@ -800,8 +800,9 @@ ; GFX10: liveins: $sgpr0_sgpr1 ; GFX10: $vcc_hi = IMPLICIT_DEF ; GFX10: [[COPY:%[0-9]+]]:sreg_64 = COPY $sgpr0_sgpr1 - ; GFX10: [[S_LOAD_DWORD_IMM:%[0-9]+]]:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM [[COPY]], 1048575, 0, 0 :: (load 4, addrspace 4) - ; GFX10: $sgpr0 = COPY [[S_LOAD_DWORD_IMM]] + ; GFX10: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 1048575 + ; GFX10: [[S_LOAD_DWORD_SGPR:%[0-9]+]]:sreg_32_xm0_xexec = S_LOAD_DWORD_SGPR [[COPY]], [[S_MOV_B32_]], 0, 0 :: (load 4, addrspace 4) + ; GFX10: $sgpr0 = COPY [[S_LOAD_DWORD_SGPR]] %0:sgpr(p4) = COPY $sgpr0_sgpr1 %1:sgpr(s64) = G_CONSTANT i64 1048575 %2:sgpr(p4) = G_PTR_ADD %0, %1 @@ -896,3 +897,137 @@ $sgpr0 = COPY %3 ... + +--- + +name: load_constant_s32_from_4_gep_negative_1 +legalized: true +regBankSelected: true +tracksRegLiveness: true + +body: | + bb.0: + liveins: $sgpr0_sgpr1 + + ; GFX6-LABEL: name: load_constant_s32_from_4_gep_negative_1 + ; GFX6: liveins: $sgpr0_sgpr1 + ; GFX6: [[COPY:%[0-9]+]]:sreg_64 = COPY $sgpr0_sgpr1 + ; GFX6: [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 -1 + ; GFX6: [[COPY1:%[0-9]+]]:sreg_32 = COPY [[COPY]].sub0 + ; GFX6: [[COPY2:%[0-9]+]]:sreg_32 = COPY [[S_MOV_B64_]].sub0 + ; GFX6: [[COPY3:%[0-9]+]]:sreg_32 = COPY [[COPY]].sub1 + ; GFX6: [[COPY4:%[0-9]+]]:sreg_32 = COPY [[S_MOV_B64_]].sub1 + ; GFX6: [[S_ADD_U32_:%[0-9]+]]:sreg_32 = S_ADD_U32 [[COPY1]], [[COPY2]], implicit-def $scc + ; GFX6: [[S_ADDC_U32_:%[0-9]+]]:sreg_32 = S_ADDC_U32 [[COPY3]], [[COPY4]], implicit-def $scc, implicit $scc + ; GFX6: [[REG_SEQUENCE:%[0-9]+]]:sreg_64_xexec = REG_SEQUENCE [[S_ADD_U32_]], %subreg.sub0, [[S_ADDC_U32_]], %subreg.sub1 + ; GFX6: [[S_LOAD_DWORD_IMM:%[0-9]+]]:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM [[REG_SEQUENCE]], 0, 0, 0 :: (load 4, addrspace 4) + ; GFX6: $sgpr0 = COPY [[S_LOAD_DWORD_IMM]] + ; GFX7-LABEL: name: load_constant_s32_from_4_gep_negative_1 + ; GFX7: liveins: $sgpr0_sgpr1 + ; GFX7: [[COPY:%[0-9]+]]:sreg_64 = COPY $sgpr0_sgpr1 + ; GFX7: [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 -1 + ; GFX7: [[COPY1:%[0-9]+]]:sreg_32 = COPY [[COPY]].sub0 + ; GFX7: [[COPY2:%[0-9]+]]:sreg_32 = COPY [[S_MOV_B64_]].sub0 + ; GFX7: [[COPY3:%[0-9]+]]:sreg_32 = COPY [[COPY]].sub1 + ; GFX7: [[COPY4:%[0-9]+]]:sreg_32 = COPY [[S_MOV_B64_]].sub1 + ; GFX7: [[S_ADD_U32_:%[0-9]+]]:sreg_32 = S_ADD_U32 [[COPY1]], [[COPY2]], implicit-def $scc + ; GFX7: [[S_ADDC_U32_:%[0-9]+]]:sreg_32 = S_ADDC_U32 [[COPY3]], [[COPY4]], implicit-def $scc, implicit $scc + ; GFX7: [[REG_SEQUENCE:%[0-9]+]]:sreg_64_xexec = REG_SEQUENCE [[S_ADD_U32_]], %subreg.sub0, [[S_ADDC_U32_]], %subreg.sub1 + ; GFX7: [[S_LOAD_DWORD_IMM:%[0-9]+]]:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM [[REG_SEQUENCE]], 0, 0, 0 :: (load 4, addrspace 4) + ; GFX7: $sgpr0 = COPY [[S_LOAD_DWORD_IMM]] + ; GFX8-LABEL: name: load_constant_s32_from_4_gep_negative_1 + ; GFX8: liveins: $sgpr0_sgpr1 + ; GFX8: [[COPY:%[0-9]+]]:sreg_64 = COPY $sgpr0_sgpr1 + ; GFX8: [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 -1 + ; GFX8: [[COPY1:%[0-9]+]]:sreg_32 = COPY [[COPY]].sub0 + ; GFX8: [[COPY2:%[0-9]+]]:sreg_32 = COPY [[S_MOV_B64_]].sub0 + ; GFX8: [[COPY3:%[0-9]+]]:sreg_32 = COPY [[COPY]].sub1 + ; GFX8: [[COPY4:%[0-9]+]]:sreg_32 = COPY [[S_MOV_B64_]].sub1 + ; GFX8: [[S_ADD_U32_:%[0-9]+]]:sreg_32 = S_ADD_U32 [[COPY1]], [[COPY2]], implicit-def $scc + ; GFX8: [[S_ADDC_U32_:%[0-9]+]]:sreg_32 = S_ADDC_U32 [[COPY3]], [[COPY4]], implicit-def $scc, implicit $scc + ; GFX8: [[REG_SEQUENCE:%[0-9]+]]:sreg_64_xexec = REG_SEQUENCE [[S_ADD_U32_]], %subreg.sub0, [[S_ADDC_U32_]], %subreg.sub1 + ; GFX8: [[S_LOAD_DWORD_IMM:%[0-9]+]]:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM [[REG_SEQUENCE]], 0, 0, 0 :: (load 4, addrspace 4) + ; GFX8: $sgpr0 = COPY [[S_LOAD_DWORD_IMM]] + ; GFX10-LABEL: name: load_constant_s32_from_4_gep_negative_1 + ; GFX10: liveins: $sgpr0_sgpr1 + ; GFX10: $vcc_hi = IMPLICIT_DEF + ; GFX10: [[COPY:%[0-9]+]]:sreg_64 = COPY $sgpr0_sgpr1 + ; GFX10: [[S_LOAD_DWORD_IMM:%[0-9]+]]:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM [[COPY]], -1, 0, 0 :: (load 4, addrspace 4) + ; GFX10: $sgpr0 = COPY [[S_LOAD_DWORD_IMM]] + %0:sgpr(p4) = COPY $sgpr0_sgpr1 + %1:sgpr(s64) = G_CONSTANT i64 -1 + %2:sgpr(p4) = G_PTR_ADD %0, %1 + %3:sgpr(s32) = G_LOAD %2 :: (load 4, align 4, addrspace 4) + $sgpr0 = COPY %3 + +... + +--- + +name: load_constant_s32_from_4_gep_negative_524288 +legalized: true +regBankSelected: true +tracksRegLiveness: true + +body: | + bb.0: + liveins: $sgpr0_sgpr1 + + ; GFX6-LABEL: name: load_constant_s32_from_4_gep_negative_524288 + ; GFX6: liveins: $sgpr0_sgpr1 + ; GFX6: [[COPY:%[0-9]+]]:sreg_64 = COPY $sgpr0_sgpr1 + ; GFX6: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 4294443008 + ; GFX6: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 -1 + ; GFX6: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_]], %subreg.sub0, [[S_MOV_B32_1]], %subreg.sub1 + ; GFX6: [[COPY1:%[0-9]+]]:sreg_32 = COPY [[COPY]].sub0 + ; GFX6: [[COPY2:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE]].sub0 + ; GFX6: [[COPY3:%[0-9]+]]:sreg_32 = COPY [[COPY]].sub1 + ; GFX6: [[COPY4:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE]].sub1 + ; GFX6: [[S_ADD_U32_:%[0-9]+]]:sreg_32 = S_ADD_U32 [[COPY1]], [[COPY2]], implicit-def $scc + ; GFX6: [[S_ADDC_U32_:%[0-9]+]]:sreg_32 = S_ADDC_U32 [[COPY3]], [[COPY4]], implicit-def $scc, implicit $scc + ; GFX6: [[REG_SEQUENCE1:%[0-9]+]]:sreg_64_xexec = REG_SEQUENCE [[S_ADD_U32_]], %subreg.sub0, [[S_ADDC_U32_]], %subreg.sub1 + ; GFX6: [[S_LOAD_DWORD_IMM:%[0-9]+]]:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM [[REG_SEQUENCE1]], 0, 0, 0 :: (load 4, addrspace 4) + ; GFX6: $sgpr0 = COPY [[S_LOAD_DWORD_IMM]] + ; GFX7-LABEL: name: load_constant_s32_from_4_gep_negative_524288 + ; GFX7: liveins: $sgpr0_sgpr1 + ; GFX7: [[COPY:%[0-9]+]]:sreg_64 = COPY $sgpr0_sgpr1 + ; GFX7: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 4294443008 + ; GFX7: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 -1 + ; GFX7: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_]], %subreg.sub0, [[S_MOV_B32_1]], %subreg.sub1 + ; GFX7: [[COPY1:%[0-9]+]]:sreg_32 = COPY [[COPY]].sub0 + ; GFX7: [[COPY2:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE]].sub0 + ; GFX7: [[COPY3:%[0-9]+]]:sreg_32 = COPY [[COPY]].sub1 + ; GFX7: [[COPY4:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE]].sub1 + ; GFX7: [[S_ADD_U32_:%[0-9]+]]:sreg_32 = S_ADD_U32 [[COPY1]], [[COPY2]], implicit-def $scc + ; GFX7: [[S_ADDC_U32_:%[0-9]+]]:sreg_32 = S_ADDC_U32 [[COPY3]], [[COPY4]], implicit-def $scc, implicit $scc + ; GFX7: [[REG_SEQUENCE1:%[0-9]+]]:sreg_64_xexec = REG_SEQUENCE [[S_ADD_U32_]], %subreg.sub0, [[S_ADDC_U32_]], %subreg.sub1 + ; GFX7: [[S_LOAD_DWORD_IMM:%[0-9]+]]:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM [[REG_SEQUENCE1]], 0, 0, 0 :: (load 4, addrspace 4) + ; GFX7: $sgpr0 = COPY [[S_LOAD_DWORD_IMM]] + ; GFX8-LABEL: name: load_constant_s32_from_4_gep_negative_524288 + ; GFX8: liveins: $sgpr0_sgpr1 + ; GFX8: [[COPY:%[0-9]+]]:sreg_64 = COPY $sgpr0_sgpr1 + ; GFX8: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 4294443008 + ; GFX8: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 -1 + ; GFX8: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_]], %subreg.sub0, [[S_MOV_B32_1]], %subreg.sub1 + ; GFX8: [[COPY1:%[0-9]+]]:sreg_32 = COPY [[COPY]].sub0 + ; GFX8: [[COPY2:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE]].sub0 + ; GFX8: [[COPY3:%[0-9]+]]:sreg_32 = COPY [[COPY]].sub1 + ; GFX8: [[COPY4:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE]].sub1 + ; GFX8: [[S_ADD_U32_:%[0-9]+]]:sreg_32 = S_ADD_U32 [[COPY1]], [[COPY2]], implicit-def $scc + ; GFX8: [[S_ADDC_U32_:%[0-9]+]]:sreg_32 = S_ADDC_U32 [[COPY3]], [[COPY4]], implicit-def $scc, implicit $scc + ; GFX8: [[REG_SEQUENCE1:%[0-9]+]]:sreg_64_xexec = REG_SEQUENCE [[S_ADD_U32_]], %subreg.sub0, [[S_ADDC_U32_]], %subreg.sub1 + ; GFX8: [[S_LOAD_DWORD_IMM:%[0-9]+]]:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM [[REG_SEQUENCE1]], 0, 0, 0 :: (load 4, addrspace 4) + ; GFX8: $sgpr0 = COPY [[S_LOAD_DWORD_IMM]] + ; GFX10-LABEL: name: load_constant_s32_from_4_gep_negative_524288 + ; GFX10: liveins: $sgpr0_sgpr1 + ; GFX10: $vcc_hi = IMPLICIT_DEF + ; GFX10: [[COPY:%[0-9]+]]:sreg_64 = COPY $sgpr0_sgpr1 + ; GFX10: [[S_LOAD_DWORD_IMM:%[0-9]+]]:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM [[COPY]], -524288, 0, 0 :: (load 4, addrspace 4) + ; GFX10: $sgpr0 = COPY [[S_LOAD_DWORD_IMM]] + %0:sgpr(p4) = COPY $sgpr0_sgpr1 + %1:sgpr(s64) = G_CONSTANT i64 -524288 + %2:sgpr(p4) = G_PTR_ADD %0, %1 + %3:sgpr(s32) = G_LOAD %2 :: (load 4, align 4, addrspace 4) + $sgpr0 = COPY %3 + +... diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/smrd.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/smrd.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/smrd.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/smrd.ll @@ -1,12 +1,14 @@ ; FIXME: Need to add support for mubuf stores to enable this on SI. ; XUN: llc < %s -march=amdgcn -mcpu=tahiti -show-mc-encoding -verify-machineinstrs -global-isel | FileCheck --check-prefix=SI --check-prefix=GCN --check-prefix=SIVI %s -; RUN: llc < %s -march=amdgcn -mcpu=bonaire -show-mc-encoding -verify-machineinstrs -global-isel | FileCheck --check-prefix=CI --check-prefix=GCN %s -; RUN: llc < %s -march=amdgcn -mcpu=tonga -show-mc-encoding -verify-machineinstrs -global-isel | FileCheck --check-prefix=VI --check-prefix=GCN --check-prefix=SIVI %s +; RUN: llc < %s -march=amdgcn -mcpu=bonaire -show-mc-encoding -verify-machineinstrs -global-isel | FileCheck --check-prefix=CI --check-prefix=GCN --check-prefix=SICIVI %s +; RUN: llc < %s -march=amdgcn -mcpu=tonga -show-mc-encoding -verify-machineinstrs -global-isel | FileCheck --check-prefix=VI --check-prefix=GCN --check-prefix=SIVI --check-prefix=SICIVI %s +; RUN: llc -march=amdgcn -mcpu=gfx900 -show-mc-encoding -verify-machineinstrs -global-isel < %s | FileCheck -check-prefix=GFX9 -check-prefix=GFX9_10 -check-prefix=GCN -check-prefix=VIGFX9_10 -check-prefix=SIVIGFX9_10 %s +; RUN: llc -march=amdgcn -mcpu=gfx1010 -show-mc-encoding -verify-machineinstrs -global-isel < %s | FileCheck -check-prefix=GFX10 -check-prefix=GFX9_10 -check-prefix=GCN -check-prefix=VIGFX9_10 -check-prefix=SIVIGFX9_10 %s ; SMRD load with an immediate offset. ; GCN-LABEL: {{^}}smrd0: ; SICI: s_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0x1 ; encoding: [0x01 -; VI: s_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0x4 +; VIGFX9_10: s_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0x4 define amdgpu_kernel void @smrd0(i32 addrspace(4)* %ptr) { entry: %0 = getelementptr i32, i32 addrspace(4)* %ptr, i64 1 @@ -18,7 +20,7 @@ ; SMRD load with the largest possible immediate offset. ; GCN-LABEL: {{^}}smrd1: ; SICI: s_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0xff ; encoding: [0xff,0x{{[0-9]+[137]}} -; VI: s_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0x3fc +; VIGFX9_10: s_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0x3fc define amdgpu_kernel void @smrd1(i32 addrspace(4)* %ptr) { entry: %0 = getelementptr i32, i32 addrspace(4)* %ptr, i64 255 @@ -32,7 +34,7 @@ ; SI: s_movk_i32 s[[OFFSET:[0-9]]], 0x400 ; SI: s_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], s[[OFFSET]] ; encoding: [0x0[[OFFSET]] ; CI: s_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0x100 -; VI: s_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0x400 +; VIGFX9_10: s_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0x400 ; GCN: s_endpgm define amdgpu_kernel void @smrd2(i32 addrspace(4)* %ptr) { entry: @@ -63,6 +65,8 @@ ; SI: s_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], [[OFFSET]] ; CI: s_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0x3ffff ; VI: s_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0xffffc +; GFX9_10: s_mov_b32 [[OFFSET:s[0-9]+]], 0xffffc +; GFX9_10: s_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], [[OFFSET]] define amdgpu_kernel void @smrd4(i32 addrspace(4)* %ptr) { entry: %0 = getelementptr i32, i32 addrspace(4)* %ptr, i64 262143 @@ -73,8 +77,8 @@ ; SMRD load with an offset greater than the largest possible immediate on VI ; GCN-LABEL: {{^}}smrd5: -; SIVI: s_mov_b32 [[OFFSET:s[0-9]+]], 0x100000 -; SIVI: s_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], [[OFFSET]] +; SIVIGFX9_10: s_mov_b32 [[OFFSET:s[0-9]+]], 0x100000 +; SIVIGFX9_10: s_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], [[OFFSET]] ; CI: s_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0x40000 ; GCN: s_endpgm define amdgpu_kernel void @smrd5(i32 addrspace(4)* %ptr) { @@ -85,3 +89,28 @@ ret void } +; GFX9_10 can use a signed immediate byte offset +; GCN-LABEL: {{^}}smrd6: +; SICIVI: s_add_u32 s{{[0-9]}}, s{{[0-9]}}, -4 +; SICIVI: s_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0x0 +; GFX9_10: s_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0xfffffffc +define amdgpu_kernel void @smrd6(i32 addrspace(1)* %out, i32 addrspace(4)* %ptr) #0 { +entry: + %tmp = getelementptr i32, i32 addrspace(4)* %ptr, i64 -1 + %tmp1 = load i32, i32 addrspace(4)* %tmp + store i32 %tmp1, i32 addrspace(1)* %out + ret void +} + +; Don't use a negative SGPR offset +; GCN-LABEL: {{^}}smrd7: +; GCN: s_add_u32 s{{[0-9]}}, s{{[0-9]}}, 0xffe00000 +; SICIVI: s_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0x0 +; GFX9_10: s_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0x0 +define amdgpu_kernel void @smrd7(i32 addrspace(1)* %out, i32 addrspace(4)* %ptr) #0 { +entry: + %tmp = getelementptr i32, i32 addrspace(4)* %ptr, i64 -524288 + %tmp1 = load i32, i32 addrspace(4)* %tmp + store i32 %tmp1, i32 addrspace(1)* %out + ret void +} diff --git a/llvm/test/CodeGen/AMDGPU/smrd.ll b/llvm/test/CodeGen/AMDGPU/smrd.ll --- a/llvm/test/CodeGen/AMDGPU/smrd.ll +++ b/llvm/test/CodeGen/AMDGPU/smrd.ll @@ -1,8 +1,8 @@ ; RUN: llc -march=amdgcn -mcpu=tahiti -verify-machineinstrs -show-mc-encoding < %s | FileCheck -check-prefix=SI -check-prefix=GCN -check-prefix=SICIVI -check-prefix=SICI -check-prefix=SIVIGFX9_10 %s ; RUN: llc -march=amdgcn -mcpu=bonaire -verify-machineinstrs -show-mc-encoding < %s | FileCheck -check-prefix=CI -check-prefix=GCN -check-prefix=SICIVI -check-prefix=SICI %s ; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs -show-mc-encoding < %s | FileCheck -check-prefix=VI -check-prefix=GCN -check-prefix=SICIVI -check-prefix=VIGFX9_10 -check-prefix=SIVIGFX9_10 %s -; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs -show-mc-encoding < %s | FileCheck -check-prefix=GFX9 -check-prefix=GCN -check-prefix=VIGFX9_10 -check-prefix=SIVIGFX9_10 %s -; RUN: llc -march=amdgcn -mcpu=gfx1010 -verify-machineinstrs -show-mc-encoding < %s | FileCheck -check-prefix=GFX10 -check-prefix=GCN -check-prefix=VIGFX9_10 -check-prefix=SIVIGFX9_10 %s +; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs -show-mc-encoding < %s | FileCheck -check-prefix=GFX9 -check-prefix=GFX9_10 -check-prefix=GCN -check-prefix=VIGFX9_10 -check-prefix=SIVIGFX9_10 %s +; RUN: llc -march=amdgcn -mcpu=gfx1010 -verify-machineinstrs -show-mc-encoding < %s | FileCheck -check-prefix=GFX10 -check-prefix=GFX9_10 -check-prefix=GCN -check-prefix=VIGFX9_10 -check-prefix=SIVIGFX9_10 %s ; SMRD load with an immediate offset. ; GCN-LABEL: {{^}}smrd0: @@ -86,6 +86,32 @@ ret void } +; GFX9_10 can use a signed immediate byte offset +; GCN-LABEL: {{^}}smrd6: +; SICIVI: s_add_u32 s{{[0-9]}}, s{{[0-9]}}, -4 +; SICIVI: s_load_dword s{{[0-9]}}, s{{\[[0-9]+:[0-9]+\]}}, 0x0 +; GFX9_10: s_load_dword s{{[0-9]}}, s{{\[[0-9]+:[0-9]+\]}}, 0xfffffffc +define amdgpu_kernel void @smrd6(i32 addrspace(1)* %out, i32 addrspace(4)* %ptr) #0 { +entry: + %tmp = getelementptr i32, i32 addrspace(4)* %ptr, i64 -1 + %tmp1 = load i32, i32 addrspace(4)* %tmp + store i32 %tmp1, i32 addrspace(1)* %out + ret void +} + +; Don't use a negative SGPR offset +; GCN-LABEL: {{^}}smrd7: +; GCN: s_add_u32 s{{[0-9]}}, s{{[0-9]}}, 0xffe00000 +; SICIVI: s_load_dword s{{[0-9]}}, s{{\[[0-9]+:[0-9]+\]}}, 0x0 +; GFX9_10: s_load_dword s{{[0-9]}}, s{{\[[0-9]+:[0-9]+\]}}, 0x0 +define amdgpu_kernel void @smrd7(i32 addrspace(1)* %out, i32 addrspace(4)* %ptr) #0 { +entry: + %tmp = getelementptr i32, i32 addrspace(4)* %ptr, i64 -524288 + %tmp1 = load i32, i32 addrspace(4)* %tmp + store i32 %tmp1, i32 addrspace(1)* %out + ret void +} + ; GCN-LABEL: {{^}}smrd_hazard: ; GCN-DAG: s_mov_b32 s3, 3 ; GCN-DAG: s_mov_b32 s2, 2