diff --git a/llvm/include/llvm/Support/AMDHSAKernelDescriptor.h b/llvm/include/llvm/Support/AMDHSAKernelDescriptor.h --- a/llvm/include/llvm/Support/AMDHSAKernelDescriptor.h +++ b/llvm/include/llvm/Support/AMDHSAKernelDescriptor.h @@ -136,13 +136,17 @@ // Compute program resource register 3 for GFX10+. Must match hardware // definition. -#define COMPUTE_PGM_RSRC3_GFX10(NAME, SHIFT, WIDTH) \ - AMDHSA_BITS_ENUM_ENTRY(COMPUTE_PGM_RSRC3_GFX10_ ## NAME, SHIFT, WIDTH) +#define COMPUTE_PGM_RSRC3_GFX10_PLUS(NAME, SHIFT, WIDTH) \ + AMDHSA_BITS_ENUM_ENTRY(COMPUTE_PGM_RSRC3_GFX10_PLUS_ ## NAME, SHIFT, WIDTH) enum : int32_t { - COMPUTE_PGM_RSRC3_GFX10(SHARED_VGPR_COUNT, 0, 4), // GFX10+ - COMPUTE_PGM_RSRC3_GFX10(RESERVED0, 4, 28), + COMPUTE_PGM_RSRC3_GFX10_PLUS(SHARED_VGPR_COUNT, 0, 4), // GFX10+ + COMPUTE_PGM_RSRC3_GFX10_PLUS(INST_PREF_SIZE, 4, 6), // GFX11+ + COMPUTE_PGM_RSRC3_GFX10_PLUS(TRAP_ON_START, 10, 1), // GFX11+ + COMPUTE_PGM_RSRC3_GFX10_PLUS(TRAP_ON_END, 11, 1), // GFX11+ + COMPUTE_PGM_RSRC3_GFX10_PLUS(RESERVED0, 12, 19), + COMPUTE_PGM_RSRC3_GFX10_PLUS(IMAGE_OP, 31, 1), // GFX11+ }; -#undef COMPUTE_PGM_RSRC3_GFX10 +#undef COMPUTE_PGM_RSRC3_GFX10_PLUS // Kernel code properties. Must be kept backwards compatible. #define KERNEL_CODE_PROPERTY(NAME, SHIFT, WIDTH) \ diff --git a/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp b/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp --- a/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp @@ -695,7 +695,7 @@ ProgInfo.DynamicCallStack = Info.HasDynamicallySizedStack || Info.HasRecursion; const uint64_t MaxScratchPerWorkitem = - GCNSubtarget::MaxWaveScratchSize / STM.getWavefrontSize(); + STM.getMaxWaveScratchSize() / STM.getWavefrontSize(); if (ProgInfo.ScratchSize > MaxScratchPerWorkitem) { DiagnosticInfoStackSize DiagStackSize(MF.getFunction(), ProgInfo.ScratchSize, @@ -879,15 +879,14 @@ ProgInfo.LDSBlocks = alignTo(ProgInfo.LDSSize, 1ULL << LDSAlignShift) >> LDSAlignShift; - // Scratch is allocated in 256 dword blocks. - unsigned ScratchAlignShift = 10; + // Scratch is allocated in 64-dword or 256-dword blocks. + unsigned ScratchAlignShift = + STM.getGeneration() >= AMDGPUSubtarget::GFX11 ? 8 : 10; // We need to program the hardware with the amount of scratch memory that // is used by the entire wave. ProgInfo.ScratchSize is the amount of // scratch memory used per thread. - ProgInfo.ScratchBlocks = - alignTo(ProgInfo.ScratchSize * STM.getWavefrontSize(), - 1ULL << ScratchAlignShift) >> - ScratchAlignShift; + ProgInfo.ScratchBlocks = divideCeil( + ProgInfo.ScratchSize * STM.getWavefrontSize(), 1ULL << ScratchAlignShift); if (getIsaVersion(getGlobalSTI()->getCPU()).Major >= 10) { ProgInfo.WgpMode = STM.isCuModeEnabled() ? 0 : 1; @@ -946,6 +945,7 @@ void AMDGPUAsmPrinter::EmitProgramInfoSI(const MachineFunction &MF, const SIProgramInfo &CurrentProgramInfo) { const SIMachineFunctionInfo *MFI = MF.getInfo(); + const GCNSubtarget &STM = MF.getSubtarget(); unsigned RsrcReg = getRsrcReg(MF.getFunction().getCallingConv()); if (AMDGPU::isCompute(MF.getFunction().getCallingConv())) { @@ -957,7 +957,10 @@ OutStreamer->emitInt32(CurrentProgramInfo.ComputePGMRSrc2); OutStreamer->emitInt32(R_00B860_COMPUTE_TMPRING_SIZE); - OutStreamer->emitInt32(S_00B860_WAVESIZE(CurrentProgramInfo.ScratchBlocks)); + OutStreamer->emitInt32( + STM.getGeneration() >= AMDGPUSubtarget::GFX11 + ? S_00B860_WAVESIZE_GFX11Plus(CurrentProgramInfo.ScratchBlocks) + : S_00B860_WAVESIZE_PreGFX11(CurrentProgramInfo.ScratchBlocks)); // TODO: Should probably note flat usage somewhere. SC emits a "FlatPtr32 = // 0" comment but I don't see a corresponding field in the register spec. @@ -966,14 +969,18 @@ OutStreamer->emitIntValue(S_00B028_VGPRS(CurrentProgramInfo.VGPRBlocks) | S_00B028_SGPRS(CurrentProgramInfo.SGPRBlocks), 4); OutStreamer->emitInt32(R_0286E8_SPI_TMPRING_SIZE); - OutStreamer->emitIntValue( - S_0286E8_WAVESIZE(CurrentProgramInfo.ScratchBlocks), 4); + OutStreamer->emitInt32( + STM.getGeneration() >= AMDGPUSubtarget::GFX11 + ? S_0286E8_WAVESIZE_GFX11Plus(CurrentProgramInfo.ScratchBlocks) + : S_0286E8_WAVESIZE_PreGFX11(CurrentProgramInfo.ScratchBlocks)); } if (MF.getFunction().getCallingConv() == CallingConv::AMDGPU_PS) { OutStreamer->emitInt32(R_00B02C_SPI_SHADER_PGM_RSRC2_PS); - OutStreamer->emitInt32( - S_00B02C_EXTRA_LDS_SIZE(CurrentProgramInfo.LDSBlocks)); + unsigned ExtraLDSSize = STM.getGeneration() >= AMDGPUSubtarget::GFX11 + ? divideCeil(CurrentProgramInfo.LDSBlocks, 2) + : CurrentProgramInfo.LDSBlocks; + OutStreamer->emitInt32(S_00B02C_EXTRA_LDS_SIZE(ExtraLDSSize)); OutStreamer->emitInt32(R_0286CC_SPI_PS_INPUT_ENA); OutStreamer->emitInt32(MFI->getPSInputEnable()); OutStreamer->emitInt32(R_0286D0_SPI_PS_INPUT_ADDR); @@ -1017,7 +1024,10 @@ // ScratchSize is in bytes, 16 aligned. MD->setScratchSize(CC, alignTo(CurrentProgramInfo.ScratchSize, 16)); if (MF.getFunction().getCallingConv() == CallingConv::AMDGPU_PS) { - MD->setRsrc2(CC, S_00B02C_EXTRA_LDS_SIZE(CurrentProgramInfo.LDSBlocks)); + unsigned ExtraLDSSize = STM.getGeneration() >= AMDGPUSubtarget::GFX11 + ? divideCeil(CurrentProgramInfo.LDSBlocks, 2) + : CurrentProgramInfo.LDSBlocks; + MD->setRsrc2(CC, S_00B02C_EXTRA_LDS_SIZE(ExtraLDSSize)); MD->setSpiPsInputEna(MFI->getPSInputEnable()); MD->setSpiPsInputAddr(MFI->getPSInputAddr()); } diff --git a/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp b/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp --- a/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp +++ b/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp @@ -1650,6 +1650,7 @@ const SMLoc &IDLoc); bool validateFlatLdsDMA(const MCInst &Inst, const OperandVector &Operands, const SMLoc &IDLoc); + bool validateExeczVcczOperands(const OperandVector &Operands); Optional validateLdsDirect(const MCInst &Inst); unsigned getConstantBusLimit(unsigned Opcode) const; bool usesConstantBus(const MCInst &Inst, unsigned OpIdx); @@ -4495,6 +4496,22 @@ return true; } +bool AMDGPUAsmParser::validateExeczVcczOperands(const OperandVector &Operands) { + if (!isGFX11Plus()) + return true; + for (auto& Operand: Operands) { + if (!Operand->isReg()) + continue; + unsigned Reg = Operand->getReg(); + if (Reg == SRC_EXECZ || Reg == SRC_VCCZ) { + Error(getRegLoc(Reg, Operands), + "execz and vccz are not supported on this GPU"); + return false; + } + } + return true; +} + bool AMDGPUAsmParser::validateInstruction(const MCInst &Inst, const SMLoc &IDLoc, const OperandVector &Operands) { @@ -4609,6 +4626,9 @@ if (!validateCoherencyBits(Inst, Operands, IDLoc)) { return false; } + if (!validateExeczVcczOperands(Operands)) { + return false; + } if (!validateFlatLdsDMA(Inst, Operands, IDLoc)) { return false; @@ -5088,7 +5108,7 @@ return Error(IDRange.Start, "directive requires gfx10+", IDRange); SharedVGPRCount = Val; PARSE_BITS_ENTRY(KD.compute_pgm_rsrc3, - COMPUTE_PGM_RSRC3_GFX10_SHARED_VGPR_COUNT, Val, + COMPUTE_PGM_RSRC3_GFX10_PLUS_SHARED_VGPR_COUNT, Val, ValRange); } else if (ID == ".amdhsa_exception_fp_ieee_invalid_op") { PARSE_BITS_ENTRY( @@ -5586,7 +5606,7 @@ if (MRI.regsOverlap(AMDGPU::TTMP12_TTMP13_TTMP14_TTMP15, RegNo)) return isGFX9Plus(); - // GFX10 has 2 more SGPRs 104 and 105. + // GFX10+ has 2 more SGPRs 104 and 105. if (MRI.regsOverlap(AMDGPU::SGPR104_SGPR105, RegNo)) return hasSGPR104_SGPR105(); @@ -5595,8 +5615,9 @@ case AMDGPU::SRC_SHARED_LIMIT: case AMDGPU::SRC_PRIVATE_BASE: case AMDGPU::SRC_PRIVATE_LIMIT: - case AMDGPU::SRC_POPS_EXITING_WAVE_ID: return isGFX9Plus(); + case AMDGPU::SRC_POPS_EXITING_WAVE_ID: + return isGFX9Plus() && !isGFX11Plus(); case AMDGPU::TBA: case AMDGPU::TBA_LO: case AMDGPU::TBA_HI: @@ -5619,7 +5640,7 @@ if (isSI() || isGFX10Plus()) { // No flat_scr on SI. - // On GFX10 flat scratch is not a valid register operand and can only be + // On GFX10Plus flat scratch is not a valid register operand and can only be // accessed with s_setreg/s_getreg. switch (RegNo) { case AMDGPU::FLAT_SCR: diff --git a/llvm/lib/Target/AMDGPU/GCNSubtarget.h b/llvm/lib/Target/AMDGPU/GCNSubtarget.h --- a/llvm/lib/Target/AMDGPU/GCNSubtarget.h +++ b/llvm/lib/Target/AMDGPU/GCNSubtarget.h @@ -201,9 +201,6 @@ SIFrameLowering FrameLowering; public: - // See COMPUTE_TMPRING_SIZE.WAVESIZE, 13-bit field in units of 256-dword. - static const unsigned MaxWaveScratchSize = (256 * 4) * ((1 << 13) - 1); - GCNSubtarget(const Triple &TT, StringRef GPU, StringRef FS, const GCNTargetMachine &TM); ~GCNSubtarget() override; @@ -266,9 +263,19 @@ return (Generation)Gen; } + unsigned getMaxWaveScratchSize() const { + // See COMPUTE_TMPRING_SIZE.WAVESIZE. + if (getGeneration() < GFX11) { + // 13-bit field in units of 256-dword. + return (256 * 4) * ((1 << 13) - 1); + } + // 15-bit field in units of 64-dword. + return (64 * 4) * ((1 << 15) - 1); + } + /// Return the number of high bits known to be zero for a frame index. unsigned getKnownHighZeroBitsForFrameIndex() const { - return countLeadingZeros(MaxWaveScratchSize) + getWavefrontSizeLog2(); + return countLeadingZeros(getMaxWaveScratchSize()) + getWavefrontSizeLog2(); } int getLDSBankCount() const { diff --git a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.cpp b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.cpp --- a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.cpp +++ b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.cpp @@ -301,7 +301,7 @@ uint32_t Encoded_pad = Encoded_s_code_end; // Instruction cache line size in bytes. - const unsigned Log2CacheLineSize = 6; + const unsigned Log2CacheLineSize = AMDGPU::isGFX11Plus(STI) ? 7 : 6; const unsigned CacheLineSize = 1u << Log2CacheLineSize; // Extra padding amount in bytes to support prefetch mode 3. @@ -456,7 +456,7 @@ compute_pgm_rsrc1, amdhsa::COMPUTE_PGM_RSRC1_FWD_PROGRESS); PRINT_FIELD(OS, ".amdhsa_shared_vgpr_count", KD, compute_pgm_rsrc3, - amdhsa::COMPUTE_PGM_RSRC3_GFX10_SHARED_VGPR_COUNT); + amdhsa::COMPUTE_PGM_RSRC3_GFX10_PLUS_SHARED_VGPR_COUNT); } PRINT_FIELD( OS, ".amdhsa_exception_fp_ieee_invalid_op", KD, @@ -824,7 +824,7 @@ uint32_t Encoded_pad = Encoded_s_code_end; // Instruction cache line size in bytes. - const unsigned Log2CacheLineSize = 6; + const unsigned Log2CacheLineSize = AMDGPU::isGFX11Plus(STI) ? 7 : 6; const unsigned CacheLineSize = 1u << Log2CacheLineSize; // Extra padding amount in bytes to support prefetch mode 3. diff --git a/llvm/lib/Target/AMDGPU/SIDefines.h b/llvm/lib/Target/AMDGPU/SIDefines.h --- a/llvm/lib/Target/AMDGPU/SIDefines.h +++ b/llvm/lib/Target/AMDGPU/SIDefines.h @@ -1036,10 +1036,12 @@ #define FP_DENORM_MODE_DP(x) (((x) & 0x3) << 6) #define R_00B860_COMPUTE_TMPRING_SIZE 0x00B860 -#define S_00B860_WAVESIZE(x) (((x) & 0x1FFF) << 12) +#define S_00B860_WAVESIZE_PreGFX11(x) (((x) & 0x1FFF) << 12) +#define S_00B860_WAVESIZE_GFX11Plus(x) (((x) & 0x7FFF) << 12) #define R_0286E8_SPI_TMPRING_SIZE 0x0286E8 -#define S_0286E8_WAVESIZE(x) (((x) & 0x1FFF) << 12) +#define S_0286E8_WAVESIZE_PreGFX11(x) (((x) & 0x1FFF) << 12) +#define S_0286E8_WAVESIZE_GFX11Plus(x) (((x) & 0x7FFF) << 12) #define R_028B54_VGT_SHADER_STAGES_EN 0x028B54 #define S_028B54_HS_W32_EN(x) (((x) & 0x1) << 21) diff --git a/llvm/test/MC/AMDGPU/gfx10-constant-bus.s b/llvm/test/MC/AMDGPU/gfx10-constant-bus.s --- a/llvm/test/MC/AMDGPU/gfx10-constant-bus.s +++ b/llvm/test/MC/AMDGPU/gfx10-constant-bus.s @@ -1,63 +1,72 @@ -// RUN: not llvm-mc -arch=amdgcn -mcpu=gfx1010 -show-encoding %s | FileCheck -check-prefix=GFX10 %s -// RUN: not llvm-mc -arch=amdgcn -mcpu=gfx1010 %s 2>&1 | FileCheck -check-prefix=GFX10-ERR --implicit-check-not=error: %s +// RUN: not llvm-mc -arch=amdgcn -mcpu=gfx1010 -show-encoding %s | FileCheck -check-prefixes=GCN,GFX10 %s +// RUN: not llvm-mc -arch=amdgcn -mcpu=gfx1010 %s 2>&1 | FileCheck -check-prefixes=GCN-ERR --implicit-check-not=error: %s +// RUN: not llvm-mc -arch=amdgcn -mcpu=gfx1100 -show-encoding %s | FileCheck -check-prefixes=GCN,GFX11 %s +// RUN: not llvm-mc -arch=amdgcn -mcpu=gfx1100 %s 2>&1 | FileCheck -check-prefixes=GCN-ERR,GFX11-ERR --implicit-check-not=error: %s //----------------------------------------------------------------------------------------- // On GFX10 we can use two scalar operands (except for 64-bit shift instructions) v_add_f32 v0, s0, s1 -// GFX10: v_add_f32_e64 v0, s0, s1 ; encoding: [0x00,0x00,0x03,0xd5,0x00,0x02,0x00,0x00] +// GCN: v_add_f32_e64 v0, s0, s1 ; encoding: [0x00,0x00,0x03,0xd5,0x00,0x02,0x00,0x00] v_madak_f32 v0, s0, v1, 42.42 // GFX10: v_madak_f32 v0, s0, v1, 0x4229ae14 ; encoding: [0x00,0x02,0x00,0x42,0x14,0xae,0x29,0x42] +// GFX11-ERR: error: instruction not supported on this GPU v_med3_f32 v0, s0, s0, s1 // GFX10: v_med3_f32 v0, s0, s0, s1 ; encoding: [0x00,0x00,0x57,0xd5,0x00,0x00,0x04,0x00] +// GFX11: v_med3_f32 v0, s0, s0, s1 ; encoding: [0x00,0x00,0x1f,0xd6,0x00,0x00,0x04,0x00] //----------------------------------------------------------------------------------------- // 64-bit shift instructions can use only one scalar value input v_ashrrev_i64 v[0:1], 0x100, s[0:1] -// GFX10-ERR: error: invalid operand (violates constant bus restrictions) +// GCN-ERR: error: invalid operand (violates constant bus restrictions) v_ashrrev_i64 v[0:1], s2, s[0:1] -// GFX10-ERR: error: invalid operand (violates constant bus restrictions) +// GCN-ERR: error: invalid operand (violates constant bus restrictions) //----------------------------------------------------------------------------------------- // v_div_fmas implicitly reads VCC, so only one scalar operand is possible v_div_fmas_f32 v5, s3, s3, s3 // GFX10: v_div_fmas_f32 v5, s3, s3, s3 ; encoding: [0x05,0x00,0x6f,0xd5,0x03,0x06,0x0c,0x00] +// GFX11: v_div_fmas_f32 v5, s3, s3, s3 ; encoding: [0x05,0x00,0x37,0xd6,0x03,0x06,0x0c,0x00] v_div_fmas_f32 v5, s3, s3, s2 -// GFX10-ERR: error: invalid operand (violates constant bus restrictions) +// GCN-ERR: error: invalid operand (violates constant bus restrictions) v_div_fmas_f32 v5, s3, 0x123, v3 -// GFX10-ERR: error: invalid operand (violates constant bus restrictions) +// GCN-ERR: error: invalid operand (violates constant bus restrictions) v_div_fmas_f64 v[5:6], 0x12345678, 0x12345678, 0x12345678 // GFX10: v_div_fmas_f64 v[5:6], 0x12345678, 0x12345678, 0x12345678 ; encoding: [0x05,0x00,0x70,0xd5,0xff,0xfe,0xfd,0x03,0x78,0x56,0x34,0x12] +// GFX11: v_div_fmas_f64 v[5:6], 0x12345678, 0x12345678, 0x12345678 ; encoding: [0x05,0x00,0x38,0xd6,0xff,0xfe,0xfd,0x03,0x78,0x56,0x34,0x12] v_div_fmas_f64 v[5:6], v[1:2], s[2:3], v[3:4] // GFX10: v_div_fmas_f64 v[5:6], v[1:2], s[2:3], v[3:4] ; encoding: [0x05,0x00,0x70,0xd5,0x01,0x05,0x0c,0x04] +// GFX11: v_div_fmas_f64 v[5:6], v[1:2], s[2:3], v[3:4] ; encoding: [0x05,0x00,0x38,0xd6,0x01,0x05,0x0c,0x04] v_div_fmas_f64 v[5:6], v[1:2], s[2:3], 0x123456 -// GFX10-ERR: error: invalid operand (violates constant bus restrictions) +// GCN-ERR: error: invalid operand (violates constant bus restrictions) //----------------------------------------------------------------------------------------- // v_mad_u64_u32 has operands of different sizes. // When these operands are literals, they are counted as 2 scalar values even if literals are identical. v_lshlrev_b64 v[5:6], 0x3f717273, 0x3f717273 -// GFX10-ERR: error: invalid operand (violates constant bus restrictions) +// GCN-ERR: error: invalid operand (violates constant bus restrictions) v_mad_u64_u32 v[5:6], s12, v1, 0x12345678, 0x12345678 // GFX10: v_mad_u64_u32 v[5:6], s12, v1, 0x12345678, 0x12345678 ; encoding: [0x05,0x0c,0x76,0xd5,0x01,0xff,0xfd,0x03,0x78,0x56,0x34,0x12] +// GFX11: v_mad_u64_u32 v[5:6], s12, v1, 0x12345678, 0x12345678 ; encoding: [0x05,0x0c,0xfe,0xd6,0x01,0xff,0xfd,0x03,0x78,0x56,0x34,0x12] v_mad_u64_u32 v[5:6], s12, s1, 0x12345678, 0x12345678 -// GFX10-ERR: error: invalid operand (violates constant bus restrictions) +// GCN-ERR: error: invalid operand (violates constant bus restrictions) //----------------------------------------------------------------------------------------- // null is free v_bfe_u32 v5, s1, s2, null // GFX10: v_bfe_u32 v5, s1, s2, null ; encoding: [0x05,0x00,0x48,0xd5,0x01,0x04,0xf4,0x01] +// GFX11: v_bfe_u32 v5, s1, s2, null ; encoding: [0x05,0x00,0x10,0xd6,0x01,0x04,0xf0,0x01] diff --git a/llvm/test/MC/AMDGPU/gfx11_asm_operands.s b/llvm/test/MC/AMDGPU/gfx11_asm_operands.s new file mode 100644 --- /dev/null +++ b/llvm/test/MC/AMDGPU/gfx11_asm_operands.s @@ -0,0 +1,144 @@ +// RUN: llvm-mc -arch=amdgcn -mcpu=gfx1030 -show-encoding %s | FileCheck --check-prefix=GFX10 %s +// RUN: not llvm-mc -arch=amdgcn -mcpu=gfx1100 -show-encoding %s | FileCheck --check-prefix=GFX11 %s +// RUN: not llvm-mc -arch=amdgcn -mcpu=gfx1100 -show-encoding %s 2>&1 | FileCheck --check-prefixes=GFX11-ERR %s + +// On GFX11+, EXECZ and VCCZ are no longer allowed to be used as sources to SALU and VALU instructions. +// The inline constants are removed. VCCZ and EXECZ still exist and can be use for conditional branches. +// LDS_DIRECT and POPS_EXITING_WAVE_ID are also no longer allowed. + +//---------------------------------------------------------------------------// +// EXECZ +//---------------------------------------------------------------------------// + +s_cbranch_execz 0x100 +// GFX10: encoding: [0x00,0x01,0x88,0xbf] +// GFX11: encoding: [0x00,0x01,0xa5,0xbf] + +s_add_i32 s0, execz, s2 +// GFX10: encoding: [0xfc,0x02,0x00,0x81] +// GFX11-ERR: error: execz and vccz are not supported on this GPU + +s_add_i32 s0, src_execz, s2 +// GFX10: encoding: [0xfc,0x02,0x00,0x81] +// GFX11-ERR: error: execz and vccz are not supported on this GPU + +s_add_i32 s0, s1, execz +// GFX10: encoding: [0x01,0xfc,0x00,0x81] +// GFX11-ERR: error: execz and vccz are not supported on this GPU + +s_add_i32 s0, s1, src_execz +// GFX10: encoding: [0x01,0xfc,0x00,0x81] +// GFX11-ERR: error: execz and vccz are not supported on this GPU + +v_add_f64 v[0:1], execz, v[2:3] +// GFX10: encoding: [0x00,0x00,0x64,0xd5,0xfc,0x04,0x02,0x00] +// GFX11-ERR: error: execz and vccz are not supported on this GPU + +v_add_f64 v[0:1], src_execz, v[2:3] +// GFX10: encoding: [0x00,0x00,0x64,0xd5,0xfc,0x04,0x02,0x00] +// GFX11-ERR: error: execz and vccz are not supported on this GPU + +v_add_f64 v[0:1], v[1:2], execz +// GFX10: encoding: [0x00,0x00,0x64,0xd5,0x01,0xf9,0x01,0x00] +// GFX11-ERR: error: execz and vccz are not supported on this GPU + +v_add_f64 v[0:1], v[1:2], src_execz +// GFX10: encoding: [0x00,0x00,0x64,0xd5,0x01,0xf9,0x01,0x00] +// GFX11-ERR: error: execz and vccz are not supported on this GPU + +//---------------------------------------------------------------------------// +// VCCZ +//---------------------------------------------------------------------------// + +s_cbranch_vccz 0x100 +// GFX10: encoding: [0x00,0x01,0x86,0xbf] +// GFX11: encoding: [0x00,0x01,0xa3,0xbf] + +s_add_i32 s0, vccz, s2 +// GFX10: encoding: [0xfb,0x02,0x00,0x81] +// GFX11-ERR: error: execz and vccz are not supported on this GPU + +s_add_i32 s0, src_vccz, s2 +// GFX10: encoding: [0xfb,0x02,0x00,0x81] +// GFX11-ERR: error: execz and vccz are not supported on this GPU + +s_add_i32 s0, s1, vccz +// GFX10: encoding: [0x01,0xfb,0x00,0x81] +// GFX11-ERR: error: execz and vccz are not supported on this GPU + +s_add_i32 s0, s1, src_vccz +// GFX10: encoding: [0x01,0xfb,0x00,0x81] +// GFX11-ERR: error: execz and vccz are not supported on this GPU + +v_add_f64 v[0:1], vccz, v[2:3] +// GFX10: encoding: [0x00,0x00,0x64,0xd5,0xfb,0x04,0x02,0x00] +// GFX11-ERR: error: execz and vccz are not supported on this GPU + +v_add_f64 v[0:1], src_vccz, v[2:3] +// GFX10: encoding: [0x00,0x00,0x64,0xd5,0xfb,0x04,0x02,0x00] +// GFX11-ERR: error: execz and vccz are not supported on this GPU + +v_add_f64 v[0:1], v[1:2], vccz +// GFX10: encoding: [0x00,0x00,0x64,0xd5,0x01,0xf7,0x01,0x00] +// GFX11-ERR: error: execz and vccz are not supported on this GPU + +v_add_f64 v[0:1], v[1:2], src_vccz +// GFX10: encoding: [0x00,0x00,0x64,0xd5,0x01,0xf7,0x01,0x00] +// GFX11-ERR: error: execz and vccz are not supported on this GPU + +//---------------------------------------------------------------------------// +// LDS_DIRECT +//---------------------------------------------------------------------------// + +v_readfirstlane_b32 s0, lds_direct +// GFX10: encoding: [0xfe,0x04,0x00,0x7e] +// GFX11-ERR: error: lds_direct is not supported on this GPU + +v_readfirstlane_b32 s0, src_lds_direct +// GFX10: encoding: [0xfe,0x04,0x00,0x7e] +// GFX11-ERR: error: lds_direct is not supported on this GPU + +v_mov_b32 v0, lds_direct +// GFX10: encoding: [0xfe,0x02,0x00,0x7e] +// GFX11-ERR: error: lds_direct is not supported on this GPU + +v_mov_b32 v0, src_lds_direct +// GFX10: encoding: [0xfe,0x02,0x00,0x7e] +// GFX11-ERR: error: lds_direct is not supported on this GPU + +//---------------------------------------------------------------------------// +// POPS_EXITING_WAVE_ID +//---------------------------------------------------------------------------// + +s_add_i32 s0, src_pops_exiting_wave_id, s1 +// GFX10: encoding: [0xef,0x01,0x00,0x81] +// GFX11-ERR: error: register not available on this GPU + +s_add_i32 s0, s1, src_pops_exiting_wave_id +// GFX10: encoding: [0x01,0xef,0x00,0x81] +// GFX11-ERR: error: register not available on this GPU + +s_add_i32 s0, pops_exiting_wave_id, s1 +// GFX10: encoding: [0xef,0x01,0x00,0x81] +// GFX11-ERR: error: register not available on this GPU + +s_add_i32 s0, s1, pops_exiting_wave_id +// GFX10: encoding: [0x01,0xef,0x00,0x81] +// GFX11-ERR: error: register not available on this GPU + +v_add_co_u32 v0, s0, pops_exiting_wave_id, v1 +// GFX10: encoding: [0x00,0x00,0x0f,0xd7,0xef,0x02,0x02,0x00] +// GFX11-ERR: error: register not available on this GPU + +v_add_co_u32 v0, s0, src_pops_exiting_wave_id, v1 +// GFX10: encoding: [0x00,0x00,0x0f,0xd7,0xef,0x02,0x02,0x00] +// GFX11-ERR: error: register not available on this GPU + +v_add_co_u32 v0, s0, v1, pops_exiting_wave_id +// GFX10: encoding: [0x00,0x00,0x0f,0xd7,0x01,0xdf,0x01,0x00] +// GFX11-ERR: error: register not available on this GPU + +v_add_co_u32 v0, s0, v1, src_pops_exiting_wave_id +// GFX10: encoding: [0x00,0x00,0x0f,0xd7,0x01,0xdf,0x01,0x00] +// GFX11-ERR: error: register not available on this GPU + diff --git a/llvm/test/MC/AMDGPU/hsa-gfx11-v3.s b/llvm/test/MC/AMDGPU/hsa-gfx11-v3.s new file mode 100644 --- /dev/null +++ b/llvm/test/MC/AMDGPU/hsa-gfx11-v3.s @@ -0,0 +1,213 @@ +// RUN: llvm-mc -triple amdgcn-amd-amdhsa -mcpu=gfx1100 --amdhsa-code-object-version=3 < %s | FileCheck --check-prefix=ASM %s +// RUN: llvm-mc -triple amdgcn-amd-amdhsa -mcpu=gfx1100 --amdhsa-code-object-version=3 -filetype=obj < %s > %t +// RUN: llvm-readelf -S -r -s %t | FileCheck --check-prefix=READOBJ %s +// RUN: llvm-objdump -s -j .rodata %t | FileCheck --check-prefix=OBJDUMP %s + +// READOBJ: Section Headers +// READOBJ: .text PROGBITS {{[0-9a-f]+}} {{[0-9a-f]+}} {{[0-9a-f]+}} {{[0-9]+}} AX {{[0-9]+}} {{[0-9]+}} 256 +// READOBJ: .rodata PROGBITS {{[0-9a-f]+}} {{[0-9a-f]+}} 0000c0 {{[0-9]+}} A {{[0-9]+}} {{[0-9]+}} 64 + +// READOBJ: Relocation section '.rela.rodata' at offset +// READOBJ: 0000000000000010 {{[0-9a-f]+}}00000005 R_AMDGPU_REL64 0000000000000000 .text + 10 +// READOBJ: 0000000000000050 {{[0-9a-f]+}}00000005 R_AMDGPU_REL64 0000000000000000 .text + 110 +// READOBJ: 0000000000000090 {{[0-9a-f]+}}00000005 R_AMDGPU_REL64 0000000000000000 .text + 210 + +// READOBJ: Symbol table '.symtab' contains {{[0-9]+}} entries: +// READOBJ: 0000000000000000 0 FUNC LOCAL PROTECTED 2 minimal +// READOBJ-NEXT: 0000000000000100 0 FUNC LOCAL PROTECTED 2 complete +// READOBJ-NEXT: 0000000000000200 0 FUNC LOCAL PROTECTED 2 special_sgpr +// READOBJ-NEXT: 0000000000000000 64 OBJECT LOCAL DEFAULT 3 minimal.kd +// READOBJ-NEXT: 0000000000000040 64 OBJECT LOCAL DEFAULT 3 complete.kd +// READOBJ-NEXT: 0000000000000080 64 OBJECT LOCAL DEFAULT 3 special_sgpr.kd + +// OBJDUMP: Contents of section .rodata +// Note, relocation for KERNEL_CODE_ENTRY_BYTE_OFFSET is not resolved here. +// minimal +// OBJDUMP-NEXT: 0000 00000000 00000000 00000000 00000000 +// OBJDUMP-NEXT: 0010 00000000 00000000 00000000 00000000 +// OBJDUMP-NEXT: 0020 00000000 00000000 00000000 00000000 +// OBJDUMP-NEXT: 0030 0000ac60 80000000 00000000 00000000 +// complete +// OBJDUMP-NEXT: 0040 01000000 01000000 08000000 00000000 +// OBJDUMP-NEXT: 0050 00000000 00000000 00000000 00000000 +// OBJDUMP-NEXT: 0060 00000000 00000000 00000000 00000000 +// OBJDUMP-NEXT: 0070 015001e4 130f007f 5e040000 00000000 +// special_sgpr +// OBJDUMP-NEXT: 0080 00000000 00000000 00000000 00000000 +// OBJDUMP-NEXT: 0090 00000000 00000000 00000000 00000000 +// OBJDUMP-NEXT: 00a0 00000000 00000000 00000000 00000000 +// OBJDUMP-NEXT: 00b0 00000060 80000000 00000000 00000000 + +.text +// ASM: .text + +.amdgcn_target "amdgcn-amd-amdhsa--gfx1100" +// ASM: .amdgcn_target "amdgcn-amd-amdhsa--gfx1100" + +.p2align 8 +.type minimal,@function +minimal: + s_endpgm + +.p2align 8 +.type complete,@function +complete: + s_endpgm + +.p2align 8 +.type special_sgpr,@function +special_sgpr: + s_endpgm + +.rodata +// ASM: .rodata + +// Test that only specifying required directives is allowed, and that defaulted +// values are omitted. +.p2align 6 +.amdhsa_kernel minimal + .amdhsa_next_free_vgpr 0 + .amdhsa_next_free_sgpr 0 +.end_amdhsa_kernel + +// ASM: .amdhsa_kernel minimal +// ASM: .amdhsa_next_free_vgpr 0 +// ASM-NEXT: .amdhsa_next_free_sgpr 0 +// ASM: .end_amdhsa_kernel + +// Test that we can specify all available directives with non-default values. +.p2align 6 +.amdhsa_kernel complete + .amdhsa_group_segment_fixed_size 1 + .amdhsa_private_segment_fixed_size 1 + .amdhsa_kernarg_size 8 + .amdhsa_user_sgpr_dispatch_ptr 1 + .amdhsa_user_sgpr_queue_ptr 1 + .amdhsa_user_sgpr_kernarg_segment_ptr 1 + .amdhsa_user_sgpr_dispatch_id 1 + .amdhsa_user_sgpr_private_segment_size 1 + .amdhsa_wavefront_size32 1 + .amdhsa_enable_private_segment 1 + .amdhsa_system_sgpr_workgroup_id_x 0 + .amdhsa_system_sgpr_workgroup_id_y 1 + .amdhsa_system_sgpr_workgroup_id_z 1 + .amdhsa_system_sgpr_workgroup_info 1 + .amdhsa_system_vgpr_workitem_id 1 + .amdhsa_next_free_vgpr 9 + .amdhsa_next_free_sgpr 27 + .amdhsa_reserve_vcc 0 + .amdhsa_float_round_mode_32 1 + .amdhsa_float_round_mode_16_64 1 + .amdhsa_float_denorm_mode_32 1 + .amdhsa_float_denorm_mode_16_64 0 + .amdhsa_dx10_clamp 0 + .amdhsa_ieee_mode 0 + .amdhsa_fp16_overflow 1 + .amdhsa_workgroup_processor_mode 1 + .amdhsa_memory_ordered 1 + .amdhsa_forward_progress 1 + .amdhsa_exception_fp_ieee_invalid_op 1 + .amdhsa_exception_fp_denorm_src 1 + .amdhsa_exception_fp_ieee_div_zero 1 + .amdhsa_exception_fp_ieee_overflow 1 + .amdhsa_exception_fp_ieee_underflow 1 + .amdhsa_exception_fp_ieee_inexact 1 + .amdhsa_exception_int_div_zero 1 +.end_amdhsa_kernel + +// ASM: .amdhsa_kernel complete +// ASM-NEXT: .amdhsa_group_segment_fixed_size 1 +// ASM-NEXT: .amdhsa_private_segment_fixed_size 1 +// ASM-NEXT: .amdhsa_kernarg_size 8 +// ASM-NEXT: .amdhsa_user_sgpr_count 9 +// ASM-NEXT: .amdhsa_user_sgpr_dispatch_ptr 1 +// ASM-NEXT: .amdhsa_user_sgpr_queue_ptr 1 +// ASM-NEXT: .amdhsa_user_sgpr_kernarg_segment_ptr 1 +// ASM-NEXT: .amdhsa_user_sgpr_dispatch_id 1 +// ASM-NEXT: .amdhsa_user_sgpr_private_segment_size 1 +// ASM-NEXT: .amdhsa_wavefront_size32 1 +// ASM-NEXT: .amdhsa_enable_private_segment 1 +// ASM-NEXT: .amdhsa_system_sgpr_workgroup_id_x 0 +// ASM-NEXT: .amdhsa_system_sgpr_workgroup_id_y 1 +// ASM-NEXT: .amdhsa_system_sgpr_workgroup_id_z 1 +// ASM-NEXT: .amdhsa_system_sgpr_workgroup_info 1 +// ASM-NEXT: .amdhsa_system_vgpr_workitem_id 1 +// ASM-NEXT: .amdhsa_next_free_vgpr 9 +// ASM-NEXT: .amdhsa_next_free_sgpr 27 +// ASM-NEXT: .amdhsa_reserve_vcc 0 +// ASM-NEXT: .amdhsa_float_round_mode_32 1 +// ASM-NEXT: .amdhsa_float_round_mode_16_64 1 +// ASM-NEXT: .amdhsa_float_denorm_mode_32 1 +// ASM-NEXT: .amdhsa_float_denorm_mode_16_64 0 +// ASM-NEXT: .amdhsa_dx10_clamp 0 +// ASM-NEXT: .amdhsa_ieee_mode 0 +// ASM-NEXT: .amdhsa_fp16_overflow 1 +// ASM-NEXT: .amdhsa_workgroup_processor_mode 1 +// ASM-NEXT: .amdhsa_memory_ordered 1 +// ASM-NEXT: .amdhsa_forward_progress 1 +// ASM-NEXT: .amdhsa_shared_vgpr_count 0 +// ASM-NEXT: .amdhsa_exception_fp_ieee_invalid_op 1 +// ASM-NEXT: .amdhsa_exception_fp_denorm_src 1 +// ASM-NEXT: .amdhsa_exception_fp_ieee_div_zero 1 +// ASM-NEXT: .amdhsa_exception_fp_ieee_overflow 1 +// ASM-NEXT: .amdhsa_exception_fp_ieee_underflow 1 +// ASM-NEXT: .amdhsa_exception_fp_ieee_inexact 1 +// ASM-NEXT: .amdhsa_exception_int_div_zero 1 +// ASM-NEXT: .end_amdhsa_kernel + +// Test that we are including special SGPR usage in the granulated count. +.p2align 6 +.amdhsa_kernel special_sgpr + // Same next_free_sgpr as "complete", but... + .amdhsa_next_free_sgpr 27 + // ...on GFX10+ this should require an additional 6 SGPRs, pushing us from + // 3 granules to 4 + + .amdhsa_reserve_vcc 0 + + .amdhsa_float_denorm_mode_16_64 0 + .amdhsa_dx10_clamp 0 + .amdhsa_ieee_mode 0 + .amdhsa_next_free_vgpr 0 +.end_amdhsa_kernel + +// ASM: .amdhsa_kernel special_sgpr +// ASM: .amdhsa_next_free_vgpr 0 +// ASM-NEXT: .amdhsa_next_free_sgpr 27 +// ASM-NEXT: .amdhsa_reserve_vcc 0 +// ASM: .amdhsa_float_denorm_mode_16_64 0 +// ASM-NEXT: .amdhsa_dx10_clamp 0 +// ASM-NEXT: .amdhsa_ieee_mode 0 +// ASM: .end_amdhsa_kernel + +.section .foo + +.byte .amdgcn.gfx_generation_number +// ASM: .byte 11 + +.byte .amdgcn.next_free_vgpr +// ASM: .byte 0 +.byte .amdgcn.next_free_sgpr +// ASM: .byte 0 + +v_mov_b32_e32 v7, s10 + +.byte .amdgcn.next_free_vgpr +// ASM: .byte 8 +.byte .amdgcn.next_free_sgpr +// ASM: .byte 11 + +.set .amdgcn.next_free_vgpr, 0 +.set .amdgcn.next_free_sgpr, 0 + +.byte .amdgcn.next_free_vgpr +// ASM: .byte 0 +.byte .amdgcn.next_free_sgpr +// ASM: .byte 0 + +v_mov_b32_e32 v16, s3 + +.byte .amdgcn.next_free_vgpr +// ASM: .byte 17 +.byte .amdgcn.next_free_sgpr +// ASM: .byte 4 diff --git a/llvm/test/MC/Disassembler/AMDGPU/decode-err.txt b/llvm/test/MC/Disassembler/AMDGPU/decode-err.txt --- a/llvm/test/MC/Disassembler/AMDGPU/decode-err.txt +++ b/llvm/test/MC/Disassembler/AMDGPU/decode-err.txt @@ -1,4 +1,9 @@ # RUN: llvm-mc -arch=amdgcn -mcpu=gfx900 -disassemble -show-encoding < %s 2>&1 | FileCheck -check-prefix=GCN %s +# RUN: llvm-mc -arch=amdgcn -mcpu=gfx1100 -disassemble -show-encoding < %s 2>&1 | FileCheck -check-prefix=GFX11 %s # GCN: warning: invalid instruction encoding 0xdf,0x00,0x00,0x02 + +# this is buffer_atomic_csub_u32 v5, off, s[8:11], s3 offset:4095. Invalid without glc +# GFX11: warning: invalid instruction encoding +0xff,0x0f,0xdc,0xe0,0x00,0x05,0x02,0x03