Index: llvm/trunk/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp =================================================================== --- llvm/trunk/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp +++ llvm/trunk/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp @@ -686,6 +686,9 @@ case AMDGPU::XNACK_MASK_HI: llvm_unreachable("xnack_mask registers should not be used"); + case AMDGPU::LDS_DIRECT: + llvm_unreachable("lds_direct register should not be used"); + case AMDGPU::TBA: case AMDGPU::TBA_LO: case AMDGPU::TBA_HI: Index: llvm/trunk/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp =================================================================== --- llvm/trunk/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp +++ llvm/trunk/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp @@ -1095,6 +1095,7 @@ bool validateMIMGGatherDMask(const MCInst &Inst); bool validateMIMGDataSize(const MCInst &Inst); bool validateMIMGD16(const MCInst &Inst); + bool validateLdsDirect(const MCInst &Inst); bool usesConstantBus(const MCInst &Inst, unsigned OpIdx); bool isInlineConstant(const MCInst &Inst, unsigned OpIdx) const; unsigned findImplicitSGPRReadInVOP(const MCInst &Inst) const; @@ -1599,6 +1600,8 @@ .Case("vcc", AMDGPU::VCC) .Case("flat_scratch", AMDGPU::FLAT_SCR) .Case("xnack_mask", AMDGPU::XNACK_MASK) + .Case("lds_direct", AMDGPU::LDS_DIRECT) + .Case("src_lds_direct", AMDGPU::LDS_DIRECT) .Case("m0", AMDGPU::M0) .Case("scc", AMDGPU::SCC) .Case("tba", AMDGPU::TBA) @@ -2465,6 +2468,86 @@ return true; } +bool AMDGPUAsmParser::validateLdsDirect(const MCInst &Inst) { + + using namespace SIInstrFlags; + const unsigned Opcode = Inst.getOpcode(); + const MCInstrDesc &Desc = MII.get(Opcode); + + // lds_direct register is defined so that it can be used + // with 9-bit operands only. Ignore encodings which do not accept these. + if ((Desc.TSFlags & (VOP1 | VOP2 | VOP3 | VOPC | VOP3P | SIInstrFlags::SDWA)) == 0) + return true; + + const int Src0Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src0); + const int Src1Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src1); + const int Src2Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src2); + + const int SrcIndices[] = { Src1Idx, Src2Idx }; + + // lds_direct cannot be specified as either src1 or src2. + for (int SrcIdx : SrcIndices) { + if (SrcIdx == -1) break; + const MCOperand &Src = Inst.getOperand(SrcIdx); + if (Src.isReg() && Src.getReg() == LDS_DIRECT) { + return false; + } + } + + if (Src0Idx == -1) + return true; + + const MCOperand &Src = Inst.getOperand(Src0Idx); + if (!Src.isReg() || Src.getReg() != LDS_DIRECT) + return true; + + // lds_direct is specified as src0. Check additional limitations. + + // FIXME: This is a workaround for bug 37943 + // which allows 64-bit VOP3 opcodes use 32-bit operands. + if (AMDGPU::getRegOperandSize(getMRI(), Desc, Src0Idx) != 4) + return false; + + // Documentation does not disable lds_direct for SDWA, but SP3 assembler does. + // FIXME: This inconsistence needs to be investigated further. + if (Desc.TSFlags & SIInstrFlags::SDWA) + return false; + + // The following opcodes do not accept lds_direct which is explicitly stated + // in AMD documentation. However SP3 disables lds_direct for most other 'rev' + // opcodes as well (e.g. for v_subrev_u32 but not for v_subrev_f32). + // FIXME: This inconsistence needs to be investigated further. + switch (Opcode) { + case AMDGPU::V_LSHLREV_B32_e32_si: + case AMDGPU::V_LSHLREV_B32_e64_si: + case AMDGPU::V_LSHLREV_B16_e32_vi: + case AMDGPU::V_LSHLREV_B16_e64_vi: + case AMDGPU::V_LSHLREV_B32_e32_vi: + case AMDGPU::V_LSHLREV_B32_e64_vi: + case AMDGPU::V_LSHLREV_B64_vi: + case AMDGPU::V_LSHRREV_B32_e32_si: + case AMDGPU::V_LSHRREV_B32_e64_si: + case AMDGPU::V_LSHRREV_B16_e32_vi: + case AMDGPU::V_LSHRREV_B16_e64_vi: + case AMDGPU::V_LSHRREV_B32_e32_vi: + case AMDGPU::V_LSHRREV_B32_e64_vi: + case AMDGPU::V_LSHRREV_B64_vi: + case AMDGPU::V_ASHRREV_I32_e64_si: + case AMDGPU::V_ASHRREV_I32_e32_si: + case AMDGPU::V_ASHRREV_I16_e32_vi: + case AMDGPU::V_ASHRREV_I16_e64_vi: + case AMDGPU::V_ASHRREV_I32_e32_vi: + case AMDGPU::V_ASHRREV_I32_e64_vi: + case AMDGPU::V_ASHRREV_I64_vi: + case AMDGPU::V_PK_LSHLREV_B16_vi: + case AMDGPU::V_PK_LSHRREV_B16_vi: + case AMDGPU::V_PK_ASHRREV_I16_vi: + return false; + default: + return true; + } +} + bool AMDGPUAsmParser::validateSOPLiteral(const MCInst &Inst) const { unsigned Opcode = Inst.getOpcode(); const MCInstrDesc &Desc = MII.get(Opcode); @@ -2500,6 +2583,11 @@ bool AMDGPUAsmParser::validateInstruction(const MCInst &Inst, const SMLoc &IDLoc) { + if (!validateLdsDirect(Inst)) { + Error(IDLoc, + "invalid use of lds_direct"); + return false; + } if (!validateSOPLiteral(Inst)) { Error(IDLoc, "only one literal operand is allowed"); Index: llvm/trunk/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp =================================================================== --- llvm/trunk/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp +++ llvm/trunk/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp @@ -781,6 +781,7 @@ // ToDo: no support for execz register case 252: break; case 253: return createRegOperand(SCC); + case 254: return createRegOperand(LDS_DIRECT); default: break; } return errOperand(Val, "unknown operand encoding " + Twine(Val)); Index: llvm/trunk/lib/Target/AMDGPU/InstPrinter/AMDGPUInstPrinter.cpp =================================================================== --- llvm/trunk/lib/Target/AMDGPU/InstPrinter/AMDGPUInstPrinter.cpp +++ llvm/trunk/lib/Target/AMDGPU/InstPrinter/AMDGPUInstPrinter.cpp @@ -268,6 +268,9 @@ case AMDGPU::XNACK_MASK: O << "xnack_mask"; return; + case AMDGPU::LDS_DIRECT: + O << "src_lds_direct"; + return; case AMDGPU::VCC_LO: O << "vcc_lo"; return; Index: llvm/trunk/lib/Target/AMDGPU/SIRegisterInfo.cpp =================================================================== --- llvm/trunk/lib/Target/AMDGPU/SIRegisterInfo.cpp +++ llvm/trunk/lib/Target/AMDGPU/SIRegisterInfo.cpp @@ -163,6 +163,9 @@ // Reserve xnack_mask registers - support is not implemented in Codegen. reserveRegisterTuples(Reserved, AMDGPU::XNACK_MASK); + // Reserve lds_direct register - support is not implemented in Codegen. + reserveRegisterTuples(Reserved, AMDGPU::LDS_DIRECT); + // Reserve Trap Handler registers - support is not implemented in Codegen. reserveRegisterTuples(Reserved, AMDGPU::TBA); reserveRegisterTuples(Reserved, AMDGPU::TMA); Index: llvm/trunk/lib/Target/AMDGPU/SIRegisterInfo.td =================================================================== --- llvm/trunk/lib/Target/AMDGPU/SIRegisterInfo.td +++ llvm/trunk/lib/Target/AMDGPU/SIRegisterInfo.td @@ -75,6 +75,8 @@ def SRC_PRIVATE_BASE : SIReg<"src_private_base", 237>; def SRC_PRIVATE_LIMIT : SIReg<"src_private_limit", 238>; +def LDS_DIRECT : SIReg <"lds_direct", 254>; + def XNACK_MASK_LO : SIReg<"xnack_mask_lo", 104>; def XNACK_MASK_HI : SIReg<"xnack_mask_hi", 105>; @@ -409,6 +411,12 @@ let CopyCost = -1; } +def LDS_DIRECT_CLASS : RegisterClass<"AMDGPU", [i32, f32, i16, f16, v2i16, v2f16], 32, + (add LDS_DIRECT)> { + let isAllocatable = 0; + let CopyCost = -1; +} + // Subset of SReg_32 without M0 for SMRD instructions and alike. // See comments in SIInstructions.td for more info. def SReg_32_XM0_XEXEC : RegisterClass<"AMDGPU", [i32, f32, i16, f16, v2i16, v2f16], 32, @@ -545,7 +553,7 @@ } def VS_32 : RegisterClass<"AMDGPU", [i32, f32, i16, f16, v2i16, v2f16], 32, - (add VGPR_32, SReg_32)> { + (add VGPR_32, SReg_32, LDS_DIRECT_CLASS)> { let isAllocatable = 0; } Index: llvm/trunk/test/MC/AMDGPU/lds_direct-err.s =================================================================== --- llvm/trunk/test/MC/AMDGPU/lds_direct-err.s +++ llvm/trunk/test/MC/AMDGPU/lds_direct-err.s @@ -0,0 +1,59 @@ +// RUN: not llvm-mc -arch=amdgcn -mcpu=gfx900 %s 2>&1 | FileCheck %s --check-prefix=NOGFX9 + +//---------------------------------------------------------------------------// +// lds_direct may be used only with vector ALU instructions +//---------------------------------------------------------------------------// + +s_and_b32 s2, lds_direct, s1 +// NOGFX9: error + +//---------------------------------------------------------------------------// +// lds_direct may not be used with V_{LSHL,LSHR,ASHL}REV opcodes +//---------------------------------------------------------------------------// + +v_ashrrev_i16 v0, lds_direct, v0 +// NOGFX9: error + +v_ashrrev_i32 v0, lds_direct, v0 +// NOGFX9: error + +v_lshlrev_b16 v0, lds_direct, v0 +// NOGFX9: error + +v_lshlrev_b32 v0, lds_direct, v0 +// NOGFX9: error + +v_lshrrev_b16 v0, lds_direct, v0 +// NOGFX9: error + +v_lshrrev_b32 v0, lds_direct, v0 +// NOGFX9: error + +v_pk_ashrrev_i16 v0, lds_direct, v0 +// NOGFX9: error + +v_pk_lshlrev_b16 v0, lds_direct, v0 +// NOGFX9: error + +v_pk_lshrrev_b16 v0, lds_direct, v0 +// NOGFX9: error + +//---------------------------------------------------------------------------// +// lds_direct cannot be used with 64-bit and larger operands +//---------------------------------------------------------------------------// + +v_add_f64 v[0:1], lds_direct, v[0:1] +// NOGFX9: error + +//---------------------------------------------------------------------------// +// Only SRC0 may specify lds_direct +//---------------------------------------------------------------------------// + +v_add_i32 v0, v0, lds_direct +// NOGFX9: error + +v_add_i32 lds_direct, v0, v0 +// NOGFX9: error + +v_fma_f32 v0, v0, v0, lds_direct +// NOGFX9: error Index: llvm/trunk/test/MC/AMDGPU/lds_direct.s =================================================================== --- llvm/trunk/test/MC/AMDGPU/lds_direct.s +++ llvm/trunk/test/MC/AMDGPU/lds_direct.s @@ -0,0 +1,116 @@ +// RUN: llvm-mc -arch=amdgcn -mcpu=gfx900 -show-encoding %s | FileCheck %s --check-prefix=GFX9 + +//---------------------------------------------------------------------------// +// VOP1/3 +//---------------------------------------------------------------------------// + +v_mov_b32 v0, src_lds_direct +// GFX9: v_mov_b32_e32 v0, src_lds_direct ; encoding: [0xfe,0x02,0x00,0x7e] + +v_mov_b32_e64 v0, src_lds_direct +// GFX9: v_mov_b32_e64 v0, src_lds_direct ; encoding: [0x00,0x00,0x41,0xd1,0xfe,0x00,0x00,0x00] + +v_cvt_f64_i32 v[0:1], src_lds_direct +// GFX9: v_cvt_f64_i32_e32 v[0:1], src_lds_direct ; encoding: [0xfe,0x08,0x00,0x7e] + +v_cvt_f64_i32_e64 v[0:1], src_lds_direct +// GFX9: v_cvt_f64_i32_e64 v[0:1], src_lds_direct ; encoding: [0x00,0x00,0x44,0xd1,0xfe,0x00,0x00,0x00] + +v_mov_fed_b32 v0, src_lds_direct +// GFX9: v_mov_fed_b32_e32 v0, src_lds_direct ; encoding: [0xfe,0x12,0x00,0x7e] + +v_mov_fed_b32_e64 v0, src_lds_direct +// GFX9: v_mov_fed_b32_e64 v0, src_lds_direct ; encoding: [0x00,0x00,0x49,0xd1,0xfe,0x00,0x00,0x00] + +v_fract_f32 v0, src_lds_direct +// GFX9: v_fract_f32_e32 v0, src_lds_direct ; encoding: [0xfe,0x36,0x00,0x7e] + +v_fract_f32_e64 v0, src_lds_direct +// GFX9: v_fract_f32_e64 v0, src_lds_direct ; encoding: [0x00,0x00,0x5b,0xd1,0xfe,0x00,0x00,0x00] + +v_cvt_f16_u16 v0, src_lds_direct +// GFX9: v_cvt_f16_u16_e32 v0, src_lds_direct ; encoding: [0xfe,0x72,0x00,0x7e] + +//---------------------------------------------------------------------------// +// VOP2/3 +//---------------------------------------------------------------------------// + +v_cndmask_b32 v0, src_lds_direct, v0, vcc +// GFX9: v_cndmask_b32_e32 v0, src_lds_direct, v0, vcc ; encoding: [0xfe,0x00,0x00,0x00] + +v_cndmask_b32_e64 v0, src_lds_direct, v0, s[0:1] +// GFX9: v_cndmask_b32_e64 v0, src_lds_direct, v0, s[0:1] ; encoding: [0x00,0x00,0x00,0xd1,0xfe,0x00,0x02,0x00] + +v_add_f32 v0, src_lds_direct, v0 +// GFX9: v_add_f32_e32 v0, src_lds_direct, v0 ; encoding: [0xfe,0x00,0x00,0x02] + +v_add_f32_e64 v0, src_lds_direct, v0 +// GFX9: v_add_f32_e64 v0, src_lds_direct, v0 ; encoding: [0x00,0x00,0x01,0xd1,0xfe,0x00,0x02,0x00] + +v_mul_i32_i24 v0, src_lds_direct, v0 +// GFX9: v_mul_i32_i24_e32 v0, src_lds_direct, v0 ; encoding: [0xfe,0x00,0x00,0x0c] + +v_add_co_u32 v0, vcc, src_lds_direct, v0 +// GFX9: v_add_co_u32_e32 v0, vcc, src_lds_direct, v0 ; encoding: [0xfe,0x00,0x00,0x32] + +//---------------------------------------------------------------------------// +// VOP3 +//---------------------------------------------------------------------------// + +v_add_co_u32_e64 v0, s[0:1], src_lds_direct, v0 +// GFX9: v_add_co_u32_e64 v0, s[0:1], src_lds_direct, v0 ; encoding: [0x00,0x00,0x19,0xd1,0xfe,0x00,0x02,0x00] + +v_madmk_f16 v0, src_lds_direct, 0x1121, v0 +// GFX9: v_madmk_f16 v0, src_lds_direct, 0x1121, v0 ; encoding: [0xfe,0x00,0x00,0x48,0x21,0x11,0x00,0x00] + +v_madak_f16 v0, src_lds_direct, v0, 0x1121 +// GFX9: v_madak_f16 v0, src_lds_direct, v0, 0x1121 ; encoding: [0xfe,0x00,0x00,0x4a,0x21,0x11,0x00,0x00] + +v_mad_f32 v0, src_lds_direct, v0, v0 +// GFX9: v_mad_f32 v0, src_lds_direct, v0, v0 ; encoding: [0x00,0x00,0xc1,0xd1,0xfe,0x00,0x02,0x04] + +v_fma_f32 v0, src_lds_direct, v0, v0 +// GFX9: v_fma_f32 v0, src_lds_direct, v0, v0 ; encoding: [0x00,0x00,0xcb,0xd1,0xfe,0x00,0x02,0x04] + +v_min3_i16 v0, src_lds_direct, v0, v0 +// GFX9: v_min3_i16 v0, src_lds_direct, v0, v0 ; encoding: [0x00,0x00,0xf5,0xd1,0xfe,0x00,0x02,0x04] + +v_max3_f16 v0, src_lds_direct, v0, v0 +// GFX9: v_max3_f16 v0, src_lds_direct, v0, v0 ; encoding: [0x00,0x00,0xf7,0xd1,0xfe,0x00,0x02,0x04] + +//---------------------------------------------------------------------------// +// VOP3P +//---------------------------------------------------------------------------// + +v_pk_mad_i16 v0, src_lds_direct, v0, v0 +// GFX9: v_pk_mad_i16 v0, src_lds_direct, v0, v0 ; encoding: [0x00,0x40,0x80,0xd3,0xfe,0x00,0x02,0x1c] + +v_pk_add_i16 v0, src_lds_direct, v0 +// GFX9: v_pk_add_i16 v0, src_lds_direct, v0 ; encoding: [0x00,0x00,0x82,0xd3,0xfe,0x00,0x02,0x18] + +//---------------------------------------------------------------------------// +// VOPC +//---------------------------------------------------------------------------// + +v_cmp_lt_f16 vcc, src_lds_direct, v0 +// GFX9: v_cmp_lt_f16_e32 vcc, src_lds_direct, v0 ; encoding: [0xfe,0x00,0x42,0x7c] + +v_cmp_eq_f32 vcc, src_lds_direct, v0 +// GFX9: v_cmp_eq_f32_e32 vcc, src_lds_direct, v0 ; encoding: [0xfe,0x00,0x84,0x7c] + +v_cmpx_neq_f32 vcc, src_lds_direct, v0 +// GFX9: v_cmpx_neq_f32_e32 vcc, src_lds_direct, v0 ; encoding: [0xfe,0x00,0xba,0x7c] + +//---------------------------------------------------------------------------// +// lds_direct alias +//---------------------------------------------------------------------------// + +v_cmp_lt_f16 vcc, lds_direct, v0 +// GFX9: v_cmp_lt_f16_e32 vcc, src_lds_direct, v0 ; encoding: [0xfe,0x00,0x42,0x7c] + +//---------------------------------------------------------------------------// +// FIXME: enable lds_direct for the following opcodes and add tests +//---------------------------------------------------------------------------// + +//v_readfirstlane_b32 s0, src_lds_direct +//v_readlane_b32 s0, src_lds_direct, s0 Index: llvm/trunk/test/MC/Disassembler/AMDGPU/lds_direct_gfx9.txt =================================================================== --- llvm/trunk/test/MC/Disassembler/AMDGPU/lds_direct_gfx9.txt +++ llvm/trunk/test/MC/Disassembler/AMDGPU/lds_direct_gfx9.txt @@ -0,0 +1,19 @@ +# RUN: llvm-mc -arch=amdgcn -mcpu=gfx900 -disassemble -show-encoding < %s | FileCheck %s --check-prefix=GFX9 + +# GFX9: v_mov_b32_e32 v0, src_lds_direct ; encoding: [0xfe,0x02,0x00,0x7e] +0xfe,0x02,0x00,0x7e + +# GFX9: v_mov_b32_e64 v0, src_lds_direct ; encoding: [0x00,0x00,0x41,0xd1,0xfe,0x00,0x00,0x00] +0x00,0x00,0x41,0xd1,0xfe,0x00,0x00,0x00 + +# GFX9: v_add_f32_e32 v0, src_lds_direct, v0 ; encoding: [0xfe,0x00,0x00,0x02] +0xfe,0x00,0x00,0x02 + +# GFX9: v_pk_mad_i16 v0, src_lds_direct, v0, v0 ; encoding: [0x00,0x40,0x80,0xd3,0xfe,0x00,0x02,0x1c] +0x00,0x40,0x80,0xd3,0xfe,0x00,0x02,0x1c + +# GFX9: v_pk_mul_lo_u16 v0, src_lds_direct, v0 ; encoding: [0x00,0x00,0x81,0xd3,0xfe,0x00,0x02,0x18] +0x00,0x00,0x81,0xd3,0xfe,0x00,0x02,0x18 + +# GFX9: v_cmpx_le_i32_e32 vcc, src_lds_direct, v0 ; encoding: [0xfe,0x00,0xa6,0x7d] +0xfe,0x00,0xa6,0x7d