This is an archive of the discontinued LLVM Phabricator instance.

[AMDGPU] Set more flags on Real instructions
ClosedPublic

Authored by foad on Jun 15 2021, 5:21 AM.

Details

Summary

This does not affect codegen, which only tests these flags on Pseudo
instructions, but might help llvm-mca which has to work with Real
instructions. In particular setting LGKM_CNT on DS instructions helps
with the problem identified in D104149.

Change-Id: Ia0fe0ebf26171144443f7580cda6249bf4dfaaa0

Diff Detail

Event Timeline

foad created this revision.Jun 15 2021, 5:21 AM
foad requested review of this revision.Jun 15 2021, 5:21 AM
Herald added a project: Restricted Project. · View Herald TranscriptJun 15 2021, 5:21 AM
rampitec accepted this revision.Jun 15 2021, 10:47 AM
This revision is now accepted and ready to land.Jun 15 2021, 10:47 AM

You may be completely on top of it already, but here is the function that I would like to be able to more or less copy within the AMDGPUCustomBehaviour class. (You don't have to read through the function, I summarize all the flags that get checked below.)

void SIInsertWaitcnts::updateEventWaitcntAfter(MachineInstr &Inst,
                                               WaitcntBrackets *ScoreBrackets) {
  // Now look at the instruction opcode. If it is a memory access
  // instruction, update the upper-bound of the appropriate counter's
  // bracket and the destination operand scores.
  // TODO: Use the (TSFlags & SIInstrFlags::LGKM_CNT) property everywhere.
  if (TII->isDS(Inst) && TII->usesLGKM_CNT(Inst)) {
    if (TII->isAlwaysGDS(Inst.getOpcode()) ||
        TII->hasModifiersSet(Inst, AMDGPU::OpName::gds)) {
      ScoreBrackets->updateByEvent(TII, TRI, MRI, GDS_ACCESS, Inst);
      ScoreBrackets->updateByEvent(TII, TRI, MRI, GDS_GPR_LOCK, Inst);
    } else {
      ScoreBrackets->updateByEvent(TII, TRI, MRI, LDS_ACCESS, Inst);
    }
  } else if (TII->isFLAT(Inst)) {
    assert(Inst.mayLoadOrStore());

    int FlatASCount = 0;

    if (mayAccessVMEMThroughFlat(Inst)) {
      ++FlatASCount;
      if (!ST->hasVscnt())
        ScoreBrackets->updateByEvent(TII, TRI, MRI, VMEM_ACCESS, Inst);
      else if (Inst.mayLoad() && !SIInstrInfo::isAtomicNoRet(Inst))
        ScoreBrackets->updateByEvent(TII, TRI, MRI, VMEM_READ_ACCESS, Inst);
      else
        ScoreBrackets->updateByEvent(TII, TRI, MRI, VMEM_WRITE_ACCESS, Inst);
    }

    if (mayAccessLDSThroughFlat(Inst)) {
      ++FlatASCount;
      ScoreBrackets->updateByEvent(TII, TRI, MRI, LDS_ACCESS, Inst);
    }

    // A Flat memory operation must access at least one address space.
    assert(FlatASCount);

    // This is a flat memory operation that access both VMEM and LDS, so note it
    // - it will require that both the VM and LGKM be flushed to zero if it is
    // pending when a VM or LGKM dependency occurs.
    if (FlatASCount > 1)
      ScoreBrackets->setPendingFlat();
  } else if (SIInstrInfo::isVMEM(Inst) &&
             // TODO: get a better carve out.
             Inst.getOpcode() != AMDGPU::BUFFER_WBINVL1 &&
             Inst.getOpcode() != AMDGPU::BUFFER_WBINVL1_SC &&
             Inst.getOpcode() != AMDGPU::BUFFER_WBINVL1_VOL &&
             Inst.getOpcode() != AMDGPU::BUFFER_GL0_INV &&
             Inst.getOpcode() != AMDGPU::BUFFER_GL1_INV) {
    if (!ST->hasVscnt())
      ScoreBrackets->updateByEvent(TII, TRI, MRI, VMEM_ACCESS, Inst);
    else if ((Inst.mayLoad() && !SIInstrInfo::isAtomicNoRet(Inst)) ||
             /* IMAGE_GET_RESINFO / IMAGE_GET_LOD */
             (TII->isMIMG(Inst) && !Inst.mayLoad() && !Inst.mayStore()))
      ScoreBrackets->updateByEvent(TII, TRI, MRI, VMEM_READ_ACCESS, Inst);
    else if (Inst.mayStore())
      ScoreBrackets->updateByEvent(TII, TRI, MRI, VMEM_WRITE_ACCESS, Inst);

    if (ST->vmemWriteNeedsExpWaitcnt() &&
        (Inst.mayStore() || SIInstrInfo::isAtomicRet(Inst))) {
      ScoreBrackets->updateByEvent(TII, TRI, MRI, VMW_GPR_LOCK, Inst);
    }
  } else if (TII->isSMRD(Inst)) {
    ScoreBrackets->updateByEvent(TII, TRI, MRI, SMEM_ACCESS, Inst);
  } else if (Inst.isCall()) {
    if (callWaitsOnFunctionReturn(Inst)) {
      // Act as a wait on everything
      ScoreBrackets->applyWaitcnt(AMDGPU::Waitcnt::allZero(ST->hasVscnt()));
    } else {
      // May need to way wait for anything.
      ScoreBrackets->applyWaitcnt(AMDGPU::Waitcnt());
    }
  } else if (SIInstrInfo::isEXP(Inst)) {
    unsigned Imm = TII->getNamedOperand(Inst, AMDGPU::OpName::tgt)->getImm();
    if (Imm >= AMDGPU::Exp::ET_PARAM0 && Imm <= AMDGPU::Exp::ET_PARAM31)
      ScoreBrackets->updateByEvent(TII, TRI, MRI, EXP_PARAM_ACCESS, Inst);
    else if (Imm >= AMDGPU::Exp::ET_POS0 && Imm <= AMDGPU::Exp::ET_POS_LAST)
      ScoreBrackets->updateByEvent(TII, TRI, MRI, EXP_POS_ACCESS, Inst);
    else
      ScoreBrackets->updateByEvent(TII, TRI, MRI, EXP_GPR_LOCK, Inst);
  } else {
    switch (Inst.getOpcode()) {
    case AMDGPU::S_SENDMSG:
    case AMDGPU::S_SENDMSGHALT:
      ScoreBrackets->updateByEvent(TII, TRI, MRI, SQ_MESSAGE, Inst);
      break;
    case AMDGPU::S_MEMTIME:
    case AMDGPU::S_MEMREALTIME:
      ScoreBrackets->updateByEvent(TII, TRI, MRI, SMEM_ACCESS, Inst);
      break;
    }
  }
}

So the list of flags:

mayLoad
mayStore
MI.getDesc().TSFlags & SIInstrFlags::DS
MI.getDesc().TSFlags & SIInstrFlags::LGKM_CNT;
MI.getDesc().TSFlags & SIInstrFlags::FLAT;
MI.getDesc().TSFlags & SIInstrFlags::IsAtomicNoRet
MI.getDesc().TSFlags & SIInstrFlags::IsAtomicRet
MI.getDesc().TSFlags & SIInstrFlags::MUBUF
MI.getDesc().TSFlags & SIInstrFlags::MTBUF
MI.getDesc().TSFlags & SIInstrFlags::MIMG
MI.getDesc().TSFlags & SIInstrFlags::SMRD;
MI.getDesc().TSFlags & SIInstrFlags::EXP

So basically just the TSFlags (assuming it's easier to copy them all at once rather than just the individual ones needed), mayLoad, and mayStore.

It seems as though the TSFlags were already being copied properly, but I'm fairly certain that the ds_read instruction was failing the following check within CB last week. So there may have been a class of instructions that weren't having their TSFlags copied? (But the copy is happening within InstSI so I'm not sure how that would happen. Maybe the ds_read instruction isn't having the flags set properly from within the pseudo instruction itself?)

if ((MCID.TSFlags & SIInstrFlags::DS) &&
    (MCID.TSFlags & SIInstrFlags::LGKM_CNT)) {

If there's anything you want me to do or test, let me know.

One function that gets used within the logic is:

// This is a flat memory operation. Check to see if it has memory tokens other
// than LDS. Other address spaces supported by flat memory operations involve
// global memory.
bool SIInsertWaitcnts::mayAccessVMEMThroughFlat(const MachineInstr &MI) const {
  assert(TII->isFLAT(MI));

  // All flat instructions use the VMEM counter.
  assert(TII->usesVM_CNT(MI));

  // If there are no memory operands then conservatively assume the flat
  // operation may access VMEM.
  if (MI.memoperands_empty())
    return true;

  // See if any memory operand specifies an address space that involves VMEM.
  // Flat operations only supported FLAT, LOCAL (LDS), or address spaces
  // involving VMEM such as GLOBAL, CONSTANT, PRIVATE (SCRATCH), etc. The REGION
  // (GDS) address space is not supported by flat operations. Therefore, simply
  // return true unless only the LDS address space is found.
  for (const MachineMemOperand *Memop : MI.memoperands()) {
    unsigned AS = Memop->getAddrSpace();
    assert(AS != AMDGPUAS::REGION_ADDRESS);
    if (AS != AMDGPUAS::LOCAL_ADDRESS)
      return true;
  }

  return false;
}

(Actually SIInsertWaitcnts::mayAccessLDSThroughFlat is another function that is also in the same boat as this one.) I'm not positive if I'll be able to recreate this from within llvm-mca because I'm not sure whether the MCInst can be queried for memoperands. (It is a MachineInstr that is being used in this function, not an MCInst.) I don't think there's anything that you can do to correct this though. Maybe the MCInst can be queried for memoperands and if not, maybe I'll have to make a conservative assumption, but this can be something I explore and discuss further when I submit the patch for AMDGPUCustomBehaviour.

If you're curious about this memoperand issue, I just looked into it a bit more. The MCInst class contains a list of MCOperandInfo. This class has a member OperandType which refers to an enum where one of the enum elements is OPERAND_MEMORY. I'm not sure if this is comparable to the MachineInstr::memoperands() function though since that one does things a bit strange:

/// Access to memory operands of the instruction. If there are none, that does
/// not imply anything about whether the function accesses memory. Instead,
/// the caller must behave conservatively.
ArrayRef<MachineMemOperand *> memoperands() const {
  if (!Info)
    return {};

  if (Info.is<EIIK_MMO>())
    return makeArrayRef(Info.getAddrOfZeroTagPointer(), 1);

  if (ExtraInfo *EI = Info.get<EIIK_OutOfLine>())
    return EI->getMMOs();

  return {};
}

Anyways, if this isn't something that you have any familiarity or insight with, then I can bring it back up when I submit the patch for AMDGPUCB.

This revision was automatically updated to reflect the committed changes.
foad added a comment.Jun 16 2021, 2:41 AM

So basically just the TSFlags (assuming it's easier to copy them all at once rather than just the individual ones needed), mayLoad, and mayStore.

Hmm. These are not all being copied correctly at the moment. Unfortunately there is no easy way to check for discrepancies. I am knocking up a python script to do it based on the output of llvm-tblgen -dump-json.

It seems as though the TSFlags were already being copied properly, but I'm fairly certain that the ds_read instruction was failing the following check within CB last week. So there may have been a class of instructions that weren't having their TSFlags copied?

I don't think you can copy the TSFlags all at once. The ds_read problem should be fixed by the current patch because it sets LGKM_CNT on the Real instruction.

One function that gets used within the logic is:

// This is a flat memory operation. Check to see if it has memory tokens other
// than LDS. Other address spaces supported by flat memory operations involve
// global memory.
bool SIInsertWaitcnts::mayAccessVMEMThroughFlat(const MachineInstr &MI) const {
  assert(TII->isFLAT(MI));

  // All flat instructions use the VMEM counter.
  assert(TII->usesVM_CNT(MI));

  // If there are no memory operands then conservatively assume the flat
  // operation may access VMEM.
  if (MI.memoperands_empty())
    return true;

  // See if any memory operand specifies an address space that involves VMEM.
  // Flat operations only supported FLAT, LOCAL (LDS), or address spaces
  // involving VMEM such as GLOBAL, CONSTANT, PRIVATE (SCRATCH), etc. The REGION
  // (GDS) address space is not supported by flat operations. Therefore, simply
  // return true unless only the LDS address space is found.
  for (const MachineMemOperand *Memop : MI.memoperands()) {
    unsigned AS = Memop->getAddrSpace();
    assert(AS != AMDGPUAS::REGION_ADDRESS);
    if (AS != AMDGPUAS::LOCAL_ADDRESS)
      return true;
  }

  return false;
}

(Actually SIInsertWaitcnts::mayAccessLDSThroughFlat is another function that is also in the same boat as this one.) I'm not positive if I'll be able to recreate this from within llvm-mca because I'm not sure whether the MCInst can be queried for memoperands. (It is a MachineInstr that is being used in this function, not an MCInst.) I don't think there's anything that you can do to correct this though. Maybe the MCInst can be queried for memoperands and if not, maybe I'll have to make a conservative assumption, but this can be something I explore and discuss further when I submit the patch for AMDGPUCustomBehaviour.

The code above is examining the pointer being loaded from, to see if the compiler statically knows something about which area of memory it points into. There's no way you can do this in MCA for a single load/store instruction in isolation, so I think you just have to "conservatively assume the flat operation may access VMEM".

foad added a comment.Jun 16 2021, 5:39 AM

So basically just the TSFlags (assuming it's easier to copy them all at once rather than just the individual ones needed), mayLoad, and mayStore.

Hmm. These are not all being copied correctly at the moment. Unfortunately there is no easy way to check for discrepancies. I am knocking up a python script to do it based on the output of llvm-tblgen -dump-json.

I've pushed some patches which should fix all of this: rG323b3e645dd3, rG24ffc343f9da, rG7f3ac6714a56

holland11 added a comment.EditedJun 16 2021, 10:23 AM

I've pushed some patches which should fix all of this: rG323b3e645dd3, rG24ffc343f9da, rG7f3ac6714a56

Awesome, thank you very much for this. I will update the AMDGPUCustomBehaviour implementation and then post it for review again.

I don't think you can copy the TSFlags all at once.

Out of curiosity, why doesn't the DS_Real class have the let TSFlags = ps.TSFlags line that all of the other classes in this patch have (all of the classes that inherit from InstSI anyways: SM_Real, FLAT_Real, MUBUF_Real, and MTBUF_Real).

I'm newer to tablegen and it's a really quirky 'language' so I may be misunderstanding something, but it seems like it is possible to copy all of the TSFlags at once and most classes were already doing. (Many of them were still missing their mayLoad and mayStore flags before this patch though.)

I just did a quick search for all classes that inherit from InstSI (and have a psuedo instruction as one of the input operands) to find which ones don't have the let TSFlags = ps.TSFlags line.

EXPInstructions.td:

class EXPCommon<bit done, string asm = ""> : InstSI<
  (outs),
  (ins exp_tgt:$tgt,
       ExpSrc0:$src0, ExpSrc1:$src1, ExpSrc2:$src2, ExpSrc3:$src3,
       exp_vm:$vm, exp_compr:$compr, i32imm:$en),
  asm> {
  let EXP = 1;
  let EXP_CNT = 1;
  let mayLoad = done;
  let mayStore = 1;
  let UseNamedOperandTable = 1;
  let Uses = [EXEC];
  let SchedRW = [WriteExport];
  let DisableWQM = 1;
}

class EXP_Real<bit done, string pseudo, int subtarget>
  : EXPCommon<done, "exp$tgt $src0, $src1, $src2, $src3"#!if(done, " done", "")
                    #"$compr$vm">,
    SIMCInstr <pseudo, subtarget> {
  let AsmMatchConverter = "cvtExp";
}

SOPInstructions.td:

class SOP1_Real<bits<8> op, SOP1_Pseudo ps, string real_name = ps.Mnemonic> :
  InstSI <ps.OutOperandList, ps.InOperandList,
          real_name # " " # ps.AsmOperands, []>,
  Enc32 {

  let isPseudo = 0;
  let isCodeGenOnly = 0;
  let Size = 4;

  // copy relevant pseudo op flags
  let SubtargetPredicate = ps.SubtargetPredicate;
  let AsmMatchConverter  = ps.AsmMatchConverter;
  let SchedRW            = ps.SchedRW;

  // encoding
  bits<7> sdst;
  bits<8> src0;

  let Inst{7-0} = !if(ps.has_src0, src0, ?);
  let Inst{15-8} = op;
  let Inst{22-16} = !if(ps.has_sdst, sdst, ?);
  let Inst{31-23} = 0x17d; //encoding;
}
class SOPK_Real<bits<5> op, SOPK_Pseudo ps> :
  InstSI <ps.OutOperandList, ps.InOperandList,
          ps.Mnemonic # " " # ps.AsmOperands, []> {
  let isPseudo = 0;
  let isCodeGenOnly = 0;

  // copy relevant pseudo op flags
  let SubtargetPredicate = ps.SubtargetPredicate;
  let AsmMatchConverter  = ps.AsmMatchConverter;
  let DisableEncoding    = ps.DisableEncoding;
  let Constraints        = ps.Constraints;
  let SchedRW            = ps.SchedRW;

  // encoding
  bits<7>  sdst;
  bits<16> simm16;
  bits<32> imm;
}

DSInstructions.td:

class DS_Real <DS_Pseudo ds> :
  InstSI <ds.OutOperandList, ds.InOperandList, ds.Mnemonic # ds.AsmOperands, []>,
  Enc64 {

  let isPseudo = 0;
  let isCodeGenOnly = 0;
  let DS = 1;
  let UseNamedOperandTable = 1;

  // copy relevant pseudo op flags
  let SubtargetPredicate = ds.SubtargetPredicate;
  let OtherPredicates    = ds.OtherPredicates;
  let AsmMatchConverter  = ds.AsmMatchConverter;
  let SchedRW            = ds.SchedRW;

  // encoding fields
  bits<10> vdst;
  bits<1> gds;
  bits<8> addr;
  bits<10> data0;
  bits<10> data1;
  bits<8> offset0;
  bits<8> offset1;

  bits<16> offset;
  let offset0 = !if(ds.has_offset, offset{7-0}, ?);
  let offset1 = !if(ds.has_offset, offset{15-8}, ?);

  bits<1> acc = !if(ds.has_vdst, vdst{9},
                    !if(!or(ds.has_data0, ds.has_gws_data0), data0{9}, 0));
}

The majority of classes that inherit from InstSI and have a Psuedo instruction as an input operand do have let TSFlags = ps.TSFlags, but these ones are the exceptions.

You'd know better than I, but maybe adding that line to the above classes (combined with how you've already added the mayLoad and mayStore flags in one of the other patches) would do the job?