Index: lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp =================================================================== --- lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp +++ lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp @@ -235,6 +235,11 @@ } MCOperand AMDGPUDisassembler::decodeOperand_VGPR_32(unsigned Val) const { + // Some instructions have operand restrictions beyond what the encoding + // allows. Some ordinarily VSrc_32 operands are VGPR_32, so clear the extra + // high bit. + Val &= 255; + return createRegOperand(AMDGPU::VGPR_32RegClassID, Val); } Index: lib/Target/AMDGPU/SIISelLowering.cpp =================================================================== --- lib/Target/AMDGPU/SIISelLowering.cpp +++ lib/Target/AMDGPU/SIISelLowering.cpp @@ -159,6 +159,9 @@ } } + // TODO: For dynamic 64-bit vector inserts/extracts, should emit a pseudo that + // is expanded to avoid having two separate loops in case the index is a VGPR. + // Most operations are naturally 32-bit vector operations. We only support // load and store of i64 vectors, so promote v2i64 vector operations to v4i32. for (MVT Vec64 : { MVT::v2i64, MVT::v2f64 }) { @@ -1126,6 +1129,315 @@ return SplitBB; } +// Do a v_movrels_b32 or v_movreld_b32 for each unique value of \p IdxReg in the +// wavefront. If the value is uniform and just happens to be in a VGPR, this +// will only do one iteration. In the worst case, this will loop 64 times. +// +// TODO: Just use v_readlane_b32 if we know the VGPR has a uniform value. +static void emitLoadM0FromVGPRLoop(const SIInstrInfo *TII, + MachineRegisterInfo &MRI, + MachineBasicBlock &OrigBB, + MachineBasicBlock &LoopBB, + const DebugLoc &DL, + MachineInstr *MovRel, + const MachineOperand &IdxReg, + unsigned InitReg, + unsigned ResultReg, + unsigned PhiReg, + unsigned InitSaveExecReg, + int Offset) { + MachineBasicBlock::iterator I = LoopBB.begin(); + + unsigned PhiExec = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass); + unsigned NewExec = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass); + unsigned CurrentIdxReg = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass); + unsigned CondReg = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass); + + BuildMI(LoopBB, I, DL, TII->get(TargetOpcode::PHI), PhiReg) + .addReg(InitReg) + .addMBB(&OrigBB) + .addReg(ResultReg) + .addMBB(&LoopBB); + + BuildMI(LoopBB, I, DL, TII->get(TargetOpcode::PHI), PhiExec) + .addReg(InitSaveExecReg) + .addMBB(&OrigBB) + .addReg(NewExec) + .addMBB(&LoopBB); + + // Read the next variant <- also loop target. + BuildMI(LoopBB, I, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32), CurrentIdxReg) + .addReg(IdxReg.getReg(), getUndefRegState(IdxReg.isUndef())); + + // Compare the just read M0 value to all possible Idx values. + BuildMI(LoopBB, I, DL, TII->get(AMDGPU::V_CMP_EQ_U32_e64), CondReg) + .addReg(CurrentIdxReg) + .addOperand(IdxReg); + + // Move index from VCC into M0 + if (Offset == 0) { + BuildMI(LoopBB, I, DL, TII->get(AMDGPU::S_MOV_B32), AMDGPU::M0) + .addReg(CurrentIdxReg, RegState::Kill); + } else { + BuildMI(LoopBB, I, DL, TII->get(AMDGPU::S_ADD_I32), AMDGPU::M0) + .addReg(CurrentIdxReg, RegState::Kill) + .addImm(Offset); + } + + // Update EXEC, save the original EXEC value to VCC. + BuildMI(LoopBB, I, DL, TII->get(AMDGPU::S_AND_SAVEEXEC_B64), NewExec) + .addReg(CondReg, RegState::Kill); + + MRI.setSimpleHint(NewExec, CondReg); + + // Do the actual move. + LoopBB.insert(I, MovRel); + + // Update EXEC, switch all done bits to 0 and all todo bits to 1. + BuildMI(LoopBB, I, DL, TII->get(AMDGPU::S_XOR_B64), AMDGPU::EXEC) + .addReg(AMDGPU::EXEC) + .addReg(NewExec); + + // XXX - s_xor_b64 sets scc to 1 if the result is nonzero, so can we use + // s_cbranch_scc0? + + // Loop back to V_READFIRSTLANE_B32 if there are still variants to cover. + BuildMI(LoopBB, I, DL, TII->get(AMDGPU::S_CBRANCH_EXECNZ)) + .addMBB(&LoopBB); +} + +// This has slightly sub-optimal regalloc when the source vector is killed by +// the read. The register allocator does not understand that the kill is +// per-workitem, so is kept alive for the whole loop so we end up not re-using a +// subregister from it, using 1 more VGPR than necessary. This was saved when +// this was expanded after register allocation. +static MachineBasicBlock *loadM0FromVGPR(const SIInstrInfo *TII, + MachineBasicBlock &MBB, + MachineInstr &MI, + MachineInstr *MovRel, + unsigned InitResultReg, + unsigned PhiReg, + int Offset) { + MachineFunction *MF = MBB.getParent(); + MachineRegisterInfo &MRI = MF->getRegInfo(); + const DebugLoc &DL = MI.getDebugLoc(); + MachineBasicBlock::iterator I(&MI); + + unsigned DstReg = MI.getOperand(0).getReg(); + unsigned SaveExec = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass); + unsigned TmpExec = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass); + + BuildMI(MBB, I, DL, TII->get(TargetOpcode::IMPLICIT_DEF), TmpExec); + + // Save the EXEC mask + BuildMI(MBB, I, DL, TII->get(AMDGPU::S_MOV_B64), SaveExec) + .addReg(AMDGPU::EXEC); + + // To insert the loop we need to split the block. Move everything after this + // point to a new block, and insert a new empty block between the two. + MachineBasicBlock *LoopBB = MF->CreateMachineBasicBlock(); + MachineBasicBlock *RemainderBB = MF->CreateMachineBasicBlock(); + MachineFunction::iterator MBBI(MBB); + ++MBBI; + + MF->insert(MBBI, LoopBB); + MF->insert(MBBI, RemainderBB); + + LoopBB->addSuccessor(LoopBB); + LoopBB->addSuccessor(RemainderBB); + + // Move the rest of the block into a new block. + RemainderBB->transferSuccessors(&MBB); + RemainderBB->splice(RemainderBB->begin(), &MBB, I, MBB.end()); + + MBB.addSuccessor(LoopBB); + + const MachineOperand *Idx = TII->getNamedOperand(MI, AMDGPU::OpName::idx); + + emitLoadM0FromVGPRLoop(TII, MRI, MBB, *LoopBB, DL, MovRel, *Idx, + InitResultReg, DstReg, PhiReg, TmpExec, Offset); + + MachineBasicBlock::iterator First = RemainderBB->begin(); + BuildMI(*RemainderBB, First, DL, TII->get(AMDGPU::S_MOV_B64), AMDGPU::EXEC) + .addReg(SaveExec); + + MI.eraseFromParent(); + + return RemainderBB; +} + +// Returns subreg index, offset +static std::pair +computeIndirectRegAndOffset(const SIRegisterInfo &TRI, + const TargetRegisterClass *SuperRC, + unsigned VecReg, + int Offset) { + int NumElts = SuperRC->getSize() / 4; + + // Skip out of bounds offsets, or else we would end up using an undefined + // register. + if (Offset >= NumElts || Offset < 0) + return std::make_pair(AMDGPU::sub0, Offset); + + return std::make_pair(AMDGPU::sub0 + Offset, 0); +} + +// Return true if the index is an SGPR and was set. +static bool setM0ToIndexFromSGPR(const SIInstrInfo *TII, + MachineRegisterInfo &MRI, + MachineInstr &MI, + int Offset) { + MachineBasicBlock *MBB = MI.getParent(); + const DebugLoc &DL = MI.getDebugLoc(); + MachineBasicBlock::iterator I(&MI); + + const MachineOperand *Idx = TII->getNamedOperand(MI, AMDGPU::OpName::idx); + const TargetRegisterClass *IdxRC = MRI.getRegClass(Idx->getReg()); + + assert(Idx->getReg() != AMDGPU::NoRegister); + + if (!TII->getRegisterInfo().isSGPRClass(IdxRC)) + return false; + + if (Offset == 0) { + BuildMI(*MBB, I, DL, TII->get(AMDGPU::S_MOV_B32), AMDGPU::M0) + .addOperand(*Idx); + } else { + BuildMI(*MBB, I, DL, TII->get(AMDGPU::S_ADD_I32), AMDGPU::M0) + .addOperand(*Idx) + .addImm(Offset); + } + + return true; +} + +// Control flow needs to be inserted if indexing with a VGPR. +static MachineBasicBlock *emitIndirectSrc(MachineInstr &MI, + MachineBasicBlock &MBB, + const SIInstrInfo *TII) { + const SIRegisterInfo &TRI = TII->getRegisterInfo(); + MachineFunction *MF = MBB.getParent(); + MachineRegisterInfo &MRI = MF->getRegInfo(); + + unsigned Dst = MI.getOperand(0).getReg(); + const MachineOperand *SrcVec = TII->getNamedOperand(MI, AMDGPU::OpName::src); + int Offset = TII->getNamedOperand(MI, AMDGPU::OpName::offset)->getImm(); + + const TargetRegisterClass *VecRC = MRI.getRegClass(SrcVec->getReg()); + + unsigned SubReg; + std::tie(SubReg, Offset) + = computeIndirectRegAndOffset(TRI, VecRC, SrcVec->getReg(), Offset); + + if (setM0ToIndexFromSGPR(TII, MRI, MI, Offset)) { + MachineBasicBlock::iterator I(&MI); + const DebugLoc &DL = MI.getDebugLoc(); + + BuildMI(MBB, I, DL, TII->get(AMDGPU::V_MOVRELS_B32_e32), Dst) + .addReg(SrcVec->getReg(), RegState::Undef, SubReg) + .addReg(SrcVec->getReg(), RegState::Implicit); + MI.eraseFromParent(); + + return &MBB; + } + + const DebugLoc &DL = MI.getDebugLoc(); + MachineBasicBlock::iterator I(&MI); + + unsigned PhiReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); + unsigned InitReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); + + BuildMI(MBB, I, DL, TII->get(TargetOpcode::IMPLICIT_DEF), InitReg); + + MachineInstr *MovRel = + BuildMI(*MF, DL, TII->get(AMDGPU::V_MOVRELS_B32_e32), Dst) + .addReg(SrcVec->getReg(), RegState::Undef, SubReg) + .addReg(SrcVec->getReg(), RegState::Implicit); + + return loadM0FromVGPR(TII, MBB, MI, MovRel, InitReg, PhiReg, Offset); +} + +static MachineBasicBlock *emitIndirectDst(MachineInstr &MI, + MachineBasicBlock &MBB, + const SIInstrInfo *TII) { + const SIRegisterInfo &TRI = TII->getRegisterInfo(); + MachineFunction *MF = MBB.getParent(); + MachineRegisterInfo &MRI = MF->getRegInfo(); + + unsigned Dst = MI.getOperand(0).getReg(); + const MachineOperand *SrcVec = TII->getNamedOperand(MI, AMDGPU::OpName::src); + const MachineOperand *Idx = TII->getNamedOperand(MI, AMDGPU::OpName::idx); + const MachineOperand *Val = TII->getNamedOperand(MI, AMDGPU::OpName::val); + int Offset = TII->getNamedOperand(MI, AMDGPU::OpName::offset)->getImm(); + const TargetRegisterClass *VecRC = MRI.getRegClass(SrcVec->getReg()); + + // This can be an immediate, but will be folded later. + assert(Val->getReg()); + + unsigned SubReg; + std::tie(SubReg, Offset) = computeIndirectRegAndOffset(TRI, VecRC, + SrcVec->getReg(), + Offset); + if (Idx->getReg() == AMDGPU::NoRegister) { + MachineBasicBlock::iterator I(&MI); + const DebugLoc &DL = MI.getDebugLoc(); + + assert(Offset == 0); + + BuildMI(MBB, I, DL, TII->get(TargetOpcode::INSERT_SUBREG), Dst) + .addOperand(*SrcVec) + .addOperand(*Val) + .addImm(SubReg); + + MI.eraseFromParent(); + return &MBB; + } + + const MCInstrDesc &MovRelDesc = TII->get(AMDGPU::V_MOVRELD_B32_e32); + if (setM0ToIndexFromSGPR(TII, MRI, MI, Offset)) { + MachineBasicBlock::iterator I(&MI); + const DebugLoc &DL = MI.getDebugLoc(); + + MachineInstr *MovRel = + BuildMI(MBB, I, DL, MovRelDesc) + .addReg(SrcVec->getReg(), RegState::Undef, SubReg) // vdst + .addOperand(*Val) + .addReg(Dst, RegState::ImplicitDefine) + .addReg(SrcVec->getReg(), RegState::Implicit); + + const int ImpDefIdx = MovRelDesc.getNumOperands() + + MovRelDesc.getNumImplicitUses(); + const int ImpUseIdx = ImpDefIdx + 1; + + MovRel->tieOperands(ImpDefIdx, ImpUseIdx); + MI.eraseFromParent(); + return &MBB; + } + + if (Val->isReg()) + MRI.clearKillFlags(Val->getReg()); + + const DebugLoc &DL = MI.getDebugLoc(); + unsigned PhiReg = MRI.createVirtualRegister(VecRC); + + // vdst is not actually read and just provides the base register index. + MachineInstr *MovRel = + BuildMI(*MF, DL, MovRelDesc) + .addReg(PhiReg, RegState::Undef, SubReg) // vdst + .addOperand(*Val) + .addReg(Dst, RegState::ImplicitDefine) + .addReg(PhiReg, RegState::Implicit); + + const int ImpDefIdx = MovRelDesc.getNumOperands() + + MovRelDesc.getNumImplicitUses(); + const int ImpUseIdx = ImpDefIdx + 1; + + MovRel->tieOperands(ImpDefIdx, ImpUseIdx); + + return loadM0FromVGPR(TII, MBB, MI, MovRel, + SrcVec->getReg(), PhiReg, Offset); +} + MachineBasicBlock *SITargetLowering::EmitInstrWithCustomInserter( MachineInstr &MI, MachineBasicBlock *BB) const { switch (MI.getOpcode()) { @@ -1133,12 +1445,10 @@ const SIInstrInfo *TII = getSubtarget()->getInstrInfo(); BuildMI(*BB, MI.getIterator(), MI.getDebugLoc(), TII->get(AMDGPU::S_MOV_B32), AMDGPU::M0) - .addOperand(MI.getOperand(0)); + .addOperand(MI.getOperand(0)); MI.eraseFromParent(); - break; - } - case AMDGPU::BRANCH: return BB; + } case AMDGPU::GET_GROUPSTATICSIZE: { const SIInstrInfo *TII = getSubtarget()->getInstrInfo(); @@ -1151,12 +1461,23 @@ MI.eraseFromParent(); return BB; } + case AMDGPU::SI_INDIRECT_SRC_V1: + case AMDGPU::SI_INDIRECT_SRC_V2: + case AMDGPU::SI_INDIRECT_SRC_V4: + case AMDGPU::SI_INDIRECT_SRC_V8: + case AMDGPU::SI_INDIRECT_SRC_V16: + return emitIndirectSrc(MI, *BB, getSubtarget()->getInstrInfo()); + case AMDGPU::SI_INDIRECT_DST_V1: + case AMDGPU::SI_INDIRECT_DST_V2: + case AMDGPU::SI_INDIRECT_DST_V4: + case AMDGPU::SI_INDIRECT_DST_V8: + case AMDGPU::SI_INDIRECT_DST_V16: + return emitIndirectDst(MI, *BB, getSubtarget()->getInstrInfo()); case AMDGPU::SI_KILL: return splitKillBlock(MI, BB); default: return AMDGPUTargetLowering::EmitInstrWithCustomInserter(MI, BB); } - return BB; } bool SITargetLowering::enableAggressiveFMAFusion(EVT VT) const { Index: lib/Target/AMDGPU/SIInstrInfo.cpp =================================================================== --- lib/Target/AMDGPU/SIInstrInfo.cpp +++ lib/Target/AMDGPU/SIInstrInfo.cpp @@ -1646,6 +1646,16 @@ return true; } +static bool isSubRegOf(const SIRegisterInfo &TRI, + const MachineOperand &SuperVec, + const MachineOperand &SubReg) { + if (TargetRegisterInfo::isPhysicalRegister(SubReg.getReg())) + return TRI.isSubRegister(SuperVec.getReg(), SubReg.getReg()); + + return SubReg.getSubReg() != AMDGPU::NoSubRegister && + SubReg.getReg() == SuperVec.getReg(); +} + bool SIInstrInfo::verifyInstruction(const MachineInstr &MI, StringRef &ErrInfo) const { uint16_t Opcode = MI.getOpcode(); @@ -1770,6 +1780,47 @@ } } + if (Desc.getOpcode() == AMDGPU::V_MOVRELS_B32_e32 || + Desc.getOpcode() == AMDGPU::V_MOVRELS_B32_e64 || + Desc.getOpcode() == AMDGPU::V_MOVRELD_B32_e32 || + Desc.getOpcode() == AMDGPU::V_MOVRELD_B32_e64) { + const bool IsDst = Desc.getOpcode() == AMDGPU::V_MOVRELD_B32_e32 || + Desc.getOpcode() == AMDGPU::V_MOVRELD_B32_e64; + + const unsigned StaticNumOps = Desc.getNumOperands() + + Desc.getNumImplicitUses(); + const unsigned NumImplicitOps = IsDst ? 2 : 1; + + if (MI.getNumOperands() != StaticNumOps + NumImplicitOps) { + ErrInfo = "missing implicit register operands"; + return false; + } + + const MachineOperand *Dst = getNamedOperand(MI, AMDGPU::OpName::vdst); + if (IsDst) { + if (!Dst->isUse()) { + ErrInfo = "v_movreld_b32 vdst should be a use operand"; + return false; + } + + unsigned UseOpIdx; + if (!MI.isRegTiedToUseOperand(StaticNumOps, &UseOpIdx) || + UseOpIdx != StaticNumOps + 1) { + ErrInfo = "movrel implicit operands should be tied"; + return false; + } + } + + const MachineOperand &Src0 = MI.getOperand(Src0Idx); + const MachineOperand &ImpUse + = MI.getOperand(StaticNumOps + NumImplicitOps - 1); + if (!ImpUse.isReg() || !ImpUse.isUse() || + !isSubRegOf(RI, ImpUse, IsDst ? *Dst : Src0)) { + ErrInfo = "src0 should be subreg of implicit vector use"; + return false; + } + } + // Make sure we aren't losing exec uses in the td files. This mostly requires // being careful when using let Uses to try to add other use registers. if (shouldReadExec(MI)) { Index: lib/Target/AMDGPU/SIInstrInfo.td =================================================================== --- lib/Target/AMDGPU/SIInstrInfo.td +++ lib/Target/AMDGPU/SIInstrInfo.td @@ -1500,6 +1500,41 @@ def VOP_I32_F32_I32 : VOPProfile <[i32, f32, i32, untyped]>; def VOP_I32_I32_I32 : VOPProfile <[i32, i32, i32, untyped]>; +// Restrict src0 to be VGPR +def VOP_I32_VI32_NO_EXT : VOPProfile<[i32, i32, untyped, untyped]> { + let Src0RC32 = VRegSrc_32; + let Src0RC64 = VRegSrc_32; + + let HasExt = 0; +} + +// Special case because there are no true output operands. Hack vdst +// to be a src operand. The custom inserter must add a tied implicit +// def and use of the super register since there seems to be no way to +// add an implicit def of a virtual register in tablegen. +def VOP_MOVRELD : VOPProfile<[untyped, i32, untyped, untyped]> { + let Src0RC32 = VOPDstOperand; + let Src0RC64 = VOPDstOperand; + + let Outs = (outs); + let Ins32 = (ins Src0RC32:$vdst, VSrc_32:$src0); + let Ins64 = (ins Src0RC64:$vdst, VSrc_32:$src0); + + let InsDPP = (ins Src0RC32:$vdst, Src0RC32:$src0, dpp_ctrl:$dpp_ctrl, row_mask:$row_mask, + bank_mask:$bank_mask, bound_ctrl:$bound_ctrl); + let InsSDWA = (ins Src0RC32:$vdst, IntInputMods:$src0_imodifiers, VCSrc_32:$src0, + clampmod:$clamp, dst_sel:$dst_sel, dst_unused:$dst_unused, + src0_sel:$src0_sel); + + let Asm32 = getAsm32<1, 1>.ret; + let Asm64 = getAsm64<1, 1, 0>.ret; + let AsmDPP = getAsmDPP<1, 1, 0>.ret; + let AsmSDWA = getAsmSDWA<1, 1, 0>.ret; + + let HasExt = 0; + let HasDst = 0; +} + // Write out to vcc or arbitrary SGPR. def VOP2b_I32_I1_I32_I32 : VOPProfile<[i32, i32, i32, untyped]> { let Asm32 = "$vdst, vcc, $src0, $src1"; Index: lib/Target/AMDGPU/SIInstructions.td =================================================================== --- lib/Target/AMDGPU/SIInstructions.td +++ lib/Target/AMDGPU/SIInstructions.td @@ -1388,9 +1388,16 @@ } let Uses = [M0, EXEC] in { -defm V_MOVRELD_B32 : VOP1Inst , "v_movreld_b32", VOP_NO_EXT>; -defm V_MOVRELS_B32 : VOP1Inst , "v_movrels_b32", VOP_NO_EXT>; +// v_movreld_b32 is a special case because the destination output + // register is really a source. It isn't actually read (but may be + // written), and is only to provide the base register to start + // indexing from. Tablegen seems to not let you define an implicit + // virtual register output for the super register being written into, + // so this must have an implicit def of the register added to it. +defm V_MOVRELD_B32 : VOP1Inst , "v_movreld_b32", VOP_MOVRELD>; +defm V_MOVRELS_B32 : VOP1Inst , "v_movrels_b32", VOP_I32_VI32_NO_EXT>; defm V_MOVRELSD_B32 : VOP1Inst , "v_movrelsd_b32", VOP_NO_EXT>; + } // End Uses = [M0, EXEC] // These instruction only exist on SI and CI @@ -2031,17 +2038,20 @@ let hasNoSchedulingInfo = 1; } -let Uses = [EXEC], Defs = [EXEC, VCC, M0], +let Uses = [EXEC], Defs = [M0, EXEC], UseNamedOperandTable = 1 in { class SI_INDIRECT_SRC : PseudoInstSI < - (outs VGPR_32:$vdst, SReg_64:$sdst), - (ins rc:$src, VS_32:$idx, i32imm:$offset)>; + (outs VGPR_32:$vdst), + (ins rc:$src, VS_32:$idx, i32imm:$offset)> { + let usesCustomInserter = 1; +} class SI_INDIRECT_DST : PseudoInstSI < - (outs rc:$vdst, SReg_64:$sdst), - (ins unknown:$src, VS_32:$idx, i32imm:$offset, VGPR_32:$val)> { + (outs rc:$vdst), + (ins rc:$src, VS_32:$idx, i32imm:$offset, VGPR_32:$val)> { let Constraints = "$src = $vdst"; + let usesCustomInserter = 1; } // TODO: We can support indirect SGPR access. @@ -2057,7 +2067,7 @@ def SI_INDIRECT_DST_V8 : SI_INDIRECT_DST; def SI_INDIRECT_DST_V16 : SI_INDIRECT_DST; -} // End Uses = [EXEC], Defs = [EXEC,VCC,M0] +} // End Uses = [EXEC], Defs = [M0, EXEC] multiclass SI_SPILL_SGPR { let UseNamedOperandTable = 1, Uses = [EXEC] in { Index: lib/Target/AMDGPU/SILowerControlFlow.cpp =================================================================== --- lib/Target/AMDGPU/SILowerControlFlow.cpp +++ lib/Target/AMDGPU/SILowerControlFlow.cpp @@ -95,25 +95,6 @@ std::pair splitBlock(MachineBasicBlock &MBB, MachineBasicBlock::iterator I); - void splitLoadM0BlockLiveIns(LivePhysRegs &RemainderLiveRegs, - const MachineRegisterInfo &MRI, - const MachineInstr &MI, - MachineBasicBlock &LoopBB, - MachineBasicBlock &RemainderBB, - unsigned SaveReg, - const MachineOperand &IdxReg); - - void emitLoadM0FromVGPRLoop(MachineBasicBlock &LoopBB, DebugLoc DL, - MachineInstr *MovRel, - const MachineOperand &IdxReg, - int Offset); - - bool loadM0(MachineInstr &MI, MachineInstr *MovRel, int Offset = 0); - std::pair computeIndirectRegAndOffset(unsigned VecReg, - int Offset) const; - bool indirectSrc(MachineInstr &MI); - bool indirectDst(MachineInstr &MI); - public: static char ID; @@ -409,81 +390,6 @@ MI.eraseFromParent(); } -// All currently live registers must remain so in the remainder block. -void SILowerControlFlow::splitLoadM0BlockLiveIns(LivePhysRegs &RemainderLiveRegs, - const MachineRegisterInfo &MRI, - const MachineInstr &MI, - MachineBasicBlock &LoopBB, - MachineBasicBlock &RemainderBB, - unsigned SaveReg, - const MachineOperand &IdxReg) { - // Add reg defined in loop body. - RemainderLiveRegs.addReg(SaveReg); - - if (const MachineOperand *Val = TII->getNamedOperand(MI, AMDGPU::OpName::val)) { - if (!Val->isUndef()) { - RemainderLiveRegs.addReg(Val->getReg()); - LoopBB.addLiveIn(Val->getReg()); - } - } - - for (unsigned Reg : RemainderLiveRegs) { - if (MRI.isAllocatable(Reg)) - RemainderBB.addLiveIn(Reg); - } - - const MachineOperand *Src = TII->getNamedOperand(MI, AMDGPU::OpName::src); - if (!Src->isUndef()) - LoopBB.addLiveIn(Src->getReg()); - - if (!IdxReg.isUndef()) - LoopBB.addLiveIn(IdxReg.getReg()); - LoopBB.sortUniqueLiveIns(); -} - -void SILowerControlFlow::emitLoadM0FromVGPRLoop(MachineBasicBlock &LoopBB, - DebugLoc DL, - MachineInstr *MovRel, - const MachineOperand &IdxReg, - int Offset) { - MachineBasicBlock::iterator I = LoopBB.begin(); - - // Read the next variant into VCC (lower 32 bits) <- also loop target - BuildMI(LoopBB, I, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32), AMDGPU::VCC_LO) - .addReg(IdxReg.getReg(), getUndefRegState(IdxReg.isUndef())); - - // Move index from VCC into M0 - BuildMI(LoopBB, I, DL, TII->get(AMDGPU::S_MOV_B32), AMDGPU::M0) - .addReg(AMDGPU::VCC_LO); - - // Compare the just read M0 value to all possible Idx values - BuildMI(LoopBB, I, DL, TII->get(AMDGPU::V_CMP_EQ_U32_e32)) - .addReg(AMDGPU::M0) - .addReg(IdxReg.getReg(), getUndefRegState(IdxReg.isUndef())); - - // Update EXEC, save the original EXEC value to VCC - BuildMI(LoopBB, I, DL, TII->get(AMDGPU::S_AND_SAVEEXEC_B64), AMDGPU::VCC) - .addReg(AMDGPU::VCC); - - if (Offset != 0) { - BuildMI(LoopBB, I, DL, TII->get(AMDGPU::S_ADD_I32), AMDGPU::M0) - .addReg(AMDGPU::M0) - .addImm(Offset); - } - - // Do the actual move - LoopBB.insert(I, MovRel); - - // Update EXEC, switch all done bits to 0 and all todo bits to 1 - BuildMI(LoopBB, I, DL, TII->get(AMDGPU::S_XOR_B64), AMDGPU::EXEC) - .addReg(AMDGPU::EXEC) - .addReg(AMDGPU::VCC); - - // Loop back to V_READFIRSTLANE_B32 if there are still variants to cover - BuildMI(LoopBB, I, DL, TII->get(AMDGPU::S_CBRANCH_EXECNZ)) - .addMBB(&LoopBB); -} - MachineBasicBlock *SILowerControlFlow::insertSkipBlock( MachineBasicBlock &MBB, MachineBasicBlock::iterator I) const { MachineFunction *MF = MBB.getParent(); @@ -522,166 +428,6 @@ return std::make_pair(LoopBB, RemainderBB); } -// Returns true if a new block was inserted. -bool SILowerControlFlow::loadM0(MachineInstr &MI, MachineInstr *MovRel, int Offset) { - MachineBasicBlock &MBB = *MI.getParent(); - DebugLoc DL = MI.getDebugLoc(); - MachineBasicBlock::iterator I(&MI); - - const MachineOperand *Idx = TII->getNamedOperand(MI, AMDGPU::OpName::idx); - - if (AMDGPU::SReg_32RegClass.contains(Idx->getReg())) { - if (Offset != 0) { - BuildMI(MBB, I, DL, TII->get(AMDGPU::S_ADD_I32), AMDGPU::M0) - .addReg(Idx->getReg(), getUndefRegState(Idx->isUndef())) - .addImm(Offset); - } else { - BuildMI(MBB, I, DL, TII->get(AMDGPU::S_MOV_B32), AMDGPU::M0) - .addReg(Idx->getReg(), getUndefRegState(Idx->isUndef())); - } - - MBB.insert(I, MovRel); - MI.eraseFromParent(); - return false; - } - - MachineOperand *SaveOp = TII->getNamedOperand(MI, AMDGPU::OpName::sdst); - SaveOp->setIsDead(false); - unsigned Save = SaveOp->getReg(); - - // Reading from a VGPR requires looping over all workitems in the wavefront. - assert(AMDGPU::SReg_64RegClass.contains(Save) && - AMDGPU::VGPR_32RegClass.contains(Idx->getReg())); - - // Save the EXEC mask - BuildMI(MBB, I, DL, TII->get(AMDGPU::S_MOV_B64), Save) - .addReg(AMDGPU::EXEC); - - LivePhysRegs RemainderLiveRegs(TRI); - - RemainderLiveRegs.addLiveOuts(MBB); - - MachineBasicBlock *LoopBB; - MachineBasicBlock *RemainderBB; - - std::tie(LoopBB, RemainderBB) = splitBlock(MBB, I); - - for (const MachineInstr &Inst : reverse(*RemainderBB)) - RemainderLiveRegs.stepBackward(Inst); - - MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo(); - LoopBB->addSuccessor(RemainderBB); - LoopBB->addSuccessor(LoopBB); - - splitLoadM0BlockLiveIns(RemainderLiveRegs, MRI, MI, *LoopBB, - *RemainderBB, Save, *Idx); - - emitLoadM0FromVGPRLoop(*LoopBB, DL, MovRel, *Idx, Offset); - - MachineBasicBlock::iterator First = RemainderBB->begin(); - BuildMI(*RemainderBB, First, DL, TII->get(AMDGPU::S_MOV_B64), AMDGPU::EXEC) - .addReg(Save); - - MI.eraseFromParent(); - return true; -} - -/// \param @VecReg The register which holds element zero of the vector being -/// addressed into. -// -/// \param[in] @Idx The index operand from the movrel instruction. This must be -// a register, but may be NoRegister. -/// -/// \param[in] @Offset As an input, this is the constant offset part of the -// indirect Index. e.g. v0 = v[VecReg + Offset] As an output, this is a constant -// value that needs to be added to the value stored in M0. -std::pair -SILowerControlFlow::computeIndirectRegAndOffset(unsigned VecReg, int Offset) const { - unsigned SubReg = TRI->getSubReg(VecReg, AMDGPU::sub0); - if (!SubReg) - SubReg = VecReg; - - const TargetRegisterClass *SuperRC = TRI->getPhysRegClass(VecReg); - const TargetRegisterClass *RC = TRI->getPhysRegClass(SubReg); - int NumElts = SuperRC->getSize() / RC->getSize(); - - int BaseRegIdx = TRI->getHWRegIndex(SubReg); - - // Skip out of bounds offsets, or else we would end up using an undefined - // register. - if (Offset >= NumElts) - return std::make_pair(RC->getRegister(BaseRegIdx), Offset); - - int RegIdx = BaseRegIdx + Offset; - if (RegIdx < 0) { - Offset = RegIdx; - RegIdx = 0; - } else { - Offset = 0; - } - - unsigned Reg = RC->getRegister(RegIdx); - return std::make_pair(Reg, Offset); -} - -// Return true if a new block was inserted. -bool SILowerControlFlow::indirectSrc(MachineInstr &MI) { - MachineBasicBlock &MBB = *MI.getParent(); - const DebugLoc &DL = MI.getDebugLoc(); - - unsigned Dst = MI.getOperand(0).getReg(); - const MachineOperand *SrcVec = TII->getNamedOperand(MI, AMDGPU::OpName::src); - int Offset = TII->getNamedOperand(MI, AMDGPU::OpName::offset)->getImm(); - unsigned Reg; - - std::tie(Reg, Offset) = computeIndirectRegAndOffset(SrcVec->getReg(), Offset); - - const MachineOperand *Idx = TII->getNamedOperand(MI, AMDGPU::OpName::idx); - if (Idx->getReg() == AMDGPU::NoRegister) { - // Only had a constant offset, copy the register directly. - BuildMI(MBB, MI.getIterator(), DL, TII->get(AMDGPU::V_MOV_B32_e32), Dst) - .addReg(Reg, getUndefRegState(SrcVec->isUndef())); - MI.eraseFromParent(); - return false; - } - - MachineInstr *MovRel = - BuildMI(*MBB.getParent(), DL, TII->get(AMDGPU::V_MOVRELS_B32_e32), Dst) - .addReg(Reg, getUndefRegState(SrcVec->isUndef())) - .addReg(SrcVec->getReg(), RegState::Implicit); - - return loadM0(MI, MovRel, Offset); -} - -// Return true if a new block was inserted. -bool SILowerControlFlow::indirectDst(MachineInstr &MI) { - MachineBasicBlock &MBB = *MI.getParent(); - const DebugLoc &DL = MI.getDebugLoc(); - - unsigned Dst = MI.getOperand(0).getReg(); - int Offset = TII->getNamedOperand(MI, AMDGPU::OpName::offset)->getImm(); - unsigned Reg; - - const MachineOperand *Val = TII->getNamedOperand(MI, AMDGPU::OpName::val); - std::tie(Reg, Offset) = computeIndirectRegAndOffset(Dst, Offset); - - MachineOperand *Idx = TII->getNamedOperand(MI, AMDGPU::OpName::idx); - if (Idx->getReg() == AMDGPU::NoRegister) { - // Only had a constant offset, copy the register directly. - BuildMI(MBB, MI.getIterator(), DL, TII->get(AMDGPU::V_MOV_B32_e32), Reg) - .addOperand(*Val); - MI.eraseFromParent(); - return false; - } - - MachineInstr *MovRel = - BuildMI(*MBB.getParent(), DL, TII->get(AMDGPU::V_MOVRELD_B32_e32), Reg) - .addReg(Val->getReg(), getUndefRegState(Val->isUndef())) - .addReg(Dst, RegState::Implicit); - - return loadM0(MI, MovRel, Offset); -} - bool SILowerControlFlow::runOnMachineFunction(MachineFunction &MF) { const SISubtarget &ST = MF.getSubtarget(); TII = ST.getInstrInfo(); @@ -773,38 +519,6 @@ Branch(MI); break; - case AMDGPU::SI_INDIRECT_SRC_V1: - case AMDGPU::SI_INDIRECT_SRC_V2: - case AMDGPU::SI_INDIRECT_SRC_V4: - case AMDGPU::SI_INDIRECT_SRC_V8: - case AMDGPU::SI_INDIRECT_SRC_V16: - if (indirectSrc(MI)) { - // The block was split at this point. We can safely skip the middle - // inserted block to the following which contains the rest of this - // block's instructions. - NextBB = std::next(BI); - BE = MF.end(); - Next = MBB.end(); - } - - break; - - case AMDGPU::SI_INDIRECT_DST_V1: - case AMDGPU::SI_INDIRECT_DST_V2: - case AMDGPU::SI_INDIRECT_DST_V4: - case AMDGPU::SI_INDIRECT_DST_V8: - case AMDGPU::SI_INDIRECT_DST_V16: - if (indirectDst(MI)) { - // The block was split at this point. We can safely skip the middle - // inserted block to the following which contains the rest of this - // block's instructions. - NextBB = std::next(BI); - BE = MF.end(); - Next = MBB.end(); - } - - break; - case AMDGPU::SI_RETURN: { assert(!MF.getInfo()->returnsVoid()); Index: lib/Target/AMDGPU/SIRegisterInfo.td =================================================================== --- lib/Target/AMDGPU/SIRegisterInfo.td +++ lib/Target/AMDGPU/SIRegisterInfo.td @@ -397,6 +397,17 @@ } //===----------------------------------------------------------------------===// +// VSrc_* Operands with an VGPR +//===----------------------------------------------------------------------===// + +// This is for operands with the enum(9), VSrc encoding restriction, +// but only allows VGPRs. +def VRegSrc_32 : RegisterOperand { + //let ParserMatchClass = RegImmMatcher<"VRegSrc32">; + let DecoderMethod = "DecodeVS_32RegisterClass"; +} + +//===----------------------------------------------------------------------===// // VCSrc_* Operands with an SGPR, VGPR or an inline constant //===----------------------------------------------------------------------===// Index: lib/Target/AMDGPU/SIShrinkInstructions.cpp =================================================================== --- lib/Target/AMDGPU/SIShrinkInstructions.cpp +++ lib/Target/AMDGPU/SIShrinkInstructions.cpp @@ -398,9 +398,13 @@ } ++NumInstructionsShrunk; - MI.eraseFromParent(); + // Copy extra operands not present in the instruction definition. + Inst32->copyImplicitOps(MF, MI); + + MI.eraseFromParent(); foldImmediates(*Inst32, TII, MRI); + DEBUG(dbgs() << "e32 MI = " << *Inst32 << '\n'); Index: test/CodeGen/AMDGPU/indirect-addressing-si.ll =================================================================== --- test/CodeGen/AMDGPU/indirect-addressing-si.ll +++ test/CodeGen/AMDGPU/indirect-addressing-si.ll @@ -5,12 +5,13 @@ ; indexing of vectors. ; CHECK-LABEL: {{^}}extract_w_offset: +; CHECK-DAG: s_load_dword [[IN:s[0-9]+]] ; CHECK-DAG: v_mov_b32_e32 v{{[0-9]+}}, 4.0 ; CHECK-DAG: v_mov_b32_e32 v{{[0-9]+}}, 0x40400000 -; CHECK-DAG: v_mov_b32_e32 v{{[0-9]+}}, 2.0 +; CHECK-DAG: v_mov_b32_e32 [[BASEREG:v[0-9]+]], 2.0 ; CHECK-DAG: v_mov_b32_e32 v{{[0-9]+}}, 1.0 -; CHECK: s_mov_b32 m0 -; CHECK-NEXT: v_movrels_b32_e32 +; CHECK-DAG: s_mov_b32 m0, [[IN]] +; CHECK: v_movrels_b32_e32 v{{[0-9]+}}, [[BASEREG]] define void @extract_w_offset(float addrspace(1)* %out, i32 %in) { entry: %idx = add i32 %in, 1 @@ -41,12 +42,13 @@ } ; CHECK-LABEL: {{^}}extract_wo_offset: +; CHECK-DAG: s_load_dword [[IN:s[0-9]+]] ; CHECK-DAG: v_mov_b32_e32 v{{[0-9]+}}, 4.0 ; CHECK-DAG: v_mov_b32_e32 v{{[0-9]+}}, 0x40400000 ; CHECK-DAG: v_mov_b32_e32 v{{[0-9]+}}, 2.0 -; CHECK-DAG: v_mov_b32_e32 v{{[0-9]+}}, 1.0 -; CHECK: s_mov_b32 m0 -; CHECK-NEXT: v_movrels_b32_e32 +; CHECK-DAG: v_mov_b32_e32 [[BASEREG:v[0-9]+]], 1.0 +; CHECK-DAG: s_mov_b32 m0, [[IN]] +; CHECK: v_movrels_b32_e32 v{{[0-9]+}}, [[BASEREG]] define void @extract_wo_offset(float addrspace(1)* %out, i32 %in) { entry: %elt = extractelement <4 x float> , i32 %in @@ -81,10 +83,17 @@ ; CHECK-LABEL: {{^}}extract_neg_offset_vgpr: ; The offset depends on the register that holds the first element of the vector. -; CHECK: v_readfirstlane_b32 -; CHECK: s_add_i32 m0, m0, 0xfffffe{{[0-9a-z]+}} -; CHECK-NEXT: v_movrels_b32_e32 v{{[0-9]}}, v0 + +; FIXME: The waitcnt for the argument load can go after the loop +; CHECK: s_mov_b64 s{{\[[0-9]+:[0-9]+\]}}, exec +; CHECK: s_waitcnt lgkmcnt(0) + +; CHECK: v_readfirstlane_b32 [[READLANE:s[0-9]+]], v{{[0-9]+}} +; CHECK: s_add_i32 m0, [[READLANE]], 0xfffffe0 +; CHECK: v_movrels_b32_e32 [[RESULT:v[0-9]+]], v1 ; CHECK: s_cbranch_execnz + +; CHECK: buffer_store_dword [[RESULT]] define void @extract_neg_offset_vgpr(i32 addrspace(1)* %out) { entry: %id = call i32 @llvm.amdgcn.workitem.id.x() #1 @@ -104,9 +113,9 @@ } ; CHECK-LABEL: {{^}}insert_undef_offset_sgpr_vector_src: -; CHECK: buffer_load_dwordx4 -; CHECK: s_mov_b32 m0, -; CHECK-NEXT: v_movreld_b32 +; CHECK-DAG: buffer_load_dwordx4 +; CHECK-DAG: s_mov_b32 m0, +; CHECK: v_movreld_b32 define void @insert_undef_offset_sgpr_vector_src(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in) { entry: %ld = load <4 x i32>, <4 x i32> addrspace(1)* %in @@ -116,8 +125,9 @@ } ; CHECK-LABEL: {{^}}insert_w_offset: -; CHECK: s_mov_b32 m0 -; CHECK-NEXT: v_movreld_b32_e32 +; CHECK: s_load_dword [[IN:s[0-9]+]] +; CHECK: s_mov_b32 m0, [[IN]] +; CHECK: v_movreld_b32_e32 define void @insert_w_offset(float addrspace(1)* %out, i32 %in) { entry: %0 = add i32 %in, 1 @@ -128,8 +138,9 @@ } ; CHECK-LABEL: {{^}}insert_wo_offset: -; CHECK: s_mov_b32 m0 -; CHECK-NEXT: v_movreld_b32_e32 +; CHECK: s_load_dword [[IN:s[0-9]+]] +; CHECK: s_mov_b32 m0, [[IN]] +; CHECK: v_movreld_b32_e32 define void @insert_wo_offset(float addrspace(1)* %out, i32 %in) { entry: %0 = insertelement <4 x float> , float 5.0, i32 %in @@ -141,7 +152,7 @@ ; CHECK-LABEL: {{^}}insert_neg_offset_sgpr: ; The offset depends on the register that holds the first element of the vector. ; CHECK: s_add_i32 m0, s{{[0-9]+}}, 0xfffffe{{[0-9a-z]+}} -; CHECK: v_movreld_b32_e32 v0, v{{[0-9]}} +; CHECK: v_movreld_b32_e32 v0, 5 define void @insert_neg_offset_sgpr(i32 addrspace(1)* %in, <4 x i32> addrspace(1)* %out, i32 %offset) { entry: %index = add i32 %offset, -512 @@ -156,7 +167,7 @@ ; CHECK-LABEL: {{^}}insert_neg_offset_sgpr_loadreg: ; The offset depends on the register that holds the first element of the vector. ; CHECK: s_add_i32 m0, s{{[0-9]+}}, 0xfffffe{{[0-9a-z]+}} -; CHECK: v_movreld_b32_e32 v0, v{{[0-9]}} +; CHECK: v_movreld_b32_e32 v0, 5 define void @insert_neg_offset_sgpr_loadreg(i32 addrspace(1)* %in, <4 x i32> addrspace(1)* %out, <4 x i32> %vec, i32 %offset) { entry: %index = add i32 %offset, -512 @@ -167,30 +178,53 @@ ; CHECK-LABEL: {{^}}insert_neg_offset_vgpr: ; The offset depends on the register that holds the first element of the vector. -; CHECK: v_readfirstlane_b32 -; CHECK: s_add_i32 m0, m0, 0xfffffe{{[0-9a-z]+}} -; CHECK-NEXT: v_movreld_b32_e32 v0, v{{[0-9]}} -; CHECK: s_cbranch_execnz + +; CHECK-DAG: v_mov_b32_e32 [[VEC_ELT0:v[0-9]+]], 1{{$}} +; CHECK-DAG: v_mov_b32_e32 [[VEC_ELT1:v[0-9]+]], 2{{$}} +; CHECK-DAG: v_mov_b32_e32 [[VEC_ELT2:v[0-9]+]], 3{{$}} +; CHECK-DAG: v_mov_b32_e32 [[VEC_ELT3:v[0-9]+]], 4{{$}} + +; CHECK: s_mov_b64 [[SAVEEXEC:s\[[0-9]+:[0-9]+\]]], exec +; CHECK: s_waitcnt lgkmcnt(0) + +; CHECK: [[LOOPBB:BB[0-9]+_[0-9]+]]: +; CHECK: v_readfirstlane_b32 [[READLANE:s[0-9]+]] +; CHECK: s_add_i32 m0, [[READLANE]], 0xfffffe00 +; CHECK: v_movreld_b32_e32 [[VEC_ELT0]], 5 +; CHECK: s_cbranch_execnz [[LOOPBB]] + +; CHECK: s_mov_b64 exec, [[SAVEEXEC]] +; CHECK: buffer_store_dword define void @insert_neg_offset_vgpr(i32 addrspace(1)* %in, <4 x i32> addrspace(1)* %out) { entry: %id = call i32 @llvm.amdgcn.workitem.id.x() #1 %index = add i32 %id, -512 - %value = insertelement <4 x i32> , i32 5, i32 %index + %value = insertelement <4 x i32> , i32 5, i32 %index store <4 x i32> %value, <4 x i32> addrspace(1)* %out ret void } ; CHECK-LABEL: {{^}}insert_neg_inline_offset_vgpr: + +; CHECK-DAG: v_mov_b32_e32 [[VEC_ELT0:v[0-9]+]], 1{{$}} +; CHECK-DAG: v_mov_b32_e32 [[VEC_ELT1:v[0-9]+]], 2{{$}} +; CHECK-DAG: v_mov_b32_e32 [[VEC_ELT2:v[0-9]+]], 3{{$}} +; CHECK-DAG: v_mov_b32_e32 [[VEC_ELT3:v[0-9]+]], 4{{$}} +; CHECK-DAG: v_mov_b32_e32 [[VAL:v[0-9]+]], 0x1f4{{$}} + +; CHECK: s_mov_b64 [[SAVEEXEC:s\[[0-9]+:[0-9]+\]]], exec +; CHECK: s_waitcnt lgkmcnt(0) + ; The offset depends on the register that holds the first element of the vector. -; CHECK: v_readfirstlane_b32 -; CHECK: s_add_i32 m0, m0, -{{[0-9]+}} -; CHECK-NEXT: v_movreld_b32_e32 v0, v{{[0-9]}} +; CHECK: v_readfirstlane_b32 [[READLANE:s[0-9]+]] +; CHECK: s_add_i32 m0, [[READLANE]], -16 +; CHECK: v_movreld_b32_e32 [[VEC_ELT0]], [[VAL]] ; CHECK: s_cbranch_execnz define void @insert_neg_inline_offset_vgpr(i32 addrspace(1)* %in, <4 x i32> addrspace(1)* %out) { entry: %id = call i32 @llvm.amdgcn.workitem.id.x() #1 %index = add i32 %id, -16 - %value = insertelement <4 x i32> , i32 5, i32 %index + %value = insertelement <4 x i32> , i32 500, i32 %index store <4 x i32> %value, <4 x i32> addrspace(1)* %out ret void } @@ -200,34 +234,37 @@ ; CHECK-LABEL: {{^}}extract_vgpr_offset_multiple_in_block: +; FIXME: Why is vector copied in between? + ; CHECK-DAG: {{buffer|flat}}_load_dword [[IDX0:v[0-9]+]] ; CHECK-DAG: s_mov_b32 [[S_ELT0:s[0-9]+]], 7 ; CHECK-DAG: s_mov_b32 [[S_ELT1:s[0-9]+]], 9 ; CHECK-DAG: v_mov_b32_e32 [[VEC_ELT0:v[0-9]+]], [[S_ELT0]] ; CHECK-DAG: v_mov_b32_e32 [[VEC_ELT1:v[0-9]+]], [[S_ELT1]] -; CHECK: s_waitcnt vmcnt(0) ; CHECK: s_mov_b64 [[MASK:s\[[0-9]+:[0-9]+\]]], exec +; CHECK: s_waitcnt vmcnt(0) lgkmcnt(0) ; CHECK: [[LOOP0:BB[0-9]+_[0-9]+]]: -; CHECK: v_readfirstlane_b32 vcc_lo, [[IDX0]] -; CHECK: s_mov_b32 m0, vcc_lo -; CHECK: v_cmp_eq_u32_e32 vcc, m0, [[IDX0]] +; CHECK-NEXT: v_readfirstlane_b32 [[READLANE:s[0-9]+]], [[IDX0]] +; CHECK: v_cmp_eq_u32_e32 vcc, [[READLANE]], [[IDX0]] +; CHECK: s_mov_b32 m0, [[READLANE]] ; CHECK: s_and_saveexec_b64 vcc, vcc -; CHECK-NEXT: v_movrels_b32_e32 [[MOVREL0:v[0-9]+]], [[VEC_ELT0]] +; CHECK: v_movrels_b32_e32 [[MOVREL0:v[0-9]+]], [[VEC_ELT0]] ; CHECK-NEXT: s_xor_b64 exec, exec, vcc -; CHECK: s_cbranch_execnz [[LOOP0]] +; CHECK-NEXT: s_cbranch_execnz [[LOOP0]] ; FIXME: Redundant copy ; CHECK: s_mov_b64 exec, [[MASK]] +; CHECK: v_mov_b32_e32 [[VEC_ELT1_2:v[0-9]+]], [[S_ELT1]] ; CHECK: s_mov_b64 [[MASK2:s\[[0-9]+:[0-9]+\]]], exec ; CHECK: [[LOOP1:BB[0-9]+_[0-9]+]]: -; CHECK: v_readfirstlane_b32 vcc_lo, [[IDX0]] -; CHECK: s_mov_b32 m0, vcc_lo -; CHECK: v_cmp_eq_u32_e32 vcc, m0, [[IDX0]] +; CHECK-NEXT: v_readfirstlane_b32 [[READLANE:s[0-9]+]], [[IDX0]] +; CHECK: v_cmp_eq_u32_e32 vcc, [[READLANE]], [[IDX0]] +; CHECK: s_mov_b32 m0, [[READLANE]] ; CHECK: s_and_saveexec_b64 vcc, vcc -; CHECK-NEXT: v_movrels_b32_e32 [[MOVREL1:v[0-9]+]], [[VEC_ELT1]] +; CHECK-NEXT: v_movrels_b32_e32 [[MOVREL1:v[0-9]+]], [[VEC_ELT1_2]] ; CHECK-NEXT: s_xor_b64 exec, exec, vcc ; CHECK: s_cbranch_execnz [[LOOP1]] @@ -259,36 +296,34 @@ ; CHECK-LABEL: {{^}}insert_vgpr_offset_multiple_in_block: ; CHECK-DAG: s_load_dwordx4 s{{\[}}[[S_ELT0:[0-9]+]]:[[S_ELT3:[0-9]+]]{{\]}} ; CHECK-DAG: {{buffer|flat}}_load_dword [[IDX0:v[0-9]+]] -; CHECK-DAG: v_mov_b32_e32 [[VEC_ELT0:v[0-9]+]], s[[S_ELT0]] ; CHECK-DAG: v_mov_b32 [[INS0:v[0-9]+]], 62 -; CHECK-DAG: s_waitcnt vmcnt(0) -; CHECK: s_mov_b64 [[MASK:s\[[0-9]+:[0-9]+\]]], exec +; CHECK-DAG: v_mov_b32_e32 v[[VEC_ELT0:[0-9]+]], s[[S_ELT0]] +; CHECK-DAG: v_mov_b32_e32 v[[VEC_ELT3:[0-9]+]], s[[S_ELT3]] ; CHECK: [[LOOP0:BB[0-9]+_[0-9]+]]: -; CHECK: v_readfirstlane_b32 vcc_lo, [[IDX0]] -; CHECK: s_mov_b32 m0, vcc_lo -; CHECK: v_cmp_eq_u32_e32 vcc, m0, [[IDX0]] +; CHECK-NEXT: v_readfirstlane_b32 [[READLANE:s[0-9]+]], [[IDX0]] +; CHECK: v_cmp_eq_u32_e32 vcc, [[READLANE]], [[IDX0]] +; CHECK: s_mov_b32 m0, [[READLANE]] ; CHECK: s_and_saveexec_b64 vcc, vcc -; CHECK-NEXT: v_movreld_b32_e32 v[[MOVREL0:[0-9]+]], [[INS0]] +; CHECK-NEXT: v_movreld_b32_e32 v[[VEC_ELT0]], [[INS0]] ; CHECK-NEXT: s_xor_b64 exec, exec, vcc ; CHECK: s_cbranch_execnz [[LOOP0]] ; FIXME: Redundant copy -; CHECK: s_mov_b64 exec, [[MASK]] -; CHECK: v_mov_b32_e32 [[INS1:v[0-9]+]], 63 +; CHECK: s_mov_b64 exec, [[MASK:s\[[0-9]+:[0-9]+\]]] ; CHECK: s_mov_b64 [[MASK]], exec ; CHECK: [[LOOP1:BB[0-9]+_[0-9]+]]: -; CHECK: v_readfirstlane_b32 vcc_lo, [[IDX0]] -; CHECK: s_mov_b32 m0, vcc_lo -; CHECK: v_cmp_eq_u32_e32 vcc, m0, [[IDX0]] +; CHECK-NEXT: v_readfirstlane_b32 [[READLANE:s[0-9]+]], [[IDX0]] +; CHECK: v_cmp_eq_u32_e32 vcc, [[READLANE]], [[IDX0]] +; CHECK: s_mov_b32 m0, [[READLANE]] ; CHECK: s_and_saveexec_b64 vcc, vcc -; CHECK-NEXT: v_movreld_b32_e32 v[[MOVREL1:[0-9]+]], [[INS1]] +; CHECK-NEXT: v_movreld_b32_e32 [[VEC_ELT1]], 63 ; CHECK-NEXT: s_xor_b64 exec, exec, vcc ; CHECK: s_cbranch_execnz [[LOOP1]] -; CHECK: buffer_store_dwordx4 v{{\[}}[[MOVREL0]]: +; CHECK: buffer_store_dwordx4 v{{\[}}[[VEC_ELT0]]: ; CHECK: buffer_store_dword [[INS0]] define void @insert_vgpr_offset_multiple_in_block(<4 x i32> addrspace(1)* %out0, <4 x i32> addrspace(1)* %out1, i32 addrspace(1)* %in, <4 x i32> %vec0) #0 { @@ -394,13 +429,26 @@ ; FIXME: Should be able to fold zero input to movreld to inline imm? ; CHECK-LABEL: {{^}}multi_same_block: -; CHECK: s_load_dword [[ARG:s[0-9]+]] -; CHECK-DAG: v_mov_b32_e32 [[ZERO:v[0-9]+]], 0{{$}} + +; CHECK-DAG: v_mov_b32_e32 v[[VEC0_ELT0:[0-9]+]], 0x41880000 +; CHECK-DAG: v_mov_b32_e32 v{{[0-9]+}}, 0x41900000 +; CHECK-DAG: v_mov_b32_e32 v[[VEC0_ELT2:[0-9]+]], 0x41980000 +; CHECK-DAG: v_mov_b32_e32 v{{[0-9]+}}, 0x41a00000 +; CHECK-DAG: v_mov_b32_e32 v{{[0-9]+}}, 0x41a80000 +; CHECK-DAG: v_mov_b32_e32 v{{[0-9]+}}, 0x41b00000 +; CHECK-DAG: s_load_dword [[ARG:s[0-9]+]] + ; CHECK-DAG: s_add_i32 m0, [[ARG]], -16 -; CHECK: v_movreld_b32_e32 v{{[0-9]+}}, [[ZERO]] +; CHECK: v_movreld_b32_e32 v[[VEC0_ELT0]], 4.0 +; CHECK-NOT: m0 -; CHECK: s_add_i32 m0, [[ARG]], -14 -; CHECK: v_movreld_b32_e32 v{{[0-9]+}}, v{{[0-9]+}} +; CHECK: v_mov_b32_e32 v[[VEC0_ELT2]], 0x4188cccd +; CHECK-DAG: v_mov_b32_e32 v{{[0-9]+}}, 0x4190cccd +; CHECK-DAG: v_mov_b32_e32 v{{[0-9]+}}, 0x4198cccd +; CHECK-DAG: v_mov_b32_e32 v{{[0-9]+}}, 0x41a0cccd +; CHECK-DAG: v_mov_b32_e32 v{{[0-9]+}}, 0x41a8cccd +; CHECK-DAG: v_mov_b32_e32 v{{[0-9]+}}, 0x41b0cccd +; CHECK: v_movreld_b32_e32 v[[VEC0_ELT2]], -4.0 ; CHECK: s_mov_b32 m0, -1 ; CHECK: ds_write_b32 @@ -409,9 +457,9 @@ define void @multi_same_block(i32 %arg) #0 { bb: %tmp1 = add i32 %arg, -16 - %tmp2 = insertelement <6 x float> , float 0.000000e+00, i32 %tmp1 + %tmp2 = insertelement <6 x float> , float 4.000000e+00, i32 %tmp1 %tmp3 = add i32 %arg, -16 - %tmp4 = insertelement <6 x float> , float 0x3FB99999A0000000, i32 %tmp3 + %tmp4 = insertelement <6 x float> , float -4.0, i32 %tmp3 %tmp5 = bitcast <6 x float> %tmp2 to <6 x i32> %tmp6 = extractelement <6 x i32> %tmp5, i32 1 %tmp7 = bitcast <6 x float> %tmp4 to <6 x i32> @@ -423,10 +471,10 @@ ; offset puts outside of superegister bounaries, so clamp to 1st element. ; CHECK-LABEL: {{^}}extract_largest_inbounds_offset: -; CHECK: buffer_load_dwordx4 v{{\[}}[[LO_ELT:[0-9]+]]:[[HI_ELT:[0-9]+]]{{\]}} -; CHECK: s_load_dword [[IDX:s[0-9]+]] +; CHECK-DAG: buffer_load_dwordx4 v{{\[}}[[LO_ELT:[0-9]+]]:[[HI_ELT:[0-9]+]]{{\]}} +; CHECK-DAG: s_load_dword [[IDX:s[0-9]+]] ; CHECK: s_mov_b32 m0, [[IDX]] -; CHECK-NEXT: v_movrels_b32_e32 [[EXTRACT:v[0-9]+]], v[[HI_ELT]] +; CHECK: v_movrels_b32_e32 [[EXTRACT:v[0-9]+]], v[[HI_ELT]] ; CHECK: buffer_store_dword [[EXTRACT]] define void @extract_largest_inbounds_offset(i32 addrspace(1)* %out, <4 x i32> addrspace(1)* %in, i32 %idx) { entry: @@ -437,11 +485,11 @@ ret void } -; CHECK-LABL: {{^}}extract_out_of_bounds_offset: -; CHECK: buffer_load_dwordx4 v{{\[}}[[LO_ELT:[0-9]+]]:[[HI_ELT:[0-9]+]]{{\]}} -; CHECK: s_load_dword [[IDX:s[0-9]+]] +; CHECK-LABEL: {{^}}extract_out_of_bounds_offset: +; CHECK-DAG: buffer_load_dwordx4 v{{\[}}[[LO_ELT:[0-9]+]]:[[HI_ELT:[0-9]+]]{{\]}} +; CHECK-DAG: s_load_dword [[IDX:s[0-9]+]] ; CHECK: s_add_i32 m0, [[IDX]], 4 -; CHECK-NEXT: v_movrels_b32_e32 [[EXTRACT:v[0-9]+]], v[[LO_ELT]] +; CHECK: v_movrels_b32_e32 [[EXTRACT:v[0-9]+]], v[[LO_ELT]] ; CHECK: buffer_store_dword [[EXTRACT]] define void @extract_out_of_bounds_offset(i32 addrspace(1)* %out, <4 x i32> addrspace(1)* %in, i32 %idx) { entry: Index: test/CodeGen/AMDGPU/indirect-addressing-undef.mir =================================================================== --- test/CodeGen/AMDGPU/indirect-addressing-undef.mir +++ /dev/null @@ -1,327 +0,0 @@ -# RUN: llc -march=amdgcn -verify-machineinstrs -run-pass si-lower-control-flow -o /dev/null %s 2>&1 | FileCheck %s -# Getting an undef that is specifically a VGPR is tricky from IR - -# CHECK-LABEL: name: extract_undef_offset_vgpr{{$}} -# CHECK: bb.1: -# CHECK: successors: %bb.2(0x40000000 / 0x80000000 = 50.00%), %bb.1(0x40000000 / 0x80000000 = 50.00%) -# CHECK: liveins: %vgpr0_vgpr1_vgpr2_vgpr3{{$}} - -# CHECK: V_READFIRSTLANE_B32 undef %vgpr10, implicit %exec -# CHECK: %vgpr0 = V_MOVRELS_B32_e32 %vgpr0, implicit %m0, implicit %exec, implicit %vgpr0_vgpr1_vgpr2_vgpr3 -# CHECK: S_CBRANCH_EXECNZ %bb.1, implicit %exec - -# CHECK: bb.2: -# CHECK: liveins: %sgpr6_sgpr7, %sgpr4_sgpr5_sgpr6_sgpr7, %sgpr4, %sgpr5, %sgpr6, %sgpr7, %sgpr4_sgpr5, %vgpr0_vgpr1_vgpr2_vgpr3, %vgpr0, %vgpr1, %vgpr2, %vgpr3, %vgpr0_vgpr1, %vgpr2_vgpr3, %vgpr0_vgpr1_vgpr2, %vgpr1_vgpr2, %vgpr1_vgpr2_vgpr3, %sgpr0_sgpr1, %sgpr0, %sgpr1{{$}} - - ---- | - target datalayout = "e-p:32:32-p1:64:64-p2:64:64-p3:32:32-p4:64:64-p5:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64" - - define void @extract_undef_offset_vgpr(i32 addrspace(1)* %out, <4 x i32> addrspace(1)* %in) { - entry: - %ld = load volatile <4 x i32>, <4 x i32> addrspace(1)* %in - %value = extractelement <4 x i32> %ld, i32 undef - store i32 %value, i32 addrspace(1)* %out - ret void - } - - define void @extract_undef_neg_offset_vgpr(i32 addrspace(1)* %out, <4 x i32> addrspace(1)* %in) { - entry: - %ld = load volatile <4 x i32>, <4 x i32> addrspace(1)* %in - %value = extractelement <4 x i32> %ld, i32 undef - store i32 %value, i32 addrspace(1)* %out - ret void - } - - define void @insert_undef_offset_vgpr(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in) { - entry: - %ld = load <4 x i32>, <4 x i32> addrspace(1)* %in - %value = insertelement <4 x i32> %ld, i32 5, i32 undef - store <4 x i32> %value, <4 x i32> addrspace(1)* %out - ret void - } - - define void @insert_undef_neg_offset_vgpr(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in) { - entry: - %ld = load <4 x i32>, <4 x i32> addrspace(1)* %in - %value = insertelement <4 x i32> %ld, i32 5, i32 undef - store <4 x i32> %value, <4 x i32> addrspace(1)* %out - ret void - } - - define void @insert_undef_value_offset_vgpr(<4 x i32> addrspace(1)*%out, <4 x i32> addrspace(1)* %in, i32 %idx) { - entry: - %ld = load <4 x i32>, <4 x i32> addrspace(1)* %in - %value = insertelement <4 x i32> %ld, i32 undef, i32 %idx - store <4 x i32> %value, <4 x i32> addrspace(1)* %out - ret void - } - -... ---- -name: extract_undef_offset_vgpr -alignment: 0 -exposesReturnsTwice: false -hasInlineAsm: false -allVRegsAllocated: true -isSSA: false -tracksRegLiveness: true -tracksSubRegLiveness: true -liveins: - - { reg: '%sgpr0_sgpr1' } -frameInfo: - isFrameAddressTaken: false - isReturnAddressTaken: false - hasStackMap: false - hasPatchPoint: false - stackSize: 0 - offsetAdjustment: 0 - maxAlignment: 0 - adjustsStack: false - hasCalls: false - maxCallFrameSize: 0 - hasOpaqueSPAdjustment: false - hasVAStart: false - hasMustTailInVarArgFunc: false -body: | - bb.0.entry: - liveins: %sgpr0_sgpr1 - - %sgpr4_sgpr5 = S_LOAD_DWORDX2_IMM %sgpr0_sgpr1, 11 - %sgpr7 = S_MOV_B32 61440 - %sgpr6 = S_MOV_B32 -1 - S_WAITCNT 127 - %vgpr0_vgpr1_vgpr2_vgpr3 = BUFFER_LOAD_DWORDX4_OFFSET %sgpr4_sgpr5_sgpr6_sgpr7, 0, 0, 0, 0, 0, implicit %exec - %sgpr4_sgpr5 = S_LOAD_DWORDX2_IMM killed %sgpr0_sgpr1, 9 - S_WAITCNT 3952 - %vgpr0, dead %sgpr0_sgpr1 = SI_INDIRECT_SRC_V4 killed %vgpr0_vgpr1_vgpr2_vgpr3, undef %vgpr10, 0, implicit-def dead %exec, implicit-def dead %vcc, implicit-def dead %m0, implicit %exec - S_WAITCNT 127 - BUFFER_STORE_DWORD_OFFSET killed %vgpr0, killed %sgpr4_sgpr5_sgpr6_sgpr7, 0, 0, 0, 0, 0, implicit %exec - S_ENDPGM - -... - -# CHECK-LABEL: name: extract_undef_neg_offset_vgpr{{$}} -# CHECK: bb.1: -# CHECK: successors: %bb.2(0x40000000 / 0x80000000 = 50.00%), %bb.1(0x40000000 / 0x80000000 = 50.00%) -# CHECK: liveins: %vgpr0_vgpr1_vgpr2_vgpr3{{$}} - -# CHECK: %vcc_lo = V_READFIRSTLANE_B32 undef %vgpr10, implicit %exec -# CHECK: %m0 = S_MOV_B32 %vcc_lo -# CHECK: %m0 = S_ADD_I32 %m0, -7, implicit-def %scc -# CHECK: %vgpr0 = V_MOVRELS_B32_e32 %vgpr0, implicit %m0, implicit %exec, implicit %vgpr0_vgpr1_vgpr2_vgpr3 -# CHECK: S_CBRANCH_EXECNZ %bb.1, implicit %exec - -# CHECK: bb.2: -# CHECK: liveins: %sgpr6_sgpr7, %sgpr4_sgpr5_sgpr6_sgpr7, %sgpr4, %sgpr5, %sgpr6, %sgpr7, %sgpr4_sgpr5, %vgpr0_vgpr1_vgpr2_vgpr3, %vgpr0, %vgpr1, %vgpr2, %vgpr3, %vgpr0_vgpr1, %vgpr2_vgpr3, %vgpr0_vgpr1_vgpr2, %vgpr1_vgpr2, %vgpr1_vgpr2_vgpr3, %sgpr0_sgpr1, %sgpr0, %sgpr1 - -name: extract_undef_neg_offset_vgpr -alignment: 0 -exposesReturnsTwice: false -hasInlineAsm: false -allVRegsAllocated: true -isSSA: false -tracksRegLiveness: true -tracksSubRegLiveness: true -liveins: - - { reg: '%sgpr0_sgpr1' } -frameInfo: - isFrameAddressTaken: false - isReturnAddressTaken: false - hasStackMap: false - hasPatchPoint: false - stackSize: 0 - offsetAdjustment: 0 - maxAlignment: 0 - adjustsStack: false - hasCalls: false - maxCallFrameSize: 0 - hasOpaqueSPAdjustment: false - hasVAStart: false - hasMustTailInVarArgFunc: false -body: | - bb.0.entry: - liveins: %sgpr0_sgpr1 - - %sgpr4_sgpr5 = S_LOAD_DWORDX2_IMM %sgpr0_sgpr1, 11 - %sgpr7 = S_MOV_B32 61440 - %sgpr6 = S_MOV_B32 -1 - S_WAITCNT 127 - %vgpr0_vgpr1_vgpr2_vgpr3 = BUFFER_LOAD_DWORDX4_OFFSET %sgpr4_sgpr5_sgpr6_sgpr7, 0, 0, 0, 0, 0, implicit %exec - %sgpr4_sgpr5 = S_LOAD_DWORDX2_IMM killed %sgpr0_sgpr1, 9 - S_WAITCNT 3952 - %vgpr0, dead %sgpr0_sgpr1 = SI_INDIRECT_SRC_V4 killed %vgpr0_vgpr1_vgpr2_vgpr3, undef %vgpr10, -7, implicit-def dead %exec, implicit-def dead %vcc, implicit-def dead %m0, implicit %exec - S_WAITCNT 127 - BUFFER_STORE_DWORD_OFFSET killed %vgpr0, killed %sgpr4_sgpr5_sgpr6_sgpr7, 0, 0, 0, 0, 0, implicit %exec - S_ENDPGM - -... - -# CHECK-LABEL: name: insert_undef_offset_vgpr{{$}} -# CHECK: bb.1: -# CHECK: successors: %bb.2(0x40000000 / 0x80000000 = 50.00%), %bb.1(0x40000000 / 0x80000000 = 50.00%) -# CHECK: liveins: %vgpr4, %vgpr0_vgpr1_vgpr2_vgpr3{{$}} - -# CHECK: %vcc_lo = V_READFIRSTLANE_B32 undef %vgpr10, implicit %exec -# CHECK: %m0 = S_MOV_B32 %vcc_lo -# CHECK: %vgpr0 = V_MOVRELD_B32_e32 %vgpr4, implicit %m0, implicit %exec, implicit %vgpr0_vgpr1_vgpr2_vgpr3 -# CHECK: S_CBRANCH_EXECNZ %bb.1, implicit %exec - -# CHECK: bb.2: -# CHECK: liveins: %sgpr6_sgpr7, %sgpr7, %sgpr4_sgpr5, %sgpr5, %sgpr4_sgpr5_sgpr6_sgpr7, %sgpr6, %sgpr4, %vgpr0_vgpr1_vgpr2_vgpr3, %vgpr0, %vgpr1, %vgpr2, %vgpr3, %vgpr0_vgpr1, %vgpr2_vgpr3, %vgpr0_vgpr1_vgpr2, %vgpr1_vgpr2, %vgpr1_vgpr2_vgpr3, %vgpr4, %sgpr0_sgpr1, %sgpr0, %sgpr1 - -name: insert_undef_offset_vgpr -alignment: 0 -exposesReturnsTwice: false -hasInlineAsm: false -allVRegsAllocated: true -isSSA: false -tracksRegLiveness: true -tracksSubRegLiveness: true -liveins: - - { reg: '%sgpr0_sgpr1' } -frameInfo: - isFrameAddressTaken: false - isReturnAddressTaken: false - hasStackMap: false - hasPatchPoint: false - stackSize: 0 - offsetAdjustment: 0 - maxAlignment: 0 - adjustsStack: false - hasCalls: false - maxCallFrameSize: 0 - hasOpaqueSPAdjustment: false - hasVAStart: false - hasMustTailInVarArgFunc: false -body: | - bb.0.entry: - liveins: %sgpr0_sgpr1 - - %sgpr4_sgpr5 = S_LOAD_DWORDX2_IMM %sgpr0_sgpr1, 11 :: (non-temporal invariant load 8 from `i64 addrspace(2)* undef`) - %sgpr7 = S_MOV_B32 61440 - %sgpr6 = S_MOV_B32 -1 - %vgpr4 = V_MOV_B32_e32 5, implicit %exec - S_WAITCNT 127 - %vgpr0_vgpr1_vgpr2_vgpr3 = BUFFER_LOAD_DWORDX4_OFFSET %sgpr4_sgpr5_sgpr6_sgpr7, 0, 0, 0, 0, 0, implicit %exec :: (load 16 from %ir.in) - %sgpr4_sgpr5 = S_LOAD_DWORDX2_IMM killed %sgpr0_sgpr1, 9 :: (non-temporal invariant load 8 from `i64 addrspace(2)* undef`) - S_WAITCNT 3952 - %vgpr0_vgpr1_vgpr2_vgpr3, dead %sgpr0_sgpr1 = SI_INDIRECT_DST_V4 %vgpr0_vgpr1_vgpr2_vgpr3, undef %vgpr10, 0, killed %vgpr4, implicit-def dead %exec, implicit-def dead %vcc, implicit-def dead %m0, implicit %exec - S_WAITCNT 127 - BUFFER_STORE_DWORDX4_OFFSET killed %vgpr0_vgpr1_vgpr2_vgpr3, killed %sgpr4_sgpr5_sgpr6_sgpr7, 0, 0, 0, 0, 0, implicit %exec :: (store 16 into %ir.out) - S_ENDPGM - -... - -# CHECK-LABEL: name: insert_undef_neg_offset_vgpr{{$}} -# CHECK: bb.1: -# CHECK: successors: %bb.2(0x40000000 / 0x80000000 = 50.00%), %bb.1(0x40000000 / 0x80000000 = 50.00%) -# CHECK: liveins: %vgpr4, %vgpr0_vgpr1_vgpr2_vgpr3{{$}} - -# CHECK: %vcc_lo = V_READFIRSTLANE_B32 undef %vgpr10, implicit %exec -# CHECK: %m0 = S_MOV_B32 %vcc_lo -# CHECK: %m0 = S_ADD_I32 %m0, -7, implicit-def %scc -# CHECK: %vgpr0 = V_MOVRELD_B32_e32 %vgpr4, implicit %m0, implicit %exec, implicit %vgpr0_vgpr1_vgpr2_vgpr3 -# CHECK: S_CBRANCH_EXECNZ %bb.1, implicit %exec - -# CHECK: bb.2: -# CHECK: liveins: %sgpr6_sgpr7, %sgpr7, %sgpr4_sgpr5, %sgpr5, %sgpr4_sgpr5_sgpr6_sgpr7, %sgpr6, %sgpr4, %vgpr0_vgpr1_vgpr2_vgpr3, %vgpr0, %vgpr1, %vgpr2, %vgpr3, %vgpr0_vgpr1, %vgpr2_vgpr3, %vgpr0_vgpr1_vgpr2, %vgpr1_vgpr2, %vgpr1_vgpr2_vgpr3, %vgpr4, %sgpr0_sgpr1, %sgpr0, %sgpr1{{$}} - -name: insert_undef_neg_offset_vgpr -alignment: 0 -exposesReturnsTwice: false -hasInlineAsm: false -allVRegsAllocated: true -isSSA: false -tracksRegLiveness: true -tracksSubRegLiveness: true -liveins: - - { reg: '%sgpr0_sgpr1' } -frameInfo: - isFrameAddressTaken: false - isReturnAddressTaken: false - hasStackMap: false - hasPatchPoint: false - stackSize: 0 - offsetAdjustment: 0 - maxAlignment: 0 - adjustsStack: false - hasCalls: false - maxCallFrameSize: 0 - hasOpaqueSPAdjustment: false - hasVAStart: false - hasMustTailInVarArgFunc: false -body: | - bb.0.entry: - liveins: %sgpr0_sgpr1 - - %sgpr4_sgpr5 = S_LOAD_DWORDX2_IMM %sgpr0_sgpr1, 11 :: (non-temporal invariant load 8 from `i64 addrspace(2)* undef`) - %sgpr7 = S_MOV_B32 61440 - %sgpr6 = S_MOV_B32 -1 - %vgpr4 = V_MOV_B32_e32 5, implicit %exec - S_WAITCNT 127 - %vgpr0_vgpr1_vgpr2_vgpr3 = BUFFER_LOAD_DWORDX4_OFFSET %sgpr4_sgpr5_sgpr6_sgpr7, 0, 0, 0, 0, 0, implicit %exec :: (load 16 from %ir.in) - %sgpr4_sgpr5 = S_LOAD_DWORDX2_IMM killed %sgpr0_sgpr1, 9 :: (non-temporal invariant load 8 from `i64 addrspace(2)* undef`) - S_WAITCNT 3952 - %vgpr0_vgpr1_vgpr2_vgpr3, dead %sgpr0_sgpr1 = SI_INDIRECT_DST_V4 %vgpr0_vgpr1_vgpr2_vgpr3, undef %vgpr10, -7, killed %vgpr4, implicit-def dead %exec, implicit-def dead %vcc, implicit-def dead %m0, implicit %exec - S_WAITCNT 127 - BUFFER_STORE_DWORDX4_OFFSET killed %vgpr0_vgpr1_vgpr2_vgpr3, killed %sgpr4_sgpr5_sgpr6_sgpr7, 0, 0, 0, 0, 0, implicit %exec :: (store 16 into %ir.out) - S_ENDPGM - -... - -# CHECK-LABEL: insert_undef_value_offset_vgpr{{$}} -# CHECK: bb.1: -# CHECK: successors: %bb.2(0x40000000 / 0x80000000 = 50.00%), %bb.1(0x40000000 / 0x80000000 = 50.00%) -# CHECK: liveins: %vgpr4, %vgpr0_vgpr1_vgpr2_vgpr3{{$}} - -# CHECK: %vcc_lo = V_READFIRSTLANE_B32 %vgpr4, implicit %exec -# CHECK: %m0 = S_MOV_B32 %vcc_lo -# CHECK: %vgpr0 = V_MOVRELD_B32_e32 undef %vgpr10, implicit %m0, implicit %exec, implicit %vgpr0_vgpr1_vgpr2_vgpr3 -# CHECK: S_CBRANCH_EXECNZ %bb.1, implicit %exec - -# CHECK: bb.2: -# CHECK: liveins: %sgpr6_sgpr7, %sgpr7, %sgpr4_sgpr5, %sgpr5, %sgpr4_sgpr5_sgpr6_sgpr7, %sgpr6, %sgpr4, %vgpr0_vgpr1_vgpr2_vgpr3, %vgpr0, %vgpr1, %vgpr2, %vgpr3, %vgpr0_vgpr1, %vgpr2_vgpr3, %vgpr0_vgpr1_vgpr2, %vgpr1_vgpr2, %vgpr1_vgpr2_vgpr3, %vgpr4, %sgpr0_sgpr1, %sgpr0, %sgpr1{{$}} - -name: insert_undef_value_offset_vgpr -alignment: 0 -exposesReturnsTwice: false -hasInlineAsm: false -allVRegsAllocated: true -isSSA: false -tracksRegLiveness: true -tracksSubRegLiveness: true -liveins: - - { reg: '%sgpr0_sgpr1' } -frameInfo: - isFrameAddressTaken: false - isReturnAddressTaken: false - hasStackMap: false - hasPatchPoint: false - stackSize: 0 - offsetAdjustment: 0 - maxAlignment: 0 - adjustsStack: false - hasCalls: false - maxCallFrameSize: 0 - hasOpaqueSPAdjustment: false - hasVAStart: false - hasMustTailInVarArgFunc: false -body: | - bb.0.entry: - liveins: %sgpr0_sgpr1 - - %sgpr4_sgpr5 = S_LOAD_DWORDX2_IMM %sgpr0_sgpr1, 11 :: (non-temporal invariant load 8 from `i64 addrspace(2)* undef`) - %sgpr7 = S_MOV_B32 61440 - %sgpr6 = S_MOV_B32 -1 - %vgpr4 = V_MOV_B32_e32 2, implicit %exec - S_WAITCNT 127 - %vgpr0_vgpr1_vgpr2_vgpr3 = BUFFER_LOAD_DWORDX4_OFFSET %sgpr4_sgpr5_sgpr6_sgpr7, 0, 0, 0, 0, 0, implicit %exec :: (load 16 from %ir.in) - %sgpr4_sgpr5 = S_LOAD_DWORDX2_IMM killed %sgpr0_sgpr1, 9 :: (non-temporal invariant load 8 from `i64 addrspace(2)* undef`) - S_WAITCNT 3952 - %vgpr0_vgpr1_vgpr2_vgpr3, dead %sgpr0_sgpr1 = SI_INDIRECT_DST_V4 %vgpr0_vgpr1_vgpr2_vgpr3, killed %vgpr4, 0, undef %vgpr10, implicit-def dead %exec, implicit-def dead %vcc, implicit-def dead %m0, implicit %exec - S_WAITCNT 127 - BUFFER_STORE_DWORDX4_OFFSET killed %vgpr0_vgpr1_vgpr2_vgpr3, killed %sgpr4_sgpr5_sgpr6_sgpr7, 0, 0, 0, 0, 0, implicit %exec :: (store 16 into %ir.out) - S_ENDPGM - -... Index: test/CodeGen/AMDGPU/insert_vector_elt.ll =================================================================== --- test/CodeGen/AMDGPU/insert_vector_elt.ll +++ test/CodeGen/AMDGPU/insert_vector_elt.ll @@ -145,8 +145,7 @@ } ; GCN-LABEL: {{^}}dynamic_insertelement_v3i32: -; GCN: v_mov_b32_e32 [[CONST:v[0-9]+]], 5 -; GCN: v_movreld_b32_e32 v[[LOW_RESULT_REG:[0-9]+]], [[CONST]] +; GCN: v_movreld_b32_e32 v[[LOW_RESULT_REG:[0-9]+]], 5 ; GCN-DAG: buffer_store_dwordx2 {{v\[}}[[LOW_RESULT_REG]]: ; GCN-DAG: buffer_store_dword v define void @dynamic_insertelement_v3i32(<3 x i32> addrspace(1)* %out, <3 x i32> %a, i32 %b) nounwind { @@ -156,10 +155,12 @@ } ; GCN-LABEL: {{^}}dynamic_insertelement_v4i32: -; GCN: v_movreld_b32 +; GCN: s_load_dword [[SVAL:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, {{0x12|0x48}} +; GCN: v_mov_b32_e32 [[VVAL:v[0-9]+]], [[SVAL]] +; GCN: v_movreld_b32_e32 v{{[0-9]+}}, [[VVAL]] ; GCN: buffer_store_dwordx4 -define void @dynamic_insertelement_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> %a, i32 %b) nounwind { - %vecins = insertelement <4 x i32> %a, i32 5, i32 %b +define void @dynamic_insertelement_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> %a, i32 %b, i32 %val) nounwind { + %vecins = insertelement <4 x i32> %a, i32 %val, i32 %b store <4 x i32> %vecins, <4 x i32> addrspace(1)* %out, align 16 ret void } @@ -332,25 +333,25 @@ } ; GCN-LABEL: {{^}}dynamic_insertelement_v2f64: -; GCN: s_load_dword [[IDX:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, {{0x11|0x44}}{{$}} +; GCN-DAG: s_load_dwordx4 s{{\[}}[[A_ELT0:[0-9]+]]:[[A_ELT3:[0-9]+]]{{\]}} +; GCN-DAG: s_load_dword [[IDX:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, {{0x11|0x44}}{{$}} + ; GCN-DAG: s_lshl_b32 [[SCALEDIDX:s[0-9]+]], [[IDX]], 1{{$}} -; GCN-DAG: v_mov_b32_e32 [[ELT0:v[0-9]+]], 0{{$}} ; GCN-DAG: v_mov_b32_e32 v{{[0-9]+}}, s{{[0-9]+}} ; GCN-DAG: v_mov_b32_e32 v{{[0-9]+}}, s{{[0-9]+}} ; GCN-DAG: v_mov_b32_e32 v{{[0-9]+}}, s{{[0-9]+}} ; GCN-DAG: v_mov_b32_e32 v{{[0-9]+}}, s{{[0-9]+}} +; GCN-DAG: v_mov_b32_e32 [[ELT1:v[0-9]+]], 0x40200000 ; GCN: s_mov_b32 m0, [[SCALEDIDX]] -; GCN: v_movreld_b32_e32 v{{[0-9]+}}, [[ELT0]] +; GCN: v_movreld_b32_e32 v{{[0-9]+}}, 0 -; Increment to next element. -; FIXME: Should be able to manipulate m0 directly instead of add and -; copy. +; Increment to next element folded into base register, but FileCheck +; can't do math expressions + +; FIXME: Should be able to manipulate m0 directly instead of s_lshl_b32 + copy to m0 -; FIXME: Should avoid resetting m0 to same value -; GCN-DAG: v_mov_b32_e32 [[ELT1:v[0-9]+]], 0x40200000 -; GCN-DAG: s_mov_b32 m0, [[SCALEDIDX]] ; GCN: v_movreld_b32_e32 v{{[0-9]+}}, [[ELT1]] ; GCN: buffer_store_dwordx4 @@ -361,14 +362,10 @@ ret void } -; FIXME: Inline immediate should be folded into v_movreld_b32. ; GCN-LABEL: {{^}}dynamic_insertelement_v2i64: -; GCN-DAG: v_mov_b32_e32 [[ELT0:v[0-9]+]], 5{{$}} -; GCN-DAG: v_mov_b32_e32 [[ELT1:v[0-9]+]], 0{{$}} - -; GCN-DAG: v_movreld_b32_e32 v{{[0-9]+}}, [[ELT0]] -; GCN-DAG: v_movreld_b32_e32 v{{[0-9]+}}, [[ELT1]] +; GCN-DAG: v_movreld_b32_e32 v{{[0-9]+}}, 5 +; GCN-DAG: v_movreld_b32_e32 v{{[0-9]+}}, 0 ; GCN: buffer_store_dwordx4 ; GCN: s_endpgm