diff --git a/llvm/lib/Target/AMDGPU/GCNRegPressure.cpp b/llvm/lib/Target/AMDGPU/GCNRegPressure.cpp --- a/llvm/lib/Target/AMDGPU/GCNRegPressure.cpp +++ b/llvm/lib/Target/AMDGPU/GCNRegPressure.cpp @@ -74,11 +74,11 @@ assert(Reg.isVirtual()); const auto RC = MRI.getRegClass(Reg); auto STI = static_cast(MRI.getTargetRegisterInfo()); - return STI->isSGPRClass(RC) ? - (STI->getRegSizeInBits(*RC) == 32 ? SGPR32 : SGPR_TUPLE) : - STI->hasAGPRs(RC) ? - (STI->getRegSizeInBits(*RC) == 32 ? AGPR32 : AGPR_TUPLE) : - (STI->getRegSizeInBits(*RC) == 32 ? VGPR32 : VGPR_TUPLE); + return STI->isSGPRClass(RC) + ? (STI->getRegSizeInBits(*RC) == 32 ? SGPR32 : SGPR_TUPLE) + : STI->isAGPRClass(RC) + ? (STI->getRegSizeInBits(*RC) == 32 ? AGPR32 : AGPR_TUPLE) + : (STI->getRegSizeInBits(*RC) == 32 ? VGPR32 : VGPR_TUPLE); } void GCNRegPressure::inc(unsigned Reg, diff --git a/llvm/lib/Target/AMDGPU/SIFixSGPRCopies.cpp b/llvm/lib/Target/AMDGPU/SIFixSGPRCopies.cpp --- a/llvm/lib/Target/AMDGPU/SIFixSGPRCopies.cpp +++ b/llvm/lib/Target/AMDGPU/SIFixSGPRCopies.cpp @@ -259,7 +259,7 @@ // VGPRz = REG_SEQUENCE VGPRx, sub0 MI.getOperand(0).setReg(CopyUse.getOperand(0).getReg()); - bool IsAGPR = TRI->hasAGPRs(DstRC); + bool IsAGPR = TRI->isAGPRClass(DstRC); for (unsigned I = 1, N = MI.getNumOperands(); I != N; I += 2) { Register SrcReg = MI.getOperand(I).getReg(); @@ -840,7 +840,7 @@ Register PHIRes = MI.getOperand(0).getReg(); const TargetRegisterClass *RC0 = MRI->getRegClass(PHIRes); - if (AllAGPRUses && numVGPRUses && !TRI->hasAGPRs(RC0)) { + if (AllAGPRUses && numVGPRUses && !TRI->isAGPRClass(RC0)) { LLVM_DEBUG(dbgs() << "Moving PHI to AGPR: " << MI); MRI->setRegClass(PHIRes, TRI->getEquivalentAGPRClass(RC0)); for (unsigned I = 1, N = MI.getNumOperands(); I != N; I += 2) { diff --git a/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp b/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp --- a/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp +++ b/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp @@ -726,6 +726,10 @@ if (MovOp == AMDGPU::COPY) return; + // Use VGPR regclass if it is an AV class. + if (TRI->isVectorSuperClass(DestRC)) + MRI->setRegClass(DestReg, TRI->getEquivalentVGPRClass(DestRC)); + UseMI->setDesc(TII->get(MovOp)); MachineInstr::mop_iterator ImpOpI = UseMI->implicit_operands().begin(); MachineInstr::mop_iterator ImpOpE = UseMI->implicit_operands().end(); @@ -1572,17 +1576,9 @@ unsigned OpIdx = Op - &UseMI->getOperand(0); const MCInstrDesc &InstDesc = UseMI->getDesc(); - const MCOperandInfo &OpInfo = InstDesc.OpInfo[OpIdx]; - switch (OpInfo.RegClass) { - case AMDGPU::AV_32RegClassID: LLVM_FALLTHROUGH; - case AMDGPU::AV_64RegClassID: LLVM_FALLTHROUGH; - case AMDGPU::AV_96RegClassID: LLVM_FALLTHROUGH; - case AMDGPU::AV_128RegClassID: LLVM_FALLTHROUGH; - case AMDGPU::AV_160RegClassID: - break; - default: + if (!TRI->isVectorSuperClass( + TRI->getRegClass(InstDesc.OpInfo[OpIdx].RegClass))) return false; - } const auto *NewDstRC = TRI->getEquivalentAGPRClass(MRI->getRegClass(Reg)); auto Dst = MRI->createVirtualRegister(NewDstRC); diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp --- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp @@ -11441,15 +11441,15 @@ if (I == -1) break; MachineOperand &Op = MI.getOperand(I); - if ((OpInfo[I].RegClass != llvm::AMDGPU::AV_64RegClassID && - OpInfo[I].RegClass != llvm::AMDGPU::AV_32RegClassID) || - !Op.getReg().isVirtual() || !TRI->isAGPR(MRI, Op.getReg())) + if (!Op.isReg() || !Op.getReg().isVirtual()) + continue; + auto *RC = TRI->getRegClassForReg(MRI, Op.getReg()); + if (!TRI->hasAGPRs(RC)) continue; auto *Src = MRI.getUniqueVRegDef(Op.getReg()); if (!Src || !Src->isCopy() || !TRI->isSGPRReg(MRI, Src->getOperand(1).getReg())) continue; - auto *RC = TRI->getRegClassForReg(MRI, Op.getReg()); auto *NewRC = TRI->getEquivalentVGPRClass(RC); // All uses of agpr64 and agpr32 can also accept vgpr except for // v_accvgpr_read, but we do not produce agpr reads during selection, diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp --- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp +++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp @@ -893,10 +893,10 @@ unsigned EltSize = 4; unsigned Opcode = AMDGPU::V_MOV_B32_e32; - if (RI.hasAGPRs(RC)) { + if (RI.isAGPRClass(RC)) { Opcode = (RI.hasVGPRs(SrcRC)) ? AMDGPU::V_ACCVGPR_WRITE_B32_e64 : AMDGPU::INSTRUCTION_LIST_END; - } else if (RI.hasVGPRs(RC) && RI.hasAGPRs(SrcRC)) { + } else if (RI.hasVGPRs(RC) && RI.isAGPRClass(SrcRC)) { Opcode = AMDGPU::V_ACCVGPR_READ_B32_e64; } else if ((Size % 64 == 0) && RI.hasVGPRs(RC) && (RI.isProperlyAlignedRC(*RC) && @@ -1200,7 +1200,7 @@ unsigned SIInstrInfo::getMovOpcode(const TargetRegisterClass *DstRC) const { - if (RI.hasAGPRs(DstRC)) + if (RI.isAGPRClass(DstRC)) return AMDGPU::COPY; if (RI.getRegSizeInBits(*DstRC) == 32) { return RI.isSGPRClass(DstRC) ? AMDGPU::S_MOV_B32 : AMDGPU::V_MOV_B32_e32; @@ -1458,8 +1458,8 @@ return; } - unsigned Opcode = RI.hasAGPRs(RC) ? getAGPRSpillSaveOpcode(SpillSize) - : getVGPRSpillSaveOpcode(SpillSize); + unsigned Opcode = RI.isAGPRClass(RC) ? getAGPRSpillSaveOpcode(SpillSize) + : getVGPRSpillSaveOpcode(SpillSize); MFI->setHasSpilledVGPRs(); BuildMI(MBB, MI, DL, get(Opcode)) @@ -1593,8 +1593,8 @@ return; } - unsigned Opcode = RI.hasAGPRs(RC) ? getAGPRSpillRestoreOpcode(SpillSize) - : getVGPRSpillRestoreOpcode(SpillSize); + unsigned Opcode = RI.isAGPRClass(RC) ? getAGPRSpillRestoreOpcode(SpillSize) + : getVGPRSpillRestoreOpcode(SpillSize); BuildMI(MBB, MI, DL, get(Opcode), DestReg) .addFrameIndex(FrameIndex) // vaddr .addReg(MFI->getStackPtrOffsetReg()) // scratch_offset @@ -2786,26 +2786,29 @@ if (Opc == AMDGPU::COPY) { Register DstReg = UseMI.getOperand(0).getReg(); bool Is16Bit = getOpSize(UseMI, 0) == 2; - bool isVGPRCopy = RI.isVGPR(*MRI, DstReg); - unsigned NewOpc = isVGPRCopy ? AMDGPU::V_MOV_B32_e32 : AMDGPU::S_MOV_B32; + const TargetRegisterClass *RC = RI.getRegClassForReg(*MRI, DstReg); + bool IsVectorRegCopy = RI.hasVectorRegisters(RC); + unsigned NewOpc = + IsVectorRegCopy ? AMDGPU::V_MOV_B32_e32 : AMDGPU::S_MOV_B32; APInt Imm(32, ImmOp->getImm()); if (UseMI.getOperand(1).getSubReg() == AMDGPU::hi16) Imm = Imm.ashr(16); - if (RI.isAGPR(*MRI, DstReg)) { + if (RI.isVectorSuperClass(RC)) { + MRI->setRegClass(DstReg, &AMDGPU::VGPR_32RegClass); + } else if (RI.isAGPRClass(RC)) { if (!isInlineConstant(Imm)) return false; NewOpc = AMDGPU::V_ACCVGPR_WRITE_B32_e64; } if (Is16Bit) { - if (isVGPRCopy) - return false; // Do not clobber vgpr_hi16 + if (RI.isVGPRClass(RC)) + return false; // Do not clobber vgpr_hi16 - if (DstReg.isVirtual() && - UseMI.getOperand(0).getSubReg() != AMDGPU::lo16) - return false; + if (DstReg.isVirtual() && UseMI.getOperand(0).getSubReg() != AMDGPU::lo16) + return false; UseMI.getOperand(0).setSubReg(0); if (DstReg.isPhysical()) { @@ -3877,9 +3880,7 @@ // verification is broken anyway if (ST.needsAlignedVGPRs()) { const TargetRegisterClass *RC = RI.getRegClassForReg(MRI, Reg); - const bool IsVGPR = RI.hasVGPRs(RC); - const bool IsAGPR = !IsVGPR && RI.hasAGPRs(RC); - if ((IsVGPR || IsAGPR) && MO.getSubReg()) { + if (RI.hasVectorRegisters(RC) && MO.getSubReg()) { const TargetRegisterClass *SubRC = RI.getSubRegClass(RC, MO.getSubReg()); RC = RI.getCompatibleSubRegClass(RC, SubRC, MO.getSubReg()); @@ -4850,10 +4851,10 @@ } // No VOP2 instructions support AGPRs. - if (Src0.isReg() && RI.isAGPR(MRI, Src0.getReg())) + if (Src0.isReg() && RI.hasAGPRs(RI.getRegClassForReg(MRI, Src0.getReg()))) legalizeOpWithMove(MI, Src0Idx); - if (Src1.isReg() && RI.isAGPR(MRI, Src1.getReg())) + if (Src1.isReg() && RI.hasAGPRs(RI.getRegClassForReg(MRI, Src1.getReg()))) legalizeOpWithMove(MI, Src1Idx); // VOP2 src0 instructions support all operand types, so we don't need to check @@ -5504,13 +5505,13 @@ if (getOpRegClass(MI, 0) == &AMDGPU::VReg_1RegClass) { VRC = &AMDGPU::VReg_1RegClass; } else - VRC = RI.hasAGPRs(getOpRegClass(MI, 0)) + VRC = RI.isAGPRClass(getOpRegClass(MI, 0)) ? RI.getEquivalentAGPRClass(SRC) : RI.getEquivalentVGPRClass(SRC); } else { - VRC = RI.hasAGPRs(getOpRegClass(MI, 0)) - ? RI.getEquivalentAGPRClass(VRC) - : RI.getEquivalentVGPRClass(VRC); + VRC = RI.isAGPRClass(getOpRegClass(MI, 0)) + ? RI.getEquivalentAGPRClass(VRC) + : RI.getEquivalentVGPRClass(VRC); } RC = VRC; } else { @@ -7047,8 +7048,8 @@ case AMDGPU::STRICT_WWM: case AMDGPU::STRICT_WQM: { const TargetRegisterClass *SrcRC = getOpRegClass(Inst, 1); - if (RI.hasAGPRs(SrcRC)) { - if (RI.hasAGPRs(NewDstRC)) + if (RI.isAGPRClass(SrcRC)) { + if (RI.isAGPRClass(NewDstRC)) return nullptr; switch (Inst.getOpcode()) { @@ -7064,7 +7065,7 @@ if (!NewDstRC) return nullptr; } else { - if (RI.hasVGPRs(NewDstRC) || NewDstRC == &AMDGPU::VReg_1RegClass) + if (RI.isVGPRClass(NewDstRC) || NewDstRC == &AMDGPU::VReg_1RegClass) return nullptr; NewDstRC = RI.getEquivalentVGPRClass(NewDstRC); diff --git a/llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp b/llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp --- a/llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp +++ b/llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp @@ -1609,7 +1609,7 @@ } unsigned BitWidth = 32 * (CI.Width + Paired.Width); - return TRI->hasAGPRs(getDataRegClass(*CI.I)) + return TRI->isAGPRClass(getDataRegClass(*CI.I)) ? TRI->getAGPRClassForBitWidth(BitWidth) : TRI->getVGPRClassForBitWidth(BitWidth); } diff --git a/llvm/lib/Target/AMDGPU/SIPeepholeSDWA.cpp b/llvm/lib/Target/AMDGPU/SIPeepholeSDWA.cpp --- a/llvm/lib/Target/AMDGPU/SIPeepholeSDWA.cpp +++ b/llvm/lib/Target/AMDGPU/SIPeepholeSDWA.cpp @@ -1170,7 +1170,7 @@ unsigned I = MI.getOperandNo(&Op); if (Desc.OpInfo[I].RegClass == -1 || - !TRI->hasVGPRs(TRI->getRegClass(Desc.OpInfo[I].RegClass))) + !TRI->isVGPRClass(TRI->getRegClass(Desc.OpInfo[I].RegClass))) continue; if (ST.hasSDWAScalar() && ConstantBusCount == 0 && Op.isReg() && diff --git a/llvm/lib/Target/AMDGPU/SIRegisterInfo.h b/llvm/lib/Target/AMDGPU/SIRegisterInfo.h --- a/llvm/lib/Target/AMDGPU/SIRegisterInfo.h +++ b/llvm/lib/Target/AMDGPU/SIRegisterInfo.h @@ -149,6 +149,10 @@ LLVM_READONLY const TargetRegisterClass *getAGPRClassForBitWidth(unsigned BitWidth) const; + LLVM_READONLY + const TargetRegisterClass * + getVectorSuperClassForBitWidth(unsigned BitWidth) const; + LLVM_READONLY static const TargetRegisterClass *getSGPRClassForBitWidth(unsigned BitWidth); @@ -178,6 +182,11 @@ return hasAGPRs(RC) && !hasVGPRs(RC); } + /// \returns true only if this class contains both VGPR and AGPR registers + bool isVectorSuperClass(const TargetRegisterClass *RC) const { + return hasVGPRs(RC) && hasAGPRs(RC); + } + /// \returns true if this class contains VGPR registers. bool hasVGPRs(const TargetRegisterClass *RC) const; diff --git a/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp b/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp --- a/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp +++ b/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp @@ -1066,7 +1066,7 @@ const TargetRegisterClass *RC = getRegClassForReg(MF->getRegInfo(), ValueReg); // On gfx90a+ AGPR is a regular VGPR acceptable for loads and stores. - const bool IsAGPR = !ST.hasGFX90AInsts() && hasAGPRs(RC); + const bool IsAGPR = !ST.hasGFX90AInsts() && isAGPRClass(RC); const unsigned RegWidth = AMDGPU::getRegBitWidth(RC->getID()) / 8; // Always use 4 byte operations for AGPRs because we need to scavenge @@ -2057,6 +2057,65 @@ : getAnyAGPRClassForBitWidth(BitWidth); } +static const TargetRegisterClass * +getAnyVectorSuperClassForBitWidth(unsigned BitWidth) { + if (BitWidth <= 64) + return &AMDGPU::AV_64RegClass; + if (BitWidth <= 96) + return &AMDGPU::AV_96RegClass; + if (BitWidth <= 128) + return &AMDGPU::AV_128RegClass; + if (BitWidth <= 160) + return &AMDGPU::AV_160RegClass; + if (BitWidth <= 192) + return &AMDGPU::AV_192RegClass; + if (BitWidth <= 224) + return &AMDGPU::AV_224RegClass; + if (BitWidth <= 256) + return &AMDGPU::AV_256RegClass; + if (BitWidth <= 512) + return &AMDGPU::AV_512RegClass; + if (BitWidth <= 1024) + return &AMDGPU::AV_1024RegClass; + + return nullptr; +} + +static const TargetRegisterClass * +getAlignedVectorSuperClassForBitWidth(unsigned BitWidth) { + if (BitWidth <= 64) + return &AMDGPU::AV_64_Align2RegClass; + if (BitWidth <= 96) + return &AMDGPU::AV_96_Align2RegClass; + if (BitWidth <= 128) + return &AMDGPU::AV_128_Align2RegClass; + if (BitWidth <= 160) + return &AMDGPU::AV_160_Align2RegClass; + if (BitWidth <= 192) + return &AMDGPU::AV_192_Align2RegClass; + if (BitWidth <= 224) + return &AMDGPU::AV_224_Align2RegClass; + if (BitWidth <= 256) + return &AMDGPU::AV_256_Align2RegClass; + if (BitWidth <= 512) + return &AMDGPU::AV_512_Align2RegClass; + if (BitWidth <= 1024) + return &AMDGPU::AV_1024_Align2RegClass; + + return nullptr; +} + +const TargetRegisterClass * +SIRegisterInfo::getVectorSuperClassForBitWidth(unsigned BitWidth) const { + if (BitWidth <= 16) + return &AMDGPU::VGPR_LO16RegClass; + if (BitWidth <= 32) + return &AMDGPU::AV_32RegClass; + return ST.needsAlignedVGPRs() + ? getAlignedVectorSuperClassForBitWidth(BitWidth) + : getAnyVectorSuperClassForBitWidth(BitWidth); +} + const TargetRegisterClass * SIRegisterInfo::getSGPRClassForBitWidth(unsigned BitWidth) { if (BitWidth <= 16) @@ -2207,15 +2266,14 @@ // We can assume that each lane corresponds to one 32-bit register. unsigned Size = getNumChannelsFromSubReg(SubIdx) * 32; - if (isSGPRClass(RC)) { - if (Size == 32) - RC = &AMDGPU::SGPR_32RegClass; - else - RC = getSGPRClassForBitWidth(Size); - } else if (hasAGPRs(RC)) { + if (isAGPRClass(RC)) { RC = getAGPRClassForBitWidth(Size); - } else { + } else if (isVGPRClass(RC)) { RC = getVGPRClassForBitWidth(Size); + } else if (isVectorSuperClass(RC)) { + RC = getVectorSuperClassForBitWidth(Size); + } else { + RC = getSGPRClassForBitWidth(Size); } assert(RC && "Invalid sub-register class size"); return RC; @@ -2359,6 +2417,7 @@ case AMDGPU::VGPR_32RegClassID: case AMDGPU::VGPR_LO16RegClassID: case AMDGPU::VGPR_HI16RegClassID: + case AMDGPU::AV_32RegClassID: return std::min(ST.getMaxNumVGPRs(Occupancy), ST.getMaxNumVGPRs(MF)); case AMDGPU::SGPR_32RegClassID: case AMDGPU::SGPR_LO16RegClassID: @@ -2369,7 +2428,8 @@ unsigned SIRegisterInfo::getRegPressureSetLimit(const MachineFunction &MF, unsigned Idx) const { if (Idx == AMDGPU::RegisterPressureSets::VGPR_32 || - Idx == AMDGPU::RegisterPressureSets::AGPR_32) + Idx == AMDGPU::RegisterPressureSets::AGPR_32 || + Idx == AMDGPU::RegisterPressureSets::AV_32) return getRegPressureLimit(&AMDGPU::VGPR_32RegClass, const_cast(MF)); @@ -2526,10 +2586,13 @@ if (!ST.needsAlignedVGPRs()) return true; - if (hasVGPRs(&RC)) + if (isVGPRClass(&RC)) return RC.hasSuperClassEq(getVGPRClassForBitWidth(getRegSizeInBits(RC))); - if (hasAGPRs(&RC)) + if (isAGPRClass(&RC)) return RC.hasSuperClassEq(getAGPRClassForBitWidth(getRegSizeInBits(RC))); + if (isVectorSuperClass(&RC)) + return RC.hasSuperClassEq( + getVectorSuperClassForBitWidth(getRegSizeInBits(RC))); return true; } diff --git a/llvm/lib/Target/AMDGPU/SIRegisterInfo.td b/llvm/lib/Target/AMDGPU/SIRegisterInfo.td --- a/llvm/lib/Target/AMDGPU/SIRegisterInfo.td +++ b/llvm/lib/Target/AMDGPU/SIRegisterInfo.td @@ -853,38 +853,37 @@ let isAllocatable = 0; let HasVGPR = 1; } +} // End GeneratePressureSet = 0 -def AV_32 : SIRegisterClass<"AMDGPU", VGPR_32.RegTypes, 32, - (add AGPR_32, VGPR_32)> { - let isAllocatable = 0; - let HasVGPR = 1; - let HasAGPR = 1; -} - -def AV_64 : SIRegisterClass<"AMDGPU", VReg_64.RegTypes, 32, - (add AReg_64, VReg_64)> { - let isAllocatable = 0; +def AV_32 : SIRegisterClass<"AMDGPU", VGPR_32.RegTypes, 32, (add VGPR_32, AGPR_32)> { let HasVGPR = 1; let HasAGPR = 1; } -} // End GeneratePressureSet = 0 -let HasVGPR = 1, HasAGPR = 1 in { -def AV_96 : SIRegisterClass<"AMDGPU", VReg_96.RegTypes, 32, - (add AReg_96, VReg_96)> { - let isAllocatable = 0; -} +// Define a register tuple class, along with one requiring an even +// aligned base register. +multiclass AVRegClass regTypes, + dag vregList, dag aregList> { + let HasVGPR = 1, HasAGPR = 1 in { + // Define the regular class. + def "" : VRegClassBase; -def AV_128 : SIRegisterClass<"AMDGPU", VReg_128.RegTypes, 32, - (add AReg_128, VReg_128)> { - let isAllocatable = 0; + // Define 2-aligned variant + def _Align2 : VRegClassBase; + } } -def AV_160 : SIRegisterClass<"AMDGPU", VReg_160.RegTypes, 32, - (add AReg_160, VReg_160)> { - let isAllocatable = 0; -} -} // End HasVGPR = 1, HasAGPR = 1 +defm AV_64 : AVRegClass<2, VReg_64.RegTypes, (add VGPR_64), (add AGPR_64)>; +defm AV_96 : AVRegClass<3, VReg_96.RegTypes, (add VGPR_96), (add AGPR_96)>; +defm AV_128 : AVRegClass<4, VReg_128.RegTypes, (add VGPR_128), (add AGPR_128)>; +defm AV_160 : AVRegClass<5, VReg_160.RegTypes, (add VGPR_160), (add AGPR_160)>; +defm AV_192 : AVRegClass<6, VReg_160.RegTypes, (add VGPR_192), (add AGPR_192)>; +defm AV_224 : AVRegClass<7, VReg_160.RegTypes, (add VGPR_224), (add AGPR_224)>; +defm AV_256 : AVRegClass<8, VReg_160.RegTypes, (add VGPR_256), (add AGPR_256)>; +defm AV_512 : AVRegClass<16, VReg_160.RegTypes, (add VGPR_512), (add AGPR_512)>; +defm AV_1024 : AVRegClass<32, VReg_160.RegTypes, (add VGPR_1024), (add AGPR_1024)>; //===----------------------------------------------------------------------===// // Register operands diff --git a/llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp b/llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp --- a/llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp +++ b/llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp @@ -1427,7 +1427,7 @@ const Register Reg = MI->getOperand(0).getReg(); const unsigned SubReg = MI->getOperand(0).getSubReg(); - if (TRI->isVGPR(*MRI, Reg)) { + if (TRI->hasVGPRs(TRI->getRegClassForReg(*MRI, Reg))) { const TargetRegisterClass *regClass = Reg.isVirtual() ? MRI->getRegClass(Reg) : TRI->getPhysRegClass(Reg); if (SubReg) @@ -1436,6 +1436,10 @@ const unsigned MovOp = TII->getMovOpcode(regClass); MI->setDesc(TII->get(MovOp)); + // Use VGPR regclass if it is an AV class. + if (Reg.isVirtual() && TRI->isVectorSuperClass(regClass)) + MRI->setRegClass(Reg, TRI->getEquivalentVGPRClass(regClass)); + // Check that it already implicitly depends on exec (like all VALU movs // should do). assert(any_of(MI->implicit_operands(), [](const MachineOperand &MO) { diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp --- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp +++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp @@ -1622,13 +1622,14 @@ return 32; case AMDGPU::SGPR_64RegClassID: case AMDGPU::VS_64RegClassID: - case AMDGPU::AV_64RegClassID: case AMDGPU::SReg_64RegClassID: case AMDGPU::VReg_64RegClassID: case AMDGPU::AReg_64RegClassID: case AMDGPU::SReg_64_XEXECRegClassID: case AMDGPU::VReg_64_Align2RegClassID: case AMDGPU::AReg_64_Align2RegClassID: + case AMDGPU::AV_64RegClassID: + case AMDGPU::AV_64_Align2RegClassID: return 64; case AMDGPU::SGPR_96RegClassID: case AMDGPU::SReg_96RegClassID: @@ -1637,6 +1638,7 @@ case AMDGPU::VReg_96_Align2RegClassID: case AMDGPU::AReg_96_Align2RegClassID: case AMDGPU::AV_96RegClassID: + case AMDGPU::AV_96_Align2RegClassID: return 96; case AMDGPU::SGPR_128RegClassID: case AMDGPU::SReg_128RegClassID: @@ -1645,6 +1647,7 @@ case AMDGPU::VReg_128_Align2RegClassID: case AMDGPU::AReg_128_Align2RegClassID: case AMDGPU::AV_128RegClassID: + case AMDGPU::AV_128_Align2RegClassID: return 128; case AMDGPU::SGPR_160RegClassID: case AMDGPU::SReg_160RegClassID: @@ -1653,6 +1656,7 @@ case AMDGPU::VReg_160_Align2RegClassID: case AMDGPU::AReg_160_Align2RegClassID: case AMDGPU::AV_160RegClassID: + case AMDGPU::AV_160_Align2RegClassID: return 160; case AMDGPU::SGPR_192RegClassID: case AMDGPU::SReg_192RegClassID: @@ -1660,6 +1664,8 @@ case AMDGPU::AReg_192RegClassID: case AMDGPU::VReg_192_Align2RegClassID: case AMDGPU::AReg_192_Align2RegClassID: + case AMDGPU::AV_192RegClassID: + case AMDGPU::AV_192_Align2RegClassID: return 192; case AMDGPU::SGPR_224RegClassID: case AMDGPU::SReg_224RegClassID: @@ -1667,6 +1673,8 @@ case AMDGPU::AReg_224RegClassID: case AMDGPU::VReg_224_Align2RegClassID: case AMDGPU::AReg_224_Align2RegClassID: + case AMDGPU::AV_224RegClassID: + case AMDGPU::AV_224_Align2RegClassID: return 224; case AMDGPU::SGPR_256RegClassID: case AMDGPU::SReg_256RegClassID: @@ -1674,6 +1682,8 @@ case AMDGPU::AReg_256RegClassID: case AMDGPU::VReg_256_Align2RegClassID: case AMDGPU::AReg_256_Align2RegClassID: + case AMDGPU::AV_256RegClassID: + case AMDGPU::AV_256_Align2RegClassID: return 256; case AMDGPU::SGPR_512RegClassID: case AMDGPU::SReg_512RegClassID: @@ -1681,6 +1691,8 @@ case AMDGPU::AReg_512RegClassID: case AMDGPU::VReg_512_Align2RegClassID: case AMDGPU::AReg_512_Align2RegClassID: + case AMDGPU::AV_512RegClassID: + case AMDGPU::AV_512_Align2RegClassID: return 512; case AMDGPU::SGPR_1024RegClassID: case AMDGPU::SReg_1024RegClassID: @@ -1688,6 +1700,8 @@ case AMDGPU::AReg_1024RegClassID: case AMDGPU::VReg_1024_Align2RegClassID: case AMDGPU::AReg_1024_Align2RegClassID: + case AMDGPU::AV_1024RegClassID: + case AMDGPU::AV_1024_Align2RegClassID: return 1024; default: llvm_unreachable("Unexpected register class"); diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-inline-asm.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-inline-asm.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-inline-asm.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-inline-asm.ll @@ -138,7 +138,7 @@ ; CHECK: bb.1 (%ir-block.0): ; CHECK: liveins: $sgpr30_sgpr31 ; CHECK: [[COPY:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31 - ; CHECK: INLINEASM &"v_mov_b32 $0, 0; v_add_f64 $1, 0, 0", 0 /* attdialect */, 1835018 /* regdef:VGPR_32 */, def %1, 2883594 /* regdef:VReg_64 */, def %2 + ; CHECK: INLINEASM &"v_mov_b32 $0, 0; v_add_f64 $1, 0, 0", 0 /* attdialect */, 1835018 /* regdef:VGPR_32 */, def %1, 2949130 /* regdef:VReg_64 */, def %2 ; CHECK: [[COPY1:%[0-9]+]]:_(s32) = COPY %1 ; CHECK: [[COPY2:%[0-9]+]]:_(s64) = COPY %2 ; CHECK: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[COPY2]](s64) diff --git a/llvm/test/CodeGen/AMDGPU/inline-asm.i128.ll b/llvm/test/CodeGen/AMDGPU/inline-asm.i128.ll --- a/llvm/test/CodeGen/AMDGPU/inline-asm.i128.ll +++ b/llvm/test/CodeGen/AMDGPU/inline-asm.i128.ll @@ -8,15 +8,15 @@ define amdgpu_kernel void @s_input_output_i128() { ; GFX908-LABEL: name: s_input_output_i128 ; GFX908: bb.0 (%ir-block.0): - ; GFX908: INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 5111818 /* regdef:SGPR_128 */, def %4 + ; GFX908: INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 5439498 /* regdef:SGPR_128 */, def %4 ; GFX908: [[COPY:%[0-9]+]]:sgpr_128 = COPY %4 - ; GFX908: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 5111817 /* reguse:SGPR_128 */, [[COPY]] + ; GFX908: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 5439497 /* reguse:SGPR_128 */, [[COPY]] ; GFX908: S_ENDPGM 0 ; GFX90A-LABEL: name: s_input_output_i128 ; GFX90A: bb.0 (%ir-block.0): - ; GFX90A: INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 5111818 /* regdef:SGPR_128 */, def %4 + ; GFX90A: INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 5439498 /* regdef:SGPR_128 */, def %4 ; GFX90A: [[COPY:%[0-9]+]]:sgpr_128 = COPY %4 - ; GFX90A: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 5111817 /* reguse:SGPR_128 */, [[COPY]] + ; GFX90A: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 5439497 /* reguse:SGPR_128 */, [[COPY]] ; GFX90A: S_ENDPGM 0 %val = tail call i128 asm sideeffect "; def $0", "=s"() call void asm sideeffect "; use $0", "s"(i128 %val) @@ -26,15 +26,15 @@ define amdgpu_kernel void @v_input_output_i128() { ; GFX908-LABEL: name: v_input_output_i128 ; GFX908: bb.0 (%ir-block.0): - ; GFX908: INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 4718602 /* regdef:VReg_128 */, def %4 + ; GFX908: INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 4980746 /* regdef:VReg_128 */, def %4 ; GFX908: [[COPY:%[0-9]+]]:vreg_128 = COPY %4 - ; GFX908: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 4718601 /* reguse:VReg_128 */, [[COPY]] + ; GFX908: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 4980745 /* reguse:VReg_128 */, [[COPY]] ; GFX908: S_ENDPGM 0 ; GFX90A-LABEL: name: v_input_output_i128 ; GFX90A: bb.0 (%ir-block.0): - ; GFX90A: INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 4849674 /* regdef:VReg_128_Align2 */, def %4 + ; GFX90A: INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 5177354 /* regdef:VReg_128_Align2 */, def %4 ; GFX90A: [[COPY:%[0-9]+]]:vreg_128_align2 = COPY %4 - ; GFX90A: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 4849673 /* reguse:VReg_128_Align2 */, [[COPY]] + ; GFX90A: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 5177353 /* reguse:VReg_128_Align2 */, [[COPY]] ; GFX90A: S_ENDPGM 0 %val = tail call i128 asm sideeffect "; def $0", "=v"() call void asm sideeffect "; use $0", "v"(i128 %val) @@ -44,15 +44,15 @@ define amdgpu_kernel void @a_input_output_i128() { ; GFX908-LABEL: name: a_input_output_i128 ; GFX908: bb.0 (%ir-block.0): - ; GFX908: INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 4653066 /* regdef:AReg_128 */, def %4 + ; GFX908: INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 4915210 /* regdef:AReg_128 */, def %4 ; GFX908: [[COPY:%[0-9]+]]:areg_128 = COPY %4 - ; GFX908: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 4653065 /* reguse:AReg_128 */, [[COPY]] + ; GFX908: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 4915209 /* reguse:AReg_128 */, [[COPY]] ; GFX908: S_ENDPGM 0 ; GFX90A-LABEL: name: a_input_output_i128 ; GFX90A: bb.0 (%ir-block.0): - ; GFX90A: INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 4784138 /* regdef:AReg_128_Align2 */, def %4 + ; GFX90A: INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 5111818 /* regdef:AReg_128_Align2 */, def %4 ; GFX90A: [[COPY:%[0-9]+]]:areg_128_align2 = COPY %4 - ; GFX90A: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 4784137 /* reguse:AReg_128_Align2 */, [[COPY]] + ; GFX90A: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 5111817 /* reguse:AReg_128_Align2 */, [[COPY]] ; GFX90A: S_ENDPGM 0 %val = call i128 asm sideeffect "; def $0", "=a"() call void asm sideeffect "; use $0", "a"(i128 %val)