Index: llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp =================================================================== --- llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp +++ llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp @@ -2969,7 +2969,7 @@ if (Opcode == AMDGPU::BUFFER_ATOMIC_ADD_F32_BOTHEN || Opcode == AMDGPU::BUFFER_ATOMIC_PK_ADD_F16_BOTHEN) { - Register IdxReg = MRI->createVirtualRegister(&AMDGPU::VReg_64RegClass); + Register IdxReg = MRI->createVirtualRegister(TRI.getVGPR64Class()); BuildMI(*MBB, &*I, DL, TII.get(AMDGPU::REG_SEQUENCE), IdxReg) .addReg(VIndex.getReg()) .addImm(AMDGPU::sub0) Index: llvm/lib/Target/AMDGPU/GCNSubtarget.h =================================================================== --- llvm/lib/Target/AMDGPU/GCNSubtarget.h +++ llvm/lib/Target/AMDGPU/GCNSubtarget.h @@ -930,6 +930,9 @@ bool hasGFX90AInsts() const { return GFX90AInsts; } + /// Return if operations acting on VGPR tuples require even alignment. + bool needsAlignedVGPRs() const { return GFX90AInsts; } + bool hasPackedTID() const { return HasPackedTID; } /// Return the maximum number of waves per SIMD for kernels using \p SGPRs Index: llvm/lib/Target/AMDGPU/SIISelLowering.cpp =================================================================== --- llvm/lib/Target/AMDGPU/SIISelLowering.cpp +++ llvm/lib/Target/AMDGPU/SIISelLowering.cpp @@ -80,36 +80,40 @@ addRegisterClass(MVT::i32, &AMDGPU::SReg_32RegClass); addRegisterClass(MVT::f32, &AMDGPU::VGPR_32RegClass); - addRegisterClass(MVT::f64, &AMDGPU::VReg_64RegClass); addRegisterClass(MVT::v2i32, &AMDGPU::SReg_64RegClass); - addRegisterClass(MVT::v2f32, &AMDGPU::VReg_64RegClass); + + const SIRegisterInfo *TRI = STI.getRegisterInfo(); + const TargetRegisterClass *V64RegClass = TRI->getVGPR64Class(); + + addRegisterClass(MVT::f64, V64RegClass); + addRegisterClass(MVT::v2f32, V64RegClass); addRegisterClass(MVT::v3i32, &AMDGPU::SGPR_96RegClass); - addRegisterClass(MVT::v3f32, &AMDGPU::VReg_96RegClass); + addRegisterClass(MVT::v3f32, TRI->getVGPRClassForBitWidth(96)); addRegisterClass(MVT::v2i64, &AMDGPU::SGPR_128RegClass); addRegisterClass(MVT::v2f64, &AMDGPU::SGPR_128RegClass); addRegisterClass(MVT::v4i32, &AMDGPU::SGPR_128RegClass); - addRegisterClass(MVT::v4f32, &AMDGPU::VReg_128RegClass); + addRegisterClass(MVT::v4f32, TRI->getVGPRClassForBitWidth(128)); addRegisterClass(MVT::v5i32, &AMDGPU::SGPR_160RegClass); - addRegisterClass(MVT::v5f32, &AMDGPU::VReg_160RegClass); + addRegisterClass(MVT::v5f32, TRI->getVGPRClassForBitWidth(160)); addRegisterClass(MVT::v8i32, &AMDGPU::SGPR_256RegClass); - addRegisterClass(MVT::v8f32, &AMDGPU::VReg_256RegClass); + addRegisterClass(MVT::v8f32, TRI->getVGPRClassForBitWidth(256)); addRegisterClass(MVT::v4i64, &AMDGPU::SGPR_256RegClass); - addRegisterClass(MVT::v4f64, &AMDGPU::VReg_256RegClass); + addRegisterClass(MVT::v4f64, TRI->getVGPRClassForBitWidth(256)); addRegisterClass(MVT::v16i32, &AMDGPU::SGPR_512RegClass); - addRegisterClass(MVT::v16f32, &AMDGPU::VReg_512RegClass); + addRegisterClass(MVT::v16f32, TRI->getVGPRClassForBitWidth(512)); addRegisterClass(MVT::v8i64, &AMDGPU::SGPR_512RegClass); - addRegisterClass(MVT::v8f64, &AMDGPU::VReg_512RegClass); + addRegisterClass(MVT::v8f64, TRI->getVGPRClassForBitWidth(512)); addRegisterClass(MVT::v16i64, &AMDGPU::SGPR_1024RegClass); - addRegisterClass(MVT::v16f64, &AMDGPU::VReg_1024RegClass); + addRegisterClass(MVT::v16f64, TRI->getVGPRClassForBitWidth(1024)); if (Subtarget->has16BitInsts()) { addRegisterClass(MVT::i16, &AMDGPU::SReg_32RegClass); @@ -123,7 +127,7 @@ } addRegisterClass(MVT::v32i32, &AMDGPU::VReg_1024RegClass); - addRegisterClass(MVT::v32f32, &AMDGPU::VReg_1024RegClass); + addRegisterClass(MVT::v32f32, TRI->getVGPRClassForBitWidth(1024)); computeRegisterProperties(Subtarget->getRegisterInfo()); @@ -11334,9 +11338,11 @@ //===----------------------------------------------------------------------===// std::pair -SITargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI, +SITargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI_, StringRef Constraint, MVT VT) const { + const SIRegisterInfo *TRI = static_cast(TRI_); + const TargetRegisterClass *RC = nullptr; if (Constraint.size() == 1) { const unsigned BitWidth = VT.getSizeInBits(); @@ -11365,7 +11371,7 @@ RC = &AMDGPU::VGPR_32RegClass; break; default: - RC = SIRegisterInfo::getVGPRClassForBitWidth(BitWidth); + RC = TRI->getVGPRClassForBitWidth(BitWidth); if (!RC) return std::make_pair(0U, nullptr); break; @@ -11379,7 +11385,7 @@ RC = &AMDGPU::AGPR_32RegClass; break; default: - RC = SIRegisterInfo::getAGPRClassForBitWidth(BitWidth); + RC = TRI->getAGPRClassForBitWidth(BitWidth); if (!RC) return std::make_pair(0U, nullptr); break; @@ -11552,6 +11558,35 @@ return false; } +static int getAlignedAGPRClassID(unsigned UnalignedClassID) { + switch (UnalignedClassID) { + case AMDGPU::VReg_64RegClassID: + return AMDGPU::VReg_64_Align2RegClassID; + case AMDGPU::VReg_96RegClassID: + return AMDGPU::VReg_96_Align2RegClassID; + case AMDGPU::VReg_128RegClassID: + return AMDGPU::VReg_128_Align2RegClassID; + case AMDGPU::VReg_256RegClassID: + return AMDGPU::VReg_256_Align2RegClassID; + case AMDGPU::VReg_512RegClassID: + return AMDGPU::VReg_512_Align2RegClassID; + case AMDGPU::AReg_64RegClassID: + return AMDGPU::AReg_64_Align2RegClassID; + case AMDGPU::AReg_96RegClassID: + return AMDGPU::AReg_96_Align2RegClassID; + case AMDGPU::AReg_128RegClassID: + return AMDGPU::AReg_128_Align2RegClassID; + case AMDGPU::AReg_256RegClassID: + return AMDGPU::AReg_256_Align2RegClassID; + case AMDGPU::AReg_512RegClassID: + return AMDGPU::AReg_512_Align2RegClassID; + case AMDGPU::AReg_1024RegClassID: + return AMDGPU::AReg_1024_Align2RegClassID; + default: + return -1; + } +} + // Figure out which registers should be reserved for stack access. Only after // the function is legalized do we know all of the non-spill stack objects or if // calls are present. @@ -11560,6 +11595,7 @@ SIMachineFunctionInfo *Info = MF.getInfo(); const GCNSubtarget &ST = MF.getSubtarget(); const SIRegisterInfo *TRI = Subtarget->getRegisterInfo(); + const SIInstrInfo *TII = ST.getInstrInfo(); if (Info->isEntryFunction()) { // Callable functions have fixed registers used for stack access. @@ -11582,7 +11618,6 @@ Info->limitOccupancy(MF); if (ST.isWave32() && !MF.empty()) { - const SIInstrInfo *TII = ST.getInstrInfo(); for (auto &MBB : MF) { for (auto &MI : MBB) { TII->fixImplicitOperands(MI); @@ -11590,6 +11625,23 @@ } } + // FIXME: This is a hack to fixup AGPR classes to use the properly aligned + // classes if required. Ideally the register class constraints would differ + // per-subtarget, but there's no easy way to achieve that right now. This is + // not a problem for VGPRs because the correctly aligned VGPR class is implied + // from using them as the register class for legal types. + if (ST.needsAlignedVGPRs()) { + for (unsigned I = 0, E = MRI.getNumVirtRegs(); I != E; ++I) { + const Register Reg = Register::index2VirtReg(I); + const TargetRegisterClass *RC = MRI.getRegClassOrNull(Reg); + if (!RC) + continue; + int NewClassID = getAlignedAGPRClassID(RC->getID()); + if (NewClassID != -1) + MRI.setRegClass(Reg, TRI->getRegClass(NewClassID)); + } + } + TargetLoweringBase::finalizeLowering(MF); // Allocate a VGPR for future SGPR Spill if Index: llvm/lib/Target/AMDGPU/SIInstrInfo.cpp =================================================================== --- llvm/lib/Target/AMDGPU/SIInstrInfo.cpp +++ llvm/lib/Target/AMDGPU/SIInstrInfo.cpp @@ -872,7 +872,7 @@ return; } - if (RC == &AMDGPU::VReg_64RegClass && + if (RC->hasSuperClassEq(&AMDGPU::VReg_64RegClass) && !RI.hasAGPRs(RI.getPhysRegClass(SrcReg))) { if (ST.hasPackedFP32Ops()) { BuildMI(MBB, MI, DL, get(AMDGPU::V_PK_MOV_B32), DestReg) @@ -1021,7 +1021,7 @@ .addImm(Value); return; } - if (RegClass == &AMDGPU::VReg_64RegClass) { + if (RegClass->hasSuperClassEq(&AMDGPU::VReg_64RegClass)) { BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B64_PSEUDO), DestReg) .addImm(Value); return; @@ -3776,7 +3776,8 @@ // Make sure the register classes are correct. for (int i = 0, e = Desc.getNumOperands(); i != e; ++i) { - if (MI.getOperand(i).isFPImm()) { + const MachineOperand &MO = MI.getOperand(i); + if (MO.isFPImm()) { ErrInfo = "FPImm Machine Operands are not supported. ISel should bitcast " "all fp values to integers."; return false; @@ -3805,7 +3806,6 @@ case AMDGPU::OPERAND_REG_INLINE_AC_INT16: case AMDGPU::OPERAND_REG_INLINE_AC_FP16: case AMDGPU::OPERAND_REG_INLINE_AC_FP64: { - const MachineOperand &MO = MI.getOperand(i); if (!MO.isReg() && (!MO.isImm() || !isInlineConstant(MI, i))) { ErrInfo = "Illegal immediate value for operand."; return false; @@ -3826,12 +3826,40 @@ continue; } - if (!MI.getOperand(i).isReg()) + if (!MO.isReg()) + continue; + Register Reg = MO.getReg(); + if (!Reg) continue; + // FIXME: Ideally we would have separate instruction definitions with the + // aligned register constraint. + // FIXME: We do not verify inline asm operands, but custom inline asm + // verification is broken anyway + if (ST.needsAlignedVGPRs()) { + const TargetRegisterClass *RC = RI.getRegClassForReg(MRI, Reg); + const bool IsVGPR = RI.hasVGPRs(RC); + const bool IsAGPR = !IsVGPR && RI.hasAGPRs(RC); + if ((IsVGPR || IsAGPR) && MO.getSubReg()) { + const TargetRegisterClass *SubRC = + RI.getSubRegClass(RC, MO.getSubReg()); + RC = RI.getCompatibleSubRegClass(RC, SubRC, MO.getSubReg()); + if (RC) + RC = SubRC; + } + + // Check that this is the aligned version of the class. + if (!RC || ((IsVGPR && !RC->hasSuperClassEq(RI.getVGPRClassForBitWidth( + RI.getRegSizeInBits(*RC)))) || + (IsAGPR && !RC->hasSuperClassEq(RI.getAGPRClassForBitWidth( + RI.getRegSizeInBits(*RC)))))) { + ErrInfo = "Subtarget requires even aligned vector registers"; + return false; + } + } + if (RegClass != -1) { - Register Reg = MI.getOperand(i).getReg(); - if (Reg == AMDGPU::NoRegister || Reg.isVirtual()) + if (Reg.isVirtual()) continue; const TargetRegisterClass *RC = RI.getRegClass(RegClass); @@ -4320,9 +4348,12 @@ if (Opcode != AMDGPU::V_MOV_B64_DPP_PSEUDO && ((DstIdx >= 0 && - Desc.OpInfo[DstIdx].RegClass == AMDGPU::VReg_64RegClassID) || - ((Src0Idx >= 0 && - Desc.OpInfo[Src0Idx].RegClass == AMDGPU::VReg_64RegClassID))) && + (Desc.OpInfo[DstIdx].RegClass == AMDGPU::VReg_64RegClassID || + Desc.OpInfo[DstIdx].RegClass == AMDGPU::VReg_64_Align2RegClassID)) || + ((Src0Idx >= 0 && + (Desc.OpInfo[Src0Idx].RegClass == AMDGPU::VReg_64RegClassID || + Desc.OpInfo[Src0Idx].RegClass == + AMDGPU::VReg_64_Align2RegClassID)))) && !AMDGPU::isLegal64BitDPPControl(DC)) { ErrInfo = "Invalid dpp_ctrl value: " "64 bit dpp only support row_newbcast"; @@ -4533,8 +4564,9 @@ Opcode = (Size == 64) ? AMDGPU::S_MOV_B64 : AMDGPU::S_MOV_B32; const TargetRegisterClass *VRC = RI.getEquivalentVGPRClass(RC); - if (RI.getCommonSubClass(&AMDGPU::VReg_64RegClass, VRC)) - VRC = &AMDGPU::VReg_64RegClass; + const TargetRegisterClass *VRC64 = RI.getVGPR64Class(); + if (RI.getCommonSubClass(VRC64, VRC)) + VRC = VRC64; else VRC = &AMDGPU::VGPR_32RegClass; Index: llvm/lib/Target/AMDGPU/SIInstructions.td =================================================================== --- llvm/lib/Target/AMDGPU/SIInstructions.td +++ llvm/lib/Target/AMDGPU/SIInstructions.td @@ -2299,7 +2299,7 @@ def : GCNPat < (i64 (int_amdgcn_mov_dpp i64:$src, timm:$dpp_ctrl, timm:$row_mask, timm:$bank_mask, timm:$bound_ctrl)), - (V_MOV_B64_DPP_PSEUDO VReg_64:$src, VReg_64:$src, + (V_MOV_B64_DPP_PSEUDO VReg_64_Align2:$src, VReg_64_Align2:$src, (as_i32timm $dpp_ctrl), (as_i32timm $row_mask), (as_i32timm $bank_mask), (as_i1timm $bound_ctrl)) @@ -2308,7 +2308,7 @@ def : GCNPat < (i64 (int_amdgcn_update_dpp i64:$old, i64:$src, timm:$dpp_ctrl, timm:$row_mask, timm:$bank_mask, timm:$bound_ctrl)), - (V_MOV_B64_DPP_PSEUDO VReg_64:$old, VReg_64:$src, (as_i32timm $dpp_ctrl), + (V_MOV_B64_DPP_PSEUDO VReg_64_Align2:$old, VReg_64_Align2:$src, (as_i32timm $dpp_ctrl), (as_i32timm $row_mask), (as_i32timm $bank_mask), (as_i1timm $bound_ctrl)) >; Index: llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp =================================================================== --- llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp +++ llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp @@ -1612,26 +1612,11 @@ return &AMDGPU::SGPR_512RegClass; } } - const TargetRegisterClass *RC = nullptr; - switch (CI.Width + Paired.Width) { - default: - return nullptr; - case 2: - RC = &AMDGPU::VReg_64RegClass; - break; - case 3: - RC = &AMDGPU::VReg_96RegClass; - break; - case 4: - RC = &AMDGPU::VReg_128RegClass; - break; - } - - if (TRI->hasAGPRs(getDataRegClass(*CI.I))) - return TRI->getEquivalentAGPRClass(RC); - - return RC; + unsigned BitWidth = 32 * (CI.Width + Paired.Width); + return TRI->hasAGPRs(getDataRegClass(*CI.I)) + ? TRI->getAGPRClassForBitWidth(BitWidth) + : TRI->getVGPRClassForBitWidth(BitWidth); } MachineBasicBlock::iterator SILoadStoreOptimizer::mergeBufferStorePair( Index: llvm/lib/Target/AMDGPU/SIRegisterInfo.h =================================================================== --- llvm/lib/Target/AMDGPU/SIRegisterInfo.h +++ llvm/lib/Target/AMDGPU/SIRegisterInfo.h @@ -134,8 +134,13 @@ return getEncodingValue(Reg) & 0xff; } - static const TargetRegisterClass *getVGPRClassForBitWidth(unsigned BitWidth); - static const TargetRegisterClass *getAGPRClassForBitWidth(unsigned BitWidth); + LLVM_READONLY + const TargetRegisterClass *getVGPRClassForBitWidth(unsigned BitWidth) const; + + LLVM_READONLY + const TargetRegisterClass *getAGPRClassForBitWidth(unsigned BitWidth) const; + + LLVM_READONLY static const TargetRegisterClass *getSGPRClassForBitWidth(unsigned BitWidth); /// Return the 'base' register class for this register. @@ -182,12 +187,21 @@ const TargetRegisterClass * getEquivalentSGPRClass(const TargetRegisterClass *VRC) const; - /// \returns The register class that is used for a sub-register of \p RC for - /// the given \p SubIdx. If \p SubIdx equals NoSubRegister, \p RC will - /// be returned. + /// \returns The canonical register class that is used for a sub-register of + /// \p RC for the given \p SubIdx. If \p SubIdx equals NoSubRegister, \p RC + /// will be returned. const TargetRegisterClass *getSubRegClass(const TargetRegisterClass *RC, unsigned SubIdx) const; + /// Returns a register class which is compatible with \p SuperRC, such that a + /// subregister exists with class \p SubRC with subregister index \p + /// SubIdx. If this is impossible (e.g., an unaligned subregister index within + /// a register tuple), return null. + const TargetRegisterClass * + getCompatibleSubRegClass(const TargetRegisterClass *SuperRC, + const TargetRegisterClass *SubRC, + unsigned SubIdx) const; + bool shouldRewriteCopySrc(const TargetRegisterClass *DefRC, unsigned DefSubReg, const TargetRegisterClass *SrcRC, @@ -268,6 +282,10 @@ : &AMDGPU::SReg_64_XEXECRegClass; } + // Return the appropriate register class to use for 64-bit VGPRs for the + // subtarget. + const TargetRegisterClass *getVGPR64Class() const; + MCRegister getVCC() const; const TargetRegisterClass *getRegClass(unsigned RCID) const; Index: llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp =================================================================== --- llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp +++ llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp @@ -345,13 +345,6 @@ reserveRegisterTuples(Reserved, Reg); } - if (ST.hasGFX90AInsts()) - for (const TargetRegisterClass *RC : this->regclasses()) - if (getRegSizeInBits(*RC) > 32 && hasVectorRegisters(RC)) - for (unsigned Reg : *RC) - if (getEncodingValue(Reg) & 1) - Reserved.set(Reg); - // FIXME: Stop using reserved registers for this. for (MCPhysReg Reg : MFI->getAGPRSpillVGPRs()) reserveRegisterTuples(Reserved, Reg); @@ -1763,14 +1756,8 @@ return AMDGPUInstPrinter::getRegisterName(Reg); } -const TargetRegisterClass * -SIRegisterInfo::getVGPRClassForBitWidth(unsigned BitWidth) { - if (BitWidth == 1) - return &AMDGPU::VReg_1RegClass; - if (BitWidth <= 16) - return &AMDGPU::VGPR_LO16RegClass; - if (BitWidth <= 32) - return &AMDGPU::VGPR_32RegClass; +static const TargetRegisterClass * +getAnyVGPRClassForBitWidth(unsigned BitWidth) { if (BitWidth <= 64) return &AMDGPU::VReg_64RegClass; if (BitWidth <= 96) @@ -1791,12 +1778,42 @@ return nullptr; } +static const TargetRegisterClass * +getAlignedVGPRClassForBitWidth(unsigned BitWidth) { + if (BitWidth <= 64) + return &AMDGPU::VReg_64_Align2RegClass; + if (BitWidth <= 96) + return &AMDGPU::VReg_96_Align2RegClass; + if (BitWidth <= 128) + return &AMDGPU::VReg_128_Align2RegClass; + if (BitWidth <= 160) + return &AMDGPU::VReg_160_Align2RegClass; + if (BitWidth <= 192) + return &AMDGPU::VReg_192_Align2RegClass; + if (BitWidth <= 256) + return &AMDGPU::VReg_256_Align2RegClass; + if (BitWidth <= 512) + return &AMDGPU::VReg_512_Align2RegClass; + if (BitWidth <= 1024) + return &AMDGPU::VReg_1024_Align2RegClass; + + return nullptr; +} + const TargetRegisterClass * -SIRegisterInfo::getAGPRClassForBitWidth(unsigned BitWidth) { +SIRegisterInfo::getVGPRClassForBitWidth(unsigned BitWidth) const { + if (BitWidth == 1) + return &AMDGPU::VReg_1RegClass; if (BitWidth <= 16) - return &AMDGPU::AGPR_LO16RegClass; + return &AMDGPU::VGPR_LO16RegClass; if (BitWidth <= 32) - return &AMDGPU::AGPR_32RegClass; + return &AMDGPU::VGPR_32RegClass; + return ST.needsAlignedVGPRs() ? getAlignedVGPRClassForBitWidth(BitWidth) + : getAnyVGPRClassForBitWidth(BitWidth); +} + +static const TargetRegisterClass * +getAnyAGPRClassForBitWidth(unsigned BitWidth) { if (BitWidth <= 64) return &AMDGPU::AReg_64RegClass; if (BitWidth <= 96) @@ -1817,6 +1834,38 @@ return nullptr; } +static const TargetRegisterClass * +getAlignedAGPRClassForBitWidth(unsigned BitWidth) { + if (BitWidth <= 64) + return &AMDGPU::AReg_64_Align2RegClass; + if (BitWidth <= 96) + return &AMDGPU::AReg_96_Align2RegClass; + if (BitWidth <= 128) + return &AMDGPU::AReg_128_Align2RegClass; + if (BitWidth <= 160) + return &AMDGPU::AReg_160_Align2RegClass; + if (BitWidth <= 192) + return &AMDGPU::AReg_192_Align2RegClass; + if (BitWidth <= 256) + return &AMDGPU::AReg_256_Align2RegClass; + if (BitWidth <= 512) + return &AMDGPU::AReg_512_Align2RegClass; + if (BitWidth <= 1024) + return &AMDGPU::AReg_1024_Align2RegClass; + + return nullptr; +} + +const TargetRegisterClass * +SIRegisterInfo::getAGPRClassForBitWidth(unsigned BitWidth) const { + if (BitWidth <= 16) + return &AMDGPU::AGPR_LO16RegClass; + if (BitWidth <= 32) + return &AMDGPU::AGPR_32RegClass; + return ST.needsAlignedVGPRs() ? getAlignedAGPRClassForBitWidth(BitWidth) + : getAnyAGPRClassForBitWidth(BitWidth); +} + const TargetRegisterClass * SIRegisterInfo::getSGPRClassForBitWidth(unsigned BitWidth) { if (BitWidth <= 16) @@ -1855,29 +1904,46 @@ &AMDGPU::VGPR_32RegClass, &AMDGPU::SReg_32RegClass, &AMDGPU::AGPR_32RegClass, + &AMDGPU::AGPR_32RegClass, + &AMDGPU::VReg_64_Align2RegClass, &AMDGPU::VReg_64RegClass, &AMDGPU::SReg_64RegClass, + &AMDGPU::AReg_64_Align2RegClass, &AMDGPU::AReg_64RegClass, + &AMDGPU::VReg_96_Align2RegClass, &AMDGPU::VReg_96RegClass, &AMDGPU::SReg_96RegClass, + &AMDGPU::AReg_96_Align2RegClass, &AMDGPU::AReg_96RegClass, + &AMDGPU::VReg_128_Align2RegClass, &AMDGPU::VReg_128RegClass, &AMDGPU::SReg_128RegClass, + &AMDGPU::AReg_128_Align2RegClass, &AMDGPU::AReg_128RegClass, + &AMDGPU::VReg_160_Align2RegClass, &AMDGPU::VReg_160RegClass, &AMDGPU::SReg_160RegClass, + &AMDGPU::AReg_160_Align2RegClass, &AMDGPU::AReg_160RegClass, + &AMDGPU::VReg_192_Align2RegClass, &AMDGPU::VReg_192RegClass, &AMDGPU::SReg_192RegClass, + &AMDGPU::AReg_192_Align2RegClass, &AMDGPU::AReg_192RegClass, + &AMDGPU::VReg_256_Align2RegClass, &AMDGPU::VReg_256RegClass, &AMDGPU::SReg_256RegClass, + &AMDGPU::AReg_256_Align2RegClass, &AMDGPU::AReg_256RegClass, + &AMDGPU::VReg_512_Align2RegClass, &AMDGPU::VReg_512RegClass, &AMDGPU::SReg_512RegClass, + &AMDGPU::AReg_512_Align2RegClass, &AMDGPU::AReg_512RegClass, &AMDGPU::SReg_1024RegClass, + &AMDGPU::VReg_1024_Align2RegClass, &AMDGPU::VReg_1024RegClass, + &AMDGPU::AReg_1024_Align2RegClass, &AMDGPU::AReg_1024RegClass, &AMDGPU::SCC_CLASSRegClass, &AMDGPU::Pseudo_SReg_32RegClass, @@ -1977,6 +2043,16 @@ return RC; } +const TargetRegisterClass * +SIRegisterInfo::getCompatibleSubRegClass(const TargetRegisterClass *SuperRC, + const TargetRegisterClass *SubRC, + unsigned SubIdx) const { + // Ensure this subregister index is aligned in the super register. + const TargetRegisterClass *MatchRC = + getMatchingSuperRegClass(SuperRC, SubRC, SubIdx); + return MatchRC && MatchRC->hasSubClassEq(SuperRC) ? MatchRC : nullptr; +} + bool SIRegisterInfo::opCanUseInlineConstant(unsigned OpType) const { if (OpType >= AMDGPU::OPERAND_REG_INLINE_AC_FIRST && OpType <= AMDGPU::OPERAND_REG_INLINE_AC_LAST) @@ -2182,6 +2258,12 @@ return isWave32 ? AMDGPU::VCC_LO : AMDGPU::VCC; } +const TargetRegisterClass *SIRegisterInfo::getVGPR64Class() const { + // VGPR tuples have an alignment requirement on gfx90a variants. + return ST.needsAlignedVGPRs() ? &AMDGPU::VReg_64_Align2RegClass + : &AMDGPU::VReg_64RegClass; +} + const TargetRegisterClass * SIRegisterInfo::getRegClass(unsigned RCID) const { switch ((int)RCID) { Index: llvm/lib/Target/AMDGPU/SIRegisterInfo.td =================================================================== --- llvm/lib/Target/AMDGPU/SIRegisterInfo.td +++ llvm/lib/Target/AMDGPU/SIRegisterInfo.td @@ -786,7 +786,7 @@ } // Register class for all vector registers (VGPRs + Interpolation Registers) -class VRegClass regTypes, dag regList> : +class VRegClassBase regTypes, dag regList> : RegisterClass<"AMDGPU", regTypes, 32, regList> { let Size = !mul(numRegs, 32); @@ -796,31 +796,46 @@ let Weight = numRegs; } -def VReg_64 : VRegClass<2, [i64, f64, v2i32, v2f32, v4f16, v4i16, p0, p1, p4], - (add VGPR_64)>; -def VReg_96 : VRegClass<3, [v3i32, v3f32], (add VGPR_96)>; -def VReg_128 : VRegClass<4, [v4i32, v4f32, v2i64, v2f64], (add VGPR_128)>; -def VReg_160 : VRegClass<5, [v5i32, v5f32], (add VGPR_160)>; -def VReg_192 : VRegClass<6, [untyped], (add VGPR_192)>; -def VReg_256 : VRegClass<8, [v8i32, v8f32, v4i64, v4f64], (add VGPR_256)>; -def VReg_512 : VRegClass<16, [v16i32, v16f32, v8i64, v8f64], (add VGPR_512)>; -def VReg_1024 : VRegClass<32, [v32i32, v32f32, v16i64, v16f64], (add VGPR_1024)>; +// Define a register tuple class, along with one requiring an even +// aligned base register. +multiclass VRegClass regTypes, dag regList> { + // Define the regular class. + def "" : VRegClassBase; -class ARegClass regTypes, dag regList> : - VRegClass { - // Requires n v_accvgpr_write and n v_accvgpr_read to copy + burn 1 vgpr - let CopyCost = !add(numRegs, numRegs, 1); + // Define 2-aligned variant + def _Align2 : VRegClassBase; } -def AReg_64 : ARegClass<2, [i64, f64, v2i32, v2f32, v4f16, v4i16], +defm VReg_64 : VRegClass<2, [i64, f64, v2i32, v2f32, v4f16, v4i16, p0, p1, p4], + (add VGPR_64)>; +defm VReg_96 : VRegClass<3, [v3i32, v3f32], (add VGPR_96)>; +defm VReg_128 : VRegClass<4, [v4i32, v4f32, v2i64, v2f64], (add VGPR_128)>; +defm VReg_160 : VRegClass<5, [v5i32, v5f32], (add VGPR_160)>; + +defm VReg_192 : VRegClass<6, [untyped], (add VGPR_192)>; +defm VReg_256 : VRegClass<8, [v8i32, v8f32, v4i64, v4f64], (add VGPR_256)>; +defm VReg_512 : VRegClass<16, [v16i32, v16f32, v8i64, v8f64], (add VGPR_512)>; +defm VReg_1024 : VRegClass<32, [v32i32, v32f32, v16i64, v16f64], (add VGPR_1024)>; + +multiclass ARegClass regTypes, dag regList> { + let CopyCost = !add(numRegs, numRegs, 1) in { + // Define the regular class. + def "" : VRegClassBase; + + // Define 2-aligned variant + def _Align2 : VRegClassBase; + } +} + +defm AReg_64 : ARegClass<2, [i64, f64, v2i32, v2f32, v4f16, v4i16], (add AGPR_64)>; -def AReg_96 : ARegClass<3, [v3i32, v3f32], (add AGPR_96)>; -def AReg_128 : ARegClass<4, [v4i32, v4f32, v2i64, v2f64], (add AGPR_128)>; -def AReg_160 : ARegClass<5, [v5i32, v5f32], (add AGPR_160)>; -def AReg_192 : ARegClass<6, [untyped], (add AGPR_192)>; -def AReg_256 : ARegClass<8, [v8i32, v8f32, v4i64, v4f64], (add AGPR_256)>; -def AReg_512 : ARegClass<16, [v16i32, v16f32, v8i64, v8f64], (add AGPR_512)>; -def AReg_1024 : ARegClass<32, [v32i32, v32f32, v16i64, v16f64], (add AGPR_1024)>; +defm AReg_96 : ARegClass<3, [v3i32, v3f32], (add AGPR_96)>; +defm AReg_128 : ARegClass<4, [v4i32, v4f32, v2i64, v2f64], (add AGPR_128)>; +defm AReg_160 : ARegClass<5, [v5i32, v5f32], (add AGPR_160)>; +defm AReg_192 : ARegClass<6, [untyped], (add AGPR_192)>; +defm AReg_256 : ARegClass<8, [v8i32, v8f32, v4i64, v4f64], (add AGPR_256)>; +defm AReg_512 : ARegClass<16, [v16i32, v16f32, v8i64, v8f64], (add AGPR_512)>; +defm AReg_1024 : ARegClass<32, [v32i32, v32f32, v16i64, v16f64], (add AGPR_1024)>; } // End GeneratePressureSet = 0 Index: llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp =================================================================== --- llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp +++ llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp @@ -1436,44 +1436,60 @@ case AMDGPU::VReg_64RegClassID: case AMDGPU::AReg_64RegClassID: case AMDGPU::SReg_64_XEXECRegClassID: + case AMDGPU::VReg_64_Align2RegClassID: + case AMDGPU::AReg_64_Align2RegClassID: return 64; case AMDGPU::SGPR_96RegClassID: case AMDGPU::SReg_96RegClassID: case AMDGPU::VReg_96RegClassID: case AMDGPU::AReg_96RegClassID: + case AMDGPU::VReg_96_Align2RegClassID: + case AMDGPU::AReg_96_Align2RegClassID: case AMDGPU::AV_96RegClassID: return 96; case AMDGPU::SGPR_128RegClassID: case AMDGPU::SReg_128RegClassID: case AMDGPU::VReg_128RegClassID: case AMDGPU::AReg_128RegClassID: + case AMDGPU::VReg_128_Align2RegClassID: + case AMDGPU::AReg_128_Align2RegClassID: case AMDGPU::AV_128RegClassID: return 128; case AMDGPU::SGPR_160RegClassID: case AMDGPU::SReg_160RegClassID: case AMDGPU::VReg_160RegClassID: case AMDGPU::AReg_160RegClassID: + case AMDGPU::VReg_160_Align2RegClassID: + case AMDGPU::AReg_160_Align2RegClassID: case AMDGPU::AV_160RegClassID: return 160; case AMDGPU::SGPR_192RegClassID: case AMDGPU::SReg_192RegClassID: case AMDGPU::VReg_192RegClassID: case AMDGPU::AReg_192RegClassID: + case AMDGPU::VReg_192_Align2RegClassID: + case AMDGPU::AReg_192_Align2RegClassID: return 192; case AMDGPU::SGPR_256RegClassID: case AMDGPU::SReg_256RegClassID: case AMDGPU::VReg_256RegClassID: case AMDGPU::AReg_256RegClassID: + case AMDGPU::VReg_256_Align2RegClassID: + case AMDGPU::AReg_256_Align2RegClassID: return 256; case AMDGPU::SGPR_512RegClassID: case AMDGPU::SReg_512RegClassID: case AMDGPU::VReg_512RegClassID: case AMDGPU::AReg_512RegClassID: + case AMDGPU::VReg_512_Align2RegClassID: + case AMDGPU::AReg_512_Align2RegClassID: return 512; case AMDGPU::SGPR_1024RegClassID: case AMDGPU::SReg_1024RegClassID: case AMDGPU::VReg_1024RegClassID: case AMDGPU::AReg_1024RegClassID: + case AMDGPU::VReg_1024_Align2RegClassID: + case AMDGPU::AReg_1024_Align2RegClassID: return 1024; default: llvm_unreachable("Unexpected register class"); Index: llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.buffer.atomic.fadd.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.buffer.atomic.fadd.ll +++ llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.buffer.atomic.fadd.ll @@ -177,13 +177,13 @@ ; GFX90A: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr1 ; GFX90A: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr2 ; GFX90A: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr3 - ; GFX90A: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3 + ; GFX90A: [[REG_SEQUENCE:%[0-9]+]]:vreg_128_align2 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3 ; GFX90A: [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr3 ; GFX90A: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr4 ; GFX90A: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[COPY]] ; GFX90A: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[COPY5]] - ; GFX90A: [[COPY9:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub0_sub1 - ; GFX90A: [[COPY10:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub2_sub3 + ; GFX90A: [[COPY9:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE]].sub0_sub1 + ; GFX90A: [[COPY10:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE]].sub2_sub3 ; GFX90A: [[S_MOV_B64_term:%[0-9]+]]:sreg_64_xexec = S_MOV_B64_term $exec ; GFX90A: bb.2: ; GFX90A: successors: %bb.3(0x40000000), %bb.2(0x40000000) @@ -263,11 +263,11 @@ ; GFX90A: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr1 ; GFX90A: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr2 ; GFX90A: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr3 - ; GFX90A: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3 + ; GFX90A: [[REG_SEQUENCE:%[0-9]+]]:vreg_128_align2 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3 ; GFX90A: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr4 ; GFX90A: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[COPY]] - ; GFX90A: [[COPY7:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub0_sub1 - ; GFX90A: [[COPY8:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub2_sub3 + ; GFX90A: [[COPY7:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE]].sub0_sub1 + ; GFX90A: [[COPY8:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE]].sub2_sub3 ; GFX90A: [[S_MOV_B64_term:%[0-9]+]]:sreg_64_xexec = S_MOV_B64_term $exec ; GFX90A: bb.2: ; GFX90A: successors: %bb.3(0x40000000), %bb.2(0x40000000) Index: llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.buffer.atomic.fadd.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.buffer.atomic.fadd.ll +++ llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.buffer.atomic.fadd.ll @@ -31,7 +31,7 @@ ; GFX90A: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1 ; GFX90A: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr2 ; GFX90A: [[COPY7:%[0-9]+]]:sreg_32 = COPY $sgpr6 - ; GFX90A: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY5]], %subreg.sub0, [[COPY6]], %subreg.sub1 + ; GFX90A: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY5]], %subreg.sub0, [[COPY6]], %subreg.sub1 ; GFX90A: [[BUFFER_ATOMIC_ADD_F32_BOTHEN_RTN:%[0-9]+]]:vgpr_32 = BUFFER_ATOMIC_ADD_F32_BOTHEN_RTN [[COPY]], [[REG_SEQUENCE1]], [[REG_SEQUENCE]], [[COPY7]], 0, 1, 0, implicit $exec :: (volatile dereferenceable load store 4 on custom "BufferResource", align 1, addrspace 4) ; GFX90A: S_ENDPGM 0 %ret = call float @llvm.amdgcn.struct.buffer.atomic.fadd.f32(float %val, <4 x i32> %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 0) @@ -66,7 +66,7 @@ ; GFX90A: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1 ; GFX90A: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr2 ; GFX90A: [[COPY7:%[0-9]+]]:sreg_32 = COPY $sgpr6 - ; GFX90A: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY5]], %subreg.sub0, [[COPY6]], %subreg.sub1 + ; GFX90A: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY5]], %subreg.sub0, [[COPY6]], %subreg.sub1 ; GFX90A: [[BUFFER_ATOMIC_ADD_F32_BOTHEN_RTN:%[0-9]+]]:vgpr_32 = BUFFER_ATOMIC_ADD_F32_BOTHEN_RTN [[COPY]], [[REG_SEQUENCE1]], [[REG_SEQUENCE]], [[COPY7]], 4095, 1, 0, implicit $exec :: (volatile dereferenceable load store 4 on custom "BufferResource" + 4095, align 1, addrspace 4) ; GFX90A: S_ENDPGM 0 %voffset.add = add i32 %voffset, 4095 @@ -192,15 +192,15 @@ ; GFX90A: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr1 ; GFX90A: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr2 ; GFX90A: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr3 - ; GFX90A: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3 + ; GFX90A: [[REG_SEQUENCE:%[0-9]+]]:vreg_128_align2 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3 ; GFX90A: [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr3 ; GFX90A: [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr4 ; GFX90A: [[COPY7:%[0-9]+]]:vgpr_32 = COPY $vgpr4 ; GFX90A: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[COPY]] ; GFX90A: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[COPY5]] ; GFX90A: [[COPY10:%[0-9]+]]:vgpr_32 = COPY [[COPY6]] - ; GFX90A: [[COPY11:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub0_sub1 - ; GFX90A: [[COPY12:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub2_sub3 + ; GFX90A: [[COPY11:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE]].sub0_sub1 + ; GFX90A: [[COPY12:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE]].sub2_sub3 ; GFX90A: [[S_MOV_B64_term:%[0-9]+]]:sreg_64_xexec = S_MOV_B64_term $exec ; GFX90A: bb.2: ; GFX90A: successors: %bb.3(0x40000000), %bb.2(0x40000000) @@ -217,7 +217,7 @@ ; GFX90A: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY7]], implicit $exec ; GFX90A: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_4]], [[COPY7]], implicit $exec ; GFX90A: [[S_AND_B64_1:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U32_e64_]], [[S_AND_B64_]], implicit-def $scc - ; GFX90A: [[REG_SEQUENCE4:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY9]], %subreg.sub0, [[COPY10]], %subreg.sub1 + ; GFX90A: [[REG_SEQUENCE4:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY9]], %subreg.sub0, [[COPY10]], %subreg.sub1 ; GFX90A: [[BUFFER_ATOMIC_ADD_F32_BOTHEN_RTN:%[0-9]+]]:vgpr_32 = BUFFER_ATOMIC_ADD_F32_BOTHEN_RTN [[COPY8]], [[REG_SEQUENCE4]], [[REG_SEQUENCE3]], [[V_READFIRSTLANE_B32_4]], 0, 1, 0, implicit $exec :: (volatile dereferenceable load store 4 on custom "BufferResource", align 1, addrspace 4) ; GFX90A: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_1]], implicit-def $exec, implicit-def $scc, implicit $exec ; GFX90A: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc @@ -283,13 +283,13 @@ ; GFX90A: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr1 ; GFX90A: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr2 ; GFX90A: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr3 - ; GFX90A: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3 + ; GFX90A: [[REG_SEQUENCE:%[0-9]+]]:vreg_128_align2 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3 ; GFX90A: [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr3 ; GFX90A: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr4 ; GFX90A: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[COPY]] ; GFX90A: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[COPY5]] - ; GFX90A: [[COPY9:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub0_sub1 - ; GFX90A: [[COPY10:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub2_sub3 + ; GFX90A: [[COPY9:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE]].sub0_sub1 + ; GFX90A: [[COPY10:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE]].sub2_sub3 ; GFX90A: [[S_MOV_B64_term:%[0-9]+]]:sreg_64_xexec = S_MOV_B64_term $exec ; GFX90A: bb.2: ; GFX90A: successors: %bb.3(0x40000000), %bb.2(0x40000000) @@ -348,7 +348,7 @@ ; GFX90A: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1 ; GFX90A: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr2 ; GFX90A: [[COPY7:%[0-9]+]]:sreg_32 = COPY $sgpr6 - ; GFX90A: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY5]], %subreg.sub0, [[COPY6]], %subreg.sub1 + ; GFX90A: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY5]], %subreg.sub0, [[COPY6]], %subreg.sub1 ; GFX90A: [[BUFFER_ATOMIC_ADD_F32_BOTHEN_RTN:%[0-9]+]]:vgpr_32 = BUFFER_ATOMIC_ADD_F32_BOTHEN_RTN [[COPY]], [[REG_SEQUENCE1]], [[REG_SEQUENCE]], [[COPY7]], 0, 1, 1, implicit $exec :: (volatile dereferenceable load store 4 on custom "BufferResource", align 1, addrspace 4) ; GFX90A: S_ENDPGM 0 %ret = call float @llvm.amdgcn.struct.buffer.atomic.fadd.f32(float %val, <4 x i32> %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 2) @@ -414,7 +414,7 @@ ; GFX90A: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1 ; GFX90A: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr2 ; GFX90A: [[COPY7:%[0-9]+]]:sreg_32 = COPY $sgpr6 - ; GFX90A: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY5]], %subreg.sub0, [[COPY6]], %subreg.sub1 + ; GFX90A: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY5]], %subreg.sub0, [[COPY6]], %subreg.sub1 ; GFX90A: [[BUFFER_ATOMIC_PK_ADD_F16_BOTHEN_RTN:%[0-9]+]]:vgpr_32 = BUFFER_ATOMIC_PK_ADD_F16_BOTHEN_RTN [[COPY]], [[REG_SEQUENCE1]], [[REG_SEQUENCE]], [[COPY7]], 0, 1, 0, implicit $exec :: (volatile dereferenceable load store 4 on custom "BufferResource", align 1, addrspace 4) ; GFX90A: S_ENDPGM 0 %ret = call <2 x half> @llvm.amdgcn.struct.buffer.atomic.fadd.v2f16(<2 x half> %val, <4 x i32> %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 0) Index: llvm/test/CodeGen/AMDGPU/alloc-aligned-tuples-gfx908.mir =================================================================== --- llvm/test/CodeGen/AMDGPU/alloc-aligned-tuples-gfx908.mir +++ llvm/test/CodeGen/AMDGPU/alloc-aligned-tuples-gfx908.mir @@ -1,10 +1,8 @@ # RUN: llc -march=amdgcn -mcpu=gfx908 -start-before=greedy -stop-after=virtregrewriter -verify-machineinstrs -o - %s | FileCheck --check-prefixes=GCN,GFX908 %s -# RUN: llc -march=amdgcn -mcpu=gfx90a -start-before=greedy -stop-after=virtregrewriter -verify-machineinstrs -o - %s | FileCheck --check-prefixes=GCN,GFX90A %s --- # GCN-LABEL: name: alloc_vgpr_64 # GFX908: $vgpr3_vgpr4 = GLOBAL_LOAD -# GFX90A: $vgpr4_vgpr5 = GLOBAL_LOAD name: alloc_vgpr_64 tracksRegLiveness: true liveins: @@ -24,7 +22,6 @@ --- # GCN-LABEL: name: alloc_vgpr_96 # GFX908: $vgpr3_vgpr4_vgpr5 = GLOBAL_LOAD -# GFX90A: $vgpr4_vgpr5_vgpr6 = GLOBAL_LOAD name: alloc_vgpr_96 tracksRegLiveness: true liveins: @@ -44,7 +41,6 @@ --- # GCN-LABEL: name: alloc_vgpr_128 # GFX908: $vgpr3_vgpr4_vgpr5_vgpr6 = GLOBAL_LOAD -# GFX90A: $vgpr4_vgpr5_vgpr6_vgpr7 = GLOBAL_LOAD name: alloc_vgpr_128 tracksRegLiveness: true liveins: @@ -64,7 +60,6 @@ --- # GCN-LABEL: name: alloc_vgpr_160 # GFX908: $vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = IMAGE_LOAD -# GFX90A: $vgpr4_vgpr5_vgpr6_vgpr7_vgpr8 = IMAGE_LOAD name: alloc_vgpr_160 tracksRegLiveness: true liveins: @@ -84,7 +79,6 @@ --- # GCN-LABEL: name: alloc_vgpr_256 # GFX908: $vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10 = COPY -# GFX90A: $vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11 = COPY name: alloc_vgpr_256 tracksRegLiveness: true liveins: @@ -106,7 +100,6 @@ --- # GCN-LABEL: name: alloc_vgpr_512 # GFX908: $vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18 = IMPLICIT_DEF -# GFX90A: $vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19 = IMPLICIT_DEF name: alloc_vgpr_512 tracksRegLiveness: true liveins: @@ -129,7 +122,6 @@ --- # GCN-LABEL: name: alloc_vgpr_1024 # GFX908: $vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34 = IMPLICIT_DEF -# GFX90A: $vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35 = IMPLICIT_DEF name: alloc_vgpr_1024 tracksRegLiveness: true liveins: @@ -156,7 +148,6 @@ --- # GCN-LABEL: name: alloc_agpr_64 # GFX908: $agpr1_agpr2 = IMPLICIT_DEF -# GFX90A: $agpr2_agpr3 = IMPLICIT_DEF name: alloc_agpr_64 tracksRegLiveness: true liveins: @@ -177,7 +168,6 @@ --- # GCN-LABEL: name: alloc_agpr_128 # GFX908: $agpr1_agpr2_agpr3_agpr4 = IMPLICIT_DEF -# GFX90A: $agpr2_agpr3_agpr4_agpr5 = IMPLICIT_DEF name: alloc_agpr_128 tracksRegLiveness: true liveins: @@ -198,7 +188,6 @@ --- # GCN-LABEL: name: alloc_agpr_512 # GFX908: $agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16 = IMPLICIT_DEF -# GFX90A: $agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17 = IMPLICIT_DEF name: alloc_agpr_512 tracksRegLiveness: true liveins: @@ -222,7 +211,6 @@ --- # GCN-LABEL: name: alloc_agpr_1024 # GFX908: $agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31_agpr32 = IMPLICIT_DEF -# GFX90A: $agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31_agpr32_agpr33 = IMPLICIT_DEF name: alloc_agpr_1024 tracksRegLiveness: true liveins: Index: llvm/test/CodeGen/AMDGPU/alloc-aligned-tuples-gfx90a.mir =================================================================== --- /dev/null +++ llvm/test/CodeGen/AMDGPU/alloc-aligned-tuples-gfx90a.mir @@ -0,0 +1,238 @@ +# RUN: llc -march=amdgcn -mcpu=gfx90a -start-before=greedy -stop-after=virtregrewriter -verify-machineinstrs -o - %s | FileCheck --check-prefixes=GCN,GFX90A %s +# Using the unaligned vector tuples are OK as long as they aren't used +# in a real instruction. + +--- +# GCN-LABEL: name: alloc_vgpr_64 +# GFX90A: $vgpr4_vgpr5 = GLOBAL_LOAD +name: alloc_vgpr_64 +tracksRegLiveness: true +liveins: + - { reg: '$vgpr0_vgpr1' } + - { reg: '$vgpr2' } +body: | + bb.0: + liveins: $vgpr0_vgpr1, $vgpr2 + + %0:vreg_64_align2 = COPY $vgpr0_vgpr1 + %1:vgpr_32 = COPY $vgpr2 + %2:vreg_64_align2 = GLOBAL_LOAD_DWORDX2 %0, 0, 0, 0, 0, 0, implicit $exec + GLOBAL_STORE_DWORDX2 %0, %2, 0, 0, 0, 0, 0, implicit $exec + GLOBAL_STORE_DWORD %0, %1, 0, 0, 0, 0, 0, implicit $exec +... + +--- +# GCN-LABEL: name: alloc_vgpr_96 +# GFX90A: $vgpr4_vgpr5_vgpr6 = GLOBAL_LOAD +name: alloc_vgpr_96 +tracksRegLiveness: true +liveins: + - { reg: '$vgpr0_vgpr1' } + - { reg: '$vgpr2' } +body: | + bb.0: + liveins: $vgpr0_vgpr1, $vgpr2 + + %0:vreg_64_align2 = COPY $vgpr0_vgpr1 + %1:vgpr_32 = COPY $vgpr2 + %2:vreg_96_align2 = GLOBAL_LOAD_DWORDX3 %0, 0, 0, 0, 0, 0, implicit $exec + GLOBAL_STORE_DWORDX3 %0, %2, 0, 0, 0, 0, 0, implicit $exec + GLOBAL_STORE_DWORD %0, %1, 0, 0, 0, 0, 0, implicit $exec +... + +--- +# GCN-LABEL: name: alloc_vgpr_128 +# GFX90A: $vgpr4_vgpr5_vgpr6_vgpr7 = GLOBAL_LOAD +name: alloc_vgpr_128 +tracksRegLiveness: true +liveins: + - { reg: '$vgpr0_vgpr1' } + - { reg: '$vgpr2' } +body: | + bb.0: + liveins: $vgpr0_vgpr1, $vgpr2 + + %0:vreg_64_align2 = COPY $vgpr0_vgpr1 + %1:vgpr_32 = COPY $vgpr2 + %2:vreg_128_align2 = GLOBAL_LOAD_DWORDX4 %0, 0, 0, 0, 0, 0, implicit $exec + GLOBAL_STORE_DWORDX4 %0, %2, 0, 0, 0, 0, 0, implicit $exec + GLOBAL_STORE_DWORD %0, %1, 0, 0, 0, 0, 0, implicit $exec +... + +--- +# GCN-LABEL: name: alloc_vgpr_160 +# GFX90A: $vgpr4_vgpr5_vgpr6_vgpr7_vgpr8 = IMAGE_LOAD +name: alloc_vgpr_160 +tracksRegLiveness: true +liveins: + - { reg: '$vgpr0_vgpr1' } + - { reg: '$vgpr2' } +body: | + bb.0: + liveins: $vgpr0_vgpr1, $vgpr2 + + %0:vreg_64_align2 = COPY $vgpr0_vgpr1 + %1:vgpr_32 = COPY $vgpr2 + %2:vreg_160_align2 = IMAGE_LOAD_V5_V1 %1, undef %3:sgpr_256, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, addrspace 4) + GLOBAL_STORE_DWORDX4 %0, %2.sub0_sub1_sub2_sub3, 0, 0, 0, 0, 0, implicit $exec + GLOBAL_STORE_DWORD %0, %1, 0, 0, 0, 0, 0, implicit $exec +... + +--- +# GCN-LABEL: name: alloc_vgpr_256 +# GFX90A: $vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11 = COPY +name: alloc_vgpr_256 +tracksRegLiveness: true +liveins: + - { reg: '$vgpr0_vgpr1' } + - { reg: '$vgpr2' } +body: | + bb.0: + liveins: $vgpr0_vgpr1, $vgpr2 + + %0:vreg_64_align2 = COPY $vgpr0_vgpr1 + %1:vgpr_32 = COPY $vgpr2 + %3:sgpr_256 = IMPLICIT_DEF + %2:vreg_256_align2 = COPY %3:sgpr_256 + %4:vreg_128_align2 = IMAGE_SAMPLE_C_CL_O_V4_V8 %2, %3:sgpr_256, undef %5:sgpr_128, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, addrspace 4) + GLOBAL_STORE_DWORDX4 %0, %2.sub0_sub1_sub2_sub3, 0, 0, 0, 0, 0, implicit $exec + GLOBAL_STORE_DWORD %0, %1, 0, 0, 0, 0, 0, implicit $exec +... + +--- +# GCN-LABEL: name: alloc_vgpr_512 +# GFX90A: $vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19 = IMPLICIT_DEF +name: alloc_vgpr_512 +tracksRegLiveness: true +liveins: + - { reg: '$vgpr0_vgpr1' } + - { reg: '$vgpr2' } +body: | + bb.0: + liveins: $vgpr0_vgpr1, $vgpr2 + + %0:vreg_64_align2 = COPY $vgpr0_vgpr1 + %1:vgpr_32 = COPY $vgpr2 + %2:vreg_512_align2 = IMPLICIT_DEF + GLOBAL_STORE_DWORDX4 %0, %2.sub0_sub1_sub2_sub3, 0, 0, 0, 0, 0, implicit $exec + GLOBAL_STORE_DWORDX4 %0, %2.sub4_sub5_sub6_sub7, 0, 0, 0, 0, 0, implicit $exec + GLOBAL_STORE_DWORDX4 %0, %2.sub8_sub9_sub10_sub11, 0, 0, 0, 0, 0, implicit $exec + GLOBAL_STORE_DWORDX4 %0, %2.sub12_sub13_sub14_sub15, 0, 0, 0, 0, 0, implicit $exec + GLOBAL_STORE_DWORD %0, %1, 0, 0, 0, 0, 0, implicit $exec +... + +--- +# GCN-LABEL: name: alloc_vgpr_1024 +# GFX90A: $vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35 = IMPLICIT_DEF +name: alloc_vgpr_1024 +tracksRegLiveness: true +liveins: + - { reg: '$vgpr0_vgpr1' } + - { reg: '$vgpr2' } +body: | + bb.0: + liveins: $vgpr0_vgpr1, $vgpr2 + + %0:vreg_64_align2 = COPY $vgpr0_vgpr1 + %1:vgpr_32 = COPY $vgpr2 + %2:vreg_1024_align2 = IMPLICIT_DEF + GLOBAL_STORE_DWORDX4 %0, %2.sub0_sub1_sub2_sub3, 0, 0, 0, 0, 0, implicit $exec + GLOBAL_STORE_DWORDX4 %0, %2.sub4_sub5_sub6_sub7, 0, 0, 0, 0, 0, implicit $exec + GLOBAL_STORE_DWORDX4 %0, %2.sub8_sub9_sub10_sub11, 0, 0, 0, 0, 0, implicit $exec + GLOBAL_STORE_DWORDX4 %0, %2.sub12_sub13_sub14_sub15, 0, 0, 0, 0, 0, implicit $exec + GLOBAL_STORE_DWORDX4 %0, %2.sub16_sub17_sub18_sub19, 0, 0, 0, 0, 0, implicit $exec + GLOBAL_STORE_DWORDX4 %0, %2.sub20_sub21_sub22_sub23, 0, 0, 0, 0, 0, implicit $exec + GLOBAL_STORE_DWORDX4 %0, %2.sub24_sub25_sub26_sub27, 0, 0, 0, 0, 0, implicit $exec + GLOBAL_STORE_DWORDX4 %0, %2.sub28_sub29_sub30_sub31, 0, 0, 0, 0, 0, implicit $exec + GLOBAL_STORE_DWORD %0, %1, 0, 0, 0, 0, 0, implicit $exec +... + +--- +# GCN-LABEL: name: alloc_agpr_64 +# GFX90A: $agpr1_agpr2 = IMPLICIT_DEF +name: alloc_agpr_64 +tracksRegLiveness: true +liveins: + - { reg: '$vgpr0_vgpr1' } + - { reg: '$agpr0' } +body: | + bb.0: + liveins: $vgpr0_vgpr1, $agpr0 + + %0:vreg_64_align2 = COPY $vgpr0_vgpr1 + %3:areg_64 = IMPLICIT_DEF + %2:vreg_64_align2 = COPY %3:areg_64 + GLOBAL_STORE_DWORDX2 %0, %2, 0, 0, 0, 0, 0, implicit $exec + %1:vgpr_32 = COPY $agpr0 + GLOBAL_STORE_DWORD %0, %1, 0, 0, 0, 0, 0, implicit $exec +... + +--- +# GCN-LABEL: name: alloc_agpr_128 +# GFX90A: $agpr1_agpr2_agpr3_agpr4 = IMPLICIT_DEF +name: alloc_agpr_128 +tracksRegLiveness: true +liveins: + - { reg: '$vgpr0_vgpr1' } + - { reg: '$agpr0' } +body: | + bb.0: + liveins: $vgpr0_vgpr1, $agpr0 + + %0:vreg_64_align2 = COPY $vgpr0_vgpr1 + %3:areg_128 = IMPLICIT_DEF + %2:vreg_128_align2 = COPY %3:areg_128 + GLOBAL_STORE_DWORDX4 %0, %2, 0, 0, 0, 0, 0, implicit $exec + %1:vgpr_32 = COPY $agpr0 + GLOBAL_STORE_DWORD %0, %1, 0, 0, 0, 0, 0, implicit $exec +... + +--- +# GCN-LABEL: name: alloc_agpr_512 +# GFX90A: $agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16 = IMPLICIT_DEF +name: alloc_agpr_512 +tracksRegLiveness: true +liveins: + - { reg: '$vgpr0_vgpr1' } + - { reg: '$agpr0' } +body: | + bb.0: + liveins: $vgpr0_vgpr1, $agpr0 + + %0:vreg_64_align2 = COPY $vgpr0_vgpr1 + %3:areg_512 = IMPLICIT_DEF + %2:vreg_512_align2 = COPY %3:areg_512 + GLOBAL_STORE_DWORDX4 %0, %2.sub0_sub1_sub2_sub3, 0, 0, 0, 0, 0, implicit $exec + GLOBAL_STORE_DWORDX4 %0, %2.sub4_sub5_sub6_sub7, 0, 0, 0, 0, 0, implicit $exec + GLOBAL_STORE_DWORDX4 %0, %2.sub8_sub9_sub10_sub11, 0, 0, 0, 0, 0, implicit $exec + GLOBAL_STORE_DWORDX4 %0, %2.sub12_sub13_sub14_sub15, 0, 0, 0, 0, 0, implicit $exec + %1:vgpr_32 = COPY $agpr0 + GLOBAL_STORE_DWORD %0, %1, 0, 0, 0, 0, 0, implicit $exec +... + +--- +# GCN-LABEL: name: alloc_agpr_1024 +# GFX90A: $agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31_agpr32 = IMPLICIT_DEF +name: alloc_agpr_1024 +tracksRegLiveness: true +liveins: + - { reg: '$vgpr0_vgpr1' } + - { reg: '$agpr0' } +body: | + bb.0: + liveins: $vgpr0_vgpr1, $agpr0 + + %0:vreg_64_align2 = COPY $vgpr0_vgpr1 + %3:areg_1024 = IMPLICIT_DEF + %2:vreg_1024_align2 = COPY %3:areg_1024 + GLOBAL_STORE_DWORDX4 %0, %2.sub0_sub1_sub2_sub3, 0, 0, 0, 0, 0, implicit $exec + GLOBAL_STORE_DWORDX4 %0, %2.sub4_sub5_sub6_sub7, 0, 0, 0, 0, 0, implicit $exec + GLOBAL_STORE_DWORDX4 %0, %2.sub8_sub9_sub10_sub11, 0, 0, 0, 0, 0, implicit $exec + GLOBAL_STORE_DWORDX4 %0, %2.sub12_sub13_sub14_sub15, 0, 0, 0, 0, 0, implicit $exec + GLOBAL_STORE_DWORDX4 %0, %2.sub16_sub17_sub18_sub19, 0, 0, 0, 0, 0, implicit $exec + GLOBAL_STORE_DWORDX4 %0, %2.sub20_sub21_sub22_sub23, 0, 0, 0, 0, 0, implicit $exec + GLOBAL_STORE_DWORDX4 %0, %2.sub24_sub25_sub26_sub27, 0, 0, 0, 0, 0, implicit $exec + GLOBAL_STORE_DWORDX4 %0, %2.sub28_sub29_sub30_sub31, 0, 0, 0, 0, 0, implicit $exec + %1:vgpr_32 = COPY $agpr0 + GLOBAL_STORE_DWORD %0, %1, 0, 0, 0, 0, 0, implicit $exec +... Index: llvm/test/CodeGen/AMDGPU/dpp64_combine.mir =================================================================== --- llvm/test/CodeGen/AMDGPU/dpp64_combine.mir +++ llvm/test/CodeGen/AMDGPU/dpp64_combine.mir @@ -2,50 +2,50 @@ --- # GCN-LABEL: name: dpp64_old_impdef -# GCN: %3:vreg_64 = V_CEIL_F64_dpp %1, 0, %0, 337, 15, 15, 1, implicit $mode, implicit $exec +# GCN: %3:vreg_64_align2 = V_CEIL_F64_dpp %1, 0, %0, 337, 15, 15, 1, implicit $mode, implicit $exec --- name: dpp64_old_impdef tracksRegLiveness: true body: | bb.0: - %0:vreg_64 = IMPLICIT_DEF - %1:vreg_64 = IMPLICIT_DEF - %2:vreg_64 = V_MOV_B64_DPP_PSEUDO %1:vreg_64, %0:vreg_64, 337, 15, 15, 1, implicit $exec - %3:vreg_64 = V_CEIL_F64_e32 %2, implicit $mode, implicit $exec + %0:vreg_64_align2 = IMPLICIT_DEF + %1:vreg_64_align2 = IMPLICIT_DEF + %2:vreg_64_align2 = V_MOV_B64_DPP_PSEUDO %1, %0, 337, 15, 15, 1, implicit $exec + %3:vreg_64_align2 = V_CEIL_F64_e32 %2, implicit $mode, implicit $exec ... # GCN-LABEL: name: dpp64_old_undef -# GCN: %3:vreg_64 = V_CEIL_F64_dpp undef %1:vreg_64, 0, undef %2:vreg_64, 337, 15, 15, 1, implicit $mode, implicit $exec +# GCN: %3:vreg_64_align2 = V_CEIL_F64_dpp undef %1:vreg_64_align2, 0, undef %2:vreg_64_align2, 337, 15, 15, 1, implicit $mode, implicit $exec --- name: dpp64_old_undef tracksRegLiveness: true body: | bb.0: - %2:vreg_64 = V_MOV_B64_DPP_PSEUDO undef %1:vreg_64, undef %0:vreg_64, 337, 15, 15, 1, implicit $exec - %3:vreg_64 = V_CEIL_F64_e32 %2, implicit $mode, implicit $exec + %2:vreg_64_align2 = V_MOV_B64_DPP_PSEUDO undef %1:vreg_64_align2, undef %0:vreg_64_align2, 337, 15, 15, 1, implicit $exec + %3:vreg_64_align2 = V_CEIL_F64_e32 %2, implicit $mode, implicit $exec ... # GCN-LABEL: name: dpp64_old_is_0 -# GCN: %3:vreg_64 = V_CEIL_F64_dpp %4, 0, undef %2:vreg_64, 337, 15, 15, 1, implicit $mode, implicit $exec +# GCN: %3:vreg_64_align2 = V_CEIL_F64_dpp %4, 0, undef %2:vreg_64_align2, 337, 15, 15, 1, implicit $mode, implicit $exec name: dpp64_old_is_0 tracksRegLiveness: true body: | bb.0: - %1:vreg_64 = V_MOV_B64_PSEUDO 0, implicit $exec - %2:vreg_64 = V_MOV_B64_DPP_PSEUDO undef %1, undef %0:vreg_64, 337, 15, 15, 1, implicit $exec - %3:vreg_64 = V_CEIL_F64_e32 %2, implicit $mode, implicit $exec + %1:vreg_64_align2 = V_MOV_B64_PSEUDO 0, implicit $exec + %2:vreg_64_align2 = V_MOV_B64_DPP_PSEUDO undef %1, undef %0:vreg_64_align2, 337, 15, 15, 1, implicit $exec + %3:vreg_64_align2 = V_CEIL_F64_e32 %2, implicit $mode, implicit $exec ... # DPP64 does not support all control values and must be split to become legal # GCN-LABEL: name: dpp64_illegal_ctrl -# GCN: %4:vgpr_32 = V_MOV_B32_dpp undef %1.sub0:vreg_64, undef %2.sub0:vreg_64, 1, 15, 15, 1, implicit $exec -# GCN: %5:vgpr_32 = V_MOV_B32_dpp undef %1.sub1:vreg_64, undef %2.sub1:vreg_64, 1, 15, 15, 1, implicit $exec -# GCN: %0:vreg_64 = REG_SEQUENCE %4, %subreg.sub0, %5, %subreg.sub1 -# GCN: %3:vreg_64 = V_CEIL_F64_e32 %0, implicit $mode, implicit $exec +# GCN: %4:vgpr_32 = V_MOV_B32_dpp undef %1.sub0:vreg_64_align2, undef %2.sub0:vreg_64_align2, 1, 15, 15, 1, implicit $exec +# GCN: %5:vgpr_32 = V_MOV_B32_dpp undef %1.sub1:vreg_64_align2, undef %2.sub1:vreg_64_align2, 1, 15, 15, 1, implicit $exec +# GCN: %0:vreg_64_align2 = REG_SEQUENCE %4, %subreg.sub0, %5, %subreg.sub1 +# GCN: %3:vreg_64_align2 = V_CEIL_F64_e32 %0, implicit $mode, implicit $exec name: dpp64_illegal_ctrl tracksRegLiveness: true body: | bb.0: - %2:vreg_64 = V_MOV_B64_DPP_PSEUDO undef %1:vreg_64, undef %0:vreg_64, 1, 15, 15, 1, implicit $exec - %3:vreg_64 = V_CEIL_F64_e32 %2, implicit $mode, implicit $exec + %2:vreg_64_align2 = V_MOV_B64_DPP_PSEUDO undef %1:vreg_64_align2, undef %0:vreg_64_align2, 1, 15, 15, 1, implicit $exec + %3:vreg_64_align2 = V_CEIL_F64_e32 %2, implicit $mode, implicit $exec ... Index: llvm/test/CodeGen/AMDGPU/inline-asm.i128.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/inline-asm.i128.ll +++ llvm/test/CodeGen/AMDGPU/inline-asm.i128.ll @@ -1,40 +1,59 @@ ; NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py -; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx908 -stop-after=finalize-isel -o - %s | FileCheck %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx908 -stop-after=finalize-isel -o - %s | FileCheck -check-prefix=GFX908 %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a -stop-after=finalize-isel -o - %s | FileCheck -check-prefix=GFX90A %s ; Make sure we only use one 128-bit register instead of 2 for i128 asm ; constraints define amdgpu_kernel void @s_input_output_i128() { - ; CHECK-LABEL: name: s_input_output_i128 - ; CHECK: bb.0 (%ir-block.0): - ; CHECK: INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 4128778 /* regdef:SGPR_128 */, def %4 - ; CHECK: [[COPY:%[0-9]+]]:sgpr_128 = COPY %4 - ; CHECK: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 4128777 /* reguse:SGPR_128 */, [[COPY]] - ; CHECK: S_ENDPGM 0 + ; GFX908-LABEL: name: s_input_output_i128 + ; GFX908: bb.0 (%ir-block.0): + ; GFX908: INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 4784138 /* regdef:SGPR_128 */, def %4 + ; GFX908: [[COPY:%[0-9]+]]:sgpr_128 = COPY %4 + ; GFX908: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 4784137 /* reguse:SGPR_128 */, [[COPY]] + ; GFX908: S_ENDPGM 0 + ; GFX90A-LABEL: name: s_input_output_i128 + ; GFX90A: bb.0 (%ir-block.0): + ; GFX90A: INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 4784138 /* regdef:SGPR_128 */, def %4 + ; GFX90A: [[COPY:%[0-9]+]]:sgpr_128 = COPY %4 + ; GFX90A: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 4784137 /* reguse:SGPR_128 */, [[COPY]] + ; GFX90A: S_ENDPGM 0 %val = tail call i128 asm sideeffect "; def $0", "=s"() call void asm sideeffect "; use $0", "s"(i128 %val) ret void } define amdgpu_kernel void @v_input_output_i128() { - ; CHECK-LABEL: name: v_input_output_i128 - ; CHECK: bb.0 (%ir-block.0): - ; CHECK: INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 3997706 /* regdef:VReg_128 */, def %4 - ; CHECK: [[COPY:%[0-9]+]]:vreg_128 = COPY %4 - ; CHECK: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 3997705 /* reguse:VReg_128 */, [[COPY]] - ; CHECK: S_ENDPGM 0 + ; GFX908-LABEL: name: v_input_output_i128 + ; GFX908: bb.0 (%ir-block.0): + ; GFX908: INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 4390922 /* regdef:VReg_128 */, def %4 + ; GFX908: [[COPY:%[0-9]+]]:vreg_128 = COPY %4 + ; GFX908: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 4390921 /* reguse:VReg_128 */, [[COPY]] + ; GFX908: S_ENDPGM 0 + ; GFX90A-LABEL: name: v_input_output_i128 + ; GFX90A: bb.0 (%ir-block.0): + ; GFX90A: INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 4521994 /* regdef:VReg_128_Align2 */, def %4 + ; GFX90A: [[COPY:%[0-9]+]]:vreg_128_align2 = COPY %4 + ; GFX90A: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 4521993 /* reguse:VReg_128_Align2 */, [[COPY]] + ; GFX90A: S_ENDPGM 0 %val = tail call i128 asm sideeffect "; def $0", "=v"() call void asm sideeffect "; use $0", "v"(i128 %val) ret void } define amdgpu_kernel void @a_input_output_i128() { - ; CHECK-LABEL: name: a_input_output_i128 - ; CHECK: bb.0 (%ir-block.0): - ; CHECK: INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 3932170 /* regdef:AReg_128 */, def %4 - ; CHECK: [[COPY:%[0-9]+]]:areg_128 = COPY %4 - ; CHECK: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 3932169 /* reguse:AReg_128 */, [[COPY]] - ; CHECK: S_ENDPGM 0 + ; GFX908-LABEL: name: a_input_output_i128 + ; GFX908: bb.0 (%ir-block.0): + ; GFX908: INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 4325386 /* regdef:AReg_128 */, def %4 + ; GFX908: [[COPY:%[0-9]+]]:areg_128 = COPY %4 + ; GFX908: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 4325385 /* reguse:AReg_128 */, [[COPY]] + ; GFX908: S_ENDPGM 0 + ; GFX90A-LABEL: name: a_input_output_i128 + ; GFX90A: bb.0 (%ir-block.0): + ; GFX90A: INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 4456458 /* regdef:AReg_128_Align2 */, def %4 + ; GFX90A: [[COPY:%[0-9]+]]:areg_128_align2 = COPY %4 + ; GFX90A: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 4456457 /* reguse:AReg_128_Align2 */, [[COPY]] + ; GFX90A: S_ENDPGM 0 %val = call i128 asm sideeffect "; def $0", "=a"() call void asm sideeffect "; use $0", "a"(i128 %val) ret void Index: llvm/test/CodeGen/AMDGPU/merge-load-store-agpr.mir =================================================================== --- llvm/test/CodeGen/AMDGPU/merge-load-store-agpr.mir +++ llvm/test/CodeGen/AMDGPU/merge-load-store-agpr.mir @@ -1,7 +1,7 @@ # RUN: llc -march=amdgcn -mcpu=gfx90a -verify-machineinstrs -run-pass si-load-store-opt -o - %s | FileCheck -enable-var-scope -check-prefix=GCN %s # GCN-LABEL: name: ds_read_b32_v_v -# GCN: vreg_64 = DS_READ2_B32 +# GCN: vreg_64_align2 = DS_READ2_B32 name: ds_read_b32_v_v body: | bb.0: @@ -12,7 +12,7 @@ ... # GCN-LABEL: name: ds_read_b32_a_a -# GCN: areg_64 = DS_READ2_B32 +# GCN: areg_64_align2 = DS_READ2_B32 name: ds_read_b32_a_a body: | bb.0: Index: llvm/test/CodeGen/AMDGPU/reserved-reg-in-clause.mir =================================================================== --- llvm/test/CodeGen/AMDGPU/reserved-reg-in-clause.mir +++ llvm/test/CodeGen/AMDGPU/reserved-reg-in-clause.mir @@ -1,28 +1,27 @@ # RUN: llc -march=amdgcn -mcpu=gfx90a -verify-machineinstrs -run-pass si-form-memory-clauses %s -o - | FileCheck -check-prefix=GCN %s # Make sure we do not produce early-clobber list with odd subregs. -# Odd vector subregs are reserved on gfx90a and verifier complaints after RA. # GCN-LABEL: name: long_reg_clause -# GCN: dead early-clobber %2.sub0_sub1_sub2_sub3:areg_512, undef early-clobber %4.sub8_sub9_sub10_sub11_sub12_sub13_sub14_sub15:areg_512, dead early-clobber %3:areg_512 = BUNDLE %0, implicit $exec { +# GCN: dead early-clobber %2.sub0_sub1_sub2_sub3:areg_512_align2, undef early-clobber %4.sub8_sub9_sub10_sub11_sub12_sub13_sub14_sub15:areg_512_align2, dead early-clobber %3:areg_512_align2 = BUNDLE %0, implicit $exec { --- name: long_reg_clause body: | bb.0.entry: - %0:vreg_64 = IMPLICIT_DEF - undef %1.sub12_sub13_sub14_sub15:areg_512 = GLOBAL_LOAD_DWORDX4 %0, -208, 0, 0, 0, 0, implicit $exec - %1.sub8_sub9_sub10_sub11:areg_512 = GLOBAL_LOAD_DWORDX4 %0, -224, 0, 0, 0, 0, implicit $exec - %1.sub4_sub5_sub6_sub7:areg_512 = GLOBAL_LOAD_DWORDX4 %0, -240, 0, 0, 0, 0, implicit $exec - dead %1.sub0_sub1_sub2_sub3:areg_512 = GLOBAL_LOAD_DWORDX4 %0, -256, 0, 0, 0, 0, implicit $exec - undef %2.sub12_sub13_sub14_sub15:areg_512 = GLOBAL_LOAD_DWORDX4 %0, -80, 0, 0, 0, 0, implicit $exec - %2.sub8_sub9_sub10_sub11:areg_512 = GLOBAL_LOAD_DWORDX4 %0, -96, 0, 0, 0, 0, implicit $exec - %2.sub4_sub5_sub6_sub7:areg_512 = GLOBAL_LOAD_DWORDX4 %0, -112, 0, 0, 0, 0, implicit $exec - dead %2.sub0_sub1_sub2_sub3:areg_512 = GLOBAL_LOAD_DWORDX4 %0, -128, 0, 0, 0, 0, implicit $exec - undef %3.sub12_sub13_sub14_sub15:areg_512 = GLOBAL_LOAD_DWORDX4 %0, 48, 0, 0, 0, 0, implicit $exec - %3.sub8_sub9_sub10_sub11:areg_512 = GLOBAL_LOAD_DWORDX4 %0, 32, 0, 0, 0, 0, implicit $exec - %3.sub4_sub5_sub6_sub7:areg_512 = GLOBAL_LOAD_DWORDX4 %0, 16, 0, 0, 0, 0, implicit $exec - dead %3.sub0_sub1_sub2_sub3:areg_512 = GLOBAL_LOAD_DWORDX4 %0, 0, 0, 0, 0, 0, implicit $exec - undef %4.sub12_sub13_sub14_sub15:areg_512 = GLOBAL_LOAD_DWORDX4 %0, 176, 0, 0, 0, 0, implicit $exec - %4.sub8_sub9_sub10_sub11:areg_512 = GLOBAL_LOAD_DWORDX4 %0, 160, 0, 0, 0, 0, implicit $exec - %4.sub4_sub5_sub6_sub7:areg_512 = GLOBAL_LOAD_DWORDX4 %0, 144, 0, 0, 0, 0, implicit $exec + %0:vreg_64_align2 = IMPLICIT_DEF + undef %1.sub12_sub13_sub14_sub15:areg_512_align2 = GLOBAL_LOAD_DWORDX4 %0, -208, 0, 0, 0, 0, implicit $exec + %1.sub8_sub9_sub10_sub11:areg_512_align2 = GLOBAL_LOAD_DWORDX4 %0, -224, 0, 0, 0, 0, implicit $exec + %1.sub4_sub5_sub6_sub7:areg_512_align2 = GLOBAL_LOAD_DWORDX4 %0, -240, 0, 0, 0, 0, implicit $exec + dead %1.sub0_sub1_sub2_sub3:areg_512_align2 = GLOBAL_LOAD_DWORDX4 %0, -256, 0, 0, 0, 0, implicit $exec + undef %2.sub12_sub13_sub14_sub15:areg_512_align2 = GLOBAL_LOAD_DWORDX4 %0, -80, 0, 0, 0, 0, implicit $exec + %2.sub8_sub9_sub10_sub11:areg_512_align2 = GLOBAL_LOAD_DWORDX4 %0, -96, 0, 0, 0, 0, implicit $exec + %2.sub4_sub5_sub6_sub7:areg_512_align2 = GLOBAL_LOAD_DWORDX4 %0, -112, 0, 0, 0, 0, implicit $exec + dead %2.sub0_sub1_sub2_sub3:areg_512_align2 = GLOBAL_LOAD_DWORDX4 %0, -128, 0, 0, 0, 0, implicit $exec + undef %3.sub12_sub13_sub14_sub15:areg_512_align2 = GLOBAL_LOAD_DWORDX4 %0, 48, 0, 0, 0, 0, implicit $exec + %3.sub8_sub9_sub10_sub11:areg_512_align2 = GLOBAL_LOAD_DWORDX4 %0, 32, 0, 0, 0, 0, implicit $exec + %3.sub4_sub5_sub6_sub7:areg_512_align2 = GLOBAL_LOAD_DWORDX4 %0, 16, 0, 0, 0, 0, implicit $exec + dead %3.sub0_sub1_sub2_sub3:areg_512_align2 = GLOBAL_LOAD_DWORDX4 %0, 0, 0, 0, 0, 0, implicit $exec + undef %4.sub12_sub13_sub14_sub15:areg_512_align2 = GLOBAL_LOAD_DWORDX4 %0, 176, 0, 0, 0, 0, implicit $exec + %4.sub8_sub9_sub10_sub11:areg_512_align2 = GLOBAL_LOAD_DWORDX4 %0, 160, 0, 0, 0, 0, implicit $exec + %4.sub4_sub5_sub6_sub7:areg_512_align2 = GLOBAL_LOAD_DWORDX4 %0, 144, 0, 0, 0, 0, implicit $exec ... Index: llvm/test/CodeGen/AMDGPU/twoaddr-fma-f64.mir =================================================================== --- llvm/test/CodeGen/AMDGPU/twoaddr-fma-f64.mir +++ llvm/test/CodeGen/AMDGPU/twoaddr-fma-f64.mir @@ -5,10 +5,10 @@ --- name: test_fmamk_reg_imm_f64 registers: - - { id: 0, class: vreg_64 } - - { id: 1, class: vreg_64 } - - { id: 2, class: vreg_64 } - - { id: 3, class: vreg_64 } + - { id: 0, class: vreg_64_align2 } + - { id: 1, class: vreg_64_align2 } + - { id: 2, class: vreg_64_align2 } + - { id: 3, class: vreg_64_align2 } body: | bb.0: @@ -24,10 +24,10 @@ --- name: test_fmamk_imm_reg_f64 registers: - - { id: 0, class: vreg_128 } - - { id: 1, class: vreg_64 } - - { id: 2, class: vreg_64 } - - { id: 3, class: vreg_64 } + - { id: 0, class: vreg_128_align2 } + - { id: 1, class: vreg_64_align2 } + - { id: 2, class: vreg_64_align2 } + - { id: 3, class: vreg_64_align2 } body: | bb.0: @@ -43,9 +43,9 @@ --- name: test_fmaak_f64 registers: - - { id: 0, class: vreg_128 } - - { id: 1, class: vreg_64 } - - { id: 2, class: vreg_64 } + - { id: 0, class: vreg_128_align2 } + - { id: 1, class: vreg_64_align2 } + - { id: 2, class: vreg_64_align2 } body: | bb.0: @@ -56,15 +56,15 @@ ... # GCN-LABEL: name: test_fmaak_sgpr_src0_f64 -# GCN: V_FMA_F64_e64 0, killed %0, 0, %1, 0, %2:vreg_64, 0, 0, implicit $mode, implicit $exec +# GCN: V_FMA_F64_e64 0, killed %0, 0, %1, 0, %2:vreg_64_align2, 0, 0, implicit $mode, implicit $exec --- name: test_fmaak_sgpr_src0_f64 registers: - { id: 0, class: sreg_64 } - - { id: 1, class: vreg_64 } - - { id: 2, class: vreg_64 } - - { id: 3, class: vreg_64 } + - { id: 1, class: vreg_64_align2 } + - { id: 2, class: vreg_64_align2 } + - { id: 3, class: vreg_64_align2 } body: | bb.0: @@ -75,14 +75,14 @@ ... # GCN-LABEL: name: test_fmaak_inlineimm_src0_f64 -# GCN: V_FMA_F64_e64 0, 4611686018427387904, 0, %0, 0, %1:vreg_64, 0, 0, implicit $mode, implicit $exec +# GCN: V_FMA_F64_e64 0, 4611686018427387904, 0, %0, 0, %1:vreg_64_align2, 0, 0, implicit $mode, implicit $exec --- name: test_fmaak_inlineimm_src0_f64 registers: - - { id: 0, class: vreg_64 } - - { id: 1, class: vreg_64 } - - { id: 2, class: vreg_64 } + - { id: 0, class: vreg_64_align2 } + - { id: 1, class: vreg_64_align2 } + - { id: 2, class: vreg_64_align2 } body: | bb.0: @@ -92,14 +92,14 @@ ... # GCN-LABEL: name: test_fmaak_otherimm_src0_f64 -# GCN: V_FMA_F64_e64 0, 4611686018427387904, 0, %0, 0, %1:vreg_64, 0, 0, implicit $mode, implicit $exec +# GCN: V_FMA_F64_e64 0, 4611686018427387904, 0, %0, 0, %1:vreg_64_align2, 0, 0, implicit $mode, implicit $exec --- name: test_fmaak_otherimm_src0_f64 registers: - - { id: 0, class: vreg_64 } - - { id: 1, class: vreg_64 } - - { id: 2, class: vreg_64 } + - { id: 0, class: vreg_64_align2 } + - { id: 1, class: vreg_64_align2 } + - { id: 2, class: vreg_64_align2 } body: | bb.0: @@ -113,9 +113,9 @@ --- name: test_fmaak_other_constantlike_src0_f64 registers: - - { id: 0, class: vreg_64 } - - { id: 1, class: vreg_64 } - - { id: 2, class: vreg_64 } + - { id: 0, class: vreg_64_align2 } + - { id: 1, class: vreg_64_align2 } + - { id: 2, class: vreg_64_align2 } stack: - { id: 0, name: "", type: default, offset: 0, size: 128, alignment: 8, callee-saved-register: '', local-offset: 0, debug-info-variable: '', @@ -133,10 +133,10 @@ --- name: test_fmamk_reg_unfoldable_literal_src0_f64 registers: - - { id: 0, class: vreg_64 } - - { id: 1, class: vreg_64 } - - { id: 2, class: vreg_64 } - - { id: 3, class: vreg_64 } + - { id: 0, class: vreg_64_align2 } + - { id: 1, class: vreg_64_align2 } + - { id: 2, class: vreg_64_align2 } + - { id: 3, class: vreg_64_align2 } body: | bb.0: @@ -152,10 +152,10 @@ --- name: test_fmamk_reg_unfoldable_literal_src1_f64 registers: - - { id: 0, class: vreg_64 } - - { id: 1, class: vreg_64 } - - { id: 2, class: vreg_64 } - - { id: 3, class: vreg_64 } + - { id: 0, class: vreg_64_align2 } + - { id: 1, class: vreg_64_align2 } + - { id: 2, class: vreg_64_align2 } + - { id: 3, class: vreg_64_align2 } body: | bb.0: @@ -171,10 +171,10 @@ --- name: test_fmaak_reg_unfoldable_literal_src2_f64 registers: - - { id: 0, class: vreg_64 } - - { id: 1, class: vreg_64 } - - { id: 2, class: vreg_64 } - - { id: 3, class: vreg_64 } + - { id: 0, class: vreg_64_align2 } + - { id: 1, class: vreg_64_align2 } + - { id: 2, class: vreg_64_align2 } + - { id: 3, class: vreg_64_align2 } body: | bb.0: Index: llvm/test/CodeGen/AMDGPU/verify-gfx90a-aligned-vgprs.mir =================================================================== --- /dev/null +++ llvm/test/CodeGen/AMDGPU/verify-gfx90a-aligned-vgprs.mir @@ -0,0 +1,121 @@ +# RUN: not --crash llc -march=amdgcn -mcpu=gfx90a -run-pass=machineverifier -o /dev/null %s 2>&1 | FileCheck %s + +# Implicit uses are OK. +--- +name: implicit_use +body: | + bb.0: + $vgpr1_vgpr2 = IMPLICIT_DEF + S_NOP 0, implicit $vgpr1_vgpr2 + %0:vreg_64 = IMPLICIT_DEF + S_NOP 0, implicit %0 + + %1:sreg_64_xexec = IMPLICIT_DEF + %2:sreg_64_xexec = SI_CALL %1, 0, csr_amdgpu_highregs, implicit $vgpr1_vgpr2 + + ; noreg is OK + DS_WRITE_B64_gfx9 $noreg, $noreg, 0, 0, implicit $exec +... + +# The unaligned registers are allowed to exist, just not on any tuple instructions. + +--- +name: copy_like_generic +body: | + bb.0: + $vgpr1_vgpr2 = IMPLICIT_DEF + $vgpr3_vgpr4 = COPY $vgpr1_vgpr2 + %0:vreg_64 = IMPLICIT_DEF + %1:vreg_64 = COPY %0 +... + +--- +name: mov_32_unaligned_super +body: | + bb.0: + undef %0.sub1:vreg_64 = V_MOV_B32_e32 0, implicit $exec + %1:vgpr_32 = V_MOV_B32_e32 undef %2.sub1:vreg_64, implicit $exec +... + +# Well-aligned subregister indexes are OK +--- +name: aligned_sub_reg +body: | + bb.0: + %0:vreg_64_align2 = IMPLICIT_DEF + %1:vreg_128_align2 = IMPLICIT_DEF + GLOBAL_STORE_DWORDX2 %0, %1.sub0_sub1, 0, 0, 0, 0, 0, implicit $exec + GLOBAL_STORE_DWORDX2 %0, %1.sub2_sub3, 0, 0, 0, 0, 0, implicit $exec +... + +--- +name: unaligned_registers +body: | + bb.0: + liveins: $vgpr0_vgpr1, $vgpr3_vgpr4_vgpr5_vgpr6 + %0:vreg_64_align2 = IMPLICIT_DEF + %1:vreg_64 = IMPLICIT_DEF + %2:vreg_96 = IMPLICIT_DEF + %3:vreg_128 = IMPLICIT_DEF + %4:areg_64 = IMPLICIT_DEF + %5:vreg_128_align2 = IMPLICIT_DEF + + ; Check virtual register uses + ; CHECK: *** Bad machine code: Subtarget requires even aligned vector registers *** + ; CHECK: *** Bad machine code: Subtarget requires even aligned vector registers *** + ; CHECK: *** Bad machine code: Subtarget requires even aligned vector registers *** + GLOBAL_STORE_DWORDX2 %0, %1, 0, 0, 0, 0, 0, implicit $exec + GLOBAL_STORE_DWORDX3 %0, %2, 0, 0, 0, 0, 0, implicit $exec + GLOBAL_STORE_DWORDX4 %0, %3, 0, 0, 0, 0, 0, implicit $exec + + ; Check virtual registers with subregisters + ; CHECK: *** Bad machine code: Subtarget requires even aligned vector registers *** + ; CHECK: *** Bad machine code: Subtarget requires even aligned vector registers *** + ; CHECK: *** Bad machine code: Subtarget requires even aligned vector registers *** + ; CHECK: *** Bad machine code: Subtarget requires even aligned vector registers *** + GLOBAL_STORE_DWORDX2 %0, %3.sub0_sub1, 0, 0, 0, 0, 0, implicit $exec + GLOBAL_STORE_DWORDX2 %0, %3.sub2_sub3, 0, 0, 0, 0, 0, implicit $exec + GLOBAL_STORE_DWORDX2 %0, %3.sub1_sub2, 0, 0, 0, 0, 0, implicit $exec + GLOBAL_STORE_DWORDX2 %0, %5.sub1_sub2, 0, 0, 0, 0, 0, implicit $exec + + ; Check physical register uses + ; CHECK: *** Bad machine code: Subtarget requires even aligned vector registers *** + ; CHECK: *** Bad machine code: Subtarget requires even aligned vector registers *** + ; CHECK: *** Bad machine code: Subtarget requires even aligned vector registers *** + GLOBAL_STORE_DWORDX2 $vgpr0_vgpr1, $vgpr3_vgpr4, 0, 0, 0, 0, 0, implicit $exec + GLOBAL_STORE_DWORDX3 $vgpr0_vgpr1, $vgpr3_vgpr4_vgpr5, 0, 0, 0, 0, 0, implicit $exec + GLOBAL_STORE_DWORDX4 $vgpr0_vgpr1, $vgpr3_vgpr4_vgpr5_vgpr6, 0, 0, 0, 0, 0, implicit $exec + + ; Check virtual register defs + ; CHECK: *** Bad machine code: Subtarget requires even aligned vector registers *** + ; CHECK: *** Bad machine code: Subtarget requires even aligned vector registers *** + ; CHECK: *** Bad machine code: Subtarget requires even aligned vector registers *** + %6:vreg_64 = GLOBAL_LOAD_DWORDX2 %0, 0, 0, 0, 0, 0, implicit $exec + %7:vreg_96 = GLOBAL_LOAD_DWORDX3 %0, 0, 0, 0, 0, 0, implicit $exec + %8:vreg_128 = GLOBAL_LOAD_DWORDX4 %0, 0, 0, 0, 0, 0, implicit $exec + + ; CHECK: *** Bad machine code: Subtarget requires even aligned vector registers *** + ; CHECK: *** Bad machine code: Subtarget requires even aligned vector registers *** + ; CHECK: *** Bad machine code: Subtarget requires even aligned vector registers *** + $vgpr1_vgpr2 = GLOBAL_LOAD_DWORDX2 %0, 0, 0, 0, 0, 0, implicit $exec + $vgpr1_vgpr2_vgpr3 = GLOBAL_LOAD_DWORDX3 %0, 0, 0, 0, 0, 0, implicit $exec + $vgpr1_vgpr2_vgpr3_vgpr4 = GLOBAL_LOAD_DWORDX4 %0, 0, 0, 0, 0, 0, implicit $exec + + ; Check AGPRs + ; CHECK: *** Bad machine code: Subtarget requires even aligned vector registers *** + ; CHECK: *** Bad machine code: Subtarget requires even aligned vector registers *** + %9:vgpr_32 = IMPLICIT_DEF + %10:areg_64 = IMPLICIT_DEF + %11:areg_128_align2 = IMPLICIT_DEF + DS_WRITE_B64_gfx9 %9, %10, 0, 0, implicit $exec + DS_WRITE_B64_gfx9 %9, %11.sub1_sub2, 0, 0, implicit $exec +... + +# FIXME: Inline asm is not verified +# ; Check inline asm +# ; XCHECK: *** Bad machine code: Subtarget requires even aligned vector registers *** +# ; XCHECK: *** Bad machine code: Subtarget requires even aligned vector registers *** +# ; XCHECK: *** Bad machine code: Subtarget requires even aligned vector registers *** +# INLINEASM &"; use $0 ", 1 /* sideeffect attdialect */, 9 /* reguse */, $vgpr1_vgpr2 +# INLINEASM &"; use $0 ", 1 /* sideeffect attdialect */, 9 /* reguse */, %4 +# INLINEASM &"; use $0 ", 1 /* sideeffect attdialect */, 9 /* reguse */, %5.sub1_sub2