diff --git a/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp b/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp --- a/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp @@ -710,8 +710,6 @@ TLI.allocateSystemSGPRs(CCInfo, MF, *Info, CC, IsShader); } else { CCInfo.AllocateReg(Info->getScratchRSrcReg()); - CCInfo.AllocateReg(Info->getScratchWaveOffsetReg()); - CCInfo.AllocateReg(Info->getFrameOffsetReg()); TLI.allocateSpecialInputSGPRs(CCInfo, MF, *TRI, *Info); } diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp --- a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp @@ -1478,6 +1478,7 @@ } std::pair AMDGPUDAGToDAGISel::foldFrameIndex(SDValue N) const { + SDLoc DL(N); const MachineFunction &MF = CurDAG->getMachineFunction(); const SIMachineFunctionInfo *Info = MF.getInfo(); @@ -1486,15 +1487,16 @@ FI->getValueType(0)); // If we can resolve this to a frame index access, this will be relative to - // either the stack or frame pointer SGPR. + // either the stack or frame pointer SGPR, or 0 in a kernel. return std::make_pair( - TFI, CurDAG->getRegister(Info->getStackPtrOffsetReg(), MVT::i32)); + TFI, Info->isEntryFunction() + ? CurDAG->getTargetConstant(0, DL, MVT::i32) + : CurDAG->getRegister(Info->getStackPtrOffsetReg(), MVT::i32)); } // If we don't know this private access is a local stack object, it needs to - // be relative to the entry point's scratch wave offset register. - return std::make_pair(N, CurDAG->getRegister(Info->getScratchWaveOffsetReg(), - MVT::i32)); + // be relative to the entry point's scratch wave offset. + return std::make_pair(N, CurDAG->getTargetConstant(0, DL, MVT::i32)); } bool AMDGPUDAGToDAGISel::SelectMUBUFScratchOffen(SDNode *Parent, @@ -1519,10 +1521,10 @@ // In a call sequence, stores to the argument stack area are relative to the // stack pointer. const MachinePointerInfo &PtrInfo = cast(Parent)->getPointerInfo(); - unsigned SOffsetReg = isStackPtrRelative(PtrInfo) ? - Info->getStackPtrOffsetReg() : Info->getScratchWaveOffsetReg(); - SOffset = CurDAG->getRegister(SOffsetReg, MVT::i32); + SOffset = isStackPtrRelative(PtrInfo) + ? CurDAG->getRegister(Info->getStackPtrOffsetReg(), MVT::i32) + : CurDAG->getTargetConstant(0, DL, MVT::i32); ImmOffset = CurDAG->getTargetConstant(Imm & 4095, DL, MVT::i16); return true; } @@ -1580,12 +1582,12 @@ SRsrc = CurDAG->getRegister(Info->getScratchRSrcReg(), MVT::v4i32); const MachinePointerInfo &PtrInfo = cast(Parent)->getPointerInfo(); - unsigned SOffsetReg = isStackPtrRelative(PtrInfo) ? - Info->getStackPtrOffsetReg() : Info->getScratchWaveOffsetReg(); // FIXME: Get from MachinePointerInfo? We should only be using the frame // offset if we know this is in a call sequence. - SOffset = CurDAG->getRegister(SOffsetReg, MVT::i32); + SOffset = isStackPtrRelative(PtrInfo) + ? CurDAG->getRegister(Info->getStackPtrOffsetReg(), MVT::i32) + : CurDAG->getTargetConstant(0, DL, MVT::i32); Offset = CurDAG->getTargetConstant(CAddr->getZExtValue(), DL, MVT::i16); return true; diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp --- a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp @@ -2694,10 +2694,10 @@ const MachineMemOperand *MMO = *MI->memoperands_begin(); const MachinePointerInfo &PtrInfo = MMO->getPointerInfo(); - Register SOffsetReg = isStackPtrRelative(PtrInfo) - ? Info->getStackPtrOffsetReg() - : Info->getScratchWaveOffsetReg(); - MIB.addReg(SOffsetReg); + if (isStackPtrRelative(PtrInfo)) + MIB.addReg(Info->getStackPtrOffsetReg()); + else + MIB.addImm(0); }, [=](MachineInstrBuilder &MIB) { // offset MIB.addImm(Offset & 4095); @@ -2734,13 +2734,6 @@ } } - // If we don't know this private access is a local stack object, it needs to - // be relative to the entry point's scratch wave offset register. - // TODO: Should split large offsets that don't fit like above. - // TODO: Don't use scratch wave offset just because the offset didn't fit. - Register SOffset = FI.hasValue() ? Info->getStackPtrOffsetReg() - : Info->getScratchWaveOffsetReg(); - return {{[=](MachineInstrBuilder &MIB) { // rsrc MIB.addReg(Info->getScratchRSrcReg()); }, @@ -2751,7 +2744,15 @@ MIB.addReg(VAddr); }, [=](MachineInstrBuilder &MIB) { // soffset - MIB.addReg(SOffset); + // If we don't know this private access is a local stack object, it + // needs to be relative to the entry point's scratch wave offset. + // TODO: Should split large offsets that don't fit like above. + // TODO: Don't use scratch wave offset just because the offset + // didn't fit. + if (!Info->isEntryFunction() && FI.hasValue()) + MIB.addReg(Info->getStackPtrOffsetReg()); + else + MIB.addImm(0); }, [=](MachineInstrBuilder &MIB) { // offset MIB.addImm(Offset); @@ -2789,15 +2790,17 @@ const MachineMemOperand *MMO = *MI->memoperands_begin(); const MachinePointerInfo &PtrInfo = MMO->getPointerInfo(); - Register SOffsetReg = isStackPtrRelative(PtrInfo) - ? Info->getStackPtrOffsetReg() - : Info->getScratchWaveOffsetReg(); return {{ - [=](MachineInstrBuilder &MIB) { + [=](MachineInstrBuilder &MIB) { // rsrc MIB.addReg(Info->getScratchRSrcReg()); - }, // rsrc - [=](MachineInstrBuilder &MIB) { MIB.addReg(SOffsetReg); }, // soffset - [=](MachineInstrBuilder &MIB) { MIB.addImm(Offset); } // offset + }, + [=](MachineInstrBuilder &MIB) { // soffset + if (isStackPtrRelative(PtrInfo)) + MIB.addReg(Info->getStackPtrOffsetReg()); + else + MIB.addImm(0); + }, + [=](MachineInstrBuilder &MIB) { MIB.addImm(Offset); } // offset }}; } diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp --- a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp @@ -1065,7 +1065,6 @@ }; if (parseRegister(YamlMFI.ScratchRSrcReg, MFI->ScratchRSrcReg) || - parseRegister(YamlMFI.ScratchWaveOffsetReg, MFI->ScratchWaveOffsetReg) || parseRegister(YamlMFI.FrameOffsetReg, MFI->FrameOffsetReg) || parseRegister(YamlMFI.StackPtrOffsetReg, MFI->StackPtrOffsetReg)) return true; @@ -1075,11 +1074,6 @@ return diagnoseRegisterClass(YamlMFI.ScratchRSrcReg); } - if (MFI->ScratchWaveOffsetReg != AMDGPU::SCRATCH_WAVE_OFFSET_REG && - !AMDGPU::SGPR_32RegClass.contains(MFI->ScratchWaveOffsetReg)) { - return diagnoseRegisterClass(YamlMFI.ScratchWaveOffsetReg); - } - if (MFI->FrameOffsetReg != AMDGPU::FP_REG && !AMDGPU::SGPR_32RegClass.contains(MFI->FrameOffsetReg)) { return diagnoseRegisterClass(YamlMFI.FrameOffsetReg); diff --git a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.cpp b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.cpp --- a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.cpp +++ b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.cpp @@ -296,7 +296,6 @@ switch (RegNo) { case AMDGPU::FP_REG: case AMDGPU::SP_REG: - case AMDGPU::SCRATCH_WAVE_OFFSET_REG: case AMDGPU::PRIVATE_RSRC_REG: llvm_unreachable("pseudo-register should not ever be emitted"); case AMDGPU::SCC: diff --git a/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp b/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp --- a/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp +++ b/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp @@ -610,21 +610,35 @@ return; if (frameIndexMayFold(TII, *UseMI, UseOpIdx, OpToFold)) { - // Sanity check that this is a stack access. + // Sanity check that this is a stack access. For both kernels and + // non-kernel functions this means the SRSRC is the stack SRSRC. For + // kernels the SOffset is always 0 because the scratch wave offset is + // already included in the scratch SRSRC, so there is no SP/FP. For + // non-kernel functions SOffset is either the StackPtrOffsetReg or 0 (in + // which case we must update it to the ScratchPtrOffsetReg when folding). // FIXME: Should probably use stack pseudos before frame lowering. - MachineOperand *SOff = TII->getNamedOperand(*UseMI, AMDGPU::OpName::soffset); - if (!SOff->isReg() || (SOff->getReg() != MFI->getScratchWaveOffsetReg() && - SOff->getReg() != MFI->getStackPtrOffsetReg())) - return; if (TII->getNamedOperand(*UseMI, AMDGPU::OpName::srsrc)->getReg() != MFI->getScratchRSrcReg()) return; + MachineOperand &SOff = + *TII->getNamedOperand(*UseMI, AMDGPU::OpName::soffset); + if (MFI->isEntryFunction()) { + if (!SOff.isImm() || SOff.getImm() != 0) + return; + } else { + if (!((SOff.isReg() && SOff.getReg() == MFI->getStackPtrOffsetReg()) || + (SOff.isImm() && SOff.getImm() == 0))) + return; + } + // A frame index will resolve to a positive constant, so it should always be // safe to fold the addressing mode, even pre-GFX9. UseMI->getOperand(UseOpIdx).ChangeToFrameIndex(OpToFold.getIndex()); - SOff->setReg(MFI->getStackPtrOffsetReg()); + + if (!MFI->isEntryFunction() && SOff.isImm()) + SOff.ChangeToRegister(MFI->getStackPtrOffsetReg(), false); return; } diff --git a/llvm/lib/Target/AMDGPU/SIFrameLowering.h b/llvm/lib/Target/AMDGPU/SIFrameLowering.h --- a/llvm/lib/Target/AMDGPU/SIFrameLowering.h +++ b/llvm/lib/Target/AMDGPU/SIFrameLowering.h @@ -61,17 +61,15 @@ const DebugLoc &DL, Register ScratchWaveOffsetReg) const; - Register getEntryFunctionReservedScratchRsrcReg(MachineFunction &MF) const; - Register - getEntryFunctionReservedScratchWaveOffsetReg(MachineFunction &MF) const; - - void emitEntryFunctionScratchRsrcRegSetup(MachineFunction &MF, - MachineBasicBlock &MBB, - MachineBasicBlock::iterator I, - const DebugLoc &DL, - Register PreloadedPrivateBufferReg, - Register ScratchRsrcReg) const; + getEntryFunctionReservedScratchRsrcReg(MachineFunction &MF, + Register ScratchWaveOffsetReg) const; + + void emitEntryFunctionScratchRsrcRegSetup( + MachineFunction &MF, MachineBasicBlock &MBB, + MachineBasicBlock::iterator I, const DebugLoc &DL, + Register PreloadedPrivateBufferReg, Register ScratchRsrcReg, + Register ScratchWaveOffsetReg) const; public: bool hasFP(const MachineFunction &MF) const override; diff --git a/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp b/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp --- a/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp +++ b/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp @@ -30,12 +30,6 @@ ST.getMaxNumSGPRs(MF) / 4); } -static ArrayRef getAllSGPRs(const GCNSubtarget &ST, - const MachineFunction &MF) { - return makeArrayRef(AMDGPU::SGPR_32RegClass.begin(), - ST.getMaxNumSGPRs(MF)); -} - // Find a scratch register that we can use at the start of the prologue to // re-align the stack pointer. We avoid using callee-save registers since they // may appear to be free when this is called from canUseAsPrologue (during @@ -263,7 +257,7 @@ // Shift down registers reserved for the scratch RSRC. Register SIFrameLowering::getEntryFunctionReservedScratchRsrcReg( - MachineFunction &MF) const { + MachineFunction &MF, Register ScratchWaveOffsetReg) const { const GCNSubtarget &ST = MF.getSubtarget(); const SIInstrInfo *TII = ST.getInstrInfo(); @@ -292,18 +286,31 @@ // cannot do this for the resources required for scratch access. For now we // skip over user SGPRs and may leave unused holes. - // We find the resource first because it has an alignment requirement. - - unsigned NumPreloaded = (MFI->getNumPreloadedSGPRs() + 3) / 4; + unsigned NumPreloadedSGPRs = MFI->getNumPreloadedSGPRs(); + // FIXME: This is just lifted from AMDGPUAsmPrinter, because I'm not + // sure where/if we track InReg SGPR arguments otherwise. + for (auto &Arg : MF.getFunction().args()) { + unsigned NumRegs = (Arg.getType()->getPrimitiveSizeInBits() + 31) / 32; + if (Arg.hasAttribute(Attribute::InReg)) { + NumPreloadedSGPRs += NumRegs; + } + } ArrayRef AllSGPR128s = getAllSGPR128(ST, MF); - AllSGPR128s = AllSGPR128s.slice(std::min(static_cast(AllSGPR128s.size()), NumPreloaded)); + AllSGPR128s = AllSGPR128s.slice(std::min( + static_cast(AllSGPR128s.size()), (NumPreloadedSGPRs + 3) / 4)); // Skip the last N reserved elements because they should have already been // reserved for VCC etc. for (MCPhysReg Reg : AllSGPR128s) { // Pick the first unallocated one. Make sure we don't clobber the other // reserved input we needed. - if (!MRI.isPhysRegUsed(Reg) && MRI.isAllocatable(Reg)) { + // + // FIXME: The preloaded SGPR count doesn't seem to be completely accurate, + // SITargetLowering::allocateSystemSGPRs just picks the next free SGPR for + // the scratch wave offset. To work around this we ask the caller for the + // scratch wave offset and explicitly avoid it. + if (!MRI.isPhysRegUsed(Reg) && MRI.isAllocatable(Reg) && + !TRI->isSubRegisterEq(Reg, ScratchWaveOffsetReg)) { MRI.replaceRegWith(ScratchRsrcReg, Reg); MFI->setScratchRSrcReg(Reg); return Reg; @@ -313,76 +320,6 @@ return ScratchRsrcReg; } -// Shift down registers reserved for the scratch wave offset. -Register SIFrameLowering::getEntryFunctionReservedScratchWaveOffsetReg( - MachineFunction &MF) const { - - const GCNSubtarget &ST = MF.getSubtarget(); - const SIInstrInfo *TII = ST.getInstrInfo(); - const SIRegisterInfo *TRI = &TII->getRegisterInfo(); - MachineRegisterInfo &MRI = MF.getRegInfo(); - SIMachineFunctionInfo *MFI = MF.getInfo(); - - assert(MFI->isEntryFunction()); - - Register ScratchWaveOffsetReg = MFI->getScratchWaveOffsetReg(); - - if (ScratchWaveOffsetReg == AMDGPU::NoRegister || - (!MRI.isPhysRegUsed(ScratchWaveOffsetReg) && !hasFP(MF) && - !MFI->hasFlatScratchInit())) { - assert(!hasFP(MF) && !MFI->hasFlatScratchInit()); - return AMDGPU::NoRegister; - } - - if (ST.hasSGPRInitBug() || - ScratchWaveOffsetReg != TRI->reservedPrivateSegmentWaveByteOffsetReg(MF)) - return ScratchWaveOffsetReg; - - unsigned NumPreloaded = MFI->getNumPreloadedSGPRs(); - - ArrayRef AllSGPRs = getAllSGPRs(ST, MF); - if (NumPreloaded > AllSGPRs.size()) - return ScratchWaveOffsetReg; - - AllSGPRs = AllSGPRs.slice(NumPreloaded); - - // We need to drop register from the end of the list that we cannot use - // for the scratch wave offset. - // + 2 s102 and s103 do not exist on VI. - // + 2 for vcc - // + 2 for xnack_mask - // + 2 for flat_scratch - // + 4 for registers reserved for scratch resource register - // + 1 for register reserved for scratch wave offset. (By exluding this - // register from the list to consider, it means that when this - // register is being used for the scratch wave offset and there - // are no other free SGPRs, then the value will stay in this register. - // + 1 if stack pointer is used. - // ---- - // 13 (+1) - unsigned ReservedRegCount = 13; - - if (AllSGPRs.size() < ReservedRegCount) - return ScratchWaveOffsetReg; - - for (MCPhysReg Reg : AllSGPRs.drop_back(ReservedRegCount)) { - // Pick the first unallocated SGPR. Be careful not to pick an alias of the - // scratch descriptor, since we haven’t added its uses yet. - if (!MRI.isPhysRegUsed(Reg) && MRI.isAllocatable(Reg)) { - MRI.replaceRegWith(ScratchWaveOffsetReg, Reg); - if (MFI->getScratchWaveOffsetReg() == MFI->getStackPtrOffsetReg()) { - assert(!hasFP(MF)); - MFI->setStackPtrOffsetReg(Reg); - } - MFI->setScratchWaveOffsetReg(Reg); - MFI->setFrameOffsetReg(Reg); - return Reg; - } - } - - return ScratchWaveOffsetReg; -} - void SIFrameLowering::emitEntryFunctionPrologue(MachineFunction &MF, MachineBasicBlock &MBB) const { assert(&MF.front() == &MBB && "Shrink-wrapping not yet supported"); @@ -401,128 +338,80 @@ SIMachineFunctionInfo *MFI = MF.getInfo(); const GCNSubtarget &ST = MF.getSubtarget(); const SIInstrInfo *TII = ST.getInstrInfo(); - const SIRegisterInfo *TRI = &TII->getRegisterInfo(); MachineRegisterInfo &MRI = MF.getRegInfo(); const Function &F = MF.getFunction(); assert(MFI->isEntryFunction()); - // We need to do the replacement of the private segment buffer and wave offset - // register even if there are no stack objects. There could be stores to undef - // or a constant without an associated object. - // - // These calls will return `AMDGPU::NoRegister` in cases where there are no - // actual uses of the respective registers. - Register ScratchRsrcReg = getEntryFunctionReservedScratchRsrcReg(MF); - Register ScratchWaveOffsetReg = - getEntryFunctionReservedScratchWaveOffsetReg(MF); - - // Make the selected registers live throughout the function. - for (MachineBasicBlock &OtherBB : MF) { - if (&OtherBB == &MBB) - continue; - - if (ScratchWaveOffsetReg != AMDGPU::NoRegister) - OtherBB.addLiveIn(ScratchWaveOffsetReg); - - if (ScratchRsrcReg != AMDGPU::NoRegister) - OtherBB.addLiveIn(ScratchRsrcReg); - } - - // Now that we have fixed the reserved registers we need to locate the - // (potentially) preloaded registers. We should always have a preloaded - // scratch wave offset register, but we only have a preloaded scratch rsrc - // register for HSA. - Register PreloadedScratchWaveOffsetReg = MFI->getPreloadedReg( + Register ScratchWaveOffsetReg = MFI->getPreloadedReg( AMDGPUFunctionArgInfo::PRIVATE_SEGMENT_WAVE_BYTE_OFFSET); // FIXME: Hack to not crash in situations which emitted an error. - if (PreloadedScratchWaveOffsetReg == AMDGPU::NoRegister) + if (ScratchWaveOffsetReg == AMDGPU::NoRegister) return; - // We added live-ins during argument lowering, but since they were not used - // they were deleted. We're adding the uses now, so add them back. - MRI.addLiveIn(PreloadedScratchWaveOffsetReg); - MBB.addLiveIn(PreloadedScratchWaveOffsetReg); - + // We need to do the replacement of the private segment buffer register even + // if there are no stack objects. There could be stores to undef or a + // constant without an associated object. + // + // This will return `AMDGPU::NoRegister` in cases where there are no actual + // uses of the SRSRC. + Register ScratchRsrcReg = + getEntryFunctionReservedScratchRsrcReg(MF, ScratchWaveOffsetReg); + + // Make the selected register live throughout the function. + if (ScratchRsrcReg != AMDGPU::NoRegister) + for (MachineBasicBlock &OtherBB : MF) + if (&OtherBB != &MBB) + OtherBB.addLiveIn(ScratchRsrcReg); + + // Now that we have fixed the reserved SRSRC we need to locate the + // (potentially) preloaded SRSRC. Register PreloadedScratchRsrcReg = AMDGPU::NoRegister; if (ST.isAmdHsaOrMesa(F)) { PreloadedScratchRsrcReg = MFI->getPreloadedReg(AMDGPUFunctionArgInfo::PRIVATE_SEGMENT_BUFFER); if (ScratchRsrcReg != AMDGPU::NoRegister && PreloadedScratchRsrcReg != AMDGPU::NoRegister) { + // We added live-ins during argument lowering, but since they were not + // used they were deleted. We're adding the uses now, so add them back. MRI.addLiveIn(PreloadedScratchRsrcReg); MBB.addLiveIn(PreloadedScratchRsrcReg); } } + // Debug location must be unknown since the first debug location is used to + // determine the end of the prologue. DebugLoc DL; MachineBasicBlock::iterator I = MBB.begin(); - const bool HasFP = hasFP(MF); - - // If we are not HSA or we happened to reserved the original input registers, - // we don't need to copy to the reserved registers. - const bool CopyBuffer = ST.isAmdHsaOrMesa(F) && - ScratchRsrcReg != AMDGPU::NoRegister && - PreloadedScratchRsrcReg != AMDGPU::NoRegister && - ScratchRsrcReg != PreloadedScratchRsrcReg; - - // This needs to be careful of the copying order to avoid overwriting one of - // the input registers before it's been copied to it's final - // destination. Usually the offset should be copied first. - const bool CopyBufferFirst = - TRI->isSubRegisterEq(PreloadedScratchRsrcReg, ScratchWaveOffsetReg); - - if (CopyBuffer && CopyBufferFirst) { - BuildMI(MBB, I, DL, TII->get(AMDGPU::COPY), ScratchRsrcReg) - .addReg(PreloadedScratchRsrcReg, RegState::Kill); + if (MF.getFrameInfo().hasCalls()) { + unsigned SPReg = MFI->getStackPtrOffsetReg(); + assert(SPReg != AMDGPU::SP_REG); + BuildMI(MBB, I, DL, TII->get(AMDGPU::S_MOV_B32), SPReg) + .addImm(MF.getFrameInfo().getStackSize() * ST.getWavefrontSize()); } - if (ScratchWaveOffsetReg != AMDGPU::NoRegister) { - BuildMI(MBB, I, DL, TII->get(AMDGPU::COPY), ScratchWaveOffsetReg) - .addReg(PreloadedScratchWaveOffsetReg, HasFP ? RegState::Kill : 0); + if (MFI->hasFlatScratchInit() || ScratchRsrcReg != AMDGPU::NoRegister) { + MRI.addLiveIn(ScratchWaveOffsetReg); + MBB.addLiveIn(ScratchWaveOffsetReg); } - if (CopyBuffer && !CopyBufferFirst) { - BuildMI(MBB, I, DL, TII->get(AMDGPU::COPY), ScratchRsrcReg) - .addReg(PreloadedScratchRsrcReg, RegState::Kill); + if (MFI->hasFlatScratchInit()) { + emitEntryFunctionFlatScratchInit(MF, MBB, I, DL, ScratchWaveOffsetReg); } - // FIXME: This should also implement the setup path for HSA. if (ScratchRsrcReg != AMDGPU::NoRegister) { - emitEntryFunctionScratchRsrcRegSetup( - MF, MBB, I, DL, PreloadedScratchRsrcReg, ScratchRsrcReg); - } - - if (HasFP) { - const MachineFrameInfo &FrameInfo = MF.getFrameInfo(); - int64_t StackSize = FrameInfo.getStackSize(); - - Register SPReg = MFI->getStackPtrOffsetReg(); - assert(SPReg != AMDGPU::SP_REG); - - // On kernel entry, the private scratch wave offset is the SP value. - if (StackSize == 0) { - BuildMI(MBB, I, DL, TII->get(AMDGPU::COPY), SPReg) - .addReg(MFI->getScratchWaveOffsetReg()); - } else { - BuildMI(MBB, I, DL, TII->get(AMDGPU::S_ADD_U32), SPReg) - .addReg(MFI->getScratchWaveOffsetReg()) - .addImm(StackSize * ST.getWavefrontSize()); - } - } - - if (MFI->hasFlatScratchInit()) { - emitEntryFunctionFlatScratchInit(MF, MBB, I, DL, - MFI->getScratchWaveOffsetReg()); + emitEntryFunctionScratchRsrcRegSetup(MF, MBB, I, DL, + PreloadedScratchRsrcReg, + ScratchRsrcReg, ScratchWaveOffsetReg); } } -// Emit scratch RSRC setup code, assuming `ScratchRsrcReg != AMDGPU::NoRegister` +// Emit scratch RSRC setup code, assuming `ScratchRsrcReg != AMDGPU::NoReg` void SIFrameLowering::emitEntryFunctionScratchRsrcRegSetup( MachineFunction &MF, MachineBasicBlock &MBB, MachineBasicBlock::iterator I, const DebugLoc &DL, Register PreloadedScratchRsrcReg, - Register ScratchRsrcReg) const { + Register ScratchRsrcReg, Register ScratchWaveOffsetReg) const { const GCNSubtarget &ST = MF.getSubtarget(); const SIInstrInfo *TII = ST.getInstrInfo(); @@ -646,7 +535,34 @@ BuildMI(MBB, I, DL, SMovB32, Rsrc3) .addImm(Rsrc23 >> 32) .addReg(ScratchRsrcReg, RegState::ImplicitDefine); + } else if (ST.isAmdHsaOrMesa(Fn)) { + assert(PreloadedScratchRsrcReg != AMDGPU::NoRegister); + + if (ScratchRsrcReg != PreloadedScratchRsrcReg) + BuildMI(MBB, I, DL, TII->get(AMDGPU::COPY), ScratchRsrcReg) + .addReg(PreloadedScratchRsrcReg, RegState::Kill); } + + // Add the scratch wave offset into the scratch RSRC. + // + // We only want to update the first 48 bits, which is the base address + // pointer, without touching the adjacent 16 bits of flags. We know this add + // cannot carry-out from bit 47, otherwise the scratch allocation would be + // impossible to fit in the 48-bit global address space. + // + // TODO: Evaluate if it is better to just construct an SRD using the flat + // scratch init and some constants rather than update the one we are passed. + Register ScratchRsrcSub0 = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub0); + Register ScratchRsrcSub1 = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub1); + + BuildMI(MBB, I, DL, TII->get(AMDGPU::S_ADD_U32), ScratchRsrcSub0) + .addReg(ScratchRsrcSub0) + .addReg(ScratchWaveOffsetReg, RegState::Kill) + .addReg(ScratchRsrcReg, RegState::ImplicitDefine); + BuildMI(MBB, I, DL, TII->get(AMDGPU::S_ADDC_U32), ScratchRsrcSub1) + .addReg(ScratchRsrcSub1) + .addImm(0) + .addReg(ScratchRsrcReg, RegState::ImplicitDefine); } bool SIFrameLowering::isSupportedStackID(TargetStackID::Value ID) const { @@ -1112,19 +1028,22 @@ bool SIFrameLowering::hasFP(const MachineFunction &MF) const { const MachineFrameInfo &MFI = MF.getFrameInfo(); + + if (MF.getInfo()->isEntryFunction()) { + // In an entry function we can always use immediate offsets. + // FIXME: Do we need/want to respect DisableFramePointerElim here? It isn't + // possible to unwind out of the entry function anyway, so the option + // doesn't seem useful in kernels. + return false; + } + if (MFI.hasCalls()) { // All offsets are unsigned, so need to be addressed in the same direction // as stack growth. // FIXME: This function is pretty broken, since it can be called before the // frame layout is determined or CSR spills are inserted. - if (MFI.getStackSize() != 0) - return true; - - // For the entry point, the input wave scratch offset must be copied to the - // API SP if there are calls. - if (MF.getInfo()->isEntryFunction()) - return true; + return MFI.getStackSize() != 0; } return MFI.hasVarSizedObjects() || MFI.isFrameAddressTaken() || diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp --- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp @@ -1898,10 +1898,9 @@ Info.setScratchRSrcReg(ReservedBufferReg); } - // hasFP should be accurate for kernels even before the frame is finalized. - if (ST.getFrameLowering()->hasFP(MF)) { - MachineRegisterInfo &MRI = MF.getRegInfo(); + MachineRegisterInfo &MRI = MF.getRegInfo(); + if (MFI.hasCalls()) { // Try to use s32 as the SP, but move it if it would interfere with input // arguments. This won't work with calls though. // @@ -1925,40 +1924,6 @@ if (Info.getStackPtrOffsetReg() == AMDGPU::SP_REG) report_fatal_error("failed to find register for SP"); } - - if (MFI.hasCalls()) { - Info.setScratchWaveOffsetReg(AMDGPU::SGPR33); - Info.setFrameOffsetReg(AMDGPU::SGPR33); - } else { - unsigned ReservedOffsetReg = - TRI.reservedPrivateSegmentWaveByteOffsetReg(MF); - Info.setScratchWaveOffsetReg(ReservedOffsetReg); - Info.setFrameOffsetReg(ReservedOffsetReg); - } - } else if (RequiresStackAccess) { - assert(!MFI.hasCalls()); - // We know there are accesses and they will be done relative to SP, so just - // pin it to the input. - // - // FIXME: Should not do this if inline asm is reading/writing these - // registers. - Register PreloadedSP = Info.getPreloadedReg( - AMDGPUFunctionArgInfo::PRIVATE_SEGMENT_WAVE_BYTE_OFFSET); - - Info.setStackPtrOffsetReg(PreloadedSP); - Info.setScratchWaveOffsetReg(PreloadedSP); - Info.setFrameOffsetReg(PreloadedSP); - } else { - assert(!MFI.hasCalls()); - - // There may not be stack access at all. There may still be spills, or - // access of a constant pointer (in which cases an extra copy will be - // emitted in the prolog). - unsigned ReservedOffsetReg - = TRI.reservedPrivateSegmentWaveByteOffsetReg(MF); - Info.setStackPtrOffsetReg(ReservedOffsetReg); - Info.setScratchWaveOffsetReg(ReservedOffsetReg); - Info.setFrameOffsetReg(ReservedOffsetReg); } } @@ -2213,8 +2178,6 @@ allocateSystemSGPRs(CCInfo, MF, *Info, CallConv, IsShader); } else { CCInfo.AllocateReg(Info->getScratchRSrcReg()); - CCInfo.AllocateReg(Info->getScratchWaveOffsetReg()); - CCInfo.AllocateReg(Info->getFrameOffsetReg()); allocateSpecialInputSGPRs(CCInfo, MF, *TRI, *Info); } @@ -10612,11 +10575,6 @@ if (Info->getFrameOffsetReg() != AMDGPU::FP_REG) MRI.replaceRegWith(AMDGPU::FP_REG, Info->getFrameOffsetReg()); - if (Info->getScratchWaveOffsetReg() != AMDGPU::SCRATCH_WAVE_OFFSET_REG) { - MRI.replaceRegWith(AMDGPU::SCRATCH_WAVE_OFFSET_REG, - Info->getScratchWaveOffsetReg()); - } - Info->limitOccupancy(MF); if (ST.isWave32() && !MF.empty()) { diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp --- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp +++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp @@ -1156,15 +1156,19 @@ MRI.constrainRegClass(SrcReg, &AMDGPU::SReg_32_XM0RegClass); } - BuildMI(MBB, MI, DL, OpDesc) + auto MIB = BuildMI(MBB, MI, DL, OpDesc) .addReg(SrcReg, getKillRegState(isKill)) // data .addFrameIndex(FrameIndex) // addr .addMemOperand(MMO) - .addReg(MFI->getScratchRSrcReg(), RegState::Implicit) - .addReg(MFI->getStackPtrOffsetReg(), RegState::Implicit); + .addReg(MFI->getScratchRSrcReg(), RegState::Implicit); // Add the scratch resource registers as implicit uses because we may end up // needing them, and need to ensure that the reserved registers are // correctly handled. + + // Also add the stack pointer if we have one, for the same reason. + if (!MFI->isEntryFunction()) + MIB.addReg(MFI->getStackPtrOffsetReg(), RegState::Implicit); + if (RI.spillSGPRToVGPR()) FrameInfo.setStackID(FrameIndex, TargetStackID::SGPRSpill); return; @@ -1181,11 +1185,15 @@ MIB.addReg(Tmp, RegState::Define); } MIB.addReg(SrcReg, getKillRegState(isKill)) // data - .addFrameIndex(FrameIndex) // addr - .addReg(MFI->getScratchRSrcReg()) // scratch_rsrc - .addReg(MFI->getStackPtrOffsetReg()) // scratch_offset - .addImm(0) // offset - .addMemOperand(MMO); + .addFrameIndex(FrameIndex) // addr + .addReg(MFI->getScratchRSrcReg()); // scratch_rsrc + if (MFI->isEntryFunction()) { + MIB.addImm(0); // scratch_offset + } else { + MIB.addReg(MFI->getStackPtrOffsetReg()); // scratch_offset + } + MIB.addImm(0) // offset + .addMemOperand(MMO); } static unsigned getSGPRSpillRestoreOpcode(unsigned Size) { @@ -1284,11 +1292,12 @@ if (RI.spillSGPRToVGPR()) FrameInfo.setStackID(FrameIndex, TargetStackID::SGPRSpill); - BuildMI(MBB, MI, DL, OpDesc, DestReg) + auto MIB = BuildMI(MBB, MI, DL, OpDesc, DestReg) .addFrameIndex(FrameIndex) // addr .addMemOperand(MMO) - .addReg(MFI->getScratchRSrcReg(), RegState::Implicit) - .addReg(MFI->getStackPtrOffsetReg(), RegState::Implicit); + .addReg(MFI->getScratchRSrcReg(), RegState::Implicit); + if (!MFI->isEntryFunction()) + MIB.addReg(MFI->getStackPtrOffsetReg(), RegState::Implicit); return; } @@ -1300,11 +1309,15 @@ Register Tmp = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); MIB.addReg(Tmp, RegState::Define); } - MIB.addFrameIndex(FrameIndex) // vaddr - .addReg(MFI->getScratchRSrcReg()) // scratch_rsrc - .addReg(MFI->getStackPtrOffsetReg()) // scratch_offset - .addImm(0) // offset - .addMemOperand(MMO); + MIB.addFrameIndex(FrameIndex) // vaddr + .addReg(MFI->getScratchRSrcReg()); // scratch_rsrc + if (MFI->isEntryFunction()) { + MIB.addImm(0); // scratch_offset + } else { + MIB.addReg(MFI->getStackPtrOffsetReg()); // scratch_offset + } + MIB.addImm(0) // offset + .addMemOperand(MMO); } /// \param @Offset Offset in bytes of the FrameIndex being spilled diff --git a/llvm/lib/Target/AMDGPU/SIInstructions.td b/llvm/lib/Target/AMDGPU/SIInstructions.td --- a/llvm/lib/Target/AMDGPU/SIInstructions.td +++ b/llvm/lib/Target/AMDGPU/SIInstructions.td @@ -636,7 +636,7 @@ def _SAVE : VPseudoInstSI < (outs), (ins vgpr_class:$vdata, i32imm:$vaddr, SReg_128:$srsrc, - SReg_32:$soffset, i32imm:$offset)> { + type2:$soffset, i32imm:$offset)> { let mayStore = 1; let mayLoad = 0; // (2 * 4) + (8 * num_subregs) bytes maximum @@ -647,7 +647,7 @@ def _RESTORE : VPseudoInstSI < (outs vgpr_class:$vdata), - (ins i32imm:$vaddr, SReg_128:$srsrc, SReg_32:$soffset, + (ins i32imm:$vaddr, SReg_128:$srsrc, type2:$soffset, i32imm:$offset)> { let mayStore = 0; let mayLoad = 1; @@ -676,7 +676,7 @@ def _SAVE : VPseudoInstSI < (outs VGPR_32:$tmp), (ins vgpr_class:$vdata, i32imm:$vaddr, SReg_128:$srsrc, - SReg_32:$soffset, i32imm:$offset)> { + type2:$soffset, i32imm:$offset)> { let mayStore = 1; let mayLoad = 0; // (2 * 4) + (16 * num_subregs) bytes maximum @@ -687,7 +687,7 @@ def _RESTORE : VPseudoInstSI < (outs vgpr_class:$vdata, VGPR_32:$tmp), - (ins i32imm:$vaddr, SReg_128:$srsrc, SReg_32:$soffset, + (ins i32imm:$vaddr, SReg_128:$srsrc, type2:$soffset, i32imm:$offset)> { let mayStore = 0; let mayLoad = 1; diff --git a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h --- a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h +++ b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h @@ -284,7 +284,6 @@ uint32_t HighBitsOf32BitAddress = 0; StringValue ScratchRSrcReg = "$private_rsrc_reg"; - StringValue ScratchWaveOffsetReg = "$scratch_wave_offset_reg"; StringValue FrameOffsetReg = "$fp_reg"; StringValue StackPtrOffsetReg = "$sp_reg"; @@ -311,8 +310,6 @@ YamlIO.mapOptional("waveLimiter", MFI.WaveLimiter, false); YamlIO.mapOptional("scratchRSrcReg", MFI.ScratchRSrcReg, StringValue("$private_rsrc_reg")); - YamlIO.mapOptional("scratchWaveOffsetReg", MFI.ScratchWaveOffsetReg, - StringValue("$scratch_wave_offset_reg")); YamlIO.mapOptional("frameOffsetReg", MFI.FrameOffsetReg, StringValue("$fp_reg")); YamlIO.mapOptional("stackPtrOffsetReg", MFI.StackPtrOffsetReg, @@ -336,14 +333,15 @@ // Registers that may be reserved for spilling purposes. These may be the same // as the input registers. unsigned ScratchRSrcReg = AMDGPU::PRIVATE_RSRC_REG; - unsigned ScratchWaveOffsetReg = AMDGPU::SCRATCH_WAVE_OFFSET_REG; - // This is the current function's incremented size from the kernel's scratch - // wave offset register. For an entry function, this is exactly the same as - // the ScratchWaveOffsetReg. + // This is the the unswizzled offset from the current dispatch's scratch wave + // base to the beginning of the current function's frame. For an entry + // function, this is 0. unsigned FrameOffsetReg = AMDGPU::FP_REG; - // Top of the stack SGPR offset derived from the ScratchWaveOffsetReg. + // This is an ABI register used in the non-entry calling convention to + // communicate the unswizzled offset from the current dispatch's scratch wave + // base to the beginning of the new function's frame. unsigned StackPtrOffsetReg = AMDGPU::SP_REG; AMDGPUFunctionArgInfo ArgInfo; @@ -713,10 +711,6 @@ ScratchRSrcReg = Reg; } - unsigned getScratchWaveOffsetReg() const { - return ScratchWaveOffsetReg; - } - unsigned getFrameOffsetReg() const { return FrameOffsetReg; } @@ -739,11 +733,6 @@ return StackPtrOffsetReg; } - void setScratchWaveOffsetReg(unsigned Reg) { - assert(Reg != 0 && "Should never be unset"); - ScratchWaveOffsetReg = Reg; - } - unsigned getQueuePtrUserSGPR() const { return ArgInfo.QueuePtr.getRegister(); } diff --git a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp --- a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp +++ b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp @@ -68,7 +68,6 @@ // Non-entry functions have no special inputs for now, other registers // required for scratch access. ScratchRSrcReg = AMDGPU::SGPR0_SGPR1_SGPR2_SGPR3; - ScratchWaveOffsetReg = AMDGPU::SGPR33; // TODO: Pick a high register, and shift down, similar to a kernel. FrameOffsetReg = AMDGPU::SGPR34; @@ -76,8 +75,6 @@ ArgInfo.PrivateSegmentBuffer = ArgDescriptor::createRegister(ScratchRSrcReg); - ArgInfo.PrivateSegmentWaveByteOffset = - ArgDescriptor::createRegister(ScratchWaveOffsetReg); if (F.hasFnAttribute("amdgpu-implicitarg-ptr")) ImplicitArgPtr = true; @@ -487,7 +484,6 @@ WaveLimiter(MFI.needsWaveLimiter()), HighBitsOf32BitAddress(MFI.get32BitAddressHighBits()), ScratchRSrcReg(regToString(MFI.getScratchRSrcReg(), TRI)), - ScratchWaveOffsetReg(regToString(MFI.getScratchWaveOffsetReg(), TRI)), FrameOffsetReg(regToString(MFI.getFrameOffsetReg(), TRI)), StackPtrOffsetReg(regToString(MFI.getStackPtrOffsetReg(), TRI)), ArgInfo(convertArgumentInfo(MFI.getArgInfo(), TRI)), diff --git a/llvm/lib/Target/AMDGPU/SIRegisterInfo.h b/llvm/lib/Target/AMDGPU/SIRegisterInfo.h --- a/llvm/lib/Target/AMDGPU/SIRegisterInfo.h +++ b/llvm/lib/Target/AMDGPU/SIRegisterInfo.h @@ -50,11 +50,6 @@ /// spilling is needed. unsigned reservedPrivateSegmentBufferReg(const MachineFunction &MF) const; - /// Return the end register initially reserved for the scratch wave offset in - /// case spilling is needed. - unsigned reservedPrivateSegmentWaveByteOffsetReg( - const MachineFunction &MF) const; - BitVector getReservedRegs(const MachineFunction &MF) const override; const MCPhysReg *getCalleeSavedRegs(const MachineFunction *MF) const override; diff --git a/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp b/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp --- a/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp +++ b/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp @@ -91,6 +91,8 @@ const SIFrameLowering *TFI = MF.getSubtarget().getFrameLowering(); const SIMachineFunctionInfo *FuncInfo = MF.getInfo(); + if (FuncInfo->isEntryFunction()) + return AMDGPU::NoRegister; return TFI->hasFP(MF) ? FuncInfo->getFrameOffsetReg() : FuncInfo->getStackPtrOffsetReg(); } @@ -177,29 +179,6 @@ return getMatchingSuperReg(BaseReg, AMDGPU::sub0, &AMDGPU::SGPR_128RegClass); } -static unsigned findPrivateSegmentWaveByteOffsetRegIndex(unsigned RegCount) { - unsigned Reg; - - // Try to place it in a hole after PrivateSegmentBufferReg. - if (RegCount & 3) { - // We cannot put the segment buffer in (Idx - 4) ... (Idx - 1) due to - // alignment constraints, so we have a hole where can put the wave offset. - Reg = RegCount - 1; - } else { - // We can put the segment buffer in (Idx - 4) ... (Idx - 1) and put the - // wave offset before it. - Reg = RegCount - 5; - } - - return Reg; -} - -unsigned SIRegisterInfo::reservedPrivateSegmentWaveByteOffsetReg( - const MachineFunction &MF) const { - unsigned Reg = findPrivateSegmentWaveByteOffsetRegIndex(ST.getMaxNumSGPRs(MF)); - return AMDGPU::SGPR_32RegClass.getRegister(Reg); -} - BitVector SIRegisterInfo::getReservedRegs(const MachineFunction &MF) const { BitVector Reserved(getNumRegs()); @@ -279,19 +258,12 @@ const SIMachineFunctionInfo *MFI = MF.getInfo(); - unsigned ScratchWaveOffsetReg = MFI->getScratchWaveOffsetReg(); - if (ScratchWaveOffsetReg != AMDGPU::NoRegister) { - // Reserve 1 SGPR for scratch wave offset in case we need to spill. - reserveRegisterTuples(Reserved, ScratchWaveOffsetReg); - } - unsigned ScratchRSrcReg = MFI->getScratchRSrcReg(); if (ScratchRSrcReg != AMDGPU::NoRegister) { // Reserve 4 SGPRs for the scratch buffer resource descriptor in case we need // to spill. // TODO: May need to reserve a VGPR if doing LDS spilling. reserveRegisterTuples(Reserved, ScratchRSrcReg); - assert(!isSubRegister(ScratchRSrcReg, ScratchWaveOffsetReg)); } // We have to assume the SP is needed in case there are calls in the function, @@ -454,12 +426,16 @@ #ifndef NDEBUG MachineBasicBlock *MBB = MI.getParent(); MachineFunction *MF = MBB->getParent(); + auto &SOffset = *TII->getNamedOperand(MI, AMDGPU::OpName::soffset); #endif assert(FIOp && FIOp->isFI() && "frame index must be address operand"); assert(TII->isMUBUF(MI)); - assert(TII->getNamedOperand(MI, AMDGPU::OpName::soffset)->getReg() == - MF->getInfo()->getStackPtrOffsetReg() && - "should only be seeing stack pointer offset relative FrameIndex"); + assert((SOffset.isReg() && + (SOffset.getReg() == + MF->getInfo()->getStackPtrOffsetReg())) || + (SOffset.isImm() && SOffset.getImm() == 0) && + "should only be seeing stack pointer or 0 offset relative " + "FrameIndex"); MachineOperand *OffsetOp = TII->getNamedOperand(MI, AMDGPU::OpName::offset); int64_t NewOffset = OffsetOp->getImm() + Offset; @@ -722,6 +698,9 @@ SOffset = RS->scavengeRegister(&AMDGPU::SGPR_32RegClass, MI, 0, false); if (SOffset == AMDGPU::NoRegister) { + if (ScratchOffsetReg == AMDGPU::NoRegister) { + report_fatal_error("could not scavenge SGPR to spill in entry function"); + } // There are no free SGPRs, and since we are in the process of spilling // VGPRs too. Since we need a VGPR in order to spill SGPRs (this is true // on SI/CI and on VI it is true until we implement spilling using scalar @@ -735,9 +714,14 @@ Scavenged = true; } - BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_ADD_U32), SOffset) - .addReg(ScratchOffsetReg) - .addImm(Offset); + if (ScratchOffsetReg == AMDGPU::NoRegister) { + BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_MOV_B32), SOffset) + .addImm(Offset); + } else { + BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_ADD_U32), SOffset) + .addReg(ScratchOffsetReg) + .addImm(Offset); + } Offset = 0; } @@ -772,16 +756,21 @@ EltSize, MinAlign(Align, EltSize * i)); MIB = BuildMI(*MBB, MI, DL, Desc) - .addReg(SubReg, getDefRegState(!IsStore) | getKillRegState(IsKill)) - .addReg(ScratchRsrcReg) - .addReg(SOffset, SOffsetRegState) - .addImm(Offset) - .addImm(0) // glc - .addImm(0) // slc - .addImm(0) // tfe - .addImm(0) // dlc - .addImm(0) // swz - .addMemOperand(NewMMO); + .addReg(SubReg, + getDefRegState(!IsStore) | getKillRegState(IsKill)) + .addReg(ScratchRsrcReg); + if (SOffset == AMDGPU::NoRegister) { + MIB.addImm(0); + } else { + MIB.addReg(SOffset, SOffsetRegState); + } + MIB.addImm(Offset) + .addImm(0) // glc + .addImm(0) // slc + .addImm(0) // tfe + .addImm(0) // dlc + .addImm(0) // swz + .addMemOperand(NewMMO); if (!IsStore && TmpReg != AMDGPU::NoRegister) MIB = BuildMI(*MBB, MI, DL, TII->get(AMDGPU::V_ACCVGPR_WRITE_B32), @@ -825,8 +814,7 @@ MachineFrameInfo &FrameInfo = MF->getFrameInfo(); assert(SpillToVGPR || (SuperReg != MFI->getStackPtrOffsetReg() && - SuperReg != MFI->getFrameOffsetReg() && - SuperReg != MFI->getScratchWaveOffsetReg())); + SuperReg != MFI->getFrameOffsetReg())); assert(SuperReg != AMDGPU::M0 && "m0 should never spill"); @@ -897,13 +885,17 @@ MachineMemOperand *MMO = MF->getMachineMemOperand(PtrInfo, MachineMemOperand::MOStore, EltSize, MinAlign(Align, EltSize * i)); - BuildMI(*MBB, MI, DL, TII->get(AMDGPU::SI_SPILL_V32_SAVE)) - .addReg(TmpVGPR, RegState::Kill) // src - .addFrameIndex(Index) // vaddr - .addReg(MFI->getScratchRSrcReg()) // srrsrc - .addReg(MFI->getStackPtrOffsetReg()) // soffset - .addImm(i * 4) // offset - .addMemOperand(MMO); + auto MIB = BuildMI(*MBB, MI, DL, TII->get(AMDGPU::SI_SPILL_V32_SAVE)) + .addReg(TmpVGPR, RegState::Kill) // src + .addFrameIndex(Index) // vaddr + .addReg(MFI->getScratchRSrcReg()); // srrsrc + if (MFI->isEntryFunction()) { + MIB.addImm(0); // soffset + } else { + MIB.addReg(MFI->getStackPtrOffsetReg()); // soffset + } + MIB.addImm(i * 4) // offset + .addMemOperand(MMO); } } @@ -974,16 +966,20 @@ MachineMemOperand::MOLoad, EltSize, MinAlign(Align, EltSize * i)); - BuildMI(*MBB, MI, DL, TII->get(AMDGPU::SI_SPILL_V32_RESTORE), TmpVGPR) - .addFrameIndex(Index) // vaddr - .addReg(MFI->getScratchRSrcReg()) // srsrc - .addReg(MFI->getStackPtrOffsetReg()) // soffset - .addImm(i * 4) // offset - .addMemOperand(MMO); - auto MIB = - BuildMI(*MBB, MI, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32), SubReg) - .addReg(TmpVGPR, RegState::Kill); + BuildMI(*MBB, MI, DL, TII->get(AMDGPU::SI_SPILL_V32_RESTORE), TmpVGPR) + .addFrameIndex(Index) // vaddr + .addReg(MFI->getScratchRSrcReg()); // srsrc + if (MFI->isEntryFunction()) { + MIB.addImm(0); // soffset + } else { + MIB.addReg(MFI->getStackPtrOffsetReg()); // soffset + } + MIB.addImm(i * 4) // offset + .addMemOperand(MMO); + + MIB = BuildMI(*MBB, MI, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32), SubReg) + .addReg(TmpVGPR, RegState::Kill); if (NumSubRegs > 1) MIB.addReg(MI->getOperand(0).getReg(), RegState::ImplicitDefine); @@ -1085,8 +1081,11 @@ case AMDGPU::SI_SPILL_A32_SAVE: { const MachineOperand *VData = TII->getNamedOperand(*MI, AMDGPU::OpName::vdata); - assert(TII->getNamedOperand(*MI, AMDGPU::OpName::soffset)->getReg() == - MFI->getStackPtrOffsetReg()); + assert(MFI->isEntryFunction() || + TII->getNamedOperand(*MI, AMDGPU::OpName::soffset)->getReg() == + MFI->getStackPtrOffsetReg()); + assert(!MFI->isEntryFunction() || + TII->getNamedOperand(*MI, AMDGPU::OpName::soffset)->getImm() == 0); buildSpillLoadStore(MI, AMDGPU::BUFFER_STORE_DWORD_OFFSET, Index, @@ -1115,8 +1114,11 @@ case AMDGPU::SI_SPILL_A1024_RESTORE: { const MachineOperand *VData = TII->getNamedOperand(*MI, AMDGPU::OpName::vdata); - assert(TII->getNamedOperand(*MI, AMDGPU::OpName::soffset)->getReg() == - MFI->getStackPtrOffsetReg()); + assert(MFI->isEntryFunction() || + TII->getNamedOperand(*MI, AMDGPU::OpName::soffset)->getReg() == + MFI->getStackPtrOffsetReg()); + assert(!MFI->isEntryFunction() || + TII->getNamedOperand(*MI, AMDGPU::OpName::soffset)->getImm() == 0); buildSpillLoadStore(MI, AMDGPU::BUFFER_LOAD_DWORD_OFFSET, Index, @@ -1135,33 +1137,21 @@ bool IsMUBUF = TII->isMUBUF(*MI); if (!IsMUBUF && !MFI->isEntryFunction()) { - // Convert to an absolute stack address by finding the offset from the - // scratch wave base and scaling by the wave size. + // Convert to a swizzled stack address by scaling by the wave size. // - // In an entry function/kernel the offset is already the absolute - // address relative to the frame register. - - Register TmpDiffReg = - RS->scavengeRegister(&AMDGPU::SReg_32_XM0RegClass, MI, 0, false); - - // If there's no free SGPR, in-place modify the FP - Register DiffReg = TmpDiffReg.isValid() ? TmpDiffReg : FrameReg; + // In an entry function/kernel the offset is already swizzled. bool IsCopy = MI->getOpcode() == AMDGPU::V_MOV_B32_e32; - Register ResultReg = IsCopy ? - MI->getOperand(0).getReg() : - RS->scavengeRegister(&AMDGPU::VGPR_32RegClass, MI, 0); - - BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_SUB_U32), DiffReg) - .addReg(FrameReg) - .addReg(MFI->getScratchWaveOffsetReg()); + Register ResultReg = + IsCopy ? MI->getOperand(0).getReg() + : RS->scavengeRegister(&AMDGPU::VGPR_32RegClass, MI, 0); int64_t Offset = FrameInfo.getObjectOffset(Index); if (Offset == 0) { // XXX - This never happens because of emergency scavenging slot at 0? BuildMI(*MBB, MI, DL, TII->get(AMDGPU::V_LSHRREV_B32_e64), ResultReg) .addImm(ST.getWavefrontSizeLog2()) - .addReg(DiffReg); + .addReg(FrameReg); } else { if (auto MIB = TII->getAddNoCarry(*MBB, MI, DL, ResultReg, *RS)) { Register ScaledReg = @@ -1170,7 +1160,7 @@ BuildMI(*MBB, *MIB, DL, TII->get(AMDGPU::V_LSHRREV_B32_e64), ScaledReg) .addImm(ST.getWavefrontSizeLog2()) - .addReg(DiffReg, RegState::Kill); + .addReg(FrameReg); const bool IsVOP2 = MIB->getOpcode() == AMDGPU::V_ADD_U32_e32; @@ -1207,10 +1197,10 @@ // unavailable. Only one additional mov is needed. Register TmpScaledReg = RS->scavengeRegister(&AMDGPU::SReg_32_XM0RegClass, MI, 0, false); - Register ScaledReg = TmpScaledReg.isValid() ? TmpScaledReg : DiffReg; + Register ScaledReg = TmpScaledReg.isValid() ? TmpScaledReg : FrameReg; BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_LSHR_B32), ScaledReg) - .addReg(DiffReg, RegState::Kill) + .addReg(FrameReg) .addImm(ST.getWavefrontSizeLog2()); BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_ADD_U32), ScaledReg) .addReg(ScaledReg, RegState::Kill) @@ -1224,19 +1214,12 @@ .addReg(ScaledReg, RegState::Kill) .addImm(Offset); BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_LSHL_B32), ScaledReg) - .addReg(DiffReg, RegState::Kill) + .addReg(FrameReg) .addImm(ST.getWavefrontSizeLog2()); } } } - if (!TmpDiffReg.isValid()) { - // Restore the FP. - BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_ADD_U32), FrameReg) - .addReg(FrameReg) - .addReg(MFI->getScratchWaveOffsetReg()); - } - // Don't introduce an extra copy if we're just materializing in a mov. if (IsCopy) MI->eraseFromParent(); @@ -1251,10 +1234,11 @@ AMDGPU::getNamedOperandIdx(MI->getOpcode(), AMDGPU::OpName::vaddr)); - assert(TII->getNamedOperand(*MI, AMDGPU::OpName::soffset)->getReg() == - MFI->getStackPtrOffsetReg()); - - TII->getNamedOperand(*MI, AMDGPU::OpName::soffset)->setReg(FrameReg); + auto &SOffset = *TII->getNamedOperand(*MI, AMDGPU::OpName::soffset); + if (SOffset.isReg()) { + assert(SOffset.getReg() == MFI->getStackPtrOffsetReg()); + SOffset.setReg(FrameReg); + } int64_t Offset = FrameInfo.getObjectOffset(Index); int64_t OldImm diff --git a/llvm/lib/Target/AMDGPU/SIRegisterInfo.td b/llvm/lib/Target/AMDGPU/SIRegisterInfo.td --- a/llvm/lib/Target/AMDGPU/SIRegisterInfo.td +++ b/llvm/lib/Target/AMDGPU/SIRegisterInfo.td @@ -101,7 +101,6 @@ def PRIVATE_RSRC_REG : SIReg<"private_rsrc", 0>; def FP_REG : SIReg<"fp", 0>; def SP_REG : SIReg<"sp", 0>; -def SCRATCH_WAVE_OFFSET_REG : SIReg<"scratch_wave_offset", 0>; // Pseudo-register to represent the program-counter DWARF register. def PC_REG : SIReg<"pc", 0>, DwarfRegNum<[16]> { @@ -435,7 +434,7 @@ //===----------------------------------------------------------------------===// def Pseudo_SReg_32 : RegisterClass<"AMDGPU", [i32, f32, i16, f16, v2i16, v2f16], 32, - (add FP_REG, SP_REG, SCRATCH_WAVE_OFFSET_REG)> { + (add FP_REG, SP_REG)> { let isAllocatable = 0; let CopyCost = -1; } diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/divergent-control-flow.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/divergent-control-flow.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/divergent-control-flow.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/divergent-control-flow.ll @@ -165,7 +165,7 @@ ; CHECK-NEXT: s_cbranch_execz BB4_5 ; CHECK-NEXT: ; %bb.4: ; %bb11 ; CHECK-NEXT: v_mov_b32_e32 v0, 4.0 -; CHECK-NEXT: buffer_store_dword v0, v0, s[0:3], s33 offen +; CHECK-NEXT: buffer_store_dword v0, v0, s[0:3], 0 offen ; CHECK-NEXT: BB4_5: ; %Flow ; CHECK-NEXT: s_or_b64 exec, exec, s[4:5] ; CHECK-NEXT: BB4_6: ; %bb12 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-load-local.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-load-local.mir --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-load-local.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-load-local.mir @@ -16,12 +16,6 @@ bb.0: liveins: $vgpr0 - ; GFX6-LABEL: name: load_local_s32_from_4 - ; GFX6: liveins: $vgpr0 - ; GFX6: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; GFX6: $m0 = S_MOV_B32 -1 - ; GFX6: [[DS_READ_B32_:%[0-9]+]]:vgpr_32 = DS_READ_B32 [[COPY]], 0, 0, implicit $m0, implicit $exec :: (load 4, addrspace 3) - ; GFX6: $vgpr0 = COPY [[DS_READ_B32_]] ; GFX7-LABEL: name: load_local_s32_from_4 ; GFX7: liveins: $vgpr0 ; GFX7: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 @@ -33,6 +27,12 @@ ; GFX9: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; GFX9: [[DS_READ_B32_gfx9_:%[0-9]+]]:vgpr_32 = DS_READ_B32_gfx9 [[COPY]], 0, 0, implicit $exec :: (load 4, addrspace 3) ; GFX9: $vgpr0 = COPY [[DS_READ_B32_gfx9_]] + ; GFX6-LABEL: name: load_local_s32_from_4 + ; GFX6: liveins: $vgpr0 + ; GFX6: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX6: $m0 = S_MOV_B32 -1 + ; GFX6: [[DS_READ_B32_:%[0-9]+]]:vgpr_32 = DS_READ_B32 [[COPY]], 0, 0, implicit $m0, implicit $exec :: (load 4, addrspace 3) + ; GFX6: $vgpr0 = COPY [[DS_READ_B32_]] %0:vgpr(p3) = COPY $vgpr0 %1:vgpr(s32) = G_LOAD %0 :: (load 4, align 4, addrspace 3) $vgpr0 = COPY %1 @@ -50,12 +50,6 @@ bb.0: liveins: $vgpr0 - ; GFX6-LABEL: name: load_local_s32_from_2 - ; GFX6: liveins: $vgpr0 - ; GFX6: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; GFX6: $m0 = S_MOV_B32 -1 - ; GFX6: [[DS_READ_U16_:%[0-9]+]]:vgpr_32 = DS_READ_U16 [[COPY]], 0, 0, implicit $m0, implicit $exec :: (load 2, addrspace 3) - ; GFX6: $vgpr0 = COPY [[DS_READ_U16_]] ; GFX7-LABEL: name: load_local_s32_from_2 ; GFX7: liveins: $vgpr0 ; GFX7: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 @@ -67,6 +61,12 @@ ; GFX9: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; GFX9: [[DS_READ_U16_gfx9_:%[0-9]+]]:vgpr_32 = DS_READ_U16_gfx9 [[COPY]], 0, 0, implicit $exec :: (load 2, addrspace 3) ; GFX9: $vgpr0 = COPY [[DS_READ_U16_gfx9_]] + ; GFX6-LABEL: name: load_local_s32_from_2 + ; GFX6: liveins: $vgpr0 + ; GFX6: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX6: $m0 = S_MOV_B32 -1 + ; GFX6: [[DS_READ_U16_:%[0-9]+]]:vgpr_32 = DS_READ_U16 [[COPY]], 0, 0, implicit $m0, implicit $exec :: (load 2, addrspace 3) + ; GFX6: $vgpr0 = COPY [[DS_READ_U16_]] %0:vgpr(p3) = COPY $vgpr0 %1:vgpr(s32) = G_LOAD %0 :: (load 2, align 2, addrspace 3) $vgpr0 = COPY %1 @@ -81,19 +81,12 @@ tracksRegLiveness: true machineFunctionInfo: scratchRSrcReg: $sgpr0_sgpr1_sgpr2_sgpr3 - scratchWaveOffsetReg: $sgpr4 stackPtrOffsetReg: $sgpr32 body: | bb.0: liveins: $vgpr0 - ; GFX6-LABEL: name: load_local_s32_from_1 - ; GFX6: liveins: $vgpr0 - ; GFX6: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; GFX6: $m0 = S_MOV_B32 -1 - ; GFX6: [[DS_READ_U8_:%[0-9]+]]:vgpr_32 = DS_READ_U8 [[COPY]], 0, 0, implicit $m0, implicit $exec :: (load 1, addrspace 3) - ; GFX6: $vgpr0 = COPY [[DS_READ_U8_]] ; GFX7-LABEL: name: load_local_s32_from_1 ; GFX7: liveins: $vgpr0 ; GFX7: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 @@ -105,6 +98,12 @@ ; GFX9: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; GFX9: [[DS_READ_U8_gfx9_:%[0-9]+]]:vgpr_32 = DS_READ_U8_gfx9 [[COPY]], 0, 0, implicit $exec :: (load 1, addrspace 3) ; GFX9: $vgpr0 = COPY [[DS_READ_U8_gfx9_]] + ; GFX6-LABEL: name: load_local_s32_from_1 + ; GFX6: liveins: $vgpr0 + ; GFX6: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX6: $m0 = S_MOV_B32 -1 + ; GFX6: [[DS_READ_U8_:%[0-9]+]]:vgpr_32 = DS_READ_U8 [[COPY]], 0, 0, implicit $m0, implicit $exec :: (load 1, addrspace 3) + ; GFX6: $vgpr0 = COPY [[DS_READ_U8_]] %0:vgpr(p3) = COPY $vgpr0 %1:vgpr(s32) = G_LOAD %0 :: (load 1, align 1, addrspace 3) $vgpr0 = COPY %1 @@ -122,12 +121,6 @@ bb.0: liveins: $vgpr0 - ; GFX6-LABEL: name: load_local_v2s32 - ; GFX6: liveins: $vgpr0 - ; GFX6: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; GFX6: $m0 = S_MOV_B32 -1 - ; GFX6: [[DS_READ_B64_:%[0-9]+]]:vreg_64 = DS_READ_B64 [[COPY]], 0, 0, implicit $m0, implicit $exec :: (load 8, addrspace 3) - ; GFX6: $vgpr0_vgpr1 = COPY [[DS_READ_B64_]] ; GFX7-LABEL: name: load_local_v2s32 ; GFX7: liveins: $vgpr0 ; GFX7: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 @@ -139,6 +132,12 @@ ; GFX9: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; GFX9: [[DS_READ_B64_gfx9_:%[0-9]+]]:vreg_64 = DS_READ_B64_gfx9 [[COPY]], 0, 0, implicit $exec :: (load 8, addrspace 3) ; GFX9: $vgpr0_vgpr1 = COPY [[DS_READ_B64_gfx9_]] + ; GFX6-LABEL: name: load_local_v2s32 + ; GFX6: liveins: $vgpr0 + ; GFX6: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX6: $m0 = S_MOV_B32 -1 + ; GFX6: [[DS_READ_B64_:%[0-9]+]]:vreg_64 = DS_READ_B64 [[COPY]], 0, 0, implicit $m0, implicit $exec :: (load 8, addrspace 3) + ; GFX6: $vgpr0_vgpr1 = COPY [[DS_READ_B64_]] %0:vgpr(p3) = COPY $vgpr0 %1:vgpr(<2 x s32>) = G_LOAD %0 :: (load 8, align 8, addrspace 3) $vgpr0_vgpr1 = COPY %1 @@ -156,12 +155,6 @@ bb.0: liveins: $vgpr0 - ; GFX6-LABEL: name: load_local_v2s32_align4 - ; GFX6: liveins: $vgpr0 - ; GFX6: [[COPY:%[0-9]+]]:vgpr(p3) = COPY $vgpr0 - ; GFX6: $m0 = S_MOV_B32 -1 - ; GFX6: [[LOAD:%[0-9]+]]:vreg_64(<2 x s32>) = G_LOAD [[COPY]](p3) :: (load 8, align 4, addrspace 3) - ; GFX6: $vgpr0_vgpr1 = COPY [[LOAD]](<2 x s32>) ; GFX7-LABEL: name: load_local_v2s32_align4 ; GFX7: liveins: $vgpr0 ; GFX7: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 @@ -173,6 +166,12 @@ ; GFX9: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; GFX9: [[DS_READ2_B32_gfx9_:%[0-9]+]]:vreg_64 = DS_READ2_B32_gfx9 [[COPY]], 0, 1, 0, implicit $exec :: (load 8, align 4, addrspace 3) ; GFX9: $vgpr0_vgpr1 = COPY [[DS_READ2_B32_gfx9_]] + ; GFX6-LABEL: name: load_local_v2s32_align4 + ; GFX6: liveins: $vgpr0 + ; GFX6: [[COPY:%[0-9]+]]:vgpr(p3) = COPY $vgpr0 + ; GFX6: $m0 = S_MOV_B32 -1 + ; GFX6: [[LOAD:%[0-9]+]]:vreg_64(<2 x s32>) = G_LOAD [[COPY]](p3) :: (load 8, align 4, addrspace 3) + ; GFX6: $vgpr0_vgpr1 = COPY [[LOAD]](<2 x s32>) %0:vgpr(p3) = COPY $vgpr0 %1:vgpr(<2 x s32>) = G_LOAD %0 :: (load 8, align 4, addrspace 3) $vgpr0_vgpr1 = COPY %1 @@ -190,12 +189,6 @@ bb.0: liveins: $vgpr0 - ; GFX6-LABEL: name: load_local_s64 - ; GFX6: liveins: $vgpr0 - ; GFX6: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; GFX6: $m0 = S_MOV_B32 -1 - ; GFX6: [[DS_READ_B64_:%[0-9]+]]:vreg_64 = DS_READ_B64 [[COPY]], 0, 0, implicit $m0, implicit $exec :: (load 8, addrspace 3) - ; GFX6: $vgpr0_vgpr1 = COPY [[DS_READ_B64_]] ; GFX7-LABEL: name: load_local_s64 ; GFX7: liveins: $vgpr0 ; GFX7: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 @@ -207,6 +200,12 @@ ; GFX9: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; GFX9: [[DS_READ_B64_gfx9_:%[0-9]+]]:vreg_64 = DS_READ_B64_gfx9 [[COPY]], 0, 0, implicit $exec :: (load 8, addrspace 3) ; GFX9: $vgpr0_vgpr1 = COPY [[DS_READ_B64_gfx9_]] + ; GFX6-LABEL: name: load_local_s64 + ; GFX6: liveins: $vgpr0 + ; GFX6: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX6: $m0 = S_MOV_B32 -1 + ; GFX6: [[DS_READ_B64_:%[0-9]+]]:vreg_64 = DS_READ_B64 [[COPY]], 0, 0, implicit $m0, implicit $exec :: (load 8, addrspace 3) + ; GFX6: $vgpr0_vgpr1 = COPY [[DS_READ_B64_]] %0:vgpr(p3) = COPY $vgpr0 %1:vgpr(s64) = G_LOAD %0 :: (load 8, align 8, addrspace 3) $vgpr0_vgpr1 = COPY %1 @@ -224,12 +223,6 @@ bb.0: liveins: $vgpr0 - ; GFX6-LABEL: name: load_local_s64_align4 - ; GFX6: liveins: $vgpr0 - ; GFX6: [[COPY:%[0-9]+]]:vgpr(p3) = COPY $vgpr0 - ; GFX6: $m0 = S_MOV_B32 -1 - ; GFX6: [[LOAD:%[0-9]+]]:vreg_64(s64) = G_LOAD [[COPY]](p3) :: (load 8, align 4, addrspace 3) - ; GFX6: $vgpr0_vgpr1 = COPY [[LOAD]](s64) ; GFX7-LABEL: name: load_local_s64_align4 ; GFX7: liveins: $vgpr0 ; GFX7: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 @@ -241,6 +234,12 @@ ; GFX9: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; GFX9: [[DS_READ2_B32_gfx9_:%[0-9]+]]:vreg_64 = DS_READ2_B32_gfx9 [[COPY]], 0, 1, 0, implicit $exec :: (load 8, align 4, addrspace 3) ; GFX9: $vgpr0_vgpr1 = COPY [[DS_READ2_B32_gfx9_]] + ; GFX6-LABEL: name: load_local_s64_align4 + ; GFX6: liveins: $vgpr0 + ; GFX6: [[COPY:%[0-9]+]]:vgpr(p3) = COPY $vgpr0 + ; GFX6: $m0 = S_MOV_B32 -1 + ; GFX6: [[LOAD:%[0-9]+]]:vreg_64(s64) = G_LOAD [[COPY]](p3) :: (load 8, align 4, addrspace 3) + ; GFX6: $vgpr0_vgpr1 = COPY [[LOAD]](s64) %0:vgpr(p3) = COPY $vgpr0 %1:vgpr(s64) = G_LOAD %0 :: (load 8, align 4, addrspace 3) $vgpr0_vgpr1 = COPY %1 @@ -258,12 +257,6 @@ bb.0: liveins: $vgpr0 - ; GFX6-LABEL: name: load_local_p3_from_4 - ; GFX6: liveins: $vgpr0 - ; GFX6: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; GFX6: $m0 = S_MOV_B32 -1 - ; GFX6: [[DS_READ_B32_:%[0-9]+]]:vgpr_32 = DS_READ_B32 [[COPY]], 0, 0, implicit $m0, implicit $exec :: (load 4, addrspace 3) - ; GFX6: $vgpr0 = COPY [[DS_READ_B32_]] ; GFX7-LABEL: name: load_local_p3_from_4 ; GFX7: liveins: $vgpr0 ; GFX7: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 @@ -275,6 +268,12 @@ ; GFX9: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; GFX9: [[DS_READ_B32_gfx9_:%[0-9]+]]:vgpr_32 = DS_READ_B32_gfx9 [[COPY]], 0, 0, implicit $exec :: (load 4, addrspace 3) ; GFX9: $vgpr0 = COPY [[DS_READ_B32_gfx9_]] + ; GFX6-LABEL: name: load_local_p3_from_4 + ; GFX6: liveins: $vgpr0 + ; GFX6: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX6: $m0 = S_MOV_B32 -1 + ; GFX6: [[DS_READ_B32_:%[0-9]+]]:vgpr_32 = DS_READ_B32 [[COPY]], 0, 0, implicit $m0, implicit $exec :: (load 4, addrspace 3) + ; GFX6: $vgpr0 = COPY [[DS_READ_B32_]] %0:vgpr(p3) = COPY $vgpr0 %1:vgpr(p3) = G_LOAD %0 :: (load 4, align 4, addrspace 3) $vgpr0 = COPY %1 @@ -292,12 +291,6 @@ bb.0: liveins: $vgpr0 - ; GFX6-LABEL: name: load_local_p5_from_4 - ; GFX6: liveins: $vgpr0 - ; GFX6: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; GFX6: $m0 = S_MOV_B32 -1 - ; GFX6: [[DS_READ_B32_:%[0-9]+]]:vgpr_32 = DS_READ_B32 [[COPY]], 0, 0, implicit $m0, implicit $exec :: (load 4, addrspace 3) - ; GFX6: $vgpr0 = COPY [[DS_READ_B32_]] ; GFX7-LABEL: name: load_local_p5_from_4 ; GFX7: liveins: $vgpr0 ; GFX7: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 @@ -309,6 +302,12 @@ ; GFX9: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; GFX9: [[DS_READ_B32_gfx9_:%[0-9]+]]:vgpr_32 = DS_READ_B32_gfx9 [[COPY]], 0, 0, implicit $exec :: (load 4, addrspace 3) ; GFX9: $vgpr0 = COPY [[DS_READ_B32_gfx9_]] + ; GFX6-LABEL: name: load_local_p5_from_4 + ; GFX6: liveins: $vgpr0 + ; GFX6: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX6: $m0 = S_MOV_B32 -1 + ; GFX6: [[DS_READ_B32_:%[0-9]+]]:vgpr_32 = DS_READ_B32 [[COPY]], 0, 0, implicit $m0, implicit $exec :: (load 4, addrspace 3) + ; GFX6: $vgpr0 = COPY [[DS_READ_B32_]] %0:vgpr(p3) = COPY $vgpr0 %1:vgpr(p3) = G_LOAD %0 :: (load 4, align 4, addrspace 3) $vgpr0 = COPY %1 @@ -326,12 +325,6 @@ bb.0: liveins: $vgpr0 - ; GFX6-LABEL: name: load_local_p1_align8 - ; GFX6: liveins: $vgpr0 - ; GFX6: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; GFX6: $m0 = S_MOV_B32 -1 - ; GFX6: [[DS_READ_B64_:%[0-9]+]]:vreg_64 = DS_READ_B64 [[COPY]], 0, 0, implicit $m0, implicit $exec :: (load 8, addrspace 3) - ; GFX6: $vgpr0_vgpr1 = COPY [[DS_READ_B64_]] ; GFX7-LABEL: name: load_local_p1_align8 ; GFX7: liveins: $vgpr0 ; GFX7: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 @@ -343,6 +336,12 @@ ; GFX9: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; GFX9: [[DS_READ_B64_gfx9_:%[0-9]+]]:vreg_64 = DS_READ_B64_gfx9 [[COPY]], 0, 0, implicit $exec :: (load 8, addrspace 3) ; GFX9: $vgpr0_vgpr1 = COPY [[DS_READ_B64_gfx9_]] + ; GFX6-LABEL: name: load_local_p1_align8 + ; GFX6: liveins: $vgpr0 + ; GFX6: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX6: $m0 = S_MOV_B32 -1 + ; GFX6: [[DS_READ_B64_:%[0-9]+]]:vreg_64 = DS_READ_B64 [[COPY]], 0, 0, implicit $m0, implicit $exec :: (load 8, addrspace 3) + ; GFX6: $vgpr0_vgpr1 = COPY [[DS_READ_B64_]] %0:vgpr(p3) = COPY $vgpr0 %1:vgpr(p1) = G_LOAD %0 :: (load 8, align 8, addrspace 3) $vgpr0_vgpr1 = COPY %1 @@ -360,12 +359,6 @@ bb.0: liveins: $vgpr0 - ; GFX6-LABEL: name: load_local_p1_align4 - ; GFX6: liveins: $vgpr0 - ; GFX6: [[COPY:%[0-9]+]]:vgpr(p3) = COPY $vgpr0 - ; GFX6: $m0 = S_MOV_B32 -1 - ; GFX6: [[LOAD:%[0-9]+]]:vreg_64(p1) = G_LOAD [[COPY]](p3) :: (load 8, align 4, addrspace 3) - ; GFX6: $vgpr0_vgpr1 = COPY [[LOAD]](p1) ; GFX7-LABEL: name: load_local_p1_align4 ; GFX7: liveins: $vgpr0 ; GFX7: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 @@ -377,6 +370,12 @@ ; GFX9: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; GFX9: [[DS_READ2_B32_gfx9_:%[0-9]+]]:vreg_64 = DS_READ2_B32_gfx9 [[COPY]], 0, 1, 0, implicit $exec :: (load 8, align 4, addrspace 3) ; GFX9: $vgpr0_vgpr1 = COPY [[DS_READ2_B32_gfx9_]] + ; GFX6-LABEL: name: load_local_p1_align4 + ; GFX6: liveins: $vgpr0 + ; GFX6: [[COPY:%[0-9]+]]:vgpr(p3) = COPY $vgpr0 + ; GFX6: $m0 = S_MOV_B32 -1 + ; GFX6: [[LOAD:%[0-9]+]]:vreg_64(p1) = G_LOAD [[COPY]](p3) :: (load 8, align 4, addrspace 3) + ; GFX6: $vgpr0_vgpr1 = COPY [[LOAD]](p1) %0:vgpr(p3) = COPY $vgpr0 %1:vgpr(p1) = G_LOAD %0 :: (load 8, align 4, addrspace 3) $vgpr0_vgpr1 = COPY %1 @@ -394,12 +393,6 @@ bb.0: liveins: $vgpr0 - ; GFX6-LABEL: name: load_local_p999_from_8 - ; GFX6: liveins: $vgpr0 - ; GFX6: [[COPY:%[0-9]+]]:vgpr(p3) = COPY $vgpr0 - ; GFX6: $m0 = S_MOV_B32 -1 - ; GFX6: [[LOAD:%[0-9]+]]:vreg_64(p999) = G_LOAD [[COPY]](p3) :: (load 8, addrspace 3) - ; GFX6: $vgpr0_vgpr1 = COPY [[LOAD]](p999) ; GFX7-LABEL: name: load_local_p999_from_8 ; GFX7: liveins: $vgpr0 ; GFX7: [[COPY:%[0-9]+]]:vgpr(p3) = COPY $vgpr0 @@ -411,6 +404,12 @@ ; GFX9: [[COPY:%[0-9]+]]:vgpr(p3) = COPY $vgpr0 ; GFX9: [[LOAD:%[0-9]+]]:vreg_64(p999) = G_LOAD [[COPY]](p3) :: (load 8, addrspace 3) ; GFX9: $vgpr0_vgpr1 = COPY [[LOAD]](p999) + ; GFX6-LABEL: name: load_local_p999_from_8 + ; GFX6: liveins: $vgpr0 + ; GFX6: [[COPY:%[0-9]+]]:vgpr(p3) = COPY $vgpr0 + ; GFX6: $m0 = S_MOV_B32 -1 + ; GFX6: [[LOAD:%[0-9]+]]:vreg_64(p999) = G_LOAD [[COPY]](p3) :: (load 8, addrspace 3) + ; GFX6: $vgpr0_vgpr1 = COPY [[LOAD]](p999) %0:vgpr(p3) = COPY $vgpr0 %1:vgpr(p999) = G_LOAD %0 :: (load 8, align 8, addrspace 3) $vgpr0_vgpr1 = COPY %1 @@ -428,12 +427,6 @@ bb.0: liveins: $vgpr0 - ; GFX6-LABEL: name: load_local_v2p3 - ; GFX6: liveins: $vgpr0 - ; GFX6: [[COPY:%[0-9]+]]:vgpr(p3) = COPY $vgpr0 - ; GFX6: $m0 = S_MOV_B32 -1 - ; GFX6: [[LOAD:%[0-9]+]]:vreg_64(<2 x p3>) = G_LOAD [[COPY]](p3) :: (load 8, addrspace 3) - ; GFX6: $vgpr0_vgpr1 = COPY [[LOAD]](<2 x p3>) ; GFX7-LABEL: name: load_local_v2p3 ; GFX7: liveins: $vgpr0 ; GFX7: [[COPY:%[0-9]+]]:vgpr(p3) = COPY $vgpr0 @@ -445,6 +438,12 @@ ; GFX9: [[COPY:%[0-9]+]]:vgpr(p3) = COPY $vgpr0 ; GFX9: [[LOAD:%[0-9]+]]:vreg_64(<2 x p3>) = G_LOAD [[COPY]](p3) :: (load 8, addrspace 3) ; GFX9: $vgpr0_vgpr1 = COPY [[LOAD]](<2 x p3>) + ; GFX6-LABEL: name: load_local_v2p3 + ; GFX6: liveins: $vgpr0 + ; GFX6: [[COPY:%[0-9]+]]:vgpr(p3) = COPY $vgpr0 + ; GFX6: $m0 = S_MOV_B32 -1 + ; GFX6: [[LOAD:%[0-9]+]]:vreg_64(<2 x p3>) = G_LOAD [[COPY]](p3) :: (load 8, addrspace 3) + ; GFX6: $vgpr0_vgpr1 = COPY [[LOAD]](<2 x p3>) %0:vgpr(p3) = COPY $vgpr0 %1:vgpr(<2 x p3>) = G_LOAD %0 :: (load 8, align 8, addrspace 3) $vgpr0_vgpr1 = COPY %1 @@ -462,12 +461,6 @@ bb.0: liveins: $vgpr0 - ; GFX6-LABEL: name: load_local_v2s16 - ; GFX6: liveins: $vgpr0 - ; GFX6: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; GFX6: $m0 = S_MOV_B32 -1 - ; GFX6: [[DS_READ_B32_:%[0-9]+]]:vgpr_32 = DS_READ_B32 [[COPY]], 0, 0, implicit $m0, implicit $exec :: (load 4, addrspace 3) - ; GFX6: $vgpr0 = COPY [[DS_READ_B32_]] ; GFX7-LABEL: name: load_local_v2s16 ; GFX7: liveins: $vgpr0 ; GFX7: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 @@ -479,6 +472,12 @@ ; GFX9: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; GFX9: [[DS_READ_B32_gfx9_:%[0-9]+]]:vgpr_32 = DS_READ_B32_gfx9 [[COPY]], 0, 0, implicit $exec :: (load 4, addrspace 3) ; GFX9: $vgpr0 = COPY [[DS_READ_B32_gfx9_]] + ; GFX6-LABEL: name: load_local_v2s16 + ; GFX6: liveins: $vgpr0 + ; GFX6: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX6: $m0 = S_MOV_B32 -1 + ; GFX6: [[DS_READ_B32_:%[0-9]+]]:vgpr_32 = DS_READ_B32 [[COPY]], 0, 0, implicit $m0, implicit $exec :: (load 4, addrspace 3) + ; GFX6: $vgpr0 = COPY [[DS_READ_B32_]] %0:vgpr(p3) = COPY $vgpr0 %1:vgpr(<2 x s16>) = G_LOAD %0 :: (load 4, align 4, addrspace 3) $vgpr0 = COPY %1 @@ -496,12 +495,6 @@ bb.0: liveins: $vgpr0 - ; GFX6-LABEL: name: load_local_v4s16 - ; GFX6: liveins: $vgpr0 - ; GFX6: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; GFX6: $m0 = S_MOV_B32 -1 - ; GFX6: [[DS_READ_B64_:%[0-9]+]]:vreg_64 = DS_READ_B64 [[COPY]], 0, 0, implicit $m0, implicit $exec :: (load 8, addrspace 3) - ; GFX6: $vgpr0_vgpr1 = COPY [[DS_READ_B64_]] ; GFX7-LABEL: name: load_local_v4s16 ; GFX7: liveins: $vgpr0 ; GFX7: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 @@ -513,6 +506,12 @@ ; GFX9: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; GFX9: [[DS_READ_B64_gfx9_:%[0-9]+]]:vreg_64 = DS_READ_B64_gfx9 [[COPY]], 0, 0, implicit $exec :: (load 8, addrspace 3) ; GFX9: $vgpr0_vgpr1 = COPY [[DS_READ_B64_gfx9_]] + ; GFX6-LABEL: name: load_local_v4s16 + ; GFX6: liveins: $vgpr0 + ; GFX6: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX6: $m0 = S_MOV_B32 -1 + ; GFX6: [[DS_READ_B64_:%[0-9]+]]:vreg_64 = DS_READ_B64 [[COPY]], 0, 0, implicit $m0, implicit $exec :: (load 8, addrspace 3) + ; GFX6: $vgpr0_vgpr1 = COPY [[DS_READ_B64_]] %0:vgpr(p3) = COPY $vgpr0 %1:vgpr(<4 x s16>) = G_LOAD %0 :: (load 8, align 8, addrspace 3) $vgpr0_vgpr1 = COPY %1 @@ -527,7 +526,6 @@ # tracksRegLiveness: true # machineFunctionInfo: # scratchRSrcReg: $sgpr0_sgpr1_sgpr2_sgpr3 -# scratchWaveOffsetReg: $sgpr4 # stackPtrOffsetReg: $sgpr32 # body: | @@ -555,14 +553,6 @@ bb.0: liveins: $vgpr0 - ; GFX6-LABEL: name: load_local_s32_from_1_gep_65535 - ; GFX6: liveins: $vgpr0 - ; GFX6: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; GFX6: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 65535, implicit $exec - ; GFX6: %2:vgpr_32, dead %4:sreg_64_xexec = V_ADD_I32_e64 [[COPY]], [[V_MOV_B32_e32_]], 0, implicit $exec - ; GFX6: $m0 = S_MOV_B32 -1 - ; GFX6: [[DS_READ_U8_:%[0-9]+]]:vgpr_32 = DS_READ_U8 %2, 0, 0, implicit $m0, implicit $exec :: (load 1, addrspace 3) - ; GFX6: $vgpr0 = COPY [[DS_READ_U8_]] ; GFX7-LABEL: name: load_local_s32_from_1_gep_65535 ; GFX7: liveins: $vgpr0 ; GFX7: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 @@ -574,6 +564,14 @@ ; GFX9: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; GFX9: [[DS_READ_U8_gfx9_:%[0-9]+]]:vgpr_32 = DS_READ_U8_gfx9 [[COPY]], 65535, 0, implicit $exec :: (load 1, addrspace 3) ; GFX9: $vgpr0 = COPY [[DS_READ_U8_gfx9_]] + ; GFX6-LABEL: name: load_local_s32_from_1_gep_65535 + ; GFX6: liveins: $vgpr0 + ; GFX6: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX6: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 65535, implicit $exec + ; GFX6: %2:vgpr_32, dead %4:sreg_64_xexec = V_ADD_I32_e64 [[COPY]], [[V_MOV_B32_e32_]], 0, implicit $exec + ; GFX6: $m0 = S_MOV_B32 -1 + ; GFX6: [[DS_READ_U8_:%[0-9]+]]:vgpr_32 = DS_READ_U8 %2, 0, 0, implicit $m0, implicit $exec :: (load 1, addrspace 3) + ; GFX6: $vgpr0 = COPY [[DS_READ_U8_]] %0:vgpr(p3) = COPY $vgpr0 %1:vgpr(s32) = G_CONSTANT i32 65535 %2:vgpr(p3) = G_PTR_ADD %0, %1 @@ -593,14 +591,6 @@ bb.0: liveins: $vgpr0 - ; GFX6-LABEL: name: load_local_s32_from_1_gep_65535_known_bits_base_address - ; GFX6: liveins: $vgpr0 - ; GFX6: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; GFX6: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 2147483647, implicit $exec - ; GFX6: [[V_AND_B32_e64_:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 [[COPY]], [[V_MOV_B32_e32_]], implicit $exec - ; GFX6: $m0 = S_MOV_B32 -1 - ; GFX6: [[DS_READ_U8_:%[0-9]+]]:vgpr_32 = DS_READ_U8 [[V_AND_B32_e64_]], 65535, 0, implicit $m0, implicit $exec :: (load 1, addrspace 3) - ; GFX6: $vgpr0 = COPY [[DS_READ_U8_]] ; GFX7-LABEL: name: load_local_s32_from_1_gep_65535_known_bits_base_address ; GFX7: liveins: $vgpr0 ; GFX7: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 @@ -616,6 +606,14 @@ ; GFX9: [[V_AND_B32_e64_:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 [[COPY]], [[V_MOV_B32_e32_]], implicit $exec ; GFX9: [[DS_READ_U8_gfx9_:%[0-9]+]]:vgpr_32 = DS_READ_U8_gfx9 [[V_AND_B32_e64_]], 65535, 0, implicit $exec :: (load 1, addrspace 3) ; GFX9: $vgpr0 = COPY [[DS_READ_U8_gfx9_]] + ; GFX6-LABEL: name: load_local_s32_from_1_gep_65535_known_bits_base_address + ; GFX6: liveins: $vgpr0 + ; GFX6: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX6: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 2147483647, implicit $exec + ; GFX6: [[V_AND_B32_e64_:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 [[COPY]], [[V_MOV_B32_e32_]], implicit $exec + ; GFX6: $m0 = S_MOV_B32 -1 + ; GFX6: [[DS_READ_U8_:%[0-9]+]]:vgpr_32 = DS_READ_U8 [[V_AND_B32_e64_]], 65535, 0, implicit $m0, implicit $exec :: (load 1, addrspace 3) + ; GFX6: $vgpr0 = COPY [[DS_READ_U8_]] %0:vgpr(s32) = COPY $vgpr0 %1:vgpr(s32) = G_CONSTANT i32 2147483647 %2:vgpr(s32) = G_AND %0, %1 @@ -638,14 +636,6 @@ bb.0: liveins: $vgpr0 - ; GFX6-LABEL: name: load_local_s32_from_1_gep_65536 - ; GFX6: liveins: $vgpr0 - ; GFX6: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; GFX6: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 65536, implicit $exec - ; GFX6: %2:vgpr_32, dead %4:sreg_64_xexec = V_ADD_I32_e64 [[COPY]], [[V_MOV_B32_e32_]], 0, implicit $exec - ; GFX6: $m0 = S_MOV_B32 -1 - ; GFX6: [[DS_READ_U8_:%[0-9]+]]:vgpr_32 = DS_READ_U8 %2, 0, 0, implicit $m0, implicit $exec :: (load 1, addrspace 3) - ; GFX6: $vgpr0 = COPY [[DS_READ_U8_]] ; GFX7-LABEL: name: load_local_s32_from_1_gep_65536 ; GFX7: liveins: $vgpr0 ; GFX7: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 @@ -661,6 +651,14 @@ ; GFX9: [[V_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[COPY]], [[V_MOV_B32_e32_]], 0, implicit $exec ; GFX9: [[DS_READ_U8_gfx9_:%[0-9]+]]:vgpr_32 = DS_READ_U8_gfx9 [[V_ADD_U32_e64_]], 0, 0, implicit $exec :: (load 1, addrspace 3) ; GFX9: $vgpr0 = COPY [[DS_READ_U8_gfx9_]] + ; GFX6-LABEL: name: load_local_s32_from_1_gep_65536 + ; GFX6: liveins: $vgpr0 + ; GFX6: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX6: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 65536, implicit $exec + ; GFX6: %2:vgpr_32, dead %4:sreg_64_xexec = V_ADD_I32_e64 [[COPY]], [[V_MOV_B32_e32_]], 0, implicit $exec + ; GFX6: $m0 = S_MOV_B32 -1 + ; GFX6: [[DS_READ_U8_:%[0-9]+]]:vgpr_32 = DS_READ_U8 %2, 0, 0, implicit $m0, implicit $exec :: (load 1, addrspace 3) + ; GFX6: $vgpr0 = COPY [[DS_READ_U8_]] %0:vgpr(p3) = COPY $vgpr0 %1:vgpr(s32) = G_CONSTANT i32 65536 %2:vgpr(p3) = G_PTR_ADD %0, %1 @@ -680,14 +678,6 @@ bb.0: liveins: $vgpr0 - ; GFX6-LABEL: name: load_local_s32_from_1_gep_m1 - ; GFX6: liveins: $vgpr0 - ; GFX6: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; GFX6: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4294967295, implicit $exec - ; GFX6: %2:vgpr_32, dead %4:sreg_64_xexec = V_ADD_I32_e64 [[COPY]], [[V_MOV_B32_e32_]], 0, implicit $exec - ; GFX6: $m0 = S_MOV_B32 -1 - ; GFX6: [[DS_READ_U8_:%[0-9]+]]:vgpr_32 = DS_READ_U8 %2, 0, 0, implicit $m0, implicit $exec :: (load 1, addrspace 3) - ; GFX6: $vgpr0 = COPY [[DS_READ_U8_]] ; GFX7-LABEL: name: load_local_s32_from_1_gep_m1 ; GFX7: liveins: $vgpr0 ; GFX7: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 @@ -703,6 +693,14 @@ ; GFX9: [[V_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[COPY]], [[V_MOV_B32_e32_]], 0, implicit $exec ; GFX9: [[DS_READ_U8_gfx9_:%[0-9]+]]:vgpr_32 = DS_READ_U8_gfx9 [[V_ADD_U32_e64_]], 0, 0, implicit $exec :: (load 1, addrspace 3) ; GFX9: $vgpr0 = COPY [[DS_READ_U8_gfx9_]] + ; GFX6-LABEL: name: load_local_s32_from_1_gep_m1 + ; GFX6: liveins: $vgpr0 + ; GFX6: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX6: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4294967295, implicit $exec + ; GFX6: %2:vgpr_32, dead %4:sreg_64_xexec = V_ADD_I32_e64 [[COPY]], [[V_MOV_B32_e32_]], 0, implicit $exec + ; GFX6: $m0 = S_MOV_B32 -1 + ; GFX6: [[DS_READ_U8_:%[0-9]+]]:vgpr_32 = DS_READ_U8 %2, 0, 0, implicit $m0, implicit $exec :: (load 1, addrspace 3) + ; GFX6: $vgpr0 = COPY [[DS_READ_U8_]] %0:vgpr(p3) = COPY $vgpr0 %1:vgpr(s32) = G_CONSTANT i32 -1 %2:vgpr(p3) = G_PTR_ADD %0, %1 @@ -722,14 +720,6 @@ bb.0: liveins: $vgpr0_vgpr1 - ; GFX6-LABEL: name: load_local_s64_align4_from_1_gep_1016 - ; GFX6: liveins: $vgpr0_vgpr1 - ; GFX6: [[COPY:%[0-9]+]]:vgpr(p3) = COPY $vgpr0 - ; GFX6: [[C:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 1016 - ; GFX6: [[PTR_ADD:%[0-9]+]]:vgpr(p3) = G_PTR_ADD [[COPY]], [[C]](s32) - ; GFX6: $m0 = S_MOV_B32 -1 - ; GFX6: [[LOAD:%[0-9]+]]:vreg_64(s64) = G_LOAD [[PTR_ADD]](p3) :: (load 8, align 4, addrspace 3) - ; GFX6: $vgpr0_vgpr1 = COPY [[LOAD]](s64) ; GFX7-LABEL: name: load_local_s64_align4_from_1_gep_1016 ; GFX7: liveins: $vgpr0_vgpr1 ; GFX7: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 @@ -741,6 +731,14 @@ ; GFX9: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; GFX9: [[DS_READ2_B32_gfx9_:%[0-9]+]]:vreg_64 = DS_READ2_B32_gfx9 [[COPY]], 254, 255, 0, implicit $exec :: (load 8, align 4, addrspace 3) ; GFX9: $vgpr0_vgpr1 = COPY [[DS_READ2_B32_gfx9_]] + ; GFX6-LABEL: name: load_local_s64_align4_from_1_gep_1016 + ; GFX6: liveins: $vgpr0_vgpr1 + ; GFX6: [[COPY:%[0-9]+]]:vgpr(p3) = COPY $vgpr0 + ; GFX6: [[C:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 1016 + ; GFX6: [[PTR_ADD:%[0-9]+]]:vgpr(p3) = G_PTR_ADD [[COPY]], [[C]](s32) + ; GFX6: $m0 = S_MOV_B32 -1 + ; GFX6: [[LOAD:%[0-9]+]]:vreg_64(s64) = G_LOAD [[PTR_ADD]](p3) :: (load 8, align 4, addrspace 3) + ; GFX6: $vgpr0_vgpr1 = COPY [[LOAD]](s64) %0:vgpr(p3) = COPY $vgpr0 %1:vgpr(s32) = G_CONSTANT i32 1016 %2:vgpr(p3) = G_PTR_ADD %0, %1 @@ -760,14 +758,6 @@ bb.0: liveins: $vgpr0_vgpr1 - ; GFX6-LABEL: name: load_local_s64_align4_from_1_gep_1020 - ; GFX6: liveins: $vgpr0_vgpr1 - ; GFX6: [[COPY:%[0-9]+]]:vgpr(p3) = COPY $vgpr0 - ; GFX6: [[C:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 1020 - ; GFX6: [[PTR_ADD:%[0-9]+]]:vgpr(p3) = G_PTR_ADD [[COPY]], [[C]](s32) - ; GFX6: $m0 = S_MOV_B32 -1 - ; GFX6: [[LOAD:%[0-9]+]]:vreg_64(s64) = G_LOAD [[PTR_ADD]](p3) :: (load 8, align 4, addrspace 3) - ; GFX6: $vgpr0_vgpr1 = COPY [[LOAD]](s64) ; GFX7-LABEL: name: load_local_s64_align4_from_1_gep_1020 ; GFX7: liveins: $vgpr0_vgpr1 ; GFX7: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 @@ -783,6 +773,14 @@ ; GFX9: [[V_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[COPY]], [[V_MOV_B32_e32_]], 0, implicit $exec ; GFX9: [[DS_READ2_B32_gfx9_:%[0-9]+]]:vreg_64 = DS_READ2_B32_gfx9 [[V_ADD_U32_e64_]], 0, 1, 0, implicit $exec :: (load 8, align 4, addrspace 3) ; GFX9: $vgpr0_vgpr1 = COPY [[DS_READ2_B32_gfx9_]] + ; GFX6-LABEL: name: load_local_s64_align4_from_1_gep_1020 + ; GFX6: liveins: $vgpr0_vgpr1 + ; GFX6: [[COPY:%[0-9]+]]:vgpr(p3) = COPY $vgpr0 + ; GFX6: [[C:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 1020 + ; GFX6: [[PTR_ADD:%[0-9]+]]:vgpr(p3) = G_PTR_ADD [[COPY]], [[C]](s32) + ; GFX6: $m0 = S_MOV_B32 -1 + ; GFX6: [[LOAD:%[0-9]+]]:vreg_64(s64) = G_LOAD [[PTR_ADD]](p3) :: (load 8, align 4, addrspace 3) + ; GFX6: $vgpr0_vgpr1 = COPY [[LOAD]](s64) %0:vgpr(p3) = COPY $vgpr0 %1:vgpr(s32) = G_CONSTANT i32 1020 %2:vgpr(p3) = G_PTR_ADD %0, %1 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-load-private.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-load-private.mir --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-load-private.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-load-private.mir @@ -10,7 +10,6 @@ tracksRegLiveness: true machineFunctionInfo: scratchRSrcReg: $sgpr0_sgpr1_sgpr2_sgpr3 - scratchWaveOffsetReg: $sgpr4 stackPtrOffsetReg: $sgpr32 body: | @@ -20,12 +19,12 @@ ; GFX6-LABEL: name: load_private_s32_from_4 ; GFX6: liveins: $vgpr0 ; GFX6: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; GFX6: [[BUFFER_LOAD_DWORD_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN [[COPY]], $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 4, addrspace 5) + ; GFX6: [[BUFFER_LOAD_DWORD_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN [[COPY]], $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 4, addrspace 5) ; GFX6: $vgpr0 = COPY [[BUFFER_LOAD_DWORD_OFFEN]] ; GFX9-LABEL: name: load_private_s32_from_4 ; GFX9: liveins: $vgpr0 ; GFX9: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; GFX9: [[BUFFER_LOAD_DWORD_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN [[COPY]], $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 4, addrspace 5) + ; GFX9: [[BUFFER_LOAD_DWORD_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN [[COPY]], $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 4, addrspace 5) ; GFX9: $vgpr0 = COPY [[BUFFER_LOAD_DWORD_OFFEN]] %0:vgpr(p5) = COPY $vgpr0 %1:vgpr(s32) = G_LOAD %0 :: (load 4, align 4, addrspace 5) @@ -41,7 +40,6 @@ tracksRegLiveness: true machineFunctionInfo: scratchRSrcReg: $sgpr0_sgpr1_sgpr2_sgpr3 - scratchWaveOffsetReg: $sgpr4 stackPtrOffsetReg: $sgpr32 body: | @@ -51,12 +49,12 @@ ; GFX6-LABEL: name: load_private_s32_from_2 ; GFX6: liveins: $vgpr0 ; GFX6: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; GFX6: [[BUFFER_LOAD_USHORT_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_USHORT_OFFEN [[COPY]], $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 2, addrspace 5) + ; GFX6: [[BUFFER_LOAD_USHORT_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_USHORT_OFFEN [[COPY]], $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 2, addrspace 5) ; GFX6: $vgpr0 = COPY [[BUFFER_LOAD_USHORT_OFFEN]] ; GFX9-LABEL: name: load_private_s32_from_2 ; GFX9: liveins: $vgpr0 ; GFX9: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; GFX9: [[BUFFER_LOAD_USHORT_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_USHORT_OFFEN [[COPY]], $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 2, addrspace 5) + ; GFX9: [[BUFFER_LOAD_USHORT_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_USHORT_OFFEN [[COPY]], $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 2, addrspace 5) ; GFX9: $vgpr0 = COPY [[BUFFER_LOAD_USHORT_OFFEN]] %0:vgpr(p5) = COPY $vgpr0 %1:vgpr(s32) = G_LOAD %0 :: (load 2, align 2, addrspace 5) @@ -72,7 +70,6 @@ tracksRegLiveness: true machineFunctionInfo: scratchRSrcReg: $sgpr0_sgpr1_sgpr2_sgpr3 - scratchWaveOffsetReg: $sgpr4 stackPtrOffsetReg: $sgpr32 body: | @@ -82,12 +79,12 @@ ; GFX6-LABEL: name: load_private_s32_from_1 ; GFX6: liveins: $vgpr0 ; GFX6: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; GFX6: [[BUFFER_LOAD_UBYTE_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_OFFEN [[COPY]], $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 1, addrspace 5) + ; GFX6: [[BUFFER_LOAD_UBYTE_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_OFFEN [[COPY]], $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 1, addrspace 5) ; GFX6: $vgpr0 = COPY [[BUFFER_LOAD_UBYTE_OFFEN]] ; GFX9-LABEL: name: load_private_s32_from_1 ; GFX9: liveins: $vgpr0 ; GFX9: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; GFX9: [[BUFFER_LOAD_UBYTE_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_OFFEN [[COPY]], $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 1, addrspace 5) + ; GFX9: [[BUFFER_LOAD_UBYTE_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_OFFEN [[COPY]], $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 1, addrspace 5) ; GFX9: $vgpr0 = COPY [[BUFFER_LOAD_UBYTE_OFFEN]] %0:vgpr(p5) = COPY $vgpr0 %1:vgpr(s32) = G_LOAD %0 :: (load 1, align 1, addrspace 5) @@ -130,7 +127,6 @@ tracksRegLiveness: true machineFunctionInfo: scratchRSrcReg: $sgpr0_sgpr1_sgpr2_sgpr3 - scratchWaveOffsetReg: $sgpr4 stackPtrOffsetReg: $sgpr32 body: | @@ -161,7 +157,6 @@ tracksRegLiveness: true machineFunctionInfo: scratchRSrcReg: $sgpr0_sgpr1_sgpr2_sgpr3 - scratchWaveOffsetReg: $sgpr4 stackPtrOffsetReg: $sgpr32 body: | @@ -196,7 +191,6 @@ tracksRegLiveness: true machineFunctionInfo: scratchRSrcReg: $sgpr0_sgpr1_sgpr2_sgpr3 - scratchWaveOffsetReg: $sgpr4 stackPtrOffsetReg: $sgpr32 body: | @@ -208,12 +202,12 @@ ; GFX6: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; GFX6: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 2047, implicit $exec ; GFX6: %2:vgpr_32, dead %4:sreg_64_xexec = V_ADD_I32_e64 [[COPY]], [[V_MOV_B32_e32_]], 0, implicit $exec - ; GFX6: [[BUFFER_LOAD_UBYTE_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_OFFEN %2, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 1, addrspace 5) + ; GFX6: [[BUFFER_LOAD_UBYTE_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_OFFEN %2, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 1, addrspace 5) ; GFX6: $vgpr0 = COPY [[BUFFER_LOAD_UBYTE_OFFEN]] ; GFX9-LABEL: name: load_private_s32_from_1_gep_2047 ; GFX9: liveins: $vgpr0 ; GFX9: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; GFX9: [[BUFFER_LOAD_UBYTE_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_OFFEN [[COPY]], $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 2047, 0, 0, 0, 0, 0, implicit $exec :: (load 1, addrspace 5) + ; GFX9: [[BUFFER_LOAD_UBYTE_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_OFFEN [[COPY]], $sgpr0_sgpr1_sgpr2_sgpr3, 0, 2047, 0, 0, 0, 0, 0, implicit $exec :: (load 1, addrspace 5) ; GFX9: $vgpr0 = COPY [[BUFFER_LOAD_UBYTE_OFFEN]] %0:vgpr(p5) = COPY $vgpr0 %1:vgpr(s32) = G_CONSTANT i32 2047 @@ -231,7 +225,6 @@ tracksRegLiveness: true machineFunctionInfo: scratchRSrcReg: $sgpr0_sgpr1_sgpr2_sgpr3 - scratchWaveOffsetReg: $sgpr4 stackPtrOffsetReg: $sgpr32 body: | @@ -243,14 +236,14 @@ ; GFX6: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; GFX6: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 2147483647, implicit $exec ; GFX6: [[V_AND_B32_e64_:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 [[COPY]], [[V_MOV_B32_e32_]], implicit $exec - ; GFX6: [[BUFFER_LOAD_UBYTE_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_OFFEN [[V_AND_B32_e64_]], $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 2047, 0, 0, 0, 0, 0, implicit $exec :: (load 1, addrspace 5) + ; GFX6: [[BUFFER_LOAD_UBYTE_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_OFFEN [[V_AND_B32_e64_]], $sgpr0_sgpr1_sgpr2_sgpr3, 0, 2047, 0, 0, 0, 0, 0, implicit $exec :: (load 1, addrspace 5) ; GFX6: $vgpr0 = COPY [[BUFFER_LOAD_UBYTE_OFFEN]] ; GFX9-LABEL: name: load_private_s32_from_1_gep_2047_known_bits ; GFX9: liveins: $vgpr0 ; GFX9: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; GFX9: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 2147483647, implicit $exec ; GFX9: [[V_AND_B32_e64_:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 [[COPY]], [[V_MOV_B32_e32_]], implicit $exec - ; GFX9: [[BUFFER_LOAD_UBYTE_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_OFFEN [[V_AND_B32_e64_]], $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 2047, 0, 0, 0, 0, 0, implicit $exec :: (load 1, addrspace 5) + ; GFX9: [[BUFFER_LOAD_UBYTE_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_OFFEN [[V_AND_B32_e64_]], $sgpr0_sgpr1_sgpr2_sgpr3, 0, 2047, 0, 0, 0, 0, 0, implicit $exec :: (load 1, addrspace 5) ; GFX9: $vgpr0 = COPY [[BUFFER_LOAD_UBYTE_OFFEN]] %0:vgpr(s32) = COPY $vgpr0 %1:vgpr(s32) = G_CONSTANT i32 2147483647 @@ -271,7 +264,6 @@ tracksRegLiveness: true machineFunctionInfo: scratchRSrcReg: $sgpr0_sgpr1_sgpr2_sgpr3 - scratchWaveOffsetReg: $sgpr4 stackPtrOffsetReg: $sgpr32 body: | @@ -283,12 +275,12 @@ ; GFX6: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; GFX6: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 2048, implicit $exec ; GFX6: %2:vgpr_32, dead %4:sreg_64_xexec = V_ADD_I32_e64 [[COPY]], [[V_MOV_B32_e32_]], 0, implicit $exec - ; GFX6: [[BUFFER_LOAD_UBYTE_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_OFFEN %2, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 1, addrspace 5) + ; GFX6: [[BUFFER_LOAD_UBYTE_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_OFFEN %2, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 1, addrspace 5) ; GFX6: $vgpr0 = COPY [[BUFFER_LOAD_UBYTE_OFFEN]] ; GFX9-LABEL: name: load_private_s32_from_1_gep_2048 ; GFX9: liveins: $vgpr0 ; GFX9: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; GFX9: [[BUFFER_LOAD_UBYTE_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_OFFEN [[COPY]], $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 2048, 0, 0, 0, 0, 0, implicit $exec :: (load 1, addrspace 5) + ; GFX9: [[BUFFER_LOAD_UBYTE_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_OFFEN [[COPY]], $sgpr0_sgpr1_sgpr2_sgpr3, 0, 2048, 0, 0, 0, 0, 0, implicit $exec :: (load 1, addrspace 5) ; GFX9: $vgpr0 = COPY [[BUFFER_LOAD_UBYTE_OFFEN]] %0:vgpr(p5) = COPY $vgpr0 %1:vgpr(s32) = G_CONSTANT i32 2048 @@ -306,7 +298,6 @@ tracksRegLiveness: true machineFunctionInfo: scratchRSrcReg: $sgpr0_sgpr1_sgpr2_sgpr3 - scratchWaveOffsetReg: $sgpr4 stackPtrOffsetReg: $sgpr32 body: | @@ -318,14 +309,14 @@ ; GFX6: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; GFX6: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4294965249, implicit $exec ; GFX6: %2:vgpr_32, dead %4:sreg_64_xexec = V_ADD_I32_e64 [[COPY]], [[V_MOV_B32_e32_]], 0, implicit $exec - ; GFX6: [[BUFFER_LOAD_UBYTE_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_OFFEN %2, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 1, addrspace 5) + ; GFX6: [[BUFFER_LOAD_UBYTE_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_OFFEN %2, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 1, addrspace 5) ; GFX6: $vgpr0 = COPY [[BUFFER_LOAD_UBYTE_OFFEN]] ; GFX9-LABEL: name: load_private_s32_from_1_gep_m2047 ; GFX9: liveins: $vgpr0 ; GFX9: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; GFX9: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4294965249, implicit $exec ; GFX9: [[V_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[COPY]], [[V_MOV_B32_e32_]], 0, implicit $exec - ; GFX9: [[BUFFER_LOAD_UBYTE_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_OFFEN [[V_ADD_U32_e64_]], $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 1, addrspace 5) + ; GFX9: [[BUFFER_LOAD_UBYTE_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_OFFEN [[V_ADD_U32_e64_]], $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 1, addrspace 5) ; GFX9: $vgpr0 = COPY [[BUFFER_LOAD_UBYTE_OFFEN]] %0:vgpr(p5) = COPY $vgpr0 %1:vgpr(s32) = G_CONSTANT i32 -2047 @@ -343,7 +334,6 @@ tracksRegLiveness: true machineFunctionInfo: scratchRSrcReg: $sgpr0_sgpr1_sgpr2_sgpr3 - scratchWaveOffsetReg: $sgpr4 stackPtrOffsetReg: $sgpr32 body: | @@ -355,14 +345,14 @@ ; GFX6: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; GFX6: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4294965248, implicit $exec ; GFX6: %2:vgpr_32, dead %4:sreg_64_xexec = V_ADD_I32_e64 [[COPY]], [[V_MOV_B32_e32_]], 0, implicit $exec - ; GFX6: [[BUFFER_LOAD_UBYTE_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_OFFEN %2, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 1, addrspace 5) + ; GFX6: [[BUFFER_LOAD_UBYTE_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_OFFEN %2, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 1, addrspace 5) ; GFX6: $vgpr0 = COPY [[BUFFER_LOAD_UBYTE_OFFEN]] ; GFX9-LABEL: name: load_private_s32_from_1_gep_m2048 ; GFX9: liveins: $vgpr0 ; GFX9: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; GFX9: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4294965248, implicit $exec ; GFX9: [[V_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[COPY]], [[V_MOV_B32_e32_]], 0, implicit $exec - ; GFX9: [[BUFFER_LOAD_UBYTE_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_OFFEN [[V_ADD_U32_e64_]], $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 1, addrspace 5) + ; GFX9: [[BUFFER_LOAD_UBYTE_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_OFFEN [[V_ADD_U32_e64_]], $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 1, addrspace 5) ; GFX9: $vgpr0 = COPY [[BUFFER_LOAD_UBYTE_OFFEN]] %0:vgpr(p5) = COPY $vgpr0 %1:vgpr(s32) = G_CONSTANT i32 -2048 @@ -380,7 +370,6 @@ tracksRegLiveness: true machineFunctionInfo: scratchRSrcReg: $sgpr0_sgpr1_sgpr2_sgpr3 - scratchWaveOffsetReg: $sgpr4 stackPtrOffsetReg: $sgpr32 body: | @@ -392,12 +381,12 @@ ; GFX6: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; GFX6: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4095, implicit $exec ; GFX6: %2:vgpr_32, dead %4:sreg_64_xexec = V_ADD_I32_e64 [[COPY]], [[V_MOV_B32_e32_]], 0, implicit $exec - ; GFX6: [[BUFFER_LOAD_UBYTE_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_OFFEN %2, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 1, addrspace 5) + ; GFX6: [[BUFFER_LOAD_UBYTE_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_OFFEN %2, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 1, addrspace 5) ; GFX6: $vgpr0 = COPY [[BUFFER_LOAD_UBYTE_OFFEN]] ; GFX9-LABEL: name: load_private_s32_from_1_gep_4095 ; GFX9: liveins: $vgpr0 ; GFX9: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; GFX9: [[BUFFER_LOAD_UBYTE_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_OFFEN [[COPY]], $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 4095, 0, 0, 0, 0, 0, implicit $exec :: (load 1, addrspace 5) + ; GFX9: [[BUFFER_LOAD_UBYTE_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_OFFEN [[COPY]], $sgpr0_sgpr1_sgpr2_sgpr3, 0, 4095, 0, 0, 0, 0, 0, implicit $exec :: (load 1, addrspace 5) ; GFX9: $vgpr0 = COPY [[BUFFER_LOAD_UBYTE_OFFEN]] %0:vgpr(p5) = COPY $vgpr0 %1:vgpr(s32) = G_CONSTANT i32 4095 @@ -415,7 +404,6 @@ tracksRegLiveness: true machineFunctionInfo: scratchRSrcReg: $sgpr0_sgpr1_sgpr2_sgpr3 - scratchWaveOffsetReg: $sgpr4 stackPtrOffsetReg: $sgpr32 body: | @@ -427,14 +415,14 @@ ; GFX6: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; GFX6: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4096, implicit $exec ; GFX6: %2:vgpr_32, dead %4:sreg_64_xexec = V_ADD_I32_e64 [[COPY]], [[V_MOV_B32_e32_]], 0, implicit $exec - ; GFX6: [[BUFFER_LOAD_UBYTE_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_OFFEN %2, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 1, addrspace 5) + ; GFX6: [[BUFFER_LOAD_UBYTE_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_OFFEN %2, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 1, addrspace 5) ; GFX6: $vgpr0 = COPY [[BUFFER_LOAD_UBYTE_OFFEN]] ; GFX9-LABEL: name: load_private_s32_from_1_gep_4096 ; GFX9: liveins: $vgpr0 ; GFX9: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; GFX9: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4096, implicit $exec ; GFX9: [[V_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[COPY]], [[V_MOV_B32_e32_]], 0, implicit $exec - ; GFX9: [[BUFFER_LOAD_UBYTE_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_OFFEN [[V_ADD_U32_e64_]], $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 1, addrspace 5) + ; GFX9: [[BUFFER_LOAD_UBYTE_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_OFFEN [[V_ADD_U32_e64_]], $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 1, addrspace 5) ; GFX9: $vgpr0 = COPY [[BUFFER_LOAD_UBYTE_OFFEN]] %0:vgpr(p5) = COPY $vgpr0 %1:vgpr(s32) = G_CONSTANT i32 4096 @@ -452,7 +440,6 @@ tracksRegLiveness: true machineFunctionInfo: scratchRSrcReg: $sgpr0_sgpr1_sgpr2_sgpr3 - scratchWaveOffsetReg: $sgpr4 stackPtrOffsetReg: $sgpr32 body: | @@ -464,14 +451,14 @@ ; GFX6: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; GFX6: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4294963201, implicit $exec ; GFX6: %2:vgpr_32, dead %4:sreg_64_xexec = V_ADD_I32_e64 [[COPY]], [[V_MOV_B32_e32_]], 0, implicit $exec - ; GFX6: [[BUFFER_LOAD_UBYTE_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_OFFEN %2, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 1, addrspace 5) + ; GFX6: [[BUFFER_LOAD_UBYTE_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_OFFEN %2, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 1, addrspace 5) ; GFX6: $vgpr0 = COPY [[BUFFER_LOAD_UBYTE_OFFEN]] ; GFX9-LABEL: name: load_private_s32_from_1_gep_m4095 ; GFX9: liveins: $vgpr0 ; GFX9: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; GFX9: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4294963201, implicit $exec ; GFX9: [[V_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[COPY]], [[V_MOV_B32_e32_]], 0, implicit $exec - ; GFX9: [[BUFFER_LOAD_UBYTE_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_OFFEN [[V_ADD_U32_e64_]], $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 1, addrspace 5) + ; GFX9: [[BUFFER_LOAD_UBYTE_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_OFFEN [[V_ADD_U32_e64_]], $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 1, addrspace 5) ; GFX9: $vgpr0 = COPY [[BUFFER_LOAD_UBYTE_OFFEN]] %0:vgpr(p5) = COPY $vgpr0 %1:vgpr(s32) = G_CONSTANT i32 -4095 @@ -489,7 +476,6 @@ tracksRegLiveness: true machineFunctionInfo: scratchRSrcReg: $sgpr0_sgpr1_sgpr2_sgpr3 - scratchWaveOffsetReg: $sgpr4 stackPtrOffsetReg: $sgpr32 body: | @@ -501,14 +487,14 @@ ; GFX6: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; GFX6: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4294963200, implicit $exec ; GFX6: %2:vgpr_32, dead %4:sreg_64_xexec = V_ADD_I32_e64 [[COPY]], [[V_MOV_B32_e32_]], 0, implicit $exec - ; GFX6: [[BUFFER_LOAD_UBYTE_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_OFFEN %2, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 1, addrspace 5) + ; GFX6: [[BUFFER_LOAD_UBYTE_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_OFFEN %2, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 1, addrspace 5) ; GFX6: $vgpr0 = COPY [[BUFFER_LOAD_UBYTE_OFFEN]] ; GFX9-LABEL: name: load_private_s32_from_1_gep_m4096 ; GFX9: liveins: $vgpr0 ; GFX9: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; GFX9: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4294963200, implicit $exec ; GFX9: [[V_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[COPY]], [[V_MOV_B32_e32_]], 0, implicit $exec - ; GFX9: [[BUFFER_LOAD_UBYTE_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_OFFEN [[V_ADD_U32_e64_]], $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 1, addrspace 5) + ; GFX9: [[BUFFER_LOAD_UBYTE_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_OFFEN [[V_ADD_U32_e64_]], $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 1, addrspace 5) ; GFX9: $vgpr0 = COPY [[BUFFER_LOAD_UBYTE_OFFEN]] %0:vgpr(p5) = COPY $vgpr0 %1:vgpr(s32) = G_CONSTANT i32 -4096 @@ -526,7 +512,6 @@ tracksRegLiveness: true machineFunctionInfo: scratchRSrcReg: $sgpr0_sgpr1_sgpr2_sgpr3 - scratchWaveOffsetReg: $sgpr4 stackPtrOffsetReg: $sgpr32 body: | @@ -538,14 +523,14 @@ ; GFX6: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; GFX6: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 8191, implicit $exec ; GFX6: %2:vgpr_32, dead %4:sreg_64_xexec = V_ADD_I32_e64 [[COPY]], [[V_MOV_B32_e32_]], 0, implicit $exec - ; GFX6: [[BUFFER_LOAD_UBYTE_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_OFFEN %2, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 1, addrspace 5) + ; GFX6: [[BUFFER_LOAD_UBYTE_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_OFFEN %2, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 1, addrspace 5) ; GFX6: $vgpr0 = COPY [[BUFFER_LOAD_UBYTE_OFFEN]] ; GFX9-LABEL: name: load_private_s32_from_1_gep_8191 ; GFX9: liveins: $vgpr0 ; GFX9: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; GFX9: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 8191, implicit $exec ; GFX9: [[V_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[COPY]], [[V_MOV_B32_e32_]], 0, implicit $exec - ; GFX9: [[BUFFER_LOAD_UBYTE_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_OFFEN [[V_ADD_U32_e64_]], $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 1, addrspace 5) + ; GFX9: [[BUFFER_LOAD_UBYTE_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_OFFEN [[V_ADD_U32_e64_]], $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 1, addrspace 5) ; GFX9: $vgpr0 = COPY [[BUFFER_LOAD_UBYTE_OFFEN]] %0:vgpr(p5) = COPY $vgpr0 %1:vgpr(s32) = G_CONSTANT i32 8191 @@ -563,7 +548,6 @@ tracksRegLiveness: true machineFunctionInfo: scratchRSrcReg: $sgpr0_sgpr1_sgpr2_sgpr3 - scratchWaveOffsetReg: $sgpr4 stackPtrOffsetReg: $sgpr32 body: | @@ -575,14 +559,14 @@ ; GFX6: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; GFX6: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 8192, implicit $exec ; GFX6: %2:vgpr_32, dead %4:sreg_64_xexec = V_ADD_I32_e64 [[COPY]], [[V_MOV_B32_e32_]], 0, implicit $exec - ; GFX6: [[BUFFER_LOAD_UBYTE_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_OFFEN %2, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 1, addrspace 5) + ; GFX6: [[BUFFER_LOAD_UBYTE_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_OFFEN %2, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 1, addrspace 5) ; GFX6: $vgpr0 = COPY [[BUFFER_LOAD_UBYTE_OFFEN]] ; GFX9-LABEL: name: load_private_s32_from_1_gep_8192 ; GFX9: liveins: $vgpr0 ; GFX9: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; GFX9: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 8192, implicit $exec ; GFX9: [[V_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[COPY]], [[V_MOV_B32_e32_]], 0, implicit $exec - ; GFX9: [[BUFFER_LOAD_UBYTE_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_OFFEN [[V_ADD_U32_e64_]], $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 1, addrspace 5) + ; GFX9: [[BUFFER_LOAD_UBYTE_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_OFFEN [[V_ADD_U32_e64_]], $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 1, addrspace 5) ; GFX9: $vgpr0 = COPY [[BUFFER_LOAD_UBYTE_OFFEN]] %0:vgpr(p5) = COPY $vgpr0 %1:vgpr(s32) = G_CONSTANT i32 8192 @@ -600,7 +584,6 @@ tracksRegLiveness: true machineFunctionInfo: scratchRSrcReg: $sgpr0_sgpr1_sgpr2_sgpr3 - scratchWaveOffsetReg: $sgpr4 stackPtrOffsetReg: $sgpr32 body: | @@ -612,14 +595,14 @@ ; GFX6: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; GFX6: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4294959105, implicit $exec ; GFX6: %2:vgpr_32, dead %4:sreg_64_xexec = V_ADD_I32_e64 [[COPY]], [[V_MOV_B32_e32_]], 0, implicit $exec - ; GFX6: [[BUFFER_LOAD_UBYTE_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_OFFEN %2, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 1, addrspace 5) + ; GFX6: [[BUFFER_LOAD_UBYTE_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_OFFEN %2, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 1, addrspace 5) ; GFX6: $vgpr0 = COPY [[BUFFER_LOAD_UBYTE_OFFEN]] ; GFX9-LABEL: name: load_private_s32_from_1_gep_m8191 ; GFX9: liveins: $vgpr0 ; GFX9: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; GFX9: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4294959105, implicit $exec ; GFX9: [[V_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[COPY]], [[V_MOV_B32_e32_]], 0, implicit $exec - ; GFX9: [[BUFFER_LOAD_UBYTE_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_OFFEN [[V_ADD_U32_e64_]], $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 1, addrspace 5) + ; GFX9: [[BUFFER_LOAD_UBYTE_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_OFFEN [[V_ADD_U32_e64_]], $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 1, addrspace 5) ; GFX9: $vgpr0 = COPY [[BUFFER_LOAD_UBYTE_OFFEN]] %0:vgpr(p5) = COPY $vgpr0 %1:vgpr(s32) = G_CONSTANT i32 -8191 @@ -637,7 +620,6 @@ tracksRegLiveness: true machineFunctionInfo: scratchRSrcReg: $sgpr0_sgpr1_sgpr2_sgpr3 - scratchWaveOffsetReg: $sgpr4 stackPtrOffsetReg: $sgpr32 body: | @@ -649,14 +631,14 @@ ; GFX6: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; GFX6: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4294959104, implicit $exec ; GFX6: %2:vgpr_32, dead %4:sreg_64_xexec = V_ADD_I32_e64 [[COPY]], [[V_MOV_B32_e32_]], 0, implicit $exec - ; GFX6: [[BUFFER_LOAD_UBYTE_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_OFFEN %2, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 1, addrspace 5) + ; GFX6: [[BUFFER_LOAD_UBYTE_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_OFFEN %2, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 1, addrspace 5) ; GFX6: $vgpr0 = COPY [[BUFFER_LOAD_UBYTE_OFFEN]] ; GFX9-LABEL: name: load_private_s32_from_1_gep_m8192 ; GFX9: liveins: $vgpr0 ; GFX9: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; GFX9: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4294959104, implicit $exec ; GFX9: [[V_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[COPY]], [[V_MOV_B32_e32_]], 0, implicit $exec - ; GFX9: [[BUFFER_LOAD_UBYTE_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_OFFEN [[V_ADD_U32_e64_]], $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 1, addrspace 5) + ; GFX9: [[BUFFER_LOAD_UBYTE_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_OFFEN [[V_ADD_U32_e64_]], $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 1, addrspace 5) ; GFX9: $vgpr0 = COPY [[BUFFER_LOAD_UBYTE_OFFEN]] %0:vgpr(p5) = COPY $vgpr0 %1:vgpr(s32) = G_CONSTANT i32 -8192 @@ -674,17 +656,16 @@ tracksRegLiveness: true machineFunctionInfo: scratchRSrcReg: $sgpr0_sgpr1_sgpr2_sgpr3 - scratchWaveOffsetReg: $sgpr4 stackPtrOffsetReg: $sgpr32 body: | bb.0: ; GFX6-LABEL: name: load_private_s32_from_4_constant_0 - ; GFX6: [[BUFFER_LOAD_DWORD_OFFSET:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 4, addrspace 5) + ; GFX6: [[BUFFER_LOAD_DWORD_OFFSET:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 4, addrspace 5) ; GFX6: $vgpr0 = COPY [[BUFFER_LOAD_DWORD_OFFSET]] ; GFX9-LABEL: name: load_private_s32_from_4_constant_0 - ; GFX9: [[BUFFER_LOAD_DWORD_OFFSET:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 4, addrspace 5) + ; GFX9: [[BUFFER_LOAD_DWORD_OFFSET:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 4, addrspace 5) ; GFX9: $vgpr0 = COPY [[BUFFER_LOAD_DWORD_OFFSET]] %0:vgpr(p5) = G_CONSTANT i32 0 %1:vgpr(s32) = G_LOAD %0 :: (load 4, align 4, addrspace 5) @@ -700,17 +681,16 @@ tracksRegLiveness: true machineFunctionInfo: scratchRSrcReg: $sgpr0_sgpr1_sgpr2_sgpr3 - scratchWaveOffsetReg: $sgpr4 stackPtrOffsetReg: $sgpr32 body: | bb.0: ; GFX6-LABEL: name: load_private_s32_from_4_constant_sgpr_16 - ; GFX6: [[BUFFER_LOAD_DWORD_OFFSET:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 16, 0, 0, 0, 0, 0, implicit $exec :: (load 4, addrspace 5) + ; GFX6: [[BUFFER_LOAD_DWORD_OFFSET:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, 0, 16, 0, 0, 0, 0, 0, implicit $exec :: (load 4, addrspace 5) ; GFX6: $vgpr0 = COPY [[BUFFER_LOAD_DWORD_OFFSET]] ; GFX9-LABEL: name: load_private_s32_from_4_constant_sgpr_16 - ; GFX9: [[BUFFER_LOAD_DWORD_OFFSET:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 16, 0, 0, 0, 0, 0, implicit $exec :: (load 4, addrspace 5) + ; GFX9: [[BUFFER_LOAD_DWORD_OFFSET:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, 0, 16, 0, 0, 0, 0, 0, implicit $exec :: (load 4, addrspace 5) ; GFX9: $vgpr0 = COPY [[BUFFER_LOAD_DWORD_OFFSET]] %0:sgpr(p5) = G_CONSTANT i32 16 %1:vgpr(s32) = G_LOAD %0 :: (load 4, align 4, addrspace 5) @@ -726,17 +706,16 @@ tracksRegLiveness: true machineFunctionInfo: scratchRSrcReg: $sgpr0_sgpr1_sgpr2_sgpr3 - scratchWaveOffsetReg: $sgpr4 stackPtrOffsetReg: $sgpr32 body: | bb.0: ; GFX6-LABEL: name: load_private_s32_from_1_constant_4095 - ; GFX6: [[BUFFER_LOAD_UBYTE_OFFSET:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 4095, 0, 0, 0, 0, 0, implicit $exec :: (load 1, addrspace 5) + ; GFX6: [[BUFFER_LOAD_UBYTE_OFFSET:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, 0, 4095, 0, 0, 0, 0, 0, implicit $exec :: (load 1, addrspace 5) ; GFX6: $vgpr0 = COPY [[BUFFER_LOAD_UBYTE_OFFSET]] ; GFX9-LABEL: name: load_private_s32_from_1_constant_4095 - ; GFX9: [[BUFFER_LOAD_UBYTE_OFFSET:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 4095, 0, 0, 0, 0, 0, implicit $exec :: (load 1, addrspace 5) + ; GFX9: [[BUFFER_LOAD_UBYTE_OFFSET:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, 0, 4095, 0, 0, 0, 0, 0, implicit $exec :: (load 1, addrspace 5) ; GFX9: $vgpr0 = COPY [[BUFFER_LOAD_UBYTE_OFFSET]] %0:vgpr(p5) = G_CONSTANT i32 4095 %1:vgpr(s32) = G_LOAD %0 :: (load 1, align 1, addrspace 5) @@ -752,7 +731,6 @@ tracksRegLiveness: true machineFunctionInfo: scratchRSrcReg: $sgpr0_sgpr1_sgpr2_sgpr3 - scratchWaveOffsetReg: $sgpr4 stackPtrOffsetReg: $sgpr32 body: | @@ -760,11 +738,11 @@ ; GFX6-LABEL: name: load_private_s32_from_1_constant_4096 ; GFX6: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4096, implicit $exec - ; GFX6: [[BUFFER_LOAD_UBYTE_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_OFFEN [[V_MOV_B32_e32_]], $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 1, addrspace 5) + ; GFX6: [[BUFFER_LOAD_UBYTE_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_OFFEN [[V_MOV_B32_e32_]], $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 1, addrspace 5) ; GFX6: $vgpr0 = COPY [[BUFFER_LOAD_UBYTE_OFFEN]] ; GFX9-LABEL: name: load_private_s32_from_1_constant_4096 ; GFX9: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4096, implicit $exec - ; GFX9: [[BUFFER_LOAD_UBYTE_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_OFFEN [[V_MOV_B32_e32_]], $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 1, addrspace 5) + ; GFX9: [[BUFFER_LOAD_UBYTE_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_OFFEN [[V_MOV_B32_e32_]], $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 1, addrspace 5) ; GFX9: $vgpr0 = COPY [[BUFFER_LOAD_UBYTE_OFFEN]] %0:vgpr(p5) = G_CONSTANT i32 4096 %1:vgpr(s32) = G_LOAD %0 :: (load 1, align 1, addrspace 5) @@ -780,7 +758,6 @@ tracksRegLiveness: true machineFunctionInfo: scratchRSrcReg: $sgpr0_sgpr1_sgpr2_sgpr3 - scratchWaveOffsetReg: $sgpr4 stackPtrOffsetReg: $sgpr32 stack: - { id: 0, size: 4, alignment: 4 } @@ -808,7 +785,6 @@ tracksRegLiveness: true machineFunctionInfo: scratchRSrcReg: $sgpr0_sgpr1_sgpr2_sgpr3 - scratchWaveOffsetReg: $sgpr4 stackPtrOffsetReg: $sgpr32 stack: - { id: 0, size: 4096, alignment: 4 } @@ -820,7 +796,7 @@ ; GFX6: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 %stack.0, implicit $exec ; GFX6: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4095, implicit $exec ; GFX6: %2:vgpr_32, dead %4:sreg_64_xexec = V_ADD_I32_e64 [[V_MOV_B32_e32_]], [[V_MOV_B32_e32_1]], 0, implicit $exec - ; GFX6: [[BUFFER_LOAD_UBYTE_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_OFFEN %2, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 1, addrspace 5) + ; GFX6: [[BUFFER_LOAD_UBYTE_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_OFFEN %2, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 1, addrspace 5) ; GFX6: $vgpr0 = COPY [[BUFFER_LOAD_UBYTE_OFFEN]] ; GFX9-LABEL: name: load_private_s32_from_1_fi_offset_4095 ; GFX9: [[BUFFER_LOAD_UBYTE_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_OFFEN %stack.0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 4095, 0, 0, 0, 0, 0, implicit $exec :: (load 1, addrspace 5) @@ -841,7 +817,6 @@ tracksRegLiveness: true machineFunctionInfo: scratchRSrcReg: $sgpr0_sgpr1_sgpr2_sgpr3 - scratchWaveOffsetReg: $sgpr4 stackPtrOffsetReg: $sgpr32 stack: - { id: 0, size: 8192, alignment: 4 } @@ -853,13 +828,13 @@ ; GFX6: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 %stack.0, implicit $exec ; GFX6: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4096, implicit $exec ; GFX6: %2:vgpr_32, dead %4:sreg_64_xexec = V_ADD_I32_e64 [[V_MOV_B32_e32_]], [[V_MOV_B32_e32_1]], 0, implicit $exec - ; GFX6: [[BUFFER_LOAD_UBYTE_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_OFFEN %2, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 1, addrspace 5) + ; GFX6: [[BUFFER_LOAD_UBYTE_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_OFFEN %2, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 1, addrspace 5) ; GFX6: $vgpr0 = COPY [[BUFFER_LOAD_UBYTE_OFFEN]] ; GFX9-LABEL: name: load_private_s32_from_1_fi_offset_4096 ; GFX9: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 %stack.0, implicit $exec ; GFX9: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4096, implicit $exec ; GFX9: [[V_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[V_MOV_B32_e32_]], [[V_MOV_B32_e32_1]], 0, implicit $exec - ; GFX9: [[BUFFER_LOAD_UBYTE_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_OFFEN [[V_ADD_U32_e64_]], $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 1, addrspace 5) + ; GFX9: [[BUFFER_LOAD_UBYTE_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_OFFEN [[V_ADD_U32_e64_]], $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 1, addrspace 5) ; GFX9: $vgpr0 = COPY [[BUFFER_LOAD_UBYTE_OFFEN]] %0:vgpr(p5) = G_FRAME_INDEX %stack.0 %1:vgpr(s32) = G_CONSTANT i32 4096 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-store-local.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-store-local.mir --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-store-local.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-store-local.mir @@ -13,19 +13,12 @@ tracksRegLiveness: true machineFunctionInfo: scratchRSrcReg: $sgpr0_sgpr1_sgpr2_sgpr3 - scratchWaveOffsetReg: $sgpr4 stackPtrOffsetReg: $sgpr32 body: | bb.0: liveins: $vgpr0, $vgpr1 - ; GFX6-LABEL: name: store_local_s32_to_4 - ; GFX6: liveins: $vgpr0, $vgpr1 - ; GFX6: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; GFX6: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 - ; GFX6: $m0 = S_MOV_B32 -1 - ; GFX6: DS_WRITE_B32 [[COPY1]], [[COPY]], 0, 0, implicit $m0, implicit $exec :: (store 4, addrspace 3) ; GFX7-LABEL: name: store_local_s32_to_4 ; GFX7: liveins: $vgpr0, $vgpr1 ; GFX7: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 @@ -37,6 +30,12 @@ ; GFX9: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; GFX9: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 ; GFX9: DS_WRITE_B32_gfx9 [[COPY1]], [[COPY]], 0, 0, implicit $exec :: (store 4, addrspace 3) + ; GFX6-LABEL: name: store_local_s32_to_4 + ; GFX6: liveins: $vgpr0, $vgpr1 + ; GFX6: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX6: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; GFX6: $m0 = S_MOV_B32 -1 + ; GFX6: DS_WRITE_B32 [[COPY1]], [[COPY]], 0, 0, implicit $m0, implicit $exec :: (store 4, addrspace 3) %0:vgpr(s32) = COPY $vgpr0 %1:vgpr(p3) = COPY $vgpr1 G_STORE %0, %1 :: (store 4, align 4, addrspace 3) @@ -51,19 +50,12 @@ tracksRegLiveness: true machineFunctionInfo: scratchRSrcReg: $sgpr0_sgpr1_sgpr2_sgpr3 - scratchWaveOffsetReg: $sgpr4 stackPtrOffsetReg: $sgpr32 body: | bb.0: liveins: $vgpr0, $vgpr1 - ; GFX6-LABEL: name: store_local_s32_to_2 - ; GFX6: liveins: $vgpr0, $vgpr1 - ; GFX6: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; GFX6: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 - ; GFX6: $m0 = S_MOV_B32 -1 - ; GFX6: DS_WRITE_B16 [[COPY1]], [[COPY]], 0, 0, implicit $m0, implicit $exec :: (store 2, addrspace 3) ; GFX7-LABEL: name: store_local_s32_to_2 ; GFX7: liveins: $vgpr0, $vgpr1 ; GFX7: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 @@ -75,6 +67,12 @@ ; GFX9: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; GFX9: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 ; GFX9: DS_WRITE_B16_gfx9 [[COPY1]], [[COPY]], 0, 0, implicit $exec :: (store 2, addrspace 3) + ; GFX6-LABEL: name: store_local_s32_to_2 + ; GFX6: liveins: $vgpr0, $vgpr1 + ; GFX6: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX6: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; GFX6: $m0 = S_MOV_B32 -1 + ; GFX6: DS_WRITE_B16 [[COPY1]], [[COPY]], 0, 0, implicit $m0, implicit $exec :: (store 2, addrspace 3) %0:vgpr(s32) = COPY $vgpr0 %1:vgpr(p3) = COPY $vgpr1 G_STORE %0, %1 :: (store 2, align 2, addrspace 3) @@ -89,19 +87,12 @@ tracksRegLiveness: true machineFunctionInfo: scratchRSrcReg: $sgpr0_sgpr1_sgpr2_sgpr3 - scratchWaveOffsetReg: $sgpr4 stackPtrOffsetReg: $sgpr32 body: | bb.0: liveins: $vgpr0, $vgpr1 - ; GFX6-LABEL: name: store_local_s32_to_1 - ; GFX6: liveins: $vgpr0, $vgpr1 - ; GFX6: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; GFX6: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 - ; GFX6: $m0 = S_MOV_B32 -1 - ; GFX6: DS_WRITE_B8 [[COPY1]], [[COPY]], 0, 0, implicit $m0, implicit $exec :: (store 1, addrspace 3) ; GFX7-LABEL: name: store_local_s32_to_1 ; GFX7: liveins: $vgpr0, $vgpr1 ; GFX7: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 @@ -113,6 +104,12 @@ ; GFX9: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; GFX9: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 ; GFX9: DS_WRITE_B8_gfx9 [[COPY1]], [[COPY]], 0, 0, implicit $exec :: (store 1, addrspace 3) + ; GFX6-LABEL: name: store_local_s32_to_1 + ; GFX6: liveins: $vgpr0, $vgpr1 + ; GFX6: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX6: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; GFX6: $m0 = S_MOV_B32 -1 + ; GFX6: DS_WRITE_B8 [[COPY1]], [[COPY]], 0, 0, implicit $m0, implicit $exec :: (store 1, addrspace 3) %0:vgpr(s32) = COPY $vgpr0 %1:vgpr(p3) = COPY $vgpr1 G_STORE %0, %1 :: (store 1, align 1, addrspace 3) @@ -127,19 +124,12 @@ tracksRegLiveness: true machineFunctionInfo: scratchRSrcReg: $sgpr0_sgpr1_sgpr2_sgpr3 - scratchWaveOffsetReg: $sgpr4 stackPtrOffsetReg: $sgpr32 body: | bb.0: liveins: $vgpr0, $vgpr1 - ; GFX6-LABEL: name: store_local_v2s16 - ; GFX6: liveins: $vgpr0, $vgpr1 - ; GFX6: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; GFX6: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 - ; GFX6: $m0 = S_MOV_B32 -1 - ; GFX6: DS_WRITE_B32 [[COPY1]], [[COPY]], 0, 0, implicit $m0, implicit $exec :: (store 4, addrspace 3) ; GFX7-LABEL: name: store_local_v2s16 ; GFX7: liveins: $vgpr0, $vgpr1 ; GFX7: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 @@ -151,6 +141,12 @@ ; GFX9: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; GFX9: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 ; GFX9: DS_WRITE_B32_gfx9 [[COPY1]], [[COPY]], 0, 0, implicit $exec :: (store 4, addrspace 3) + ; GFX6-LABEL: name: store_local_v2s16 + ; GFX6: liveins: $vgpr0, $vgpr1 + ; GFX6: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX6: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; GFX6: $m0 = S_MOV_B32 -1 + ; GFX6: DS_WRITE_B32 [[COPY1]], [[COPY]], 0, 0, implicit $m0, implicit $exec :: (store 4, addrspace 3) %0:vgpr(<2 x s16>) = COPY $vgpr0 %1:vgpr(p3) = COPY $vgpr1 G_STORE %0, %1 :: (store 4, align 4, addrspace 3) @@ -165,19 +161,12 @@ tracksRegLiveness: true machineFunctionInfo: scratchRSrcReg: $sgpr0_sgpr1_sgpr2_sgpr3 - scratchWaveOffsetReg: $sgpr4 stackPtrOffsetReg: $sgpr32 body: | bb.0: liveins: $vgpr0, $vgpr1 - ; GFX6-LABEL: name: store_local_p3 - ; GFX6: liveins: $vgpr0, $vgpr1 - ; GFX6: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; GFX6: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 - ; GFX6: $m0 = S_MOV_B32 -1 - ; GFX6: DS_WRITE_B32 [[COPY1]], [[COPY]], 0, 0, implicit $m0, implicit $exec :: (store 4, addrspace 3) ; GFX7-LABEL: name: store_local_p3 ; GFX7: liveins: $vgpr0, $vgpr1 ; GFX7: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 @@ -189,6 +178,12 @@ ; GFX9: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; GFX9: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 ; GFX9: DS_WRITE_B32_gfx9 [[COPY1]], [[COPY]], 0, 0, implicit $exec :: (store 4, addrspace 3) + ; GFX6-LABEL: name: store_local_p3 + ; GFX6: liveins: $vgpr0, $vgpr1 + ; GFX6: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX6: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; GFX6: $m0 = S_MOV_B32 -1 + ; GFX6: DS_WRITE_B32 [[COPY1]], [[COPY]], 0, 0, implicit $m0, implicit $exec :: (store 4, addrspace 3) %0:vgpr(p3) = COPY $vgpr0 %1:vgpr(p3) = COPY $vgpr1 G_STORE %0, %1 :: (store 4, align 4, addrspace 3) @@ -205,11 +200,6 @@ body: | bb.0: - ; GFX6-LABEL: name: store_local_s32_to_1_constant_4095 - ; GFX6: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4095, implicit $exec - ; GFX6: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec - ; GFX6: $m0 = S_MOV_B32 -1 - ; GFX6: DS_WRITE_B8 [[V_MOV_B32_e32_]], [[V_MOV_B32_e32_1]], 0, 0, implicit $m0, implicit $exec :: (store 1, addrspace 3) ; GFX7-LABEL: name: store_local_s32_to_1_constant_4095 ; GFX7: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4095, implicit $exec ; GFX7: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec @@ -219,6 +209,11 @@ ; GFX9: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4095, implicit $exec ; GFX9: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec ; GFX9: DS_WRITE_B8_gfx9 [[V_MOV_B32_e32_]], [[V_MOV_B32_e32_1]], 0, 0, implicit $exec :: (store 1, addrspace 3) + ; GFX6-LABEL: name: store_local_s32_to_1_constant_4095 + ; GFX6: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4095, implicit $exec + ; GFX6: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + ; GFX6: $m0 = S_MOV_B32 -1 + ; GFX6: DS_WRITE_B8 [[V_MOV_B32_e32_]], [[V_MOV_B32_e32_1]], 0, 0, implicit $m0, implicit $exec :: (store 1, addrspace 3) %0:vgpr(p3) = G_CONSTANT i32 4095 %1:vgpr(s32) = G_CONSTANT i32 0 G_STORE %1, %0 :: (store 1, align 1, addrspace 3) @@ -233,7 +228,6 @@ tracksRegLiveness: true machineFunctionInfo: scratchRSrcReg: $sgpr0_sgpr1_sgpr2_sgpr3 - scratchWaveOffsetReg: $sgpr4 stackPtrOffsetReg: $sgpr32 stack: - { id: 0, size: 4096, alignment: 4 } @@ -241,11 +235,6 @@ body: | bb.0: - ; GFX6-LABEL: name: store_local_s32_to_1_constant_4096 - ; GFX6: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4096, implicit $exec - ; GFX6: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec - ; GFX6: $m0 = S_MOV_B32 -1 - ; GFX6: DS_WRITE_B8 [[V_MOV_B32_e32_]], [[V_MOV_B32_e32_1]], 0, 0, implicit $m0, implicit $exec :: (store 1, addrspace 3) ; GFX7-LABEL: name: store_local_s32_to_1_constant_4096 ; GFX7: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4096, implicit $exec ; GFX7: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec @@ -255,6 +244,11 @@ ; GFX9: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4096, implicit $exec ; GFX9: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec ; GFX9: DS_WRITE_B8_gfx9 [[V_MOV_B32_e32_]], [[V_MOV_B32_e32_1]], 0, 0, implicit $exec :: (store 1, addrspace 3) + ; GFX6-LABEL: name: store_local_s32_to_1_constant_4096 + ; GFX6: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4096, implicit $exec + ; GFX6: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + ; GFX6: $m0 = S_MOV_B32 -1 + ; GFX6: DS_WRITE_B8 [[V_MOV_B32_e32_]], [[V_MOV_B32_e32_1]], 0, 0, implicit $m0, implicit $exec :: (store 1, addrspace 3) %0:vgpr(p3) = G_CONSTANT i32 4096 %1:vgpr(s32) = G_CONSTANT i32 0 G_STORE %1, %0 :: (store 1, align 1, addrspace 3) @@ -269,19 +263,12 @@ tracksRegLiveness: true machineFunctionInfo: scratchRSrcReg: $sgpr0_sgpr1_sgpr2_sgpr3 - scratchWaveOffsetReg: $sgpr4 stackPtrOffsetReg: $sgpr32 body: | bb.0: liveins: $vgpr0_vgpr1, $vgpr2 - ; GFX6-LABEL: name: store_local_s64_align4 - ; GFX6: liveins: $vgpr0_vgpr1, $vgpr2 - ; GFX6: [[COPY:%[0-9]+]]:vgpr(s64) = COPY $vgpr0_vgpr1 - ; GFX6: [[COPY1:%[0-9]+]]:vgpr(p3) = COPY $vgpr2 - ; GFX6: $m0 = S_MOV_B32 -1 - ; GFX6: G_STORE [[COPY]](s64), [[COPY1]](p3) :: (store 8, align 4, addrspace 3) ; GFX7-LABEL: name: store_local_s64_align4 ; GFX7: liveins: $vgpr0_vgpr1, $vgpr2 ; GFX7: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 @@ -297,6 +284,12 @@ ; GFX9: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1 ; GFX9: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0 ; GFX9: DS_WRITE2_B32_gfx9 [[COPY1]], [[COPY3]], [[COPY2]], 0, 1, 0, implicit $exec :: (store 8, align 4, addrspace 3) + ; GFX6-LABEL: name: store_local_s64_align4 + ; GFX6: liveins: $vgpr0_vgpr1, $vgpr2 + ; GFX6: [[COPY:%[0-9]+]]:vgpr(s64) = COPY $vgpr0_vgpr1 + ; GFX6: [[COPY1:%[0-9]+]]:vgpr(p3) = COPY $vgpr2 + ; GFX6: $m0 = S_MOV_B32 -1 + ; GFX6: G_STORE [[COPY]](s64), [[COPY1]](p3) :: (store 8, align 4, addrspace 3) %0:vgpr(s64) = COPY $vgpr0_vgpr1 %1:vgpr(p3) = COPY $vgpr2 G_STORE %0, %1 :: (store 8, align 4, addrspace 3) @@ -311,19 +304,12 @@ tracksRegLiveness: true machineFunctionInfo: scratchRSrcReg: $sgpr0_sgpr1_sgpr2_sgpr3 - scratchWaveOffsetReg: $sgpr4 stackPtrOffsetReg: $sgpr32 body: | bb.0: liveins: $vgpr0_vgpr1, $vgpr2 - ; GFX6-LABEL: name: store_local_p1_align4 - ; GFX6: liveins: $vgpr0_vgpr1, $vgpr2 - ; GFX6: [[COPY:%[0-9]+]]:vgpr(p1) = COPY $vgpr0_vgpr1 - ; GFX6: [[COPY1:%[0-9]+]]:vgpr(p3) = COPY $vgpr2 - ; GFX6: $m0 = S_MOV_B32 -1 - ; GFX6: G_STORE [[COPY]](p1), [[COPY1]](p3) :: (store 8, align 4, addrspace 3) ; GFX7-LABEL: name: store_local_p1_align4 ; GFX7: liveins: $vgpr0_vgpr1, $vgpr2 ; GFX7: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 @@ -339,6 +325,12 @@ ; GFX9: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1 ; GFX9: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0 ; GFX9: DS_WRITE2_B32_gfx9 [[COPY1]], [[COPY3]], [[COPY2]], 0, 1, 0, implicit $exec :: (store 8, align 4, addrspace 3) + ; GFX6-LABEL: name: store_local_p1_align4 + ; GFX6: liveins: $vgpr0_vgpr1, $vgpr2 + ; GFX6: [[COPY:%[0-9]+]]:vgpr(p1) = COPY $vgpr0_vgpr1 + ; GFX6: [[COPY1:%[0-9]+]]:vgpr(p3) = COPY $vgpr2 + ; GFX6: $m0 = S_MOV_B32 -1 + ; GFX6: G_STORE [[COPY]](p1), [[COPY1]](p3) :: (store 8, align 4, addrspace 3) %0:vgpr(p1) = COPY $vgpr0_vgpr1 %1:vgpr(p3) = COPY $vgpr2 G_STORE %0, %1 :: (store 8, align 4, addrspace 3) @@ -353,19 +345,12 @@ tracksRegLiveness: true machineFunctionInfo: scratchRSrcReg: $sgpr0_sgpr1_sgpr2_sgpr3 - scratchWaveOffsetReg: $sgpr4 stackPtrOffsetReg: $sgpr32 body: | bb.0: liveins: $vgpr0_vgpr1, $vgpr2 - ; GFX6-LABEL: name: store_local_v2s32_align4 - ; GFX6: liveins: $vgpr0_vgpr1, $vgpr2 - ; GFX6: [[COPY:%[0-9]+]]:vgpr(<2 x s32>) = COPY $vgpr0_vgpr1 - ; GFX6: [[COPY1:%[0-9]+]]:vgpr(p3) = COPY $vgpr2 - ; GFX6: $m0 = S_MOV_B32 -1 - ; GFX6: G_STORE [[COPY]](<2 x s32>), [[COPY1]](p3) :: (store 8, align 4, addrspace 3) ; GFX7-LABEL: name: store_local_v2s32_align4 ; GFX7: liveins: $vgpr0_vgpr1, $vgpr2 ; GFX7: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 @@ -381,6 +366,12 @@ ; GFX9: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1 ; GFX9: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0 ; GFX9: DS_WRITE2_B32_gfx9 [[COPY1]], [[COPY3]], [[COPY2]], 0, 1, 0, implicit $exec :: (store 8, align 4, addrspace 3) + ; GFX6-LABEL: name: store_local_v2s32_align4 + ; GFX6: liveins: $vgpr0_vgpr1, $vgpr2 + ; GFX6: [[COPY:%[0-9]+]]:vgpr(<2 x s32>) = COPY $vgpr0_vgpr1 + ; GFX6: [[COPY1:%[0-9]+]]:vgpr(p3) = COPY $vgpr2 + ; GFX6: $m0 = S_MOV_B32 -1 + ; GFX6: G_STORE [[COPY]](<2 x s32>), [[COPY1]](p3) :: (store 8, align 4, addrspace 3) %0:vgpr(<2 x s32>) = COPY $vgpr0_vgpr1 %1:vgpr(p3) = COPY $vgpr2 G_STORE %0, %1 :: (store 8, align 4, addrspace 3) @@ -395,19 +386,12 @@ tracksRegLiveness: true machineFunctionInfo: scratchRSrcReg: $sgpr0_sgpr1_sgpr2_sgpr3 - scratchWaveOffsetReg: $sgpr4 stackPtrOffsetReg: $sgpr32 body: | bb.0: liveins: $vgpr0_vgpr1, $vgpr2 - ; GFX6-LABEL: name: store_local_v4s16_align4 - ; GFX6: liveins: $vgpr0_vgpr1, $vgpr2 - ; GFX6: [[COPY:%[0-9]+]]:vgpr(<4 x s16>) = COPY $vgpr0_vgpr1 - ; GFX6: [[COPY1:%[0-9]+]]:vgpr(p3) = COPY $vgpr2 - ; GFX6: $m0 = S_MOV_B32 -1 - ; GFX6: G_STORE [[COPY]](<4 x s16>), [[COPY1]](p3) :: (store 8, align 4, addrspace 3) ; GFX7-LABEL: name: store_local_v4s16_align4 ; GFX7: liveins: $vgpr0_vgpr1, $vgpr2 ; GFX7: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 @@ -423,6 +407,12 @@ ; GFX9: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1 ; GFX9: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0 ; GFX9: DS_WRITE2_B32_gfx9 [[COPY1]], [[COPY3]], [[COPY2]], 0, 1, 0, implicit $exec :: (store 8, align 4, addrspace 3) + ; GFX6-LABEL: name: store_local_v4s16_align4 + ; GFX6: liveins: $vgpr0_vgpr1, $vgpr2 + ; GFX6: [[COPY:%[0-9]+]]:vgpr(<4 x s16>) = COPY $vgpr0_vgpr1 + ; GFX6: [[COPY1:%[0-9]+]]:vgpr(p3) = COPY $vgpr2 + ; GFX6: $m0 = S_MOV_B32 -1 + ; GFX6: G_STORE [[COPY]](<4 x s16>), [[COPY1]](p3) :: (store 8, align 4, addrspace 3) %0:vgpr(<4 x s16>) = COPY $vgpr0_vgpr1 %1:vgpr(p3) = COPY $vgpr2 G_STORE %0, %1 :: (store 8, align 4, addrspace 3) @@ -437,19 +427,12 @@ tracksRegLiveness: true machineFunctionInfo: scratchRSrcReg: $sgpr0_sgpr1_sgpr2_sgpr3 - scratchWaveOffsetReg: $sgpr4 stackPtrOffsetReg: $sgpr32 body: | bb.0: liveins: $vgpr0_vgpr1, $vgpr2 - ; GFX6-LABEL: name: store_local_s64_align8 - ; GFX6: liveins: $vgpr0_vgpr1, $vgpr2 - ; GFX6: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 - ; GFX6: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2 - ; GFX6: $m0 = S_MOV_B32 -1 - ; GFX6: DS_WRITE_B64 [[COPY1]], [[COPY]], 0, 0, implicit $m0, implicit $exec :: (store 8, addrspace 3) ; GFX7-LABEL: name: store_local_s64_align8 ; GFX7: liveins: $vgpr0_vgpr1, $vgpr2 ; GFX7: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 @@ -461,6 +444,12 @@ ; GFX9: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 ; GFX9: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2 ; GFX9: DS_WRITE_B64_gfx9 [[COPY1]], [[COPY]], 0, 0, implicit $exec :: (store 8, addrspace 3) + ; GFX6-LABEL: name: store_local_s64_align8 + ; GFX6: liveins: $vgpr0_vgpr1, $vgpr2 + ; GFX6: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 + ; GFX6: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2 + ; GFX6: $m0 = S_MOV_B32 -1 + ; GFX6: DS_WRITE_B64 [[COPY1]], [[COPY]], 0, 0, implicit $m0, implicit $exec :: (store 8, addrspace 3) %0:vgpr(s64) = COPY $vgpr0_vgpr1 %1:vgpr(p3) = COPY $vgpr2 G_STORE %0, %1 :: (store 8, align 8, addrspace 3) @@ -475,19 +464,12 @@ tracksRegLiveness: true machineFunctionInfo: scratchRSrcReg: $sgpr0_sgpr1_sgpr2_sgpr3 - scratchWaveOffsetReg: $sgpr4 stackPtrOffsetReg: $sgpr32 body: | bb.0: liveins: $vgpr0_vgpr1, $vgpr2 - ; GFX6-LABEL: name: store_local_p1_align8 - ; GFX6: liveins: $vgpr0_vgpr1, $vgpr2 - ; GFX6: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 - ; GFX6: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2 - ; GFX6: $m0 = S_MOV_B32 -1 - ; GFX6: DS_WRITE_B64 [[COPY1]], [[COPY]], 0, 0, implicit $m0, implicit $exec :: (store 8, addrspace 3) ; GFX7-LABEL: name: store_local_p1_align8 ; GFX7: liveins: $vgpr0_vgpr1, $vgpr2 ; GFX7: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 @@ -499,6 +481,12 @@ ; GFX9: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 ; GFX9: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2 ; GFX9: DS_WRITE_B64_gfx9 [[COPY1]], [[COPY]], 0, 0, implicit $exec :: (store 8, addrspace 3) + ; GFX6-LABEL: name: store_local_p1_align8 + ; GFX6: liveins: $vgpr0_vgpr1, $vgpr2 + ; GFX6: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 + ; GFX6: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2 + ; GFX6: $m0 = S_MOV_B32 -1 + ; GFX6: DS_WRITE_B64 [[COPY1]], [[COPY]], 0, 0, implicit $m0, implicit $exec :: (store 8, addrspace 3) %0:vgpr(p1) = COPY $vgpr0_vgpr1 %1:vgpr(p3) = COPY $vgpr2 G_STORE %0, %1 :: (store 8, align 8, addrspace 3) @@ -513,19 +501,12 @@ tracksRegLiveness: true machineFunctionInfo: scratchRSrcReg: $sgpr0_sgpr1_sgpr2_sgpr3 - scratchWaveOffsetReg: $sgpr4 stackPtrOffsetReg: $sgpr32 body: | bb.0: liveins: $vgpr0_vgpr1, $vgpr2 - ; GFX6-LABEL: name: store_local_v2s32_align8 - ; GFX6: liveins: $vgpr0_vgpr1, $vgpr2 - ; GFX6: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 - ; GFX6: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2 - ; GFX6: $m0 = S_MOV_B32 -1 - ; GFX6: DS_WRITE_B64 [[COPY1]], [[COPY]], 0, 0, implicit $m0, implicit $exec :: (store 8, addrspace 3) ; GFX7-LABEL: name: store_local_v2s32_align8 ; GFX7: liveins: $vgpr0_vgpr1, $vgpr2 ; GFX7: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 @@ -537,6 +518,12 @@ ; GFX9: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 ; GFX9: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2 ; GFX9: DS_WRITE_B64_gfx9 [[COPY1]], [[COPY]], 0, 0, implicit $exec :: (store 8, addrspace 3) + ; GFX6-LABEL: name: store_local_v2s32_align8 + ; GFX6: liveins: $vgpr0_vgpr1, $vgpr2 + ; GFX6: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 + ; GFX6: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2 + ; GFX6: $m0 = S_MOV_B32 -1 + ; GFX6: DS_WRITE_B64 [[COPY1]], [[COPY]], 0, 0, implicit $m0, implicit $exec :: (store 8, addrspace 3) %0:vgpr(<2 x s32>) = COPY $vgpr0_vgpr1 %1:vgpr(p3) = COPY $vgpr2 G_STORE %0, %1 :: (store 8, align 8, addrspace 3) @@ -551,19 +538,12 @@ tracksRegLiveness: true machineFunctionInfo: scratchRSrcReg: $sgpr0_sgpr1_sgpr2_sgpr3 - scratchWaveOffsetReg: $sgpr4 stackPtrOffsetReg: $sgpr32 body: | bb.0: liveins: $vgpr0_vgpr1, $vgpr2 - ; GFX6-LABEL: name: store_local_v4s16_align8 - ; GFX6: liveins: $vgpr0_vgpr1, $vgpr2 - ; GFX6: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 - ; GFX6: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2 - ; GFX6: $m0 = S_MOV_B32 -1 - ; GFX6: DS_WRITE_B64 [[COPY1]], [[COPY]], 0, 0, implicit $m0, implicit $exec :: (store 8, addrspace 3) ; GFX7-LABEL: name: store_local_v4s16_align8 ; GFX7: liveins: $vgpr0_vgpr1, $vgpr2 ; GFX7: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 @@ -575,6 +555,12 @@ ; GFX9: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 ; GFX9: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2 ; GFX9: DS_WRITE_B64_gfx9 [[COPY1]], [[COPY]], 0, 0, implicit $exec :: (store 8, addrspace 3) + ; GFX6-LABEL: name: store_local_v4s16_align8 + ; GFX6: liveins: $vgpr0_vgpr1, $vgpr2 + ; GFX6: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 + ; GFX6: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2 + ; GFX6: $m0 = S_MOV_B32 -1 + ; GFX6: DS_WRITE_B64 [[COPY1]], [[COPY]], 0, 0, implicit $m0, implicit $exec :: (store 8, addrspace 3) %0:vgpr(<4 x s16>) = COPY $vgpr0_vgpr1 %1:vgpr(p3) = COPY $vgpr2 G_STORE %0, %1 :: (store 8, align 8, addrspace 3) @@ -589,21 +575,12 @@ tracksRegLiveness: true machineFunctionInfo: scratchRSrcReg: $sgpr0_sgpr1_sgpr2_sgpr3 - scratchWaveOffsetReg: $sgpr4 stackPtrOffsetReg: $sgpr32 body: | bb.0: liveins: $vgpr0_vgpr1, $vgpr2 - ; GFX6-LABEL: name: store_local_s64_align4_from_1_gep_1016 - ; GFX6: liveins: $vgpr0_vgpr1, $vgpr2 - ; GFX6: [[COPY:%[0-9]+]]:vgpr(s64) = COPY $vgpr0_vgpr1 - ; GFX6: [[COPY1:%[0-9]+]]:vgpr(p3) = COPY $vgpr2 - ; GFX6: [[C:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 1016 - ; GFX6: [[PTR_ADD:%[0-9]+]]:vgpr(p3) = G_PTR_ADD [[COPY1]], [[C]](s32) - ; GFX6: $m0 = S_MOV_B32 -1 - ; GFX6: G_STORE [[COPY]](s64), [[PTR_ADD]](p3) :: (store 8, align 4, addrspace 3) ; GFX7-LABEL: name: store_local_s64_align4_from_1_gep_1016 ; GFX7: liveins: $vgpr0_vgpr1, $vgpr2 ; GFX7: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 @@ -619,6 +596,14 @@ ; GFX9: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1 ; GFX9: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0 ; GFX9: DS_WRITE2_B32_gfx9 [[COPY1]], [[COPY3]], [[COPY2]], 254, 255, 0, implicit $exec :: (store 8, align 4, addrspace 3) + ; GFX6-LABEL: name: store_local_s64_align4_from_1_gep_1016 + ; GFX6: liveins: $vgpr0_vgpr1, $vgpr2 + ; GFX6: [[COPY:%[0-9]+]]:vgpr(s64) = COPY $vgpr0_vgpr1 + ; GFX6: [[COPY1:%[0-9]+]]:vgpr(p3) = COPY $vgpr2 + ; GFX6: [[C:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 1016 + ; GFX6: [[PTR_ADD:%[0-9]+]]:vgpr(p3) = G_PTR_ADD [[COPY1]], [[C]](s32) + ; GFX6: $m0 = S_MOV_B32 -1 + ; GFX6: G_STORE [[COPY]](s64), [[PTR_ADD]](p3) :: (store 8, align 4, addrspace 3) %0:vgpr(s64) = COPY $vgpr0_vgpr1 %1:vgpr(p3) = COPY $vgpr2 %2:vgpr(s32) = G_CONSTANT i32 1016 @@ -635,21 +620,12 @@ tracksRegLiveness: true machineFunctionInfo: scratchRSrcReg: $sgpr0_sgpr1_sgpr2_sgpr3 - scratchWaveOffsetReg: $sgpr4 stackPtrOffsetReg: $sgpr32 body: | bb.0: liveins: $vgpr0_vgpr1, $vgpr2 - ; GFX6-LABEL: name: store_local_s64_align4_from_1_gep_1020 - ; GFX6: liveins: $vgpr0_vgpr1, $vgpr2 - ; GFX6: [[COPY:%[0-9]+]]:vgpr(s64) = COPY $vgpr0_vgpr1 - ; GFX6: [[COPY1:%[0-9]+]]:vgpr(p3) = COPY $vgpr2 - ; GFX6: [[C:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 1020 - ; GFX6: [[PTR_ADD:%[0-9]+]]:vgpr(p3) = G_PTR_ADD [[COPY1]], [[C]](s32) - ; GFX6: $m0 = S_MOV_B32 -1 - ; GFX6: G_STORE [[COPY]](s64), [[PTR_ADD]](p3) :: (store 8, align 4, addrspace 3) ; GFX7-LABEL: name: store_local_s64_align4_from_1_gep_1020 ; GFX7: liveins: $vgpr0_vgpr1, $vgpr2 ; GFX7: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 @@ -669,6 +645,14 @@ ; GFX9: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1 ; GFX9: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0 ; GFX9: DS_WRITE2_B32_gfx9 [[V_ADD_U32_e64_]], [[COPY3]], [[COPY2]], 0, 1, 0, implicit $exec :: (store 8, align 4, addrspace 3) + ; GFX6-LABEL: name: store_local_s64_align4_from_1_gep_1020 + ; GFX6: liveins: $vgpr0_vgpr1, $vgpr2 + ; GFX6: [[COPY:%[0-9]+]]:vgpr(s64) = COPY $vgpr0_vgpr1 + ; GFX6: [[COPY1:%[0-9]+]]:vgpr(p3) = COPY $vgpr2 + ; GFX6: [[C:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 1020 + ; GFX6: [[PTR_ADD:%[0-9]+]]:vgpr(p3) = G_PTR_ADD [[COPY1]], [[C]](s32) + ; GFX6: $m0 = S_MOV_B32 -1 + ; GFX6: G_STORE [[COPY]](s64), [[PTR_ADD]](p3) :: (store 8, align 4, addrspace 3) %0:vgpr(s64) = COPY $vgpr0_vgpr1 %1:vgpr(p3) = COPY $vgpr2 %2:vgpr(s32) = G_CONSTANT i32 1020 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-store-private.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-store-private.mir --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-store-private.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-store-private.mir @@ -4,29 +4,29 @@ --- -name: store_private_s32_to_4 +name: function_store_private_s32_to_4 legalized: true regBankSelected: true tracksRegLiveness: true machineFunctionInfo: + isEntryFunction: false scratchRSrcReg: $sgpr0_sgpr1_sgpr2_sgpr3 - scratchWaveOffsetReg: $sgpr4 stackPtrOffsetReg: $sgpr32 body: | bb.0: liveins: $vgpr0, $vgpr1 - ; GFX6-LABEL: name: store_private_s32_to_4 + ; GFX6-LABEL: name: function_store_private_s32_to_4 ; GFX6: liveins: $vgpr0, $vgpr1 ; GFX6: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; GFX6: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 - ; GFX6: BUFFER_STORE_DWORD_OFFEN [[COPY]], [[COPY1]], $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 4, addrspace 5) - ; GFX9-LABEL: name: store_private_s32_to_4 + ; GFX6: BUFFER_STORE_DWORD_OFFEN [[COPY]], [[COPY1]], $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 4, addrspace 5) + ; GFX9-LABEL: name: function_store_private_s32_to_4 ; GFX9: liveins: $vgpr0, $vgpr1 ; GFX9: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; GFX9: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 - ; GFX9: BUFFER_STORE_DWORD_OFFEN [[COPY]], [[COPY1]], $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 4, addrspace 5) + ; GFX9: BUFFER_STORE_DWORD_OFFEN [[COPY]], [[COPY1]], $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 4, addrspace 5) %0:vgpr(s32) = COPY $vgpr0 %1:vgpr(p5) = COPY $vgpr1 G_STORE %0, %1 :: (store 4, align 4, addrspace 5) @@ -35,29 +35,29 @@ --- -name: store_private_s32_to_2 +name: function_store_private_s32_to_2 legalized: true regBankSelected: true tracksRegLiveness: true machineFunctionInfo: + isEntryFunction: false scratchRSrcReg: $sgpr0_sgpr1_sgpr2_sgpr3 - scratchWaveOffsetReg: $sgpr4 stackPtrOffsetReg: $sgpr32 body: | bb.0: liveins: $vgpr0, $vgpr1 - ; GFX6-LABEL: name: store_private_s32_to_2 + ; GFX6-LABEL: name: function_store_private_s32_to_2 ; GFX6: liveins: $vgpr0, $vgpr1 ; GFX6: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; GFX6: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 - ; GFX6: BUFFER_STORE_SHORT_OFFEN [[COPY]], [[COPY1]], $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 2, addrspace 5) - ; GFX9-LABEL: name: store_private_s32_to_2 + ; GFX6: BUFFER_STORE_SHORT_OFFEN [[COPY]], [[COPY1]], $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 2, addrspace 5) + ; GFX9-LABEL: name: function_store_private_s32_to_2 ; GFX9: liveins: $vgpr0, $vgpr1 ; GFX9: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; GFX9: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 - ; GFX9: BUFFER_STORE_SHORT_OFFEN [[COPY]], [[COPY1]], $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 2, addrspace 5) + ; GFX9: BUFFER_STORE_SHORT_OFFEN [[COPY]], [[COPY1]], $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 2, addrspace 5) %0:vgpr(s32) = COPY $vgpr0 %1:vgpr(p5) = COPY $vgpr1 G_STORE %0, %1 :: (store 2, align 2, addrspace 5) @@ -66,29 +66,29 @@ --- -name: store_private_s32_to_1 +name: function_store_private_s32_to_1 legalized: true regBankSelected: true tracksRegLiveness: true machineFunctionInfo: + isEntryFunction: false scratchRSrcReg: $sgpr0_sgpr1_sgpr2_sgpr3 - scratchWaveOffsetReg: $sgpr4 stackPtrOffsetReg: $sgpr32 body: | bb.0: liveins: $vgpr0, $vgpr1 - ; GFX6-LABEL: name: store_private_s32_to_1 + ; GFX6-LABEL: name: function_store_private_s32_to_1 ; GFX6: liveins: $vgpr0, $vgpr1 ; GFX6: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; GFX6: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 - ; GFX6: BUFFER_STORE_BYTE_OFFEN [[COPY]], [[COPY1]], $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 1, addrspace 5) - ; GFX9-LABEL: name: store_private_s32_to_1 + ; GFX6: BUFFER_STORE_BYTE_OFFEN [[COPY]], [[COPY1]], $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 1, addrspace 5) + ; GFX9-LABEL: name: function_store_private_s32_to_1 ; GFX9: liveins: $vgpr0, $vgpr1 ; GFX9: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; GFX9: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 - ; GFX9: BUFFER_STORE_BYTE_OFFEN [[COPY]], [[COPY1]], $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 1, addrspace 5) + ; GFX9: BUFFER_STORE_BYTE_OFFEN [[COPY]], [[COPY1]], $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 1, addrspace 5) %0:vgpr(s32) = COPY $vgpr0 %1:vgpr(p5) = COPY $vgpr1 G_STORE %0, %1 :: (store 1, align 1, addrspace 5) @@ -97,29 +97,29 @@ --- -name: store_private_v2s16 +name: function_store_private_v2s16 legalized: true regBankSelected: true tracksRegLiveness: true machineFunctionInfo: + isEntryFunction: false scratchRSrcReg: $sgpr0_sgpr1_sgpr2_sgpr3 - scratchWaveOffsetReg: $sgpr4 stackPtrOffsetReg: $sgpr32 body: | bb.0: liveins: $vgpr0, $vgpr1 - ; GFX6-LABEL: name: store_private_v2s16 + ; GFX6-LABEL: name: function_store_private_v2s16 ; GFX6: liveins: $vgpr0, $vgpr1 ; GFX6: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; GFX6: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 - ; GFX6: BUFFER_STORE_DWORD_OFFEN [[COPY]], [[COPY1]], $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 4, addrspace 5) - ; GFX9-LABEL: name: store_private_v2s16 + ; GFX6: BUFFER_STORE_DWORD_OFFEN [[COPY]], [[COPY1]], $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 4, addrspace 5) + ; GFX9-LABEL: name: function_store_private_v2s16 ; GFX9: liveins: $vgpr0, $vgpr1 ; GFX9: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; GFX9: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 - ; GFX9: BUFFER_STORE_DWORD_OFFEN [[COPY]], [[COPY1]], $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 4, addrspace 5) + ; GFX9: BUFFER_STORE_DWORD_OFFEN [[COPY]], [[COPY1]], $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 4, addrspace 5) %0:vgpr(<2 x s16>) = COPY $vgpr0 %1:vgpr(p5) = COPY $vgpr1 G_STORE %0, %1 :: (store 4, align 4, addrspace 5) @@ -128,29 +128,29 @@ --- -name: store_private_p3 +name: function_store_private_p3 legalized: true regBankSelected: true tracksRegLiveness: true machineFunctionInfo: + isEntryFunction: false scratchRSrcReg: $sgpr0_sgpr1_sgpr2_sgpr3 - scratchWaveOffsetReg: $sgpr4 stackPtrOffsetReg: $sgpr32 body: | bb.0: liveins: $vgpr0, $vgpr1 - ; GFX6-LABEL: name: store_private_p3 + ; GFX6-LABEL: name: function_store_private_p3 ; GFX6: liveins: $vgpr0, $vgpr1 ; GFX6: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; GFX6: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 - ; GFX6: BUFFER_STORE_DWORD_OFFEN [[COPY]], [[COPY1]], $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 4, addrspace 5) - ; GFX9-LABEL: name: store_private_p3 + ; GFX6: BUFFER_STORE_DWORD_OFFEN [[COPY]], [[COPY1]], $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 4, addrspace 5) + ; GFX9-LABEL: name: function_store_private_p3 ; GFX9: liveins: $vgpr0, $vgpr1 ; GFX9: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; GFX9: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 - ; GFX9: BUFFER_STORE_DWORD_OFFEN [[COPY]], [[COPY1]], $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 4, addrspace 5) + ; GFX9: BUFFER_STORE_DWORD_OFFEN [[COPY]], [[COPY1]], $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 4, addrspace 5) %0:vgpr(p3) = COPY $vgpr0 %1:vgpr(p5) = COPY $vgpr1 G_STORE %0, %1 :: (store 4, align 4, addrspace 5) @@ -159,29 +159,29 @@ --- -name: store_private_p5 +name: function_store_private_p5 legalized: true regBankSelected: true tracksRegLiveness: true machineFunctionInfo: + isEntryFunction: false scratchRSrcReg: $sgpr0_sgpr1_sgpr2_sgpr3 - scratchWaveOffsetReg: $sgpr4 stackPtrOffsetReg: $sgpr32 body: | bb.0: liveins: $vgpr0, $vgpr1 - ; GFX6-LABEL: name: store_private_p5 + ; GFX6-LABEL: name: function_store_private_p5 ; GFX6: liveins: $vgpr0, $vgpr1 ; GFX6: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; GFX6: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 - ; GFX6: BUFFER_STORE_DWORD_OFFEN [[COPY]], [[COPY1]], $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 4, addrspace 5) - ; GFX9-LABEL: name: store_private_p5 + ; GFX6: BUFFER_STORE_DWORD_OFFEN [[COPY]], [[COPY1]], $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 4, addrspace 5) + ; GFX9-LABEL: name: function_store_private_p5 ; GFX9: liveins: $vgpr0, $vgpr1 ; GFX9: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; GFX9: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 - ; GFX9: BUFFER_STORE_DWORD_OFFEN [[COPY]], [[COPY1]], $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 4, addrspace 5) + ; GFX9: BUFFER_STORE_DWORD_OFFEN [[COPY]], [[COPY1]], $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 4, addrspace 5) %0:vgpr(p5) = COPY $vgpr0 %1:vgpr(p5) = COPY $vgpr1 G_STORE %0, %1 :: (store 4, align 4, addrspace 5) @@ -190,13 +190,13 @@ --- -name: store_private_s32_to_1_fi_offset_4095 +name: function_store_private_s32_to_1_fi_offset_4095 legalized: true regBankSelected: true tracksRegLiveness: true machineFunctionInfo: + isEntryFunction: false scratchRSrcReg: $sgpr0_sgpr1_sgpr2_sgpr3 - scratchWaveOffsetReg: $sgpr4 stackPtrOffsetReg: $sgpr32 stack: - { id: 0, size: 4096, alignment: 4 } @@ -204,13 +204,13 @@ body: | bb.0: - ; GFX6-LABEL: name: store_private_s32_to_1_fi_offset_4095 + ; GFX6-LABEL: name: function_store_private_s32_to_1_fi_offset_4095 ; GFX6: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 %stack.0, implicit $exec ; GFX6: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4095, implicit $exec ; GFX6: %2:vgpr_32, dead %4:sreg_64_xexec = V_ADD_I32_e64 [[V_MOV_B32_e32_]], [[V_MOV_B32_e32_1]], 0, implicit $exec ; GFX6: [[V_MOV_B32_e32_2:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec - ; GFX6: BUFFER_STORE_BYTE_OFFEN [[V_MOV_B32_e32_2]], %2, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 1, addrspace 5) - ; GFX9-LABEL: name: store_private_s32_to_1_fi_offset_4095 + ; GFX6: BUFFER_STORE_BYTE_OFFEN [[V_MOV_B32_e32_2]], %2, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 1, addrspace 5) + ; GFX9-LABEL: name: function_store_private_s32_to_1_fi_offset_4095 ; GFX9: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec ; GFX9: BUFFER_STORE_BYTE_OFFEN [[V_MOV_B32_e32_]], %stack.0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 4095, 0, 0, 0, 0, 0, implicit $exec :: (store 1, addrspace 5) %0:vgpr(p5) = G_FRAME_INDEX %stack.0 @@ -223,13 +223,13 @@ --- -name: store_private_s32_to_1_constant_4095 +name: function_store_private_s32_to_1_constant_4095 legalized: true regBankSelected: true tracksRegLiveness: true machineFunctionInfo: + isEntryFunction: false scratchRSrcReg: $sgpr0_sgpr1_sgpr2_sgpr3 - scratchWaveOffsetReg: $sgpr4 stackPtrOffsetReg: $sgpr32 stack: - { id: 0, size: 4096, alignment: 4 } @@ -237,12 +237,12 @@ body: | bb.0: - ; GFX6-LABEL: name: store_private_s32_to_1_constant_4095 + ; GFX6-LABEL: name: function_store_private_s32_to_1_constant_4095 ; GFX6: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec - ; GFX6: BUFFER_STORE_BYTE_OFFSET [[V_MOV_B32_e32_]], $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 4095, 0, 0, 0, 0, 0, implicit $exec :: (store 1, addrspace 5) - ; GFX9-LABEL: name: store_private_s32_to_1_constant_4095 + ; GFX6: BUFFER_STORE_BYTE_OFFSET [[V_MOV_B32_e32_]], $sgpr0_sgpr1_sgpr2_sgpr3, 0, 4095, 0, 0, 0, 0, 0, implicit $exec :: (store 1, addrspace 5) + ; GFX9-LABEL: name: function_store_private_s32_to_1_constant_4095 ; GFX9: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec - ; GFX9: BUFFER_STORE_BYTE_OFFSET [[V_MOV_B32_e32_]], $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 4095, 0, 0, 0, 0, 0, implicit $exec :: (store 1, addrspace 5) + ; GFX9: BUFFER_STORE_BYTE_OFFSET [[V_MOV_B32_e32_]], $sgpr0_sgpr1_sgpr2_sgpr3, 0, 4095, 0, 0, 0, 0, 0, implicit $exec :: (store 1, addrspace 5) %0:vgpr(p5) = G_CONSTANT i32 4095 %1:vgpr(s32) = G_CONSTANT i32 0 G_STORE %1, %0 :: (store 1, align 1, addrspace 5) @@ -251,13 +251,13 @@ --- -name: store_private_s32_to_1_constant_4096 +name: function_store_private_s32_to_1_constant_4096 legalized: true regBankSelected: true tracksRegLiveness: true machineFunctionInfo: + isEntryFunction: false scratchRSrcReg: $sgpr0_sgpr1_sgpr2_sgpr3 - scratchWaveOffsetReg: $sgpr4 stackPtrOffsetReg: $sgpr32 stack: - { id: 0, size: 4096, alignment: 4 } @@ -265,14 +265,291 @@ body: | bb.0: - ; GFX6-LABEL: name: store_private_s32_to_1_constant_4096 + ; GFX6-LABEL: name: function_store_private_s32_to_1_constant_4096 ; GFX6: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec ; GFX6: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4096, implicit $exec - ; GFX6: BUFFER_STORE_BYTE_OFFEN [[V_MOV_B32_e32_]], [[V_MOV_B32_e32_1]], $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 1, addrspace 5) - ; GFX9-LABEL: name: store_private_s32_to_1_constant_4096 + ; GFX6: BUFFER_STORE_BYTE_OFFEN [[V_MOV_B32_e32_]], [[V_MOV_B32_e32_1]], $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 1, addrspace 5) + ; GFX9-LABEL: name: function_store_private_s32_to_1_constant_4096 ; GFX9: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec ; GFX9: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4096, implicit $exec - ; GFX9: BUFFER_STORE_BYTE_OFFEN [[V_MOV_B32_e32_]], [[V_MOV_B32_e32_1]], $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 1, addrspace 5) + ; GFX9: BUFFER_STORE_BYTE_OFFEN [[V_MOV_B32_e32_]], [[V_MOV_B32_e32_1]], $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 1, addrspace 5) + %0:vgpr(p5) = G_CONSTANT i32 4096 + %1:vgpr(s32) = G_CONSTANT i32 0 + G_STORE %1, %0 :: (store 1, align 1, addrspace 5) + +... + +--- + +name: kernel_store_private_s32_to_4 +legalized: true +regBankSelected: true +tracksRegLiveness: true +machineFunctionInfo: + isEntryFunction: true + scratchRSrcReg: $sgpr0_sgpr1_sgpr2_sgpr3 + +body: | + bb.0: + liveins: $vgpr0, $vgpr1, $sgpr0_sgpr1_sgpr2_sgpr3 + + ; GFX6-LABEL: name: kernel_store_private_s32_to_4 + ; GFX6: liveins: $vgpr0, $vgpr1, $sgpr0_sgpr1_sgpr2_sgpr3 + ; GFX6: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX6: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; GFX6: BUFFER_STORE_DWORD_OFFEN [[COPY]], [[COPY1]], $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 4, addrspace 5) + ; GFX9-LABEL: name: kernel_store_private_s32_to_4 + ; GFX9: liveins: $vgpr0, $vgpr1, $sgpr0_sgpr1_sgpr2_sgpr3 + ; GFX9: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX9: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; GFX9: BUFFER_STORE_DWORD_OFFEN [[COPY]], [[COPY1]], $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 4, addrspace 5) + %0:vgpr(s32) = COPY $vgpr0 + %1:vgpr(p5) = COPY $vgpr1 + G_STORE %0, %1 :: (store 4, align 4, addrspace 5) + +... + +--- + +name: kernel_store_private_s32_to_2 +legalized: true +regBankSelected: true +tracksRegLiveness: true +machineFunctionInfo: + isEntryFunction: true + scratchRSrcReg: $sgpr0_sgpr1_sgpr2_sgpr3 + +body: | + bb.0: + liveins: $vgpr0, $vgpr1, $sgpr0_sgpr1_sgpr2_sgpr3 + + ; GFX6-LABEL: name: kernel_store_private_s32_to_2 + ; GFX6: liveins: $vgpr0, $vgpr1, $sgpr0_sgpr1_sgpr2_sgpr3 + ; GFX6: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX6: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; GFX6: BUFFER_STORE_SHORT_OFFEN [[COPY]], [[COPY1]], $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 2, addrspace 5) + ; GFX9-LABEL: name: kernel_store_private_s32_to_2 + ; GFX9: liveins: $vgpr0, $vgpr1, $sgpr0_sgpr1_sgpr2_sgpr3 + ; GFX9: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX9: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; GFX9: BUFFER_STORE_SHORT_OFFEN [[COPY]], [[COPY1]], $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 2, addrspace 5) + %0:vgpr(s32) = COPY $vgpr0 + %1:vgpr(p5) = COPY $vgpr1 + G_STORE %0, %1 :: (store 2, align 2, addrspace 5) + +... + +--- + +name: kernel_store_private_s32_to_1 +legalized: true +regBankSelected: true +tracksRegLiveness: true +machineFunctionInfo: + isEntryFunction: true + scratchRSrcReg: $sgpr0_sgpr1_sgpr2_sgpr3 + +body: | + bb.0: + liveins: $vgpr0, $vgpr1, $sgpr0_sgpr1_sgpr2_sgpr3 + + ; GFX6-LABEL: name: kernel_store_private_s32_to_1 + ; GFX6: liveins: $vgpr0, $vgpr1, $sgpr0_sgpr1_sgpr2_sgpr3 + ; GFX6: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX6: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; GFX6: BUFFER_STORE_BYTE_OFFEN [[COPY]], [[COPY1]], $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 1, addrspace 5) + ; GFX9-LABEL: name: kernel_store_private_s32_to_1 + ; GFX9: liveins: $vgpr0, $vgpr1, $sgpr0_sgpr1_sgpr2_sgpr3 + ; GFX9: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX9: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; GFX9: BUFFER_STORE_BYTE_OFFEN [[COPY]], [[COPY1]], $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 1, addrspace 5) + %0:vgpr(s32) = COPY $vgpr0 + %1:vgpr(p5) = COPY $vgpr1 + G_STORE %0, %1 :: (store 1, align 1, addrspace 5) + +... + +--- + +name: kernel_store_private_v2s16 +legalized: true +regBankSelected: true +tracksRegLiveness: true +machineFunctionInfo: + isEntryFunction: true + scratchRSrcReg: $sgpr0_sgpr1_sgpr2_sgpr3 + +body: | + bb.0: + liveins: $vgpr0, $vgpr1, $sgpr0_sgpr1_sgpr2_sgpr3 + + ; GFX6-LABEL: name: kernel_store_private_v2s16 + ; GFX6: liveins: $vgpr0, $vgpr1, $sgpr0_sgpr1_sgpr2_sgpr3 + ; GFX6: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX6: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; GFX6: BUFFER_STORE_DWORD_OFFEN [[COPY]], [[COPY1]], $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 4, addrspace 5) + ; GFX9-LABEL: name: kernel_store_private_v2s16 + ; GFX9: liveins: $vgpr0, $vgpr1, $sgpr0_sgpr1_sgpr2_sgpr3 + ; GFX9: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX9: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; GFX9: BUFFER_STORE_DWORD_OFFEN [[COPY]], [[COPY1]], $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 4, addrspace 5) + %0:vgpr(<2 x s16>) = COPY $vgpr0 + %1:vgpr(p5) = COPY $vgpr1 + G_STORE %0, %1 :: (store 4, align 4, addrspace 5) + +... + +--- + +name: kernel_store_private_p3 +legalized: true +regBankSelected: true +tracksRegLiveness: true +machineFunctionInfo: + isEntryFunction: true + scratchRSrcReg: $sgpr0_sgpr1_sgpr2_sgpr3 + +body: | + bb.0: + liveins: $vgpr0, $vgpr1, $sgpr0_sgpr1_sgpr2_sgpr3 + + ; GFX6-LABEL: name: kernel_store_private_p3 + ; GFX6: liveins: $vgpr0, $vgpr1, $sgpr0_sgpr1_sgpr2_sgpr3 + ; GFX6: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX6: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; GFX6: BUFFER_STORE_DWORD_OFFEN [[COPY]], [[COPY1]], $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 4, addrspace 5) + ; GFX9-LABEL: name: kernel_store_private_p3 + ; GFX9: liveins: $vgpr0, $vgpr1, $sgpr0_sgpr1_sgpr2_sgpr3 + ; GFX9: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX9: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; GFX9: BUFFER_STORE_DWORD_OFFEN [[COPY]], [[COPY1]], $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 4, addrspace 5) + %0:vgpr(p3) = COPY $vgpr0 + %1:vgpr(p5) = COPY $vgpr1 + G_STORE %0, %1 :: (store 4, align 4, addrspace 5) + +... + +--- + +name: kernel_store_private_p5 +legalized: true +regBankSelected: true +tracksRegLiveness: true +machineFunctionInfo: + isEntryFunction: true + scratchRSrcReg: $sgpr0_sgpr1_sgpr2_sgpr3 + +body: | + bb.0: + liveins: $vgpr0, $vgpr1, $sgpr0_sgpr1_sgpr2_sgpr3 + + ; GFX6-LABEL: name: kernel_store_private_p5 + ; GFX6: liveins: $vgpr0, $vgpr1, $sgpr0_sgpr1_sgpr2_sgpr3 + ; GFX6: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX6: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; GFX6: BUFFER_STORE_DWORD_OFFEN [[COPY]], [[COPY1]], $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 4, addrspace 5) + ; GFX9-LABEL: name: kernel_store_private_p5 + ; GFX9: liveins: $vgpr0, $vgpr1, $sgpr0_sgpr1_sgpr2_sgpr3 + ; GFX9: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX9: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; GFX9: BUFFER_STORE_DWORD_OFFEN [[COPY]], [[COPY1]], $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 4, addrspace 5) + %0:vgpr(p5) = COPY $vgpr0 + %1:vgpr(p5) = COPY $vgpr1 + G_STORE %0, %1 :: (store 4, align 4, addrspace 5) + +... + +--- + +name: kernel_store_private_s32_to_1_fi_offset_4095 +legalized: true +regBankSelected: true +tracksRegLiveness: true +machineFunctionInfo: + isEntryFunction: true + scratchRSrcReg: $sgpr0_sgpr1_sgpr2_sgpr3 +stack: + - { id: 0, size: 4096, alignment: 4 } + +body: | + bb.0: + liveins: $sgpr0_sgpr1_sgpr2_sgpr3 + + ; GFX6-LABEL: name: kernel_store_private_s32_to_1_fi_offset_4095 + ; GFX6: liveins: $sgpr0_sgpr1_sgpr2_sgpr3 + ; GFX6: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 %stack.0, implicit $exec + ; GFX6: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4095, implicit $exec + ; GFX6: %2:vgpr_32, dead %4:sreg_64_xexec = V_ADD_I32_e64 [[V_MOV_B32_e32_]], [[V_MOV_B32_e32_1]], 0, implicit $exec + ; GFX6: [[V_MOV_B32_e32_2:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + ; GFX6: BUFFER_STORE_BYTE_OFFEN [[V_MOV_B32_e32_2]], %2, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 1, addrspace 5) + ; GFX9-LABEL: name: kernel_store_private_s32_to_1_fi_offset_4095 + ; GFX9: liveins: $sgpr0_sgpr1_sgpr2_sgpr3 + ; GFX9: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + ; GFX9: BUFFER_STORE_BYTE_OFFEN [[V_MOV_B32_e32_]], %stack.0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 4095, 0, 0, 0, 0, 0, implicit $exec :: (store 1, addrspace 5) + %0:vgpr(p5) = G_FRAME_INDEX %stack.0 + %1:vgpr(s32) = G_CONSTANT i32 4095 + %2:vgpr(p5) = G_PTR_ADD %0, %1 + %3:vgpr(s32) = G_CONSTANT i32 0 + G_STORE %3, %2 :: (store 1, align 1, addrspace 5) + +... + +--- + +name: kernel_store_private_s32_to_1_constant_4095 +legalized: true +regBankSelected: true +tracksRegLiveness: true +machineFunctionInfo: + isEntryFunction: true + scratchRSrcReg: $sgpr0_sgpr1_sgpr2_sgpr3 +stack: + - { id: 0, size: 4096, alignment: 4 } + +body: | + bb.0: + liveins: $sgpr0_sgpr1_sgpr2_sgpr3 + + ; GFX6-LABEL: name: kernel_store_private_s32_to_1_constant_4095 + ; GFX6: liveins: $sgpr0_sgpr1_sgpr2_sgpr3 + ; GFX6: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + ; GFX6: BUFFER_STORE_BYTE_OFFSET [[V_MOV_B32_e32_]], $sgpr0_sgpr1_sgpr2_sgpr3, 0, 4095, 0, 0, 0, 0, 0, implicit $exec :: (store 1, addrspace 5) + ; GFX9-LABEL: name: kernel_store_private_s32_to_1_constant_4095 + ; GFX9: liveins: $sgpr0_sgpr1_sgpr2_sgpr3 + ; GFX9: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + ; GFX9: BUFFER_STORE_BYTE_OFFSET [[V_MOV_B32_e32_]], $sgpr0_sgpr1_sgpr2_sgpr3, 0, 4095, 0, 0, 0, 0, 0, implicit $exec :: (store 1, addrspace 5) + %0:vgpr(p5) = G_CONSTANT i32 4095 + %1:vgpr(s32) = G_CONSTANT i32 0 + G_STORE %1, %0 :: (store 1, align 1, addrspace 5) + +... + +--- + +name: kernel_store_private_s32_to_1_constant_4096 +legalized: true +regBankSelected: true +tracksRegLiveness: true +machineFunctionInfo: + isEntryFunction: true + scratchRSrcReg: $sgpr0_sgpr1_sgpr2_sgpr3 +stack: + - { id: 0, size: 4096, alignment: 4 } + +body: | + bb.0: + liveins: $sgpr0_sgpr1_sgpr2_sgpr3 + + ; GFX6-LABEL: name: kernel_store_private_s32_to_1_constant_4096 + ; GFX6: liveins: $sgpr0_sgpr1_sgpr2_sgpr3 + ; GFX6: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + ; GFX6: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4096, implicit $exec + ; GFX6: BUFFER_STORE_BYTE_OFFEN [[V_MOV_B32_e32_]], [[V_MOV_B32_e32_1]], $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 1, addrspace 5) + ; GFX9-LABEL: name: kernel_store_private_s32_to_1_constant_4096 + ; GFX9: liveins: $sgpr0_sgpr1_sgpr2_sgpr3 + ; GFX9: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + ; GFX9: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4096, implicit $exec + ; GFX9: BUFFER_STORE_BYTE_OFFEN [[V_MOV_B32_e32_]], [[V_MOV_B32_e32_1]], $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 1, addrspace 5) %0:vgpr(p5) = G_CONSTANT i32 4096 %1:vgpr(s32) = G_CONSTANT i32 0 G_STORE %1, %0 :: (store 1, align 1, addrspace 5) diff --git a/llvm/test/CodeGen/AMDGPU/addrspacecast.ll b/llvm/test/CodeGen/AMDGPU/addrspacecast.ll --- a/llvm/test/CodeGen/AMDGPU/addrspacecast.ll +++ b/llvm/test/CodeGen/AMDGPU/addrspacecast.ll @@ -133,7 +133,7 @@ ; HSA-DAG: v_mov_b32_e32 v[[VPTR_LO:[0-9]+]], s[[PTR_LO]] ; HSA-DAG: v_cndmask_b32_e32 [[CASTPTR:v[0-9]+]], 0, v[[VPTR_LO]] ; HSA-DAG: v_mov_b32_e32 v[[K:[0-9]+]], 0{{$}} -; HSA: buffer_store_dword v[[K]], [[CASTPTR]], s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} offen{{$}} +; HSA: buffer_store_dword v[[K]], [[CASTPTR]], s{{\[[0-9]+:[0-9]+\]}}, 0 offen{{$}} define amdgpu_kernel void @use_flat_to_private_addrspacecast(i32* %ptr) #0 { %ftos = addrspacecast i32* %ptr to i32 addrspace(5)* store volatile i32 0, i32 addrspace(5)* %ftos @@ -231,7 +231,7 @@ ; HSA-LABEL: {{^}}cast_0_flat_to_private_addrspacecast: ; HSA: v_mov_b32_e32 [[K:v[0-9]+]], 7{{$}} -; HSA: buffer_store_dword [[K]], off, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+$}} +; HSA: buffer_store_dword [[K]], off, s{{\[[0-9]+:[0-9]+\]}}, 0 define amdgpu_kernel void @cast_0_flat_to_private_addrspacecast() #0 { %cast = addrspacecast i32* null to i32 addrspace(5)* store volatile i32 7, i32 addrspace(5)* %cast diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu.private-memory.ll b/llvm/test/CodeGen/AMDGPU/amdgpu.private-memory.ll --- a/llvm/test/CodeGen/AMDGPU/amdgpu.private-memory.ll +++ b/llvm/test/CodeGen/AMDGPU/amdgpu.private-memory.ll @@ -45,8 +45,8 @@ ; HSA-ALLOCA: s_add_u32 s6, s6, s9 ; HSA-ALLOCA: s_lshr_b32 flat_scratch_hi, s6, 8 -; SI-ALLOCA: buffer_store_dword v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], s{{[0-9]+}} offen ; encoding: [0x00,0x10,0x70,0xe0 -; SI-ALLOCA: buffer_store_dword v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], s{{[0-9]+}} offen ; encoding: [0x00,0x10,0x70,0xe0 +; SI-ALLOCA: buffer_store_dword v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], 0 offen ; encoding: [0x00,0x10,0x70,0xe0 +; SI-ALLOCA: buffer_store_dword v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], 0 offen ; encoding: [0x00,0x10,0x70,0xe0 ; HSAOPT: [[DISPATCH_PTR:%[0-9]+]] = call noalias nonnull dereferenceable(64) i8 addrspace(4)* @llvm.amdgcn.dispatch.ptr() @@ -226,10 +226,10 @@ ; R600-VECT: MOVA_INT -; SI-ALLOCA-DAG: buffer_store_short v{{[0-9]+}}, off, s[{{[0-9]+:[0-9]+}}], s{{[0-9]+}} offset:6 ; encoding: [0x06,0x00,0x68,0xe0 -; SI-ALLOCA-DAG: buffer_store_short v{{[0-9]+}}, off, s[{{[0-9]+:[0-9]+}}], s{{[0-9]+}} offset:4 ; encoding: [0x04,0x00,0x68,0xe0 +; SI-ALLOCA-DAG: buffer_store_short v{{[0-9]+}}, off, s[{{[0-9]+:[0-9]+}}], 0 offset:6 ; encoding: [0x06,0x00,0x68,0xe0 +; SI-ALLOCA-DAG: buffer_store_short v{{[0-9]+}}, off, s[{{[0-9]+:[0-9]+}}], 0 offset:4 ; encoding: [0x04,0x00,0x68,0xe0 ; Loaded value is 0 or 1, so sext will become zext, so we get buffer_load_ushort instead of buffer_load_sshort. -; SI-ALLOCA: buffer_load_sshort v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], s{{[0-9]+}} +; SI-ALLOCA: buffer_load_sshort v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], 0 ; SI-PROMOTE-VECT: s_load_dword [[IDX:s[0-9]+]] ; SI-PROMOTE-VECT: s_mov_b32 [[SREG:s[0-9]+]], 0x10000 @@ -257,8 +257,8 @@ ; SI-PROMOTE-VECT-DAG: s_lshl_b32 ; SI-PROMOTE-VECT-DAG: v_lshrrev -; SI-ALLOCA-DAG: buffer_store_byte v{{[0-9]+}}, off, s[{{[0-9]+:[0-9]+}}], s{{[0-9]+}} offset:4 ; encoding: [0x04,0x00,0x60,0xe0 -; SI-ALLOCA-DAG: buffer_store_byte v{{[0-9]+}}, off, s[{{[0-9]+:[0-9]+}}], s{{[0-9]+}} offset:5 ; encoding: [0x05,0x00,0x60,0xe0 +; SI-ALLOCA-DAG: buffer_store_byte v{{[0-9]+}}, off, s[{{[0-9]+:[0-9]+}}], 0 offset:4 ; encoding: [0x04,0x00,0x60,0xe0 +; SI-ALLOCA-DAG: buffer_store_byte v{{[0-9]+}}, off, s[{{[0-9]+:[0-9]+}}], 0 offset:5 ; encoding: [0x05,0x00,0x60,0xe0 define amdgpu_kernel void @char_array(i32 addrspace(1)* %out, i32 %index) #0 { entry: %0 = alloca [2 x i8], addrspace(5) @@ -281,7 +281,7 @@ ; R600-NOT: [[CHAN]]+ ; ; A total of 5 bytes should be allocated and used. -; SI: buffer_store_byte v{{[0-9]+}}, off, s[{{[0-9]+:[0-9]+}}], s{{[0-9]+}} offset:4 ; +; SI: buffer_store_byte v{{[0-9]+}}, off, s[{{[0-9]+:[0-9]+}}], 0 offset:4 ; define amdgpu_kernel void @no_overlap(i32 addrspace(1)* %out, i32 %in) #0 { entry: %0 = alloca [3 x i8], align 1, addrspace(5) @@ -393,9 +393,9 @@ ; FUNC-LABEL: ptrtoint: ; SI-NOT: ds_write -; SI: buffer_store_dword v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], s{{[0-9]+}} offen +; SI: buffer_store_dword v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], 0 offen ; SI: v_add_{{[iu]}}32_e32 [[ADD_OFFSET:v[0-9]+]], vcc, 5, -; SI: buffer_load_dword v{{[0-9]+}}, [[ADD_OFFSET:v[0-9]+]], s[{{[0-9]+:[0-9]+}}], s{{[0-9]+}} offen ; +; SI: buffer_load_dword v{{[0-9]+}}, [[ADD_OFFSET:v[0-9]+]], s[{{[0-9]+:[0-9]+}}], 0 offen ; define amdgpu_kernel void @ptrtoint(i32 addrspace(1)* %out, i32 %a, i32 %b) #0 { %alloca = alloca [16 x i32], addrspace(5) %tmp0 = getelementptr [16 x i32], [16 x i32] addrspace(5)* %alloca, i32 0, i32 %a diff --git a/llvm/test/CodeGen/AMDGPU/array-ptr-calc-i32.ll b/llvm/test/CodeGen/AMDGPU/array-ptr-calc-i32.ll --- a/llvm/test/CodeGen/AMDGPU/array-ptr-calc-i32.ll +++ b/llvm/test/CodeGen/AMDGPU/array-ptr-calc-i32.ll @@ -13,9 +13,9 @@ ; SI-LABEL: {{^}}test_private_array_ptr_calc: ; SI-ALLOCA: v_add_i32_e32 [[PTRREG:v[0-9]+]], vcc, 16, v{{[0-9]+}} -; SI-ALLOCA: buffer_store_dword {{v[0-9]+}}, [[PTRREG]], s[{{[0-9]+:[0-9]+}}], s{{[0-9]+}} offen offset:64 +; SI-ALLOCA: buffer_store_dword {{v[0-9]+}}, [[PTRREG]], s[{{[0-9]+:[0-9]+}}], 0 offen offset:64 ; SI-ALLOCA: s_barrier -; SI-ALLOCA: buffer_load_dword {{v[0-9]+}}, [[PTRREG]], s[{{[0-9]+:[0-9]+}}], s{{[0-9]+}} offen offset:64 +; SI-ALLOCA: buffer_load_dword {{v[0-9]+}}, [[PTRREG]], s[{{[0-9]+:[0-9]+}}], 0 offen offset:64 ; ; FIXME: The AMDGPUPromoteAlloca pass should be able to convert this ; alloca to a vector. It currently fails because it does not know how diff --git a/llvm/test/CodeGen/AMDGPU/attr-amdgpu-num-sgpr.ll b/llvm/test/CodeGen/AMDGPU/attr-amdgpu-num-sgpr.ll --- a/llvm/test/CodeGen/AMDGPU/attr-amdgpu-num-sgpr.ll +++ b/llvm/test/CodeGen/AMDGPU/attr-amdgpu-num-sgpr.ll @@ -2,6 +2,11 @@ ; FIXME: Vectorization can increase required SGPR count beyond limit. +; FIXME: I'm not sure I understand what this is testing, but after the +; CC change the spare SGPR available once the scratch wave offset dies +; seems to help GreedyRA avoid a spill, but somehow ends up with another +; SGPR used. + ; ALL-LABEL: {{^}}max_9_sgprs: ; ALL: SGPRBlocks: 1 diff --git a/llvm/test/CodeGen/AMDGPU/byval-frame-setup.ll b/llvm/test/CodeGen/AMDGPU/byval-frame-setup.ll --- a/llvm/test/CodeGen/AMDGPU/byval-frame-setup.ll +++ b/llvm/test/CodeGen/AMDGPU/byval-frame-setup.ll @@ -2,31 +2,6 @@ ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=hawaii -enable-ipra=0 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,CI %s %struct.ByValStruct = type { [4 x i32] } - -; GCN-LABEL: {{^}}void_func_byval_struct: -; GCN: buffer_load_dword [[LOAD0:v[0-9]+]], off, s[0:3], s32{{$}} -; GCN-NOT: s32 -; GCN: buffer_store_dword [[LOAD0]], off, s[0:3], s32{{$}} -; GCN-NOT: s32 - -; GCN: buffer_load_dword [[LOAD1:v[0-9]+]], off, s[0:3], s32 offset:16{{$}} -; GCN-NOT: s32 -; GCN: buffer_store_dword [[LOAD1]], off, s[0:3], s32 offset:16{{$}} -; GCN-NOT: s32 -define hidden void @void_func_byval_struct(%struct.ByValStruct addrspace(5)* byval noalias nocapture align 4 %arg0, %struct.ByValStruct addrspace(5)* byval noalias nocapture align 4 %arg1) #1 { -entry: - %arrayidx = getelementptr inbounds %struct.ByValStruct, %struct.ByValStruct addrspace(5)* %arg0, i32 0, i32 0, i32 0 - %tmp = load volatile i32, i32 addrspace(5)* %arrayidx, align 4 - %add = add nsw i32 %tmp, 1 - store volatile i32 %add, i32 addrspace(5)* %arrayidx, align 4 - %arrayidx2 = getelementptr inbounds %struct.ByValStruct, %struct.ByValStruct addrspace(5)* %arg1, i32 0, i32 0, i32 0 - %tmp1 = load volatile i32, i32 addrspace(5)* %arrayidx2, align 4 - %add3 = add nsw i32 %tmp1, 2 - store volatile i32 %add3, i32 addrspace(5)* %arrayidx2, align 4 - store volatile i32 9, i32 addrspace(1)* null, align 4 - ret void -} - ; Make sure the offset is folded and function's frame register is used ; rather than the global scratch wave offset. ; GCN-LABEL: {{^}}void_func_byval_struct_use_outside_entry_block: @@ -67,331 +42,6 @@ bb1: ret void } - -; GCN-LABEL: {{^}}void_func_byval_struct_non_leaf: -; GCN: buffer_store_dword v33, off, s[0:3], s32 offset:36 -; GCN-DAG: v_writelane_b32 v33, s34, -; GCN: s_mov_b32 s34, s32 -; GCN-DAG: buffer_load_dword [[LOAD0:v[0-9]+]], off, s[0:3], s34{{$}} -; GCN-DAG: s_add_u32 s32, s32, 0xc00{{$}} -; GCN-DAG: buffer_store_dword v32, off, s[0:3], s34 offset:32 -; GCN-NOT: v_writelane_b32 v{{[0-9]+}}, s32 - -; GCN-DAG: v_add_{{[iu]}}32_e32 [[ADD0:v[0-9]+]], vcc, 1, [[LOAD0]] -; GCN: buffer_store_dword [[ADD0]], off, s[0:3], s34{{$}} - -; GCN-DAG: buffer_load_dword [[LOAD1:v[0-9]+]], off, s[0:3], s34 offset:16{{$}} -; GCN-DAG: v_add_{{[iu]}}32_e32 [[ADD1:v[0-9]+]], vcc, 2, [[LOAD1]] - -; GCN: s_swappc_b64 - -; GCN: buffer_store_dword [[ADD1]], off, s[0:3], s34 offset:16{{$}} - -; GCN: v_readlane_b32 -; GCN-NOT: v_readlane_b32 s32 -; GCN-DAG: buffer_load_dword v32, off, s[0:3], s34 offset:32 -; GCN: s_sub_u32 s32, s32, 0xc00{{$}} -; GCN: v_readlane_b32 s34, v33, -; GCN-DAG: buffer_load_dword v33, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload -; GCN: s_setpc_b64 -define void @void_func_byval_struct_non_leaf(%struct.ByValStruct addrspace(5)* byval noalias nocapture align 4 %arg0, %struct.ByValStruct addrspace(5)* byval noalias nocapture align 4 %arg1) #1 { -entry: - %arrayidx = getelementptr inbounds %struct.ByValStruct, %struct.ByValStruct addrspace(5)* %arg0, i32 0, i32 0, i32 0 - %tmp = load volatile i32, i32 addrspace(5)* %arrayidx, align 4 - %add = add nsw i32 %tmp, 1 - store volatile i32 %add, i32 addrspace(5)* %arrayidx, align 4 - %arrayidx2 = getelementptr inbounds %struct.ByValStruct, %struct.ByValStruct addrspace(5)* %arg1, i32 0, i32 0, i32 0 - %tmp1 = load volatile i32, i32 addrspace(5)* %arrayidx2, align 4 - %add3 = add nsw i32 %tmp1, 2 - call void @external_void_func_void() - store volatile i32 %add3, i32 addrspace(5)* %arrayidx2, align 4 - store volatile i32 9, i32 addrspace(1)* null, align 4 - ret void -} - -; GCN-LABEL: {{^}}call_void_func_byval_struct_func: -; GCN: s_mov_b32 s34, s32 -; GCN-DAG: s_add_u32 s32, s32, 0xc00{{$}} -; GCN-DAG: v_writelane_b32 - -; GCN-DAG: v_mov_b32_e32 [[NINE:v[0-9]+]], 9 -; GCN-DAG: v_mov_b32_e32 [[THIRTEEN:v[0-9]+]], 13 - -; GCN-DAG: buffer_store_dword [[NINE]], off, s[0:3], s34{{$}} -; GCN-DAG: buffer_store_dword [[THIRTEEN]], off, s[0:3], s34 offset:16 - -; GCN-DAG: buffer_load_dword [[LOAD0:v[0-9]+]], off, s[0:3], s34{{$}} -; GCN-DAG: buffer_load_dword [[LOAD1:v[0-9]+]], off, s[0:3], s34 offset:4 -; GCN-DAG: buffer_load_dword [[LOAD2:v[0-9]+]], off, s[0:3], s34 offset:8 -; GCN-DAG: buffer_load_dword [[LOAD3:v[0-9]+]], off, s[0:3], s34 offset:12 - -; GCN-NOT: s_add_u32 s32, s32, 0x800 - - -; GCN-DAG: buffer_store_dword [[LOAD0]], off, s[0:3], s32{{$}} -; GCN-DAG: buffer_store_dword [[LOAD1]], off, s[0:3], s32 offset:4 -; GCN-DAG: buffer_store_dword [[LOAD2]], off, s[0:3], s32 offset:8 -; GCN-DAG: buffer_store_dword [[LOAD3]], off, s[0:3], s32 offset:12 - -; GCN: buffer_load_dword [[LOAD4:v[0-9]+]], off, s[0:3], s34 offset:16 -; GCN: buffer_load_dword [[LOAD5:v[0-9]+]], off, s[0:3], s34 offset:20 -; GCN: buffer_load_dword [[LOAD6:v[0-9]+]], off, s[0:3], s34 offset:24 -; GCN: buffer_load_dword [[LOAD7:v[0-9]+]], off, s[0:3], s34 offset:28 - -; GCN-DAG: buffer_store_dword [[LOAD4]], off, s[0:3], s32 offset:16 -; GCN-DAG: buffer_store_dword [[LOAD5]], off, s[0:3], s32 offset:20 -; GCN-DAG: buffer_store_dword [[LOAD6]], off, s[0:3], s32 offset:24 -; GCN-DAG: buffer_store_dword [[LOAD7]], off, s[0:3], s32 offset:28 - -; GCN: s_swappc_b64 -; GCN-NOT: v_readlane_b32 s32 -; GCN: v_readlane_b32 -; GCN-NOT: v_readlane_b32 s32 - -; GCN-NOT: s_sub_u32 s32, s32, 0x800 - -; GCN: s_sub_u32 s32, s32, 0xc00{{$}} -; GCN: v_readlane_b32 s34, v -; GCN: s_waitcnt -; GCN: s_setpc_b64 -define void @call_void_func_byval_struct_func() #1 { -entry: - %arg0 = alloca %struct.ByValStruct, align 4, addrspace(5) - %arg1 = alloca %struct.ByValStruct, align 4, addrspace(5) - %tmp = bitcast %struct.ByValStruct addrspace(5)* %arg0 to i8 addrspace(5)* - call void @llvm.lifetime.start.p5i8(i64 32, i8 addrspace(5)* %tmp) - %tmp1 = bitcast %struct.ByValStruct addrspace(5)* %arg1 to i8 addrspace(5)* - call void @llvm.lifetime.start.p5i8(i64 32, i8 addrspace(5)* %tmp1) - %arrayidx = getelementptr inbounds %struct.ByValStruct, %struct.ByValStruct addrspace(5)* %arg0, i32 0, i32 0, i32 0 - store volatile i32 9, i32 addrspace(5)* %arrayidx, align 4 - %arrayidx2 = getelementptr inbounds %struct.ByValStruct, %struct.ByValStruct addrspace(5)* %arg1, i32 0, i32 0, i32 0 - store volatile i32 13, i32 addrspace(5)* %arrayidx2, align 4 - call void @void_func_byval_struct(%struct.ByValStruct addrspace(5)* byval nonnull align 4 %arg0, %struct.ByValStruct addrspace(5)* byval nonnull align 4 %arg1) - call void @llvm.lifetime.end.p5i8(i64 32, i8 addrspace(5)* %tmp1) - call void @llvm.lifetime.end.p5i8(i64 32, i8 addrspace(5)* %tmp) - ret void -} - -; GCN-LABEL: {{^}}call_void_func_byval_struct_kernel: -; GCN: s_mov_b32 s33, s7 -; GCN-NOT: s_add_u32 s32, s32, 0x800 - -; GCN: v_mov_b32_e32 [[NINE:v[0-9]+]], 9 -; GCN: buffer_store_dword [[NINE]], off, s[0:3], s33 offset:8 -; GCN: v_mov_b32_e32 [[THIRTEEN:v[0-9]+]], 13 -; GCN: buffer_store_dword [[THIRTEEN]], off, s[0:3], s33 offset:24 - -; GCN-NOT: s_add_u32 s32, s32, 0x800 -; GCN-DAG: buffer_load_dword [[LOAD0:v[0-9]+]], off, s[0:3], s33 offset:8 -; GCN-DAG: buffer_load_dword [[LOAD1:v[0-9]+]], off, s[0:3], s33 offset:12 -; GCN-DAG: s_add_u32 s32, s33, 0xc00{{$}} -; GCN-DAG: buffer_load_dword [[LOAD2:v[0-9]+]], off, s[0:3], s33 offset:16 -; GCN-DAG: buffer_load_dword [[LOAD3:v[0-9]+]], off, s[0:3], s33 offset:20 - -; GCN: s_getpc_b64 - -; GCN-DAG: buffer_store_dword [[LOAD0]], off, s[0:3], s32{{$}} -; GCN-DAG: buffer_store_dword [[LOAD1]], off, s[0:3], s32 offset:4 -; GCN-DAG: buffer_store_dword [[LOAD2]], off, s[0:3], s32 offset:8 -; GCN-DAG: buffer_store_dword [[LOAD3]], off, s[0:3], s32 offset:12 - -; GCN-DAG: buffer_load_dword [[LOAD4:v[0-9]+]], off, s[0:3], s33 offset:24 -; GCN-DAG: buffer_load_dword [[LOAD5:v[0-9]+]], off, s[0:3], s33 offset:28 -; GCN-DAG: buffer_load_dword [[LOAD6:v[0-9]+]], off, s[0:3], s33 offset:32 -; GCN-DAG: buffer_load_dword [[LOAD7:v[0-9]+]], off, s[0:3], s33 offset:36 - -; GCN-DAG: buffer_store_dword [[LOAD4]], off, s[0:3], s32 offset:16 -; GCN-DAG: buffer_store_dword [[LOAD5]], off, s[0:3], s32 offset:20 -; GCN-DAG: buffer_store_dword [[LOAD6]], off, s[0:3], s32 offset:24 -; GCN-DAG: buffer_store_dword [[LOAD7]], off, s[0:3], s32 offset:28 - - -; GCN: s_swappc_b64 -; GCN-NOT: s_sub_u32 s32 -; GCN: s_endpgm -define amdgpu_kernel void @call_void_func_byval_struct_kernel() #1 { -entry: - %arg0 = alloca %struct.ByValStruct, align 4, addrspace(5) - %arg1 = alloca %struct.ByValStruct, align 4, addrspace(5) - %tmp = bitcast %struct.ByValStruct addrspace(5)* %arg0 to i8 addrspace(5)* - call void @llvm.lifetime.start.p5i8(i64 32, i8 addrspace(5)* %tmp) - %tmp1 = bitcast %struct.ByValStruct addrspace(5)* %arg1 to i8 addrspace(5)* - call void @llvm.lifetime.start.p5i8(i64 32, i8 addrspace(5)* %tmp1) - %arrayidx = getelementptr inbounds %struct.ByValStruct, %struct.ByValStruct addrspace(5)* %arg0, i32 0, i32 0, i32 0 - store volatile i32 9, i32 addrspace(5)* %arrayidx, align 4 - %arrayidx2 = getelementptr inbounds %struct.ByValStruct, %struct.ByValStruct addrspace(5)* %arg1, i32 0, i32 0, i32 0 - store volatile i32 13, i32 addrspace(5)* %arrayidx2, align 4 - call void @void_func_byval_struct(%struct.ByValStruct addrspace(5)* byval nonnull align 4 %arg0, %struct.ByValStruct addrspace(5)* byval nonnull align 4 %arg1) - call void @llvm.lifetime.end.p5i8(i64 32, i8 addrspace(5)* %tmp1) - call void @llvm.lifetime.end.p5i8(i64 32, i8 addrspace(5)* %tmp) - ret void -} - -; GCN-LABEL: {{^}}void_func_byval_struct_align8: -; GCN: buffer_load_dword [[LOAD0:v[0-9]+]], off, s[0:3], s32{{$}} -; GCN-NOT: s32 -; GCN: buffer_store_dword [[LOAD0]], off, s[0:3], s32{{$}} -; GCN-NOT: s32 - -; GCN: buffer_load_dword [[LOAD1:v[0-9]+]], off, s[0:3], s32 offset:16{{$}} -; GCN-NOT: s32 -; GCN: buffer_store_dword [[LOAD1]], off, s[0:3], s32 offset:16{{$}} -; GCN-NOT: s32 -define hidden void @void_func_byval_struct_align8(%struct.ByValStruct addrspace(5)* byval noalias nocapture align 8 %arg0, %struct.ByValStruct addrspace(5)* byval noalias nocapture align 8 %arg1) #1 { -entry: - %arrayidx = getelementptr inbounds %struct.ByValStruct, %struct.ByValStruct addrspace(5)* %arg0, i32 0, i32 0, i32 0 - %tmp = load volatile i32, i32 addrspace(5)* %arrayidx, align 8 - %add = add nsw i32 %tmp, 1 - store volatile i32 %add, i32 addrspace(5)* %arrayidx, align 8 - %arrayidx2 = getelementptr inbounds %struct.ByValStruct, %struct.ByValStruct addrspace(5)* %arg1, i32 0, i32 0, i32 0 - %tmp1 = load volatile i32, i32 addrspace(5)* %arrayidx2, align 8 - %add3 = add nsw i32 %tmp1, 2 - store volatile i32 %add3, i32 addrspace(5)* %arrayidx2, align 8 - store volatile i32 9, i32 addrspace(1)* null, align 4 - ret void -} - -; Make sure the byval alignment is respected in the call frame setup -; GCN-LABEL: {{^}}call_void_func_byval_struct_align8_kernel: -; GCN: s_mov_b32 s33, s7 -; GCN-NOT: s_add_u32 s32, s32, 0x800 - -; GCN: v_mov_b32_e32 [[NINE:v[0-9]+]], 9 -; GCN: buffer_store_dword [[NINE]], off, s[0:3], s33 offset:8 -; GCN: v_mov_b32_e32 [[THIRTEEN:v[0-9]+]], 13 -; GCN: buffer_store_dword [[THIRTEEN]], off, s[0:3], s33 offset:24 - - -; GCN-NOT: s_add_u32 s32, s32, 0x800 - -; GCN: buffer_load_dword [[LOAD0:v[0-9]+]], off, s[0:3], s33 offset:8 -; GCN: buffer_load_dword [[LOAD1:v[0-9]+]], off, s[0:3], s33 offset:12 -; GCN: buffer_load_dword [[LOAD2:v[0-9]+]], off, s[0:3], s33 offset:16 -; GCN: buffer_load_dword [[LOAD3:v[0-9]+]], off, s[0:3], s33 offset:20 - -; GCN-NOT: s_add_u32 s32, s32, 0x800 -; GCN-DAG: s_add_u32 s32, s33, 0xc00{{$}} - -; GCN: buffer_store_dword [[LOAD3]], off, s[0:3], s32 offset:12 -; GCN: buffer_store_dword [[LOAD2]], off, s[0:3], s32 offset:8 -; GCN: buffer_store_dword [[LOAD1]], off, s[0:3], s32 offset:4 -; GCN: buffer_store_dword [[LOAD0]], off, s[0:3], s32{{$}} - - -; GCN-DAG: buffer_load_dword [[LOAD4:v[0-9]+]], off, s[0:3], s33 offset:24 -; GCN-DAG: buffer_load_dword [[LOAD5:v[0-9]+]], off, s[0:3], s33 offset:28 -; GCN-DAG: buffer_load_dword [[LOAD6:v[0-9]+]], off, s[0:3], s33 offset:32 -; GCN-DAG: buffer_load_dword [[LOAD7:v[0-9]+]], off, s[0:3], s33 offset:36 - -; GCN-DAG: buffer_store_dword [[LOAD4]], off, s[0:3], s32 offset:16 -; GCN-DAG: buffer_store_dword [[LOAD5]], off, s[0:3], s32 offset:20 -; GCN-DAG: buffer_store_dword [[LOAD6]], off, s[0:3], s32 offset:24 -; GCN-DAG: buffer_store_dword [[LOAD7]], off, s[0:3], s32 offset:28 - - -; GCN: s_swappc_b64 -; GCN-NOT: s_sub_u32 s32 -; GCN: s_endpgm -define amdgpu_kernel void @call_void_func_byval_struct_align8_kernel() #1 { -entry: - %arg0 = alloca %struct.ByValStruct, align 8, addrspace(5) - %arg1 = alloca %struct.ByValStruct, align 8, addrspace(5) - %tmp = bitcast %struct.ByValStruct addrspace(5)* %arg0 to i8 addrspace(5)* - call void @llvm.lifetime.start.p5i8(i64 32, i8 addrspace(5)* %tmp) - %tmp1 = bitcast %struct.ByValStruct addrspace(5)* %arg1 to i8 addrspace(5)* - call void @llvm.lifetime.start.p5i8(i64 32, i8 addrspace(5)* %tmp1) - %arrayidx = getelementptr inbounds %struct.ByValStruct, %struct.ByValStruct addrspace(5)* %arg0, i32 0, i32 0, i32 0 - store volatile i32 9, i32 addrspace(5)* %arrayidx, align 8 - %arrayidx2 = getelementptr inbounds %struct.ByValStruct, %struct.ByValStruct addrspace(5)* %arg1, i32 0, i32 0, i32 0 - store volatile i32 13, i32 addrspace(5)* %arrayidx2, align 8 - call void @void_func_byval_struct_align8(%struct.ByValStruct addrspace(5)* byval nonnull align 8 %arg0, %struct.ByValStruct addrspace(5)* byval nonnull align 8 %arg1) - call void @llvm.lifetime.end.p5i8(i64 32, i8 addrspace(5)* %tmp1) - call void @llvm.lifetime.end.p5i8(i64 32, i8 addrspace(5)* %tmp) - ret void -} - -; GCN-LABEL: {{^}}call_void_func_byval_struct_align8_func: -; GCN: s_mov_b32 s34, s32 -; GCN-DAG: s_add_u32 s32, s32, 0xc00{{$}} -; GCN-DAG: v_writelane_b32 - -; GCN-DAG: v_mov_b32_e32 [[NINE:v[0-9]+]], 9 -; GCN-DAG: v_mov_b32_e32 [[THIRTEEN:v[0-9]+]], 13 - -; GCN-DAG: buffer_store_dword [[NINE]], off, s[0:3], s34{{$}} -; GCN-DAG: buffer_store_dword [[THIRTEEN]], off, s[0:3], s34 offset:16 - -; GCN-DAG: buffer_load_dword [[LOAD0:v[0-9]+]], off, s[0:3], s34{{$}} -; GCN-DAG: buffer_load_dword [[LOAD1:v[0-9]+]], off, s[0:3], s34 offset:4 -; GCN-DAG: buffer_load_dword [[LOAD2:v[0-9]+]], off, s[0:3], s34 offset:8 -; GCN-DAG: buffer_load_dword [[LOAD3:v[0-9]+]], off, s[0:3], s34 offset:12 - -; GCN-NOT: s_add_u32 s32, s32, 0x800 - -; GCN-DAG: buffer_store_dword [[LOAD0]], off, s[0:3], s32{{$}} -; GCN-DAG: buffer_store_dword [[LOAD1]], off, s[0:3], s32 offset:4 -; GCN-DAG: buffer_store_dword [[LOAD2]], off, s[0:3], s32 offset:8 -; GCN-DAG: buffer_store_dword [[LOAD3]], off, s[0:3], s32 offset:12 - -; GCN: buffer_load_dword [[LOAD4:v[0-9]+]], off, s[0:3], s34 offset:16 -; GCN: buffer_load_dword [[LOAD5:v[0-9]+]], off, s[0:3], s34 offset:20 -; GCN: buffer_load_dword [[LOAD6:v[0-9]+]], off, s[0:3], s34 offset:24 -; GCN: buffer_load_dword [[LOAD7:v[0-9]+]], off, s[0:3], s34 offset:28 - -; GCN: s_waitcnt vmcnt(0) -; GCN-DAG: buffer_store_dword [[LOAD4]], off, s[0:3], s32 offset:16 -; GCN-DAG: buffer_store_dword [[LOAD5]], off, s[0:3], s32 offset:20 -; GCN-DAG: buffer_store_dword [[LOAD6]], off, s[0:3], s32 offset:24 -; GCN-DAG: buffer_store_dword [[LOAD7]], off, s[0:3], s32 offset:28 - -; GCN: s_swappc_b64 -; GCN-NOT: v_readlane_b32 s32 -; GCN: v_readlane_b32 -; GCN-NOT: v_readlane_b32 s32 - -; GCN-NOT: s_sub_u32 s32, s32, 0x800 - -; GCN: s_sub_u32 s32, s32, 0xc00{{$}} -; GCN: v_readlane_b32 s34, v -; GCN: s_waitcnt -; GCN-NEXT: s_setpc_b64 -define void @call_void_func_byval_struct_align8_func() #0 { -entry: - %arg0 = alloca %struct.ByValStruct, align 8, addrspace(5) - %arg1 = alloca %struct.ByValStruct, align 8, addrspace(5) - %tmp = bitcast %struct.ByValStruct addrspace(5)* %arg0 to i8 addrspace(5)* - call void @llvm.lifetime.start.p5i8(i64 32, i8 addrspace(5)* %tmp) - %tmp1 = bitcast %struct.ByValStruct addrspace(5)* %arg1 to i8 addrspace(5)* - call void @llvm.lifetime.start.p5i8(i64 32, i8 addrspace(5)* %tmp1) - %arrayidx = getelementptr inbounds %struct.ByValStruct, %struct.ByValStruct addrspace(5)* %arg0, i32 0, i32 0, i32 0 - store volatile i32 9, i32 addrspace(5)* %arrayidx, align 8 - %arrayidx2 = getelementptr inbounds %struct.ByValStruct, %struct.ByValStruct addrspace(5)* %arg1, i32 0, i32 0, i32 0 - store volatile i32 13, i32 addrspace(5)* %arrayidx2, align 8 - call void @void_func_byval_struct_align8(%struct.ByValStruct addrspace(5)* byval nonnull align 8 %arg0, %struct.ByValStruct addrspace(5)* byval nonnull align 8 %arg1) - call void @llvm.lifetime.end.p5i8(i64 32, i8 addrspace(5)* %tmp1) - call void @llvm.lifetime.end.p5i8(i64 32, i8 addrspace(5)* %tmp) - ret void -} - -; GCN-LABEL: {{^}}call_void_func_byval_struct_kernel_no_frame_pointer_elim: -define amdgpu_kernel void @call_void_func_byval_struct_kernel_no_frame_pointer_elim() #2 { -entry: - %arg0 = alloca %struct.ByValStruct, align 4, addrspace(5) - %arg1 = alloca %struct.ByValStruct, align 4, addrspace(5) - %tmp = bitcast %struct.ByValStruct addrspace(5)* %arg0 to i8 addrspace(5)* - call void @llvm.lifetime.start.p5i8(i64 32, i8 addrspace(5)* %tmp) - %tmp1 = bitcast %struct.ByValStruct addrspace(5)* %arg1 to i8 addrspace(5)* - call void @llvm.lifetime.start.p5i8(i64 32, i8 addrspace(5)* %tmp1) - %arrayidx = getelementptr inbounds %struct.ByValStruct, %struct.ByValStruct addrspace(5)* %arg0, i32 0, i32 0, i32 0 - store volatile i32 9, i32 addrspace(5)* %arrayidx, align 4 - %arrayidx2 = getelementptr inbounds %struct.ByValStruct, %struct.ByValStruct addrspace(5)* %arg1, i32 0, i32 0, i32 0 - store volatile i32 13, i32 addrspace(5)* %arrayidx2, align 4 - call void @void_func_byval_struct(%struct.ByValStruct addrspace(5)* byval nonnull align 4 %arg0, %struct.ByValStruct addrspace(5)* byval nonnull align 4 %arg1) - call void @llvm.lifetime.end.p5i8(i64 32, i8 addrspace(5)* %tmp1) - call void @llvm.lifetime.end.p5i8(i64 32, i8 addrspace(5)* %tmp) - ret void -} - declare hidden void @external_void_func_void() #0 declare void @llvm.lifetime.start.p5i8(i64, i8 addrspace(5)* nocapture) #3 diff --git a/llvm/test/CodeGen/AMDGPU/call-argument-types.ll b/llvm/test/CodeGen/AMDGPU/call-argument-types.ll --- a/llvm/test/CodeGen/AMDGPU/call-argument-types.ll +++ b/llvm/test/CodeGen/AMDGPU/call-argument-types.ll @@ -78,14 +78,11 @@ } ; GCN-LABEL: {{^}}test_call_external_void_func_i1_signext: -; MESA: s_mov_b32 s33, s3{{$}} -; HSA: s_mov_b32 s33, s9{{$}} ; HSA: buffer_load_ubyte [[VAR:v[0-9]+]] -; HSA: s_mov_b32 s32, s33 +; HSA: s_mov_b32 s32, 0 ; MESA-DAG: buffer_load_ubyte [[VAR:v[0-9]+]] -; MESA-DAG: s_mov_b32 s32, s33{{$}} - +; MESA-DAG: s_mov_b32 s32, 0{{$}} ; GCN: s_getpc_b64 s{{\[}}[[PC_LO:[0-9]+]]:[[PC_HI:[0-9]+]]{{\]}} ; GCN-NEXT: s_add_u32 s[[PC_LO]], s[[PC_LO]], external_void_func_i1_signext@rel32@lo+4 @@ -103,13 +100,12 @@ ; FIXME: load should be scheduled before getpc ; GCN-LABEL: {{^}}test_call_external_void_func_i1_zeroext: -; MESA: s_mov_b32 s33, s3{{$}} ; HSA: buffer_load_ubyte v0 -; HSA-DAG: s_mov_b32 s32, s33{{$}} +; HSA-DAG: s_mov_b32 s32, 0{{$}} ; MESA: buffer_load_ubyte v0 -; MESA-DAG: s_mov_b32 s32, s33{{$}} +; MESA-DAG: s_mov_b32 s32, 0{{$}} ; GCN: s_getpc_b64 s{{\[}}[[PC_LO:[0-9]+]]:[[PC_HI:[0-9]+]]{{\]}} ; GCN-NEXT: s_add_u32 s[[PC_LO]], s[[PC_LO]], external_void_func_i1_zeroext@rel32@lo+4 @@ -127,14 +123,13 @@ } ; GCN-LABEL: {{^}}test_call_external_void_func_i8_imm: -; MESA-DAG: s_mov_b32 s33, s3{{$}} ; GCN-DAG: s_getpc_b64 s{{\[}}[[PC_LO:[0-9]+]]:[[PC_HI:[0-9]+]]{{\]}} ; GCN-DAG: s_add_u32 s[[PC_LO]], s[[PC_LO]], external_void_func_i8@rel32@lo+4 ; GCN-DAG: s_addc_u32 s[[PC_HI]], s[[PC_HI]], external_void_func_i8@rel32@hi+4 ; GCN-DAG: v_mov_b32_e32 v0, 0x7b -; GCN-DAG: s_mov_b32 s32, s33{{$}} +; GCN-DAG: s_mov_b32 s32, 0{{$}} ; GCN: s_swappc_b64 s[30:31], s{{\[}}[[PC_LO]]:[[PC_HI]]{{\]}} ; GCN-NEXT: s_endpgm @@ -145,15 +140,13 @@ ; FIXME: don't wait before call ; GCN-LABEL: {{^}}test_call_external_void_func_i8_signext: -; HSA-DAG: s_mov_b32 s33, s9{{$}} -; MESA-DAG: s_mov_b32 s33, s3{{$}} ; GCN-DAG: buffer_load_sbyte v0 ; GCN-DAG: s_getpc_b64 s{{\[}}[[PC_LO:[0-9]+]]:[[PC_HI:[0-9]+]]{{\]}} ; GCN-DAG: s_add_u32 s[[PC_LO]], s[[PC_LO]], external_void_func_i8_signext@rel32@lo+4 ; GCN-DAG: s_addc_u32 s[[PC_HI]], s[[PC_HI]], external_void_func_i8_signext@rel32@hi+4 -; GCN-DAG: s_mov_b32 s32, s3 +; GCN-DAG: s_mov_b32 s32, 0 ; GCN-NOT: s_waitcnt ; GCN-NEXT: s_swappc_b64 s[30:31], s{{\[}}[[PC_LO]]:[[PC_HI]]{{\]}} @@ -165,15 +158,13 @@ } ; GCN-LABEL: {{^}}test_call_external_void_func_i8_zeroext: -; MESA-DAG: s_mov_b32 s33, s3{{$}} -; HSA-DAG: s_mov_b32 s33, s9{{$}} ; GCN-DAG: buffer_load_ubyte v0 ; GCN-DAG: s_getpc_b64 s{{\[}}[[PC_LO:[0-9]+]]:[[PC_HI:[0-9]+]]{{\]}} ; GCN-DAG: s_add_u32 s[[PC_LO]], s[[PC_LO]], external_void_func_i8_zeroext@rel32@lo+4 ; GCN-DAG: s_addc_u32 s[[PC_HI]], s[[PC_HI]], external_void_func_i8_zeroext@rel32@hi+4 -; GCN-DAG: s_mov_b32 s32, s33 +; GCN-DAG: s_mov_b32 s32, 0 ; GCN-NOT: s_waitcnt ; GCN-NEXT: s_swappc_b64 s[30:31], s{{\[}}[[PC_LO]]:[[PC_HI]]{{\]}} @@ -187,7 +178,7 @@ ; GCN-LABEL: {{^}}test_call_external_void_func_i16_imm: ; GCN-DAG: v_mov_b32_e32 v0, 0x7b{{$}} -; GCN-DAG: s_mov_b32 s32, s33 +; GCN-DAG: s_mov_b32 s32, 0 ; GCN: s_swappc_b64 define amdgpu_kernel void @test_call_external_void_func_i16_imm() #0 { @@ -196,14 +187,13 @@ } ; GCN-LABEL: {{^}}test_call_external_void_func_i16_signext: -; MESA-DAG: s_mov_b32 s33, s3{{$}} ; GCN-DAG: buffer_load_sshort v0 ; GCN-DAG: s_getpc_b64 s{{\[}}[[PC_LO:[0-9]+]]:[[PC_HI:[0-9]+]]{{\]}} ; GCN-DAG: s_add_u32 s[[PC_LO]], s[[PC_LO]], external_void_func_i16_signext@rel32@lo+4 ; GCN-DAG: s_addc_u32 s[[PC_HI]], s[[PC_HI]], external_void_func_i16_signext@rel32@hi+4 -; GCN-DAG: s_mov_b32 s32, s33 +; GCN-DAG: s_mov_b32 s32, 0 ; GCN-NOT: s_waitcnt ; GCN-NEXT: s_swappc_b64 s[30:31], s{{\[}}[[PC_LO]]:[[PC_HI]]{{\]}} @@ -215,14 +205,12 @@ } ; GCN-LABEL: {{^}}test_call_external_void_func_i16_zeroext: -; MESA-DAG: s_mov_b32 s33, s3{{$}} - ; GCN-DAG: s_getpc_b64 s{{\[}}[[PC_LO:[0-9]+]]:[[PC_HI:[0-9]+]]{{\]}} ; GCN-DAG: s_add_u32 s[[PC_LO]], s[[PC_LO]], external_void_func_i16_zeroext@rel32@lo+4 ; GCN-DAG: s_addc_u32 s[[PC_HI]], s[[PC_HI]], external_void_func_i16_zeroext@rel32@hi+4 -; GCN-DAG: s_mov_b32 s32, s33 +; GCN-DAG: s_mov_b32 s32, 0 ; GCN-NOT: s_waitcnt ; GCN-NEXT: s_swappc_b64 s[30:31], s{{\[}}[[PC_LO]]:[[PC_HI]]{{\]}} @@ -234,13 +222,12 @@ } ; GCN-LABEL: {{^}}test_call_external_void_func_i32_imm: -; MESA-DAG: s_mov_b32 s33, s3{{$}} ; GCN-DAG: s_getpc_b64 s{{\[}}[[PC_LO:[0-9]+]]:[[PC_HI:[0-9]+]]{{\]}} ; GCN-DAG: s_add_u32 s[[PC_LO]], s[[PC_LO]], external_void_func_i32@rel32@lo+4 ; GCN-DAG: s_addc_u32 s[[PC_HI]], s[[PC_HI]], external_void_func_i32@rel32@hi+4 ; GCN-DAG: v_mov_b32_e32 v0, 42 -; GCN-DAG: s_mov_b32 s32, s33 +; GCN-DAG: s_mov_b32 s32, 0 ; GCN: s_swappc_b64 s[30:31], s{{\[}}[[PC_LO]]:[[PC_HI]]{{\]}} ; GCN-NEXT: s_endpgm @@ -497,9 +484,7 @@ ret void } -; GCN-LABEL: {{^}}test_call_external_void_func_v3i32_imm: -; HSA-DAG: s_mov_b32 s33, s9 -; MESA-DAG: s_mov_b32 s33, s3{{$}} +; GCN-LABEL: {{^}}test_call_external_void_func_v3i32_imm: {{.*}} ; GCN-NOT: v3 ; GCN-DAG: v_mov_b32_e32 v0, 3 @@ -616,10 +601,8 @@ } ; GCN-LABEL: {{^}}test_call_external_void_func_v32i32_i32: -; HSA-DAG: s_mov_b32 s33, s9 ; HSA-NOT: s_add_u32 s32 -; MESA-DAG: s_mov_b32 s33, s3{{$}} ; MESA-NOT: s_add_u32 s32 ; GCN-DAG: buffer_load_dword [[VAL1:v[0-9]+]], off, s[{{[0-9]+}}:{{[0-9]+}}], 0{{$}} @@ -670,19 +653,19 @@ ; GCN-LABEL: {{^}}test_call_external_void_func_byval_struct_i8_i32: ; GCN-DAG: v_mov_b32_e32 [[VAL0:v[0-9]+]], 3 ; GCN-DAG: v_mov_b32_e32 [[VAL1:v[0-9]+]], 8 -; MESA-DAG: buffer_store_byte [[VAL0]], off, s[36:39], s33 offset:8 -; MESA-DAG: buffer_store_dword [[VAL1]], off, s[36:39], s33 offset:12 +; MESA-DAG: buffer_store_byte [[VAL0]], off, s[36:39], 0 offset:8 +; MESA-DAG: buffer_store_dword [[VAL1]], off, s[36:39], 0 offset:12 -; HSA-DAG: buffer_store_byte [[VAL0]], off, s[0:3], s33 offset:8 -; HSA-DAG: buffer_store_dword [[VAL1]], off, s[0:3], s33 offset:12 +; HSA-DAG: buffer_store_byte [[VAL0]], off, s[0:3], 0 offset:8 +; HSA-DAG: buffer_store_dword [[VAL1]], off, s[0:3], 0 offset:12 -; HSA: buffer_load_dword [[RELOAD_VAL0:v[0-9]+]], off, s[0:3], s33 offset:8 -; HSA: buffer_load_dword [[RELOAD_VAL1:v[0-9]+]], off, s[0:3], s33 offset:12 +; HSA: buffer_load_dword [[RELOAD_VAL0:v[0-9]+]], off, s[0:3], 0 offset:8 +; HSA: buffer_load_dword [[RELOAD_VAL1:v[0-9]+]], off, s[0:3], 0 offset:12 -; MESA: buffer_load_dword [[RELOAD_VAL0:v[0-9]+]], off, s[36:39], s33 offset:8 -; MESA: buffer_load_dword [[RELOAD_VAL1:v[0-9]+]], off, s[36:39], s33 offset:12 +; MESA: buffer_load_dword [[RELOAD_VAL0:v[0-9]+]], off, s[36:39], 0 offset:8 +; MESA: buffer_load_dword [[RELOAD_VAL1:v[0-9]+]], off, s[36:39], 0 offset:12 -; GCN-DAG: s_add_u32 [[SP:s[0-9]+]], s33, 0x400{{$}} +; GCN-DAG: s_movk_i32 [[SP:s[0-9]+]], 0x400{{$}} ; HSA-DAG: buffer_store_dword [[RELOAD_VAL0]], off, s[0:3], [[SP]]{{$}} ; HSA-DAG: buffer_store_dword [[RELOAD_VAL1]], off, s[0:3], [[SP]] offset:4 @@ -703,23 +686,22 @@ } ; GCN-LABEL: {{^}}test_call_external_void_func_sret_struct_i8_i32_byval_struct_i8_i32: -; MESA-DAG: s_add_u32 [[SP:s[0-9]+]], [[FP_REG:s[0-9]+]], 0x800{{$}} -; HSA-DAG: s_add_u32 [[SP:s[0-9]+]], [[FP_REG:s[0-9]+]], 0x800{{$}} +; GCN-DAG: s_movk_i32 [[SP:s[0-9]+]], 0x800{{$}} ; GCN-DAG: v_mov_b32_e32 [[VAL0:v[0-9]+]], 3 ; GCN-DAG: v_mov_b32_e32 [[VAL1:v[0-9]+]], 8 -; GCN-DAG: buffer_store_byte [[VAL0]], off, s{{\[[0-9]+:[0-9]+\]}}, [[FP_REG]] offset:8 -; GCN-DAG: buffer_store_dword [[VAL1]], off, s{{\[[0-9]+:[0-9]+\]}}, [[FP_REG]] offset:12 +; GCN-DAG: buffer_store_byte [[VAL0]], off, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:8 +; GCN-DAG: buffer_store_dword [[VAL1]], off, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:12 -; GCN-DAG: buffer_load_dword [[RELOAD_VAL0:v[0-9]+]], off, s{{\[[0-9]+:[0-9]+\]}}, [[FP_REG]] offset:8 -; GCN-DAG: buffer_load_dword [[RELOAD_VAL1:v[0-9]+]], off, s{{\[[0-9]+:[0-9]+\]}}, [[FP_REG]] offset:12 +; GCN-DAG: buffer_load_dword [[RELOAD_VAL0:v[0-9]+]], off, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:8 +; GCN-DAG: buffer_load_dword [[RELOAD_VAL1:v[0-9]+]], off, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:12 ; GCN-NOT: s_add_u32 [[SP]] ; GCN-DAG: buffer_store_dword [[RELOAD_VAL0]], off, s{{\[[0-9]+:[0-9]+\]}}, [[SP]]{{$}} ; GCN-DAG: buffer_store_dword [[RELOAD_VAL1]], off, s{{\[[0-9]+:[0-9]+\]}}, [[SP]] offset:4 ; GCN: s_swappc_b64 -; GCN-DAG: buffer_load_ubyte [[LOAD_OUT_VAL0:v[0-9]+]], off, s{{\[[0-9]+:[0-9]+\]}}, [[FP_REG]] offset:16 -; GCN-DAG: buffer_load_dword [[LOAD_OUT_VAL1:v[0-9]+]], off, s{{\[[0-9]+:[0-9]+\]}}, [[FP_REG]] offset:20 +; GCN-DAG: buffer_load_ubyte [[LOAD_OUT_VAL0:v[0-9]+]], off, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:16 +; GCN-DAG: buffer_load_dword [[LOAD_OUT_VAL1:v[0-9]+]], off, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:20 ; GCN-NOT: s_sub_u32 [[SP]] ; GCN: buffer_store_byte [[LOAD_OUT_VAL0]], off diff --git a/llvm/test/CodeGen/AMDGPU/call-constant.ll b/llvm/test/CodeGen/AMDGPU/call-constant.ll --- a/llvm/test/CodeGen/AMDGPU/call-constant.ll +++ b/llvm/test/CodeGen/AMDGPU/call-constant.ll @@ -3,9 +3,8 @@ ; FIXME: Emitting unnecessary flat_scratch setup ; GCN-LABEL: {{^}}test_call_undef: -; GCN: s_mov_b32 s8, s7 ; GCN: s_mov_b32 flat_scratch_lo, s5 -; GCN: s_add_u32 s4, s4, s8 +; GCN: s_add_u32 s4, s4, s7 ; GCN: s_lshr_b32 ; GCN: s_endpgm define amdgpu_kernel void @test_call_undef() #0 { @@ -24,9 +23,8 @@ } ; GCN-LABEL: {{^}}test_call_null: -; GCN: s_mov_b32 s8, s7 ; GCN: s_mov_b32 flat_scratch_lo, s5 -; GCN: s_add_u32 s4, s4, s8 +; GCN: s_add_u32 s4, s4, s7 ; GCN: s_lshr_b32 ; GCN: s_endpgm define amdgpu_kernel void @test_call_null() #0 { diff --git a/llvm/test/CodeGen/AMDGPU/call-preserved-registers.ll b/llvm/test/CodeGen/AMDGPU/call-preserved-registers.ll --- a/llvm/test/CodeGen/AMDGPU/call-preserved-registers.ll +++ b/llvm/test/CodeGen/AMDGPU/call-preserved-registers.ll @@ -5,11 +5,10 @@ declare hidden void @external_void_func_void() #0 ; GCN-LABEL: {{^}}test_kernel_call_external_void_func_void_clobber_s30_s31_call_external_void_func_void: -; GCN: s_mov_b32 s33, s7 ; GCN: s_getpc_b64 s[34:35] ; GCN-NEXT: s_add_u32 s34, s34, ; GCN-NEXT: s_addc_u32 s35, s35, -; GCN-NEXT: s_mov_b32 s32, s33 +; GCN-NEXT: s_mov_b32 s32, 0 ; GCN: s_swappc_b64 s[30:31], s[34:35] ; GCN-NEXT: #ASMSTART @@ -105,9 +104,9 @@ } ; GCN-LABEL: {{^}}test_call_void_func_void_mayclobber_s31: -; GCN: s_mov_b32 s34, s31 +; GCN: s_mov_b32 s33, s31 ; GCN-NEXT: s_swappc_b64 -; GCN-NEXT: s_mov_b32 s31, s34 +; GCN-NEXT: s_mov_b32 s31, s33 define amdgpu_kernel void @test_call_void_func_void_mayclobber_s31(i32 addrspace(1)* %out) #0 { %s31 = call i32 asm sideeffect "; def $0", "={s31}"() call void @external_void_func_void() @@ -126,14 +125,11 @@ ret void } -; FIXME: What is the expected behavior for reserved registers here? - ; GCN-LABEL: {{^}}test_call_void_func_void_preserves_s33: -; GCN: s_mov_b32 s33, s9 -; GCN: s_mov_b32 s32, s33 ; GCN: s_getpc_b64 s[4:5] ; GCN-NEXT: s_add_u32 s4, s4, external_void_func_void@rel32@lo+4 ; GCN-NEXT: s_addc_u32 s5, s5, external_void_func_void@rel32@hi+4 +; GCN: s_mov_b32 s32, 0 ; GCN: #ASMSTART ; GCN-NEXT: ; def s33 ; GCN-NEXT: #ASMEND @@ -150,14 +146,15 @@ ret void } -; GCN-LABEL: {{^}}test_call_void_func_void_preserves_s34: -; GCN: s_mov_b32 s33, s9 -; GCN-NOT: s34 +; FIXME: What is the expected behavior for reserved registers here? + +; GCN-LABEL: {{^}}test_call_void_func_void_preserves_s34: {{.*}} ; GCN-NOT: s34 ; GCN: s_getpc_b64 s[4:5] ; GCN-NEXT: s_add_u32 s4, s4, external_void_func_void@rel32@lo+4 ; GCN-NEXT: s_addc_u32 s5, s5, external_void_func_void@rel32@hi+4 +; GCN: s_mov_b32 s32, 0 ; GCN-NOT: s34 ; GCN: ;;#ASMSTART @@ -180,15 +177,14 @@ ret void } -; GCN-LABEL: {{^}}test_call_void_func_void_preserves_v32: -; GCN: s_mov_b32 s33, s9 +; GCN-LABEL: {{^}}test_call_void_func_void_preserves_v32: {{.*}} ; GCN-NOT: v32 ; GCN: s_getpc_b64 s[4:5] ; GCN-NEXT: s_add_u32 s4, s4, external_void_func_void@rel32@lo+4 ; GCN-NEXT: s_addc_u32 s5, s5, external_void_func_void@rel32@hi+4 +; GCN: s_mov_b32 s32, 0 ; GCN-NOT: v32 -; GCN-DAG: s_mov_b32 s32, s33 ; GCN: ;;#ASMSTART ; GCN-NEXT: ; def v32 @@ -234,12 +230,10 @@ } ; GCN-LABEL: {{^}}test_call_void_func_void_clobber_s33: -; GCN: s_mov_b32 s33, s7 - ; GCN: s_getpc_b64 ; GCN-NEXT: s_add_u32 ; GCN-NEXT: s_addc_u32 -; GCN-NEXT: s_mov_b32 s32, s33 +; GCN-NEXT: s_mov_b32 s32, 0 ; GCN: s_swappc_b64 ; GCN-NEXT: s_endpgm define amdgpu_kernel void @test_call_void_func_void_clobber_s33() #0 { @@ -248,11 +242,10 @@ } ; GCN-LABEL: {{^}}test_call_void_func_void_clobber_s34: -; GCN: s_mov_b32 s33, s7 ; GCN: s_getpc_b64 ; GCN-NEXT: s_add_u32 ; GCN-NEXT: s_addc_u32 -; GCN-NEXT: s_mov_b32 s32, s33 +; GCN-NEXT: s_mov_b32 s32, 0 ; GCN: s_swappc_b64 ; GCN-NEXT: s_endpgm define amdgpu_kernel void @test_call_void_func_void_clobber_s34() #0 { diff --git a/llvm/test/CodeGen/AMDGPU/call-waitcnt.ll b/llvm/test/CodeGen/AMDGPU/call-waitcnt.ll --- a/llvm/test/CodeGen/AMDGPU/call-waitcnt.ll +++ b/llvm/test/CodeGen/AMDGPU/call-waitcnt.ll @@ -6,16 +6,17 @@ ; GCN-LABEL: call_memory_arg_load: ; GCN: ; %bb.0: ; GCN-NEXT: s_load_dword s4, s[4:5], 0x0 -; GCN-NEXT: s_mov_b32 s33, s9 -; GCN-NEXT: s_add_u32 flat_scratch_lo, s6, s33 +; GCN-NEXT: s_add_u32 flat_scratch_lo, s6, s9 ; GCN-NEXT: s_addc_u32 flat_scratch_hi, s7, 0 -; GCN-NEXT: s_mov_b32 s32, s33 +; GCN-NEXT: s_add_u32 s0, s0, s9 +; GCN-NEXT: s_addc_u32 s1, s1, 0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: v_mov_b32_e32 v0, s4 ; GCN-NEXT: ds_read_b32 v0, v0 ; GCN-NEXT: s_getpc_b64 s[4:5] ; GCN-NEXT: s_add_u32 s4, s4, func@rel32@lo+4 ; GCN-NEXT: s_addc_u32 s5, s5, func@rel32@hi+4 +; GCN-NEXT: s_mov_b32 s32, 0 ; GCN-NEXT: s_swappc_b64 s[30:31], s[4:5] ; GCN-NEXT: s_endpgm %vgpr = load volatile i32, i32 addrspace(3)* %ptr @@ -28,19 +29,20 @@ ; GCN-LABEL: call_memory_no_dep: ; GCN: ; %bb.0: ; GCN-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 -; GCN-NEXT: s_mov_b32 s33, s9 -; GCN-NEXT: s_add_u32 flat_scratch_lo, s6, s33 -; GCN-NEXT: v_mov_b32_e32 v2, 0 +; GCN-NEXT: s_add_u32 flat_scratch_lo, s6, s9 ; GCN-NEXT: s_addc_u32 flat_scratch_hi, s7, 0 +; GCN-NEXT: s_add_u32 s0, s0, s9 +; GCN-NEXT: v_mov_b32_e32 v2, 0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: v_mov_b32_e32 v0, s4 ; GCN-NEXT: v_mov_b32_e32 v1, s5 +; GCN-NEXT: s_addc_u32 s1, s1, 0 ; GCN-NEXT: global_store_dword v[0:1], v2, off ; GCN-NEXT: v_mov_b32_e32 v0, 0 ; GCN-NEXT: s_getpc_b64 s[6:7] ; GCN-NEXT: s_add_u32 s6, s6, func@rel32@lo+4 ; GCN-NEXT: s_addc_u32 s7, s7, func@rel32@hi+4 -; GCN-NEXT: s_mov_b32 s32, s33 +; GCN-NEXT: s_mov_b32 s32, 0 ; GCN-NEXT: s_swappc_b64 s[30:31], s[6:7] ; GCN-NEXT: s_endpgm store i32 0, i32 addrspace(1)* %ptr @@ -52,15 +54,16 @@ define amdgpu_kernel void @call_no_wait_after_call(i32 addrspace(1)* %ptr, i32) #0 { ; GCN-LABEL: call_no_wait_after_call: ; GCN: ; %bb.0: +; GCN-NEXT: s_add_u32 flat_scratch_lo, s6, s9 ; GCN-NEXT: s_load_dwordx2 s[34:35], s[4:5], 0x0 -; GCN-NEXT: s_mov_b32 s33, s9 -; GCN-NEXT: s_add_u32 flat_scratch_lo, s6, s33 ; GCN-NEXT: s_addc_u32 flat_scratch_hi, s7, 0 +; GCN-NEXT: s_add_u32 s0, s0, s9 +; GCN-NEXT: s_addc_u32 s1, s1, 0 ; GCN-NEXT: v_mov_b32_e32 v0, 0 ; GCN-NEXT: s_getpc_b64 s[4:5] ; GCN-NEXT: s_add_u32 s4, s4, func@rel32@lo+4 ; GCN-NEXT: s_addc_u32 s5, s5, func@rel32@hi+4 -; GCN-NEXT: s_mov_b32 s32, s33 +; GCN-NEXT: s_mov_b32 s32, 0 ; GCN-NEXT: v_mov_b32_e32 v32, 0 ; GCN-NEXT: s_swappc_b64 s[30:31], s[4:5] ; GCN-NEXT: v_mov_b32_e32 v0, s34 @@ -75,15 +78,16 @@ define amdgpu_kernel void @call_no_wait_after_call_return_val(i32 addrspace(1)* %ptr, i32) #0 { ; GCN-LABEL: call_no_wait_after_call_return_val: ; GCN: ; %bb.0: +; GCN-NEXT: s_add_u32 flat_scratch_lo, s6, s9 ; GCN-NEXT: s_load_dwordx2 s[34:35], s[4:5], 0x0 -; GCN-NEXT: s_mov_b32 s33, s9 -; GCN-NEXT: s_add_u32 flat_scratch_lo, s6, s33 ; GCN-NEXT: s_addc_u32 flat_scratch_hi, s7, 0 +; GCN-NEXT: s_add_u32 s0, s0, s9 +; GCN-NEXT: s_addc_u32 s1, s1, 0 ; GCN-NEXT: v_mov_b32_e32 v0, 0 ; GCN-NEXT: s_getpc_b64 s[4:5] ; GCN-NEXT: s_add_u32 s4, s4, func.return@rel32@lo+4 ; GCN-NEXT: s_addc_u32 s5, s5, func.return@rel32@hi+4 -; GCN-NEXT: s_mov_b32 s32, s33 +; GCN-NEXT: s_mov_b32 s32, 0 ; GCN-NEXT: s_swappc_b64 s[30:31], s[4:5] ; GCN-NEXT: v_mov_b32_e32 v1, s34 ; GCN-NEXT: v_mov_b32_e32 v2, s35 @@ -98,15 +102,16 @@ define amdgpu_kernel void @call_got_load(i32 addrspace(1)* %ptr, i32) #0 { ; GCN-LABEL: call_got_load: ; GCN: ; %bb.0: -; GCN-NEXT: s_mov_b32 s33, s9 -; GCN-NEXT: s_add_u32 flat_scratch_lo, s6, s33 +; GCN-NEXT: s_add_u32 flat_scratch_lo, s6, s9 ; GCN-NEXT: s_addc_u32 flat_scratch_hi, s7, 0 +; GCN-NEXT: s_add_u32 s0, s0, s9 +; GCN-NEXT: s_addc_u32 s1, s1, 0 ; GCN-NEXT: s_getpc_b64 s[4:5] ; GCN-NEXT: s_add_u32 s4, s4, got.func@gotpcrel32@lo+4 ; GCN-NEXT: s_addc_u32 s5, s5, got.func@gotpcrel32@hi+4 ; GCN-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 ; GCN-NEXT: v_mov_b32_e32 v0, 0 -; GCN-NEXT: s_mov_b32 s32, s33 +; GCN-NEXT: s_mov_b32 s32, 0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: s_swappc_b64 s[30:31], s[4:5] ; GCN-NEXT: s_endpgm diff --git a/llvm/test/CodeGen/AMDGPU/callee-special-input-sgprs.ll b/llvm/test/CodeGen/AMDGPU/callee-special-input-sgprs.ll --- a/llvm/test/CodeGen/AMDGPU/callee-special-input-sgprs.ll +++ b/llvm/test/CodeGen/AMDGPU/callee-special-input-sgprs.ll @@ -197,6 +197,8 @@ ret void } +; FIXME: Include use of scratch wave offset in these tests? + ; GCN-LABEL: {{^}}kern_indirect_use_workgroup_id_x: ; GCN: enable_sgpr_workgroup_id_x = 1 ; GCN: enable_sgpr_workgroup_id_y = 0 @@ -207,7 +209,7 @@ ; GCN-NEXT: s_getpc_b64 s[6:7] ; GCN-NEXT: s_add_u32 s6, s6, use_workgroup_id_x@rel32@lo+4 ; GCN-NEXT: s_addc_u32 s7, s7, use_workgroup_id_x@rel32@hi+4 -; GCN: s_mov_b32 s32, s33 +; GCN: s_mov_b32 s32, 0 ; GCN: s_swappc_b64 ; GCN-NEXT: s_endpgm define amdgpu_kernel void @kern_indirect_use_workgroup_id_x() #1 { @@ -220,9 +222,8 @@ ; GCN: enable_sgpr_workgroup_id_y = 1 ; GCN: enable_sgpr_workgroup_id_z = 0 -; GCN: s_mov_b32 s33, s8 -; GCN-DAG: s_mov_b32 s4, s7 -; GCN: s_mov_b32 s32, s33 +; GCN: s_mov_b32 s4, s7 +; GCN: s_mov_b32 s32, 0 ; GCN: s_swappc_b64 define amdgpu_kernel void @kern_indirect_use_workgroup_id_y() #1 { call void @use_workgroup_id_y() @@ -233,9 +234,10 @@ ; GCN: enable_sgpr_workgroup_id_x = 1 ; GCN: enable_sgpr_workgroup_id_y = 0 ; GCN: enable_sgpr_workgroup_id_z = 1 -; GCN: s_mov_b32 s33, s8 + ; GCN: s_mov_b32 s4, s7 +; GCN: s_mov_b32 s32, 0 ; GCN: s_swappc_b64 define amdgpu_kernel void @kern_indirect_use_workgroup_id_z() #1 { call void @use_workgroup_id_z() @@ -247,11 +249,10 @@ ; GCN: enable_sgpr_workgroup_id_y = 1 ; GCN: enable_sgpr_workgroup_id_z = 0 -; GCN: s_mov_b32 s33, s8 - ; GCN: s_mov_b32 s5, s7 ; GCN: s_mov_b32 s4, s6 -; GCN: s_mov_b32 s32, s33 + +; GCN: s_mov_b32 s32, 0 ; GCN: s_swappc_b64 define amdgpu_kernel void @kern_indirect_use_workgroup_id_xy() #1 { call void @use_workgroup_id_xy() @@ -263,13 +264,11 @@ ; GCN: enable_sgpr_workgroup_id_y = 1 ; GCN: enable_sgpr_workgroup_id_z = 1 -; GCN: s_mov_b32 s33, s9 - ; GCN: s_mov_b32 s4, s6 ; GCN: s_mov_b32 s5, s7 ; GCN: s_mov_b32 s6, s8 -; GCN: s_mov_b32 s32, s33 +; GCN: s_mov_b32 s32, 0 ; GCN: s_swappc_b64 define amdgpu_kernel void @kern_indirect_use_workgroup_id_xyz() #1 { call void @use_workgroup_id_xyz() @@ -281,12 +280,10 @@ ; GCN: enable_sgpr_workgroup_id_y = 0 ; GCN: enable_sgpr_workgroup_id_z = 1 -; GCN: s_mov_b32 s33, s8 ; GCN: s_mov_b32 s5, s7 ; GCN: s_mov_b32 s4, s6 -; GCN: s_mov_b32 s32, s33 - +; GCN: s_mov_b32 s32, 0 ; GCN: s_swappc_b64 define amdgpu_kernel void @kern_indirect_use_workgroup_id_xz() #1 { call void @use_workgroup_id_xz() @@ -298,10 +295,10 @@ ; GCN: enable_sgpr_workgroup_id_y = 1 ; GCN: enable_sgpr_workgroup_id_z = 1 -; GCN: s_mov_b32 s33, s9 ; GCN: s_mov_b32 s4, s7 ; GCN: s_mov_b32 s5, s8 -; GCN: s_mov_b32 s32, s33 + +; GCN: s_mov_b32 s32, 0 ; GCN: s_swappc_b64 define amdgpu_kernel void @kern_indirect_use_workgroup_id_yz() #1 { call void @use_workgroup_id_yz() @@ -368,10 +365,10 @@ ; GCN: enable_sgpr_workgroup_id_y = 0 ; GCN: enable_sgpr_workgroup_id_z = 0 -; GCN-DAG: s_mov_b32 s33, s7 ; GCN-DAG: v_mov_b32_e32 v0, 0x22b ; GCN-DAG: s_mov_b32 s4, s6 -; GCN-DAG: s_mov_b32 s32, s33 + +; GCN-DAG: s_mov_b32 s32, 0 ; GCN-NOT: s4 ; GCN: s_swappc_b64 define amdgpu_kernel void @kern_indirect_other_arg_use_workgroup_id_x() #1 { @@ -384,11 +381,10 @@ ; GCN: enable_sgpr_workgroup_id_y = 1 ; GCN: enable_sgpr_workgroup_id_z = 0 -; GCN-DAG: s_mov_b32 s33, s8 ; GCN-DAG: v_mov_b32_e32 v0, 0x22b ; GCN-DAG: s_mov_b32 s4, s7 -; GCN-DAG: s_mov_b32 s32, s33 +; GCN-DAG: s_mov_b32 s32, 0 ; GCN: s_swappc_b64 define amdgpu_kernel void @kern_indirect_other_arg_use_workgroup_id_y() #1 { call void @other_arg_use_workgroup_id_y(i32 555) @@ -400,10 +396,9 @@ ; GCN: enable_sgpr_workgroup_id_y = 0 ; GCN: enable_sgpr_workgroup_id_z = 1 -; GCN-DAG: s_mov_b32 s33, s8 ; GCN-DAG: v_mov_b32_e32 v0, 0x22b -; GCN: s_mov_b32 s32, s33 +; GCN: s_mov_b32 s32, 0 ; GCN: s_swappc_b64 define amdgpu_kernel void @kern_indirect_other_arg_use_workgroup_id_z() #1 { call void @other_arg_use_workgroup_id_z(i32 555) @@ -469,11 +464,10 @@ ; GCN: enable_sgpr_dispatch_id = 1 ; GCN: enable_sgpr_flat_scratch_init = 1 -; GCN: s_mov_b32 s33, s17 ; GCN: s_mov_b32 s12, s14 ; GCN: s_mov_b32 s13, s15 ; GCN: s_mov_b32 s14, s16 -; GCN: s_mov_b32 s32, s33 +; GCN: s_mov_b32 s32, 0 ; GCN: s_swappc_b64 define amdgpu_kernel void @kern_indirect_use_every_sgpr_input() #1 { call void @use_every_sgpr_input() diff --git a/llvm/test/CodeGen/AMDGPU/callee-special-input-vgprs.ll b/llvm/test/CodeGen/AMDGPU/callee-special-input-vgprs.ll --- a/llvm/test/CodeGen/AMDGPU/callee-special-input-vgprs.ll +++ b/llvm/test/CodeGen/AMDGPU/callee-special-input-vgprs.ll @@ -361,8 +361,7 @@ ; GCN-LABEL: {{^}}kern_call_too_many_args_use_workitem_id_x: ; GCN: enable_vgpr_workitem_id = 0 -; GCN: s_mov_b32 s33, s7 -; GCN: s_mov_b32 s32, s33 +; GCN: s_mov_b32 s32, 0 ; GCN: buffer_store_dword v0, off, s[0:3], s32{{$}} ; GCN: s_swappc_b64 define amdgpu_kernel void @kern_call_too_many_args_use_workitem_id_x() #1 { @@ -489,14 +488,13 @@ ; GCN-LABEL: {{^}}kern_call_too_many_args_use_workitem_id_x_byval: ; GCN: enable_vgpr_workitem_id = 0 -; GCN-DAG: s_mov_b32 s33, s7 -; GCN-DAG: v_mov_b32_e32 [[K:v[0-9]+]], 0x3e7{{$}} -; GCN: buffer_store_dword [[K]], off, s[0:3], s33 offset:4 -; GCN: buffer_load_dword [[RELOAD_BYVAL:v[0-9]+]], off, s[0:3], s33 offset:4 -; GCN: s_add_u32 s32, s33, 0x400{{$}} +; GCN: v_mov_b32_e32 [[K:v[0-9]+]], 0x3e7{{$}} +; GCN: s_movk_i32 s32, 0x400{{$}} +; GCN: buffer_store_dword [[K]], off, s[0:3], 0 offset:4 +; GCN: buffer_store_dword v0, off, s[0:3], s32 offset:4 +; GCN: buffer_load_dword [[RELOAD_BYVAL:v[0-9]+]], off, s[0:3], 0 offset:4 ; GCN-NOT: s32 -; GCN: buffer_store_dword v0, off, s[0:3], s32 offset:4 ; GCN: buffer_store_dword [[RELOAD_BYVAL]], off, s[0:3], s32{{$}} ; GCN: v_mov_b32_e32 [[RELOAD_BYVAL]], @@ -618,8 +616,7 @@ ; GCN-LABEL: {{^}}kern_call_too_many_args_use_workitem_id_xyz: ; GCN: enable_vgpr_workitem_id = 2 -; GCN-DAG: s_mov_b32 s33, s7 -; GCN-DAG: s_mov_b32 s32, s33 +; GCN-DAG: s_mov_b32 s32, 0 ; GCN-DAG: v_lshlrev_b32_e32 v1, 10, v1 ; GCN-DAG: v_or_b32_e32 v0, v0, v1 @@ -708,15 +705,13 @@ ; GCN-LABEL: {{^}}kern_call_too_many_args_use_workitem_id_x_stack_yz: ; GCN: enable_vgpr_workitem_id = 2 -; GCN: s_mov_b32 s33, s7 - ; GCN-NOT: v0 ; GCN-DAG: v_lshlrev_b32_e32 v1, 10, v1 ; GCN-DAG: v_or_b32_e32 v0, v0, v1 ; GCN-DAG: v_lshlrev_b32_e32 v2, 20, v2 ; GCN-DAG: v_or_b32_e32 v31, v0, v2 -; GCN: s_mov_b32 s32, s33 +; GCN: s_mov_b32 s32, 0 ; GCN: s_swappc_b64 define amdgpu_kernel void @kern_call_too_many_args_use_workitem_id_x_stack_yz() #1 { call void @too_many_args_use_workitem_id_x_stack_yz( diff --git a/llvm/test/CodeGen/AMDGPU/captured-frame-index.ll b/llvm/test/CodeGen/AMDGPU/captured-frame-index.ll --- a/llvm/test/CodeGen/AMDGPU/captured-frame-index.ll +++ b/llvm/test/CodeGen/AMDGPU/captured-frame-index.ll @@ -28,8 +28,8 @@ ; Offset is applied ; GCN-LABEL: {{^}}stored_fi_to_lds_2_small_objects: ; GCN-DAG: v_mov_b32_e32 [[ZERO:v[0-9]+]], 4{{$}} -; GCN-DAG: buffer_store_dword v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} offset:4{{$}} -; GCN-DAG: buffer_store_dword v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} offset:8{{$}} +; GCN-DAG: buffer_store_dword v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:4{{$}} +; GCN-DAG: buffer_store_dword v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:8{{$}} ; GCN-DAG: s_load_dword [[LDSPTR:s[0-9]+]] @@ -51,9 +51,9 @@ ; Same frame index is used multiple times in the store ; GCN-LABEL: {{^}}stored_fi_to_self: ; GCN-DAG: v_mov_b32_e32 [[K:v[0-9]+]], 0x4d2{{$}} -; GCN: buffer_store_dword [[K]], off, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} offset:4{{$}} +; GCN: buffer_store_dword [[K]], off, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:4{{$}} ; GCN-DAG: v_mov_b32_e32 [[ZERO:v[0-9]+]], 4{{$}} -; GCN: buffer_store_dword [[ZERO]], off, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} offset:4{{$}} +; GCN: buffer_store_dword [[ZERO]], off, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:4{{$}} define amdgpu_kernel void @stored_fi_to_self() #0 { %tmp = alloca i32 addrspace(5)*, addrspace(5) @@ -66,13 +66,13 @@ ; GCN-LABEL: {{^}}stored_fi_to_self_offset: ; GCN-DAG: v_mov_b32_e32 [[K0:v[0-9]+]], 32{{$}} -; GCN: buffer_store_dword [[K0]], off, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} offset:4{{$}} +; GCN: buffer_store_dword [[K0]], off, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:4{{$}} ; GCN-DAG: v_mov_b32_e32 [[K1:v[0-9]+]], 0x4d2{{$}} -; GCN: buffer_store_dword [[K1]], off, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} offset:2052{{$}} +; GCN: buffer_store_dword [[K1]], off, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:2052{{$}} ; GCN: v_mov_b32_e32 [[OFFSETK:v[0-9]+]], 0x804{{$}} -; GCN: buffer_store_dword [[OFFSETK]], off, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} offset:2052{{$}} +; GCN: buffer_store_dword [[OFFSETK]], off, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:2052{{$}} define amdgpu_kernel void @stored_fi_to_self_offset() #0 { %tmp0 = alloca [512 x i32], addrspace(5) %tmp1 = alloca i32 addrspace(5)*, addrspace(5) @@ -89,15 +89,15 @@ } ; GCN-LABEL: {{^}}stored_fi_to_fi: -; GCN: buffer_store_dword v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} offset:4{{$}} -; GCN: buffer_store_dword v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} offset:8{{$}} -; GCN: buffer_store_dword v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} offset:12{{$}} +; GCN: buffer_store_dword v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:4{{$}} +; GCN: buffer_store_dword v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:8{{$}} +; GCN: buffer_store_dword v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:12{{$}} ; GCN: v_mov_b32_e32 [[FI1:v[0-9]+]], 8{{$}} -; GCN: buffer_store_dword [[FI1]], off, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} offset:12{{$}} +; GCN: buffer_store_dword [[FI1]], off, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:12{{$}} ; GCN: v_mov_b32_e32 [[FI2:v[0-9]+]], 12{{$}} -; GCN: buffer_store_dword [[FI2]], off, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} offset:8{{$}} +; GCN: buffer_store_dword [[FI2]], off, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:8{{$}} define amdgpu_kernel void @stored_fi_to_fi() #0 { %tmp0 = alloca i32 addrspace(5)*, addrspace(5) %tmp1 = alloca i32 addrspace(5)*, addrspace(5) @@ -115,7 +115,7 @@ } ; GCN-LABEL: {{^}}stored_fi_to_global: -; GCN: buffer_store_dword v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} offset:4{{$}} +; GCN: buffer_store_dword v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:4{{$}} ; GCN: v_mov_b32_e32 [[FI:v[0-9]+]], 4{{$}} ; GCN: buffer_store_dword [[FI]] define amdgpu_kernel void @stored_fi_to_global(float addrspace(5)* addrspace(1)* %ptr) #0 { @@ -127,9 +127,9 @@ ; Offset is applied ; GCN-LABEL: {{^}}stored_fi_to_global_2_small_objects: -; GCN: buffer_store_dword v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} offset:4{{$}} -; GCN: buffer_store_dword v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} offset:8{{$}} -; GCN: buffer_store_dword v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} offset:12{{$}} +; GCN: buffer_store_dword v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:4{{$}} +; GCN: buffer_store_dword v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:8{{$}} +; GCN: buffer_store_dword v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:12{{$}} ; GCN: v_mov_b32_e32 [[FI1:v[0-9]+]], 8{{$}} ; GCN: buffer_store_dword [[FI1]], off, s{{\[[0-9]+:[0-9]+\]}}, 0{{$}} @@ -150,7 +150,7 @@ ; GCN-LABEL: {{^}}stored_fi_to_global_huge_frame_offset: ; GCN: v_mov_b32_e32 [[BASE_0:v[0-9]+]], 0{{$}} -; GCN: buffer_store_dword [[BASE_0]], off, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} offset:4{{$}} +; GCN: buffer_store_dword [[BASE_0]], off, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:4{{$}} ; FIXME: Re-initialize ; GCN: v_mov_b32_e32 [[BASE_0_1:v[0-9]+]], 4{{$}} @@ -160,7 +160,7 @@ ; GCN: v_add_i32_e32 [[BASE_1_OFF_2:v[0-9]+]], vcc, 56, [[BASE_0_1]] -; GCN: buffer_store_dword [[K]], [[BASE_1_OFF_1]], s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} offen{{$}} +; GCN: buffer_store_dword [[K]], [[BASE_1_OFF_1]], s{{\[[0-9]+:[0-9]+\]}}, 0 offen{{$}} ; GCN: buffer_store_dword [[BASE_1_OFF_2]], off, s{{\[[0-9]+:[0-9]+\]}}, 0{{$}} define amdgpu_kernel void @stored_fi_to_global_huge_frame_offset(i32 addrspace(5)* addrspace(1)* %ptr) #0 { diff --git a/llvm/test/CodeGen/AMDGPU/cc-update.ll b/llvm/test/CodeGen/AMDGPU/cc-update.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/cc-update.ll @@ -0,0 +1,249 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx803 < %s | FileCheck --check-prefix=GFX803 %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 < %s | FileCheck --check-prefix=GFX900 %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 < %s | FileCheck --check-prefix=GFX1010 %s + +define amdgpu_kernel void @test_kern_empty() local_unnamed_addr #0 { +; GFX803-LABEL: test_kern_empty: +; GFX803: ; %bb.0: ; %entry +; GFX803-NEXT: s_endpgm +; +; GFX900-LABEL: test_kern_empty: +; GFX900: ; %bb.0: ; %entry +; GFX900-NEXT: s_endpgm +; +; GFX1010-LABEL: test_kern_empty: +; GFX1010: ; %bb.0: ; %entry +; GFX1010-NEXT: s_endpgm +entry: + ret void +} + +define amdgpu_kernel void @test_kern_stack() local_unnamed_addr #0 { +; GFX803-LABEL: test_kern_stack: +; GFX803: ; %bb.0: ; %entry +; GFX803-NEXT: s_add_u32 s4, s4, s7 +; GFX803-NEXT: s_lshr_b32 flat_scratch_hi, s4, 8 +; GFX803-NEXT: s_add_u32 s0, s0, s7 +; GFX803-NEXT: s_addc_u32 s1, s1, 0 +; GFX803-NEXT: v_mov_b32_e32 v0, 0 +; GFX803-NEXT: s_mov_b32 flat_scratch_lo, s5 +; GFX803-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:4 +; GFX803-NEXT: s_endpgm +; +; GFX900-LABEL: test_kern_stack: +; GFX900: ; %bb.0: ; %entry +; GFX900-NEXT: s_add_u32 flat_scratch_lo, s4, s7 +; GFX900-NEXT: s_addc_u32 flat_scratch_hi, s5, 0 +; GFX900-NEXT: s_add_u32 s0, s0, s7 +; GFX900-NEXT: s_addc_u32 s1, s1, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, 0 +; GFX900-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:4 +; GFX900-NEXT: s_endpgm +; +; GFX1010-LABEL: test_kern_stack: +; GFX1010: ; %bb.0: ; %entry +; GFX1010-NEXT: s_add_u32 s4, s4, s7 +; GFX1010-NEXT: s_addc_u32 s5, s5, 0 +; GFX1010-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s4 +; GFX1010-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s5 +; GFX1010-NEXT: s_add_u32 s0, s0, s7 +; GFX1010-NEXT: s_addc_u32 s1, s1, 0 +; GFX1010-NEXT: v_mov_b32_e32 v0, 0 +; GFX1010-NEXT: ; implicit-def: $vcc_hi +; GFX1010-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:4 +; GFX1010-NEXT: s_endpgm +entry: + %x = alloca i32, align 4, addrspace(5) + store volatile i32 0, i32 addrspace(5)* %x, align 4 + ret void +} + +define amdgpu_kernel void @test_kern_call() local_unnamed_addr #0 { +; GFX803-LABEL: test_kern_call: +; GFX803: ; %bb.0: ; %entry +; GFX803-NEXT: s_add_u32 s4, s4, s7 +; GFX803-NEXT: s_lshr_b32 flat_scratch_hi, s4, 8 +; GFX803-NEXT: s_add_u32 s0, s0, s7 +; GFX803-NEXT: s_addc_u32 s1, s1, 0 +; GFX803-NEXT: s_mov_b32 flat_scratch_lo, s5 +; GFX803-NEXT: s_getpc_b64 s[4:5] +; GFX803-NEXT: s_add_u32 s4, s4, ex@rel32@lo+4 +; GFX803-NEXT: s_addc_u32 s5, s5, ex@rel32@hi+4 +; GFX803-NEXT: s_mov_b32 s32, 0 +; GFX803-NEXT: s_swappc_b64 s[30:31], s[4:5] +; GFX803-NEXT: s_endpgm +; +; GFX900-LABEL: test_kern_call: +; GFX900: ; %bb.0: ; %entry +; GFX900-NEXT: s_add_u32 flat_scratch_lo, s4, s7 +; GFX900-NEXT: s_addc_u32 flat_scratch_hi, s5, 0 +; GFX900-NEXT: s_add_u32 s0, s0, s7 +; GFX900-NEXT: s_addc_u32 s1, s1, 0 +; GFX900-NEXT: s_getpc_b64 s[4:5] +; GFX900-NEXT: s_add_u32 s4, s4, ex@rel32@lo+4 +; GFX900-NEXT: s_addc_u32 s5, s5, ex@rel32@hi+4 +; GFX900-NEXT: s_mov_b32 s32, 0 +; GFX900-NEXT: s_swappc_b64 s[30:31], s[4:5] +; GFX900-NEXT: s_endpgm +; +; GFX1010-LABEL: test_kern_call: +; GFX1010: ; %bb.0: ; %entry +; GFX1010-NEXT: s_add_u32 s4, s4, s7 +; GFX1010-NEXT: s_mov_b32 s32, 0 +; GFX1010-NEXT: s_addc_u32 s5, s5, 0 +; GFX1010-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s4 +; GFX1010-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s5 +; GFX1010-NEXT: s_add_u32 s0, s0, s7 +; GFX1010-NEXT: s_addc_u32 s1, s1, 0 +; GFX1010-NEXT: s_getpc_b64 s[4:5] +; GFX1010-NEXT: s_add_u32 s4, s4, ex@rel32@lo+4 +; GFX1010-NEXT: s_addc_u32 s5, s5, ex@rel32@hi+4 +; GFX1010-NEXT: ; implicit-def: $vcc_hi +; GFX1010-NEXT: s_swappc_b64 s[30:31], s[4:5] +; GFX1010-NEXT: s_endpgm +entry: + tail call void @ex() #0 + ret void +} + +define amdgpu_kernel void @test_kern_stack_and_call() local_unnamed_addr #0 { +; GFX803-LABEL: test_kern_stack_and_call: +; GFX803: ; %bb.0: ; %entry +; GFX803-NEXT: s_add_u32 s4, s4, s7 +; GFX803-NEXT: s_lshr_b32 flat_scratch_hi, s4, 8 +; GFX803-NEXT: s_add_u32 s0, s0, s7 +; GFX803-NEXT: s_addc_u32 s1, s1, 0 +; GFX803-NEXT: v_mov_b32_e32 v0, 0 +; GFX803-NEXT: s_mov_b32 flat_scratch_lo, s5 +; GFX803-NEXT: s_getpc_b64 s[4:5] +; GFX803-NEXT: s_add_u32 s4, s4, ex@rel32@lo+4 +; GFX803-NEXT: s_addc_u32 s5, s5, ex@rel32@hi+4 +; GFX803-NEXT: s_movk_i32 s32, 0x400 +; GFX803-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:4 +; GFX803-NEXT: s_swappc_b64 s[30:31], s[4:5] +; GFX803-NEXT: s_endpgm +; +; GFX900-LABEL: test_kern_stack_and_call: +; GFX900: ; %bb.0: ; %entry +; GFX900-NEXT: s_add_u32 flat_scratch_lo, s4, s7 +; GFX900-NEXT: s_addc_u32 flat_scratch_hi, s5, 0 +; GFX900-NEXT: s_add_u32 s0, s0, s7 +; GFX900-NEXT: s_addc_u32 s1, s1, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, 0 +; GFX900-NEXT: s_getpc_b64 s[4:5] +; GFX900-NEXT: s_add_u32 s4, s4, ex@rel32@lo+4 +; GFX900-NEXT: s_addc_u32 s5, s5, ex@rel32@hi+4 +; GFX900-NEXT: s_movk_i32 s32, 0x400 +; GFX900-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:4 +; GFX900-NEXT: s_swappc_b64 s[30:31], s[4:5] +; GFX900-NEXT: s_endpgm +; +; GFX1010-LABEL: test_kern_stack_and_call: +; GFX1010: ; %bb.0: ; %entry +; GFX1010-NEXT: s_add_u32 s4, s4, s7 +; GFX1010-NEXT: s_movk_i32 s32, 0x200 +; GFX1010-NEXT: s_addc_u32 s5, s5, 0 +; GFX1010-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s4 +; GFX1010-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s5 +; GFX1010-NEXT: s_add_u32 s0, s0, s7 +; GFX1010-NEXT: s_addc_u32 s1, s1, 0 +; GFX1010-NEXT: v_mov_b32_e32 v0, 0 +; GFX1010-NEXT: s_getpc_b64 s[4:5] +; GFX1010-NEXT: s_add_u32 s4, s4, ex@rel32@lo+4 +; GFX1010-NEXT: s_addc_u32 s5, s5, ex@rel32@hi+4 +; GFX1010-NEXT: ; implicit-def: $vcc_hi +; GFX1010-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:4 +; GFX1010-NEXT: s_swappc_b64 s[30:31], s[4:5] +; GFX1010-NEXT: s_endpgm +entry: + %x = alloca i32, align 4, addrspace(5) + store volatile i32 0, i32 addrspace(5)* %x, align 4 + tail call void @ex() #0 + ret void +} + +define amdgpu_kernel void @test_sgpr_offset_kernel() #1 { +; GFX803-LABEL: test_sgpr_offset_kernel: +; GFX803: ; %bb.0: ; %entry +; GFX803-NEXT: s_add_u32 s4, s4, s7 +; GFX803-NEXT: s_lshr_b32 flat_scratch_hi, s4, 8 +; GFX803-NEXT: s_add_u32 s0, s0, s7 +; GFX803-NEXT: s_addc_u32 s1, s1, 0 +; GFX803-NEXT: buffer_load_dword v0, off, s[0:3], 0 offset:8 +; GFX803-NEXT: s_mov_b32 s4, 0x40000 +; GFX803-NEXT: s_mov_b32 flat_scratch_lo, s5 +; GFX803-NEXT: s_waitcnt vmcnt(0) +; GFX803-NEXT: buffer_store_dword v0, off, s[0:3], s4 ; 4-byte Folded Spill +; GFX803-NEXT: ;;#ASMSTART +; GFX803-NEXT: ;;#ASMEND +; GFX803-NEXT: s_mov_b32 s4, 0x40000 +; GFX803-NEXT: buffer_load_dword v0, off, s[0:3], s4 ; 4-byte Folded Reload +; GFX803-NEXT: s_waitcnt vmcnt(0) +; GFX803-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:8 +; GFX803-NEXT: s_endpgm +; +; GFX900-LABEL: test_sgpr_offset_kernel: +; GFX900: ; %bb.0: ; %entry +; GFX900-NEXT: s_add_u32 flat_scratch_lo, s4, s7 +; GFX900-NEXT: s_addc_u32 flat_scratch_hi, s5, 0 +; GFX900-NEXT: s_add_u32 s0, s0, s7 +; GFX900-NEXT: s_addc_u32 s1, s1, 0 +; GFX900-NEXT: buffer_load_dword v0, off, s[0:3], 0 offset:8 +; GFX900-NEXT: s_mov_b32 s6, 0x40000 +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: buffer_store_dword v0, off, s[0:3], s6 ; 4-byte Folded Spill +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s6, 0x40000 +; GFX900-NEXT: buffer_load_dword v0, off, s[0:3], s6 ; 4-byte Folded Reload +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:8 +; GFX900-NEXT: s_endpgm +; +; GFX1010-LABEL: test_sgpr_offset_kernel: +; GFX1010: ; %bb.0: ; %entry +; GFX1010-NEXT: s_add_u32 s4, s4, s7 +; GFX1010-NEXT: s_addc_u32 s5, s5, 0 +; GFX1010-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s4 +; GFX1010-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s5 +; GFX1010-NEXT: s_add_u32 s0, s0, s7 +; GFX1010-NEXT: s_addc_u32 s1, s1, 0 +; GFX1010-NEXT: s_mov_b32 s6, 0x20000 +; GFX1010-NEXT: ; implicit-def: $vcc_hi +; GFX1010-NEXT: buffer_load_dword v0, off, s[0:3], 0 offset:8 +; GFX1010-NEXT: s_waitcnt vmcnt(0) +; GFX1010-NEXT: buffer_store_dword v0, off, s[0:3], s6 ; 4-byte Folded Spill +; GFX1010-NEXT: v_nop +; GFX1010-NEXT: s_mov_b32 s6, 0x20000 +; GFX1010-NEXT: ;;#ASMSTART +; GFX1010-NEXT: ;;#ASMEND +; GFX1010-NEXT: buffer_load_dword v0, off, s[0:3], s6 ; 4-byte Folded Reload +; GFX1010-NEXT: s_waitcnt vmcnt(0) +; GFX1010-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:8 +; GFX1010-NEXT: s_endpgm +entry: + ; Occupy 4096 bytes of scratch, so the offset of the spill of %a does not + ; fit in the instruction, and has to live in the SGPR offset. + %alloca = alloca i8, i32 4092, align 4, addrspace(5) + %buf = bitcast i8 addrspace(5)* %alloca to i32 addrspace(5)* + + %aptr = getelementptr i32, i32 addrspace(5)* %buf, i32 1 + ; 0x40000 / 64 = 4096 (for wave64) + ; CHECK: s_add_u32 s6, s7, 0x40000 + ; CHECK: buffer_store_dword v{{[0-9]+}}, off, s[{{[0-9]+:[0-9]+}}], s6 ; 4-byte Folded Spill + %a = load volatile i32, i32 addrspace(5)* %aptr + + ; Force %a to spill + call void asm sideeffect "", "~{v0},~{v1},~{v2},~{v3},~{v4},~{v5},~{v6},~{v7}" () + + %outptr = getelementptr i32, i32 addrspace(5)* %buf, i32 1 + store volatile i32 %a, i32 addrspace(5)* %outptr + + ret void +} + +declare hidden void @ex() local_unnamed_addr #0 + +attributes #0 = { nounwind } +attributes #1 = { nounwind "amdgpu-num-vgpr"="8" } diff --git a/llvm/test/CodeGen/AMDGPU/cgp-addressing-modes.ll b/llvm/test/CodeGen/AMDGPU/cgp-addressing-modes.ll --- a/llvm/test/CodeGen/AMDGPU/cgp-addressing-modes.ll +++ b/llvm/test/CodeGen/AMDGPU/cgp-addressing-modes.ll @@ -135,8 +135,8 @@ ; GCN-LABEL: {{^}}test_sink_scratch_small_offset_i32: ; GCN: s_and_saveexec_b64 -; GCN: buffer_store_dword {{v[0-9]+}}, off, {{s\[[0-9]+:[0-9]+\]}}, {{s[0-9]+}} offset:4092{{$}} -; GCN: buffer_load_dword {{v[0-9]+}}, off, {{s\[[0-9]+:[0-9]+\]}}, {{s[0-9]+}} offset:4092{{$}} +; GCN: buffer_store_dword {{v[0-9]+}}, off, {{s\[[0-9]+:[0-9]+\]}}, 0 offset:4092{{$}} +; GCN: buffer_load_dword {{v[0-9]+}}, off, {{s\[[0-9]+:[0-9]+\]}}, 0 offset:4092{{$}} ; GCN: {{^}}BB4_2: define amdgpu_kernel void @test_sink_scratch_small_offset_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in, i32 %arg) { entry: @@ -174,9 +174,9 @@ ; GCN-LABEL: {{^}}test_sink_scratch_small_offset_i32_reserved: ; GCN: s_and_saveexec_b64 ; GCN: v_mov_b32_e32 [[BASE_FI0:v[0-9]+]], 4 -; GCN: buffer_store_dword {{v[0-9]+}}, [[BASE_FI0]], {{s\[[0-9]+:[0-9]+\]}}, {{s[0-9]+}} offen offset:4092{{$}} +; GCN: buffer_store_dword {{v[0-9]+}}, [[BASE_FI0]], {{s\[[0-9]+:[0-9]+\]}}, 0 offen offset:4092{{$}} ; GCN: v_mov_b32_e32 [[BASE_FI1:v[0-9]+]], 4 -; GCN: buffer_load_dword {{v[0-9]+}}, [[BASE_FI1]], {{s\[[0-9]+:[0-9]+\]}}, {{s[0-9]+}} offen offset:4092{{$}} +; GCN: buffer_load_dword {{v[0-9]+}}, [[BASE_FI1]], {{s\[[0-9]+:[0-9]+\]}}, 0 offen offset:4092{{$}} ; GCN: {{^BB[0-9]+}}_2: define amdgpu_kernel void @test_sink_scratch_small_offset_i32_reserved(i32 addrspace(1)* %out, i32 addrspace(1)* %in, i32 %arg) { @@ -213,8 +213,8 @@ ; GCN-LABEL: {{^}}test_no_sink_scratch_large_offset_i32: ; GCN: s_and_saveexec_b64 -; GCN: buffer_store_dword {{v[0-9]+}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, {{s[0-9]+}} offen{{$}} -; GCN: buffer_load_dword {{v[0-9]+}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, {{s[0-9]+}} offen{{$}} +; GCN: buffer_store_dword {{v[0-9]+}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, 0 offen{{$}} +; GCN: buffer_load_dword {{v[0-9]+}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, 0 offen{{$}} ; GCN: {{^BB[0-9]+}}_2: define amdgpu_kernel void @test_no_sink_scratch_large_offset_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in, i32 %arg) { entry: diff --git a/llvm/test/CodeGen/AMDGPU/chain-hi-to-lo.ll b/llvm/test/CodeGen/AMDGPU/chain-hi-to-lo.ll --- a/llvm/test/CodeGen/AMDGPU/chain-hi-to-lo.ll +++ b/llvm/test/CodeGen/AMDGPU/chain-hi-to-lo.ll @@ -5,9 +5,9 @@ ; GCN-LABEL: chain_hi_to_lo_private: ; GCN: ; %bb.0: ; %bb ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: buffer_load_ushort v0, off, s[0:3], s33 offset:2 +; GCN-NEXT: buffer_load_ushort v0, off, s[0:3], 0 offset:2 ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_load_short_d16_hi v0, off, s[0:3], s33 +; GCN-NEXT: buffer_load_short_d16_hi v0, off, s[0:3], 0 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: s_setpc_b64 s[30:31] bb: @@ -26,9 +26,9 @@ ; GCN-LABEL: chain_hi_to_lo_private_different_bases: ; GCN: ; %bb.0: ; %bb ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: buffer_load_ushort v0, v0, s[0:3], s33 offen +; GCN-NEXT: buffer_load_ushort v0, v0, s[0:3], 0 offen ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_load_short_d16_hi v0, v1, s[0:3], s33 offen +; GCN-NEXT: buffer_load_short_d16_hi v0, v1, s[0:3], 0 offen ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: s_setpc_b64 s[30:31] bb: @@ -46,7 +46,7 @@ ; GCN: ; %bb.0: ; %bb ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN-NEXT: v_add_f16_e32 v1, 1.0, v1 -; GCN-NEXT: buffer_load_short_d16_hi v1, v0, s[0:3], s33 offen +; GCN-NEXT: buffer_load_short_d16_hi v1, v0, s[0:3], 0 offen ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: v_mov_b32_e32 v0, v1 ; GCN-NEXT: s_setpc_b64 s[30:31] @@ -196,6 +196,8 @@ ; GCN-NEXT: s_add_u32 flat_scratch_lo, s6, s9 ; GCN-NEXT: s_addc_u32 flat_scratch_hi, s7, 0 ; GCN-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x0 +; GCN-NEXT: s_add_u32 s0, s0, s9 +; GCN-NEXT: s_addc_u32 s1, s1, 0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: v_mov_b32_e32 v2, s4 ; GCN-NEXT: v_mov_b32_e32 v3, s5 @@ -203,20 +205,20 @@ ; GCN-NEXT: v_mov_b32_e32 v0, s6 ; GCN-NEXT: v_mov_b32_e32 v1, s7 ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_short v4, off, s[0:3], s9 offset:4 +; GCN-NEXT: buffer_store_short v4, off, s[0:3], 0 offset:4 ; GCN-NEXT: global_load_ushort v4, v[2:3], off offset:2 ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_short v4, off, s[0:3], s9 offset:6 +; GCN-NEXT: buffer_store_short v4, off, s[0:3], 0 offset:6 ; GCN-NEXT: global_load_ushort v2, v[2:3], off offset:4 ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_short v2, off, s[0:3], s9 offset:8 -; GCN-NEXT: buffer_load_ushort v2, off, s[0:3], s9 offset:4 -; GCN-NEXT: buffer_load_ushort v4, off, s[0:3], s9 offset:6 +; GCN-NEXT: buffer_store_short v2, off, s[0:3], 0 offset:8 +; GCN-NEXT: buffer_load_ushort v2, off, s[0:3], 0 offset:4 +; GCN-NEXT: buffer_load_ushort v4, off, s[0:3], 0 offset:6 ; GCN-NEXT: s_waitcnt vmcnt(1) ; GCN-NEXT: v_and_b32_e32 v2, 0xffff, v2 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: v_mov_b32_e32 v3, v4 -; GCN-NEXT: buffer_load_short_d16_hi v3, off, s[0:3], s9 offset:8 +; GCN-NEXT: buffer_load_short_d16_hi v3, off, s[0:3], 0 offset:8 ; GCN-NEXT: v_lshl_or_b32 v2, v4, 16, v2 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: global_store_dwordx2 v[0:1], v[2:3], off @@ -298,10 +300,10 @@ ; GCN-LABEL: chain_hi_to_lo_private_other_dep: ; GCN: ; %bb.0: ; %bb ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: buffer_load_short_d16_hi v1, v0, s[0:3], s33 offen +; GCN-NEXT: buffer_load_short_d16_hi v1, v0, s[0:3], 0 offen ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: v_pk_sub_u16 v1, v1, -12 op_sel_hi:[1,0] -; GCN-NEXT: buffer_load_short_d16 v1, v0, s[0:3], s33 offen offset:2 +; GCN-NEXT: buffer_load_short_d16 v1, v0, s[0:3], 0 offen offset:2 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: v_mov_b32_e32 v0, v1 ; GCN-NEXT: s_setpc_b64 s[30:31] diff --git a/llvm/test/CodeGen/AMDGPU/collapse-endcf.ll b/llvm/test/CodeGen/AMDGPU/collapse-endcf.ll --- a/llvm/test/CodeGen/AMDGPU/collapse-endcf.ll +++ b/llvm/test/CodeGen/AMDGPU/collapse-endcf.ll @@ -228,7 +228,7 @@ ; GCN: s_andn2_b64 exec, exec, ; GCN-NEXT: s_cbranch_execnz [[BB1_LOOP]] -; GCN: buffer_load_dword v{{[0-9]+}}, v{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} offen +; GCN: buffer_load_dword v{{[0-9]+}}, v{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}}, 0 offen ; GCN: s_and_b64 exec, exec, {{vcc|s\[[0-9:]+\]}} ; GCN-NOT: s_or_b64 exec, exec diff --git a/llvm/test/CodeGen/AMDGPU/control-flow-fastregalloc.ll b/llvm/test/CodeGen/AMDGPU/control-flow-fastregalloc.ll --- a/llvm/test/CodeGen/AMDGPU/control-flow-fastregalloc.ll +++ b/llvm/test/CodeGen/AMDGPU/control-flow-fastregalloc.ll @@ -22,16 +22,16 @@ ; GCN: s_and_b64 s{{\[}}[[ANDEXEC_LO:[0-9]+]]:[[ANDEXEC_HI:[0-9]+]]{{\]}}, s{{\[}}[[SAVEEXEC_LO]]:[[SAVEEXEC_HI]]{{\]}}, [[CMP0]] ; Spill load -; GCN: buffer_store_dword [[LOAD0]], off, s[0:3], s7 offset:[[LOAD0_OFFSET:[0-9]+]] ; 4-byte Folded Spill +; GCN: buffer_store_dword [[LOAD0]], off, s[0:3], 0 offset:[[LOAD0_OFFSET:[0-9]+]] ; 4-byte Folded Spill ; Spill saved exec ; VGPR: v_writelane_b32 [[SPILL_VGPR:v[0-9]+]], s[[SAVEEXEC_LO]], [[SAVEEXEC_LO_LANE:[0-9]+]] ; VGPR: v_writelane_b32 [[SPILL_VGPR]], s[[SAVEEXEC_HI]], [[SAVEEXEC_HI_LANE:[0-9]+]] ; VMEM: v_mov_b32_e32 v[[V_SAVEEXEC_LO:[0-9]+]], s[[SAVEEXEC_LO]] -; VMEM: buffer_store_dword v[[V_SAVEEXEC_LO]], off, s[0:3], s7 offset:20 ; 4-byte Folded Spill +; VMEM: buffer_store_dword v[[V_SAVEEXEC_LO]], off, s[0:3], 0 offset:20 ; 4-byte Folded Spill ; VMEM: v_mov_b32_e32 v[[V_SAVEEXEC_HI:[0-9]+]], s[[SAVEEXEC_HI]] -; VMEM: buffer_store_dword v[[V_SAVEEXEC_HI]], off, s[0:3], s7 offset:24 ; 4-byte Folded Spill +; VMEM: buffer_store_dword v[[V_SAVEEXEC_HI]], off, s[0:3], 0 offset:24 ; 4-byte Folded Spill ; GCN: s_mov_b64 exec, s{{\[}}[[ANDEXEC_LO]]:[[ANDEXEC_HI]]{{\]}} @@ -40,13 +40,13 @@ ; GCN: ; %bb.{{[0-9]+}}: ; %if ; GCN: s_mov_b32 m0, -1 ; GCN: ds_read_b32 [[LOAD1:v[0-9]+]] -; GCN: buffer_load_dword [[RELOAD_LOAD0:v[0-9]+]], off, s[0:3], s7 offset:[[LOAD0_OFFSET]] ; 4-byte Folded Reload +; GCN: buffer_load_dword [[RELOAD_LOAD0:v[0-9]+]], off, s[0:3], 0 offset:[[LOAD0_OFFSET]] ; 4-byte Folded Reload ; GCN: s_waitcnt vmcnt(0) lgkmcnt(0) ; Spill val register ; GCN: v_add_i32_e32 [[VAL:v[0-9]+]], vcc, [[LOAD1]], [[RELOAD_LOAD0]] -; GCN: buffer_store_dword [[VAL]], off, s[0:3], s7 offset:[[VAL_OFFSET:[0-9]+]] ; 4-byte Folded Spill +; GCN: buffer_store_dword [[VAL]], off, s[0:3], 0 offset:[[VAL_OFFSET:[0-9]+]] ; 4-byte Folded Spill ; VMEM: [[ENDIF]]: @@ -56,18 +56,18 @@ -; VMEM: buffer_load_dword v[[V_RELOAD_SAVEEXEC_LO:[0-9]+]], off, s[0:3], s7 offset:20 ; 4-byte Folded Reload +; VMEM: buffer_load_dword v[[V_RELOAD_SAVEEXEC_LO:[0-9]+]], off, s[0:3], 0 offset:20 ; 4-byte Folded Reload ; VMEM: s_waitcnt vmcnt(0) ; VMEM: v_readfirstlane_b32 s[[S_RELOAD_SAVEEXEC_LO:[0-9]+]], v[[V_RELOAD_SAVEEXEC_LO]] -; VMEM: buffer_load_dword v[[V_RELOAD_SAVEEXEC_HI:[0-9]+]], off, s[0:3], s7 offset:24 ; 4-byte Folded Reload +; VMEM: buffer_load_dword v[[V_RELOAD_SAVEEXEC_HI:[0-9]+]], off, s[0:3], 0 offset:24 ; 4-byte Folded Reload ; VMEM: s_waitcnt vmcnt(0) ; VMEM: v_readfirstlane_b32 s[[S_RELOAD_SAVEEXEC_HI:[0-9]+]], v[[V_RELOAD_SAVEEXEC_HI]] ; GCN: s_or_b64 exec, exec, s{{\[}}[[S_RELOAD_SAVEEXEC_LO]]:[[S_RELOAD_SAVEEXEC_HI]]{{\]}} ; Restore val -; GCN: buffer_load_dword [[RELOAD_VAL:v[0-9]+]], off, s[0:3], s7 offset:[[VAL_OFFSET]] ; 4-byte Folded Reload +; GCN: buffer_load_dword [[RELOAD_VAL:v[0-9]+]], off, s[0:3], 0 offset:[[VAL_OFFSET]] ; 4-byte Folded Reload ; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RELOAD_VAL]] define amdgpu_kernel void @divergent_if_endif(i32 addrspace(1)* %out) #0 { @@ -102,7 +102,7 @@ ; GCN: s_and_b64 s{{\[}}[[ANDEXEC_LO:[0-9]+]]:[[ANDEXEC_HI:[0-9]+]]{{\]}}, s{{\[}}[[SAVEEXEC_LO:[0-9]+]]:[[SAVEEXEC_HI:[0-9]+]]{{\]}}, [[CMP0]] ; Spill load -; GCN: buffer_store_dword [[LOAD0]], off, s[0:3], s7 offset:[[LOAD0_OFFSET:[0-9]+]] ; 4-byte Folded Spill +; GCN: buffer_store_dword [[LOAD0]], off, s[0:3], 0 offset:[[LOAD0_OFFSET:[0-9]+]] ; 4-byte Folded Spill ; Spill saved exec ; VGPR: v_writelane_b32 [[SPILL_VGPR:v[0-9]+]], s[[SAVEEXEC_LO]], [[SAVEEXEC_LO_LANE:[0-9]+]] @@ -110,9 +110,9 @@ ; VMEM: v_mov_b32_e32 v[[V_SAVEEXEC_LO:[0-9]+]], s[[SAVEEXEC_LO]] -; VMEM: buffer_store_dword v[[V_SAVEEXEC_LO]], off, s[0:3], s7 offset:24 ; 4-byte Folded Spill +; VMEM: buffer_store_dword v[[V_SAVEEXEC_LO]], off, s[0:3], 0 offset:24 ; 4-byte Folded Spill ; VMEM: v_mov_b32_e32 v[[V_SAVEEXEC_HI:[0-9]+]], s[[SAVEEXEC_HI]] -; VMEM: buffer_store_dword v[[V_SAVEEXEC_HI]], off, s[0:3], s7 offset:28 ; 4-byte Folded Spill +; VMEM: buffer_store_dword v[[V_SAVEEXEC_HI]], off, s[0:3], 0 offset:28 ; 4-byte Folded Spill ; GCN: s_mov_b64 exec, s{{\[}}[[ANDEXEC_LO]]:[[ANDEXEC_HI]]{{\]}} @@ -120,10 +120,10 @@ ; GCN: [[LOOP:BB[0-9]+_[0-9]+]]: -; GCN: buffer_load_dword v[[VAL_LOOP_RELOAD:[0-9]+]], off, s[0:3], s7 offset:[[LOAD0_OFFSET]] ; 4-byte Folded Reload +; GCN: buffer_load_dword v[[VAL_LOOP_RELOAD:[0-9]+]], off, s[0:3], 0 offset:[[LOAD0_OFFSET]] ; 4-byte Folded Reload ; GCN: v_subrev_i32_e32 [[VAL_LOOP:v[0-9]+]], vcc, v{{[0-9]+}}, v[[VAL_LOOP_RELOAD]] ; GCN: s_cmp_lg_u32 -; GCN: buffer_store_dword [[VAL_LOOP]], off, s[0:3], s7 offset:[[VAL_SUB_OFFSET:[0-9]+]] ; 4-byte Folded Spill +; GCN: buffer_store_dword [[VAL_LOOP]], off, s[0:3], 0 offset:[[VAL_SUB_OFFSET:[0-9]+]] ; 4-byte Folded Spill ; GCN-NEXT: s_cbranch_scc1 [[LOOP]] @@ -131,16 +131,16 @@ ; VGPR: v_readlane_b32 s[[S_RELOAD_SAVEEXEC_LO:[0-9]+]], [[SPILL_VGPR]], [[SAVEEXEC_LO_LANE]] ; VGPR: v_readlane_b32 s[[S_RELOAD_SAVEEXEC_HI:[0-9]+]], [[SPILL_VGPR]], [[SAVEEXEC_HI_LANE]] -; VMEM: buffer_load_dword v[[V_RELOAD_SAVEEXEC_LO:[0-9]+]], off, s[0:3], s7 offset:24 ; 4-byte Folded Reload +; VMEM: buffer_load_dword v[[V_RELOAD_SAVEEXEC_LO:[0-9]+]], off, s[0:3], 0 offset:24 ; 4-byte Folded Reload ; VMEM: s_waitcnt vmcnt(0) ; VMEM: v_readfirstlane_b32 s[[S_RELOAD_SAVEEXEC_LO:[0-9]+]], v[[V_RELOAD_SAVEEXEC_LO]] -; VMEM: buffer_load_dword v[[V_RELOAD_SAVEEXEC_HI:[0-9]+]], off, s[0:3], s7 offset:28 ; 4-byte Folded Reload +; VMEM: buffer_load_dword v[[V_RELOAD_SAVEEXEC_HI:[0-9]+]], off, s[0:3], 0 offset:28 ; 4-byte Folded Reload ; VMEM: s_waitcnt vmcnt(0) ; VMEM: v_readfirstlane_b32 s[[S_RELOAD_SAVEEXEC_HI:[0-9]+]], v[[V_RELOAD_SAVEEXEC_HI]] ; GCN: s_or_b64 exec, exec, s{{\[}}[[S_RELOAD_SAVEEXEC_LO]]:[[S_RELOAD_SAVEEXEC_HI]]{{\]}} -; GCN: buffer_load_dword v[[VAL_END:[0-9]+]], off, s[0:3], s7 offset:[[VAL_SUB_OFFSET]] ; 4-byte Folded Reload +; GCN: buffer_load_dword v[[VAL_END:[0-9]+]], off, s[0:3], 0 offset:[[VAL_SUB_OFFSET]] ; 4-byte Folded Reload ; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, v[[VAL_END]] define amdgpu_kernel void @divergent_loop(i32 addrspace(1)* %out) #0 { @@ -179,16 +179,16 @@ ; GCN: s_xor_b64 s{{\[}}[[SAVEEXEC_LO]]:[[SAVEEXEC_HI]]{{\]}}, s{{\[}}[[ANDEXEC_LO]]:[[ANDEXEC_HI]]{{\]}}, s{{\[}}[[SAVEEXEC_LO]]:[[SAVEEXEC_HI]]{{\]}} ; Spill load -; GCN: buffer_store_dword [[LOAD0]], off, s[0:3], s7 offset:[[LOAD0_OFFSET:[0-9]+]] ; 4-byte Folded Spill +; GCN: buffer_store_dword [[LOAD0]], off, s[0:3], 0 offset:[[LOAD0_OFFSET:[0-9]+]] ; 4-byte Folded Spill ; Spill saved exec ; VGPR: v_writelane_b32 [[SPILL_VGPR:v[0-9]+]], s[[SAVEEXEC_LO]], [[SAVEEXEC_LO_LANE:[0-9]+]] ; VGPR: v_writelane_b32 [[SPILL_VGPR]], s[[SAVEEXEC_HI]], [[SAVEEXEC_HI_LANE:[0-9]+]] ; VMEM: v_mov_b32_e32 v[[V_SAVEEXEC_LO:[0-9]+]], s[[SAVEEXEC_LO]] -; VMEM: buffer_store_dword v[[V_SAVEEXEC_LO]], off, s[0:3], s7 offset:[[SAVEEXEC_LO_OFFSET:[0-9]+]] ; 4-byte Folded Spill +; VMEM: buffer_store_dword v[[V_SAVEEXEC_LO]], off, s[0:3], 0 offset:[[SAVEEXEC_LO_OFFSET:[0-9]+]] ; 4-byte Folded Spill ; VMEM: v_mov_b32_e32 v[[V_SAVEEXEC_HI:[0-9]+]], s[[SAVEEXEC_HI]] -; VMEM: buffer_store_dword v[[V_SAVEEXEC_HI]], off, s[0:3], s7 offset:[[SAVEEXEC_HI_OFFSET:[0-9]+]] ; 4-byte Folded Spill +; VMEM: buffer_store_dword v[[V_SAVEEXEC_HI]], off, s[0:3], 0 offset:[[SAVEEXEC_HI_OFFSET:[0-9]+]] ; 4-byte Folded Spill ; GCN: s_mov_b64 exec, [[CMP0]] @@ -201,18 +201,18 @@ ; VGPR: v_readlane_b32 s[[FLOW_S_RELOAD_SAVEEXEC_HI:[0-9]+]], [[SPILL_VGPR]], [[SAVEEXEC_HI_LANE]] -; VMEM: buffer_load_dword v[[FLOW_V_RELOAD_SAVEEXEC_LO:[0-9]+]], off, s[0:3], s7 offset:[[SAVEEXEC_LO_OFFSET]] +; VMEM: buffer_load_dword v[[FLOW_V_RELOAD_SAVEEXEC_LO:[0-9]+]], off, s[0:3], 0 offset:[[SAVEEXEC_LO_OFFSET]] ; VMEM: s_waitcnt vmcnt(0) ; VMEM: v_readfirstlane_b32 s[[FLOW_S_RELOAD_SAVEEXEC_LO:[0-9]+]], v[[FLOW_V_RELOAD_SAVEEXEC_LO]] -; VMEM: buffer_load_dword v[[FLOW_V_RELOAD_SAVEEXEC_HI:[0-9]+]], off, s[0:3], s7 offset:[[SAVEEXEC_HI_OFFSET]] ; 4-byte Folded Reload +; VMEM: buffer_load_dword v[[FLOW_V_RELOAD_SAVEEXEC_HI:[0-9]+]], off, s[0:3], 0 offset:[[SAVEEXEC_HI_OFFSET]] ; 4-byte Folded Reload ; VMEM: s_waitcnt vmcnt(0) ; VMEM: v_readfirstlane_b32 s[[FLOW_S_RELOAD_SAVEEXEC_HI:[0-9]+]], v[[FLOW_V_RELOAD_SAVEEXEC_HI]] ; GCN: s_or_saveexec_b64 s{{\[}}[[FLOW_S_RELOAD_SAVEEXEC_LO]]:[[FLOW_S_RELOAD_SAVEEXEC_HI]]{{\]}}, s{{\[}}[[FLOW_S_RELOAD_SAVEEXEC_LO]]:[[FLOW_S_RELOAD_SAVEEXEC_HI]]{{\]}} ; Regular spill value restored after exec modification -; GCN: buffer_load_dword [[FLOW_VAL:v[0-9]+]], off, s[0:3], s7 offset:[[FLOW_VAL_OFFSET:[0-9]+]] ; 4-byte Folded Reload +; GCN: buffer_load_dword [[FLOW_VAL:v[0-9]+]], off, s[0:3], 0 offset:[[FLOW_VAL_OFFSET:[0-9]+]] ; 4-byte Folded Reload ; Spill saved exec @@ -221,26 +221,26 @@ ; VMEM: v_mov_b32_e32 v[[FLOW_V_SAVEEXEC_LO:[0-9]+]], s[[FLOW_S_RELOAD_SAVEEXEC_LO]] -; VMEM: buffer_store_dword v[[FLOW_V_SAVEEXEC_LO]], off, s[0:3], s7 offset:[[FLOW_SAVEEXEC_LO_OFFSET:[0-9]+]] ; 4-byte Folded Spill +; VMEM: buffer_store_dword v[[FLOW_V_SAVEEXEC_LO]], off, s[0:3], 0 offset:[[FLOW_SAVEEXEC_LO_OFFSET:[0-9]+]] ; 4-byte Folded Spill ; VMEM: v_mov_b32_e32 v[[FLOW_V_SAVEEXEC_HI:[0-9]+]], s[[FLOW_S_RELOAD_SAVEEXEC_HI]] -; VMEM: buffer_store_dword v[[FLOW_V_SAVEEXEC_HI]], off, s[0:3], s7 offset:[[FLOW_SAVEEXEC_HI_OFFSET:[0-9]+]] ; 4-byte Folded Spill +; VMEM: buffer_store_dword v[[FLOW_V_SAVEEXEC_HI]], off, s[0:3], 0 offset:[[FLOW_SAVEEXEC_HI_OFFSET:[0-9]+]] ; 4-byte Folded Spill -; GCN: buffer_store_dword [[FLOW_VAL]], off, s[0:3], s7 offset:[[RESULT_OFFSET:[0-9]+]] ; 4-byte Folded Spill +; GCN: buffer_store_dword [[FLOW_VAL]], off, s[0:3], 0 offset:[[RESULT_OFFSET:[0-9]+]] ; 4-byte Folded Spill ; GCN: s_xor_b64 exec, exec, s{{\[}}[[FLOW_S_RELOAD_SAVEEXEC_LO]]:[[FLOW_S_RELOAD_SAVEEXEC_HI]]{{\]}} ; GCN-NEXT: s_cbranch_execz [[ENDIF:BB[0-9]+_[0-9]+]] ; GCN: ; %bb.{{[0-9]+}}: ; %if ; GCN: ds_read_b32 -; GCN: buffer_load_dword v[[LOAD0_RELOAD:[0-9]+]], off, s[0:3], s7 offset:[[LOAD0_OFFSET]] ; 4-byte Folded Reload +; GCN: buffer_load_dword v[[LOAD0_RELOAD:[0-9]+]], off, s[0:3], 0 offset:[[LOAD0_OFFSET]] ; 4-byte Folded Reload ; GCN: v_add_i32_e32 [[ADD:v[0-9]+]], vcc, v{{[0-9]+}}, v[[LOAD0_RELOAD]] -; GCN: buffer_store_dword [[ADD]], off, s[0:3], s7 offset:[[RESULT_OFFSET]] ; 4-byte Folded Spill +; GCN: buffer_store_dword [[ADD]], off, s[0:3], 0 offset:[[RESULT_OFFSET]] ; 4-byte Folded Spill ; GCN-NEXT: s_branch [[ENDIF:BB[0-9]+_[0-9]+]] ; GCN: [[ELSE]]: ; %else -; GCN: buffer_load_dword v[[LOAD0_RELOAD:[0-9]+]], off, s[0:3], s7 offset:[[LOAD0_OFFSET]] ; 4-byte Folded Reload +; GCN: buffer_load_dword v[[LOAD0_RELOAD:[0-9]+]], off, s[0:3], 0 offset:[[LOAD0_OFFSET]] ; 4-byte Folded Reload ; GCN: v_subrev_i32_e32 [[SUB:v[0-9]+]], vcc, v{{[0-9]+}}, v[[LOAD0_RELOAD]] -; GCN: buffer_store_dword [[ADD]], off, s[0:3], s7 offset:[[FLOW_RESULT_OFFSET:[0-9]+]] ; 4-byte Folded Spill +; GCN: buffer_store_dword [[ADD]], off, s[0:3], 0 offset:[[FLOW_RESULT_OFFSET:[0-9]+]] ; 4-byte Folded Spill ; GCN-NEXT: s_branch [[FLOW]] ; GCN: [[ENDIF]]: @@ -248,17 +248,17 @@ ; VGPR: v_readlane_b32 s[[S_RELOAD_SAVEEXEC_HI:[0-9]+]], [[SPILL_VGPR]], [[FLOW_SAVEEXEC_HI_LANE]] -; VMEM: buffer_load_dword v[[V_RELOAD_SAVEEXEC_LO:[0-9]+]], off, s[0:3], s7 offset:[[FLOW_SAVEEXEC_LO_OFFSET]] ; 4-byte Folded Reload +; VMEM: buffer_load_dword v[[V_RELOAD_SAVEEXEC_LO:[0-9]+]], off, s[0:3], 0 offset:[[FLOW_SAVEEXEC_LO_OFFSET]] ; 4-byte Folded Reload ; VMEM: s_waitcnt vmcnt(0) ; VMEM: v_readfirstlane_b32 s[[S_RELOAD_SAVEEXEC_LO:[0-9]+]], v[[V_RELOAD_SAVEEXEC_LO]] -; VMEM: buffer_load_dword v[[V_RELOAD_SAVEEXEC_HI:[0-9]+]], off, s[0:3], s7 offset:[[FLOW_SAVEEXEC_HI_OFFSET]] ; 4-byte Folded Reload +; VMEM: buffer_load_dword v[[V_RELOAD_SAVEEXEC_HI:[0-9]+]], off, s[0:3], 0 offset:[[FLOW_SAVEEXEC_HI_OFFSET]] ; 4-byte Folded Reload ; VMEM: s_waitcnt vmcnt(0) ; VMEM: v_readfirstlane_b32 s[[S_RELOAD_SAVEEXEC_HI:[0-9]+]], v[[V_RELOAD_SAVEEXEC_HI]] ; GCN: s_or_b64 exec, exec, s{{\[}}[[S_RELOAD_SAVEEXEC_LO]]:[[S_RELOAD_SAVEEXEC_HI]]{{\]}} -; GCN: buffer_load_dword v[[RESULT:[0-9]+]], off, s[0:3], s7 offset:[[RESULT_OFFSET]] ; 4-byte Folded Reload +; GCN: buffer_load_dword v[[RESULT:[0-9]+]], off, s[0:3], 0 offset:[[RESULT_OFFSET]] ; 4-byte Folded Reload ; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, v[[RESULT]] define amdgpu_kernel void @divergent_if_else_endif(i32 addrspace(1)* %out) #0 { entry: diff --git a/llvm/test/CodeGen/AMDGPU/cross-block-use-is-not-abi-copy.ll b/llvm/test/CodeGen/AMDGPU/cross-block-use-is-not-abi-copy.ll --- a/llvm/test/CodeGen/AMDGPU/cross-block-use-is-not-abi-copy.ll +++ b/llvm/test/CodeGen/AMDGPU/cross-block-use-is-not-abi-copy.ll @@ -169,14 +169,15 @@ ; GCN-LABEL: v3i16_registers: ; GCN: ; %bb.0: ; %entry ; GCN-NEXT: s_load_dword s4, s[4:5], 0x0 -; GCN-NEXT: s_mov_b32 s33, s9 -; GCN-NEXT: s_add_u32 flat_scratch_lo, s6, s33 +; GCN-NEXT: s_add_u32 flat_scratch_lo, s6, s9 ; GCN-NEXT: s_addc_u32 flat_scratch_hi, s7, 0 -; GCN-NEXT: s_mov_b32 s32, s33 +; GCN-NEXT: s_add_u32 s0, s0, s9 +; GCN-NEXT: s_addc_u32 s1, s1, 0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: s_and_b32 s4, 1, s4 ; GCN-NEXT: v_cmp_eq_u32_e64 s[4:5], s4, 1 ; GCN-NEXT: s_and_b64 vcc, exec, s[4:5] +; GCN-NEXT: s_mov_b32 s32, 0 ; GCN-NEXT: s_cbranch_vccz BB4_2 ; GCN-NEXT: ; %bb.1: ; GCN-NEXT: s_mov_b32 s4, 0 @@ -213,14 +214,15 @@ ; GCN-LABEL: v3f16_registers: ; GCN: ; %bb.0: ; %entry ; GCN-NEXT: s_load_dword s4, s[4:5], 0x0 -; GCN-NEXT: s_mov_b32 s33, s9 -; GCN-NEXT: s_add_u32 flat_scratch_lo, s6, s33 +; GCN-NEXT: s_add_u32 flat_scratch_lo, s6, s9 ; GCN-NEXT: s_addc_u32 flat_scratch_hi, s7, 0 -; GCN-NEXT: s_mov_b32 s32, s33 +; GCN-NEXT: s_add_u32 s0, s0, s9 +; GCN-NEXT: s_addc_u32 s1, s1, 0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: s_and_b32 s4, 1, s4 ; GCN-NEXT: v_cmp_eq_u32_e64 s[4:5], s4, 1 ; GCN-NEXT: s_and_b64 vcc, exec, s[4:5] +; GCN-NEXT: s_mov_b32 s32, 0 ; GCN-NEXT: s_cbranch_vccz BB5_2 ; GCN-NEXT: ; %bb.1: ; GCN-NEXT: s_mov_b32 s4, 0 diff --git a/llvm/test/CodeGen/AMDGPU/extload-private.ll b/llvm/test/CodeGen/AMDGPU/extload-private.ll --- a/llvm/test/CodeGen/AMDGPU/extload-private.ll +++ b/llvm/test/CodeGen/AMDGPU/extload-private.ll @@ -2,7 +2,7 @@ ; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-promote-alloca -amdgpu-sroa=0 -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s ; FUNC-LABEL: {{^}}load_i8_sext_private: -; SI: buffer_load_sbyte v{{[0-9]+}}, off, s[{{[0-9]+:[0-9]+}}], s{{[0-9]+}} offset:4{{$}} +; SI: buffer_load_sbyte v{{[0-9]+}}, off, s[{{[0-9]+:[0-9]+}}], 0 offset:4{{$}} define amdgpu_kernel void @load_i8_sext_private(i32 addrspace(1)* %out) { entry: %tmp0 = alloca i8, addrspace(5) @@ -13,7 +13,7 @@ } ; FUNC-LABEL: {{^}}load_i8_zext_private: -; SI: buffer_load_ubyte v{{[0-9]+}}, off, s[{{[0-9]+:[0-9]+}}], s{{[0-9]+}} offset:4{{$}} +; SI: buffer_load_ubyte v{{[0-9]+}}, off, s[{{[0-9]+:[0-9]+}}], 0 offset:4{{$}} define amdgpu_kernel void @load_i8_zext_private(i32 addrspace(1)* %out) { entry: %tmp0 = alloca i8, addrspace(5) @@ -24,7 +24,7 @@ } ; FUNC-LABEL: {{^}}load_i16_sext_private: -; SI: buffer_load_sshort v{{[0-9]+}}, off, s[{{[0-9]+:[0-9]+}}], s{{[0-9]+}} offset:4{{$}} +; SI: buffer_load_sshort v{{[0-9]+}}, off, s[{{[0-9]+:[0-9]+}}], 0 offset:4{{$}} define amdgpu_kernel void @load_i16_sext_private(i32 addrspace(1)* %out) { entry: %tmp0 = alloca i16, addrspace(5) @@ -35,7 +35,7 @@ } ; FUNC-LABEL: {{^}}load_i16_zext_private: -; SI: buffer_load_ushort v{{[0-9]+}}, off, s[{{[0-9]+:[0-9]+}}], s{{[0-9]+}} offset:4{{$}} +; SI: buffer_load_ushort v{{[0-9]+}}, off, s[{{[0-9]+:[0-9]+}}], 0 offset:4{{$}} define amdgpu_kernel void @load_i16_zext_private(i32 addrspace(1)* %out) { entry: %tmp0 = alloca i16, addrspace(5) diff --git a/llvm/test/CodeGen/AMDGPU/fast-unaligned-load-store.private.ll b/llvm/test/CodeGen/AMDGPU/fast-unaligned-load-store.private.ll --- a/llvm/test/CodeGen/AMDGPU/fast-unaligned-load-store.private.ll +++ b/llvm/test/CodeGen/AMDGPU/fast-unaligned-load-store.private.ll @@ -9,8 +9,8 @@ ; GFX7-ALIGNED: ; %bb.0: ; GFX7-ALIGNED-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-ALIGNED-NEXT: v_add_i32_e32 v1, vcc, 2, v0 -; GFX7-ALIGNED-NEXT: buffer_load_ushort v0, v0, s[0:3], s33 offen -; GFX7-ALIGNED-NEXT: buffer_load_ushort v1, v1, s[0:3], s33 offen +; GFX7-ALIGNED-NEXT: buffer_load_ushort v0, v0, s[0:3], 0 offen +; GFX7-ALIGNED-NEXT: buffer_load_ushort v1, v1, s[0:3], 0 offen ; GFX7-ALIGNED-NEXT: s_waitcnt vmcnt(0) ; GFX7-ALIGNED-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX7-ALIGNED-NEXT: v_or_b32_e32 v0, v0, v1 @@ -20,8 +20,8 @@ ; GFX7-UNALIGNED: ; %bb.0: ; GFX7-UNALIGNED-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-UNALIGNED-NEXT: v_add_i32_e32 v1, vcc, 2, v0 -; GFX7-UNALIGNED-NEXT: buffer_load_ushort v0, v0, s[0:3], s33 offen -; GFX7-UNALIGNED-NEXT: buffer_load_ushort v1, v1, s[0:3], s33 offen +; GFX7-UNALIGNED-NEXT: buffer_load_ushort v0, v0, s[0:3], 0 offen +; GFX7-UNALIGNED-NEXT: buffer_load_ushort v1, v1, s[0:3], 0 offen ; GFX7-UNALIGNED-NEXT: s_waitcnt vmcnt(0) ; GFX7-UNALIGNED-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX7-UNALIGNED-NEXT: v_or_b32_e32 v0, v0, v1 @@ -30,8 +30,8 @@ ; GFX9-LABEL: private_load_2xi16_align2: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: buffer_load_ushort v1, v0, s[0:3], s33 offen -; GFX9-NEXT: buffer_load_ushort v0, v0, s[0:3], s33 offen offset:2 +; GFX9-NEXT: buffer_load_ushort v1, v0, s[0:3], 0 offen +; GFX9-NEXT: buffer_load_ushort v0, v0, s[0:3], 0 offen offset:2 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_lshl_or_b32 v0, v0, 16, v1 ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -53,8 +53,8 @@ ; GFX7-ALIGNED-NEXT: v_mov_b32_e32 v3, 1 ; GFX7-ALIGNED-NEXT: v_mov_b32_e32 v0, 2 ; GFX7-ALIGNED-NEXT: v_add_i32_e32 v2, vcc, 2, v1 -; GFX7-ALIGNED-NEXT: buffer_store_short v3, v1, s[0:3], s33 offen -; GFX7-ALIGNED-NEXT: buffer_store_short v0, v2, s[0:3], s33 offen +; GFX7-ALIGNED-NEXT: buffer_store_short v3, v1, s[0:3], 0 offen +; GFX7-ALIGNED-NEXT: buffer_store_short v0, v2, s[0:3], 0 offen ; GFX7-ALIGNED-NEXT: s_waitcnt vmcnt(0) ; GFX7-ALIGNED-NEXT: s_setpc_b64 s[30:31] ; @@ -64,8 +64,8 @@ ; GFX7-UNALIGNED-NEXT: v_mov_b32_e32 v3, 1 ; GFX7-UNALIGNED-NEXT: v_mov_b32_e32 v0, 2 ; GFX7-UNALIGNED-NEXT: v_add_i32_e32 v2, vcc, 2, v1 -; GFX7-UNALIGNED-NEXT: buffer_store_short v3, v1, s[0:3], s33 offen -; GFX7-UNALIGNED-NEXT: buffer_store_short v0, v2, s[0:3], s33 offen +; GFX7-UNALIGNED-NEXT: buffer_store_short v3, v1, s[0:3], 0 offen +; GFX7-UNALIGNED-NEXT: buffer_store_short v0, v2, s[0:3], 0 offen ; GFX7-UNALIGNED-NEXT: s_waitcnt vmcnt(0) ; GFX7-UNALIGNED-NEXT: s_setpc_b64 s[30:31] ; @@ -73,9 +73,9 @@ ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, 1 -; GFX9-NEXT: buffer_store_short v0, v1, s[0:3], s33 offen -; GFX9-NEXT: v_mov_b32_e32 v0, 2 -; GFX9-NEXT: buffer_store_short v0, v1, s[0:3], s33 offen offset:2 +; GFX9-NEXT: v_mov_b32_e32 v2, 2 +; GFX9-NEXT: buffer_store_short v0, v1, s[0:3], 0 offen +; GFX9-NEXT: buffer_store_short v2, v1, s[0:3], 0 offen offset:2 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] %gep.r = getelementptr i16, i16 addrspace(5)* %r, i64 1 @@ -89,36 +89,35 @@ ; GFX7-ALIGNED-LABEL: private_load_2xi16_align1: ; GFX7-ALIGNED: ; %bb.0: ; GFX7-ALIGNED-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-ALIGNED-NEXT: v_add_i32_e32 v2, vcc, 1, v0 -; GFX7-ALIGNED-NEXT: buffer_load_ubyte v2, v2, s[0:3], s33 offen ; GFX7-ALIGNED-NEXT: v_add_i32_e32 v1, vcc, 2, v0 -; GFX7-ALIGNED-NEXT: v_add_i32_e32 v3, vcc, 3, v0 -; GFX7-ALIGNED-NEXT: buffer_load_ubyte v3, v3, s[0:3], s33 offen -; GFX7-ALIGNED-NEXT: buffer_load_ubyte v1, v1, s[0:3], s33 offen -; GFX7-ALIGNED-NEXT: buffer_load_ubyte v0, v0, s[0:3], s33 offen -; GFX7-ALIGNED-NEXT: s_waitcnt vmcnt(3) -; GFX7-ALIGNED-NEXT: v_lshlrev_b32_e32 v2, 8, v2 +; GFX7-ALIGNED-NEXT: v_add_i32_e32 v2, vcc, 1, v0 +; GFX7-ALIGNED-NEXT: buffer_load_ubyte v3, v0, s[0:3], 0 offen +; GFX7-ALIGNED-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; GFX7-ALIGNED-NEXT: buffer_load_ubyte v0, v0, s[0:3], 0 offen +; GFX7-ALIGNED-NEXT: buffer_load_ubyte v2, v2, s[0:3], 0 offen +; GFX7-ALIGNED-NEXT: buffer_load_ubyte v1, v1, s[0:3], 0 offen ; GFX7-ALIGNED-NEXT: s_waitcnt vmcnt(2) -; GFX7-ALIGNED-NEXT: v_lshlrev_b32_e32 v3, 8, v3 +; GFX7-ALIGNED-NEXT: v_lshlrev_b32_e32 v0, 8, v0 ; GFX7-ALIGNED-NEXT: s_waitcnt vmcnt(1) -; GFX7-ALIGNED-NEXT: v_or_b32_e32 v1, v3, v1 +; GFX7-ALIGNED-NEXT: v_lshlrev_b32_e32 v2, 8, v2 ; GFX7-ALIGNED-NEXT: s_waitcnt vmcnt(0) -; GFX7-ALIGNED-NEXT: v_or_b32_e32 v0, v2, v0 -; GFX7-ALIGNED-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX7-ALIGNED-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX7-ALIGNED-NEXT: v_or_b32_e32 v2, v2, v3 +; GFX7-ALIGNED-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX7-ALIGNED-NEXT: v_or_b32_e32 v0, v2, v0 ; GFX7-ALIGNED-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-UNALIGNED-LABEL: private_load_2xi16_align1: ; GFX7-UNALIGNED: ; %bb.0: ; GFX7-UNALIGNED-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-UNALIGNED-NEXT: buffer_load_dword v0, v0, s[0:3], s33 offen +; GFX7-UNALIGNED-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen ; GFX7-UNALIGNED-NEXT: s_waitcnt vmcnt(0) ; GFX7-UNALIGNED-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: private_load_2xi16_align1: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: buffer_load_dword v0, v0, s[0:3], s33 offen +; GFX9-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen ; GFX9-NEXT: v_mov_b32_e32 v1, 0xffff ; GFX9-NEXT: s_mov_b32 s4, 0xffff ; GFX9-NEXT: s_waitcnt vmcnt(0) @@ -141,15 +140,15 @@ ; GFX7-ALIGNED: ; %bb.0: ; GFX7-ALIGNED-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-ALIGNED-NEXT: v_mov_b32_e32 v3, 1 -; GFX7-ALIGNED-NEXT: buffer_store_byte v3, v1, s[0:3], s33 offen ; GFX7-ALIGNED-NEXT: v_add_i32_e32 v2, vcc, 2, v1 -; GFX7-ALIGNED-NEXT: v_add_i32_e32 v3, vcc, 1, v1 -; GFX7-ALIGNED-NEXT: v_mov_b32_e32 v4, 0 +; GFX7-ALIGNED-NEXT: v_add_i32_e32 v4, vcc, 1, v1 +; GFX7-ALIGNED-NEXT: v_mov_b32_e32 v5, 0 +; GFX7-ALIGNED-NEXT: buffer_store_byte v3, v1, s[0:3], 0 offen +; GFX7-ALIGNED-NEXT: buffer_store_byte v5, v4, s[0:3], 0 offen ; GFX7-ALIGNED-NEXT: v_add_i32_e32 v1, vcc, 3, v1 ; GFX7-ALIGNED-NEXT: v_mov_b32_e32 v0, 2 -; GFX7-ALIGNED-NEXT: buffer_store_byte v4, v3, s[0:3], s33 offen -; GFX7-ALIGNED-NEXT: buffer_store_byte v4, v1, s[0:3], s33 offen -; GFX7-ALIGNED-NEXT: buffer_store_byte v0, v2, s[0:3], s33 offen +; GFX7-ALIGNED-NEXT: buffer_store_byte v5, v1, s[0:3], 0 offen +; GFX7-ALIGNED-NEXT: buffer_store_byte v0, v2, s[0:3], 0 offen ; GFX7-ALIGNED-NEXT: s_waitcnt vmcnt(0) ; GFX7-ALIGNED-NEXT: s_setpc_b64 s[30:31] ; @@ -157,7 +156,7 @@ ; GFX7-UNALIGNED: ; %bb.0: ; GFX7-UNALIGNED-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-UNALIGNED-NEXT: v_mov_b32_e32 v0, 0x20001 -; GFX7-UNALIGNED-NEXT: buffer_store_dword v0, v1, s[0:3], s33 offen +; GFX7-UNALIGNED-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen ; GFX7-UNALIGNED-NEXT: s_waitcnt vmcnt(0) ; GFX7-UNALIGNED-NEXT: s_setpc_b64 s[30:31] ; @@ -165,7 +164,7 @@ ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, 0x20001 -; GFX9-NEXT: buffer_store_dword v0, v1, s[0:3], s33 offen +; GFX9-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] %gep.r = getelementptr i16, i16 addrspace(5)* %r, i64 1 @@ -186,21 +185,21 @@ ; GFX7-ALIGNED-LABEL: private_load_2xi16_align4: ; GFX7-ALIGNED: ; %bb.0: ; GFX7-ALIGNED-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-ALIGNED-NEXT: buffer_load_dword v0, v0, s[0:3], s33 offen +; GFX7-ALIGNED-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen ; GFX7-ALIGNED-NEXT: s_waitcnt vmcnt(0) ; GFX7-ALIGNED-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-UNALIGNED-LABEL: private_load_2xi16_align4: ; GFX7-UNALIGNED: ; %bb.0: ; GFX7-UNALIGNED-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-UNALIGNED-NEXT: buffer_load_dword v0, v0, s[0:3], s33 offen +; GFX7-UNALIGNED-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen ; GFX7-UNALIGNED-NEXT: s_waitcnt vmcnt(0) ; GFX7-UNALIGNED-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: private_load_2xi16_align4: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: buffer_load_dword v0, v0, s[0:3], s33 offen +; GFX9-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen ; GFX9-NEXT: v_mov_b32_e32 v1, 0xffff ; GFX9-NEXT: s_mov_b32 s4, 0xffff ; GFX9-NEXT: s_waitcnt vmcnt(0) @@ -233,7 +232,7 @@ ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN-NEXT: v_mov_b32_e32 v0, 0x20001 -; GCN-NEXT: buffer_store_dword v0, v1, s[0:3], s33 offen +; GCN-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: s_setpc_b64 s[30:31] %gep.r = getelementptr i16, i16 addrspace(5)* %r, i64 1 diff --git a/llvm/test/CodeGen/AMDGPU/fold-fi-mubuf.mir b/llvm/test/CodeGen/AMDGPU/fold-fi-mubuf.mir --- a/llvm/test/CodeGen/AMDGPU/fold-fi-mubuf.mir +++ b/llvm/test/CodeGen/AMDGPU/fold-fi-mubuf.mir @@ -1,8 +1,9 @@ # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py # RUN: llc -mtriple=amdgcn-amd-amdhsa -verify-machineinstrs -run-pass si-fold-operands,dead-mi-elimination %s -o - | FileCheck -check-prefix=GCN %s +# Kernels have no FP --- -name: no_fold_fi_non_stack_rsrc_soffset +name: kernel_no_fold_fi_non_stack_rsrc_and_soffset tracksRegLiveness: true frameInfo: maxAlignment: 4 @@ -12,14 +13,11 @@ machineFunctionInfo: isEntryFunction: true scratchRSrcReg: '$sgpr96_sgpr97_sgpr98_sgpr99' - scratchWaveOffsetReg: '$sgpr6' - frameOffsetReg: '$sgpr6' - stackPtrOffsetReg: '$sgpr6' body: | bb.0: liveins: $sgpr12_sgpr13_sgpr14_sgpr15 - ; GCN-LABEL: name: no_fold_fi_non_stack_rsrc_soffset + ; GCN-LABEL: name: kernel_no_fold_fi_non_stack_rsrc_and_soffset ; GCN: liveins: $sgpr12_sgpr13_sgpr14_sgpr15 ; GCN: [[COPY:%[0-9]+]]:sgpr_128 = COPY $sgpr12_sgpr13_sgpr14_sgpr15 ; GCN: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 %stack.0, implicit $exec @@ -36,7 +34,7 @@ ... --- -name: no_fold_fi_non_stack_rsrc +name: kernel_no_fold_fi_non_stack_rsrc tracksRegLiveness: true frameInfo: maxAlignment: 4 @@ -46,14 +44,140 @@ machineFunctionInfo: isEntryFunction: true scratchRSrcReg: '$sgpr96_sgpr97_sgpr98_sgpr99' - scratchWaveOffsetReg: '$sgpr6' - frameOffsetReg: '$sgpr6' +body: | + bb.0: + liveins: $sgpr12_sgpr13_sgpr14_sgpr15 + + ; GCN-LABEL: name: kernel_no_fold_fi_non_stack_rsrc + ; GCN: liveins: $sgpr12_sgpr13_sgpr14_sgpr15 + ; GCN: [[COPY:%[0-9]+]]:sgpr_128 = COPY $sgpr12_sgpr13_sgpr14_sgpr15 + ; GCN: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 %stack.0, implicit $exec + ; GCN: [[BUFFER_LOAD_DWORD_IDXEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_IDXEN [[V_MOV_B32_e32_]], [[COPY]], 0, 0, 0, 0, 0, 0, 0, implicit $exec + ; GCN: $vgpr0 = COPY [[BUFFER_LOAD_DWORD_IDXEN]] + ; GCN: SI_RETURN_TO_EPILOG $vgpr0 + %0:sgpr_128 = COPY $sgpr12_sgpr13_sgpr14_sgpr15 + %2:vgpr_32 = V_MOV_B32_e32 %stack.0, implicit $exec + %3:vgpr_32 = BUFFER_LOAD_DWORD_IDXEN %2, %0, 0, 0, 0, 0, 0, 0, 0, implicit $exec + $vgpr0 = COPY %3 + SI_RETURN_TO_EPILOG $vgpr0 + +... + +--- +name: kernel_no_fold_fi_non_stack_soffset +tracksRegLiveness: true +frameInfo: + maxAlignment: 4 + localFrameSize: 4 +stack: + - { id: 0, size: 4, alignment: 4, local-offset: 0 } +machineFunctionInfo: + isEntryFunction: true + scratchRSrcReg: '$sgpr0_sgpr1_sgpr2_sgpr3' +body: | + bb.0: + + ; GCN-LABEL: name: kernel_no_fold_fi_non_stack_soffset + ; GCN: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 %stack.0, implicit $exec + ; GCN: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 7, implicit $exec + ; GCN: BUFFER_STORE_DWORD_OFFEN [[V_MOV_B32_e32_1]], [[V_MOV_B32_e32_]], $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, 0, 0, implicit $exec + ; GCN: [[BUFFER_LOAD_DWORD_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN [[V_MOV_B32_e32_]], $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, 0, 0, implicit $exec + ; GCN: $vgpr0 = COPY [[BUFFER_LOAD_DWORD_OFFEN]] + ; GCN: S_ENDPGM 0, implicit $vgpr0 + %0:vgpr_32 = V_MOV_B32_e32 %stack.0, implicit $exec + %1:vgpr_32 = V_MOV_B32_e32 7, implicit $exec + %2:sreg_32_xm0 = S_MOV_B32 0 + + BUFFER_STORE_DWORD_OFFEN %1:vgpr_32, %0:vgpr_32, $sgpr0_sgpr1_sgpr2_sgpr3, %2, 0, 0, 0, 0, 0, 0, implicit $exec + %3:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN %0:vgpr_32, $sgpr0_sgpr1_sgpr2_sgpr3, %2, 0, 0, 0, 0, 0, 0, implicit $exec + $vgpr0 = COPY %3 + S_ENDPGM 0, implicit $vgpr0 + +... + +--- +name: kernel_fold_fi_mubuf +tracksRegLiveness: true +frameInfo: + maxAlignment: 4 + localFrameSize: 4 +stack: + - { id: 0, size: 4, alignment: 4, local-offset: 0 } +machineFunctionInfo: + isEntryFunction: true + scratchRSrcReg: '$sgpr0_sgpr1_sgpr2_sgpr3' +body: | + bb.0: + + ; GCN-LABEL: name: kernel_fold_fi_mubuf + ; GCN: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 7, implicit $exec + ; GCN: BUFFER_STORE_DWORD_OFFEN [[V_MOV_B32_e32_]], %stack.0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, 0, 0, implicit $exec + ; GCN: [[BUFFER_LOAD_DWORD_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN %stack.0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, 0, 0, implicit $exec + ; GCN: $vgpr0 = COPY [[BUFFER_LOAD_DWORD_OFFEN]] + ; GCN: S_ENDPGM 0, implicit $vgpr0 + %0:vgpr_32 = V_MOV_B32_e32 %stack.0, implicit $exec + %1:vgpr_32 = V_MOV_B32_e32 7, implicit $exec + + BUFFER_STORE_DWORD_OFFEN %1:vgpr_32, %0:vgpr_32, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, 0, 0, implicit $exec + %2:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN %0:vgpr_32, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, 0, 0, implicit $exec + $vgpr0 = COPY %2 + S_ENDPGM 0, implicit $vgpr0 + +... + + +# Functions have an unswizzled SP/FP relative to the wave offset +--- +name: function_no_fold_fi_non_stack_rsrc_and_soffset +tracksRegLiveness: true +frameInfo: + maxAlignment: 4 + localFrameSize: 4 +stack: + - { id: 0, size: 4, alignment: 4, local-offset: 0 } +machineFunctionInfo: + isEntryFunction: false + scratchRSrcReg: '$sgpr96_sgpr97_sgpr98_sgpr99' + frameOffsetReg: '$sgpr32' + stackPtrOffsetReg: '$sgpr32' +body: | + bb.0: + liveins: $sgpr12_sgpr13_sgpr14_sgpr15 + + ; GCN-LABEL: name: function_no_fold_fi_non_stack_rsrc_and_soffset + ; GCN: liveins: $sgpr12_sgpr13_sgpr14_sgpr15 + ; GCN: [[COPY:%[0-9]+]]:sgpr_128 = COPY $sgpr12_sgpr13_sgpr14_sgpr15 + ; GCN: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 %stack.0, implicit $exec + ; GCN: [[BUFFER_LOAD_DWORD_IDXEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_IDXEN [[V_MOV_B32_e32_]], [[COPY]], 0, 0, 0, 0, 0, 0, 0, implicit $exec + ; GCN: $vgpr0 = COPY [[BUFFER_LOAD_DWORD_IDXEN]] + ; GCN: SI_RETURN_TO_EPILOG $vgpr0 + %0:sgpr_128 = COPY $sgpr12_sgpr13_sgpr14_sgpr15 + %1:sreg_32_xm0 = S_MOV_B32 0 + %2:vgpr_32 = V_MOV_B32_e32 %stack.0, implicit $exec + %3:vgpr_32 = BUFFER_LOAD_DWORD_IDXEN %2, %0, %1, 0, 0, 0, 0, 0, 0, implicit $exec + $vgpr0 = COPY %3 + SI_RETURN_TO_EPILOG $vgpr0 + +... + +--- +name: function_no_fold_fi_non_stack_rsrc +tracksRegLiveness: true +frameInfo: + maxAlignment: 4 + localFrameSize: 4 +stack: + - { id: 0, size: 4, alignment: 4, local-offset: 0 } +machineFunctionInfo: + isEntryFunction: false + scratchRSrcReg: '$sgpr96_sgpr97_sgpr98_sgpr99' + frameOffsetReg: '$sgpr32' stackPtrOffsetReg: '$sgpr32' body: | bb.0: liveins: $sgpr12_sgpr13_sgpr14_sgpr15 - ; GCN-LABEL: name: no_fold_fi_non_stack_rsrc + ; GCN-LABEL: name: function_no_fold_fi_non_stack_rsrc ; GCN: liveins: $sgpr12_sgpr13_sgpr14_sgpr15 ; GCN: [[COPY:%[0-9]+]]:sgpr_128 = COPY $sgpr12_sgpr13_sgpr14_sgpr15 ; GCN: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 %stack.0, implicit $exec @@ -68,9 +192,8 @@ ... -# Offset is from global scratch wave offset. --- -name: fold_fi_mubuf_scratch_scratch_wave_offset +name: function_no_fold_fi_non_stack_soffset tracksRegLiveness: true frameInfo: maxAlignment: 4 @@ -78,14 +201,14 @@ stack: - { id: 0, size: 4, alignment: 4, local-offset: 0 } machineFunctionInfo: - isEntryFunction: true + isEntryFunction: false scratchRSrcReg: '$sgpr0_sgpr1_sgpr2_sgpr3' - scratchWaveOffsetReg: '$sgpr33' + frameOffsetReg: '$sgpr32' stackPtrOffsetReg: '$sgpr32' body: | bb.0: - ; GCN-LABEL: name: fold_fi_mubuf_scratch_scratch_wave_offset + ; GCN-LABEL: name: function_no_fold_fi_non_stack_soffset ; GCN: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 7, implicit $exec ; GCN: BUFFER_STORE_DWORD_OFFEN [[V_MOV_B32_e32_]], %stack.0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0, 0, 0, 0, implicit $exec ; GCN: [[BUFFER_LOAD_DWORD_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN %stack.0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0, 0, 0, 0, implicit $exec @@ -94,15 +217,15 @@ %0:vgpr_32 = V_MOV_B32_e32 %stack.0, implicit $exec %1:vgpr_32 = V_MOV_B32_e32 7, implicit $exec - BUFFER_STORE_DWORD_OFFEN %1:vgpr_32, %0:vgpr_32, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr33, 0, 0, 0, 0, 0, 0, implicit $exec - %2:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN %0:vgpr_32, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr33, 0, 0, 0, 0, 0, 0, implicit $exec + BUFFER_STORE_DWORD_OFFEN %1:vgpr_32, %0:vgpr_32, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, 0, 0, implicit $exec + %2:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN %0:vgpr_32, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, 0, 0, implicit $exec $vgpr0 = COPY %2 S_ENDPGM 0, implicit $vgpr0 ... --- -name: no_fold_fi_mubuf_scratch_sp_offset +name: function_fold_fi_mubuf_wave_relative tracksRegLiveness: true frameInfo: maxAlignment: 4 @@ -110,14 +233,46 @@ stack: - { id: 0, size: 4, alignment: 4, local-offset: 0 } machineFunctionInfo: - isEntryFunction: true + isEntryFunction: false + scratchRSrcReg: '$sgpr0_sgpr1_sgpr2_sgpr3' + frameOffsetReg: '$sgpr32' + stackPtrOffsetReg: '$sgpr32' +body: | + bb.0: + + ; GCN-LABEL: name: function_fold_fi_mubuf_wave_relative + ; GCN: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 7, implicit $exec + ; GCN: BUFFER_STORE_DWORD_OFFEN [[V_MOV_B32_e32_]], %stack.0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0, 0, 0, 0, implicit $exec + ; GCN: [[BUFFER_LOAD_DWORD_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN %stack.0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0, 0, 0, 0, implicit $exec + ; GCN: $vgpr0 = COPY [[BUFFER_LOAD_DWORD_OFFEN]] + ; GCN: S_ENDPGM 0, implicit $vgpr0 + %0:vgpr_32 = V_MOV_B32_e32 %stack.0, implicit $exec + %1:vgpr_32 = V_MOV_B32_e32 7, implicit $exec + + BUFFER_STORE_DWORD_OFFEN %1:vgpr_32, %0:vgpr_32, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, 0, 0, implicit $exec + %2:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN %0:vgpr_32, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, 0, 0, implicit $exec + $vgpr0 = COPY %2 + S_ENDPGM 0, implicit $vgpr0 + +... + +--- +name: function_fold_fi_mubuf_stack_relative +tracksRegLiveness: true +frameInfo: + maxAlignment: 4 + localFrameSize: 4 +stack: + - { id: 0, size: 4, alignment: 4, local-offset: 0 } +machineFunctionInfo: + isEntryFunction: false scratchRSrcReg: '$sgpr0_sgpr1_sgpr2_sgpr3' - scratchWaveOffsetReg: '$sgpr33' + frameOffsetReg: '$sgpr32' stackPtrOffsetReg: '$sgpr32' body: | bb.0: - ; GCN-LABEL: name: no_fold_fi_mubuf_scratch_sp_offset + ; GCN-LABEL: name: function_fold_fi_mubuf_stack_relative ; GCN: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 7, implicit $exec ; GCN: BUFFER_STORE_DWORD_OFFEN [[V_MOV_B32_e32_]], %stack.0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0, 0, 0, 0, implicit $exec ; GCN: [[BUFFER_LOAD_DWORD_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN %stack.0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0, 0, 0, 0, implicit $exec diff --git a/llvm/test/CodeGen/AMDGPU/frame-index-elimination.ll b/llvm/test/CodeGen/AMDGPU/frame-index-elimination.ll --- a/llvm/test/CodeGen/AMDGPU/frame-index-elimination.ll +++ b/llvm/test/CodeGen/AMDGPU/frame-index-elimination.ll @@ -7,10 +7,9 @@ ; Materialize into a mov. Make sure there isn't an unnecessary copy. ; GCN-LABEL: {{^}}func_mov_fi_i32: ; GCN: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN: s_sub_u32 [[SUB:s[0-9]+|vcc_lo|vcc_hi]], s32, s33 -; CI-NEXT: v_lshr_b32_e64 v0, [[SUB]], 6 -; GFX9-NEXT: v_lshrrev_b32_e64 v0, 6, [[SUB]] +; CI-NEXT: v_lshr_b32_e64 v0, s32, 6 +; GFX9-NEXT: v_lshrrev_b32_e64 v0, 6, s32 ; GCN-NOT: v_mov ; GCN: ds_write_b32 v0, v0 @@ -24,19 +23,15 @@ ; GCN-LABEL: {{^}}func_mov_fi_i32_offset: ; GCN: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CI: s_sub_u32 [[SUB0:s[0-9]+|vcc_lo|vcc_hi]], s32, s33 -; CI-NEXT: s_sub_u32 [[SUB1:s[0-9]+|vcc_lo|vcc_hi]], s32, s33 -; CI-DAG: v_lshr_b32_e64 v0, [[SUB0]], 6 -; CI-DAG: v_lshr_b32_e64 [[SCALED:v[0-9]+]], [[SUB1]], 6 +; CI-DAG: v_lshr_b32_e64 v0, s32, 6 +; CI-DAG: v_lshr_b32_e64 [[SCALED:v[0-9]+]], s32, 6 ; CI-NOT: v_mov ; CI: ds_write_b32 v0, v0 -; CI-NEXT: v_add_i32_e64 v0, s{{\[[0-9]+:[0-9]+\]}}, 4, [[SCALED]] +; CI-NEXT: v_add_i32_e{{32|64}} v0, {{s\[[0-9]+:[0-9]+\]|vcc}}, 4, [[SCALED]] ; CI-NEXT: ds_write_b32 v0, v0 -; GFX9: s_sub_u32 [[SUB0:s[0-9]+|vcc_lo|vcc_hi]], s32, s33 -; GFX9-NEXT: s_sub_u32 [[SUB1:s[0-9]+|vcc_lo|vcc_hi]], s32, s33 -; GFX9-NEXT: v_lshrrev_b32_e64 v0, 6, [[SUB0]] -; GFX9-NEXT: v_lshrrev_b32_e64 [[SCALED:v[0-9]+]], 6, [[SUB1]] +; GFX9: v_lshrrev_b32_e64 v0, 6, s32 +; GFX9-NEXT: v_lshrrev_b32_e64 [[SCALED:v[0-9]+]], 6, s32 ; GFX9-DAG: ds_write_b32 v0, v0 ; GFX9-NEXT: v_add_u32_e32 v0, 4, [[SCALED]] ; GFX9-NEXT: ds_write_b32 v0, v0 @@ -53,15 +48,13 @@ ; GCN-LABEL: {{^}}func_add_constant_to_fi_i32: ; GCN: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN: s_sub_u32 [[SUB:s[0-9]+|vcc_lo|vcc_hi]], s32, s33 -; CI-NEXT: v_lshr_b32_e64 [[SCALED:v[0-9]+]], [[SUB]], 6 +; CI: v_lshr_b32_e64 [[SCALED:v[0-9]+]], s32, 6 ; CI-NEXT: v_add_i32_e32 v0, vcc, 4, [[SCALED]] -; GFX9-NEXT: v_lshrrev_b32_e64 [[SCALED:v[0-9]+]], 6, [[SUB]] +; GFX9: v_lshrrev_b32_e64 [[SCALED:v[0-9]+]], 6, s32 ; GFX9-NEXT: v_add_u32_e32 v0, 4, [[SCALED]] - ; GCN-NOT: v_mov ; GCN: ds_write_b32 v0, v0 define void @func_add_constant_to_fi_i32() #0 { @@ -75,11 +68,10 @@ ; into. ; GCN-LABEL: {{^}}func_other_fi_user_i32: -; GCN: s_sub_u32 [[SUB:s[0-9]+|vcc_lo|vcc_hi]], s32, s33 -; CI-NEXT: v_lshr_b32_e64 v0, [[SUB]], 6 +; CI: v_lshr_b32_e64 v0, s32, 6 -; GFX9-NEXT: v_lshrrev_b32_e64 v0, 6, [[SUB]] +; GFX9: v_lshrrev_b32_e64 v0, 6, s32 ; GCN-NEXT: v_mul_u32_u24_e32 v0, 9, v0 ; GCN-NOT: v_mov @@ -94,7 +86,7 @@ ; GCN-LABEL: {{^}}func_store_private_arg_i32_ptr: ; GCN: v_mov_b32_e32 v1, 15{{$}} -; GCN: buffer_store_dword v1, v0, s[0:3], s33 offen{{$}} +; GCN: buffer_store_dword v1, v0, s[0:3], 0 offen{{$}} define void @func_store_private_arg_i32_ptr(i32 addrspace(5)* %ptr) #0 { store volatile i32 15, i32 addrspace(5)* %ptr ret void @@ -102,7 +94,7 @@ ; GCN-LABEL: {{^}}func_load_private_arg_i32_ptr: ; GCN: s_waitcnt -; GCN-NEXT: buffer_load_dword v0, v0, s[0:3], s33 offen{{$}} +; GCN-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen{{$}} define void @func_load_private_arg_i32_ptr(i32 addrspace(5)* %ptr) #0 { %val = load volatile i32, i32 addrspace(5)* %ptr ret void @@ -110,12 +102,11 @@ ; GCN-LABEL: {{^}}void_func_byval_struct_i8_i32_ptr: ; GCN: s_waitcnt -; GCN-NEXT: s_sub_u32 [[SUB_OFFSET:s[0-9]+|vcc_lo|vcc_hi]], s32, s33 -; CI-NEXT: v_lshr_b32_e64 [[SHIFT:v[0-9]+]], [[SUB_OFFSET]], 6 +; CI: v_lshr_b32_e64 [[SHIFT:v[0-9]+]], s32, 6 ; CI-NEXT: v_or_b32_e32 v0, 4, [[SHIFT]] -; GFX9-NEXT: v_lshrrev_b32_e64 [[SHIFT:v[0-9]+]], 6, [[SUB_OFFSET]] +; GFX9: v_lshrrev_b32_e64 [[SHIFT:v[0-9]+]], 6, s32 ; GFX9-NEXT: v_or_b32_e32 v0, 4, [[SHIFT]] ; GCN-NOT: v_mov @@ -143,11 +134,10 @@ } ; GCN-LABEL: {{^}}void_func_byval_struct_i8_i32_ptr_nonentry_block: -; GCN: s_sub_u32 [[SUB_OFFSET:s[0-9]+]], s32, s33 -; CI: v_lshr_b32_e64 [[SHIFT:v[0-9]+]], [[SUB_OFFSET]], 6 +; CI: v_lshr_b32_e64 [[SHIFT:v[0-9]+]], s32, 6 -; GFX9: v_lshrrev_b32_e64 [[SHIFT:v[0-9]+]], 6, [[SUB_OFFSET]] +; GFX9: v_lshrrev_b32_e64 [[SHIFT:v[0-9]+]], 6, s32 ; GCN: s_and_saveexec_b64 @@ -175,13 +165,12 @@ ; Added offset can't be used with VOP3 add ; GCN-LABEL: {{^}}func_other_fi_user_non_inline_imm_offset_i32: -; GCN: s_sub_u32 [[SUB:s[0-9]+|vcc_lo|vcc_hi]], s32, s33 -; CI-DAG: s_movk_i32 [[K:s[0-9]+|vcc_lo|vcc_hi]], 0x200 -; CI-DAG: v_lshr_b32_e64 [[SCALED:v[0-9]+]], [[SUB]], 6 +; CI-DAG: s_movk_i32 [[K:s[0-9]+|vcc_lo|vcc_hi]], 0x200 +; CI-DAG: v_lshr_b32_e64 [[SCALED:v[0-9]+]], s32, 6 ; CI: v_add_i32_e32 [[VZ:v[0-9]+]], vcc, [[K]], [[SCALED]] -; GFX9-DAG: v_lshrrev_b32_e64 [[SCALED:v[0-9]+]], 6, [[SUB]] +; GFX9-DAG: v_lshrrev_b32_e64 [[SCALED:v[0-9]+]], 6, s32 ; GFX9: v_add_u32_e32 [[VZ:v[0-9]+]], 0x200, [[SCALED]] ; GCN: v_mul_u32_u24_e32 [[VZ]], 9, [[VZ]] @@ -199,13 +188,12 @@ } ; GCN-LABEL: {{^}}func_other_fi_user_non_inline_imm_offset_i32_vcc_live: -; GCN: s_sub_u32 [[DIFF:s[0-9]+]], s32, s33 -; CI-DAG: s_movk_i32 [[OFFSET:s[0-9]+]], 0x200 -; CI-DAG: v_lshr_b32_e64 [[SCALED:v[0-9]+]], [[DIFF]], 6 +; CI-DAG: s_movk_i32 [[OFFSET:s[0-9]+]], 0x200 +; CI-DAG: v_lshr_b32_e64 [[SCALED:v[0-9]+]], s32, 6 ; CI: v_add_i32_e64 [[VZ:v[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, [[OFFSET]], [[SCALED]] -; GFX9-DAG: v_lshrrev_b32_e64 [[SCALED:v[0-9]+]], 6, [[DIFF]] +; GFX9-DAG: v_lshrrev_b32_e64 [[SCALED:v[0-9]+]], 6, s32 ; GFX9: v_add_u32_e32 [[VZ:v[0-9]+]], 0x200, [[SCALED]] ; GCN: v_mul_u32_u24_e32 [[VZ]], 9, [[VZ]] @@ -256,12 +244,11 @@ ; GCN-LABEL: {{^}}alloca_ptr_nonentry_block: ; GCN: s_and_saveexec_b64 ; GCN: buffer_load_dword v{{[0-9]+}}, off, s[0:3], s32 offset:4 -; GCN: s_sub_u32 [[SUB_OFFSET:s[0-9]+|vcc_lo|vcc_hi]], s32, s33 -; CI: v_lshr_b32_e64 [[SHIFT:v[0-9]+]], [[SUB_OFFSET]], 6 +; CI: v_lshr_b32_e64 [[SHIFT:v[0-9]+]], s32, 6 ; CI-NEXT: v_or_b32_e32 [[PTR:v[0-9]+]], 4, [[SHIFT]] -; GFX9: v_lshrrev_b32_e64 [[SHIFT:v[0-9]+]], 6, [[SUB_OFFSET]] +; GFX9: v_lshrrev_b32_e64 [[SHIFT:v[0-9]+]], 6, s32 ; GFX9-NEXT: v_or_b32_e32 [[PTR:v[0-9]+]], 4, [[SHIFT]] ; GCN: ds_write_b32 v{{[0-9]+}}, [[PTR]] diff --git a/llvm/test/CodeGen/AMDGPU/frame-lowering-entry-all-sgpr-used.mir b/llvm/test/CodeGen/AMDGPU/frame-lowering-entry-all-sgpr-used.mir --- a/llvm/test/CodeGen/AMDGPU/frame-lowering-entry-all-sgpr-used.mir +++ b/llvm/test/CodeGen/AMDGPU/frame-lowering-entry-all-sgpr-used.mir @@ -26,7 +26,6 @@ isEntryFunction: true waveLimiter: true scratchRSrcReg: '$sgpr96_sgpr97_sgpr98_sgpr99' - scratchWaveOffsetReg: '$sgpr101' frameOffsetReg: '$sgpr101' stackPtrOffsetReg: '$sgpr32' argumentInfo: diff --git a/llvm/test/CodeGen/AMDGPU/frame-lowering-fp-adjusted.mir b/llvm/test/CodeGen/AMDGPU/frame-lowering-fp-adjusted.mir --- a/llvm/test/CodeGen/AMDGPU/frame-lowering-fp-adjusted.mir +++ b/llvm/test/CodeGen/AMDGPU/frame-lowering-fp-adjusted.mir @@ -29,7 +29,6 @@ isEntryFunction: true waveLimiter: true scratchRSrcReg: '$sgpr96_sgpr97_sgpr98_sgpr99' - scratchWaveOffsetReg: '$sgpr101' frameOffsetReg: '$sgpr101' stackPtrOffsetReg: '$sgpr32' argumentInfo: @@ -47,4 +46,4 @@ liveins: $sgpr4, $sgpr5, $sgpr9, $sgpr22, $vgpr0, $sgpr6_sgpr7 renamable $vgpr2 = IMPLICIT_DEF - SI_SPILL_V32_SAVE killed $vgpr2, %stack.0, $sgpr96_sgpr97_sgpr98_sgpr99, $sgpr32, 0, implicit $exec :: (store 4 into %stack.0, addrspace 5) + SI_SPILL_V32_SAVE killed $vgpr2, %stack.0, $sgpr96_sgpr97_sgpr98_sgpr99, 0, 0, implicit $exec :: (store 4 into %stack.0, addrspace 5) diff --git a/llvm/test/CodeGen/AMDGPU/function-returns.ll b/llvm/test/CodeGen/AMDGPU/function-returns.ll --- a/llvm/test/CodeGen/AMDGPU/function-returns.ll +++ b/llvm/test/CodeGen/AMDGPU/function-returns.ll @@ -409,8 +409,8 @@ ; GCN-LABEL: {{^}}void_func_sret_struct_i8_i32: ; GCN: buffer_load_ubyte [[VAL0:v[0-9]+]] ; GCN: buffer_load_dword [[VAL1:v[0-9]+]] -; GCN: buffer_store_byte [[VAL0]], v0, s[0:3], s33 offen{{$}} -; GCN: buffer_store_dword [[VAL1]], v0, s[0:3], s33 offen offset:4{{$}} +; GCN: buffer_store_byte [[VAL0]], v0, s[0:3], 0 offen{{$}} +; GCN: buffer_store_dword [[VAL1]], v0, s[0:3], 0 offen offset:4{{$}} define void @void_func_sret_struct_i8_i32({ i8, i32 } addrspace(5)* sret %arg0) #0 { %val0 = load volatile i8, i8 addrspace(1)* undef %val1 = load volatile i32, i32 addrspace(1)* undef @@ -426,39 +426,39 @@ ; AssertZext inserted. Not using it introduces the spills. ; GCN-LABEL: {{^}}v33i32_func_void: -; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s33 offen{{$}} -; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s33 offen offset:4{{$}} -; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s33 offen offset:8{{$}} -; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s33 offen offset:12{{$}} -; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s33 offen offset:16{{$}} -; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s33 offen offset:20{{$}} -; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s33 offen offset:24{{$}} -; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s33 offen offset:28{{$}} -; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s33 offen offset:32{{$}} -; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s33 offen offset:36{{$}} -; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s33 offen offset:40{{$}} -; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s33 offen offset:44{{$}} -; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s33 offen offset:48{{$}} -; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s33 offen offset:52{{$}} -; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s33 offen offset:56{{$}} -; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s33 offen offset:60{{$}} -; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s33 offen offset:64{{$}} -; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s33 offen offset:68{{$}} -; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s33 offen offset:72{{$}} -; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s33 offen offset:76{{$}} -; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s33 offen offset:80{{$}} -; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s33 offen offset:84{{$}} -; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s33 offen offset:88{{$}} -; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s33 offen offset:92{{$}} -; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s33 offen offset:96{{$}} -; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s33 offen offset:100{{$}} -; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s33 offen offset:104{{$}} -; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s33 offen offset:108{{$}} -; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s33 offen offset:112{{$}} -; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s33 offen offset:116{{$}} -; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s33 offen offset:120{{$}} -; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s33 offen offset:124{{$}} -; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s33 offen offset:128{{$}} +; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], 0 offen{{$}} +; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], 0 offen offset:4{{$}} +; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], 0 offen offset:8{{$}} +; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], 0 offen offset:12{{$}} +; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], 0 offen offset:16{{$}} +; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], 0 offen offset:20{{$}} +; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], 0 offen offset:24{{$}} +; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], 0 offen offset:28{{$}} +; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], 0 offen offset:32{{$}} +; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], 0 offen offset:36{{$}} +; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], 0 offen offset:40{{$}} +; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], 0 offen offset:44{{$}} +; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], 0 offen offset:48{{$}} +; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], 0 offen offset:52{{$}} +; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], 0 offen offset:56{{$}} +; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], 0 offen offset:60{{$}} +; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], 0 offen offset:64{{$}} +; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], 0 offen offset:68{{$}} +; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], 0 offen offset:72{{$}} +; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], 0 offen offset:76{{$}} +; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], 0 offen offset:80{{$}} +; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], 0 offen offset:84{{$}} +; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], 0 offen offset:88{{$}} +; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], 0 offen offset:92{{$}} +; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], 0 offen offset:96{{$}} +; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], 0 offen offset:100{{$}} +; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], 0 offen offset:104{{$}} +; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], 0 offen offset:108{{$}} +; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], 0 offen offset:112{{$}} +; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], 0 offen offset:116{{$}} +; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], 0 offen offset:120{{$}} +; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], 0 offen offset:124{{$}} +; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], 0 offen offset:128{{$}} ; GFX9: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 define <33 x i32> @v33i32_func_void() #0 { @@ -468,39 +468,39 @@ } ; GCN-LABEL: {{^}}struct_v32i32_i32_func_void: -; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s33 offen{{$}} -; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s33 offen offset:4{{$}} -; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s33 offen offset:8{{$}} -; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s33 offen offset:12{{$}} -; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s33 offen offset:16{{$}} -; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s33 offen offset:20{{$}} -; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s33 offen offset:24{{$}} -; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s33 offen offset:28{{$}} -; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s33 offen offset:32{{$}} -; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s33 offen offset:36{{$}} -; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s33 offen offset:40{{$}} -; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s33 offen offset:44{{$}} -; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s33 offen offset:48{{$}} -; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s33 offen offset:52{{$}} -; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s33 offen offset:56{{$}} -; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s33 offen offset:60{{$}} -; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s33 offen offset:64{{$}} -; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s33 offen offset:68{{$}} -; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s33 offen offset:72{{$}} -; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s33 offen offset:76{{$}} -; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s33 offen offset:80{{$}} -; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s33 offen offset:84{{$}} -; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s33 offen offset:88{{$}} -; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s33 offen offset:92{{$}} -; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s33 offen offset:96{{$}} -; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s33 offen offset:100{{$}} -; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s33 offen offset:104{{$}} -; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s33 offen offset:108{{$}} -; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s33 offen offset:112{{$}} -; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s33 offen offset:116{{$}} -; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s33 offen offset:120{{$}} -; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s33 offen offset:124{{$}} -; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s33 offen offset:128{{$}} +; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], 0 offen{{$}} +; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], 0 offen offset:4{{$}} +; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], 0 offen offset:8{{$}} +; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], 0 offen offset:12{{$}} +; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], 0 offen offset:16{{$}} +; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], 0 offen offset:20{{$}} +; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], 0 offen offset:24{{$}} +; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], 0 offen offset:28{{$}} +; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], 0 offen offset:32{{$}} +; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], 0 offen offset:36{{$}} +; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], 0 offen offset:40{{$}} +; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], 0 offen offset:44{{$}} +; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], 0 offen offset:48{{$}} +; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], 0 offen offset:52{{$}} +; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], 0 offen offset:56{{$}} +; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], 0 offen offset:60{{$}} +; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], 0 offen offset:64{{$}} +; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], 0 offen offset:68{{$}} +; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], 0 offen offset:72{{$}} +; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], 0 offen offset:76{{$}} +; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], 0 offen offset:80{{$}} +; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], 0 offen offset:84{{$}} +; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], 0 offen offset:88{{$}} +; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], 0 offen offset:92{{$}} +; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], 0 offen offset:96{{$}} +; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], 0 offen offset:100{{$}} +; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], 0 offen offset:104{{$}} +; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], 0 offen offset:108{{$}} +; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], 0 offen offset:112{{$}} +; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], 0 offen offset:116{{$}} +; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], 0 offen offset:120{{$}} +; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], 0 offen offset:124{{$}} +; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], 0 offen offset:128{{$}} ; GFX9: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 define { <32 x i32>, i32 } @struct_v32i32_i32_func_void() #0 { @@ -510,39 +510,39 @@ } ; GCN-LABEL: {{^}}struct_i32_v32i32_func_void: -; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s33 offen{{$}} -; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s33 offen offset:128{{$}} -; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s33 offen offset:132{{$}} -; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s33 offen offset:136{{$}} -; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s33 offen offset:140{{$}} -; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s33 offen offset:144{{$}} -; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s33 offen offset:148{{$}} -; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s33 offen offset:152{{$}} -; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s33 offen offset:156{{$}} -; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s33 offen offset:160{{$}} -; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s33 offen offset:164{{$}} -; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s33 offen offset:168{{$}} -; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s33 offen offset:172{{$}} -; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s33 offen offset:176{{$}} -; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s33 offen offset:180{{$}} -; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s33 offen offset:184{{$}} -; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s33 offen offset:188{{$}} -; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s33 offen offset:192{{$}} -; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s33 offen offset:196{{$}} -; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s33 offen offset:200{{$}} -; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s33 offen offset:204{{$}} -; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s33 offen offset:208{{$}} -; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s33 offen offset:212{{$}} -; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s33 offen offset:216{{$}} -; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s33 offen offset:220{{$}} -; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s33 offen offset:224{{$}} -; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s33 offen offset:228{{$}} -; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s33 offen offset:232{{$}} -; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s33 offen offset:236{{$}} -; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s33 offen offset:240{{$}} -; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s33 offen offset:244{{$}} -; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s33 offen offset:248{{$}} -; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s33 offen offset:252{{$}} +; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], 0 offen{{$}} +; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], 0 offen offset:128{{$}} +; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], 0 offen offset:132{{$}} +; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], 0 offen offset:136{{$}} +; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], 0 offen offset:140{{$}} +; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], 0 offen offset:144{{$}} +; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], 0 offen offset:148{{$}} +; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], 0 offen offset:152{{$}} +; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], 0 offen offset:156{{$}} +; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], 0 offen offset:160{{$}} +; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], 0 offen offset:164{{$}} +; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], 0 offen offset:168{{$}} +; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], 0 offen offset:172{{$}} +; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], 0 offen offset:176{{$}} +; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], 0 offen offset:180{{$}} +; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], 0 offen offset:184{{$}} +; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], 0 offen offset:188{{$}} +; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], 0 offen offset:192{{$}} +; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], 0 offen offset:196{{$}} +; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], 0 offen offset:200{{$}} +; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], 0 offen offset:204{{$}} +; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], 0 offen offset:208{{$}} +; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], 0 offen offset:212{{$}} +; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], 0 offen offset:216{{$}} +; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], 0 offen offset:220{{$}} +; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], 0 offen offset:224{{$}} +; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], 0 offen offset:228{{$}} +; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], 0 offen offset:232{{$}} +; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], 0 offen offset:236{{$}} +; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], 0 offen offset:240{{$}} +; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], 0 offen offset:244{{$}} +; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], 0 offen offset:248{{$}} +; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], 0 offen offset:252{{$}} ; GFX9: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 define { i32, <32 x i32> } @struct_i32_v32i32_func_void() #0 { diff --git a/llvm/test/CodeGen/AMDGPU/hsa-metadata-kernel-code-props-v3.ll b/llvm/test/CodeGen/AMDGPU/hsa-metadata-kernel-code-props-v3.ll --- a/llvm/test/CodeGen/AMDGPU/hsa-metadata-kernel-code-props-v3.ll +++ b/llvm/test/CodeGen/AMDGPU/hsa-metadata-kernel-code-props-v3.ll @@ -48,10 +48,10 @@ } ; CHECK: .name: num_spilled_sgprs -; GFX700: .sgpr_spill_count: 40 -; GFX803: .sgpr_spill_count: 24 -; GFX900: .sgpr_spill_count: 24 -; GFX1010: .sgpr_spill_count: 24 +; GFX700: .sgpr_spill_count: 38 +; GFX803: .sgpr_spill_count: 22 +; GFX900: .sgpr_spill_count: 22 +; GFX1010: .sgpr_spill_count: 22 ; CHECK: .symbol: num_spilled_sgprs.kd define amdgpu_kernel void @num_spilled_sgprs( i32 addrspace(1)* %out0, i32 addrspace(1)* %out1, [8 x i32], diff --git a/llvm/test/CodeGen/AMDGPU/hsa-metadata-kernel-code-props.ll b/llvm/test/CodeGen/AMDGPU/hsa-metadata-kernel-code-props.ll --- a/llvm/test/CodeGen/AMDGPU/hsa-metadata-kernel-code-props.ll +++ b/llvm/test/CodeGen/AMDGPU/hsa-metadata-kernel-code-props.ll @@ -57,9 +57,9 @@ ; CHECK-LABEL: - Name: num_spilled_sgprs ; CHECK: SymbolName: 'num_spilled_sgprs@kd' ; CHECK: CodeProps: -; GFX700: NumSpilledSGPRs: 40 -; GFX803: NumSpilledSGPRs: 24 -; GFX900: NumSpilledSGPRs: 24 +; GFX700: NumSpilledSGPRs: 38 +; GFX803: NumSpilledSGPRs: 22 +; GFX900: NumSpilledSGPRs: 22 define amdgpu_kernel void @num_spilled_sgprs( i32 addrspace(1)* %out0, i32 addrspace(1)* %out1, [8 x i32], i32 addrspace(1)* %out2, i32 addrspace(1)* %out3, [8 x i32], diff --git a/llvm/test/CodeGen/AMDGPU/idot8s.ll b/llvm/test/CodeGen/AMDGPU/idot8s.ll --- a/llvm/test/CodeGen/AMDGPU/idot8s.ll +++ b/llvm/test/CodeGen/AMDGPU/idot8s.ll @@ -9,49 +9,49 @@ define amdgpu_kernel void @idot8_acc32(<8 x i4> addrspace(1)* %src1, ; GFX7-LABEL: idot8_acc32: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x9 -; GFX7-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xd -; GFX7-NEXT: s_mov_b32 s7, 0xf000 -; GFX7-NEXT: s_mov_b32 s6, -1 +; GFX7-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd +; GFX7-NEXT: s_mov_b32 s3, 0xf000 +; GFX7-NEXT: s_mov_b32 s2, -1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: s_load_dword s0, s[8:9], 0x0 -; GFX7-NEXT: s_load_dword s1, s[10:11], 0x0 -; GFX7-NEXT: s_load_dword s21, s[4:5], 0x0 +; GFX7-NEXT: s_load_dword s4, s[4:5], 0x0 +; GFX7-NEXT: s_load_dword s5, s[6:7], 0x0 +; GFX7-NEXT: s_load_dword s20, s[0:1], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: s_bfe_i32 s2, s0, 0x40000 -; GFX7-NEXT: s_bfe_i32 s8, s1, 0x40000 -; GFX7-NEXT: s_bfe_i32 s10, s1, 0x40004 -; GFX7-NEXT: v_mov_b32_e32 v0, s8 -; GFX7-NEXT: v_mov_b32_e32 v1, s21 -; GFX7-NEXT: v_mad_i32_i24 v0, s2, v0, v1 -; GFX7-NEXT: s_bfe_i32 s9, s0, 0x40004 -; GFX7-NEXT: v_mov_b32_e32 v1, s10 -; GFX7-NEXT: s_bfe_i32 s12, s1, 0x40008 -; GFX7-NEXT: v_mad_i32_i24 v0, s9, v1, v0 -; GFX7-NEXT: s_bfe_i32 s11, s0, 0x40008 -; GFX7-NEXT: v_mov_b32_e32 v1, s12 -; GFX7-NEXT: s_bfe_i32 s14, s1, 0x4000c -; GFX7-NEXT: v_mad_i32_i24 v0, s11, v1, v0 -; GFX7-NEXT: s_bfe_i32 s13, s0, 0x4000c -; GFX7-NEXT: v_mov_b32_e32 v1, s14 -; GFX7-NEXT: s_bfe_i32 s16, s1, 0x40010 -; GFX7-NEXT: v_mad_i32_i24 v0, s13, v1, v0 -; GFX7-NEXT: s_bfe_i32 s15, s0, 0x40010 -; GFX7-NEXT: v_mov_b32_e32 v1, s16 -; GFX7-NEXT: s_bfe_i32 s18, s1, 0x40014 -; GFX7-NEXT: s_bfe_i32 s20, s1, 0x40018 -; GFX7-NEXT: v_mad_i32_i24 v0, s15, v1, v0 -; GFX7-NEXT: s_bfe_i32 s17, s0, 0x40014 -; GFX7-NEXT: v_mov_b32_e32 v1, s18 -; GFX7-NEXT: s_bfe_i32 s19, s0, 0x40018 -; GFX7-NEXT: v_mad_i32_i24 v0, s17, v1, v0 +; GFX7-NEXT: s_bfe_i32 s6, s4, 0x40000 +; GFX7-NEXT: s_bfe_i32 s7, s5, 0x40000 +; GFX7-NEXT: s_bfe_i32 s9, s5, 0x40004 +; GFX7-NEXT: v_mov_b32_e32 v0, s7 ; GFX7-NEXT: v_mov_b32_e32 v1, s20 -; GFX7-NEXT: s_ashr_i32 s1, s1, 28 -; GFX7-NEXT: v_mad_i32_i24 v0, s19, v1, v0 -; GFX7-NEXT: s_ashr_i32 s0, s0, 28 -; GFX7-NEXT: v_mov_b32_e32 v1, s1 -; GFX7-NEXT: v_mad_i32_i24 v0, s0, v1, v0 -; GFX7-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GFX7-NEXT: v_mad_i32_i24 v0, s6, v0, v1 +; GFX7-NEXT: s_bfe_i32 s8, s4, 0x40004 +; GFX7-NEXT: v_mov_b32_e32 v1, s9 +; GFX7-NEXT: s_bfe_i32 s11, s5, 0x40008 +; GFX7-NEXT: v_mad_i32_i24 v0, s8, v1, v0 +; GFX7-NEXT: s_bfe_i32 s10, s4, 0x40008 +; GFX7-NEXT: v_mov_b32_e32 v1, s11 +; GFX7-NEXT: s_bfe_i32 s13, s5, 0x4000c +; GFX7-NEXT: v_mad_i32_i24 v0, s10, v1, v0 +; GFX7-NEXT: s_bfe_i32 s12, s4, 0x4000c +; GFX7-NEXT: v_mov_b32_e32 v1, s13 +; GFX7-NEXT: s_bfe_i32 s15, s5, 0x40010 +; GFX7-NEXT: v_mad_i32_i24 v0, s12, v1, v0 +; GFX7-NEXT: s_bfe_i32 s14, s4, 0x40010 +; GFX7-NEXT: v_mov_b32_e32 v1, s15 +; GFX7-NEXT: s_bfe_i32 s17, s5, 0x40014 +; GFX7-NEXT: s_bfe_i32 s19, s5, 0x40018 +; GFX7-NEXT: v_mad_i32_i24 v0, s14, v1, v0 +; GFX7-NEXT: s_bfe_i32 s16, s4, 0x40014 +; GFX7-NEXT: v_mov_b32_e32 v1, s17 +; GFX7-NEXT: s_bfe_i32 s18, s4, 0x40018 +; GFX7-NEXT: v_mad_i32_i24 v0, s16, v1, v0 +; GFX7-NEXT: v_mov_b32_e32 v1, s19 +; GFX7-NEXT: s_ashr_i32 s5, s5, 28 +; GFX7-NEXT: v_mad_i32_i24 v0, s18, v1, v0 +; GFX7-NEXT: s_ashr_i32 s4, s4, 28 +; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: v_mad_i32_i24 v0, s4, v1, v0 +; GFX7-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX7-NEXT: s_endpgm ; ; GFX8-LABEL: idot8_acc32: @@ -60,41 +60,41 @@ ; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: s_load_dword s2, s[4:5], 0x0 -; GFX8-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX8-NEXT: s_load_dword s19, s[0:1], 0x0 +; GFX8-NEXT: s_load_dword s3, s[6:7], 0x0 +; GFX8-NEXT: s_load_dword s18, s[0:1], 0x0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_bfe_i32 s5, s2, 0x40000 -; GFX8-NEXT: s_bfe_i32 s6, s4, 0x40000 -; GFX8-NEXT: s_bfe_i32 s8, s4, 0x40004 -; GFX8-NEXT: v_mov_b32_e32 v0, s6 -; GFX8-NEXT: v_mov_b32_e32 v1, s19 -; GFX8-NEXT: v_mad_i32_i24 v0, s5, v0, v1 -; GFX8-NEXT: s_bfe_i32 s7, s2, 0x40004 -; GFX8-NEXT: v_mov_b32_e32 v1, s8 -; GFX8-NEXT: s_bfe_i32 s10, s4, 0x40008 -; GFX8-NEXT: v_mad_i32_i24 v0, s7, v1, v0 -; GFX8-NEXT: s_bfe_i32 s9, s2, 0x40008 -; GFX8-NEXT: v_mov_b32_e32 v1, s10 -; GFX8-NEXT: s_bfe_i32 s12, s4, 0x4000c -; GFX8-NEXT: v_mad_i32_i24 v0, s9, v1, v0 -; GFX8-NEXT: s_bfe_i32 s11, s2, 0x4000c -; GFX8-NEXT: v_mov_b32_e32 v1, s12 -; GFX8-NEXT: s_bfe_i32 s14, s4, 0x40010 -; GFX8-NEXT: v_mad_i32_i24 v0, s11, v1, v0 -; GFX8-NEXT: s_bfe_i32 s13, s2, 0x40010 -; GFX8-NEXT: v_mov_b32_e32 v1, s14 -; GFX8-NEXT: s_bfe_i32 s16, s4, 0x40014 -; GFX8-NEXT: s_bfe_i32 s18, s4, 0x40018 -; GFX8-NEXT: v_mad_i32_i24 v0, s13, v1, v0 -; GFX8-NEXT: s_bfe_i32 s15, s2, 0x40014 -; GFX8-NEXT: v_mov_b32_e32 v1, s16 -; GFX8-NEXT: s_bfe_i32 s17, s2, 0x40018 -; GFX8-NEXT: v_mad_i32_i24 v0, s15, v1, v0 +; GFX8-NEXT: s_bfe_i32 s4, s2, 0x40000 +; GFX8-NEXT: s_bfe_i32 s5, s3, 0x40000 +; GFX8-NEXT: s_bfe_i32 s7, s3, 0x40004 +; GFX8-NEXT: v_mov_b32_e32 v0, s5 ; GFX8-NEXT: v_mov_b32_e32 v1, s18 -; GFX8-NEXT: s_ashr_i32 s4, s4, 28 -; GFX8-NEXT: v_mad_i32_i24 v0, s17, v1, v0 +; GFX8-NEXT: v_mad_i32_i24 v0, s4, v0, v1 +; GFX8-NEXT: s_bfe_i32 s6, s2, 0x40004 +; GFX8-NEXT: v_mov_b32_e32 v1, s7 +; GFX8-NEXT: s_bfe_i32 s9, s3, 0x40008 +; GFX8-NEXT: v_mad_i32_i24 v0, s6, v1, v0 +; GFX8-NEXT: s_bfe_i32 s8, s2, 0x40008 +; GFX8-NEXT: v_mov_b32_e32 v1, s9 +; GFX8-NEXT: s_bfe_i32 s11, s3, 0x4000c +; GFX8-NEXT: v_mad_i32_i24 v0, s8, v1, v0 +; GFX8-NEXT: s_bfe_i32 s10, s2, 0x4000c +; GFX8-NEXT: v_mov_b32_e32 v1, s11 +; GFX8-NEXT: s_bfe_i32 s13, s3, 0x40010 +; GFX8-NEXT: v_mad_i32_i24 v0, s10, v1, v0 +; GFX8-NEXT: s_bfe_i32 s12, s2, 0x40010 +; GFX8-NEXT: v_mov_b32_e32 v1, s13 +; GFX8-NEXT: s_bfe_i32 s15, s3, 0x40014 +; GFX8-NEXT: s_bfe_i32 s17, s3, 0x40018 +; GFX8-NEXT: v_mad_i32_i24 v0, s12, v1, v0 +; GFX8-NEXT: s_bfe_i32 s14, s2, 0x40014 +; GFX8-NEXT: v_mov_b32_e32 v1, s15 +; GFX8-NEXT: s_bfe_i32 s16, s2, 0x40018 +; GFX8-NEXT: v_mad_i32_i24 v0, s14, v1, v0 +; GFX8-NEXT: v_mov_b32_e32 v1, s17 +; GFX8-NEXT: s_ashr_i32 s3, s3, 28 +; GFX8-NEXT: v_mad_i32_i24 v0, s16, v1, v0 ; GFX8-NEXT: s_ashr_i32 s2, s2, 28 -; GFX8-NEXT: v_mov_b32_e32 v1, s4 +; GFX8-NEXT: v_mov_b32_e32 v1, s3 ; GFX8-NEXT: v_mad_i32_i24 v2, s2, v1, v0 ; GFX8-NEXT: v_mov_b32_e32 v0, s0 ; GFX8-NEXT: v_mov_b32_e32 v1, s1 @@ -107,41 +107,41 @@ ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_load_dword s2, s[4:5], 0x0 -; GFX9-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX9-NEXT: s_load_dword s19, s[0:1], 0x0 +; GFX9-NEXT: s_load_dword s3, s[6:7], 0x0 +; GFX9-NEXT: s_load_dword s18, s[0:1], 0x0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_bfe_i32 s5, s2, 0x40000 -; GFX9-NEXT: s_bfe_i32 s6, s4, 0x40000 -; GFX9-NEXT: s_bfe_i32 s8, s4, 0x40004 -; GFX9-NEXT: v_mov_b32_e32 v0, s6 -; GFX9-NEXT: v_mov_b32_e32 v1, s19 -; GFX9-NEXT: v_mad_i32_i24 v0, s5, v0, v1 -; GFX9-NEXT: s_bfe_i32 s7, s2, 0x40004 -; GFX9-NEXT: v_mov_b32_e32 v1, s8 -; GFX9-NEXT: s_bfe_i32 s10, s4, 0x40008 -; GFX9-NEXT: v_mad_i32_i24 v0, s7, v1, v0 -; GFX9-NEXT: s_bfe_i32 s9, s2, 0x40008 -; GFX9-NEXT: v_mov_b32_e32 v1, s10 -; GFX9-NEXT: s_bfe_i32 s12, s4, 0x4000c -; GFX9-NEXT: v_mad_i32_i24 v0, s9, v1, v0 -; GFX9-NEXT: s_bfe_i32 s11, s2, 0x4000c -; GFX9-NEXT: v_mov_b32_e32 v1, s12 -; GFX9-NEXT: s_bfe_i32 s14, s4, 0x40010 -; GFX9-NEXT: v_mad_i32_i24 v0, s11, v1, v0 -; GFX9-NEXT: s_bfe_i32 s13, s2, 0x40010 -; GFX9-NEXT: v_mov_b32_e32 v1, s14 -; GFX9-NEXT: s_bfe_i32 s16, s4, 0x40014 -; GFX9-NEXT: s_bfe_i32 s18, s4, 0x40018 -; GFX9-NEXT: v_mad_i32_i24 v0, s13, v1, v0 -; GFX9-NEXT: s_bfe_i32 s15, s2, 0x40014 -; GFX9-NEXT: v_mov_b32_e32 v1, s16 -; GFX9-NEXT: s_bfe_i32 s17, s2, 0x40018 -; GFX9-NEXT: v_mad_i32_i24 v0, s15, v1, v0 +; GFX9-NEXT: s_bfe_i32 s4, s2, 0x40000 +; GFX9-NEXT: s_bfe_i32 s5, s3, 0x40000 +; GFX9-NEXT: s_bfe_i32 s7, s3, 0x40004 +; GFX9-NEXT: v_mov_b32_e32 v0, s5 ; GFX9-NEXT: v_mov_b32_e32 v1, s18 -; GFX9-NEXT: s_ashr_i32 s4, s4, 28 -; GFX9-NEXT: v_mad_i32_i24 v0, s17, v1, v0 +; GFX9-NEXT: v_mad_i32_i24 v0, s4, v0, v1 +; GFX9-NEXT: s_bfe_i32 s6, s2, 0x40004 +; GFX9-NEXT: v_mov_b32_e32 v1, s7 +; GFX9-NEXT: s_bfe_i32 s9, s3, 0x40008 +; GFX9-NEXT: v_mad_i32_i24 v0, s6, v1, v0 +; GFX9-NEXT: s_bfe_i32 s8, s2, 0x40008 +; GFX9-NEXT: v_mov_b32_e32 v1, s9 +; GFX9-NEXT: s_bfe_i32 s11, s3, 0x4000c +; GFX9-NEXT: v_mad_i32_i24 v0, s8, v1, v0 +; GFX9-NEXT: s_bfe_i32 s10, s2, 0x4000c +; GFX9-NEXT: v_mov_b32_e32 v1, s11 +; GFX9-NEXT: s_bfe_i32 s13, s3, 0x40010 +; GFX9-NEXT: v_mad_i32_i24 v0, s10, v1, v0 +; GFX9-NEXT: s_bfe_i32 s12, s2, 0x40010 +; GFX9-NEXT: v_mov_b32_e32 v1, s13 +; GFX9-NEXT: s_bfe_i32 s15, s3, 0x40014 +; GFX9-NEXT: s_bfe_i32 s17, s3, 0x40018 +; GFX9-NEXT: v_mad_i32_i24 v0, s12, v1, v0 +; GFX9-NEXT: s_bfe_i32 s14, s2, 0x40014 +; GFX9-NEXT: v_mov_b32_e32 v1, s15 +; GFX9-NEXT: s_bfe_i32 s16, s2, 0x40018 +; GFX9-NEXT: v_mad_i32_i24 v0, s14, v1, v0 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: s_ashr_i32 s3, s3, 28 +; GFX9-NEXT: v_mad_i32_i24 v0, s16, v1, v0 ; GFX9-NEXT: s_ashr_i32 s2, s2, 28 -; GFX9-NEXT: v_mov_b32_e32 v1, s4 +; GFX9-NEXT: v_mov_b32_e32 v1, s3 ; GFX9-NEXT: v_mad_i32_i24 v2, s2, v1, v0 ; GFX9-NEXT: v_mov_b32_e32 v0, s0 ; GFX9-NEXT: v_mov_b32_e32 v1, s1 @@ -154,11 +154,11 @@ ; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DL-NEXT: s_load_dword s2, s[6:7], 0x0 -; GFX9-DL-NEXT: s_load_dword s6, s[0:1], 0x0 +; GFX9-DL-NEXT: s_load_dword s3, s[0:1], 0x0 ; GFX9-DL-NEXT: s_load_dword s4, s[4:5], 0x0 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DL-NEXT: v_mov_b32_e32 v0, s2 -; GFX9-DL-NEXT: v_mov_b32_e32 v1, s6 +; GFX9-DL-NEXT: v_mov_b32_e32 v1, s3 ; GFX9-DL-NEXT: v_dot8_i32_i4 v2, s4, v0, v1 ; GFX9-DL-NEXT: v_mov_b32_e32 v0, s0 ; GFX9-DL-NEXT: v_mov_b32_e32 v1, s1 @@ -167,18 +167,18 @@ ; ; GFX10-DL-LABEL: idot8_acc32: ; GFX10-DL: ; %bb.0: ; %entry -; GFX10-DL-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34 -; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-DL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34 +; GFX10-DL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-DL-NEXT: ; implicit-def: $vcc_hi ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: s_load_dword s0, s[8:9], 0x0 -; GFX10-DL-NEXT: s_load_dword s1, s[4:5], 0x0 -; GFX10-DL-NEXT: s_load_dword s2, s[6:7], 0x0 +; GFX10-DL-NEXT: s_load_dword s6, s[4:5], 0x0 +; GFX10-DL-NEXT: s_load_dword s0, s[0:1], 0x0 +; GFX10-DL-NEXT: s_load_dword s1, s[2:3], 0x0 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-DL-NEXT: v_dot8_i32_i4 v2, s1, s2, v0 -; GFX10-DL-NEXT: v_mov_b32_e32 v0, s8 -; GFX10-DL-NEXT: v_mov_b32_e32 v1, s9 +; GFX10-DL-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-DL-NEXT: v_dot8_i32_i4 v2, s0, s1, v0 +; GFX10-DL-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-DL-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-DL-NEXT: global_store_dword v[0:1], v2, off ; GFX10-DL-NEXT: s_endpgm <8 x i4> addrspace(1)* %src2, @@ -254,66 +254,66 @@ define amdgpu_kernel void @idot8_acc16(<8 x i4> addrspace(1)* %src1, ; GFX7-LABEL: idot8_acc16: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x9 -; GFX7-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xd -; GFX7-NEXT: s_mov_b32 s7, 0xf000 -; GFX7-NEXT: s_mov_b32 s6, -1 -; GFX7-NEXT: s_mov_b32 s0, 0xffff +; GFX7-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd +; GFX7-NEXT: s_mov_b32 s3, 0xf000 +; GFX7-NEXT: s_mov_b32 s2, -1 +; GFX7-NEXT: s_mov_b32 s8, 0xffff ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: buffer_load_ushort v0, off, s[4:7], 0 -; GFX7-NEXT: s_load_dword s1, s[8:9], 0x0 -; GFX7-NEXT: s_load_dword s2, s[10:11], 0x0 +; GFX7-NEXT: buffer_load_ushort v0, off, s[0:3], 0 +; GFX7-NEXT: s_load_dword s4, s[4:5], 0x0 +; GFX7-NEXT: s_load_dword s5, s[6:7], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: s_bfe_i32 s8, s1, 0x40000 -; GFX7-NEXT: s_bfe_i32 s9, s2, 0x40000 -; GFX7-NEXT: s_bfe_i32 s11, s2, 0x40004 -; GFX7-NEXT: s_and_b32 s9, s9, s0 -; GFX7-NEXT: s_bfe_i32 s10, s1, 0x40004 -; GFX7-NEXT: s_bfe_i32 s13, s2, 0x40008 -; GFX7-NEXT: s_and_b32 s11, s11, s0 -; GFX7-NEXT: s_and_b32 s8, s8, s0 -; GFX7-NEXT: v_mov_b32_e32 v1, s9 -; GFX7-NEXT: s_bfe_i32 s12, s1, 0x40008 -; GFX7-NEXT: s_bfe_i32 s15, s2, 0x4000c -; GFX7-NEXT: s_and_b32 s13, s13, s0 -; GFX7-NEXT: s_and_b32 s10, s10, s0 -; GFX7-NEXT: v_mov_b32_e32 v2, s11 -; GFX7-NEXT: s_bfe_i32 s14, s1, 0x4000c -; GFX7-NEXT: s_bfe_i32 s17, s2, 0x40010 -; GFX7-NEXT: s_and_b32 s15, s15, s0 -; GFX7-NEXT: s_and_b32 s12, s12, s0 -; GFX7-NEXT: v_mov_b32_e32 v3, s13 -; GFX7-NEXT: s_bfe_i32 s16, s1, 0x40010 -; GFX7-NEXT: s_bfe_i32 s19, s2, 0x40014 -; GFX7-NEXT: s_and_b32 s17, s17, s0 -; GFX7-NEXT: s_and_b32 s14, s14, s0 -; GFX7-NEXT: v_mov_b32_e32 v4, s15 -; GFX7-NEXT: s_bfe_i32 s21, s2, 0x40018 -; GFX7-NEXT: s_bfe_i32 s18, s1, 0x40014 -; GFX7-NEXT: s_and_b32 s19, s19, s0 -; GFX7-NEXT: s_and_b32 s16, s16, s0 -; GFX7-NEXT: v_mov_b32_e32 v5, s17 -; GFX7-NEXT: s_bfe_i32 s20, s1, 0x40018 -; GFX7-NEXT: s_ashr_i32 s2, s2, 28 -; GFX7-NEXT: s_and_b32 s21, s21, s0 -; GFX7-NEXT: s_and_b32 s18, s18, s0 -; GFX7-NEXT: v_mov_b32_e32 v6, s19 -; GFX7-NEXT: s_ashr_i32 s1, s1, 28 -; GFX7-NEXT: s_and_b32 s20, s20, s0 -; GFX7-NEXT: s_and_b32 s2, s2, s0 -; GFX7-NEXT: v_mov_b32_e32 v7, s21 -; GFX7-NEXT: s_and_b32 s0, s1, s0 +; GFX7-NEXT: s_bfe_i32 s6, s4, 0x40000 +; GFX7-NEXT: s_bfe_i32 s7, s5, 0x40000 +; GFX7-NEXT: s_bfe_i32 s10, s5, 0x40004 +; GFX7-NEXT: s_and_b32 s7, s7, s8 +; GFX7-NEXT: s_bfe_i32 s9, s4, 0x40004 +; GFX7-NEXT: s_bfe_i32 s12, s5, 0x40008 +; GFX7-NEXT: s_and_b32 s10, s10, s8 +; GFX7-NEXT: s_and_b32 s6, s6, s8 +; GFX7-NEXT: v_mov_b32_e32 v1, s7 +; GFX7-NEXT: s_bfe_i32 s11, s4, 0x40008 +; GFX7-NEXT: s_bfe_i32 s14, s5, 0x4000c +; GFX7-NEXT: s_and_b32 s12, s12, s8 +; GFX7-NEXT: s_and_b32 s9, s9, s8 +; GFX7-NEXT: v_mov_b32_e32 v2, s10 +; GFX7-NEXT: s_bfe_i32 s13, s4, 0x4000c +; GFX7-NEXT: s_bfe_i32 s16, s5, 0x40010 +; GFX7-NEXT: s_and_b32 s14, s14, s8 +; GFX7-NEXT: s_and_b32 s11, s11, s8 +; GFX7-NEXT: v_mov_b32_e32 v3, s12 +; GFX7-NEXT: s_bfe_i32 s15, s4, 0x40010 +; GFX7-NEXT: s_bfe_i32 s18, s5, 0x40014 +; GFX7-NEXT: s_and_b32 s16, s16, s8 +; GFX7-NEXT: s_and_b32 s13, s13, s8 +; GFX7-NEXT: v_mov_b32_e32 v4, s14 +; GFX7-NEXT: s_bfe_i32 s20, s5, 0x40018 +; GFX7-NEXT: s_bfe_i32 s17, s4, 0x40014 +; GFX7-NEXT: s_and_b32 s18, s18, s8 +; GFX7-NEXT: s_and_b32 s15, s15, s8 +; GFX7-NEXT: v_mov_b32_e32 v5, s16 +; GFX7-NEXT: s_bfe_i32 s19, s4, 0x40018 +; GFX7-NEXT: s_ashr_i32 s5, s5, 28 +; GFX7-NEXT: s_and_b32 s20, s20, s8 +; GFX7-NEXT: s_and_b32 s17, s17, s8 +; GFX7-NEXT: v_mov_b32_e32 v6, s18 +; GFX7-NEXT: s_ashr_i32 s4, s4, 28 +; GFX7-NEXT: s_and_b32 s19, s19, s8 +; GFX7-NEXT: s_and_b32 s5, s5, s8 +; GFX7-NEXT: v_mov_b32_e32 v7, s20 +; GFX7-NEXT: s_and_b32 s4, s4, s8 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_mad_u32_u24 v0, s8, v1, v0 -; GFX7-NEXT: v_mad_u32_u24 v0, s10, v2, v0 -; GFX7-NEXT: v_mad_u32_u24 v0, s12, v3, v0 -; GFX7-NEXT: v_mad_u32_u24 v0, s14, v4, v0 -; GFX7-NEXT: v_mad_u32_u24 v0, s16, v5, v0 -; GFX7-NEXT: v_mad_u32_u24 v0, s18, v6, v0 -; GFX7-NEXT: v_mad_u32_u24 v0, s20, v7, v0 -; GFX7-NEXT: v_mov_b32_e32 v1, s2 -; GFX7-NEXT: v_mad_u32_u24 v0, s0, v1, v0 -; GFX7-NEXT: buffer_store_short v0, off, s[4:7], 0 +; GFX7-NEXT: v_mad_u32_u24 v0, s6, v1, v0 +; GFX7-NEXT: v_mad_u32_u24 v0, s9, v2, v0 +; GFX7-NEXT: v_mad_u32_u24 v0, s11, v3, v0 +; GFX7-NEXT: v_mad_u32_u24 v0, s13, v4, v0 +; GFX7-NEXT: v_mad_u32_u24 v0, s15, v5, v0 +; GFX7-NEXT: v_mad_u32_u24 v0, s17, v6, v0 +; GFX7-NEXT: v_mad_u32_u24 v0, s19, v7, v0 +; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: v_mad_u32_u24 v0, s4, v1, v0 +; GFX7-NEXT: buffer_store_short v0, off, s[0:3], 0 ; GFX7-NEXT: s_endpgm ; ; GFX8-LABEL: idot8_acc16: @@ -327,41 +327,41 @@ ; GFX8-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX8-NEXT: s_load_dword s1, s[6:7], 0x0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_bfe_i32 s5, s0, 0x40000 -; GFX8-NEXT: s_bfe_i32 s6, s1, 0x40000 -; GFX8-NEXT: s_bfe_i32 s8, s1, 0x40004 -; GFX8-NEXT: s_bfe_i32 s10, s1, 0x40008 -; GFX8-NEXT: v_mov_b32_e32 v6, s6 +; GFX8-NEXT: s_bfe_i32 s4, s0, 0x40000 +; GFX8-NEXT: s_bfe_i32 s5, s1, 0x40000 +; GFX8-NEXT: s_bfe_i32 s7, s1, 0x40004 +; GFX8-NEXT: s_bfe_i32 s9, s1, 0x40008 +; GFX8-NEXT: v_mov_b32_e32 v6, s5 ; GFX8-NEXT: s_lshr_b32 s2, s0, 12 -; GFX8-NEXT: s_lshr_b32 s4, s1, 12 -; GFX8-NEXT: s_bfe_i32 s7, s0, 0x40004 -; GFX8-NEXT: s_bfe_i32 s9, s0, 0x40008 -; GFX8-NEXT: v_mov_b32_e32 v3, s10 -; GFX8-NEXT: v_mov_b32_e32 v7, s8 +; GFX8-NEXT: s_lshr_b32 s3, s1, 12 +; GFX8-NEXT: s_bfe_i32 s6, s0, 0x40004 +; GFX8-NEXT: s_bfe_i32 s8, s0, 0x40008 +; GFX8-NEXT: v_mov_b32_e32 v3, s9 +; GFX8-NEXT: v_mov_b32_e32 v7, s7 ; GFX8-NEXT: v_lshlrev_b16_e64 v4, 12, s2 -; GFX8-NEXT: v_lshlrev_b16_e64 v5, 12, s4 -; GFX8-NEXT: v_mul_i32_i24_e32 v3, s9, v3 -; GFX8-NEXT: s_bfe_i32 s12, s1, 0x40010 +; GFX8-NEXT: v_lshlrev_b16_e64 v5, 12, s3 +; GFX8-NEXT: v_mul_i32_i24_e32 v3, s8, v3 +; GFX8-NEXT: s_bfe_i32 s11, s1, 0x40010 ; GFX8-NEXT: v_ashrrev_i16_e32 v4, 12, v4 ; GFX8-NEXT: v_ashrrev_i16_e32 v5, 12, v5 -; GFX8-NEXT: s_bfe_i32 s14, s1, 0x40014 -; GFX8-NEXT: s_bfe_i32 s11, s0, 0x40010 -; GFX8-NEXT: v_mov_b32_e32 v8, s12 -; GFX8-NEXT: s_bfe_i32 s16, s1, 0x40018 -; GFX8-NEXT: s_bfe_i32 s13, s0, 0x40014 -; GFX8-NEXT: v_mov_b32_e32 v9, s14 -; GFX8-NEXT: s_bfe_i32 s15, s0, 0x40018 +; GFX8-NEXT: s_bfe_i32 s13, s1, 0x40014 +; GFX8-NEXT: s_bfe_i32 s10, s0, 0x40010 +; GFX8-NEXT: v_mov_b32_e32 v8, s11 +; GFX8-NEXT: s_bfe_i32 s15, s1, 0x40018 +; GFX8-NEXT: s_bfe_i32 s12, s0, 0x40014 +; GFX8-NEXT: v_mov_b32_e32 v9, s13 +; GFX8-NEXT: s_bfe_i32 s14, s0, 0x40018 ; GFX8-NEXT: s_ashr_i32 s1, s1, 28 -; GFX8-NEXT: v_mov_b32_e32 v10, s16 +; GFX8-NEXT: v_mov_b32_e32 v10, s15 ; GFX8-NEXT: s_ashr_i32 s0, s0, 28 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_mad_i32_i24 v2, s5, v6, v2 -; GFX8-NEXT: v_mad_i32_i24 v2, s7, v7, v2 +; GFX8-NEXT: v_mad_i32_i24 v2, s4, v6, v2 +; GFX8-NEXT: v_mad_i32_i24 v2, s6, v7, v2 ; GFX8-NEXT: v_add_u32_sdwa v2, vcc, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0 ; GFX8-NEXT: v_mad_u32_u24 v2, v4, v5, v2 -; GFX8-NEXT: v_mad_i32_i24 v2, s11, v8, v2 -; GFX8-NEXT: v_mad_i32_i24 v2, s13, v9, v2 -; GFX8-NEXT: v_mad_i32_i24 v2, s15, v10, v2 +; GFX8-NEXT: v_mad_i32_i24 v2, s10, v8, v2 +; GFX8-NEXT: v_mad_i32_i24 v2, s12, v9, v2 +; GFX8-NEXT: v_mad_i32_i24 v2, s14, v10, v2 ; GFX8-NEXT: v_mov_b32_e32 v3, s1 ; GFX8-NEXT: v_mad_i32_i24 v2, s0, v3, v2 ; GFX8-NEXT: flat_store_short v[0:1], v2 @@ -378,41 +378,41 @@ ; GFX9-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX9-NEXT: s_load_dword s1, s[6:7], 0x0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_bfe_i32 s5, s0, 0x40000 -; GFX9-NEXT: s_bfe_i32 s6, s1, 0x40000 -; GFX9-NEXT: s_bfe_i32 s8, s1, 0x40004 -; GFX9-NEXT: s_bfe_i32 s10, s1, 0x40008 -; GFX9-NEXT: v_mov_b32_e32 v6, s6 +; GFX9-NEXT: s_bfe_i32 s4, s0, 0x40000 +; GFX9-NEXT: s_bfe_i32 s5, s1, 0x40000 +; GFX9-NEXT: s_bfe_i32 s7, s1, 0x40004 +; GFX9-NEXT: s_bfe_i32 s9, s1, 0x40008 +; GFX9-NEXT: v_mov_b32_e32 v6, s5 ; GFX9-NEXT: s_lshr_b32 s2, s0, 12 -; GFX9-NEXT: s_lshr_b32 s4, s1, 12 -; GFX9-NEXT: s_bfe_i32 s7, s0, 0x40004 -; GFX9-NEXT: s_bfe_i32 s9, s0, 0x40008 -; GFX9-NEXT: v_mov_b32_e32 v3, s10 -; GFX9-NEXT: v_mov_b32_e32 v7, s8 +; GFX9-NEXT: s_lshr_b32 s3, s1, 12 +; GFX9-NEXT: s_bfe_i32 s6, s0, 0x40004 +; GFX9-NEXT: s_bfe_i32 s8, s0, 0x40008 +; GFX9-NEXT: v_mov_b32_e32 v3, s9 +; GFX9-NEXT: v_mov_b32_e32 v7, s7 ; GFX9-NEXT: v_lshlrev_b16_e64 v4, 12, s2 -; GFX9-NEXT: v_lshlrev_b16_e64 v5, 12, s4 -; GFX9-NEXT: v_mul_i32_i24_e32 v3, s9, v3 -; GFX9-NEXT: s_bfe_i32 s12, s1, 0x40010 +; GFX9-NEXT: v_lshlrev_b16_e64 v5, 12, s3 +; GFX9-NEXT: v_mul_i32_i24_e32 v3, s8, v3 +; GFX9-NEXT: s_bfe_i32 s11, s1, 0x40010 ; GFX9-NEXT: v_ashrrev_i16_e32 v4, 12, v4 ; GFX9-NEXT: v_ashrrev_i16_e32 v5, 12, v5 -; GFX9-NEXT: s_bfe_i32 s14, s1, 0x40014 -; GFX9-NEXT: s_bfe_i32 s11, s0, 0x40010 -; GFX9-NEXT: v_mov_b32_e32 v8, s12 -; GFX9-NEXT: s_bfe_i32 s16, s1, 0x40018 -; GFX9-NEXT: s_bfe_i32 s13, s0, 0x40014 -; GFX9-NEXT: v_mov_b32_e32 v9, s14 -; GFX9-NEXT: s_bfe_i32 s15, s0, 0x40018 +; GFX9-NEXT: s_bfe_i32 s13, s1, 0x40014 +; GFX9-NEXT: s_bfe_i32 s10, s0, 0x40010 +; GFX9-NEXT: v_mov_b32_e32 v8, s11 +; GFX9-NEXT: s_bfe_i32 s15, s1, 0x40018 +; GFX9-NEXT: s_bfe_i32 s12, s0, 0x40014 +; GFX9-NEXT: v_mov_b32_e32 v9, s13 +; GFX9-NEXT: s_bfe_i32 s14, s0, 0x40018 ; GFX9-NEXT: s_ashr_i32 s1, s1, 28 -; GFX9-NEXT: v_mov_b32_e32 v10, s16 +; GFX9-NEXT: v_mov_b32_e32 v10, s15 ; GFX9-NEXT: s_ashr_i32 s0, s0, 28 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_mad_i32_i24 v2, s5, v6, v2 -; GFX9-NEXT: v_mad_i32_i24 v2, s7, v7, v2 +; GFX9-NEXT: v_mad_i32_i24 v2, s4, v6, v2 +; GFX9-NEXT: v_mad_i32_i24 v2, s6, v7, v2 ; GFX9-NEXT: v_add_u32_sdwa v2, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0 ; GFX9-NEXT: v_mad_u32_u24 v2, v4, v5, v2 -; GFX9-NEXT: v_mad_i32_i24 v2, s11, v8, v2 -; GFX9-NEXT: v_mad_i32_i24 v2, s13, v9, v2 -; GFX9-NEXT: v_mad_i32_i24 v2, s15, v10, v2 +; GFX9-NEXT: v_mad_i32_i24 v2, s10, v8, v2 +; GFX9-NEXT: v_mad_i32_i24 v2, s12, v9, v2 +; GFX9-NEXT: v_mad_i32_i24 v2, s14, v10, v2 ; GFX9-NEXT: v_mov_b32_e32 v3, s1 ; GFX9-NEXT: v_mad_i32_i24 v2, s0, v3, v2 ; GFX9-NEXT: global_store_short v[0:1], v2, off @@ -429,41 +429,41 @@ ; GFX9-DL-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX9-DL-NEXT: s_load_dword s1, s[6:7], 0x0 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: s_bfe_i32 s5, s0, 0x40000 -; GFX9-DL-NEXT: s_bfe_i32 s6, s1, 0x40000 -; GFX9-DL-NEXT: s_bfe_i32 s8, s1, 0x40004 -; GFX9-DL-NEXT: s_bfe_i32 s10, s1, 0x40008 -; GFX9-DL-NEXT: v_mov_b32_e32 v6, s6 +; GFX9-DL-NEXT: s_bfe_i32 s4, s0, 0x40000 +; GFX9-DL-NEXT: s_bfe_i32 s5, s1, 0x40000 +; GFX9-DL-NEXT: s_bfe_i32 s7, s1, 0x40004 +; GFX9-DL-NEXT: s_bfe_i32 s9, s1, 0x40008 +; GFX9-DL-NEXT: v_mov_b32_e32 v6, s5 ; GFX9-DL-NEXT: s_lshr_b32 s2, s0, 12 -; GFX9-DL-NEXT: s_lshr_b32 s4, s1, 12 -; GFX9-DL-NEXT: s_bfe_i32 s7, s0, 0x40004 -; GFX9-DL-NEXT: s_bfe_i32 s9, s0, 0x40008 -; GFX9-DL-NEXT: v_mov_b32_e32 v3, s10 -; GFX9-DL-NEXT: v_mov_b32_e32 v7, s8 +; GFX9-DL-NEXT: s_lshr_b32 s3, s1, 12 +; GFX9-DL-NEXT: s_bfe_i32 s6, s0, 0x40004 +; GFX9-DL-NEXT: s_bfe_i32 s8, s0, 0x40008 +; GFX9-DL-NEXT: v_mov_b32_e32 v3, s9 +; GFX9-DL-NEXT: v_mov_b32_e32 v7, s7 ; GFX9-DL-NEXT: v_lshlrev_b16_e64 v4, 12, s2 -; GFX9-DL-NEXT: v_lshlrev_b16_e64 v5, 12, s4 -; GFX9-DL-NEXT: v_mul_i32_i24_e32 v3, s9, v3 -; GFX9-DL-NEXT: s_bfe_i32 s12, s1, 0x40010 +; GFX9-DL-NEXT: v_lshlrev_b16_e64 v5, 12, s3 +; GFX9-DL-NEXT: v_mul_i32_i24_e32 v3, s8, v3 +; GFX9-DL-NEXT: s_bfe_i32 s11, s1, 0x40010 ; GFX9-DL-NEXT: v_ashrrev_i16_e32 v4, 12, v4 ; GFX9-DL-NEXT: v_ashrrev_i16_e32 v5, 12, v5 -; GFX9-DL-NEXT: s_bfe_i32 s14, s1, 0x40014 -; GFX9-DL-NEXT: s_bfe_i32 s11, s0, 0x40010 -; GFX9-DL-NEXT: v_mov_b32_e32 v8, s12 -; GFX9-DL-NEXT: s_bfe_i32 s16, s1, 0x40018 -; GFX9-DL-NEXT: s_bfe_i32 s13, s0, 0x40014 -; GFX9-DL-NEXT: v_mov_b32_e32 v9, s14 -; GFX9-DL-NEXT: s_bfe_i32 s15, s0, 0x40018 +; GFX9-DL-NEXT: s_bfe_i32 s13, s1, 0x40014 +; GFX9-DL-NEXT: s_bfe_i32 s10, s0, 0x40010 +; GFX9-DL-NEXT: v_mov_b32_e32 v8, s11 +; GFX9-DL-NEXT: s_bfe_i32 s15, s1, 0x40018 +; GFX9-DL-NEXT: s_bfe_i32 s12, s0, 0x40014 +; GFX9-DL-NEXT: v_mov_b32_e32 v9, s13 +; GFX9-DL-NEXT: s_bfe_i32 s14, s0, 0x40018 ; GFX9-DL-NEXT: s_ashr_i32 s1, s1, 28 -; GFX9-DL-NEXT: v_mov_b32_e32 v10, s16 +; GFX9-DL-NEXT: v_mov_b32_e32 v10, s15 ; GFX9-DL-NEXT: s_ashr_i32 s0, s0, 28 ; GFX9-DL-NEXT: s_waitcnt vmcnt(0) -; GFX9-DL-NEXT: v_mad_i32_i24 v2, s5, v6, v2 -; GFX9-DL-NEXT: v_mad_i32_i24 v2, s7, v7, v2 +; GFX9-DL-NEXT: v_mad_i32_i24 v2, s4, v6, v2 +; GFX9-DL-NEXT: v_mad_i32_i24 v2, s6, v7, v2 ; GFX9-DL-NEXT: v_add_u32_sdwa v2, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0 ; GFX9-DL-NEXT: v_mad_u32_u24 v2, v4, v5, v2 -; GFX9-DL-NEXT: v_mad_i32_i24 v2, s11, v8, v2 -; GFX9-DL-NEXT: v_mad_i32_i24 v2, s13, v9, v2 -; GFX9-DL-NEXT: v_mad_i32_i24 v2, s15, v10, v2 +; GFX9-DL-NEXT: v_mad_i32_i24 v2, s10, v8, v2 +; GFX9-DL-NEXT: v_mad_i32_i24 v2, s12, v9, v2 +; GFX9-DL-NEXT: v_mad_i32_i24 v2, s14, v10, v2 ; GFX9-DL-NEXT: v_mov_b32_e32 v3, s1 ; GFX9-DL-NEXT: v_mad_i32_i24 v2, s0, v3, v2 ; GFX9-DL-NEXT: global_store_short v[0:1], v2, off @@ -471,49 +471,49 @@ ; ; GFX10-DL-LABEL: idot8_acc16: ; GFX10-DL: ; %bb.0: ; %entry -; GFX10-DL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34 +; GFX10-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX10-DL-NEXT: ; implicit-def: $vcc_hi ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: v_mov_b32_e32 v0, s4 -; GFX10-DL-NEXT: v_mov_b32_e32 v1, s5 -; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-DL-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-DL-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-DL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-DL-NEXT: global_load_ushort v2, v[0:1], off ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: s_load_dword s0, s[4:5], 0x0 -; GFX10-DL-NEXT: s_load_dword s1, s[6:7], 0x0 +; GFX10-DL-NEXT: s_load_dword s0, s[0:1], 0x0 +; GFX10-DL-NEXT: s_load_dword s1, s[2:3], 0x0 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DL-NEXT: s_lshr_b32 s2, s0, 12 -; GFX10-DL-NEXT: s_lshr_b32 s4, s1, 12 -; GFX10-DL-NEXT: s_bfe_i32 s5, s0, 0x40000 -; GFX10-DL-NEXT: s_bfe_i32 s6, s1, 0x40000 -; GFX10-DL-NEXT: s_bfe_i32 s7, s0, 0x40004 +; GFX10-DL-NEXT: s_lshr_b32 s3, s1, 12 +; GFX10-DL-NEXT: s_bfe_i32 s4, s0, 0x40000 +; GFX10-DL-NEXT: s_bfe_i32 s5, s1, 0x40000 +; GFX10-DL-NEXT: s_bfe_i32 s6, s0, 0x40004 ; GFX10-DL-NEXT: v_lshlrev_b16_e64 v3, 12, s2 -; GFX10-DL-NEXT: v_lshlrev_b16_e64 v4, 12, s4 -; GFX10-DL-NEXT: s_bfe_i32 s8, s0, 0x40008 -; GFX10-DL-NEXT: s_bfe_i32 s9, s1, 0x40008 +; GFX10-DL-NEXT: v_lshlrev_b16_e64 v4, 12, s3 +; GFX10-DL-NEXT: s_bfe_i32 s7, s0, 0x40008 +; GFX10-DL-NEXT: s_bfe_i32 s8, s1, 0x40008 ; GFX10-DL-NEXT: s_bfe_i32 s2, s1, 0x40004 ; GFX10-DL-NEXT: v_ashrrev_i16_e64 v3, 12, v3 -; GFX10-DL-NEXT: s_mov_b32 s4, 0xffff +; GFX10-DL-NEXT: s_mov_b32 s3, 0xffff ; GFX10-DL-NEXT: v_ashrrev_i16_e64 v4, 12, v4 -; GFX10-DL-NEXT: v_mul_i32_i24_e64 v5, s8, s9 -; GFX10-DL-NEXT: v_and_b32_e32 v3, s4, v3 -; GFX10-DL-NEXT: v_and_b32_e32 v4, s4, v4 -; GFX10-DL-NEXT: s_bfe_i32 s4, s1, 0x40010 +; GFX10-DL-NEXT: v_mul_i32_i24_e64 v5, s7, s8 +; GFX10-DL-NEXT: v_and_b32_e32 v3, s3, v3 +; GFX10-DL-NEXT: v_and_b32_e32 v4, s3, v4 +; GFX10-DL-NEXT: s_bfe_i32 s3, s1, 0x40010 ; GFX10-DL-NEXT: s_waitcnt vmcnt(0) -; GFX10-DL-NEXT: v_mad_i32_i24 v2, s5, s6, v2 -; GFX10-DL-NEXT: s_bfe_i32 s5, s0, 0x40014 -; GFX10-DL-NEXT: s_bfe_i32 s6, s1, 0x40014 -; GFX10-DL-NEXT: v_mad_i32_i24 v2, s7, s2, v2 +; GFX10-DL-NEXT: v_mad_i32_i24 v2, s4, s5, v2 +; GFX10-DL-NEXT: s_bfe_i32 s4, s0, 0x40014 +; GFX10-DL-NEXT: s_bfe_i32 s5, s1, 0x40014 +; GFX10-DL-NEXT: v_mad_i32_i24 v2, s6, s2, v2 ; GFX10-DL-NEXT: s_bfe_i32 s2, s0, 0x40010 ; GFX10-DL-NEXT: v_add_nc_u32_sdwa v2, v2, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0 ; GFX10-DL-NEXT: v_mad_u32_u24 v2, v3, v4, v2 -; GFX10-DL-NEXT: v_mad_i32_i24 v2, s2, s4, v2 +; GFX10-DL-NEXT: v_mad_i32_i24 v2, s2, s3, v2 ; GFX10-DL-NEXT: s_bfe_i32 s2, s0, 0x40018 -; GFX10-DL-NEXT: s_bfe_i32 s4, s1, 0x40018 +; GFX10-DL-NEXT: s_bfe_i32 s3, s1, 0x40018 ; GFX10-DL-NEXT: s_ashr_i32 s0, s0, 28 ; GFX10-DL-NEXT: s_ashr_i32 s1, s1, 28 -; GFX10-DL-NEXT: v_mad_i32_i24 v2, s5, s6, v2 -; GFX10-DL-NEXT: v_mad_i32_i24 v2, s2, s4, v2 +; GFX10-DL-NEXT: v_mad_i32_i24 v2, s4, s5, v2 +; GFX10-DL-NEXT: v_mad_i32_i24 v2, s2, s3, v2 ; GFX10-DL-NEXT: v_mad_i32_i24 v2, s0, s1, v2 ; GFX10-DL-NEXT: global_store_short v[0:1], v2, off ; GFX10-DL-NEXT: s_endpgm @@ -589,66 +589,66 @@ define amdgpu_kernel void @idot8_acc8(<8 x i4> addrspace(1)* %src1, ; GFX7-LABEL: idot8_acc8: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x9 -; GFX7-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xd -; GFX7-NEXT: s_mov_b32 s7, 0xf000 -; GFX7-NEXT: s_mov_b32 s6, -1 -; GFX7-NEXT: s_movk_i32 s0, 0xff +; GFX7-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd +; GFX7-NEXT: s_mov_b32 s3, 0xf000 +; GFX7-NEXT: s_mov_b32 s2, -1 +; GFX7-NEXT: s_movk_i32 s8, 0xff ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: buffer_load_ubyte v0, off, s[4:7], 0 -; GFX7-NEXT: s_load_dword s1, s[8:9], 0x0 -; GFX7-NEXT: s_load_dword s2, s[10:11], 0x0 +; GFX7-NEXT: buffer_load_ubyte v0, off, s[0:3], 0 +; GFX7-NEXT: s_load_dword s4, s[4:5], 0x0 +; GFX7-NEXT: s_load_dword s5, s[6:7], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: s_bfe_i32 s8, s1, 0x40000 -; GFX7-NEXT: s_bfe_i32 s9, s2, 0x40000 -; GFX7-NEXT: s_bfe_i32 s11, s2, 0x40004 -; GFX7-NEXT: s_and_b32 s9, s9, s0 -; GFX7-NEXT: s_bfe_i32 s10, s1, 0x40004 -; GFX7-NEXT: s_bfe_i32 s13, s2, 0x40008 -; GFX7-NEXT: s_and_b32 s11, s11, s0 -; GFX7-NEXT: s_and_b32 s8, s8, s0 -; GFX7-NEXT: v_mov_b32_e32 v1, s9 -; GFX7-NEXT: s_bfe_i32 s12, s1, 0x40008 -; GFX7-NEXT: s_bfe_i32 s15, s2, 0x4000c -; GFX7-NEXT: s_and_b32 s13, s13, s0 -; GFX7-NEXT: s_and_b32 s10, s10, s0 -; GFX7-NEXT: v_mov_b32_e32 v2, s11 -; GFX7-NEXT: s_bfe_i32 s14, s1, 0x4000c -; GFX7-NEXT: s_bfe_i32 s17, s2, 0x40010 -; GFX7-NEXT: s_and_b32 s15, s15, s0 -; GFX7-NEXT: s_and_b32 s12, s12, s0 -; GFX7-NEXT: v_mov_b32_e32 v3, s13 -; GFX7-NEXT: s_bfe_i32 s16, s1, 0x40010 -; GFX7-NEXT: s_bfe_i32 s19, s2, 0x40014 -; GFX7-NEXT: s_and_b32 s17, s17, s0 -; GFX7-NEXT: s_and_b32 s14, s14, s0 -; GFX7-NEXT: v_mov_b32_e32 v4, s15 -; GFX7-NEXT: s_bfe_i32 s21, s2, 0x40018 -; GFX7-NEXT: s_bfe_i32 s18, s1, 0x40014 -; GFX7-NEXT: s_and_b32 s19, s19, s0 -; GFX7-NEXT: s_and_b32 s16, s16, s0 -; GFX7-NEXT: v_mov_b32_e32 v5, s17 -; GFX7-NEXT: s_bfe_i32 s20, s1, 0x40018 -; GFX7-NEXT: s_ashr_i32 s2, s2, 28 -; GFX7-NEXT: s_and_b32 s21, s21, s0 -; GFX7-NEXT: s_and_b32 s18, s18, s0 -; GFX7-NEXT: v_mov_b32_e32 v6, s19 -; GFX7-NEXT: s_ashr_i32 s1, s1, 28 -; GFX7-NEXT: s_and_b32 s20, s20, s0 -; GFX7-NEXT: s_and_b32 s2, s2, s0 -; GFX7-NEXT: v_mov_b32_e32 v7, s21 -; GFX7-NEXT: s_and_b32 s0, s1, s0 +; GFX7-NEXT: s_bfe_i32 s6, s4, 0x40000 +; GFX7-NEXT: s_bfe_i32 s7, s5, 0x40000 +; GFX7-NEXT: s_bfe_i32 s10, s5, 0x40004 +; GFX7-NEXT: s_and_b32 s7, s7, s8 +; GFX7-NEXT: s_bfe_i32 s9, s4, 0x40004 +; GFX7-NEXT: s_bfe_i32 s12, s5, 0x40008 +; GFX7-NEXT: s_and_b32 s10, s10, s8 +; GFX7-NEXT: s_and_b32 s6, s6, s8 +; GFX7-NEXT: v_mov_b32_e32 v1, s7 +; GFX7-NEXT: s_bfe_i32 s11, s4, 0x40008 +; GFX7-NEXT: s_bfe_i32 s14, s5, 0x4000c +; GFX7-NEXT: s_and_b32 s12, s12, s8 +; GFX7-NEXT: s_and_b32 s9, s9, s8 +; GFX7-NEXT: v_mov_b32_e32 v2, s10 +; GFX7-NEXT: s_bfe_i32 s13, s4, 0x4000c +; GFX7-NEXT: s_bfe_i32 s16, s5, 0x40010 +; GFX7-NEXT: s_and_b32 s14, s14, s8 +; GFX7-NEXT: s_and_b32 s11, s11, s8 +; GFX7-NEXT: v_mov_b32_e32 v3, s12 +; GFX7-NEXT: s_bfe_i32 s15, s4, 0x40010 +; GFX7-NEXT: s_bfe_i32 s18, s5, 0x40014 +; GFX7-NEXT: s_and_b32 s16, s16, s8 +; GFX7-NEXT: s_and_b32 s13, s13, s8 +; GFX7-NEXT: v_mov_b32_e32 v4, s14 +; GFX7-NEXT: s_bfe_i32 s20, s5, 0x40018 +; GFX7-NEXT: s_bfe_i32 s17, s4, 0x40014 +; GFX7-NEXT: s_and_b32 s18, s18, s8 +; GFX7-NEXT: s_and_b32 s15, s15, s8 +; GFX7-NEXT: v_mov_b32_e32 v5, s16 +; GFX7-NEXT: s_bfe_i32 s19, s4, 0x40018 +; GFX7-NEXT: s_ashr_i32 s5, s5, 28 +; GFX7-NEXT: s_and_b32 s20, s20, s8 +; GFX7-NEXT: s_and_b32 s17, s17, s8 +; GFX7-NEXT: v_mov_b32_e32 v6, s18 +; GFX7-NEXT: s_ashr_i32 s4, s4, 28 +; GFX7-NEXT: s_and_b32 s19, s19, s8 +; GFX7-NEXT: s_and_b32 s5, s5, s8 +; GFX7-NEXT: v_mov_b32_e32 v7, s20 +; GFX7-NEXT: s_and_b32 s4, s4, s8 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_mad_u32_u24 v0, s8, v1, v0 -; GFX7-NEXT: v_mad_u32_u24 v0, s10, v2, v0 -; GFX7-NEXT: v_mad_u32_u24 v0, s12, v3, v0 -; GFX7-NEXT: v_mad_u32_u24 v0, s14, v4, v0 -; GFX7-NEXT: v_mad_u32_u24 v0, s16, v5, v0 -; GFX7-NEXT: v_mad_u32_u24 v0, s18, v6, v0 -; GFX7-NEXT: v_mad_u32_u24 v0, s20, v7, v0 -; GFX7-NEXT: v_mov_b32_e32 v1, s2 -; GFX7-NEXT: v_mad_u32_u24 v0, s0, v1, v0 -; GFX7-NEXT: buffer_store_byte v0, off, s[4:7], 0 +; GFX7-NEXT: v_mad_u32_u24 v0, s6, v1, v0 +; GFX7-NEXT: v_mad_u32_u24 v0, s9, v2, v0 +; GFX7-NEXT: v_mad_u32_u24 v0, s11, v3, v0 +; GFX7-NEXT: v_mad_u32_u24 v0, s13, v4, v0 +; GFX7-NEXT: v_mad_u32_u24 v0, s15, v5, v0 +; GFX7-NEXT: v_mad_u32_u24 v0, s17, v6, v0 +; GFX7-NEXT: v_mad_u32_u24 v0, s19, v7, v0 +; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: v_mad_u32_u24 v0, s4, v1, v0 +; GFX7-NEXT: buffer_store_byte v0, off, s[0:3], 0 ; GFX7-NEXT: s_endpgm ; ; GFX8-LABEL: idot8_acc8: @@ -657,50 +657,50 @@ ; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX8-NEXT: s_movk_i32 s2, 0xff ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_load_dword s6, s[6:7], 0x0 +; GFX8-NEXT: s_load_dword s3, s[6:7], 0x0 ; GFX8-NEXT: v_mov_b32_e32 v0, s0 ; GFX8-NEXT: v_mov_b32_e32 v1, s1 ; GFX8-NEXT: flat_load_ubyte v2, v[0:1] ; GFX8-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_bfe_i32 s7, s6, 0x40000 -; GFX8-NEXT: s_lshr_b32 s4, s6, 12 -; GFX8-NEXT: s_bfe_i32 s9, s6, 0x40004 -; GFX8-NEXT: s_bfe_i32 s11, s6, 0x40008 +; GFX8-NEXT: s_bfe_i32 s6, s3, 0x40000 +; GFX8-NEXT: s_lshr_b32 s4, s3, 12 +; GFX8-NEXT: s_bfe_i32 s8, s3, 0x40004 +; GFX8-NEXT: s_bfe_i32 s10, s3, 0x40008 ; GFX8-NEXT: s_lshr_b32 s1, s0, 12 ; GFX8-NEXT: s_bfe_i32 s5, s0, 0x40000 -; GFX8-NEXT: v_mov_b32_e32 v6, s7 +; GFX8-NEXT: v_mov_b32_e32 v6, s6 ; GFX8-NEXT: v_lshlrev_b16_e64 v4, 12, s1 ; GFX8-NEXT: v_lshlrev_b16_e64 v5, 12, s4 -; GFX8-NEXT: s_bfe_i32 s8, s0, 0x40004 -; GFX8-NEXT: s_bfe_i32 s10, s0, 0x40008 -; GFX8-NEXT: v_mov_b32_e32 v3, s11 -; GFX8-NEXT: v_mov_b32_e32 v7, s9 +; GFX8-NEXT: s_bfe_i32 s7, s0, 0x40004 +; GFX8-NEXT: s_bfe_i32 s9, s0, 0x40008 +; GFX8-NEXT: v_mov_b32_e32 v3, s10 +; GFX8-NEXT: v_mov_b32_e32 v7, s8 ; GFX8-NEXT: v_ashrrev_i16_e32 v4, 12, v4 ; GFX8-NEXT: v_ashrrev_i16_e32 v5, 12, v5 -; GFX8-NEXT: v_mul_i32_i24_e32 v3, s10, v3 -; GFX8-NEXT: s_bfe_i32 s13, s6, 0x40010 +; GFX8-NEXT: v_mul_i32_i24_e32 v3, s9, v3 +; GFX8-NEXT: s_bfe_i32 s12, s3, 0x40010 ; GFX8-NEXT: v_and_b32_e32 v4, s2, v4 ; GFX8-NEXT: v_and_b32_e32 v5, s2, v5 -; GFX8-NEXT: s_bfe_i32 s15, s6, 0x40014 -; GFX8-NEXT: s_bfe_i32 s12, s0, 0x40010 -; GFX8-NEXT: v_mov_b32_e32 v8, s13 -; GFX8-NEXT: s_bfe_i32 s17, s6, 0x40018 -; GFX8-NEXT: s_bfe_i32 s14, s0, 0x40014 -; GFX8-NEXT: v_mov_b32_e32 v9, s15 -; GFX8-NEXT: s_bfe_i32 s16, s0, 0x40018 -; GFX8-NEXT: s_ashr_i32 s6, s6, 28 -; GFX8-NEXT: v_mov_b32_e32 v10, s17 +; GFX8-NEXT: s_bfe_i32 s14, s3, 0x40014 +; GFX8-NEXT: s_bfe_i32 s11, s0, 0x40010 +; GFX8-NEXT: v_mov_b32_e32 v8, s12 +; GFX8-NEXT: s_bfe_i32 s16, s3, 0x40018 +; GFX8-NEXT: s_bfe_i32 s13, s0, 0x40014 +; GFX8-NEXT: v_mov_b32_e32 v9, s14 +; GFX8-NEXT: s_bfe_i32 s15, s0, 0x40018 +; GFX8-NEXT: s_ashr_i32 s3, s3, 28 +; GFX8-NEXT: v_mov_b32_e32 v10, s16 ; GFX8-NEXT: s_ashr_i32 s0, s0, 28 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_mad_i32_i24 v2, s5, v6, v2 -; GFX8-NEXT: v_mad_i32_i24 v2, s8, v7, v2 +; GFX8-NEXT: v_mad_i32_i24 v2, s7, v7, v2 ; GFX8-NEXT: v_add_u32_sdwa v2, vcc, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0 ; GFX8-NEXT: v_mad_u32_u24 v2, v4, v5, v2 -; GFX8-NEXT: v_mad_i32_i24 v2, s12, v8, v2 -; GFX8-NEXT: v_mad_i32_i24 v2, s14, v9, v2 -; GFX8-NEXT: v_mad_i32_i24 v2, s16, v10, v2 -; GFX8-NEXT: v_mov_b32_e32 v3, s6 +; GFX8-NEXT: v_mad_i32_i24 v2, s11, v8, v2 +; GFX8-NEXT: v_mad_i32_i24 v2, s13, v9, v2 +; GFX8-NEXT: v_mad_i32_i24 v2, s15, v10, v2 +; GFX8-NEXT: v_mov_b32_e32 v3, s3 ; GFX8-NEXT: v_mad_i32_i24 v2, s0, v3, v2 ; GFX8-NEXT: flat_store_byte v[0:1], v2 ; GFX8-NEXT: s_endpgm @@ -711,50 +711,50 @@ ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX9-NEXT: s_movk_i32 s2, 0xff ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_load_dword s6, s[6:7], 0x0 +; GFX9-NEXT: s_load_dword s3, s[6:7], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v0, s0 ; GFX9-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-NEXT: global_load_ubyte v2, v[0:1], off ; GFX9-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_bfe_i32 s7, s6, 0x40000 -; GFX9-NEXT: s_lshr_b32 s4, s6, 12 -; GFX9-NEXT: s_bfe_i32 s9, s6, 0x40004 -; GFX9-NEXT: s_bfe_i32 s11, s6, 0x40008 +; GFX9-NEXT: s_bfe_i32 s6, s3, 0x40000 +; GFX9-NEXT: s_lshr_b32 s4, s3, 12 +; GFX9-NEXT: s_bfe_i32 s8, s3, 0x40004 +; GFX9-NEXT: s_bfe_i32 s10, s3, 0x40008 ; GFX9-NEXT: s_lshr_b32 s1, s0, 12 ; GFX9-NEXT: s_bfe_i32 s5, s0, 0x40000 -; GFX9-NEXT: v_mov_b32_e32 v6, s7 +; GFX9-NEXT: v_mov_b32_e32 v6, s6 ; GFX9-NEXT: v_lshlrev_b16_e64 v4, 12, s1 ; GFX9-NEXT: v_lshlrev_b16_e64 v5, 12, s4 -; GFX9-NEXT: s_bfe_i32 s8, s0, 0x40004 -; GFX9-NEXT: s_bfe_i32 s10, s0, 0x40008 -; GFX9-NEXT: v_mov_b32_e32 v3, s11 -; GFX9-NEXT: v_mov_b32_e32 v7, s9 +; GFX9-NEXT: s_bfe_i32 s7, s0, 0x40004 +; GFX9-NEXT: s_bfe_i32 s9, s0, 0x40008 +; GFX9-NEXT: v_mov_b32_e32 v3, s10 +; GFX9-NEXT: v_mov_b32_e32 v7, s8 ; GFX9-NEXT: v_ashrrev_i16_e32 v4, 12, v4 ; GFX9-NEXT: v_ashrrev_i16_e32 v5, 12, v5 -; GFX9-NEXT: v_mul_i32_i24_e32 v3, s10, v3 -; GFX9-NEXT: s_bfe_i32 s13, s6, 0x40010 +; GFX9-NEXT: v_mul_i32_i24_e32 v3, s9, v3 +; GFX9-NEXT: s_bfe_i32 s12, s3, 0x40010 ; GFX9-NEXT: v_and_b32_e32 v4, s2, v4 ; GFX9-NEXT: v_and_b32_e32 v5, s2, v5 -; GFX9-NEXT: s_bfe_i32 s15, s6, 0x40014 -; GFX9-NEXT: s_bfe_i32 s12, s0, 0x40010 -; GFX9-NEXT: v_mov_b32_e32 v8, s13 -; GFX9-NEXT: s_bfe_i32 s17, s6, 0x40018 -; GFX9-NEXT: s_bfe_i32 s14, s0, 0x40014 -; GFX9-NEXT: v_mov_b32_e32 v9, s15 -; GFX9-NEXT: s_bfe_i32 s16, s0, 0x40018 -; GFX9-NEXT: s_ashr_i32 s6, s6, 28 -; GFX9-NEXT: v_mov_b32_e32 v10, s17 +; GFX9-NEXT: s_bfe_i32 s14, s3, 0x40014 +; GFX9-NEXT: s_bfe_i32 s11, s0, 0x40010 +; GFX9-NEXT: v_mov_b32_e32 v8, s12 +; GFX9-NEXT: s_bfe_i32 s16, s3, 0x40018 +; GFX9-NEXT: s_bfe_i32 s13, s0, 0x40014 +; GFX9-NEXT: v_mov_b32_e32 v9, s14 +; GFX9-NEXT: s_bfe_i32 s15, s0, 0x40018 +; GFX9-NEXT: s_ashr_i32 s3, s3, 28 +; GFX9-NEXT: v_mov_b32_e32 v10, s16 ; GFX9-NEXT: s_ashr_i32 s0, s0, 28 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_mad_i32_i24 v2, s5, v6, v2 -; GFX9-NEXT: v_mad_i32_i24 v2, s8, v7, v2 +; GFX9-NEXT: v_mad_i32_i24 v2, s7, v7, v2 ; GFX9-NEXT: v_add_u32_sdwa v2, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0 ; GFX9-NEXT: v_mad_u32_u24 v2, v4, v5, v2 -; GFX9-NEXT: v_mad_i32_i24 v2, s12, v8, v2 -; GFX9-NEXT: v_mad_i32_i24 v2, s14, v9, v2 -; GFX9-NEXT: v_mad_i32_i24 v2, s16, v10, v2 -; GFX9-NEXT: v_mov_b32_e32 v3, s6 +; GFX9-NEXT: v_mad_i32_i24 v2, s11, v8, v2 +; GFX9-NEXT: v_mad_i32_i24 v2, s13, v9, v2 +; GFX9-NEXT: v_mad_i32_i24 v2, s15, v10, v2 +; GFX9-NEXT: v_mov_b32_e32 v3, s3 ; GFX9-NEXT: v_mad_i32_i24 v2, s0, v3, v2 ; GFX9-NEXT: global_store_byte v[0:1], v2, off ; GFX9-NEXT: s_endpgm @@ -765,99 +765,99 @@ ; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX9-DL-NEXT: s_movk_i32 s2, 0xff ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: s_load_dword s6, s[6:7], 0x0 +; GFX9-DL-NEXT: s_load_dword s3, s[6:7], 0x0 ; GFX9-DL-NEXT: v_mov_b32_e32 v0, s0 ; GFX9-DL-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-DL-NEXT: global_load_ubyte v2, v[0:1], off ; GFX9-DL-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: s_bfe_i32 s7, s6, 0x40000 -; GFX9-DL-NEXT: s_lshr_b32 s4, s6, 12 -; GFX9-DL-NEXT: s_bfe_i32 s9, s6, 0x40004 -; GFX9-DL-NEXT: s_bfe_i32 s11, s6, 0x40008 +; GFX9-DL-NEXT: s_bfe_i32 s6, s3, 0x40000 +; GFX9-DL-NEXT: s_lshr_b32 s4, s3, 12 +; GFX9-DL-NEXT: s_bfe_i32 s8, s3, 0x40004 +; GFX9-DL-NEXT: s_bfe_i32 s10, s3, 0x40008 ; GFX9-DL-NEXT: s_lshr_b32 s1, s0, 12 ; GFX9-DL-NEXT: s_bfe_i32 s5, s0, 0x40000 -; GFX9-DL-NEXT: v_mov_b32_e32 v6, s7 +; GFX9-DL-NEXT: v_mov_b32_e32 v6, s6 ; GFX9-DL-NEXT: v_lshlrev_b16_e64 v4, 12, s1 ; GFX9-DL-NEXT: v_lshlrev_b16_e64 v5, 12, s4 -; GFX9-DL-NEXT: s_bfe_i32 s8, s0, 0x40004 -; GFX9-DL-NEXT: s_bfe_i32 s10, s0, 0x40008 -; GFX9-DL-NEXT: v_mov_b32_e32 v3, s11 -; GFX9-DL-NEXT: v_mov_b32_e32 v7, s9 +; GFX9-DL-NEXT: s_bfe_i32 s7, s0, 0x40004 +; GFX9-DL-NEXT: s_bfe_i32 s9, s0, 0x40008 +; GFX9-DL-NEXT: v_mov_b32_e32 v3, s10 +; GFX9-DL-NEXT: v_mov_b32_e32 v7, s8 ; GFX9-DL-NEXT: v_ashrrev_i16_e32 v4, 12, v4 ; GFX9-DL-NEXT: v_ashrrev_i16_e32 v5, 12, v5 -; GFX9-DL-NEXT: v_mul_i32_i24_e32 v3, s10, v3 -; GFX9-DL-NEXT: s_bfe_i32 s13, s6, 0x40010 +; GFX9-DL-NEXT: v_mul_i32_i24_e32 v3, s9, v3 +; GFX9-DL-NEXT: s_bfe_i32 s12, s3, 0x40010 ; GFX9-DL-NEXT: v_and_b32_e32 v4, s2, v4 ; GFX9-DL-NEXT: v_and_b32_e32 v5, s2, v5 -; GFX9-DL-NEXT: s_bfe_i32 s15, s6, 0x40014 -; GFX9-DL-NEXT: s_bfe_i32 s12, s0, 0x40010 -; GFX9-DL-NEXT: v_mov_b32_e32 v8, s13 -; GFX9-DL-NEXT: s_bfe_i32 s17, s6, 0x40018 -; GFX9-DL-NEXT: s_bfe_i32 s14, s0, 0x40014 -; GFX9-DL-NEXT: v_mov_b32_e32 v9, s15 -; GFX9-DL-NEXT: s_bfe_i32 s16, s0, 0x40018 -; GFX9-DL-NEXT: s_ashr_i32 s6, s6, 28 -; GFX9-DL-NEXT: v_mov_b32_e32 v10, s17 +; GFX9-DL-NEXT: s_bfe_i32 s14, s3, 0x40014 +; GFX9-DL-NEXT: s_bfe_i32 s11, s0, 0x40010 +; GFX9-DL-NEXT: v_mov_b32_e32 v8, s12 +; GFX9-DL-NEXT: s_bfe_i32 s16, s3, 0x40018 +; GFX9-DL-NEXT: s_bfe_i32 s13, s0, 0x40014 +; GFX9-DL-NEXT: v_mov_b32_e32 v9, s14 +; GFX9-DL-NEXT: s_bfe_i32 s15, s0, 0x40018 +; GFX9-DL-NEXT: s_ashr_i32 s3, s3, 28 +; GFX9-DL-NEXT: v_mov_b32_e32 v10, s16 ; GFX9-DL-NEXT: s_ashr_i32 s0, s0, 28 ; GFX9-DL-NEXT: s_waitcnt vmcnt(0) ; GFX9-DL-NEXT: v_mad_i32_i24 v2, s5, v6, v2 -; GFX9-DL-NEXT: v_mad_i32_i24 v2, s8, v7, v2 +; GFX9-DL-NEXT: v_mad_i32_i24 v2, s7, v7, v2 ; GFX9-DL-NEXT: v_add_u32_sdwa v2, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0 ; GFX9-DL-NEXT: v_mad_u32_u24 v2, v4, v5, v2 -; GFX9-DL-NEXT: v_mad_i32_i24 v2, s12, v8, v2 -; GFX9-DL-NEXT: v_mad_i32_i24 v2, s14, v9, v2 -; GFX9-DL-NEXT: v_mad_i32_i24 v2, s16, v10, v2 -; GFX9-DL-NEXT: v_mov_b32_e32 v3, s6 +; GFX9-DL-NEXT: v_mad_i32_i24 v2, s11, v8, v2 +; GFX9-DL-NEXT: v_mad_i32_i24 v2, s13, v9, v2 +; GFX9-DL-NEXT: v_mad_i32_i24 v2, s15, v10, v2 +; GFX9-DL-NEXT: v_mov_b32_e32 v3, s3 ; GFX9-DL-NEXT: v_mad_i32_i24 v2, s0, v3, v2 ; GFX9-DL-NEXT: global_store_byte v[0:1], v2, off ; GFX9-DL-NEXT: s_endpgm ; ; GFX10-DL-LABEL: idot8_acc8: ; GFX10-DL: ; %bb.0: ; %entry -; GFX10-DL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34 +; GFX10-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX10-DL-NEXT: ; implicit-def: $vcc_hi ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: v_mov_b32_e32 v0, s4 -; GFX10-DL-NEXT: v_mov_b32_e32 v1, s5 -; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-DL-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-DL-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-DL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-DL-NEXT: global_load_ubyte v2, v[0:1], off ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: s_load_dword s0, s[4:5], 0x0 -; GFX10-DL-NEXT: s_load_dword s1, s[6:7], 0x0 +; GFX10-DL-NEXT: s_load_dword s0, s[0:1], 0x0 +; GFX10-DL-NEXT: s_load_dword s1, s[2:3], 0x0 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DL-NEXT: s_lshr_b32 s2, s0, 12 -; GFX10-DL-NEXT: s_lshr_b32 s4, s1, 12 -; GFX10-DL-NEXT: s_bfe_i32 s5, s0, 0x40000 -; GFX10-DL-NEXT: s_bfe_i32 s6, s1, 0x40000 -; GFX10-DL-NEXT: s_bfe_i32 s7, s0, 0x40004 +; GFX10-DL-NEXT: s_lshr_b32 s3, s1, 12 +; GFX10-DL-NEXT: s_bfe_i32 s4, s0, 0x40000 +; GFX10-DL-NEXT: s_bfe_i32 s5, s1, 0x40000 +; GFX10-DL-NEXT: s_bfe_i32 s6, s0, 0x40004 ; GFX10-DL-NEXT: v_lshlrev_b16_e64 v3, 12, s2 -; GFX10-DL-NEXT: v_lshlrev_b16_e64 v4, 12, s4 -; GFX10-DL-NEXT: s_bfe_i32 s8, s0, 0x40008 -; GFX10-DL-NEXT: s_bfe_i32 s9, s1, 0x40008 +; GFX10-DL-NEXT: v_lshlrev_b16_e64 v4, 12, s3 +; GFX10-DL-NEXT: s_bfe_i32 s7, s0, 0x40008 +; GFX10-DL-NEXT: s_bfe_i32 s8, s1, 0x40008 ; GFX10-DL-NEXT: s_bfe_i32 s2, s1, 0x40004 ; GFX10-DL-NEXT: v_ashrrev_i16_e64 v3, 12, v3 -; GFX10-DL-NEXT: s_movk_i32 s4, 0xff +; GFX10-DL-NEXT: s_movk_i32 s3, 0xff ; GFX10-DL-NEXT: v_ashrrev_i16_e64 v4, 12, v4 -; GFX10-DL-NEXT: v_mul_i32_i24_e64 v5, s8, s9 -; GFX10-DL-NEXT: v_and_b32_e32 v3, s4, v3 -; GFX10-DL-NEXT: v_and_b32_e32 v4, s4, v4 -; GFX10-DL-NEXT: s_bfe_i32 s4, s1, 0x40010 +; GFX10-DL-NEXT: v_mul_i32_i24_e64 v5, s7, s8 +; GFX10-DL-NEXT: v_and_b32_e32 v3, s3, v3 +; GFX10-DL-NEXT: v_and_b32_e32 v4, s3, v4 +; GFX10-DL-NEXT: s_bfe_i32 s3, s1, 0x40010 ; GFX10-DL-NEXT: s_waitcnt vmcnt(0) -; GFX10-DL-NEXT: v_mad_i32_i24 v2, s5, s6, v2 -; GFX10-DL-NEXT: s_bfe_i32 s5, s0, 0x40014 -; GFX10-DL-NEXT: s_bfe_i32 s6, s1, 0x40014 -; GFX10-DL-NEXT: v_mad_i32_i24 v2, s7, s2, v2 +; GFX10-DL-NEXT: v_mad_i32_i24 v2, s4, s5, v2 +; GFX10-DL-NEXT: s_bfe_i32 s4, s0, 0x40014 +; GFX10-DL-NEXT: s_bfe_i32 s5, s1, 0x40014 +; GFX10-DL-NEXT: v_mad_i32_i24 v2, s6, s2, v2 ; GFX10-DL-NEXT: s_bfe_i32 s2, s0, 0x40010 ; GFX10-DL-NEXT: v_add_nc_u32_sdwa v2, v2, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0 ; GFX10-DL-NEXT: v_mad_u32_u24 v2, v3, v4, v2 -; GFX10-DL-NEXT: v_mad_i32_i24 v2, s2, s4, v2 +; GFX10-DL-NEXT: v_mad_i32_i24 v2, s2, s3, v2 ; GFX10-DL-NEXT: s_bfe_i32 s2, s0, 0x40018 -; GFX10-DL-NEXT: s_bfe_i32 s4, s1, 0x40018 +; GFX10-DL-NEXT: s_bfe_i32 s3, s1, 0x40018 ; GFX10-DL-NEXT: s_ashr_i32 s0, s0, 28 ; GFX10-DL-NEXT: s_ashr_i32 s1, s1, 28 -; GFX10-DL-NEXT: v_mad_i32_i24 v2, s5, s6, v2 -; GFX10-DL-NEXT: v_mad_i32_i24 v2, s2, s4, v2 +; GFX10-DL-NEXT: v_mad_i32_i24 v2, s4, s5, v2 +; GFX10-DL-NEXT: v_mad_i32_i24 v2, s2, s3, v2 ; GFX10-DL-NEXT: v_mad_i32_i24 v2, s0, s1, v2 ; GFX10-DL-NEXT: global_store_byte v[0:1], v2, off ; GFX10-DL-NEXT: s_endpgm @@ -934,51 +934,51 @@ define amdgpu_kernel void @idot8_multiuses_mul1(<8 x i4> addrspace(1)* %src1, ; GFX7-LABEL: idot8_multiuses_mul1: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x9 -; GFX7-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xd -; GFX7-NEXT: s_mov_b32 s7, 0xf000 -; GFX7-NEXT: s_mov_b32 s6, -1 +; GFX7-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd +; GFX7-NEXT: s_mov_b32 s3, 0xf000 +; GFX7-NEXT: s_mov_b32 s2, -1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: s_load_dword s0, s[8:9], 0x0 -; GFX7-NEXT: s_load_dword s1, s[10:11], 0x0 -; GFX7-NEXT: s_load_dword s21, s[4:5], 0x0 +; GFX7-NEXT: s_load_dword s4, s[4:5], 0x0 +; GFX7-NEXT: s_load_dword s5, s[6:7], 0x0 +; GFX7-NEXT: s_load_dword s20, s[0:1], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: s_bfe_i32 s2, s0, 0x40000 -; GFX7-NEXT: s_bfe_i32 s8, s1, 0x40000 -; GFX7-NEXT: v_mov_b32_e32 v0, s8 -; GFX7-NEXT: v_mov_b32_e32 v1, s21 -; GFX7-NEXT: v_mad_i32_i24 v1, s2, v0, v1 -; GFX7-NEXT: s_bfe_i32 s10, s1, 0x40004 -; GFX7-NEXT: s_bfe_i32 s9, s0, 0x40004 -; GFX7-NEXT: s_bfe_i32 s12, s1, 0x40008 -; GFX7-NEXT: v_mad_i32_i24 v0, s2, v0, v1 -; GFX7-NEXT: v_mov_b32_e32 v2, s10 -; GFX7-NEXT: v_mad_i32_i24 v0, s9, v2, v0 -; GFX7-NEXT: s_bfe_i32 s11, s0, 0x40008 -; GFX7-NEXT: v_mov_b32_e32 v2, s12 -; GFX7-NEXT: s_bfe_i32 s14, s1, 0x4000c -; GFX7-NEXT: v_mad_i32_i24 v0, s11, v2, v0 -; GFX7-NEXT: s_bfe_i32 s13, s0, 0x4000c -; GFX7-NEXT: v_mov_b32_e32 v2, s14 -; GFX7-NEXT: s_bfe_i32 s16, s1, 0x40010 -; GFX7-NEXT: v_mad_i32_i24 v0, s13, v2, v0 -; GFX7-NEXT: s_bfe_i32 s15, s0, 0x40010 -; GFX7-NEXT: v_mov_b32_e32 v2, s16 -; GFX7-NEXT: s_bfe_i32 s18, s1, 0x40014 -; GFX7-NEXT: s_bfe_i32 s20, s1, 0x40018 -; GFX7-NEXT: v_mad_i32_i24 v0, s15, v2, v0 -; GFX7-NEXT: s_bfe_i32 s17, s0, 0x40014 -; GFX7-NEXT: v_mov_b32_e32 v2, s18 -; GFX7-NEXT: s_bfe_i32 s19, s0, 0x40018 -; GFX7-NEXT: v_mad_i32_i24 v0, s17, v2, v0 -; GFX7-NEXT: v_mov_b32_e32 v2, s20 -; GFX7-NEXT: s_ashr_i32 s1, s1, 28 -; GFX7-NEXT: v_mad_i32_i24 v0, s19, v2, v0 -; GFX7-NEXT: s_ashr_i32 s0, s0, 28 -; GFX7-NEXT: v_mov_b32_e32 v2, s1 -; GFX7-NEXT: v_mad_i32_i24 v0, s0, v2, v0 +; GFX7-NEXT: s_bfe_i32 s6, s4, 0x40000 +; GFX7-NEXT: s_bfe_i32 s7, s5, 0x40000 +; GFX7-NEXT: v_mov_b32_e32 v0, s7 +; GFX7-NEXT: v_mov_b32_e32 v1, s20 +; GFX7-NEXT: v_mad_i32_i24 v1, s6, v0, v1 +; GFX7-NEXT: s_bfe_i32 s9, s5, 0x40004 +; GFX7-NEXT: s_bfe_i32 s8, s4, 0x40004 +; GFX7-NEXT: s_bfe_i32 s11, s5, 0x40008 +; GFX7-NEXT: v_mad_i32_i24 v0, s6, v0, v1 +; GFX7-NEXT: v_mov_b32_e32 v2, s9 +; GFX7-NEXT: v_mad_i32_i24 v0, s8, v2, v0 +; GFX7-NEXT: s_bfe_i32 s10, s4, 0x40008 +; GFX7-NEXT: v_mov_b32_e32 v2, s11 +; GFX7-NEXT: s_bfe_i32 s13, s5, 0x4000c +; GFX7-NEXT: v_mad_i32_i24 v0, s10, v2, v0 +; GFX7-NEXT: s_bfe_i32 s12, s4, 0x4000c +; GFX7-NEXT: v_mov_b32_e32 v2, s13 +; GFX7-NEXT: s_bfe_i32 s15, s5, 0x40010 +; GFX7-NEXT: v_mad_i32_i24 v0, s12, v2, v0 +; GFX7-NEXT: s_bfe_i32 s14, s4, 0x40010 +; GFX7-NEXT: v_mov_b32_e32 v2, s15 +; GFX7-NEXT: s_bfe_i32 s17, s5, 0x40014 +; GFX7-NEXT: s_bfe_i32 s19, s5, 0x40018 +; GFX7-NEXT: v_mad_i32_i24 v0, s14, v2, v0 +; GFX7-NEXT: s_bfe_i32 s16, s4, 0x40014 +; GFX7-NEXT: v_mov_b32_e32 v2, s17 +; GFX7-NEXT: s_bfe_i32 s18, s4, 0x40018 +; GFX7-NEXT: v_mad_i32_i24 v0, s16, v2, v0 +; GFX7-NEXT: v_mov_b32_e32 v2, s19 +; GFX7-NEXT: s_ashr_i32 s5, s5, 28 +; GFX7-NEXT: v_mad_i32_i24 v0, s18, v2, v0 +; GFX7-NEXT: s_ashr_i32 s4, s4, 28 +; GFX7-NEXT: v_mov_b32_e32 v2, s5 +; GFX7-NEXT: v_mad_i32_i24 v0, s4, v2, v0 ; GFX7-NEXT: v_add_i32_e32 v0, vcc, v0, v1 -; GFX7-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GFX7-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX7-NEXT: s_endpgm ; ; GFX8-LABEL: idot8_multiuses_mul1: @@ -987,42 +987,42 @@ ; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: s_load_dword s2, s[4:5], 0x0 -; GFX8-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX8-NEXT: s_load_dword s19, s[0:1], 0x0 +; GFX8-NEXT: s_load_dword s3, s[6:7], 0x0 +; GFX8-NEXT: s_load_dword s18, s[0:1], 0x0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_bfe_i32 s5, s2, 0x40000 -; GFX8-NEXT: s_bfe_i32 s6, s4, 0x40000 -; GFX8-NEXT: v_mov_b32_e32 v0, s6 -; GFX8-NEXT: v_mov_b32_e32 v1, s19 -; GFX8-NEXT: v_mad_i32_i24 v1, s5, v0, v1 -; GFX8-NEXT: s_bfe_i32 s8, s4, 0x40004 -; GFX8-NEXT: s_bfe_i32 s7, s2, 0x40004 -; GFX8-NEXT: s_bfe_i32 s10, s4, 0x40008 -; GFX8-NEXT: v_mad_i32_i24 v0, s5, v0, v1 -; GFX8-NEXT: v_mov_b32_e32 v2, s8 -; GFX8-NEXT: v_mad_i32_i24 v0, s7, v2, v0 -; GFX8-NEXT: s_bfe_i32 s9, s2, 0x40008 -; GFX8-NEXT: v_mov_b32_e32 v2, s10 -; GFX8-NEXT: s_bfe_i32 s12, s4, 0x4000c -; GFX8-NEXT: v_mad_i32_i24 v0, s9, v2, v0 -; GFX8-NEXT: s_bfe_i32 s11, s2, 0x4000c -; GFX8-NEXT: v_mov_b32_e32 v2, s12 -; GFX8-NEXT: s_bfe_i32 s14, s4, 0x40010 -; GFX8-NEXT: v_mad_i32_i24 v0, s11, v2, v0 -; GFX8-NEXT: s_bfe_i32 s13, s2, 0x40010 -; GFX8-NEXT: v_mov_b32_e32 v2, s14 -; GFX8-NEXT: s_bfe_i32 s16, s4, 0x40014 -; GFX8-NEXT: s_bfe_i32 s18, s4, 0x40018 -; GFX8-NEXT: v_mad_i32_i24 v0, s13, v2, v0 -; GFX8-NEXT: s_bfe_i32 s15, s2, 0x40014 -; GFX8-NEXT: v_mov_b32_e32 v2, s16 -; GFX8-NEXT: s_bfe_i32 s17, s2, 0x40018 -; GFX8-NEXT: v_mad_i32_i24 v0, s15, v2, v0 -; GFX8-NEXT: v_mov_b32_e32 v2, s18 -; GFX8-NEXT: s_ashr_i32 s4, s4, 28 -; GFX8-NEXT: v_mad_i32_i24 v0, s17, v2, v0 +; GFX8-NEXT: s_bfe_i32 s4, s2, 0x40000 +; GFX8-NEXT: s_bfe_i32 s5, s3, 0x40000 +; GFX8-NEXT: v_mov_b32_e32 v0, s5 +; GFX8-NEXT: v_mov_b32_e32 v1, s18 +; GFX8-NEXT: v_mad_i32_i24 v1, s4, v0, v1 +; GFX8-NEXT: s_bfe_i32 s7, s3, 0x40004 +; GFX8-NEXT: s_bfe_i32 s6, s2, 0x40004 +; GFX8-NEXT: s_bfe_i32 s9, s3, 0x40008 +; GFX8-NEXT: v_mad_i32_i24 v0, s4, v0, v1 +; GFX8-NEXT: v_mov_b32_e32 v2, s7 +; GFX8-NEXT: v_mad_i32_i24 v0, s6, v2, v0 +; GFX8-NEXT: s_bfe_i32 s8, s2, 0x40008 +; GFX8-NEXT: v_mov_b32_e32 v2, s9 +; GFX8-NEXT: s_bfe_i32 s11, s3, 0x4000c +; GFX8-NEXT: v_mad_i32_i24 v0, s8, v2, v0 +; GFX8-NEXT: s_bfe_i32 s10, s2, 0x4000c +; GFX8-NEXT: v_mov_b32_e32 v2, s11 +; GFX8-NEXT: s_bfe_i32 s13, s3, 0x40010 +; GFX8-NEXT: v_mad_i32_i24 v0, s10, v2, v0 +; GFX8-NEXT: s_bfe_i32 s12, s2, 0x40010 +; GFX8-NEXT: v_mov_b32_e32 v2, s13 +; GFX8-NEXT: s_bfe_i32 s15, s3, 0x40014 +; GFX8-NEXT: s_bfe_i32 s17, s3, 0x40018 +; GFX8-NEXT: v_mad_i32_i24 v0, s12, v2, v0 +; GFX8-NEXT: s_bfe_i32 s14, s2, 0x40014 +; GFX8-NEXT: v_mov_b32_e32 v2, s15 +; GFX8-NEXT: s_bfe_i32 s16, s2, 0x40018 +; GFX8-NEXT: v_mad_i32_i24 v0, s14, v2, v0 +; GFX8-NEXT: v_mov_b32_e32 v2, s17 +; GFX8-NEXT: s_ashr_i32 s3, s3, 28 +; GFX8-NEXT: v_mad_i32_i24 v0, s16, v2, v0 ; GFX8-NEXT: s_ashr_i32 s2, s2, 28 -; GFX8-NEXT: v_mov_b32_e32 v2, s4 +; GFX8-NEXT: v_mov_b32_e32 v2, s3 ; GFX8-NEXT: v_mad_i32_i24 v0, s2, v2, v0 ; GFX8-NEXT: v_add_u32_e32 v2, vcc, v0, v1 ; GFX8-NEXT: v_mov_b32_e32 v0, s0 @@ -1036,42 +1036,42 @@ ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_load_dword s2, s[4:5], 0x0 -; GFX9-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX9-NEXT: s_load_dword s19, s[0:1], 0x0 +; GFX9-NEXT: s_load_dword s3, s[6:7], 0x0 +; GFX9-NEXT: s_load_dword s18, s[0:1], 0x0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_bfe_i32 s5, s2, 0x40000 -; GFX9-NEXT: s_bfe_i32 s6, s4, 0x40000 -; GFX9-NEXT: v_mov_b32_e32 v0, s6 -; GFX9-NEXT: v_mov_b32_e32 v1, s19 -; GFX9-NEXT: v_mad_i32_i24 v1, s5, v0, v1 -; GFX9-NEXT: s_bfe_i32 s8, s4, 0x40004 -; GFX9-NEXT: s_bfe_i32 s7, s2, 0x40004 -; GFX9-NEXT: s_bfe_i32 s10, s4, 0x40008 -; GFX9-NEXT: v_mad_i32_i24 v0, s5, v0, v1 -; GFX9-NEXT: v_mov_b32_e32 v2, s8 -; GFX9-NEXT: v_mad_i32_i24 v0, s7, v2, v0 -; GFX9-NEXT: s_bfe_i32 s9, s2, 0x40008 -; GFX9-NEXT: v_mov_b32_e32 v2, s10 -; GFX9-NEXT: s_bfe_i32 s12, s4, 0x4000c -; GFX9-NEXT: v_mad_i32_i24 v0, s9, v2, v0 -; GFX9-NEXT: s_bfe_i32 s11, s2, 0x4000c -; GFX9-NEXT: v_mov_b32_e32 v2, s12 -; GFX9-NEXT: s_bfe_i32 s14, s4, 0x40010 -; GFX9-NEXT: v_mad_i32_i24 v0, s11, v2, v0 -; GFX9-NEXT: s_bfe_i32 s13, s2, 0x40010 -; GFX9-NEXT: v_mov_b32_e32 v2, s14 -; GFX9-NEXT: s_bfe_i32 s16, s4, 0x40014 -; GFX9-NEXT: s_bfe_i32 s18, s4, 0x40018 -; GFX9-NEXT: v_mad_i32_i24 v0, s13, v2, v0 -; GFX9-NEXT: s_bfe_i32 s15, s2, 0x40014 -; GFX9-NEXT: v_mov_b32_e32 v2, s16 -; GFX9-NEXT: s_bfe_i32 s17, s2, 0x40018 -; GFX9-NEXT: v_mad_i32_i24 v0, s15, v2, v0 -; GFX9-NEXT: v_mov_b32_e32 v2, s18 -; GFX9-NEXT: s_ashr_i32 s4, s4, 28 -; GFX9-NEXT: v_mad_i32_i24 v0, s17, v2, v0 +; GFX9-NEXT: s_bfe_i32 s4, s2, 0x40000 +; GFX9-NEXT: s_bfe_i32 s5, s3, 0x40000 +; GFX9-NEXT: v_mov_b32_e32 v0, s5 +; GFX9-NEXT: v_mov_b32_e32 v1, s18 +; GFX9-NEXT: v_mad_i32_i24 v1, s4, v0, v1 +; GFX9-NEXT: s_bfe_i32 s7, s3, 0x40004 +; GFX9-NEXT: s_bfe_i32 s6, s2, 0x40004 +; GFX9-NEXT: s_bfe_i32 s9, s3, 0x40008 +; GFX9-NEXT: v_mad_i32_i24 v0, s4, v0, v1 +; GFX9-NEXT: v_mov_b32_e32 v2, s7 +; GFX9-NEXT: v_mad_i32_i24 v0, s6, v2, v0 +; GFX9-NEXT: s_bfe_i32 s8, s2, 0x40008 +; GFX9-NEXT: v_mov_b32_e32 v2, s9 +; GFX9-NEXT: s_bfe_i32 s11, s3, 0x4000c +; GFX9-NEXT: v_mad_i32_i24 v0, s8, v2, v0 +; GFX9-NEXT: s_bfe_i32 s10, s2, 0x4000c +; GFX9-NEXT: v_mov_b32_e32 v2, s11 +; GFX9-NEXT: s_bfe_i32 s13, s3, 0x40010 +; GFX9-NEXT: v_mad_i32_i24 v0, s10, v2, v0 +; GFX9-NEXT: s_bfe_i32 s12, s2, 0x40010 +; GFX9-NEXT: v_mov_b32_e32 v2, s13 +; GFX9-NEXT: s_bfe_i32 s15, s3, 0x40014 +; GFX9-NEXT: s_bfe_i32 s17, s3, 0x40018 +; GFX9-NEXT: v_mad_i32_i24 v0, s12, v2, v0 +; GFX9-NEXT: s_bfe_i32 s14, s2, 0x40014 +; GFX9-NEXT: v_mov_b32_e32 v2, s15 +; GFX9-NEXT: s_bfe_i32 s16, s2, 0x40018 +; GFX9-NEXT: v_mad_i32_i24 v0, s14, v2, v0 +; GFX9-NEXT: v_mov_b32_e32 v2, s17 +; GFX9-NEXT: s_ashr_i32 s3, s3, 28 +; GFX9-NEXT: v_mad_i32_i24 v0, s16, v2, v0 ; GFX9-NEXT: s_ashr_i32 s2, s2, 28 -; GFX9-NEXT: v_mov_b32_e32 v2, s4 +; GFX9-NEXT: v_mov_b32_e32 v2, s3 ; GFX9-NEXT: v_mad_i32_i24 v0, s2, v2, v0 ; GFX9-NEXT: v_add_u32_e32 v2, v1, v0 ; GFX9-NEXT: v_mov_b32_e32 v0, s0 @@ -1085,42 +1085,42 @@ ; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DL-NEXT: s_load_dword s2, s[4:5], 0x0 -; GFX9-DL-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX9-DL-NEXT: s_load_dword s19, s[0:1], 0x0 +; GFX9-DL-NEXT: s_load_dword s3, s[6:7], 0x0 +; GFX9-DL-NEXT: s_load_dword s18, s[0:1], 0x0 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: s_bfe_i32 s5, s2, 0x40000 -; GFX9-DL-NEXT: s_bfe_i32 s6, s4, 0x40000 -; GFX9-DL-NEXT: v_mov_b32_e32 v0, s6 -; GFX9-DL-NEXT: v_mov_b32_e32 v1, s19 -; GFX9-DL-NEXT: v_mad_i32_i24 v1, s5, v0, v1 -; GFX9-DL-NEXT: s_bfe_i32 s8, s4, 0x40004 -; GFX9-DL-NEXT: s_bfe_i32 s7, s2, 0x40004 -; GFX9-DL-NEXT: s_bfe_i32 s10, s4, 0x40008 -; GFX9-DL-NEXT: v_mad_i32_i24 v0, s5, v0, v1 -; GFX9-DL-NEXT: v_mov_b32_e32 v2, s8 -; GFX9-DL-NEXT: v_mad_i32_i24 v0, s7, v2, v0 -; GFX9-DL-NEXT: s_bfe_i32 s9, s2, 0x40008 -; GFX9-DL-NEXT: v_mov_b32_e32 v2, s10 -; GFX9-DL-NEXT: s_bfe_i32 s12, s4, 0x4000c -; GFX9-DL-NEXT: v_mad_i32_i24 v0, s9, v2, v0 -; GFX9-DL-NEXT: s_bfe_i32 s11, s2, 0x4000c -; GFX9-DL-NEXT: v_mov_b32_e32 v2, s12 -; GFX9-DL-NEXT: s_bfe_i32 s14, s4, 0x40010 -; GFX9-DL-NEXT: v_mad_i32_i24 v0, s11, v2, v0 -; GFX9-DL-NEXT: s_bfe_i32 s13, s2, 0x40010 -; GFX9-DL-NEXT: v_mov_b32_e32 v2, s14 -; GFX9-DL-NEXT: s_bfe_i32 s16, s4, 0x40014 -; GFX9-DL-NEXT: s_bfe_i32 s18, s4, 0x40018 -; GFX9-DL-NEXT: v_mad_i32_i24 v0, s13, v2, v0 -; GFX9-DL-NEXT: s_bfe_i32 s15, s2, 0x40014 -; GFX9-DL-NEXT: v_mov_b32_e32 v2, s16 -; GFX9-DL-NEXT: s_bfe_i32 s17, s2, 0x40018 -; GFX9-DL-NEXT: v_mad_i32_i24 v0, s15, v2, v0 -; GFX9-DL-NEXT: v_mov_b32_e32 v2, s18 -; GFX9-DL-NEXT: s_ashr_i32 s4, s4, 28 -; GFX9-DL-NEXT: v_mad_i32_i24 v0, s17, v2, v0 +; GFX9-DL-NEXT: s_bfe_i32 s4, s2, 0x40000 +; GFX9-DL-NEXT: s_bfe_i32 s5, s3, 0x40000 +; GFX9-DL-NEXT: v_mov_b32_e32 v0, s5 +; GFX9-DL-NEXT: v_mov_b32_e32 v1, s18 +; GFX9-DL-NEXT: v_mad_i32_i24 v1, s4, v0, v1 +; GFX9-DL-NEXT: s_bfe_i32 s7, s3, 0x40004 +; GFX9-DL-NEXT: s_bfe_i32 s6, s2, 0x40004 +; GFX9-DL-NEXT: s_bfe_i32 s9, s3, 0x40008 +; GFX9-DL-NEXT: v_mad_i32_i24 v0, s4, v0, v1 +; GFX9-DL-NEXT: v_mov_b32_e32 v2, s7 +; GFX9-DL-NEXT: v_mad_i32_i24 v0, s6, v2, v0 +; GFX9-DL-NEXT: s_bfe_i32 s8, s2, 0x40008 +; GFX9-DL-NEXT: v_mov_b32_e32 v2, s9 +; GFX9-DL-NEXT: s_bfe_i32 s11, s3, 0x4000c +; GFX9-DL-NEXT: v_mad_i32_i24 v0, s8, v2, v0 +; GFX9-DL-NEXT: s_bfe_i32 s10, s2, 0x4000c +; GFX9-DL-NEXT: v_mov_b32_e32 v2, s11 +; GFX9-DL-NEXT: s_bfe_i32 s13, s3, 0x40010 +; GFX9-DL-NEXT: v_mad_i32_i24 v0, s10, v2, v0 +; GFX9-DL-NEXT: s_bfe_i32 s12, s2, 0x40010 +; GFX9-DL-NEXT: v_mov_b32_e32 v2, s13 +; GFX9-DL-NEXT: s_bfe_i32 s15, s3, 0x40014 +; GFX9-DL-NEXT: s_bfe_i32 s17, s3, 0x40018 +; GFX9-DL-NEXT: v_mad_i32_i24 v0, s12, v2, v0 +; GFX9-DL-NEXT: s_bfe_i32 s14, s2, 0x40014 +; GFX9-DL-NEXT: v_mov_b32_e32 v2, s15 +; GFX9-DL-NEXT: s_bfe_i32 s16, s2, 0x40018 +; GFX9-DL-NEXT: v_mad_i32_i24 v0, s14, v2, v0 +; GFX9-DL-NEXT: v_mov_b32_e32 v2, s17 +; GFX9-DL-NEXT: s_ashr_i32 s3, s3, 28 +; GFX9-DL-NEXT: v_mad_i32_i24 v0, s16, v2, v0 ; GFX9-DL-NEXT: s_ashr_i32 s2, s2, 28 -; GFX9-DL-NEXT: v_mov_b32_e32 v2, s4 +; GFX9-DL-NEXT: v_mov_b32_e32 v2, s3 ; GFX9-DL-NEXT: v_mad_i32_i24 v0, s2, v2, v0 ; GFX9-DL-NEXT: v_add_u32_e32 v2, v1, v0 ; GFX9-DL-NEXT: v_mov_b32_e32 v0, s0 @@ -1135,35 +1135,35 @@ ; GFX10-DL-NEXT: ; implicit-def: $vcc_hi ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DL-NEXT: s_load_dword s2, s[4:5], 0x0 -; GFX10-DL-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX10-DL-NEXT: s_load_dword s5, s[0:1], 0x0 +; GFX10-DL-NEXT: s_load_dword s3, s[6:7], 0x0 +; GFX10-DL-NEXT: s_load_dword s4, s[0:1], 0x0 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: s_bfe_i32 s6, s2, 0x40000 -; GFX10-DL-NEXT: s_bfe_i32 s7, s4, 0x40000 -; GFX10-DL-NEXT: v_mov_b32_e32 v0, s5 -; GFX10-DL-NEXT: s_bfe_i32 s5, s2, 0x40004 -; GFX10-DL-NEXT: s_bfe_i32 s8, s4, 0x40004 -; GFX10-DL-NEXT: v_mad_i32_i24 v0, s6, s7, v0 -; GFX10-DL-NEXT: v_mad_i32_i24 v1, s6, s7, v0 -; GFX10-DL-NEXT: s_bfe_i32 s6, s2, 0x40008 -; GFX10-DL-NEXT: s_bfe_i32 s7, s4, 0x40008 -; GFX10-DL-NEXT: v_mad_i32_i24 v1, s5, s8, v1 -; GFX10-DL-NEXT: s_bfe_i32 s5, s2, 0x4000c -; GFX10-DL-NEXT: s_bfe_i32 s8, s4, 0x4000c -; GFX10-DL-NEXT: v_mad_i32_i24 v1, s6, s7, v1 -; GFX10-DL-NEXT: s_bfe_i32 s6, s2, 0x40010 -; GFX10-DL-NEXT: s_bfe_i32 s7, s4, 0x40010 -; GFX10-DL-NEXT: v_mad_i32_i24 v1, s5, s8, v1 -; GFX10-DL-NEXT: s_bfe_i32 s5, s2, 0x40014 -; GFX10-DL-NEXT: s_bfe_i32 s8, s4, 0x40014 -; GFX10-DL-NEXT: v_mad_i32_i24 v1, s6, s7, v1 -; GFX10-DL-NEXT: s_bfe_i32 s6, s2, 0x40018 -; GFX10-DL-NEXT: s_bfe_i32 s7, s4, 0x40018 +; GFX10-DL-NEXT: s_bfe_i32 s5, s2, 0x40000 +; GFX10-DL-NEXT: s_bfe_i32 s6, s3, 0x40000 +; GFX10-DL-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-DL-NEXT: s_bfe_i32 s4, s2, 0x40004 +; GFX10-DL-NEXT: s_bfe_i32 s7, s3, 0x40004 +; GFX10-DL-NEXT: v_mad_i32_i24 v0, s5, s6, v0 +; GFX10-DL-NEXT: v_mad_i32_i24 v1, s5, s6, v0 +; GFX10-DL-NEXT: s_bfe_i32 s5, s2, 0x40008 +; GFX10-DL-NEXT: s_bfe_i32 s6, s3, 0x40008 +; GFX10-DL-NEXT: v_mad_i32_i24 v1, s4, s7, v1 +; GFX10-DL-NEXT: s_bfe_i32 s4, s2, 0x4000c +; GFX10-DL-NEXT: s_bfe_i32 s7, s3, 0x4000c +; GFX10-DL-NEXT: v_mad_i32_i24 v1, s5, s6, v1 +; GFX10-DL-NEXT: s_bfe_i32 s5, s2, 0x40010 +; GFX10-DL-NEXT: s_bfe_i32 s6, s3, 0x40010 +; GFX10-DL-NEXT: v_mad_i32_i24 v1, s4, s7, v1 +; GFX10-DL-NEXT: s_bfe_i32 s4, s2, 0x40014 +; GFX10-DL-NEXT: s_bfe_i32 s7, s3, 0x40014 +; GFX10-DL-NEXT: v_mad_i32_i24 v1, s5, s6, v1 +; GFX10-DL-NEXT: s_bfe_i32 s5, s2, 0x40018 +; GFX10-DL-NEXT: s_bfe_i32 s6, s3, 0x40018 ; GFX10-DL-NEXT: s_ashr_i32 s2, s2, 28 -; GFX10-DL-NEXT: s_ashr_i32 s4, s4, 28 -; GFX10-DL-NEXT: v_mad_i32_i24 v1, s5, s8, v1 -; GFX10-DL-NEXT: v_mad_i32_i24 v1, s6, s7, v1 -; GFX10-DL-NEXT: v_mad_i32_i24 v1, s2, s4, v1 +; GFX10-DL-NEXT: s_ashr_i32 s3, s3, 28 +; GFX10-DL-NEXT: v_mad_i32_i24 v1, s4, s7, v1 +; GFX10-DL-NEXT: v_mad_i32_i24 v1, s5, s6, v1 +; GFX10-DL-NEXT: v_mad_i32_i24 v1, s2, s3, v1 ; GFX10-DL-NEXT: v_add_nc_u32_e32 v2, v0, v1 ; GFX10-DL-NEXT: v_mov_b32_e32 v0, s0 ; GFX10-DL-NEXT: v_mov_b32_e32 v1, s1 @@ -1243,64 +1243,64 @@ define amdgpu_kernel void @idot8_acc32_vecMul(<8 x i4> addrspace(1)* %src1, ; GFX7-LABEL: idot8_acc32_vecMul: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x9 -; GFX7-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xd -; GFX7-NEXT: s_mov_b32 s7, 0xf000 -; GFX7-NEXT: s_mov_b32 s6, -1 +; GFX7-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd +; GFX7-NEXT: s_mov_b32 s3, 0xf000 +; GFX7-NEXT: s_mov_b32 s2, -1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: s_load_dword s1, s[8:9], 0x0 -; GFX7-NEXT: s_load_dword s9, s[10:11], 0x0 +; GFX7-NEXT: s_load_dword s5, s[4:5], 0x0 +; GFX7-NEXT: s_load_dword s7, s[6:7], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: s_ashr_i64 s[10:11], s[0:1], 60 -; GFX7-NEXT: s_lshl_b32 s11, s1, 4 -; GFX7-NEXT: s_ashr_i64 s[16:17], s[10:11], 60 -; GFX7-NEXT: s_lshl_b32 s11, s1, 16 -; GFX7-NEXT: s_ashr_i64 s[18:19], s[10:11], 60 -; GFX7-NEXT: s_lshl_b32 s11, s1, 20 -; GFX7-NEXT: s_lshl_b32 s13, s1, 8 -; GFX7-NEXT: s_lshl_b32 s15, s1, 12 -; GFX7-NEXT: s_ashr_i64 s[20:21], s[10:11], 60 -; GFX7-NEXT: s_lshl_b32 s11, s1, 24 -; GFX7-NEXT: s_lshl_b32 s1, s1, 28 -; GFX7-NEXT: s_ashr_i64 s[0:1], s[0:1], 60 -; GFX7-NEXT: s_lshl_b32 s1, s9, 4 -; GFX7-NEXT: s_ashr_i64 s[26:27], s[0:1], 60 -; GFX7-NEXT: s_lshl_b32 s1, s9, 8 -; GFX7-NEXT: s_ashr_i64 s[28:29], s[0:1], 60 -; GFX7-NEXT: s_lshl_b32 s1, s9, 12 -; GFX7-NEXT: s_ashr_i64 s[30:31], s[0:1], 60 -; GFX7-NEXT: s_lshl_b32 s1, s9, 16 -; GFX7-NEXT: s_ashr_i64 s[32:33], s[0:1], 60 -; GFX7-NEXT: s_lshl_b32 s1, s9, 20 -; GFX7-NEXT: s_ashr_i64 s[34:35], s[0:1], 60 -; GFX7-NEXT: s_lshl_b32 s1, s9, 24 -; GFX7-NEXT: s_ashr_i64 s[36:37], s[0:1], 60 -; GFX7-NEXT: s_lshl_b32 s1, s9, 28 -; GFX7-NEXT: s_ashr_i64 s[24:25], s[8:9], 60 -; GFX7-NEXT: s_ashr_i64 s[8:9], s[0:1], 60 -; GFX7-NEXT: s_load_dword s1, s[4:5], 0x0 -; GFX7-NEXT: v_mov_b32_e32 v0, s8 -; GFX7-NEXT: s_ashr_i64 s[22:23], s[10:11], 60 -; GFX7-NEXT: s_ashr_i64 s[14:15], s[14:15], 60 +; GFX7-NEXT: s_ashr_i64 s[8:9], s[4:5], 60 +; GFX7-NEXT: s_lshl_b32 s9, s5, 4 +; GFX7-NEXT: s_ashr_i64 s[14:15], s[8:9], 60 +; GFX7-NEXT: s_lshl_b32 s9, s5, 16 +; GFX7-NEXT: s_ashr_i64 s[16:17], s[8:9], 60 +; GFX7-NEXT: s_lshl_b32 s9, s5, 20 +; GFX7-NEXT: s_lshl_b32 s11, s5, 8 +; GFX7-NEXT: s_lshl_b32 s13, s5, 12 +; GFX7-NEXT: s_ashr_i64 s[18:19], s[8:9], 60 +; GFX7-NEXT: s_lshl_b32 s9, s5, 24 +; GFX7-NEXT: s_lshl_b32 s5, s5, 28 +; GFX7-NEXT: s_ashr_i64 s[4:5], s[4:5], 60 +; GFX7-NEXT: s_lshl_b32 s5, s7, 4 +; GFX7-NEXT: s_ashr_i64 s[24:25], s[4:5], 60 +; GFX7-NEXT: s_lshl_b32 s5, s7, 8 +; GFX7-NEXT: s_ashr_i64 s[26:27], s[4:5], 60 +; GFX7-NEXT: s_lshl_b32 s5, s7, 12 +; GFX7-NEXT: s_ashr_i64 s[28:29], s[4:5], 60 +; GFX7-NEXT: s_lshl_b32 s5, s7, 16 +; GFX7-NEXT: s_ashr_i64 s[30:31], s[4:5], 60 +; GFX7-NEXT: s_lshl_b32 s5, s7, 20 +; GFX7-NEXT: s_ashr_i64 s[32:33], s[4:5], 60 +; GFX7-NEXT: s_lshl_b32 s5, s7, 24 +; GFX7-NEXT: s_ashr_i64 s[34:35], s[4:5], 60 +; GFX7-NEXT: s_lshl_b32 s5, s7, 28 +; GFX7-NEXT: s_ashr_i64 s[22:23], s[6:7], 60 +; GFX7-NEXT: s_ashr_i64 s[6:7], s[4:5], 60 +; GFX7-NEXT: s_load_dword s5, s[0:1], 0x0 +; GFX7-NEXT: v_mov_b32_e32 v0, s6 +; GFX7-NEXT: s_ashr_i64 s[20:21], s[8:9], 60 ; GFX7-NEXT: s_ashr_i64 s[12:13], s[12:13], 60 +; GFX7-NEXT: s_ashr_i64 s[10:11], s[10:11], 60 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v1, s1 -; GFX7-NEXT: v_mad_i32_i24 v0, s0, v0, v1 -; GFX7-NEXT: v_mov_b32_e32 v1, s36 -; GFX7-NEXT: v_mad_i32_i24 v0, s22, v1, v0 +; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: v_mad_i32_i24 v0, s4, v0, v1 ; GFX7-NEXT: v_mov_b32_e32 v1, s34 ; GFX7-NEXT: v_mad_i32_i24 v0, s20, v1, v0 ; GFX7-NEXT: v_mov_b32_e32 v1, s32 ; GFX7-NEXT: v_mad_i32_i24 v0, s18, v1, v0 ; GFX7-NEXT: v_mov_b32_e32 v1, s30 -; GFX7-NEXT: v_mad_i32_i24 v0, s14, v1, v0 +; GFX7-NEXT: v_mad_i32_i24 v0, s16, v1, v0 ; GFX7-NEXT: v_mov_b32_e32 v1, s28 ; GFX7-NEXT: v_mad_i32_i24 v0, s12, v1, v0 ; GFX7-NEXT: v_mov_b32_e32 v1, s26 -; GFX7-NEXT: v_mad_i32_i24 v0, s16, v1, v0 -; GFX7-NEXT: v_mov_b32_e32 v1, s24 ; GFX7-NEXT: v_mad_i32_i24 v0, s10, v1, v0 -; GFX7-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GFX7-NEXT: v_mov_b32_e32 v1, s24 +; GFX7-NEXT: v_mad_i32_i24 v0, s14, v1, v0 +; GFX7-NEXT: v_mov_b32_e32 v1, s22 +; GFX7-NEXT: v_mad_i32_i24 v0, s8, v1, v0 +; GFX7-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX7-NEXT: s_endpgm ; ; GFX8-LABEL: idot8_acc32_vecMul: @@ -1308,57 +1308,58 @@ ; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_load_dword s5, s[4:5], 0x0 -; GFX8-NEXT: s_load_dword s7, s[6:7], 0x0 -; GFX8-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX8-NEXT: s_load_dword s3, s[4:5], 0x0 +; GFX8-NEXT: s_load_dword s5, s[6:7], 0x0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_ashr_i64 s[8:9], s[4:5], 60 -; GFX8-NEXT: s_lshl_b32 s9, s5, 4 -; GFX8-NEXT: s_ashr_i64 s[16:17], s[8:9], 60 -; GFX8-NEXT: s_lshl_b32 s9, s5, 20 -; GFX8-NEXT: s_lshl_b32 s11, s5, 8 -; GFX8-NEXT: s_lshl_b32 s13, s5, 12 -; GFX8-NEXT: s_lshl_b32 s15, s5, 16 -; GFX8-NEXT: s_ashr_i64 s[18:19], s[8:9], 60 -; GFX8-NEXT: s_lshl_b32 s9, s5, 24 -; GFX8-NEXT: s_lshl_b32 s5, s5, 28 -; GFX8-NEXT: s_ashr_i64 s[4:5], s[4:5], 60 -; GFX8-NEXT: s_lshl_b32 s5, s7, 4 -; GFX8-NEXT: s_ashr_i64 s[24:25], s[4:5], 60 -; GFX8-NEXT: s_lshl_b32 s5, s7, 8 -; GFX8-NEXT: s_ashr_i64 s[26:27], s[4:5], 60 -; GFX8-NEXT: s_lshl_b32 s5, s7, 12 -; GFX8-NEXT: s_ashr_i64 s[28:29], s[4:5], 60 -; GFX8-NEXT: s_lshl_b32 s5, s7, 16 -; GFX8-NEXT: s_ashr_i64 s[30:31], s[4:5], 60 -; GFX8-NEXT: s_lshl_b32 s5, s7, 20 -; GFX8-NEXT: s_ashr_i64 s[32:33], s[4:5], 60 -; GFX8-NEXT: s_lshl_b32 s5, s7, 24 -; GFX8-NEXT: s_ashr_i64 s[34:35], s[4:5], 60 -; GFX8-NEXT: s_lshl_b32 s5, s7, 28 -; GFX8-NEXT: s_ashr_i64 s[22:23], s[6:7], 60 -; GFX8-NEXT: s_ashr_i64 s[6:7], s[4:5], 60 -; GFX8-NEXT: v_mov_b32_e32 v0, s6 -; GFX8-NEXT: v_mov_b32_e32 v1, s2 -; GFX8-NEXT: v_mad_i32_i24 v0, s4, v0, v1 -; GFX8-NEXT: s_ashr_i64 s[20:21], s[8:9], 60 -; GFX8-NEXT: v_mov_b32_e32 v1, s34 -; GFX8-NEXT: v_mad_i32_i24 v0, s20, v1, v0 +; GFX8-NEXT: s_ashr_i64 s[6:7], s[2:3], 60 +; GFX8-NEXT: s_lshl_b32 s7, s3, 4 +; GFX8-NEXT: s_ashr_i64 s[14:15], s[6:7], 60 +; GFX8-NEXT: s_lshl_b32 s7, s3, 20 +; GFX8-NEXT: s_lshl_b32 s9, s3, 8 +; GFX8-NEXT: s_lshl_b32 s11, s3, 12 +; GFX8-NEXT: s_lshl_b32 s13, s3, 16 +; GFX8-NEXT: s_ashr_i64 s[16:17], s[6:7], 60 +; GFX8-NEXT: s_lshl_b32 s7, s3, 24 +; GFX8-NEXT: s_lshl_b32 s3, s3, 28 +; GFX8-NEXT: s_ashr_i64 s[2:3], s[2:3], 60 +; GFX8-NEXT: s_lshl_b32 s3, s5, 4 +; GFX8-NEXT: s_ashr_i64 s[22:23], s[2:3], 60 +; GFX8-NEXT: s_lshl_b32 s3, s5, 8 +; GFX8-NEXT: s_ashr_i64 s[24:25], s[2:3], 60 +; GFX8-NEXT: s_lshl_b32 s3, s5, 12 +; GFX8-NEXT: s_ashr_i64 s[26:27], s[2:3], 60 +; GFX8-NEXT: s_lshl_b32 s3, s5, 16 +; GFX8-NEXT: s_ashr_i64 s[28:29], s[2:3], 60 +; GFX8-NEXT: s_lshl_b32 s3, s5, 20 +; GFX8-NEXT: s_ashr_i64 s[30:31], s[2:3], 60 +; GFX8-NEXT: s_lshl_b32 s3, s5, 24 +; GFX8-NEXT: s_ashr_i64 s[32:33], s[2:3], 60 +; GFX8-NEXT: s_lshl_b32 s3, s5, 28 +; GFX8-NEXT: s_ashr_i64 s[20:21], s[4:5], 60 +; GFX8-NEXT: s_ashr_i64 s[4:5], s[2:3], 60 +; GFX8-NEXT: s_load_dword s3, s[0:1], 0x0 +; GFX8-NEXT: v_mov_b32_e32 v0, s4 +; GFX8-NEXT: s_ashr_i64 s[18:19], s[6:7], 60 +; GFX8-NEXT: s_ashr_i64 s[12:13], s[12:13], 60 +; GFX8-NEXT: s_ashr_i64 s[10:11], s[10:11], 60 +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: v_mov_b32_e32 v1, s3 +; GFX8-NEXT: v_mad_i32_i24 v0, s2, v0, v1 ; GFX8-NEXT: v_mov_b32_e32 v1, s32 ; GFX8-NEXT: v_mad_i32_i24 v0, s18, v1, v0 -; GFX8-NEXT: s_ashr_i64 s[14:15], s[14:15], 60 ; GFX8-NEXT: v_mov_b32_e32 v1, s30 -; GFX8-NEXT: v_mad_i32_i24 v0, s14, v1, v0 -; GFX8-NEXT: s_ashr_i64 s[12:13], s[12:13], 60 +; GFX8-NEXT: v_mad_i32_i24 v0, s16, v1, v0 ; GFX8-NEXT: v_mov_b32_e32 v1, s28 ; GFX8-NEXT: v_mad_i32_i24 v0, s12, v1, v0 -; GFX8-NEXT: s_ashr_i64 s[10:11], s[10:11], 60 ; GFX8-NEXT: v_mov_b32_e32 v1, s26 ; GFX8-NEXT: v_mad_i32_i24 v0, s10, v1, v0 +; GFX8-NEXT: s_ashr_i64 s[8:9], s[8:9], 60 ; GFX8-NEXT: v_mov_b32_e32 v1, s24 -; GFX8-NEXT: v_mad_i32_i24 v0, s16, v1, v0 +; GFX8-NEXT: v_mad_i32_i24 v0, s8, v1, v0 ; GFX8-NEXT: v_mov_b32_e32 v1, s22 -; GFX8-NEXT: v_mad_i32_i24 v2, s8, v1, v0 +; GFX8-NEXT: v_mad_i32_i24 v0, s14, v1, v0 +; GFX8-NEXT: v_mov_b32_e32 v1, s20 +; GFX8-NEXT: v_mad_i32_i24 v2, s6, v1, v0 ; GFX8-NEXT: v_mov_b32_e32 v0, s0 ; GFX8-NEXT: v_mov_b32_e32 v1, s1 ; GFX8-NEXT: flat_store_dword v[0:1], v2 @@ -1369,57 +1370,58 @@ ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_load_dword s5, s[4:5], 0x0 -; GFX9-NEXT: s_load_dword s7, s[6:7], 0x0 -; GFX9-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX9-NEXT: s_load_dword s3, s[4:5], 0x0 +; GFX9-NEXT: s_load_dword s5, s[6:7], 0x0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_ashr_i64 s[8:9], s[4:5], 60 -; GFX9-NEXT: s_lshl_b32 s9, s5, 4 -; GFX9-NEXT: s_ashr_i64 s[16:17], s[8:9], 60 -; GFX9-NEXT: s_lshl_b32 s9, s5, 20 -; GFX9-NEXT: s_lshl_b32 s11, s5, 8 -; GFX9-NEXT: s_lshl_b32 s13, s5, 12 -; GFX9-NEXT: s_lshl_b32 s15, s5, 16 -; GFX9-NEXT: s_ashr_i64 s[18:19], s[8:9], 60 -; GFX9-NEXT: s_lshl_b32 s9, s5, 24 -; GFX9-NEXT: s_lshl_b32 s5, s5, 28 -; GFX9-NEXT: s_ashr_i64 s[4:5], s[4:5], 60 -; GFX9-NEXT: s_lshl_b32 s5, s7, 4 -; GFX9-NEXT: s_ashr_i64 s[24:25], s[4:5], 60 -; GFX9-NEXT: s_lshl_b32 s5, s7, 8 -; GFX9-NEXT: s_ashr_i64 s[26:27], s[4:5], 60 -; GFX9-NEXT: s_lshl_b32 s5, s7, 12 -; GFX9-NEXT: s_ashr_i64 s[28:29], s[4:5], 60 -; GFX9-NEXT: s_lshl_b32 s5, s7, 16 -; GFX9-NEXT: s_ashr_i64 s[30:31], s[4:5], 60 -; GFX9-NEXT: s_lshl_b32 s5, s7, 20 -; GFX9-NEXT: s_ashr_i64 s[32:33], s[4:5], 60 -; GFX9-NEXT: s_lshl_b32 s5, s7, 24 -; GFX9-NEXT: s_ashr_i64 s[34:35], s[4:5], 60 -; GFX9-NEXT: s_lshl_b32 s5, s7, 28 -; GFX9-NEXT: s_ashr_i64 s[22:23], s[6:7], 60 -; GFX9-NEXT: s_ashr_i64 s[6:7], s[4:5], 60 -; GFX9-NEXT: v_mov_b32_e32 v0, s6 -; GFX9-NEXT: v_mov_b32_e32 v1, s2 -; GFX9-NEXT: v_mad_i32_i24 v0, s4, v0, v1 -; GFX9-NEXT: s_ashr_i64 s[20:21], s[8:9], 60 -; GFX9-NEXT: v_mov_b32_e32 v1, s34 -; GFX9-NEXT: v_mad_i32_i24 v0, s20, v1, v0 +; GFX9-NEXT: s_ashr_i64 s[6:7], s[2:3], 60 +; GFX9-NEXT: s_lshl_b32 s7, s3, 4 +; GFX9-NEXT: s_ashr_i64 s[14:15], s[6:7], 60 +; GFX9-NEXT: s_lshl_b32 s7, s3, 20 +; GFX9-NEXT: s_lshl_b32 s9, s3, 8 +; GFX9-NEXT: s_lshl_b32 s11, s3, 12 +; GFX9-NEXT: s_lshl_b32 s13, s3, 16 +; GFX9-NEXT: s_ashr_i64 s[16:17], s[6:7], 60 +; GFX9-NEXT: s_lshl_b32 s7, s3, 24 +; GFX9-NEXT: s_lshl_b32 s3, s3, 28 +; GFX9-NEXT: s_ashr_i64 s[2:3], s[2:3], 60 +; GFX9-NEXT: s_lshl_b32 s3, s5, 4 +; GFX9-NEXT: s_ashr_i64 s[22:23], s[2:3], 60 +; GFX9-NEXT: s_lshl_b32 s3, s5, 8 +; GFX9-NEXT: s_ashr_i64 s[24:25], s[2:3], 60 +; GFX9-NEXT: s_lshl_b32 s3, s5, 12 +; GFX9-NEXT: s_ashr_i64 s[26:27], s[2:3], 60 +; GFX9-NEXT: s_lshl_b32 s3, s5, 16 +; GFX9-NEXT: s_ashr_i64 s[28:29], s[2:3], 60 +; GFX9-NEXT: s_lshl_b32 s3, s5, 20 +; GFX9-NEXT: s_ashr_i64 s[30:31], s[2:3], 60 +; GFX9-NEXT: s_lshl_b32 s3, s5, 24 +; GFX9-NEXT: s_ashr_i64 s[32:33], s[2:3], 60 +; GFX9-NEXT: s_lshl_b32 s3, s5, 28 +; GFX9-NEXT: s_ashr_i64 s[20:21], s[4:5], 60 +; GFX9-NEXT: s_ashr_i64 s[4:5], s[2:3], 60 +; GFX9-NEXT: s_load_dword s3, s[0:1], 0x0 +; GFX9-NEXT: v_mov_b32_e32 v0, s4 +; GFX9-NEXT: s_ashr_i64 s[18:19], s[6:7], 60 +; GFX9-NEXT: s_ashr_i64 s[12:13], s[12:13], 60 +; GFX9-NEXT: s_ashr_i64 s[10:11], s[10:11], 60 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v1, s3 +; GFX9-NEXT: v_mad_i32_i24 v0, s2, v0, v1 ; GFX9-NEXT: v_mov_b32_e32 v1, s32 ; GFX9-NEXT: v_mad_i32_i24 v0, s18, v1, v0 -; GFX9-NEXT: s_ashr_i64 s[14:15], s[14:15], 60 ; GFX9-NEXT: v_mov_b32_e32 v1, s30 -; GFX9-NEXT: v_mad_i32_i24 v0, s14, v1, v0 -; GFX9-NEXT: s_ashr_i64 s[12:13], s[12:13], 60 +; GFX9-NEXT: v_mad_i32_i24 v0, s16, v1, v0 ; GFX9-NEXT: v_mov_b32_e32 v1, s28 ; GFX9-NEXT: v_mad_i32_i24 v0, s12, v1, v0 -; GFX9-NEXT: s_ashr_i64 s[10:11], s[10:11], 60 ; GFX9-NEXT: v_mov_b32_e32 v1, s26 ; GFX9-NEXT: v_mad_i32_i24 v0, s10, v1, v0 +; GFX9-NEXT: s_ashr_i64 s[8:9], s[8:9], 60 ; GFX9-NEXT: v_mov_b32_e32 v1, s24 -; GFX9-NEXT: v_mad_i32_i24 v0, s16, v1, v0 +; GFX9-NEXT: v_mad_i32_i24 v0, s8, v1, v0 ; GFX9-NEXT: v_mov_b32_e32 v1, s22 -; GFX9-NEXT: v_mad_i32_i24 v2, s8, v1, v0 +; GFX9-NEXT: v_mad_i32_i24 v0, s14, v1, v0 +; GFX9-NEXT: v_mov_b32_e32 v1, s20 +; GFX9-NEXT: v_mad_i32_i24 v2, s6, v1, v0 ; GFX9-NEXT: v_mov_b32_e32 v0, s0 ; GFX9-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-NEXT: global_store_dword v[0:1], v2, off @@ -1430,57 +1432,58 @@ ; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: s_load_dword s5, s[4:5], 0x0 -; GFX9-DL-NEXT: s_load_dword s7, s[6:7], 0x0 -; GFX9-DL-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX9-DL-NEXT: s_load_dword s3, s[4:5], 0x0 +; GFX9-DL-NEXT: s_load_dword s5, s[6:7], 0x0 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: s_ashr_i64 s[8:9], s[4:5], 60 -; GFX9-DL-NEXT: s_lshl_b32 s9, s5, 4 -; GFX9-DL-NEXT: s_ashr_i64 s[16:17], s[8:9], 60 -; GFX9-DL-NEXT: s_lshl_b32 s9, s5, 20 -; GFX9-DL-NEXT: s_lshl_b32 s11, s5, 8 -; GFX9-DL-NEXT: s_lshl_b32 s13, s5, 12 -; GFX9-DL-NEXT: s_lshl_b32 s15, s5, 16 -; GFX9-DL-NEXT: s_ashr_i64 s[18:19], s[8:9], 60 -; GFX9-DL-NEXT: s_lshl_b32 s9, s5, 24 -; GFX9-DL-NEXT: s_lshl_b32 s5, s5, 28 -; GFX9-DL-NEXT: s_ashr_i64 s[4:5], s[4:5], 60 -; GFX9-DL-NEXT: s_lshl_b32 s5, s7, 4 -; GFX9-DL-NEXT: s_ashr_i64 s[24:25], s[4:5], 60 -; GFX9-DL-NEXT: s_lshl_b32 s5, s7, 8 -; GFX9-DL-NEXT: s_ashr_i64 s[26:27], s[4:5], 60 -; GFX9-DL-NEXT: s_lshl_b32 s5, s7, 12 -; GFX9-DL-NEXT: s_ashr_i64 s[28:29], s[4:5], 60 -; GFX9-DL-NEXT: s_lshl_b32 s5, s7, 16 -; GFX9-DL-NEXT: s_ashr_i64 s[30:31], s[4:5], 60 -; GFX9-DL-NEXT: s_lshl_b32 s5, s7, 20 -; GFX9-DL-NEXT: s_ashr_i64 s[32:33], s[4:5], 60 -; GFX9-DL-NEXT: s_lshl_b32 s5, s7, 24 -; GFX9-DL-NEXT: s_ashr_i64 s[34:35], s[4:5], 60 -; GFX9-DL-NEXT: s_lshl_b32 s5, s7, 28 -; GFX9-DL-NEXT: s_ashr_i64 s[22:23], s[6:7], 60 -; GFX9-DL-NEXT: s_ashr_i64 s[6:7], s[4:5], 60 -; GFX9-DL-NEXT: v_mov_b32_e32 v0, s6 -; GFX9-DL-NEXT: v_mov_b32_e32 v1, s2 -; GFX9-DL-NEXT: v_mad_i32_i24 v0, s4, v0, v1 -; GFX9-DL-NEXT: s_ashr_i64 s[20:21], s[8:9], 60 -; GFX9-DL-NEXT: v_mov_b32_e32 v1, s34 -; GFX9-DL-NEXT: v_mad_i32_i24 v0, s20, v1, v0 +; GFX9-DL-NEXT: s_ashr_i64 s[6:7], s[2:3], 60 +; GFX9-DL-NEXT: s_lshl_b32 s7, s3, 4 +; GFX9-DL-NEXT: s_ashr_i64 s[14:15], s[6:7], 60 +; GFX9-DL-NEXT: s_lshl_b32 s7, s3, 20 +; GFX9-DL-NEXT: s_lshl_b32 s9, s3, 8 +; GFX9-DL-NEXT: s_lshl_b32 s11, s3, 12 +; GFX9-DL-NEXT: s_lshl_b32 s13, s3, 16 +; GFX9-DL-NEXT: s_ashr_i64 s[16:17], s[6:7], 60 +; GFX9-DL-NEXT: s_lshl_b32 s7, s3, 24 +; GFX9-DL-NEXT: s_lshl_b32 s3, s3, 28 +; GFX9-DL-NEXT: s_ashr_i64 s[2:3], s[2:3], 60 +; GFX9-DL-NEXT: s_lshl_b32 s3, s5, 4 +; GFX9-DL-NEXT: s_ashr_i64 s[22:23], s[2:3], 60 +; GFX9-DL-NEXT: s_lshl_b32 s3, s5, 8 +; GFX9-DL-NEXT: s_ashr_i64 s[24:25], s[2:3], 60 +; GFX9-DL-NEXT: s_lshl_b32 s3, s5, 12 +; GFX9-DL-NEXT: s_ashr_i64 s[26:27], s[2:3], 60 +; GFX9-DL-NEXT: s_lshl_b32 s3, s5, 16 +; GFX9-DL-NEXT: s_ashr_i64 s[28:29], s[2:3], 60 +; GFX9-DL-NEXT: s_lshl_b32 s3, s5, 20 +; GFX9-DL-NEXT: s_ashr_i64 s[30:31], s[2:3], 60 +; GFX9-DL-NEXT: s_lshl_b32 s3, s5, 24 +; GFX9-DL-NEXT: s_ashr_i64 s[32:33], s[2:3], 60 +; GFX9-DL-NEXT: s_lshl_b32 s3, s5, 28 +; GFX9-DL-NEXT: s_ashr_i64 s[20:21], s[4:5], 60 +; GFX9-DL-NEXT: s_ashr_i64 s[4:5], s[2:3], 60 +; GFX9-DL-NEXT: s_load_dword s3, s[0:1], 0x0 +; GFX9-DL-NEXT: v_mov_b32_e32 v0, s4 +; GFX9-DL-NEXT: s_ashr_i64 s[18:19], s[6:7], 60 +; GFX9-DL-NEXT: s_ashr_i64 s[12:13], s[12:13], 60 +; GFX9-DL-NEXT: s_ashr_i64 s[10:11], s[10:11], 60 +; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-DL-NEXT: v_mov_b32_e32 v1, s3 +; GFX9-DL-NEXT: v_mad_i32_i24 v0, s2, v0, v1 ; GFX9-DL-NEXT: v_mov_b32_e32 v1, s32 ; GFX9-DL-NEXT: v_mad_i32_i24 v0, s18, v1, v0 -; GFX9-DL-NEXT: s_ashr_i64 s[14:15], s[14:15], 60 ; GFX9-DL-NEXT: v_mov_b32_e32 v1, s30 -; GFX9-DL-NEXT: v_mad_i32_i24 v0, s14, v1, v0 -; GFX9-DL-NEXT: s_ashr_i64 s[12:13], s[12:13], 60 +; GFX9-DL-NEXT: v_mad_i32_i24 v0, s16, v1, v0 ; GFX9-DL-NEXT: v_mov_b32_e32 v1, s28 ; GFX9-DL-NEXT: v_mad_i32_i24 v0, s12, v1, v0 -; GFX9-DL-NEXT: s_ashr_i64 s[10:11], s[10:11], 60 ; GFX9-DL-NEXT: v_mov_b32_e32 v1, s26 ; GFX9-DL-NEXT: v_mad_i32_i24 v0, s10, v1, v0 +; GFX9-DL-NEXT: s_ashr_i64 s[8:9], s[8:9], 60 ; GFX9-DL-NEXT: v_mov_b32_e32 v1, s24 -; GFX9-DL-NEXT: v_mad_i32_i24 v0, s16, v1, v0 +; GFX9-DL-NEXT: v_mad_i32_i24 v0, s8, v1, v0 ; GFX9-DL-NEXT: v_mov_b32_e32 v1, s22 -; GFX9-DL-NEXT: v_mad_i32_i24 v2, s8, v1, v0 +; GFX9-DL-NEXT: v_mad_i32_i24 v0, s14, v1, v0 +; GFX9-DL-NEXT: v_mov_b32_e32 v1, s20 +; GFX9-DL-NEXT: v_mad_i32_i24 v2, s6, v1, v0 ; GFX9-DL-NEXT: v_mov_b32_e32 v0, s0 ; GFX9-DL-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-DL-NEXT: global_store_dword v[0:1], v2, off @@ -1492,49 +1495,49 @@ ; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX10-DL-NEXT: ; implicit-def: $vcc_hi ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: s_load_dword s5, s[4:5], 0x0 -; GFX10-DL-NEXT: s_load_dword s7, s[6:7], 0x0 +; GFX10-DL-NEXT: s_load_dword s3, s[4:5], 0x0 +; GFX10-DL-NEXT: s_load_dword s5, s[6:7], 0x0 ; GFX10-DL-NEXT: s_load_dword s2, s[0:1], 0x0 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-DL-NEXT: s_lshl_b32 s7, s3, 28 ; GFX10-DL-NEXT: s_lshl_b32 s9, s5, 28 -; GFX10-DL-NEXT: s_lshl_b32 s11, s7, 28 +; GFX10-DL-NEXT: s_lshl_b32 s11, s3, 24 ; GFX10-DL-NEXT: s_lshl_b32 s13, s5, 24 -; GFX10-DL-NEXT: s_lshl_b32 s15, s7, 24 ; GFX10-DL-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-DL-NEXT: s_ashr_i64 s[6:7], s[6:7], 60 ; GFX10-DL-NEXT: s_ashr_i64 s[8:9], s[8:9], 60 ; GFX10-DL-NEXT: s_ashr_i64 s[10:11], s[10:11], 60 ; GFX10-DL-NEXT: s_ashr_i64 s[12:13], s[12:13], 60 -; GFX10-DL-NEXT: s_ashr_i64 s[14:15], s[14:15], 60 +; GFX10-DL-NEXT: s_lshl_b32 s7, s3, 20 ; GFX10-DL-NEXT: s_lshl_b32 s9, s5, 20 -; GFX10-DL-NEXT: s_lshl_b32 s11, s7, 20 -; GFX10-DL-NEXT: v_mad_i32_i24 v0, s8, s10, v0 +; GFX10-DL-NEXT: v_mad_i32_i24 v0, s6, s8, v0 +; GFX10-DL-NEXT: s_lshl_b32 s11, s3, 16 ; GFX10-DL-NEXT: s_lshl_b32 s13, s5, 16 -; GFX10-DL-NEXT: s_lshl_b32 s15, s7, 16 +; GFX10-DL-NEXT: s_ashr_i64 s[6:7], s[6:7], 60 ; GFX10-DL-NEXT: s_ashr_i64 s[8:9], s[8:9], 60 +; GFX10-DL-NEXT: v_mad_i32_i24 v0, s10, s12, v0 ; GFX10-DL-NEXT: s_ashr_i64 s[10:11], s[10:11], 60 -; GFX10-DL-NEXT: v_mad_i32_i24 v0, s12, s14, v0 ; GFX10-DL-NEXT: s_ashr_i64 s[12:13], s[12:13], 60 -; GFX10-DL-NEXT: s_ashr_i64 s[14:15], s[14:15], 60 +; GFX10-DL-NEXT: s_lshl_b32 s7, s3, 12 ; GFX10-DL-NEXT: s_lshl_b32 s9, s5, 12 -; GFX10-DL-NEXT: s_lshl_b32 s11, s7, 12 -; GFX10-DL-NEXT: v_mad_i32_i24 v0, s8, s10, v0 +; GFX10-DL-NEXT: v_mad_i32_i24 v0, s6, s8, v0 +; GFX10-DL-NEXT: s_lshl_b32 s11, s3, 8 ; GFX10-DL-NEXT: s_lshl_b32 s13, s5, 8 -; GFX10-DL-NEXT: s_lshl_b32 s15, s7, 8 +; GFX10-DL-NEXT: s_ashr_i64 s[6:7], s[6:7], 60 ; GFX10-DL-NEXT: s_ashr_i64 s[8:9], s[8:9], 60 -; GFX10-DL-NEXT: s_ashr_i64 s[10:11], s[10:11], 60 -; GFX10-DL-NEXT: v_mad_i32_i24 v0, s12, s14, v0 +; GFX10-DL-NEXT: v_mad_i32_i24 v0, s10, s12, v0 +; GFX10-DL-NEXT: s_lshl_b32 s7, s3, 4 ; GFX10-DL-NEXT: s_lshl_b32 s9, s5, 4 -; GFX10-DL-NEXT: s_lshl_b32 s11, s7, 4 +; GFX10-DL-NEXT: s_ashr_i64 s[10:11], s[10:11], 60 ; GFX10-DL-NEXT: s_ashr_i64 s[12:13], s[12:13], 60 -; GFX10-DL-NEXT: s_ashr_i64 s[14:15], s[14:15], 60 -; GFX10-DL-NEXT: v_mad_i32_i24 v0, s8, s10, v0 +; GFX10-DL-NEXT: v_mad_i32_i24 v0, s6, s8, v0 +; GFX10-DL-NEXT: s_ashr_i64 s[6:7], s[6:7], 60 ; GFX10-DL-NEXT: s_ashr_i64 s[8:9], s[8:9], 60 -; GFX10-DL-NEXT: s_ashr_i64 s[10:11], s[10:11], 60 +; GFX10-DL-NEXT: s_ashr_i64 s[2:3], s[2:3], 60 ; GFX10-DL-NEXT: s_ashr_i64 s[4:5], s[4:5], 60 -; GFX10-DL-NEXT: s_ashr_i64 s[6:7], s[6:7], 60 -; GFX10-DL-NEXT: v_mad_i32_i24 v0, s12, s14, v0 -; GFX10-DL-NEXT: v_mad_i32_i24 v0, s8, s10, v0 -; GFX10-DL-NEXT: v_mad_i32_i24 v2, s4, s6, v0 +; GFX10-DL-NEXT: v_mad_i32_i24 v0, s10, s12, v0 +; GFX10-DL-NEXT: v_mad_i32_i24 v0, s6, s8, v0 +; GFX10-DL-NEXT: v_mad_i32_i24 v2, s2, s4, v0 ; GFX10-DL-NEXT: v_mov_b32_e32 v0, s0 ; GFX10-DL-NEXT: v_mov_b32_e32 v1, s1 ; GFX10-DL-NEXT: global_store_dword v[0:1], v2, off @@ -1576,62 +1579,62 @@ define amdgpu_kernel void @idot8_acc16_vecMul(<8 x i4> addrspace(1)* %src1, ; GFX7-LABEL: idot8_acc16_vecMul: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x9 -; GFX7-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xd -; GFX7-NEXT: s_mov_b32 s7, 0xf000 -; GFX7-NEXT: s_mov_b32 s6, -1 -; GFX7-NEXT: s_mov_b32 s2, 0xffff +; GFX7-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd +; GFX7-NEXT: s_mov_b32 s3, 0xf000 +; GFX7-NEXT: s_mov_b32 s2, -1 +; GFX7-NEXT: s_mov_b32 s8, 0xffff ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: s_load_dword s0, s[10:11], 0x0 -; GFX7-NEXT: buffer_load_ushort v0, off, s[4:7], 0 -; GFX7-NEXT: s_load_dword s1, s[8:9], 0x0 +; GFX7-NEXT: s_load_dword s6, s[6:7], 0x0 +; GFX7-NEXT: buffer_load_ushort v0, off, s[0:3], 0 +; GFX7-NEXT: s_load_dword s4, s[4:5], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: s_bfe_i32 s16, s0, 0x40018 -; GFX7-NEXT: s_bfe_i32 s17, s0, 0x40014 -; GFX7-NEXT: s_bfe_i32 s18, s0, 0x40010 -; GFX7-NEXT: s_bfe_i32 s19, s0, 0x40000 -; GFX7-NEXT: s_bfe_i32 s20, s0, 0x40004 -; GFX7-NEXT: s_bfe_i32 s21, s0, 0x40008 -; GFX7-NEXT: s_ashr_i32 s15, s0, 28 -; GFX7-NEXT: s_bfe_i32 s0, s0, 0x4000c -; GFX7-NEXT: s_ashr_i32 s8, s1, 28 -; GFX7-NEXT: s_bfe_i32 s9, s1, 0x40018 -; GFX7-NEXT: s_bfe_i32 s10, s1, 0x40014 -; GFX7-NEXT: s_bfe_i32 s11, s1, 0x40010 -; GFX7-NEXT: s_bfe_i32 s12, s1, 0x40000 -; GFX7-NEXT: v_mov_b32_e32 v4, s19 -; GFX7-NEXT: s_bfe_i32 s13, s1, 0x40004 -; GFX7-NEXT: v_mov_b32_e32 v3, s20 -; GFX7-NEXT: s_bfe_i32 s14, s1, 0x40008 -; GFX7-NEXT: v_mov_b32_e32 v2, s21 -; GFX7-NEXT: s_bfe_i32 s1, s1, 0x4000c -; GFX7-NEXT: v_mov_b32_e32 v1, s0 -; GFX7-NEXT: v_mul_i32_i24_e32 v1, s1, v1 -; GFX7-NEXT: v_mul_i32_i24_e32 v2, s14, v2 -; GFX7-NEXT: v_mul_i32_i24_e32 v3, s13, v3 -; GFX7-NEXT: v_mul_i32_i24_e32 v4, s12, v4 +; GFX7-NEXT: s_bfe_i32 s15, s6, 0x40018 +; GFX7-NEXT: s_bfe_i32 s16, s6, 0x40014 +; GFX7-NEXT: s_bfe_i32 s17, s6, 0x40010 +; GFX7-NEXT: s_bfe_i32 s18, s6, 0x40000 +; GFX7-NEXT: s_bfe_i32 s19, s6, 0x40004 +; GFX7-NEXT: s_bfe_i32 s20, s6, 0x40008 +; GFX7-NEXT: s_ashr_i32 s14, s6, 28 +; GFX7-NEXT: s_bfe_i32 s6, s6, 0x4000c +; GFX7-NEXT: s_ashr_i32 s5, s4, 28 +; GFX7-NEXT: s_bfe_i32 s7, s4, 0x40018 +; GFX7-NEXT: s_bfe_i32 s9, s4, 0x40014 +; GFX7-NEXT: s_bfe_i32 s10, s4, 0x40010 +; GFX7-NEXT: s_bfe_i32 s11, s4, 0x40000 +; GFX7-NEXT: v_mov_b32_e32 v4, s18 +; GFX7-NEXT: s_bfe_i32 s12, s4, 0x40004 +; GFX7-NEXT: v_mov_b32_e32 v3, s19 +; GFX7-NEXT: s_bfe_i32 s13, s4, 0x40008 +; GFX7-NEXT: v_mov_b32_e32 v2, s20 +; GFX7-NEXT: s_bfe_i32 s4, s4, 0x4000c +; GFX7-NEXT: v_mov_b32_e32 v1, s6 +; GFX7-NEXT: v_mul_i32_i24_e32 v1, s4, v1 +; GFX7-NEXT: v_mul_i32_i24_e32 v2, s13, v2 +; GFX7-NEXT: v_mul_i32_i24_e32 v3, s12, v3 +; GFX7-NEXT: v_mul_i32_i24_e32 v4, s11, v4 ; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX7-NEXT: v_and_b32_e32 v2, s2, v2 +; GFX7-NEXT: v_and_b32_e32 v2, s8, v2 ; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GFX7-NEXT: v_and_b32_e32 v4, s2, v4 +; GFX7-NEXT: v_and_b32_e32 v4, s8, v4 ; GFX7-NEXT: v_or_b32_e32 v1, v2, v1 ; GFX7-NEXT: v_or_b32_e32 v2, v4, v3 ; GFX7-NEXT: v_alignbit_b32 v3, v1, v2, 16 ; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v1 -; GFX7-NEXT: v_mov_b32_e32 v5, s18 -; GFX7-NEXT: v_mov_b32_e32 v6, s17 -; GFX7-NEXT: v_mov_b32_e32 v7, s16 +; GFX7-NEXT: v_mov_b32_e32 v5, s17 +; GFX7-NEXT: v_mov_b32_e32 v6, s16 +; GFX7-NEXT: v_mov_b32_e32 v7, s15 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_add_i32_e32 v0, vcc, v0, v2 ; GFX7-NEXT: v_add_i32_e32 v0, vcc, v3, v0 ; GFX7-NEXT: v_add_i32_e32 v0, vcc, v0, v1 ; GFX7-NEXT: v_add_i32_e32 v0, vcc, v4, v0 -; GFX7-NEXT: v_mad_i32_i24 v0, s11, v5, v0 -; GFX7-NEXT: v_mad_i32_i24 v0, s10, v6, v0 -; GFX7-NEXT: v_mad_i32_i24 v0, s9, v7, v0 -; GFX7-NEXT: v_mov_b32_e32 v1, s15 -; GFX7-NEXT: v_mad_i32_i24 v0, s8, v1, v0 -; GFX7-NEXT: buffer_store_short v0, off, s[4:7], 0 +; GFX7-NEXT: v_mad_i32_i24 v0, s10, v5, v0 +; GFX7-NEXT: v_mad_i32_i24 v0, s9, v6, v0 +; GFX7-NEXT: v_mad_i32_i24 v0, s7, v7, v0 +; GFX7-NEXT: v_mov_b32_e32 v1, s14 +; GFX7-NEXT: v_mad_i32_i24 v0, s5, v1, v0 +; GFX7-NEXT: buffer_store_short v0, off, s[0:3], 0 ; GFX7-NEXT: s_endpgm ; ; GFX8-LABEL: idot8_acc16_vecMul: @@ -1639,59 +1642,59 @@ ; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_load_dword s7, s[6:7], 0x0 +; GFX8-NEXT: s_load_dword s3, s[6:7], 0x0 ; GFX8-NEXT: v_mov_b32_e32 v0, s0 ; GFX8-NEXT: v_mov_b32_e32 v1, s1 ; GFX8-NEXT: flat_load_ushort v2, v[0:1] ; GFX8-NEXT: s_load_dword s1, s[4:5], 0x0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_lshl_b32 s29, s7, 28 -; GFX8-NEXT: s_ashr_i64 s[18:19], s[6:7], 60 -; GFX8-NEXT: s_lshl_b32 s21, s7, 8 -; GFX8-NEXT: s_lshl_b32 s23, s7, 12 -; GFX8-NEXT: s_lshl_b32 s17, s1, 28 -; GFX8-NEXT: s_lshl_b32 s25, s7, 16 -; GFX8-NEXT: s_lshl_b32 s27, s7, 24 -; GFX8-NEXT: s_lshl_b32 s19, s7, 4 -; GFX8-NEXT: s_lshl_b32 s7, s7, 20 +; GFX8-NEXT: s_lshl_b32 s27, s3, 28 +; GFX8-NEXT: s_ashr_i64 s[16:17], s[2:3], 60 +; GFX8-NEXT: s_lshl_b32 s19, s3, 8 +; GFX8-NEXT: s_lshl_b32 s21, s3, 12 +; GFX8-NEXT: s_lshl_b32 s15, s1, 28 +; GFX8-NEXT: s_lshl_b32 s23, s3, 16 +; GFX8-NEXT: s_lshl_b32 s25, s3, 24 +; GFX8-NEXT: s_lshl_b32 s17, s3, 4 +; GFX8-NEXT: s_lshl_b32 s3, s3, 20 ; GFX8-NEXT: s_ashr_i64 s[4:5], s[0:1], 60 -; GFX8-NEXT: s_ashr_i64 s[28:29], s[28:29], 60 -; GFX8-NEXT: s_lshl_b32 s9, s1, 8 -; GFX8-NEXT: s_lshl_b32 s11, s1, 12 -; GFX8-NEXT: s_lshl_b32 s13, s1, 16 -; GFX8-NEXT: s_lshl_b32 s15, s1, 24 +; GFX8-NEXT: s_ashr_i64 s[26:27], s[26:27], 60 +; GFX8-NEXT: s_lshl_b32 s7, s1, 8 +; GFX8-NEXT: s_lshl_b32 s9, s1, 12 +; GFX8-NEXT: s_lshl_b32 s11, s1, 16 +; GFX8-NEXT: s_lshl_b32 s13, s1, 24 ; GFX8-NEXT: s_lshl_b32 s5, s1, 4 ; GFX8-NEXT: s_lshl_b32 s1, s1, 20 -; GFX8-NEXT: s_ashr_i64 s[26:27], s[26:27], 60 -; GFX8-NEXT: s_ashr_i64 s[6:7], s[6:7], 60 -; GFX8-NEXT: s_ashr_i64 s[16:17], s[16:17], 60 -; GFX8-NEXT: v_mov_b32_e32 v4, s28 +; GFX8-NEXT: s_ashr_i64 s[24:25], s[24:25], 60 +; GFX8-NEXT: s_ashr_i64 s[2:3], s[2:3], 60 ; GFX8-NEXT: s_ashr_i64 s[14:15], s[14:15], 60 +; GFX8-NEXT: v_mov_b32_e32 v4, s26 +; GFX8-NEXT: s_ashr_i64 s[12:13], s[12:13], 60 ; GFX8-NEXT: s_ashr_i64 s[0:1], s[0:1], 60 -; GFX8-NEXT: v_mov_b32_e32 v3, s6 -; GFX8-NEXT: v_mov_b32_e32 v5, s26 -; GFX8-NEXT: s_ashr_i64 s[24:25], s[24:25], 60 -; GFX8-NEXT: v_mul_i32_i24_e32 v3, s0, v3 +; GFX8-NEXT: v_mov_b32_e32 v3, s2 +; GFX8-NEXT: v_mov_b32_e32 v5, s24 ; GFX8-NEXT: s_ashr_i64 s[22:23], s[22:23], 60 -; GFX8-NEXT: s_ashr_i64 s[12:13], s[12:13], 60 -; GFX8-NEXT: v_mov_b32_e32 v6, s24 +; GFX8-NEXT: v_mul_i32_i24_e32 v3, s0, v3 ; GFX8-NEXT: s_ashr_i64 s[20:21], s[20:21], 60 ; GFX8-NEXT: s_ashr_i64 s[10:11], s[10:11], 60 -; GFX8-NEXT: v_mov_b32_e32 v7, s22 -; GFX8-NEXT: s_ashr_i64 s[32:33], s[18:19], 60 +; GFX8-NEXT: v_mov_b32_e32 v6, s22 +; GFX8-NEXT: s_ashr_i64 s[18:19], s[18:19], 60 ; GFX8-NEXT: s_ashr_i64 s[8:9], s[8:9], 60 -; GFX8-NEXT: v_mov_b32_e32 v8, s20 -; GFX8-NEXT: s_ashr_i64 s[30:31], s[4:5], 60 -; GFX8-NEXT: v_mov_b32_e32 v9, s32 +; GFX8-NEXT: v_mov_b32_e32 v7, s20 +; GFX8-NEXT: s_ashr_i64 s[30:31], s[16:17], 60 +; GFX8-NEXT: s_ashr_i64 s[6:7], s[6:7], 60 +; GFX8-NEXT: v_mov_b32_e32 v8, s18 +; GFX8-NEXT: s_ashr_i64 s[28:29], s[4:5], 60 +; GFX8-NEXT: v_mov_b32_e32 v9, s30 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_mad_i32_i24 v2, s16, v4, v2 -; GFX8-NEXT: v_mad_i32_i24 v2, s14, v5, v2 +; GFX8-NEXT: v_mad_i32_i24 v2, s14, v4, v2 +; GFX8-NEXT: v_mad_i32_i24 v2, s12, v5, v2 ; GFX8-NEXT: v_add_u32_sdwa v2, vcc, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0 -; GFX8-NEXT: v_mad_i32_i24 v2, s12, v6, v2 -; GFX8-NEXT: v_mad_i32_i24 v2, s10, v7, v2 -; GFX8-NEXT: v_mad_i32_i24 v2, s8, v8, v2 -; GFX8-NEXT: v_mad_i32_i24 v2, s30, v9, v2 -; GFX8-NEXT: v_mov_b32_e32 v3, s18 +; GFX8-NEXT: v_mad_i32_i24 v2, s10, v6, v2 +; GFX8-NEXT: v_mad_i32_i24 v2, s8, v7, v2 +; GFX8-NEXT: v_mad_i32_i24 v2, s6, v8, v2 +; GFX8-NEXT: v_mad_i32_i24 v2, s28, v9, v2 +; GFX8-NEXT: v_mov_b32_e32 v3, s16 ; GFX8-NEXT: v_mad_i32_i24 v2, s4, v3, v2 ; GFX8-NEXT: flat_store_short v[0:1], v2 ; GFX8-NEXT: s_endpgm @@ -1704,35 +1707,35 @@ ; GFX9-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX9-NEXT: s_load_dword s6, s[6:7], 0x0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_bfe_u32 s4, s2, 0x40018 -; GFX9-NEXT: s_lshr_b32 s5, s2, 28 -; GFX9-NEXT: s_bfe_u32 s8, s2, 0x40010 -; GFX9-NEXT: s_bfe_u32 s9, s2, 0x40014 -; GFX9-NEXT: s_bfe_u32 s10, s2, 0x40008 -; GFX9-NEXT: s_bfe_u32 s11, s2, 0x4000c -; GFX9-NEXT: s_and_b32 s12, s2, 15 +; GFX9-NEXT: s_bfe_u32 s3, s2, 0x40018 +; GFX9-NEXT: s_lshr_b32 s4, s2, 28 +; GFX9-NEXT: s_bfe_u32 s5, s2, 0x40010 +; GFX9-NEXT: s_bfe_u32 s8, s2, 0x40014 +; GFX9-NEXT: s_bfe_u32 s9, s2, 0x40008 +; GFX9-NEXT: s_bfe_u32 s10, s2, 0x4000c +; GFX9-NEXT: s_and_b32 s11, s2, 15 ; GFX9-NEXT: s_bfe_u32 s2, s2, 0x40004 -; GFX9-NEXT: s_pack_ll_b32_b16 s2, s12, s2 +; GFX9-NEXT: s_pack_ll_b32_b16 s2, s11, s2 ; GFX9-NEXT: v_pk_lshlrev_b16 v0, 12, s2 op_sel_hi:[0,1] -; GFX9-NEXT: s_pack_ll_b32_b16 s2, s10, s11 +; GFX9-NEXT: s_pack_ll_b32_b16 s2, s9, s10 ; GFX9-NEXT: v_pk_lshlrev_b16 v1, 12, s2 op_sel_hi:[0,1] -; GFX9-NEXT: s_pack_ll_b32_b16 s2, s8, s9 +; GFX9-NEXT: s_pack_ll_b32_b16 s2, s5, s8 ; GFX9-NEXT: v_pk_lshlrev_b16 v2, 12, s2 op_sel_hi:[0,1] -; GFX9-NEXT: s_pack_ll_b32_b16 s2, s4, s5 +; GFX9-NEXT: s_pack_ll_b32_b16 s2, s3, s4 ; GFX9-NEXT: s_bfe_u32 s7, s6, 0x40018 -; GFX9-NEXT: s_lshr_b32 s13, s6, 28 -; GFX9-NEXT: s_bfe_u32 s14, s6, 0x40010 -; GFX9-NEXT: s_bfe_u32 s15, s6, 0x40014 -; GFX9-NEXT: s_bfe_u32 s16, s6, 0x40008 -; GFX9-NEXT: s_bfe_u32 s17, s6, 0x4000c -; GFX9-NEXT: s_and_b32 s18, s6, 15 +; GFX9-NEXT: s_lshr_b32 s12, s6, 28 +; GFX9-NEXT: s_bfe_u32 s13, s6, 0x40010 +; GFX9-NEXT: s_bfe_u32 s14, s6, 0x40014 +; GFX9-NEXT: s_bfe_u32 s15, s6, 0x40008 +; GFX9-NEXT: s_bfe_u32 s16, s6, 0x4000c +; GFX9-NEXT: s_and_b32 s17, s6, 15 ; GFX9-NEXT: s_bfe_u32 s6, s6, 0x40004 ; GFX9-NEXT: v_pk_lshlrev_b16 v3, 12, s2 op_sel_hi:[0,1] -; GFX9-NEXT: s_pack_ll_b32_b16 s2, s18, s6 +; GFX9-NEXT: s_pack_ll_b32_b16 s2, s17, s6 ; GFX9-NEXT: v_pk_lshlrev_b16 v4, 12, s2 op_sel_hi:[0,1] -; GFX9-NEXT: s_pack_ll_b32_b16 s2, s16, s17 +; GFX9-NEXT: s_pack_ll_b32_b16 s2, s15, s16 ; GFX9-NEXT: v_pk_lshlrev_b16 v5, 12, s2 op_sel_hi:[0,1] -; GFX9-NEXT: s_pack_ll_b32_b16 s2, s14, s15 +; GFX9-NEXT: s_pack_ll_b32_b16 s2, s13, s14 ; GFX9-NEXT: v_pk_ashrrev_i16 v0, 12, v0 op_sel_hi:[0,1] ; GFX9-NEXT: v_pk_ashrrev_i16 v4, 12, v4 op_sel_hi:[0,1] ; GFX9-NEXT: v_pk_ashrrev_i16 v1, 12, v1 op_sel_hi:[0,1] @@ -1746,7 +1749,7 @@ ; GFX9-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-NEXT: v_pk_mul_lo_u16 v2, v2, v6 ; GFX9-NEXT: global_load_ushort v6, v[0:1], off -; GFX9-NEXT: s_pack_ll_b32_b16 s2, s7, s13 +; GFX9-NEXT: s_pack_ll_b32_b16 s2, s7, s12 ; GFX9-NEXT: v_pk_lshlrev_b16 v7, 12, s2 op_sel_hi:[0,1] ; GFX9-NEXT: v_pk_ashrrev_i16 v3, 12, v3 op_sel_hi:[0,1] ; GFX9-NEXT: v_pk_ashrrev_i16 v7, 12, v7 op_sel_hi:[0,1] @@ -1771,35 +1774,35 @@ ; GFX9-DL-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX9-DL-NEXT: s_load_dword s6, s[6:7], 0x0 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: s_bfe_u32 s4, s2, 0x40018 -; GFX9-DL-NEXT: s_lshr_b32 s5, s2, 28 -; GFX9-DL-NEXT: s_bfe_u32 s8, s2, 0x40010 -; GFX9-DL-NEXT: s_bfe_u32 s9, s2, 0x40014 -; GFX9-DL-NEXT: s_bfe_u32 s10, s2, 0x40008 -; GFX9-DL-NEXT: s_bfe_u32 s11, s2, 0x4000c -; GFX9-DL-NEXT: s_and_b32 s12, s2, 15 +; GFX9-DL-NEXT: s_bfe_u32 s3, s2, 0x40018 +; GFX9-DL-NEXT: s_lshr_b32 s4, s2, 28 +; GFX9-DL-NEXT: s_bfe_u32 s5, s2, 0x40010 +; GFX9-DL-NEXT: s_bfe_u32 s8, s2, 0x40014 +; GFX9-DL-NEXT: s_bfe_u32 s9, s2, 0x40008 +; GFX9-DL-NEXT: s_bfe_u32 s10, s2, 0x4000c +; GFX9-DL-NEXT: s_and_b32 s11, s2, 15 ; GFX9-DL-NEXT: s_bfe_u32 s2, s2, 0x40004 -; GFX9-DL-NEXT: s_pack_ll_b32_b16 s2, s12, s2 +; GFX9-DL-NEXT: s_pack_ll_b32_b16 s2, s11, s2 ; GFX9-DL-NEXT: v_pk_lshlrev_b16 v0, 12, s2 op_sel_hi:[0,1] -; GFX9-DL-NEXT: s_pack_ll_b32_b16 s2, s10, s11 +; GFX9-DL-NEXT: s_pack_ll_b32_b16 s2, s9, s10 ; GFX9-DL-NEXT: v_pk_lshlrev_b16 v1, 12, s2 op_sel_hi:[0,1] -; GFX9-DL-NEXT: s_pack_ll_b32_b16 s2, s8, s9 +; GFX9-DL-NEXT: s_pack_ll_b32_b16 s2, s5, s8 ; GFX9-DL-NEXT: v_pk_lshlrev_b16 v2, 12, s2 op_sel_hi:[0,1] -; GFX9-DL-NEXT: s_pack_ll_b32_b16 s2, s4, s5 +; GFX9-DL-NEXT: s_pack_ll_b32_b16 s2, s3, s4 ; GFX9-DL-NEXT: s_bfe_u32 s7, s6, 0x40018 -; GFX9-DL-NEXT: s_lshr_b32 s13, s6, 28 -; GFX9-DL-NEXT: s_bfe_u32 s14, s6, 0x40010 -; GFX9-DL-NEXT: s_bfe_u32 s15, s6, 0x40014 -; GFX9-DL-NEXT: s_bfe_u32 s16, s6, 0x40008 -; GFX9-DL-NEXT: s_bfe_u32 s17, s6, 0x4000c -; GFX9-DL-NEXT: s_and_b32 s18, s6, 15 +; GFX9-DL-NEXT: s_lshr_b32 s12, s6, 28 +; GFX9-DL-NEXT: s_bfe_u32 s13, s6, 0x40010 +; GFX9-DL-NEXT: s_bfe_u32 s14, s6, 0x40014 +; GFX9-DL-NEXT: s_bfe_u32 s15, s6, 0x40008 +; GFX9-DL-NEXT: s_bfe_u32 s16, s6, 0x4000c +; GFX9-DL-NEXT: s_and_b32 s17, s6, 15 ; GFX9-DL-NEXT: s_bfe_u32 s6, s6, 0x40004 ; GFX9-DL-NEXT: v_pk_lshlrev_b16 v3, 12, s2 op_sel_hi:[0,1] -; GFX9-DL-NEXT: s_pack_ll_b32_b16 s2, s18, s6 +; GFX9-DL-NEXT: s_pack_ll_b32_b16 s2, s17, s6 ; GFX9-DL-NEXT: v_pk_lshlrev_b16 v4, 12, s2 op_sel_hi:[0,1] -; GFX9-DL-NEXT: s_pack_ll_b32_b16 s2, s16, s17 +; GFX9-DL-NEXT: s_pack_ll_b32_b16 s2, s15, s16 ; GFX9-DL-NEXT: v_pk_lshlrev_b16 v5, 12, s2 op_sel_hi:[0,1] -; GFX9-DL-NEXT: s_pack_ll_b32_b16 s2, s14, s15 +; GFX9-DL-NEXT: s_pack_ll_b32_b16 s2, s13, s14 ; GFX9-DL-NEXT: v_pk_ashrrev_i16 v0, 12, v0 op_sel_hi:[0,1] ; GFX9-DL-NEXT: v_pk_ashrrev_i16 v4, 12, v4 op_sel_hi:[0,1] ; GFX9-DL-NEXT: v_pk_ashrrev_i16 v1, 12, v1 op_sel_hi:[0,1] @@ -1813,7 +1816,7 @@ ; GFX9-DL-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-DL-NEXT: v_pk_mul_lo_u16 v2, v2, v6 ; GFX9-DL-NEXT: global_load_ushort v6, v[0:1], off -; GFX9-DL-NEXT: s_pack_ll_b32_b16 s2, s7, s13 +; GFX9-DL-NEXT: s_pack_ll_b32_b16 s2, s7, s12 ; GFX9-DL-NEXT: v_pk_lshlrev_b16 v7, 12, s2 op_sel_hi:[0,1] ; GFX9-DL-NEXT: v_pk_ashrrev_i16 v3, 12, v3 op_sel_hi:[0,1] ; GFX9-DL-NEXT: v_pk_ashrrev_i16 v7, 12, v7 op_sel_hi:[0,1] @@ -1832,53 +1835,53 @@ ; ; GFX10-DL-LABEL: idot8_acc16_vecMul: ; GFX10-DL: ; %bb.0: ; %entry -; GFX10-DL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34 +; GFX10-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX10-DL-NEXT: ; implicit-def: $vcc_hi ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: v_mov_b32_e32 v0, s4 -; GFX10-DL-NEXT: v_mov_b32_e32 v1, s5 -; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-DL-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-DL-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-DL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-DL-NEXT: global_load_ushort v2, v[0:1], off ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: s_load_dword s0, s[4:5], 0x0 -; GFX10-DL-NEXT: s_load_dword s1, s[6:7], 0x0 +; GFX10-DL-NEXT: s_load_dword s0, s[0:1], 0x0 +; GFX10-DL-NEXT: s_load_dword s1, s[2:3], 0x0 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: s_and_b32 s5, s0, 15 -; GFX10-DL-NEXT: s_bfe_u32 s6, s0, 0x40004 -; GFX10-DL-NEXT: s_and_b32 s7, s1, 15 -; GFX10-DL-NEXT: s_bfe_u32 s8, s1, 0x40004 +; GFX10-DL-NEXT: s_and_b32 s4, s0, 15 +; GFX10-DL-NEXT: s_bfe_u32 s5, s0, 0x40004 +; GFX10-DL-NEXT: s_and_b32 s6, s1, 15 +; GFX10-DL-NEXT: s_bfe_u32 s7, s1, 0x40004 ; GFX10-DL-NEXT: s_bfe_u32 s2, s0, 0x40018 -; GFX10-DL-NEXT: s_lshr_b32 s4, s0, 28 -; GFX10-DL-NEXT: s_pack_ll_b32_b16 s5, s5, s6 -; GFX10-DL-NEXT: s_bfe_u32 s9, s0, 0x40010 -; GFX10-DL-NEXT: s_pack_ll_b32_b16 s7, s7, s8 -; GFX10-DL-NEXT: s_bfe_u32 s10, s0, 0x40014 -; GFX10-DL-NEXT: s_bfe_u32 s6, s0, 0x40008 -; GFX10-DL-NEXT: v_pk_lshlrev_b16 v3, 12, s5 op_sel_hi:[0,1] +; GFX10-DL-NEXT: s_lshr_b32 s3, s0, 28 +; GFX10-DL-NEXT: s_pack_ll_b32_b16 s4, s4, s5 +; GFX10-DL-NEXT: s_bfe_u32 s8, s0, 0x40010 +; GFX10-DL-NEXT: s_pack_ll_b32_b16 s6, s6, s7 +; GFX10-DL-NEXT: s_bfe_u32 s9, s0, 0x40014 +; GFX10-DL-NEXT: s_bfe_u32 s5, s0, 0x40008 +; GFX10-DL-NEXT: v_pk_lshlrev_b16 v3, 12, s4 op_sel_hi:[0,1] ; GFX10-DL-NEXT: s_bfe_u32 s0, s0, 0x4000c -; GFX10-DL-NEXT: v_pk_lshlrev_b16 v4, 12, s7 op_sel_hi:[0,1] -; GFX10-DL-NEXT: s_bfe_u32 s8, s1, 0x40008 -; GFX10-DL-NEXT: s_bfe_u32 s5, s1, 0x4000c +; GFX10-DL-NEXT: v_pk_lshlrev_b16 v4, 12, s6 op_sel_hi:[0,1] +; GFX10-DL-NEXT: s_bfe_u32 s7, s1, 0x40008 +; GFX10-DL-NEXT: s_bfe_u32 s4, s1, 0x4000c ; GFX10-DL-NEXT: v_pk_ashrrev_i16 v3, 12, v3 op_sel_hi:[0,1] -; GFX10-DL-NEXT: s_pack_ll_b32_b16 s0, s6, s0 +; GFX10-DL-NEXT: s_pack_ll_b32_b16 s0, s5, s0 ; GFX10-DL-NEXT: v_pk_ashrrev_i16 v4, 12, v4 op_sel_hi:[0,1] -; GFX10-DL-NEXT: s_bfe_u32 s6, s1, 0x40010 -; GFX10-DL-NEXT: s_pack_ll_b32_b16 s5, s8, s5 -; GFX10-DL-NEXT: s_bfe_u32 s7, s1, 0x40018 +; GFX10-DL-NEXT: s_bfe_u32 s5, s1, 0x40010 +; GFX10-DL-NEXT: s_pack_ll_b32_b16 s4, s7, s4 +; GFX10-DL-NEXT: s_bfe_u32 s6, s1, 0x40018 ; GFX10-DL-NEXT: v_pk_lshlrev_b16 v5, 12, s0 op_sel_hi:[0,1] ; GFX10-DL-NEXT: s_bfe_u32 s0, s1, 0x40014 ; GFX10-DL-NEXT: v_pk_mul_lo_u16 v3, v3, v4 -; GFX10-DL-NEXT: v_pk_lshlrev_b16 v6, 12, s5 op_sel_hi:[0,1] -; GFX10-DL-NEXT: s_pack_ll_b32_b16 s5, s9, s10 +; GFX10-DL-NEXT: v_pk_lshlrev_b16 v6, 12, s4 op_sel_hi:[0,1] +; GFX10-DL-NEXT: s_pack_ll_b32_b16 s4, s8, s9 ; GFX10-DL-NEXT: v_pk_ashrrev_i16 v4, 12, v5 op_sel_hi:[0,1] -; GFX10-DL-NEXT: s_pack_ll_b32_b16 s0, s6, s0 +; GFX10-DL-NEXT: s_pack_ll_b32_b16 s0, s5, s0 ; GFX10-DL-NEXT: s_lshr_b32 s1, s1, 28 ; GFX10-DL-NEXT: v_pk_ashrrev_i16 v5, 12, v6 op_sel_hi:[0,1] -; GFX10-DL-NEXT: v_pk_lshlrev_b16 v6, 12, s5 op_sel_hi:[0,1] +; GFX10-DL-NEXT: v_pk_lshlrev_b16 v6, 12, s4 op_sel_hi:[0,1] ; GFX10-DL-NEXT: v_pk_lshlrev_b16 v7, 12, s0 op_sel_hi:[0,1] -; GFX10-DL-NEXT: s_pack_ll_b32_b16 s0, s2, s4 +; GFX10-DL-NEXT: s_pack_ll_b32_b16 s0, s2, s3 ; GFX10-DL-NEXT: v_pk_mul_lo_u16 v4, v4, v5 -; GFX10-DL-NEXT: s_pack_ll_b32_b16 s1, s7, s1 +; GFX10-DL-NEXT: s_pack_ll_b32_b16 s1, s6, s1 ; GFX10-DL-NEXT: v_pk_ashrrev_i16 v5, 12, v7 op_sel_hi:[0,1] ; GFX10-DL-NEXT: v_pk_lshlrev_b16 v7, 12, s1 op_sel_hi:[0,1] ; GFX10-DL-NEXT: s_waitcnt vmcnt(0) @@ -1935,65 +1938,65 @@ define amdgpu_kernel void @idot8_acc8_vecMul(<8 x i4> addrspace(1)* %src1, ; GFX7-LABEL: idot8_acc8_vecMul: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x9 -; GFX7-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xd -; GFX7-NEXT: s_mov_b32 s7, 0xf000 -; GFX7-NEXT: s_mov_b32 s6, -1 -; GFX7-NEXT: s_movk_i32 s0, 0xff -; GFX7-NEXT: s_mov_b32 s1, 0xffff +; GFX7-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd +; GFX7-NEXT: s_mov_b32 s3, 0xf000 +; GFX7-NEXT: s_mov_b32 s2, -1 +; GFX7-NEXT: s_movk_i32 s8, 0xff +; GFX7-NEXT: s_mov_b32 s9, 0xffff ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: buffer_load_ubyte v0, off, s[4:7], 0 -; GFX7-NEXT: s_load_dword s2, s[8:9], 0x0 -; GFX7-NEXT: s_load_dword s8, s[10:11], 0x0 +; GFX7-NEXT: buffer_load_ubyte v0, off, s[0:3], 0 +; GFX7-NEXT: s_load_dword s4, s[4:5], 0x0 +; GFX7-NEXT: s_load_dword s5, s[6:7], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: s_bfe_i32 s9, s2, 0x40000 -; GFX7-NEXT: s_bfe_i32 s16, s8, 0x40000 -; GFX7-NEXT: s_bfe_i32 s17, s8, 0x40004 -; GFX7-NEXT: s_bfe_i32 s18, s8, 0x40008 -; GFX7-NEXT: s_bfe_i32 s19, s8, 0x4000c -; GFX7-NEXT: s_bfe_i32 s20, s8, 0x40010 -; GFX7-NEXT: s_bfe_i32 s21, s8, 0x40014 -; GFX7-NEXT: s_bfe_i32 s22, s8, 0x40018 -; GFX7-NEXT: s_ashr_i32 s8, s8, 28 -; GFX7-NEXT: v_mov_b32_e32 v8, s16 -; GFX7-NEXT: s_bfe_i32 s10, s2, 0x40004 -; GFX7-NEXT: v_mov_b32_e32 v7, s17 -; GFX7-NEXT: s_bfe_i32 s11, s2, 0x40008 -; GFX7-NEXT: v_mov_b32_e32 v6, s18 -; GFX7-NEXT: s_bfe_i32 s12, s2, 0x4000c -; GFX7-NEXT: v_mov_b32_e32 v5, s19 -; GFX7-NEXT: s_bfe_i32 s13, s2, 0x40010 -; GFX7-NEXT: v_mov_b32_e32 v4, s20 -; GFX7-NEXT: s_bfe_i32 s14, s2, 0x40014 -; GFX7-NEXT: v_mov_b32_e32 v3, s21 -; GFX7-NEXT: s_bfe_i32 s15, s2, 0x40018 -; GFX7-NEXT: v_mov_b32_e32 v2, s22 -; GFX7-NEXT: s_ashr_i32 s2, s2, 28 -; GFX7-NEXT: v_mov_b32_e32 v1, s8 -; GFX7-NEXT: v_mul_i32_i24_e32 v1, s2, v1 -; GFX7-NEXT: v_mul_i32_i24_e32 v2, s15, v2 -; GFX7-NEXT: v_mul_i32_i24_e32 v3, s14, v3 -; GFX7-NEXT: v_mul_i32_i24_e32 v9, s13, v4 -; GFX7-NEXT: v_mul_i32_i24_e32 v5, s12, v5 -; GFX7-NEXT: v_mul_i32_i24_e32 v6, s11, v6 -; GFX7-NEXT: v_mul_i32_i24_e32 v7, s10, v7 -; GFX7-NEXT: v_mul_i32_i24_e32 v8, s9, v8 +; GFX7-NEXT: s_bfe_i32 s6, s4, 0x40000 +; GFX7-NEXT: s_bfe_i32 s15, s5, 0x40000 +; GFX7-NEXT: s_bfe_i32 s16, s5, 0x40004 +; GFX7-NEXT: s_bfe_i32 s17, s5, 0x40008 +; GFX7-NEXT: s_bfe_i32 s18, s5, 0x4000c +; GFX7-NEXT: s_bfe_i32 s19, s5, 0x40010 +; GFX7-NEXT: s_bfe_i32 s20, s5, 0x40014 +; GFX7-NEXT: s_bfe_i32 s21, s5, 0x40018 +; GFX7-NEXT: s_ashr_i32 s5, s5, 28 +; GFX7-NEXT: v_mov_b32_e32 v8, s15 +; GFX7-NEXT: s_bfe_i32 s7, s4, 0x40004 +; GFX7-NEXT: v_mov_b32_e32 v7, s16 +; GFX7-NEXT: s_bfe_i32 s10, s4, 0x40008 +; GFX7-NEXT: v_mov_b32_e32 v6, s17 +; GFX7-NEXT: s_bfe_i32 s11, s4, 0x4000c +; GFX7-NEXT: v_mov_b32_e32 v5, s18 +; GFX7-NEXT: s_bfe_i32 s12, s4, 0x40010 +; GFX7-NEXT: v_mov_b32_e32 v4, s19 +; GFX7-NEXT: s_bfe_i32 s13, s4, 0x40014 +; GFX7-NEXT: v_mov_b32_e32 v3, s20 +; GFX7-NEXT: s_bfe_i32 s14, s4, 0x40018 +; GFX7-NEXT: v_mov_b32_e32 v2, s21 +; GFX7-NEXT: s_ashr_i32 s4, s4, 28 +; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: v_mul_i32_i24_e32 v1, s4, v1 +; GFX7-NEXT: v_mul_i32_i24_e32 v2, s14, v2 +; GFX7-NEXT: v_mul_i32_i24_e32 v3, s13, v3 +; GFX7-NEXT: v_mul_i32_i24_e32 v9, s12, v4 +; GFX7-NEXT: v_mul_i32_i24_e32 v5, s11, v5 +; GFX7-NEXT: v_mul_i32_i24_e32 v6, s10, v6 +; GFX7-NEXT: v_mul_i32_i24_e32 v7, s7, v7 +; GFX7-NEXT: v_mul_i32_i24_e32 v8, s6, v8 ; GFX7-NEXT: v_lshlrev_b32_e32 v1, 8, v1 -; GFX7-NEXT: v_and_b32_e32 v2, s0, v2 +; GFX7-NEXT: v_and_b32_e32 v2, s8, v2 ; GFX7-NEXT: v_lshlrev_b32_e32 v3, 8, v3 -; GFX7-NEXT: v_and_b32_e32 v9, s0, v9 +; GFX7-NEXT: v_and_b32_e32 v9, s8, v9 ; GFX7-NEXT: v_lshlrev_b32_e32 v5, 8, v5 -; GFX7-NEXT: v_and_b32_e32 v6, s0, v6 +; GFX7-NEXT: v_and_b32_e32 v6, s8, v6 ; GFX7-NEXT: v_lshlrev_b32_e32 v7, 8, v7 -; GFX7-NEXT: v_and_b32_e32 v8, s0, v8 +; GFX7-NEXT: v_and_b32_e32 v8, s8, v8 ; GFX7-NEXT: v_or_b32_e32 v1, v2, v1 ; GFX7-NEXT: v_or_b32_e32 v2, v9, v3 ; GFX7-NEXT: v_or_b32_e32 v3, v6, v5 ; GFX7-NEXT: v_or_b32_e32 v5, v8, v7 ; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX7-NEXT: v_and_b32_e32 v2, s1, v2 +; GFX7-NEXT: v_and_b32_e32 v2, s9, v2 ; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GFX7-NEXT: v_and_b32_e32 v5, s1, v5 +; GFX7-NEXT: v_and_b32_e32 v5, s9, v5 ; GFX7-NEXT: v_or_b32_e32 v1, v2, v1 ; GFX7-NEXT: v_or_b32_e32 v2, v5, v3 ; GFX7-NEXT: v_alignbit_b32 v3, v1, v2, 8 @@ -2007,83 +2010,83 @@ ; GFX7-NEXT: v_add_i32_e32 v0, vcc, v3, v0 ; GFX7-NEXT: v_add_i32_e32 v0, vcc, v5, v0 ; GFX7-NEXT: v_add_i32_e32 v0, vcc, v6, v0 -; GFX7-NEXT: v_mad_i32_i24 v0, s13, v4, v0 +; GFX7-NEXT: v_mad_i32_i24 v0, s12, v4, v0 ; GFX7-NEXT: v_add_i32_e32 v0, vcc, v0, v7 ; GFX7-NEXT: v_add_i32_e32 v0, vcc, v0, v8 ; GFX7-NEXT: v_add_i32_e32 v0, vcc, v0, v1 -; GFX7-NEXT: buffer_store_byte v0, off, s[4:7], 0 +; GFX7-NEXT: buffer_store_byte v0, off, s[0:3], 0 ; GFX7-NEXT: s_endpgm ; ; GFX8-LABEL: idot8_acc8_vecMul: ; GFX8: ; %bb.0: ; %entry ; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 -; GFX8-NEXT: s_mov_b32 s2, 0xffff +; GFX8-NEXT: s_mov_b32 s32, 0xffff ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v0, s0 ; GFX8-NEXT: v_mov_b32_e32 v1, s1 ; GFX8-NEXT: flat_load_ubyte v2, v[0:1] ; GFX8-NEXT: s_load_dword s1, s[4:5], 0x0 -; GFX8-NEXT: s_load_dword s5, s[6:7], 0x0 +; GFX8-NEXT: s_load_dword s3, s[6:7], 0x0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_lshl_b32 s13, s1, 24 -; GFX8-NEXT: s_lshl_b32 s17, s1, 16 -; GFX8-NEXT: s_ashr_i64 s[22:23], s[4:5], 60 -; GFX8-NEXT: s_lshl_b32 s25, s5, 24 -; GFX8-NEXT: s_lshl_b32 s27, s5, 28 -; GFX8-NEXT: s_lshl_b32 s29, s5, 16 -; GFX8-NEXT: s_ashr_i64 s[10:11], s[0:1], 60 -; GFX8-NEXT: s_lshl_b32 s15, s1, 28 -; GFX8-NEXT: s_lshl_b32 s19, s5, 8 -; GFX8-NEXT: s_lshl_b32 s21, s5, 12 -; GFX8-NEXT: s_lshl_b32 s23, s5, 4 -; GFX8-NEXT: s_lshl_b32 s5, s5, 20 -; GFX8-NEXT: s_ashr_i64 s[12:13], s[12:13], 60 -; GFX8-NEXT: s_ashr_i64 s[16:17], s[16:17], 60 +; GFX8-NEXT: s_lshl_b32 s11, s1, 24 +; GFX8-NEXT: s_lshl_b32 s15, s1, 16 +; GFX8-NEXT: s_ashr_i64 s[20:21], s[2:3], 60 +; GFX8-NEXT: s_lshl_b32 s23, s3, 24 +; GFX8-NEXT: s_lshl_b32 s25, s3, 28 +; GFX8-NEXT: s_lshl_b32 s27, s3, 16 +; GFX8-NEXT: s_ashr_i64 s[8:9], s[0:1], 60 +; GFX8-NEXT: s_lshl_b32 s13, s1, 28 +; GFX8-NEXT: s_lshl_b32 s17, s3, 8 +; GFX8-NEXT: s_lshl_b32 s19, s3, 12 +; GFX8-NEXT: s_lshl_b32 s21, s3, 4 +; GFX8-NEXT: s_lshl_b32 s3, s3, 20 +; GFX8-NEXT: s_ashr_i64 s[10:11], s[10:11], 60 +; GFX8-NEXT: s_ashr_i64 s[14:15], s[14:15], 60 +; GFX8-NEXT: s_ashr_i64 s[22:23], s[22:23], 60 ; GFX8-NEXT: s_ashr_i64 s[24:25], s[24:25], 60 ; GFX8-NEXT: s_ashr_i64 s[26:27], s[26:27], 60 -; GFX8-NEXT: s_ashr_i64 s[28:29], s[28:29], 60 -; GFX8-NEXT: s_lshl_b32 s7, s1, 8 -; GFX8-NEXT: s_lshl_b32 s9, s1, 12 -; GFX8-NEXT: s_lshl_b32 s11, s1, 4 +; GFX8-NEXT: s_lshl_b32 s5, s1, 8 +; GFX8-NEXT: s_lshl_b32 s7, s1, 12 +; GFX8-NEXT: s_lshl_b32 s9, s1, 4 ; GFX8-NEXT: s_lshl_b32 s1, s1, 20 -; GFX8-NEXT: s_ashr_i64 s[4:5], s[4:5], 60 -; GFX8-NEXT: s_ashr_i64 s[14:15], s[14:15], 60 -; GFX8-NEXT: v_mov_b32_e32 v6, s28 -; GFX8-NEXT: v_mov_b32_e32 v7, s16 -; GFX8-NEXT: v_mov_b32_e32 v8, s26 -; GFX8-NEXT: v_mov_b32_e32 v9, s24 -; GFX8-NEXT: v_mov_b32_e32 v10, s12 +; GFX8-NEXT: s_ashr_i64 s[2:3], s[2:3], 60 +; GFX8-NEXT: s_ashr_i64 s[12:13], s[12:13], 60 +; GFX8-NEXT: v_mov_b32_e32 v6, s26 +; GFX8-NEXT: v_mov_b32_e32 v7, s14 +; GFX8-NEXT: v_mov_b32_e32 v8, s24 +; GFX8-NEXT: v_mov_b32_e32 v9, s22 +; GFX8-NEXT: v_mov_b32_e32 v10, s10 ; GFX8-NEXT: v_mul_i32_i24_sdwa v6, v7, v6 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX8-NEXT: v_mul_i32_i24_e32 v7, s14, v8 +; GFX8-NEXT: v_mul_i32_i24_e32 v7, s12, v8 ; GFX8-NEXT: v_mul_i32_i24_sdwa v8, v10, v9 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX8-NEXT: s_ashr_i64 s[0:1], s[0:1], 60 -; GFX8-NEXT: v_mov_b32_e32 v5, s4 +; GFX8-NEXT: v_mov_b32_e32 v5, s2 ; GFX8-NEXT: v_mul_i32_i24_e32 v5, s0, v5 ; GFX8-NEXT: v_or_b32_sdwa v7, v7, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX8-NEXT: s_ashr_i64 s[6:7], s[6:7], 60 -; GFX8-NEXT: s_ashr_i64 s[18:19], s[18:19], 60 +; GFX8-NEXT: s_ashr_i64 s[4:5], s[4:5], 60 +; GFX8-NEXT: s_ashr_i64 s[16:17], s[16:17], 60 ; GFX8-NEXT: v_or_b32_sdwa v5, v5, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX8-NEXT: v_and_b32_e32 v6, s2, v7 -; GFX8-NEXT: s_ashr_i64 s[20:21], s[20:21], 60 -; GFX8-NEXT: v_mov_b32_e32 v3, s22 -; GFX8-NEXT: v_mov_b32_e32 v4, s10 -; GFX8-NEXT: s_ashr_i64 s[32:33], s[22:23], 60 +; GFX8-NEXT: v_and_b32_e32 v6, s32, v7 +; GFX8-NEXT: s_ashr_i64 s[18:19], s[18:19], 60 +; GFX8-NEXT: v_mov_b32_e32 v3, s20 +; GFX8-NEXT: v_mov_b32_e32 v4, s8 +; GFX8-NEXT: s_ashr_i64 s[30:31], s[20:21], 60 ; GFX8-NEXT: v_mul_i32_i24_sdwa v3, v4, v3 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX8-NEXT: v_or_b32_e32 v5, v6, v5 -; GFX8-NEXT: s_ashr_i64 s[8:9], s[8:9], 60 -; GFX8-NEXT: v_mov_b32_e32 v4, s20 -; GFX8-NEXT: v_mov_b32_e32 v12, s18 -; GFX8-NEXT: v_mov_b32_e32 v13, s6 -; GFX8-NEXT: s_ashr_i64 s[30:31], s[10:11], 60 -; GFX8-NEXT: v_mov_b32_e32 v11, s32 -; GFX8-NEXT: v_mul_i32_i24_e32 v4, s8, v4 +; GFX8-NEXT: s_ashr_i64 s[6:7], s[6:7], 60 +; GFX8-NEXT: v_mov_b32_e32 v4, s18 +; GFX8-NEXT: v_mov_b32_e32 v12, s16 +; GFX8-NEXT: v_mov_b32_e32 v13, s4 +; GFX8-NEXT: s_ashr_i64 s[28:29], s[8:9], 60 +; GFX8-NEXT: v_mov_b32_e32 v11, s30 +; GFX8-NEXT: v_mul_i32_i24_e32 v4, s6, v4 ; GFX8-NEXT: v_mul_i32_i24_sdwa v10, v13, v12 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX8-NEXT: v_lshrrev_b32_e32 v7, 8, v5 ; GFX8-NEXT: v_or_b32_sdwa v4, v4, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX8-NEXT: v_mul_i32_i24_e32 v9, s30, v11 +; GFX8-NEXT: v_mul_i32_i24_e32 v9, s28, v11 ; GFX8-NEXT: v_or_b32_sdwa v3, v9, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX8-NEXT: v_and_b32_e32 v4, s2, v4 +; GFX8-NEXT: v_and_b32_e32 v4, s32, v4 ; GFX8-NEXT: v_or_b32_e32 v3, v4, v3 ; GFX8-NEXT: v_lshrrev_b32_e32 v8, 8, v3 ; GFX8-NEXT: s_waitcnt vmcnt(0) @@ -2110,20 +2113,20 @@ ; GFX9-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX9-NEXT: s_load_dword s1, s[6:7], 0x0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_lshr_b32 s8, s0, 4 -; GFX9-NEXT: s_lshr_b32 s15, s1, 4 +; GFX9-NEXT: s_lshr_b32 s7, s0, 4 +; GFX9-NEXT: s_lshr_b32 s14, s1, 4 ; GFX9-NEXT: v_lshlrev_b16_e64 v3, 12, s0 ; GFX9-NEXT: v_lshlrev_b16_e64 v4, 12, s1 -; GFX9-NEXT: v_lshlrev_b16_e64 v7, 12, s8 -; GFX9-NEXT: v_lshlrev_b16_e64 v14, 12, s15 -; GFX9-NEXT: s_lshr_b32 s9, s0, 12 -; GFX9-NEXT: s_lshr_b32 s10, s0, 8 -; GFX9-NEXT: s_lshr_b32 s16, s1, 12 -; GFX9-NEXT: s_lshr_b32 s17, s1, 8 -; GFX9-NEXT: v_lshlrev_b16_e64 v5, 12, s10 -; GFX9-NEXT: v_lshlrev_b16_e64 v6, 12, s9 -; GFX9-NEXT: v_lshlrev_b16_e64 v12, 12, s17 -; GFX9-NEXT: v_lshlrev_b16_e64 v13, 12, s16 +; GFX9-NEXT: v_lshlrev_b16_e64 v7, 12, s7 +; GFX9-NEXT: v_lshlrev_b16_e64 v14, 12, s14 +; GFX9-NEXT: s_lshr_b32 s8, s0, 12 +; GFX9-NEXT: s_lshr_b32 s9, s0, 8 +; GFX9-NEXT: s_lshr_b32 s15, s1, 12 +; GFX9-NEXT: s_lshr_b32 s16, s1, 8 +; GFX9-NEXT: v_lshlrev_b16_e64 v5, 12, s9 +; GFX9-NEXT: v_lshlrev_b16_e64 v6, 12, s8 +; GFX9-NEXT: v_lshlrev_b16_e64 v12, 12, s16 +; GFX9-NEXT: v_lshlrev_b16_e64 v13, 12, s15 ; GFX9-NEXT: v_ashrrev_i16_e32 v3, 12, v3 ; GFX9-NEXT: v_ashrrev_i16_e32 v4, 12, v4 ; GFX9-NEXT: v_ashrrev_i16_e32 v7, 12, v7 @@ -2135,26 +2138,26 @@ ; GFX9-NEXT: v_mul_lo_u16_e32 v3, v3, v4 ; GFX9-NEXT: v_mul_lo_u16_sdwa v7, v7, v14 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v3, v3, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: s_lshr_b32 s4, s0, 20 -; GFX9-NEXT: s_lshr_b32 s5, s0, 16 -; GFX9-NEXT: s_lshr_b32 s11, s1, 20 -; GFX9-NEXT: s_lshr_b32 s12, s1, 16 +; GFX9-NEXT: s_lshr_b32 s3, s0, 20 +; GFX9-NEXT: s_lshr_b32 s4, s0, 16 +; GFX9-NEXT: s_lshr_b32 s10, s1, 20 +; GFX9-NEXT: s_lshr_b32 s11, s1, 16 ; GFX9-NEXT: v_mul_lo_u16_sdwa v6, v6, v13 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-NEXT: v_mul_lo_u16_e32 v5, v5, v12 -; GFX9-NEXT: v_lshlrev_b16_e64 v10, 12, s5 -; GFX9-NEXT: v_lshlrev_b16_e64 v11, 12, s4 -; GFX9-NEXT: v_lshlrev_b16_e64 v17, 12, s12 -; GFX9-NEXT: v_lshlrev_b16_e64 v18, 12, s11 -; GFX9-NEXT: s_lshr_b32 s6, s0, 28 -; GFX9-NEXT: s_lshr_b32 s7, s0, 24 -; GFX9-NEXT: s_lshr_b32 s13, s1, 28 -; GFX9-NEXT: s_lshr_b32 s14, s1, 24 +; GFX9-NEXT: v_lshlrev_b16_e64 v10, 12, s4 +; GFX9-NEXT: v_lshlrev_b16_e64 v11, 12, s3 +; GFX9-NEXT: v_lshlrev_b16_e64 v17, 12, s11 +; GFX9-NEXT: v_lshlrev_b16_e64 v18, 12, s10 +; GFX9-NEXT: s_lshr_b32 s5, s0, 28 +; GFX9-NEXT: s_lshr_b32 s6, s0, 24 +; GFX9-NEXT: s_lshr_b32 s12, s1, 28 +; GFX9-NEXT: s_lshr_b32 s13, s1, 24 ; GFX9-NEXT: v_and_b32_e32 v3, s2, v3 ; GFX9-NEXT: v_or_b32_sdwa v5, v5, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_lshlrev_b16_e64 v8, 12, s7 -; GFX9-NEXT: v_lshlrev_b16_e64 v9, 12, s6 -; GFX9-NEXT: v_lshlrev_b16_e64 v15, 12, s14 -; GFX9-NEXT: v_lshlrev_b16_e64 v16, 12, s13 +; GFX9-NEXT: v_lshlrev_b16_e64 v8, 12, s6 +; GFX9-NEXT: v_lshlrev_b16_e64 v9, 12, s5 +; GFX9-NEXT: v_lshlrev_b16_e64 v15, 12, s13 +; GFX9-NEXT: v_lshlrev_b16_e64 v16, 12, s12 ; GFX9-NEXT: v_or_b32_e32 v5, v3, v5 ; GFX9-NEXT: v_ashrrev_i16_e32 v10, 12, v10 ; GFX9-NEXT: v_ashrrev_i16_e32 v17, 12, v17 @@ -2198,20 +2201,20 @@ ; GFX9-DL-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX9-DL-NEXT: s_load_dword s1, s[6:7], 0x0 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: s_lshr_b32 s8, s0, 4 -; GFX9-DL-NEXT: s_lshr_b32 s15, s1, 4 +; GFX9-DL-NEXT: s_lshr_b32 s7, s0, 4 +; GFX9-DL-NEXT: s_lshr_b32 s14, s1, 4 ; GFX9-DL-NEXT: v_lshlrev_b16_e64 v3, 12, s0 ; GFX9-DL-NEXT: v_lshlrev_b16_e64 v4, 12, s1 -; GFX9-DL-NEXT: v_lshlrev_b16_e64 v7, 12, s8 -; GFX9-DL-NEXT: v_lshlrev_b16_e64 v14, 12, s15 -; GFX9-DL-NEXT: s_lshr_b32 s9, s0, 12 -; GFX9-DL-NEXT: s_lshr_b32 s10, s0, 8 -; GFX9-DL-NEXT: s_lshr_b32 s16, s1, 12 -; GFX9-DL-NEXT: s_lshr_b32 s17, s1, 8 -; GFX9-DL-NEXT: v_lshlrev_b16_e64 v5, 12, s10 -; GFX9-DL-NEXT: v_lshlrev_b16_e64 v6, 12, s9 -; GFX9-DL-NEXT: v_lshlrev_b16_e64 v12, 12, s17 -; GFX9-DL-NEXT: v_lshlrev_b16_e64 v13, 12, s16 +; GFX9-DL-NEXT: v_lshlrev_b16_e64 v7, 12, s7 +; GFX9-DL-NEXT: v_lshlrev_b16_e64 v14, 12, s14 +; GFX9-DL-NEXT: s_lshr_b32 s8, s0, 12 +; GFX9-DL-NEXT: s_lshr_b32 s9, s0, 8 +; GFX9-DL-NEXT: s_lshr_b32 s15, s1, 12 +; GFX9-DL-NEXT: s_lshr_b32 s16, s1, 8 +; GFX9-DL-NEXT: v_lshlrev_b16_e64 v5, 12, s9 +; GFX9-DL-NEXT: v_lshlrev_b16_e64 v6, 12, s8 +; GFX9-DL-NEXT: v_lshlrev_b16_e64 v12, 12, s16 +; GFX9-DL-NEXT: v_lshlrev_b16_e64 v13, 12, s15 ; GFX9-DL-NEXT: v_ashrrev_i16_e32 v3, 12, v3 ; GFX9-DL-NEXT: v_ashrrev_i16_e32 v4, 12, v4 ; GFX9-DL-NEXT: v_ashrrev_i16_e32 v7, 12, v7 @@ -2223,26 +2226,26 @@ ; GFX9-DL-NEXT: v_mul_lo_u16_e32 v3, v3, v4 ; GFX9-DL-NEXT: v_mul_lo_u16_sdwa v7, v7, v14 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-DL-NEXT: v_or_b32_sdwa v3, v3, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-DL-NEXT: s_lshr_b32 s4, s0, 20 -; GFX9-DL-NEXT: s_lshr_b32 s5, s0, 16 -; GFX9-DL-NEXT: s_lshr_b32 s11, s1, 20 -; GFX9-DL-NEXT: s_lshr_b32 s12, s1, 16 +; GFX9-DL-NEXT: s_lshr_b32 s3, s0, 20 +; GFX9-DL-NEXT: s_lshr_b32 s4, s0, 16 +; GFX9-DL-NEXT: s_lshr_b32 s10, s1, 20 +; GFX9-DL-NEXT: s_lshr_b32 s11, s1, 16 ; GFX9-DL-NEXT: v_mul_lo_u16_sdwa v6, v6, v13 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-DL-NEXT: v_mul_lo_u16_e32 v5, v5, v12 -; GFX9-DL-NEXT: v_lshlrev_b16_e64 v10, 12, s5 -; GFX9-DL-NEXT: v_lshlrev_b16_e64 v11, 12, s4 -; GFX9-DL-NEXT: v_lshlrev_b16_e64 v17, 12, s12 -; GFX9-DL-NEXT: v_lshlrev_b16_e64 v18, 12, s11 -; GFX9-DL-NEXT: s_lshr_b32 s6, s0, 28 -; GFX9-DL-NEXT: s_lshr_b32 s7, s0, 24 -; GFX9-DL-NEXT: s_lshr_b32 s13, s1, 28 -; GFX9-DL-NEXT: s_lshr_b32 s14, s1, 24 +; GFX9-DL-NEXT: v_lshlrev_b16_e64 v10, 12, s4 +; GFX9-DL-NEXT: v_lshlrev_b16_e64 v11, 12, s3 +; GFX9-DL-NEXT: v_lshlrev_b16_e64 v17, 12, s11 +; GFX9-DL-NEXT: v_lshlrev_b16_e64 v18, 12, s10 +; GFX9-DL-NEXT: s_lshr_b32 s5, s0, 28 +; GFX9-DL-NEXT: s_lshr_b32 s6, s0, 24 +; GFX9-DL-NEXT: s_lshr_b32 s12, s1, 28 +; GFX9-DL-NEXT: s_lshr_b32 s13, s1, 24 ; GFX9-DL-NEXT: v_and_b32_e32 v3, s2, v3 ; GFX9-DL-NEXT: v_or_b32_sdwa v5, v5, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-DL-NEXT: v_lshlrev_b16_e64 v8, 12, s7 -; GFX9-DL-NEXT: v_lshlrev_b16_e64 v9, 12, s6 -; GFX9-DL-NEXT: v_lshlrev_b16_e64 v15, 12, s14 -; GFX9-DL-NEXT: v_lshlrev_b16_e64 v16, 12, s13 +; GFX9-DL-NEXT: v_lshlrev_b16_e64 v8, 12, s6 +; GFX9-DL-NEXT: v_lshlrev_b16_e64 v9, 12, s5 +; GFX9-DL-NEXT: v_lshlrev_b16_e64 v15, 12, s13 +; GFX9-DL-NEXT: v_lshlrev_b16_e64 v16, 12, s12 ; GFX9-DL-NEXT: v_or_b32_e32 v5, v3, v5 ; GFX9-DL-NEXT: v_ashrrev_i16_e32 v10, 12, v10 ; GFX9-DL-NEXT: v_ashrrev_i16_e32 v17, 12, v17 @@ -2276,70 +2279,70 @@ ; ; GFX10-DL-LABEL: idot8_acc8_vecMul: ; GFX10-DL: ; %bb.0: ; %entry -; GFX10-DL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34 -; GFX10-DL-NEXT: s_mov_b32 s2, 0xffff +; GFX10-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX10-DL-NEXT: ; implicit-def: $vcc_hi ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: v_mov_b32_e32 v0, s4 -; GFX10-DL-NEXT: v_mov_b32_e32 v1, s5 -; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-DL-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-DL-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-DL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-DL-NEXT: global_load_ubyte v2, v[0:1], off ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: s_load_dword s0, s[4:5], 0x0 -; GFX10-DL-NEXT: s_load_dword s1, s[6:7], 0x0 +; GFX10-DL-NEXT: s_load_dword s0, s[0:1], 0x0 +; GFX10-DL-NEXT: s_load_dword s1, s[2:3], 0x0 +; GFX10-DL-NEXT: s_mov_b32 s2, 0xffff ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: s_lshr_b32 s8, s0, 4 -; GFX10-DL-NEXT: s_lshr_b32 s15, s1, 4 -; GFX10-DL-NEXT: s_lshr_b32 s9, s0, 12 -; GFX10-DL-NEXT: s_lshr_b32 s16, s1, 12 +; GFX10-DL-NEXT: s_lshr_b32 s7, s0, 4 +; GFX10-DL-NEXT: s_lshr_b32 s14, s1, 4 +; GFX10-DL-NEXT: s_lshr_b32 s8, s0, 12 +; GFX10-DL-NEXT: s_lshr_b32 s15, s1, 12 ; GFX10-DL-NEXT: v_lshlrev_b16_e64 v3, 12, s0 -; GFX10-DL-NEXT: v_lshlrev_b16_e64 v7, 12, s8 -; GFX10-DL-NEXT: v_lshlrev_b16_e64 v12, 12, s15 +; GFX10-DL-NEXT: v_lshlrev_b16_e64 v7, 12, s7 +; GFX10-DL-NEXT: v_lshlrev_b16_e64 v12, 12, s14 ; GFX10-DL-NEXT: v_lshlrev_b16_e64 v4, 12, s1 -; GFX10-DL-NEXT: v_lshlrev_b16_e64 v14, 12, s16 -; GFX10-DL-NEXT: s_lshr_b32 s10, s0, 8 -; GFX10-DL-NEXT: s_lshr_b32 s17, s1, 8 -; GFX10-DL-NEXT: v_lshlrev_b16_e64 v6, 12, s9 +; GFX10-DL-NEXT: v_lshlrev_b16_e64 v14, 12, s15 +; GFX10-DL-NEXT: s_lshr_b32 s9, s0, 8 +; GFX10-DL-NEXT: s_lshr_b32 s16, s1, 8 +; GFX10-DL-NEXT: v_lshlrev_b16_e64 v6, 12, s8 ; GFX10-DL-NEXT: v_ashrrev_i16_e64 v7, 12, v7 ; GFX10-DL-NEXT: v_ashrrev_i16_e64 v12, 12, v12 -; GFX10-DL-NEXT: v_lshlrev_b16_e64 v5, 12, s10 +; GFX10-DL-NEXT: v_lshlrev_b16_e64 v5, 12, s9 ; GFX10-DL-NEXT: v_ashrrev_i16_e64 v3, 12, v3 ; GFX10-DL-NEXT: v_ashrrev_i16_e64 v4, 12, v4 -; GFX10-DL-NEXT: v_lshlrev_b16_e64 v13, 12, s17 +; GFX10-DL-NEXT: v_lshlrev_b16_e64 v13, 12, s16 ; GFX10-DL-NEXT: v_mul_lo_u16_e64 v7, v7, v12 ; GFX10-DL-NEXT: v_ashrrev_i16_e64 v19, 12, v6 ; GFX10-DL-NEXT: v_ashrrev_i16_e64 v14, 12, v14 -; GFX10-DL-NEXT: s_lshr_b32 s4, s0, 20 -; GFX10-DL-NEXT: s_lshr_b32 s5, s0, 16 -; GFX10-DL-NEXT: s_lshr_b32 s6, s0, 28 -; GFX10-DL-NEXT: s_lshr_b32 s7, s0, 24 +; GFX10-DL-NEXT: s_lshr_b32 s3, s0, 20 +; GFX10-DL-NEXT: s_lshr_b32 s4, s0, 16 +; GFX10-DL-NEXT: s_lshr_b32 s5, s0, 28 +; GFX10-DL-NEXT: s_lshr_b32 s6, s0, 24 ; GFX10-DL-NEXT: v_mul_lo_u16_e64 v3, v3, v4 ; GFX10-DL-NEXT: v_mul_lo_u16_e64 v4, v19, v14 ; GFX10-DL-NEXT: v_lshlrev_b16_e64 v6, 8, v7 -; GFX10-DL-NEXT: s_lshr_b32 s11, s1, 20 +; GFX10-DL-NEXT: s_lshr_b32 s10, s1, 20 ; GFX10-DL-NEXT: v_ashrrev_i16_e64 v12, 12, v13 ; GFX10-DL-NEXT: v_ashrrev_i16_e64 v5, 12, v5 -; GFX10-DL-NEXT: s_lshr_b32 s12, s1, 16 +; GFX10-DL-NEXT: s_lshr_b32 s11, s1, 16 ; GFX10-DL-NEXT: v_or_b32_sdwa v3, v3, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX10-DL-NEXT: s_lshr_b32 s13, s1, 28 -; GFX10-DL-NEXT: v_lshlrev_b16_e64 v8, 12, s7 -; GFX10-DL-NEXT: v_lshlrev_b16_e64 v9, 12, s6 -; GFX10-DL-NEXT: v_lshlrev_b16_e64 v10, 12, s5 -; GFX10-DL-NEXT: v_lshlrev_b16_e64 v11, 12, s4 -; GFX10-DL-NEXT: v_lshlrev_b16_e64 v13, 12, s11 +; GFX10-DL-NEXT: s_lshr_b32 s12, s1, 28 +; GFX10-DL-NEXT: v_lshlrev_b16_e64 v8, 12, s6 +; GFX10-DL-NEXT: v_lshlrev_b16_e64 v9, 12, s5 +; GFX10-DL-NEXT: v_lshlrev_b16_e64 v10, 12, s4 +; GFX10-DL-NEXT: v_lshlrev_b16_e64 v11, 12, s3 +; GFX10-DL-NEXT: v_lshlrev_b16_e64 v13, 12, s10 ; GFX10-DL-NEXT: v_mul_lo_u16_e64 v5, v5, v12 ; GFX10-DL-NEXT: v_lshlrev_b16_e64 v4, 8, v4 -; GFX10-DL-NEXT: v_lshlrev_b16_e64 v7, 12, s12 -; GFX10-DL-NEXT: s_lshr_b32 s14, s1, 24 +; GFX10-DL-NEXT: v_lshlrev_b16_e64 v7, 12, s11 +; GFX10-DL-NEXT: s_lshr_b32 s13, s1, 24 ; GFX10-DL-NEXT: v_ashrrev_i16_e64 v6, 12, v8 ; GFX10-DL-NEXT: v_ashrrev_i16_e64 v8, 12, v9 ; GFX10-DL-NEXT: v_ashrrev_i16_e64 v9, 12, v10 ; GFX10-DL-NEXT: v_or_b32_sdwa v4, v5, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX10-DL-NEXT: v_and_b32_e32 v3, s2, v3 -; GFX10-DL-NEXT: v_lshlrev_b16_e64 v16, 12, s13 +; GFX10-DL-NEXT: v_lshlrev_b16_e64 v16, 12, s12 ; GFX10-DL-NEXT: v_ashrrev_i16_e64 v5, 12, v11 ; GFX10-DL-NEXT: v_ashrrev_i16_e64 v10, 12, v13 -; GFX10-DL-NEXT: v_lshlrev_b16_e64 v15, 12, s14 +; GFX10-DL-NEXT: v_lshlrev_b16_e64 v15, 12, s13 ; GFX10-DL-NEXT: v_ashrrev_i16_e64 v7, 12, v7 ; GFX10-DL-NEXT: v_ashrrev_i16_e64 v11, 12, v16 ; GFX10-DL-NEXT: v_or_b32_e32 v4, v3, v4 diff --git a/llvm/test/CodeGen/AMDGPU/idot8u.ll b/llvm/test/CodeGen/AMDGPU/idot8u.ll --- a/llvm/test/CodeGen/AMDGPU/idot8u.ll +++ b/llvm/test/CodeGen/AMDGPU/idot8u.ll @@ -9,49 +9,49 @@ define amdgpu_kernel void @udot8_acc32(<8 x i4> addrspace(1)* %src1, ; GFX7-LABEL: udot8_acc32: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x9 -; GFX7-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xd -; GFX7-NEXT: s_mov_b32 s7, 0xf000 -; GFX7-NEXT: s_mov_b32 s6, -1 +; GFX7-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd +; GFX7-NEXT: s_mov_b32 s3, 0xf000 +; GFX7-NEXT: s_mov_b32 s2, -1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: s_load_dword s10, s[10:11], 0x0 -; GFX7-NEXT: s_load_dword s0, s[8:9], 0x0 -; GFX7-NEXT: s_load_dword s21, s[4:5], 0x0 +; GFX7-NEXT: s_load_dword s6, s[6:7], 0x0 +; GFX7-NEXT: s_load_dword s4, s[4:5], 0x0 +; GFX7-NEXT: s_load_dword s20, s[0:1], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: s_lshr_b32 s11, s10, 28 -; GFX7-NEXT: s_bfe_u32 s15, s10, 0x40018 -; GFX7-NEXT: s_bfe_u32 s16, s10, 0x40014 -; GFX7-NEXT: s_bfe_u32 s17, s10, 0x40010 -; GFX7-NEXT: s_bfe_u32 s18, s10, 0x4000c -; GFX7-NEXT: s_bfe_u32 s19, s10, 0x40008 -; GFX7-NEXT: s_bfe_u32 s20, s10, 0x40004 -; GFX7-NEXT: s_and_b32 s10, s10, 15 -; GFX7-NEXT: s_lshr_b32 s1, s0, 28 -; GFX7-NEXT: s_bfe_u32 s2, s0, 0x40018 -; GFX7-NEXT: s_bfe_u32 s8, s0, 0x40014 -; GFX7-NEXT: s_bfe_u32 s9, s0, 0x40010 -; GFX7-NEXT: s_bfe_u32 s12, s0, 0x4000c -; GFX7-NEXT: s_bfe_u32 s13, s0, 0x40008 -; GFX7-NEXT: s_bfe_u32 s14, s0, 0x40004 -; GFX7-NEXT: s_and_b32 s0, s0, 15 -; GFX7-NEXT: v_mov_b32_e32 v0, s10 -; GFX7-NEXT: v_mov_b32_e32 v1, s21 -; GFX7-NEXT: v_mad_u32_u24 v0, s0, v0, v1 +; GFX7-NEXT: s_lshr_b32 s7, s6, 28 +; GFX7-NEXT: s_bfe_u32 s14, s6, 0x40018 +; GFX7-NEXT: s_bfe_u32 s15, s6, 0x40014 +; GFX7-NEXT: s_bfe_u32 s16, s6, 0x40010 +; GFX7-NEXT: s_bfe_u32 s17, s6, 0x4000c +; GFX7-NEXT: s_bfe_u32 s18, s6, 0x40008 +; GFX7-NEXT: s_bfe_u32 s19, s6, 0x40004 +; GFX7-NEXT: s_and_b32 s6, s6, 15 +; GFX7-NEXT: s_lshr_b32 s5, s4, 28 +; GFX7-NEXT: s_bfe_u32 s8, s4, 0x40018 +; GFX7-NEXT: s_bfe_u32 s9, s4, 0x40014 +; GFX7-NEXT: s_bfe_u32 s10, s4, 0x40010 +; GFX7-NEXT: s_bfe_u32 s11, s4, 0x4000c +; GFX7-NEXT: s_bfe_u32 s12, s4, 0x40008 +; GFX7-NEXT: s_bfe_u32 s13, s4, 0x40004 +; GFX7-NEXT: s_and_b32 s4, s4, 15 +; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s20 -; GFX7-NEXT: v_mad_u32_u24 v0, s14, v1, v0 +; GFX7-NEXT: v_mad_u32_u24 v0, s4, v0, v1 ; GFX7-NEXT: v_mov_b32_e32 v1, s19 ; GFX7-NEXT: v_mad_u32_u24 v0, s13, v1, v0 ; GFX7-NEXT: v_mov_b32_e32 v1, s18 ; GFX7-NEXT: v_mad_u32_u24 v0, s12, v1, v0 ; GFX7-NEXT: v_mov_b32_e32 v1, s17 -; GFX7-NEXT: v_mad_u32_u24 v0, s9, v1, v0 +; GFX7-NEXT: v_mad_u32_u24 v0, s11, v1, v0 ; GFX7-NEXT: v_mov_b32_e32 v1, s16 -; GFX7-NEXT: v_mad_u32_u24 v0, s8, v1, v0 +; GFX7-NEXT: v_mad_u32_u24 v0, s10, v1, v0 ; GFX7-NEXT: v_mov_b32_e32 v1, s15 -; GFX7-NEXT: v_mad_u32_u24 v0, s2, v1, v0 -; GFX7-NEXT: v_mov_b32_e32 v1, s11 -; GFX7-NEXT: v_mad_u32_u24 v0, s1, v1, v0 -; GFX7-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GFX7-NEXT: v_mad_u32_u24 v0, s9, v1, v0 +; GFX7-NEXT: v_mov_b32_e32 v1, s14 +; GFX7-NEXT: v_mad_u32_u24 v0, s8, v1, v0 +; GFX7-NEXT: v_mov_b32_e32 v1, s7 +; GFX7-NEXT: v_mad_u32_u24 v0, s5, v1, v0 +; GFX7-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX7-NEXT: s_endpgm ; ; GFX8-LABEL: udot8_acc32: @@ -61,29 +61,27 @@ ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: s_load_dword s6, s[6:7], 0x0 ; GFX8-NEXT: s_load_dword s2, s[4:5], 0x0 -; GFX8-NEXT: s_load_dword s19, s[0:1], 0x0 +; GFX8-NEXT: s_load_dword s18, s[0:1], 0x0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: s_lshr_b32 s7, s6, 28 -; GFX8-NEXT: s_bfe_u32 s13, s6, 0x40018 -; GFX8-NEXT: s_bfe_u32 s14, s6, 0x40014 -; GFX8-NEXT: s_bfe_u32 s15, s6, 0x40010 -; GFX8-NEXT: s_bfe_u32 s16, s6, 0x4000c -; GFX8-NEXT: s_bfe_u32 s17, s6, 0x40008 -; GFX8-NEXT: s_bfe_u32 s18, s6, 0x40004 +; GFX8-NEXT: s_bfe_u32 s12, s6, 0x40018 +; GFX8-NEXT: s_bfe_u32 s13, s6, 0x40014 +; GFX8-NEXT: s_bfe_u32 s14, s6, 0x40010 +; GFX8-NEXT: s_bfe_u32 s15, s6, 0x4000c +; GFX8-NEXT: s_bfe_u32 s16, s6, 0x40008 +; GFX8-NEXT: s_bfe_u32 s17, s6, 0x40004 ; GFX8-NEXT: s_and_b32 s6, s6, 15 -; GFX8-NEXT: s_lshr_b32 s4, s2, 28 -; GFX8-NEXT: s_bfe_u32 s5, s2, 0x40018 -; GFX8-NEXT: s_bfe_u32 s8, s2, 0x40014 -; GFX8-NEXT: s_bfe_u32 s9, s2, 0x40010 -; GFX8-NEXT: s_bfe_u32 s10, s2, 0x4000c -; GFX8-NEXT: s_bfe_u32 s11, s2, 0x40008 -; GFX8-NEXT: s_bfe_u32 s12, s2, 0x40004 +; GFX8-NEXT: s_lshr_b32 s3, s2, 28 +; GFX8-NEXT: s_bfe_u32 s4, s2, 0x40018 +; GFX8-NEXT: s_bfe_u32 s5, s2, 0x40014 +; GFX8-NEXT: s_bfe_u32 s8, s2, 0x40010 +; GFX8-NEXT: s_bfe_u32 s9, s2, 0x4000c +; GFX8-NEXT: s_bfe_u32 s10, s2, 0x40008 +; GFX8-NEXT: s_bfe_u32 s11, s2, 0x40004 ; GFX8-NEXT: s_and_b32 s2, s2, 15 ; GFX8-NEXT: v_mov_b32_e32 v0, s6 -; GFX8-NEXT: v_mov_b32_e32 v1, s19 -; GFX8-NEXT: v_mad_u32_u24 v0, s2, v0, v1 ; GFX8-NEXT: v_mov_b32_e32 v1, s18 -; GFX8-NEXT: v_mad_u32_u24 v0, s12, v1, v0 +; GFX8-NEXT: v_mad_u32_u24 v0, s2, v0, v1 ; GFX8-NEXT: v_mov_b32_e32 v1, s17 ; GFX8-NEXT: v_mad_u32_u24 v0, s11, v1, v0 ; GFX8-NEXT: v_mov_b32_e32 v1, s16 @@ -94,8 +92,10 @@ ; GFX8-NEXT: v_mad_u32_u24 v0, s8, v1, v0 ; GFX8-NEXT: v_mov_b32_e32 v1, s13 ; GFX8-NEXT: v_mad_u32_u24 v0, s5, v1, v0 +; GFX8-NEXT: v_mov_b32_e32 v1, s12 +; GFX8-NEXT: v_mad_u32_u24 v0, s4, v1, v0 ; GFX8-NEXT: v_mov_b32_e32 v1, s7 -; GFX8-NEXT: v_mad_u32_u24 v2, s4, v1, v0 +; GFX8-NEXT: v_mad_u32_u24 v2, s3, v1, v0 ; GFX8-NEXT: v_mov_b32_e32 v0, s0 ; GFX8-NEXT: v_mov_b32_e32 v1, s1 ; GFX8-NEXT: flat_store_dword v[0:1], v2 @@ -108,29 +108,27 @@ ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_load_dword s6, s[6:7], 0x0 ; GFX9-NEXT: s_load_dword s2, s[4:5], 0x0 -; GFX9-NEXT: s_load_dword s19, s[0:1], 0x0 +; GFX9-NEXT: s_load_dword s18, s[0:1], 0x0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_lshr_b32 s7, s6, 28 -; GFX9-NEXT: s_bfe_u32 s13, s6, 0x40018 -; GFX9-NEXT: s_bfe_u32 s14, s6, 0x40014 -; GFX9-NEXT: s_bfe_u32 s15, s6, 0x40010 -; GFX9-NEXT: s_bfe_u32 s16, s6, 0x4000c -; GFX9-NEXT: s_bfe_u32 s17, s6, 0x40008 -; GFX9-NEXT: s_bfe_u32 s18, s6, 0x40004 +; GFX9-NEXT: s_bfe_u32 s12, s6, 0x40018 +; GFX9-NEXT: s_bfe_u32 s13, s6, 0x40014 +; GFX9-NEXT: s_bfe_u32 s14, s6, 0x40010 +; GFX9-NEXT: s_bfe_u32 s15, s6, 0x4000c +; GFX9-NEXT: s_bfe_u32 s16, s6, 0x40008 +; GFX9-NEXT: s_bfe_u32 s17, s6, 0x40004 ; GFX9-NEXT: s_and_b32 s6, s6, 15 -; GFX9-NEXT: s_lshr_b32 s4, s2, 28 -; GFX9-NEXT: s_bfe_u32 s5, s2, 0x40018 -; GFX9-NEXT: s_bfe_u32 s8, s2, 0x40014 -; GFX9-NEXT: s_bfe_u32 s9, s2, 0x40010 -; GFX9-NEXT: s_bfe_u32 s10, s2, 0x4000c -; GFX9-NEXT: s_bfe_u32 s11, s2, 0x40008 -; GFX9-NEXT: s_bfe_u32 s12, s2, 0x40004 +; GFX9-NEXT: s_lshr_b32 s3, s2, 28 +; GFX9-NEXT: s_bfe_u32 s4, s2, 0x40018 +; GFX9-NEXT: s_bfe_u32 s5, s2, 0x40014 +; GFX9-NEXT: s_bfe_u32 s8, s2, 0x40010 +; GFX9-NEXT: s_bfe_u32 s9, s2, 0x4000c +; GFX9-NEXT: s_bfe_u32 s10, s2, 0x40008 +; GFX9-NEXT: s_bfe_u32 s11, s2, 0x40004 ; GFX9-NEXT: s_and_b32 s2, s2, 15 ; GFX9-NEXT: v_mov_b32_e32 v0, s6 -; GFX9-NEXT: v_mov_b32_e32 v1, s19 -; GFX9-NEXT: v_mad_u32_u24 v0, s2, v0, v1 ; GFX9-NEXT: v_mov_b32_e32 v1, s18 -; GFX9-NEXT: v_mad_u32_u24 v0, s12, v1, v0 +; GFX9-NEXT: v_mad_u32_u24 v0, s2, v0, v1 ; GFX9-NEXT: v_mov_b32_e32 v1, s17 ; GFX9-NEXT: v_mad_u32_u24 v0, s11, v1, v0 ; GFX9-NEXT: v_mov_b32_e32 v1, s16 @@ -141,8 +139,10 @@ ; GFX9-NEXT: v_mad_u32_u24 v0, s8, v1, v0 ; GFX9-NEXT: v_mov_b32_e32 v1, s13 ; GFX9-NEXT: v_mad_u32_u24 v0, s5, v1, v0 +; GFX9-NEXT: v_mov_b32_e32 v1, s12 +; GFX9-NEXT: v_mad_u32_u24 v0, s4, v1, v0 ; GFX9-NEXT: v_mov_b32_e32 v1, s7 -; GFX9-NEXT: v_mad_u32_u24 v2, s4, v1, v0 +; GFX9-NEXT: v_mad_u32_u24 v2, s3, v1, v0 ; GFX9-NEXT: v_mov_b32_e32 v0, s0 ; GFX9-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-NEXT: global_store_dword v[0:1], v2, off @@ -154,11 +154,11 @@ ; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DL-NEXT: s_load_dword s2, s[6:7], 0x0 -; GFX9-DL-NEXT: s_load_dword s6, s[0:1], 0x0 +; GFX9-DL-NEXT: s_load_dword s3, s[0:1], 0x0 ; GFX9-DL-NEXT: s_load_dword s4, s[4:5], 0x0 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DL-NEXT: v_mov_b32_e32 v0, s2 -; GFX9-DL-NEXT: v_mov_b32_e32 v1, s6 +; GFX9-DL-NEXT: v_mov_b32_e32 v1, s3 ; GFX9-DL-NEXT: v_dot8_u32_u4 v2, s4, v0, v1 ; GFX9-DL-NEXT: v_mov_b32_e32 v0, s0 ; GFX9-DL-NEXT: v_mov_b32_e32 v1, s1 @@ -167,18 +167,18 @@ ; ; GFX10-DL-LABEL: udot8_acc32: ; GFX10-DL: ; %bb.0: ; %entry -; GFX10-DL-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34 -; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-DL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34 +; GFX10-DL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-DL-NEXT: ; implicit-def: $vcc_hi ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: s_load_dword s0, s[8:9], 0x0 -; GFX10-DL-NEXT: s_load_dword s1, s[4:5], 0x0 -; GFX10-DL-NEXT: s_load_dword s2, s[6:7], 0x0 +; GFX10-DL-NEXT: s_load_dword s6, s[4:5], 0x0 +; GFX10-DL-NEXT: s_load_dword s0, s[0:1], 0x0 +; GFX10-DL-NEXT: s_load_dword s1, s[2:3], 0x0 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-DL-NEXT: v_dot8_u32_u4 v2, s1, s2, v0 -; GFX10-DL-NEXT: v_mov_b32_e32 v0, s8 -; GFX10-DL-NEXT: v_mov_b32_e32 v1, s9 +; GFX10-DL-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-DL-NEXT: v_dot8_u32_u4 v2, s0, s1, v0 +; GFX10-DL-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-DL-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-DL-NEXT: global_store_dword v[0:1], v2, off ; GFX10-DL-NEXT: s_endpgm <8 x i4> addrspace(1)* %src2, @@ -254,49 +254,49 @@ define amdgpu_kernel void @udot8_acc16(<8 x i4> addrspace(1)* %src1, ; GFX7-LABEL: udot8_acc16: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x9 -; GFX7-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xd -; GFX7-NEXT: s_mov_b32 s7, 0xf000 -; GFX7-NEXT: s_mov_b32 s6, -1 +; GFX7-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd +; GFX7-NEXT: s_mov_b32 s3, 0xf000 +; GFX7-NEXT: s_mov_b32 s2, -1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: buffer_load_ushort v0, off, s[4:7], 0 -; GFX7-NEXT: s_load_dword s0, s[8:9], 0x0 -; GFX7-NEXT: s_load_dword s1, s[10:11], 0x0 +; GFX7-NEXT: buffer_load_ushort v0, off, s[0:3], 0 +; GFX7-NEXT: s_load_dword s4, s[4:5], 0x0 +; GFX7-NEXT: s_load_dword s5, s[6:7], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: s_lshr_b32 s2, s0, 28 -; GFX7-NEXT: s_bfe_u32 s15, s1, 0x40018 -; GFX7-NEXT: s_bfe_u32 s16, s1, 0x40014 -; GFX7-NEXT: s_bfe_u32 s17, s1, 0x40010 -; GFX7-NEXT: s_bfe_u32 s18, s1, 0x4000c -; GFX7-NEXT: s_bfe_u32 s19, s1, 0x40008 -; GFX7-NEXT: s_bfe_u32 s20, s1, 0x40004 -; GFX7-NEXT: s_lshr_b32 s14, s1, 28 -; GFX7-NEXT: s_and_b32 s1, s1, 15 -; GFX7-NEXT: s_bfe_u32 s8, s0, 0x40018 -; GFX7-NEXT: s_bfe_u32 s9, s0, 0x40014 -; GFX7-NEXT: s_bfe_u32 s10, s0, 0x40010 -; GFX7-NEXT: s_bfe_u32 s11, s0, 0x4000c -; GFX7-NEXT: s_bfe_u32 s12, s0, 0x40008 -; GFX7-NEXT: s_bfe_u32 s13, s0, 0x40004 -; GFX7-NEXT: s_and_b32 s0, s0, 15 -; GFX7-NEXT: v_mov_b32_e32 v1, s1 -; GFX7-NEXT: v_mov_b32_e32 v2, s20 -; GFX7-NEXT: v_mov_b32_e32 v3, s19 -; GFX7-NEXT: v_mov_b32_e32 v4, s18 -; GFX7-NEXT: v_mov_b32_e32 v5, s17 -; GFX7-NEXT: v_mov_b32_e32 v6, s16 -; GFX7-NEXT: v_mov_b32_e32 v7, s15 +; GFX7-NEXT: s_lshr_b32 s6, s4, 28 +; GFX7-NEXT: s_bfe_u32 s14, s5, 0x40018 +; GFX7-NEXT: s_bfe_u32 s15, s5, 0x40014 +; GFX7-NEXT: s_bfe_u32 s16, s5, 0x40010 +; GFX7-NEXT: s_bfe_u32 s17, s5, 0x4000c +; GFX7-NEXT: s_bfe_u32 s18, s5, 0x40008 +; GFX7-NEXT: s_bfe_u32 s19, s5, 0x40004 +; GFX7-NEXT: s_lshr_b32 s13, s5, 28 +; GFX7-NEXT: s_and_b32 s5, s5, 15 +; GFX7-NEXT: s_bfe_u32 s7, s4, 0x40018 +; GFX7-NEXT: s_bfe_u32 s8, s4, 0x40014 +; GFX7-NEXT: s_bfe_u32 s9, s4, 0x40010 +; GFX7-NEXT: s_bfe_u32 s10, s4, 0x4000c +; GFX7-NEXT: s_bfe_u32 s11, s4, 0x40008 +; GFX7-NEXT: s_bfe_u32 s12, s4, 0x40004 +; GFX7-NEXT: s_and_b32 s4, s4, 15 +; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: v_mov_b32_e32 v2, s19 +; GFX7-NEXT: v_mov_b32_e32 v3, s18 +; GFX7-NEXT: v_mov_b32_e32 v4, s17 +; GFX7-NEXT: v_mov_b32_e32 v5, s16 +; GFX7-NEXT: v_mov_b32_e32 v6, s15 +; GFX7-NEXT: v_mov_b32_e32 v7, s14 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_mad_u32_u24 v0, s0, v1, v0 -; GFX7-NEXT: v_mad_u32_u24 v0, s13, v2, v0 -; GFX7-NEXT: v_mad_u32_u24 v0, s12, v3, v0 -; GFX7-NEXT: v_mad_u32_u24 v0, s11, v4, v0 -; GFX7-NEXT: v_mad_u32_u24 v0, s10, v5, v0 -; GFX7-NEXT: v_mad_u32_u24 v0, s9, v6, v0 -; GFX7-NEXT: v_mad_u32_u24 v0, s8, v7, v0 -; GFX7-NEXT: v_mov_b32_e32 v1, s14 -; GFX7-NEXT: v_mad_u32_u24 v0, s2, v1, v0 -; GFX7-NEXT: buffer_store_short v0, off, s[4:7], 0 +; GFX7-NEXT: v_mad_u32_u24 v0, s4, v1, v0 +; GFX7-NEXT: v_mad_u32_u24 v0, s12, v2, v0 +; GFX7-NEXT: v_mad_u32_u24 v0, s11, v3, v0 +; GFX7-NEXT: v_mad_u32_u24 v0, s10, v4, v0 +; GFX7-NEXT: v_mad_u32_u24 v0, s9, v5, v0 +; GFX7-NEXT: v_mad_u32_u24 v0, s8, v6, v0 +; GFX7-NEXT: v_mad_u32_u24 v0, s7, v7, v0 +; GFX7-NEXT: v_mov_b32_e32 v1, s13 +; GFX7-NEXT: v_mad_u32_u24 v0, s6, v1, v0 +; GFX7-NEXT: buffer_store_short v0, off, s[0:3], 0 ; GFX7-NEXT: s_endpgm ; ; GFX8-LABEL: udot8_acc16: @@ -311,38 +311,38 @@ ; GFX8-NEXT: s_load_dword s1, s[6:7], 0x0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: s_lshr_b32 s2, s0, 28 -; GFX8-NEXT: s_bfe_u32 s11, s1, 0x40018 -; GFX8-NEXT: s_bfe_u32 s12, s1, 0x40014 -; GFX8-NEXT: s_bfe_u32 s13, s1, 0x40010 -; GFX8-NEXT: s_bfe_u32 s14, s1, 0x4000c -; GFX8-NEXT: s_bfe_u32 s15, s1, 0x40008 -; GFX8-NEXT: s_bfe_u32 s16, s1, 0x40004 -; GFX8-NEXT: s_lshr_b32 s10, s1, 28 +; GFX8-NEXT: s_bfe_u32 s10, s1, 0x40018 +; GFX8-NEXT: s_bfe_u32 s11, s1, 0x40014 +; GFX8-NEXT: s_bfe_u32 s12, s1, 0x40010 +; GFX8-NEXT: s_bfe_u32 s13, s1, 0x4000c +; GFX8-NEXT: s_bfe_u32 s14, s1, 0x40008 +; GFX8-NEXT: s_bfe_u32 s15, s1, 0x40004 +; GFX8-NEXT: s_lshr_b32 s9, s1, 28 ; GFX8-NEXT: s_and_b32 s1, s1, 15 -; GFX8-NEXT: s_bfe_u32 s4, s0, 0x40018 -; GFX8-NEXT: s_bfe_u32 s5, s0, 0x40014 -; GFX8-NEXT: s_bfe_u32 s6, s0, 0x40010 -; GFX8-NEXT: s_bfe_u32 s7, s0, 0x4000c -; GFX8-NEXT: s_bfe_u32 s8, s0, 0x40008 -; GFX8-NEXT: s_bfe_u32 s9, s0, 0x40004 +; GFX8-NEXT: s_bfe_u32 s3, s0, 0x40018 +; GFX8-NEXT: s_bfe_u32 s4, s0, 0x40014 +; GFX8-NEXT: s_bfe_u32 s5, s0, 0x40010 +; GFX8-NEXT: s_bfe_u32 s6, s0, 0x4000c +; GFX8-NEXT: s_bfe_u32 s7, s0, 0x40008 +; GFX8-NEXT: s_bfe_u32 s8, s0, 0x40004 ; GFX8-NEXT: s_and_b32 s0, s0, 15 ; GFX8-NEXT: v_mov_b32_e32 v3, s1 -; GFX8-NEXT: v_mov_b32_e32 v4, s16 -; GFX8-NEXT: v_mov_b32_e32 v5, s15 -; GFX8-NEXT: v_mov_b32_e32 v6, s14 -; GFX8-NEXT: v_mov_b32_e32 v7, s13 -; GFX8-NEXT: v_mov_b32_e32 v8, s12 -; GFX8-NEXT: v_mov_b32_e32 v9, s11 +; GFX8-NEXT: v_mov_b32_e32 v4, s15 +; GFX8-NEXT: v_mov_b32_e32 v5, s14 +; GFX8-NEXT: v_mov_b32_e32 v6, s13 +; GFX8-NEXT: v_mov_b32_e32 v7, s12 +; GFX8-NEXT: v_mov_b32_e32 v8, s11 +; GFX8-NEXT: v_mov_b32_e32 v9, s10 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_mad_u32_u24 v2, s0, v3, v2 -; GFX8-NEXT: v_mad_u32_u24 v2, s9, v4, v2 +; GFX8-NEXT: v_mad_u32_u24 v2, s8, v4, v2 ; GFX8-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GFX8-NEXT: v_mad_u32_u24 v2, s8, v5, v2 -; GFX8-NEXT: v_mad_u32_u24 v2, s7, v6, v2 -; GFX8-NEXT: v_mad_u32_u24 v2, s6, v7, v2 -; GFX8-NEXT: v_mad_u32_u24 v2, s5, v8, v2 -; GFX8-NEXT: v_mad_u32_u24 v2, s4, v9, v2 -; GFX8-NEXT: v_mov_b32_e32 v3, s10 +; GFX8-NEXT: v_mad_u32_u24 v2, s7, v5, v2 +; GFX8-NEXT: v_mad_u32_u24 v2, s6, v6, v2 +; GFX8-NEXT: v_mad_u32_u24 v2, s5, v7, v2 +; GFX8-NEXT: v_mad_u32_u24 v2, s4, v8, v2 +; GFX8-NEXT: v_mad_u32_u24 v2, s3, v9, v2 +; GFX8-NEXT: v_mov_b32_e32 v3, s9 ; GFX8-NEXT: v_mad_u32_u24 v2, s2, v3, v2 ; GFX8-NEXT: flat_store_short v[0:1], v2 ; GFX8-NEXT: s_endpgm @@ -359,38 +359,38 @@ ; GFX9-NEXT: s_load_dword s1, s[6:7], 0x0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_lshr_b32 s2, s0, 28 -; GFX9-NEXT: s_bfe_u32 s11, s1, 0x40018 -; GFX9-NEXT: s_bfe_u32 s12, s1, 0x40014 -; GFX9-NEXT: s_bfe_u32 s13, s1, 0x40010 -; GFX9-NEXT: s_bfe_u32 s14, s1, 0x4000c -; GFX9-NEXT: s_bfe_u32 s15, s1, 0x40008 -; GFX9-NEXT: s_bfe_u32 s16, s1, 0x40004 -; GFX9-NEXT: s_lshr_b32 s10, s1, 28 +; GFX9-NEXT: s_bfe_u32 s10, s1, 0x40018 +; GFX9-NEXT: s_bfe_u32 s11, s1, 0x40014 +; GFX9-NEXT: s_bfe_u32 s12, s1, 0x40010 +; GFX9-NEXT: s_bfe_u32 s13, s1, 0x4000c +; GFX9-NEXT: s_bfe_u32 s14, s1, 0x40008 +; GFX9-NEXT: s_bfe_u32 s15, s1, 0x40004 +; GFX9-NEXT: s_lshr_b32 s9, s1, 28 ; GFX9-NEXT: s_and_b32 s1, s1, 15 -; GFX9-NEXT: s_bfe_u32 s4, s0, 0x40018 -; GFX9-NEXT: s_bfe_u32 s5, s0, 0x40014 -; GFX9-NEXT: s_bfe_u32 s6, s0, 0x40010 -; GFX9-NEXT: s_bfe_u32 s7, s0, 0x4000c -; GFX9-NEXT: s_bfe_u32 s8, s0, 0x40008 -; GFX9-NEXT: s_bfe_u32 s9, s0, 0x40004 +; GFX9-NEXT: s_bfe_u32 s3, s0, 0x40018 +; GFX9-NEXT: s_bfe_u32 s4, s0, 0x40014 +; GFX9-NEXT: s_bfe_u32 s5, s0, 0x40010 +; GFX9-NEXT: s_bfe_u32 s6, s0, 0x4000c +; GFX9-NEXT: s_bfe_u32 s7, s0, 0x40008 +; GFX9-NEXT: s_bfe_u32 s8, s0, 0x40004 ; GFX9-NEXT: s_and_b32 s0, s0, 15 ; GFX9-NEXT: v_mov_b32_e32 v3, s1 -; GFX9-NEXT: v_mov_b32_e32 v4, s16 -; GFX9-NEXT: v_mov_b32_e32 v5, s15 -; GFX9-NEXT: v_mov_b32_e32 v6, s14 -; GFX9-NEXT: v_mov_b32_e32 v7, s13 -; GFX9-NEXT: v_mov_b32_e32 v8, s12 -; GFX9-NEXT: v_mov_b32_e32 v9, s11 +; GFX9-NEXT: v_mov_b32_e32 v4, s15 +; GFX9-NEXT: v_mov_b32_e32 v5, s14 +; GFX9-NEXT: v_mov_b32_e32 v6, s13 +; GFX9-NEXT: v_mov_b32_e32 v7, s12 +; GFX9-NEXT: v_mov_b32_e32 v8, s11 +; GFX9-NEXT: v_mov_b32_e32 v9, s10 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_mad_u32_u24 v2, s0, v3, v2 -; GFX9-NEXT: v_mad_u32_u24 v2, s9, v4, v2 +; GFX9-NEXT: v_mad_u32_u24 v2, s8, v4, v2 ; GFX9-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GFX9-NEXT: v_mad_u32_u24 v2, s8, v5, v2 -; GFX9-NEXT: v_mad_u32_u24 v2, s7, v6, v2 -; GFX9-NEXT: v_mad_u32_u24 v2, s6, v7, v2 -; GFX9-NEXT: v_mad_u32_u24 v2, s5, v8, v2 -; GFX9-NEXT: v_mad_u32_u24 v2, s4, v9, v2 -; GFX9-NEXT: v_mov_b32_e32 v3, s10 +; GFX9-NEXT: v_mad_u32_u24 v2, s7, v5, v2 +; GFX9-NEXT: v_mad_u32_u24 v2, s6, v6, v2 +; GFX9-NEXT: v_mad_u32_u24 v2, s5, v7, v2 +; GFX9-NEXT: v_mad_u32_u24 v2, s4, v8, v2 +; GFX9-NEXT: v_mad_u32_u24 v2, s3, v9, v2 +; GFX9-NEXT: v_mov_b32_e32 v3, s9 ; GFX9-NEXT: v_mad_u32_u24 v2, s2, v3, v2 ; GFX9-NEXT: global_store_short v[0:1], v2, off ; GFX9-NEXT: s_endpgm @@ -407,80 +407,80 @@ ; GFX9-DL-NEXT: s_load_dword s1, s[6:7], 0x0 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DL-NEXT: s_lshr_b32 s2, s0, 28 -; GFX9-DL-NEXT: s_bfe_u32 s11, s1, 0x40018 -; GFX9-DL-NEXT: s_bfe_u32 s12, s1, 0x40014 -; GFX9-DL-NEXT: s_bfe_u32 s13, s1, 0x40010 -; GFX9-DL-NEXT: s_bfe_u32 s14, s1, 0x4000c -; GFX9-DL-NEXT: s_bfe_u32 s15, s1, 0x40008 -; GFX9-DL-NEXT: s_bfe_u32 s16, s1, 0x40004 -; GFX9-DL-NEXT: s_lshr_b32 s10, s1, 28 +; GFX9-DL-NEXT: s_bfe_u32 s10, s1, 0x40018 +; GFX9-DL-NEXT: s_bfe_u32 s11, s1, 0x40014 +; GFX9-DL-NEXT: s_bfe_u32 s12, s1, 0x40010 +; GFX9-DL-NEXT: s_bfe_u32 s13, s1, 0x4000c +; GFX9-DL-NEXT: s_bfe_u32 s14, s1, 0x40008 +; GFX9-DL-NEXT: s_bfe_u32 s15, s1, 0x40004 +; GFX9-DL-NEXT: s_lshr_b32 s9, s1, 28 ; GFX9-DL-NEXT: s_and_b32 s1, s1, 15 -; GFX9-DL-NEXT: s_bfe_u32 s4, s0, 0x40018 -; GFX9-DL-NEXT: s_bfe_u32 s5, s0, 0x40014 -; GFX9-DL-NEXT: s_bfe_u32 s6, s0, 0x40010 -; GFX9-DL-NEXT: s_bfe_u32 s7, s0, 0x4000c -; GFX9-DL-NEXT: s_bfe_u32 s8, s0, 0x40008 -; GFX9-DL-NEXT: s_bfe_u32 s9, s0, 0x40004 +; GFX9-DL-NEXT: s_bfe_u32 s3, s0, 0x40018 +; GFX9-DL-NEXT: s_bfe_u32 s4, s0, 0x40014 +; GFX9-DL-NEXT: s_bfe_u32 s5, s0, 0x40010 +; GFX9-DL-NEXT: s_bfe_u32 s6, s0, 0x4000c +; GFX9-DL-NEXT: s_bfe_u32 s7, s0, 0x40008 +; GFX9-DL-NEXT: s_bfe_u32 s8, s0, 0x40004 ; GFX9-DL-NEXT: s_and_b32 s0, s0, 15 ; GFX9-DL-NEXT: v_mov_b32_e32 v3, s1 -; GFX9-DL-NEXT: v_mov_b32_e32 v4, s16 -; GFX9-DL-NEXT: v_mov_b32_e32 v5, s15 -; GFX9-DL-NEXT: v_mov_b32_e32 v6, s14 -; GFX9-DL-NEXT: v_mov_b32_e32 v7, s13 -; GFX9-DL-NEXT: v_mov_b32_e32 v8, s12 -; GFX9-DL-NEXT: v_mov_b32_e32 v9, s11 +; GFX9-DL-NEXT: v_mov_b32_e32 v4, s15 +; GFX9-DL-NEXT: v_mov_b32_e32 v5, s14 +; GFX9-DL-NEXT: v_mov_b32_e32 v6, s13 +; GFX9-DL-NEXT: v_mov_b32_e32 v7, s12 +; GFX9-DL-NEXT: v_mov_b32_e32 v8, s11 +; GFX9-DL-NEXT: v_mov_b32_e32 v9, s10 ; GFX9-DL-NEXT: s_waitcnt vmcnt(0) ; GFX9-DL-NEXT: v_mad_u32_u24 v2, s0, v3, v2 -; GFX9-DL-NEXT: v_mad_u32_u24 v2, s9, v4, v2 +; GFX9-DL-NEXT: v_mad_u32_u24 v2, s8, v4, v2 ; GFX9-DL-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GFX9-DL-NEXT: v_mad_u32_u24 v2, s8, v5, v2 -; GFX9-DL-NEXT: v_mad_u32_u24 v2, s7, v6, v2 -; GFX9-DL-NEXT: v_mad_u32_u24 v2, s6, v7, v2 -; GFX9-DL-NEXT: v_mad_u32_u24 v2, s5, v8, v2 -; GFX9-DL-NEXT: v_mad_u32_u24 v2, s4, v9, v2 -; GFX9-DL-NEXT: v_mov_b32_e32 v3, s10 +; GFX9-DL-NEXT: v_mad_u32_u24 v2, s7, v5, v2 +; GFX9-DL-NEXT: v_mad_u32_u24 v2, s6, v6, v2 +; GFX9-DL-NEXT: v_mad_u32_u24 v2, s5, v7, v2 +; GFX9-DL-NEXT: v_mad_u32_u24 v2, s4, v8, v2 +; GFX9-DL-NEXT: v_mad_u32_u24 v2, s3, v9, v2 +; GFX9-DL-NEXT: v_mov_b32_e32 v3, s9 ; GFX9-DL-NEXT: v_mad_u32_u24 v2, s2, v3, v2 ; GFX9-DL-NEXT: global_store_short v[0:1], v2, off ; GFX9-DL-NEXT: s_endpgm ; ; GFX10-DL-LABEL: udot8_acc16: ; GFX10-DL: ; %bb.0: ; %entry -; GFX10-DL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34 +; GFX10-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX10-DL-NEXT: ; implicit-def: $vcc_hi ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: v_mov_b32_e32 v0, s4 -; GFX10-DL-NEXT: v_mov_b32_e32 v1, s5 -; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-DL-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-DL-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-DL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-DL-NEXT: global_load_ushort v2, v[0:1], off ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: s_load_dword s0, s[4:5], 0x0 -; GFX10-DL-NEXT: s_load_dword s1, s[6:7], 0x0 +; GFX10-DL-NEXT: s_load_dword s0, s[0:1], 0x0 +; GFX10-DL-NEXT: s_load_dword s1, s[2:3], 0x0 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DL-NEXT: s_and_b32 s2, s0, 15 -; GFX10-DL-NEXT: s_and_b32 s4, s1, 15 -; GFX10-DL-NEXT: s_bfe_u32 s5, s0, 0x40004 -; GFX10-DL-NEXT: s_bfe_u32 s6, s1, 0x40004 +; GFX10-DL-NEXT: s_and_b32 s3, s1, 15 +; GFX10-DL-NEXT: s_bfe_u32 s4, s0, 0x40004 +; GFX10-DL-NEXT: s_bfe_u32 s5, s1, 0x40004 ; GFX10-DL-NEXT: s_waitcnt vmcnt(0) -; GFX10-DL-NEXT: v_mad_u32_u24 v2, s2, s4, v2 +; GFX10-DL-NEXT: v_mad_u32_u24 v2, s2, s3, v2 ; GFX10-DL-NEXT: s_bfe_u32 s2, s0, 0x40008 -; GFX10-DL-NEXT: s_bfe_u32 s4, s1, 0x40008 -; GFX10-DL-NEXT: v_mad_u32_u24 v2, s5, s6, v2 -; GFX10-DL-NEXT: s_bfe_u32 s5, s0, 0x4000c -; GFX10-DL-NEXT: s_bfe_u32 s6, s1, 0x4000c +; GFX10-DL-NEXT: s_bfe_u32 s3, s1, 0x40008 +; GFX10-DL-NEXT: v_mad_u32_u24 v2, s4, s5, v2 +; GFX10-DL-NEXT: s_bfe_u32 s4, s0, 0x4000c +; GFX10-DL-NEXT: s_bfe_u32 s5, s1, 0x4000c ; GFX10-DL-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GFX10-DL-NEXT: v_mad_u32_u24 v2, s2, s4, v2 +; GFX10-DL-NEXT: v_mad_u32_u24 v2, s2, s3, v2 ; GFX10-DL-NEXT: s_bfe_u32 s2, s0, 0x40010 -; GFX10-DL-NEXT: s_bfe_u32 s4, s1, 0x40010 -; GFX10-DL-NEXT: v_mad_u32_u24 v2, s5, s6, v2 -; GFX10-DL-NEXT: s_bfe_u32 s5, s0, 0x40014 -; GFX10-DL-NEXT: s_bfe_u32 s6, s1, 0x40014 -; GFX10-DL-NEXT: v_mad_u32_u24 v2, s2, s4, v2 +; GFX10-DL-NEXT: s_bfe_u32 s3, s1, 0x40010 +; GFX10-DL-NEXT: v_mad_u32_u24 v2, s4, s5, v2 +; GFX10-DL-NEXT: s_bfe_u32 s4, s0, 0x40014 +; GFX10-DL-NEXT: s_bfe_u32 s5, s1, 0x40014 +; GFX10-DL-NEXT: v_mad_u32_u24 v2, s2, s3, v2 ; GFX10-DL-NEXT: s_bfe_u32 s2, s0, 0x40018 -; GFX10-DL-NEXT: s_bfe_u32 s4, s1, 0x40018 +; GFX10-DL-NEXT: s_bfe_u32 s3, s1, 0x40018 ; GFX10-DL-NEXT: s_lshr_b32 s0, s0, 28 ; GFX10-DL-NEXT: s_lshr_b32 s1, s1, 28 -; GFX10-DL-NEXT: v_mad_u32_u24 v2, s5, s6, v2 -; GFX10-DL-NEXT: v_mad_u32_u24 v2, s2, s4, v2 +; GFX10-DL-NEXT: v_mad_u32_u24 v2, s4, s5, v2 +; GFX10-DL-NEXT: v_mad_u32_u24 v2, s2, s3, v2 ; GFX10-DL-NEXT: v_mad_u32_u24 v2, s0, s1, v2 ; GFX10-DL-NEXT: global_store_short v[0:1], v2, off ; GFX10-DL-NEXT: s_endpgm @@ -557,49 +557,49 @@ define amdgpu_kernel void @udot8_acc8(<8 x i4> addrspace(1)* %src1, ; GFX7-LABEL: udot8_acc8: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x9 -; GFX7-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xd -; GFX7-NEXT: s_mov_b32 s7, 0xf000 -; GFX7-NEXT: s_mov_b32 s6, -1 +; GFX7-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd +; GFX7-NEXT: s_mov_b32 s3, 0xf000 +; GFX7-NEXT: s_mov_b32 s2, -1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: buffer_load_ubyte v0, off, s[4:7], 0 -; GFX7-NEXT: s_load_dword s0, s[8:9], 0x0 -; GFX7-NEXT: s_load_dword s1, s[10:11], 0x0 +; GFX7-NEXT: buffer_load_ubyte v0, off, s[0:3], 0 +; GFX7-NEXT: s_load_dword s4, s[4:5], 0x0 +; GFX7-NEXT: s_load_dword s5, s[6:7], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: s_lshr_b32 s2, s0, 28 -; GFX7-NEXT: s_bfe_u32 s15, s1, 0x40018 -; GFX7-NEXT: s_bfe_u32 s16, s1, 0x40014 -; GFX7-NEXT: s_bfe_u32 s17, s1, 0x40010 -; GFX7-NEXT: s_bfe_u32 s18, s1, 0x4000c -; GFX7-NEXT: s_bfe_u32 s19, s1, 0x40008 -; GFX7-NEXT: s_bfe_u32 s20, s1, 0x40004 -; GFX7-NEXT: s_lshr_b32 s14, s1, 28 -; GFX7-NEXT: s_and_b32 s1, s1, 15 -; GFX7-NEXT: s_bfe_u32 s8, s0, 0x40018 -; GFX7-NEXT: s_bfe_u32 s9, s0, 0x40014 -; GFX7-NEXT: s_bfe_u32 s10, s0, 0x40010 -; GFX7-NEXT: s_bfe_u32 s11, s0, 0x4000c -; GFX7-NEXT: s_bfe_u32 s12, s0, 0x40008 -; GFX7-NEXT: s_bfe_u32 s13, s0, 0x40004 -; GFX7-NEXT: s_and_b32 s0, s0, 15 -; GFX7-NEXT: v_mov_b32_e32 v1, s1 -; GFX7-NEXT: v_mov_b32_e32 v2, s20 -; GFX7-NEXT: v_mov_b32_e32 v3, s19 -; GFX7-NEXT: v_mov_b32_e32 v4, s18 -; GFX7-NEXT: v_mov_b32_e32 v5, s17 -; GFX7-NEXT: v_mov_b32_e32 v6, s16 -; GFX7-NEXT: v_mov_b32_e32 v7, s15 +; GFX7-NEXT: s_lshr_b32 s6, s4, 28 +; GFX7-NEXT: s_bfe_u32 s14, s5, 0x40018 +; GFX7-NEXT: s_bfe_u32 s15, s5, 0x40014 +; GFX7-NEXT: s_bfe_u32 s16, s5, 0x40010 +; GFX7-NEXT: s_bfe_u32 s17, s5, 0x4000c +; GFX7-NEXT: s_bfe_u32 s18, s5, 0x40008 +; GFX7-NEXT: s_bfe_u32 s19, s5, 0x40004 +; GFX7-NEXT: s_lshr_b32 s13, s5, 28 +; GFX7-NEXT: s_and_b32 s5, s5, 15 +; GFX7-NEXT: s_bfe_u32 s7, s4, 0x40018 +; GFX7-NEXT: s_bfe_u32 s8, s4, 0x40014 +; GFX7-NEXT: s_bfe_u32 s9, s4, 0x40010 +; GFX7-NEXT: s_bfe_u32 s10, s4, 0x4000c +; GFX7-NEXT: s_bfe_u32 s11, s4, 0x40008 +; GFX7-NEXT: s_bfe_u32 s12, s4, 0x40004 +; GFX7-NEXT: s_and_b32 s4, s4, 15 +; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: v_mov_b32_e32 v2, s19 +; GFX7-NEXT: v_mov_b32_e32 v3, s18 +; GFX7-NEXT: v_mov_b32_e32 v4, s17 +; GFX7-NEXT: v_mov_b32_e32 v5, s16 +; GFX7-NEXT: v_mov_b32_e32 v6, s15 +; GFX7-NEXT: v_mov_b32_e32 v7, s14 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_mad_u32_u24 v0, s0, v1, v0 -; GFX7-NEXT: v_mad_u32_u24 v0, s13, v2, v0 -; GFX7-NEXT: v_mad_u32_u24 v0, s12, v3, v0 -; GFX7-NEXT: v_mad_u32_u24 v0, s11, v4, v0 -; GFX7-NEXT: v_mad_u32_u24 v0, s10, v5, v0 -; GFX7-NEXT: v_mad_u32_u24 v0, s9, v6, v0 -; GFX7-NEXT: v_mad_u32_u24 v0, s8, v7, v0 -; GFX7-NEXT: v_mov_b32_e32 v1, s14 -; GFX7-NEXT: v_mad_u32_u24 v0, s2, v1, v0 -; GFX7-NEXT: buffer_store_byte v0, off, s[4:7], 0 +; GFX7-NEXT: v_mad_u32_u24 v0, s4, v1, v0 +; GFX7-NEXT: v_mad_u32_u24 v0, s12, v2, v0 +; GFX7-NEXT: v_mad_u32_u24 v0, s11, v3, v0 +; GFX7-NEXT: v_mad_u32_u24 v0, s10, v4, v0 +; GFX7-NEXT: v_mad_u32_u24 v0, s9, v5, v0 +; GFX7-NEXT: v_mad_u32_u24 v0, s8, v6, v0 +; GFX7-NEXT: v_mad_u32_u24 v0, s7, v7, v0 +; GFX7-NEXT: v_mov_b32_e32 v1, s13 +; GFX7-NEXT: v_mad_u32_u24 v0, s6, v1, v0 +; GFX7-NEXT: buffer_store_byte v0, off, s[0:3], 0 ; GFX7-NEXT: s_endpgm ; ; GFX8-LABEL: udot8_acc8: @@ -614,38 +614,38 @@ ; GFX8-NEXT: s_load_dword s1, s[6:7], 0x0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: s_lshr_b32 s2, s0, 28 -; GFX8-NEXT: s_bfe_u32 s11, s1, 0x40018 -; GFX8-NEXT: s_bfe_u32 s12, s1, 0x40014 -; GFX8-NEXT: s_bfe_u32 s13, s1, 0x40010 -; GFX8-NEXT: s_bfe_u32 s14, s1, 0x4000c -; GFX8-NEXT: s_bfe_u32 s15, s1, 0x40008 -; GFX8-NEXT: s_bfe_u32 s16, s1, 0x40004 -; GFX8-NEXT: s_lshr_b32 s10, s1, 28 +; GFX8-NEXT: s_bfe_u32 s10, s1, 0x40018 +; GFX8-NEXT: s_bfe_u32 s11, s1, 0x40014 +; GFX8-NEXT: s_bfe_u32 s12, s1, 0x40010 +; GFX8-NEXT: s_bfe_u32 s13, s1, 0x4000c +; GFX8-NEXT: s_bfe_u32 s14, s1, 0x40008 +; GFX8-NEXT: s_bfe_u32 s15, s1, 0x40004 +; GFX8-NEXT: s_lshr_b32 s9, s1, 28 ; GFX8-NEXT: s_and_b32 s1, s1, 15 -; GFX8-NEXT: s_bfe_u32 s4, s0, 0x40018 -; GFX8-NEXT: s_bfe_u32 s5, s0, 0x40014 -; GFX8-NEXT: s_bfe_u32 s6, s0, 0x40010 -; GFX8-NEXT: s_bfe_u32 s7, s0, 0x4000c -; GFX8-NEXT: s_bfe_u32 s8, s0, 0x40008 -; GFX8-NEXT: s_bfe_u32 s9, s0, 0x40004 +; GFX8-NEXT: s_bfe_u32 s3, s0, 0x40018 +; GFX8-NEXT: s_bfe_u32 s4, s0, 0x40014 +; GFX8-NEXT: s_bfe_u32 s5, s0, 0x40010 +; GFX8-NEXT: s_bfe_u32 s6, s0, 0x4000c +; GFX8-NEXT: s_bfe_u32 s7, s0, 0x40008 +; GFX8-NEXT: s_bfe_u32 s8, s0, 0x40004 ; GFX8-NEXT: s_and_b32 s0, s0, 15 ; GFX8-NEXT: v_mov_b32_e32 v3, s1 -; GFX8-NEXT: v_mov_b32_e32 v4, s16 -; GFX8-NEXT: v_mov_b32_e32 v5, s15 -; GFX8-NEXT: v_mov_b32_e32 v6, s14 -; GFX8-NEXT: v_mov_b32_e32 v7, s13 -; GFX8-NEXT: v_mov_b32_e32 v8, s12 -; GFX8-NEXT: v_mov_b32_e32 v9, s11 +; GFX8-NEXT: v_mov_b32_e32 v4, s15 +; GFX8-NEXT: v_mov_b32_e32 v5, s14 +; GFX8-NEXT: v_mov_b32_e32 v6, s13 +; GFX8-NEXT: v_mov_b32_e32 v7, s12 +; GFX8-NEXT: v_mov_b32_e32 v8, s11 +; GFX8-NEXT: v_mov_b32_e32 v9, s10 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_mad_u32_u24 v2, s0, v3, v2 -; GFX8-NEXT: v_mad_u32_u24 v2, s9, v4, v2 +; GFX8-NEXT: v_mad_u32_u24 v2, s8, v4, v2 ; GFX8-NEXT: v_and_b32_e32 v2, 0xff, v2 -; GFX8-NEXT: v_mad_u32_u24 v2, s8, v5, v2 -; GFX8-NEXT: v_mad_u32_u24 v2, s7, v6, v2 -; GFX8-NEXT: v_mad_u32_u24 v2, s6, v7, v2 -; GFX8-NEXT: v_mad_u32_u24 v2, s5, v8, v2 -; GFX8-NEXT: v_mad_u32_u24 v2, s4, v9, v2 -; GFX8-NEXT: v_mov_b32_e32 v3, s10 +; GFX8-NEXT: v_mad_u32_u24 v2, s7, v5, v2 +; GFX8-NEXT: v_mad_u32_u24 v2, s6, v6, v2 +; GFX8-NEXT: v_mad_u32_u24 v2, s5, v7, v2 +; GFX8-NEXT: v_mad_u32_u24 v2, s4, v8, v2 +; GFX8-NEXT: v_mad_u32_u24 v2, s3, v9, v2 +; GFX8-NEXT: v_mov_b32_e32 v3, s9 ; GFX8-NEXT: v_mad_u32_u24 v2, s2, v3, v2 ; GFX8-NEXT: flat_store_byte v[0:1], v2 ; GFX8-NEXT: s_endpgm @@ -662,38 +662,38 @@ ; GFX9-NEXT: s_load_dword s1, s[6:7], 0x0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_lshr_b32 s2, s0, 28 -; GFX9-NEXT: s_bfe_u32 s11, s1, 0x40018 -; GFX9-NEXT: s_bfe_u32 s12, s1, 0x40014 -; GFX9-NEXT: s_bfe_u32 s13, s1, 0x40010 -; GFX9-NEXT: s_bfe_u32 s14, s1, 0x4000c -; GFX9-NEXT: s_bfe_u32 s15, s1, 0x40008 -; GFX9-NEXT: s_bfe_u32 s16, s1, 0x40004 -; GFX9-NEXT: s_lshr_b32 s10, s1, 28 +; GFX9-NEXT: s_bfe_u32 s10, s1, 0x40018 +; GFX9-NEXT: s_bfe_u32 s11, s1, 0x40014 +; GFX9-NEXT: s_bfe_u32 s12, s1, 0x40010 +; GFX9-NEXT: s_bfe_u32 s13, s1, 0x4000c +; GFX9-NEXT: s_bfe_u32 s14, s1, 0x40008 +; GFX9-NEXT: s_bfe_u32 s15, s1, 0x40004 +; GFX9-NEXT: s_lshr_b32 s9, s1, 28 ; GFX9-NEXT: s_and_b32 s1, s1, 15 -; GFX9-NEXT: s_bfe_u32 s4, s0, 0x40018 -; GFX9-NEXT: s_bfe_u32 s5, s0, 0x40014 -; GFX9-NEXT: s_bfe_u32 s6, s0, 0x40010 -; GFX9-NEXT: s_bfe_u32 s7, s0, 0x4000c -; GFX9-NEXT: s_bfe_u32 s8, s0, 0x40008 -; GFX9-NEXT: s_bfe_u32 s9, s0, 0x40004 +; GFX9-NEXT: s_bfe_u32 s3, s0, 0x40018 +; GFX9-NEXT: s_bfe_u32 s4, s0, 0x40014 +; GFX9-NEXT: s_bfe_u32 s5, s0, 0x40010 +; GFX9-NEXT: s_bfe_u32 s6, s0, 0x4000c +; GFX9-NEXT: s_bfe_u32 s7, s0, 0x40008 +; GFX9-NEXT: s_bfe_u32 s8, s0, 0x40004 ; GFX9-NEXT: s_and_b32 s0, s0, 15 ; GFX9-NEXT: v_mov_b32_e32 v3, s1 -; GFX9-NEXT: v_mov_b32_e32 v4, s16 -; GFX9-NEXT: v_mov_b32_e32 v5, s15 -; GFX9-NEXT: v_mov_b32_e32 v6, s14 -; GFX9-NEXT: v_mov_b32_e32 v7, s13 -; GFX9-NEXT: v_mov_b32_e32 v8, s12 -; GFX9-NEXT: v_mov_b32_e32 v9, s11 +; GFX9-NEXT: v_mov_b32_e32 v4, s15 +; GFX9-NEXT: v_mov_b32_e32 v5, s14 +; GFX9-NEXT: v_mov_b32_e32 v6, s13 +; GFX9-NEXT: v_mov_b32_e32 v7, s12 +; GFX9-NEXT: v_mov_b32_e32 v8, s11 +; GFX9-NEXT: v_mov_b32_e32 v9, s10 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_mad_u32_u24 v2, s0, v3, v2 -; GFX9-NEXT: v_mad_u32_u24 v2, s9, v4, v2 +; GFX9-NEXT: v_mad_u32_u24 v2, s8, v4, v2 ; GFX9-NEXT: v_and_b32_e32 v2, 0xff, v2 -; GFX9-NEXT: v_mad_u32_u24 v2, s8, v5, v2 -; GFX9-NEXT: v_mad_u32_u24 v2, s7, v6, v2 -; GFX9-NEXT: v_mad_u32_u24 v2, s6, v7, v2 -; GFX9-NEXT: v_mad_u32_u24 v2, s5, v8, v2 -; GFX9-NEXT: v_mad_u32_u24 v2, s4, v9, v2 -; GFX9-NEXT: v_mov_b32_e32 v3, s10 +; GFX9-NEXT: v_mad_u32_u24 v2, s7, v5, v2 +; GFX9-NEXT: v_mad_u32_u24 v2, s6, v6, v2 +; GFX9-NEXT: v_mad_u32_u24 v2, s5, v7, v2 +; GFX9-NEXT: v_mad_u32_u24 v2, s4, v8, v2 +; GFX9-NEXT: v_mad_u32_u24 v2, s3, v9, v2 +; GFX9-NEXT: v_mov_b32_e32 v3, s9 ; GFX9-NEXT: v_mad_u32_u24 v2, s2, v3, v2 ; GFX9-NEXT: global_store_byte v[0:1], v2, off ; GFX9-NEXT: s_endpgm @@ -710,80 +710,80 @@ ; GFX9-DL-NEXT: s_load_dword s1, s[6:7], 0x0 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DL-NEXT: s_lshr_b32 s2, s0, 28 -; GFX9-DL-NEXT: s_bfe_u32 s11, s1, 0x40018 -; GFX9-DL-NEXT: s_bfe_u32 s12, s1, 0x40014 -; GFX9-DL-NEXT: s_bfe_u32 s13, s1, 0x40010 -; GFX9-DL-NEXT: s_bfe_u32 s14, s1, 0x4000c -; GFX9-DL-NEXT: s_bfe_u32 s15, s1, 0x40008 -; GFX9-DL-NEXT: s_bfe_u32 s16, s1, 0x40004 -; GFX9-DL-NEXT: s_lshr_b32 s10, s1, 28 +; GFX9-DL-NEXT: s_bfe_u32 s10, s1, 0x40018 +; GFX9-DL-NEXT: s_bfe_u32 s11, s1, 0x40014 +; GFX9-DL-NEXT: s_bfe_u32 s12, s1, 0x40010 +; GFX9-DL-NEXT: s_bfe_u32 s13, s1, 0x4000c +; GFX9-DL-NEXT: s_bfe_u32 s14, s1, 0x40008 +; GFX9-DL-NEXT: s_bfe_u32 s15, s1, 0x40004 +; GFX9-DL-NEXT: s_lshr_b32 s9, s1, 28 ; GFX9-DL-NEXT: s_and_b32 s1, s1, 15 -; GFX9-DL-NEXT: s_bfe_u32 s4, s0, 0x40018 -; GFX9-DL-NEXT: s_bfe_u32 s5, s0, 0x40014 -; GFX9-DL-NEXT: s_bfe_u32 s6, s0, 0x40010 -; GFX9-DL-NEXT: s_bfe_u32 s7, s0, 0x4000c -; GFX9-DL-NEXT: s_bfe_u32 s8, s0, 0x40008 -; GFX9-DL-NEXT: s_bfe_u32 s9, s0, 0x40004 +; GFX9-DL-NEXT: s_bfe_u32 s3, s0, 0x40018 +; GFX9-DL-NEXT: s_bfe_u32 s4, s0, 0x40014 +; GFX9-DL-NEXT: s_bfe_u32 s5, s0, 0x40010 +; GFX9-DL-NEXT: s_bfe_u32 s6, s0, 0x4000c +; GFX9-DL-NEXT: s_bfe_u32 s7, s0, 0x40008 +; GFX9-DL-NEXT: s_bfe_u32 s8, s0, 0x40004 ; GFX9-DL-NEXT: s_and_b32 s0, s0, 15 ; GFX9-DL-NEXT: v_mov_b32_e32 v3, s1 -; GFX9-DL-NEXT: v_mov_b32_e32 v4, s16 -; GFX9-DL-NEXT: v_mov_b32_e32 v5, s15 -; GFX9-DL-NEXT: v_mov_b32_e32 v6, s14 -; GFX9-DL-NEXT: v_mov_b32_e32 v7, s13 -; GFX9-DL-NEXT: v_mov_b32_e32 v8, s12 -; GFX9-DL-NEXT: v_mov_b32_e32 v9, s11 +; GFX9-DL-NEXT: v_mov_b32_e32 v4, s15 +; GFX9-DL-NEXT: v_mov_b32_e32 v5, s14 +; GFX9-DL-NEXT: v_mov_b32_e32 v6, s13 +; GFX9-DL-NEXT: v_mov_b32_e32 v7, s12 +; GFX9-DL-NEXT: v_mov_b32_e32 v8, s11 +; GFX9-DL-NEXT: v_mov_b32_e32 v9, s10 ; GFX9-DL-NEXT: s_waitcnt vmcnt(0) ; GFX9-DL-NEXT: v_mad_u32_u24 v2, s0, v3, v2 -; GFX9-DL-NEXT: v_mad_u32_u24 v2, s9, v4, v2 +; GFX9-DL-NEXT: v_mad_u32_u24 v2, s8, v4, v2 ; GFX9-DL-NEXT: v_and_b32_e32 v2, 0xff, v2 -; GFX9-DL-NEXT: v_mad_u32_u24 v2, s8, v5, v2 -; GFX9-DL-NEXT: v_mad_u32_u24 v2, s7, v6, v2 -; GFX9-DL-NEXT: v_mad_u32_u24 v2, s6, v7, v2 -; GFX9-DL-NEXT: v_mad_u32_u24 v2, s5, v8, v2 -; GFX9-DL-NEXT: v_mad_u32_u24 v2, s4, v9, v2 -; GFX9-DL-NEXT: v_mov_b32_e32 v3, s10 +; GFX9-DL-NEXT: v_mad_u32_u24 v2, s7, v5, v2 +; GFX9-DL-NEXT: v_mad_u32_u24 v2, s6, v6, v2 +; GFX9-DL-NEXT: v_mad_u32_u24 v2, s5, v7, v2 +; GFX9-DL-NEXT: v_mad_u32_u24 v2, s4, v8, v2 +; GFX9-DL-NEXT: v_mad_u32_u24 v2, s3, v9, v2 +; GFX9-DL-NEXT: v_mov_b32_e32 v3, s9 ; GFX9-DL-NEXT: v_mad_u32_u24 v2, s2, v3, v2 ; GFX9-DL-NEXT: global_store_byte v[0:1], v2, off ; GFX9-DL-NEXT: s_endpgm ; ; GFX10-DL-LABEL: udot8_acc8: ; GFX10-DL: ; %bb.0: ; %entry -; GFX10-DL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34 +; GFX10-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX10-DL-NEXT: ; implicit-def: $vcc_hi ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: v_mov_b32_e32 v0, s4 -; GFX10-DL-NEXT: v_mov_b32_e32 v1, s5 -; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-DL-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-DL-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-DL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-DL-NEXT: global_load_ubyte v2, v[0:1], off ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: s_load_dword s0, s[4:5], 0x0 -; GFX10-DL-NEXT: s_load_dword s1, s[6:7], 0x0 +; GFX10-DL-NEXT: s_load_dword s0, s[0:1], 0x0 +; GFX10-DL-NEXT: s_load_dword s1, s[2:3], 0x0 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DL-NEXT: s_and_b32 s2, s0, 15 -; GFX10-DL-NEXT: s_and_b32 s4, s1, 15 -; GFX10-DL-NEXT: s_bfe_u32 s5, s0, 0x40004 -; GFX10-DL-NEXT: s_bfe_u32 s6, s1, 0x40004 +; GFX10-DL-NEXT: s_and_b32 s3, s1, 15 +; GFX10-DL-NEXT: s_bfe_u32 s4, s0, 0x40004 +; GFX10-DL-NEXT: s_bfe_u32 s5, s1, 0x40004 ; GFX10-DL-NEXT: s_waitcnt vmcnt(0) -; GFX10-DL-NEXT: v_mad_u32_u24 v2, s2, s4, v2 +; GFX10-DL-NEXT: v_mad_u32_u24 v2, s2, s3, v2 ; GFX10-DL-NEXT: s_bfe_u32 s2, s0, 0x40008 -; GFX10-DL-NEXT: s_bfe_u32 s4, s1, 0x40008 -; GFX10-DL-NEXT: v_mad_u32_u24 v2, s5, s6, v2 -; GFX10-DL-NEXT: s_bfe_u32 s5, s0, 0x4000c -; GFX10-DL-NEXT: s_bfe_u32 s6, s1, 0x4000c +; GFX10-DL-NEXT: s_bfe_u32 s3, s1, 0x40008 +; GFX10-DL-NEXT: v_mad_u32_u24 v2, s4, s5, v2 +; GFX10-DL-NEXT: s_bfe_u32 s4, s0, 0x4000c +; GFX10-DL-NEXT: s_bfe_u32 s5, s1, 0x4000c ; GFX10-DL-NEXT: v_and_b32_e32 v2, 0xff, v2 -; GFX10-DL-NEXT: v_mad_u32_u24 v2, s2, s4, v2 +; GFX10-DL-NEXT: v_mad_u32_u24 v2, s2, s3, v2 ; GFX10-DL-NEXT: s_bfe_u32 s2, s0, 0x40010 -; GFX10-DL-NEXT: s_bfe_u32 s4, s1, 0x40010 -; GFX10-DL-NEXT: v_mad_u32_u24 v2, s5, s6, v2 -; GFX10-DL-NEXT: s_bfe_u32 s5, s0, 0x40014 -; GFX10-DL-NEXT: s_bfe_u32 s6, s1, 0x40014 -; GFX10-DL-NEXT: v_mad_u32_u24 v2, s2, s4, v2 +; GFX10-DL-NEXT: s_bfe_u32 s3, s1, 0x40010 +; GFX10-DL-NEXT: v_mad_u32_u24 v2, s4, s5, v2 +; GFX10-DL-NEXT: s_bfe_u32 s4, s0, 0x40014 +; GFX10-DL-NEXT: s_bfe_u32 s5, s1, 0x40014 +; GFX10-DL-NEXT: v_mad_u32_u24 v2, s2, s3, v2 ; GFX10-DL-NEXT: s_bfe_u32 s2, s0, 0x40018 -; GFX10-DL-NEXT: s_bfe_u32 s4, s1, 0x40018 +; GFX10-DL-NEXT: s_bfe_u32 s3, s1, 0x40018 ; GFX10-DL-NEXT: s_lshr_b32 s0, s0, 28 ; GFX10-DL-NEXT: s_lshr_b32 s1, s1, 28 -; GFX10-DL-NEXT: v_mad_u32_u24 v2, s5, s6, v2 -; GFX10-DL-NEXT: v_mad_u32_u24 v2, s2, s4, v2 +; GFX10-DL-NEXT: v_mad_u32_u24 v2, s4, s5, v2 +; GFX10-DL-NEXT: v_mad_u32_u24 v2, s2, s3, v2 ; GFX10-DL-NEXT: v_mad_u32_u24 v2, s0, s1, v2 ; GFX10-DL-NEXT: global_store_byte v[0:1], v2, off ; GFX10-DL-NEXT: s_endpgm @@ -860,50 +860,50 @@ define amdgpu_kernel void @udot8_acc4(<8 x i4> addrspace(1)* %src1, ; GFX7-LABEL: udot8_acc4: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x9 -; GFX7-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xd -; GFX7-NEXT: s_mov_b32 s7, 0xf000 -; GFX7-NEXT: s_mov_b32 s6, -1 +; GFX7-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd +; GFX7-NEXT: s_mov_b32 s3, 0xf000 +; GFX7-NEXT: s_mov_b32 s2, -1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: buffer_load_ubyte v0, off, s[4:7], 0 -; GFX7-NEXT: s_load_dword s0, s[8:9], 0x0 -; GFX7-NEXT: s_load_dword s1, s[10:11], 0x0 +; GFX7-NEXT: buffer_load_ubyte v0, off, s[0:3], 0 +; GFX7-NEXT: s_load_dword s4, s[4:5], 0x0 +; GFX7-NEXT: s_load_dword s5, s[6:7], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: s_lshr_b32 s2, s0, 28 -; GFX7-NEXT: s_bfe_u32 s15, s1, 0x40018 -; GFX7-NEXT: s_bfe_u32 s16, s1, 0x40014 -; GFX7-NEXT: s_bfe_u32 s17, s1, 0x40010 -; GFX7-NEXT: s_bfe_u32 s18, s1, 0x4000c -; GFX7-NEXT: s_bfe_u32 s19, s1, 0x40008 -; GFX7-NEXT: s_bfe_u32 s20, s1, 0x40004 -; GFX7-NEXT: s_lshr_b32 s14, s1, 28 -; GFX7-NEXT: s_and_b32 s1, s1, 15 -; GFX7-NEXT: s_bfe_u32 s8, s0, 0x40018 -; GFX7-NEXT: s_bfe_u32 s9, s0, 0x40014 -; GFX7-NEXT: s_bfe_u32 s10, s0, 0x40010 -; GFX7-NEXT: s_bfe_u32 s11, s0, 0x4000c -; GFX7-NEXT: s_bfe_u32 s12, s0, 0x40008 -; GFX7-NEXT: s_bfe_u32 s13, s0, 0x40004 -; GFX7-NEXT: s_and_b32 s0, s0, 15 -; GFX7-NEXT: v_mov_b32_e32 v1, s1 -; GFX7-NEXT: v_mov_b32_e32 v2, s20 -; GFX7-NEXT: v_mov_b32_e32 v3, s19 -; GFX7-NEXT: v_mov_b32_e32 v4, s18 -; GFX7-NEXT: v_mov_b32_e32 v5, s17 -; GFX7-NEXT: v_mov_b32_e32 v6, s16 -; GFX7-NEXT: v_mov_b32_e32 v7, s15 +; GFX7-NEXT: s_lshr_b32 s6, s4, 28 +; GFX7-NEXT: s_bfe_u32 s14, s5, 0x40018 +; GFX7-NEXT: s_bfe_u32 s15, s5, 0x40014 +; GFX7-NEXT: s_bfe_u32 s16, s5, 0x40010 +; GFX7-NEXT: s_bfe_u32 s17, s5, 0x4000c +; GFX7-NEXT: s_bfe_u32 s18, s5, 0x40008 +; GFX7-NEXT: s_bfe_u32 s19, s5, 0x40004 +; GFX7-NEXT: s_lshr_b32 s13, s5, 28 +; GFX7-NEXT: s_and_b32 s5, s5, 15 +; GFX7-NEXT: s_bfe_u32 s7, s4, 0x40018 +; GFX7-NEXT: s_bfe_u32 s8, s4, 0x40014 +; GFX7-NEXT: s_bfe_u32 s9, s4, 0x40010 +; GFX7-NEXT: s_bfe_u32 s10, s4, 0x4000c +; GFX7-NEXT: s_bfe_u32 s11, s4, 0x40008 +; GFX7-NEXT: s_bfe_u32 s12, s4, 0x40004 +; GFX7-NEXT: s_and_b32 s4, s4, 15 +; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: v_mov_b32_e32 v2, s19 +; GFX7-NEXT: v_mov_b32_e32 v3, s18 +; GFX7-NEXT: v_mov_b32_e32 v4, s17 +; GFX7-NEXT: v_mov_b32_e32 v5, s16 +; GFX7-NEXT: v_mov_b32_e32 v6, s15 +; GFX7-NEXT: v_mov_b32_e32 v7, s14 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_mad_u32_u24 v0, s0, v1, v0 -; GFX7-NEXT: v_mad_u32_u24 v0, s13, v2, v0 -; GFX7-NEXT: v_mad_u32_u24 v0, s12, v3, v0 -; GFX7-NEXT: v_mad_u32_u24 v0, s11, v4, v0 -; GFX7-NEXT: v_mad_u32_u24 v0, s10, v5, v0 -; GFX7-NEXT: v_mad_u32_u24 v0, s9, v6, v0 -; GFX7-NEXT: v_mad_u32_u24 v0, s8, v7, v0 -; GFX7-NEXT: v_mov_b32_e32 v1, s14 -; GFX7-NEXT: v_mad_u32_u24 v0, s2, v1, v0 +; GFX7-NEXT: v_mad_u32_u24 v0, s4, v1, v0 +; GFX7-NEXT: v_mad_u32_u24 v0, s12, v2, v0 +; GFX7-NEXT: v_mad_u32_u24 v0, s11, v3, v0 +; GFX7-NEXT: v_mad_u32_u24 v0, s10, v4, v0 +; GFX7-NEXT: v_mad_u32_u24 v0, s9, v5, v0 +; GFX7-NEXT: v_mad_u32_u24 v0, s8, v6, v0 +; GFX7-NEXT: v_mad_u32_u24 v0, s7, v7, v0 +; GFX7-NEXT: v_mov_b32_e32 v1, s13 +; GFX7-NEXT: v_mad_u32_u24 v0, s6, v1, v0 ; GFX7-NEXT: v_and_b32_e32 v0, 15, v0 -; GFX7-NEXT: buffer_store_byte v0, off, s[4:7], 0 +; GFX7-NEXT: buffer_store_byte v0, off, s[0:3], 0 ; GFX7-NEXT: s_endpgm ; ; GFX8-LABEL: udot8_acc4: @@ -917,41 +917,41 @@ ; GFX8-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX8-NEXT: s_load_dword s1, s[6:7], 0x0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_and_b32 s9, s0, 15 -; GFX8-NEXT: s_and_b32 s16, s1, 15 -; GFX8-NEXT: s_bfe_u32 s15, s1, 0x40004 -; GFX8-NEXT: v_mov_b32_e32 v4, s16 -; GFX8-NEXT: s_bfe_u32 s11, s1, 0x40018 -; GFX8-NEXT: s_bfe_u32 s12, s1, 0x40014 -; GFX8-NEXT: s_bfe_u32 s13, s1, 0x40010 -; GFX8-NEXT: s_bfe_u32 s14, s1, 0x40008 -; GFX8-NEXT: s_lshr_b32 s10, s1, 28 +; GFX8-NEXT: s_and_b32 s8, s0, 15 +; GFX8-NEXT: s_and_b32 s15, s1, 15 +; GFX8-NEXT: s_bfe_u32 s14, s1, 0x40004 +; GFX8-NEXT: v_mov_b32_e32 v4, s15 +; GFX8-NEXT: s_bfe_u32 s10, s1, 0x40018 +; GFX8-NEXT: s_bfe_u32 s11, s1, 0x40014 +; GFX8-NEXT: s_bfe_u32 s12, s1, 0x40010 +; GFX8-NEXT: s_bfe_u32 s13, s1, 0x40008 +; GFX8-NEXT: s_lshr_b32 s9, s1, 28 ; GFX8-NEXT: s_bfe_u32 s1, s1, 0x4000c -; GFX8-NEXT: s_bfe_u32 s8, s0, 0x40004 -; GFX8-NEXT: v_mov_b32_e32 v5, s15 +; GFX8-NEXT: s_bfe_u32 s7, s0, 0x40004 +; GFX8-NEXT: v_mov_b32_e32 v5, s14 ; GFX8-NEXT: s_lshr_b32 s2, s0, 28 -; GFX8-NEXT: s_bfe_u32 s4, s0, 0x40018 -; GFX8-NEXT: s_bfe_u32 s5, s0, 0x40014 -; GFX8-NEXT: s_bfe_u32 s6, s0, 0x40010 -; GFX8-NEXT: s_bfe_u32 s7, s0, 0x40008 +; GFX8-NEXT: s_bfe_u32 s3, s0, 0x40018 +; GFX8-NEXT: s_bfe_u32 s4, s0, 0x40014 +; GFX8-NEXT: s_bfe_u32 s5, s0, 0x40010 +; GFX8-NEXT: s_bfe_u32 s6, s0, 0x40008 ; GFX8-NEXT: s_bfe_u32 s0, s0, 0x4000c ; GFX8-NEXT: v_mov_b32_e32 v3, s1 -; GFX8-NEXT: v_mov_b32_e32 v6, s14 +; GFX8-NEXT: v_mov_b32_e32 v6, s13 ; GFX8-NEXT: v_mul_u32_u24_e32 v3, s0, v3 ; GFX8-NEXT: v_and_b32_e32 v3, 15, v3 -; GFX8-NEXT: v_mov_b32_e32 v7, s13 -; GFX8-NEXT: v_mov_b32_e32 v8, s12 -; GFX8-NEXT: v_mov_b32_e32 v9, s11 +; GFX8-NEXT: v_mov_b32_e32 v7, s12 +; GFX8-NEXT: v_mov_b32_e32 v8, s11 +; GFX8-NEXT: v_mov_b32_e32 v9, s10 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_mad_u32_u24 v2, s9, v4, v2 -; GFX8-NEXT: v_mad_u32_u24 v2, s8, v5, v2 -; GFX8-NEXT: v_mad_u32_u24 v2, s7, v6, v2 +; GFX8-NEXT: v_mad_u32_u24 v2, s8, v4, v2 +; GFX8-NEXT: v_mad_u32_u24 v2, s7, v5, v2 +; GFX8-NEXT: v_mad_u32_u24 v2, s6, v6, v2 ; GFX8-NEXT: v_and_b32_e32 v2, 15, v2 ; GFX8-NEXT: v_add_u32_e32 v2, vcc, v3, v2 -; GFX8-NEXT: v_mad_u32_u24 v2, s6, v7, v2 -; GFX8-NEXT: v_mad_u32_u24 v2, s5, v8, v2 -; GFX8-NEXT: v_mad_u32_u24 v2, s4, v9, v2 -; GFX8-NEXT: v_mov_b32_e32 v3, s10 +; GFX8-NEXT: v_mad_u32_u24 v2, s5, v7, v2 +; GFX8-NEXT: v_mad_u32_u24 v2, s4, v8, v2 +; GFX8-NEXT: v_mad_u32_u24 v2, s3, v9, v2 +; GFX8-NEXT: v_mov_b32_e32 v3, s9 ; GFX8-NEXT: v_mad_u32_u24 v2, s2, v3, v2 ; GFX8-NEXT: v_and_b32_e32 v2, 15, v2 ; GFX8-NEXT: flat_store_byte v[0:1], v2 @@ -968,41 +968,41 @@ ; GFX9-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX9-NEXT: s_load_dword s1, s[6:7], 0x0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_and_b32 s9, s0, 15 -; GFX9-NEXT: s_and_b32 s16, s1, 15 -; GFX9-NEXT: s_bfe_u32 s15, s1, 0x40004 -; GFX9-NEXT: v_mov_b32_e32 v4, s16 -; GFX9-NEXT: s_bfe_u32 s11, s1, 0x40018 -; GFX9-NEXT: s_bfe_u32 s12, s1, 0x40014 -; GFX9-NEXT: s_bfe_u32 s13, s1, 0x40010 -; GFX9-NEXT: s_bfe_u32 s14, s1, 0x40008 -; GFX9-NEXT: s_lshr_b32 s10, s1, 28 +; GFX9-NEXT: s_and_b32 s8, s0, 15 +; GFX9-NEXT: s_and_b32 s15, s1, 15 +; GFX9-NEXT: s_bfe_u32 s14, s1, 0x40004 +; GFX9-NEXT: v_mov_b32_e32 v4, s15 +; GFX9-NEXT: s_bfe_u32 s10, s1, 0x40018 +; GFX9-NEXT: s_bfe_u32 s11, s1, 0x40014 +; GFX9-NEXT: s_bfe_u32 s12, s1, 0x40010 +; GFX9-NEXT: s_bfe_u32 s13, s1, 0x40008 +; GFX9-NEXT: s_lshr_b32 s9, s1, 28 ; GFX9-NEXT: s_bfe_u32 s1, s1, 0x4000c -; GFX9-NEXT: s_bfe_u32 s8, s0, 0x40004 -; GFX9-NEXT: v_mov_b32_e32 v5, s15 +; GFX9-NEXT: s_bfe_u32 s7, s0, 0x40004 +; GFX9-NEXT: v_mov_b32_e32 v5, s14 ; GFX9-NEXT: s_lshr_b32 s2, s0, 28 -; GFX9-NEXT: s_bfe_u32 s4, s0, 0x40018 -; GFX9-NEXT: s_bfe_u32 s5, s0, 0x40014 -; GFX9-NEXT: s_bfe_u32 s6, s0, 0x40010 -; GFX9-NEXT: s_bfe_u32 s7, s0, 0x40008 +; GFX9-NEXT: s_bfe_u32 s3, s0, 0x40018 +; GFX9-NEXT: s_bfe_u32 s4, s0, 0x40014 +; GFX9-NEXT: s_bfe_u32 s5, s0, 0x40010 +; GFX9-NEXT: s_bfe_u32 s6, s0, 0x40008 ; GFX9-NEXT: s_bfe_u32 s0, s0, 0x4000c ; GFX9-NEXT: v_mov_b32_e32 v3, s1 -; GFX9-NEXT: v_mov_b32_e32 v6, s14 +; GFX9-NEXT: v_mov_b32_e32 v6, s13 ; GFX9-NEXT: v_mul_u32_u24_e32 v3, s0, v3 ; GFX9-NEXT: v_and_b32_e32 v3, 15, v3 -; GFX9-NEXT: v_mov_b32_e32 v7, s13 -; GFX9-NEXT: v_mov_b32_e32 v8, s12 -; GFX9-NEXT: v_mov_b32_e32 v9, s11 +; GFX9-NEXT: v_mov_b32_e32 v7, s12 +; GFX9-NEXT: v_mov_b32_e32 v8, s11 +; GFX9-NEXT: v_mov_b32_e32 v9, s10 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_mad_u32_u24 v2, s9, v4, v2 -; GFX9-NEXT: v_mad_u32_u24 v2, s8, v5, v2 -; GFX9-NEXT: v_mad_u32_u24 v2, s7, v6, v2 +; GFX9-NEXT: v_mad_u32_u24 v2, s8, v4, v2 +; GFX9-NEXT: v_mad_u32_u24 v2, s7, v5, v2 +; GFX9-NEXT: v_mad_u32_u24 v2, s6, v6, v2 ; GFX9-NEXT: v_and_b32_e32 v2, 15, v2 ; GFX9-NEXT: v_add_u32_e32 v2, v2, v3 -; GFX9-NEXT: v_mad_u32_u24 v2, s6, v7, v2 -; GFX9-NEXT: v_mad_u32_u24 v2, s5, v8, v2 -; GFX9-NEXT: v_mad_u32_u24 v2, s4, v9, v2 -; GFX9-NEXT: v_mov_b32_e32 v3, s10 +; GFX9-NEXT: v_mad_u32_u24 v2, s5, v7, v2 +; GFX9-NEXT: v_mad_u32_u24 v2, s4, v8, v2 +; GFX9-NEXT: v_mad_u32_u24 v2, s3, v9, v2 +; GFX9-NEXT: v_mov_b32_e32 v3, s9 ; GFX9-NEXT: v_mad_u32_u24 v2, s2, v3, v2 ; GFX9-NEXT: v_and_b32_e32 v2, 15, v2 ; GFX9-NEXT: global_store_byte v[0:1], v2, off @@ -1019,41 +1019,41 @@ ; GFX9-DL-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX9-DL-NEXT: s_load_dword s1, s[6:7], 0x0 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: s_and_b32 s9, s0, 15 -; GFX9-DL-NEXT: s_and_b32 s16, s1, 15 -; GFX9-DL-NEXT: s_bfe_u32 s15, s1, 0x40004 -; GFX9-DL-NEXT: v_mov_b32_e32 v4, s16 -; GFX9-DL-NEXT: s_bfe_u32 s11, s1, 0x40018 -; GFX9-DL-NEXT: s_bfe_u32 s12, s1, 0x40014 -; GFX9-DL-NEXT: s_bfe_u32 s13, s1, 0x40010 -; GFX9-DL-NEXT: s_bfe_u32 s14, s1, 0x40008 -; GFX9-DL-NEXT: s_lshr_b32 s10, s1, 28 +; GFX9-DL-NEXT: s_and_b32 s8, s0, 15 +; GFX9-DL-NEXT: s_and_b32 s15, s1, 15 +; GFX9-DL-NEXT: s_bfe_u32 s14, s1, 0x40004 +; GFX9-DL-NEXT: v_mov_b32_e32 v4, s15 +; GFX9-DL-NEXT: s_bfe_u32 s10, s1, 0x40018 +; GFX9-DL-NEXT: s_bfe_u32 s11, s1, 0x40014 +; GFX9-DL-NEXT: s_bfe_u32 s12, s1, 0x40010 +; GFX9-DL-NEXT: s_bfe_u32 s13, s1, 0x40008 +; GFX9-DL-NEXT: s_lshr_b32 s9, s1, 28 ; GFX9-DL-NEXT: s_bfe_u32 s1, s1, 0x4000c -; GFX9-DL-NEXT: s_bfe_u32 s8, s0, 0x40004 -; GFX9-DL-NEXT: v_mov_b32_e32 v5, s15 +; GFX9-DL-NEXT: s_bfe_u32 s7, s0, 0x40004 +; GFX9-DL-NEXT: v_mov_b32_e32 v5, s14 ; GFX9-DL-NEXT: s_lshr_b32 s2, s0, 28 -; GFX9-DL-NEXT: s_bfe_u32 s4, s0, 0x40018 -; GFX9-DL-NEXT: s_bfe_u32 s5, s0, 0x40014 -; GFX9-DL-NEXT: s_bfe_u32 s6, s0, 0x40010 -; GFX9-DL-NEXT: s_bfe_u32 s7, s0, 0x40008 +; GFX9-DL-NEXT: s_bfe_u32 s3, s0, 0x40018 +; GFX9-DL-NEXT: s_bfe_u32 s4, s0, 0x40014 +; GFX9-DL-NEXT: s_bfe_u32 s5, s0, 0x40010 +; GFX9-DL-NEXT: s_bfe_u32 s6, s0, 0x40008 ; GFX9-DL-NEXT: s_bfe_u32 s0, s0, 0x4000c ; GFX9-DL-NEXT: v_mov_b32_e32 v3, s1 -; GFX9-DL-NEXT: v_mov_b32_e32 v6, s14 +; GFX9-DL-NEXT: v_mov_b32_e32 v6, s13 ; GFX9-DL-NEXT: v_mul_u32_u24_e32 v3, s0, v3 ; GFX9-DL-NEXT: v_and_b32_e32 v3, 15, v3 -; GFX9-DL-NEXT: v_mov_b32_e32 v7, s13 -; GFX9-DL-NEXT: v_mov_b32_e32 v8, s12 -; GFX9-DL-NEXT: v_mov_b32_e32 v9, s11 +; GFX9-DL-NEXT: v_mov_b32_e32 v7, s12 +; GFX9-DL-NEXT: v_mov_b32_e32 v8, s11 +; GFX9-DL-NEXT: v_mov_b32_e32 v9, s10 ; GFX9-DL-NEXT: s_waitcnt vmcnt(0) -; GFX9-DL-NEXT: v_mad_u32_u24 v2, s9, v4, v2 -; GFX9-DL-NEXT: v_mad_u32_u24 v2, s8, v5, v2 -; GFX9-DL-NEXT: v_mad_u32_u24 v2, s7, v6, v2 +; GFX9-DL-NEXT: v_mad_u32_u24 v2, s8, v4, v2 +; GFX9-DL-NEXT: v_mad_u32_u24 v2, s7, v5, v2 +; GFX9-DL-NEXT: v_mad_u32_u24 v2, s6, v6, v2 ; GFX9-DL-NEXT: v_and_b32_e32 v2, 15, v2 ; GFX9-DL-NEXT: v_add_u32_e32 v2, v2, v3 -; GFX9-DL-NEXT: v_mad_u32_u24 v2, s6, v7, v2 -; GFX9-DL-NEXT: v_mad_u32_u24 v2, s5, v8, v2 -; GFX9-DL-NEXT: v_mad_u32_u24 v2, s4, v9, v2 -; GFX9-DL-NEXT: v_mov_b32_e32 v3, s10 +; GFX9-DL-NEXT: v_mad_u32_u24 v2, s5, v7, v2 +; GFX9-DL-NEXT: v_mad_u32_u24 v2, s4, v8, v2 +; GFX9-DL-NEXT: v_mad_u32_u24 v2, s3, v9, v2 +; GFX9-DL-NEXT: v_mov_b32_e32 v3, s9 ; GFX9-DL-NEXT: v_mad_u32_u24 v2, s2, v3, v2 ; GFX9-DL-NEXT: v_and_b32_e32 v2, 15, v2 ; GFX9-DL-NEXT: global_store_byte v[0:1], v2, off @@ -1061,44 +1061,44 @@ ; ; GFX10-DL-LABEL: udot8_acc4: ; GFX10-DL: ; %bb.0: ; %entry -; GFX10-DL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34 +; GFX10-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX10-DL-NEXT: ; implicit-def: $vcc_hi ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: v_mov_b32_e32 v0, s4 -; GFX10-DL-NEXT: v_mov_b32_e32 v1, s5 -; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-DL-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-DL-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-DL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-DL-NEXT: global_load_ubyte v2, v[0:1], off ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: s_load_dword s0, s[4:5], 0x0 -; GFX10-DL-NEXT: s_load_dword s1, s[6:7], 0x0 +; GFX10-DL-NEXT: s_load_dword s0, s[0:1], 0x0 +; GFX10-DL-NEXT: s_load_dword s1, s[2:3], 0x0 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DL-NEXT: s_and_b32 s2, s0, 15 -; GFX10-DL-NEXT: s_and_b32 s4, s1, 15 -; GFX10-DL-NEXT: s_bfe_u32 s5, s0, 0x40004 -; GFX10-DL-NEXT: s_bfe_u32 s6, s1, 0x40004 -; GFX10-DL-NEXT: s_bfe_u32 s7, s1, 0x40008 +; GFX10-DL-NEXT: s_and_b32 s3, s1, 15 +; GFX10-DL-NEXT: s_bfe_u32 s4, s0, 0x40004 +; GFX10-DL-NEXT: s_bfe_u32 s5, s1, 0x40004 +; GFX10-DL-NEXT: s_bfe_u32 s6, s1, 0x40008 ; GFX10-DL-NEXT: s_waitcnt vmcnt(0) -; GFX10-DL-NEXT: v_mad_u32_u24 v2, s2, s4, v2 +; GFX10-DL-NEXT: v_mad_u32_u24 v2, s2, s3, v2 ; GFX10-DL-NEXT: s_bfe_u32 s2, s0, 0x40008 -; GFX10-DL-NEXT: s_bfe_u32 s4, s0, 0x4000c -; GFX10-DL-NEXT: v_mad_u32_u24 v2, s5, s6, v2 -; GFX10-DL-NEXT: s_bfe_u32 s5, s1, 0x4000c -; GFX10-DL-NEXT: s_bfe_u32 s6, s1, 0x40014 -; GFX10-DL-NEXT: v_mad_u32_u24 v2, s2, s7, v2 -; GFX10-DL-NEXT: v_mul_u32_u24_e64 v3, s4, s5 +; GFX10-DL-NEXT: s_bfe_u32 s3, s0, 0x4000c +; GFX10-DL-NEXT: v_mad_u32_u24 v2, s4, s5, v2 +; GFX10-DL-NEXT: s_bfe_u32 s4, s1, 0x4000c +; GFX10-DL-NEXT: s_bfe_u32 s5, s1, 0x40014 +; GFX10-DL-NEXT: v_mad_u32_u24 v2, s2, s6, v2 +; GFX10-DL-NEXT: v_mul_u32_u24_e64 v3, s3, s4 ; GFX10-DL-NEXT: s_bfe_u32 s2, s0, 0x40010 -; GFX10-DL-NEXT: s_bfe_u32 s4, s1, 0x40010 -; GFX10-DL-NEXT: s_bfe_u32 s5, s0, 0x40014 +; GFX10-DL-NEXT: s_bfe_u32 s3, s1, 0x40010 +; GFX10-DL-NEXT: s_bfe_u32 s4, s0, 0x40014 ; GFX10-DL-NEXT: v_and_b32_e32 v2, 15, v2 ; GFX10-DL-NEXT: v_and_b32_e32 v3, 15, v3 ; GFX10-DL-NEXT: v_add_nc_u32_e32 v2, v2, v3 -; GFX10-DL-NEXT: v_mad_u32_u24 v2, s2, s4, v2 +; GFX10-DL-NEXT: v_mad_u32_u24 v2, s2, s3, v2 ; GFX10-DL-NEXT: s_bfe_u32 s2, s0, 0x40018 -; GFX10-DL-NEXT: s_bfe_u32 s4, s1, 0x40018 +; GFX10-DL-NEXT: s_bfe_u32 s3, s1, 0x40018 ; GFX10-DL-NEXT: s_lshr_b32 s0, s0, 28 ; GFX10-DL-NEXT: s_lshr_b32 s1, s1, 28 -; GFX10-DL-NEXT: v_mad_u32_u24 v2, s5, s6, v2 -; GFX10-DL-NEXT: v_mad_u32_u24 v2, s2, s4, v2 +; GFX10-DL-NEXT: v_mad_u32_u24 v2, s4, s5, v2 +; GFX10-DL-NEXT: v_mad_u32_u24 v2, s2, s3, v2 ; GFX10-DL-NEXT: v_mad_u32_u24 v2, s0, s1, v2 ; GFX10-DL-NEXT: v_and_b32_e32 v2, 15, v2 ; GFX10-DL-NEXT: global_store_byte v[0:1], v2, off @@ -1160,50 +1160,50 @@ define amdgpu_kernel void @udot8_CommutationInsideMAD(<8 x i4> addrspace(1)* %src1, ; GFX7-LABEL: udot8_CommutationInsideMAD: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x9 -; GFX7-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xd -; GFX7-NEXT: s_mov_b32 s7, 0xf000 -; GFX7-NEXT: s_mov_b32 s6, -1 +; GFX7-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd +; GFX7-NEXT: s_mov_b32 s3, 0xf000 +; GFX7-NEXT: s_mov_b32 s2, -1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: buffer_load_ubyte v0, off, s[4:7], 0 -; GFX7-NEXT: s_load_dword s0, s[8:9], 0x0 -; GFX7-NEXT: s_load_dword s1, s[10:11], 0x0 +; GFX7-NEXT: buffer_load_ubyte v0, off, s[0:3], 0 +; GFX7-NEXT: s_load_dword s4, s[4:5], 0x0 +; GFX7-NEXT: s_load_dword s5, s[6:7], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: s_lshr_b32 s2, s0, 28 -; GFX7-NEXT: s_bfe_u32 s15, s1, 0x40018 -; GFX7-NEXT: s_bfe_u32 s16, s1, 0x40014 -; GFX7-NEXT: s_bfe_u32 s17, s1, 0x40010 -; GFX7-NEXT: s_bfe_u32 s18, s1, 0x4000c -; GFX7-NEXT: s_bfe_u32 s19, s1, 0x40008 -; GFX7-NEXT: s_bfe_u32 s20, s1, 0x40004 -; GFX7-NEXT: s_lshr_b32 s14, s1, 28 -; GFX7-NEXT: s_and_b32 s1, s1, 15 -; GFX7-NEXT: s_bfe_u32 s8, s0, 0x40018 -; GFX7-NEXT: s_bfe_u32 s9, s0, 0x40014 -; GFX7-NEXT: s_bfe_u32 s10, s0, 0x40010 -; GFX7-NEXT: s_bfe_u32 s11, s0, 0x4000c -; GFX7-NEXT: s_bfe_u32 s12, s0, 0x40008 -; GFX7-NEXT: s_bfe_u32 s13, s0, 0x40004 -; GFX7-NEXT: s_and_b32 s0, s0, 15 -; GFX7-NEXT: v_mov_b32_e32 v1, s1 -; GFX7-NEXT: v_mov_b32_e32 v2, s20 -; GFX7-NEXT: v_mov_b32_e32 v3, s19 -; GFX7-NEXT: v_mov_b32_e32 v4, s18 -; GFX7-NEXT: v_mov_b32_e32 v5, s17 -; GFX7-NEXT: v_mov_b32_e32 v6, s16 -; GFX7-NEXT: v_mov_b32_e32 v7, s15 +; GFX7-NEXT: s_lshr_b32 s6, s4, 28 +; GFX7-NEXT: s_bfe_u32 s14, s5, 0x40018 +; GFX7-NEXT: s_bfe_u32 s15, s5, 0x40014 +; GFX7-NEXT: s_bfe_u32 s16, s5, 0x40010 +; GFX7-NEXT: s_bfe_u32 s17, s5, 0x4000c +; GFX7-NEXT: s_bfe_u32 s18, s5, 0x40008 +; GFX7-NEXT: s_bfe_u32 s19, s5, 0x40004 +; GFX7-NEXT: s_lshr_b32 s13, s5, 28 +; GFX7-NEXT: s_and_b32 s5, s5, 15 +; GFX7-NEXT: s_bfe_u32 s7, s4, 0x40018 +; GFX7-NEXT: s_bfe_u32 s8, s4, 0x40014 +; GFX7-NEXT: s_bfe_u32 s9, s4, 0x40010 +; GFX7-NEXT: s_bfe_u32 s10, s4, 0x4000c +; GFX7-NEXT: s_bfe_u32 s11, s4, 0x40008 +; GFX7-NEXT: s_bfe_u32 s12, s4, 0x40004 +; GFX7-NEXT: s_and_b32 s4, s4, 15 +; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: v_mov_b32_e32 v2, s19 +; GFX7-NEXT: v_mov_b32_e32 v3, s18 +; GFX7-NEXT: v_mov_b32_e32 v4, s17 +; GFX7-NEXT: v_mov_b32_e32 v5, s16 +; GFX7-NEXT: v_mov_b32_e32 v6, s15 +; GFX7-NEXT: v_mov_b32_e32 v7, s14 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_mad_u32_u24 v0, s0, v1, v0 -; GFX7-NEXT: v_mad_u32_u24 v0, s13, v2, v0 -; GFX7-NEXT: v_mad_u32_u24 v0, s12, v3, v0 -; GFX7-NEXT: v_mad_u32_u24 v0, s11, v4, v0 -; GFX7-NEXT: v_mad_u32_u24 v0, s10, v5, v0 -; GFX7-NEXT: v_mad_u32_u24 v0, s9, v6, v0 -; GFX7-NEXT: v_mad_u32_u24 v0, s8, v7, v0 -; GFX7-NEXT: v_mov_b32_e32 v1, s14 -; GFX7-NEXT: v_mad_u32_u24 v0, s2, v1, v0 +; GFX7-NEXT: v_mad_u32_u24 v0, s4, v1, v0 +; GFX7-NEXT: v_mad_u32_u24 v0, s12, v2, v0 +; GFX7-NEXT: v_mad_u32_u24 v0, s11, v3, v0 +; GFX7-NEXT: v_mad_u32_u24 v0, s10, v4, v0 +; GFX7-NEXT: v_mad_u32_u24 v0, s9, v5, v0 +; GFX7-NEXT: v_mad_u32_u24 v0, s8, v6, v0 +; GFX7-NEXT: v_mad_u32_u24 v0, s7, v7, v0 +; GFX7-NEXT: v_mov_b32_e32 v1, s13 +; GFX7-NEXT: v_mad_u32_u24 v0, s6, v1, v0 ; GFX7-NEXT: v_and_b32_e32 v0, 15, v0 -; GFX7-NEXT: buffer_store_byte v0, off, s[4:7], 0 +; GFX7-NEXT: buffer_store_byte v0, off, s[0:3], 0 ; GFX7-NEXT: s_endpgm ; ; GFX8-LABEL: udot8_CommutationInsideMAD: @@ -1217,41 +1217,41 @@ ; GFX8-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX8-NEXT: s_load_dword s1, s[6:7], 0x0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_and_b32 s9, s0, 15 -; GFX8-NEXT: s_and_b32 s16, s1, 15 -; GFX8-NEXT: s_bfe_u32 s15, s1, 0x40004 -; GFX8-NEXT: v_mov_b32_e32 v4, s16 -; GFX8-NEXT: s_bfe_u32 s11, s1, 0x40018 -; GFX8-NEXT: s_bfe_u32 s12, s1, 0x40014 -; GFX8-NEXT: s_bfe_u32 s13, s1, 0x40010 -; GFX8-NEXT: s_bfe_u32 s14, s1, 0x40008 -; GFX8-NEXT: s_lshr_b32 s10, s1, 28 +; GFX8-NEXT: s_and_b32 s8, s0, 15 +; GFX8-NEXT: s_and_b32 s15, s1, 15 +; GFX8-NEXT: s_bfe_u32 s14, s1, 0x40004 +; GFX8-NEXT: v_mov_b32_e32 v4, s15 +; GFX8-NEXT: s_bfe_u32 s10, s1, 0x40018 +; GFX8-NEXT: s_bfe_u32 s11, s1, 0x40014 +; GFX8-NEXT: s_bfe_u32 s12, s1, 0x40010 +; GFX8-NEXT: s_bfe_u32 s13, s1, 0x40008 +; GFX8-NEXT: s_lshr_b32 s9, s1, 28 ; GFX8-NEXT: s_bfe_u32 s1, s1, 0x4000c -; GFX8-NEXT: s_bfe_u32 s8, s0, 0x40004 -; GFX8-NEXT: v_mov_b32_e32 v5, s15 +; GFX8-NEXT: s_bfe_u32 s7, s0, 0x40004 +; GFX8-NEXT: v_mov_b32_e32 v5, s14 ; GFX8-NEXT: s_lshr_b32 s2, s0, 28 -; GFX8-NEXT: s_bfe_u32 s4, s0, 0x40018 -; GFX8-NEXT: s_bfe_u32 s5, s0, 0x40014 -; GFX8-NEXT: s_bfe_u32 s6, s0, 0x40010 -; GFX8-NEXT: s_bfe_u32 s7, s0, 0x40008 +; GFX8-NEXT: s_bfe_u32 s3, s0, 0x40018 +; GFX8-NEXT: s_bfe_u32 s4, s0, 0x40014 +; GFX8-NEXT: s_bfe_u32 s5, s0, 0x40010 +; GFX8-NEXT: s_bfe_u32 s6, s0, 0x40008 ; GFX8-NEXT: s_bfe_u32 s0, s0, 0x4000c ; GFX8-NEXT: v_mov_b32_e32 v3, s1 -; GFX8-NEXT: v_mov_b32_e32 v6, s14 +; GFX8-NEXT: v_mov_b32_e32 v6, s13 ; GFX8-NEXT: v_mul_u32_u24_e32 v3, s0, v3 ; GFX8-NEXT: v_and_b32_e32 v3, 15, v3 -; GFX8-NEXT: v_mov_b32_e32 v7, s13 -; GFX8-NEXT: v_mov_b32_e32 v8, s12 -; GFX8-NEXT: v_mov_b32_e32 v9, s11 +; GFX8-NEXT: v_mov_b32_e32 v7, s12 +; GFX8-NEXT: v_mov_b32_e32 v8, s11 +; GFX8-NEXT: v_mov_b32_e32 v9, s10 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_mad_u32_u24 v2, s9, v4, v2 -; GFX8-NEXT: v_mad_u32_u24 v2, s8, v5, v2 -; GFX8-NEXT: v_mad_u32_u24 v2, s7, v6, v2 +; GFX8-NEXT: v_mad_u32_u24 v2, s8, v4, v2 +; GFX8-NEXT: v_mad_u32_u24 v2, s7, v5, v2 +; GFX8-NEXT: v_mad_u32_u24 v2, s6, v6, v2 ; GFX8-NEXT: v_and_b32_e32 v2, 15, v2 ; GFX8-NEXT: v_add_u32_e32 v2, vcc, v2, v3 -; GFX8-NEXT: v_mad_u32_u24 v2, s6, v7, v2 -; GFX8-NEXT: v_mad_u32_u24 v2, s5, v8, v2 -; GFX8-NEXT: v_mad_u32_u24 v2, s4, v9, v2 -; GFX8-NEXT: v_mov_b32_e32 v3, s10 +; GFX8-NEXT: v_mad_u32_u24 v2, s5, v7, v2 +; GFX8-NEXT: v_mad_u32_u24 v2, s4, v8, v2 +; GFX8-NEXT: v_mad_u32_u24 v2, s3, v9, v2 +; GFX8-NEXT: v_mov_b32_e32 v3, s9 ; GFX8-NEXT: v_mad_u32_u24 v2, s2, v3, v2 ; GFX8-NEXT: v_and_b32_e32 v2, 15, v2 ; GFX8-NEXT: flat_store_byte v[0:1], v2 @@ -1268,41 +1268,41 @@ ; GFX9-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX9-NEXT: s_load_dword s1, s[6:7], 0x0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_and_b32 s9, s0, 15 -; GFX9-NEXT: s_and_b32 s16, s1, 15 -; GFX9-NEXT: s_bfe_u32 s15, s1, 0x40004 -; GFX9-NEXT: v_mov_b32_e32 v4, s16 -; GFX9-NEXT: s_bfe_u32 s11, s1, 0x40018 -; GFX9-NEXT: s_bfe_u32 s12, s1, 0x40014 -; GFX9-NEXT: s_bfe_u32 s13, s1, 0x40010 -; GFX9-NEXT: s_bfe_u32 s14, s1, 0x40008 -; GFX9-NEXT: s_lshr_b32 s10, s1, 28 +; GFX9-NEXT: s_and_b32 s8, s0, 15 +; GFX9-NEXT: s_and_b32 s15, s1, 15 +; GFX9-NEXT: s_bfe_u32 s14, s1, 0x40004 +; GFX9-NEXT: v_mov_b32_e32 v4, s15 +; GFX9-NEXT: s_bfe_u32 s10, s1, 0x40018 +; GFX9-NEXT: s_bfe_u32 s11, s1, 0x40014 +; GFX9-NEXT: s_bfe_u32 s12, s1, 0x40010 +; GFX9-NEXT: s_bfe_u32 s13, s1, 0x40008 +; GFX9-NEXT: s_lshr_b32 s9, s1, 28 ; GFX9-NEXT: s_bfe_u32 s1, s1, 0x4000c -; GFX9-NEXT: s_bfe_u32 s8, s0, 0x40004 -; GFX9-NEXT: v_mov_b32_e32 v5, s15 +; GFX9-NEXT: s_bfe_u32 s7, s0, 0x40004 +; GFX9-NEXT: v_mov_b32_e32 v5, s14 ; GFX9-NEXT: s_lshr_b32 s2, s0, 28 -; GFX9-NEXT: s_bfe_u32 s4, s0, 0x40018 -; GFX9-NEXT: s_bfe_u32 s5, s0, 0x40014 -; GFX9-NEXT: s_bfe_u32 s6, s0, 0x40010 -; GFX9-NEXT: s_bfe_u32 s7, s0, 0x40008 +; GFX9-NEXT: s_bfe_u32 s3, s0, 0x40018 +; GFX9-NEXT: s_bfe_u32 s4, s0, 0x40014 +; GFX9-NEXT: s_bfe_u32 s5, s0, 0x40010 +; GFX9-NEXT: s_bfe_u32 s6, s0, 0x40008 ; GFX9-NEXT: s_bfe_u32 s0, s0, 0x4000c ; GFX9-NEXT: v_mov_b32_e32 v3, s1 -; GFX9-NEXT: v_mov_b32_e32 v6, s14 +; GFX9-NEXT: v_mov_b32_e32 v6, s13 ; GFX9-NEXT: v_mul_u32_u24_e32 v3, s0, v3 ; GFX9-NEXT: v_and_b32_e32 v3, 15, v3 -; GFX9-NEXT: v_mov_b32_e32 v7, s13 -; GFX9-NEXT: v_mov_b32_e32 v8, s12 -; GFX9-NEXT: v_mov_b32_e32 v9, s11 +; GFX9-NEXT: v_mov_b32_e32 v7, s12 +; GFX9-NEXT: v_mov_b32_e32 v8, s11 +; GFX9-NEXT: v_mov_b32_e32 v9, s10 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_mad_u32_u24 v2, s9, v4, v2 -; GFX9-NEXT: v_mad_u32_u24 v2, s8, v5, v2 -; GFX9-NEXT: v_mad_u32_u24 v2, s7, v6, v2 +; GFX9-NEXT: v_mad_u32_u24 v2, s8, v4, v2 +; GFX9-NEXT: v_mad_u32_u24 v2, s7, v5, v2 +; GFX9-NEXT: v_mad_u32_u24 v2, s6, v6, v2 ; GFX9-NEXT: v_and_b32_e32 v2, 15, v2 ; GFX9-NEXT: v_add_u32_e32 v2, v3, v2 -; GFX9-NEXT: v_mad_u32_u24 v2, s6, v7, v2 -; GFX9-NEXT: v_mad_u32_u24 v2, s5, v8, v2 -; GFX9-NEXT: v_mad_u32_u24 v2, s4, v9, v2 -; GFX9-NEXT: v_mov_b32_e32 v3, s10 +; GFX9-NEXT: v_mad_u32_u24 v2, s5, v7, v2 +; GFX9-NEXT: v_mad_u32_u24 v2, s4, v8, v2 +; GFX9-NEXT: v_mad_u32_u24 v2, s3, v9, v2 +; GFX9-NEXT: v_mov_b32_e32 v3, s9 ; GFX9-NEXT: v_mad_u32_u24 v2, s2, v3, v2 ; GFX9-NEXT: v_and_b32_e32 v2, 15, v2 ; GFX9-NEXT: global_store_byte v[0:1], v2, off @@ -1319,41 +1319,41 @@ ; GFX9-DL-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX9-DL-NEXT: s_load_dword s1, s[6:7], 0x0 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: s_and_b32 s9, s0, 15 -; GFX9-DL-NEXT: s_and_b32 s16, s1, 15 -; GFX9-DL-NEXT: s_bfe_u32 s15, s1, 0x40004 -; GFX9-DL-NEXT: v_mov_b32_e32 v4, s16 -; GFX9-DL-NEXT: s_bfe_u32 s11, s1, 0x40018 -; GFX9-DL-NEXT: s_bfe_u32 s12, s1, 0x40014 -; GFX9-DL-NEXT: s_bfe_u32 s13, s1, 0x40010 -; GFX9-DL-NEXT: s_bfe_u32 s14, s1, 0x40008 -; GFX9-DL-NEXT: s_lshr_b32 s10, s1, 28 +; GFX9-DL-NEXT: s_and_b32 s8, s0, 15 +; GFX9-DL-NEXT: s_and_b32 s15, s1, 15 +; GFX9-DL-NEXT: s_bfe_u32 s14, s1, 0x40004 +; GFX9-DL-NEXT: v_mov_b32_e32 v4, s15 +; GFX9-DL-NEXT: s_bfe_u32 s10, s1, 0x40018 +; GFX9-DL-NEXT: s_bfe_u32 s11, s1, 0x40014 +; GFX9-DL-NEXT: s_bfe_u32 s12, s1, 0x40010 +; GFX9-DL-NEXT: s_bfe_u32 s13, s1, 0x40008 +; GFX9-DL-NEXT: s_lshr_b32 s9, s1, 28 ; GFX9-DL-NEXT: s_bfe_u32 s1, s1, 0x4000c -; GFX9-DL-NEXT: s_bfe_u32 s8, s0, 0x40004 -; GFX9-DL-NEXT: v_mov_b32_e32 v5, s15 +; GFX9-DL-NEXT: s_bfe_u32 s7, s0, 0x40004 +; GFX9-DL-NEXT: v_mov_b32_e32 v5, s14 ; GFX9-DL-NEXT: s_lshr_b32 s2, s0, 28 -; GFX9-DL-NEXT: s_bfe_u32 s4, s0, 0x40018 -; GFX9-DL-NEXT: s_bfe_u32 s5, s0, 0x40014 -; GFX9-DL-NEXT: s_bfe_u32 s6, s0, 0x40010 -; GFX9-DL-NEXT: s_bfe_u32 s7, s0, 0x40008 +; GFX9-DL-NEXT: s_bfe_u32 s3, s0, 0x40018 +; GFX9-DL-NEXT: s_bfe_u32 s4, s0, 0x40014 +; GFX9-DL-NEXT: s_bfe_u32 s5, s0, 0x40010 +; GFX9-DL-NEXT: s_bfe_u32 s6, s0, 0x40008 ; GFX9-DL-NEXT: s_bfe_u32 s0, s0, 0x4000c ; GFX9-DL-NEXT: v_mov_b32_e32 v3, s1 -; GFX9-DL-NEXT: v_mov_b32_e32 v6, s14 +; GFX9-DL-NEXT: v_mov_b32_e32 v6, s13 ; GFX9-DL-NEXT: v_mul_u32_u24_e32 v3, s0, v3 ; GFX9-DL-NEXT: v_and_b32_e32 v3, 15, v3 -; GFX9-DL-NEXT: v_mov_b32_e32 v7, s13 -; GFX9-DL-NEXT: v_mov_b32_e32 v8, s12 -; GFX9-DL-NEXT: v_mov_b32_e32 v9, s11 +; GFX9-DL-NEXT: v_mov_b32_e32 v7, s12 +; GFX9-DL-NEXT: v_mov_b32_e32 v8, s11 +; GFX9-DL-NEXT: v_mov_b32_e32 v9, s10 ; GFX9-DL-NEXT: s_waitcnt vmcnt(0) -; GFX9-DL-NEXT: v_mad_u32_u24 v2, s9, v4, v2 -; GFX9-DL-NEXT: v_mad_u32_u24 v2, s8, v5, v2 -; GFX9-DL-NEXT: v_mad_u32_u24 v2, s7, v6, v2 +; GFX9-DL-NEXT: v_mad_u32_u24 v2, s8, v4, v2 +; GFX9-DL-NEXT: v_mad_u32_u24 v2, s7, v5, v2 +; GFX9-DL-NEXT: v_mad_u32_u24 v2, s6, v6, v2 ; GFX9-DL-NEXT: v_and_b32_e32 v2, 15, v2 ; GFX9-DL-NEXT: v_add_u32_e32 v2, v3, v2 -; GFX9-DL-NEXT: v_mad_u32_u24 v2, s6, v7, v2 -; GFX9-DL-NEXT: v_mad_u32_u24 v2, s5, v8, v2 -; GFX9-DL-NEXT: v_mad_u32_u24 v2, s4, v9, v2 -; GFX9-DL-NEXT: v_mov_b32_e32 v3, s10 +; GFX9-DL-NEXT: v_mad_u32_u24 v2, s5, v7, v2 +; GFX9-DL-NEXT: v_mad_u32_u24 v2, s4, v8, v2 +; GFX9-DL-NEXT: v_mad_u32_u24 v2, s3, v9, v2 +; GFX9-DL-NEXT: v_mov_b32_e32 v3, s9 ; GFX9-DL-NEXT: v_mad_u32_u24 v2, s2, v3, v2 ; GFX9-DL-NEXT: v_and_b32_e32 v2, 15, v2 ; GFX9-DL-NEXT: global_store_byte v[0:1], v2, off @@ -1361,44 +1361,44 @@ ; ; GFX10-DL-LABEL: udot8_CommutationInsideMAD: ; GFX10-DL: ; %bb.0: ; %entry -; GFX10-DL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34 +; GFX10-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX10-DL-NEXT: ; implicit-def: $vcc_hi ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: v_mov_b32_e32 v0, s4 -; GFX10-DL-NEXT: v_mov_b32_e32 v1, s5 -; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-DL-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-DL-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-DL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-DL-NEXT: global_load_ubyte v2, v[0:1], off ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: s_load_dword s0, s[4:5], 0x0 -; GFX10-DL-NEXT: s_load_dword s1, s[6:7], 0x0 +; GFX10-DL-NEXT: s_load_dword s0, s[0:1], 0x0 +; GFX10-DL-NEXT: s_load_dword s1, s[2:3], 0x0 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DL-NEXT: s_and_b32 s2, s0, 15 -; GFX10-DL-NEXT: s_and_b32 s4, s1, 15 -; GFX10-DL-NEXT: s_bfe_u32 s5, s0, 0x40004 -; GFX10-DL-NEXT: s_bfe_u32 s6, s1, 0x40004 -; GFX10-DL-NEXT: s_bfe_u32 s7, s1, 0x40008 -; GFX10-DL-NEXT: s_bfe_u32 s8, s1, 0x4000c +; GFX10-DL-NEXT: s_and_b32 s3, s1, 15 +; GFX10-DL-NEXT: s_bfe_u32 s4, s0, 0x40004 +; GFX10-DL-NEXT: s_bfe_u32 s5, s1, 0x40004 +; GFX10-DL-NEXT: s_bfe_u32 s6, s1, 0x40008 +; GFX10-DL-NEXT: s_bfe_u32 s7, s1, 0x4000c ; GFX10-DL-NEXT: s_waitcnt vmcnt(0) -; GFX10-DL-NEXT: v_mad_u32_u24 v2, s2, s4, v2 -; GFX10-DL-NEXT: s_bfe_u32 s4, s0, 0x4000c +; GFX10-DL-NEXT: v_mad_u32_u24 v2, s2, s3, v2 +; GFX10-DL-NEXT: s_bfe_u32 s3, s0, 0x4000c ; GFX10-DL-NEXT: s_bfe_u32 s2, s0, 0x40008 -; GFX10-DL-NEXT: v_mad_u32_u24 v2, s5, s6, v2 -; GFX10-DL-NEXT: v_mul_u32_u24_e64 v3, s4, s8 -; GFX10-DL-NEXT: s_bfe_u32 s4, s1, 0x40010 -; GFX10-DL-NEXT: s_bfe_u32 s5, s0, 0x40014 -; GFX10-DL-NEXT: s_bfe_u32 s6, s1, 0x40014 -; GFX10-DL-NEXT: v_mad_u32_u24 v2, s2, s7, v2 +; GFX10-DL-NEXT: v_mad_u32_u24 v2, s4, s5, v2 +; GFX10-DL-NEXT: v_mul_u32_u24_e64 v3, s3, s7 +; GFX10-DL-NEXT: s_bfe_u32 s3, s1, 0x40010 +; GFX10-DL-NEXT: s_bfe_u32 s4, s0, 0x40014 +; GFX10-DL-NEXT: s_bfe_u32 s5, s1, 0x40014 +; GFX10-DL-NEXT: v_mad_u32_u24 v2, s2, s6, v2 ; GFX10-DL-NEXT: v_and_b32_e32 v3, 15, v3 ; GFX10-DL-NEXT: s_bfe_u32 s2, s0, 0x40010 ; GFX10-DL-NEXT: v_and_b32_e32 v2, 15, v2 ; GFX10-DL-NEXT: v_add_nc_u32_e32 v2, v3, v2 -; GFX10-DL-NEXT: v_mad_u32_u24 v2, s2, s4, v2 +; GFX10-DL-NEXT: v_mad_u32_u24 v2, s2, s3, v2 ; GFX10-DL-NEXT: s_bfe_u32 s2, s0, 0x40018 -; GFX10-DL-NEXT: s_bfe_u32 s4, s1, 0x40018 +; GFX10-DL-NEXT: s_bfe_u32 s3, s1, 0x40018 ; GFX10-DL-NEXT: s_lshr_b32 s0, s0, 28 ; GFX10-DL-NEXT: s_lshr_b32 s1, s1, 28 -; GFX10-DL-NEXT: v_mad_u32_u24 v2, s5, s6, v2 -; GFX10-DL-NEXT: v_mad_u32_u24 v2, s2, s4, v2 +; GFX10-DL-NEXT: v_mad_u32_u24 v2, s4, s5, v2 +; GFX10-DL-NEXT: v_mad_u32_u24 v2, s2, s3, v2 ; GFX10-DL-NEXT: v_mad_u32_u24 v2, s0, s1, v2 ; GFX10-DL-NEXT: v_and_b32_e32 v2, 15, v2 ; GFX10-DL-NEXT: global_store_byte v[0:1], v2, off @@ -1458,51 +1458,51 @@ define amdgpu_kernel void @udot8_multiuses_mul1(<8 x i4> addrspace(1)* %src1, ; GFX7-LABEL: udot8_multiuses_mul1: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x9 -; GFX7-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xd -; GFX7-NEXT: s_mov_b32 s7, 0xf000 -; GFX7-NEXT: s_mov_b32 s6, -1 +; GFX7-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd +; GFX7-NEXT: s_mov_b32 s3, 0xf000 +; GFX7-NEXT: s_mov_b32 s2, -1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: s_load_dword s10, s[10:11], 0x0 -; GFX7-NEXT: s_load_dword s0, s[8:9], 0x0 -; GFX7-NEXT: s_load_dword s21, s[4:5], 0x0 +; GFX7-NEXT: s_load_dword s6, s[6:7], 0x0 +; GFX7-NEXT: s_load_dword s4, s[4:5], 0x0 +; GFX7-NEXT: s_load_dword s20, s[0:1], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: s_bfe_u32 s20, s10, 0x40004 -; GFX7-NEXT: s_lshr_b32 s11, s10, 28 -; GFX7-NEXT: s_bfe_u32 s15, s10, 0x40018 -; GFX7-NEXT: s_bfe_u32 s16, s10, 0x40014 -; GFX7-NEXT: s_bfe_u32 s17, s10, 0x40010 -; GFX7-NEXT: s_bfe_u32 s18, s10, 0x4000c -; GFX7-NEXT: s_bfe_u32 s19, s10, 0x40008 -; GFX7-NEXT: s_and_b32 s10, s10, 15 -; GFX7-NEXT: s_lshr_b32 s1, s0, 28 -; GFX7-NEXT: s_bfe_u32 s2, s0, 0x40018 -; GFX7-NEXT: s_bfe_u32 s8, s0, 0x40014 -; GFX7-NEXT: s_bfe_u32 s9, s0, 0x40010 -; GFX7-NEXT: s_bfe_u32 s12, s0, 0x4000c -; GFX7-NEXT: s_bfe_u32 s13, s0, 0x40008 -; GFX7-NEXT: s_bfe_u32 s14, s0, 0x40004 -; GFX7-NEXT: s_and_b32 s0, s0, 15 -; GFX7-NEXT: v_mov_b32_e32 v0, s10 -; GFX7-NEXT: v_mov_b32_e32 v1, s21 -; GFX7-NEXT: v_mad_u32_u24 v1, s0, v0, v1 -; GFX7-NEXT: v_mov_b32_e32 v2, s20 -; GFX7-NEXT: v_mad_u32_u24 v0, s0, v0, v1 -; GFX7-NEXT: v_mad_u32_u24 v1, s14, v2, v1 +; GFX7-NEXT: s_bfe_u32 s19, s6, 0x40004 +; GFX7-NEXT: s_lshr_b32 s7, s6, 28 +; GFX7-NEXT: s_bfe_u32 s14, s6, 0x40018 +; GFX7-NEXT: s_bfe_u32 s15, s6, 0x40014 +; GFX7-NEXT: s_bfe_u32 s16, s6, 0x40010 +; GFX7-NEXT: s_bfe_u32 s17, s6, 0x4000c +; GFX7-NEXT: s_bfe_u32 s18, s6, 0x40008 +; GFX7-NEXT: s_and_b32 s6, s6, 15 +; GFX7-NEXT: s_lshr_b32 s5, s4, 28 +; GFX7-NEXT: s_bfe_u32 s8, s4, 0x40018 +; GFX7-NEXT: s_bfe_u32 s9, s4, 0x40014 +; GFX7-NEXT: s_bfe_u32 s10, s4, 0x40010 +; GFX7-NEXT: s_bfe_u32 s11, s4, 0x4000c +; GFX7-NEXT: s_bfe_u32 s12, s4, 0x40008 +; GFX7-NEXT: s_bfe_u32 s13, s4, 0x40004 +; GFX7-NEXT: s_and_b32 s4, s4, 15 +; GFX7-NEXT: v_mov_b32_e32 v0, s6 +; GFX7-NEXT: v_mov_b32_e32 v1, s20 +; GFX7-NEXT: v_mad_u32_u24 v1, s4, v0, v1 ; GFX7-NEXT: v_mov_b32_e32 v2, s19 +; GFX7-NEXT: v_mad_u32_u24 v0, s4, v0, v1 ; GFX7-NEXT: v_mad_u32_u24 v1, s13, v2, v1 ; GFX7-NEXT: v_mov_b32_e32 v2, s18 ; GFX7-NEXT: v_mad_u32_u24 v1, s12, v2, v1 ; GFX7-NEXT: v_mov_b32_e32 v2, s17 -; GFX7-NEXT: v_mad_u32_u24 v1, s9, v2, v1 +; GFX7-NEXT: v_mad_u32_u24 v1, s11, v2, v1 ; GFX7-NEXT: v_mov_b32_e32 v2, s16 -; GFX7-NEXT: v_mad_u32_u24 v1, s8, v2, v1 +; GFX7-NEXT: v_mad_u32_u24 v1, s10, v2, v1 ; GFX7-NEXT: v_mov_b32_e32 v2, s15 -; GFX7-NEXT: v_mad_u32_u24 v1, s2, v2, v1 -; GFX7-NEXT: v_mov_b32_e32 v2, s11 -; GFX7-NEXT: v_mad_u32_u24 v1, s1, v2, v1 +; GFX7-NEXT: v_mad_u32_u24 v1, s9, v2, v1 +; GFX7-NEXT: v_mov_b32_e32 v2, s14 +; GFX7-NEXT: v_mad_u32_u24 v1, s8, v2, v1 +; GFX7-NEXT: v_mov_b32_e32 v2, s7 +; GFX7-NEXT: v_mad_u32_u24 v1, s5, v2, v1 ; GFX7-NEXT: v_add_i32_e32 v0, vcc, v1, v0 -; GFX7-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GFX7-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX7-NEXT: s_endpgm ; ; GFX8-LABEL: udot8_multiuses_mul1: @@ -1512,31 +1512,29 @@ ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: s_load_dword s6, s[6:7], 0x0 ; GFX8-NEXT: s_load_dword s2, s[4:5], 0x0 -; GFX8-NEXT: s_load_dword s19, s[0:1], 0x0 +; GFX8-NEXT: s_load_dword s18, s[0:1], 0x0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_bfe_u32 s18, s6, 0x40004 +; GFX8-NEXT: s_bfe_u32 s17, s6, 0x40004 ; GFX8-NEXT: s_lshr_b32 s7, s6, 28 -; GFX8-NEXT: s_bfe_u32 s13, s6, 0x40018 -; GFX8-NEXT: s_bfe_u32 s14, s6, 0x40014 -; GFX8-NEXT: s_bfe_u32 s15, s6, 0x40010 -; GFX8-NEXT: s_bfe_u32 s16, s6, 0x4000c -; GFX8-NEXT: s_bfe_u32 s17, s6, 0x40008 +; GFX8-NEXT: s_bfe_u32 s12, s6, 0x40018 +; GFX8-NEXT: s_bfe_u32 s13, s6, 0x40014 +; GFX8-NEXT: s_bfe_u32 s14, s6, 0x40010 +; GFX8-NEXT: s_bfe_u32 s15, s6, 0x4000c +; GFX8-NEXT: s_bfe_u32 s16, s6, 0x40008 ; GFX8-NEXT: s_and_b32 s6, s6, 15 -; GFX8-NEXT: s_lshr_b32 s4, s2, 28 -; GFX8-NEXT: s_bfe_u32 s5, s2, 0x40018 -; GFX8-NEXT: s_bfe_u32 s8, s2, 0x40014 -; GFX8-NEXT: s_bfe_u32 s9, s2, 0x40010 -; GFX8-NEXT: s_bfe_u32 s10, s2, 0x4000c -; GFX8-NEXT: s_bfe_u32 s11, s2, 0x40008 -; GFX8-NEXT: s_bfe_u32 s12, s2, 0x40004 +; GFX8-NEXT: s_lshr_b32 s3, s2, 28 +; GFX8-NEXT: s_bfe_u32 s4, s2, 0x40018 +; GFX8-NEXT: s_bfe_u32 s5, s2, 0x40014 +; GFX8-NEXT: s_bfe_u32 s8, s2, 0x40010 +; GFX8-NEXT: s_bfe_u32 s9, s2, 0x4000c +; GFX8-NEXT: s_bfe_u32 s10, s2, 0x40008 +; GFX8-NEXT: s_bfe_u32 s11, s2, 0x40004 ; GFX8-NEXT: s_and_b32 s2, s2, 15 ; GFX8-NEXT: v_mov_b32_e32 v0, s6 -; GFX8-NEXT: v_mov_b32_e32 v1, s19 +; GFX8-NEXT: v_mov_b32_e32 v1, s18 ; GFX8-NEXT: v_mad_u32_u24 v1, s2, v0, v1 -; GFX8-NEXT: v_mov_b32_e32 v2, s18 -; GFX8-NEXT: v_mad_u32_u24 v0, s2, v0, v1 -; GFX8-NEXT: v_mad_u32_u24 v1, s12, v2, v1 ; GFX8-NEXT: v_mov_b32_e32 v2, s17 +; GFX8-NEXT: v_mad_u32_u24 v0, s2, v0, v1 ; GFX8-NEXT: v_mad_u32_u24 v1, s11, v2, v1 ; GFX8-NEXT: v_mov_b32_e32 v2, s16 ; GFX8-NEXT: v_mad_u32_u24 v1, s10, v2, v1 @@ -1546,8 +1544,10 @@ ; GFX8-NEXT: v_mad_u32_u24 v1, s8, v2, v1 ; GFX8-NEXT: v_mov_b32_e32 v2, s13 ; GFX8-NEXT: v_mad_u32_u24 v1, s5, v2, v1 -; GFX8-NEXT: v_mov_b32_e32 v2, s7 +; GFX8-NEXT: v_mov_b32_e32 v2, s12 ; GFX8-NEXT: v_mad_u32_u24 v1, s4, v2, v1 +; GFX8-NEXT: v_mov_b32_e32 v2, s7 +; GFX8-NEXT: v_mad_u32_u24 v1, s3, v2, v1 ; GFX8-NEXT: v_add_u32_e32 v2, vcc, v1, v0 ; GFX8-NEXT: v_mov_b32_e32 v0, s0 ; GFX8-NEXT: v_mov_b32_e32 v1, s1 @@ -1561,31 +1561,29 @@ ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_load_dword s6, s[6:7], 0x0 ; GFX9-NEXT: s_load_dword s2, s[4:5], 0x0 -; GFX9-NEXT: s_load_dword s19, s[0:1], 0x0 +; GFX9-NEXT: s_load_dword s18, s[0:1], 0x0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_bfe_u32 s18, s6, 0x40004 +; GFX9-NEXT: s_bfe_u32 s17, s6, 0x40004 ; GFX9-NEXT: s_lshr_b32 s7, s6, 28 -; GFX9-NEXT: s_bfe_u32 s13, s6, 0x40018 -; GFX9-NEXT: s_bfe_u32 s14, s6, 0x40014 -; GFX9-NEXT: s_bfe_u32 s15, s6, 0x40010 -; GFX9-NEXT: s_bfe_u32 s16, s6, 0x4000c -; GFX9-NEXT: s_bfe_u32 s17, s6, 0x40008 +; GFX9-NEXT: s_bfe_u32 s12, s6, 0x40018 +; GFX9-NEXT: s_bfe_u32 s13, s6, 0x40014 +; GFX9-NEXT: s_bfe_u32 s14, s6, 0x40010 +; GFX9-NEXT: s_bfe_u32 s15, s6, 0x4000c +; GFX9-NEXT: s_bfe_u32 s16, s6, 0x40008 ; GFX9-NEXT: s_and_b32 s6, s6, 15 -; GFX9-NEXT: s_lshr_b32 s4, s2, 28 -; GFX9-NEXT: s_bfe_u32 s5, s2, 0x40018 -; GFX9-NEXT: s_bfe_u32 s8, s2, 0x40014 -; GFX9-NEXT: s_bfe_u32 s9, s2, 0x40010 -; GFX9-NEXT: s_bfe_u32 s10, s2, 0x4000c -; GFX9-NEXT: s_bfe_u32 s11, s2, 0x40008 -; GFX9-NEXT: s_bfe_u32 s12, s2, 0x40004 +; GFX9-NEXT: s_lshr_b32 s3, s2, 28 +; GFX9-NEXT: s_bfe_u32 s4, s2, 0x40018 +; GFX9-NEXT: s_bfe_u32 s5, s2, 0x40014 +; GFX9-NEXT: s_bfe_u32 s8, s2, 0x40010 +; GFX9-NEXT: s_bfe_u32 s9, s2, 0x4000c +; GFX9-NEXT: s_bfe_u32 s10, s2, 0x40008 +; GFX9-NEXT: s_bfe_u32 s11, s2, 0x40004 ; GFX9-NEXT: s_and_b32 s2, s2, 15 ; GFX9-NEXT: v_mov_b32_e32 v0, s6 -; GFX9-NEXT: v_mov_b32_e32 v1, s19 +; GFX9-NEXT: v_mov_b32_e32 v1, s18 ; GFX9-NEXT: v_mad_u32_u24 v1, s2, v0, v1 -; GFX9-NEXT: v_mov_b32_e32 v2, s18 -; GFX9-NEXT: v_mad_u32_u24 v0, s2, v0, v1 -; GFX9-NEXT: v_mad_u32_u24 v1, s12, v2, v1 ; GFX9-NEXT: v_mov_b32_e32 v2, s17 +; GFX9-NEXT: v_mad_u32_u24 v0, s2, v0, v1 ; GFX9-NEXT: v_mad_u32_u24 v1, s11, v2, v1 ; GFX9-NEXT: v_mov_b32_e32 v2, s16 ; GFX9-NEXT: v_mad_u32_u24 v1, s10, v2, v1 @@ -1595,8 +1593,10 @@ ; GFX9-NEXT: v_mad_u32_u24 v1, s8, v2, v1 ; GFX9-NEXT: v_mov_b32_e32 v2, s13 ; GFX9-NEXT: v_mad_u32_u24 v1, s5, v2, v1 -; GFX9-NEXT: v_mov_b32_e32 v2, s7 +; GFX9-NEXT: v_mov_b32_e32 v2, s12 ; GFX9-NEXT: v_mad_u32_u24 v1, s4, v2, v1 +; GFX9-NEXT: v_mov_b32_e32 v2, s7 +; GFX9-NEXT: v_mad_u32_u24 v1, s3, v2, v1 ; GFX9-NEXT: v_add_u32_e32 v2, v0, v1 ; GFX9-NEXT: v_mov_b32_e32 v0, s0 ; GFX9-NEXT: v_mov_b32_e32 v1, s1 @@ -1610,31 +1610,29 @@ ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DL-NEXT: s_load_dword s6, s[6:7], 0x0 ; GFX9-DL-NEXT: s_load_dword s2, s[4:5], 0x0 -; GFX9-DL-NEXT: s_load_dword s19, s[0:1], 0x0 +; GFX9-DL-NEXT: s_load_dword s18, s[0:1], 0x0 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: s_bfe_u32 s18, s6, 0x40004 +; GFX9-DL-NEXT: s_bfe_u32 s17, s6, 0x40004 ; GFX9-DL-NEXT: s_lshr_b32 s7, s6, 28 -; GFX9-DL-NEXT: s_bfe_u32 s13, s6, 0x40018 -; GFX9-DL-NEXT: s_bfe_u32 s14, s6, 0x40014 -; GFX9-DL-NEXT: s_bfe_u32 s15, s6, 0x40010 -; GFX9-DL-NEXT: s_bfe_u32 s16, s6, 0x4000c -; GFX9-DL-NEXT: s_bfe_u32 s17, s6, 0x40008 +; GFX9-DL-NEXT: s_bfe_u32 s12, s6, 0x40018 +; GFX9-DL-NEXT: s_bfe_u32 s13, s6, 0x40014 +; GFX9-DL-NEXT: s_bfe_u32 s14, s6, 0x40010 +; GFX9-DL-NEXT: s_bfe_u32 s15, s6, 0x4000c +; GFX9-DL-NEXT: s_bfe_u32 s16, s6, 0x40008 ; GFX9-DL-NEXT: s_and_b32 s6, s6, 15 -; GFX9-DL-NEXT: s_lshr_b32 s4, s2, 28 -; GFX9-DL-NEXT: s_bfe_u32 s5, s2, 0x40018 -; GFX9-DL-NEXT: s_bfe_u32 s8, s2, 0x40014 -; GFX9-DL-NEXT: s_bfe_u32 s9, s2, 0x40010 -; GFX9-DL-NEXT: s_bfe_u32 s10, s2, 0x4000c -; GFX9-DL-NEXT: s_bfe_u32 s11, s2, 0x40008 -; GFX9-DL-NEXT: s_bfe_u32 s12, s2, 0x40004 +; GFX9-DL-NEXT: s_lshr_b32 s3, s2, 28 +; GFX9-DL-NEXT: s_bfe_u32 s4, s2, 0x40018 +; GFX9-DL-NEXT: s_bfe_u32 s5, s2, 0x40014 +; GFX9-DL-NEXT: s_bfe_u32 s8, s2, 0x40010 +; GFX9-DL-NEXT: s_bfe_u32 s9, s2, 0x4000c +; GFX9-DL-NEXT: s_bfe_u32 s10, s2, 0x40008 +; GFX9-DL-NEXT: s_bfe_u32 s11, s2, 0x40004 ; GFX9-DL-NEXT: s_and_b32 s2, s2, 15 ; GFX9-DL-NEXT: v_mov_b32_e32 v0, s6 -; GFX9-DL-NEXT: v_mov_b32_e32 v1, s19 +; GFX9-DL-NEXT: v_mov_b32_e32 v1, s18 ; GFX9-DL-NEXT: v_mad_u32_u24 v1, s2, v0, v1 -; GFX9-DL-NEXT: v_mov_b32_e32 v2, s18 -; GFX9-DL-NEXT: v_mad_u32_u24 v0, s2, v0, v1 -; GFX9-DL-NEXT: v_mad_u32_u24 v1, s12, v2, v1 ; GFX9-DL-NEXT: v_mov_b32_e32 v2, s17 +; GFX9-DL-NEXT: v_mad_u32_u24 v0, s2, v0, v1 ; GFX9-DL-NEXT: v_mad_u32_u24 v1, s11, v2, v1 ; GFX9-DL-NEXT: v_mov_b32_e32 v2, s16 ; GFX9-DL-NEXT: v_mad_u32_u24 v1, s10, v2, v1 @@ -1644,8 +1642,10 @@ ; GFX9-DL-NEXT: v_mad_u32_u24 v1, s8, v2, v1 ; GFX9-DL-NEXT: v_mov_b32_e32 v2, s13 ; GFX9-DL-NEXT: v_mad_u32_u24 v1, s5, v2, v1 -; GFX9-DL-NEXT: v_mov_b32_e32 v2, s7 +; GFX9-DL-NEXT: v_mov_b32_e32 v2, s12 ; GFX9-DL-NEXT: v_mad_u32_u24 v1, s4, v2, v1 +; GFX9-DL-NEXT: v_mov_b32_e32 v2, s7 +; GFX9-DL-NEXT: v_mad_u32_u24 v1, s3, v2, v1 ; GFX9-DL-NEXT: v_add_u32_e32 v2, v0, v1 ; GFX9-DL-NEXT: v_mov_b32_e32 v0, s0 ; GFX9-DL-NEXT: v_mov_b32_e32 v1, s1 @@ -1659,35 +1659,35 @@ ; GFX10-DL-NEXT: ; implicit-def: $vcc_hi ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DL-NEXT: s_load_dword s2, s[4:5], 0x0 -; GFX10-DL-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX10-DL-NEXT: s_load_dword s5, s[0:1], 0x0 +; GFX10-DL-NEXT: s_load_dword s3, s[6:7], 0x0 +; GFX10-DL-NEXT: s_load_dword s4, s[0:1], 0x0 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: s_and_b32 s6, s2, 15 -; GFX10-DL-NEXT: s_and_b32 s7, s4, 15 -; GFX10-DL-NEXT: v_mov_b32_e32 v0, s5 -; GFX10-DL-NEXT: s_bfe_u32 s5, s2, 0x40004 -; GFX10-DL-NEXT: s_bfe_u32 s8, s4, 0x40004 -; GFX10-DL-NEXT: s_bfe_u32 s9, s2, 0x40008 -; GFX10-DL-NEXT: s_bfe_u32 s10, s4, 0x40008 -; GFX10-DL-NEXT: v_mad_u32_u24 v0, s6, s7, v0 -; GFX10-DL-NEXT: v_mad_u32_u24 v1, s5, s8, v0 -; GFX10-DL-NEXT: s_bfe_u32 s5, s2, 0x4000c -; GFX10-DL-NEXT: s_bfe_u32 s8, s4, 0x4000c -; GFX10-DL-NEXT: v_mad_u32_u24 v0, s6, s7, v0 -; GFX10-DL-NEXT: v_mad_u32_u24 v1, s9, s10, v1 -; GFX10-DL-NEXT: s_bfe_u32 s9, s2, 0x40010 -; GFX10-DL-NEXT: s_bfe_u32 s10, s4, 0x40010 -; GFX10-DL-NEXT: v_mad_u32_u24 v1, s5, s8, v1 -; GFX10-DL-NEXT: s_bfe_u32 s5, s2, 0x40014 -; GFX10-DL-NEXT: s_bfe_u32 s8, s4, 0x40014 -; GFX10-DL-NEXT: v_mad_u32_u24 v1, s9, s10, v1 -; GFX10-DL-NEXT: s_bfe_u32 s9, s2, 0x40018 -; GFX10-DL-NEXT: s_bfe_u32 s10, s4, 0x40018 +; GFX10-DL-NEXT: s_and_b32 s5, s2, 15 +; GFX10-DL-NEXT: s_and_b32 s6, s3, 15 +; GFX10-DL-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-DL-NEXT: s_bfe_u32 s4, s2, 0x40004 +; GFX10-DL-NEXT: s_bfe_u32 s7, s3, 0x40004 +; GFX10-DL-NEXT: s_bfe_u32 s8, s2, 0x40008 +; GFX10-DL-NEXT: s_bfe_u32 s9, s3, 0x40008 +; GFX10-DL-NEXT: v_mad_u32_u24 v0, s5, s6, v0 +; GFX10-DL-NEXT: v_mad_u32_u24 v1, s4, s7, v0 +; GFX10-DL-NEXT: s_bfe_u32 s4, s2, 0x4000c +; GFX10-DL-NEXT: s_bfe_u32 s7, s3, 0x4000c +; GFX10-DL-NEXT: v_mad_u32_u24 v0, s5, s6, v0 +; GFX10-DL-NEXT: v_mad_u32_u24 v1, s8, s9, v1 +; GFX10-DL-NEXT: s_bfe_u32 s8, s2, 0x40010 +; GFX10-DL-NEXT: s_bfe_u32 s9, s3, 0x40010 +; GFX10-DL-NEXT: v_mad_u32_u24 v1, s4, s7, v1 +; GFX10-DL-NEXT: s_bfe_u32 s4, s2, 0x40014 +; GFX10-DL-NEXT: s_bfe_u32 s7, s3, 0x40014 +; GFX10-DL-NEXT: v_mad_u32_u24 v1, s8, s9, v1 +; GFX10-DL-NEXT: s_bfe_u32 s8, s2, 0x40018 +; GFX10-DL-NEXT: s_bfe_u32 s9, s3, 0x40018 ; GFX10-DL-NEXT: s_lshr_b32 s2, s2, 28 -; GFX10-DL-NEXT: s_lshr_b32 s4, s4, 28 -; GFX10-DL-NEXT: v_mad_u32_u24 v1, s5, s8, v1 -; GFX10-DL-NEXT: v_mad_u32_u24 v1, s9, s10, v1 -; GFX10-DL-NEXT: v_mad_u32_u24 v1, s2, s4, v1 +; GFX10-DL-NEXT: s_lshr_b32 s3, s3, 28 +; GFX10-DL-NEXT: v_mad_u32_u24 v1, s4, s7, v1 +; GFX10-DL-NEXT: v_mad_u32_u24 v1, s8, s9, v1 +; GFX10-DL-NEXT: v_mad_u32_u24 v1, s2, s3, v1 ; GFX10-DL-NEXT: v_add_nc_u32_e32 v2, v0, v1 ; GFX10-DL-NEXT: v_mov_b32_e32 v0, s0 ; GFX10-DL-NEXT: v_mov_b32_e32 v1, s1 @@ -1766,49 +1766,49 @@ define amdgpu_kernel void @udot8_acc32_vecMul(<8 x i4> addrspace(1)* %src1, ; GFX7-LABEL: udot8_acc32_vecMul: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x9 -; GFX7-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xd -; GFX7-NEXT: s_mov_b32 s7, 0xf000 -; GFX7-NEXT: s_mov_b32 s6, -1 +; GFX7-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd +; GFX7-NEXT: s_mov_b32 s3, 0xf000 +; GFX7-NEXT: s_mov_b32 s2, -1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: s_load_dword s10, s[10:11], 0x0 -; GFX7-NEXT: s_load_dword s0, s[8:9], 0x0 -; GFX7-NEXT: s_load_dword s21, s[4:5], 0x0 +; GFX7-NEXT: s_load_dword s6, s[6:7], 0x0 +; GFX7-NEXT: s_load_dword s4, s[4:5], 0x0 +; GFX7-NEXT: s_load_dword s20, s[0:1], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: s_lshr_b32 s11, s10, 28 -; GFX7-NEXT: s_bfe_u32 s15, s10, 0x40018 -; GFX7-NEXT: s_bfe_u32 s16, s10, 0x40014 -; GFX7-NEXT: s_bfe_u32 s17, s10, 0x40010 -; GFX7-NEXT: s_bfe_u32 s18, s10, 0x4000c -; GFX7-NEXT: s_bfe_u32 s19, s10, 0x40008 -; GFX7-NEXT: s_bfe_u32 s20, s10, 0x40004 -; GFX7-NEXT: s_and_b32 s10, s10, 15 -; GFX7-NEXT: s_lshr_b32 s1, s0, 28 -; GFX7-NEXT: s_bfe_u32 s2, s0, 0x40018 -; GFX7-NEXT: s_bfe_u32 s8, s0, 0x40014 -; GFX7-NEXT: s_bfe_u32 s9, s0, 0x40010 -; GFX7-NEXT: s_bfe_u32 s12, s0, 0x4000c -; GFX7-NEXT: s_bfe_u32 s13, s0, 0x40008 -; GFX7-NEXT: s_bfe_u32 s14, s0, 0x40004 -; GFX7-NEXT: s_and_b32 s0, s0, 15 -; GFX7-NEXT: v_mov_b32_e32 v0, s10 -; GFX7-NEXT: v_mov_b32_e32 v1, s21 -; GFX7-NEXT: v_mad_u32_u24 v0, s0, v0, v1 +; GFX7-NEXT: s_lshr_b32 s7, s6, 28 +; GFX7-NEXT: s_bfe_u32 s14, s6, 0x40018 +; GFX7-NEXT: s_bfe_u32 s15, s6, 0x40014 +; GFX7-NEXT: s_bfe_u32 s16, s6, 0x40010 +; GFX7-NEXT: s_bfe_u32 s17, s6, 0x4000c +; GFX7-NEXT: s_bfe_u32 s18, s6, 0x40008 +; GFX7-NEXT: s_bfe_u32 s19, s6, 0x40004 +; GFX7-NEXT: s_and_b32 s6, s6, 15 +; GFX7-NEXT: s_lshr_b32 s5, s4, 28 +; GFX7-NEXT: s_bfe_u32 s8, s4, 0x40018 +; GFX7-NEXT: s_bfe_u32 s9, s4, 0x40014 +; GFX7-NEXT: s_bfe_u32 s10, s4, 0x40010 +; GFX7-NEXT: s_bfe_u32 s11, s4, 0x4000c +; GFX7-NEXT: s_bfe_u32 s12, s4, 0x40008 +; GFX7-NEXT: s_bfe_u32 s13, s4, 0x40004 +; GFX7-NEXT: s_and_b32 s4, s4, 15 +; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s20 -; GFX7-NEXT: v_mad_u32_u24 v0, s14, v1, v0 +; GFX7-NEXT: v_mad_u32_u24 v0, s4, v0, v1 ; GFX7-NEXT: v_mov_b32_e32 v1, s19 ; GFX7-NEXT: v_mad_u32_u24 v0, s13, v1, v0 ; GFX7-NEXT: v_mov_b32_e32 v1, s18 ; GFX7-NEXT: v_mad_u32_u24 v0, s12, v1, v0 ; GFX7-NEXT: v_mov_b32_e32 v1, s17 -; GFX7-NEXT: v_mad_u32_u24 v0, s9, v1, v0 +; GFX7-NEXT: v_mad_u32_u24 v0, s11, v1, v0 ; GFX7-NEXT: v_mov_b32_e32 v1, s16 -; GFX7-NEXT: v_mad_u32_u24 v0, s8, v1, v0 +; GFX7-NEXT: v_mad_u32_u24 v0, s10, v1, v0 ; GFX7-NEXT: v_mov_b32_e32 v1, s15 -; GFX7-NEXT: v_mad_u32_u24 v0, s2, v1, v0 -; GFX7-NEXT: v_mov_b32_e32 v1, s11 -; GFX7-NEXT: v_mad_u32_u24 v0, s1, v1, v0 -; GFX7-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GFX7-NEXT: v_mad_u32_u24 v0, s9, v1, v0 +; GFX7-NEXT: v_mov_b32_e32 v1, s14 +; GFX7-NEXT: v_mad_u32_u24 v0, s8, v1, v0 +; GFX7-NEXT: v_mov_b32_e32 v1, s7 +; GFX7-NEXT: v_mad_u32_u24 v0, s5, v1, v0 +; GFX7-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX7-NEXT: s_endpgm ; ; GFX8-LABEL: udot8_acc32_vecMul: @@ -1818,29 +1818,27 @@ ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: s_load_dword s6, s[6:7], 0x0 ; GFX8-NEXT: s_load_dword s2, s[4:5], 0x0 -; GFX8-NEXT: s_load_dword s19, s[0:1], 0x0 +; GFX8-NEXT: s_load_dword s18, s[0:1], 0x0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: s_lshr_b32 s7, s6, 28 -; GFX8-NEXT: s_bfe_u32 s13, s6, 0x40018 -; GFX8-NEXT: s_bfe_u32 s14, s6, 0x40014 -; GFX8-NEXT: s_bfe_u32 s15, s6, 0x40010 -; GFX8-NEXT: s_bfe_u32 s16, s6, 0x4000c -; GFX8-NEXT: s_bfe_u32 s17, s6, 0x40008 -; GFX8-NEXT: s_bfe_u32 s18, s6, 0x40004 +; GFX8-NEXT: s_bfe_u32 s12, s6, 0x40018 +; GFX8-NEXT: s_bfe_u32 s13, s6, 0x40014 +; GFX8-NEXT: s_bfe_u32 s14, s6, 0x40010 +; GFX8-NEXT: s_bfe_u32 s15, s6, 0x4000c +; GFX8-NEXT: s_bfe_u32 s16, s6, 0x40008 +; GFX8-NEXT: s_bfe_u32 s17, s6, 0x40004 ; GFX8-NEXT: s_and_b32 s6, s6, 15 -; GFX8-NEXT: s_lshr_b32 s4, s2, 28 -; GFX8-NEXT: s_bfe_u32 s5, s2, 0x40018 -; GFX8-NEXT: s_bfe_u32 s8, s2, 0x40014 -; GFX8-NEXT: s_bfe_u32 s9, s2, 0x40010 -; GFX8-NEXT: s_bfe_u32 s10, s2, 0x4000c -; GFX8-NEXT: s_bfe_u32 s11, s2, 0x40008 -; GFX8-NEXT: s_bfe_u32 s12, s2, 0x40004 +; GFX8-NEXT: s_lshr_b32 s3, s2, 28 +; GFX8-NEXT: s_bfe_u32 s4, s2, 0x40018 +; GFX8-NEXT: s_bfe_u32 s5, s2, 0x40014 +; GFX8-NEXT: s_bfe_u32 s8, s2, 0x40010 +; GFX8-NEXT: s_bfe_u32 s9, s2, 0x4000c +; GFX8-NEXT: s_bfe_u32 s10, s2, 0x40008 +; GFX8-NEXT: s_bfe_u32 s11, s2, 0x40004 ; GFX8-NEXT: s_and_b32 s2, s2, 15 ; GFX8-NEXT: v_mov_b32_e32 v0, s6 -; GFX8-NEXT: v_mov_b32_e32 v1, s19 -; GFX8-NEXT: v_mad_u32_u24 v0, s2, v0, v1 ; GFX8-NEXT: v_mov_b32_e32 v1, s18 -; GFX8-NEXT: v_mad_u32_u24 v0, s12, v1, v0 +; GFX8-NEXT: v_mad_u32_u24 v0, s2, v0, v1 ; GFX8-NEXT: v_mov_b32_e32 v1, s17 ; GFX8-NEXT: v_mad_u32_u24 v0, s11, v1, v0 ; GFX8-NEXT: v_mov_b32_e32 v1, s16 @@ -1851,8 +1849,10 @@ ; GFX8-NEXT: v_mad_u32_u24 v0, s8, v1, v0 ; GFX8-NEXT: v_mov_b32_e32 v1, s13 ; GFX8-NEXT: v_mad_u32_u24 v0, s5, v1, v0 +; GFX8-NEXT: v_mov_b32_e32 v1, s12 +; GFX8-NEXT: v_mad_u32_u24 v0, s4, v1, v0 ; GFX8-NEXT: v_mov_b32_e32 v1, s7 -; GFX8-NEXT: v_mad_u32_u24 v2, s4, v1, v0 +; GFX8-NEXT: v_mad_u32_u24 v2, s3, v1, v0 ; GFX8-NEXT: v_mov_b32_e32 v0, s0 ; GFX8-NEXT: v_mov_b32_e32 v1, s1 ; GFX8-NEXT: flat_store_dword v[0:1], v2 @@ -1865,29 +1865,27 @@ ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_load_dword s6, s[6:7], 0x0 ; GFX9-NEXT: s_load_dword s2, s[4:5], 0x0 -; GFX9-NEXT: s_load_dword s19, s[0:1], 0x0 +; GFX9-NEXT: s_load_dword s18, s[0:1], 0x0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_lshr_b32 s7, s6, 28 -; GFX9-NEXT: s_bfe_u32 s13, s6, 0x40018 -; GFX9-NEXT: s_bfe_u32 s14, s6, 0x40014 -; GFX9-NEXT: s_bfe_u32 s15, s6, 0x40010 -; GFX9-NEXT: s_bfe_u32 s16, s6, 0x4000c -; GFX9-NEXT: s_bfe_u32 s17, s6, 0x40008 -; GFX9-NEXT: s_bfe_u32 s18, s6, 0x40004 +; GFX9-NEXT: s_bfe_u32 s12, s6, 0x40018 +; GFX9-NEXT: s_bfe_u32 s13, s6, 0x40014 +; GFX9-NEXT: s_bfe_u32 s14, s6, 0x40010 +; GFX9-NEXT: s_bfe_u32 s15, s6, 0x4000c +; GFX9-NEXT: s_bfe_u32 s16, s6, 0x40008 +; GFX9-NEXT: s_bfe_u32 s17, s6, 0x40004 ; GFX9-NEXT: s_and_b32 s6, s6, 15 -; GFX9-NEXT: s_lshr_b32 s4, s2, 28 -; GFX9-NEXT: s_bfe_u32 s5, s2, 0x40018 -; GFX9-NEXT: s_bfe_u32 s8, s2, 0x40014 -; GFX9-NEXT: s_bfe_u32 s9, s2, 0x40010 -; GFX9-NEXT: s_bfe_u32 s10, s2, 0x4000c -; GFX9-NEXT: s_bfe_u32 s11, s2, 0x40008 -; GFX9-NEXT: s_bfe_u32 s12, s2, 0x40004 +; GFX9-NEXT: s_lshr_b32 s3, s2, 28 +; GFX9-NEXT: s_bfe_u32 s4, s2, 0x40018 +; GFX9-NEXT: s_bfe_u32 s5, s2, 0x40014 +; GFX9-NEXT: s_bfe_u32 s8, s2, 0x40010 +; GFX9-NEXT: s_bfe_u32 s9, s2, 0x4000c +; GFX9-NEXT: s_bfe_u32 s10, s2, 0x40008 +; GFX9-NEXT: s_bfe_u32 s11, s2, 0x40004 ; GFX9-NEXT: s_and_b32 s2, s2, 15 ; GFX9-NEXT: v_mov_b32_e32 v0, s6 -; GFX9-NEXT: v_mov_b32_e32 v1, s19 -; GFX9-NEXT: v_mad_u32_u24 v0, s2, v0, v1 ; GFX9-NEXT: v_mov_b32_e32 v1, s18 -; GFX9-NEXT: v_mad_u32_u24 v0, s12, v1, v0 +; GFX9-NEXT: v_mad_u32_u24 v0, s2, v0, v1 ; GFX9-NEXT: v_mov_b32_e32 v1, s17 ; GFX9-NEXT: v_mad_u32_u24 v0, s11, v1, v0 ; GFX9-NEXT: v_mov_b32_e32 v1, s16 @@ -1898,8 +1896,10 @@ ; GFX9-NEXT: v_mad_u32_u24 v0, s8, v1, v0 ; GFX9-NEXT: v_mov_b32_e32 v1, s13 ; GFX9-NEXT: v_mad_u32_u24 v0, s5, v1, v0 +; GFX9-NEXT: v_mov_b32_e32 v1, s12 +; GFX9-NEXT: v_mad_u32_u24 v0, s4, v1, v0 ; GFX9-NEXT: v_mov_b32_e32 v1, s7 -; GFX9-NEXT: v_mad_u32_u24 v2, s4, v1, v0 +; GFX9-NEXT: v_mad_u32_u24 v2, s3, v1, v0 ; GFX9-NEXT: v_mov_b32_e32 v0, s0 ; GFX9-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-NEXT: global_store_dword v[0:1], v2, off @@ -1911,11 +1911,11 @@ ; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DL-NEXT: s_load_dword s2, s[6:7], 0x0 -; GFX9-DL-NEXT: s_load_dword s6, s[0:1], 0x0 +; GFX9-DL-NEXT: s_load_dword s3, s[0:1], 0x0 ; GFX9-DL-NEXT: s_load_dword s4, s[4:5], 0x0 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DL-NEXT: v_mov_b32_e32 v0, s2 -; GFX9-DL-NEXT: v_mov_b32_e32 v1, s6 +; GFX9-DL-NEXT: v_mov_b32_e32 v1, s3 ; GFX9-DL-NEXT: v_dot8_u32_u4 v2, s4, v0, v1 ; GFX9-DL-NEXT: v_mov_b32_e32 v0, s0 ; GFX9-DL-NEXT: v_mov_b32_e32 v1, s1 @@ -1924,18 +1924,18 @@ ; ; GFX10-DL-LABEL: udot8_acc32_vecMul: ; GFX10-DL: ; %bb.0: ; %entry -; GFX10-DL-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34 -; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-DL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34 +; GFX10-DL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-DL-NEXT: ; implicit-def: $vcc_hi ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: s_load_dword s0, s[8:9], 0x0 -; GFX10-DL-NEXT: s_load_dword s1, s[4:5], 0x0 -; GFX10-DL-NEXT: s_load_dword s2, s[6:7], 0x0 +; GFX10-DL-NEXT: s_load_dword s6, s[4:5], 0x0 +; GFX10-DL-NEXT: s_load_dword s0, s[0:1], 0x0 +; GFX10-DL-NEXT: s_load_dword s1, s[2:3], 0x0 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-DL-NEXT: v_dot8_u32_u4 v2, s1, s2, v0 -; GFX10-DL-NEXT: v_mov_b32_e32 v0, s8 -; GFX10-DL-NEXT: v_mov_b32_e32 v1, s9 +; GFX10-DL-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-DL-NEXT: v_dot8_u32_u4 v2, s0, s1, v0 +; GFX10-DL-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-DL-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-DL-NEXT: global_store_dword v[0:1], v2, off ; GFX10-DL-NEXT: s_endpgm <8 x i4> addrspace(1)* %src2, @@ -1976,59 +1976,59 @@ define amdgpu_kernel void @udot8_acc16_vecMul(<8 x i4> addrspace(1)* %src1, ; GFX7-LABEL: udot8_acc16_vecMul: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x9 -; GFX7-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xd -; GFX7-NEXT: s_mov_b32 s7, 0xf000 -; GFX7-NEXT: s_mov_b32 s6, -1 +; GFX7-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd +; GFX7-NEXT: s_mov_b32 s3, 0xf000 +; GFX7-NEXT: s_mov_b32 s2, -1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: buffer_load_ushort v0, off, s[4:7], 0 -; GFX7-NEXT: s_load_dword s0, s[8:9], 0x0 -; GFX7-NEXT: s_load_dword s1, s[10:11], 0x0 +; GFX7-NEXT: buffer_load_ushort v0, off, s[0:3], 0 +; GFX7-NEXT: s_load_dword s4, s[4:5], 0x0 +; GFX7-NEXT: s_load_dword s5, s[6:7], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: s_bfe_u32 s11, s0, 0x40004 -; GFX7-NEXT: s_bfe_u32 s18, s1, 0x40004 -; GFX7-NEXT: s_bfe_u32 s20, s1, 0x4000c -; GFX7-NEXT: v_mov_b32_e32 v4, s18 -; GFX7-NEXT: s_bfe_u32 s15, s1, 0x40018 -; GFX7-NEXT: s_bfe_u32 s16, s1, 0x40014 -; GFX7-NEXT: s_bfe_u32 s17, s1, 0x40010 -; GFX7-NEXT: s_and_b32 s19, s1, 15 -; GFX7-NEXT: s_lshr_b32 s14, s1, 28 -; GFX7-NEXT: s_bfe_u32 s1, s1, 0x40008 -; GFX7-NEXT: s_bfe_u32 s13, s0, 0x4000c -; GFX7-NEXT: v_mov_b32_e32 v2, s20 -; GFX7-NEXT: v_mul_u32_u24_e32 v2, s13, v2 -; GFX7-NEXT: v_mul_u32_u24_e32 v4, s11, v4 -; GFX7-NEXT: s_lshr_b32 s2, s0, 28 -; GFX7-NEXT: s_bfe_u32 s8, s0, 0x40018 -; GFX7-NEXT: s_bfe_u32 s9, s0, 0x40014 -; GFX7-NEXT: s_bfe_u32 s10, s0, 0x40010 -; GFX7-NEXT: s_and_b32 s12, s0, 15 -; GFX7-NEXT: v_mov_b32_e32 v3, s19 -; GFX7-NEXT: s_bfe_u32 s0, s0, 0x40008 -; GFX7-NEXT: v_mov_b32_e32 v1, s1 -; GFX7-NEXT: v_mul_u32_u24_e32 v1, s0, v1 +; GFX7-NEXT: s_bfe_u32 s10, s4, 0x40004 +; GFX7-NEXT: s_bfe_u32 s17, s5, 0x40004 +; GFX7-NEXT: s_bfe_u32 s19, s5, 0x4000c +; GFX7-NEXT: v_mov_b32_e32 v4, s17 +; GFX7-NEXT: s_bfe_u32 s14, s5, 0x40018 +; GFX7-NEXT: s_bfe_u32 s15, s5, 0x40014 +; GFX7-NEXT: s_bfe_u32 s16, s5, 0x40010 +; GFX7-NEXT: s_and_b32 s18, s5, 15 +; GFX7-NEXT: s_lshr_b32 s13, s5, 28 +; GFX7-NEXT: s_bfe_u32 s5, s5, 0x40008 +; GFX7-NEXT: s_bfe_u32 s12, s4, 0x4000c +; GFX7-NEXT: v_mov_b32_e32 v2, s19 +; GFX7-NEXT: v_mul_u32_u24_e32 v2, s12, v2 +; GFX7-NEXT: v_mul_u32_u24_e32 v4, s10, v4 +; GFX7-NEXT: s_lshr_b32 s6, s4, 28 +; GFX7-NEXT: s_bfe_u32 s7, s4, 0x40018 +; GFX7-NEXT: s_bfe_u32 s8, s4, 0x40014 +; GFX7-NEXT: s_bfe_u32 s9, s4, 0x40010 +; GFX7-NEXT: s_and_b32 s11, s4, 15 +; GFX7-NEXT: v_mov_b32_e32 v3, s18 +; GFX7-NEXT: s_bfe_u32 s4, s4, 0x40008 +; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: v_mul_u32_u24_e32 v1, s4, v1 ; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX7-NEXT: v_mul_u32_u24_e32 v3, s12, v3 +; GFX7-NEXT: v_mul_u32_u24_e32 v3, s11, v3 ; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v4 ; GFX7-NEXT: v_or_b32_e32 v1, v1, v2 ; GFX7-NEXT: v_or_b32_e32 v2, v3, v4 ; GFX7-NEXT: v_alignbit_b32 v3, v1, v2, 16 ; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v1 -; GFX7-NEXT: v_mov_b32_e32 v5, s17 -; GFX7-NEXT: v_mov_b32_e32 v6, s16 -; GFX7-NEXT: v_mov_b32_e32 v7, s15 +; GFX7-NEXT: v_mov_b32_e32 v5, s16 +; GFX7-NEXT: v_mov_b32_e32 v6, s15 +; GFX7-NEXT: v_mov_b32_e32 v7, s14 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_add_i32_e32 v0, vcc, v0, v2 ; GFX7-NEXT: v_add_i32_e32 v0, vcc, v3, v0 ; GFX7-NEXT: v_add_i32_e32 v0, vcc, v0, v1 ; GFX7-NEXT: v_add_i32_e32 v0, vcc, v4, v0 -; GFX7-NEXT: v_mad_u32_u24 v0, s10, v5, v0 -; GFX7-NEXT: v_mad_u32_u24 v0, s9, v6, v0 -; GFX7-NEXT: v_mad_u32_u24 v0, s8, v7, v0 -; GFX7-NEXT: v_mov_b32_e32 v1, s14 -; GFX7-NEXT: v_mad_u32_u24 v0, s2, v1, v0 -; GFX7-NEXT: buffer_store_short v0, off, s[4:7], 0 +; GFX7-NEXT: v_mad_u32_u24 v0, s9, v5, v0 +; GFX7-NEXT: v_mad_u32_u24 v0, s8, v6, v0 +; GFX7-NEXT: v_mad_u32_u24 v0, s7, v7, v0 +; GFX7-NEXT: v_mov_b32_e32 v1, s13 +; GFX7-NEXT: v_mad_u32_u24 v0, s6, v1, v0 +; GFX7-NEXT: buffer_store_short v0, off, s[0:3], 0 ; GFX7-NEXT: s_endpgm ; ; GFX8-LABEL: udot8_acc16_vecMul: @@ -2043,38 +2043,38 @@ ; GFX8-NEXT: s_load_dword s1, s[6:7], 0x0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: s_lshr_b32 s2, s0, 28 -; GFX8-NEXT: s_bfe_u32 s11, s1, 0x40018 -; GFX8-NEXT: s_bfe_u32 s12, s1, 0x40014 -; GFX8-NEXT: s_bfe_u32 s13, s1, 0x40010 -; GFX8-NEXT: s_bfe_u32 s14, s1, 0x4000c -; GFX8-NEXT: s_bfe_u32 s15, s1, 0x40008 -; GFX8-NEXT: s_bfe_u32 s16, s1, 0x40004 -; GFX8-NEXT: s_lshr_b32 s10, s1, 28 +; GFX8-NEXT: s_bfe_u32 s10, s1, 0x40018 +; GFX8-NEXT: s_bfe_u32 s11, s1, 0x40014 +; GFX8-NEXT: s_bfe_u32 s12, s1, 0x40010 +; GFX8-NEXT: s_bfe_u32 s13, s1, 0x4000c +; GFX8-NEXT: s_bfe_u32 s14, s1, 0x40008 +; GFX8-NEXT: s_bfe_u32 s15, s1, 0x40004 +; GFX8-NEXT: s_lshr_b32 s9, s1, 28 ; GFX8-NEXT: s_and_b32 s1, s1, 15 -; GFX8-NEXT: s_bfe_u32 s4, s0, 0x40018 -; GFX8-NEXT: s_bfe_u32 s5, s0, 0x40014 -; GFX8-NEXT: s_bfe_u32 s6, s0, 0x40010 -; GFX8-NEXT: s_bfe_u32 s7, s0, 0x4000c -; GFX8-NEXT: s_bfe_u32 s8, s0, 0x40008 -; GFX8-NEXT: s_bfe_u32 s9, s0, 0x40004 +; GFX8-NEXT: s_bfe_u32 s3, s0, 0x40018 +; GFX8-NEXT: s_bfe_u32 s4, s0, 0x40014 +; GFX8-NEXT: s_bfe_u32 s5, s0, 0x40010 +; GFX8-NEXT: s_bfe_u32 s6, s0, 0x4000c +; GFX8-NEXT: s_bfe_u32 s7, s0, 0x40008 +; GFX8-NEXT: s_bfe_u32 s8, s0, 0x40004 ; GFX8-NEXT: s_and_b32 s0, s0, 15 ; GFX8-NEXT: v_mov_b32_e32 v3, s1 -; GFX8-NEXT: v_mov_b32_e32 v4, s16 -; GFX8-NEXT: v_mov_b32_e32 v5, s15 -; GFX8-NEXT: v_mov_b32_e32 v6, s14 -; GFX8-NEXT: v_mov_b32_e32 v7, s13 -; GFX8-NEXT: v_mov_b32_e32 v8, s12 -; GFX8-NEXT: v_mov_b32_e32 v9, s11 +; GFX8-NEXT: v_mov_b32_e32 v4, s15 +; GFX8-NEXT: v_mov_b32_e32 v5, s14 +; GFX8-NEXT: v_mov_b32_e32 v6, s13 +; GFX8-NEXT: v_mov_b32_e32 v7, s12 +; GFX8-NEXT: v_mov_b32_e32 v8, s11 +; GFX8-NEXT: v_mov_b32_e32 v9, s10 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_mad_u32_u24 v2, s0, v3, v2 -; GFX8-NEXT: v_mad_u32_u24 v2, s9, v4, v2 +; GFX8-NEXT: v_mad_u32_u24 v2, s8, v4, v2 ; GFX8-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GFX8-NEXT: v_mad_u32_u24 v2, s8, v5, v2 -; GFX8-NEXT: v_mad_u32_u24 v2, s7, v6, v2 -; GFX8-NEXT: v_mad_u32_u24 v2, s6, v7, v2 -; GFX8-NEXT: v_mad_u32_u24 v2, s5, v8, v2 -; GFX8-NEXT: v_mad_u32_u24 v2, s4, v9, v2 -; GFX8-NEXT: v_mov_b32_e32 v3, s10 +; GFX8-NEXT: v_mad_u32_u24 v2, s7, v5, v2 +; GFX8-NEXT: v_mad_u32_u24 v2, s6, v6, v2 +; GFX8-NEXT: v_mad_u32_u24 v2, s5, v7, v2 +; GFX8-NEXT: v_mad_u32_u24 v2, s4, v8, v2 +; GFX8-NEXT: v_mad_u32_u24 v2, s3, v9, v2 +; GFX8-NEXT: v_mov_b32_e32 v3, s9 ; GFX8-NEXT: v_mad_u32_u24 v2, s2, v3, v2 ; GFX8-NEXT: flat_store_short v[0:1], v2 ; GFX8-NEXT: s_endpgm @@ -2088,36 +2088,36 @@ ; GFX9-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_bfe_u32 s7, s6, 0x40018 -; GFX9-NEXT: s_lshr_b32 s13, s6, 28 -; GFX9-NEXT: s_pack_ll_b32_b16 s7, s7, s13 -; GFX9-NEXT: s_bfe_u32 s4, s2, 0x40018 -; GFX9-NEXT: s_lshr_b32 s5, s2, 28 -; GFX9-NEXT: s_bfe_u32 s14, s6, 0x40010 -; GFX9-NEXT: s_bfe_u32 s15, s6, 0x40014 -; GFX9-NEXT: s_pack_ll_b32_b16 s4, s4, s5 +; GFX9-NEXT: s_lshr_b32 s12, s6, 28 +; GFX9-NEXT: s_pack_ll_b32_b16 s7, s7, s12 +; GFX9-NEXT: s_bfe_u32 s3, s2, 0x40018 +; GFX9-NEXT: s_lshr_b32 s4, s2, 28 +; GFX9-NEXT: s_bfe_u32 s13, s6, 0x40010 +; GFX9-NEXT: s_bfe_u32 s14, s6, 0x40014 +; GFX9-NEXT: s_pack_ll_b32_b16 s3, s3, s4 ; GFX9-NEXT: v_mov_b32_e32 v0, s7 -; GFX9-NEXT: v_pk_mul_lo_u16 v2, s4, v0 -; GFX9-NEXT: s_pack_ll_b32_b16 s4, s14, s15 -; GFX9-NEXT: s_bfe_u32 s8, s2, 0x40010 -; GFX9-NEXT: s_bfe_u32 s9, s2, 0x40014 -; GFX9-NEXT: s_bfe_u32 s16, s6, 0x40008 -; GFX9-NEXT: s_bfe_u32 s17, s6, 0x4000c -; GFX9-NEXT: s_and_b32 s18, s6, 15 -; GFX9-NEXT: v_mov_b32_e32 v0, s4 -; GFX9-NEXT: s_pack_ll_b32_b16 s5, s8, s9 -; GFX9-NEXT: s_bfe_u32 s10, s2, 0x40008 -; GFX9-NEXT: s_bfe_u32 s11, s2, 0x4000c +; GFX9-NEXT: v_pk_mul_lo_u16 v2, s3, v0 +; GFX9-NEXT: s_pack_ll_b32_b16 s3, s13, s14 +; GFX9-NEXT: s_bfe_u32 s5, s2, 0x40010 +; GFX9-NEXT: s_bfe_u32 s8, s2, 0x40014 +; GFX9-NEXT: s_bfe_u32 s15, s6, 0x40008 +; GFX9-NEXT: s_bfe_u32 s16, s6, 0x4000c +; GFX9-NEXT: s_and_b32 s17, s6, 15 +; GFX9-NEXT: v_mov_b32_e32 v0, s3 +; GFX9-NEXT: s_pack_ll_b32_b16 s4, s5, s8 +; GFX9-NEXT: s_bfe_u32 s9, s2, 0x40008 +; GFX9-NEXT: s_bfe_u32 s10, s2, 0x4000c ; GFX9-NEXT: s_bfe_u32 s6, s6, 0x40004 -; GFX9-NEXT: s_pack_ll_b32_b16 s4, s16, s17 -; GFX9-NEXT: v_pk_mul_lo_u16 v3, s5, v0 -; GFX9-NEXT: s_and_b32 s12, s2, 15 +; GFX9-NEXT: s_pack_ll_b32_b16 s3, s15, s16 +; GFX9-NEXT: v_pk_mul_lo_u16 v3, s4, v0 +; GFX9-NEXT: s_and_b32 s11, s2, 15 ; GFX9-NEXT: s_bfe_u32 s2, s2, 0x40004 -; GFX9-NEXT: v_mov_b32_e32 v0, s4 -; GFX9-NEXT: s_pack_ll_b32_b16 s5, s10, s11 -; GFX9-NEXT: s_pack_ll_b32_b16 s4, s18, s6 -; GFX9-NEXT: v_pk_mul_lo_u16 v4, s5, v0 -; GFX9-NEXT: s_pack_ll_b32_b16 s2, s12, s2 -; GFX9-NEXT: v_mov_b32_e32 v0, s4 +; GFX9-NEXT: v_mov_b32_e32 v0, s3 +; GFX9-NEXT: s_pack_ll_b32_b16 s4, s9, s10 +; GFX9-NEXT: s_pack_ll_b32_b16 s3, s17, s6 +; GFX9-NEXT: v_pk_mul_lo_u16 v4, s4, v0 +; GFX9-NEXT: s_pack_ll_b32_b16 s2, s11, s2 +; GFX9-NEXT: v_mov_b32_e32 v0, s3 ; GFX9-NEXT: v_pk_mul_lo_u16 v5, s2, v0 ; GFX9-NEXT: v_mov_b32_e32 v0, s0 ; GFX9-NEXT: v_mov_b32_e32 v1, s1 @@ -2143,36 +2143,36 @@ ; GFX9-DL-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DL-NEXT: s_bfe_u32 s7, s6, 0x40018 -; GFX9-DL-NEXT: s_lshr_b32 s13, s6, 28 -; GFX9-DL-NEXT: s_pack_ll_b32_b16 s7, s7, s13 -; GFX9-DL-NEXT: s_bfe_u32 s4, s2, 0x40018 -; GFX9-DL-NEXT: s_lshr_b32 s5, s2, 28 -; GFX9-DL-NEXT: s_bfe_u32 s14, s6, 0x40010 -; GFX9-DL-NEXT: s_bfe_u32 s15, s6, 0x40014 -; GFX9-DL-NEXT: s_pack_ll_b32_b16 s4, s4, s5 +; GFX9-DL-NEXT: s_lshr_b32 s12, s6, 28 +; GFX9-DL-NEXT: s_pack_ll_b32_b16 s7, s7, s12 +; GFX9-DL-NEXT: s_bfe_u32 s3, s2, 0x40018 +; GFX9-DL-NEXT: s_lshr_b32 s4, s2, 28 +; GFX9-DL-NEXT: s_bfe_u32 s13, s6, 0x40010 +; GFX9-DL-NEXT: s_bfe_u32 s14, s6, 0x40014 +; GFX9-DL-NEXT: s_pack_ll_b32_b16 s3, s3, s4 ; GFX9-DL-NEXT: v_mov_b32_e32 v0, s7 -; GFX9-DL-NEXT: v_pk_mul_lo_u16 v2, s4, v0 -; GFX9-DL-NEXT: s_pack_ll_b32_b16 s4, s14, s15 -; GFX9-DL-NEXT: s_bfe_u32 s8, s2, 0x40010 -; GFX9-DL-NEXT: s_bfe_u32 s9, s2, 0x40014 -; GFX9-DL-NEXT: s_bfe_u32 s16, s6, 0x40008 -; GFX9-DL-NEXT: s_bfe_u32 s17, s6, 0x4000c -; GFX9-DL-NEXT: s_and_b32 s18, s6, 15 -; GFX9-DL-NEXT: v_mov_b32_e32 v0, s4 -; GFX9-DL-NEXT: s_pack_ll_b32_b16 s5, s8, s9 -; GFX9-DL-NEXT: s_bfe_u32 s10, s2, 0x40008 -; GFX9-DL-NEXT: s_bfe_u32 s11, s2, 0x4000c +; GFX9-DL-NEXT: v_pk_mul_lo_u16 v2, s3, v0 +; GFX9-DL-NEXT: s_pack_ll_b32_b16 s3, s13, s14 +; GFX9-DL-NEXT: s_bfe_u32 s5, s2, 0x40010 +; GFX9-DL-NEXT: s_bfe_u32 s8, s2, 0x40014 +; GFX9-DL-NEXT: s_bfe_u32 s15, s6, 0x40008 +; GFX9-DL-NEXT: s_bfe_u32 s16, s6, 0x4000c +; GFX9-DL-NEXT: s_and_b32 s17, s6, 15 +; GFX9-DL-NEXT: v_mov_b32_e32 v0, s3 +; GFX9-DL-NEXT: s_pack_ll_b32_b16 s4, s5, s8 +; GFX9-DL-NEXT: s_bfe_u32 s9, s2, 0x40008 +; GFX9-DL-NEXT: s_bfe_u32 s10, s2, 0x4000c ; GFX9-DL-NEXT: s_bfe_u32 s6, s6, 0x40004 -; GFX9-DL-NEXT: s_pack_ll_b32_b16 s4, s16, s17 -; GFX9-DL-NEXT: v_pk_mul_lo_u16 v3, s5, v0 -; GFX9-DL-NEXT: s_and_b32 s12, s2, 15 +; GFX9-DL-NEXT: s_pack_ll_b32_b16 s3, s15, s16 +; GFX9-DL-NEXT: v_pk_mul_lo_u16 v3, s4, v0 +; GFX9-DL-NEXT: s_and_b32 s11, s2, 15 ; GFX9-DL-NEXT: s_bfe_u32 s2, s2, 0x40004 -; GFX9-DL-NEXT: v_mov_b32_e32 v0, s4 -; GFX9-DL-NEXT: s_pack_ll_b32_b16 s5, s10, s11 -; GFX9-DL-NEXT: s_pack_ll_b32_b16 s4, s18, s6 -; GFX9-DL-NEXT: v_pk_mul_lo_u16 v4, s5, v0 -; GFX9-DL-NEXT: s_pack_ll_b32_b16 s2, s12, s2 -; GFX9-DL-NEXT: v_mov_b32_e32 v0, s4 +; GFX9-DL-NEXT: v_mov_b32_e32 v0, s3 +; GFX9-DL-NEXT: s_pack_ll_b32_b16 s4, s9, s10 +; GFX9-DL-NEXT: s_pack_ll_b32_b16 s3, s17, s6 +; GFX9-DL-NEXT: v_pk_mul_lo_u16 v4, s4, v0 +; GFX9-DL-NEXT: s_pack_ll_b32_b16 s2, s11, s2 +; GFX9-DL-NEXT: v_mov_b32_e32 v0, s3 ; GFX9-DL-NEXT: v_pk_mul_lo_u16 v5, s2, v0 ; GFX9-DL-NEXT: v_mov_b32_e32 v0, s0 ; GFX9-DL-NEXT: v_mov_b32_e32 v1, s1 @@ -2191,47 +2191,47 @@ ; ; GFX10-DL-LABEL: udot8_acc16_vecMul: ; GFX10-DL: ; %bb.0: ; %entry -; GFX10-DL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34 +; GFX10-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX10-DL-NEXT: ; implicit-def: $vcc_hi ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: v_mov_b32_e32 v0, s4 -; GFX10-DL-NEXT: v_mov_b32_e32 v1, s5 -; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-DL-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-DL-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-DL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-DL-NEXT: global_load_ushort v2, v[0:1], off ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: s_load_dword s0, s[4:5], 0x0 -; GFX10-DL-NEXT: s_load_dword s1, s[6:7], 0x0 +; GFX10-DL-NEXT: s_load_dword s0, s[0:1], 0x0 +; GFX10-DL-NEXT: s_load_dword s1, s[2:3], 0x0 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DL-NEXT: s_and_b32 s2, s0, 15 -; GFX10-DL-NEXT: s_bfe_u32 s6, s0, 0x40004 -; GFX10-DL-NEXT: s_and_b32 s4, s1, 15 -; GFX10-DL-NEXT: s_bfe_u32 s5, s1, 0x40004 -; GFX10-DL-NEXT: s_bfe_u32 s7, s1, 0x4000c -; GFX10-DL-NEXT: s_bfe_u32 s8, s0, 0x4000c -; GFX10-DL-NEXT: s_pack_ll_b32_b16 s2, s2, s6 -; GFX10-DL-NEXT: s_bfe_u32 s6, s1, 0x40008 -; GFX10-DL-NEXT: s_pack_ll_b32_b16 s4, s4, s5 -; GFX10-DL-NEXT: s_bfe_u32 s5, s0, 0x40008 -; GFX10-DL-NEXT: v_pk_mul_lo_u16 v3, s2, s4 -; GFX10-DL-NEXT: s_pack_ll_b32_b16 s4, s6, s7 -; GFX10-DL-NEXT: s_pack_ll_b32_b16 s5, s5, s8 +; GFX10-DL-NEXT: s_bfe_u32 s5, s0, 0x40004 +; GFX10-DL-NEXT: s_and_b32 s3, s1, 15 +; GFX10-DL-NEXT: s_bfe_u32 s4, s1, 0x40004 +; GFX10-DL-NEXT: s_bfe_u32 s6, s1, 0x4000c +; GFX10-DL-NEXT: s_bfe_u32 s7, s0, 0x4000c +; GFX10-DL-NEXT: s_pack_ll_b32_b16 s2, s2, s5 +; GFX10-DL-NEXT: s_bfe_u32 s5, s1, 0x40008 +; GFX10-DL-NEXT: s_pack_ll_b32_b16 s3, s3, s4 +; GFX10-DL-NEXT: s_bfe_u32 s4, s0, 0x40008 +; GFX10-DL-NEXT: v_pk_mul_lo_u16 v3, s2, s3 +; GFX10-DL-NEXT: s_pack_ll_b32_b16 s3, s5, s6 +; GFX10-DL-NEXT: s_pack_ll_b32_b16 s4, s4, s7 ; GFX10-DL-NEXT: s_bfe_u32 s2, s0, 0x40010 -; GFX10-DL-NEXT: s_bfe_u32 s6, s0, 0x40014 -; GFX10-DL-NEXT: s_bfe_u32 s7, s1, 0x40010 -; GFX10-DL-NEXT: s_bfe_u32 s8, s1, 0x40014 -; GFX10-DL-NEXT: v_pk_mul_lo_u16 v4, s5, s4 -; GFX10-DL-NEXT: s_bfe_u32 s4, s0, 0x40018 -; GFX10-DL-NEXT: s_pack_ll_b32_b16 s2, s2, s6 +; GFX10-DL-NEXT: s_bfe_u32 s5, s0, 0x40014 +; GFX10-DL-NEXT: s_bfe_u32 s6, s1, 0x40010 +; GFX10-DL-NEXT: s_bfe_u32 s7, s1, 0x40014 +; GFX10-DL-NEXT: v_pk_mul_lo_u16 v4, s4, s3 +; GFX10-DL-NEXT: s_bfe_u32 s3, s0, 0x40018 +; GFX10-DL-NEXT: s_pack_ll_b32_b16 s2, s2, s5 ; GFX10-DL-NEXT: s_lshr_b32 s0, s0, 28 -; GFX10-DL-NEXT: s_pack_ll_b32_b16 s5, s7, s8 -; GFX10-DL-NEXT: s_bfe_u32 s6, s1, 0x40018 +; GFX10-DL-NEXT: s_pack_ll_b32_b16 s4, s6, s7 +; GFX10-DL-NEXT: s_bfe_u32 s5, s1, 0x40018 ; GFX10-DL-NEXT: s_lshr_b32 s1, s1, 28 -; GFX10-DL-NEXT: s_pack_ll_b32_b16 s0, s4, s0 -; GFX10-DL-NEXT: s_pack_ll_b32_b16 s1, s6, s1 +; GFX10-DL-NEXT: s_pack_ll_b32_b16 s0, s3, s0 +; GFX10-DL-NEXT: s_pack_ll_b32_b16 s1, s5, s1 ; GFX10-DL-NEXT: s_waitcnt vmcnt(0) ; GFX10-DL-NEXT: v_add_nc_u32_e32 v2, v3, v2 ; GFX10-DL-NEXT: v_add_nc_u32_sdwa v2, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX10-DL-NEXT: v_pk_mul_lo_u16 v3, s2, s5 +; GFX10-DL-NEXT: v_pk_mul_lo_u16 v3, s2, s4 ; GFX10-DL-NEXT: v_add_nc_u32_sdwa v2, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:BYTE_0 ; GFX10-DL-NEXT: v_add_nc_u32_sdwa v2, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX10-DL-NEXT: v_pk_mul_lo_u16 v4, s0, s1 @@ -2278,53 +2278,53 @@ define amdgpu_kernel void @udot8_acc8_vecMul(<8 x i4> addrspace(1)* %src1, ; GFX7-LABEL: udot8_acc8_vecMul: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x9 -; GFX7-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xd -; GFX7-NEXT: s_mov_b32 s7, 0xf000 -; GFX7-NEXT: s_mov_b32 s6, -1 +; GFX7-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd +; GFX7-NEXT: s_mov_b32 s3, 0xf000 +; GFX7-NEXT: s_mov_b32 s2, -1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: buffer_load_ubyte v0, off, s[4:7], 0 -; GFX7-NEXT: s_load_dword s0, s[8:9], 0x0 -; GFX7-NEXT: s_load_dword s1, s[10:11], 0x0 +; GFX7-NEXT: buffer_load_ubyte v0, off, s[0:3], 0 +; GFX7-NEXT: s_load_dword s4, s[4:5], 0x0 +; GFX7-NEXT: s_load_dword s5, s[6:7], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: s_bfe_u32 s2, s0, 0x4000c -; GFX7-NEXT: s_bfe_u32 s14, s1, 0x4000c -; GFX7-NEXT: s_bfe_u32 s16, s1, 0x40004 -; GFX7-NEXT: s_lshr_b32 s18, s1, 28 -; GFX7-NEXT: v_mov_b32_e32 v8, s14 -; GFX7-NEXT: s_bfe_u32 s15, s1, 0x40008 -; GFX7-NEXT: s_and_b32 s17, s1, 15 -; GFX7-NEXT: s_bfe_u32 s19, s1, 0x40018 -; GFX7-NEXT: s_bfe_u32 s20, s1, 0x40014 -; GFX7-NEXT: s_bfe_u32 s9, s0, 0x40004 -; GFX7-NEXT: v_mov_b32_e32 v6, s16 -; GFX7-NEXT: s_lshr_b32 s11, s0, 28 -; GFX7-NEXT: v_mov_b32_e32 v4, s18 -; GFX7-NEXT: v_mul_u32_u24_e32 v4, s11, v4 -; GFX7-NEXT: v_mul_u32_u24_e32 v6, s9, v6 -; GFX7-NEXT: v_mul_u32_u24_e32 v8, s2, v8 -; GFX7-NEXT: s_bfe_u32 s1, s1, 0x40010 -; GFX7-NEXT: s_bfe_u32 s8, s0, 0x40008 -; GFX7-NEXT: v_mov_b32_e32 v7, s15 -; GFX7-NEXT: s_and_b32 s10, s0, 15 -; GFX7-NEXT: v_mov_b32_e32 v5, s17 -; GFX7-NEXT: s_bfe_u32 s12, s0, 0x40018 -; GFX7-NEXT: v_mov_b32_e32 v3, s19 -; GFX7-NEXT: s_bfe_u32 s13, s0, 0x40014 -; GFX7-NEXT: v_mov_b32_e32 v2, s20 -; GFX7-NEXT: v_mul_u32_u24_e32 v2, s13, v2 -; GFX7-NEXT: s_bfe_u32 s0, s0, 0x40010 -; GFX7-NEXT: v_mov_b32_e32 v1, s1 -; GFX7-NEXT: v_mul_u32_u24_e32 v3, s12, v3 +; GFX7-NEXT: s_bfe_u32 s6, s4, 0x4000c +; GFX7-NEXT: s_bfe_u32 s13, s5, 0x4000c +; GFX7-NEXT: s_bfe_u32 s15, s5, 0x40004 +; GFX7-NEXT: s_lshr_b32 s17, s5, 28 +; GFX7-NEXT: v_mov_b32_e32 v8, s13 +; GFX7-NEXT: s_bfe_u32 s14, s5, 0x40008 +; GFX7-NEXT: s_and_b32 s16, s5, 15 +; GFX7-NEXT: s_bfe_u32 s18, s5, 0x40018 +; GFX7-NEXT: s_bfe_u32 s19, s5, 0x40014 +; GFX7-NEXT: s_bfe_u32 s8, s4, 0x40004 +; GFX7-NEXT: v_mov_b32_e32 v6, s15 +; GFX7-NEXT: s_lshr_b32 s10, s4, 28 +; GFX7-NEXT: v_mov_b32_e32 v4, s17 +; GFX7-NEXT: v_mul_u32_u24_e32 v4, s10, v4 +; GFX7-NEXT: v_mul_u32_u24_e32 v6, s8, v6 +; GFX7-NEXT: v_mul_u32_u24_e32 v8, s6, v8 +; GFX7-NEXT: s_bfe_u32 s5, s5, 0x40010 +; GFX7-NEXT: s_bfe_u32 s7, s4, 0x40008 +; GFX7-NEXT: v_mov_b32_e32 v7, s14 +; GFX7-NEXT: s_and_b32 s9, s4, 15 +; GFX7-NEXT: v_mov_b32_e32 v5, s16 +; GFX7-NEXT: s_bfe_u32 s11, s4, 0x40018 +; GFX7-NEXT: v_mov_b32_e32 v3, s18 +; GFX7-NEXT: s_bfe_u32 s12, s4, 0x40014 +; GFX7-NEXT: v_mov_b32_e32 v2, s19 +; GFX7-NEXT: v_mul_u32_u24_e32 v2, s12, v2 +; GFX7-NEXT: s_bfe_u32 s4, s4, 0x40010 +; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: v_mul_u32_u24_e32 v3, s11, v3 ; GFX7-NEXT: v_lshlrev_b32_e32 v4, 8, v4 -; GFX7-NEXT: v_mul_u32_u24_e32 v5, s10, v5 -; GFX7-NEXT: v_mul_u32_u24_e32 v7, s8, v7 +; GFX7-NEXT: v_mul_u32_u24_e32 v5, s9, v5 +; GFX7-NEXT: v_mul_u32_u24_e32 v7, s7, v7 ; GFX7-NEXT: v_lshlrev_b32_e32 v6, 8, v6 ; GFX7-NEXT: v_lshlrev_b32_e32 v8, 8, v8 ; GFX7-NEXT: v_or_b32_e32 v3, v3, v4 ; GFX7-NEXT: v_or_b32_e32 v4, v5, v6 ; GFX7-NEXT: v_or_b32_e32 v5, v7, v8 -; GFX7-NEXT: v_mul_u32_u24_e32 v9, s0, v1 +; GFX7-NEXT: v_mul_u32_u24_e32 v9, s4, v1 ; GFX7-NEXT: v_lshlrev_b32_e32 v2, 8, v2 ; GFX7-NEXT: v_or_b32_e32 v2, v9, v2 ; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v3 @@ -2342,11 +2342,11 @@ ; GFX7-NEXT: v_add_i32_e32 v0, vcc, v4, v0 ; GFX7-NEXT: v_add_i32_e32 v0, vcc, v5, v0 ; GFX7-NEXT: v_add_i32_e32 v0, vcc, v6, v0 -; GFX7-NEXT: v_mad_u32_u24 v0, s0, v1, v0 +; GFX7-NEXT: v_mad_u32_u24 v0, s4, v1, v0 ; GFX7-NEXT: v_add_i32_e32 v0, vcc, v0, v7 ; GFX7-NEXT: v_add_i32_e32 v0, vcc, v0, v8 ; GFX7-NEXT: v_add_i32_e32 v0, vcc, v0, v2 -; GFX7-NEXT: buffer_store_byte v0, off, s[4:7], 0 +; GFX7-NEXT: buffer_store_byte v0, off, s[0:3], 0 ; GFX7-NEXT: s_endpgm ; ; GFX8-LABEL: udot8_acc8_vecMul: @@ -2361,42 +2361,42 @@ ; GFX8-NEXT: s_load_dword s2, s[6:7], 0x0 ; GFX8-NEXT: s_mov_b32 s0, 0xffff ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_bfe_u32 s8, s1, 0x40004 -; GFX8-NEXT: s_bfe_u32 s10, s1, 0x4000c -; GFX8-NEXT: s_bfe_u32 s15, s2, 0x40004 -; GFX8-NEXT: s_and_b32 s16, s2, 15 -; GFX8-NEXT: s_bfe_u32 s17, s2, 0x4000c -; GFX8-NEXT: s_bfe_u32 s4, s1, 0x40014 -; GFX8-NEXT: s_lshr_b32 s6, s1, 28 -; GFX8-NEXT: s_bfe_u32 s11, s2, 0x40014 -; GFX8-NEXT: s_bfe_u32 s12, s2, 0x40010 -; GFX8-NEXT: s_lshr_b32 s13, s2, 28 -; GFX8-NEXT: s_bfe_u32 s14, s2, 0x40018 +; GFX8-NEXT: s_bfe_u32 s7, s1, 0x40004 +; GFX8-NEXT: s_bfe_u32 s9, s1, 0x4000c +; GFX8-NEXT: s_bfe_u32 s14, s2, 0x40004 +; GFX8-NEXT: s_and_b32 s15, s2, 15 +; GFX8-NEXT: s_bfe_u32 s16, s2, 0x4000c +; GFX8-NEXT: s_bfe_u32 s3, s1, 0x40014 +; GFX8-NEXT: s_lshr_b32 s5, s1, 28 +; GFX8-NEXT: s_bfe_u32 s10, s2, 0x40014 +; GFX8-NEXT: s_bfe_u32 s11, s2, 0x40010 +; GFX8-NEXT: s_lshr_b32 s12, s2, 28 +; GFX8-NEXT: s_bfe_u32 s13, s2, 0x40018 ; GFX8-NEXT: s_bfe_u32 s2, s2, 0x40008 -; GFX8-NEXT: s_and_b32 s9, s1, 15 -; GFX8-NEXT: v_mov_b32_e32 v4, s17 -; GFX8-NEXT: v_mov_b32_e32 v5, s10 -; GFX8-NEXT: v_mov_b32_e32 v6, s16 -; GFX8-NEXT: v_mov_b32_e32 v7, s15 -; GFX8-NEXT: v_mov_b32_e32 v8, s8 +; GFX8-NEXT: s_and_b32 s8, s1, 15 +; GFX8-NEXT: v_mov_b32_e32 v4, s16 +; GFX8-NEXT: v_mov_b32_e32 v5, s9 +; GFX8-NEXT: v_mov_b32_e32 v6, s15 +; GFX8-NEXT: v_mov_b32_e32 v7, s14 +; GFX8-NEXT: v_mov_b32_e32 v8, s7 ; GFX8-NEXT: v_mul_u32_u24_sdwa v4, v5, v4 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX8-NEXT: v_mul_u32_u24_e32 v5, s9, v6 +; GFX8-NEXT: v_mul_u32_u24_e32 v5, s8, v6 ; GFX8-NEXT: v_mul_u32_u24_sdwa v6, v8, v7 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX8-NEXT: s_bfe_u32 s5, s1, 0x40010 -; GFX8-NEXT: s_bfe_u32 s7, s1, 0x40018 -; GFX8-NEXT: v_mov_b32_e32 v9, s14 +; GFX8-NEXT: s_bfe_u32 s4, s1, 0x40010 +; GFX8-NEXT: s_bfe_u32 s6, s1, 0x40018 +; GFX8-NEXT: v_mov_b32_e32 v9, s13 ; GFX8-NEXT: s_bfe_u32 s1, s1, 0x40008 ; GFX8-NEXT: v_mov_b32_e32 v3, s2 -; GFX8-NEXT: v_mov_b32_e32 v10, s13 -; GFX8-NEXT: v_mov_b32_e32 v11, s6 -; GFX8-NEXT: v_mov_b32_e32 v12, s12 -; GFX8-NEXT: v_mov_b32_e32 v13, s11 -; GFX8-NEXT: v_mov_b32_e32 v14, s4 +; GFX8-NEXT: v_mov_b32_e32 v10, s12 +; GFX8-NEXT: v_mov_b32_e32 v11, s5 +; GFX8-NEXT: v_mov_b32_e32 v12, s11 +; GFX8-NEXT: v_mov_b32_e32 v13, s10 +; GFX8-NEXT: v_mov_b32_e32 v14, s3 ; GFX8-NEXT: v_mul_u32_u24_e32 v3, s1, v3 ; GFX8-NEXT: v_or_b32_e32 v5, v5, v6 -; GFX8-NEXT: v_mul_u32_u24_e32 v7, s7, v9 +; GFX8-NEXT: v_mul_u32_u24_e32 v7, s6, v9 ; GFX8-NEXT: v_mul_u32_u24_sdwa v8, v11, v10 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX8-NEXT: v_mul_u32_u24_e32 v9, s5, v12 +; GFX8-NEXT: v_mul_u32_u24_e32 v9, s4, v12 ; GFX8-NEXT: v_mul_u32_u24_sdwa v10, v14, v13 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX8-NEXT: v_and_b32_e32 v5, s0, v5 ; GFX8-NEXT: v_or_b32_sdwa v3, v3, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD @@ -2431,40 +2431,40 @@ ; GFX9-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX9-NEXT: s_load_dword s1, s[6:7], 0x0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_bfe_u32 s4, s0, 0x40010 -; GFX9-NEXT: s_bfe_u32 s11, s1, 0x40010 -; GFX9-NEXT: s_bfe_u32 s12, s1, 0x40014 -; GFX9-NEXT: s_bfe_u32 s13, s1, 0x40018 -; GFX9-NEXT: s_lshr_b32 s14, s1, 28 -; GFX9-NEXT: s_and_b32 s15, s1, 15 -; GFX9-NEXT: s_bfe_u32 s16, s1, 0x40004 -; GFX9-NEXT: s_bfe_u32 s17, s1, 0x40008 -; GFX9-NEXT: v_mov_b32_e32 v3, s11 +; GFX9-NEXT: s_bfe_u32 s3, s0, 0x40010 +; GFX9-NEXT: s_bfe_u32 s10, s1, 0x40010 +; GFX9-NEXT: s_bfe_u32 s11, s1, 0x40014 +; GFX9-NEXT: s_bfe_u32 s12, s1, 0x40018 +; GFX9-NEXT: s_lshr_b32 s13, s1, 28 +; GFX9-NEXT: s_and_b32 s14, s1, 15 +; GFX9-NEXT: s_bfe_u32 s15, s1, 0x40004 +; GFX9-NEXT: s_bfe_u32 s16, s1, 0x40008 +; GFX9-NEXT: v_mov_b32_e32 v3, s10 ; GFX9-NEXT: s_bfe_u32 s1, s1, 0x4000c -; GFX9-NEXT: s_bfe_u32 s5, s0, 0x40014 -; GFX9-NEXT: v_mov_b32_e32 v4, s12 -; GFX9-NEXT: s_bfe_u32 s6, s0, 0x40018 -; GFX9-NEXT: v_mov_b32_e32 v5, s13 -; GFX9-NEXT: s_lshr_b32 s7, s0, 28 -; GFX9-NEXT: v_mov_b32_e32 v6, s14 -; GFX9-NEXT: s_and_b32 s8, s0, 15 -; GFX9-NEXT: v_mov_b32_e32 v7, s15 -; GFX9-NEXT: s_bfe_u32 s9, s0, 0x40004 -; GFX9-NEXT: v_mov_b32_e32 v8, s16 -; GFX9-NEXT: s_bfe_u32 s10, s0, 0x40008 -; GFX9-NEXT: v_mov_b32_e32 v9, s17 +; GFX9-NEXT: s_bfe_u32 s4, s0, 0x40014 +; GFX9-NEXT: v_mov_b32_e32 v4, s11 +; GFX9-NEXT: s_bfe_u32 s5, s0, 0x40018 +; GFX9-NEXT: v_mov_b32_e32 v5, s12 +; GFX9-NEXT: s_lshr_b32 s6, s0, 28 +; GFX9-NEXT: v_mov_b32_e32 v6, s13 +; GFX9-NEXT: s_and_b32 s7, s0, 15 +; GFX9-NEXT: v_mov_b32_e32 v7, s14 +; GFX9-NEXT: s_bfe_u32 s8, s0, 0x40004 +; GFX9-NEXT: v_mov_b32_e32 v8, s15 +; GFX9-NEXT: s_bfe_u32 s9, s0, 0x40008 +; GFX9-NEXT: v_mov_b32_e32 v9, s16 ; GFX9-NEXT: s_bfe_u32 s0, s0, 0x4000c ; GFX9-NEXT: v_mov_b32_e32 v10, s1 -; GFX9-NEXT: v_mul_lo_u16_e32 v3, s4, v3 -; GFX9-NEXT: v_mul_lo_u16_sdwa v4, s5, v4 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX9-NEXT: v_mul_lo_u16_e32 v5, s6, v5 -; GFX9-NEXT: v_mul_lo_u16_sdwa v6, s7, v6 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX9-NEXT: v_mul_lo_u16_e32 v7, s8, v7 -; GFX9-NEXT: v_mul_lo_u16_sdwa v8, s9, v8 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_mul_lo_u16_e32 v3, s3, v3 +; GFX9-NEXT: v_mul_lo_u16_sdwa v4, s4, v4 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_mul_lo_u16_e32 v5, s5, v5 +; GFX9-NEXT: v_mul_lo_u16_sdwa v6, s6, v6 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_mul_lo_u16_e32 v7, s7, v7 +; GFX9-NEXT: v_mul_lo_u16_sdwa v8, s8, v8 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-NEXT: v_or_b32_e32 v3, v3, v4 ; GFX9-NEXT: v_or_b32_sdwa v4, v5, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-NEXT: v_or_b32_e32 v5, v7, v8 -; GFX9-NEXT: v_mul_lo_u16_e32 v9, s10, v9 +; GFX9-NEXT: v_mul_lo_u16_e32 v9, s9, v9 ; GFX9-NEXT: v_mul_lo_u16_sdwa v10, s0, v10 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-NEXT: v_and_b32_e32 v5, s2, v5 ; GFX9-NEXT: v_or_b32_sdwa v6, v9, v10 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD @@ -2497,40 +2497,40 @@ ; GFX9-DL-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX9-DL-NEXT: s_load_dword s1, s[6:7], 0x0 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: s_bfe_u32 s4, s0, 0x40010 -; GFX9-DL-NEXT: s_bfe_u32 s11, s1, 0x40010 -; GFX9-DL-NEXT: s_bfe_u32 s12, s1, 0x40014 -; GFX9-DL-NEXT: s_bfe_u32 s13, s1, 0x40018 -; GFX9-DL-NEXT: s_lshr_b32 s14, s1, 28 -; GFX9-DL-NEXT: s_and_b32 s15, s1, 15 -; GFX9-DL-NEXT: s_bfe_u32 s16, s1, 0x40004 -; GFX9-DL-NEXT: s_bfe_u32 s17, s1, 0x40008 -; GFX9-DL-NEXT: v_mov_b32_e32 v3, s11 +; GFX9-DL-NEXT: s_bfe_u32 s3, s0, 0x40010 +; GFX9-DL-NEXT: s_bfe_u32 s10, s1, 0x40010 +; GFX9-DL-NEXT: s_bfe_u32 s11, s1, 0x40014 +; GFX9-DL-NEXT: s_bfe_u32 s12, s1, 0x40018 +; GFX9-DL-NEXT: s_lshr_b32 s13, s1, 28 +; GFX9-DL-NEXT: s_and_b32 s14, s1, 15 +; GFX9-DL-NEXT: s_bfe_u32 s15, s1, 0x40004 +; GFX9-DL-NEXT: s_bfe_u32 s16, s1, 0x40008 +; GFX9-DL-NEXT: v_mov_b32_e32 v3, s10 ; GFX9-DL-NEXT: s_bfe_u32 s1, s1, 0x4000c -; GFX9-DL-NEXT: s_bfe_u32 s5, s0, 0x40014 -; GFX9-DL-NEXT: v_mov_b32_e32 v4, s12 -; GFX9-DL-NEXT: s_bfe_u32 s6, s0, 0x40018 -; GFX9-DL-NEXT: v_mov_b32_e32 v5, s13 -; GFX9-DL-NEXT: s_lshr_b32 s7, s0, 28 -; GFX9-DL-NEXT: v_mov_b32_e32 v6, s14 -; GFX9-DL-NEXT: s_and_b32 s8, s0, 15 -; GFX9-DL-NEXT: v_mov_b32_e32 v7, s15 -; GFX9-DL-NEXT: s_bfe_u32 s9, s0, 0x40004 -; GFX9-DL-NEXT: v_mov_b32_e32 v8, s16 -; GFX9-DL-NEXT: s_bfe_u32 s10, s0, 0x40008 -; GFX9-DL-NEXT: v_mov_b32_e32 v9, s17 +; GFX9-DL-NEXT: s_bfe_u32 s4, s0, 0x40014 +; GFX9-DL-NEXT: v_mov_b32_e32 v4, s11 +; GFX9-DL-NEXT: s_bfe_u32 s5, s0, 0x40018 +; GFX9-DL-NEXT: v_mov_b32_e32 v5, s12 +; GFX9-DL-NEXT: s_lshr_b32 s6, s0, 28 +; GFX9-DL-NEXT: v_mov_b32_e32 v6, s13 +; GFX9-DL-NEXT: s_and_b32 s7, s0, 15 +; GFX9-DL-NEXT: v_mov_b32_e32 v7, s14 +; GFX9-DL-NEXT: s_bfe_u32 s8, s0, 0x40004 +; GFX9-DL-NEXT: v_mov_b32_e32 v8, s15 +; GFX9-DL-NEXT: s_bfe_u32 s9, s0, 0x40008 +; GFX9-DL-NEXT: v_mov_b32_e32 v9, s16 ; GFX9-DL-NEXT: s_bfe_u32 s0, s0, 0x4000c ; GFX9-DL-NEXT: v_mov_b32_e32 v10, s1 -; GFX9-DL-NEXT: v_mul_lo_u16_e32 v3, s4, v3 -; GFX9-DL-NEXT: v_mul_lo_u16_sdwa v4, s5, v4 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX9-DL-NEXT: v_mul_lo_u16_e32 v5, s6, v5 -; GFX9-DL-NEXT: v_mul_lo_u16_sdwa v6, s7, v6 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX9-DL-NEXT: v_mul_lo_u16_e32 v7, s8, v7 -; GFX9-DL-NEXT: v_mul_lo_u16_sdwa v8, s9, v8 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-DL-NEXT: v_mul_lo_u16_e32 v3, s3, v3 +; GFX9-DL-NEXT: v_mul_lo_u16_sdwa v4, s4, v4 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-DL-NEXT: v_mul_lo_u16_e32 v5, s5, v5 +; GFX9-DL-NEXT: v_mul_lo_u16_sdwa v6, s6, v6 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-DL-NEXT: v_mul_lo_u16_e32 v7, s7, v7 +; GFX9-DL-NEXT: v_mul_lo_u16_sdwa v8, s8, v8 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-DL-NEXT: v_or_b32_e32 v3, v3, v4 ; GFX9-DL-NEXT: v_or_b32_sdwa v4, v5, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-DL-NEXT: v_or_b32_e32 v5, v7, v8 -; GFX9-DL-NEXT: v_mul_lo_u16_e32 v9, s10, v9 +; GFX9-DL-NEXT: v_mul_lo_u16_e32 v9, s9, v9 ; GFX9-DL-NEXT: v_mul_lo_u16_sdwa v10, s0, v10 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-DL-NEXT: v_and_b32_e32 v5, s2, v5 ; GFX9-DL-NEXT: v_or_b32_sdwa v6, v9, v10 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD @@ -2553,58 +2553,58 @@ ; ; GFX10-DL-LABEL: udot8_acc8_vecMul: ; GFX10-DL: ; %bb.0: ; %entry -; GFX10-DL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34 +; GFX10-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX10-DL-NEXT: ; implicit-def: $vcc_hi ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: v_mov_b32_e32 v0, s4 -; GFX10-DL-NEXT: v_mov_b32_e32 v1, s5 -; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-DL-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-DL-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-DL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-DL-NEXT: global_load_ubyte v2, v[0:1], off ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: s_load_dword s0, s[4:5], 0x0 -; GFX10-DL-NEXT: s_load_dword s1, s[6:7], 0x0 +; GFX10-DL-NEXT: s_load_dword s0, s[0:1], 0x0 +; GFX10-DL-NEXT: s_load_dword s1, s[2:3], 0x0 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DL-NEXT: s_bfe_u32 s2, s0, 0x40004 -; GFX10-DL-NEXT: s_bfe_u32 s4, s1, 0x40004 -; GFX10-DL-NEXT: s_and_b32 s5, s0, 15 -; GFX10-DL-NEXT: s_and_b32 s7, s1, 15 -; GFX10-DL-NEXT: s_bfe_u32 s6, s0, 0x4000c -; GFX10-DL-NEXT: s_bfe_u32 s8, s1, 0x4000c -; GFX10-DL-NEXT: v_mul_lo_u16_e64 v3, s2, s4 +; GFX10-DL-NEXT: s_bfe_u32 s3, s1, 0x40004 +; GFX10-DL-NEXT: s_and_b32 s4, s0, 15 +; GFX10-DL-NEXT: s_and_b32 s6, s1, 15 +; GFX10-DL-NEXT: s_bfe_u32 s5, s0, 0x4000c +; GFX10-DL-NEXT: s_bfe_u32 s7, s1, 0x4000c +; GFX10-DL-NEXT: v_mul_lo_u16_e64 v3, s2, s3 ; GFX10-DL-NEXT: s_bfe_u32 s2, s0, 0x40008 -; GFX10-DL-NEXT: v_mul_lo_u16_e64 v4, s5, s7 -; GFX10-DL-NEXT: s_bfe_u32 s4, s1, 0x40008 -; GFX10-DL-NEXT: v_mul_lo_u16_e64 v5, s6, s8 +; GFX10-DL-NEXT: v_mul_lo_u16_e64 v4, s4, s6 +; GFX10-DL-NEXT: s_bfe_u32 s3, s1, 0x40008 +; GFX10-DL-NEXT: v_mul_lo_u16_e64 v5, s5, s7 ; GFX10-DL-NEXT: v_lshlrev_b16_e64 v3, 8, v3 -; GFX10-DL-NEXT: s_mov_b32 s5, 0xffff -; GFX10-DL-NEXT: s_bfe_u32 s7, s1, 0x40014 -; GFX10-DL-NEXT: v_mul_lo_u16_e64 v6, s2, s4 +; GFX10-DL-NEXT: s_mov_b32 s4, 0xffff +; GFX10-DL-NEXT: s_bfe_u32 s6, s1, 0x40014 +; GFX10-DL-NEXT: v_mul_lo_u16_e64 v6, s2, s3 ; GFX10-DL-NEXT: v_lshlrev_b16_e64 v5, 8, v5 ; GFX10-DL-NEXT: v_or_b32_e32 v3, v4, v3 -; GFX10-DL-NEXT: s_bfe_u32 s4, s0, 0x40014 +; GFX10-DL-NEXT: s_bfe_u32 s3, s0, 0x40014 ; GFX10-DL-NEXT: s_bfe_u32 s2, s0, 0x40010 -; GFX10-DL-NEXT: s_bfe_u32 s6, s0, 0x40018 +; GFX10-DL-NEXT: s_bfe_u32 s5, s0, 0x40018 ; GFX10-DL-NEXT: v_or_b32_sdwa v4, v6, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX10-DL-NEXT: v_and_b32_e32 v3, s5, v3 -; GFX10-DL-NEXT: s_bfe_u32 s8, s1, 0x40010 -; GFX10-DL-NEXT: s_lshr_b32 s9, s1, 28 +; GFX10-DL-NEXT: v_and_b32_e32 v3, s4, v3 +; GFX10-DL-NEXT: s_bfe_u32 s7, s1, 0x40010 +; GFX10-DL-NEXT: s_lshr_b32 s8, s1, 28 ; GFX10-DL-NEXT: s_lshr_b32 s0, s0, 28 -; GFX10-DL-NEXT: v_mul_lo_u16_e64 v5, s4, s7 +; GFX10-DL-NEXT: v_mul_lo_u16_e64 v5, s3, s6 ; GFX10-DL-NEXT: v_or_b32_e32 v4, v3, v4 ; GFX10-DL-NEXT: s_bfe_u32 s1, s1, 0x40018 -; GFX10-DL-NEXT: v_mul_lo_u16_e64 v6, s2, s8 -; GFX10-DL-NEXT: v_mul_lo_u16_e64 v7, s0, s9 +; GFX10-DL-NEXT: v_mul_lo_u16_e64 v6, s2, s7 +; GFX10-DL-NEXT: v_mul_lo_u16_e64 v7, s0, s8 ; GFX10-DL-NEXT: v_lshrrev_b32_e32 v8, 8, v4 ; GFX10-DL-NEXT: v_lshlrev_b16_e64 v7, 8, v7 ; GFX10-DL-NEXT: s_waitcnt vmcnt(0) ; GFX10-DL-NEXT: v_add_nc_u32_e32 v2, v3, v2 ; GFX10-DL-NEXT: v_lshlrev_b16_e64 v3, 8, v5 -; GFX10-DL-NEXT: v_mul_lo_u16_e64 v5, s6, s1 +; GFX10-DL-NEXT: v_mul_lo_u16_e64 v5, s5, s1 ; GFX10-DL-NEXT: v_add_nc_u32_e32 v2, v2, v8 ; GFX10-DL-NEXT: v_or_b32_e32 v3, v6, v3 ; GFX10-DL-NEXT: v_or_b32_sdwa v5, v5, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX10-DL-NEXT: v_add_nc_u32_sdwa v2, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_2 -; GFX10-DL-NEXT: v_and_b32_e32 v3, s5, v3 +; GFX10-DL-NEXT: v_and_b32_e32 v3, s4, v3 ; GFX10-DL-NEXT: v_add_nc_u32_sdwa v2, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3 ; GFX10-DL-NEXT: v_or_b32_e32 v4, v3, v5 ; GFX10-DL-NEXT: v_add_nc_u32_e32 v2, v2, v3 @@ -2651,50 +2651,50 @@ define amdgpu_kernel void @udot8_acc4_vecMul(<8 x i4> addrspace(1)* %src1, ; GFX7-LABEL: udot8_acc4_vecMul: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x9 -; GFX7-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xd -; GFX7-NEXT: s_mov_b32 s7, 0xf000 -; GFX7-NEXT: s_mov_b32 s6, -1 +; GFX7-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd +; GFX7-NEXT: s_mov_b32 s3, 0xf000 +; GFX7-NEXT: s_mov_b32 s2, -1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: buffer_load_ubyte v0, off, s[4:7], 0 -; GFX7-NEXT: s_load_dword s0, s[8:9], 0x0 -; GFX7-NEXT: s_load_dword s1, s[10:11], 0x0 +; GFX7-NEXT: buffer_load_ubyte v0, off, s[0:3], 0 +; GFX7-NEXT: s_load_dword s4, s[4:5], 0x0 +; GFX7-NEXT: s_load_dword s5, s[6:7], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: s_lshr_b32 s2, s0, 28 -; GFX7-NEXT: s_bfe_u32 s15, s1, 0x40018 -; GFX7-NEXT: s_bfe_u32 s16, s1, 0x40014 -; GFX7-NEXT: s_bfe_u32 s17, s1, 0x40010 -; GFX7-NEXT: s_bfe_u32 s18, s1, 0x4000c -; GFX7-NEXT: s_bfe_u32 s19, s1, 0x40008 -; GFX7-NEXT: s_bfe_u32 s20, s1, 0x40004 -; GFX7-NEXT: s_lshr_b32 s14, s1, 28 -; GFX7-NEXT: s_and_b32 s1, s1, 15 -; GFX7-NEXT: s_bfe_u32 s8, s0, 0x40018 -; GFX7-NEXT: s_bfe_u32 s9, s0, 0x40014 -; GFX7-NEXT: s_bfe_u32 s10, s0, 0x40010 -; GFX7-NEXT: s_bfe_u32 s11, s0, 0x4000c -; GFX7-NEXT: s_bfe_u32 s12, s0, 0x40008 -; GFX7-NEXT: s_bfe_u32 s13, s0, 0x40004 -; GFX7-NEXT: s_and_b32 s0, s0, 15 -; GFX7-NEXT: v_mov_b32_e32 v1, s1 -; GFX7-NEXT: v_mov_b32_e32 v2, s20 -; GFX7-NEXT: v_mov_b32_e32 v3, s19 -; GFX7-NEXT: v_mov_b32_e32 v4, s18 -; GFX7-NEXT: v_mov_b32_e32 v5, s17 -; GFX7-NEXT: v_mov_b32_e32 v6, s16 -; GFX7-NEXT: v_mov_b32_e32 v7, s15 +; GFX7-NEXT: s_lshr_b32 s6, s4, 28 +; GFX7-NEXT: s_bfe_u32 s14, s5, 0x40018 +; GFX7-NEXT: s_bfe_u32 s15, s5, 0x40014 +; GFX7-NEXT: s_bfe_u32 s16, s5, 0x40010 +; GFX7-NEXT: s_bfe_u32 s17, s5, 0x4000c +; GFX7-NEXT: s_bfe_u32 s18, s5, 0x40008 +; GFX7-NEXT: s_bfe_u32 s19, s5, 0x40004 +; GFX7-NEXT: s_lshr_b32 s13, s5, 28 +; GFX7-NEXT: s_and_b32 s5, s5, 15 +; GFX7-NEXT: s_bfe_u32 s7, s4, 0x40018 +; GFX7-NEXT: s_bfe_u32 s8, s4, 0x40014 +; GFX7-NEXT: s_bfe_u32 s9, s4, 0x40010 +; GFX7-NEXT: s_bfe_u32 s10, s4, 0x4000c +; GFX7-NEXT: s_bfe_u32 s11, s4, 0x40008 +; GFX7-NEXT: s_bfe_u32 s12, s4, 0x40004 +; GFX7-NEXT: s_and_b32 s4, s4, 15 +; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: v_mov_b32_e32 v2, s19 +; GFX7-NEXT: v_mov_b32_e32 v3, s18 +; GFX7-NEXT: v_mov_b32_e32 v4, s17 +; GFX7-NEXT: v_mov_b32_e32 v5, s16 +; GFX7-NEXT: v_mov_b32_e32 v6, s15 +; GFX7-NEXT: v_mov_b32_e32 v7, s14 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_mad_u32_u24 v0, s0, v1, v0 -; GFX7-NEXT: v_mad_u32_u24 v0, s13, v2, v0 -; GFX7-NEXT: v_mad_u32_u24 v0, s12, v3, v0 -; GFX7-NEXT: v_mad_u32_u24 v0, s11, v4, v0 -; GFX7-NEXT: v_mad_u32_u24 v0, s10, v5, v0 -; GFX7-NEXT: v_mad_u32_u24 v0, s9, v6, v0 -; GFX7-NEXT: v_mad_u32_u24 v0, s8, v7, v0 -; GFX7-NEXT: v_mov_b32_e32 v1, s14 -; GFX7-NEXT: v_mad_u32_u24 v0, s2, v1, v0 +; GFX7-NEXT: v_mad_u32_u24 v0, s4, v1, v0 +; GFX7-NEXT: v_mad_u32_u24 v0, s12, v2, v0 +; GFX7-NEXT: v_mad_u32_u24 v0, s11, v3, v0 +; GFX7-NEXT: v_mad_u32_u24 v0, s10, v4, v0 +; GFX7-NEXT: v_mad_u32_u24 v0, s9, v5, v0 +; GFX7-NEXT: v_mad_u32_u24 v0, s8, v6, v0 +; GFX7-NEXT: v_mad_u32_u24 v0, s7, v7, v0 +; GFX7-NEXT: v_mov_b32_e32 v1, s13 +; GFX7-NEXT: v_mad_u32_u24 v0, s6, v1, v0 ; GFX7-NEXT: v_and_b32_e32 v0, 15, v0 -; GFX7-NEXT: buffer_store_byte v0, off, s[4:7], 0 +; GFX7-NEXT: buffer_store_byte v0, off, s[0:3], 0 ; GFX7-NEXT: s_endpgm ; ; GFX8-LABEL: udot8_acc4_vecMul: @@ -2708,41 +2708,41 @@ ; GFX8-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX8-NEXT: s_load_dword s1, s[6:7], 0x0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_and_b32 s9, s0, 15 -; GFX8-NEXT: s_and_b32 s16, s1, 15 -; GFX8-NEXT: s_bfe_u32 s15, s1, 0x40004 -; GFX8-NEXT: v_mov_b32_e32 v4, s16 -; GFX8-NEXT: s_bfe_u32 s11, s1, 0x40018 -; GFX8-NEXT: s_bfe_u32 s12, s1, 0x40014 -; GFX8-NEXT: s_bfe_u32 s13, s1, 0x40010 -; GFX8-NEXT: s_bfe_u32 s14, s1, 0x40008 -; GFX8-NEXT: s_lshr_b32 s10, s1, 28 +; GFX8-NEXT: s_and_b32 s8, s0, 15 +; GFX8-NEXT: s_and_b32 s15, s1, 15 +; GFX8-NEXT: s_bfe_u32 s14, s1, 0x40004 +; GFX8-NEXT: v_mov_b32_e32 v4, s15 +; GFX8-NEXT: s_bfe_u32 s10, s1, 0x40018 +; GFX8-NEXT: s_bfe_u32 s11, s1, 0x40014 +; GFX8-NEXT: s_bfe_u32 s12, s1, 0x40010 +; GFX8-NEXT: s_bfe_u32 s13, s1, 0x40008 +; GFX8-NEXT: s_lshr_b32 s9, s1, 28 ; GFX8-NEXT: s_bfe_u32 s1, s1, 0x4000c -; GFX8-NEXT: s_bfe_u32 s8, s0, 0x40004 -; GFX8-NEXT: v_mov_b32_e32 v5, s15 +; GFX8-NEXT: s_bfe_u32 s7, s0, 0x40004 +; GFX8-NEXT: v_mov_b32_e32 v5, s14 ; GFX8-NEXT: s_lshr_b32 s2, s0, 28 -; GFX8-NEXT: s_bfe_u32 s4, s0, 0x40018 -; GFX8-NEXT: s_bfe_u32 s5, s0, 0x40014 -; GFX8-NEXT: s_bfe_u32 s6, s0, 0x40010 -; GFX8-NEXT: s_bfe_u32 s7, s0, 0x40008 +; GFX8-NEXT: s_bfe_u32 s3, s0, 0x40018 +; GFX8-NEXT: s_bfe_u32 s4, s0, 0x40014 +; GFX8-NEXT: s_bfe_u32 s5, s0, 0x40010 +; GFX8-NEXT: s_bfe_u32 s6, s0, 0x40008 ; GFX8-NEXT: s_bfe_u32 s0, s0, 0x4000c ; GFX8-NEXT: v_mov_b32_e32 v3, s1 -; GFX8-NEXT: v_mov_b32_e32 v6, s14 +; GFX8-NEXT: v_mov_b32_e32 v6, s13 ; GFX8-NEXT: v_mul_u32_u24_e32 v3, s0, v3 ; GFX8-NEXT: v_and_b32_e32 v3, 15, v3 -; GFX8-NEXT: v_mov_b32_e32 v7, s13 -; GFX8-NEXT: v_mov_b32_e32 v8, s12 -; GFX8-NEXT: v_mov_b32_e32 v9, s11 +; GFX8-NEXT: v_mov_b32_e32 v7, s12 +; GFX8-NEXT: v_mov_b32_e32 v8, s11 +; GFX8-NEXT: v_mov_b32_e32 v9, s10 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_mad_u32_u24 v2, s9, v4, v2 -; GFX8-NEXT: v_mad_u32_u24 v2, s8, v5, v2 -; GFX8-NEXT: v_mad_u32_u24 v2, s7, v6, v2 +; GFX8-NEXT: v_mad_u32_u24 v2, s8, v4, v2 +; GFX8-NEXT: v_mad_u32_u24 v2, s7, v5, v2 +; GFX8-NEXT: v_mad_u32_u24 v2, s6, v6, v2 ; GFX8-NEXT: v_and_b32_e32 v2, 15, v2 ; GFX8-NEXT: v_add_u32_e32 v2, vcc, v3, v2 -; GFX8-NEXT: v_mad_u32_u24 v2, s6, v7, v2 -; GFX8-NEXT: v_mad_u32_u24 v2, s5, v8, v2 -; GFX8-NEXT: v_mad_u32_u24 v2, s4, v9, v2 -; GFX8-NEXT: v_mov_b32_e32 v3, s10 +; GFX8-NEXT: v_mad_u32_u24 v2, s5, v7, v2 +; GFX8-NEXT: v_mad_u32_u24 v2, s4, v8, v2 +; GFX8-NEXT: v_mad_u32_u24 v2, s3, v9, v2 +; GFX8-NEXT: v_mov_b32_e32 v3, s9 ; GFX8-NEXT: v_mad_u32_u24 v2, s2, v3, v2 ; GFX8-NEXT: v_and_b32_e32 v2, 15, v2 ; GFX8-NEXT: flat_store_byte v[0:1], v2 @@ -2759,41 +2759,41 @@ ; GFX9-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX9-NEXT: s_load_dword s1, s[6:7], 0x0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_and_b32 s9, s0, 15 -; GFX9-NEXT: s_and_b32 s16, s1, 15 -; GFX9-NEXT: s_bfe_u32 s15, s1, 0x40004 -; GFX9-NEXT: v_mov_b32_e32 v4, s16 -; GFX9-NEXT: s_bfe_u32 s11, s1, 0x40018 -; GFX9-NEXT: s_bfe_u32 s12, s1, 0x40014 -; GFX9-NEXT: s_bfe_u32 s13, s1, 0x40010 -; GFX9-NEXT: s_bfe_u32 s14, s1, 0x40008 -; GFX9-NEXT: s_lshr_b32 s10, s1, 28 +; GFX9-NEXT: s_and_b32 s8, s0, 15 +; GFX9-NEXT: s_and_b32 s15, s1, 15 +; GFX9-NEXT: s_bfe_u32 s14, s1, 0x40004 +; GFX9-NEXT: v_mov_b32_e32 v4, s15 +; GFX9-NEXT: s_bfe_u32 s10, s1, 0x40018 +; GFX9-NEXT: s_bfe_u32 s11, s1, 0x40014 +; GFX9-NEXT: s_bfe_u32 s12, s1, 0x40010 +; GFX9-NEXT: s_bfe_u32 s13, s1, 0x40008 +; GFX9-NEXT: s_lshr_b32 s9, s1, 28 ; GFX9-NEXT: s_bfe_u32 s1, s1, 0x4000c -; GFX9-NEXT: s_bfe_u32 s8, s0, 0x40004 -; GFX9-NEXT: v_mov_b32_e32 v5, s15 +; GFX9-NEXT: s_bfe_u32 s7, s0, 0x40004 +; GFX9-NEXT: v_mov_b32_e32 v5, s14 ; GFX9-NEXT: s_lshr_b32 s2, s0, 28 -; GFX9-NEXT: s_bfe_u32 s4, s0, 0x40018 -; GFX9-NEXT: s_bfe_u32 s5, s0, 0x40014 -; GFX9-NEXT: s_bfe_u32 s6, s0, 0x40010 -; GFX9-NEXT: s_bfe_u32 s7, s0, 0x40008 +; GFX9-NEXT: s_bfe_u32 s3, s0, 0x40018 +; GFX9-NEXT: s_bfe_u32 s4, s0, 0x40014 +; GFX9-NEXT: s_bfe_u32 s5, s0, 0x40010 +; GFX9-NEXT: s_bfe_u32 s6, s0, 0x40008 ; GFX9-NEXT: s_bfe_u32 s0, s0, 0x4000c ; GFX9-NEXT: v_mov_b32_e32 v3, s1 -; GFX9-NEXT: v_mov_b32_e32 v6, s14 +; GFX9-NEXT: v_mov_b32_e32 v6, s13 ; GFX9-NEXT: v_mul_u32_u24_e32 v3, s0, v3 ; GFX9-NEXT: v_and_b32_e32 v3, 15, v3 -; GFX9-NEXT: v_mov_b32_e32 v7, s13 -; GFX9-NEXT: v_mov_b32_e32 v8, s12 -; GFX9-NEXT: v_mov_b32_e32 v9, s11 +; GFX9-NEXT: v_mov_b32_e32 v7, s12 +; GFX9-NEXT: v_mov_b32_e32 v8, s11 +; GFX9-NEXT: v_mov_b32_e32 v9, s10 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_mad_u32_u24 v2, s9, v4, v2 -; GFX9-NEXT: v_mad_u32_u24 v2, s8, v5, v2 -; GFX9-NEXT: v_mad_u32_u24 v2, s7, v6, v2 +; GFX9-NEXT: v_mad_u32_u24 v2, s8, v4, v2 +; GFX9-NEXT: v_mad_u32_u24 v2, s7, v5, v2 +; GFX9-NEXT: v_mad_u32_u24 v2, s6, v6, v2 ; GFX9-NEXT: v_and_b32_e32 v2, 15, v2 ; GFX9-NEXT: v_add_u32_e32 v2, v2, v3 -; GFX9-NEXT: v_mad_u32_u24 v2, s6, v7, v2 -; GFX9-NEXT: v_mad_u32_u24 v2, s5, v8, v2 -; GFX9-NEXT: v_mad_u32_u24 v2, s4, v9, v2 -; GFX9-NEXT: v_mov_b32_e32 v3, s10 +; GFX9-NEXT: v_mad_u32_u24 v2, s5, v7, v2 +; GFX9-NEXT: v_mad_u32_u24 v2, s4, v8, v2 +; GFX9-NEXT: v_mad_u32_u24 v2, s3, v9, v2 +; GFX9-NEXT: v_mov_b32_e32 v3, s9 ; GFX9-NEXT: v_mad_u32_u24 v2, s2, v3, v2 ; GFX9-NEXT: v_and_b32_e32 v2, 15, v2 ; GFX9-NEXT: global_store_byte v[0:1], v2, off @@ -2810,41 +2810,41 @@ ; GFX9-DL-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX9-DL-NEXT: s_load_dword s1, s[6:7], 0x0 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: s_and_b32 s9, s0, 15 -; GFX9-DL-NEXT: s_and_b32 s16, s1, 15 -; GFX9-DL-NEXT: s_bfe_u32 s15, s1, 0x40004 -; GFX9-DL-NEXT: v_mov_b32_e32 v4, s16 -; GFX9-DL-NEXT: s_bfe_u32 s11, s1, 0x40018 -; GFX9-DL-NEXT: s_bfe_u32 s12, s1, 0x40014 -; GFX9-DL-NEXT: s_bfe_u32 s13, s1, 0x40010 -; GFX9-DL-NEXT: s_bfe_u32 s14, s1, 0x40008 -; GFX9-DL-NEXT: s_lshr_b32 s10, s1, 28 +; GFX9-DL-NEXT: s_and_b32 s8, s0, 15 +; GFX9-DL-NEXT: s_and_b32 s15, s1, 15 +; GFX9-DL-NEXT: s_bfe_u32 s14, s1, 0x40004 +; GFX9-DL-NEXT: v_mov_b32_e32 v4, s15 +; GFX9-DL-NEXT: s_bfe_u32 s10, s1, 0x40018 +; GFX9-DL-NEXT: s_bfe_u32 s11, s1, 0x40014 +; GFX9-DL-NEXT: s_bfe_u32 s12, s1, 0x40010 +; GFX9-DL-NEXT: s_bfe_u32 s13, s1, 0x40008 +; GFX9-DL-NEXT: s_lshr_b32 s9, s1, 28 ; GFX9-DL-NEXT: s_bfe_u32 s1, s1, 0x4000c -; GFX9-DL-NEXT: s_bfe_u32 s8, s0, 0x40004 -; GFX9-DL-NEXT: v_mov_b32_e32 v5, s15 +; GFX9-DL-NEXT: s_bfe_u32 s7, s0, 0x40004 +; GFX9-DL-NEXT: v_mov_b32_e32 v5, s14 ; GFX9-DL-NEXT: s_lshr_b32 s2, s0, 28 -; GFX9-DL-NEXT: s_bfe_u32 s4, s0, 0x40018 -; GFX9-DL-NEXT: s_bfe_u32 s5, s0, 0x40014 -; GFX9-DL-NEXT: s_bfe_u32 s6, s0, 0x40010 -; GFX9-DL-NEXT: s_bfe_u32 s7, s0, 0x40008 +; GFX9-DL-NEXT: s_bfe_u32 s3, s0, 0x40018 +; GFX9-DL-NEXT: s_bfe_u32 s4, s0, 0x40014 +; GFX9-DL-NEXT: s_bfe_u32 s5, s0, 0x40010 +; GFX9-DL-NEXT: s_bfe_u32 s6, s0, 0x40008 ; GFX9-DL-NEXT: s_bfe_u32 s0, s0, 0x4000c ; GFX9-DL-NEXT: v_mov_b32_e32 v3, s1 -; GFX9-DL-NEXT: v_mov_b32_e32 v6, s14 +; GFX9-DL-NEXT: v_mov_b32_e32 v6, s13 ; GFX9-DL-NEXT: v_mul_u32_u24_e32 v3, s0, v3 ; GFX9-DL-NEXT: v_and_b32_e32 v3, 15, v3 -; GFX9-DL-NEXT: v_mov_b32_e32 v7, s13 -; GFX9-DL-NEXT: v_mov_b32_e32 v8, s12 -; GFX9-DL-NEXT: v_mov_b32_e32 v9, s11 +; GFX9-DL-NEXT: v_mov_b32_e32 v7, s12 +; GFX9-DL-NEXT: v_mov_b32_e32 v8, s11 +; GFX9-DL-NEXT: v_mov_b32_e32 v9, s10 ; GFX9-DL-NEXT: s_waitcnt vmcnt(0) -; GFX9-DL-NEXT: v_mad_u32_u24 v2, s9, v4, v2 -; GFX9-DL-NEXT: v_mad_u32_u24 v2, s8, v5, v2 -; GFX9-DL-NEXT: v_mad_u32_u24 v2, s7, v6, v2 +; GFX9-DL-NEXT: v_mad_u32_u24 v2, s8, v4, v2 +; GFX9-DL-NEXT: v_mad_u32_u24 v2, s7, v5, v2 +; GFX9-DL-NEXT: v_mad_u32_u24 v2, s6, v6, v2 ; GFX9-DL-NEXT: v_and_b32_e32 v2, 15, v2 ; GFX9-DL-NEXT: v_add_u32_e32 v2, v2, v3 -; GFX9-DL-NEXT: v_mad_u32_u24 v2, s6, v7, v2 -; GFX9-DL-NEXT: v_mad_u32_u24 v2, s5, v8, v2 -; GFX9-DL-NEXT: v_mad_u32_u24 v2, s4, v9, v2 -; GFX9-DL-NEXT: v_mov_b32_e32 v3, s10 +; GFX9-DL-NEXT: v_mad_u32_u24 v2, s5, v7, v2 +; GFX9-DL-NEXT: v_mad_u32_u24 v2, s4, v8, v2 +; GFX9-DL-NEXT: v_mad_u32_u24 v2, s3, v9, v2 +; GFX9-DL-NEXT: v_mov_b32_e32 v3, s9 ; GFX9-DL-NEXT: v_mad_u32_u24 v2, s2, v3, v2 ; GFX9-DL-NEXT: v_and_b32_e32 v2, 15, v2 ; GFX9-DL-NEXT: global_store_byte v[0:1], v2, off @@ -2852,44 +2852,44 @@ ; ; GFX10-DL-LABEL: udot8_acc4_vecMul: ; GFX10-DL: ; %bb.0: ; %entry -; GFX10-DL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34 +; GFX10-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX10-DL-NEXT: ; implicit-def: $vcc_hi ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: v_mov_b32_e32 v0, s4 -; GFX10-DL-NEXT: v_mov_b32_e32 v1, s5 -; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-DL-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-DL-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-DL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-DL-NEXT: global_load_ubyte v2, v[0:1], off ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: s_load_dword s0, s[4:5], 0x0 -; GFX10-DL-NEXT: s_load_dword s1, s[6:7], 0x0 +; GFX10-DL-NEXT: s_load_dword s0, s[0:1], 0x0 +; GFX10-DL-NEXT: s_load_dword s1, s[2:3], 0x0 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DL-NEXT: s_and_b32 s2, s0, 15 -; GFX10-DL-NEXT: s_and_b32 s4, s1, 15 -; GFX10-DL-NEXT: s_bfe_u32 s5, s0, 0x40004 -; GFX10-DL-NEXT: s_bfe_u32 s6, s1, 0x40004 -; GFX10-DL-NEXT: s_bfe_u32 s7, s1, 0x40008 +; GFX10-DL-NEXT: s_and_b32 s3, s1, 15 +; GFX10-DL-NEXT: s_bfe_u32 s4, s0, 0x40004 +; GFX10-DL-NEXT: s_bfe_u32 s5, s1, 0x40004 +; GFX10-DL-NEXT: s_bfe_u32 s6, s1, 0x40008 ; GFX10-DL-NEXT: s_waitcnt vmcnt(0) -; GFX10-DL-NEXT: v_mad_u32_u24 v2, s2, s4, v2 +; GFX10-DL-NEXT: v_mad_u32_u24 v2, s2, s3, v2 ; GFX10-DL-NEXT: s_bfe_u32 s2, s0, 0x40008 -; GFX10-DL-NEXT: s_bfe_u32 s4, s0, 0x4000c -; GFX10-DL-NEXT: v_mad_u32_u24 v2, s5, s6, v2 -; GFX10-DL-NEXT: s_bfe_u32 s5, s1, 0x4000c -; GFX10-DL-NEXT: s_bfe_u32 s6, s1, 0x40014 -; GFX10-DL-NEXT: v_mad_u32_u24 v2, s2, s7, v2 -; GFX10-DL-NEXT: v_mul_u32_u24_e64 v3, s4, s5 +; GFX10-DL-NEXT: s_bfe_u32 s3, s0, 0x4000c +; GFX10-DL-NEXT: v_mad_u32_u24 v2, s4, s5, v2 +; GFX10-DL-NEXT: s_bfe_u32 s4, s1, 0x4000c +; GFX10-DL-NEXT: s_bfe_u32 s5, s1, 0x40014 +; GFX10-DL-NEXT: v_mad_u32_u24 v2, s2, s6, v2 +; GFX10-DL-NEXT: v_mul_u32_u24_e64 v3, s3, s4 ; GFX10-DL-NEXT: s_bfe_u32 s2, s0, 0x40010 -; GFX10-DL-NEXT: s_bfe_u32 s4, s1, 0x40010 -; GFX10-DL-NEXT: s_bfe_u32 s5, s0, 0x40014 +; GFX10-DL-NEXT: s_bfe_u32 s3, s1, 0x40010 +; GFX10-DL-NEXT: s_bfe_u32 s4, s0, 0x40014 ; GFX10-DL-NEXT: v_and_b32_e32 v2, 15, v2 ; GFX10-DL-NEXT: v_and_b32_e32 v3, 15, v3 ; GFX10-DL-NEXT: v_add_nc_u32_e32 v2, v2, v3 -; GFX10-DL-NEXT: v_mad_u32_u24 v2, s2, s4, v2 +; GFX10-DL-NEXT: v_mad_u32_u24 v2, s2, s3, v2 ; GFX10-DL-NEXT: s_bfe_u32 s2, s0, 0x40018 -; GFX10-DL-NEXT: s_bfe_u32 s4, s1, 0x40018 +; GFX10-DL-NEXT: s_bfe_u32 s3, s1, 0x40018 ; GFX10-DL-NEXT: s_lshr_b32 s0, s0, 28 ; GFX10-DL-NEXT: s_lshr_b32 s1, s1, 28 -; GFX10-DL-NEXT: v_mad_u32_u24 v2, s5, s6, v2 -; GFX10-DL-NEXT: v_mad_u32_u24 v2, s2, s4, v2 +; GFX10-DL-NEXT: v_mad_u32_u24 v2, s4, s5, v2 +; GFX10-DL-NEXT: v_mad_u32_u24 v2, s2, s3, v2 ; GFX10-DL-NEXT: v_mad_u32_u24 v2, s0, s1, v2 ; GFX10-DL-NEXT: v_and_b32_e32 v2, 15, v2 ; GFX10-DL-NEXT: global_store_byte v[0:1], v2, off diff --git a/llvm/test/CodeGen/AMDGPU/indirect-addressing-term.ll b/llvm/test/CodeGen/AMDGPU/indirect-addressing-term.ll --- a/llvm/test/CodeGen/AMDGPU/indirect-addressing-term.ll +++ b/llvm/test/CodeGen/AMDGPU/indirect-addressing-term.ll @@ -16,40 +16,40 @@ ; GCN: renamable $sgpr2 = COPY renamable $sgpr1 ; GCN: renamable $sgpr0 = COPY renamable $sgpr0, implicit killed $sgpr0_sgpr1 ; GCN: renamable $sgpr1 = S_MOV_B32 61440 - ; GCN: renamable $sgpr4 = S_MOV_B32 -1 - ; GCN: undef renamable $sgpr8 = COPY killed renamable $sgpr0, implicit-def $sgpr8_sgpr9_sgpr10_sgpr11 - ; GCN: renamable $sgpr9 = COPY killed renamable $sgpr2 - ; GCN: renamable $sgpr10 = COPY killed renamable $sgpr4 - ; GCN: renamable $sgpr11 = COPY killed renamable $sgpr1 + ; GCN: renamable $sgpr3 = S_MOV_B32 -1 + ; GCN: undef renamable $sgpr4 = COPY killed renamable $sgpr0, implicit-def $sgpr4_sgpr5_sgpr6_sgpr7 + ; GCN: renamable $sgpr5 = COPY killed renamable $sgpr2 + ; GCN: renamable $sgpr6 = COPY killed renamable $sgpr3 + ; GCN: renamable $sgpr7 = COPY killed renamable $sgpr1 ; GCN: renamable $sgpr0 = S_MOV_B32 16 ; GCN: renamable $sgpr1 = S_MOV_B32 15 ; GCN: renamable $sgpr2 = S_MOV_B32 14 - ; GCN: renamable $sgpr4 = S_MOV_B32 13 - ; GCN: renamable $sgpr5 = S_MOV_B32 12 - ; GCN: renamable $sgpr6 = S_MOV_B32 11 - ; GCN: renamable $sgpr7 = S_MOV_B32 10 - ; GCN: renamable $sgpr12 = S_MOV_B32 9 - ; GCN: renamable $sgpr13 = S_MOV_B32 8 - ; GCN: renamable $sgpr14 = S_MOV_B32 7 - ; GCN: renamable $sgpr15 = S_MOV_B32 6 - ; GCN: renamable $sgpr16 = S_MOV_B32 5 - ; GCN: renamable $sgpr17 = S_MOV_B32 3 - ; GCN: renamable $sgpr18 = S_MOV_B32 2 - ; GCN: renamable $sgpr19 = S_MOV_B32 1 - ; GCN: renamable $sgpr20 = S_MOV_B32 0 - ; GCN: renamable $vgpr1 = COPY killed renamable $sgpr20 - ; GCN: renamable $vgpr2 = COPY killed renamable $sgpr19 - ; GCN: renamable $vgpr3 = COPY killed renamable $sgpr18 - ; GCN: renamable $vgpr4 = COPY killed renamable $sgpr17 - ; GCN: renamable $vgpr5 = COPY killed renamable $sgpr16 - ; GCN: renamable $vgpr6 = COPY killed renamable $sgpr15 - ; GCN: renamable $vgpr7 = COPY killed renamable $sgpr14 - ; GCN: renamable $vgpr8 = COPY killed renamable $sgpr13 - ; GCN: renamable $vgpr9 = COPY killed renamable $sgpr12 - ; GCN: renamable $vgpr10 = COPY killed renamable $sgpr7 - ; GCN: renamable $vgpr11 = COPY killed renamable $sgpr6 - ; GCN: renamable $vgpr12 = COPY killed renamable $sgpr5 - ; GCN: renamable $vgpr13 = COPY killed renamable $sgpr4 + ; GCN: renamable $sgpr3 = S_MOV_B32 13 + ; GCN: renamable $sgpr8 = S_MOV_B32 12 + ; GCN: renamable $sgpr9 = S_MOV_B32 11 + ; GCN: renamable $sgpr10 = S_MOV_B32 10 + ; GCN: renamable $sgpr11 = S_MOV_B32 9 + ; GCN: renamable $sgpr12 = S_MOV_B32 8 + ; GCN: renamable $sgpr13 = S_MOV_B32 7 + ; GCN: renamable $sgpr14 = S_MOV_B32 6 + ; GCN: renamable $sgpr15 = S_MOV_B32 5 + ; GCN: renamable $sgpr16 = S_MOV_B32 3 + ; GCN: renamable $sgpr17 = S_MOV_B32 2 + ; GCN: renamable $sgpr18 = S_MOV_B32 1 + ; GCN: renamable $sgpr19 = S_MOV_B32 0 + ; GCN: renamable $vgpr1 = COPY killed renamable $sgpr19 + ; GCN: renamable $vgpr2 = COPY killed renamable $sgpr18 + ; GCN: renamable $vgpr3 = COPY killed renamable $sgpr17 + ; GCN: renamable $vgpr4 = COPY killed renamable $sgpr16 + ; GCN: renamable $vgpr5 = COPY killed renamable $sgpr15 + ; GCN: renamable $vgpr6 = COPY killed renamable $sgpr14 + ; GCN: renamable $vgpr7 = COPY killed renamable $sgpr13 + ; GCN: renamable $vgpr8 = COPY killed renamable $sgpr12 + ; GCN: renamable $vgpr9 = COPY killed renamable $sgpr11 + ; GCN: renamable $vgpr10 = COPY killed renamable $sgpr10 + ; GCN: renamable $vgpr11 = COPY killed renamable $sgpr9 + ; GCN: renamable $vgpr12 = COPY killed renamable $sgpr8 + ; GCN: renamable $vgpr13 = COPY killed renamable $sgpr3 ; GCN: renamable $vgpr14 = COPY killed renamable $sgpr2 ; GCN: renamable $vgpr15 = COPY killed renamable $sgpr1 ; GCN: renamable $vgpr16 = COPY killed renamable $sgpr0 @@ -69,41 +69,41 @@ ; GCN: renamable $vgpr30 = COPY killed renamable $vgpr14 ; GCN: renamable $vgpr31 = COPY killed renamable $vgpr15 ; GCN: renamable $vgpr32 = COPY killed renamable $vgpr16 - ; GCN: renamable $sgpr22_sgpr23 = S_MOV_B64 $exec + ; GCN: renamable $sgpr20_sgpr21 = S_MOV_B64 $exec ; GCN: renamable $vgpr1 = IMPLICIT_DEF - ; GCN: renamable $sgpr24_sgpr25 = IMPLICIT_DEF - ; GCN: SI_SPILL_V32_SAVE killed $vgpr0, %stack.0, $sgpr96_sgpr97_sgpr98_sgpr99, $sgpr3, 0, implicit $exec :: (store 4 into %stack.0, addrspace 5) - ; GCN: SI_SPILL_S128_SAVE killed $sgpr8_sgpr9_sgpr10_sgpr11, %stack.1, implicit $exec, implicit $sgpr96_sgpr97_sgpr98_sgpr99, implicit $sgpr3 :: (store 16 into %stack.1, align 4, addrspace 5) - ; GCN: SI_SPILL_V512_SAVE killed $vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32, %stack.2, $sgpr96_sgpr97_sgpr98_sgpr99, $sgpr3, 0, implicit $exec :: (store 64 into %stack.2, align 4, addrspace 5) - ; GCN: SI_SPILL_S64_SAVE killed $sgpr22_sgpr23, %stack.3, implicit $exec, implicit $sgpr96_sgpr97_sgpr98_sgpr99, implicit $sgpr3 :: (store 8 into %stack.3, align 4, addrspace 5) - ; GCN: SI_SPILL_V32_SAVE killed $vgpr1, %stack.4, $sgpr96_sgpr97_sgpr98_sgpr99, $sgpr3, 0, implicit $exec :: (store 4 into %stack.4, addrspace 5) - ; GCN: SI_SPILL_S64_SAVE killed $sgpr24_sgpr25, %stack.5, implicit $exec, implicit $sgpr96_sgpr97_sgpr98_sgpr99, implicit $sgpr3 :: (store 8 into %stack.5, align 4, addrspace 5) + ; GCN: renamable $sgpr22_sgpr23 = IMPLICIT_DEF + ; GCN: SI_SPILL_V32_SAVE killed $vgpr0, %stack.0, $sgpr96_sgpr97_sgpr98_sgpr99, 0, 0, implicit $exec :: (store 4 into %stack.0, addrspace 5) + ; GCN: SI_SPILL_S128_SAVE killed $sgpr4_sgpr5_sgpr6_sgpr7, %stack.1, implicit $exec, implicit $sgpr96_sgpr97_sgpr98_sgpr99 :: (store 16 into %stack.1, align 4, addrspace 5) + ; GCN: SI_SPILL_V512_SAVE killed $vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32, %stack.2, $sgpr96_sgpr97_sgpr98_sgpr99, 0, 0, implicit $exec :: (store 64 into %stack.2, align 4, addrspace 5) + ; GCN: SI_SPILL_S64_SAVE killed $sgpr20_sgpr21, %stack.3, implicit $exec, implicit $sgpr96_sgpr97_sgpr98_sgpr99 :: (store 8 into %stack.3, align 4, addrspace 5) + ; GCN: SI_SPILL_V32_SAVE killed $vgpr1, %stack.4, $sgpr96_sgpr97_sgpr98_sgpr99, 0, 0, implicit $exec :: (store 4 into %stack.4, addrspace 5) + ; GCN: SI_SPILL_S64_SAVE killed $sgpr22_sgpr23, %stack.5, implicit $exec, implicit $sgpr96_sgpr97_sgpr98_sgpr99 :: (store 8 into %stack.5, align 4, addrspace 5) ; GCN: bb.1: ; GCN: successors: %bb.1(0x40000000), %bb.2(0x40000000) - ; GCN: $sgpr0_sgpr1 = SI_SPILL_S64_RESTORE %stack.5, implicit $exec, implicit $sgpr96_sgpr97_sgpr98_sgpr99, implicit $sgpr3 :: (load 8 from %stack.5, align 4, addrspace 5) - ; GCN: $vgpr0 = SI_SPILL_V32_RESTORE %stack.4, $sgpr96_sgpr97_sgpr98_sgpr99, $sgpr3, 0, implicit $exec :: (load 4 from %stack.4, addrspace 5) - ; GCN: $vgpr1 = SI_SPILL_V32_RESTORE %stack.0, $sgpr96_sgpr97_sgpr98_sgpr99, $sgpr3, 0, implicit $exec :: (load 4 from %stack.0, addrspace 5) + ; GCN: $sgpr0_sgpr1 = SI_SPILL_S64_RESTORE %stack.5, implicit $exec, implicit $sgpr96_sgpr97_sgpr98_sgpr99 :: (load 8 from %stack.5, align 4, addrspace 5) + ; GCN: $vgpr0 = SI_SPILL_V32_RESTORE %stack.4, $sgpr96_sgpr97_sgpr98_sgpr99, 0, 0, implicit $exec :: (load 4 from %stack.4, addrspace 5) + ; GCN: $vgpr1 = SI_SPILL_V32_RESTORE %stack.0, $sgpr96_sgpr97_sgpr98_sgpr99, 0, 0, implicit $exec :: (load 4 from %stack.0, addrspace 5) ; GCN: renamable $sgpr2 = V_READFIRSTLANE_B32 $vgpr1, implicit $exec ; GCN: renamable $sgpr4_sgpr5 = V_CMP_EQ_U32_e64 $sgpr2, killed $vgpr1, implicit $exec ; GCN: renamable $sgpr4_sgpr5 = S_AND_SAVEEXEC_B64 killed renamable $sgpr4_sgpr5, implicit-def $exec, implicit-def $scc, implicit $exec ; GCN: S_SET_GPR_IDX_ON killed renamable $sgpr2, 1, implicit-def $m0, implicit undef $m0 - ; GCN: $vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17 = SI_SPILL_V512_RESTORE %stack.2, $sgpr96_sgpr97_sgpr98_sgpr99, $sgpr3, 0, implicit $exec :: (load 64 from %stack.2, align 4, addrspace 5) + ; GCN: $vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17 = SI_SPILL_V512_RESTORE %stack.2, $sgpr96_sgpr97_sgpr98_sgpr99, 0, 0, implicit $exec :: (load 64 from %stack.2, align 4, addrspace 5) ; GCN: renamable $vgpr18 = V_MOV_B32_e32 undef $vgpr3, implicit $exec, implicit killed $vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17, implicit $m0 ; GCN: S_SET_GPR_IDX_OFF ; GCN: renamable $vgpr19 = COPY renamable $vgpr18 ; GCN: renamable $sgpr6_sgpr7 = COPY renamable $sgpr4_sgpr5 - ; GCN: SI_SPILL_S64_SAVE killed $sgpr6_sgpr7, %stack.5, implicit $exec, implicit $sgpr96_sgpr97_sgpr98_sgpr99, implicit $sgpr3 :: (store 8 into %stack.5, align 4, addrspace 5) - ; GCN: SI_SPILL_S64_SAVE killed $sgpr0_sgpr1, %stack.6, implicit $exec, implicit $sgpr96_sgpr97_sgpr98_sgpr99, implicit $sgpr3 :: (store 8 into %stack.6, align 4, addrspace 5) - ; GCN: SI_SPILL_V32_SAVE killed $vgpr19, %stack.4, $sgpr96_sgpr97_sgpr98_sgpr99, $sgpr3, 0, implicit $exec :: (store 4 into %stack.4, addrspace 5) - ; GCN: SI_SPILL_V32_SAVE killed $vgpr0, %stack.7, $sgpr96_sgpr97_sgpr98_sgpr99, $sgpr3, 0, implicit $exec :: (store 4 into %stack.7, addrspace 5) - ; GCN: SI_SPILL_V32_SAVE killed $vgpr18, %stack.8, $sgpr96_sgpr97_sgpr98_sgpr99, $sgpr3, 0, implicit $exec :: (store 4 into %stack.8, addrspace 5) + ; GCN: SI_SPILL_S64_SAVE killed $sgpr6_sgpr7, %stack.5, implicit $exec, implicit $sgpr96_sgpr97_sgpr98_sgpr99 :: (store 8 into %stack.5, align 4, addrspace 5) + ; GCN: SI_SPILL_S64_SAVE killed $sgpr0_sgpr1, %stack.6, implicit $exec, implicit $sgpr96_sgpr97_sgpr98_sgpr99 :: (store 8 into %stack.6, align 4, addrspace 5) + ; GCN: SI_SPILL_V32_SAVE killed $vgpr19, %stack.4, $sgpr96_sgpr97_sgpr98_sgpr99, 0, 0, implicit $exec :: (store 4 into %stack.4, addrspace 5) + ; GCN: SI_SPILL_V32_SAVE killed $vgpr0, %stack.7, $sgpr96_sgpr97_sgpr98_sgpr99, 0, 0, implicit $exec :: (store 4 into %stack.7, addrspace 5) + ; GCN: SI_SPILL_V32_SAVE killed $vgpr18, %stack.8, $sgpr96_sgpr97_sgpr98_sgpr99, 0, 0, implicit $exec :: (store 4 into %stack.8, addrspace 5) ; GCN: $exec = S_XOR_B64_term $exec, killed renamable $sgpr4_sgpr5, implicit-def $scc ; GCN: S_CBRANCH_EXECNZ %bb.1, implicit $exec ; GCN: bb.2: - ; GCN: $sgpr0_sgpr1 = SI_SPILL_S64_RESTORE %stack.3, implicit $exec, implicit $sgpr96_sgpr97_sgpr98_sgpr99, implicit $sgpr3 :: (load 8 from %stack.3, align 4, addrspace 5) + ; GCN: $sgpr0_sgpr1 = SI_SPILL_S64_RESTORE %stack.3, implicit $exec, implicit $sgpr96_sgpr97_sgpr98_sgpr99 :: (load 8 from %stack.3, align 4, addrspace 5) ; GCN: $exec = S_MOV_B64 renamable $sgpr0_sgpr1 - ; GCN: $vgpr0 = SI_SPILL_V32_RESTORE %stack.8, $sgpr96_sgpr97_sgpr98_sgpr99, $sgpr3, 0, implicit $exec :: (load 4 from %stack.8, addrspace 5) - ; GCN: $sgpr4_sgpr5_sgpr6_sgpr7 = SI_SPILL_S128_RESTORE %stack.1, implicit $exec, implicit $sgpr96_sgpr97_sgpr98_sgpr99, implicit $sgpr3 :: (load 16 from %stack.1, align 4, addrspace 5) + ; GCN: $vgpr0 = SI_SPILL_V32_RESTORE %stack.8, $sgpr96_sgpr97_sgpr98_sgpr99, 0, 0, implicit $exec :: (load 4 from %stack.8, addrspace 5) + ; GCN: $sgpr4_sgpr5_sgpr6_sgpr7 = SI_SPILL_S128_RESTORE %stack.1, implicit $exec, implicit $sgpr96_sgpr97_sgpr98_sgpr99 :: (load 16 from %stack.1, align 4, addrspace 5) ; GCN: BUFFER_STORE_DWORD_OFFSET renamable $vgpr0, renamable $sgpr4_sgpr5_sgpr6_sgpr7, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 4 into %ir.out.load, addrspace 1) ; GCN: S_ENDPGM 0 entry: diff --git a/llvm/test/CodeGen/AMDGPU/insert_vector_elt.ll b/llvm/test/CodeGen/AMDGPU/insert_vector_elt.ll --- a/llvm/test/CodeGen/AMDGPU/insert_vector_elt.ll +++ b/llvm/test/CodeGen/AMDGPU/insert_vector_elt.ll @@ -1620,9 +1620,9 @@ ; SI-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; SI-NEXT: s_load_dwordx16 s[12:27], s[4:5], 0x10 ; SI-NEXT: s_load_dword s4, s[4:5], 0x20 +; SI-NEXT: s_add_u32 s0, s0, s7 +; SI-NEXT: s_addc_u32 s1, s1, 0 ; SI-NEXT: v_mov_b32_e32 v16, 64 -; SI-NEXT: s_mov_b32 s11, 0x100f000 -; SI-NEXT: s_mov_b32 s10, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: v_mov_b32_e32 v0, s12 ; SI-NEXT: s_and_b32 s4, s4, 7 @@ -1642,18 +1642,20 @@ ; SI-NEXT: v_mov_b32_e32 v9, s21 ; SI-NEXT: v_mov_b32_e32 v10, s22 ; SI-NEXT: v_mov_b32_e32 v11, s23 -; SI-NEXT: buffer_store_dwordx4 v[12:15], off, s[0:3], s7 offset:112 -; SI-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], s7 offset:96 -; SI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], s7 offset:80 -; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], s7 offset:64 +; SI-NEXT: buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:112 +; SI-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:96 +; SI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:80 +; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:64 ; SI-NEXT: v_or_b32_e32 v16, s4, v16 ; SI-NEXT: v_mov_b32_e32 v0, 0 ; SI-NEXT: v_mov_b32_e32 v1, 0x40200000 -; SI-NEXT: buffer_store_dwordx2 v[0:1], v16, s[0:3], s7 offen -; SI-NEXT: buffer_load_dwordx4 v[0:3], off, s[0:3], s7 offset:64 -; SI-NEXT: buffer_load_dwordx4 v[4:7], off, s[0:3], s7 offset:80 -; SI-NEXT: buffer_load_dwordx4 v[8:11], off, s[0:3], s7 offset:96 -; SI-NEXT: buffer_load_dwordx4 v[12:15], off, s[0:3], s7 offset:112 +; SI-NEXT: buffer_store_dwordx2 v[0:1], v16, s[0:3], 0 offen +; SI-NEXT: buffer_load_dwordx4 v[0:3], off, s[0:3], 0 offset:64 +; SI-NEXT: buffer_load_dwordx4 v[4:7], off, s[0:3], 0 offset:80 +; SI-NEXT: buffer_load_dwordx4 v[8:11], off, s[0:3], 0 offset:96 +; SI-NEXT: buffer_load_dwordx4 v[12:15], off, s[0:3], 0 offset:112 +; SI-NEXT: s_mov_b32 s11, 0x100f000 +; SI-NEXT: s_mov_b32 s10, -1 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_store_dwordx4 v[12:15], off, s[8:11], 0 offset:48 ; SI-NEXT: buffer_store_dwordx4 v[8:11], off, s[8:11], 0 offset:32 @@ -1666,9 +1668,9 @@ ; VI-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; VI-NEXT: s_load_dwordx16 s[12:27], s[4:5], 0x40 ; VI-NEXT: s_load_dword s4, s[4:5], 0x80 +; VI-NEXT: s_add_u32 s0, s0, s7 +; VI-NEXT: s_addc_u32 s1, s1, 0 ; VI-NEXT: v_mov_b32_e32 v16, 64 -; VI-NEXT: s_mov_b32 s11, 0x1100f000 -; VI-NEXT: s_mov_b32 s10, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s12 ; VI-NEXT: s_and_b32 s4, s4, 7 @@ -1688,18 +1690,20 @@ ; VI-NEXT: v_mov_b32_e32 v9, s21 ; VI-NEXT: v_mov_b32_e32 v10, s22 ; VI-NEXT: v_mov_b32_e32 v11, s23 -; VI-NEXT: buffer_store_dwordx4 v[12:15], off, s[0:3], s7 offset:112 -; VI-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], s7 offset:96 -; VI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], s7 offset:80 -; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], s7 offset:64 +; VI-NEXT: buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:112 +; VI-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:96 +; VI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:80 +; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:64 ; VI-NEXT: v_or_b32_e32 v16, s4, v16 ; VI-NEXT: v_mov_b32_e32 v0, 0 ; VI-NEXT: v_mov_b32_e32 v1, 0x40200000 -; VI-NEXT: buffer_store_dwordx2 v[0:1], v16, s[0:3], s7 offen -; VI-NEXT: buffer_load_dwordx4 v[0:3], off, s[0:3], s7 offset:64 -; VI-NEXT: buffer_load_dwordx4 v[4:7], off, s[0:3], s7 offset:80 -; VI-NEXT: buffer_load_dwordx4 v[8:11], off, s[0:3], s7 offset:96 -; VI-NEXT: buffer_load_dwordx4 v[12:15], off, s[0:3], s7 offset:112 +; VI-NEXT: buffer_store_dwordx2 v[0:1], v16, s[0:3], 0 offen +; VI-NEXT: buffer_load_dwordx4 v[0:3], off, s[0:3], 0 offset:64 +; VI-NEXT: buffer_load_dwordx4 v[4:7], off, s[0:3], 0 offset:80 +; VI-NEXT: buffer_load_dwordx4 v[8:11], off, s[0:3], 0 offset:96 +; VI-NEXT: buffer_load_dwordx4 v[12:15], off, s[0:3], 0 offset:112 +; VI-NEXT: s_mov_b32 s11, 0x1100f000 +; VI-NEXT: s_mov_b32 s10, -1 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_store_dwordx4 v[12:15], off, s[8:11], 0 offset:48 ; VI-NEXT: buffer_store_dwordx4 v[8:11], off, s[8:11], 0 offset:32 diff --git a/llvm/test/CodeGen/AMDGPU/ipra.ll b/llvm/test/CodeGen/AMDGPU/ipra.ll --- a/llvm/test/CodeGen/AMDGPU/ipra.ll +++ b/llvm/test/CodeGen/AMDGPU/ipra.ll @@ -30,7 +30,7 @@ ; GCN-NOT: writelane ; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, v8 -; GCN: ; NumSgprs: 38 +; GCN: ; NumSgprs: 37 ; GCN: ; NumVgprs: 9 define amdgpu_kernel void @kernel_call() #0 { %vgpr = load volatile i32, i32 addrspace(1)* undef diff --git a/llvm/test/CodeGen/AMDGPU/large-alloca-compute.ll b/llvm/test/CodeGen/AMDGPU/large-alloca-compute.ll --- a/llvm/test/CodeGen/AMDGPU/large-alloca-compute.ll +++ b/llvm/test/CodeGen/AMDGPU/large-alloca-compute.ll @@ -48,8 +48,8 @@ ; GFX10HSA-DAG: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), [[FLAT_SCR_LO]] ; GFX10HSA-DAG: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), [[FLAT_SCR_HI]] -; GCNHSA: buffer_store_dword {{v[0-9]+}}, {{v[0-9]+}}, s[0:3], s9 offen -; GCNHSA: buffer_load_dword {{v[0-9]+}}, {{v[0-9]+}}, s[0:3], s9 offen +; GCNHSA: buffer_store_dword {{v[0-9]+}}, {{v[0-9]+}}, s[0:3], 0 offen +; GCNHSA: buffer_load_dword {{v[0-9]+}}, {{v[0-9]+}}, s[0:3], 0 offen ; Scratch size = alloca size + emergency stack slot, align {{.*}}, addrspace(5) ; ALL: ; ScratchSize: 32772 diff --git a/llvm/test/CodeGen/AMDGPU/large-alloca-graphics.ll b/llvm/test/CodeGen/AMDGPU/large-alloca-graphics.ll --- a/llvm/test/CodeGen/AMDGPU/large-alloca-graphics.ll +++ b/llvm/test/CodeGen/AMDGPU/large-alloca-graphics.ll @@ -3,15 +3,19 @@ ; RUN: llc -march=amdgcn -mcpu=gfx900 -mattr=-flat-for-global < %s | FileCheck -check-prefix=GCN -check-prefix=GFX9 -check-prefix=ALL %s ; ALL-LABEL: {{^}}large_alloca_pixel_shader: -; GCN-DAG: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 -; GCN-DAG: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 -; GCN-DAG: s_mov_b32 s10, -1 -; CI-DAG: s_mov_b32 s11, 0xe8f000 -; VI-DAG: s_mov_b32 s11, 0xe80000 -; GFX9-DAG: s_mov_b32 s11, 0xe00000 +; GCN-DAG: s_mov_b32 s4, SCRATCH_RSRC_DWORD0 +; GCN-DAG: s_mov_b32 s5, SCRATCH_RSRC_DWORD1 +; GCN-DAG: s_mov_b32 s6, -1 -; GCN: buffer_store_dword {{v[0-9]+}}, {{v[0-9]+}}, s[8:11], s0 offen -; GCN: buffer_load_dword {{v[0-9]+}}, {{v[0-9]+}}, s[8:11], s0 offen +; CI-DAG: s_mov_b32 s7, 0xe8f000 +; VI-DAG: s_mov_b32 s7, 0xe80000 +; GFX9-DAG: s_mov_b32 s7, 0xe00000 + +; GCN: s_add_u32 s4, s4, s0 +; GCN: s_addc_u32 s5, s5, 0 + +; GCN: buffer_store_dword {{v[0-9]+}}, {{v[0-9]+}}, s[4:7], 0 offen +; GCN: buffer_load_dword {{v[0-9]+}}, {{v[0-9]+}}, s[4:7], 0 offen ; ALL: ; ScratchSize: 32772 define amdgpu_ps void @large_alloca_pixel_shader(i32 %x, i32 %y) #0 { @@ -25,15 +29,19 @@ } ; ALL-LABEL: {{^}}large_alloca_pixel_shader_inreg: -; GCN-DAG: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 -; GCN-DAG: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 -; GCN-DAG: s_mov_b32 s10, -1 -; CI-DAG: s_mov_b32 s11, 0xe8f000 -; VI-DAG: s_mov_b32 s11, 0xe80000 -; GFX9-DAG: s_mov_b32 s11, 0xe00000 - -; GCN: buffer_store_dword {{v[0-9]+}}, {{v[0-9]+}}, s[8:11], s2 offen -; GCN: buffer_load_dword {{v[0-9]+}}, {{v[0-9]+}}, s[8:11], s2 offen +; GCN-DAG: s_mov_b32 s4, SCRATCH_RSRC_DWORD0 +; GCN-DAG: s_mov_b32 s5, SCRATCH_RSRC_DWORD1 +; GCN-DAG: s_mov_b32 s6, -1 + +; CI-DAG: s_mov_b32 s7, 0xe8f000 +; VI-DAG: s_mov_b32 s7, 0xe80000 +; GFX9-DAG: s_mov_b32 s7, 0xe00000 + +; GCN: s_add_u32 s4, s4, s2 +; GCN: s_addc_u32 s5, s5, 0 + +; GCN: buffer_store_dword {{v[0-9]+}}, {{v[0-9]+}}, s[4:7], 0 offen +; GCN: buffer_load_dword {{v[0-9]+}}, {{v[0-9]+}}, s[4:7], 0 offen ; ALL: ; ScratchSize: 32772 define amdgpu_ps void @large_alloca_pixel_shader_inreg(i32 inreg %x, i32 inreg %y) #0 { diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.implicit.buffer.ptr.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.implicit.buffer.ptr.ll --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.implicit.buffer.ptr.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.implicit.buffer.ptr.ll @@ -3,7 +3,7 @@ ; FIXME: Requires stack object to not assert ; GCN-LABEL: {{^}}test_ps: ; GCN: s_load_dwordx2 s[4:5], s[0:1], 0x0 -; GCN: buffer_store_dword v0, off, s[4:7], s2 offset:4 +; GCN: buffer_store_dword v0, off, s[4:7], 0 offset:4 ; GCN: s_load_dword s{{[0-9]+}}, s[0:1], 0x0 ; GCN-NEXT: s_waitcnt ; GCN-NEXT: ; return @@ -18,7 +18,7 @@ ; GCN-LABEL: {{^}}test_cs: ; GCN: s_mov_b64 s[4:5], s[0:1] -; GCN: buffer_store_dword v{{[0-9]+}}, off, s[4:7], s2 offset:4 +; GCN: buffer_store_dword v{{[0-9]+}}, off, s[4:7], 0 offset:4 ; GCN: s_load_dword s0, s[0:1], 0x0 define amdgpu_cs i32 @test_cs() #1 { %alloca = alloca i32, addrspace(5) diff --git a/llvm/test/CodeGen/AMDGPU/load-hi16.ll b/llvm/test/CodeGen/AMDGPU/load-hi16.ll --- a/llvm/test/CodeGen/AMDGPU/load-hi16.ll +++ b/llvm/test/CodeGen/AMDGPU/load-hi16.ll @@ -531,13 +531,13 @@ ; GCN-LABEL: {{^}}load_private_hi_v2i16_reglo_vreg_nooff: ; GCN: s_waitcnt -; GFX900: buffer_load_short_d16_hi v0, off, s[0:3], s33 offset:4094{{$}} +; GFX900: buffer_load_short_d16_hi v0, off, s[0:3], 0 offset:4094{{$}} ; GFX900: s_waitcnt ; GFX900-NEXT: global_store_dword v{{\[[0-9]+:[0-9]+\]}}, v0 ; GFX900-NEXT: s_waitcnt ; GFX900-NEXT: s_setpc_b64 -; NO-D16-HI: buffer_load_ushort v{{[0-9]+}}, off, s[0:3], s33 offset:4094{{$}} +; NO-D16-HI: buffer_load_ushort v{{[0-9]+}}, off, s[0:3], 0 offset:4094{{$}} define void @load_private_hi_v2i16_reglo_vreg_nooff(i16 addrspace(5)* byval %in, i16 %reg) #0 { entry: %load = load volatile i16, i16 addrspace(5)* inttoptr (i32 4094 to i16 addrspace(5)*) @@ -549,13 +549,13 @@ ; GCN-LABEL: {{^}}load_private_hi_v2f16_reglo_vreg_nooff: ; GCN: s_waitcnt -; GFX900-NEXT: buffer_load_short_d16_hi v1, off, s[0:3], s33 offset:4094{{$}} +; GFX900-NEXT: buffer_load_short_d16_hi v1, off, s[0:3], 0 offset:4094{{$}} ; GFX900-NEXT: s_waitcnt ; GFX900-NEXT: global_store_dword v{{\[[0-9]+:[0-9]+\]}}, v1 ; GFX900-NEXT: s_waitcnt ; GFX900-NEXT: s_setpc_b64 -; NO-D16-HI: buffer_load_ushort v{{[0-9]+}}, off, s[0:3], s33 offset:4094{{$}} +; NO-D16-HI: buffer_load_ushort v{{[0-9]+}}, off, s[0:3], 0 offset:4094{{$}} define void @load_private_hi_v2f16_reglo_vreg_nooff(half addrspace(5)* %in, half %reg) #0 { entry: %load = load volatile half, half addrspace(5)* inttoptr (i32 4094 to half addrspace(5)*) @@ -649,13 +649,13 @@ ; GCN-LABEL: {{^}}load_private_hi_v2i16_reglo_vreg_nooff_zexti8: ; GCN: s_waitcnt -; GFX900-NEXT: buffer_load_ubyte_d16_hi v1, off, s[0:3], s33 offset:4094{{$}} +; GFX900-NEXT: buffer_load_ubyte_d16_hi v1, off, s[0:3], 0 offset:4094{{$}} ; GFX900-NEXT: s_waitcnt ; GFX900-NEXT: global_store_dword v{{\[[0-9]+:[0-9]+\]}}, v1 ; GFX900-NEXT: s_waitcnt ; GFX900-NEXT: s_setpc_b64 -; NO-D16-HI: buffer_load_ubyte v0, off, s[0:3], s33 offset:4094{{$}} +; NO-D16-HI: buffer_load_ubyte v0, off, s[0:3], 0 offset:4094{{$}} define void @load_private_hi_v2i16_reglo_vreg_nooff_zexti8(i8 addrspace(5)* %in, i16 %reg) #0 { entry: %load = load volatile i8, i8 addrspace(5)* inttoptr (i32 4094 to i8 addrspace(5)*) @@ -668,13 +668,13 @@ ; GCN-LABEL: {{^}}load_private_hi_v2i16_reglo_vreg_nooff_sexti8: ; GCN: s_waitcnt -; GFX900-NEXT: buffer_load_sbyte_d16_hi v1, off, s[0:3], s33 offset:4094{{$}} +; GFX900-NEXT: buffer_load_sbyte_d16_hi v1, off, s[0:3], 0 offset:4094{{$}} ; GFX900-NEXT: s_waitcnt ; GFX900-NEXT: global_store_dword v{{\[[0-9]+:[0-9]+\]}}, v1 ; GFX900-NEXT: s_waitcnt ; GFX900-NEXT: s_setpc_b64 -; NO-D16-HI: buffer_load_sbyte v0, off, s[0:3], s33 offset:4094{{$}} +; NO-D16-HI: buffer_load_sbyte v0, off, s[0:3], 0 offset:4094{{$}} define void @load_private_hi_v2i16_reglo_vreg_nooff_sexti8(i8 addrspace(5)* %in, i16 %reg) #0 { entry: %load = load volatile i8, i8 addrspace(5)* inttoptr (i32 4094 to i8 addrspace(5)*) @@ -687,13 +687,13 @@ ; GCN-LABEL: {{^}}load_private_hi_v2f16_reglo_vreg_nooff_zexti8: ; GCN: s_waitcnt -; GFX900-NEXT: buffer_load_ubyte_d16_hi v1, off, s[0:3], s33 offset:4094{{$}} +; GFX900-NEXT: buffer_load_ubyte_d16_hi v1, off, s[0:3], 0 offset:4094{{$}} ; GFX900-NEXT: s_waitcnt ; GFX900-NEXT: global_store_dword v{{\[[0-9]+:[0-9]+\]}}, v1 ; GFX900-NEXT: s_waitcnt ; GFX900-NEXT: s_setpc_b64 -; NO-D16-HI: buffer_load_ubyte v0, off, s[0:3], s33 offset:4094{{$}} +; NO-D16-HI: buffer_load_ubyte v0, off, s[0:3], 0 offset:4094{{$}} define void @load_private_hi_v2f16_reglo_vreg_nooff_zexti8(i8 addrspace(5)* %in, half %reg) #0 { entry: %load = load volatile i8, i8 addrspace(5)* inttoptr (i32 4094 to i8 addrspace(5)*) diff --git a/llvm/test/CodeGen/AMDGPU/load-lo16.ll b/llvm/test/CodeGen/AMDGPU/load-lo16.ll --- a/llvm/test/CodeGen/AMDGPU/load-lo16.ll +++ b/llvm/test/CodeGen/AMDGPU/load-lo16.ll @@ -1303,7 +1303,7 @@ ; GFX900-LABEL: load_private_lo_v2i16_reglo_vreg_nooff: ; GFX900: ; %bb.0: ; %entry ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: buffer_load_short_d16 v1, off, s[0:3], s33 offset:4094 +; GFX900-NEXT: buffer_load_short_d16 v1, off, s[0:3], 0 offset:4094 ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: global_store_dword v[0:1], v1, off ; GFX900-NEXT: s_waitcnt vmcnt(0) @@ -1312,7 +1312,7 @@ ; GFX906-LABEL: load_private_lo_v2i16_reglo_vreg_nooff: ; GFX906: ; %bb.0: ; %entry ; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX906-NEXT: buffer_load_ushort v0, off, s[0:3], s33 offset:4094 +; GFX906-NEXT: buffer_load_ushort v0, off, s[0:3], 0 offset:4094 ; GFX906-NEXT: v_mov_b32_e32 v2, 0xffff ; GFX906-NEXT: s_waitcnt vmcnt(0) ; GFX906-NEXT: v_bfi_b32 v0, v2, v0, v1 @@ -1323,7 +1323,7 @@ ; GFX803-LABEL: load_private_lo_v2i16_reglo_vreg_nooff: ; GFX803: ; %bb.0: ; %entry ; GFX803-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX803-NEXT: buffer_load_ushort v0, off, s[0:3], s33 offset:4094 +; GFX803-NEXT: buffer_load_ushort v0, off, s[0:3], 0 offset:4094 ; GFX803-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; GFX803-NEXT: s_waitcnt vmcnt(0) ; GFX803-NEXT: v_or_b32_e32 v0, v0, v1 @@ -1342,7 +1342,7 @@ ; GFX900-LABEL: load_private_lo_v2i16_reghi_vreg_nooff: ; GFX900: ; %bb.0: ; %entry ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: buffer_load_short_d16 v1, off, s[0:3], s33 offset:4094 +; GFX900-NEXT: buffer_load_short_d16 v1, off, s[0:3], 0 offset:4094 ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: global_store_dword v[0:1], v1, off ; GFX900-NEXT: s_waitcnt vmcnt(0) @@ -1351,7 +1351,7 @@ ; GFX906-LABEL: load_private_lo_v2i16_reghi_vreg_nooff: ; GFX906: ; %bb.0: ; %entry ; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX906-NEXT: buffer_load_ushort v0, off, s[0:3], s33 offset:4094 +; GFX906-NEXT: buffer_load_ushort v0, off, s[0:3], 0 offset:4094 ; GFX906-NEXT: v_mov_b32_e32 v2, 0xffff ; GFX906-NEXT: s_waitcnt vmcnt(0) ; GFX906-NEXT: v_bfi_b32 v0, v2, v0, v1 @@ -1362,7 +1362,7 @@ ; GFX803-LABEL: load_private_lo_v2i16_reghi_vreg_nooff: ; GFX803: ; %bb.0: ; %entry ; GFX803-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX803-NEXT: buffer_load_ushort v0, off, s[0:3], s33 offset:4094 +; GFX803-NEXT: buffer_load_ushort v0, off, s[0:3], 0 offset:4094 ; GFX803-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; GFX803-NEXT: s_waitcnt vmcnt(0) ; GFX803-NEXT: v_or_b32_e32 v0, v0, v1 @@ -1381,7 +1381,7 @@ ; GFX900-LABEL: load_private_lo_v2f16_reglo_vreg_nooff: ; GFX900: ; %bb.0: ; %entry ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: buffer_load_short_d16 v1, off, s[0:3], s33 offset:4094 +; GFX900-NEXT: buffer_load_short_d16 v1, off, s[0:3], 0 offset:4094 ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: global_store_dword v[0:1], v1, off ; GFX900-NEXT: s_waitcnt vmcnt(0) @@ -1390,7 +1390,7 @@ ; GFX906-LABEL: load_private_lo_v2f16_reglo_vreg_nooff: ; GFX906: ; %bb.0: ; %entry ; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX906-NEXT: buffer_load_ushort v0, off, s[0:3], s33 offset:4094 +; GFX906-NEXT: buffer_load_ushort v0, off, s[0:3], 0 offset:4094 ; GFX906-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; GFX906-NEXT: s_waitcnt vmcnt(0) ; GFX906-NEXT: v_and_b32_e32 v0, 0xffff, v0 @@ -1402,7 +1402,7 @@ ; GFX803-LABEL: load_private_lo_v2f16_reglo_vreg_nooff: ; GFX803: ; %bb.0: ; %entry ; GFX803-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX803-NEXT: buffer_load_ushort v0, off, s[0:3], s33 offset:4094 +; GFX803-NEXT: buffer_load_ushort v0, off, s[0:3], 0 offset:4094 ; GFX803-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; GFX803-NEXT: s_waitcnt vmcnt(0) ; GFX803-NEXT: v_or_b32_e32 v0, v0, v1 @@ -1504,7 +1504,7 @@ ; GFX900-LABEL: load_private_lo_v2i16_reglo_vreg_nooff_zexti8: ; GFX900: ; %bb.0: ; %entry ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: buffer_load_ubyte_d16 v1, off, s[0:3], s33 offset:4094 +; GFX900-NEXT: buffer_load_ubyte_d16 v1, off, s[0:3], 0 offset:4094 ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: global_store_dword v[0:1], v1, off ; GFX900-NEXT: s_waitcnt vmcnt(0) @@ -1513,7 +1513,7 @@ ; GFX906-LABEL: load_private_lo_v2i16_reglo_vreg_nooff_zexti8: ; GFX906: ; %bb.0: ; %entry ; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX906-NEXT: buffer_load_ubyte v0, off, s[0:3], s33 offset:4094 +; GFX906-NEXT: buffer_load_ubyte v0, off, s[0:3], 0 offset:4094 ; GFX906-NEXT: v_mov_b32_e32 v2, 0xffff ; GFX906-NEXT: s_waitcnt vmcnt(0) ; GFX906-NEXT: v_bfi_b32 v0, v2, v0, v1 @@ -1525,7 +1525,7 @@ ; GFX803: ; %bb.0: ; %entry ; GFX803-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX803-NEXT: v_lshrrev_b32_e32 v0, 16, v1 -; GFX803-NEXT: buffer_load_ubyte v1, off, s[0:3], s33 offset:4094 +; GFX803-NEXT: buffer_load_ubyte v1, off, s[0:3], 0 offset:4094 ; GFX803-NEXT: s_mov_b32 s4, 0x5040c00 ; GFX803-NEXT: s_waitcnt vmcnt(0) ; GFX803-NEXT: v_perm_b32 v0, v0, v1, s4 @@ -1545,7 +1545,7 @@ ; GFX900-LABEL: load_private_lo_v2i16_reglo_vreg_nooff_sexti8: ; GFX900: ; %bb.0: ; %entry ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: buffer_load_sbyte_d16 v1, off, s[0:3], s33 offset:4094 +; GFX900-NEXT: buffer_load_sbyte_d16 v1, off, s[0:3], 0 offset:4094 ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: global_store_dword v[0:1], v1, off ; GFX900-NEXT: s_waitcnt vmcnt(0) @@ -1554,7 +1554,7 @@ ; GFX906-LABEL: load_private_lo_v2i16_reglo_vreg_nooff_sexti8: ; GFX906: ; %bb.0: ; %entry ; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX906-NEXT: buffer_load_sbyte v0, off, s[0:3], s33 offset:4094 +; GFX906-NEXT: buffer_load_sbyte v0, off, s[0:3], 0 offset:4094 ; GFX906-NEXT: v_mov_b32_e32 v2, 0xffff ; GFX906-NEXT: s_waitcnt vmcnt(0) ; GFX906-NEXT: v_bfi_b32 v0, v2, v0, v1 @@ -1565,7 +1565,7 @@ ; GFX803-LABEL: load_private_lo_v2i16_reglo_vreg_nooff_sexti8: ; GFX803: ; %bb.0: ; %entry ; GFX803-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX803-NEXT: buffer_load_sbyte v0, off, s[0:3], s33 offset:4094 +; GFX803-NEXT: buffer_load_sbyte v0, off, s[0:3], 0 offset:4094 ; GFX803-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; GFX803-NEXT: s_waitcnt vmcnt(0) ; GFX803-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD @@ -1585,7 +1585,7 @@ ; GFX900-LABEL: load_private_lo_v2f16_reglo_vreg_nooff_zexti8: ; GFX900: ; %bb.0: ; %entry ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: buffer_load_ubyte_d16 v1, off, s[0:3], s33 offset:4094 +; GFX900-NEXT: buffer_load_ubyte_d16 v1, off, s[0:3], 0 offset:4094 ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: global_store_dword v[0:1], v1, off ; GFX900-NEXT: s_waitcnt vmcnt(0) @@ -1594,7 +1594,7 @@ ; GFX906-LABEL: load_private_lo_v2f16_reglo_vreg_nooff_zexti8: ; GFX906: ; %bb.0: ; %entry ; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX906-NEXT: buffer_load_ubyte v0, off, s[0:3], s33 offset:4094 +; GFX906-NEXT: buffer_load_ubyte v0, off, s[0:3], 0 offset:4094 ; GFX906-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; GFX906-NEXT: s_waitcnt vmcnt(0) ; GFX906-NEXT: v_and_b32_e32 v0, 0xffff, v0 @@ -1607,7 +1607,7 @@ ; GFX803: ; %bb.0: ; %entry ; GFX803-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX803-NEXT: v_lshrrev_b32_e32 v0, 16, v1 -; GFX803-NEXT: buffer_load_ubyte v1, off, s[0:3], s33 offset:4094 +; GFX803-NEXT: buffer_load_ubyte v1, off, s[0:3], 0 offset:4094 ; GFX803-NEXT: s_mov_b32 s4, 0x5040c00 ; GFX803-NEXT: s_waitcnt vmcnt(0) ; GFX803-NEXT: v_perm_b32 v0, v0, v1, s4 diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-load.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-load.ll --- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-load.ll +++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-load.ll @@ -447,8 +447,8 @@ } ; GCN-LABEL: {{^}}nontemporal_private_0: -; GFX89: buffer_load_dword v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9]+}}:{{[0-9]+}}], s{{[0-9]+}} offen glc slc{{$}} -; GFX10: buffer_load_dword v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9]+}}:{{[0-9]+}}], s{{[0-9]+}} offen slc{{$}} +; GFX89: buffer_load_dword v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9]+}}:{{[0-9]+}}], 0 offen glc slc{{$}} +; GFX10: buffer_load_dword v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9]+}}:{{[0-9]+}}], 0 offen slc{{$}} ; GFX10: .amdhsa_kernel nontemporal_private_0 ; GFX10WGP-NOT: .amdhsa_workgroup_processor_mode 0 ; GFX10CU: .amdhsa_workgroup_processor_mode 0 @@ -462,8 +462,8 @@ } ; GCN-LABEL: {{^}}nontemporal_private_1: -; GFX89: buffer_load_dword v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9]+}}:{{[0-9]+}}], s{{[0-9]+}} offen glc slc{{$}} -; GFX10: buffer_load_dword v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9]+}}:{{[0-9]+}}], s{{[0-9]+}} offen slc{{$}} +; GFX89: buffer_load_dword v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9]+}}:{{[0-9]+}}], 0 offen glc slc{{$}} +; GFX10: buffer_load_dword v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9]+}}:{{[0-9]+}}], 0 offen slc{{$}} ; GFX10: .amdhsa_kernel nontemporal_private_1 ; GFX10WGP-NOT: .amdhsa_workgroup_processor_mode 0 ; GFX10CU: .amdhsa_workgroup_processor_mode 0 diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-store.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-store.ll --- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-store.ll +++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-store.ll @@ -314,8 +314,8 @@ } ; GCN-LABEL: {{^}}nontemporal_private_0: -; GFX89: buffer_store_dword v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9]+}}:{{[0-9]+}}], s{{[0-9]+}} offen glc slc{{$}} -; GFX10: buffer_store_dword v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9]+}}:{{[0-9]+}}], s{{[0-9]+}} offen slc{{$}} +; GFX89: buffer_store_dword v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9]+}}:{{[0-9]+}}], 0 offen glc slc{{$}} +; GFX10: buffer_store_dword v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9]+}}:{{[0-9]+}}], 0 offen slc{{$}} ; GFX10: .amdhsa_kernel nontemporal_private_0 ; GFX10WGP-NOT: .amdhsa_workgroup_processor_mode 0 ; GFX10CU: .amdhsa_workgroup_processor_mode 0 @@ -329,8 +329,8 @@ } ; GCN-LABEL: {{^}}nontemporal_private_1: -; GFX89: buffer_store_dword v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9]+}}:{{[0-9]+}}], s{{[0-9]+}} offen glc slc{{$}} -; GFX10: buffer_store_dword v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9]+}}:{{[0-9]+}}], s{{[0-9]+}} offen slc{{$}} +; GFX89: buffer_store_dword v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9]+}}:{{[0-9]+}}], 0 offen glc slc{{$}} +; GFX10: buffer_store_dword v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9]+}}:{{[0-9]+}}], 0 offen slc{{$}} ; GFX10: .amdhsa_kernel nontemporal_private_1 ; GFX10WGP-NOT: .amdhsa_workgroup_processor_mode 0 ; GFX10CU: .amdhsa_workgroup_processor_mode 0 diff --git a/llvm/test/CodeGen/AMDGPU/memory_clause.ll b/llvm/test/CodeGen/AMDGPU/memory_clause.ll --- a/llvm/test/CodeGen/AMDGPU/memory_clause.ll +++ b/llvm/test/CodeGen/AMDGPU/memory_clause.ll @@ -115,61 +115,50 @@ ; GCN-NEXT: v_and_b32_e32 v2, 0x3ff, v2 ; GCN-NEXT: v_lshlrev_b32_e32 v2, 4, v2 ; GCN-NEXT: v_add_u32_e32 v0, v0, v2 -; GCN-NEXT: s_nop 0 -; GCN-NEXT: s_nop 0 -; GCN-NEXT: buffer_load_dword v3, v0, s[0:3], s33 offen -; GCN-NEXT: buffer_load_dword v4, v0, s[0:3], s33 offen offset:4 -; GCN-NEXT: buffer_load_dword v5, v0, s[0:3], s33 offen offset:8 -; GCN-NEXT: buffer_load_dword v6, v0, s[0:3], s33 offen offset:12 -; GCN-NEXT: buffer_load_dword v7, v0, s[0:3], s33 offen offset:16 -; GCN-NEXT: buffer_load_dword v8, v0, s[0:3], s33 offen offset:20 -; GCN-NEXT: buffer_load_dword v9, v0, s[0:3], s33 offen offset:24 -; GCN-NEXT: buffer_load_dword v10, v0, s[0:3], s33 offen offset:28 -; GCN-NEXT: buffer_load_dword v11, v0, s[0:3], s33 offen offset:32 -; GCN-NEXT: buffer_load_dword v12, v0, s[0:3], s33 offen offset:36 -; GCN-NEXT: buffer_load_dword v13, v0, s[0:3], s33 offen offset:40 -; GCN-NEXT: buffer_load_dword v14, v0, s[0:3], s33 offen offset:44 -; GCN-NEXT: buffer_load_dword v15, v0, s[0:3], s33 offen offset:48 -; GCN-NEXT: buffer_load_dword v16, v0, s[0:3], s33 offen offset:52 -; GCN-NEXT: buffer_load_dword v17, v0, s[0:3], s33 offen offset:56 ; GCN-NEXT: v_add_u32_e32 v1, v1, v2 ; GCN-NEXT: s_nop 0 ; GCN-NEXT: s_nop 0 -; GCN-NEXT: buffer_load_dword v0, v0, s[0:3], s33 offen offset:60 -; GCN-NEXT: s_nop 0 -; GCN-NEXT: s_waitcnt vmcnt(15) -; GCN-NEXT: s_nop 0 -; GCN-NEXT: buffer_store_dword v3, v1, s[0:3], s33 offen -; GCN-NEXT: s_waitcnt vmcnt(15) -; GCN-NEXT: buffer_store_dword v4, v1, s[0:3], s33 offen offset:4 -; GCN-NEXT: s_waitcnt vmcnt(15) -; GCN-NEXT: buffer_store_dword v5, v1, s[0:3], s33 offen offset:8 -; GCN-NEXT: s_waitcnt vmcnt(15) -; GCN-NEXT: buffer_store_dword v6, v1, s[0:3], s33 offen offset:12 -; GCN-NEXT: s_waitcnt vmcnt(15) -; GCN-NEXT: buffer_store_dword v7, v1, s[0:3], s33 offen offset:16 -; GCN-NEXT: s_waitcnt vmcnt(15) -; GCN-NEXT: buffer_store_dword v8, v1, s[0:3], s33 offen offset:20 -; GCN-NEXT: s_waitcnt vmcnt(15) -; GCN-NEXT: buffer_store_dword v9, v1, s[0:3], s33 offen offset:24 -; GCN-NEXT: s_waitcnt vmcnt(15) -; GCN-NEXT: buffer_store_dword v10, v1, s[0:3], s33 offen offset:28 -; GCN-NEXT: s_waitcnt vmcnt(15) -; GCN-NEXT: buffer_store_dword v11, v1, s[0:3], s33 offen offset:32 -; GCN-NEXT: s_waitcnt vmcnt(15) -; GCN-NEXT: buffer_store_dword v12, v1, s[0:3], s33 offen offset:36 -; GCN-NEXT: s_waitcnt vmcnt(15) -; GCN-NEXT: buffer_store_dword v13, v1, s[0:3], s33 offen offset:40 -; GCN-NEXT: s_waitcnt vmcnt(15) -; GCN-NEXT: buffer_store_dword v14, v1, s[0:3], s33 offen offset:44 -; GCN-NEXT: s_waitcnt vmcnt(15) -; GCN-NEXT: buffer_store_dword v15, v1, s[0:3], s33 offen offset:48 -; GCN-NEXT: s_waitcnt vmcnt(15) -; GCN-NEXT: buffer_store_dword v16, v1, s[0:3], s33 offen offset:52 -; GCN-NEXT: s_waitcnt vmcnt(15) -; GCN-NEXT: buffer_store_dword v17, v1, s[0:3], s33 offen offset:56 -; GCN-NEXT: s_waitcnt vmcnt(15) -; GCN-NEXT: buffer_store_dword v0, v1, s[0:3], s33 offen offset:60 +; GCN-NEXT: buffer_load_dword v6, v0, s[0:3], 0 offen offset:20 +; GCN-NEXT: buffer_load_dword v7, v0, s[0:3], 0 offen offset:24 +; GCN-NEXT: buffer_load_dword v8, v0, s[0:3], 0 offen offset:28 +; GCN-NEXT: buffer_load_dword v9, v0, s[0:3], 0 offen offset:32 +; GCN-NEXT: buffer_load_dword v10, v0, s[0:3], 0 offen offset:36 +; GCN-NEXT: buffer_load_dword v11, v0, s[0:3], 0 offen offset:40 +; GCN-NEXT: buffer_load_dword v12, v0, s[0:3], 0 offen offset:44 +; GCN-NEXT: buffer_load_dword v13, v0, s[0:3], 0 offen offset:48 +; GCN-NEXT: buffer_load_dword v14, v0, s[0:3], 0 offen offset:52 +; GCN-NEXT: buffer_load_dword v15, v0, s[0:3], 0 offen offset:56 +; GCN-NEXT: buffer_load_dword v16, v0, s[0:3], 0 offen offset:60 +; GCN-NEXT: buffer_load_dword v2, v0, s[0:3], 0 offen +; GCN-NEXT: buffer_load_dword v3, v0, s[0:3], 0 offen offset:4 +; GCN-NEXT: buffer_load_dword v4, v0, s[0:3], 0 offen offset:8 +; GCN-NEXT: buffer_load_dword v5, v0, s[0:3], 0 offen offset:12 +; GCN-NEXT: s_nop 0 +; GCN-NEXT: s_nop 0 +; GCN-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16 +; GCN-NEXT: s_nop 0 +; GCN-NEXT: s_waitcnt vmcnt(4) +; GCN-NEXT: s_nop 0 +; GCN-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen +; GCN-NEXT: s_waitcnt vmcnt(4) +; GCN-NEXT: buffer_store_dword v3, v1, s[0:3], 0 offen offset:4 +; GCN-NEXT: s_waitcnt vmcnt(4) +; GCN-NEXT: buffer_store_dword v4, v1, s[0:3], 0 offen offset:8 +; GCN-NEXT: s_waitcnt vmcnt(4) +; GCN-NEXT: buffer_store_dword v5, v1, s[0:3], 0 offen offset:12 +; GCN-NEXT: s_waitcnt vmcnt(4) +; GCN-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:16 +; GCN-NEXT: buffer_store_dword v6, v1, s[0:3], 0 offen offset:20 +; GCN-NEXT: buffer_store_dword v7, v1, s[0:3], 0 offen offset:24 +; GCN-NEXT: buffer_store_dword v8, v1, s[0:3], 0 offen offset:28 +; GCN-NEXT: buffer_store_dword v9, v1, s[0:3], 0 offen offset:32 +; GCN-NEXT: buffer_store_dword v10, v1, s[0:3], 0 offen offset:36 +; GCN-NEXT: buffer_store_dword v11, v1, s[0:3], 0 offen offset:40 +; GCN-NEXT: buffer_store_dword v12, v1, s[0:3], 0 offen offset:44 +; GCN-NEXT: buffer_store_dword v13, v1, s[0:3], 0 offen offset:48 +; GCN-NEXT: buffer_store_dword v14, v1, s[0:3], 0 offen offset:52 +; GCN-NEXT: buffer_store_dword v15, v1, s[0:3], 0 offen offset:56 +; GCN-NEXT: buffer_store_dword v16, v1, s[0:3], 0 offen offset:60 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: s_setpc_b64 s[30:31] bb: diff --git a/llvm/test/CodeGen/AMDGPU/mesa3d.ll b/llvm/test/CodeGen/AMDGPU/mesa3d.ll --- a/llvm/test/CodeGen/AMDGPU/mesa3d.ll +++ b/llvm/test/CodeGen/AMDGPU/mesa3d.ll @@ -5,7 +5,7 @@ ; GCN-DAG: s_mov_b32 s6, -1{{$}} ; GCN-DAG: s_mov_b32 s7, 0xe8f000 ; GCN-DAG: v_mov_b32_e32 [[V:v[0-9]+]], 2 -; GCN: buffer_store_dword [[V]], off, s[4:7], s2 offset:4 +; GCN: buffer_store_dword [[V]], off, s[4:7], 0 offset:4 define amdgpu_ps void @scratch_ps(i32 addrspace(1)* %out, i32 %in) { entry: %alloca = alloca i32, addrspace(5) diff --git a/llvm/test/CodeGen/AMDGPU/mir-print-dead-csr-fi.mir b/llvm/test/CodeGen/AMDGPU/mir-print-dead-csr-fi.mir --- a/llvm/test/CodeGen/AMDGPU/mir-print-dead-csr-fi.mir +++ b/llvm/test/CodeGen/AMDGPU/mir-print-dead-csr-fi.mir @@ -15,7 +15,6 @@ maxAlignment: 4 machineFunctionInfo: scratchRSrcReg: '$sgpr0_sgpr1_sgpr2_sgpr3' - scratchWaveOffsetReg: '$sgpr4' frameOffsetReg: '$sgpr5' stackPtrOffsetReg: '$sgpr32' body: | diff --git a/llvm/test/CodeGen/AMDGPU/misched-killflags.mir b/llvm/test/CodeGen/AMDGPU/misched-killflags.mir --- a/llvm/test/CodeGen/AMDGPU/misched-killflags.mir +++ b/llvm/test/CodeGen/AMDGPU/misched-killflags.mir @@ -6,7 +6,6 @@ machineFunctionInfo: isEntryFunction: true scratchRSrcReg: '$sgpr0_sgpr1_sgpr2_sgpr3' - scratchWaveOffsetReg: '$sgpr7' frameOffsetReg: '$sgpr7' body: | bb.0: diff --git a/llvm/test/CodeGen/AMDGPU/mubuf-offset-private.ll b/llvm/test/CodeGen/AMDGPU/mubuf-offset-private.ll --- a/llvm/test/CodeGen/AMDGPU/mubuf-offset-private.ll +++ b/llvm/test/CodeGen/AMDGPU/mubuf-offset-private.ll @@ -5,49 +5,49 @@ ; Test addressing modes when the scratch base is not a frame index. ; GCN-LABEL: {{^}}store_private_offset_i8: -; GCN: buffer_store_byte v{{[0-9]+}}, off, s[4:7], s2 offset:8 +; GCN: buffer_store_byte v{{[0-9]+}}, off, s[4:7], 0 offset:8 define amdgpu_kernel void @store_private_offset_i8() #0 { store volatile i8 5, i8 addrspace(5)* inttoptr (i32 8 to i8 addrspace(5)*) ret void } ; GCN-LABEL: {{^}}store_private_offset_i16: -; GCN: buffer_store_short v{{[0-9]+}}, off, s[4:7], s2 offset:8 +; GCN: buffer_store_short v{{[0-9]+}}, off, s[4:7], 0 offset:8 define amdgpu_kernel void @store_private_offset_i16() #0 { store volatile i16 5, i16 addrspace(5)* inttoptr (i32 8 to i16 addrspace(5)*) ret void } ; GCN-LABEL: {{^}}store_private_offset_i32: -; GCN: buffer_store_dword v{{[0-9]+}}, off, s[4:7], s2 offset:8 +; GCN: buffer_store_dword v{{[0-9]+}}, off, s[4:7], 0 offset:8 define amdgpu_kernel void @store_private_offset_i32() #0 { store volatile i32 5, i32 addrspace(5)* inttoptr (i32 8 to i32 addrspace(5)*) ret void } ; GCN-LABEL: {{^}}store_private_offset_v2i32: -; GCN: buffer_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, off, s[4:7], s2 offset:8 +; GCN: buffer_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, off, s[4:7], 0 offset:8 define amdgpu_kernel void @store_private_offset_v2i32() #0 { store volatile <2 x i32> , <2 x i32> addrspace(5)* inttoptr (i32 8 to <2 x i32> addrspace(5)*) ret void } ; GCN-LABEL: {{^}}store_private_offset_v4i32: -; GCN: buffer_store_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, off, s[4:7], s2 offset:8 +; GCN: buffer_store_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, off, s[4:7], 0 offset:8 define amdgpu_kernel void @store_private_offset_v4i32() #0 { store volatile <4 x i32> , <4 x i32> addrspace(5)* inttoptr (i32 8 to <4 x i32> addrspace(5)*) ret void } ; GCN-LABEL: {{^}}load_private_offset_i8: -; GCN: buffer_load_ubyte v{{[0-9]+}}, off, s[4:7], s2 offset:8 +; GCN: buffer_load_ubyte v{{[0-9]+}}, off, s[4:7], 0 offset:8 define amdgpu_kernel void @load_private_offset_i8() #0 { %load = load volatile i8, i8 addrspace(5)* inttoptr (i32 8 to i8 addrspace(5)*) ret void } ; GCN-LABEL: {{^}}sextload_private_offset_i8: -; GCN: buffer_load_sbyte v{{[0-9]+}}, off, s[4:7], s8 offset:8 +; GCN: buffer_load_sbyte v{{[0-9]+}}, off, s[4:7], 0 offset:8 define amdgpu_kernel void @sextload_private_offset_i8(i32 addrspace(1)* %out) #0 { %load = load volatile i8, i8 addrspace(5)* inttoptr (i32 8 to i8 addrspace(5)*) %sextload = sext i8 %load to i32 @@ -56,7 +56,7 @@ } ; GCN-LABEL: {{^}}zextload_private_offset_i8: -; GCN: buffer_load_ubyte v{{[0-9]+}}, off, s[4:7], s8 offset:8 +; GCN: buffer_load_ubyte v{{[0-9]+}}, off, s[4:7], 0 offset:8 define amdgpu_kernel void @zextload_private_offset_i8(i32 addrspace(1)* %out) #0 { %load = load volatile i8, i8 addrspace(5)* inttoptr (i32 8 to i8 addrspace(5)*) %zextload = zext i8 %load to i32 @@ -65,14 +65,14 @@ } ; GCN-LABEL: {{^}}load_private_offset_i16: -; GCN: buffer_load_ushort v{{[0-9]+}}, off, s[4:7], s2 offset:8 +; GCN: buffer_load_ushort v{{[0-9]+}}, off, s[4:7], 0 offset:8 define amdgpu_kernel void @load_private_offset_i16() #0 { %load = load volatile i16, i16 addrspace(5)* inttoptr (i32 8 to i16 addrspace(5)*) ret void } ; GCN-LABEL: {{^}}sextload_private_offset_i16: -; GCN: buffer_load_sshort v{{[0-9]+}}, off, s[4:7], s8 offset:8 +; GCN: buffer_load_sshort v{{[0-9]+}}, off, s[4:7], 0 offset:8 define amdgpu_kernel void @sextload_private_offset_i16(i32 addrspace(1)* %out) #0 { %load = load volatile i16, i16 addrspace(5)* inttoptr (i32 8 to i16 addrspace(5)*) %sextload = sext i16 %load to i32 @@ -81,7 +81,7 @@ } ; GCN-LABEL: {{^}}zextload_private_offset_i16: -; GCN: buffer_load_ushort v{{[0-9]+}}, off, s[4:7], s8 offset:8 +; GCN: buffer_load_ushort v{{[0-9]+}}, off, s[4:7], 0 offset:8 define amdgpu_kernel void @zextload_private_offset_i16(i32 addrspace(1)* %out) #0 { %load = load volatile i16, i16 addrspace(5)* inttoptr (i32 8 to i16 addrspace(5)*) %zextload = zext i16 %load to i32 @@ -90,28 +90,28 @@ } ; GCN-LABEL: {{^}}load_private_offset_i32: -; GCN: buffer_load_dword v{{[0-9]+}}, off, s[4:7], s2 offset:8 +; GCN: buffer_load_dword v{{[0-9]+}}, off, s[4:7], 0 offset:8 define amdgpu_kernel void @load_private_offset_i32() #0 { %load = load volatile i32, i32 addrspace(5)* inttoptr (i32 8 to i32 addrspace(5)*) ret void } ; GCN-LABEL: {{^}}load_private_offset_v2i32: -; GCN: buffer_load_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, off, s[4:7], s2 offset:8 +; GCN: buffer_load_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, off, s[4:7], 0 offset:8 define amdgpu_kernel void @load_private_offset_v2i32() #0 { %load = load volatile <2 x i32>, <2 x i32> addrspace(5)* inttoptr (i32 8 to <2 x i32> addrspace(5)*) ret void } ; GCN-LABEL: {{^}}load_private_offset_v4i32: -; GCN: buffer_load_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, off, s[4:7], s2 offset:8 +; GCN: buffer_load_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, off, s[4:7], 0 offset:8 define amdgpu_kernel void @load_private_offset_v4i32() #0 { %load = load volatile <4 x i32>, <4 x i32> addrspace(5)* inttoptr (i32 8 to <4 x i32> addrspace(5)*) ret void } ; GCN-LABEL: {{^}}store_private_offset_i8_max_offset: -; GCN: buffer_store_byte v{{[0-9]+}}, off, s[4:7], s2 offset:4095 +; GCN: buffer_store_byte v{{[0-9]+}}, off, s[4:7], 0 offset:4095 define amdgpu_kernel void @store_private_offset_i8_max_offset() #0 { store volatile i8 5, i8 addrspace(5)* inttoptr (i32 4095 to i8 addrspace(5)*) ret void @@ -119,7 +119,7 @@ ; GCN-LABEL: {{^}}store_private_offset_i8_max_offset_plus1: ; GCN: v_mov_b32_e32 [[OFFSET:v[0-9]+]], 0x1000 -; GCN: buffer_store_byte v{{[0-9]+}}, [[OFFSET]], s[4:7], s2 offen{{$}} +; GCN: buffer_store_byte v{{[0-9]+}}, [[OFFSET]], s[4:7], 0 offen{{$}} define amdgpu_kernel void @store_private_offset_i8_max_offset_plus1() #0 { store volatile i8 5, i8 addrspace(5)* inttoptr (i32 4096 to i8 addrspace(5)*) ret void @@ -127,7 +127,7 @@ ; GCN-LABEL: {{^}}store_private_offset_i8_max_offset_plus2: ; GCN: v_mov_b32_e32 [[OFFSET:v[0-9]+]], 0x1000 -; GCN: buffer_store_byte v{{[0-9]+}}, [[OFFSET]], s[4:7], s2 offen offset:1{{$}} +; GCN: buffer_store_byte v{{[0-9]+}}, [[OFFSET]], s[4:7], 0 offen offset:1{{$}} define amdgpu_kernel void @store_private_offset_i8_max_offset_plus2() #0 { store volatile i8 5, i8 addrspace(5)* inttoptr (i32 4097 to i8 addrspace(5)*) ret void @@ -139,10 +139,10 @@ ; GCN-LABEL: {{^}}store_private_unknown_bits_vaddr: ; SICIVI: v_add_{{i|u}}32_e32 [[ADDR0:v[0-9]+]], vcc, 4 ; SICIVI: v_add_{{i|u}}32_e32 [[ADDR1:v[0-9]+]], vcc, 32, [[ADDR0]] -; SICIVI: buffer_store_dword v{{[0-9]+}}, [[ADDR1]], s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} offen{{$}} +; SICIVI: buffer_store_dword v{{[0-9]+}}, [[ADDR1]], s{{\[[0-9]+:[0-9]+\]}}, 0 offen{{$}} ; GFX9: v_add_u32_e32 [[ADDR:v[0-9]+]], 4, -; GFX9: buffer_store_dword v{{[0-9]+}}, [[ADDR]], s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} offen offset:32 +; GFX9: buffer_store_dword v{{[0-9]+}}, [[ADDR]], s{{\[[0-9]+:[0-9]+\]}}, 0 offen offset:32 define amdgpu_kernel void @store_private_unknown_bits_vaddr() #0 { %alloca = alloca [16 x i32], align 4, addrspace(5) %vaddr = load volatile i32, i32 addrspace(1)* undef diff --git a/llvm/test/CodeGen/AMDGPU/optimize-exec-masking-pre-ra.mir b/llvm/test/CodeGen/AMDGPU/optimize-exec-masking-pre-ra.mir --- a/llvm/test/CodeGen/AMDGPU/optimize-exec-masking-pre-ra.mir +++ b/llvm/test/CodeGen/AMDGPU/optimize-exec-masking-pre-ra.mir @@ -9,7 +9,6 @@ machineFunctionInfo: isEntryFunction: true scratchRSrcReg: '$sgpr96_sgpr97_sgpr98_sgpr99' - scratchWaveOffsetReg: '$sgpr101' frameOffsetReg: '$sgpr101' body: | ; GCN-LABEL: name: exec_src1_is_not_copy diff --git a/llvm/test/CodeGen/AMDGPU/partial-sgpr-to-vgpr-spills.ll b/llvm/test/CodeGen/AMDGPU/partial-sgpr-to-vgpr-spills.ll --- a/llvm/test/CodeGen/AMDGPU/partial-sgpr-to-vgpr-spills.ll +++ b/llvm/test/CodeGen/AMDGPU/partial-sgpr-to-vgpr-spills.ll @@ -412,22 +412,23 @@ ; GCN-NEXT: v_writelane_b32 v0, s18, 62 ; GCN-NEXT: v_writelane_b32 v0, s19, 63 -; GCN: v_readlane_b32 s4, v0, 48 -; GCN-NEXT: v_readlane_b32 s5, v0, 49 -; GCN-NEXT: v_readlane_b32 s6, v0, 50 -; GCN-NEXT: v_readlane_b32 s7, v0, 51 -; GCN-NEXT: v_readlane_b32 s8, v0, 52 -; GCN-NEXT: v_readlane_b32 s9, v0, 53 -; GCN-NEXT: v_readlane_b32 s10, v0, 54 -; GCN-NEXT: v_readlane_b32 s11, v0, 55 -; GCN-NEXT: v_readlane_b32 s12, v0, 56 -; GCN-NEXT: v_readlane_b32 s13, v0, 57 -; GCN-NEXT: v_readlane_b32 s14, v0, 58 -; GCN-NEXT: v_readlane_b32 s15, v0, 59 -; GCN-NEXT: v_readlane_b32 s16, v0, 60 -; GCN-NEXT: v_readlane_b32 s17, v0, 61 -; GCN-NEXT: v_readlane_b32 s18, v0, 62 -; GCN-NEXT: v_readlane_b32 s19, v0, 63 +; GCN: v_readlane_b32 s0, v0, 48 +; GCN-NEXT: v_readlane_b32 s1, v0, 49 +; GCN-NEXT: v_readlane_b32 s2, v0, 50 +; GCN-NEXT: v_readlane_b32 s3, v0, 51 +; GCN-NEXT: v_readlane_b32 s4, v0, 52 +; GCN-NEXT: v_readlane_b32 s5, v0, 53 +; GCN-NEXT: v_readlane_b32 s6, v0, 54 +; GCN-NEXT: v_readlane_b32 s7, v0, 55 +; GCN-NEXT: v_readlane_b32 s8, v0, 56 +; GCN-NEXT: v_readlane_b32 s9, v0, 57 +; GCN-NEXT: v_readlane_b32 s10, v0, 58 +; GCN-NEXT: v_readlane_b32 s11, v0, 59 +; GCN-NEXT: v_readlane_b32 s12, v0, 60 +; GCN-NEXT: v_readlane_b32 s13, v0, 61 +; GCN-NEXT: v_readlane_b32 s14, v0, 62 +; GCN-NEXT: v_readlane_b32 s15, v0, 63 +; GCN: use s[0:15] define amdgpu_kernel void @split_sgpr_spill_2_vgprs(i32 addrspace(1)* %out, i32 %in) #1 { %wide.sgpr0 = call <16 x i32> asm sideeffect "; def $0", "=s" () #0 %wide.sgpr1 = call <16 x i32> asm sideeffect "; def $0", "=s" () #0 @@ -512,8 +513,8 @@ ; GCN-NEXT: v_writelane_b32 v23, s{{[[0-9]+}}, 48 ; GCN-NEXT: v_writelane_b32 v23, s{{[[0-9]+}}, 49 -; GCN: buffer_store_dword v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} -; GCN: buffer_store_dword v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} +; GCN: buffer_store_dword v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0 +; GCN: buffer_store_dword v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0 ; GCN: s_cbranch_scc1 @@ -572,12 +573,12 @@ ; GCN-NEXT: v_readlane_b32 s[[USE_TMP_HI:[0-9]+]], v23, 31 ; GCN: ; use s{{\[}}[[USE_TMP_LO]]:[[USE_TMP_HI]]{{\]}} -; GCN: buffer_load_dword v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} -; GCN: buffer_load_dword v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} - -; GCN: v_readfirstlane_b32 s1, v0 +; GCN: buffer_load_dword [[V_TMP:v[0-9]+]], off, s{{\[[0-9]+:[0-9]+\]}}, 0 +; GCN: v_readfirstlane_b32 s[[USE_TMP_LO:[0-9]+]], [[V_TMP]] +; GCN: buffer_load_dword [[V_TMP:v[0-9]+]], off, s{{\[[0-9]+:[0-9]+\]}}, 0 +; GCN: v_readfirstlane_b32 s[[USE_TMP_HI:[0-9]+]], [[V_TMP]] ; GCN: ;;#ASMSTART -; GCN: ; use s[0:1] +; GCN: ; use s{{\[}}[[USE_TMP_LO]]:[[USE_TMP_HI]]{{\]}} define amdgpu_kernel void @no_vgprs_last_sgpr_spill(i32 addrspace(1)* %out, i32 %in) #1 { call void asm sideeffect "", "~{v[0:7]}" () #0 call void asm sideeffect "", "~{v[8:15]}" () #0 diff --git a/llvm/test/CodeGen/AMDGPU/pei-reg-scavenger-position.mir b/llvm/test/CodeGen/AMDGPU/pei-reg-scavenger-position.mir --- a/llvm/test/CodeGen/AMDGPU/pei-reg-scavenger-position.mir +++ b/llvm/test/CodeGen/AMDGPU/pei-reg-scavenger-position.mir @@ -17,28 +17,30 @@ machineFunctionInfo: isEntryFunction: true scratchRSrcReg: $sgpr0_sgpr1_sgpr2_sgpr3 - scratchWaveOffsetReg: $sgpr33 - frameOffsetReg: $sgpr5 stackPtrOffsetReg: $sgpr32 + argumentInfo: + privateSegmentWaveByteOffset: { reg: '$sgpr4' } body: | ; CHECK-LABEL: name: scavenge_register_position ; CHECK: bb.0: ; CHECK: successors: %bb.1(0x80000000) - ; CHECK: liveins: $sgpr33, $sgpr0_sgpr1_sgpr2_sgpr3 - ; CHECK: $sgpr4 = S_ADD_U32 $sgpr32, 524288, implicit-def $scc + ; CHECK: liveins: $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4 + ; CHECK: $sgpr0 = S_ADD_U32 $sgpr0, killed $sgpr4, implicit-def $scc, implicit-def $sgpr0_sgpr1_sgpr2_sgpr3 + ; CHECK: $sgpr1 = S_ADDC_U32 $sgpr1, 0, implicit-def $scc, implicit $scc, implicit-def $sgpr0_sgpr1_sgpr2_sgpr3 + ; CHECK: $sgpr4 = S_MOV_B32 524288 ; CHECK: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, killed $sgpr4, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 4 from %stack.0, align 8192, addrspace 5) ; CHECK: S_BRANCH %bb.1 ; CHECK: bb.1: ; CHECK: liveins: $sgpr0_sgpr1_sgpr2_sgpr3 - ; CHECK: $sgpr4 = S_ADD_U32 $sgpr32, 524288, implicit-def $scc + ; CHECK: $sgpr4 = S_MOV_B32 524288 ; CHECK: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, killed $sgpr4, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 4 from %stack.0, align 8192, addrspace 5) ; CHECK: S_ENDPGM 0, implicit $vgpr0 bb.0: - $vgpr0 = SI_SPILL_V32_RESTORE %stack.0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, implicit $exec :: (load 4 from %stack.0, addrspace 5) + $vgpr0 = SI_SPILL_V32_RESTORE %stack.0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, implicit $exec :: (load 4 from %stack.0, addrspace 5) S_BRANCH %bb.1 bb.1: - $vgpr0 = SI_SPILL_V32_RESTORE %stack.0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, implicit $exec :: (load 4 from %stack.0, addrspace 5) + $vgpr0 = SI_SPILL_V32_RESTORE %stack.0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, implicit $exec :: (load 4 from %stack.0, addrspace 5) S_ENDPGM 0, implicit $vgpr0 ... diff --git a/llvm/test/CodeGen/AMDGPU/pei-scavenge-sgpr-carry-out.mir b/llvm/test/CodeGen/AMDGPU/pei-scavenge-sgpr-carry-out.mir --- a/llvm/test/CodeGen/AMDGPU/pei-scavenge-sgpr-carry-out.mir +++ b/llvm/test/CodeGen/AMDGPU/pei-scavenge-sgpr-carry-out.mir @@ -19,8 +19,7 @@ machineFunctionInfo: isEntryFunction: false scratchRSrcReg: $sgpr0_sgpr1_sgpr2_sgpr3 - scratchWaveOffsetReg: $sgpr34 - frameOffsetReg: $sgpr33 + frameOffsetReg: $sgpr34 stackPtrOffsetReg: $sgpr32 body: | @@ -29,21 +28,19 @@ ; CHECK-LABEL: name: scavenge_sgpr_pei_no_sgprs ; CHECK: liveins: $vgpr1 - ; CHECK: $sgpr27 = frame-setup COPY $sgpr33 + ; CHECK: $sgpr27 = frame-setup COPY $sgpr34 ; CHECK: $sgpr4 = frame-setup S_ADD_U32 $sgpr32, 524224, implicit-def $scc - ; CHECK: $sgpr33 = frame-setup S_AND_B32 killed $sgpr4, 4294443008, implicit-def $scc + ; CHECK: $sgpr34 = frame-setup S_AND_B32 killed $sgpr4, 4294443008, implicit-def $scc ; CHECK: $sgpr32 = frame-setup S_ADD_U32 $sgpr32, 1572864, implicit-def $scc ; CHECK: S_NOP 0, implicit-def $sgpr4, implicit-def $sgpr5, implicit-def $sgpr6, implicit-def $sgpr7, implicit-def $sgpr8, implicit-def $sgpr9, implicit-def $sgpr10, implicit-def $sgpr11, implicit-def $sgpr12, implicit-def $sgpr13, implicit-def $sgpr14, implicit-def $sgpr15, implicit-def $sgpr16, implicit-def $sgpr17, implicit-def $sgpr18, implicit-def $sgpr19, implicit-def $sgpr20, implicit-def $sgpr21, implicit-def $sgpr22, implicit-def $sgpr23, implicit-def $sgpr24, implicit-def $sgpr25, implicit-def $sgpr26, implicit-def $sgpr17, implicit-def $sgpr28, implicit-def $sgpr29, implicit-def $sgpr30, implicit-def $sgpr31, implicit-def $vcc - ; CHECK: $sgpr33 = S_SUB_U32 $sgpr33, $sgpr34, implicit-def $scc - ; CHECK: $sgpr33 = S_LSHR_B32 killed $sgpr33, 6, implicit-def $scc - ; CHECK: $sgpr33 = S_ADD_U32 killed $sgpr33, 8192, implicit-def $scc - ; CHECK: $vgpr2 = COPY killed $sgpr33 - ; CHECK: $sgpr33 = S_SUB_U32 killed $sgpr33, 8192, implicit-def $scc - ; CHECK: $sgpr33 = S_LSHL_B32 killed $sgpr33, 6, implicit-def $scc - ; CHECK: $sgpr33 = S_ADD_U32 $sgpr33, $sgpr34, implicit-def $scc + ; CHECK: $sgpr34 = S_LSHR_B32 $sgpr34, 6, implicit-def $scc + ; CHECK: $sgpr34 = S_ADD_U32 killed $sgpr34, 8192, implicit-def $scc + ; CHECK: $vgpr2 = COPY killed $sgpr34 + ; CHECK: $sgpr34 = S_SUB_U32 killed $sgpr34, 8192, implicit-def $scc + ; CHECK: $sgpr34 = S_LSHL_B32 $sgpr34, 6, implicit-def $scc ; CHECK: $vgpr0 = V_OR_B32_e32 killed $vgpr2, $vgpr1, implicit $exec, implicit $sgpr4, implicit $sgpr5, implicit $sgpr6, implicit $sgpr7, implicit $sgpr8, implicit $sgpr9, implicit $sgpr10, implicit $sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $sgpr15, implicit $sgpr16, implicit $sgpr17, implicit $sgpr18, implicit $sgpr19, implicit $sgpr20, implicit $sgpr21, implicit $sgpr22, implicit $sgpr23, implicit $sgpr24, implicit $sgpr25, implicit $sgpr26, implicit $sgpr17, implicit $sgpr28, implicit $sgpr29, implicit $sgpr30, implicit $sgpr31 ; CHECK: $sgpr32 = frame-destroy S_SUB_U32 $sgpr32, 1572864, implicit-def $scc - ; CHECK: $sgpr33 = frame-setup COPY $sgpr27 + ; CHECK: $sgpr34 = frame-setup COPY $sgpr27 ; CHECK: S_ENDPGM 0, implicit $vcc S_NOP 0, implicit-def $sgpr4, implicit-def $sgpr5, implicit-def $sgpr6, implicit-def $sgpr7, implicit-def $sgpr8, implicit-def $sgpr9, implicit-def $sgpr10, implicit-def $sgpr11, implicit-def $sgpr12, implicit-def $sgpr13, implicit-def $sgpr14, implicit-def $sgpr15, implicit-def $sgpr16, implicit-def $sgpr17, implicit-def $sgpr18, implicit-def $sgpr19, implicit-def $sgpr20, implicit-def $sgpr21, implicit-def $sgpr22, implicit-def $sgpr23, implicit-def $sgpr24, implicit-def $sgpr25, implicit-def $sgpr26, implicit-def $sgpr17, implicit-def $sgpr28, implicit-def $sgpr29, implicit-def $sgpr30, implicit-def $sgpr31, implicit-def $vcc $vgpr0 = V_OR_B32_e32 %stack.1, $vgpr1, implicit $exec, implicit $sgpr4, implicit $sgpr5, implicit $sgpr6, implicit $sgpr7, implicit $sgpr8, implicit $sgpr9, implicit $sgpr10, implicit $sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $sgpr15, implicit $sgpr16, implicit $sgpr17, implicit $sgpr18, implicit $sgpr19, implicit $sgpr20, implicit $sgpr21, implicit $sgpr22, implicit $sgpr23, implicit $sgpr24, implicit $sgpr25, implicit $sgpr26, implicit $sgpr17, implicit $sgpr28, implicit $sgpr29, implicit $sgpr30, implicit $sgpr31 @@ -64,8 +61,7 @@ machineFunctionInfo: isEntryFunction: false scratchRSrcReg: $sgpr0_sgpr1_sgpr2_sgpr3 - scratchWaveOffsetReg: $sgpr34 - frameOffsetReg: $sgpr33 + frameOffsetReg: $sgpr34 stackPtrOffsetReg: $sgpr32 body: | @@ -74,18 +70,17 @@ ; CHECK-LABEL: name: scavenge_sgpr_pei_one_sgpr ; CHECK: liveins: $vgpr1 - ; CHECK: $sgpr27 = frame-setup COPY $sgpr33 + ; CHECK: $sgpr27 = frame-setup COPY $sgpr34 ; CHECK: $sgpr4 = frame-setup S_ADD_U32 $sgpr32, 524224, implicit-def $scc - ; CHECK: $sgpr33 = frame-setup S_AND_B32 killed $sgpr4, 4294443008, implicit-def $scc + ; CHECK: $sgpr34 = frame-setup S_AND_B32 killed $sgpr4, 4294443008, implicit-def $scc ; CHECK: $sgpr32 = frame-setup S_ADD_U32 $sgpr32, 1572864, implicit-def $scc ; CHECK: S_NOP 0, implicit-def $sgpr4, implicit-def $sgpr5, implicit-def $sgpr6, implicit-def $sgpr7, implicit-def $sgpr8, implicit-def $sgpr9, implicit-def $sgpr10, implicit-def $sgpr11, implicit-def $sgpr12, implicit-def $sgpr13, implicit-def $sgpr14, implicit-def $sgpr15, implicit-def $sgpr16, implicit-def $sgpr17, implicit-def $sgpr18, implicit-def $sgpr19, implicit-def $sgpr20, implicit-def $sgpr21, implicit-def $sgpr22, implicit-def $sgpr23, implicit-def $sgpr24, implicit-def $sgpr25, implicit-def $sgpr26, implicit-def $sgpr17, implicit-def $sgpr28, implicit-def $sgpr30, implicit-def $sgpr31, implicit-def $vcc - ; CHECK: $sgpr29 = S_SUB_U32 $sgpr33, $sgpr34, implicit-def $scc - ; CHECK: $sgpr29 = S_LSHR_B32 killed $sgpr29, 6, implicit-def $scc + ; CHECK: $sgpr29 = S_LSHR_B32 $sgpr34, 6, implicit-def $scc ; CHECK: $sgpr29 = S_ADD_U32 killed $sgpr29, 8192, implicit-def $scc ; CHECK: $vgpr2 = COPY killed $sgpr29 ; CHECK: $vgpr0 = V_OR_B32_e32 killed $vgpr2, $vgpr1, implicit $exec, implicit $sgpr4, implicit $sgpr5, implicit $sgpr6, implicit $sgpr7, implicit $sgpr8, implicit $sgpr9, implicit $sgpr10, implicit $sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $sgpr15, implicit $sgpr16, implicit $sgpr17, implicit $sgpr18, implicit $sgpr19, implicit $sgpr20, implicit $sgpr21, implicit $sgpr22, implicit $sgpr23, implicit $sgpr24, implicit $sgpr25, implicit $sgpr26, implicit $sgpr17, implicit $sgpr28, implicit $sgpr31 ; CHECK: $sgpr32 = frame-destroy S_SUB_U32 $sgpr32, 1572864, implicit-def $scc - ; CHECK: $sgpr33 = frame-setup COPY $sgpr27 + ; CHECK: $sgpr34 = frame-setup COPY $sgpr27 ; CHECK: S_ENDPGM 0, implicit $vcc S_NOP 0, implicit-def $sgpr4, implicit-def $sgpr5, implicit-def $sgpr6, implicit-def $sgpr7, implicit-def $sgpr8, implicit-def $sgpr9, implicit-def $sgpr10, implicit-def $sgpr11, implicit-def $sgpr12, implicit-def $sgpr13, implicit-def $sgpr14, implicit-def $sgpr15, implicit-def $sgpr16, implicit-def $sgpr17, implicit-def $sgpr18, implicit-def $sgpr19, implicit-def $sgpr20, implicit-def $sgpr21, implicit-def $sgpr22, implicit-def $sgpr23, implicit-def $sgpr24, implicit-def $sgpr25, implicit-def $sgpr26, implicit-def $sgpr17, implicit-def $sgpr28, implicit-def $sgpr30, implicit-def $sgpr31, implicit-def $vcc $vgpr0 = V_OR_B32_e32 %stack.1, $vgpr1, implicit $exec, implicit $sgpr4, implicit $sgpr5, implicit $sgpr6, implicit $sgpr7, implicit $sgpr8, implicit $sgpr9, implicit $sgpr10, implicit $sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $sgpr15, implicit $sgpr16, implicit $sgpr17, implicit $sgpr18, implicit $sgpr19, implicit $sgpr20, implicit $sgpr21, implicit $sgpr22, implicit $sgpr23, implicit $sgpr24, implicit $sgpr25, implicit $sgpr26, implicit $sgpr17, implicit $sgpr28, implicit $sgpr31 @@ -106,8 +101,7 @@ machineFunctionInfo: isEntryFunction: false scratchRSrcReg: $sgpr0_sgpr1_sgpr2_sgpr3 - scratchWaveOffsetReg: $sgpr34 - frameOffsetReg: $sgpr33 + frameOffsetReg: $sgpr34 stackPtrOffsetReg: $sgpr32 body: | @@ -116,18 +110,17 @@ ; CHECK-LABEL: name: scavenge_sgpr_pei_one_sgpr_64 ; CHECK: liveins: $vgpr1 - ; CHECK: $sgpr27 = frame-setup COPY $sgpr33 + ; CHECK: $sgpr27 = frame-setup COPY $sgpr34 ; CHECK: $sgpr4 = frame-setup S_ADD_U32 $sgpr32, 524224, implicit-def $scc - ; CHECK: $sgpr33 = frame-setup S_AND_B32 killed $sgpr4, 4294443008, implicit-def $scc + ; CHECK: $sgpr34 = frame-setup S_AND_B32 killed $sgpr4, 4294443008, implicit-def $scc ; CHECK: $sgpr32 = frame-setup S_ADD_U32 $sgpr32, 1572864, implicit-def $scc ; CHECK: S_NOP 0, implicit-def $sgpr4, implicit-def $sgpr5, implicit-def $sgpr6, implicit-def $sgpr7, implicit-def $sgpr8, implicit-def $sgpr9, implicit-def $sgpr10, implicit-def $sgpr11, implicit-def $sgpr12, implicit-def $sgpr13, implicit-def $sgpr14, implicit-def $sgpr15, implicit-def $sgpr16, implicit-def $sgpr17, implicit-def $sgpr18, implicit-def $sgpr19, implicit-def $sgpr20, implicit-def $sgpr21, implicit-def $sgpr22, implicit-def $sgpr23, implicit-def $sgpr24, implicit-def $sgpr25, implicit-def $sgpr26, implicit-def $sgpr17, implicit-def $sgpr30, implicit-def $sgpr31, implicit-def $vcc - ; CHECK: $sgpr28 = S_SUB_U32 $sgpr33, $sgpr34, implicit-def $scc - ; CHECK: $vgpr3 = V_LSHRREV_B32_e64 6, killed $sgpr28, implicit $exec + ; CHECK: $vgpr3 = V_LSHRREV_B32_e64 6, $sgpr34, implicit $exec ; CHECK: $sgpr28 = S_MOV_B32 8192 ; CHECK: $vgpr2, dead $sgpr28_sgpr29 = V_ADD_I32_e64 killed $sgpr28, killed $vgpr3, 0, implicit $exec ; CHECK: $vgpr0 = V_OR_B32_e32 killed $vgpr2, $vgpr1, implicit $exec, implicit $sgpr4, implicit $sgpr5, implicit $sgpr6, implicit $sgpr7, implicit $sgpr8, implicit $sgpr9, implicit $sgpr10, implicit $sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $sgpr15, implicit $sgpr16, implicit $sgpr17, implicit $sgpr18, implicit $sgpr19, implicit $sgpr20, implicit $sgpr21, implicit $sgpr22, implicit $sgpr23, implicit $sgpr24, implicit $sgpr25, implicit $sgpr26, implicit $sgpr17, implicit $sgpr31 ; CHECK: $sgpr32 = frame-destroy S_SUB_U32 $sgpr32, 1572864, implicit-def $scc - ; CHECK: $sgpr33 = frame-setup COPY $sgpr27 + ; CHECK: $sgpr34 = frame-setup COPY $sgpr27 ; CHECK: S_ENDPGM 0, implicit $vcc S_NOP 0, implicit-def $sgpr4, implicit-def $sgpr5, implicit-def $sgpr6, implicit-def $sgpr7, implicit-def $sgpr8, implicit-def $sgpr9, implicit-def $sgpr10, implicit-def $sgpr11, implicit-def $sgpr12, implicit-def $sgpr13, implicit-def $sgpr14, implicit-def $sgpr15, implicit-def $sgpr16, implicit-def $sgpr17, implicit-def $sgpr18, implicit-def $sgpr19, implicit-def $sgpr20, implicit-def $sgpr21, implicit-def $sgpr22, implicit-def $sgpr23, implicit-def $sgpr24, implicit-def $sgpr25, implicit-def $sgpr26, implicit-def $sgpr17, implicit-def $sgpr30, implicit-def $sgpr31, implicit-def $vcc $vgpr0 = V_OR_B32_e32 %stack.1, $vgpr1, implicit $exec, implicit $sgpr4, implicit $sgpr5, implicit $sgpr6, implicit $sgpr7, implicit $sgpr8, implicit $sgpr9, implicit $sgpr10, implicit $sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $sgpr15, implicit $sgpr16, implicit $sgpr17, implicit $sgpr18, implicit $sgpr19, implicit $sgpr20, implicit $sgpr21, implicit $sgpr22, implicit $sgpr23, implicit $sgpr24, implicit $sgpr25, implicit $sgpr26, implicit $sgpr17, implicit $sgpr31 @@ -147,8 +140,7 @@ machineFunctionInfo: isEntryFunction: false scratchRSrcReg: $sgpr0_sgpr1_sgpr2_sgpr3 - scratchWaveOffsetReg: $sgpr34 - frameOffsetReg: $sgpr33 + frameOffsetReg: $sgpr34 stackPtrOffsetReg: $sgpr32 body: | @@ -157,18 +149,17 @@ ; CHECK-LABEL: name: scavenge_sgpr_pei_prefer_vcc ; CHECK: liveins: $vgpr1 - ; CHECK: $sgpr27 = frame-setup COPY $sgpr33 + ; CHECK: $sgpr27 = frame-setup COPY $sgpr34 ; CHECK: $sgpr4 = frame-setup S_ADD_U32 $sgpr32, 524224, implicit-def $scc - ; CHECK: $sgpr33 = frame-setup S_AND_B32 killed $sgpr4, 4294443008, implicit-def $scc + ; CHECK: $sgpr34 = frame-setup S_AND_B32 killed $sgpr4, 4294443008, implicit-def $scc ; CHECK: $sgpr32 = frame-setup S_ADD_U32 $sgpr32, 1572864, implicit-def $scc ; CHECK: S_NOP 0, implicit-def $sgpr4, implicit-def $sgpr5, implicit-def $sgpr6, implicit-def $sgpr7, implicit-def $sgpr8, implicit-def $sgpr9, implicit-def $sgpr10, implicit-def $sgpr11, implicit-def $sgpr12, implicit-def $sgpr13, implicit-def $sgpr14, implicit-def $sgpr15, implicit-def $sgpr16, implicit-def $sgpr17, implicit-def $sgpr18, implicit-def $sgpr19, implicit-def $sgpr20, implicit-def $sgpr21, implicit-def $sgpr22, implicit-def $sgpr23, implicit-def $sgpr24, implicit-def $sgpr25, implicit-def $sgpr26, implicit-def $sgpr17, implicit-def $sgpr30, implicit-def $sgpr31 - ; CHECK: $vcc_hi = S_SUB_U32 $sgpr33, $sgpr34, implicit-def $scc - ; CHECK: $vgpr3 = V_LSHRREV_B32_e64 6, killed $vcc_hi, implicit $exec + ; CHECK: $vgpr3 = V_LSHRREV_B32_e64 6, $sgpr34, implicit $exec ; CHECK: $vcc_lo = S_MOV_B32 8192 ; CHECK: $vgpr2, dead $vcc = V_ADD_I32_e64 killed $vcc_lo, killed $vgpr3, 0, implicit $exec ; CHECK: $vgpr0 = V_OR_B32_e32 killed $vgpr2, $vgpr1, implicit $exec, implicit $sgpr4, implicit $sgpr5, implicit $sgpr6, implicit $sgpr7, implicit $sgpr8, implicit $sgpr9, implicit $sgpr10, implicit $sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $sgpr15, implicit $sgpr16, implicit $sgpr17, implicit $sgpr18, implicit $sgpr19, implicit $sgpr20, implicit $sgpr21, implicit $sgpr22, implicit $sgpr23, implicit $sgpr24, implicit $sgpr25, implicit $sgpr26, implicit $sgpr17, implicit $sgpr31 ; CHECK: $sgpr32 = frame-destroy S_SUB_U32 $sgpr32, 1572864, implicit-def $scc - ; CHECK: $sgpr33 = frame-setup COPY $sgpr27 + ; CHECK: $sgpr34 = frame-setup COPY $sgpr27 ; CHECK: S_ENDPGM 0 S_NOP 0, implicit-def $sgpr4, implicit-def $sgpr5, implicit-def $sgpr6, implicit-def $sgpr7, implicit-def $sgpr8, implicit-def $sgpr9, implicit-def $sgpr10, implicit-def $sgpr11, implicit-def $sgpr12, implicit-def $sgpr13, implicit-def $sgpr14, implicit-def $sgpr15, implicit-def $sgpr16, implicit-def $sgpr17, implicit-def $sgpr18, implicit-def $sgpr19, implicit-def $sgpr20, implicit-def $sgpr21, implicit-def $sgpr22, implicit-def $sgpr23, implicit-def $sgpr24, implicit-def $sgpr25, implicit-def $sgpr26, implicit-def $sgpr17, implicit-def $sgpr30, implicit-def $sgpr31 $vgpr0 = V_OR_B32_e32 %stack.1, $vgpr1, implicit $exec, implicit $sgpr4, implicit $sgpr5, implicit $sgpr6, implicit $sgpr7, implicit $sgpr8, implicit $sgpr9, implicit $sgpr10, implicit $sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $sgpr15, implicit $sgpr16, implicit $sgpr17, implicit $sgpr18, implicit $sgpr19, implicit $sgpr20, implicit $sgpr21, implicit $sgpr22, implicit $sgpr23, implicit $sgpr24, implicit $sgpr25, implicit $sgpr26, implicit $sgpr17, implicit $sgpr31 diff --git a/llvm/test/CodeGen/AMDGPU/pei-scavenge-sgpr-gfx9.mir b/llvm/test/CodeGen/AMDGPU/pei-scavenge-sgpr-gfx9.mir --- a/llvm/test/CodeGen/AMDGPU/pei-scavenge-sgpr-gfx9.mir +++ b/llvm/test/CodeGen/AMDGPU/pei-scavenge-sgpr-gfx9.mir @@ -14,7 +14,6 @@ machineFunctionInfo: isEntryFunction: false scratchRSrcReg: $sgpr0_sgpr1_sgpr2_sgpr3 - scratchWaveOffsetReg: $sgpr34 frameOffsetReg: $sgpr33 stackPtrOffsetReg: $sgpr32 @@ -29,10 +28,8 @@ ; CHECK: $sgpr33 = frame-setup S_AND_B32 killed $sgpr4, 4294443008, implicit-def $scc ; CHECK: $sgpr32 = frame-setup S_ADD_U32 $sgpr32, 1572864, implicit-def $scc ; CHECK: S_NOP 0, implicit-def $sgpr4, implicit-def $sgpr5, implicit-def $sgpr6, implicit-def $sgpr7, implicit-def $sgpr8, implicit-def $sgpr9, implicit-def $sgpr10, implicit-def $sgpr11, implicit-def $sgpr12, implicit-def $sgpr13, implicit-def $sgpr14, implicit-def $sgpr15, implicit-def $sgpr16, implicit-def $sgpr17, implicit-def $sgpr18, implicit-def $sgpr19, implicit-def $sgpr20, implicit-def $sgpr21, implicit-def $sgpr22, implicit-def $sgpr23, implicit-def $sgpr24, implicit-def $sgpr25, implicit-def $sgpr26, implicit-def $sgpr17, implicit-def $sgpr28, implicit-def $sgpr29, implicit-def $sgpr30, implicit-def $sgpr31, implicit-def $vcc - ; CHECK: $sgpr33 = S_SUB_U32 $sgpr33, $sgpr34, implicit-def $scc - ; CHECK: $vgpr3 = V_LSHRREV_B32_e64 6, killed $sgpr33, implicit $exec + ; CHECK: $vgpr3 = V_LSHRREV_B32_e64 6, $sgpr33, implicit $exec ; CHECK: $vgpr2 = V_ADD_U32_e32 8192, killed $vgpr3, implicit $exec - ; CHECK: $sgpr33 = S_ADD_U32 $sgpr33, $sgpr34, implicit-def $scc ; CHECK: $vgpr0 = V_OR_B32_e32 killed $vgpr2, $vgpr1, implicit $exec, implicit $sgpr4, implicit $sgpr5, implicit $sgpr6, implicit $sgpr7, implicit $sgpr8, implicit $sgpr9, implicit $sgpr10, implicit $sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $sgpr15, implicit $sgpr16, implicit $sgpr17, implicit $sgpr18, implicit $sgpr19, implicit $sgpr20, implicit $sgpr21, implicit $sgpr22, implicit $sgpr23, implicit $sgpr24, implicit $sgpr25, implicit $sgpr26, implicit $sgpr17, implicit $sgpr28, implicit $sgpr29, implicit $sgpr30, implicit $sgpr31 ; CHECK: $sgpr32 = frame-destroy S_SUB_U32 $sgpr32, 1572864, implicit-def $scc ; CHECK: $sgpr33 = frame-setup COPY $sgpr27 diff --git a/llvm/test/CodeGen/AMDGPU/pei-scavenge-sgpr.mir b/llvm/test/CodeGen/AMDGPU/pei-scavenge-sgpr.mir --- a/llvm/test/CodeGen/AMDGPU/pei-scavenge-sgpr.mir +++ b/llvm/test/CodeGen/AMDGPU/pei-scavenge-sgpr.mir @@ -14,7 +14,6 @@ machineFunctionInfo: isEntryFunction: false scratchRSrcReg: $sgpr0_sgpr1_sgpr2_sgpr3 - scratchWaveOffsetReg: $sgpr34 frameOffsetReg: $sgpr33 stackPtrOffsetReg: $sgpr32 @@ -29,9 +28,7 @@ ; CHECK: $sgpr33 = frame-setup S_AND_B32 killed $sgpr4, 4294705152, implicit-def $scc ; CHECK: $sgpr32 = frame-setup S_ADD_U32 $sgpr32, 524288, implicit-def $scc ; CHECK: S_NOP 0, implicit-def $sgpr4, implicit-def $sgpr5, implicit-def $sgpr6, implicit-def $sgpr7, implicit-def $sgpr8, implicit-def $sgpr9, implicit-def $sgpr10, implicit-def $sgpr11, implicit-def $sgpr12, implicit-def $sgpr13, implicit-def $sgpr14, implicit-def $sgpr15, implicit-def $sgpr16, implicit-def $sgpr17, implicit-def $sgpr18, implicit-def $sgpr19, implicit-def $sgpr20, implicit-def $sgpr21, implicit-def $sgpr22, implicit-def $sgpr23, implicit-def $sgpr24, implicit-def $sgpr25, implicit-def $sgpr26, implicit-def $sgpr17, implicit-def $sgpr28, implicit-def $sgpr29, implicit-def $sgpr30, implicit-def $sgpr31, implicit-def $vcc - ; CHECK: $sgpr33 = S_SUB_U32 $sgpr33, $sgpr34, implicit-def $scc ; CHECK: $vgpr2 = V_LSHRREV_B32_e64 6, $sgpr33, implicit $exec - ; CHECK: $sgpr33 = S_ADD_U32 $sgpr33, $sgpr34, implicit-def $scc ; CHECK: $vgpr0 = V_OR_B32_e32 killed $vgpr2, $vgpr1, implicit $exec, implicit $sgpr4, implicit $sgpr5, implicit $sgpr6, implicit $sgpr7, implicit $sgpr8, implicit $sgpr9, implicit $sgpr10, implicit $sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $sgpr15, implicit $sgpr16, implicit $sgpr17, implicit $sgpr18, implicit $sgpr19, implicit $sgpr20, implicit $sgpr21, implicit $sgpr22, implicit $sgpr23, implicit $sgpr24, implicit $sgpr25, implicit $sgpr26, implicit $sgpr17, implicit $sgpr28, implicit $sgpr29, implicit $sgpr30, implicit $sgpr31 ; CHECK: $sgpr32 = frame-destroy S_SUB_U32 $sgpr32, 524288, implicit-def $scc ; CHECK: $sgpr33 = frame-setup COPY $sgpr27 diff --git a/llvm/test/CodeGen/AMDGPU/private-access-no-objects.ll b/llvm/test/CodeGen/AMDGPU/private-access-no-objects.ll --- a/llvm/test/CodeGen/AMDGPU/private-access-no-objects.ll +++ b/llvm/test/CodeGen/AMDGPU/private-access-no-objects.ll @@ -10,14 +10,13 @@ ; GCN-LABEL: {{^}}store_to_undef: ; OPT-DAG: s_mov_b64 s{{\[}}[[RSRC_LO:[0-9]+]]:{{[0-9]+\]}}, s[0:1] ; OPT-DAG: s_mov_b64 s{{\[[0-9]+}}:[[RSRC_HI:[0-9]+]]{{\]}}, s[2:3] -; OPT-DAG: s_mov_b32 [[SOFFSET:s[0-9]+]], s5{{$}} -; OPT: buffer_store_dword v{{[0-9]+}}, v{{[0-9]+}}, s{{\[}}[[RSRC_LO]]:[[RSRC_HI]]{{\]}}, [[SOFFSET]] offen{{$}} +; OPT: buffer_store_dword v{{[0-9]+}}, v{{[0-9]+}}, s{{\[}}[[RSRC_LO]]:[[RSRC_HI]]{{\]}}, 0 offen{{$}} ; -O0 should assume spilling, so the input scratch resource descriptor ; -should be used directly without any copies. ; OPTNONE-NOT: s_mov_b32 -; OPTNONE: buffer_store_dword v{{[0-9]+}}, v{{[0-9]+}}, s[0:3], s5 offen{{$}} +; OPTNONE: buffer_store_dword v{{[0-9]+}}, v{{[0-9]+}}, s[0:3], 0 offen{{$}} define amdgpu_kernel void @store_to_undef() #0 { store volatile i32 0, i32 addrspace(5)* undef ret void @@ -26,8 +25,7 @@ ; GCN-LABEL: {{^}}store_to_inttoptr: ; OPT-DAG: s_mov_b64 s{{\[}}[[RSRC_LO:[0-9]+]]:{{[0-9]+\]}}, s[0:1] ; OPT-DAG: s_mov_b64 s{{\[[0-9]+}}:[[RSRC_HI:[0-9]+]]{{\]}}, s[2:3] -; OPT-DAG: s_mov_b32 [[SOFFSET:s[0-9]+]], s5{{$}} -; OPT: buffer_store_dword v{{[0-9]+}}, off, s{{\[}}[[RSRC_LO]]:[[RSRC_HI]]{{\]}}, [[SOFFSET]] offset:124{{$}} +; OPT: buffer_store_dword v{{[0-9]+}}, off, s{{\[}}[[RSRC_LO]]:[[RSRC_HI]]{{\]}}, 0 offset:124{{$}} define amdgpu_kernel void @store_to_inttoptr() #0 { store volatile i32 0, i32 addrspace(5)* inttoptr (i32 124 to i32 addrspace(5)*) ret void @@ -36,8 +34,7 @@ ; GCN-LABEL: {{^}}load_from_undef: ; OPT-DAG: s_mov_b64 s{{\[}}[[RSRC_LO:[0-9]+]]:{{[0-9]+\]}}, s[0:1] ; OPT-DAG: s_mov_b64 s{{\[[0-9]+}}:[[RSRC_HI:[0-9]+]]{{\]}}, s[2:3] -; OPT-DAG: s_mov_b32 [[SOFFSET:s[0-9]+]], s5{{$}} -; OPT: buffer_load_dword v{{[0-9]+}}, v{{[0-9]+}}, s{{\[}}[[RSRC_LO]]:[[RSRC_HI]]{{\]}}, [[SOFFSET]] offen{{$}} +; OPT: buffer_load_dword v{{[0-9]+}}, v{{[0-9]+}}, s{{\[}}[[RSRC_LO]]:[[RSRC_HI]]{{\]}}, 0 offen{{$}} define amdgpu_kernel void @load_from_undef() #0 { %ld = load volatile i32, i32 addrspace(5)* undef ret void @@ -46,8 +43,7 @@ ; GCN-LABEL: {{^}}load_from_inttoptr: ; OPT-DAG: s_mov_b64 s{{\[}}[[RSRC_LO:[0-9]+]]:{{[0-9]+\]}}, s[0:1] ; OPT-DAG: s_mov_b64 s{{\[[0-9]+}}:[[RSRC_HI:[0-9]+]]{{\]}}, s[2:3] -; OPT-DAG: s_mov_b32 [[SOFFSET:s[0-9]+]], s5{{$}} -; OPT: buffer_load_dword v{{[0-9]+}}, off, s{{\[}}[[RSRC_LO]]:[[RSRC_HI]]{{\]}}, [[SOFFSET]] offset:124{{$}} +; OPT: buffer_load_dword v{{[0-9]+}}, off, s{{\[}}[[RSRC_LO]]:[[RSRC_HI]]{{\]}}, 0 offset:124{{$}} define amdgpu_kernel void @load_from_inttoptr() #0 { %ld = load volatile i32, i32 addrspace(5)* inttoptr (i32 124 to i32 addrspace(5)*) ret void diff --git a/llvm/test/CodeGen/AMDGPU/private-element-size.ll b/llvm/test/CodeGen/AMDGPU/private-element-size.ll --- a/llvm/test/CodeGen/AMDGPU/private-element-size.ll +++ b/llvm/test/CodeGen/AMDGPU/private-element-size.ll @@ -10,32 +10,32 @@ ; HSA-ELT4: private_element_size = 1 -; HSA-ELT16-DAG: buffer_store_dwordx4 {{v\[[0-9]+:[0-9]+\]}}, off, s[0:3], s9 offset:16 -; HSA-ELT16-DAG: buffer_store_dwordx4 {{v\[[0-9]+:[0-9]+\]}}, off, s[0:3], s9 offset:32 -; HSA-ELT16-DAG: buffer_load_dwordx4 {{v\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, s[0:3], s9 offen{{$}} - -; HSA-ELT8-DAG: buffer_store_dwordx2 {{v\[[0-9]+:[0-9]+\]}}, off, s[0:3], s9 offset:24{{$}} -; HSA-ELT8-DAG: buffer_store_dwordx2 {{v\[[0-9]+:[0-9]+\]}}, off, s[0:3], s9 offset:16 -; HSA-ELT8-DAG: buffer_store_dwordx2 {{v\[[0-9]+:[0-9]+\]}}, off, s[0:3], s9 offset:32 -; HSA-ELT8-DAG: buffer_store_dwordx2 {{v\[[0-9]+:[0-9]+\]}}, off, s[0:3], s9 offset:40 - -; HSA-ELT8: buffer_load_dwordx2 {{v\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, s[0:3], s9 offen -; HSA-ELT8: buffer_load_dwordx2 {{v\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, s[0:3], s9 offen - - -; HSA-ELT4-DAG: buffer_store_dword {{v[0-9]+}}, off, s[0:3], s9 offset:16{{$}} -; HSA-ELT4-DAG: buffer_store_dword {{v[0-9]+}}, off, s[0:3], s9 offset:20{{$}} -; HSA-ELT4-DAG: buffer_store_dword {{v[0-9]+}}, off, s[0:3], s9 offset:24{{$}} -; HSA-ELT4-DAG: buffer_store_dword {{v[0-9]+}}, off, s[0:3], s9 offset:28{{$}} -; HSA-ELT4-DAG: buffer_store_dword {{v[0-9]+}}, off, s[0:3], s9 offset:32{{$}} -; HSA-ELT4-DAG: buffer_store_dword {{v[0-9]+}}, off, s[0:3], s9 offset:36{{$}} -; HSA-ELT4-DAG: buffer_store_dword {{v[0-9]+}}, off, s[0:3], s9 offset:40{{$}} -; HSA-ELT4-DAG: buffer_store_dword {{v[0-9]+}}, off, s[0:3], s9 offset:44{{$}} - -; HSA-ELT4-DAG: buffer_load_dword {{v[0-9]+}}, v{{[0-9]+}}, s[0:3], s9 offen{{$}} -; HSA-ELT4-DAG: buffer_load_dword {{v[0-9]+}}, v{{[0-9]+}}, s[0:3], s9 offen offset:4{{$}} -; HSA-ELT4-DAG: buffer_load_dword {{v[0-9]+}}, v{{[0-9]+}}, s[0:3], s9 offen offset:8{{$}} -; HSA-ELT4-DAG: buffer_load_dword {{v[0-9]+}}, v{{[0-9]+}}, s[0:3], s9 offen offset:12{{$}} +; HSA-ELT16-DAG: buffer_store_dwordx4 {{v\[[0-9]+:[0-9]+\]}}, off, s[0:3], 0 offset:16 +; HSA-ELT16-DAG: buffer_store_dwordx4 {{v\[[0-9]+:[0-9]+\]}}, off, s[0:3], 0 offset:32 +; HSA-ELT16-DAG: buffer_load_dwordx4 {{v\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, s[0:3], 0 offen{{$}} + +; HSA-ELT8-DAG: buffer_store_dwordx2 {{v\[[0-9]+:[0-9]+\]}}, off, s[0:3], 0 offset:24{{$}} +; HSA-ELT8-DAG: buffer_store_dwordx2 {{v\[[0-9]+:[0-9]+\]}}, off, s[0:3], 0 offset:16 +; HSA-ELT8-DAG: buffer_store_dwordx2 {{v\[[0-9]+:[0-9]+\]}}, off, s[0:3], 0 offset:32 +; HSA-ELT8-DAG: buffer_store_dwordx2 {{v\[[0-9]+:[0-9]+\]}}, off, s[0:3], 0 offset:40 + +; HSA-ELT8: buffer_load_dwordx2 {{v\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, s[0:3], 0 offen +; HSA-ELT8: buffer_load_dwordx2 {{v\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, s[0:3], 0 offen + + +; HSA-ELT4-DAG: buffer_store_dword {{v[0-9]+}}, off, s[0:3], 0 offset:16{{$}} +; HSA-ELT4-DAG: buffer_store_dword {{v[0-9]+}}, off, s[0:3], 0 offset:20{{$}} +; HSA-ELT4-DAG: buffer_store_dword {{v[0-9]+}}, off, s[0:3], 0 offset:24{{$}} +; HSA-ELT4-DAG: buffer_store_dword {{v[0-9]+}}, off, s[0:3], 0 offset:28{{$}} +; HSA-ELT4-DAG: buffer_store_dword {{v[0-9]+}}, off, s[0:3], 0 offset:32{{$}} +; HSA-ELT4-DAG: buffer_store_dword {{v[0-9]+}}, off, s[0:3], 0 offset:36{{$}} +; HSA-ELT4-DAG: buffer_store_dword {{v[0-9]+}}, off, s[0:3], 0 offset:40{{$}} +; HSA-ELT4-DAG: buffer_store_dword {{v[0-9]+}}, off, s[0:3], 0 offset:44{{$}} + +; HSA-ELT4-DAG: buffer_load_dword {{v[0-9]+}}, v{{[0-9]+}}, s[0:3], 0 offen{{$}} +; HSA-ELT4-DAG: buffer_load_dword {{v[0-9]+}}, v{{[0-9]+}}, s[0:3], 0 offen offset:4{{$}} +; HSA-ELT4-DAG: buffer_load_dword {{v[0-9]+}}, v{{[0-9]+}}, s[0:3], 0 offen offset:8{{$}} +; HSA-ELT4-DAG: buffer_load_dword {{v[0-9]+}}, v{{[0-9]+}}, s[0:3], 0 offen offset:12{{$}} define amdgpu_kernel void @private_elt_size_v4i32(<4 x i32> addrspace(1)* %out, i32 addrspace(1)* %index.array) #0 { entry: %tid = call i32 @llvm.amdgcn.workitem.id.x() @@ -59,53 +59,53 @@ ; HSA-ELT8: private_element_size = 2 ; HSA-ELT4: private_element_size = 1 -; HSA-ELT16-DAG: buffer_store_dwordx4 {{v\[[0-9]+:[0-9]+\]}}, off, s[0:3], s9 offset:32 -; HSA-ELT16-DAG: buffer_store_dwordx4 {{v\[[0-9]+:[0-9]+\]}}, off, s[0:3], s9 offset:48 -; HSA-ELT16-DAG: buffer_store_dwordx4 {{v\[[0-9]+:[0-9]+\]}}, off, s[0:3], s9 offset:64 -; HSA-ELT16-DAG: buffer_store_dwordx4 {{v\[[0-9]+:[0-9]+\]}}, off, s[0:3], s9 offset:80 - -; HSA-ELT16-DAG: buffer_load_dwordx4 {{v\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, s[0:3], s9 offen{{$}} -; HSA-ELT16-DAG: buffer_load_dwordx4 {{v\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, s[0:3], s9 offen{{$}} - - -; HSA-ELT8-DAG: buffer_store_dwordx2 {{v\[[0-9]+:[0-9]+\]}}, off, s[0:3], s9 offset:32 -; HSA-ELT8-DAG: buffer_store_dwordx2 {{v\[[0-9]+:[0-9]+\]}}, off, s[0:3], s9 offset:40 -; HSA-ELT8-DAG: buffer_store_dwordx2 {{v\[[0-9]+:[0-9]+\]}}, off, s[0:3], s9 offset:48 -; HSA-ELT8-DAG: buffer_store_dwordx2 {{v\[[0-9]+:[0-9]+\]}}, off, s[0:3], s9 offset:56 -; HSA-ELT8-DAG: buffer_store_dwordx2 {{v\[[0-9]+:[0-9]+\]}}, off, s[0:3], s9 offset:88 -; HSA-ELT8-DAG: buffer_store_dwordx2 {{v\[[0-9]+:[0-9]+\]}}, off, s[0:3], s9 offset:80 -; HSA-ELT8-DAG: buffer_store_dwordx2 {{v\[[0-9]+:[0-9]+\]}}, off, s[0:3], s9 offset:72 -; HSA-ELT8-DAG: buffer_store_dwordx2 {{v\[[0-9]+:[0-9]+\]}}, off, s[0:3], s9 offset:64 - -; HSA-ELT8: buffer_load_dwordx2 {{v\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, s[0:3], s9 offen -; HSA-ELT8: buffer_load_dwordx2 {{v\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, s[0:3], s9 offen - - -; HSA-ELT4-DAG: buffer_store_dword {{v[0-9]+}}, off, s[0:3], s9 offset:32{{$}} -; HSA-ELT4-DAG: buffer_store_dword {{v[0-9]+}}, off, s[0:3], s9 offset:36{{$}} -; HSA-ELT4-DAG: buffer_store_dword {{v[0-9]+}}, off, s[0:3], s9 offset:40{{$}} -; HSA-ELT4-DAG: buffer_store_dword {{v[0-9]+}}, off, s[0:3], s9 offset:44{{$}} -; HSA-ELT4-DAG: buffer_store_dword {{v[0-9]+}}, off, s[0:3], s9 offset:48{{$}} -; HSA-ELT4-DAG: buffer_store_dword {{v[0-9]+}}, off, s[0:3], s9 offset:52{{$}} -; HSA-ELT4-DAG: buffer_store_dword {{v[0-9]+}}, off, s[0:3], s9 offset:56{{$}} -; HSA-ELT4-DAG: buffer_store_dword {{v[0-9]+}}, off, s[0:3], s9 offset:60{{$}} -; HSA-ELT4-DAG: buffer_store_dword {{v[0-9]+}}, off, s[0:3], s9 offset:64{{$}} -; HSA-ELT4-DAG: buffer_store_dword {{v[0-9]+}}, off, s[0:3], s9 offset:68{{$}} -; HSA-ELT4-DAG: buffer_store_dword {{v[0-9]+}}, off, s[0:3], s9 offset:72{{$}} -; HSA-ELT4-DAG: buffer_store_dword {{v[0-9]+}}, off, s[0:3], s9 offset:76{{$}} -; HSA-ELT4-DAG: buffer_store_dword {{v[0-9]+}}, off, s[0:3], s9 offset:80{{$}} -; HSA-ELT4-DAG: buffer_store_dword {{v[0-9]+}}, off, s[0:3], s9 offset:84{{$}} -; HSA-ELT4-DAG: buffer_store_dword {{v[0-9]+}}, off, s[0:3], s9 offset:88{{$}} -; HSA-ELT4-DAG: buffer_store_dword {{v[0-9]+}}, off, s[0:3], s9 offset:92{{$}} - -; HSA-ELT4-DAG: buffer_load_dword {{v[0-9]+}}, v{{[0-9]+}}, s[0:3], s9 offen{{$}} -; HSA-ELT4-DAG: buffer_load_dword {{v[0-9]+}}, v{{[0-9]+}}, s[0:3], s9 offen offset:4{{$}} -; HSA-ELT4-DAG: buffer_load_dword {{v[0-9]+}}, v{{[0-9]+}}, s[0:3], s9 offen offset:8{{$}} -; HSA-ELT4-DAG: buffer_load_dword {{v[0-9]+}}, v{{[0-9]+}}, s[0:3], s9 offen offset:12{{$}} -; HSA-ELT4-DAG: buffer_load_dword {{v[0-9]+}}, v{{[0-9]+}}, s[0:3], s9 offen offset:16{{$}} -; HSA-ELT4-DAG: buffer_load_dword {{v[0-9]+}}, v{{[0-9]+}}, s[0:3], s9 offen offset:20{{$}} -; HSA-ELT4-DAG: buffer_load_dword {{v[0-9]+}}, v{{[0-9]+}}, s[0:3], s9 offen offset:24{{$}} -; HSA-ELT4-DAG: buffer_load_dword {{v[0-9]+}}, v{{[0-9]+}}, s[0:3], s9 offen offset:28{{$}} +; HSA-ELT16-DAG: buffer_store_dwordx4 {{v\[[0-9]+:[0-9]+\]}}, off, s[0:3], 0 offset:32 +; HSA-ELT16-DAG: buffer_store_dwordx4 {{v\[[0-9]+:[0-9]+\]}}, off, s[0:3], 0 offset:48 +; HSA-ELT16-DAG: buffer_store_dwordx4 {{v\[[0-9]+:[0-9]+\]}}, off, s[0:3], 0 offset:64 +; HSA-ELT16-DAG: buffer_store_dwordx4 {{v\[[0-9]+:[0-9]+\]}}, off, s[0:3], 0 offset:80 + +; HSA-ELT16-DAG: buffer_load_dwordx4 {{v\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, s[0:3], 0 offen{{$}} +; HSA-ELT16-DAG: buffer_load_dwordx4 {{v\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, s[0:3], 0 offen{{$}} + + +; HSA-ELT8-DAG: buffer_store_dwordx2 {{v\[[0-9]+:[0-9]+\]}}, off, s[0:3], 0 offset:32 +; HSA-ELT8-DAG: buffer_store_dwordx2 {{v\[[0-9]+:[0-9]+\]}}, off, s[0:3], 0 offset:40 +; HSA-ELT8-DAG: buffer_store_dwordx2 {{v\[[0-9]+:[0-9]+\]}}, off, s[0:3], 0 offset:48 +; HSA-ELT8-DAG: buffer_store_dwordx2 {{v\[[0-9]+:[0-9]+\]}}, off, s[0:3], 0 offset:56 +; HSA-ELT8-DAG: buffer_store_dwordx2 {{v\[[0-9]+:[0-9]+\]}}, off, s[0:3], 0 offset:88 +; HSA-ELT8-DAG: buffer_store_dwordx2 {{v\[[0-9]+:[0-9]+\]}}, off, s[0:3], 0 offset:80 +; HSA-ELT8-DAG: buffer_store_dwordx2 {{v\[[0-9]+:[0-9]+\]}}, off, s[0:3], 0 offset:72 +; HSA-ELT8-DAG: buffer_store_dwordx2 {{v\[[0-9]+:[0-9]+\]}}, off, s[0:3], 0 offset:64 + +; HSA-ELT8: buffer_load_dwordx2 {{v\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, s[0:3], 0 offen +; HSA-ELT8: buffer_load_dwordx2 {{v\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, s[0:3], 0 offen + + +; HSA-ELT4-DAG: buffer_store_dword {{v[0-9]+}}, off, s[0:3], 0 offset:32{{$}} +; HSA-ELT4-DAG: buffer_store_dword {{v[0-9]+}}, off, s[0:3], 0 offset:36{{$}} +; HSA-ELT4-DAG: buffer_store_dword {{v[0-9]+}}, off, s[0:3], 0 offset:40{{$}} +; HSA-ELT4-DAG: buffer_store_dword {{v[0-9]+}}, off, s[0:3], 0 offset:44{{$}} +; HSA-ELT4-DAG: buffer_store_dword {{v[0-9]+}}, off, s[0:3], 0 offset:48{{$}} +; HSA-ELT4-DAG: buffer_store_dword {{v[0-9]+}}, off, s[0:3], 0 offset:52{{$}} +; HSA-ELT4-DAG: buffer_store_dword {{v[0-9]+}}, off, s[0:3], 0 offset:56{{$}} +; HSA-ELT4-DAG: buffer_store_dword {{v[0-9]+}}, off, s[0:3], 0 offset:60{{$}} +; HSA-ELT4-DAG: buffer_store_dword {{v[0-9]+}}, off, s[0:3], 0 offset:64{{$}} +; HSA-ELT4-DAG: buffer_store_dword {{v[0-9]+}}, off, s[0:3], 0 offset:68{{$}} +; HSA-ELT4-DAG: buffer_store_dword {{v[0-9]+}}, off, s[0:3], 0 offset:72{{$}} +; HSA-ELT4-DAG: buffer_store_dword {{v[0-9]+}}, off, s[0:3], 0 offset:76{{$}} +; HSA-ELT4-DAG: buffer_store_dword {{v[0-9]+}}, off, s[0:3], 0 offset:80{{$}} +; HSA-ELT4-DAG: buffer_store_dword {{v[0-9]+}}, off, s[0:3], 0 offset:84{{$}} +; HSA-ELT4-DAG: buffer_store_dword {{v[0-9]+}}, off, s[0:3], 0 offset:88{{$}} +; HSA-ELT4-DAG: buffer_store_dword {{v[0-9]+}}, off, s[0:3], 0 offset:92{{$}} + +; HSA-ELT4-DAG: buffer_load_dword {{v[0-9]+}}, v{{[0-9]+}}, s[0:3], 0 offen{{$}} +; HSA-ELT4-DAG: buffer_load_dword {{v[0-9]+}}, v{{[0-9]+}}, s[0:3], 0 offen offset:4{{$}} +; HSA-ELT4-DAG: buffer_load_dword {{v[0-9]+}}, v{{[0-9]+}}, s[0:3], 0 offen offset:8{{$}} +; HSA-ELT4-DAG: buffer_load_dword {{v[0-9]+}}, v{{[0-9]+}}, s[0:3], 0 offen offset:12{{$}} +; HSA-ELT4-DAG: buffer_load_dword {{v[0-9]+}}, v{{[0-9]+}}, s[0:3], 0 offen offset:16{{$}} +; HSA-ELT4-DAG: buffer_load_dword {{v[0-9]+}}, v{{[0-9]+}}, s[0:3], 0 offen offset:20{{$}} +; HSA-ELT4-DAG: buffer_load_dword {{v[0-9]+}}, v{{[0-9]+}}, s[0:3], 0 offen offset:24{{$}} +; HSA-ELT4-DAG: buffer_load_dword {{v[0-9]+}}, v{{[0-9]+}}, s[0:3], 0 offen offset:28{{$}} define amdgpu_kernel void @private_elt_size_v8i32(<8 x i32> addrspace(1)* %out, i32 addrspace(1)* %index.array) #0 { entry: %tid = call i32 @llvm.amdgcn.workitem.id.x() @@ -130,19 +130,19 @@ ; HSA-ELT8: private_element_size = 2 ; HSA-ELT4: private_element_size = 1 -; HSA-ELTGE8-DAG: buffer_store_dwordx2 {{v\[[0-9]+:[0-9]+\]}}, {{off|v[0-9]}}, s[0:3], s9 offset:1 -; HSA-ELTGE8-DAG: buffer_store_dwordx2 {{v\[[0-9]+:[0-9]+\]}}, {{off|v[0-9]}}, s[0:3], s9 offset:2 +; HSA-ELTGE8-DAG: buffer_store_dwordx2 {{v\[[0-9]+:[0-9]+\]}}, {{off|v[0-9]}}, s[0:3], 0 offset:1 +; HSA-ELTGE8-DAG: buffer_store_dwordx2 {{v\[[0-9]+:[0-9]+\]}}, {{off|v[0-9]}}, s[0:3], 0 offset:2 -; HSA-ELTGE8: buffer_load_dwordx2 {{v\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, s[0:3], s9 offen +; HSA-ELTGE8: buffer_load_dwordx2 {{v\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, s[0:3], 0 offen -; HSA-ELT4-DAG: buffer_store_dword {{v[0-9]+}}, off, s[0:3], s9 offset:16{{$}} -; HSA-ELT4-DAG: buffer_store_dword {{v[0-9]+}}, off, s[0:3], s9 offset:20{{$}} -; HSA-ELT4-DAG: buffer_store_dword {{v[0-9]+}}, off, s[0:3], s9 offset:24{{$}} -; HSA-ELT4-DAG: buffer_store_dword {{v[0-9]+}}, off, s[0:3], s9 offset:28{{$}} +; HSA-ELT4-DAG: buffer_store_dword {{v[0-9]+}}, off, s[0:3], 0 offset:16{{$}} +; HSA-ELT4-DAG: buffer_store_dword {{v[0-9]+}}, off, s[0:3], 0 offset:20{{$}} +; HSA-ELT4-DAG: buffer_store_dword {{v[0-9]+}}, off, s[0:3], 0 offset:24{{$}} +; HSA-ELT4-DAG: buffer_store_dword {{v[0-9]+}}, off, s[0:3], 0 offset:28{{$}} -; HSA-ELT4: buffer_load_dword {{v[0-9]+}}, v{{[0-9]+}}, s[0:3], s9 offen{{$}} -; HSA-ELT4: buffer_load_dword {{v[0-9]+}}, v{{[0-9]+}}, s[0:3], s9 offen offset:4{{$}} +; HSA-ELT4: buffer_load_dword {{v[0-9]+}}, v{{[0-9]+}}, s[0:3], 0 offen{{$}} +; HSA-ELT4: buffer_load_dword {{v[0-9]+}}, v{{[0-9]+}}, s[0:3], 0 offen offset:4{{$}} define amdgpu_kernel void @private_elt_size_i64(i64 addrspace(1)* %out, i32 addrspace(1)* %index.array) #0 { entry: %tid = call i32 @llvm.amdgcn.workitem.id.x() @@ -166,19 +166,19 @@ ; HSA-ELT8: private_element_size = 2 ; HSA-ELT4: private_element_size = 1 -; HSA-ELTGE8-DAG: buffer_store_dwordx2 {{v\[[0-9]+:[0-9]+\]}}, off, s[0:3], s9 offset:16 -; HSA-ELTGE8-DAG: buffer_store_dwordx2 {{v\[[0-9]+:[0-9]+\]}}, off, s[0:3], s9 offset:24 +; HSA-ELTGE8-DAG: buffer_store_dwordx2 {{v\[[0-9]+:[0-9]+\]}}, off, s[0:3], 0 offset:16 +; HSA-ELTGE8-DAG: buffer_store_dwordx2 {{v\[[0-9]+:[0-9]+\]}}, off, s[0:3], 0 offset:24 -; HSA-ELTGE8: buffer_load_dwordx2 {{v\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, s[0:3], s9 offen +; HSA-ELTGE8: buffer_load_dwordx2 {{v\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, s[0:3], 0 offen -; HSA-ELT4-DAG: buffer_store_dword {{v[0-9]+}}, off, s[0:3], s9 offset:16{{$}} -; HSA-ELT4-DAG: buffer_store_dword {{v[0-9]+}}, off, s[0:3], s9 offset:20{{$}} -; HSA-ELT4-DAG: buffer_store_dword {{v[0-9]+}}, off, s[0:3], s9 offset:24{{$}} -; HSA-ELT4-DAG: buffer_store_dword {{v[0-9]+}}, off, s[0:3], s9 offset:28{{$}} +; HSA-ELT4-DAG: buffer_store_dword {{v[0-9]+}}, off, s[0:3], 0 offset:16{{$}} +; HSA-ELT4-DAG: buffer_store_dword {{v[0-9]+}}, off, s[0:3], 0 offset:20{{$}} +; HSA-ELT4-DAG: buffer_store_dword {{v[0-9]+}}, off, s[0:3], 0 offset:24{{$}} +; HSA-ELT4-DAG: buffer_store_dword {{v[0-9]+}}, off, s[0:3], 0 offset:28{{$}} -; HSA-ELT4: buffer_load_dword {{v[0-9]+}}, v{{[0-9]+}}, s[0:3], s9 offen{{$}} -; HSA-ELT4: buffer_load_dword {{v[0-9]+}}, v{{[0-9]+}}, s[0:3], s9 offen offset:4{{$}} +; HSA-ELT4: buffer_load_dword {{v[0-9]+}}, v{{[0-9]+}}, s[0:3], 0 offen{{$}} +; HSA-ELT4: buffer_load_dword {{v[0-9]+}}, v{{[0-9]+}}, s[0:3], 0 offen offset:4{{$}} define amdgpu_kernel void @private_elt_size_f64(double addrspace(1)* %out, i32 addrspace(1)* %index.array) #0 { entry: %tid = call i32 @llvm.amdgcn.workitem.id.x() @@ -202,32 +202,32 @@ ; HSA-ELT8: private_element_size = 2 ; HSA-ELT4: private_element_size = 1 -; HSA-ELT16-DAG: buffer_store_dwordx4 {{v\[[0-9]+:[0-9]+\]}}, off, s[0:3], s9 offset:16 -; HSA-ELT16-DAG: buffer_store_dwordx4 {{v\[[0-9]+:[0-9]+\]}}, off, s[0:3], s9 offset:32 -; HSA-ELT16-DAG: buffer_load_dwordx4 {{v\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, s[0:3], s9 offen{{$}} +; HSA-ELT16-DAG: buffer_store_dwordx4 {{v\[[0-9]+:[0-9]+\]}}, off, s[0:3], 0 offset:16 +; HSA-ELT16-DAG: buffer_store_dwordx4 {{v\[[0-9]+:[0-9]+\]}}, off, s[0:3], 0 offset:32 +; HSA-ELT16-DAG: buffer_load_dwordx4 {{v\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, s[0:3], 0 offen{{$}} -; HSA-ELT8-DAG: buffer_store_dwordx2 {{v\[[0-9]+:[0-9]+\]}}, off, s[0:3], s9 offset:16{{$}} -; HSA-ELT8-DAG: buffer_store_dwordx2 {{v\[[0-9]+:[0-9]+\]}}, off, s[0:3], s9 offset:24 -; HSA-ELT8-DAG: buffer_store_dwordx2 {{v\[[0-9]+:[0-9]+\]}}, off, s[0:3], s9 offset:40 -; HSA-ELT8-DAG: buffer_store_dwordx2 {{v\[[0-9]+:[0-9]+\]}}, off, s[0:3], s9 offset:32 +; HSA-ELT8-DAG: buffer_store_dwordx2 {{v\[[0-9]+:[0-9]+\]}}, off, s[0:3], 0 offset:16{{$}} +; HSA-ELT8-DAG: buffer_store_dwordx2 {{v\[[0-9]+:[0-9]+\]}}, off, s[0:3], 0 offset:24 +; HSA-ELT8-DAG: buffer_store_dwordx2 {{v\[[0-9]+:[0-9]+\]}}, off, s[0:3], 0 offset:40 +; HSA-ELT8-DAG: buffer_store_dwordx2 {{v\[[0-9]+:[0-9]+\]}}, off, s[0:3], 0 offset:32 -; HSA-ELT8: buffer_load_dwordx2 {{v\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, s[0:3], s9 offen -; HSA-ELT8: buffer_load_dwordx2 {{v\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, s[0:3], s9 offen +; HSA-ELT8: buffer_load_dwordx2 {{v\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, s[0:3], 0 offen +; HSA-ELT8: buffer_load_dwordx2 {{v\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, s[0:3], 0 offen -; HSA-ELT4-DAG: buffer_store_dword {{v[0-9]+}}, off, s[0:3], s9 offset:16{{$}} -; HSA-ELT4-DAG: buffer_store_dword {{v[0-9]+}}, off, s[0:3], s9 offset:20{{$}} -; HSA-ELT4-DAG: buffer_store_dword {{v[0-9]+}}, off, s[0:3], s9 offset:24{{$}} -; HSA-ELT4-DAG: buffer_store_dword {{v[0-9]+}}, off, s[0:3], s9 offset:28{{$}} -; HSA-ELT4-DAG: buffer_store_dword {{v[0-9]+}}, off, s[0:3], s9 offset:32{{$}} -; HSA-ELT4-DAG: buffer_store_dword {{v[0-9]+}}, off, s[0:3], s9 offset:36{{$}} -; HSA-ELT4-DAG: buffer_store_dword {{v[0-9]+}}, off, s[0:3], s9 offset:40{{$}} -; HSA-ELT4-DAG: buffer_store_dword {{v[0-9]+}}, off, s[0:3], s9 offset:44{{$}} +; HSA-ELT4-DAG: buffer_store_dword {{v[0-9]+}}, off, s[0:3], 0 offset:16{{$}} +; HSA-ELT4-DAG: buffer_store_dword {{v[0-9]+}}, off, s[0:3], 0 offset:20{{$}} +; HSA-ELT4-DAG: buffer_store_dword {{v[0-9]+}}, off, s[0:3], 0 offset:24{{$}} +; HSA-ELT4-DAG: buffer_store_dword {{v[0-9]+}}, off, s[0:3], 0 offset:28{{$}} +; HSA-ELT4-DAG: buffer_store_dword {{v[0-9]+}}, off, s[0:3], 0 offset:32{{$}} +; HSA-ELT4-DAG: buffer_store_dword {{v[0-9]+}}, off, s[0:3], 0 offset:36{{$}} +; HSA-ELT4-DAG: buffer_store_dword {{v[0-9]+}}, off, s[0:3], 0 offset:40{{$}} +; HSA-ELT4-DAG: buffer_store_dword {{v[0-9]+}}, off, s[0:3], 0 offset:44{{$}} -; HSA-ELT4: buffer_load_dword {{v[0-9]+}}, v{{[0-9]+}}, s[0:3], s9 offen{{$}} -; HSA-ELT4: buffer_load_dword {{v[0-9]+}}, v{{[0-9]+}}, s[0:3], s9 offen offset:4{{$}} -; HSA-ELT4: buffer_load_dword {{v[0-9]+}}, v{{[0-9]+}}, s[0:3], s9 offen offset:8{{$}} -; HSA-ELT4: buffer_load_dword {{v[0-9]+}}, v{{[0-9]+}}, s[0:3], s9 offen offset:12{{$}} +; HSA-ELT4: buffer_load_dword {{v[0-9]+}}, v{{[0-9]+}}, s[0:3], 0 offen{{$}} +; HSA-ELT4: buffer_load_dword {{v[0-9]+}}, v{{[0-9]+}}, s[0:3], 0 offen offset:4{{$}} +; HSA-ELT4: buffer_load_dword {{v[0-9]+}}, v{{[0-9]+}}, s[0:3], 0 offen offset:8{{$}} +; HSA-ELT4: buffer_load_dword {{v[0-9]+}}, v{{[0-9]+}}, s[0:3], 0 offen offset:12{{$}} define amdgpu_kernel void @private_elt_size_v2i64(<2 x i64> addrspace(1)* %out, i32 addrspace(1)* %index.array) #0 { entry: %tid = call i32 @llvm.amdgcn.workitem.id.x() diff --git a/llvm/test/CodeGen/AMDGPU/rename-independent-subregs-mac-operands.mir b/llvm/test/CodeGen/AMDGPU/rename-independent-subregs-mac-operands.mir --- a/llvm/test/CodeGen/AMDGPU/rename-independent-subregs-mac-operands.mir +++ b/llvm/test/CodeGen/AMDGPU/rename-independent-subregs-mac-operands.mir @@ -13,7 +13,6 @@ tracksRegLiveness: true machineFunctionInfo: scratchRSrcReg: '$sgpr0_sgpr1_sgpr2_sgpr3' - scratchWaveOffsetReg: '$sgpr4' frameOffsetReg: '$sgpr4' registers: @@ -99,7 +98,6 @@ tracksRegLiveness: true machineFunctionInfo: scratchRSrcReg: '$sgpr0_sgpr1_sgpr2_sgpr3' - scratchWaveOffsetReg: '$sgpr4' frameOffsetReg: '$sgpr4' registers: - { id: 0, class: vgpr_32, preferred-register: '' } diff --git a/llvm/test/CodeGen/AMDGPU/sched-assert-dead-def-subreg-use-other-subreg.mir b/llvm/test/CodeGen/AMDGPU/sched-assert-dead-def-subreg-use-other-subreg.mir --- a/llvm/test/CodeGen/AMDGPU/sched-assert-dead-def-subreg-use-other-subreg.mir +++ b/llvm/test/CodeGen/AMDGPU/sched-assert-dead-def-subreg-use-other-subreg.mir @@ -10,7 +10,6 @@ machineFunctionInfo: isEntryFunction: true scratchRSrcReg: '$sgpr24_sgpr25_sgpr26_sgpr27' - scratchWaveOffsetReg: '$sgpr32' frameOffsetReg: '$sgpr32' stackPtrOffsetReg: '$sgpr32' argumentInfo: diff --git a/llvm/test/CodeGen/AMDGPU/sched-handleMoveUp-subreg-def-across-subreg-def.mir b/llvm/test/CodeGen/AMDGPU/sched-handleMoveUp-subreg-def-across-subreg-def.mir --- a/llvm/test/CodeGen/AMDGPU/sched-handleMoveUp-subreg-def-across-subreg-def.mir +++ b/llvm/test/CodeGen/AMDGPU/sched-handleMoveUp-subreg-def-across-subreg-def.mir @@ -11,7 +11,6 @@ machineFunctionInfo: isEntryFunction: true scratchRSrcReg: '$sgpr96_sgpr97_sgpr98_sgpr99' - scratchWaveOffsetReg: '$sgpr101' frameOffsetReg: '$sgpr101' stackPtrOffsetReg: '$sgpr101' argumentInfo: diff --git a/llvm/test/CodeGen/AMDGPU/scratch-buffer.ll b/llvm/test/CodeGen/AMDGPU/scratch-buffer.ll --- a/llvm/test/CodeGen/AMDGPU/scratch-buffer.ll +++ b/llvm/test/CodeGen/AMDGPU/scratch-buffer.ll @@ -9,9 +9,9 @@ ; should be able to reuse the same regiser for each scratch buffer access. ; GCN-LABEL: {{^}}legal_offset_fi: -; GCN: buffer_store_dword v{{[0-9]+}}, off, s[{{[0-9]+}}:{{[0-9]+}}], s{{[0-9]+}} offset:4{{$}} +; GCN: buffer_store_dword v{{[0-9]+}}, off, s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:4{{$}} ; GCN: v_mov_b32_e32 [[OFFSET:v[0-9]+]], 0x8004 -; GCN: buffer_store_dword v{{[0-9]+}}, [[OFFSET]], s[{{[0-9]+}}:{{[0-9]+}}], s{{[0-9]+}} offen{{$}} +; GCN: buffer_store_dword v{{[0-9]+}}, [[OFFSET]], s[{{[0-9]+}}:{{[0-9]+}}], 0 offen{{$}} define amdgpu_kernel void @legal_offset_fi(i32 addrspace(1)* %out, i32 %cond, i32 %if_offset, i32 %else_offset) { entry: @@ -47,11 +47,11 @@ } ; GCN-LABEL: {{^}}legal_offset_fi_offset: -; GCN-DAG: buffer_store_dword v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9]+}}:{{[0-9]+}}], s{{[0-9]+}} offen{{$}} +; GCN-DAG: buffer_store_dword v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9]+}}:{{[0-9]+}}], 0 offen{{$}} ; This constant isn't folded, because it has multiple uses. ; GCN-DAG: v_mov_b32_e32 [[K8000:v[0-9]+]], 0x8004 ; GCN-DAG: v_add_{{[iu]}}32_e32 [[OFFSET:v[0-9]+]], vcc, [[K8000]] -; GCN: buffer_store_dword v{{[0-9]+}}, [[OFFSET]], s[{{[0-9]+}}:{{[0-9]+}}], s{{[0-9]+}} offen{{$}} +; GCN: buffer_store_dword v{{[0-9]+}}, [[OFFSET]], s[{{[0-9]+}}:{{[0-9]+}}], 0 offen{{$}} define amdgpu_kernel void @legal_offset_fi_offset(i32 addrspace(1)* %out, i32 %cond, i32 addrspace(1)* %offsets, i32 %if_offset, i32 %else_offset) { entry: @@ -88,7 +88,7 @@ ; GCN-LABEL: {{^}}neg_vaddr_offset_inbounds: ; GCN: v_add_{{[iu]}}32_e32 [[ADD:v[0-9]+]], vcc, 16, v{{[0-9]+}} -; GCN: buffer_store_dword v{{[0-9]+}}, [[ADD]], s[{{[0-9]+:[0-9]+}}], s{{[0-9]+}} offen{{$}} +; GCN: buffer_store_dword v{{[0-9]+}}, [[ADD]], s[{{[0-9]+:[0-9]+}}], 0 offen{{$}} define amdgpu_kernel void @neg_vaddr_offset_inbounds(i32 %offset) { entry: %array = alloca [8192 x i32], addrspace(5) @@ -100,7 +100,7 @@ ; GCN-LABEL: {{^}}neg_vaddr_offset: ; GCN: v_add_{{[iu]}}32_e32 [[ADD:v[0-9]+]], vcc, 16, v{{[0-9]+}} -; GCN: buffer_store_dword v{{[0-9]+}}, [[ADD]], s[{{[0-9]+:[0-9]+}}], s{{[0-9]+}} offen{{$}} +; GCN: buffer_store_dword v{{[0-9]+}}, [[ADD]], s[{{[0-9]+:[0-9]+}}], 0 offen{{$}} define amdgpu_kernel void @neg_vaddr_offset(i32 %offset) { entry: %array = alloca [8192 x i32], addrspace(5) @@ -111,7 +111,7 @@ } ; GCN-LABEL: {{^}}pos_vaddr_offset: -; GCN: buffer_store_dword v{{[0-9]+}}, off, s[{{[0-9]+:[0-9]+}}], s{{[0-9]+}} offset:20 +; GCN: buffer_store_dword v{{[0-9]+}}, off, s[{{[0-9]+:[0-9]+}}], 0 offset:20 define amdgpu_kernel void @pos_vaddr_offset(i32 addrspace(1)* %out, i32 %offset) { entry: %array = alloca [8192 x i32], addrspace(5) diff --git a/llvm/test/CodeGen/AMDGPU/scratch-simple.ll b/llvm/test/CodeGen/AMDGPU/scratch-simple.ll --- a/llvm/test/CodeGen/AMDGPU/scratch-simple.ll +++ b/llvm/test/CodeGen/AMDGPU/scratch-simple.ll @@ -29,8 +29,8 @@ ; GCN-DAG: v_or_b32_e32 [[LO_OFF:v[0-9]+]], 0x200, [[CLAMP_IDX]] ; GCN-DAG: v_or_b32_e32 [[HI_OFF:v[0-9]+]], 0x400, [[CLAMP_IDX]] -; GCN: buffer_load_dword {{v[0-9]+}}, [[LO_OFF]], {{s\[[0-9]+:[0-9]+\]}}, s0 offen -; GCN: buffer_load_dword {{v[0-9]+}}, [[HI_OFF]], {{s\[[0-9]+:[0-9]+\]}}, s0 offen +; GCN: buffer_load_dword {{v[0-9]+}}, [[LO_OFF]], {{s\[[0-9]+:[0-9]+\]}}, 0 offen +; GCN: buffer_load_dword {{v[0-9]+}}, [[HI_OFF]], {{s\[[0-9]+:[0-9]+\]}}, 0 offen define amdgpu_ps float @ps_main(i32 %idx) { %v1 = extractelement <81 x float> , i32 %idx %v2 = extractelement <81 x float> , i32 %idx @@ -41,8 +41,8 @@ ; GCN-LABEL: {{^}}vs_main: ; GCN-DAG: s_mov_b32 s4, SCRATCH_RSRC_DWORD0 ; GCN-NOT: s_mov_b32 s0 -; GCN: buffer_load_dword {{v[0-9]+}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, s0 offen -; GCN: buffer_load_dword {{v[0-9]+}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, s0 offen +; GCN: buffer_load_dword {{v[0-9]+}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, 0 offen +; GCN: buffer_load_dword {{v[0-9]+}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, 0 offen define amdgpu_vs float @vs_main(i32 %idx) { %v1 = extractelement <81 x float> , i32 %idx %v2 = extractelement <81 x float> , i32 %idx @@ -52,8 +52,8 @@ ; GCN-LABEL: {{^}}cs_main: ; GCN-DAG: s_mov_b32 s4, SCRATCH_RSRC_DWORD0 -; GCN: buffer_load_dword {{v[0-9]+}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, s0 offen -; GCN: buffer_load_dword {{v[0-9]+}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, s0 offen +; GCN: buffer_load_dword {{v[0-9]+}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, 0 offen +; GCN: buffer_load_dword {{v[0-9]+}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, 0 offen define amdgpu_cs float @cs_main(i32 %idx) { %v1 = extractelement <81 x float> , i32 %idx %v2 = extractelement <81 x float> , i32 %idx @@ -64,13 +64,13 @@ ; GCN-LABEL: {{^}}hs_main: ; SIVI: s_mov_b32 s4, SCRATCH_RSRC_DWORD0 ; SIVI-NOT: s_mov_b32 s0 -; SIVI: buffer_load_dword {{v[0-9]+}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, s0 offen -; SIVI: buffer_load_dword {{v[0-9]+}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, s0 offen +; SIVI: buffer_load_dword {{v[0-9]+}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, 0 offen +; SIVI: buffer_load_dword {{v[0-9]+}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, 0 offen ; GFX9_10: s_mov_b32 s0, SCRATCH_RSRC_DWORD0 ; GFX9_10-NOT: s_mov_b32 s5 -; GFX9_10: buffer_load_dword {{v[0-9]+}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, s5 offen -; GFX9_10: buffer_load_dword {{v[0-9]+}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, s5 offen +; GFX9_10: buffer_load_dword {{v[0-9]+}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, 0 offen +; GFX9_10: buffer_load_dword {{v[0-9]+}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, 0 offen define amdgpu_hs float @hs_main(i32 %idx) { %v1 = extractelement <81 x float> , i32 %idx %v2 = extractelement <81 x float> , i32 %idx @@ -80,12 +80,12 @@ ; GCN-LABEL: {{^}}gs_main: ; SIVI: s_mov_b32 s4, SCRATCH_RSRC_DWORD0 -; SIVI: buffer_load_dword {{v[0-9]+}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, s0 offen -; SIVI: buffer_load_dword {{v[0-9]+}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, s0 offen +; SIVI: buffer_load_dword {{v[0-9]+}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, 0 offen +; SIVI: buffer_load_dword {{v[0-9]+}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, 0 offen ; GFX9_10: s_mov_b32 s0, SCRATCH_RSRC_DWORD0 -; GFX9_10: buffer_load_dword {{v[0-9]+}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, s5 offen -; GFX9_10: buffer_load_dword {{v[0-9]+}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, s5 offen +; GFX9_10: buffer_load_dword {{v[0-9]+}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, 0 offen +; GFX9_10: buffer_load_dword {{v[0-9]+}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, 0 offen define amdgpu_gs float @gs_main(i32 %idx) { %v1 = extractelement <81 x float> , i32 %idx %v2 = extractelement <81 x float> , i32 %idx @@ -93,42 +93,8 @@ ret float %r } -; GCN-LABEL: {{^}}hs_ir_uses_scratch_offset: -; GCN: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 - -; SIVI-NOT: s_mov_b32 s6 -; SIVI: buffer_load_dword {{v[0-9]+}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, s6 offen -; SIVI: buffer_load_dword {{v[0-9]+}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, s6 offen - -; GFX9_10-NOT: s_mov_b32 s5 -; GFX9_10-DAG: buffer_load_dword {{v[0-9]+}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, s5 offen -; GFX9_10-DAG: buffer_load_dword {{v[0-9]+}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, s5 offen - -; GCN-DAG: s_mov_b32 s2, s5 -define amdgpu_hs <{i32, i32, i32, float}> @hs_ir_uses_scratch_offset(i32 inreg, i32 inreg, i32 inreg, i32 inreg, i32 inreg, i32 inreg %swo, i32 %idx) { - %v1 = extractelement <81 x float> , i32 %idx - %v2 = extractelement <81 x float> , i32 %idx - %f = fadd float %v1, %v2 - %r1 = insertvalue <{i32, i32, i32, float}> undef, i32 %swo, 2 - %r2 = insertvalue <{i32, i32, i32, float}> %r1, float %f, 3 - ret <{i32, i32, i32, float}> %r2 -} - -; GCN-LABEL: {{^}}gs_ir_uses_scratch_offset: -; GCN: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 - -; SIVI: buffer_load_dword {{v[0-9]+}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, s6 offen -; SIVI: buffer_load_dword {{v[0-9]+}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, s6 offen - -; GFX9_10-DAG: buffer_load_dword {{v[0-9]+}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, s5 offen -; GFX9_10-DAG: buffer_load_dword {{v[0-9]+}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, s5 offen - -; GCN-DAG: s_mov_b32 s2, s5 -define amdgpu_gs <{i32, i32, i32, float}> @gs_ir_uses_scratch_offset(i32 inreg, i32 inreg, i32 inreg, i32 inreg, i32 inreg, i32 inreg %swo, i32 %idx) { - %v1 = extractelement <81 x float> , i32 %idx - %v2 = extractelement <81 x float> , i32 %idx - %f = fadd float %v1, %v2 - %r1 = insertvalue <{i32, i32, i32, float}> undef, i32 %swo, 2 - %r2 = insertvalue <{i32, i32, i32, float}> %r1, float %f, 3 - ret <{i32, i32, i32, float}> %r2 -} +; FIXME: This change assumes the scratch wave offset is dead after being used +; to update the scratch SRD, but this test previously used `inreg` to refer to +; the scratch wave offset in cases where it has a fixed location (i.e. SGPR5 +; for GFX9). What exactly is the test trying to verify, and is the change to +; mark the scratch wave offset as "killed" by the new setup in the prologue OK? diff --git a/llvm/test/CodeGen/AMDGPU/sgpr-spill-wrong-stack-id.mir b/llvm/test/CodeGen/AMDGPU/sgpr-spill-wrong-stack-id.mir --- a/llvm/test/CodeGen/AMDGPU/sgpr-spill-wrong-stack-id.mir +++ b/llvm/test/CodeGen/AMDGPU/sgpr-spill-wrong-stack-id.mir @@ -35,13 +35,13 @@ # SHARE: SI_SPILL_S32_SAVE $sgpr32, %stack.2, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr32 :: (store 4 into %stack.2, addrspace 5) # SHARE: SI_SPILL_V32_SAVE killed $vgpr0, %stack.0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, implicit $exec :: (store 4 into %stack.0, addrspace 5) -# SHARE: SI_SPILL_S64_SAVE killed renamable $sgpr6_sgpr7, %stack.1, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr32 :: (store 8 into %stack.1, align 4, addrspace 5) -# SHARE: renamable $sgpr6_sgpr7 = SI_SPILL_S64_RESTORE %stack.1, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr32 :: (load 8 from %stack.1, align 4, addrspace 5) -# SHARE: dead $sgpr30_sgpr31 = SI_CALL killed renamable $sgpr6_sgpr7, @func, csr_amdgpu_highregs, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr4, implicit undef $vgpr0 +# SHARE: SI_SPILL_S64_SAVE killed renamable $sgpr4_sgpr5, %stack.1, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr32 :: (store 8 into %stack.1, align 4, addrspace 5) +# SHARE: renamable $sgpr4_sgpr5 = SI_SPILL_S64_RESTORE %stack.1, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr32 :: (load 8 from %stack.1, align 4, addrspace 5) +# SHARE: dead $sgpr30_sgpr31 = SI_CALL killed renamable $sgpr4_sgpr5, @func, csr_amdgpu_highregs, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit undef $vgpr0 # SHARE: $sgpr32 = SI_SPILL_S32_RESTORE %stack.2, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr32 :: (load 4 from %stack.2, addrspace 5) # SHARE: $vgpr0 = SI_SPILL_V32_RESTORE %stack.0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, implicit $exec :: (load 4 from %stack.0, addrspace 5) -# SHARE: renamable $sgpr6_sgpr7 = SI_SPILL_S64_RESTORE %stack.1, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr32 :: (load 8 from %stack.1, align 4, addrspace 5) -# SHARE: dead $sgpr30_sgpr31 = SI_CALL killed renamable $sgpr6_sgpr7, @func, csr_amdgpu_highregs, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr4, implicit $vgpr0 +# SHARE: renamable $sgpr4_sgpr5 = SI_SPILL_S64_RESTORE %stack.1, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr32 :: (load 8 from %stack.1, align 4, addrspace 5) +# SHARE: dead $sgpr30_sgpr31 = SI_CALL killed renamable $sgpr4_sgpr5, @func, csr_amdgpu_highregs, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $vgpr0 # SHARE: $sgpr32 = SI_SPILL_S32_RESTORE %stack.2, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr32 :: (load 4 from %stack.2, addrspace 5) # NOSHARE: stack: @@ -60,14 +60,14 @@ # NOSHARE: SI_SPILL_S32_SAVE $sgpr32, %stack.2, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr32 :: (store 4 into %stack.2, addrspace 5) # NOSHARE: SI_SPILL_V32_SAVE killed $vgpr0, %stack.0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, implicit $exec :: (store 4 into %stack.0, addrspace 5) -# NOSHARE: SI_SPILL_S64_SAVE killed renamable $sgpr6_sgpr7, %stack.1, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr32 :: (store 8 into %stack.1, align 4, addrspace 5) -# NOSHARE: renamable $sgpr6_sgpr7 = SI_SPILL_S64_RESTORE %stack.1, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr32 :: (load 8 from %stack.1, align 4, addrspace 5) -# NOSHARE: dead $sgpr30_sgpr31 = SI_CALL killed renamable $sgpr6_sgpr7, @func, csr_amdgpu_highregs, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr4, implicit undef $vgpr0 +# NOSHARE: SI_SPILL_S64_SAVE killed renamable $sgpr4_sgpr5, %stack.1, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr32 :: (store 8 into %stack.1, align 4, addrspace 5) +# NOSHARE: renamable $sgpr4_sgpr5 = SI_SPILL_S64_RESTORE %stack.1, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr32 :: (load 8 from %stack.1, align 4, addrspace 5) +# NOSHARE: dead $sgpr30_sgpr31 = SI_CALL killed renamable $sgpr4_sgpr5, @func, csr_amdgpu_highregs, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit undef $vgpr0 # NOSHARE: $sgpr32 = SI_SPILL_S32_RESTORE %stack.2, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr32 :: (load 4 from %stack.2, addrspace 5) # NOSHARE: SI_SPILL_S32_SAVE $sgpr32, %stack.3, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr32 :: (store 4 into %stack.3, addrspace 5) # NOSHARE: $vgpr0 = SI_SPILL_V32_RESTORE %stack.0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, implicit $exec :: (load 4 from %stack.0, addrspace 5) -# NOSHARE: renamable $sgpr6_sgpr7 = SI_SPILL_S64_RESTORE %stack.1, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr32 :: (load 8 from %stack.1, align 4, addrspace 5) -# NOSHARE: dead $sgpr30_sgpr31 = SI_CALL killed renamable $sgpr6_sgpr7, @func, csr_amdgpu_highregs, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr4, implicit $vgpr0 +# NOSHARE: renamable $sgpr4_sgpr5 = SI_SPILL_S64_RESTORE %stack.1, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr32 :: (load 8 from %stack.1, align 4, addrspace 5) +# NOSHARE: dead $sgpr30_sgpr31 = SI_CALL killed renamable $sgpr4_sgpr5, @func, csr_amdgpu_highregs, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $vgpr0 # NOSHARE: $sgpr32 = SI_SPILL_S32_RESTORE %stack.3, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr32 :: (load 4 from %stack.3, addrspace 5) ... @@ -78,7 +78,6 @@ hasCalls: true machineFunctionInfo: scratchRSrcReg: $sgpr0_sgpr1_sgpr2_sgpr3 - scratchWaveOffsetReg: $sgpr4 frameOffsetReg: $sgpr32 stackPtrOffsetReg: $sgpr32 body: | @@ -88,13 +87,13 @@ %2:vgpr_32 = FLAT_LOAD_DWORD %1, 0, 0, 0, 0, implicit $exec, implicit $flat_scr %3:sreg_64 = SI_PC_ADD_REL_OFFSET target-flags(amdgpu-rel32-lo) @func + 4, target-flags(amdgpu-rel32-hi) @func + 4, implicit-def dead $scc ADJCALLSTACKUP 0, 0, implicit-def $scc, implicit-def $sgpr32, implicit $sgpr32, implicit $sgpr32 - dead $sgpr30_sgpr31 = SI_CALL %3, @func, csr_amdgpu_highregs, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr4, implicit undef $vgpr0 + dead $sgpr30_sgpr31 = SI_CALL %3, @func, csr_amdgpu_highregs, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit undef $vgpr0 $sgpr32 = COPY %0 %4:sreg_32_xm0 = COPY $sgpr32 ADJCALLSTACKDOWN 0, 0, implicit-def $scc, implicit-def $sgpr32, implicit $sgpr32, implicit $sgpr32 ADJCALLSTACKUP 0, 0, implicit-def $scc, implicit-def $sgpr32, implicit $sgpr32, implicit $sgpr32 $vgpr0 = COPY %2 - dead $sgpr30_sgpr31 = SI_CALL %3, @func, csr_amdgpu_highregs, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr4, implicit killed $vgpr0 + dead $sgpr30_sgpr31 = SI_CALL %3, @func, csr_amdgpu_highregs, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit killed $vgpr0 $sgpr32 = COPY %4 ADJCALLSTACKDOWN 0, 0, implicit-def $scc, implicit-def $sgpr32, implicit $sgpr32, implicit $sgpr32 diff --git a/llvm/test/CodeGen/AMDGPU/shl_add_ptr.ll b/llvm/test/CodeGen/AMDGPU/shl_add_ptr.ll --- a/llvm/test/CodeGen/AMDGPU/shl_add_ptr.ll +++ b/llvm/test/CodeGen/AMDGPU/shl_add_ptr.ll @@ -345,10 +345,10 @@ ; GCN-LABEL: {{^}}shl_add_ptr_combine_2use_private: ; GCN: v_lshlrev_b32_e32 [[SCALE0:v[0-9]+]], 2, v0 -; GCN: buffer_store_dword v{{[0-9]+}}, [[SCALE0]], s[0:3], s33 offen offset:16 +; GCN: buffer_store_dword v{{[0-9]+}}, [[SCALE0]], s[0:3], 0 offen offset:16 ; GCN: v_lshlrev_b32_e32 [[SCALE1:v[0-9]+]], 3, v0 -; GCN: buffer_store_dword v{{[0-9]+}}, [[SCALE1]], s[0:3], s33 offen offset:32 +; GCN: buffer_store_dword v{{[0-9]+}}, [[SCALE1]], s[0:3], 0 offen offset:32 define void @shl_add_ptr_combine_2use_private(i16 zeroext %idx.arg) #0 { %idx = zext i16 %idx.arg to i32 %idx.add = add nuw i32 %idx, 4 @@ -364,9 +364,9 @@ ; GCN-LABEL: {{^}}shl_add_ptr_combine_2use_max_private_offset: ; GCN-DAG: v_lshlrev_b32_e32 [[SCALE0:v[0-9]+]], 3, v0 ; GCN-DAG: v_lshlrev_b32_e32 [[SCALE1:v[0-9]+]], 4, v0 -; GCN-DAG: buffer_store_dword v{{[0-9]+}}, [[SCALE0]], s[0:3], s33 offen offset:4088 +; GCN-DAG: buffer_store_dword v{{[0-9]+}}, [[SCALE0]], s[0:3], 0 offen offset:4088 ; GCN-DAG: v_add_{{[iu]}}32_e32 [[ADD:v[0-9]+]], vcc, 0x1ff0, [[SCALE1]] -; GCN: buffer_store_dword v{{[0-9]+}}, [[ADD]], s[0:3], s33 offen{{$}} +; GCN: buffer_store_dword v{{[0-9]+}}, [[ADD]], s[0:3], 0 offen{{$}} define void @shl_add_ptr_combine_2use_max_private_offset(i16 zeroext %idx.arg) #0 { %idx = zext i16 %idx.arg to i32 %idx.add = add nuw i32 %idx, 511 @@ -382,8 +382,8 @@ ; GCN: v_add_{{[iu]}}32_e32 [[ADD:v[0-9]+]], vcc, 0x100, v0 ; GCN-DAG: v_lshlrev_b32_e32 [[SCALE0:v[0-9]+]], 4, [[ADD]] ; GCN-DAG: v_lshlrev_b32_e32 [[SCALE1:v[0-9]+]], 5, [[ADD]] -; GCN-DAG: buffer_store_dword v{{[0-9]+}}, [[SCALE0]], s[0:3], s33 offen{{$}} -; GCN: buffer_store_dword v{{[0-9]+}}, [[SCALE1]], s[0:3], s33 offen{{$}} +; GCN-DAG: buffer_store_dword v{{[0-9]+}}, [[SCALE0]], s[0:3], 0 offen{{$}} +; GCN: buffer_store_dword v{{[0-9]+}}, [[SCALE1]], s[0:3], 0 offen{{$}} define void @shl_add_ptr_combine_2use_both_max_private_offset(i16 zeroext %idx.arg) #0 { %idx = zext i16 %idx.arg to i32 %idx.add = add nuw i32 %idx, 256 diff --git a/llvm/test/CodeGen/AMDGPU/si-spill-sgpr-stack.ll b/llvm/test/CodeGen/AMDGPU/si-spill-sgpr-stack.ll --- a/llvm/test/CodeGen/AMDGPU/si-spill-sgpr-stack.ll +++ b/llvm/test/CodeGen/AMDGPU/si-spill-sgpr-stack.ll @@ -3,11 +3,10 @@ ; Make sure this doesn't crash. ; ALL-LABEL: {{^}}test: ; ALL: s_mov_b32 s[[LO:[0-9]+]], SCRATCH_RSRC_DWORD0 -; ALL: s_mov_b32 s[[OFF:[0-9]+]], s3 ; ALL: s_mov_b32 s[[HI:[0-9]+]], 0xe80000 ; Make sure we are handling hazards correctly. -; SGPR: buffer_load_dword [[VHI:v[0-9]+]], off, s[{{[0-9]+:[0-9]+}}], s{{[0-9]+}} offset:16 +; SGPR: buffer_load_dword [[VHI:v[0-9]+]], off, s[{{[0-9]+:[0-9]+}}], 0 offset:16 ; SGPR-NEXT: s_waitcnt vmcnt(0) ; SGPR-NEXT: v_readfirstlane_b32 s[[HI:[0-9]+]], [[VHI]] ; SGPR-NEXT: s_nop 4 diff --git a/llvm/test/CodeGen/AMDGPU/sibling-call.ll b/llvm/test/CodeGen/AMDGPU/sibling-call.ll --- a/llvm/test/CodeGen/AMDGPU/sibling-call.ll +++ b/llvm/test/CodeGen/AMDGPU/sibling-call.ll @@ -117,7 +117,7 @@ ; GCN-LABEL: {{^}}sibling_call_i32_fastcc_i32_byval_i32: ; GCN-NOT: v0 ; GCN-NOT: s32 -; GCN: buffer_load_dword v1, off, s[0:3], s33 offset:16 +; GCN: buffer_load_dword v1, off, s[0:3], 0 offset:16 ; GCN: buffer_store_dword v1, off, s[0:3], s32{{$}} ; GCN-NEXT: s_setpc_b64 define fastcc i32 @sibling_call_i32_fastcc_i32_byval_i32(i32 %a, [32 x i32] %large) #1 { diff --git a/llvm/test/CodeGen/AMDGPU/sp-too-many-input-sgprs.ll b/llvm/test/CodeGen/AMDGPU/sp-too-many-input-sgprs.ll deleted file mode 100644 --- a/llvm/test/CodeGen/AMDGPU/sp-too-many-input-sgprs.ll +++ /dev/null @@ -1,102 +0,0 @@ -; RUN: llc -mtriple=amdgcn-mesa-mesa3d -verify-machineinstrs < %s | FileCheck -check-prefixes=MESA3D,ALL %s -; RUN: llc -mtriple=amdgcn-- -verify-machineinstrs < %s | FileCheck -check-prefixes=UNKNOWN,ALL %s - -; Make sure shaders pick a workable SP with > 32 input SGPRs. -; FIXME: Doesn't seem to be getting initial value from right register? - -; ALL-LABEL: {{^}}too_many_input_sgprs_32: -; MESA3D-NOT: s34 -; MESA3D: buffer_store_dword v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, s34 offset:4 - -; Happens to end up in s32 anyway -; UNKNOWN-NOT: s32 -; UNKNOWN: buffer_store_dword v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, s32 offset:4 -define amdgpu_ps i32 @too_many_input_sgprs_32(i32 inreg %arg, i32 inreg %arg1, i32 inreg %arg2, i32 inreg %arg3, i32 inreg %arg4, i32 inreg %arg5, i32 inreg %arg6, i32 inreg %arg7, - i32 inreg %arg8, i32 inreg %arg9, i32 inreg %arg10, i32 inreg %arg11, i32 inreg %arg12, i32 inreg %arg13, i32 inreg %arg14, i32 inreg %arg15, - i32 inreg %arg16, i32 inreg %arg17, i32 inreg %arg18, i32 inreg %arg19, i32 inreg %arg20, i32 inreg %arg21, i32 inreg %arg22, i32 inreg %arg23, - i32 inreg %arg24, i32 inreg %arg25, i32 inreg %arg26, i32 inreg %arg27, i32 inreg %arg28, i32 inreg %arg29, i32 inreg %arg30, i32 inreg %arg31) { -bb: - %alloca = alloca i32, align 4, addrspace(5) - store volatile i32 0, i32 addrspace(5)* %alloca - %tmp = add i32 %arg, %arg1 - %tmp32 = add i32 %tmp, %arg2 - %tmp33 = add i32 %tmp32, %arg3 - %tmp34 = add i32 %tmp33, %arg4 - %tmp35 = add i32 %tmp34, %arg5 - %tmp36 = add i32 %tmp35, %arg6 - %tmp37 = add i32 %tmp36, %arg7 - %tmp38 = add i32 %tmp37, %arg8 - %tmp39 = add i32 %tmp38, %arg9 - %tmp40 = add i32 %tmp39, %arg10 - %tmp41 = add i32 %tmp40, %arg11 - %tmp42 = add i32 %tmp41, %arg12 - %tmp43 = add i32 %tmp42, %arg13 - %tmp44 = add i32 %tmp43, %arg14 - %tmp45 = add i32 %tmp44, %arg15 - %tmp46 = add i32 %tmp45, %arg16 - %tmp47 = add i32 %tmp46, %arg17 - %tmp48 = add i32 %tmp47, %arg18 - %tmp49 = add i32 %tmp48, %arg19 - %tmp50 = add i32 %tmp49, %arg20 - %tmp51 = add i32 %tmp50, %arg21 - %tmp52 = add i32 %tmp51, %arg22 - %tmp53 = add i32 %tmp52, %arg23 - %tmp54 = add i32 %tmp53, %arg24 - %tmp55 = add i32 %tmp54, %arg25 - %tmp56 = add i32 %tmp55, %arg26 - %tmp57 = add i32 %tmp56, %arg27 - %tmp58 = add i32 %tmp57, %arg28 - %tmp59 = add i32 %tmp58, %arg29 - %tmp60 = add i32 %tmp59, %arg30 - %tmp61 = add i32 %tmp60, %arg31 - ret i32 %tmp61 -} - -; ALL-LABEL: {{^}}too_many_input_sgprs_33: -; MESA3D-NOT: s35 -; MESA3D: buffer_store_dword v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, s35 offset:4 - -; UNKNOWN-NOT: s33 -; UNKNOWN: buffer_store_dword v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, s33 offset:4 -define amdgpu_ps i32 @too_many_input_sgprs_33(i32 inreg %arg, i32 inreg %arg1, i32 inreg %arg2, i32 inreg %arg3, i32 inreg %arg4, i32 inreg %arg5, i32 inreg %arg6, i32 inreg %arg7, - i32 inreg %arg8, i32 inreg %arg9, i32 inreg %arg10, i32 inreg %arg11, i32 inreg %arg12, i32 inreg %arg13, i32 inreg %arg14, i32 inreg %arg15, - i32 inreg %arg16, i32 inreg %arg17, i32 inreg %arg18, i32 inreg %arg19, i32 inreg %arg20, i32 inreg %arg21, i32 inreg %arg22, i32 inreg %arg23, - i32 inreg %arg24, i32 inreg %arg25, i32 inreg %arg26, i32 inreg %arg27, i32 inreg %arg28, i32 inreg %arg29, i32 inreg %arg30, i32 inreg %arg31, - i32 inreg %arg32) { -bb: - %alloca = alloca i32, align 4, addrspace(5) - store volatile i32 0, i32 addrspace(5)* %alloca - %tmp = add i32 %arg, %arg1 - %tmp32 = add i32 %tmp, %arg2 - %tmp33 = add i32 %tmp32, %arg3 - %tmp34 = add i32 %tmp33, %arg4 - %tmp35 = add i32 %tmp34, %arg5 - %tmp36 = add i32 %tmp35, %arg6 - %tmp37 = add i32 %tmp36, %arg7 - %tmp38 = add i32 %tmp37, %arg8 - %tmp39 = add i32 %tmp38, %arg9 - %tmp40 = add i32 %tmp39, %arg10 - %tmp41 = add i32 %tmp40, %arg11 - %tmp42 = add i32 %tmp41, %arg12 - %tmp43 = add i32 %tmp42, %arg13 - %tmp44 = add i32 %tmp43, %arg14 - %tmp45 = add i32 %tmp44, %arg15 - %tmp46 = add i32 %tmp45, %arg16 - %tmp47 = add i32 %tmp46, %arg17 - %tmp48 = add i32 %tmp47, %arg18 - %tmp49 = add i32 %tmp48, %arg19 - %tmp50 = add i32 %tmp49, %arg20 - %tmp51 = add i32 %tmp50, %arg21 - %tmp52 = add i32 %tmp51, %arg22 - %tmp53 = add i32 %tmp52, %arg23 - %tmp54 = add i32 %tmp53, %arg24 - %tmp55 = add i32 %tmp54, %arg25 - %tmp56 = add i32 %tmp55, %arg26 - %tmp57 = add i32 %tmp56, %arg27 - %tmp58 = add i32 %tmp57, %arg28 - %tmp59 = add i32 %tmp58, %arg29 - %tmp60 = add i32 %tmp59, %arg30 - %tmp61 = add i32 %tmp60, %arg31 - %tmp62 = add i32 %tmp61, %arg32 - ret i32 %tmp62 -} diff --git a/llvm/test/CodeGen/AMDGPU/spill-agpr.ll b/llvm/test/CodeGen/AMDGPU/spill-agpr.ll --- a/llvm/test/CodeGen/AMDGPU/spill-agpr.ll +++ b/llvm/test/CodeGen/AMDGPU/spill-agpr.ll @@ -6,8 +6,8 @@ ; A2M-DAG: s_mov_b32 s{{[0-9]+}}, SCRATCH_RSRC_DWORD1 ; A2V-NOT: SCRATCH_RSRC ; GFX908-DAG: v_accvgpr_read_b32 v[[VSPILL:[0-9]+]], a0 -; A2M: buffer_store_dword v[[VSPILL]], off, s[{{[0-9:]+}}], s{{[0-9]+}} offset:[[FI:[0-9]+]] ; 4-byte Folded Spill -; A2M: buffer_load_dword v[[VSPILL:[0-9]+]], off, s[{{[0-9:]+}}], s{{[0-9]+}} offset:[[FI]] ; 4-byte Folded Reload +; A2M: buffer_store_dword v[[VSPILL]], off, s[{{[0-9:]+}}], 0 offset:[[FI:[0-9]+]] ; 4-byte Folded Spill +; A2M: buffer_load_dword v[[VSPILL:[0-9]+]], off, s[{{[0-9:]+}}], 0 offset:[[FI]] ; 4-byte Folded Reload ; GFX908: v_accvgpr_write_b32 a{{[0-9]+}}, v[[VSPILL]] ; A2V: ScratchSize: 0 define amdgpu_kernel void @max_24regs_32a_used(<16 x float> addrspace(1)* %arg, float addrspace(1)* %out) #0 { @@ -35,8 +35,8 @@ ; A2M-DAG: s_mov_b32 s{{[0-9]+}}, SCRATCH_RSRC_DWORD1 ; A2V-NOT: SCRATCH_RSRC ; GFX908-DAG: v_accvgpr_read_b32 v[[VSPILL:[0-9]+]], a4 -; A2M: buffer_store_dword v[[VSPILL]], off, s[{{[0-9:]+}}], s{{[0-9]+}} offset:[[FI:[0-9]+]] ; 4-byte Folded Spill -; A2M: buffer_load_dword v[[VSPILL:[0-9]+]], off, s[{{[0-9:]+}}], s{{[0-9]+}} offset:[[FI]] ; 4-byte Folded Reload +; A2M: buffer_store_dword v[[VSPILL]], off, s[{{[0-9:]+}}], 0 offset:[[FI:[0-9]+]] ; 4-byte Folded Spill +; A2M: buffer_load_dword v[[VSPILL:[0-9]+]], off, s[{{[0-9:]+}}], 0 offset:[[FI]] ; 4-byte Folded Reload ; A2V: v_accvgpr_write_b32 a{{[0-9]+}}, v[[VSPILL]] ; A2V: ScratchSize: 0 define amdgpu_kernel void @max_12regs_13a_used(<4 x float> addrspace(1)* %arg, <4 x float> addrspace(1)* %out) #2 { @@ -64,8 +64,8 @@ ; A2M-DAG: s_mov_b32 s{{[0-9]+}}, SCRATCH_RSRC_DWORD1 ; A2V-NOT: SCRATCH_RSRC ; GFX908-DAG: v_accvgpr_read_b32 v[[VSPILL:[0-9]+]], a0 -; A2M: buffer_store_dword v[[VSPILL]], off, s[{{[0-9:]+}}], s{{[0-9]+}} offset:[[FI:[0-9]+]] ; 4-byte Folded Spill -; A2M: buffer_load_dword v[[VSPILL:[0-9]+]], off, s[{{[0-9:]+}}], s{{[0-9]+}} offset:[[FI]] ; 4-byte Folded Reload +; A2M: buffer_store_dword v[[VSPILL]], off, s[{{[0-9:]+}}], 0 offset:[[FI:[0-9]+]] ; 4-byte Folded Spill +; A2M: buffer_load_dword v[[VSPILL:[0-9]+]], off, s[{{[0-9:]+}}], 0 offset:[[FI]] ; 4-byte Folded Reload ; GFX908: v_accvgpr_write_b32 a{{[0-9]+}}, v[[VSPILL]] ; A2V: ScratchSize: 0 define amdgpu_kernel void @max_10_vgprs_used_9a(i32 addrspace(1)* %p) #1 { @@ -80,8 +80,8 @@ ; A2M-DAG: s_mov_b32 s{{[0-9]+}}, SCRATCH_RSRC_DWORD1 ; A2V-NOT: SCRATCH_RSRC ; GFX908-DAG: v_accvgpr_read_b32 v[[VSPILL:[0-9]+]], a0 -; A2M: buffer_store_dword v[[VSPILL]], off, s[{{[0-9:]+}}], s{{[0-9]+}} offset:[[FI:[0-9]+]] ; 4-byte Folded Spill -; A2M: buffer_load_dword v[[VSPILL:[0-9]+]], off, s[{{[0-9:]+}}], s{{[0-9]+}} offset:[[FI]] ; 4-byte Folded Reload +; A2M: buffer_store_dword v[[VSPILL]], off, s[{{[0-9:]+}}], 0 offset:[[FI:[0-9]+]] ; 4-byte Folded Spill +; A2M: buffer_load_dword v[[VSPILL:[0-9]+]], off, s[{{[0-9:]+}}], 0 offset:[[FI]] ; 4-byte Folded Reload ; GFX908: v_accvgpr_write_b32 a{{[0-9]+}}, v[[VSPILL]] ; A2V: ScratchSize: 0 define amdgpu_kernel void @max_32regs_mfma32(float addrspace(1)* %arg) #3 { diff --git a/llvm/test/CodeGen/AMDGPU/spill-before-exec.mir b/llvm/test/CodeGen/AMDGPU/spill-before-exec.mir --- a/llvm/test/CodeGen/AMDGPU/spill-before-exec.mir +++ b/llvm/test/CodeGen/AMDGPU/spill-before-exec.mir @@ -10,7 +10,6 @@ tracksRegLiveness: true machineFunctionInfo: scratchRSrcReg: $sgpr0_sgpr1_sgpr2_sgpr3 - scratchWaveOffsetReg: $sgpr4 stackPtrOffsetReg: $sgpr32 registers: - { id: 0, class: sreg_64 } @@ -36,6 +35,10 @@ - { id: 1119, class: sgpr_128 } - { id: 1120, class: sgpr_128 } - { id: 1121, class: sgpr_128 } + - { id: 1122, class: sgpr_128 } + - { id: 1123, class: sgpr_128 } + - { id: 1124, class: sgpr_128 } + - { id: 1125, class: sgpr_128 } body: | bb.0: successors: %bb.1 @@ -63,6 +66,10 @@ %1119 = COPY %1100 %1120 = COPY %1100 %1121 = COPY %1100 + %1122 = COPY %1100 + %1123 = COPY %1100 + %1124 = COPY %1100 + %1125 = COPY %1100 S_BRANCH %bb.1 bb.1: @@ -97,6 +104,8 @@ S_CMP_EQ_U64 %1116.sub0_sub1, %1117.sub2_sub3, implicit-def $scc S_CMP_EQ_U64 %1118.sub0_sub1, %1119.sub2_sub3, implicit-def $scc S_CMP_EQ_U64 %1120.sub0_sub1, %1121.sub2_sub3, implicit-def $scc + S_CMP_EQ_U64 %1122.sub0_sub1, %1123.sub2_sub3, implicit-def $scc + S_CMP_EQ_U64 %1124.sub0_sub1, %1125.sub2_sub3, implicit-def $scc $vgpr0 = V_MOV_B32_e32 0, implicit $exec S_SETPC_B64_return undef $sgpr30_sgpr31, implicit %0, implicit $vgpr0 diff --git a/llvm/test/CodeGen/AMDGPU/spill-empty-live-interval.mir b/llvm/test/CodeGen/AMDGPU/spill-empty-live-interval.mir --- a/llvm/test/CodeGen/AMDGPU/spill-empty-live-interval.mir +++ b/llvm/test/CodeGen/AMDGPU/spill-empty-live-interval.mir @@ -21,7 +21,6 @@ tracksRegLiveness: true machineFunctionInfo: scratchRSrcReg: $sgpr0_sgpr1_sgpr2_sgpr3 - scratchWaveOffsetReg: $sgpr4 stackPtrOffsetReg: $sgpr32 body: | bb.0: @@ -55,7 +54,6 @@ tracksRegLiveness: true machineFunctionInfo: scratchRSrcReg: $sgpr0_sgpr1_sgpr2_sgpr3 - scratchWaveOffsetReg: $sgpr4 stackPtrOffsetReg: $sgpr32 body: | bb.0: diff --git a/llvm/test/CodeGen/AMDGPU/spill-m0.ll b/llvm/test/CodeGen/AMDGPU/spill-m0.ll --- a/llvm/test/CodeGen/AMDGPU/spill-m0.ll +++ b/llvm/test/CodeGen/AMDGPU/spill-m0.ll @@ -14,7 +14,7 @@ ; TOVMEM-DAG: s_mov_b32 [[M0_COPY:s[0-9]+]], m0 ; TOVMEM-DAG: v_mov_b32_e32 [[SPILL_VREG:v[0-9]+]], [[M0_COPY]] -; TOVMEM: buffer_store_dword [[SPILL_VREG]], off, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} offset:12 ; 4-byte Folded Spill +; TOVMEM: buffer_store_dword [[SPILL_VREG]], off, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:12 ; 4-byte Folded Spill ; GCN: s_cbranch_scc1 [[ENDIF:BB[0-9]+_[0-9]+]] @@ -22,7 +22,7 @@ ; TOVGPR: v_readlane_b32 [[M0_RESTORE:s[0-9]+]], [[SPILL_VREG]], 2 ; TOVGPR: s_mov_b32 m0, [[M0_RESTORE]] -; TOVMEM: buffer_load_dword [[RELOAD_VREG:v[0-9]+]], off, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} offset:12 ; 4-byte Folded Reload +; TOVMEM: buffer_load_dword [[RELOAD_VREG:v[0-9]+]], off, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:12 ; 4-byte Folded Reload ; TOVMEM: s_waitcnt vmcnt(0) ; TOVMEM: v_readfirstlane_b32 [[M0_RESTORE:s[0-9]+]], [[RELOAD_VREG]] ; TOVMEM: s_mov_b32 m0, [[M0_RESTORE]] diff --git a/llvm/test/CodeGen/AMDGPU/spill-offset-calculation.ll b/llvm/test/CodeGen/AMDGPU/spill-offset-calculation.ll --- a/llvm/test/CodeGen/AMDGPU/spill-offset-calculation.ll +++ b/llvm/test/CodeGen/AMDGPU/spill-offset-calculation.ll @@ -13,7 +13,7 @@ %buf = bitcast i8 addrspace(5)* %alloca to i32 addrspace(5)* %aptr = getelementptr i32, i32 addrspace(5)* %buf, i32 1 - ; CHECK: buffer_store_dword v{{[0-9]+}}, off, s[{{[0-9]+:[0-9]+}}], s{{[0-9]+}} offset:4092 ; 4-byte Folded Spill + ; CHECK: buffer_store_dword v{{[0-9]+}}, off, s[{{[0-9]+:[0-9]+}}], 0 offset:4092 ; 4-byte Folded Spill %a = load volatile i32, i32 addrspace(5)* %aptr ; Force %a to spill. @@ -35,7 +35,7 @@ %aptr = getelementptr i32, i32 addrspace(5)* %buf, i32 1 ; 0x40000 / 64 = 4096 (for wave64) - ; CHECK: s_add_u32 s6, s7, 0x40000 + ; CHECK: s_mov_b32 s6, 0x40000 ; CHECK: buffer_store_dword v{{[0-9]+}}, off, s[{{[0-9]+:[0-9]+}}], s6 ; 4-byte Folded Spill %a = load volatile i32, i32 addrspace(5)* %aptr @@ -48,39 +48,8 @@ ret void } -; CHECK-LABEL: test_sgpr_offset_kernel_scavenge_fail -define amdgpu_kernel void @test_sgpr_offset_kernel_scavenge_fail() #1 { -entry: - ; Occupy 4096 bytes of scratch, so the offset of the spill of %a does not - ; fit in the instruction, and has to live in the SGPR offset. - %alloca = alloca i8, i32 4092, align 4, addrspace(5) - %buf = bitcast i8 addrspace(5)* %alloca to i32 addrspace(5)* - - %aptr = getelementptr i32, i32 addrspace(5)* %buf, i32 1 - - ; 0x40000 / 64 = 4096 (for wave64) - %a = load volatile i32, i32 addrspace(5)* %aptr - - %asm = call { i32, i32, i32, i32, i32, i32, i32, i32 } asm sideeffect "", "=s,=s,=s,=s,=s,=s,=s,=s"() - %asm0 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32 } %asm, 0 - %asm1 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32 } %asm, 1 - %asm2 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32 } %asm, 2 - %asm3 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32 } %asm, 3 - %asm4 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32 } %asm, 4 - %asm5 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32 } %asm, 5 - %asm6 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32 } %asm, 6 - %asm7 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32 } %asm, 7 - - call void asm sideeffect "", "~{v0},~{v1},~{v2},~{v3},~{v4},~{v5},~{v6},~{v7}"() #0 - - ; CHECK: s_add_u32 s7, s7, 0x40000 - ; CHECK: buffer_load_dword v{{[0-9]+}}, off, s[{{[0-9]+:[0-9]+}}], s7 ; 4-byte Folded Reload - ; CHECK: s_sub_u32 s7, s7, 0x40000 - - ; Force %a to spill with no free SGPRs - call void asm sideeffect "", "s,s,s,s,s,s,s,s,v"(i32 %asm0, i32 %asm1, i32 %asm2, i32 %asm3, i32 %asm4, i32 %asm5, i32 %asm6, i32 %asm7, i32 %a) - ret void -} +; FIXME: If we fail to scavenge an SGPR in a kernel we don't have a stack +; pointer to temporarily update, so we just crash. ; CHECK-LABEL: test_sgpr_offset_function_scavenge_fail define void @test_sgpr_offset_function_scavenge_fail() #2 { @@ -141,8 +110,8 @@ %bufv1 = bitcast i8 addrspace(5)* %alloca to i32 addrspace(5)* %bufv2 = bitcast i8 addrspace(5)* %alloca to <2 x i32> addrspace(5)* - ; CHECK: buffer_store_dword v{{[0-9]+}}, off, s[{{[0-9]+:[0-9]+}}], s{{[0-9]+}} offset:4088 ; 4-byte Folded Spill - ; CHECK: buffer_store_dword v{{[0-9]+}}, off, s[{{[0-9]+:[0-9]+}}], s{{[0-9]+}} offset:4092 ; 4-byte Folded Spill + ; CHECK: buffer_store_dword v{{[0-9]+}}, off, s[{{[0-9]+:[0-9]+}}], 0 offset:4088 ; 4-byte Folded Spill + ; CHECK: buffer_store_dword v{{[0-9]+}}, off, s[{{[0-9]+:[0-9]+}}], 0 offset:4092 ; 4-byte Folded Spill %aptr = getelementptr <2 x i32>, <2 x i32> addrspace(5)* %bufv2, i32 1 %a = load volatile <2 x i32>, <2 x i32> addrspace(5)* %aptr @@ -170,7 +139,7 @@ %bufv2 = bitcast i8 addrspace(5)* %alloca to <2 x i32> addrspace(5)* ; 0x3ff00 / 64 = 4092 (for wave64) - ; CHECK: s_add_u32 s6, s7, 0x3ff00 + ; CHECK: s_mov_b32 s6, 0x3ff00 ; CHECK: buffer_store_dword v{{[0-9]+}}, off, s[{{[0-9]+:[0-9]+}}], s6 ; 4-byte Folded Spill ; CHECK: buffer_store_dword v{{[0-9]+}}, off, s[{{[0-9]+:[0-9]+}}], s6 offset:4 ; 4-byte Folded Spill %aptr = getelementptr <2 x i32>, <2 x i32> addrspace(5)* %bufv2, i32 1 diff --git a/llvm/test/CodeGen/AMDGPU/stack-pointer-offset-relative-frameindex.ll b/llvm/test/CodeGen/AMDGPU/stack-pointer-offset-relative-frameindex.ll --- a/llvm/test/CodeGen/AMDGPU/stack-pointer-offset-relative-frameindex.ll +++ b/llvm/test/CodeGen/AMDGPU/stack-pointer-offset-relative-frameindex.ll @@ -10,14 +10,15 @@ ; GCN-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 ; GCN-NEXT: s_mov_b32 s38, -1 ; GCN-NEXT: s_mov_b32 s39, 0x31c16000 -; GCN-NEXT: s_mov_b32 s33, s3 -; GCN-NEXT: s_mov_b64 s[0:1], s[36:37] +; GCN-NEXT: s_add_u32 s36, s36, s3 +; GCN-NEXT: s_addc_u32 s37, s37, 0 ; GCN-NEXT: v_mov_b32_e32 v1, 0x2000 ; GCN-NEXT: v_mov_b32_e32 v2, 0x4000 ; GCN-NEXT: v_mov_b32_e32 v3, 0 -; GCN-NEXT: s_mov_b64 s[2:3], s[38:39] ; GCN-NEXT: v_mov_b32_e32 v4, 0x400000 -; GCN-NEXT: s_add_u32 s32, s33, 0xc0000 +; GCN-NEXT: s_mov_b64 s[0:1], s[36:37] +; GCN-NEXT: s_mov_b64 s[2:3], s[38:39] +; GCN-NEXT: s_mov_b32 s32, 0xc0000 ; GCN-NEXT: v_add_nc_u32_e64 v32, 4, 0x4000 ; GCN-NEXT: ; implicit-def: $vcc_hi ; GCN-NEXT: s_getpc_b64 s[4:5] @@ -30,13 +31,13 @@ ; GCN-NEXT: s_and_saveexec_b32 s0, vcc_lo ; GCN-NEXT: s_cbranch_execz BB0_2 ; GCN-NEXT: ; %bb.1: ; %if.then4.i -; GCN-NEXT: buffer_load_dword v0, v32, s[36:39], s32 offen -; GCN-NEXT: buffer_load_dword v1, v32, s[36:39], s32 offen offset:4 +; GCN-NEXT: buffer_load_dword v0, v32, s[36:39], 0 offen +; GCN-NEXT: buffer_load_dword v1, v32, s[36:39], 0 offen offset:4 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: v_add_nc_u32_e32 v0, v1, v0 ; GCN-NEXT: v_mul_lo_u32 v0, 0x41c64e6d, v0 ; GCN-NEXT: v_add_nc_u32_e32 v0, 0x3039, v0 -; GCN-NEXT: buffer_store_dword v0, v0, s[36:39], s33 offen +; GCN-NEXT: buffer_store_dword v0, v0, s[36:39], 0 offen ; GCN-NEXT: BB0_2: ; %shader_eval_surface.exit ; GCN-NEXT: s_endpgm entry: diff --git a/llvm/test/CodeGen/AMDGPU/stack-realign-kernel.ll b/llvm/test/CodeGen/AMDGPU/stack-realign-kernel.ll --- a/llvm/test/CodeGen/AMDGPU/stack-realign-kernel.ll +++ b/llvm/test/CodeGen/AMDGPU/stack-realign-kernel.ll @@ -7,10 +7,12 @@ ; VI-LABEL: max_alignment_128: ; VI: ; %bb.0: ; VI-NEXT: s_add_u32 s4, s4, s7 +; VI-NEXT: s_lshr_b32 flat_scratch_hi, s4, 8 +; VI-NEXT: s_add_u32 s0, s0, s7 +; VI-NEXT: s_addc_u32 s1, s1, 0 ; VI-NEXT: v_mov_b32_e32 v0, 9 ; VI-NEXT: s_mov_b32 flat_scratch_lo, s5 -; VI-NEXT: s_lshr_b32 flat_scratch_hi, s4, 8 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s7 offset:128 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:128 ; VI-NEXT: s_endpgm ; VI-NEXT: .section .rodata,#alloc ; VI-NEXT: .p2align 6 @@ -52,9 +54,11 @@ ; GFX9-LABEL: max_alignment_128: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_add_u32 flat_scratch_lo, s4, s7 -; GFX9-NEXT: v_mov_b32_e32 v0, 9 ; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s5, 0 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s7 offset:128 +; GFX9-NEXT: s_add_u32 s0, s0, s7 +; GFX9-NEXT: s_addc_u32 s1, s1, 0 +; GFX9-NEXT: v_mov_b32_e32 v0, 9 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:128 ; GFX9-NEXT: s_endpgm ; GFX9-NEXT: .section .rodata,#alloc ; GFX9-NEXT: .p2align 6 @@ -102,10 +106,12 @@ ; VI-LABEL: stackrealign_attr: ; VI: ; %bb.0: ; VI-NEXT: s_add_u32 s4, s4, s7 +; VI-NEXT: s_lshr_b32 flat_scratch_hi, s4, 8 +; VI-NEXT: s_add_u32 s0, s0, s7 +; VI-NEXT: s_addc_u32 s1, s1, 0 ; VI-NEXT: v_mov_b32_e32 v0, 9 ; VI-NEXT: s_mov_b32 flat_scratch_lo, s5 -; VI-NEXT: s_lshr_b32 flat_scratch_hi, s4, 8 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s7 offset:4 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:4 ; VI-NEXT: s_endpgm ; VI-NEXT: .section .rodata,#alloc ; VI-NEXT: .p2align 6 @@ -147,9 +153,11 @@ ; GFX9-LABEL: stackrealign_attr: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_add_u32 flat_scratch_lo, s4, s7 -; GFX9-NEXT: v_mov_b32_e32 v0, 9 ; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s5, 0 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s7 offset:4 +; GFX9-NEXT: s_add_u32 s0, s0, s7 +; GFX9-NEXT: s_addc_u32 s1, s1, 0 +; GFX9-NEXT: v_mov_b32_e32 v0, 9 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:4 ; GFX9-NEXT: s_endpgm ; GFX9-NEXT: .section .rodata,#alloc ; GFX9-NEXT: .p2align 6 @@ -197,10 +205,12 @@ ; VI-LABEL: alignstack_attr: ; VI: ; %bb.0: ; VI-NEXT: s_add_u32 s4, s4, s7 +; VI-NEXT: s_lshr_b32 flat_scratch_hi, s4, 8 +; VI-NEXT: s_add_u32 s0, s0, s7 +; VI-NEXT: s_addc_u32 s1, s1, 0 ; VI-NEXT: v_mov_b32_e32 v0, 9 ; VI-NEXT: s_mov_b32 flat_scratch_lo, s5 -; VI-NEXT: s_lshr_b32 flat_scratch_hi, s4, 8 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s7 offset:4 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:4 ; VI-NEXT: s_endpgm ; VI-NEXT: .section .rodata,#alloc ; VI-NEXT: .p2align 6 @@ -242,9 +252,11 @@ ; GFX9-LABEL: alignstack_attr: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_add_u32 flat_scratch_lo, s4, s7 -; GFX9-NEXT: v_mov_b32_e32 v0, 9 ; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s5, 0 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s7 offset:4 +; GFX9-NEXT: s_add_u32 s0, s0, s7 +; GFX9-NEXT: s_addc_u32 s1, s1, 0 +; GFX9-NEXT: v_mov_b32_e32 v0, 9 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:4 ; GFX9-NEXT: s_endpgm ; GFX9-NEXT: .section .rodata,#alloc ; GFX9-NEXT: .p2align 6 diff --git a/llvm/test/CodeGen/AMDGPU/stack-realign.ll b/llvm/test/CodeGen/AMDGPU/stack-realign.ll --- a/llvm/test/CodeGen/AMDGPU/stack-realign.ll +++ b/llvm/test/CodeGen/AMDGPU/stack-realign.ll @@ -9,18 +9,17 @@ ; = 144 bytes with padding between them ; GCN-LABEL: {{^}}needs_align16_default_stack_align: -; GCN: s_sub_u32 [[SUB:s[0-9]+]], s32, s33 ; GCN-DAG: v_lshlrev_b32_e32 [[SCALED_IDX:v[0-9]+]], 4, v0 -; GCN-DAG: v_lshrrev_b32_e64 [[FRAMEDIFF:v[0-9]+]], 6, [[SUB]] +; GCN-DAG: v_lshrrev_b32_e64 [[FRAMEDIFF:v[0-9]+]], 6, s32 ; GCN: v_add_u32_e32 [[FI:v[0-9]+]], vcc, [[FRAMEDIFF]], [[SCALED_IDX]] ; GCN-NOT: s32 -; GCN: buffer_store_dword v{{[0-9]+}}, v{{[0-9]+}}, s[0:3], s33 offen +; GCN: buffer_store_dword v{{[0-9]+}}, v{{[0-9]+}}, s[0:3], 0 offen ; GCN: v_or_b32_e32 v{{[0-9]+}}, 12 -; GCN: buffer_store_dword v{{[0-9]+}}, v{{[0-9]+}}, s[0:3], s33 offen -; GCN: buffer_store_dword v{{[0-9]+}}, v{{[0-9]+}}, s[0:3], s33 offen -; GCN: buffer_store_dword v{{[0-9]+}}, v{{[0-9]+}}, s[0:3], s33 offen +; GCN: buffer_store_dword v{{[0-9]+}}, v{{[0-9]+}}, s[0:3], 0 offen +; GCN: buffer_store_dword v{{[0-9]+}}, v{{[0-9]+}}, s[0:3], 0 offen +; GCN: buffer_store_dword v{{[0-9]+}}, v{{[0-9]+}}, s[0:3], 0 offen ; GCN-NOT: s32 @@ -35,13 +34,13 @@ ; GCN-LABEL: {{^}}needs_align16_stack_align4: ; GCN: s_add_u32 [[SCRATCH_REG:s[0-9]+]], s32, 0x3c0{{$}} ; GCN: s_and_b32 s34, [[SCRATCH_REG]], 0xfffffc00 -; GCN: s_add_u32 s32, s32, 0x2800{{$}} -; GCN: buffer_store_dword v{{[0-9]+}}, v{{[0-9]+}}, s[0:3], s33 offen +; GCN: buffer_store_dword v{{[0-9]+}}, v{{[0-9]+}}, s[0:3], 0 offen ; GCN: v_or_b32_e32 v{{[0-9]+}}, 12 -; GCN: buffer_store_dword v{{[0-9]+}}, v{{[0-9]+}}, s[0:3], s33 offen -; GCN: buffer_store_dword v{{[0-9]+}}, v{{[0-9]+}}, s[0:3], s33 offen -; GCN: buffer_store_dword v{{[0-9]+}}, v{{[0-9]+}}, s[0:3], s33 offen +; GCN: s_add_u32 s32, s32, 0x2800{{$}} +; GCN: buffer_store_dword v{{[0-9]+}}, v{{[0-9]+}}, s[0:3], 0 offen +; GCN: buffer_store_dword v{{[0-9]+}}, v{{[0-9]+}}, s[0:3], 0 offen +; GCN: buffer_store_dword v{{[0-9]+}}, v{{[0-9]+}}, s[0:3], 0 offen ; GCN: s_sub_u32 s32, s32, 0x2800 @@ -56,13 +55,13 @@ ; GCN-LABEL: {{^}}needs_align32: ; GCN: s_add_u32 [[SCRATCH_REG:s[0-9]+]], s32, 0x7c0{{$}} ; GCN: s_and_b32 s34, [[SCRATCH_REG]], 0xfffff800 -; GCN: s_add_u32 s32, s32, 0x3000{{$}} -; GCN: buffer_store_dword v{{[0-9]+}}, v{{[0-9]+}}, s[0:3], s33 offen +; GCN: buffer_store_dword v{{[0-9]+}}, v{{[0-9]+}}, s[0:3], 0 offen ; GCN: v_or_b32_e32 v{{[0-9]+}}, 12 -; GCN: buffer_store_dword v{{[0-9]+}}, v{{[0-9]+}}, s[0:3], s33 offen -; GCN: buffer_store_dword v{{[0-9]+}}, v{{[0-9]+}}, s[0:3], s33 offen -; GCN: buffer_store_dword v{{[0-9]+}}, v{{[0-9]+}}, s[0:3], s33 offen +; GCN: s_add_u32 s32, s32, 0x3000{{$}} +; GCN: buffer_store_dword v{{[0-9]+}}, v{{[0-9]+}}, s[0:3], 0 offen +; GCN: buffer_store_dword v{{[0-9]+}}, v{{[0-9]+}}, s[0:3], 0 offen +; GCN: buffer_store_dword v{{[0-9]+}}, v{{[0-9]+}}, s[0:3], 0 offen ; GCN: s_sub_u32 s32, s32, 0x3000 @@ -79,7 +78,7 @@ ; GCN: s_and_b32 s34, [[SCRATCH_REG]], 0xffffff00 ; GCN: s_add_u32 s32, s32, 0xd00{{$}} -; GCN: buffer_store_dword v{{[0-9]+}}, v{{[0-9]+}}, s[0:3], s33 offen +; GCN: buffer_store_dword v{{[0-9]+}}, v{{[0-9]+}}, s[0:3], 0 offen ; GCN: s_sub_u32 s32, s32, 0xd00 ; GCN: ; ScratchSize: 52 @@ -91,8 +90,7 @@ } ; GCN-LABEL: {{^}}kernel_call_align16_from_8: -; GCN: s_mov_b32 s33, s7{{$}} -; GCN-NEXT: s_add_u32 s32, s33, 0x400{{$}} +; GCN: s_movk_i32 s32, 0x400{{$}} ; GCN-NOT: s32 ; GCN: s_swappc_b64 define amdgpu_kernel void @kernel_call_align16_from_8() #0 { @@ -104,8 +102,7 @@ ; The call sequence should keep the stack on call aligned to 4 ; GCN-LABEL: {{^}}kernel_call_align16_from_5: -; GCN: s_mov_b32 s33, s7{{$}} -; GCN-NEXT: s_add_u32 s32, s33, 0x400 +; GCN: s_movk_i32 s32, 0x400 ; GCN: s_swappc_b64 define amdgpu_kernel void @kernel_call_align16_from_5() { %alloca0 = alloca i8, align 1, addrspace(5) @@ -116,8 +113,7 @@ } ; GCN-LABEL: {{^}}kernel_call_align4_from_5: -; GCN: s_mov_b32 s33, s7{{$}} -; GCN: s_add_u32 s32, s33, 0x400 +; GCN: s_movk_i32 s32, 0x400 ; GCN: s_swappc_b64 define amdgpu_kernel void @kernel_call_align4_from_5() { %alloca0 = alloca i8, align 1, addrspace(5) diff --git a/llvm/test/CodeGen/AMDGPU/stack-slot-color-sgpr-vgpr-spills.mir b/llvm/test/CodeGen/AMDGPU/stack-slot-color-sgpr-vgpr-spills.mir --- a/llvm/test/CodeGen/AMDGPU/stack-slot-color-sgpr-vgpr-spills.mir +++ b/llvm/test/CodeGen/AMDGPU/stack-slot-color-sgpr-vgpr-spills.mir @@ -12,15 +12,14 @@ # CHECK: SI_SPILL_V32_SAVE killed $vgpr0, %stack.0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, implicit $exec :: (store 4 into %stack.0, addrspace 5) # CHECK: $vgpr0 = SI_SPILL_V32_RESTORE %stack.0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, implicit $exec :: (load 4 from %stack.0, addrspace 5) -# CHECK: SI_SPILL_S32_SAVE killed renamable $sgpr6, %stack.1, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr32 :: (store 4 into %stack.1, addrspace 5) -# CHECK: $sgpr6 = SI_SPILL_S32_RESTORE %stack.1, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr32 :: (load 4 from %stack.1, addrspace 5) +# CHECK: SI_SPILL_S32_SAVE killed renamable $sgpr5, %stack.1, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr32 :: (store 4 into %stack.1, addrspace 5) +# CHECK: $sgpr5 = SI_SPILL_S32_RESTORE %stack.1, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr32 :: (load 4 from %stack.1, addrspace 5) name: no_merge_sgpr_vgpr_spill_slot tracksRegLiveness: true machineFunctionInfo: scratchRSrcReg: $sgpr0_sgpr1_sgpr2_sgpr3 - scratchWaveOffsetReg: $sgpr4 - frameOffsetReg: $sgpr5 + frameOffsetReg: $sgpr4 stackPtrOffsetReg: $sgpr32 body: | bb.0: diff --git a/llvm/test/CodeGen/AMDGPU/store-hi16.ll b/llvm/test/CodeGen/AMDGPU/store-hi16.ll --- a/llvm/test/CodeGen/AMDGPU/store-hi16.ll +++ b/llvm/test/CodeGen/AMDGPU/store-hi16.ll @@ -389,10 +389,10 @@ ; GCN-LABEL: {{^}}store_private_hi_v2i16: ; GCN: s_waitcnt -; GFX900-NEXT: buffer_store_short_d16_hi v1, v0, s[0:3], s33 offen{{$}} +; GFX900-NEXT: buffer_store_short_d16_hi v1, v0, s[0:3], 0 offen{{$}} ; NO-D16-HI: v_lshrrev_b32_e32 v1, 16, v1 -; NO-D16-HI: buffer_store_short v1, v0, s[0:3], s33 offen{{$}} +; NO-D16-HI: buffer_store_short v1, v0, s[0:3], 0 offen{{$}} ; GCN-NEXT: s_waitcnt ; GCN-NEXT: s_setpc_b64 @@ -408,10 +408,10 @@ ; GCN-LABEL: {{^}}store_private_hi_v2f16: ; GCN: s_waitcnt -; GFX900-NEXT: buffer_store_short_d16_hi v1, v0, s[0:3], s33 offen{{$}} +; GFX900-NEXT: buffer_store_short_d16_hi v1, v0, s[0:3], 0 offen{{$}} ; NO-D16-HI: v_lshrrev_b32_e32 v1, 16, v1 -; NO-D16-HI: buffer_store_short v1, v0, s[0:3], s33 offen{{$}} +; NO-D16-HI: buffer_store_short v1, v0, s[0:3], 0 offen{{$}} ; GCN-NEXT: s_waitcnt ; GCN-NEXT: s_setpc_b64 @@ -427,10 +427,10 @@ ; GCN-LABEL: {{^}}store_private_hi_i32_shift: ; GCN: s_waitcnt -; GFX900-NEXT: buffer_store_short_d16_hi v1, v0, s[0:3], s33 offen{{$}} +; GFX900-NEXT: buffer_store_short_d16_hi v1, v0, s[0:3], 0 offen{{$}} ; NO-D16-HI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; NO-D16-HI-NEXT: buffer_store_short v1, v0, s[0:3], s33 offen{{$}} +; NO-D16-HI-NEXT: buffer_store_short v1, v0, s[0:3], 0 offen{{$}} ; GCN-NEXT: s_waitcnt ; GCN-NEXT: s_setpc_b64 @@ -445,10 +445,10 @@ ; GCN-LABEL: {{^}}store_private_hi_v2i16_i8: ; GCN: s_waitcnt -; GFX900-NEXT: buffer_store_byte_d16_hi v1, v0, s[0:3], s33 offen{{$}} +; GFX900-NEXT: buffer_store_byte_d16_hi v1, v0, s[0:3], 0 offen{{$}} ; NO-D16-HI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; NO-D16-HI-NEXT: buffer_store_byte v1, v0, s[0:3], s33 offen{{$}} +; NO-D16-HI-NEXT: buffer_store_byte v1, v0, s[0:3], 0 offen{{$}} ; GCN-NEXT: s_waitcnt ; GCN-NEXT: s_setpc_b64 @@ -464,10 +464,10 @@ ; GCN-LABEL: {{^}}store_private_hi_i8_shift: ; GCN: s_waitcnt -; GFX900-NEXT: buffer_store_byte_d16_hi v1, v0, s[0:3], s33 offen{{$}} +; GFX900-NEXT: buffer_store_byte_d16_hi v1, v0, s[0:3], 0 offen{{$}} ; NO-D16-HI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; NO-D16-HI-NEXT: buffer_store_byte v1, v0, s[0:3], s33 offen{{$}} +; NO-D16-HI-NEXT: buffer_store_byte v1, v0, s[0:3], 0 offen{{$}} ; GCN-NEXT: s_waitcnt ; GCN-NEXT: s_setpc_b64 @@ -502,10 +502,10 @@ ; GCN-LABEL: {{^}}store_private_hi_v2i16_nooff: ; GCN: s_waitcnt -; GFX900-NEXT: buffer_store_short_d16_hi v0, off, s[0:3], s33{{$}} +; GFX900-NEXT: buffer_store_short_d16_hi v0, off, s[0:3], 0{{$}} ; NO-D16-HI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; NO-D16-HI-NEXT: buffer_store_short v0, off, s[0:3], s33{{$}} +; NO-D16-HI-NEXT: buffer_store_short v0, off, s[0:3], 0{{$}} ; GCN-NEXT: s_waitcnt ; GCN-NEXT: s_setpc_b64 @@ -522,10 +522,10 @@ ; GCN-LABEL: {{^}}store_private_hi_v2i16_i8_nooff: ; GCN: s_waitcnt -; GFX900-NEXT: buffer_store_byte_d16_hi v0, off, s[0:3], s33{{$}} +; GFX900-NEXT: buffer_store_byte_d16_hi v0, off, s[0:3], 0{{$}} ; NO-D16-HI: v_lshrrev_b32_e32 v0, 16, v0 -; NO-D16-HI: buffer_store_byte v0, off, s[0:3], s33{{$}} +; NO-D16-HI: buffer_store_byte v0, off, s[0:3], 0{{$}} ; GCN-NEXT: s_waitcnt ; GCN-NEXT: s_setpc_b64 diff --git a/llvm/test/CodeGen/AMDGPU/subreg-split-live-in-error.mir b/llvm/test/CodeGen/AMDGPU/subreg-split-live-in-error.mir --- a/llvm/test/CodeGen/AMDGPU/subreg-split-live-in-error.mir +++ b/llvm/test/CodeGen/AMDGPU/subreg-split-live-in-error.mir @@ -41,7 +41,6 @@ tracksRegLiveness: true machineFunctionInfo: scratchRSrcReg: $sgpr0_sgpr1_sgpr2_sgpr3 - scratchWaveOffsetReg: $sgpr4 stackPtrOffsetReg: $sgpr32 liveins: - { reg: '$vgpr2', virtual-reg: '%0' } diff --git a/llvm/test/CodeGen/AMDGPU/subvector-test.mir b/llvm/test/CodeGen/AMDGPU/subvector-test.mir --- a/llvm/test/CodeGen/AMDGPU/subvector-test.mir +++ b/llvm/test/CodeGen/AMDGPU/subvector-test.mir @@ -7,7 +7,6 @@ tracksRegLiveness: true machineFunctionInfo: scratchRSrcReg: $sgpr0_sgpr1_sgpr2_sgpr3 - scratchWaveOffsetReg: $sgpr4 frameOffsetReg: $sgpr5 stackPtrOffsetReg: $sgpr32 body: | diff --git a/llvm/test/CodeGen/AMDGPU/vgpr-spill-emergency-stack-slot.ll b/llvm/test/CodeGen/AMDGPU/vgpr-spill-emergency-stack-slot.ll --- a/llvm/test/CodeGen/AMDGPU/vgpr-spill-emergency-stack-slot.ll +++ b/llvm/test/CodeGen/AMDGPU/vgpr-spill-emergency-stack-slot.ll @@ -22,8 +22,8 @@ ; GFX9-DAG: s_mov_b32 s[[DESC3:[0-9]+]], 0xe00000 ; OFFREG is offset system SGPR -; GCN: buffer_store_dword {{v[0-9]+}}, off, s{{\[}}[[DESC0]]:[[DESC3]]], s12 offset:{{[0-9]+}} ; 4-byte Folded Spill -; GCN: buffer_load_dword v{{[0-9]+}}, off, s{{\[}}[[DESC0]]:[[DESC3]]], s12 offset:{{[0-9]+}} ; 4-byte Folded Reload +; GCN: buffer_store_dword {{v[0-9]+}}, off, s{{\[}}[[DESC0]]:[[DESC3]]], 0 offset:{{[0-9]+}} ; 4-byte Folded Spill +; GCN: buffer_load_dword v{{[0-9]+}}, off, s{{\[}}[[DESC0]]:[[DESC3]]], 0 offset:{{[0-9]+}} ; 4-byte Folded Reload ; GCN: NumVgprs: 256 ; GCN: ScratchSize: 1536 diff --git a/llvm/test/CodeGen/AMDGPU/virtregrewrite-undef-identity-copy.mir b/llvm/test/CodeGen/AMDGPU/virtregrewrite-undef-identity-copy.mir --- a/llvm/test/CodeGen/AMDGPU/virtregrewrite-undef-identity-copy.mir +++ b/llvm/test/CodeGen/AMDGPU/virtregrewrite-undef-identity-copy.mir @@ -24,7 +24,6 @@ machineFunctionInfo: isEntryFunction: true scratchRSrcReg: '$sgpr0_sgpr1_sgpr2_sgpr3' - scratchWaveOffsetReg: '$sgpr95' frameOffsetReg: '$sgpr95' stackPtrOffsetReg: '$sgpr32' body: | diff --git a/llvm/test/CodeGen/AMDGPU/wqm.ll b/llvm/test/CodeGen/AMDGPU/wqm.ll --- a/llvm/test/CodeGen/AMDGPU/wqm.ll +++ b/llvm/test/CodeGen/AMDGPU/wqm.ll @@ -693,11 +693,11 @@ ; CHECK: s_and_b64 exec, exec, [[LIVE]] ; CHECK: buffer_store_dword {{v[0-9]+}}, off, {{s\[[0-9]+:[0-9]+\]}}, 0 ; CHECK: s_wqm_b64 exec, exec -; CHECK: buffer_store_dword {{v[0-9]+}}, off, {{s\[[0-9]+:[0-9]+\]}}, {{s[0-9]+}} offset:4{{$}} +; CHECK: buffer_store_dword {{v[0-9]+}}, off, {{s\[[0-9]+:[0-9]+\]}}, 0 offset:4{{$}} ; CHECK: s_and_b64 exec, exec, [[LIVE]] ; CHECK: buffer_store_dword {{v[0-9]+}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, 0 idxen ; CHECK: s_wqm_b64 exec, exec -; CHECK: buffer_load_dword {{v[0-9]+}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, {{s[0-9]+}} offen +; CHECK: buffer_load_dword {{v[0-9]+}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, 0 offen ; CHECK: s_and_b64 exec, exec, [[LIVE]] ; CHECK: image_sample diff --git a/llvm/test/CodeGen/AMDGPU/wwm-reserved.ll b/llvm/test/CodeGen/AMDGPU/wwm-reserved.ll --- a/llvm/test/CodeGen/AMDGPU/wwm-reserved.ll +++ b/llvm/test/CodeGen/AMDGPU/wwm-reserved.ll @@ -44,7 +44,7 @@ ; GFX9: v_mov_b32_dpp v[[FIRST_MOV:[0-9]+]], v{{[0-9]+}} row_bcast:31 row_mask:0xc bank_mask:0xf ; GFX9: v_add_u32_e32 v[[FIRST_ADD:[0-9]+]], v{{[0-9]+}}, v[[FIRST_MOV]] ; GFX9: v_mov_b32_e32 v[[FIRST:[0-9]+]], v[[FIRST_ADD]] -; GFX9-O0: buffer_store_dword v[[FIRST]], off, s{{\[}}{{[0-9]+}}:{{[0-9]+}}{{\]}}, s[[FIRST_SGPR_OFFSET:[0-9]+]] offset:[[FIRST_IMM_OFFSET:[0-9]+]] +; GFX9-O0: buffer_store_dword v[[FIRST]], off, s{{\[}}{{[0-9]+}}:{{[0-9]+}}{{\]}}, 0 offset:[[FIRST_IMM_OFFSET:[0-9]+]] %tmp120 = tail call i32 @llvm.amdgcn.update.dpp.i32(i32 0, i32 %tmp105, i32 323, i32 12, i32 15, i1 false) %tmp121 = add i32 %tmp105, %tmp120 %tmp122 = tail call i32 @llvm.amdgcn.wwm.i32(i32 %tmp121) @@ -58,7 +58,7 @@ ; GFX9: v_mov_b32_dpp v[[SECOND_MOV:[0-9]+]], v{{[0-9]+}} row_bcast:31 row_mask:0xc bank_mask:0xf ; GFX9: v_add_u32_e32 v[[SECOND_ADD:[0-9]+]], v{{[0-9]+}}, v[[SECOND_MOV]] ; GFX9: v_mov_b32_e32 v[[SECOND:[0-9]+]], v[[SECOND_ADD]] -; GFX9-O0: buffer_store_dword v[[SECOND]], off, s{{\[}}{{[0-9]+}}:{{[0-9]+}}{{\]}}, s[[SECOND_SGPR_OFFSET:[0-9]+]] offset:[[SECOND_IMM_OFFSET:[0-9]+]] +; GFX9-O0: buffer_store_dword v[[SECOND]], off, s{{\[}}{{[0-9]+}}:{{[0-9]+}}{{\]}}, 0 offset:[[SECOND_IMM_OFFSET:[0-9]+]] %tmp135 = tail call i32 @llvm.amdgcn.update.dpp.i32(i32 0, i32 %tmp107, i32 323, i32 12, i32 15, i1 false) %tmp136 = add i32 %tmp107, %tmp135 %tmp137 = tail call i32 @llvm.amdgcn.wwm.i32(i32 %tmp136) @@ -67,8 +67,8 @@ merge: %merge_value = phi i32 [ 0, %entry ], [%tmp137, %if ] ; GFX9-O3: v_cmp_eq_u32_e32 vcc, v[[FIRST]], v[[SECOND]] -; GFX9-O0: buffer_load_dword v[[SECOND:[0-9]+]], off, s{{\[}}{{[0-9]+}}:{{[0-9]+}}{{\]}}, s[[SECOND_SGPR_OFFSET]] offset:[[SECOND_IMM_OFFSET]] -; GFX9-O0: buffer_load_dword v[[FIRST:[0-9]+]], off, s{{\[}}{{[0-9]+}}:{{[0-9]+}}{{\]}}, s[[FIRST_SGPR_OFFSET]] offset:[[FIRST_IMM_OFFSET]] +; GFX9-O0: buffer_load_dword v[[SECOND:[0-9]+]], off, s{{\[}}{{[0-9]+}}:{{[0-9]+}}{{\]}}, 0 offset:[[SECOND_IMM_OFFSET]] +; GFX9-O0: buffer_load_dword v[[FIRST:[0-9]+]], off, s{{\[}}{{[0-9]+}}:{{[0-9]+}}{{\]}}, 0 offset:[[FIRST_IMM_OFFSET]] ; GFX9-O0: v_cmp_eq_u32_e64 s{{\[}}{{[0-9]+}}:{{[0-9]+}}{{\]}}, v[[FIRST]], v[[SECOND]] %tmp138 = icmp eq i32 %tmp122, %merge_value %tmp139 = sext i1 %tmp138 to i32 diff --git a/llvm/test/CodeGen/MIR/AMDGPU/machine-function-info-no-ir.mir b/llvm/test/CodeGen/MIR/AMDGPU/machine-function-info-no-ir.mir --- a/llvm/test/CodeGen/MIR/AMDGPU/machine-function-info-no-ir.mir +++ b/llvm/test/CodeGen/MIR/AMDGPU/machine-function-info-no-ir.mir @@ -13,7 +13,6 @@ # FULL-NEXT: memoryBound: true # FULL-NEXT: waveLimiter: true # FULL-NEXT: scratchRSrcReg: '$sgpr8_sgpr9_sgpr10_sgpr11' -# FULL-NEXT: scratchWaveOffsetReg: '$sgpr12' # FULL-NEXT: frameOffsetReg: '$sgpr12' # FULL-NEXT: stackPtrOffsetReg: '$sgpr13' # FULL-NEXT: argumentInfo: @@ -40,7 +39,6 @@ # SIMPLE-NEXT: memoryBound: true # SIMPLE-NEXT: waveLimiter: true # SIMPLE-NEXT: scratchRSrcReg: '$sgpr8_sgpr9_sgpr10_sgpr11' -# SIMPLE-NEXT: scratchWaveOffsetReg: '$sgpr12' # SIMPLE-NEXT: frameOffsetReg: '$sgpr12' # SIMPLE-NEXT: stackPtrOffsetReg: '$sgpr13' # SIMPLE-NEXT: argumentInfo: @@ -60,7 +58,6 @@ memoryBound: true waveLimiter: true scratchRSrcReg: '$sgpr8_sgpr9_sgpr10_sgpr11' - scratchWaveOffsetReg: '$sgpr12' frameOffsetReg: '$sgpr12' stackPtrOffsetReg: '$sgpr13' argumentInfo: @@ -87,12 +84,10 @@ # FULL-NEXT: memoryBound: false # FULL-NEXT: waveLimiter: false # FULL-NEXT: scratchRSrcReg: '$private_rsrc_reg' -# FULL-NEXT: scratchWaveOffsetReg: '$scratch_wave_offset_reg' # FULL-NEXT: frameOffsetReg: '$fp_reg' # FULL-NEXT: stackPtrOffsetReg: '$sp_reg' # FULL-NEXT: argumentInfo: # FULL-NEXT: privateSegmentBuffer: { reg: '$sgpr0_sgpr1_sgpr2_sgpr3' } -# FULL-NEXT: privateSegmentWaveByteOffset: { reg: '$sgpr33' } # FULL-NEXT: mode: # FULL-NEXT: ieee: true # FULL-NEXT: dx10-clamp: true @@ -107,7 +102,6 @@ # SIMPLE-NEXT: maxKernArgAlign: 1 # SIMPLE-NEXT: argumentInfo: # SIMPLE-NEXT: privateSegmentBuffer: { reg: '$sgpr0_sgpr1_sgpr2_sgpr3' } -# SIMPLE-NEXT: privateSegmentWaveByteOffset: { reg: '$sgpr33' } # SIMPLE-NEXT: body: name: no_mfi @@ -128,12 +122,10 @@ # FULL-NEXT: memoryBound: false # FULL-NEXT: waveLimiter: false # FULL-NEXT: scratchRSrcReg: '$private_rsrc_reg' -# FULL-NEXT: scratchWaveOffsetReg: '$scratch_wave_offset_reg' # FULL-NEXT: frameOffsetReg: '$fp_reg' # FULL-NEXT: stackPtrOffsetReg: '$sp_reg' # FULL-NEXT: argumentInfo: # FULL-NEXT: privateSegmentBuffer: { reg: '$sgpr0_sgpr1_sgpr2_sgpr3' } -# FULL-NEXT: privateSegmentWaveByteOffset: { reg: '$sgpr33' } # FULL-NEXT: mode: # FULL-NEXT: ieee: true # FULL-NEXT: dx10-clamp: true @@ -148,7 +140,6 @@ # SIMPLE-NEXT: maxKernArgAlign: 1 # SIMPLE-NEXT: argumentInfo: # SIMPLE-NEXT: privateSegmentBuffer: { reg: '$sgpr0_sgpr1_sgpr2_sgpr3' } -# SIMPLE-NEXT: privateSegmentWaveByteOffset: { reg: '$sgpr33' } # SIMPLE-NEXT: body: name: empty_mfi @@ -170,12 +161,10 @@ # FULL-NEXT: memoryBound: false # FULL-NEXT: waveLimiter: false # FULL-NEXT: scratchRSrcReg: '$private_rsrc_reg' -# FULL-NEXT: scratchWaveOffsetReg: '$scratch_wave_offset_reg' # FULL-NEXT: frameOffsetReg: '$fp_reg' # FULL-NEXT: stackPtrOffsetReg: '$sp_reg' # FULL-NEXT: argumentInfo: # FULL-NEXT: privateSegmentBuffer: { reg: '$sgpr0_sgpr1_sgpr2_sgpr3' } -# FULL-NEXT: privateSegmentWaveByteOffset: { reg: '$sgpr33' } # FULL-NEXT: mode: # FULL-NEXT: ieee: true # FULL-NEXT: dx10-clamp: true @@ -191,7 +180,6 @@ # SIMPLE-NEXT: isEntryFunction: true # SIMPLE-NEXT: argumentInfo: # SIMPLE-NEXT: privateSegmentBuffer: { reg: '$sgpr0_sgpr1_sgpr2_sgpr3' } -# SIMPLE-NEXT: privateSegmentWaveByteOffset: { reg: '$sgpr33' } # SIMPLE-NEXT: body: name: empty_mfi_entry_func @@ -207,12 +195,10 @@ # ALL-LABEL: name: default_regs_mfi # FULL: scratchRSrcReg: '$private_rsrc_reg' -# FULL-NEXT: scratchWaveOffsetReg: '$scratch_wave_offset_reg' # FULL-NEXT: frameOffsetReg: '$fp_reg' # FULL-NEXT: stackPtrOffsetReg: '$sp_reg' # SIMPLE-NOT: scratchRSrcReg -# SIMPLE-NOT: scratchWaveOffsetReg # SIMPLE-NOT:: stackPtrOffsetReg name: default_regs_mfi machineFunctionInfo: @@ -230,13 +216,11 @@ # FULL: argumentInfo: # FULL-NEXT: privateSegmentBuffer: { reg: '$sgpr0_sgpr1_sgpr2_sgpr3' } # FULL-NEXT: flatScratchInit: { offset: 4 } -# FULL-NEXT: privateSegmentWaveByteOffset: { reg: '$sgpr33' } # FULL-NEXT: workItemIDY: { reg: '$vgpr0', mask: 65280 } # SIMPLE: argumentInfo: # SIMPLE-NEXT: privateSegmentBuffer: { reg: '$sgpr0_sgpr1_sgpr2_sgpr3' } # SIMPLE-NEXT: flatScratchInit: { offset: 4 } -# SIMPLE-NEXT: privateSegmentWaveByteOffset: { reg: '$sgpr33' } # SIMPLE-NEXT: workItemIDY: { reg: '$vgpr0', mask: 65280 } name: fake_stack_arginfo machineFunctionInfo: diff --git a/llvm/test/CodeGen/MIR/AMDGPU/machine-function-info.ll b/llvm/test/CodeGen/MIR/AMDGPU/machine-function-info.ll deleted file mode 100644 --- a/llvm/test/CodeGen/MIR/AMDGPU/machine-function-info.ll +++ /dev/null @@ -1,177 +0,0 @@ -; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=tahiti -stop-after finalize-isel -o %t.mir %s -; RUN: llc -run-pass=none -verify-machineinstrs %t.mir -o - | FileCheck %s - -; Test that SIMachineFunctionInfo can be round trip serialized through -; MIR. - -@lds = addrspace(3) global [512 x float] undef, align 4 - -; CHECK-LABEL: {{^}}name: kernel -; CHECK: machineFunctionInfo: -; CHECK-NEXT: explicitKernArgSize: 128 -; CHECK-NEXT: maxKernArgAlign: 64 -; CHECK-NEXT: ldsSize: 0 -; CHECK-NEXT: isEntryFunction: true -; CHECK-NEXT: noSignedZerosFPMath: false -; CHECK-NEXT: memoryBound: false -; CHECK-NEXT: waveLimiter: false -; CHECK-NEXT: scratchRSrcReg: '$sgpr96_sgpr97_sgpr98_sgpr99' -; CHECK-NEXT: scratchWaveOffsetReg: '$sgpr101' -; CHECK-NEXT: frameOffsetReg: '$sgpr101' -; CHECK-NEXT: stackPtrOffsetReg: '$sgpr101' -; CHECK-NEXT: argumentInfo: -; CHECK-NEXT: privateSegmentBuffer: { reg: '$sgpr0_sgpr1_sgpr2_sgpr3' } -; CHECK-NEXT: kernargSegmentPtr: { reg: '$sgpr4_sgpr5' } -; CHECK-NEXT: workGroupIDX: { reg: '$sgpr6' } -; CHECK-NEXT: privateSegmentWaveByteOffset: { reg: '$sgpr7' } -; CHECK-NEXT: workItemIDX: { reg: '$vgpr0' } -; CHECK-NEXT: mode: -; CHECK-NEXT: ieee: true -; CHECK-NEXT: dx10-clamp: true -; CHECK-NEXT: fp32-input-denormals: false -; CHECK-NEXT: fp32-output-denormals: false -; CHECK-NEXT: fp64-fp16-input-denormals: true -; CHECK-NEXT: fp64-fp16-output-denormals: true -; CHECK-NEXT: highBitsOf32BitAddress: 0 -; CHECK-NEXT: body: -define amdgpu_kernel void @kernel(i32 %arg0, i64 %arg1, <16 x i32> %arg2) { - %gep = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %arg0 - store float 0.0, float addrspace(3)* %gep, align 4 - ret void -} - -; CHECK-LABEL: {{^}}name: ps_shader -; CHECK: machineFunctionInfo: -; CHECK-NEXT: explicitKernArgSize: 0 -; CHECK-NEXT: maxKernArgAlign: 1 -; CHECK-NEXT: ldsSize: 0 -; CHECK-NEXT: isEntryFunction: true -; CHECK-NEXT: noSignedZerosFPMath: false -; CHECK-NEXT: memoryBound: false -; CHECK-NEXT: waveLimiter: false -; CHECK-NEXT: scratchRSrcReg: '$sgpr96_sgpr97_sgpr98_sgpr99' -; CHECK-NEXT: scratchWaveOffsetReg: '$sgpr101' -; CHECK-NEXT: frameOffsetReg: '$sgpr101' -; CHECK-NEXT: stackPtrOffsetReg: '$sgpr101' -; CHECK-NEXT: argumentInfo: -; CHECK-NEXT: privateSegmentWaveByteOffset: { reg: '$sgpr3' } -; CHECK-NEXT: implicitBufferPtr: { reg: '$sgpr0_sgpr1' } -; CHECK-NEXT: mode: -; CHECK-NEXT: ieee: false -; CHECK-NEXT: dx10-clamp: true -; CHECK-NEXT: fp32-input-denormals: false -; CHECK-NEXT: fp32-output-denormals: false -; CHECK-NEXT: fp64-fp16-input-denormals: true -; CHECK-NEXT: fp64-fp16-output-denormals: true -; CHECK-NEXT: highBitsOf32BitAddress: 0 -; CHECK-NEXT: body: -define amdgpu_ps void @ps_shader(i32 %arg0, i32 inreg %arg1) { - ret void -} - -; CHECK-LABEL: {{^}}name: function -; CHECK: machineFunctionInfo: -; CHECK-NEXT: explicitKernArgSize: 0 -; CHECK-NEXT: maxKernArgAlign: 1 -; CHECK-NEXT: ldsSize: 0 -; CHECK-NEXT: isEntryFunction: false -; CHECK-NEXT: noSignedZerosFPMath: false -; CHECK-NEXT: memoryBound: false -; CHECK-NEXT: waveLimiter: false -; CHECK-NEXT: scratchRSrcReg: '$sgpr0_sgpr1_sgpr2_sgpr3' -; CHECK-NEXT: scratchWaveOffsetReg: '$sgpr33' -; CHECK-NEXT: frameOffsetReg: '$sgpr34' -; CHECK-NEXT: stackPtrOffsetReg: '$sgpr32' -; CHECK-NEXT: argumentInfo: -; CHECK-NEXT: privateSegmentBuffer: { reg: '$sgpr0_sgpr1_sgpr2_sgpr3' } -; CHECK-NEXT: privateSegmentWaveByteOffset: { reg: '$sgpr33' } -; CHECK-NEXT: mode: -; CHECK-NEXT: ieee: true -; CHECK-NEXT: dx10-clamp: true -; CHECK-NEXT: fp32-input-denormals: false -; CHECK-NEXT: fp32-output-denormals: false -; CHECK-NEXT: fp64-fp16-input-denormals: true -; CHECK-NEXT: fp64-fp16-output-denormals: true -; CHECK-NEXT: highBitsOf32BitAddress: 0 -; CHECK-NEXT: body: -define void @function() { - ret void -} - -; CHECK-LABEL: {{^}}name: function_nsz -; CHECK: machineFunctionInfo: -; CHECK-NEXT: explicitKernArgSize: 0 -; CHECK-NEXT: maxKernArgAlign: 1 -; CHECK-NEXT: ldsSize: 0 -; CHECK-NEXT: isEntryFunction: false -; CHECK-NEXT: noSignedZerosFPMath: true -; CHECK-NEXT: memoryBound: false -; CHECK-NEXT: waveLimiter: false -; CHECK-NEXT: scratchRSrcReg: '$sgpr0_sgpr1_sgpr2_sgpr3' -; CHECK-NEXT: scratchWaveOffsetReg: '$sgpr33' -; CHECK-NEXT: frameOffsetReg: '$sgpr34' -; CHECK-NEXT: stackPtrOffsetReg: '$sgpr32' -; CHECK-NEXT: argumentInfo: -; CHECK-NEXT: privateSegmentBuffer: { reg: '$sgpr0_sgpr1_sgpr2_sgpr3' } -; CHECK-NEXT: privateSegmentWaveByteOffset: { reg: '$sgpr33' } -; CHECK-NEXT: mode: -; CHECK-NEXT: ieee: true -; CHECK-NEXT: dx10-clamp: true -; CHECK-NEXT: fp32-input-denormals: false -; CHECK-NEXT: fp32-output-denormals: false -; CHECK-NEXT: fp64-fp16-input-denormals: true -; CHECK-NEXT: fp64-fp16-output-denormals: true -; CHECK-NEXT: highBitsOf32BitAddress: 0 -; CHECK-NEXT: body: -define void @function_nsz() #0 { - ret void -} - -; CHECK-LABEL: {{^}}name: function_dx10_clamp_off -; CHECK: mode: -; CHECK-NEXT: ieee: true -; CHECK-NEXT: dx10-clamp: false -; CHECK-NEXT: fp32-input-denormals: false -; CHECK-NEXT: fp32-output-denormals: false -; CHECK-NEXT: fp64-fp16-input-denormals: true -; CHECK-NEXT: fp64-fp16-output-denormals: true -define void @function_dx10_clamp_off() #1 { - ret void -} - -; CHECK-LABEL: {{^}}name: function_ieee_off -; CHECK: mode: -; CHECK-NEXT: ieee: false -; CHECK-NEXT: dx10-clamp: true -; CHECK-NEXT: fp32-input-denormals: false -; CHECK-NEXT: fp32-output-denormals: false -; CHECK-NEXT: fp64-fp16-input-denormals: true -; CHECK-NEXT: fp64-fp16-output-denormals: true -define void @function_ieee_off() #2 { - ret void -} - -; CHECK-LABEL: {{^}}name: function_ieee_off_dx10_clamp_off -; CHECK: mode: -; CHECK-NEXT: ieee: false -; CHECK-NEXT: dx10-clamp: false -; CHECK-NEXT: fp32-input-denormals: false -; CHECK-NEXT: fp32-output-denormals: false -; CHECK-NEXT: fp64-fp16-input-denormals: true -; CHECK-NEXT: fp64-fp16-output-denormals: true -define void @function_ieee_off_dx10_clamp_off() #3 { - ret void -} - -; CHECK-LABEL: {{^}}name: high_address_bits -; CHECK: machineFunctionInfo: -; CHECK: highBitsOf32BitAddress: 4294934528 -define amdgpu_ps void @high_address_bits() #4 { - ret void -} - -attributes #0 = { "no-signed-zeros-fp-math" = "true" } -attributes #1 = { "amdgpu-dx10-clamp" = "false" } -attributes #2 = { "amdgpu-ieee" = "false" } -attributes #3 = { "amdgpu-dx10-clamp" = "false" "amdgpu-ieee" = "false" } -attributes #4 = { "amdgpu-32bit-address-high-bits"="0xffff8000" } diff --git a/llvm/test/CodeGen/MIR/AMDGPU/mfi-parse-error-scratch-wave-offset-reg.mir b/llvm/test/CodeGen/MIR/AMDGPU/mfi-parse-error-scratch-wave-offset-reg.mir deleted file mode 100644 --- a/llvm/test/CodeGen/MIR/AMDGPU/mfi-parse-error-scratch-wave-offset-reg.mir +++ /dev/null @@ -1,12 +0,0 @@ -# RUN: not llc -march=amdgcn -run-pass none -o /dev/null %s 2>&1 | FileCheck %s -# CHECK: :7:27: expected a named register -# CHECK: scratchWaveOffsetReg: '' ---- -name: empty_scratch_wave_offset_reg -machineFunctionInfo: - scratchWaveOffsetReg: '' -body: | - bb.0: - - S_ENDPGM -... diff --git a/llvm/test/CodeGen/MIR/AMDGPU/mfi-scratch-wave-offset-reg-class.mir b/llvm/test/CodeGen/MIR/AMDGPU/mfi-scratch-wave-offset-reg-class.mir deleted file mode 100644 --- a/llvm/test/CodeGen/MIR/AMDGPU/mfi-scratch-wave-offset-reg-class.mir +++ /dev/null @@ -1,13 +0,0 @@ -# RUN: not llc -march=amdgcn -run-pass none -o /dev/null %s 2>&1 | FileCheck %s -# CHECK: :8:33: incorrect register class for field -# CHECK: scratchWaveOffsetReg: '$vgpr0' - ---- -name: wrong_reg_class_scratch_wave_offset_reg -machineFunctionInfo: - scratchWaveOffsetReg: '$vgpr0' -body: | - bb.0: - - S_ENDPGM -... diff --git a/llvm/test/CodeGen/MIR/AMDGPU/parse-order-reserved-regs.mir b/llvm/test/CodeGen/MIR/AMDGPU/parse-order-reserved-regs.mir --- a/llvm/test/CodeGen/MIR/AMDGPU/parse-order-reserved-regs.mir +++ b/llvm/test/CodeGen/MIR/AMDGPU/parse-order-reserved-regs.mir @@ -1,6 +1,8 @@ # RUN: llc -march=amdgcn -run-pass=none -verify-machineinstrs -o - %s | FileCheck %s # RUN: llc -march=amdgcn -run-pass mir-canonicalizer -verify-machineinstrs -o - %s +# FIXME: Is this still testing anything? + # Previously getReservedRegs was called before parsing # machineFunctionInfo, but the AMDGPU implementation depends on # setting register fields to reserve there. $sgpr50 would then not be @@ -10,21 +12,19 @@ # CHECK: machineFunctionInfo: # CHECK: isEntryFunction: true # CHECK: scratchRSrcReg: '$sgpr0_sgpr1_sgpr2_sgpr3' -# CHECK: scratchWaveOffsetReg: '$sgpr50' -# CHECK: frameOffsetReg: '$sgpr50' -# CHECK: renamable $vgpr0 = BUFFER_LOAD_DWORD_OFFEN %stack.0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr50, 4, 0, 0, 0, 0, 0, implicit $exec :: (load 4, addrspace 5) +# CHECK: renamable $vgpr0 = BUFFER_LOAD_DWORD_OFFEN %stack.0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 4, 0, 0, 0, 0, 0, implicit $exec :: (load 4, addrspace 5) name: reserve_correct_register tracksRegLiveness: true machineFunctionInfo: isEntryFunction: true scratchRSrcReg: '$sgpr0_sgpr1_sgpr2_sgpr3' - scratchWaveOffsetReg: '$sgpr50' - frameOffsetReg: '$sgpr50' + argumentInfo: + privateSegmentWaveByteOffset: { reg: '$sgpr50' } stack: - { id: 0, type: default, offset: 0, size: 4, alignment: 4 } body: | bb.0: - renamable $vgpr0 = BUFFER_LOAD_DWORD_OFFEN %stack.0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr50, 4, 0, 0, 0, 0, 0, implicit $exec :: (load 4, addrspace 5) + renamable $vgpr0 = BUFFER_LOAD_DWORD_OFFEN %stack.0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 4, 0, 0, 0, 0, 0, implicit $exec :: (load 4, addrspace 5) S_ENDPGM 0 ... diff --git a/llvm/test/DebugInfo/AMDGPU/variable-locations.ll b/llvm/test/DebugInfo/AMDGPU/variable-locations.ll --- a/llvm/test/DebugInfo/AMDGPU/variable-locations.ll +++ b/llvm/test/DebugInfo/AMDGPU/variable-locations.ll @@ -32,7 +32,7 @@ @GlobB = common addrspace(1) global i32 0, align 4, !dbg !6 ; CHECK: {{.*}}DW_TAG_subprogram -; CHECK: DW_AT_frame_base [DW_FORM_block1] (DW_OP_reg{{.*}} SGPR9) +; CHECK-NOT: DW_AT_frame_base define amdgpu_kernel void @kernel1( ; CHECK: {{.*}}DW_TAG_formal_parameter